Makefile


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148

# Copyright (c) 2017, 2023  Luke Shumaker <lukeshu@lukeshu.com>
#
# This work is free.  You can redistribute it and/or modify it under
# the terms of the Do What The Fuck You Want To Public License,
# Version 2, as published by Sam Hocevar.  See the COPYING file for
# more details.

SHELL=bash -o pipefail
PATH:=$(CURDIR)/bin:$(PATH)
export PATH

url2murl = $(subst %,^25,$(subst :,^3A,$(subst ^,^5E,$1)))
murl2url = $(subst ^5E,^,$(subst ^3A,:,$(subst ^25,%,$1)))
dirfail = ( r=$$?; mv -- '$@'{,.bak}; exit $$r; )

# This is split into stages for when Make has to make decisions about
# the build tree based on the output of a previous stage.  That is:
# these stages exist for a technical GNU Make reason, not for
# human-comprehensibility reasons; so stages have lopsided sizes; the
# first two are very small, and almost everything is in the third
# stage.
all:
	# Stage 1 ######################################################################
	$(MAKE) dat/urlkeys.mk
	# Stage 2 ######################################################################
	$(MAKE) dat/index.mk
	# Stage 3 ######################################################################
	$(MAKE) dat/git
.PHONY: all

COPYING:
	curl -L http://www.wtfpl.net/txt/copying/ >$@

# Stage 1 ######################################################################
#
# Fetch a listing of all relevant URLs.
#
#  - `dat/cdxindex.txt`
#  - `dat/urlkeys.txt`
#  - `dat/urlkeys.mk`

dat:
	mkdir -p $@
dat/cdxindex.txt: | dat
	cdxget 'url=www.unicode.org/Public/*' 'fl=urlkey' 'collapse=urlkey' 'filter=statuscode:200' 'filter=urlkey:.*(cvt|convert)utf.*' | grep -vFx 'org,unicode)/public/2.0-update/cvtutf7.c' > $@
dat/urlkeys.txt: dat/cdxindex.txt
	< $< cut -d '?' -f1 | sort -u > $@
dat/urlkeys.mk: dat/urlkeys.txt
	< $< sed 's/^/urlkeys+=/' > $@

# Stage 2 ######################################################################
#
# Fetch the history for each relevant URL.
#
# - `dat/each-cdx/$(urlkey).txt` (for each urlkey in `dat/urlkeys.mk`)
#
# - `dat/index.txt`
#   has a line for each relevant URL:
#
#       ${wayback_timestamp:YYYYmmddHHMMSS} ${url}
#
# - `dat/index.mk`
ifneq ($(wildcard dat/urlkeys.mk),)
include dat/urlkeys.mk

dat/each-cdx/%.txt:
	@mkdir -p '$(@D)'
	cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'fl=timestamp,original' > '$@'
dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys))) dat/urlkeys.txt
	cat -- $(foreach c,$(filter dat/each-cdx/%,$^),'$c') | sort > $@
dat/index.mk: dat/index.txt
	< $< sed -e 's,^,index+=,' -e 's, ,/,' > $@

# Stage 3 ######################################################################
#
# The main stage.
ifneq ($(wildcard dat/index.mk),)
-include dat/index.mk

# Part 1: Directory indexes:
#
# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/index.html`
#
# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/readme.txt`
#
# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/metadata.txt`
#   has a line for each file mentioned in index.html (this format is
#   controlled by `bin/fmt-metadata`):
#
#          ${file_name} ${file_timestamp:YYYY-mm-dd HH:MM}
dat/content-dir/%/index.html:
	@mkdir -p '$(@D)'
	curl -sfL 'http://web.archive.org/web/$(call murl2url,$(subst /http,id_/http,$*))' > $@
dat/content-dir/%/readme.txt: dat/content-dir/%/index.html
	< $< sed -n '/^<[pP][rR][eE]>$$/,/<\/[pP][rR][eE]>/p' | sed -e 1d -e 's,</[pP][rR][eE]>.*,,'  > $@
dat/content-dir/%/metadata.txt: dat/content-dir/%/index.html
	< $< grep -i '^<img' | sed 's/<[^>]*>//g' | grep -vi 'parent directory' | fmt-metadata $(firstword $(subst /, ,$*)) > $@
content-dir = $(foreach u,$(filter %/,$(index)),dat/content-dir/$(call url2murl,$(u)))
download += $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir))

# Part 2: File contents:
# - `dat/content-file/$(wayback_timestamp:YYYYmmddHHMMSS)/$(file_murl)`
dat/content-file/%:
	@mkdir -p '$(@D)'
	curl -sfL 'http://web.archive.org/web/$(call murl2url,$(subst /http,id_/http,$*))' > $@
content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u)))
download += $(content-file)

# `download` is a convenience target to download files without
# processing them.  It isn't depended on by anything.
download: $(download)
.PHONY: download

# Part 3: Aggregate:
# - `dat/metadata.txt`
#   has a line for each file mentioned in any index.html:
#
#          ${dirindex_wayback_timestamp:YYYYmmddHHMMSS} ${branch_name}/${file_name} ${file_html_timestamp:YYYY-mm-dd HH:MM}
#
#   where the ${dirindex_wayback_timestamp} and ${branch_name} are
#   determined from the path to the relevant index.html.
#
# - `dat/pools/`
#   + pass 1 and pass 1.5
#     * `dat/pools/files/${file_html_timestamp:YYYYmmddHHMM}-${branch_name}_${file_name}/`
#     * `dat/pools/snaps/${dirindex_wayback_timestamp:YYYYmmddHHMMSS}-${branch_name}/${file_name}` (symlink to the /files/ file)
#   + pass 2 and pass 3:
#     * `dat/pools/files/${file_html_timestamp:YYYYmmddHHMM}-${branch_name}_${file_name}/${file_name}` (for each existing /file/ dir)
#
dat/metadata.txt: $(addsuffix metadata.txt,$(content-dir)) dat/index.txt
	grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | sed -E -e 's,^dat/content-dir/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt:,/,' -e 's,\s+, ,g' | sort -u > $@
dat/pools: $(download) dat/metadata.txt dat/index.txt
	rm -rf -- $@ $@.bak
	poolify dat/metadata.txt dat/index.txt || $(dirfail)

# Part 4: Turn each `dat/pools/snaps/*` directory into a Git commit
#
# - `dat/git/`
dat/git: dat/pools $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir))
	rm -rf -- $@ $@.bak
	gitify $@ || $(dirfail)

################################################################################
endif
endif

.DELETE_ON_ERROR:
.SECONDARY: