summaryrefslogtreecommitdiff
path: root/Makefile
blob: c2ee221034a42dd3ee00d933dcabffc64e45b781 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
SHELL=bash -o pipefail
PATH:=$(CURDIR)/bin:$(PATH)
export PATH

url2murl = $(subst %,^25,$(subst :,^3A,$(subst ^,^5E,$1)))
murl2url = $(subst ^5E,^,$(subst ^3A,:,$(subst ^25,%,$1)))
dirfail = ( r=$$?; mv -- '$@'{,.bak}; exit $$r; )

# This is split into stages for when Make has to make decisions about
# the build tree based on the output of a previous stage.  That is:
# these stages exist for a technical GNU Make reason, not for
# human-comprehensibility reasons; so stages have lopsided sizes; the
# first two are very small, and almost everything is in the third
# stage.
all:
	# Stage 1 ######################################################################
	$(MAKE) dat/urlkeys.mk
	# Stage 2 ######################################################################
	$(MAKE) dat/index.mk
	# Stage 3 ######################################################################
	$(MAKE) dat/git
.PHONY: all

# Stage 1 ######################################################################
#
# Fetch a listing of all relevant URLs.
#
#  - `dat/cdxindex.txt`
#  - `dat/urlkeys.txt`
#  - `dat/urlkeys.mk`

dat:
	mkdir -p $@
dat/cdxindex.txt: | dat
	cdxget 'url=www.unicode.org/Public/*' 'fl=urlkey' 'collapse=urlkey' 'filter=statuscode:200' 'filter=urlkey:.*(cvt|convert)utf.*' | grep -vFx 'org,unicode)/public/2.0-update/cvtutf7.c' > $@
dat/urlkeys.txt: dat/cdxindex.txt
	< $< cut -d '?' -f1 | sort -u > $@
dat/urlkeys.mk: dat/urlkeys.txt
	< $< sed 's/^/urlkeys+=/' > $@

# Stage 2 ######################################################################
#
# Fetch the history for each relevant URL.
#
# - `dat/each-cdx/$(urlkey).txt` (for each urlkey in `dat/urlkeys.mk`)
#
# - `dat/index.txt`
#   has a line for each relevant URL:
#
#       ${wayback_timestamp:YYYYmmddHHMMSS} ${url}
#
# - `dat/index.mk`
ifneq ($(wildcard dat/urlkeys.mk),)
include dat/urlkeys.mk

dat/each-cdx/%.txt:
	@mkdir -p '$(@D)'
	cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'fl=timestamp,original' > '$@'
dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys))) dat/urlkeys.txt
	cat -- $(foreach c,$(filter dat/each-cdx/%,$^),'$c') | sort > $@
dat/index.mk: dat/index.txt
	< $< sed -e 's,^,index+=,' -e 's, ,/,' > $@

# Stage 3 ######################################################################
#
# The main stage.
ifneq ($(wildcard dat/index.mk),)
-include dat/index.mk

# Part 1: Directory indexes:
#
# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/index.html`
#
# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/readme.txt`
#
# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/metadata.txt`
#   has a line for each file mentioned in index.html (this format is
#   controlled by `bin/fmt-metadata`):
#
#          ${file_name} ${file_timestamp:YYYY-mm-dd HH:MM}
dat/content-dir/%/index.html:
	@mkdir -p '$(@D)'
	curl -sfL 'http://web.archive.org/web/$(call murl2url,$(subst /http,id_/http,$*))' > $@
dat/content-dir/%/readme.txt: dat/content-dir/%/index.html
	< $< sed -n '/^<[pP][rR][eE]>$$/,/<\/[pP][rR][eE]>/p' | sed -e 1d -e 's,</[pP][rR][eE]>.*,,'  > $@
dat/content-dir/%/metadata.txt: dat/content-dir/%/index.html
	< $< grep -i '^<img' | sed 's/<[^>]*>//g' | grep -vi 'parent directory' | fmt-metadata $(firstword $(subst /, ,$*)) > $@
content-dir = $(foreach u,$(filter %/,$(index)),dat/content-dir/$(call url2murl,$(u)))
download += $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir))

# Part 2: File contents:
# - `dat/content-file/$(wayback_timestamp:YYYYmmddHHMMSS)/$(file_murl)`
dat/content-file/%:
	@mkdir -p '$(@D)'
	curl -sfL 'http://web.archive.org/web/$(call murl2url,$(subst /http,id_/http,$*))' > $@
content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u)))
download += $(content-file)

# `download` is a convenience target to download files without
# processing them.  It isn't depended on by anything.
download: $(download)
.PHONY: download

# Part 3: Aggregate:
# - `dat/metadata.txt`
#   has a line for each file mentioned in any index.html:
#
#          ${dirindex_wayback_timestamp:YYYYmmddHHMMSS} ${branch_name}/${file_name} ${file_html_timestamp:YYYY-mm-dd HH:MM}
#
#   where the ${dirindex_wayback_timestamp} and ${branch_name} are
#   determined from the path to the relevant index.html.
#
# - `dat/pools/`
#   + pass 1 and pass 1.5
#     * `dat/pools/files/${file_html_timestamp:YYYYmmddHHMM}-${branch_name}_${file_name}/`
#     * `dat/pools/snaps/${dirindex_wayback_timestamp:YYYYmmddHHMMSS}-${branch_name}/${file_name}` (symlink to the /files/ file)
#   + pass 2 and pass 3:
#     * `dat/pools/files/${file_html_timestamp:YYYYmmddHHMM}-${branch_name}_${file_name}/${file_name}` (for each existing /file/ dir)
#
dat/metadata.txt: $(addsuffix metadata.txt,$(content-dir)) dat/index.txt
	grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | sed -E -e 's,^dat/content-dir/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt:,/,' -e 's,\s+, ,g' | sort -u > $@
dat/pools: $(download) dat/metadata.txt dat/index.txt
	rm -rf -- $@ $@.bak
	poolify dat/metadata.txt dat/index.txt || $(dirfail)

# Part 4: Turn each `dat/pools/snaps/*` directory into a Git commit
#
# - `dat/git/`
dat/git: dat/pools $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir))
	rm -rf -- $@ $@.bak
	gitify $@ || $(dirfail)

################################################################################
endif
endif

.DELETE_ON_ERROR:
.SECONDARY: