summaryrefslogtreecommitdiff
path: root/Makefile
blob: 2ee9a4265f825cbe29bc1df6e0d5bc60b9accfaf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
SHELL=bash -o pipefail
PATH:=$(CURDIR)/bin:$(PATH)
export PATH

url2murl = $(subst %,^25,$(subst :,^3A,$(subst ^,^5E,$1)))
murl2url = $(subst ^5E,^,$(subst ^3A,:,$(subst ^25,%,$1)))
dirfail = ( r=$$?; mv -- '$@'{,.bak}; exit $$r; )

all:
	$(MAKE) dat/urlkeys.mk
	$(MAKE) dat/index.mk
	$(MAKE) dat/git

fix:
	grep -rl '<html><body><h1>503' dat | xargs rm -fv --

.PHONY: all fix

# Stage 1 ######################################################################

dat:
	mkdir -p $@
dat/cdxindex.txt: | dat
	cdxget 'url=www.unicode.org/Public/*' 'fl=urlkey' 'filter=statuscode:200' 'filter=urlkey:.*(cvt|convert)utf.*' > $@
dat/urlkeys.txt: dat/cdxindex.txt
	< $< cut -d '?' -f1 | sort -u > $@
dat/urlkeys.mk: dat/urlkeys.txt
	< $< sed 's/^/urlkeys+=/' > $@

# Stage 2 ######################################################################
ifneq ($(wildcard dat/urlkeys.mk),)
include dat/urlkeys.mk

dat/each-cdx/%.txt:
	@mkdir -p '$(@D)'
	cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'fl=timestamp,original' > '$@'
dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys))) dat/urlkeys.txt
	cat -- $(foreach c,$(filter dat/each-cdx/%,$^),'$c') | sort > $@
dat/index.mk: dat/index.txt
	< $< sed -e 's,^,index+=,' -e 's, ,/,' > $@

# Stage 3 ######################################################################
ifneq ($(wildcard dat/index.mk),)
-include dat/index.mk

dat/content-dir/%/index.wahtml:
	@mkdir -p '$(@D)'
	curl -sL 'http://web.archive.org/web/$(call murl2url,$*)' > $@
dat/content-dir/%/index.html: dat/content-dir/%/index.wahtml
	< $< wayfore > $@
dat/content-dir/%/readme.txt: dat/content-dir/%/index.html
	< $< sed -n '/^<pre>$$/,/<\/pre>/p' | sed -e 1d -e 's,</pre>.*,,'  > $@
dat/content-dir/%/metadata.txt: dat/content-dir/%/index.html
	< $< grep '^<img' | sed 's/<[^>]*>//g' | grep -vi 'parent directory' | fmt-metadata $(firstword $(subst /, ,$*)) > $@
content-dir = $(foreach u,$(filter %/,$(index)),dat/content-dir/$(call url2murl,$(u)))
download += $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir))

dat/content-file/%:
	@mkdir -p '$(@D)'
	curl -sL 'http://web.archive.org/web/$(call murl2url,$*)' > $@
content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u)))
download += $(content-file)

download: $(download)
.PHONY: download

dat/pools.txt: $(addsuffix metadata.txt,$(content-dir)) dat/index.txt
	grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | sed -r -e 's,^dat/content-dir/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt:,/,' -e 's,\s+, ,g' | sort -u > $@
dat/pools: $(download) dat/pools.txt dat/index.txt
	rm -rf -- $@ $@.bak
	poolify dat/pools.txt dat/index.txt || $(dirfail)
dat/git: dat/pools $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir))
	rm -rf -- $@ $@.bak
	gitify $@ || $(dirfail)

################################################################################
endif
endif

.DELETE_ON_ERROR:
.SECONDARY: