diff options
-rw-r--r-- | Makefile | 33 | ||||
-rwxr-xr-x | bin/dateify | 14 | ||||
-rwxr-xr-x | bin/fmt-metadata | 4 | ||||
-rwxr-xr-x | bin/gitthing | 5 | ||||
-rwxr-xr-x | bin/poolify | 3 |
5 files changed, 28 insertions, 31 deletions
@@ -6,41 +6,52 @@ url2murl = $(subst %,^25,$(subst :,^3A,$(subst ^,^5E,$1))) murl2url = $(subst ^5E,^,$(subst ^3A,:,$(subst ^25,%,$1))) dirfail = ( r=$$?; mv -- '$@'{,.bak}; exit $$r; ) -all: dat/git dat/pools +all: + $(MAKE) dat/urlkeys.mk + $(MAKE) dat/index.mk + $(MAKE) dat/pools fix: grep -rl '<html><body><h1>503' dat | xargs rm -fv -- +.PHONY: all fix + +# Stage 1 ###################################################################### + dat: mkdir -p $@ dat/cdxindex.txt: | dat cdxget 'url=www.unicode.org/Public/*' 'fl=urlkey' 'filter=statuscode:200' 'filter=urlkey:.*(cvt|convert)utf.*' > $@ dat/urlkeys.txt: dat/cdxindex.txt - cat $^ | cut -d '?' -f1 | sort -u > $@ + < $< cut -d '?' -f1 | sort -u > $@ dat/urlkeys.mk: dat/urlkeys.txt - cat $^ | sed 's/^/urlkeys+=/' < $< > $@ + < $< sed 's/^/urlkeys+=/' > $@ --include dat/urlkeys.mk +# Stage 2 ###################################################################### +ifneq ($(wildcard dat/urlkeys.mk),) +include dat/urlkeys.mk dat/each-cdx/%.txt: @mkdir -p '$(@D)' cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'collapse=digest' 'fl=timestamp,original' > '$@' -dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys))) urlkeys.txt +dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys))) dat/urlkeys.txt cat -- $(foreach c,$(filter dat/each-cdx/%,$^),'$c') | sort > $@ dat/index.mk: dat/index.txt < $< sed -e 's,^,index+=,' -e 's, ,/,' > $@ +# Stage 3 ###################################################################### +ifneq ($(wildcard dat/index.mk),) -include dat/index.mk dat/content-dir/%/index.wahtml: @mkdir -p '$(@D)' curl -s 'http://web.archive.org/web/$(call murl2url,$*)' > $@ dat/content-dir/%/index.html: dat/content-dir/%/index.wahtml - wayfore < $< > $@ + < $< wayfore > $@ dat/content-dir/%/readme.txt: dat/content-dir/%/index.html < $< sed -n '/^<pre>$$/,/<\/pre>/p' | sed -e 1d -e 's,</pre>.*,,' > $@ dat/content-dir/%/metadata.txt: dat/content-dir/%/index.html - < $< grep '^<img' | sed 's/<[^>]*>//g' | fmt-metadata $(firstword $(subst /, ,$*)) > $@ + < $< grep '^<img' | sed 's/<[^>]*>//g' | grep -vi 'parent directory' | fmt-metadata $(firstword $(subst /, ,$*)) > $@ content-dir = $(foreach u,$(filter %/,$(index)),dat/content-dir/$(call url2murl,$(u))) download += $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir)) @@ -51,9 +62,10 @@ content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url download += $(content-file) download: $(download) +.PHONY: download dat/pools.txt: $(addsuffix metadata.txt,$(content-dir)) dat/index.txt - grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | bin/dateify | sed -r -e 's,.*web\.archive\.org/web/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt: ,/,' -e 's,\s+, ,g' | sort -u > $@ + grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | sed -r -e 's,^dat/content-dir/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt:,/,' -e 's,\s+, ,g' | sort -u > $@ dat/pools: $(download) dat/pools.txt dat/index.txt rm -rf -- $@ $@.bak poolify dat/pools.txt dat/index.txt || $(dirfail) @@ -62,6 +74,9 @@ dat/git: $(download) dat/index.txt rm -rf -- $@ $@.bak gitthing dat/git < dat/index.txt || $(dirfail) -.PHONY: all fix download +################################################################################ +endif +endif + .DELETE_ON_ERROR: .SECONDARY: diff --git a/bin/dateify b/bin/dateify deleted file mode 100755 index 7aefdd9..0000000 --- a/bin/dateify +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/sed -rf -s/\b([0-9]{2})-([A-Z][a-z][a-z])-([0-9]{4})(\b|T|_)/\3-\2-\1\4/ -s/Jan/01/ -s/Feb/02/ -s/Mar/03/ -s/Apr/04/ -s/May/05/ -s/Jun/06/ -s/Jul/07/ -s/Aug/08/ -s/Sep/09/ -s/Oct/10/ -s/Nov/11/ -s/Dec/12/ diff --git a/bin/fmt-metadata b/bin/fmt-metadata index 0682414..7867d63 100755 --- a/bin/fmt-metadata +++ b/bin/fmt-metadata @@ -4,15 +4,13 @@ require 'time' snapshot = ARGV.first.to_i $stdin.each_line do |line| - m = /^ (\S+) +(..-\S+-.... ..:..) +([0-9.]+)(\S+) *$/.match(line) + m = /^ (\S+) +(..-\S+-.... ..:..) +([0-9.-]+)(\S+) *$/.match(line) raise "Malformed line: #{line}" unless m name = m[1] datetime = m[2] size_numb = m[3] size_unit = m[4] - next if name.downcase == "parent directory" - # The Unicode.org web server switched the timezone of timestamps # in May 2004 if snapshot < 20040500000000 diff --git a/bin/gitthing b/bin/gitthing index 7bac2e2..ff7b5ac 100755 --- a/bin/gitthing +++ b/bin/gitthing @@ -43,12 +43,11 @@ main() { git checkout -b "$branch" || true git checkout "$branch" - waurl="http://web.archive.org/web/$time/$url" if [[ -n "$filepart" ]]; then - file="$top/dat/content-file/$(url2murl "${waurl#http://}")" + file="$top/dat/content-file/$time/$(url2murl "$url")" cp "$file" . else - dir="$top/dat/content-dir/$(url2murl "${waurl#http://}")" + dir="$top/dat/content-dir/$time/$(url2murl "$url")" comm -23 \ <(git ls-files) \ <(< "$dir/metadata.txt" awk '{print $1}') \ diff --git a/bin/poolify b/bin/poolify index 34e0b42..af8bf40 100755 --- a/bin/poolify +++ b/bin/poolify @@ -46,8 +46,7 @@ main() { false fi - waurl="http://web.archive.org/web/$time/$url" - file="dat/content-file/$(url2murl "${waurl#http://}")" + file="dat/content-file/$time/$(url2murl $url)" declare -i i=0 while true; do |