diff options
-rw-r--r-- | Makefile | 34 | ||||
-rwxr-xr-x | bin/dateify | 14 | ||||
-rwxr-xr-x | bin/fmt-metadata | 25 | ||||
-rwxr-xr-x | bin/gitthing | 4 | ||||
-rwxr-xr-x | bin/poolify | 69 |
5 files changed, 133 insertions, 13 deletions
@@ -4,8 +4,9 @@ export PATH url2murl = $(subst %,^25,$(subst :,^3A,$(subst ^,^5E,$1))) murl2url = $(subst ^5E,^,$(subst ^3A,:,$(subst ^25,%,$1))) +dirfail = ( r=$$?; mv -- '$@'{,.bak}; exit $$r; ) -all: download +all: dat/git dat/pools fix: grep -rl '<html><body><h1>503' dat | xargs rm -fv -- @@ -24,34 +25,43 @@ dat/urlkeys.mk: dat/urlkeys.txt dat/each-cdx/%.txt: @mkdir -p '$(@D)' cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'collapse=digest' 'fl=timestamp,original' > '$@' -dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys))) - cat -- $(foreach c,$^,'$c') | sort > $@ +dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys))) urlkeys.txt + cat -- $(foreach c,$(filter dat/each-cdx/%,$^),'$c') | sort > $@ dat/index.mk: dat/index.txt - < $< sed 's,^,index+=web.archive.org/web/,;s, ,/,' > $@ + < $< sed -e 's,^,index+=,' -e 's, ,/,' > $@ -include dat/index.mk dat/content-dir/%/index.wahtml: @mkdir -p '$(@D)' - curl -s 'http://$(call murl2url,$*)' > $@ + curl -s 'http://web.archive.org/web/$(call murl2url,$*)' > $@ dat/content-dir/%/index.html: dat/content-dir/%/index.wahtml wayfore < $< > $@ dat/content-dir/%/readme.txt: dat/content-dir/%/index.html < $< sed -n '/^<pre>$$/,/<\/pre>/p' | sed -e 1d -e 's,</pre>.*,,' > $@ dat/content-dir/%/metadata.txt: dat/content-dir/%/index.html - < $< grep '^<img' | sed 's/<[^>]*>//g' | grep -vi 'parent directory' > $@ + < $< grep '^<img' | sed 's/<[^>]*>//g' | fmt-metadata $(firstword $(subst /, ,$*)) > $@ content-dir = $(foreach u,$(filter %/,$(index)),dat/content-dir/$(call url2murl,$(u))) -download: $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir)) +download += $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir)) dat/content-file/%: @mkdir -p '$(@D)' - curl -s 'http://$(call murl2url,$*)' > $@ + curl -s 'http://web.archive.org/web/$(call murl2url,$*)' > $@ content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u))) -download: $(content-file) +download += $(content-file) -git: download - gitthing dat/git < dat/index.txt +download: $(download) -.PHONY: all fix download git +dat/pools.txt: $(addsuffix metadata.txt,$(content-dir)) dat/index.txt + grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | bin/dateify | sed -r -e 's,.*web\.archive\.org/web/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt: ,/,' -e 's,\s+, ,g' | sort -u > $@ +dat/pools: $(download) dat/pools.txt dat/index.txt + rm -rf -- $@ $@.bak + poolify dat/pools.txt dat/index.txt || $(dirfail) + +dat/git: $(download) dat/index.txt + rm -rf -- $@ $@.bak + gitthing dat/git < dat/index.txt || $(dirfail) + +.PHONY: all fix download .DELETE_ON_ERROR: .SECONDARY: diff --git a/bin/dateify b/bin/dateify new file mode 100755 index 0000000..7aefdd9 --- /dev/null +++ b/bin/dateify @@ -0,0 +1,14 @@ +#!/usr/bin/sed -rf +s/\b([0-9]{2})-([A-Z][a-z][a-z])-([0-9]{4})(\b|T|_)/\3-\2-\1\4/ +s/Jan/01/ +s/Feb/02/ +s/Mar/03/ +s/Apr/04/ +s/May/05/ +s/Jun/06/ +s/Jul/07/ +s/Aug/08/ +s/Sep/09/ +s/Oct/10/ +s/Nov/11/ +s/Dec/12/ diff --git a/bin/fmt-metadata b/bin/fmt-metadata new file mode 100755 index 0000000..0682414 --- /dev/null +++ b/bin/fmt-metadata @@ -0,0 +1,25 @@ +#!/usr/bin/env ruby +require 'time' + +snapshot = ARGV.first.to_i + +$stdin.each_line do |line| + m = /^ (\S+) +(..-\S+-.... ..:..) +([0-9.]+)(\S+) *$/.match(line) + raise "Malformed line: #{line}" unless m + name = m[1] + datetime = m[2] + size_numb = m[3] + size_unit = m[4] + + next if name.downcase == "parent directory" + + # The Unicode.org web server switched the timezone of timestamps + # in May 2004 + if snapshot < 20040500000000 + datetime = Time.parse("#{datetime} +01:00").utc.strftime('%Y-%m-%d %H:%M') + else + datetime = Time.parse("#{datetime} +00:00").utc.strftime('%Y-%m-%d %H:%M') + end + + puts ("%-22s %s %3s%s" % [ name, datetime, size_numb, size_unit ]) +end diff --git a/bin/gitthing b/bin/gitthing index a54c017..7bac2e2 100755 --- a/bin/gitthing +++ b/bin/gitthing @@ -62,7 +62,9 @@ main() { if [[ -n "$(git status -s .)" ]]; then gitdate="$(sed -r 's/(....)(..)(..)(..)(..)(..)/\1-\2-\3T\4:\5:\6/' <<<"$time")" git add . - GIT_AUTHOR_DATE=$gitdate GIT_AUTHOR_DATE=$gitdate git commit -m "$time $url" + export GIT_AUTHOR_DATE=$gitdate + export GIT_COMMITTER_DATE=$gitdate + git commit -m "$time $url" fi done } diff --git a/bin/poolify b/bin/poolify new file mode 100755 index 0000000..34e0b42 --- /dev/null +++ b/bin/poolify @@ -0,0 +1,69 @@ +#!/usr/bin/env bash + +url2murl() { + local x + x=$1 + x=${x//'^'/'^5E'} + x=${x//':'/'^3A'} + x=${x//'%'/'^25'} + printf '%s' "$x" +} + +main() { + set -euE -o pipefail + shopt -s nullglob + + while read -r snap name date time size; do + dirpart="${name%/*}" + filepart="${name##*/}" + filedir=dat/pools/files/"${date//-/}${time//:/}-${name//\//_}" + snapdir=dat/pools/snaps/"${snap}-${dirpart//\//_}" + mkdir -p -- "$filedir" "$snapdir" + ln -sr "$filedir/$filepart" "$snapdir" + done < "$1" + + while read -r time url; do + name="${url##*/Public/}" + dirpart="${name%/*}" + filepart="${name##*/}" + + if [[ -z "$filepart" ]]; then + continue + fi + + pools=(dat/pools/files/*-"${name//\//_}") + pools=("${pools[@]##*/}") + + mypool='' + for pool in "${pools[@]}"; do + pooltime="${pool%%-*}" + if [[ "${pooltime}00" -le "$time" ]]; then + mypool=$pool + fi + done + if [[ -z "$mypool" ]]; then + >&2 printf 'Could not find pool for %s %s' "$time" "$url" + false + fi + + waurl="http://web.archive.org/web/$time/$url" + file="dat/content-file/$(url2murl "${waurl#http://}")" + + declare -i i=0 + while true; do + link="dat/pools/files/$mypool/$filepart.$i" + link="${link%.0}" + a="$(readlink -f "$link")" || true + b="$(readlink -f "$file")" + if cmp -s -- "$a" "$b"; then + break + fi + if ln -sr "$b" "$link"; then + break + fi + i+=1 + done + done < "$2" +} + +main "$@" |