summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2017-07-01 18:17:38 -0400
committerLuke Shumaker <lukeshu@lukeshu.com>2017-07-01 18:17:38 -0400
commit55f9bd8d5226eb49f5899c5ddca89c655ef96e9b (patch)
tree53096770e331f2d571fc9e49af5000cfed1fa0d8
parente4109fab0bece003dc53c78c7cc8608b68328312 (diff)
stuff
-rw-r--r--Makefile34
-rwxr-xr-xbin/dateify14
-rwxr-xr-xbin/fmt-metadata25
-rwxr-xr-xbin/gitthing4
-rwxr-xr-xbin/poolify69
5 files changed, 133 insertions, 13 deletions
diff --git a/Makefile b/Makefile
index 12c5bb6..2b96cd7 100644
--- a/Makefile
+++ b/Makefile
@@ -4,8 +4,9 @@ export PATH
url2murl = $(subst %,^25,$(subst :,^3A,$(subst ^,^5E,$1)))
murl2url = $(subst ^5E,^,$(subst ^3A,:,$(subst ^25,%,$1)))
+dirfail = ( r=$$?; mv -- '$@'{,.bak}; exit $$r; )
-all: download
+all: dat/git dat/pools
fix:
grep -rl '<html><body><h1>503' dat | xargs rm -fv --
@@ -24,34 +25,43 @@ dat/urlkeys.mk: dat/urlkeys.txt
dat/each-cdx/%.txt:
@mkdir -p '$(@D)'
cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'collapse=digest' 'fl=timestamp,original' > '$@'
-dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys)))
- cat -- $(foreach c,$^,'$c') | sort > $@
+dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys))) urlkeys.txt
+ cat -- $(foreach c,$(filter dat/each-cdx/%,$^),'$c') | sort > $@
dat/index.mk: dat/index.txt
- < $< sed 's,^,index+=web.archive.org/web/,;s, ,/,' > $@
+ < $< sed -e 's,^,index+=,' -e 's, ,/,' > $@
-include dat/index.mk
dat/content-dir/%/index.wahtml:
@mkdir -p '$(@D)'
- curl -s 'http://$(call murl2url,$*)' > $@
+ curl -s 'http://web.archive.org/web/$(call murl2url,$*)' > $@
dat/content-dir/%/index.html: dat/content-dir/%/index.wahtml
wayfore < $< > $@
dat/content-dir/%/readme.txt: dat/content-dir/%/index.html
< $< sed -n '/^<pre>$$/,/<\/pre>/p' | sed -e 1d -e 's,</pre>.*,,' > $@
dat/content-dir/%/metadata.txt: dat/content-dir/%/index.html
- < $< grep '^<img' | sed 's/<[^>]*>//g' | grep -vi 'parent directory' > $@
+ < $< grep '^<img' | sed 's/<[^>]*>//g' | fmt-metadata $(firstword $(subst /, ,$*)) > $@
content-dir = $(foreach u,$(filter %/,$(index)),dat/content-dir/$(call url2murl,$(u)))
-download: $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir))
+download += $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir))
dat/content-file/%:
@mkdir -p '$(@D)'
- curl -s 'http://$(call murl2url,$*)' > $@
+ curl -s 'http://web.archive.org/web/$(call murl2url,$*)' > $@
content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u)))
-download: $(content-file)
+download += $(content-file)
-git: download
- gitthing dat/git < dat/index.txt
+download: $(download)
-.PHONY: all fix download git
+dat/pools.txt: $(addsuffix metadata.txt,$(content-dir)) dat/index.txt
+ grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | bin/dateify | sed -r -e 's,.*web\.archive\.org/web/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt: ,/,' -e 's,\s+, ,g' | sort -u > $@
+dat/pools: $(download) dat/pools.txt dat/index.txt
+ rm -rf -- $@ $@.bak
+ poolify dat/pools.txt dat/index.txt || $(dirfail)
+
+dat/git: $(download) dat/index.txt
+ rm -rf -- $@ $@.bak
+ gitthing dat/git < dat/index.txt || $(dirfail)
+
+.PHONY: all fix download
.DELETE_ON_ERROR:
.SECONDARY:
diff --git a/bin/dateify b/bin/dateify
new file mode 100755
index 0000000..7aefdd9
--- /dev/null
+++ b/bin/dateify
@@ -0,0 +1,14 @@
+#!/usr/bin/sed -rf
+s/\b([0-9]{2})-([A-Z][a-z][a-z])-([0-9]{4})(\b|T|_)/\3-\2-\1\4/
+s/Jan/01/
+s/Feb/02/
+s/Mar/03/
+s/Apr/04/
+s/May/05/
+s/Jun/06/
+s/Jul/07/
+s/Aug/08/
+s/Sep/09/
+s/Oct/10/
+s/Nov/11/
+s/Dec/12/
diff --git a/bin/fmt-metadata b/bin/fmt-metadata
new file mode 100755
index 0000000..0682414
--- /dev/null
+++ b/bin/fmt-metadata
@@ -0,0 +1,25 @@
+#!/usr/bin/env ruby
+require 'time'
+
+snapshot = ARGV.first.to_i
+
+$stdin.each_line do |line|
+ m = /^ (\S+) +(..-\S+-.... ..:..) +([0-9.]+)(\S+) *$/.match(line)
+ raise "Malformed line: #{line}" unless m
+ name = m[1]
+ datetime = m[2]
+ size_numb = m[3]
+ size_unit = m[4]
+
+ next if name.downcase == "parent directory"
+
+ # The Unicode.org web server switched the timezone of timestamps
+ # in May 2004
+ if snapshot < 20040500000000
+ datetime = Time.parse("#{datetime} +01:00").utc.strftime('%Y-%m-%d %H:%M')
+ else
+ datetime = Time.parse("#{datetime} +00:00").utc.strftime('%Y-%m-%d %H:%M')
+ end
+
+ puts ("%-22s %s %3s%s" % [ name, datetime, size_numb, size_unit ])
+end
diff --git a/bin/gitthing b/bin/gitthing
index a54c017..7bac2e2 100755
--- a/bin/gitthing
+++ b/bin/gitthing
@@ -62,7 +62,9 @@ main() {
if [[ -n "$(git status -s .)" ]]; then
gitdate="$(sed -r 's/(....)(..)(..)(..)(..)(..)/\1-\2-\3T\4:\5:\6/' <<<"$time")"
git add .
- GIT_AUTHOR_DATE=$gitdate GIT_AUTHOR_DATE=$gitdate git commit -m "$time $url"
+ export GIT_AUTHOR_DATE=$gitdate
+ export GIT_COMMITTER_DATE=$gitdate
+ git commit -m "$time $url"
fi
done
}
diff --git a/bin/poolify b/bin/poolify
new file mode 100755
index 0000000..34e0b42
--- /dev/null
+++ b/bin/poolify
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+
+url2murl() {
+ local x
+ x=$1
+ x=${x//'^'/'^5E'}
+ x=${x//':'/'^3A'}
+ x=${x//'%'/'^25'}
+ printf '%s' "$x"
+}
+
+main() {
+ set -euE -o pipefail
+ shopt -s nullglob
+
+ while read -r snap name date time size; do
+ dirpart="${name%/*}"
+ filepart="${name##*/}"
+ filedir=dat/pools/files/"${date//-/}${time//:/}-${name//\//_}"
+ snapdir=dat/pools/snaps/"${snap}-${dirpart//\//_}"
+ mkdir -p -- "$filedir" "$snapdir"
+ ln -sr "$filedir/$filepart" "$snapdir"
+ done < "$1"
+
+ while read -r time url; do
+ name="${url##*/Public/}"
+ dirpart="${name%/*}"
+ filepart="${name##*/}"
+
+ if [[ -z "$filepart" ]]; then
+ continue
+ fi
+
+ pools=(dat/pools/files/*-"${name//\//_}")
+ pools=("${pools[@]##*/}")
+
+ mypool=''
+ for pool in "${pools[@]}"; do
+ pooltime="${pool%%-*}"
+ if [[ "${pooltime}00" -le "$time" ]]; then
+ mypool=$pool
+ fi
+ done
+ if [[ -z "$mypool" ]]; then
+ >&2 printf 'Could not find pool for %s %s' "$time" "$url"
+ false
+ fi
+
+ waurl="http://web.archive.org/web/$time/$url"
+ file="dat/content-file/$(url2murl "${waurl#http://}")"
+
+ declare -i i=0
+ while true; do
+ link="dat/pools/files/$mypool/$filepart.$i"
+ link="${link%.0}"
+ a="$(readlink -f "$link")" || true
+ b="$(readlink -f "$file")"
+ if cmp -s -- "$a" "$b"; then
+ break
+ fi
+ if ln -sr "$b" "$link"; then
+ break
+ fi
+ i+=1
+ done
+ done < "$2"
+}
+
+main "$@"