summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile33
-rwxr-xr-xbin/dateify14
-rwxr-xr-xbin/fmt-metadata4
-rwxr-xr-xbin/gitthing5
-rwxr-xr-xbin/poolify3
5 files changed, 28 insertions, 31 deletions
diff --git a/Makefile b/Makefile
index 2b96cd7..55041ff 100644
--- a/Makefile
+++ b/Makefile
@@ -6,41 +6,52 @@ url2murl = $(subst %,^25,$(subst :,^3A,$(subst ^,^5E,$1)))
murl2url = $(subst ^5E,^,$(subst ^3A,:,$(subst ^25,%,$1)))
dirfail = ( r=$$?; mv -- '$@'{,.bak}; exit $$r; )
-all: dat/git dat/pools
+all:
+ $(MAKE) dat/urlkeys.mk
+ $(MAKE) dat/index.mk
+ $(MAKE) dat/pools
fix:
grep -rl '<html><body><h1>503' dat | xargs rm -fv --
+.PHONY: all fix
+
+# Stage 1 ######################################################################
+
dat:
mkdir -p $@
dat/cdxindex.txt: | dat
cdxget 'url=www.unicode.org/Public/*' 'fl=urlkey' 'filter=statuscode:200' 'filter=urlkey:.*(cvt|convert)utf.*' > $@
dat/urlkeys.txt: dat/cdxindex.txt
- cat $^ | cut -d '?' -f1 | sort -u > $@
+ < $< cut -d '?' -f1 | sort -u > $@
dat/urlkeys.mk: dat/urlkeys.txt
- cat $^ | sed 's/^/urlkeys+=/' < $< > $@
+ < $< sed 's/^/urlkeys+=/' > $@
--include dat/urlkeys.mk
+# Stage 2 ######################################################################
+ifneq ($(wildcard dat/urlkeys.mk),)
+include dat/urlkeys.mk
dat/each-cdx/%.txt:
@mkdir -p '$(@D)'
cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'collapse=digest' 'fl=timestamp,original' > '$@'
-dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys))) urlkeys.txt
+dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys))) dat/urlkeys.txt
cat -- $(foreach c,$(filter dat/each-cdx/%,$^),'$c') | sort > $@
dat/index.mk: dat/index.txt
< $< sed -e 's,^,index+=,' -e 's, ,/,' > $@
+# Stage 3 ######################################################################
+ifneq ($(wildcard dat/index.mk),)
-include dat/index.mk
dat/content-dir/%/index.wahtml:
@mkdir -p '$(@D)'
curl -s 'http://web.archive.org/web/$(call murl2url,$*)' > $@
dat/content-dir/%/index.html: dat/content-dir/%/index.wahtml
- wayfore < $< > $@
+ < $< wayfore > $@
dat/content-dir/%/readme.txt: dat/content-dir/%/index.html
< $< sed -n '/^<pre>$$/,/<\/pre>/p' | sed -e 1d -e 's,</pre>.*,,' > $@
dat/content-dir/%/metadata.txt: dat/content-dir/%/index.html
- < $< grep '^<img' | sed 's/<[^>]*>//g' | fmt-metadata $(firstword $(subst /, ,$*)) > $@
+ < $< grep '^<img' | sed 's/<[^>]*>//g' | grep -vi 'parent directory' | fmt-metadata $(firstword $(subst /, ,$*)) > $@
content-dir = $(foreach u,$(filter %/,$(index)),dat/content-dir/$(call url2murl,$(u)))
download += $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir))
@@ -51,9 +62,10 @@ content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url
download += $(content-file)
download: $(download)
+.PHONY: download
dat/pools.txt: $(addsuffix metadata.txt,$(content-dir)) dat/index.txt
- grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | bin/dateify | sed -r -e 's,.*web\.archive\.org/web/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt: ,/,' -e 's,\s+, ,g' | sort -u > $@
+ grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | sed -r -e 's,^dat/content-dir/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt:,/,' -e 's,\s+, ,g' | sort -u > $@
dat/pools: $(download) dat/pools.txt dat/index.txt
rm -rf -- $@ $@.bak
poolify dat/pools.txt dat/index.txt || $(dirfail)
@@ -62,6 +74,9 @@ dat/git: $(download) dat/index.txt
rm -rf -- $@ $@.bak
gitthing dat/git < dat/index.txt || $(dirfail)
-.PHONY: all fix download
+################################################################################
+endif
+endif
+
.DELETE_ON_ERROR:
.SECONDARY:
diff --git a/bin/dateify b/bin/dateify
deleted file mode 100755
index 7aefdd9..0000000
--- a/bin/dateify
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/usr/bin/sed -rf
-s/\b([0-9]{2})-([A-Z][a-z][a-z])-([0-9]{4})(\b|T|_)/\3-\2-\1\4/
-s/Jan/01/
-s/Feb/02/
-s/Mar/03/
-s/Apr/04/
-s/May/05/
-s/Jun/06/
-s/Jul/07/
-s/Aug/08/
-s/Sep/09/
-s/Oct/10/
-s/Nov/11/
-s/Dec/12/
diff --git a/bin/fmt-metadata b/bin/fmt-metadata
index 0682414..7867d63 100755
--- a/bin/fmt-metadata
+++ b/bin/fmt-metadata
@@ -4,15 +4,13 @@ require 'time'
snapshot = ARGV.first.to_i
$stdin.each_line do |line|
- m = /^ (\S+) +(..-\S+-.... ..:..) +([0-9.]+)(\S+) *$/.match(line)
+ m = /^ (\S+) +(..-\S+-.... ..:..) +([0-9.-]+)(\S+) *$/.match(line)
raise "Malformed line: #{line}" unless m
name = m[1]
datetime = m[2]
size_numb = m[3]
size_unit = m[4]
- next if name.downcase == "parent directory"
-
# The Unicode.org web server switched the timezone of timestamps
# in May 2004
if snapshot < 20040500000000
diff --git a/bin/gitthing b/bin/gitthing
index 7bac2e2..ff7b5ac 100755
--- a/bin/gitthing
+++ b/bin/gitthing
@@ -43,12 +43,11 @@ main() {
git checkout -b "$branch" || true
git checkout "$branch"
- waurl="http://web.archive.org/web/$time/$url"
if [[ -n "$filepart" ]]; then
- file="$top/dat/content-file/$(url2murl "${waurl#http://}")"
+ file="$top/dat/content-file/$time/$(url2murl "$url")"
cp "$file" .
else
- dir="$top/dat/content-dir/$(url2murl "${waurl#http://}")"
+ dir="$top/dat/content-dir/$time/$(url2murl "$url")"
comm -23 \
<(git ls-files) \
<(< "$dir/metadata.txt" awk '{print $1}') \
diff --git a/bin/poolify b/bin/poolify
index 34e0b42..af8bf40 100755
--- a/bin/poolify
+++ b/bin/poolify
@@ -46,8 +46,7 @@ main() {
false
fi
- waurl="http://web.archive.org/web/$time/$url"
- file="dat/content-file/$(url2murl "${waurl#http://}")"
+ file="dat/content-file/$time/$(url2murl $url)"
declare -i i=0
while true; do