diff options
-rw-r--r-- | Makefile | 28 | ||||
-rwxr-xr-x | bin/wayfore | 4 |
2 files changed, 31 insertions, 1 deletions
@@ -2,7 +2,10 @@ SHELL=bash -o pipefail PATH:=$(CURDIR)/bin:$(PATH) export PATH -all: dat/index.txt +url2murl = $(subst %,^25,$(subst :,^3A,$(subst ^,^5E,$1))) +murl2url = $(subst ^5E,^,$(subst ^3A,:,$(subst ^25,%,$1))) + +all: download fix: grep -rl '<html><body><h1>503' dat | xargs rm -fv -- @@ -23,6 +26,29 @@ dat/each-cdx/%.txt: cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'collapse=digest' 'fl=timestamp,original' > '$@' dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys))) cat -- $(foreach c,$^,'$c') | sort > $@ +dat/index.mk: dat/index.txt + < $< sed 's,^,index+=web.archive.org/web/,;s, ,/,' > $@ + +-include dat/index.mk + +dat/content-dir/%/index.wahtml: + @mkdir -p '$(@D)' + curl -s 'http://$(call murl2url,$*)' > $@ +dat/content-dir/%/index.html: dat/content-dir/%/index.wahtml + wayfore < $< > $@ +dat/content-dir/%/readme.txt: dat/content-dir/%/index.html + < $< sed -n '/^<pre>$$/,/<\/pre>/p' | sed -e 1d -e 's,</pre>.*,,' > $@ +dat/content-dir/%/metadata.txt: dat/content-dir/%/index.html + < $< grep '^<img' | sed 's/<[^>]*>//g' | grep -vi 'parent directory' > $@ +content-dir = $(foreach u,$(filter %/,$(index)),dat/content-dir/$(call url2murl,$(u))) +download: $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir)) + +dat/content-file/%: + @mkdir -p '$(@D)' + curl -s 'http://$(call murl2url,$*)' > $@ +content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u))) +download: $(content-file) +.PHONY: all fix download .DELETE_ON_ERROR: .SECONDARY: diff --git a/bin/wayfore b/bin/wayfore new file mode 100755 index 0000000..b0bde8a --- /dev/null +++ b/bin/wayfore @@ -0,0 +1,4 @@ +#!/usr/bin/sed -zrf +# The opposite of 'wayback' +s/(<[hH][eE][aA][dD]>).*<!-- End Wayback Rewrite JS Include -->/\1/ +s/<!-- BEGIN WAYBACK TOOLBAR INSERT -->.*<!-- END WAYBACK TOOLBAR INSERT -->// |