diff options
author | Luke Shumaker <lukeshu@lukeshu.com> | 2017-06-30 22:27:24 -0400 |
---|---|---|
committer | Luke Shumaker <lukeshu@lukeshu.com> | 2017-06-30 22:27:24 -0400 |
commit | 2631b600d153aeda1d4201164dafc023dfdceedb (patch) | |
tree | eb3458af979ed98230e5d5ce40709202df14f2ed /Makefile | |
parent | 99011e7fcebeccc26a3da591e3445a93ffadad3c (diff) |
download the actual data
Diffstat (limited to 'Makefile')
-rw-r--r-- | Makefile | 28 |
1 files changed, 27 insertions, 1 deletions
@@ -2,7 +2,10 @@ SHELL=bash -o pipefail PATH:=$(CURDIR)/bin:$(PATH) export PATH -all: dat/index.txt +url2murl = $(subst %,^25,$(subst :,^3A,$(subst ^,^5E,$1))) +murl2url = $(subst ^5E,^,$(subst ^3A,:,$(subst ^25,%,$1))) + +all: download fix: grep -rl '<html><body><h1>503' dat | xargs rm -fv -- @@ -23,6 +26,29 @@ dat/each-cdx/%.txt: cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'collapse=digest' 'fl=timestamp,original' > '$@' dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys))) cat -- $(foreach c,$^,'$c') | sort > $@ +dat/index.mk: dat/index.txt + < $< sed 's,^,index+=web.archive.org/web/,;s, ,/,' > $@ + +-include dat/index.mk + +dat/content-dir/%/index.wahtml: + @mkdir -p '$(@D)' + curl -s 'http://$(call murl2url,$*)' > $@ +dat/content-dir/%/index.html: dat/content-dir/%/index.wahtml + wayfore < $< > $@ +dat/content-dir/%/readme.txt: dat/content-dir/%/index.html + < $< sed -n '/^<pre>$$/,/<\/pre>/p' | sed -e 1d -e 's,</pre>.*,,' > $@ +dat/content-dir/%/metadata.txt: dat/content-dir/%/index.html + < $< grep '^<img' | sed 's/<[^>]*>//g' | grep -vi 'parent directory' > $@ +content-dir = $(foreach u,$(filter %/,$(index)),dat/content-dir/$(call url2murl,$(u))) +download: $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir)) + +dat/content-file/%: + @mkdir -p '$(@D)' + curl -s 'http://$(call murl2url,$*)' > $@ +content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u))) +download: $(content-file) +.PHONY: all fix download .DELETE_ON_ERROR: .SECONDARY: |