From 2631b600d153aeda1d4201164dafc023dfdceedb Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 30 Jun 2017 22:27:24 -0400 Subject: download the actual data --- Makefile | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'Makefile') diff --git a/Makefile b/Makefile index 262d3af..5cb9fdc 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,10 @@ SHELL=bash -o pipefail PATH:=$(CURDIR)/bin:$(PATH) export PATH -all: dat/index.txt +url2murl = $(subst %,^25,$(subst :,^3A,$(subst ^,^5E,$1))) +murl2url = $(subst ^5E,^,$(subst ^3A,:,$(subst ^25,%,$1))) + +all: download fix: grep -rl '

503' dat | xargs rm -fv -- @@ -23,6 +26,29 @@ dat/each-cdx/%.txt: cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'collapse=digest' 'fl=timestamp,original' > '$@' dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys))) cat -- $(foreach c,$^,'$c') | sort > $@ +dat/index.mk: dat/index.txt + < $< sed 's,^,index+=web.archive.org/web/,;s, ,/,' > $@ + +-include dat/index.mk + +dat/content-dir/%/index.wahtml: + @mkdir -p '$(@D)' + curl -s 'http://$(call murl2url,$*)' > $@ +dat/content-dir/%/index.html: dat/content-dir/%/index.wahtml + wayfore < $< > $@ +dat/content-dir/%/readme.txt: dat/content-dir/%/index.html + < $< sed -n '/^
$$/,/<\/pre>/p' | sed -e 1d -e 's,
.*,,' > $@ +dat/content-dir/%/metadata.txt: dat/content-dir/%/index.html + < $< grep '^]*>//g' | grep -vi 'parent directory' > $@ +content-dir = $(foreach u,$(filter %/,$(index)),dat/content-dir/$(call url2murl,$(u))) +download: $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir)) + +dat/content-file/%: + @mkdir -p '$(@D)' + curl -s 'http://$(call murl2url,$*)' > $@ +content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u))) +download: $(content-file) +.PHONY: all fix download .DELETE_ON_ERROR: .SECONDARY: -- cgit v1.2.3