From 838a73363333bd2eda5aaf08e41701760406987a Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Sat, 1 Jul 2017 19:55:21 -0400 Subject: Almost there Now it just needs to figure out that it needs to synthesize directory listings for dat/pools/files/*/*.1 files. --- Makefile | 8 ++++---- bin/cdxget | 2 +- bin/fmt-metadata | 5 ++++- bin/poolify | 9 +++++++++ 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 55041ff..0272edb 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ dirfail = ( r=$$?; mv -- '$@'{,.bak}; exit $$r; ) all: $(MAKE) dat/urlkeys.mk $(MAKE) dat/index.mk - $(MAKE) dat/pools + $(MAKE) dat/pools dat/git fix: grep -rl '

503' dat | xargs rm -fv -- @@ -33,7 +33,7 @@ include dat/urlkeys.mk dat/each-cdx/%.txt: @mkdir -p '$(@D)' - cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'collapse=digest' 'fl=timestamp,original' > '$@' + cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'fl=timestamp,original' > '$@' dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys))) dat/urlkeys.txt cat -- $(foreach c,$(filter dat/each-cdx/%,$^),'$c') | sort > $@ dat/index.mk: dat/index.txt @@ -45,7 +45,7 @@ ifneq ($(wildcard dat/index.mk),) dat/content-dir/%/index.wahtml: @mkdir -p '$(@D)' - curl -s 'http://web.archive.org/web/$(call murl2url,$*)' > $@ + curl -sL 'http://web.archive.org/web/$(call murl2url,$*)' > $@ dat/content-dir/%/index.html: dat/content-dir/%/index.wahtml < $< wayfore > $@ dat/content-dir/%/readme.txt: dat/content-dir/%/index.html @@ -57,7 +57,7 @@ download += $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(co dat/content-file/%: @mkdir -p '$(@D)' - curl -s 'http://web.archive.org/web/$(call murl2url,$*)' > $@ + curl -sL 'http://web.archive.org/web/$(call murl2url,$*)' > $@ content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u))) download += $(content-file) diff --git a/bin/cdxget b/bin/cdxget index a54612d..46d56c4 100755 --- a/bin/cdxget +++ b/bin/cdxget @@ -4,4 +4,4 @@ url='http://web.archive.org/cdx/search/cdx?' for arg in "$@"; do url+="$s${arg%%=*}=$(printf '%s' "${arg#*=}"|urlencode)&" done -curl -s "$url" +curl -sL "$url" diff --git a/bin/fmt-metadata b/bin/fmt-metadata index 7867d63..c92419b 100755 --- a/bin/fmt-metadata +++ b/bin/fmt-metadata @@ -19,5 +19,8 @@ $stdin.each_line do |line| datetime = Time.parse("#{datetime} +00:00").utc.strftime('%Y-%m-%d %H:%M') end - puts ("%-22s %s %3s%s" % [ name, datetime, size_numb, size_unit ]) + # discard the size, I guess. The number of digits precision was + # inconsistent over the years. + #puts ("%-22s %s %3s%s" % [ name, datetime, size_numb, size_unit ]) + puts ("%-22s %s" % [ name, datetime ]) end diff --git a/bin/poolify b/bin/poolify index af8bf40..e33821b 100755 --- a/bin/poolify +++ b/bin/poolify @@ -13,6 +13,7 @@ main() { set -euE -o pipefail shopt -s nullglob + echo '# Pass 1' while read -r snap name date time size; do dirpart="${name%/*}" filepart="${name##*/}" @@ -22,6 +23,7 @@ main() { ln -sr "$filedir/$filepart" "$snapdir" done < "$1" + echo '# Pass 2' while read -r time url; do name="${url##*/Public/}" dirpart="${name%/*}" @@ -63,6 +65,13 @@ main() { i+=1 done done < "$2" + + echo '# Pass 3' + while read -r missing; do + if [[ -f "${missing/.OLD}/${missing##*_}" ]]; then + ln -sr "${missing/.OLD}/${missing##*_}" "$missing" + fi + done < <(find dat/pools/files/*-PROGRAMS_CVTUTF.OLD_* -type d -empty) } main "$@" -- cgit v1.2.3-54-g00ecf