summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2017-07-01 19:55:21 -0400
committerLuke Shumaker <lukeshu@lukeshu.com>2017-07-01 19:55:21 -0400
commit838a73363333bd2eda5aaf08e41701760406987a (patch)
tree59e8dfea595959d93108249a79f828aadce7e0af
parentbc0107813be28ab9532f136047d23bbaa158ee8b (diff)
Almost there
Now it just needs to figure out that it needs to synthesize directory listings for dat/pools/files/*/*.1 files.
-rw-r--r--Makefile8
-rwxr-xr-xbin/cdxget2
-rwxr-xr-xbin/fmt-metadata5
-rwxr-xr-xbin/poolify9
4 files changed, 18 insertions, 6 deletions
diff --git a/Makefile b/Makefile
index 55041ff..0272edb 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ dirfail = ( r=$$?; mv -- '$@'{,.bak}; exit $$r; )
all:
$(MAKE) dat/urlkeys.mk
$(MAKE) dat/index.mk
- $(MAKE) dat/pools
+ $(MAKE) dat/pools dat/git
fix:
grep -rl '<html><body><h1>503' dat | xargs rm -fv --
@@ -33,7 +33,7 @@ include dat/urlkeys.mk
dat/each-cdx/%.txt:
@mkdir -p '$(@D)'
- cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'collapse=digest' 'fl=timestamp,original' > '$@'
+ cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'fl=timestamp,original' > '$@'
dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys))) dat/urlkeys.txt
cat -- $(foreach c,$(filter dat/each-cdx/%,$^),'$c') | sort > $@
dat/index.mk: dat/index.txt
@@ -45,7 +45,7 @@ ifneq ($(wildcard dat/index.mk),)
dat/content-dir/%/index.wahtml:
@mkdir -p '$(@D)'
- curl -s 'http://web.archive.org/web/$(call murl2url,$*)' > $@
+ curl -sL 'http://web.archive.org/web/$(call murl2url,$*)' > $@
dat/content-dir/%/index.html: dat/content-dir/%/index.wahtml
< $< wayfore > $@
dat/content-dir/%/readme.txt: dat/content-dir/%/index.html
@@ -57,7 +57,7 @@ download += $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(co
dat/content-file/%:
@mkdir -p '$(@D)'
- curl -s 'http://web.archive.org/web/$(call murl2url,$*)' > $@
+ curl -sL 'http://web.archive.org/web/$(call murl2url,$*)' > $@
content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u)))
download += $(content-file)
diff --git a/bin/cdxget b/bin/cdxget
index a54612d..46d56c4 100755
--- a/bin/cdxget
+++ b/bin/cdxget
@@ -4,4 +4,4 @@ url='http://web.archive.org/cdx/search/cdx?'
for arg in "$@"; do
url+="$s${arg%%=*}=$(printf '%s' "${arg#*=}"|urlencode)&"
done
-curl -s "$url"
+curl -sL "$url"
diff --git a/bin/fmt-metadata b/bin/fmt-metadata
index 7867d63..c92419b 100755
--- a/bin/fmt-metadata
+++ b/bin/fmt-metadata
@@ -19,5 +19,8 @@ $stdin.each_line do |line|
datetime = Time.parse("#{datetime} +00:00").utc.strftime('%Y-%m-%d %H:%M')
end
- puts ("%-22s %s %3s%s" % [ name, datetime, size_numb, size_unit ])
+ # discard the size, I guess. The number of digits precision was
+ # inconsistent over the years.
+ #puts ("%-22s %s %3s%s" % [ name, datetime, size_numb, size_unit ])
+ puts ("%-22s %s" % [ name, datetime ])
end
diff --git a/bin/poolify b/bin/poolify
index af8bf40..e33821b 100755
--- a/bin/poolify
+++ b/bin/poolify
@@ -13,6 +13,7 @@ main() {
set -euE -o pipefail
shopt -s nullglob
+ echo '# Pass 1'
while read -r snap name date time size; do
dirpart="${name%/*}"
filepart="${name##*/}"
@@ -22,6 +23,7 @@ main() {
ln -sr "$filedir/$filepart" "$snapdir"
done < "$1"
+ echo '# Pass 2'
while read -r time url; do
name="${url##*/Public/}"
dirpart="${name%/*}"
@@ -63,6 +65,13 @@ main() {
i+=1
done
done < "$2"
+
+ echo '# Pass 3'
+ while read -r missing; do
+ if [[ -f "${missing/.OLD}/${missing##*_}" ]]; then
+ ln -sr "${missing/.OLD}/${missing##*_}" "$missing"
+ fi
+ done < <(find dat/pools/files/*-PROGRAMS_CVTUTF.OLD_* -type d -empty)
}
main "$@"