From 57266bea1b5fd82c6cc199c538783306c668b661 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 30 Jun 2017 19:21:29 -0400 Subject: initial commit --- .gitignore | 4 ++++ Makefile | 25 +++++++++++++++++++++++++ cdx_json | 7 +++++++ urlkey2url | 8 ++++++++ 4 files changed, 44 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100755 cdx_json create mode 100755 urlkey2url diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..745d6d4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.json +*/ +*.mk +*.txt diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..14db651 --- /dev/null +++ b/Makefile @@ -0,0 +1,25 @@ +all: each-cdx + +ftp.json: cdx_json + ./cdx_json 'url=ftp.unicode.org/Public/*' 'collapse=urlkey' > $@ +www.json: cdx_json + ./cdx_json 'url=www.unicode.org/Public/*' 'collapse=urlkey' > $@ +urlkeys.txt: ftp.json www.json Makefile + cat $^ | cut -d '"' -f2,10 | sed -n 's/"200$$//p' | cut -d '?' -f1 | sed 's/,ftp)/)/' | sort -u | grep -i -e cvtutf -e convertutf > $@ +urlkeys.mk: urlkeys.txt + sed 's/^/urlkeys+=/' < $< > $@ + +-include urlkeys.mk +rp = ) +c = , +all_urlkeys = $(urlkeys) $(subst $(rp),$(c)ftp$(rp),$(urlkeys)) + +each-cdx/%.json: cdx_json urlkey2url + mkdir -p '$(@D)' + ./cdx_json "url=$$(./urlkey2url '$*')" 'collapse=digest' > '$@' + +each-cdx: $(addprefix each-cdx/,$(addsuffix .json,$(all_urlkeys))) +.PHONY: each-cdx + +.DELETE_ON_ERROR: +.SECONDARY: diff --git a/cdx_json b/cdx_json new file mode 100755 index 0000000..81284af --- /dev/null +++ b/cdx_json @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +url='http://web.archive.org/cdx/search/cdx?output=json' +for arg in "$@"; do + url+="&${arg%%=*}=$(printf '%s' "${arg#*=}"|urlencode)" +done +curl -s "$url" diff --git a/urlkey2url b/urlkey2url new file mode 100755 index 0000000..5d0ec3d --- /dev/null +++ b/urlkey2url @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +for arg in "$@"; do + keydomain="${arg%%)*}" + keypath="${arg#*)}" + domain="$(IFS=,; printf '%s\n' $keydomain|tac|xargs|tr ' ' '.')" + echo "$domain$keypath" +done -- cgit v1.2.3-54-g00ecf