summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2017-06-30 20:05:23 -0400
committerLuke Shumaker <lukeshu@lukeshu.com>2017-06-30 20:05:23 -0400
commite46a74fe8a143936eee2b9be1fd6b5f963357d9d (patch)
tree457e959fc8bcdf1222092a29435e468c824bb697
parent57266bea1b5fd82c6cc199c538783306c668b661 (diff)
work on it
-rw-r--r--.gitignore5
-rw-r--r--Makefile29
-rw-r--r--bin/cdxcat0
-rw-r--r--bin/cdxcut0
-rwxr-xr-xbin/cdxget7
-rwxr-xr-xbin/urlkey2url (renamed from urlkey2url)0
-rwxr-xr-xcdx_json7
7 files changed, 25 insertions, 23 deletions
diff --git a/.gitignore b/.gitignore
index 745d6d4..a15fceb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1 @@
-*.json
-*/
-*.mk
-*.txt
+/dat/
diff --git a/Makefile b/Makefile
index 14db651..eb8ae8e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,24 +1,29 @@
+PATH:=$(CURDIR)/bin:$(PATH)
+export PATH
+
all: each-cdx
-ftp.json: cdx_json
- ./cdx_json 'url=ftp.unicode.org/Public/*' 'collapse=urlkey' > $@
-www.json: cdx_json
- ./cdx_json 'url=www.unicode.org/Public/*' 'collapse=urlkey' > $@
-urlkeys.txt: ftp.json www.json Makefile
- cat $^ | cut -d '"' -f2,10 | sed -n 's/"200$$//p' | cut -d '?' -f1 | sed 's/,ftp)/)/' | sort -u | grep -i -e cvtutf -e convertutf > $@
-urlkeys.mk: urlkeys.txt
- sed 's/^/urlkeys+=/' < $< > $@
+dat:
+ mkdir -p $@
+dat/ftp.txt: | dat
+ cdxget 'url=ftp.unicode.org/Public/*' 'fl=urlkey' 'filter=statuscode:200' 'filter=urlkey:.*(cvt|convert)utf.*' > $@
+dat/www.txt: | dat
+ cdxget 'url=www.unicode.org/Public/*' 'fl=urlkey' 'filter=statuscode:200' 'filter=urlkey:.*(cvt|convert)utf.*' > $@
+dat/urlkeys.txt: dat/ftp.txt dat/www.txt
+ cat $^ | cut -d '?' -f1 | sed 's/,ftp)/)/' | sort -u > $@
+dat/urlkeys.mk: dat/urlkeys.txt
+ cat $^ | sed 's/^/urlkeys+=/' < $< > $@
--include urlkeys.mk
+-include dat/urlkeys.mk
rp = )
c = ,
all_urlkeys = $(urlkeys) $(subst $(rp),$(c)ftp$(rp),$(urlkeys))
-each-cdx/%.json: cdx_json urlkey2url
+dat/each-cdx/%.txt:
mkdir -p '$(@D)'
- ./cdx_json "url=$$(./urlkey2url '$*')" 'collapse=digest' > '$@'
+ cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'collapse=digest' 'fl=timestamp,original' > '$@'
-each-cdx: $(addprefix each-cdx/,$(addsuffix .json,$(all_urlkeys)))
+each-cdx: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(all_urlkeys)))
.PHONY: each-cdx
.DELETE_ON_ERROR:
diff --git a/bin/cdxcat b/bin/cdxcat
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/bin/cdxcat
diff --git a/bin/cdxcut b/bin/cdxcut
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/bin/cdxcut
diff --git a/bin/cdxget b/bin/cdxget
new file mode 100755
index 0000000..a54612d
--- /dev/null
+++ b/bin/cdxget
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+url='http://web.archive.org/cdx/search/cdx?'
+for arg in "$@"; do
+ url+="$s${arg%%=*}=$(printf '%s' "${arg#*=}"|urlencode)&"
+done
+curl -s "$url"
diff --git a/urlkey2url b/bin/urlkey2url
index 5d0ec3d..5d0ec3d 100755
--- a/urlkey2url
+++ b/bin/urlkey2url
diff --git a/cdx_json b/cdx_json
deleted file mode 100755
index 81284af..0000000
--- a/cdx_json
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/usr/bin/env bash
-
-url='http://web.archive.org/cdx/search/cdx?output=json'
-for arg in "$@"; do
- url+="&${arg%%=*}=$(printf '%s' "${arg#*=}"|urlencode)"
-done
-curl -s "$url"