diff options
-rw-r--r-- | .gitignore | 6 | ||||
-rw-r--r-- | COPYING | 14 | ||||
-rw-r--r-- | Makefile | 93 | ||||
-rw-r--r-- | README.md | 2 | ||||
-rwxr-xr-x | bin/cdxget | 14 | ||||
-rwxr-xr-x | bin/fmt-metadata | 7 | ||||
-rwxr-xr-x | bin/gitify | 28 | ||||
-rwxr-xr-x | bin/poolify | 54 | ||||
-rwxr-xr-x | bin/urlkey2url | 6 | ||||
-rwxr-xr-x | bin/wayfore | 4 |
10 files changed, 186 insertions, 42 deletions
@@ -1 +1,7 @@ +# Copyright (c) 2017 Luke Shumaker <lukeshu@lukeshu.com> +# +# This work is free. You can redistribute it and/or modify it under +# the terms of the Do What The Fuck You Want To Public License, +# Version 2, as published by Sam Hocevar. See the COPYING file for +# more details. /dat/ @@ -0,0 +1,14 @@ + DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
+ Version 2, December 2004
+
+ Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
+
+ Everyone is permitted to copy and distribute verbatim or modified
+ copies of this license document, and changing it is allowed as long
+ as the name is changed.
+
+ DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. You just DO WHAT THE FUCK YOU WANT TO.
+
@@ -1,3 +1,10 @@ +# Copyright (c) 2017, 2023 Luke Shumaker <lukeshu@lukeshu.com> +# +# This work is free. You can redistribute it and/or modify it under +# the terms of the Do What The Fuck You Want To Public License, +# Version 2, as published by Sam Hocevar. See the COPYING file for +# more details. + SHELL=bash -o pipefail PATH:=$(CURDIR)/bin:$(PATH) export PATH @@ -6,28 +13,53 @@ url2murl = $(subst %,^25,$(subst :,^3A,$(subst ^,^5E,$1))) murl2url = $(subst ^5E,^,$(subst ^3A,:,$(subst ^25,%,$1))) dirfail = ( r=$$?; mv -- '$@'{,.bak}; exit $$r; ) +# This is split into stages for when Make has to make decisions about +# the build tree based on the output of a previous stage. That is: +# these stages exist for a technical GNU Make reason, not for +# human-comprehensibility reasons; so stages have lopsided sizes; the +# first two are very small, and almost everything is in the third +# stage. all: + # Stage 1 ###################################################################### $(MAKE) dat/urlkeys.mk + # Stage 2 ###################################################################### $(MAKE) dat/index.mk + # Stage 3 ###################################################################### $(MAKE) dat/git +.PHONY: all -fix: - grep -rl '<html><body><h1>503' dat | xargs rm -fv -- - -.PHONY: all fix +COPYING: + curl -L http://www.wtfpl.net/txt/copying/ >$@ # Stage 1 ###################################################################### +# +# Fetch a listing of all relevant URLs. +# +# - `dat/cdxindex.txt` +# - `dat/urlkeys.txt` +# - `dat/urlkeys.mk` dat: mkdir -p $@ dat/cdxindex.txt: | dat - cdxget 'url=www.unicode.org/Public/*' 'fl=urlkey' 'filter=statuscode:200' 'filter=urlkey:.*(cvt|convert)utf.*' > $@ + cdxget 'url=www.unicode.org/Public/*' 'fl=urlkey' 'collapse=urlkey' 'filter=statuscode:200' 'filter=urlkey:.*(cvt|convert)utf.*' | grep -vFx 'org,unicode)/public/2.0-update/cvtutf7.c' > $@ dat/urlkeys.txt: dat/cdxindex.txt < $< cut -d '?' -f1 | sort -u > $@ dat/urlkeys.mk: dat/urlkeys.txt < $< sed 's/^/urlkeys+=/' > $@ # Stage 2 ###################################################################### +# +# Fetch the history for each relevant URL. +# +# - `dat/each-cdx/$(urlkey).txt` (for each urlkey in `dat/urlkeys.mk`) +# +# - `dat/index.txt` +# has a line for each relevant URL: +# +# ${wayback_timestamp:YYYYmmddHHMMSS} ${url} +# +# - `dat/index.mk` ifneq ($(wildcard dat/urlkeys.mk),) include dat/urlkeys.mk @@ -40,35 +72,70 @@ dat/index.mk: dat/index.txt < $< sed -e 's,^,index+=,' -e 's, ,/,' > $@ # Stage 3 ###################################################################### +# +# The main stage. ifneq ($(wildcard dat/index.mk),) -include dat/index.mk -dat/content-dir/%/index.wahtml: +# Part 1: Directory indexes: +# +# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/index.html` +# +# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/readme.txt` +# +# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/metadata.txt` +# has a line for each file mentioned in index.html (this format is +# controlled by `bin/fmt-metadata`): +# +# ${file_name} ${file_timestamp:YYYY-mm-dd HH:MM} +dat/content-dir/%/index.html: @mkdir -p '$(@D)' - curl -sL 'http://web.archive.org/web/$(call murl2url,$*)' > $@ -dat/content-dir/%/index.html: dat/content-dir/%/index.wahtml - < $< wayfore > $@ + curl -sfL 'http://web.archive.org/web/$(call murl2url,$(subst /http,id_/http,$*))' > $@ dat/content-dir/%/readme.txt: dat/content-dir/%/index.html - < $< sed -n '/^<pre>$$/,/<\/pre>/p' | sed -e 1d -e 's,</pre>.*,,' > $@ + < $< sed -n '/^<[pP][rR][eE]>$$/,/<\/[pP][rR][eE]>/p' | sed -e 1d -e 's,</[pP][rR][eE]>.*,,' > $@ dat/content-dir/%/metadata.txt: dat/content-dir/%/index.html - < $< grep '^<img' | sed 's/<[^>]*>//g' | grep -vi 'parent directory' | fmt-metadata $(firstword $(subst /, ,$*)) > $@ + < $< grep -i '^<img' | sed 's/<[^>]*>//g' | grep -vi 'parent directory' | fmt-metadata $(firstword $(subst /, ,$*)) > $@ content-dir = $(foreach u,$(filter %/,$(index)),dat/content-dir/$(call url2murl,$(u))) download += $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir)) +# Part 2: File contents: +# - `dat/content-file/$(wayback_timestamp:YYYYmmddHHMMSS)/$(file_murl)` dat/content-file/%: @mkdir -p '$(@D)' - curl -sL 'http://web.archive.org/web/$(call murl2url,$*)' > $@ + curl -sfL 'http://web.archive.org/web/$(call murl2url,$(subst /http,id_/http,$*))' > $@ content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u))) download += $(content-file) +# `download` is a convenience target to download files without +# processing them. It isn't depended on by anything. download: $(download) .PHONY: download +# Part 3: Aggregate: +# - `dat/metadata.txt` +# has a line for each file mentioned in any index.html: +# +# ${dirindex_wayback_timestamp:YYYYmmddHHMMSS} ${branch_name}/${file_name} ${file_html_timestamp:YYYY-mm-dd HH:MM} +# +# where the ${dirindex_wayback_timestamp} and ${branch_name} are +# determined from the path to the relevant index.html. +# +# - `dat/pools/` +# + pass 1 and pass 1.5 +# * `dat/pools/files/${file_html_timestamp:YYYYmmddHHMM}-${branch_name}_${file_name}/` +# * `dat/pools/snaps/${dirindex_wayback_timestamp:YYYYmmddHHMMSS}-${branch_name}/${file_name}` (symlink to the /files/ file) +# + pass 2 and pass 3: +# * `dat/pools/files/${file_html_timestamp:YYYYmmddHHMM}-${branch_name}_${file_name}/${file_name}` (for each existing /file/ dir) +# dat/metadata.txt: $(addsuffix metadata.txt,$(content-dir)) dat/index.txt - grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | sed -r -e 's,^dat/content-dir/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt:,/,' -e 's,\s+, ,g' | sort -u > $@ + grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | sed -E -e 's,^dat/content-dir/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt:,/,' -e 's,\s+, ,g' | sort -u > $@ dat/pools: $(download) dat/metadata.txt dat/index.txt rm -rf -- $@ $@.bak poolify dat/metadata.txt dat/index.txt || $(dirfail) + +# Part 4: Turn each `dat/pools/snaps/*` directory into a Git commit +# +# - `dat/git/` dat/git: dat/pools $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir)) rm -rf -- $@ $@.bak gitify $@ || $(dirfail) @@ -13,7 +13,7 @@ The license contains a use-restriction that makes it non-Free and incompatible with the GNU GPL. Additionally, every revision of it supposedly has multiple bugs; though I have not studied it closely enough to verify that claim. I am publishing its history because it -is of historic interest, not because it should be used. +is of historical interest, not because it should be used. -- Happy hacking, @@ -1,7 +1,15 @@ #!/usr/bin/env bash +# Copyright (c) 2017, 2023 Luke Shumaker <lukeshu@lukeshu.com> +# +# This work is free. You can redistribute it and/or modify it under +# the terms of the Do What The Fuck You Want To Public License, +# Version 2, as published by Sam Hocevar. See the COPYING file for +# more details. -url='http://web.archive.org/cdx/search/cdx?' +url='http://web.archive.org/cdx/search/cdx' +s='?' for arg in "$@"; do - url+="$s${arg%%=*}=$(printf '%s' "${arg#*=}"|urlencode)&" + url+="$s${arg%%=*}=$(printf '%s' "${arg#*=}"|urlencode)" + s='&' done -curl -sL "$url" +curl -sfL "$url" diff --git a/bin/fmt-metadata b/bin/fmt-metadata index c92419b..ec82451 100755 --- a/bin/fmt-metadata +++ b/bin/fmt-metadata @@ -1,4 +1,11 @@ #!/usr/bin/env ruby +# Copyright (c) 2017 Luke Shumaker <lukeshu@lukeshu.com> +# +# This work is free. You can redistribute it and/or modify it under +# the terms of the Do What The Fuck You Want To Public License, +# Version 2, as published by Sam Hocevar. See the COPYING file for +# more details. + require 'time' snapshot = ARGV.first.to_i @@ -1,4 +1,10 @@ #!/usr/bin/env bash +# Copyright (c) 2017, 2023 Luke Shumaker <lukeshu@lukeshu.com> +# +# This work is free. You can redistribute it and/or modify it under +# the terms of the Do What The Fuck You Want To Public License, +# Version 2, as published by Sam Hocevar. See the COPYING file for +# more details. empty() { [[ $(stat -c %s "$1") -eq 0 ]] @@ -74,20 +80,36 @@ main() { waurl="http://web.archive.org/web/$(murl2url "${listingdir##*/content-dir/}/")" msg="$waurl" fi - gitdate="$(sed -r 's/(....)(..)(..)(..)(..)(..)/\1-\2-\3T\4:\5:\6/' <<<"$time")" + + HACK_TZ=-0500 + if [[ "$branch" == BETA/CVTUTF-1-4 || "$branch" == ALPHA/CVTUTF-1-5-draft ]]; then + HACK_TZ=-0400 + fi + HACK_NAME='Luke Shumaker' + HACK_EMAIL='lukeshu@lukeshu.com' + + gitdate="$(sed -E 's/(....)(..)(..)(..)(..)(..)/\1-\2-\3T\4:\5:\6 '"$HACK_TZ"'/' <<<"$time")" + git add . + export GIT_AUTHOR_DATE=$gitdate + export GIT_AUTHOR_NAME=$HACK_NAME + export GIT_AUTHOR_EMAIL=$HACK_EMAIL + export GIT_COMMITTER_DATE=$gitdate + export GIT_COMMITTER_NAME=$HACK_NAME + export GIT_COMMITTER_EMAIL=$HACK_EMAIL + git commit --allow-empty -m "$msg" if [[ "$branch" != *.OLD ]]; then lastbranch="$branch" fi if [[ "$branch" == PROGRAMS/CVTUTF ]] && git log -n1 --stat|grep -qF 'ExpectedOutput.txt'; then - git filter-branch -f --parent-filter 'cat; echo " -p BETA/CVTUTF-1-3"' HEAD^..HEAD + FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch -f --parent-filter 'cat; echo " -p BETA/CVTUTF-1-3"' HEAD^..HEAD git update-ref -d refs/original/refs/heads/"$branch" fi if [[ "$branch" == PROGRAMS/CVTUTF.OLD ]] && git log -n1 --stat|grep -qi '.*\.c\s'; then - git filter-branch -f --parent-filter 'cat; echo " -p PROGRAMS/CVTUTF^"' HEAD^..HEAD + FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch -f --parent-filter 'cat; echo " -p PROGRAMS/CVTUTF^"' HEAD^..HEAD git update-ref -d refs/original/refs/heads/"$branch" fi fi diff --git a/bin/poolify b/bin/poolify index 6f9a109..b49cbd9 100755 --- a/bin/poolify +++ b/bin/poolify @@ -1,4 +1,10 @@ #!/usr/bin/env bash +# Copyright (c) 2017, 2023 Luke Shumaker <lukeshu@lukeshu.com> +# +# This work is free. You can redistribute it and/or modify it under +# the terms of the Do What The Fuck You Want To Public License, +# Version 2, as published by Sam Hocevar. See the COPYING file for +# more details. url2murl() { local x @@ -13,14 +19,27 @@ main() { set -euE -o pipefail shopt -s nullglob - echo '# Pass 1' - declare -A rewrite - rewrite[200109261739]=200303310700 - while read -r snap name date time size; do + arg_metadata_txt=$1 + arg_index_txt=$2 + + # Overrides ############################################################ + + declare -A override_datetime + override_datetime[200109261739]=200303310700 + + override_synthetic_listings=( + #YYYYMMDDHHMM branch_name newfiles + '200307291500 ALPHA/CVTUTF-1-1 ExpectedOutput.txt readme.txt' + ) + + # Main ################################################################# + + echo '# Pass 1 (initialize snapshots from $arg_metadata_txt)' + while read -r snap name date time; do dirpart="${name%/*}" filepart="${name##*/}" datetime="${date//-/}${time//:/}" - datetime="${rewrite[$datetime]:-$datetime}" + datetime="${override_datetime[$datetime]:-$datetime}" filedir=dat/pools/files/"${datetime}-${name//\//_}" snapdir=dat/pools/snaps/"${snap}-${dirpart//\//_}" if [[ -d "${filedir/.OLD/}" ]]; then @@ -28,9 +47,9 @@ main() { fi mkdir -p -- "$filedir" "$snapdir" ln -sr "$filedir/$filepart" "$snapdir" - done < "$1" + done < "$arg_metadata_txt" - echo '# Pass 1.5' + echo '# Pass 1.5 (initialize synthetic snapshots)' # Looking at the data, there are 3 revisions that we DON'T # have directory listings for. So we need to synthesize # those. @@ -39,7 +58,8 @@ main() { # synthesizing anything, then looking for files ending in # ".1". They are created during pass 2 if we have a file with # no matching listing. - while read -r datetime dirpart newfiles; do + for line in "${override_synthetic_listings[@]}"; do + read -r datetime dirpart newfiles <<<"$line" # We need to figure out which files to put in the # directory listing. We're going to do that by # mimicking the previous listing with that dirpart. @@ -65,20 +85,18 @@ main() { rm -- "$snapdir/$filepart" ln -sr "$filedir/$filepart" "$snapdir" done - done < <(printf '%s\n' \ - '200307291500 ALPHA/CVTUTF-1-1 ExpectedOutput.txt readme.txt' \ - ) + done - echo '# Pass 2' + echo '# Pass 2 (resolve files)' while read -r time url; do + if [[ "$url" == */ ]]; then + # Skip directories + continue + fi name="${url##*/Public/}" dirpart="${name%/*}" filepart="${name##*/}" - if [[ -z "$filepart" ]]; then - continue - fi - pools=(dat/pools/files/*-"${name//\//_}") if [[ "$name" = *.OLD* ]]; then pname="${name//\//_}" @@ -115,9 +133,9 @@ main() { fi i+=1 done - done < "$2" + done < "$arg_index_txt" - echo '# Pass 3' + echo '# Pass 3 (resolve missing files)' while read -r missing; do if [[ -f "${missing/.OLD}/${missing##*_}" ]]; then ln -sr "${missing/.OLD}/${missing##*_}" "$missing" diff --git a/bin/urlkey2url b/bin/urlkey2url index 5d0ec3d..813e66f 100755 --- a/bin/urlkey2url +++ b/bin/urlkey2url @@ -1,4 +1,10 @@ #!/usr/bin/env bash +# Copyright (c) 2017 Luke Shumaker <lukeshu@lukeshu.com> +# +# This work is free. You can redistribute it and/or modify it under +# the terms of the Do What The Fuck You Want To Public License, +# Version 2, as published by Sam Hocevar. See the COPYING file for +# more details. for arg in "$@"; do keydomain="${arg%%)*}" diff --git a/bin/wayfore b/bin/wayfore deleted file mode 100755 index b0bde8a..0000000 --- a/bin/wayfore +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/sed -zrf -# The opposite of 'wayback' -s/(<[hH][eE][aA][dD]>).*<!-- End Wayback Rewrite JS Include -->/\1/ -s/<!-- BEGIN WAYBACK TOOLBAR INSERT -->.*<!-- END WAYBACK TOOLBAR INSERT -->// |