10 files changed, 186 insertions, 42 deletions
diff --git a/.gitignore b/.gitignore
index a15fceb..434f67d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,7 @@
+# Copyright (c) 2017  Luke Shumaker <lukeshu@lukeshu.com>
+#
+# This work is free.  You can redistribute it and/or modify it under
+# the terms of the Do What The Fuck You Want To Public License,
+# Version 2, as published by Sam Hocevar.  See the COPYING file for
+# more details.
 /dat/
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..ee7d6a5
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,14 @@
+            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
+                    Version 2, December 2004
+
+ Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
+
+ Everyone is permitted to copy and distribute verbatim or modified
+ copies of this license document, and changing it is allowed as long
+ as the name is changed.
+
+            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. You just DO WHAT THE FUCK YOU WANT TO.
+
diff --git a/Makefile b/Makefile
index ad7e53e..6d12db3 100644
--- a/Makefile
+++ b/Makefile
@@ -1,3 +1,10 @@
+# Copyright (c) 2017, 2023  Luke Shumaker <lukeshu@lukeshu.com>
+#
+# This work is free.  You can redistribute it and/or modify it under
+# the terms of the Do What The Fuck You Want To Public License,
+# Version 2, as published by Sam Hocevar.  See the COPYING file for
+# more details.
+
 SHELL=bash -o pipefail
 PATH:=$(CURDIR)/bin:$(PATH)
 export PATH
@@ -6,28 +13,53 @@ url2murl = $(subst %,^25,$(subst :,^3A,$(subst ^,^5E,$1)))
 murl2url = $(subst ^5E,^,$(subst ^3A,:,$(subst ^25,%,$1)))
 dirfail = ( r=$$?; mv -- '$@'{,.bak}; exit $$r; )
 
+# This is split into stages for when Make has to make decisions about
+# the build tree based on the output of a previous stage.  That is:
+# these stages exist for a technical GNU Make reason, not for
+# human-comprehensibility reasons; so stages have lopsided sizes; the
+# first two are very small, and almost everything is in the third
+# stage.
 all:
+	# Stage 1 ######################################################################
 	$(MAKE) dat/urlkeys.mk
+	# Stage 2 ######################################################################
 	$(MAKE) dat/index.mk
+	# Stage 3 ######################################################################
 	$(MAKE) dat/git
+.PHONY: all
 
-fix:
-	grep -rl '<html><body><h1>503' dat | xargs rm -fv --
-
-.PHONY: all fix
+COPYING:
+	curl -L http://www.wtfpl.net/txt/copying/ >$@
 
 # Stage 1 ######################################################################
+#
+# Fetch a listing of all relevant URLs.
+#
+#  - `dat/cdxindex.txt`
+#  - `dat/urlkeys.txt`
+#  - `dat/urlkeys.mk`
 
 dat:
 	mkdir -p $@
 dat/cdxindex.txt: | dat
-	cdxget 'url=www.unicode.org/Public/*' 'fl=urlkey' 'filter=statuscode:200' 'filter=urlkey:.*(cvt|convert)utf.*' > $@
+	cdxget 'url=www.unicode.org/Public/*' 'fl=urlkey' 'collapse=urlkey' 'filter=statuscode:200' 'filter=urlkey:.*(cvt|convert)utf.*' | grep -vFx 'org,unicode)/public/2.0-update/cvtutf7.c' > $@
 dat/urlkeys.txt: dat/cdxindex.txt
 	< $< cut -d '?' -f1 | sort -u > $@
 dat/urlkeys.mk: dat/urlkeys.txt
 	< $< sed 's/^/urlkeys+=/' > $@
 
 # Stage 2 ######################################################################
+#
+# Fetch the history for each relevant URL.
+#
+# - `dat/each-cdx/$(urlkey).txt` (for each urlkey in `dat/urlkeys.mk`)
+#
+# - `dat/index.txt`
+#   has a line for each relevant URL:
+#
+#       ${wayback_timestamp:YYYYmmddHHMMSS} ${url}
+#
+# - `dat/index.mk`
 ifneq ($(wildcard dat/urlkeys.mk),)
 include dat/urlkeys.mk
 
@@ -40,35 +72,70 @@ dat/index.mk: dat/index.txt
 	< $< sed -e 's,^,index+=,' -e 's, ,/,' > $@
 
 # Stage 3 ######################################################################
+#
+# The main stage.
 ifneq ($(wildcard dat/index.mk),)
 -include dat/index.mk
 
-dat/content-dir/%/index.wahtml:
+# Part 1: Directory indexes:
+#
+# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/index.html`
+#
+# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/readme.txt`
+#
+# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/metadata.txt`
+#   has a line for each file mentioned in index.html (this format is
+#   controlled by `bin/fmt-metadata`):
+#
+#          ${file_name} ${file_timestamp:YYYY-mm-dd HH:MM}
+dat/content-dir/%/index.html:
 	@mkdir -p '$(@D)'
-	curl -sL 'http://web.archive.org/web/$(call murl2url,$*)' > $@
-dat/content-dir/%/index.html: dat/content-dir/%/index.wahtml
-	< $< wayfore > $@
+	curl -sfL 'http://web.archive.org/web/$(call murl2url,$(subst /http,id_/http,$*))' > $@
 dat/content-dir/%/readme.txt: dat/content-dir/%/index.html
-	< $< sed -n '/^<pre>$$/,/<\/pre>/p' | sed -e 1d -e 's,</pre>.*,,'  > $@
+	< $< sed -n '/^<[pP][rR][eE]>$$/,/<\/[pP][rR][eE]>/p' | sed -e 1d -e 's,</[pP][rR][eE]>.*,,'  > $@
 dat/content-dir/%/metadata.txt: dat/content-dir/%/index.html
-	< $< grep '^<img' | sed 's/<[^>]*>//g' | grep -vi 'parent directory' | fmt-metadata $(firstword $(subst /, ,$*)) > $@
+	< $< grep -i '^<img' | sed 's/<[^>]*>//g' | grep -vi 'parent directory' | fmt-metadata $(firstword $(subst /, ,$*)) > $@
 content-dir = $(foreach u,$(filter %/,$(index)),dat/content-dir/$(call url2murl,$(u)))
 download += $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir))
 
+# Part 2: File contents:
+# - `dat/content-file/$(wayback_timestamp:YYYYmmddHHMMSS)/$(file_murl)`
 dat/content-file/%:
 	@mkdir -p '$(@D)'
-	curl -sL 'http://web.archive.org/web/$(call murl2url,$*)' > $@
+	curl -sfL 'http://web.archive.org/web/$(call murl2url,$(subst /http,id_/http,$*))' > $@
 content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u)))
 download += $(content-file)
 
+# `download` is a convenience target to download files without
+# processing them.  It isn't depended on by anything.
 download: $(download)
 .PHONY: download
 
+# Part 3: Aggregate:
+# - `dat/metadata.txt`
+#   has a line for each file mentioned in any index.html:
+#
+#          ${dirindex_wayback_timestamp:YYYYmmddHHMMSS} ${branch_name}/${file_name} ${file_html_timestamp:YYYY-mm-dd HH:MM}
+#
+#   where the ${dirindex_wayback_timestamp} and ${branch_name} are
+#   determined from the path to the relevant index.html.
+#
+# - `dat/pools/`
+#   + pass 1 and pass 1.5
+#     * `dat/pools/files/${file_html_timestamp:YYYYmmddHHMM}-${branch_name}_${file_name}/`
+#     * `dat/pools/snaps/${dirindex_wayback_timestamp:YYYYmmddHHMMSS}-${branch_name}/${file_name}` (symlink to the /files/ file)
+#   + pass 2 and pass 3:
+#     * `dat/pools/files/${file_html_timestamp:YYYYmmddHHMM}-${branch_name}_${file_name}/${file_name}` (for each existing /file/ dir)
+#
 dat/metadata.txt: $(addsuffix metadata.txt,$(content-dir)) dat/index.txt
-	grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | sed -r -e 's,^dat/content-dir/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt:,/,' -e 's,\s+, ,g' | sort -u > $@
+	grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | sed -E -e 's,^dat/content-dir/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt:,/,' -e 's,\s+, ,g' | sort -u > $@
 dat/pools: $(download) dat/metadata.txt dat/index.txt
 	rm -rf -- $@ $@.bak
 	poolify dat/metadata.txt dat/index.txt || $(dirfail)
+
+# Part 4: Turn each `dat/pools/snaps/*` directory into a Git commit
+#
+# - `dat/git/`
 dat/git: dat/pools $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir))
 	rm -rf -- $@ $@.bak
 	gitify $@ || $(dirfail)
diff --git a/README.md b/README.md
index 6e0a06a..b4cbcd3 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ The license contains a use-restriction that makes it non-Free and
 incompatible with the GNU GPL.  Additionally, every revision of it
 supposedly has multiple bugs; though I have not studied it closely
 enough to verify that claim.  I am publishing its history because it
-is of historic interest, not because it should be used.
+is of historical interest, not because it should be used.
 
 -- 
 Happy hacking,
diff --git a/bin/cdxget b/bin/cdxget
index 46d56c4..1dbc8f0 100755
--- a/bin/cdxget
+++ b/bin/cdxget
@@ -1,7 +1,15 @@
 #!/usr/bin/env bash
+# Copyright (c) 2017, 2023  Luke Shumaker <lukeshu@lukeshu.com>
+#
+# This work is free.  You can redistribute it and/or modify it under
+# the terms of the Do What The Fuck You Want To Public License,
+# Version 2, as published by Sam Hocevar.  See the COPYING file for
+# more details.
 
-url='http://web.archive.org/cdx/search/cdx?'
+url='http://web.archive.org/cdx/search/cdx'
+s='?'
 for arg in "$@"; do
-	url+="$s${arg%%=*}=$(printf '%s' "${arg#*=}"|urlencode)&"
+	url+="$s${arg%%=*}=$(printf '%s' "${arg#*=}"|urlencode)"
+	s='&'
 done
-curl -sL "$url"
+curl -sfL "$url"
diff --git a/bin/fmt-metadata b/bin/fmt-metadata
index c92419b..ec82451 100755
--- a/bin/fmt-metadata
+++ b/bin/fmt-metadata
@@ -1,4 +1,11 @@
 #!/usr/bin/env ruby
+# Copyright (c) 2017  Luke Shumaker <lukeshu@lukeshu.com>
+#
+# This work is free.  You can redistribute it and/or modify it under
+# the terms of the Do What The Fuck You Want To Public License,
+# Version 2, as published by Sam Hocevar.  See the COPYING file for
+# more details.
+
 require 'time'
 
 snapshot = ARGV.first.to_i
diff --git a/bin/gitify b/bin/gitify
index 2c73bcd..d655f22 100755
--- a/bin/gitify
+++ b/bin/gitify
@@ -1,4 +1,10 @@
 #!/usr/bin/env bash
+# Copyright (c) 2017, 2023  Luke Shumaker <lukeshu@lukeshu.com>
+#
+# This work is free.  You can redistribute it and/or modify it under
+# the terms of the Do What The Fuck You Want To Public License,
+# Version 2, as published by Sam Hocevar.  See the COPYING file for
+# more details.
 
 empty() {
 	[[ $(stat -c %s "$1") -eq 0 ]]
@@ -74,20 +80,36 @@ main() {
 				waurl="http://web.archive.org/web/$(murl2url "${listingdir##*/content-dir/}/")"
 				msg="$waurl"
 			fi
-			gitdate="$(sed -r 's/(....)(..)(..)(..)(..)(..)/\1-\2-\3T\4:\5:\6/' <<<"$time")"
+
+			HACK_TZ=-0500
+			if [[ "$branch" == BETA/CVTUTF-1-4 || "$branch" == ALPHA/CVTUTF-1-5-draft ]]; then
+				HACK_TZ=-0400
+			fi
+			HACK_NAME='Luke Shumaker'
+			HACK_EMAIL='lukeshu@lukeshu.com'
+
+			gitdate="$(sed -E 's/(....)(..)(..)(..)(..)(..)/\1-\2-\3T\4:\5:\6 '"$HACK_TZ"'/' <<<"$time")"
+
 			git add .
+
 			export GIT_AUTHOR_DATE=$gitdate
+			export GIT_AUTHOR_NAME=$HACK_NAME
+			export GIT_AUTHOR_EMAIL=$HACK_EMAIL
+
 			export GIT_COMMITTER_DATE=$gitdate
+			export GIT_COMMITTER_NAME=$HACK_NAME
+			export GIT_COMMITTER_EMAIL=$HACK_EMAIL
+
 			git commit --allow-empty -m "$msg"
 			if [[ "$branch" != *.OLD ]]; then
 				lastbranch="$branch"
 			fi
 			if [[ "$branch" == PROGRAMS/CVTUTF ]] && git log -n1 --stat|grep -qF 'ExpectedOutput.txt'; then
-				git filter-branch -f --parent-filter 'cat; echo " -p BETA/CVTUTF-1-3"' HEAD^..HEAD
+				FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch -f --parent-filter 'cat; echo " -p BETA/CVTUTF-1-3"' HEAD^..HEAD
 				git update-ref -d refs/original/refs/heads/"$branch"
 			fi
 			if [[ "$branch" == PROGRAMS/CVTUTF.OLD ]] && git log -n1 --stat|grep -qi '.*\.c\s'; then
-				git filter-branch -f --parent-filter 'cat; echo " -p PROGRAMS/CVTUTF^"' HEAD^..HEAD
+				FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch -f --parent-filter 'cat; echo " -p PROGRAMS/CVTUTF^"' HEAD^..HEAD
 				git update-ref -d refs/original/refs/heads/"$branch"
 			fi
 		fi
diff --git a/bin/poolify b/bin/poolify
index 6f9a109..b49cbd9 100755
--- a/bin/poolify
+++ b/bin/poolify
@@ -1,4 +1,10 @@
 #!/usr/bin/env bash
+# Copyright (c) 2017, 2023  Luke Shumaker <lukeshu@lukeshu.com>
+#
+# This work is free.  You can redistribute it and/or modify it under
+# the terms of the Do What The Fuck You Want To Public License,
+# Version 2, as published by Sam Hocevar.  See the COPYING file for
+# more details.
 
 url2murl() {
 	local x
@@ -13,14 +19,27 @@ main() {
 	set -euE -o pipefail
 	shopt -s nullglob
 
-	echo '# Pass 1'
-	declare -A rewrite
-	rewrite[200109261739]=200303310700
-	while read -r snap name date time size; do
+	arg_metadata_txt=$1
+	arg_index_txt=$2
+
+	# Overrides ############################################################
+
+	declare -A override_datetime
+	override_datetime[200109261739]=200303310700
+
+	override_synthetic_listings=(
+		#YYYYMMDDHHMM branch_name         newfiles
+		'200307291500 ALPHA/CVTUTF-1-1    ExpectedOutput.txt readme.txt'
+	)
+
+	# Main #################################################################
+
+	echo '# Pass 1 (initialize snapshots from $arg_metadata_txt)'
+	while read -r snap name date time; do
 		dirpart="${name%/*}"
 		filepart="${name##*/}"
 		datetime="${date//-/}${time//:/}"
-		datetime="${rewrite[$datetime]:-$datetime}"
+		datetime="${override_datetime[$datetime]:-$datetime}"
 		filedir=dat/pools/files/"${datetime}-${name//\//_}"
 		snapdir=dat/pools/snaps/"${snap}-${dirpart//\//_}"
 		if [[ -d "${filedir/.OLD/}" ]]; then
@@ -28,9 +47,9 @@ main() {
 		fi
 		mkdir -p -- "$filedir" "$snapdir"
 		ln -sr "$filedir/$filepart" "$snapdir"
-	done < "$1"
+	done < "$arg_metadata_txt"
 
-	echo '# Pass 1.5'
+	echo '# Pass 1.5 (initialize synthetic snapshots)'
 	# Looking at the data, there are 3 revisions that we DON'T
 	# have directory listings for.  So we need to synthesize
 	# those.
@@ -39,7 +58,8 @@ main() {
 	# synthesizing anything, then looking for files ending in
 	# ".1".  They are created during pass 2 if we have a file with
 	# no matching listing.
-	while read -r datetime dirpart newfiles; do
+	for line in "${override_synthetic_listings[@]}"; do
+		read -r datetime dirpart newfiles <<<"$line"
 		# We need to figure out which files to put in the
 		# directory listing.  We're going to do that by
 		# mimicking the previous listing with that dirpart.
@@ -65,20 +85,18 @@ main() {
 			rm -- "$snapdir/$filepart"
 			ln -sr "$filedir/$filepart" "$snapdir"
 		done
-	done < <(printf '%s\n' \
-			'200307291500 ALPHA/CVTUTF-1-1    ExpectedOutput.txt readme.txt' \
-		)
+	done
 
-	echo '# Pass 2'
+	echo '# Pass 2 (resolve files)'
 	while read -r time url; do
+		if [[ "$url" == */ ]]; then
+			# Skip directories
+			continue
+		fi
 		name="${url##*/Public/}"
 		dirpart="${name%/*}"
 		filepart="${name##*/}"
 
-		if [[ -z "$filepart" ]]; then
-			continue
-		fi
-
 		pools=(dat/pools/files/*-"${name//\//_}")
 		if [[ "$name" = *.OLD* ]]; then
 			pname="${name//\//_}"
@@ -115,9 +133,9 @@ main() {
 			fi
 			i+=1
 		done
-	done < "$2"
+	done < "$arg_index_txt"
 
-	echo '# Pass 3'
+	echo '# Pass 3 (resolve missing files)'
 	while read -r missing; do
 		if [[ -f "${missing/.OLD}/${missing##*_}" ]]; then
 			ln -sr "${missing/.OLD}/${missing##*_}" "$missing"
diff --git a/bin/urlkey2url b/bin/urlkey2url
index 5d0ec3d..813e66f 100755
--- a/bin/urlkey2url
+++ b/bin/urlkey2url
@@ -1,4 +1,10 @@
 #!/usr/bin/env bash
+# Copyright (c) 2017  Luke Shumaker <lukeshu@lukeshu.com>
+#
+# This work is free.  You can redistribute it and/or modify it under
+# the terms of the Do What The Fuck You Want To Public License,
+# Version 2, as published by Sam Hocevar.  See the COPYING file for
+# more details.
 
 for arg in "$@"; do
 	keydomain="${arg%%)*}"
diff --git a/bin/wayfore b/bin/wayfore
deleted file mode 100755
index b0bde8a..0000000
--- a/bin/wayfore
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/sed -zrf
-# The opposite of 'wayback'
-s/(<[hH][eE][aA][dD]>).*<!-- End Wayback Rewrite JS Include -->/\1/
-s/<!-- BEGIN WAYBACK TOOLBAR INSERT -->.*<!-- END WAYBACK TOOLBAR INSERT -->//