From b15e310abe81a952624c3e96dd117699de7359e1 Mon Sep 17 00:00:00 2001 From: "Luke T. Shumaker" Date: Thu, 12 Oct 2023 18:33:50 -0600 Subject: Always pass -f (--fail) to curl --- Makefile | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'Makefile') diff --git a/Makefile b/Makefile index ad7e53e..07bfa56 100644 --- a/Makefile +++ b/Makefile @@ -10,11 +10,7 @@ all: $(MAKE) dat/urlkeys.mk $(MAKE) dat/index.mk $(MAKE) dat/git - -fix: - grep -rl '

503' dat | xargs rm -fv -- - -.PHONY: all fix +.PHONY: all # Stage 1 ###################################################################### @@ -45,7 +41,7 @@ ifneq ($(wildcard dat/index.mk),) dat/content-dir/%/index.wahtml: @mkdir -p '$(@D)' - curl -sL 'http://web.archive.org/web/$(call murl2url,$*)' > $@ + curl -sfL 'http://web.archive.org/web/$(call murl2url,$*)' > $@ dat/content-dir/%/index.html: dat/content-dir/%/index.wahtml < $< wayfore > $@ dat/content-dir/%/readme.txt: dat/content-dir/%/index.html @@ -57,7 +53,7 @@ download += $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(co dat/content-file/%: @mkdir -p '$(@D)' - curl -sL 'http://web.archive.org/web/$(call murl2url,$*)' > $@ + curl -sfL 'http://web.archive.org/web/$(call murl2url,$*)' > $@ content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u))) download += $(content-file) -- cgit v1.2.3-54-g00ecf From 8722fff2231107011d4b87104b7a024e48a6e0a3 Mon Sep 17 00:00:00 2001 From: "Luke T. Shumaker" Date: Thu, 12 Oct 2023 18:34:46 -0600 Subject: Ditch bin/wayfore --- Makefile | 12 +++++------- bin/wayfore | 4 ---- 2 files changed, 5 insertions(+), 11 deletions(-) delete mode 100755 bin/wayfore (limited to 'Makefile') diff --git a/Makefile b/Makefile index 07bfa56..d72b0c3 100644 --- a/Makefile +++ b/Makefile @@ -39,21 +39,19 @@ dat/index.mk: dat/index.txt ifneq ($(wildcard dat/index.mk),) -include dat/index.mk -dat/content-dir/%/index.wahtml: +dat/content-dir/%/index.html: @mkdir -p '$(@D)' - curl -sfL 'http://web.archive.org/web/$(call murl2url,$*)' > $@ -dat/content-dir/%/index.html: dat/content-dir/%/index.wahtml - < $< wayfore > $@ + curl -sfL 'http://web.archive.org/web/$(call murl2url,$(subst /http,id_/http,$*))' > $@ dat/content-dir/%/readme.txt: dat/content-dir/%/index.html - < $< sed -n '/^

$$/,/<\/pre>/p' | sed -e 1d -e 's,

.*,,' > $@ + < $< sed -n '/^<[pP][rR][eE]>$$/,/<\/[pP][rR][eE]>/p' | sed -e 1d -e 's,.*,,' > $@ dat/content-dir/%/metadata.txt: dat/content-dir/%/index.html - < $< grep '^]*>//g' | grep -vi 'parent directory' | fmt-metadata $(firstword $(subst /, ,$*)) > $@ + < $< grep -i '^]*>//g' | grep -vi 'parent directory' | fmt-metadata $(firstword $(subst /, ,$*)) > $@ content-dir = $(foreach u,$(filter %/,$(index)),dat/content-dir/$(call url2murl,$(u))) download += $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir)) dat/content-file/%: @mkdir -p '$(@D)' - curl -sfL 'http://web.archive.org/web/$(call murl2url,$*)' > $@ + curl -sfL 'http://web.archive.org/web/$(call murl2url,$(subst /http,id_/http,$*))' > $@ content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u))) download += $(content-file) diff --git a/bin/wayfore b/bin/wayfore deleted file mode 100755 index b0bde8a..0000000 --- a/bin/wayfore +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/sed -zrf -# The opposite of 'wayback' -s/(<[hH][eE][aA][dD]>).*/\1/ -s/.*// -- cgit v1.2.3-54-g00ecf From 84d09b97bf85096e98b8f6f7e95008788ab15f5f Mon Sep 17 00:00:00 2001 From: "Luke T. Shumaker" Date: Sat, 14 Oct 2023 17:41:48 -0600 Subject: sed -r is deprecated in favor of -E --- Makefile | 2 +- bin/gitify | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'Makefile') diff --git a/Makefile b/Makefile index d72b0c3..ed1dbad 100644 --- a/Makefile +++ b/Makefile @@ -59,7 +59,7 @@ download: $(download) .PHONY: download dat/metadata.txt: $(addsuffix metadata.txt,$(content-dir)) dat/index.txt - grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | sed -r -e 's,^dat/content-dir/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt:,/,' -e 's,\s+, ,g' | sort -u > $@ + grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | sed -E -e 's,^dat/content-dir/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt:,/,' -e 's,\s+, ,g' | sort -u > $@ dat/pools: $(download) dat/metadata.txt dat/index.txt rm -rf -- $@ $@.bak poolify dat/metadata.txt dat/index.txt || $(dirfail) diff --git a/bin/gitify b/bin/gitify index 1e5d43d..7282dc3 100755 --- a/bin/gitify +++ b/bin/gitify @@ -82,7 +82,7 @@ main() { HACK_NAME='Luke Shumaker' HACK_EMAIL='lukeshu@lukeshu.com' - gitdate="$(sed -r 's/(....)(..)(..)(..)(..)(..)/\1-\2-\3T\4:\5:\6 '"$HACK_TZ"'/' <<<"$time")" + gitdate="$(sed -E 's/(....)(..)(..)(..)(..)(..)/\1-\2-\3T\4:\5:\6 '"$HACK_TZ"'/' <<<"$time")" git add . -- cgit v1.2.3-54-g00ecf From e35a01b00eb39366b6c8b1294c6a766838313f38 Mon Sep 17 00:00:00 2001 From: "Luke T. Shumaker" Date: Sat, 14 Oct 2023 18:41:23 -0600 Subject: Makefile: Add comments --- Makefile | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'Makefile') diff --git a/Makefile b/Makefile index ed1dbad..63fc135 100644 --- a/Makefile +++ b/Makefile @@ -6,13 +6,28 @@ url2murl = $(subst %,^25,$(subst :,^3A,$(subst ^,^5E,$1))) murl2url = $(subst ^5E,^,$(subst ^3A,:,$(subst ^25,%,$1))) dirfail = ( r=$$?; mv -- '$@'{,.bak}; exit $$r; ) +# This is split into stages for when Make has to make decisions about +# the build tree based on the output of a previous stage. That is: +# these stages exist for a technical GNU Make reason, not for +# human-comprehensibility reasons; so stages have lopsided sizes; the +# first two are very small, and almost everything is in the third +# stage. all: + # Stage 1 ###################################################################### $(MAKE) dat/urlkeys.mk + # Stage 2 ###################################################################### $(MAKE) dat/index.mk + # Stage 3 ###################################################################### $(MAKE) dat/git .PHONY: all # Stage 1 ###################################################################### +# +# Fetch a listing of all relevant URLs. +# +# - `dat/cdxindex.txt` +# - `dat/urlkeys.txt` +# - `dat/urlkeys.mk` dat: mkdir -p $@ @@ -24,6 +39,17 @@ dat/urlkeys.mk: dat/urlkeys.txt < $< sed 's/^/urlkeys+=/' > $@ # Stage 2 ###################################################################### +# +# Fetch the history for each relevant URL. +# +# - `dat/each-cdx/$(urlkey).txt` (for each urlkey in `dat/urlkeys.mk`) +# +# - `dat/index.txt` +# has a line for each relevant URL: +# +# ${wayback_timestamp:YYYYmmddHHMMSS} ${url} +# +# - `dat/index.mk` ifneq ($(wildcard dat/urlkeys.mk),) include dat/urlkeys.mk @@ -36,9 +62,22 @@ dat/index.mk: dat/index.txt < $< sed -e 's,^,index+=,' -e 's, ,/,' > $@ # Stage 3 ###################################################################### +# +# The main stage. ifneq ($(wildcard dat/index.mk),) -include dat/index.mk +# Part 1: Directory indexes: +# +# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/index.html` +# +# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/readme.txt` +# +# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/metadata.txt` +# has a line for each file mentioned in index.html (this format is +# controlled by `bin/fmt-metadata`): +# +# ${file_name} ${file_timestamp:YYYY-mm-dd HH:MM} dat/content-dir/%/index.html: @mkdir -p '$(@D)' curl -sfL 'http://web.archive.org/web/$(call murl2url,$(subst /http,id_/http,$*))' > $@ @@ -49,20 +88,44 @@ dat/content-dir/%/metadata.txt: dat/content-dir/%/index.html content-dir = $(foreach u,$(filter %/,$(index)),dat/content-dir/$(call url2murl,$(u))) download += $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir)) +# Part 2: File contents: +# - `dat/content-file/$(wayback_timestamp:YYYYmmddHHMMSS)/$(file_murl)` dat/content-file/%: @mkdir -p '$(@D)' curl -sfL 'http://web.archive.org/web/$(call murl2url,$(subst /http,id_/http,$*))' > $@ content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u))) download += $(content-file) +# `download` is a convenience target to download files without +# processing them. It isn't depended on by anything. download: $(download) .PHONY: download +# Part 3: Aggregate: +# - `dat/metadata.txt` +# has a line for each file mentioned in any index.html: +# +# ${dirindex_wayback_timestamp:YYYYmmddHHMMSS} ${branch_name}/${file_name} ${file_html_timestamp:YYYY-mm-dd HH:MM} +# +# where the ${dirindex_wayback_timestamp} and ${branch_name} are +# determined from the path to the relevant index.html. +# +# - `dat/pools/` +# + pass 1 and pass 1.5 +# * `dat/pools/files/${file_html_timestamp:YYYYmmddHHMM}-${branch_name}_${file_name}/` +# * `dat/pools/snaps/${dirindex_wayback_timestamp:YYYYmmddHHMMSS}-${branch_name}/${file_name}` (symlink to the /files/ file) +# + pass 2 and pass 3: +# * `dat/pools/files/${file_html_timestamp:YYYYmmddHHMM}-${branch_name}_${file_name}/${file_name}` (for each existing /file/ dir) +# dat/metadata.txt: $(addsuffix metadata.txt,$(content-dir)) dat/index.txt grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | sed -E -e 's,^dat/content-dir/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt:,/,' -e 's,\s+, ,g' | sort -u > $@ dat/pools: $(download) dat/metadata.txt dat/index.txt rm -rf -- $@ $@.bak poolify dat/metadata.txt dat/index.txt || $(dirfail) + +# Part 4: Turn each `dat/pools/snaps/*` directory into a Git commit +# +# - `dat/git/` dat/git: dat/pools $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir)) rm -rf -- $@ $@.bak gitify $@ || $(dirfail) -- cgit v1.2.3-54-g00ecf