summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <LukeShu@sbcglobal.net>2014-06-20 15:18:33 -0400
committerLuke Shumaker <LukeShu@sbcglobal.net>2014-06-20 15:18:33 -0400
commitf1559ccab1f9de1f03659a9de6572a593abbbda5 (patch)
treeb537836edefc0d40feaa9563778912ad30a24a64
parent50aedb15997bae463b6676c10c05ab1b0384563e (diff)
Optimize jh-checksource
I didn't think it needed optimizing, but on my abslibre checkout, it was taking 48-point-something seconds. Now it takse 14-point-something seconds.
-rw-r--r--jh-checksource.sh49
1 files changed, 32 insertions, 17 deletions
diff --git a/jh-checksource.sh b/jh-checksource.sh
index e3eff21..9de91c0 100644
--- a/jh-checksource.sh
+++ b/jh-checksource.sh
@@ -8,15 +8,19 @@ sep='<no-filename-ever-contains-this>'
safe_types_regexp=('^(inode|text|image|video|audio)/')
safe_types_string=('application/pdf' 'application/postscript' 'application/xml' 'application/ogg' 'message/rfc822')
-safe_files_regexp=('/\.(git|hg|svn)/' '/[^/]*\.git/' '/po/[^/]*.gmo$' '\.flw$' '\.odg$' '\.ppt$')
+safe_dirs_glob=(.{git,hg,svn} '*.git')
+safe_files_regexp=('/po/[^/]*.gmo$' '\.flw$' '\.odg$' '\.ppt$')
safe_files_string=()
# don't care about files less than 3 bytes.
min_size=3
-normalize_filename() {
- local cwd="$(readlink -m -- "$PWD")"
- readlink -m -- "$1"|sed "s|^$cwd/|./|"
+cwd="$(readlink -m -- "$PWD")"
+
+normalize_filenames() {
+ while IFS='' read -r filename; do
+ readlink -m -- "$filename"
+ done | sed "s|^$cwd/|./|"
}
matches_string() {
@@ -53,10 +57,11 @@ print-machine() {
main() {
local format=human
# Parse arguments
+ local arg
for arg in "$@"; do
case "$arg" in
-m) format=machine;;
- *) safe_files_string+=("$(normalize_filename "$arg")");;
+ *) safe_files_string+=("$(normalize_filenames <<<"$arg")");;
esac
done
@@ -65,23 +70,33 @@ main() {
trap "$(printf 'rm -f -- %q' "$unsafe_files")" EXIT
# Heavy lifting
- find . -type f -printf '%s %h/%f\n' | # find all files
+ local filter_dirs=()
+ local glob
+ for glob in "${safe_dirs_glob[@]}"; do
+ filter_dirs+=(-type d -name "$glob" -prune -o)
+ done
+ find . "${filter_dirs[@]}" -type f -printf '%s %p\n' | # find all files
while read -r size file; do # filter out files smaller than $min_size
[[ $size -lt $min_size ]] || printf '%s\n' "$file"
done |
- xargs -d'\n' file --mime-type -r -F "$sep" | # identify the filetypes
+ normalize_filenames |
+ xargs -r -d'\n' file --mime-type -r -F "$sep" | # identify the filetypes
sed -r "s@(.*)${sep}\s*(.*)@\2:\1@" | # reformat the output to be easier to parse
while IFS=: read -r type file; do
- file="$(normalize_filename "$file")"
-
- if \
- matches_string "$file" "${safe_files_string[@]}" || \
- matches_string "$type" "${safe_types_string[@]}" || \
- matches_regexp "$file" "${safe_files_regexp[@]}" || \
- matches_regexp "$type" "${safe_types_regexp[@]}" ; then
- : # do nothing
- else
- printf "%s\n" "$file"
+ declare -A cached_types
+ if ! { matches_string "$file" "${safe_files_string[@]}" || \
+ matches_regexp "$file" "${safe_files_regexp[@]}" ;}; then
+ if [[ -z ${cached_types[$type]} ]]; then
+ if matches_string "$type" "${safe_types_string[@]}" || \
+ matches_regexp "$type" "${safe_types_regexp[@]}" ; then
+ cached_types[$type]=false
+ else
+ cached_types[$type]=true
+ fi
+ fi
+ if "${cached_types[$type]}"; then
+ printf "%s\n" "$file"
+ fi
fi
done > "$unsafe_files"