diff options
author | Luke Shumaker <LukeShu@sbcglobal.net> | 2014-06-20 15:18:33 -0400 |
---|---|---|
committer | Luke Shumaker <LukeShu@sbcglobal.net> | 2014-06-20 15:18:33 -0400 |
commit | f1559ccab1f9de1f03659a9de6572a593abbbda5 (patch) | |
tree | b537836edefc0d40feaa9563778912ad30a24a64 | |
parent | 50aedb15997bae463b6676c10c05ab1b0384563e (diff) |
Optimize jh-checksource
I didn't think it needed optimizing, but on my abslibre checkout, it was
taking 48-point-something seconds. Now it takse 14-point-something
seconds.
-rw-r--r-- | jh-checksource.sh | 49 |
1 files changed, 32 insertions, 17 deletions
diff --git a/jh-checksource.sh b/jh-checksource.sh index e3eff21..9de91c0 100644 --- a/jh-checksource.sh +++ b/jh-checksource.sh @@ -8,15 +8,19 @@ sep='<no-filename-ever-contains-this>' safe_types_regexp=('^(inode|text|image|video|audio)/') safe_types_string=('application/pdf' 'application/postscript' 'application/xml' 'application/ogg' 'message/rfc822') -safe_files_regexp=('/\.(git|hg|svn)/' '/[^/]*\.git/' '/po/[^/]*.gmo$' '\.flw$' '\.odg$' '\.ppt$') +safe_dirs_glob=(.{git,hg,svn} '*.git') +safe_files_regexp=('/po/[^/]*.gmo$' '\.flw$' '\.odg$' '\.ppt$') safe_files_string=() # don't care about files less than 3 bytes. min_size=3 -normalize_filename() { - local cwd="$(readlink -m -- "$PWD")" - readlink -m -- "$1"|sed "s|^$cwd/|./|" +cwd="$(readlink -m -- "$PWD")" + +normalize_filenames() { + while IFS='' read -r filename; do + readlink -m -- "$filename" + done | sed "s|^$cwd/|./|" } matches_string() { @@ -53,10 +57,11 @@ print-machine() { main() { local format=human # Parse arguments + local arg for arg in "$@"; do case "$arg" in -m) format=machine;; - *) safe_files_string+=("$(normalize_filename "$arg")");; + *) safe_files_string+=("$(normalize_filenames <<<"$arg")");; esac done @@ -65,23 +70,33 @@ main() { trap "$(printf 'rm -f -- %q' "$unsafe_files")" EXIT # Heavy lifting - find . -type f -printf '%s %h/%f\n' | # find all files + local filter_dirs=() + local glob + for glob in "${safe_dirs_glob[@]}"; do + filter_dirs+=(-type d -name "$glob" -prune -o) + done + find . "${filter_dirs[@]}" -type f -printf '%s %p\n' | # find all files while read -r size file; do # filter out files smaller than $min_size [[ $size -lt $min_size ]] || printf '%s\n' "$file" done | - xargs -d'\n' file --mime-type -r -F "$sep" | # identify the filetypes + normalize_filenames | + xargs -r -d'\n' file --mime-type -r -F "$sep" | # identify the filetypes sed -r "s@(.*)${sep}\s*(.*)@\2:\1@" | # reformat the output to be easier to parse while IFS=: read -r type file; do - file="$(normalize_filename "$file")" - - if \ - matches_string "$file" "${safe_files_string[@]}" || \ - matches_string "$type" "${safe_types_string[@]}" || \ - matches_regexp "$file" "${safe_files_regexp[@]}" || \ - matches_regexp "$type" "${safe_types_regexp[@]}" ; then - : # do nothing - else - printf "%s\n" "$file" + declare -A cached_types + if ! { matches_string "$file" "${safe_files_string[@]}" || \ + matches_regexp "$file" "${safe_files_regexp[@]}" ;}; then + if [[ -z ${cached_types[$type]} ]]; then + if matches_string "$type" "${safe_types_string[@]}" || \ + matches_regexp "$type" "${safe_types_regexp[@]}" ; then + cached_types[$type]=false + else + cached_types[$type]=true + fi + fi + if "${cached_types[$type]}"; then + printf "%s\n" "$file" + fi fi done > "$unsafe_files" |