From f1559ccab1f9de1f03659a9de6572a593abbbda5 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 20 Jun 2014 15:18:33 -0400 Subject: Optimize jh-checksource I didn't think it needed optimizing, but on my abslibre checkout, it was taking 48-point-something seconds. Now it takse 14-point-something seconds. --- jh-checksource.sh | 49 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/jh-checksource.sh b/jh-checksource.sh index e3eff21..9de91c0 100644 --- a/jh-checksource.sh +++ b/jh-checksource.sh @@ -8,15 +8,19 @@ sep='' safe_types_regexp=('^(inode|text|image|video|audio)/') safe_types_string=('application/pdf' 'application/postscript' 'application/xml' 'application/ogg' 'message/rfc822') -safe_files_regexp=('/\.(git|hg|svn)/' '/[^/]*\.git/' '/po/[^/]*.gmo$' '\.flw$' '\.odg$' '\.ppt$') +safe_dirs_glob=(.{git,hg,svn} '*.git') +safe_files_regexp=('/po/[^/]*.gmo$' '\.flw$' '\.odg$' '\.ppt$') safe_files_string=() # don't care about files less than 3 bytes. min_size=3 -normalize_filename() { - local cwd="$(readlink -m -- "$PWD")" - readlink -m -- "$1"|sed "s|^$cwd/|./|" +cwd="$(readlink -m -- "$PWD")" + +normalize_filenames() { + while IFS='' read -r filename; do + readlink -m -- "$filename" + done | sed "s|^$cwd/|./|" } matches_string() { @@ -53,10 +57,11 @@ print-machine() { main() { local format=human # Parse arguments + local arg for arg in "$@"; do case "$arg" in -m) format=machine;; - *) safe_files_string+=("$(normalize_filename "$arg")");; + *) safe_files_string+=("$(normalize_filenames <<<"$arg")");; esac done @@ -65,23 +70,33 @@ main() { trap "$(printf 'rm -f -- %q' "$unsafe_files")" EXIT # Heavy lifting - find . -type f -printf '%s %h/%f\n' | # find all files + local filter_dirs=() + local glob + for glob in "${safe_dirs_glob[@]}"; do + filter_dirs+=(-type d -name "$glob" -prune -o) + done + find . "${filter_dirs[@]}" -type f -printf '%s %p\n' | # find all files while read -r size file; do # filter out files smaller than $min_size [[ $size -lt $min_size ]] || printf '%s\n' "$file" done | - xargs -d'\n' file --mime-type -r -F "$sep" | # identify the filetypes + normalize_filenames | + xargs -r -d'\n' file --mime-type -r -F "$sep" | # identify the filetypes sed -r "s@(.*)${sep}\s*(.*)@\2:\1@" | # reformat the output to be easier to parse while IFS=: read -r type file; do - file="$(normalize_filename "$file")" - - if \ - matches_string "$file" "${safe_files_string[@]}" || \ - matches_string "$type" "${safe_types_string[@]}" || \ - matches_regexp "$file" "${safe_files_regexp[@]}" || \ - matches_regexp "$type" "${safe_types_regexp[@]}" ; then - : # do nothing - else - printf "%s\n" "$file" + declare -A cached_types + if ! { matches_string "$file" "${safe_files_string[@]}" || \ + matches_regexp "$file" "${safe_files_regexp[@]}" ;}; then + if [[ -z ${cached_types[$type]} ]]; then + if matches_string "$type" "${safe_types_string[@]}" || \ + matches_regexp "$type" "${safe_types_regexp[@]}" ; then + cached_types[$type]=false + else + cached_types[$type]=true + fi + fi + if "${cached_types[$type]}"; then + printf "%s\n" "$file" + fi fi done > "$unsafe_files" -- cgit v1.2.3-54-g00ecf