#! /bin/bash # Last edited on 2024-04-24 05:33:28 by stolfi PROG_NAME=${0##*/} PROG_DESC="find all files under given directories; print cksum, size, name" PROG_HELP=( "${PROG_NAME} [-exclude {PATTERN} | -exclude-path {PATTERN} ].. {DIR}.." ) PROG_INFO=( "\nNAME" "\n ${PROG_NAME} - ${PROG_DESC}." "\n" "\nSYNOPSIS" "\n ${PROG_HELP[@]}" "\n" "\nDESCRIPTION" "\n Writes to stdout a list of all ordinary files" "\n in the specified directories (which usually should be disjoint)." "\n" "\n For each file, prints: the checksum (as a zero-padded" "\n 10-digit decimal integer), the size in bytes, and the file's pathname." "\n" "\n Rejects any directory or file names contain blanks, line" "\n breaks, backslashes, single or double quotes. Assumes" "\n that the full paths found do not contain any double slashes \"//\"." "\n" "\n Does not list symbolic links, pipes, etc. Excludes a few" "\n trash directories that are known to contain many" "\n invalid file names, such as {.cache}, {.config/chromium}, etc." "\n" "\nOPTIONS" "\n -exclude {NAME}" "\n -exclude-path {PATTERN}" "\n Each occurrence of these options specifies the name" "\n (without slashes) of a file to be excluded from the listing. See the \"-name\" and" "\n \"-wholename\" options of \"find(1)\", respectively, for the syntax of {PATTERN}." "\n If it is a directory, also excludes all sub-directories and files therein." "\n" "\nSEE ALSO" "\n find(1), find-all-files-size-date" "\nAUTHOR" "\n Created 2017-05-04 by Jorge Stolfi, Unicamp" ) # ---------------------------------------------------------------------- # COMMAND LINE PARSING # Parse command line switches: exclop=( ) while [[ ( $# -ge 1 ) && ( "/$1" =~ /-.* ) ]]; do if [[ ( $# -ge 2 ) && ( "/$1" == "/-exclude" ) ]]; then exclop+=( -name "$2" -prune -o ); shift; shift; elif [[ ( $# -ge 2 ) && ( "/$1" == "/-exclude-path" ) ]]; then exclop+=( -wholename "$2" -prune -o ); shift; shift; else echo "unknown option $1" 1>&2 ; echo -e "usage:\n ${PROG_HELP[@]}" 1>&2 ; exit 1 fi done dirs=( "$@" ) # END COMMAND LINE PARSING # ---------------------------------------------------------------------- echo "exclop = [" "${exclop[@]}" "]" 1>&2 # Regularize directory names to start with "./" and end with "/"; assume that "//" is same as "/": # echo "dirs = [" ${dirs[@]} "]" 1>&2 if [[ ${#dirs[@]} == 0 ]]; then # Default is current directory: dirs=( ./ ); else dirs=( \ ` echo "${dirs[@]}" \ | tr ' ' '\012' \ | sed -e '/^\$/d' -e 's:^\([^/]\):./\1:' -e 's:^[.]/[.]/:./:' -e 's:[/]*\$:/:' -e 's:[/][/]*:/:' \ ` \ ) fi echo "dirs = [" "${dirs[@]}" "]" 1>&2 tmp="/tmp/$$" tfile="${tmp}.dir" # Output of {find}. efile="${tmp}.err" # Errors. gfile="${tmp}.gud" # Good files. # Find files. Write only wholename (between "//..//") to ${tfile}. # Also strip leading "./". # echo "finding files ..." 1>&2 export TZ=UTC find "${dirs[@]}" \ -type l -prune -o \ -name 'cache' -prune -o \ -name '.cache' -prune -o \ -name '.local' -prune -o \ -name '.thumbnails' -prune -o \ -name '.macromedia' -prune -o \ -name '.mozilla' -prune -o \ -name '.xsession-errors' -prune -o \ -name '.all-files-*' -prune -o \ -wholename '*/testfn/bad' -prune -o \ -wholename '*/.config/chromium' -prune -o \ -wholename '*/Crash Reports' -prune -o \ -wholename '*/ChromiumUnsnapped' -prune -o \ -wholename '*/snap/chromium' -prune -o \ -wholename '*/.config/chromium' -prune -o \ "${exclop[@]}" \ \( -type f -printf "//%p//\n" \) \ | sed -e 's:/[.]/:/:g' -e 's:/[.]/:/:g' -e 's:/[.]/:/:g' -e 's:/[.]/:/:g' \ > ${tfile} # Check for bad characters and line breaks: cat ${tfile} \ | list_bad_filenames.gawk \ -v efile=${efile} \ > ${gfile} wc -l ${gfile} 1>&2 # gawk \ # -v efile="${efile}" \ # ' (NF != 1) { printf "(0) %s\n", $0 > efile; next; } # Embedded blanks. # /[^ ][\/][\/][^ ]/ { printf "(1) %s\n", $0 > efile; next; } # Embedded "//". # /["\\]/ { printf "(2) %s\n", $0 > efile; next; } # Embedded doublequotes, backslashes. # /['"'"']/ { printf "(3) %s\n", $0 > efile; next; } # Embedded single quotes. # ($1 !~ /^[\/][\/].+[\/][\/]$/) { printf "(6) %s\n", $0 > efile; next; } # Embedded newlines? # // { print; } # ' \ # ${tfile} \ # > ${gfile} # If there were any bad filenames, show them: if [[ -s ${efile} ]]; then echo '** bad filenames found:' 1>&2 cat ${efile} 1>&2 fi # echo "computing checksums ..." 1>&2 ( cat ${gfile} \ | sed -e "s:^//:':" -e "s://$:':" \ | xargs -L 500 cksum \ ) \ | gawk \ ' (NF != 3) { printf "** bug NF\n[[%s]]\n", $0 > "/dev/stderr"; exit(1); } /^[0-9]+ [0-9]+ / { printf "%010d %14d %s\n", $1, $2, $3; next; } // { printf "** bug format\n[[%s]]\n", $0 > "/dev/stderr"; exit(1); } ' rm -f ${tfile} ${efile} ${gfile} exit 0