#! /bin/bash cmd="${0##*/}" PROG_DESC="finds files in given directories that are duplicated in other places" PROG_HELP=( "${PROG_NAME} [-sameDate] [-sameTime] [-sameName] {ALLFNAME} {REFDIR} {TARGETDIR}... > {REPORT}.txt" ) PROG_INFO=( "\nNAME" "\n ${PROG_NAME} - ${PROG_DESC}." "\n" "\nSYNOPSIS" "\n ${PROG_HELP[@]}" "\n" "\nDESCRIPTION" "\n Finds files in one of the given {TARGETDIR}s that apparently also occur in {REFDIR} " "\n (excluding from the latter all {TARGETDIR}s that are subdirectories of it)." "\n" "\n In principle, two files are flagged as possibly equal if they" "\n have the same size and checksum. The user may request more stringent criteria." "\n" "\n Files that are too short are ignored, as they could give too many checksum coincidences." "\n" "\n The scrips uses data collected by {find_all_files_size_date.sh} in" "\n file \"{ALLFNAME}.sdf\" and by {find_all_files_cksum_size.sh} in" "\n file \"{ALLFNAME}.csf\". Entries in those files" "\n that cannot be paired by wholename are flagged and ignored." "\n" "\n The directories {REFDIR} and {TARGETDIR} must be prefixes of" "\n the file names in those reports. Any leading \"./\" are removed" "\n from these arguments, so {REFDIR} may be \"./\" to mean \"all files" "\n listed in those reports minus the target diretories\"." "\n" \ "\nSEE ALSO" "\n " "\nAUTHOR" "\n Created 2020-09-06 by Jorge Stolfi, Unicamp" ) sameDate=0; sortDate=( -k2,2 ); sameTime=0; sortTime=( -k3,3 ); sameName=0; sortName=( -k7,7 ); while [[ ( $# -ge 1 ) && ( "/$1" == /-* ) ]]; do if [ "/$1" == "/-sameDate" ]; then sameDate=1; sortDate=( ); shift; elif [ "/$1" == "/-sameTime" ]; then sameTime=1; sortTime=( ); shift; elif [ "/$1" == "/-sameName" ]; then sameName=1; sortName=( ); shift; else echo 'unrecognized option "'"$1" 1>&2 echo "usage: ${PROG_HELP[*]}" 1>&2 exit 1; fi done if [ $# -lt 2 ]; then echo "missing directory names" 1>&2 echo "usage: ${PROG_HELP[*]}" 1>&2 ; exit 1 fi allfname="$1"; shift refdir="$1"; shift thedirs=( "$@" ) tmp=/tmp/$$ minsize=8 # Standardize leading "./" and trailing "/": newdirs=() for xdir in ${refdir} ${thedirs[@]}; do if [[ "@${xdir:0:1}" == "@/" ]]; then echo "${xdir}: absolute directories not allowed" 1>&2; exit 1 fi dir="`echo ${xdir} | sed -e 's:^:./:' -e 's:^[.]/[.]/:./:' -e 's:[/]*\$:/:' -e 's:[/][/]*:/:'`" newdirs+=( ${dir} ) done refdir=${newdirs[0]} thedirs=( "${newdirs[@]:1}" ) echo "allfname = [ ${allfname} ]" 1>&2 echo "refdir = [ ${refdir} ]" 1>&2 echo "thedirs (${#thedirs[@]}) = [ ${thedirs[@]} ]" 1>&2 echo "tmp = [ ${tmp} ]" 1>&2 echo "joining the two all-files lists..." 1>&2 sfname="${tmp}_sort" exts=( sdf csf ) for kk in 0 1 ; do ext="${exts[$kk]}" cat ${allfname}.${ext} \ | gawk \ -v kk=${kk} \ -v minsize=${minsize} \ ' ($(1+kk)+0 >= minsize) { $3 = ("./" $3); print; }' \ | sort -k3,3 \ > ${sfname}.${ext} wc -l ${sfname}.${ext} 1>&2 done join \ -j1 3 -j2 3 \ -a1 -a2 \ -e '???' \ -o1.1,1.2,2.1,2.2,0 \ ${sfname}.csf ${sfname}.sdf \ | gawk \ ' /[?][?][?]/ { printf "%d: !! unpaired, ignored [%s]\n", FNR, $0 > "/dev/stderr"; next } (NF != 5) { printf "%d: ** bad NF = %d [%s]\n", FNR, NF, $0 > "/dev/stderr"; exit(1) } ($2 != $3) { printf "%d: !! discrepant sizes in .csf and .sdf, ignored = [%s]\n", FNR, $0 > "/dev/stderr"; next } // { print $1, $2, $4, $5; } ' \ > ${sfname}.csdf wc -l ${sfname}.csdf 1>&2 echo "extracting the relevant entries from the all-files list..." 1>&2 xfpref="${tmp}_ex" for key in ref the ; do pfile="${xfpref}_${key}.pats" if [[ ${key} == ref ]]; then dlist="${refdir}" else dlist="${thedirs[*]}" fi echo "${dlist}" | tr ' ' '\012' | sed -e 's:^: :' > ${pfile} xfname="${xfpref}_${key}" cat ${sfname}.csdf \ | fgrep -F -f ${pfile} \ | sort \ > ${xfname}.csdf wc -l ${xfname}.csdf 1>&2 done echo "removing targets from reference..." 1>&2 bool 1-2 ${xfpref}_ref.csdf ${xfpref}_the.csdf > ${xfpref}_oth.csdf echo "sorting "_oth" and "_the" data by cksum..." 1>&2 for key in oth the; do ifile="${xfpref}_${key}.csdf" ofile="${xfpref}_${key}_sort.csdf" cat ${ifile} \ | sort -k 1,1 \ > ${ofile} wc -l ${ofile} 1>&2 done othlist="${xfpref}_oth_sort.csdf" thelist="${xfpref}_the_sort.csdf" echo "joining "_oth" and "_the" data by cksum, filtering by same size..." 1>&2 duplist="${tmp}_dups.csdfdf" join \ -j1 1 -j2 1 \ -o0,1.2,2.2,1.3,2.3,1.4,2.4 \ ${thelist} ${othlist} \ | gawk \ ' (NF != 7) { printf "%d: bad NF = %d [%s]\n", FNR, NF; exit(1) } ( ($2+0) != ($3+0) ) { printf "%d: !! same checsum (%d) but different sizes\n", FNR, $1 > "/dev/stderr"; printf " [%14s %s %s]\n", $2, $4, $6 > "/dev/stderr"; printf " [%14s %s %s]\n", $3, $5, $7 > "/dev/stderr"; next } // { printf "%010d %14s %s %-80s %s %-80s\n", $1, $2, $4, $6, $5, $7 } ' \ | sort -k6,6 -k4,4 \ > ${duplist} wc -l ${duplist} 1>&2 cat ${duplist} rm -fv ${tmp}_* 1>&2 exit 0