#! /bin/bash

cmd="${0##*/}"
PROG_DESC="finds files in given directories that are duplicated in other places"
PROG_HELP=( "${PROG_NAME} [-sameDate] [-sameTime] [-sameName] {ALLFNAME} {REFDIR} {TARGETDIR}... > {REPORT}.txt" )
PROG_INFO=(
  "\nNAME"
  "\n  ${PROG_NAME} - ${PROG_DESC}."
  "\n"
  "\nSYNOPSIS"
  "\n  ${PROG_HELP[@]}"
  "\n"
  "\nDESCRIPTION"
  "\n  Finds files in one of the given {TARGETDIR}s that apparently also occur in {REFDIR} "
  "\n (excluding from the latter all {TARGETDIR}s that are subdirectories of it)."
  "\n"
  "\n  In principle, two files are flagged as possibly equal if they"
  "\n have the same size and checksum.  The user may request more stringent criteria."
  "\n"
  "\n  Files that are too short are ignored, as they could give too many checksum coincidences."
  "\n"
  "\n  The scrips uses data collected by {find_all_files_size_date.sh} in"
  "\n file \"{ALLFNAME}.sdf\" and by {find_all_files_cksum_size.sh} in"
  "\n file \"{ALLFNAME}.csf\".  Entries in those files"
  "\n that cannot be paired by wholename are flagged and ignored."
  "\n"
  "\n  The directories {REFDIR} and {TARGETDIR} must be prefixes of"
  "\n the file names in those reports.  Any leading \"./\" are removed"
  "\n from these arguments, so {REFDIR} may be \"./\" to mean \"all files"
  "\n listed in those reports minus the target diretories\"."
  "\n" \
  "\nSEE ALSO"
  "\n  "
  "\nAUTHOR"
  "\n  Created 2020-09-06 by Jorge Stolfi, Unicamp"
)

sameDate=0; sortDate=( -k2,2 ); 
sameTime=0; sortTime=( -k3,3 );
sameName=0; sortName=( -k7,7 );
while [[ ( $# -ge 1 ) && ( "/$1" == /-* ) ]]; do
  if [ "/$1" == "/-sameDate" ]; then
    sameDate=1; sortDate=( ); shift;
  elif [ "/$1" == "/-sameTime" ]; then
    sameTime=1; sortTime=( ); shift;
  elif [ "/$1" == "/-sameName" ]; then
    sameName=1; sortName=( ); shift;
  else
    echo 'unrecognized option "'"$1" 1>&2 
    echo "usage: ${PROG_HELP[*]}" 1>&2 
    exit 1;
  fi
done

if [ $# -lt 2 ]; then
  echo "missing directory names" 1>&2
  echo "usage: ${PROG_HELP[*]}" 1>&2 ; exit 1
fi

allfname="$1"; shift
refdir="$1"; shift
thedirs=( "$@" )

tmp=/tmp/$$
minsize=8

# Standardize leading "./" and trailing "/":
newdirs=()
for xdir in ${refdir} ${thedirs[@]}; do
  if [[ "@${xdir:0:1}" == "@/" ]]; then 
    echo "${xdir}: absolute directories not allowed" 1>&2; exit 1 
  fi
  dir="`echo ${xdir} |  sed -e 's:^:./:' -e 's:^[.]/[.]/:./:' -e 's:[/]*\$:/:' -e 's:[/][/]*:/:'`"
  newdirs+=( ${dir} )
done
refdir=${newdirs[0]}
thedirs=( "${newdirs[@]:1}" )

echo "allfname = [ ${allfname} ]" 1>&2
echo "refdir = [ ${refdir} ]" 1>&2
echo "thedirs (${#thedirs[@]}) = [ ${thedirs[@]} ]" 1>&2
echo "tmp = [ ${tmp} ]" 1>&2

echo "joining the two all-files lists..." 1>&2
sfname="${tmp}_sort"
exts=( sdf csf )
for kk in 0 1 ; do
  ext="${exts[$kk]}"
  cat ${allfname}.${ext} \
    | gawk \
        -v kk=${kk} \
        -v minsize=${minsize} \
        ' ($(1+kk)+0 >= minsize) { $3 = ("./" $3); print; }' \
    | sort -k3,3 \
    > ${sfname}.${ext}
  wc -l ${sfname}.${ext} 1>&2
done

join \
    -j1 3 -j2 3 \
    -a1 -a2 \
    -e '???' \
    -o1.1,1.2,2.1,2.2,0 \
    ${sfname}.csf ${sfname}.sdf \
  | gawk \
      ' /[?][?][?]/ { printf "%d: !! unpaired, ignored [%s]\n", FNR, $0 > "/dev/stderr"; next }
        (NF != 5) { printf "%d: ** bad NF = %d [%s]\n", FNR, NF, $0 > "/dev/stderr"; exit(1) }
        ($2 != $3) { 
          printf "%d: !! discrepant sizes in .csf and .sdf, ignored = [%s]\n", FNR, $0 > "/dev/stderr";
          next
        }
        // { print $1, $2, $4, $5; }
      ' \
  > ${sfname}.csdf
wc -l ${sfname}.csdf 1>&2

echo "extracting the relevant entries from the all-files list..." 1>&2
xfpref="${tmp}_ex"
for key in ref the ; do 
  pfile="${xfpref}_${key}.pats"
  if [[ ${key} == ref ]]; then 
    dlist="${refdir}"
  else
    dlist="${thedirs[*]}"
  fi
  echo "${dlist}" | tr ' ' '\012' | sed -e 's:^: :' > ${pfile}
  xfname="${xfpref}_${key}"
  cat ${sfname}.csdf \
    | fgrep -F -f ${pfile} \
    | sort \
    > ${xfname}.csdf
  wc -l ${xfname}.csdf 1>&2
done

echo "removing targets from reference..." 1>&2
bool 1-2 ${xfpref}_ref.csdf ${xfpref}_the.csdf > ${xfpref}_oth.csdf

echo "sorting "_oth" and "_the" data by cksum..." 1>&2
for key in oth the; do
  ifile="${xfpref}_${key}.csdf"
  ofile="${xfpref}_${key}_sort.csdf"
  cat ${ifile} \
    | sort -k 1,1 \
    > ${ofile}
  wc -l ${ofile} 1>&2
done
othlist="${xfpref}_oth_sort.csdf"
thelist="${xfpref}_the_sort.csdf"

echo "joining "_oth" and "_the" data by cksum, filtering by same size..." 1>&2
duplist="${tmp}_dups.csdfdf"
join \
    -j1 1 -j2 1 \
    -o0,1.2,2.2,1.3,2.3,1.4,2.4 \
    ${thelist} ${othlist} \
  | gawk \
      ' (NF != 7) { printf "%d: bad NF = %d [%s]\n", FNR, NF; exit(1) }
        ( ($2+0) != ($3+0) ) {
          printf "%d: !! same checsum (%d) but different sizes\n", FNR, $1  > "/dev/stderr";
          printf "  [%14s %s %s]\n", $2, $4, $6  > "/dev/stderr";
          printf "  [%14s %s %s]\n", $3, $5, $7  > "/dev/stderr";
          next
        }
        // { printf "%010d %14s %s %-80s %s %-80s\n", $1, $2, $4, $6, $5, $7 }
      ' \
  | sort -k6,6 -k4,4 \
  > ${duplist}
  
wc -l ${duplist} 1>&2

cat ${duplist}

rm -fv ${tmp}_* 1>&2
exit 0