#! /bin/bash
# Last edited on 2024-07-18 17:21:23 by stolfi

PROG_NAME=${0##*/}
PROG_DESC="merges a \".csdf\" file with an \".sdf\" file"
PROG_HELP=(
  "${PROG_NAME} {FILE1} {FILE2}"
)
PROG_INFO=(
  "\nNAME"
  "\n  ${PROG_NAME} - ${PROG_DESC}."
  "\n"
  "\nSYNOPSIS"
  "\n  ${PROG_HELP[@]}"
  "\n"
  "\nDESCRIPTION"
  "\n  Reads a \".csdf\" file {FILE1} as would be produced by"
  "\n {find_all_files_cksum_size_date.sh}, and a \".sdf\" file {FILE2}" 
  "\n as would be produced by {find_all_files_size_date.sh}.  Assumes" 
  "\n that the latter is recent.  Writes out a file like {FILE1} with" 
  "\n all files in {FILE2}, computing their checksums as needed."
  "\n"
  "\n  Assumes that each line of {FILE1} has fields" 
  "\n \"{CKSUM} {SIZE} {DATE} {FNAME}\", and each line" 
  "\n of {FILE2} has fields \"{SIZE} {DATE} {FNAME}\".  The"
  "\n {DATE} field (mod-date) in either file must not have"
  "\n a decimal fraction part (nanoseconds)."
  "\n" 
  "\n  The script pairs up the lines of the two files by the {SIZE}, {DATE}" 
  "\n and {FNAME} fields, and outputs a file in the same format" 
  "\n as {FILE1} with the data for all files in {FILE2}." 
  "\n" 
  "\n  More preicisely, for each paired line, the script assumes that" 
  "\n the file still exists and its size, date, and checksum have not" 
  "\n changed, and writes the line to standard output.  Each unpaired line" 
  "\n from  {FILE1} is flagged, written to {stderr}, and excluded.  For"
  "\n each unpaired line from {FILE2}, the script checks whether" 
  "\n the file exists and its size has not changed, then" 
  "\n computes the checksum on the spot and writes the line out, trusting" 
  "\n that the mod-date is the same as in {FILE2}."
  "\n" 
  "\n  Both files are filtered through {check_cksum_size_date_file_format.gawk}.  Lines"
  "\n that are rejected by this filter (e.g. because they have forbidden characters"
  "\n in the {FNAME}) are flagged to {stderr} and excluded."
  "\n" 
  "\nOPTIONS"
  "\nSEE ALSO"
  "\n  find(1), find_all_files_size_date, find_all_files_cksum_size"
  "\nAUTHOR"
  "\n  Created 2007-01-17, 2017-05-04, 2022-09-21 by Jorge Stolfi, Unicamp"
)

# ----------------------------------------------------------------------
# COMMAND LINE PARSING

# Parse command line switches: 
while [[ ( $# -ge 1 ) && ( "/$1" =~ /-.* ) ]]; do
  if [[ ( $# -ge 1 ) && ( "/$1" == "/-BOGUS" ) ]]; then 
    shift;
  else
    echo "unknown option $1" 1>&2 ;
    echo -e "usage:\n  ${PROG_HELP[@]}" 1>&2 ; exit 1 
  fi
done 
file1="$1"; shift
file2="$1"; shift

# END COMMAND LINE PARSING
# ----------------------------------------------------------------------

if [[ "@${file1##*.}" != "@csdf" ]]; then
  echo "** file \"${file1}\" should be a \".csdf\" file" 1>&2; exit 1
fi

if [[ "@${file2##*.}" != "@sdf" ]]; then
  echo "** file \"${file1}\" should be an \".sdf\" file" 1>&2; exit 1
fi

tmp="/tmp/$$"
tmp_file1="${tmp}-f1.csdf"
tmp_file2="${tmp}-f2.sdf"
tmp_join="${tmp}-f12.sdfcc"

echo "sorting ${file1}..." 1>&2
cat ${file1} \
  | check_cksum_size_date_file_format.gawk -v format="csdf" \
  | gawk '//{ printf "%s#%s#%s %s\n", $2, $3, $4, $1; }' \
  | sort -b -k1 \
  > ${tmp_file1}
  
echo "sorting ${file2}..." 1>&2
cat ${file2} \
  | check_cksum_size_date_file_format.gawk -v format="sdf" \
  | gawk '//{ printf "%s#%s#%s unknown\n", $1, $2, $3; }' \
  | sort -b -k1 \
  > ${tmp_file2}

# Merge into file with fields "{SIZE}#{DATE}#{FNAME} {CKSUM1} {CKSUM2}"
# then write "{CKSUM} {SIZE} {DATE} {FNAME}":
echo "joining the files..." 1>&2
join \
    -a1 -a2 -e '??????????' -j1 -o 0,1.2,2.2 \
    ${tmp_file1} \
    ${tmp_file2} \
  > ${tmp_join}
  
cat ${tmp_join} \
  | gawk \
      ' BEGIN {
          npa = 0; nunp1 = 0; nunp2 = 0
        }
        // { 
          gsub(/[#]/, " ", $0);
          if (NF != 5) { printf "** bug NF\n[[%s]]\n", $0 > "/dev/stderr";  exit(1); }
          sz = $1; dt = $2; na = $3; ck1 = $4; ck2 = $5;
          ok = 1;
          if (ck2 == "??????????") {
            printf "only in file 1: %10s %14s %s %s\n", ck1, sz, dt, na > "/dev/stderr";
            ck = ck1;
            nunp1++;
            ok = 0;
          } else {
            if (ck2 != "unknown") { 
              printf "** bug {CKSUM2}\n[[%s]]\n", $0 > "/dev/stderr";  exit(1);
            }
            if (ck1 == "??????????") {
              printf "only in file 2: %10s %14s %s %s\n", "---", sz, dt, na > "/dev/stderr";
              nunp2++;
              tname = "/tmp/ck";
              cmd = ( "rm " tname "; cksum " na " > " tname "; touch " tname ";" )
              system(cmd);
              getline ckout < tname; close(tname);
              nck = split(ckout, ckflds)
              if (nck != 3) { printf "** bug cksum (n) = \"%s\"\n[[%s]]\n",ckout,$0 > "/dev/stderr";  exit(1); }
              ckck = ckflds[1]
              szck = ckflds[2]
              nack = ckflds[3]
              if ((! match(ckck, /^[0-9]+$/)) || (nack != na)) { 
                printf "** bug cksum (f) = \"%s\"\n[[%s]]\n",ckout,$0 > "/dev/stderr";  exit(1);
              }
              if (szck != sz) { 
                printf "!! size changed: %d -> %d\n[[%s]]\n",sz,szck,$0 > "/dev/stderr";
                sz = szck;
              }
              ck = sprintf("%010d", ckck);
            } else {
              ck = ck1;
              npa=++;
            }
          }
          if (! match(ck, /^[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$/)) { 
            printf "** bug {CKSUM1} = %s\n[[%s]]\n", ck, $0 > "/dev/stderr";  exit(1);
          }
          if (ok) { printf "%010s %14s %s %s\n", ck, sz, dt, na; }
          next;
        }
        END {
          printf "%10d paired lines\n", npa > "/dev/stderr";
          printf "%10d lines only in file 1\n", nunp1 > "/dev/stderr";
          printf "%10d lines only in file 2\n", nunp2 > "/dev/stderr";
        }
      '

# rm -f ${tmp_file2} ${tmp_file1} ${tmp_join}
exit 0