#! /usr/bin/gawk -f
# Last edited on 2024-07-14 15:08:19 by stolfi

# Reads a ".csdf" file produced by {find_all_files_cksum_size_date.sh},
# sorted in ascending order of checksum, size, and mod date.
# 
# Outputs a list of all files that seem to be duplicated
# because they have the same checksum, size, and mod date.
# In the output, groups of two or more apparently identical files
# are printed together. The groups are separated by a blank lines.
# each group is prefixed with a line "# group {cksum} {size} {date}"
# and then each file by a line "# file  {lin}" where {lin} is a
# line of the input file.

BEGIN { 
  split("", lines); 
  clear_group();
  aborted = -1;
}

(aborted >= 0){ exit(aborted); }

// { 
  if (NF != 4) { data_error("bad NF"); }
  ck = $1; sz = $2; dt = $3; na = $4;
  if ((ck != ock) || (sz != osz) || (dt != odt)) 
    { dump_group(); 
      clear_group();
    }
  store_line(ck, sz, dt, $0)
  next;
}

END {
  if (aborted >= 0) { exit(aborted) }
  dump_group();
}
  
function clear_group() {
  # Resets the current group to empty:
  # The lines of the current group are {lines[0..n-1]}.
  n = 0; 
  # Checksun=m, size, and date of current group: 
  ock = ""; osz= ""; odt = ""
}

function store_line(ck,sz,dt,lin  ) {
  # Adds one more line to the current group 
  lines[n] = lin; n++;
  ock = ck; osz = sz; odt = dt
}

function dump_group(   i,op) {
  # Dumps the current group, if it has 2 or more lines:
  if (n < 2) { return; }
  printf "# group %s %s %s\n", ock, osz, odt;
  for (i = 0; i < n; i++)
    { printf "# file %s\n", lines[i]; }
  print ""
}

function data_error(msg) {
  printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"
  aborted = 1
  exit(1)
}