#! /usr/bin/gawk -f # Last edited on 2022-08-17 18:34:18 by stolfi # Reads a file produced by {find_all_files_cksum_size.sh}, sorted. # Looks for two or more consecutive entries that have the same checksum and size. # Outputs a file where each line has the form "{CKSUM} {SIZE} {FILE1} {FILE2} ..." # where {FILE1}, {FILE2}, ... are the duplicated files. BEGIN{ ock = ""; # Checksum field of previous line. osz = ""; # Size field of previous line. nd = 0; # Number of files with the {ock,osz} attributes. split("", fd); # Names of files with {ock,osz} attributes, indexed {0..nd-1}. } /^[0-9]/ { if (NF != 3) { data_error("bad {NF}"); } ck = $1; sz = $2; fn = $3; if ((ck < ock) || ((ck == ock) && (sz < osz))) { data_error("out of order"); } if ((ck == ock) && (sz == osz)) { fd[nd] = fn; nd++; } else { if (nd >= 2) { dumpem(); } ock = ck; osz = sz; fd[0] = fn; nd = 1; } next } END { if (nd >= 2) { dumpem(); } } function dumpem( i) { printf "%010d %14d", ock, osz; for (i = 0; i < nd; i++) { printf " %s", fd[i]; } printf "\n"; } function data_error(msg) { printf "** error: %s\n", msg > "/dev/stderr"; exit(1) }