#! /usr/bin/gawk -f # Last edited on 2024-07-14 15:08:19 by stolfi # Reads a ".csdf" file produced by {find_all_files_cksum_size_date.sh}, # sorted in ascending order of checksum, size, and mod date. # # Outputs a list of all files that seem to be duplicated # because they have the same checksum, size, and mod date. # In the output, groups of two or more apparently identical files # are printed together. The groups are separated by a blank lines. # each group is prefixed with a line "# group {cksum} {size} {date}" # and then each file by a line "# file {lin}" where {lin} is a # line of the input file. BEGIN { split("", lines); clear_group(); aborted = -1; } (aborted >= 0){ exit(aborted); } // { if (NF != 4) { data_error("bad NF"); } ck = $1; sz = $2; dt = $3; na = $4; if ((ck != ock) || (sz != osz) || (dt != odt)) { dump_group(); clear_group(); } store_line(ck, sz, dt, $0) next; } END { if (aborted >= 0) { exit(aborted) } dump_group(); } function clear_group() { # Resets the current group to empty: # The lines of the current group are {lines[0..n-1]}. n = 0; # Checksun=m, size, and date of current group: ock = ""; osz= ""; odt = "" } function store_line(ck,sz,dt,lin ) { # Adds one more line to the current group lines[n] = lin; n++; ock = ck; osz = sz; odt = dt } function dump_group( i,op) { # Dumps the current group, if it has 2 or more lines: if (n < 2) { return; } printf "# group %s %s %s\n", ock, osz, odt; for (i = 0; i < n; i++) { printf "# file %s\n", lines[i]; } print "" } function data_error(msg) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr" aborted = 1 exit(1) }