#! /usr/bin/gawk -f # Last edited on 2003-09-18 23:23:51 by stolfi BEGIN { usage = ( \ ARGV[0] " < IN > OUT" \ ); abort = -1; # To be used as a filter by find-dup-files # Input records should contain # SIZE.CHECKSUM FILENAME # The file must be sorted so that the entries whose relevant attributes # are the same occur next to each other. split("", ps); ps[-1] = ""; # Size and checksum split("", pf); pf[-1] = ""; # File name ndups = 0; } (abort >= 0) { exit abort; } /^ *([#]|$)/ { next; } (NF != 2) { data_warning(("bad format, ignored [" $0 "]")); next; } /./ { same = ($1 == ps[ndups-1]); if (! same) { flush_batch(); } ps[ndups] = $1; pf[ndups] = $2; ndups++; } END { flush_batch(); } function flush_batch( i,j,tmp,sgn) { if(ndups > 1) { printf "dups \\\n" # Sort matched entries by name: for (i = 0; i < ndups; i++) { for (j = i+1; j < ndups; j++) { if (pf[i] != pf[j]) { sgn = (pf[i] < pf[j] ? -1 : 1); } else { sgn = 0; } if (sgn > 0) { tmp = ps[i]; ps[i] = ps[j]; ps[j] = tmp; tmp = pf[i]; pf[i] = pf[j]; pf[j] = tmp; } } printf " %s %s \\\n", \ ps[i], pf[i]; } printf " -end\n\n"; } ndups = 0; } function data_warning(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; }