#! /usr/bin/gawk -f # Last edited on 2003-09-10 23:21:52 by stolfi BEGIN { usage = ( \ ARGV[0] " \\\n" \ " [-v ignoreDate=BOOL] \\\n" \ " [-v ignoreTime=BOOL] \\\n" \ " [-v ignorePath=BOOL] \\\n" \ " [-v ignoreName=BOOL] < IN > OUT" \ ); abort = -1; # To be used as a filter by find-possibly-dup-files # Input records should contain # SIZE.CHECKSUM YYYY-MM-DD HH:MM:SS IDIR ROOTDIR RELPATH FILENAME INODE # The file must be sorted so that the entries whose relevant attributes # are the same occur next to each other. if (ignoreDate == "") { ignoreDate = 0; } # Date must be the same if (ignoreTime == "") { ignoreTime = 0; } # Time must be the same if (ignorePath == "") { ignorePath = 0; } # Rel path must be the same if (ignoreName == "") { ignoreName = 0; } # Filename must be the same split("", ps); ps[-1] = ""; # Size and checksum split("", pd); pd[-1] = ""; # Day split("", pt); pt[-1] = ""; # Time split("", pk); pk[-1] = ""; # Sequence of root directory split("", ph); ph[-1] = ""; # Root directory split("", pp); pp[-1] = ""; # Root-relative path of subdirectory split("", pf); pf[-1] = ""; # File name in subdirectory ndups = 0; } (abort >= 0) { exit abort; } /^ *([#]|$)/ { next; } (NF != 8) { data_warning(("bad format, ignored [" $0 "]")); next; } /./ { same = ($1 == ps[ndups-1]); if (! ignoreDate) { same = same && ($2 == pd[ndups-1]); } if (! ignoreTime) { same = same && ($3 == pt[ndups-1]); } if (! ignorePath) { same = same && ($6 == pp[ndups-1]); } if (! ignoreName) { same = same && ($7 == pf[ndups-1]); } if (! same) { flush_batch(); } ps[ndups] = $1; pd[ndups] = $2; pt[ndups] = $3; pk[ndups] = $4; ph[ndups] = $5; pp[ndups] = $6; pf[ndups] = $7; pi[ndups] = $8; ndups++; } END { flush_batch(); } function flush_batch( i,j,tmp,sgn) { if(ndups > 1) { printf "dups \\\n" # Sort matched entries by iseq, path, name, date, time: for (i = 0; i < ndups; i++) { for (j = i+1; j < ndups; j++) { if (pk[i] != pk[j]) { sgn = (pk[i] < pk[j] ? -1 : 1); } else if (pp[i] != pp[j]) { sgn = (pp[i] < pp[j] ? -1 : 1); } else if (pf[i] != pf[j]) { sgn = (pf[i] < pf[j] ? -1 : 1); } else if (pd[i] != pd[j]) { sgn = (pd[i] < pd[j] ? -1 : 1); } else if (pt[i] != pt[j]) { sgn = (pt[i] < pt[j] ? -1 : 1); } else { sgn = 0; } if (sgn > 0) { tmp = ps[i]; ps[i] = ps[j]; ps[j] = tmp; tmp = pd[i]; pd[i] = pd[j]; pd[j] = tmp; tmp = pt[i]; pt[i] = pt[j]; pt[j] = tmp; tmp = pk[i]; pk[i] = pk[j]; pk[j] = tmp; tmp = ph[i]; ph[i] = ph[j]; ph[j] = tmp; tmp = pp[i]; pp[i] = pp[j]; pp[j] = tmp; tmp = pf[i]; pf[i] = pf[j]; pf[j] = tmp; tmp = pi[i]; pi[i] = pi[j]; pi[j] = tmp; } } printf " %s %s %s %s %s %s %s %s \\\n", \ ps[i], pd[i], pt[i], pk[i], ph[i], pp[i], pf[i], pi[i]; } printf " -end\n\n"; } ndups = 0; } function data_warning(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; }