#! /usr/bin/gawk -f # Last edited on 2011-10-07 22:29:14 by stolfi BEGIN { usage = ( ARGV[0] \ " [-v command=COMMAND] \\\n" \ " [-v pairsOnly=BOOL] \\\n" \ " [-v makeLinks=BOOL] \\\n" \ " [-v allFiles=BOOL] \\\n" \ " < DUPLIST > REMLIST" \ ); # Takes the output of find-possibly-dup-files and turns it into # a list of "/bin/rm" commands for all but the last copy of every file. # # Considers only "dups" entries with at least two files # with distinct inodes. If {pairsOnly} is specified, # requires exactly two files from different directories (1 and 2). # # The {command}, if given, is used instead of "/bin/rm". # # If {allFiles} is true, issues commands for all files in each # entry, including the last one. # # If {makeLinks} is true, after each "/bin/rm" commands issues # one or more commands that create symbolic links from the removed files # to the surviving ones. This option is incompatible with {allFiles}. abort = -1; ndups = 0; if (pairsOnly == "") { pairsOnly = 0; } if (allFiles == "") { allFiles = 0; } if (makeLinks == "") { makeLinks = 0; } if (command == "") { command = "/bin/rm"; } if (allFiles && makeLinks) { arg_error("cannot set both \"allFiles\" and \"makeLinks\""); } split("", f); split("", idir); split("", inode); } (abort >= 0) { exit abort; } /^ *([#]|$)/ { next; } /^dups / { if (ndups != 0) { data_error(("missing \"-end\"")); } next; } /^ *-end *$/ { if (ndups < 2) { data_error("single-file dups"); } else if (all_inodes_are_equal()) { data_warning("same inode, ignored"); } else if (pairsOnly && (ndups > 2)) { data_warning("more than two files, ignored"); } else if (pairsOnly && ((idir[0] != 1) || (idir[1] != 2))) { data_warning("wrong directories, ignored"); } else { printf "%s", command; nr = (allFiles ? ndups : ndups-1); for (k = 0; k < nr; k++) { printf " '%s'", f[k]; } printf "\n"; if (! allFiles) { printf "# '%s'\n", f[ndups-1]; if (makeLinks) { for (k = 0; k < nr; k++) { dst = create_link_target(f[ndups-1], f[k]); printf "/bin/ln -s '%s' '%s'\n", dst, f[k]; } } } } ndups = 0; next; } function create_link_target(dst,name, dir) { # If {dst} and/or name starts with "..", convert it to absolute: dst = remove_dot_steps(dst); name = remove_dot_steps(name); # If {name} is absolute, make sure {dst} is absolute too: if (name ~ /^[\/]/) { dst = (ENVIRON["PWD"] "/" dst); } # If {dst} is absolute, return it: if (dst ~ /^[\/]/) { return dst; } # Now both paths are relative. Strip common prefix: while ( \ match(dst, /^[^\/]+[\/]/) && \ (substr(dst, 1, RLENGTH) == substr(name, 1, RLENGTH)) \ ) { dst = substr(dst, 1+RLENGTH); name = substr(name, 1+RLENGTH); } # Now make {dst} relative to the directories of {name}: dir = ""; while(match(name, /^[^\/]+[\/]/)) { name = substr(name, 1+RLENGTH); dir = ("../" dir); } return (dir dst); } function remove_dot_steps(path) { # Remove "./" components path = gensub(/(^|[\/])[.]([\/]|$)/, "\\1", "g", path); # Remove "../" components while(match(path, /(^|[\/])[.][.]([\/]|$)/)) { if (path ~ /^[.][.][\/]/) { path = ( ENVIRON["PWD"] "/" path ); } path = gensub(/(^|[\/])([^\/]*)[\/][.][.]([\/]|$)/, "\\1", "g", path); } # Replace "//" by "/" path = gensub(/[\/][\/]/, "/", "g", path); path = gensub(/[\/][\/]/, "/", "g", path); path = gensub(/[\/][\/]/, "/", "g", path); return path; } function all_inodes_are_equal( i) { for(i = 1; i < ndups; i++) { if (inode[0] != inode[i]) { return 0; } } return 1; } ((NF != 9) || ($(NF) != "\\")) { data_error("bad record format"); next; } // { filename = ($5 "/" $6 "/" $7); gsub(/[\/][.][\/]/, "/", filename); f[ndups] = filename; idir[ndups] = $4; inode[ndups] = $8; ndups++; } END { if (abort >= 0) { exit abort; } if (ndups != 0) { data_error(("missing final \"-end\"")); } } function data_warning(msg) { printf "warning: line %d: %s\n", FNR, msg > "/dev/stderr"; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit abort; } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit abort; }