#! /usr/bin/gawk -f # Last edited on 1998-09-20 03:50:52 by stolfi BEGIN { abort = -1; usage = ( \ "perturb-words \\\n" \ " -v stringField=NUM \\\n" \ " -v distField=NUM \\\n" \ " [ -v increment=NUM ] \\\n" \ " [ -v replaceChars=CHARS ] \\\n" \ " [ -v deleteChars=CHARS ] \\\n" \ " [ -v insertChars=CHARS ] \\\n" \ " [ -v dupOnly=BOOL ] \\\n" \ " < INFILE > OUTFILE" \ ); # # Field number "stringField" of each input record must contain a # non-empty string over the given alphabet plus "_". Field number # "distField" must be a non-negative integer. # # For each input record, this script enumerates all strings that # differ from field "stringField" by one edit step (insertion, # deletion, or replacement of a letter by another from the same # class). Note that the same perturbed string may be generated # more than once. # # Each input record is copied verbatim to the output. # One additional record, in the same format, is written for each of the # "perturbed" strings, with field "distField" incremented by the # given "increment" (default 1). # # The "_" character is used as a null, to avoid empty strings. # It is deleted from the input string, before the perturbation step, # and substituted for any perturbed string that happens to be empty. # # The "replaceChars" string specifies one or more sets of characters # that can be substituted for each other, separated by "/"s. The # classes need not be disjoint. E.g., "aeiouyw/td/pb/kcg/sz/yjx/mn/lrs/wvf" # says that any of the letters { a, e, i, o, u, y w } can be replaced by # any other letter of that same set; but { w v f }. If this # parameter is not specified, no substitutions will be performed. # # The "insertChars" string specifies a list of characters that can # be inserted. Any of these characters can be inserted at any # position. If this parameter is not specified, no insertions will # be performed. # # The "deleteChars" string specifies a list of characters that can # be deleted. Any of these characters can be deleted from any # position. If this parameter is not specified, no deletions will # be performed. # # If "dupOnly" is set, deletion is restricted to adjacent duplicated # letters (e.g. "address" -> { "adress", "addres" }), and insertion # is confined to duplication of existing letters # (e.g. "fall" -> { "ffall", "faall", "falll", "falll" }) # Thus general insertions and deletions will become 2-step operations. # In any case, the "insertChars" and "deleteChars" strings will # limit the characters that can be inserted or deleted. # if (stringField == "") { arg_error("must define \"stringField\""); } if (distField == "") { arg_error("must define \"distField\""); } if (increment == "") { increment = 1; } if (dupOnly == "") { dupOnly = 0; } # # Build the table "simil[ch]" with the charactes that can be # substituted for character "ch". # nReplace = split(replaceChars, class, "/"); split("", simil); for (icl in class) { cl = class[icl]; nc = length(cl); for(i=1;i<=nc;i++) { x = substr(cl,i,1); for(j=1;j<i;j++) { y = substr(cl,j,1); if ((x != y) && (index(simil[x],y)==0)) simil[x] = (simil[x] y); if ((y != x) && (index(simil[y],x)==0)) simil[y] = (simil[y] x); } } } nInsert = length(insertChars); nDelete = length(deleteChars); } /./ { if (abort >= 0) { exit abort; } if ((NF < distField) || (NF < stringField)) { error(("line " NR ": bad format")); } # Copy the input record: printf "%s\n", $0; # Generate the perturbed versions: w = $(stringField); gsub(/[_]/, "", w); d = $(distField); n = length(w); # "p[0..m-1]" are the mutated words: split("", p); m = 0; # Letter substitution: if (nReplace > 0) { for(i=1;i<=n;i++) { pre = substr(w,1,i-1); let = substr(w,i,1); pos = substr(w,i+1,n-i); if (let in simil) { reps = simil[let]; nreps = length(reps); for(j=1; j<=nreps; j++) { rep = substr(reps, j, 1); if (rep == let) { error(("line " NR ": inconsistent simil[]")); } p[m] = (pre rep pos ); m++; } } } } if (nDelete > 0) { if (dupOnly) { # Deletion of duplicate letters: for(i=1;i<n;i++) { let = substr(w,i,1); nex = substr(w,i+1,1); if((let == nex) && (index(deleteChars, let) != 0)) { p[m] =(substr(w,1,i-1) substr(w,i+1)); m++; } } } else { # Deletion of specified letters: for(i=1;i<=n;i++) { let = substr(w,i,1); if(index(deleteChars, let) != 0) { p[m] =(substr(w,1,i-1) substr(w,i+1)); m++; } } } } if (nInsert > 0) { if(dupOnly) { # Duplication of letters: for(i=0;i<n;i++) { pre = substr(w,1,i); pos = substr(w,i+1,n-i); let = substr(w,i+1,1); if (index(insertChars, let) != 0) { p[m] = (pre let pos); m++; } } } else { # Insertion of specified letters: for(i=0;i<=n;i++) { pre = substr(w,1,i); pos = substr(w,i+1,n-i); for(k=1;k<=nInsert;k++) { let = substr(insertChars,k,1); p[m] = (pre let pos); m++; } } } } # Increment distance: $(distField) += increment; # Output strings: for(k=0;k<m;k++) { $(stringField) = p[k]; if ($(stringField) == "") { $(stringField) = "_"; } printf "%s\n", $0; } } function error(msg) { printf "*** %s\n", msg > "/dev/stderr"; abort = 1; exit 1 } function arg_error(msg) { printf "*** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", msg > "/dev/stderr"; abort = 1; exit 1 }