#! /usr/bin/gawk -f
# Last edited on 1998-09-20 03:50:52 by stolfi

BEGIN { 
  abort = -1;
  usage = ( \
    "perturb-words \\\n" \
    "  -v stringField=NUM \\\n" \
    "  -v distField=NUM \\\n" \
    "  [ -v increment=NUM ] \\\n" \
    "  [ -v replaceChars=CHARS ] \\\n" \
    "  [ -v deleteChars=CHARS ] \\\n" \
    "  [ -v insertChars=CHARS ] \\\n" \
    "  [ -v dupOnly=BOOL ] \\\n" \
    "  < INFILE > OUTFILE" \
  );
  #
  # Field number "stringField" of each input record must contain a
  # non-empty string over the given alphabet plus "_".  Field number
  # "distField" must be a non-negative integer.
  #
  # For each input record, this script enumerates all strings that
  # differ from field "stringField" by one edit step (insertion,
  # deletion, or replacement of a letter by another from the same
  # class).  Note that the same perturbed string may be generated 
  # more than once. 
  #
  # Each input record is copied verbatim to the output.
  # One additional record, in the same format, is written for each of the
  # "perturbed" strings, with field "distField" incremented by the 
  # given "increment" (default 1). 
  #
  # The "_" character is used as a null, to avoid empty strings.
  # It is deleted from the input string, before the perturbation step,
  # and substituted for any perturbed string that happens to be empty.
  #
  # The "replaceChars" string specifies one or more sets of characters
  # that can be substituted for each other, separated by "/"s. The
  # classes need not be disjoint.  E.g., "aeiouyw/td/pb/kcg/sz/yjx/mn/lrs/wvf"
  # says that any of the letters { a, e, i, o, u, y w } can be replaced by 
  # any other letter of that same set; but { w v f }.  If this 
  # parameter is not specified, no substitutions will be performed.
  #
  # The "insertChars" string specifies a list of characters that can
  # be inserted.  Any of these characters can be inserted at any
  # position. If this parameter is not specified, no insertions will
  # be performed.
  #
  # The "deleteChars" string specifies a list of characters that can
  # be deleted.  Any of these characters can be deleted from any
  # position. If this parameter is not specified, no deletions will
  # be performed.
  #
  # If "dupOnly" is set, deletion is restricted to adjacent duplicated
  # letters (e.g. "address" -> { "adress", "addres" }), and insertion
  # is confined to duplication of existing letters 
  # (e.g. "fall" -> { "ffall", "faall", "falll", "falll" })
  # Thus general insertions and deletions will become 2-step operations.
  # In any case, the "insertChars" and "deleteChars" strings will 
  # limit the characters that can be inserted or deleted.
  # 
  if (stringField == "") { arg_error("must define \"stringField\""); }
  if (distField == "") { arg_error("must define \"distField\""); }
  if (increment == "") { increment = 1; }
  if (dupOnly == "") { dupOnly = 0; }
  #
  # Build the table "simil[ch]" with the charactes that can be
  # substituted for character "ch".
  #
  nReplace = split(replaceChars, class, "/");
  split("", simil);
  for (icl in class)
    { cl = class[icl];
      nc = length(cl);
      for(i=1;i<=nc;i++) 
        { x = substr(cl,i,1); 
          for(j=1;j<i;j++) 
            { y = substr(cl,j,1); 
              if ((x != y) && (index(simil[x],y)==0)) simil[x] = (simil[x] y);
              if ((y != x) && (index(simil[y],x)==0)) simil[y] = (simil[y] x);
            }
        }
    }
  nInsert = length(insertChars);
  nDelete = length(deleteChars);
}

/./ {
  if (abort >= 0) { exit abort; }
  if ((NF < distField) || (NF < stringField)) { error(("line " NR ": bad format")); }
  # Copy the input record:
  printf "%s\n", $0;
  # Generate the perturbed versions:
  w = $(stringField); gsub(/[_]/, "", w);
  d = $(distField);
  n = length(w);
  # "p[0..m-1]" are the mutated words:
  split("", p); m = 0;
  # Letter substitution:
  if (nReplace > 0)
    { 
      for(i=1;i<=n;i++)
        { pre = substr(w,1,i-1); 
          let = substr(w,i,1); 
          pos = substr(w,i+1,n-i);
          if (let in simil)
            { reps = simil[let];
              nreps = length(reps);
              for(j=1; j<=nreps; j++)
                { rep = substr(reps, j, 1);
                  if (rep == let) { error(("line " NR ": inconsistent simil[]")); }
                  p[m] = (pre rep pos ); m++;
                }
            }
        }
    }
  if (nDelete > 0)
    { if (dupOnly)
        { # Deletion of duplicate letters: 
          for(i=1;i<n;i++)
            { let = substr(w,i,1); nex = substr(w,i+1,1);
              if((let == nex) && (index(deleteChars, let) != 0))
                { p[m] =(substr(w,1,i-1) substr(w,i+1)); m++; }
            }
        }
      else
        { # Deletion of specified letters: 
          for(i=1;i<=n;i++)
            { let = substr(w,i,1);
              if(index(deleteChars, let) != 0)
                { p[m] =(substr(w,1,i-1) substr(w,i+1)); m++; }
            }
        }
    }
  if (nInsert > 0)
    { if(dupOnly)
        { # Duplication of letters:
          for(i=0;i<n;i++)
            { pre = substr(w,1,i); 
              pos = substr(w,i+1,n-i);
              let = substr(w,i+1,1);
              if (index(insertChars, let) != 0)
                { p[m] = (pre let pos); m++; }
            }
        }
      else
        {
          # Insertion of specified letters:
          for(i=0;i<=n;i++)
            { pre = substr(w,1,i); 
              pos = substr(w,i+1,n-i);
              for(k=1;k<=nInsert;k++)
                { let = substr(insertChars,k,1);
                  p[m] = (pre let pos); m++;
                }
            }
        }
    } 

  # Increment distance:
  $(distField) += increment;
  # Output strings:
  for(k=0;k<m;k++)
    { $(stringField) = p[k]; 
      if ($(stringField) == "") { $(stringField) = "_"; }
      printf "%s\n", $0;
    }
}

function error(msg)
{
  printf "*** %s\n", msg > "/dev/stderr";
  abort = 1;
  exit 1
}

function arg_error(msg)
{
  printf "*** %s\n", msg > "/dev/stderr";
  printf "usage: %s\n", msg > "/dev/stderr";
  abort = 1;
  exit 1
}