#! /usr/bin/gawk -f
# Last edited on 1998-09-20 03:50:52 by stolfi
BEGIN {
abort = -1;
usage = ( \
"perturb-words \\\n" \
" -v stringField=NUM \\\n" \
" -v distField=NUM \\\n" \
" [ -v increment=NUM ] \\\n" \
" [ -v replaceChars=CHARS ] \\\n" \
" [ -v deleteChars=CHARS ] \\\n" \
" [ -v insertChars=CHARS ] \\\n" \
" [ -v dupOnly=BOOL ] \\\n" \
" < INFILE > OUTFILE" \
);
#
# Field number "stringField" of each input record must contain a
# non-empty string over the given alphabet plus "_". Field number
# "distField" must be a non-negative integer.
#
# For each input record, this script enumerates all strings that
# differ from field "stringField" by one edit step (insertion,
# deletion, or replacement of a letter by another from the same
# class). Note that the same perturbed string may be generated
# more than once.
#
# Each input record is copied verbatim to the output.
# One additional record, in the same format, is written for each of the
# "perturbed" strings, with field "distField" incremented by the
# given "increment" (default 1).
#
# The "_" character is used as a null, to avoid empty strings.
# It is deleted from the input string, before the perturbation step,
# and substituted for any perturbed string that happens to be empty.
#
# The "replaceChars" string specifies one or more sets of characters
# that can be substituted for each other, separated by "/"s. The
# classes need not be disjoint. E.g., "aeiouyw/td/pb/kcg/sz/yjx/mn/lrs/wvf"
# says that any of the letters { a, e, i, o, u, y w } can be replaced by
# any other letter of that same set; but { w v f }. If this
# parameter is not specified, no substitutions will be performed.
#
# The "insertChars" string specifies a list of characters that can
# be inserted. Any of these characters can be inserted at any
# position. If this parameter is not specified, no insertions will
# be performed.
#
# The "deleteChars" string specifies a list of characters that can
# be deleted. Any of these characters can be deleted from any
# position. If this parameter is not specified, no deletions will
# be performed.
#
# If "dupOnly" is set, deletion is restricted to adjacent duplicated
# letters (e.g. "address" -> { "adress", "addres" }), and insertion
# is confined to duplication of existing letters
# (e.g. "fall" -> { "ffall", "faall", "falll", "falll" })
# Thus general insertions and deletions will become 2-step operations.
# In any case, the "insertChars" and "deleteChars" strings will
# limit the characters that can be inserted or deleted.
#
if (stringField == "") { arg_error("must define \"stringField\""); }
if (distField == "") { arg_error("must define \"distField\""); }
if (increment == "") { increment = 1; }
if (dupOnly == "") { dupOnly = 0; }
#
# Build the table "simil[ch]" with the charactes that can be
# substituted for character "ch".
#
nReplace = split(replaceChars, class, "/");
split("", simil);
for (icl in class)
{ cl = class[icl];
nc = length(cl);
for(i=1;i<=nc;i++)
{ x = substr(cl,i,1);
for(j=1;j<i;j++)
{ y = substr(cl,j,1);
if ((x != y) && (index(simil[x],y)==0)) simil[x] = (simil[x] y);
if ((y != x) && (index(simil[y],x)==0)) simil[y] = (simil[y] x);
}
}
}
nInsert = length(insertChars);
nDelete = length(deleteChars);
}
/./ {
if (abort >= 0) { exit abort; }
if ((NF < distField) || (NF < stringField)) { error(("line " NR ": bad format")); }
# Copy the input record:
printf "%s\n", $0;
# Generate the perturbed versions:
w = $(stringField); gsub(/[_]/, "", w);
d = $(distField);
n = length(w);
# "p[0..m-1]" are the mutated words:
split("", p); m = 0;
# Letter substitution:
if (nReplace > 0)
{
for(i=1;i<=n;i++)
{ pre = substr(w,1,i-1);
let = substr(w,i,1);
pos = substr(w,i+1,n-i);
if (let in simil)
{ reps = simil[let];
nreps = length(reps);
for(j=1; j<=nreps; j++)
{ rep = substr(reps, j, 1);
if (rep == let) { error(("line " NR ": inconsistent simil[]")); }
p[m] = (pre rep pos ); m++;
}
}
}
}
if (nDelete > 0)
{ if (dupOnly)
{ # Deletion of duplicate letters:
for(i=1;i<n;i++)
{ let = substr(w,i,1); nex = substr(w,i+1,1);
if((let == nex) && (index(deleteChars, let) != 0))
{ p[m] =(substr(w,1,i-1) substr(w,i+1)); m++; }
}
}
else
{ # Deletion of specified letters:
for(i=1;i<=n;i++)
{ let = substr(w,i,1);
if(index(deleteChars, let) != 0)
{ p[m] =(substr(w,1,i-1) substr(w,i+1)); m++; }
}
}
}
if (nInsert > 0)
{ if(dupOnly)
{ # Duplication of letters:
for(i=0;i<n;i++)
{ pre = substr(w,1,i);
pos = substr(w,i+1,n-i);
let = substr(w,i+1,1);
if (index(insertChars, let) != 0)
{ p[m] = (pre let pos); m++; }
}
}
else
{
# Insertion of specified letters:
for(i=0;i<=n;i++)
{ pre = substr(w,1,i);
pos = substr(w,i+1,n-i);
for(k=1;k<=nInsert;k++)
{ let = substr(insertChars,k,1);
p[m] = (pre let pos); m++;
}
}
}
}
# Increment distance:
$(distField) += increment;
# Output strings:
for(k=0;k<m;k++)
{ $(stringField) = p[k];
if ($(stringField) == "") { $(stringField) = "_"; }
printf "%s\n", $0;
}
}
function error(msg)
{
printf "*** %s\n", msg > "/dev/stderr";
abort = 1;
exit 1
}
function arg_error(msg)
{
printf "*** %s\n", msg > "/dev/stderr";
printf "usage: %s\n", msg > "/dev/stderr";
abort = 1;
exit 1
}