#! /usr/bin/gawk -f
# Neeeds -f eva2erg.gawk
# Last edited on 1999-01-05 22:04:00 by stolfi

# Adds a match key for bringing together "similar" phrases
# in badly spellt Engllishe toungge.  Based on phonetic similarity.
#
#    cat INFILE \
#      | add-lac-match-key -f eva2erg.gawk \
#          [ -v inField=IFLDNUM ] \
#          [ -v outField=OFLDNUM ] \
#      > OUTFILE
#
# This script reads from stdin one or more records that contain an
# Engllishe word KEY. It outputs the same records, each augmented with a
# "reduced" version RKEY of the same word.
#
# The KEY word is assumed to be field IFLDNUM of each record
# (the first field by default), and the RKEY is inserted as field
# OFLDNUM (first by default).

function error(msg)
{ 
  printf "line %d: %s\n", NR, msg > "/dev/stderr"
  abort = 1
  exit
}

function recode_text(str)
{
  # Converts a textstring with no embedded "{}"s to the
  # requested encoding.
  
  # Delete EVMT fillers:
  str = gensub(/[!% ]/, "", "g", str);
  
  # Normalize spaces and surround with one space
  str = gensub(/[-/=,.]+/,   ".", "g", str);
  str = gensub(/^[-/=,.]*/,  ".", "g", str);
  str = gensub(/[-/=,.]*$/,  ".", "g", str);

  # Letter "x" by its "ks" sound:
  str = gensub(/x/,       "ks",  "g", str);
  
  # Letters "ch", "sh", "sch" by "x":
  str = gensub(/[cs]h/,   "x",  "g", str);
  str = gensub(/sch/, "x",  "g", str);
  
  # Group "ph" becomes "f":
  str = gensub(/ph/,    "f",  "g", str);
  
  # Group "wr" becomes "r":
  str = gensub(/wr/,    "r",  "g", str);
  
  # Letter "f" often sounds as "v" and fice-fersa:
  str = gensub(/v/,       "f",  "g", str);
  
  # Group "ight" is equivalent to "ite"
  str = gensub(/ight/,    "ite",  "g", str);
  
  # Group "ought" is equivalent to "aut"
  str = gensub(/ought/,    "aut",  "g", str);
  
  # Otherwise the group "gh" is soundless except before vowels
  str = gensub(/gh([.bcdfghjklmnpqrstvwxz])/,    "\\1",  "g", str);
  
  # Otherwise the non-initial letter "h" is often soundless:
  str = gensub(/([^.])h/,       "\\1",  "g", str);
  
  # Letters "qu" and "gu" are "k" and "g", usually:
  str = gensub(/qu/,    "k",  "g", str);
  str = gensub(/gu([aeiou])/,    "g\\1",  "g", str);
  
  # Remove duplicated letters:
  str = gensub(/[ck]+/,     "k",  "g", str);
  str = gensub(/[b]+/,      "b",  "g", str);
  str = gensub(/[d]+/,      "d",  "g", str);
  str = gensub(/[f]+/,      "f",  "g", str);
  str = gensub(/[g]+/,      "g",  "g", str);
  str = gensub(/[l]+/,      "l",  "g", str);
  str = gensub(/[mn]+/,     "n",  "g", str);
  str = gensub(/[p]+/,      "p",  "g", str);
  str = gensub(/[r]+/,      "r",  "g", str);
  str = gensub(/[sz]+/,     "s",  "g", str);
  str = gensub(/[t]+/,      "t",  "g", str);
  str = gensub(/[v]+/,      "v",  "g", str);
  
  # Soften "c" before "e" and "i": 
  str = gensub(/[s][c]([ei])/,  "s\\1", "g", str);
  str = gensub(/[c]([ei])/,     "s\\1", "g", str);
  
  # Soften "g" before "e" and "i": 
  str = gensub(/[g]([ei])/,     "j\\1", "g", str);
  
  # Remove plural endings, posessives, 3rd person:
  str = gensub(/[i][e][s][.]/,      "y.",     "g", str);
  str = gensub(/([^.])[e][s][.]/,   "\\1.",   "g", str);
  str = gensub(/([^.])[s][.]/,   "\\1.",   "g", str);
  
  # Remove past tense endings:
  str = gensub(/([^.])[ei][dt][.]/, "\\1.",  "g", str);
  str = gensub(/([^.])[d][.]/,      "\\1.",  "g", str);
  
  # Remove present continuous endings:
  str = gensub(/([^.])ing[.]/, "\\1.",  "g", str);
  
  # Remove final "e", if not alone:
  str = gensub(/([^.])[e][.]/,      "\\1.",   "g", str);
  
  # The "pt" group may be spelled "t"
  str = gensub(/pt/,    "p",  "g", str);
  
  # Groups "ai", "ay", "ea" becomes "e":
  str = gensub(/a[iy]/,   "e", "g", str);
  str = gensub(/ea/,      "e", "g", str);
  
  # Collapse back vowels (with any adjacent front ones): 
  str = gensub(/[eiy]*[oawu]+[oaueiwy]*/,   "o", "g", str);
  
  # Delete back vowels between consonants (except final "s"): 
  str = gensub(/([b-df-hj-np-tvxz])o([b-df-hj-np-tvxz])/,   "\\1\\2", "g", str);
  str = gensub(/([b-df-hj-np-tvxz])o([b-df-hj-np-tvxz])/,   "\\1\\2", "g", str);
  
  # Collapse front vowels to "e"
  str = gensub(/[eiy]+/,   "e", "g", str);
  
  # Ensure no words have disappeared
  str = gensub(/[.][.]/,   ".o.", "g", str);
  str = gensub(/[.][.]/,   ".o.", "g", str);
  
  # Delete all spaces
  str = gensub(/[.]/,  "", "g", str);

  # Guard against empty string:
  if (str == "") { str = "o"; }

  return str;
}

function printout(mw, fn,  i)
{
  # prints $0 with "mw" inserted as field "$(fn)"
  if (NF < fn-1) { error("not enough output fields\n"); }
  if (fn == 1)
    { print mw, $0; }
  else if (fn == NF+1)
    { print $0, mw; }
  else
    { for (i=1;i<fn;i++) { printf "%s%s", $(i), OFS; }
      printf "%s", mw;
      for (i=fn;i<=NF;i++) { printf "%s%s", OFS, $(i); }
    }
}

BEGIN {
  abort = 0;
  if (inField == "") inField = 1;
  if (outField == "") outField = 1;
}

/./ {
  if (abort) exit;
  if (NF < inField) { error("not enough input fields\n"); }
  printout(recode_text(erg_erase_comments($(inField))), outField);
  next;
}