#! /usr/bin/gawk -f
# Neeeds -f eva2erg.gawk
# Last edited on 1999-01-05 22:06:30 by stolfi

# Adds a match key adequate for gathering together "similar" Geez phrases.
#
#    cat INFILE \
#      | add-eno-match-key -f eva2erg.gawk \
#          [ -v inField=IFLDNUM ] \
#          [ -v outField=OFLDNUM ] \
#      > OUTFILE
#
# This script reads from stdin one or more records that contain a Geez
# (classical Ethiopian) word KEY. It outputs the same records, each
# preceded by a "reduced" version RKEY of the same word (and one
# blank).
#
# The KEY word is assumed to be field IFLDNUM of each record
# (the first field by default).

function error(msg)
{ 
  printf "line %d: %s\n", NR, msg > "/dev/stderr"
  abort = 1
  exit
}

function recode_text(str)
{
  # Converts a textstring with no embedded "{}"s to the
  # requested encoding.
  
  # Delete EVMT fillers:
  str = gensub(/[!% ]/, "", "g", str);
  
  # Normalize spaces and surround with one space
  str = gensub(/[-/=,.]+/,   ".", "g", str);
  str = gensub(/^[-/=,.]*/,  ".", "g", str);
  str = gensub(/[-/=,.]*$/,  ".", "g", str);

  # Remove the phonetic markers [`W]:
  str = gensub(/W/, "", "g", str);
  str = gensub(/[`]([a-zA-Z])/,  "\\1", "g", str);
  
  # Collapse all numbers (Arabic and Ethiopian) to "0":
  str = gensub(/[`0-9]+/,  "0", "g", str);

  # Letter "a" is used (rarely) instead of "e":
  str = gensub(/a/,  "e", "g", str);

  # Map uppercase to lowercase:
  while (match(str, /[A-Z]/))
    { 
      str = gensub(/A/,  "a", "g", str);
      str = gensub(/B/,  "b", "g", str);
      str = gensub(/C/,  "c", "g", str);
      str = gensub(/D/,  "d", "g", str);
      str = gensub(/E/,  "e", "g", str);
      str = gensub(/F/,  "f", "g", str);
      str = gensub(/G/,  "g", "g", str);
      str = gensub(/H/,  "h", "g", str);
      str = gensub(/I/,  "i", "g", str);
      str = gensub(/J/,  "j", "g", str);
      str = gensub(/K/,  "k", "g", str);
      str = gensub(/L/,  "l", "g", str);
      str = gensub(/M/,  "m", "g", str);
      str = gensub(/N/,  "n", "g", str);
      str = gensub(/O/,  "o", "g", str);
      str = gensub(/P/,  "p", "g", str);
      str = gensub(/Q/,  "q", "g", str);
      str = gensub(/R/,  "r", "g", str);
      str = gensub(/S/,  "s", "g", str);
      str = gensub(/T/,  "t", "g", str);
      str = gensub(/U/,  "u", "g", str);
      str = gensub(/V/,  "v", "g", str);
      str = gensub(/W/,  "w", "g", str);
      str = gensub(/X/,  "x", "g", str);
      str = gensub(/Y/,  "y", "g", str);
      str = gensub(/Z/,  "z", "g", str);
    }

  # Collapse similar(?) consonants:
  str = gensub(/[jyzx]/,  "j", "g", str);
  str = gensub(/[khqgc]/, "k", "g", str);
  str = gensub(/[pb]/,    "b", "g", str);
  str = gensub(/[fvw]/,   "v", "g", str);

  # Collapse similar(?) vowels:
  str = gensub(/[ei]/,    "e", "g", str);
  str = gensub(/[ou]/,    "u", "g", str);

  # Ensure no words have disappeared
  str = gensub(/[.][.]/,   ".u.", "g", str);
  str = gensub(/[.][.]/,   ".u.", "g", str);
  
  # Delete spaces and the syllable separator [']
  str = gensub(/[.']/,  "", "g", str);

  # Guard against empty string:
  if (str == "") { str = "u"; }

  return str;
}

function printout(mw, fn,  i)
{
  # prints $0 with "mw" inserted as field "$(fn)"
  if (NF < fn-1) { error("not enough output fields\n"); }
  if (fn == 1)
    { print mw, $0; }
  else if (fn == NF+1)
    { print $0, mw; }
  else
    { for (i=1;i<fn;i++) { printf "%s%s", $(i), OFS; }
      printf "%s", mw;
      for (i=fn;i<=NF;i++) { printf "%s%s", OFS, $(i); }
    }
}

BEGIN {
  abort = 0;
  if (inField == "") inField = 1;
  if (outField == "") outField = 1;
}

/./ {
  if (abort) exit;
  if (NF < inField) { error("not enough input fields\n"); }
  printout(recode_text(erg_erase_comments($(inField))), outField);
  next;
}