#! /usr/bin/gawk -f
# Last edited on 1999-01-31 23:10:38 by stolfi

BEGIN {
  abort = -1;
  usage = ( "factor-line-OK < INFILE > OUTFILE" );
  
  # Factors each line of the INFILE into its OKOKOKO 
  # elements, and writes the result to standard output.
  #
  # Each "Q" and "K" element with its "I" and "E" complements is
  # wrapped with "{}". Will provide a dummy element "{ø}" if the word
  # has no "q". The O elements are left undelimited, but a "_" is
  # provided where the O element is empty (including at the end, but
  # not at the beginning.)
  #
  # All EVA and ASCII spaces in the input are replaced by single 
  # "."s. The above rules are applied independently to each word;
  # there is no interference between adjacent words
  #
  # The inField and outField are "1" if not specified,
  # i.e. the new field is prepended to the original record.
}

(abort >= 0) { exit abort; }

/^#/ { print; next; }

/./ {
  print factor_text($0);
  next;
}  

function factor_text(x,   y,n,wds,i,w,e)
{
  # Decomposes "x" into its OKOKOKO elements, separated by ":".
  # Assumes "x" is uncapitalized EVA without comments and fillers.
  # EVA spaces and "/" are allowed.

  gsub(/[{][^{}]*[}]/, "", x);
  gsub(/[!]/, "", x);
  gsub(/[&][0-9][0-9][0-9][;]?/, "?", x);
  gsub(/[%?*]/, "?", x);
  if (match(x, /[^-=\/,. *?%a-z]/)) { error(("invalid char in word \"" x "\"")); }
  
  # Map "sh", "ch", and "ee" to single letters to simplify the parsing.
  # Note that "eee" groups are paired off from the left end. 
  gsub(/ch/, "C", x);
  gsub(/sh/, "S", x);
  gsub(/ee/, "E", x);

  # Map platformed and half-platformed letters to capitals to simplify the parsing:
  gsub(/ckh/, "K", x);
  gsub(/cth/, "T", x);
  gsub(/cfh/, "F", x);
  gsub(/cph/, "P", x);
  gsub(/c[?]h/, "?", x);
  #             
  gsub(/ikh/, "G", x);
  gsub(/ith/, "H", x);
  gsub(/ifh/, "M", x);
  gsub(/iph/, "N", x);
  gsub(/i[?]h/, "?", x);
  #
  gsub(/ck/, "U", x);
  gsub(/ct/, "V", x);
  gsub(/cf/, "X", x);
  gsub(/cp/, "Y", x);
  gsub(/c[?]/, "?", x);
  
  y = ""; 
  
  # Split into words:
  gsub(/^[-=\/,. ]+/, "", x);
  gsub(/[-=\/,. ]+$/, "", x);
  n = split(x, wds, /[-=\/,. ]+/);
  
  for (i=1;i<=n;i++)
    { w = wds[i];
      if (y != "") { y = ( y "." ); }
      # split off initial <q> if any:
      if (match(w, /^[q]/)) 
        { e = substr(w,1,RLENGTH); w = substr(w, RLENGTH+1); }
      else
        { e = "ø"; }
      y = ( y "{" e "}");

      while (1)
        { # split off "[aoy]" group
          if (match(w, /^[aoy]+/))
            { e = substr(w,1,RLENGTH); w = substr(w, RLENGTH+1); }
          else
            { e = "_"; }
          y = ( y e );

          if (w == "") { break; }
          
          # copy next main letter with "i" and "e" complements
          if (match(w, /^([i]+[^aoyi]|[^aoyehi][eh]?|[^aoy])/))
            { e = substr(w,1,RLENGTH); w = substr(w, RLENGTH+1); }
          else
            { error(("got stuck with \"" w "\"")); }
          y = ( y "{" e "}");
        }

    }
  # Unfold letter folding:
  gsub(/U/, "ck", y);
  gsub(/V/, "ct", y);
  gsub(/X/, "cf", y);
  gsub(/Y/, "cp", y);
  #
  gsub(/G/, "ikh", y);
  gsub(/H/, "ith", y);
  gsub(/M/, "ifh", y);
  gsub(/N/, "iph", y);
  #
  gsub(/K/, "ckh", y);
  gsub(/T/, "cth", y);
  gsub(/P/, "cph", y);
  gsub(/F/, "cfh", y);
  #
  gsub(/C/, "ch", y);
  gsub(/S/, "sh", y);
  gsub(/E/, "ee", y);

  return y;
}

function error(msg)
{ 
  printf "line %d: %s\n", NR, msg >> "/dev/stderr";
  abort = 1;
  exit 1;
}

function arg_error(msg)
{ 
  printf "%s\n", msg >> "/dev/stderr";
  abort = 1;
  exit 1;
}