# A library of GAWK functions for removing 
# easily confused details from EVA text.
# Last edited on 1999-02-01 00:07:27 by stolfi

# ========================================================================
# Functions for erasing comments:

function erg_erase_string(str)
{
  # Returns a string of "!"s with same length as "str".
  return gensub(/./, "!", "g", str);
}

function erg_erase_comments(old,   new, i)
{
  # Removes '{}' comments and other fillers from "old",
  # replacing them by "!"s so as to preserve the length.
  new = "";
  while (length(old) != 0)
    { i = index(old, "{");
      if (i == 0)
        { new = (new old); old = ""; }
      else if (i > 1)
        { new = (new substr(old, 1, i-1));
          old = substr(old, i);
        }
      else
        { match(old, /^{[^{}]*}/);
          if (RSTART > 0) 
            { new = (new erg_erase_string(substr(old, 1, RLENGTH)));
              old = substr(old, RLENGTH + 1);
            }
          else
            { printf "line %d, bad {}-comment\n", NR > "/dev/stderr";
              new = (new old); old = "";
            }
        }
    }
  return new;
}

# ========================================================================
# Functions for finding similar words:
#
# The following recoding operations may map two or more EVA
# characters to a fewer number of characters. In that case
# the balance is filled with " ", not "!", for the 
# benefit of erg_char_offsets below.
#
# These functions should be applied in the order in which they
# are declared here.

function erg_map_ee_to_ch(txt)
{
  # Maps "ee" to "ch" and "se" to "sh", i.e. provides a ligature.
  
  gsub(/se/,   "sh",   txt);
  gsub(/ee/,   "ch",   txt);
  
  return txt
}

function erg_map_sh_to_ch(txt)
{
  # Maps "sh" to "ch", i.e. erases the plume on "sh" (but not on "s").
  
  gsub(/sh/,   "ch",   txt);
  
  return txt
}

function erg_erase_ligatures(txt)
{
  # Erases the ligature information, namely the 
  # difference between "sh" and "se", "ch" and "ee",
  # "ckh" and "eke", etc. 
  
  gsub(/c/,    "e",    txt);
  gsub(/h/,    "e",    txt);
  
  return txt
}

function erg_erase_plumes(txt)
{
  # Erases the plume on "s", "r", by converting them to "e".
  # Also erases the isolated plumes ['"+].
  # Also converts "sh" to "ch", if it has not been
  # eliminated by erg_erase_ligatures or erg_map_sh_to_ch. 
  # Also maps "u" to "a".
  # Does not affect "n".
  
  gsub(/sh/,   "ch",   txt);
  gsub(/s/,    "e",    txt);
  gsub(/r/,    "e",    txt);
  gsub(/u/,    "a",    txt);
  txt = gensub(/([oayechi])['"+]/, "\\1 ", "g", txt);

  return txt
}

function erg_ignore_gallows_eyes(txt)
{
  # Erases the "eye" information from all gallows, 
  # by mapping "k" to "t" and "f" to "p".

  gsub(/k/,    "t",    txt);
  gsub(/f/,    "p",    txt);
  
  return txt
}
  
function erg_join_ei(txt)
{
  # Collapses "ei" to "a".

  gsub(/ei/,   "a ",   txt);
  
  return txt
}
  
function erg_equate_aoy(txt)
{
  # Erases the differences between "a", "o", "y",
  # mapping them all to "o".
  
  gsub(/a/,    "o",    txt);
  gsub(/y/,    "o",    txt);
  
  return txt
}

function erg_equate_bn(txt)
{
  # Maps "b" to "n".
  
  gsub(/b/,    "n",    txt);
  
  return txt
}

function erg_collapse_ii(txt)
{
  # Maps "ii" and "iii" and "iiii" to just "i".
  
  gsub(/iiii/, "i   ", txt);
  gsub(/iii/,  "i  ",  txt);
  gsub(/ii/,   "i ",   txt);
  
  return txt
}

function erg_equate_eights(txt)
{
  # Identifies "j", "g", "m" with "d".
  
  gsub(/j/,    "d",    txt);
  gsub(/g/,    "d",    txt);
  gsub(/m/,    "d",    txt);
  
  return txt
}

function erg_equate_pt(txt)
{
  # Identifies "p" with "t".
  
  gsub(/p/,    "t",    txt);
  
  return txt
}

function erg_erase_q(txt)
{
  # Replaces "qX" by "X ", in a chunk of comment-free EVA.
  # Adds a space after the letter to preserve length.
  # Note that the space comes after to ensure we compute the correct offsets.
  # Better call "erg_join_ei" before this one.
  
  return gensub(/q([oayeclktp])/, "\\1 ", "g", txt);
}

# ========================================================================
# Functions for dealing with word spaces:
# These functions consider "/" a space, too.

function erg_unify_word_spaces(str)
{ 
  # Replaces all EVA word spaces by "."s
  return gensub(/[-/=,.]/, ".", "g", str);
}

function erg_erase_word_spaces(str)
{ 
  # Replaces all EVA word spaces by "!"s
  return gensub(/[-/=,.]/, "!", "g", str);
}

# ========================================================================
# Functions for discarding words that contain invalid characters:

function erg_crush_invalid_words(str,  res)
{ 
  # Replaces any word that contains an invalid character by "*"s
  res = "";
  while (str != "")
    { if (match(str, /^[^*?%]*([-.,/=]+|$)/))
        { res = (res substr(str, 1, RLENGTH));
          str = substr(str,RLENGTH+1);
        }
      else if (match(str, /^[^-.,/=]*[?*%][^-.,/=]*/))
        { res = (res "?" erg_erase_string(substr(str, RLENGTH-1)));
          str = substr(str,RLENGTH+1);
        }
      else
        { printf "line %d, erg_crush_invalid_words lost\n", NR > "/dev/stderr";
          abort = 1; exit 1;
        }
    }
  return res
}

# ========================================================================
# Functions for squeezing out fillers (and remembering them):

function erg_pack(old)
{
  # Removes all fillers from "old"
  return gensub(/[!% ]/, "", "g", old);
}

function erg_char_offsets(txt, iof, fof,  i, k, m, c, goon)
{
  # Expects iof and bof to be empty arrays.  Stores in iof[i] the byte
  # offset of the substring of txt that apparently gave rise to the
  # ith character of the string pck = erg_pack(txt).  Also stores in
  # fof[i] the byte offset for the end of that string.
  #
  # Assumes that the string of a non-filler character includes 
  # that character and any succeeding " "s.  Thus 
  # 
  # E.g. suppose txt = "!!!ab !c  !" so that pck = "abc"; then
  # iof[1..3] will be {3,4,7} and fof[1..3] will be {4,6,10}.
  # Also sets fof[0] to 0 and iof[length(pck)] to length(txt).
  i = 0; m = length(txt);
  fof[i] = 0; goon = 0;
  for(k=1;k<=m;k++)
    { c = substr(txt,k,1);
      if ((c == "!") || (c == "%"))
        { goon = 0; }
      else if (c == " ")
        { if (goon) fof[i] = k; }
      else 
        { i++; iof[i] = k-1; fof[i] = k; goon = 1; }
    }
  iof[i + 1] = m;
}