#! /usr/bin/gawk -f
# Must specify -f eva2erg.gawk
# Last edited on 1999-01-05 22:14:45 by stolfi

# Adds a match key that can be used to find similar words in Voynichese 
# text.
#
#    cat INFILE \
#      | add-match-key -f eva2erg.gawk \
#          [ -v inField=IFLDNUM ] \
#          [ -v outField=OFLDNUM ] \
#      > OUTFILE
#
# This script reads from stdin one or more records that contain a
# Voynichese word KEY. It outputs the same redords, each augmented with a
# "reduced" version RKEY of the same word.
#
# The KEY word is assumed to be field IFLDNUM of each record
# (the first field by default), and the RKEY is inserted as field
# OFLDNUM (first by default).
#

BEGIN {
  abort = 0;
  if (inField == "") inField = 1;
  if (outField == "") outField = 1;
  printf "options:\n" > "/dev/stderr";

  # Select options:
  erase_ligatures = 1;
  erase_plumes = 1;
  ignore_gallows_eyes = 1;
  join_ei = 1;
  equate_bn = 1;
  equate_aoy = 1;
  collapse_ii = 1;
  equate_eights = 1;
  equate_pt = 1;
  erase_q = 1;
  erase_word_spaces = 1;
  
  # Print options:
  if (erase_ligatures)     printf "  erase_ligatures\n"     > "/dev/stderr";
  if (erase_plumes)        printf "  erase_plumes\n"        > "/dev/stderr";
  if (ignore_gallows_eyes) printf "  ignore_gallows_eyes\n" > "/dev/stderr";
  if (join_ei)             printf "  join_ei\n"             > "/dev/stderr";
  if (equate_aoy)          printf "  equate_aoy\n"          > "/dev/stderr";
  if (equate_bn)           printf "  equate_bn\n"           > "/dev/stderr";
  if (collapse_ii)         printf "  collapse_ii\n"         > "/dev/stderr";
  if (equate_eights)       printf "  equate_eights\n"       > "/dev/stderr";
  if (equate_pt)           printf "  equate_pt\n"           > "/dev/stderr";
  if (erase_q)             printf "  erase_q\n"             > "/dev/stderr";
  if (erase_word_spaces)   printf "  erase_word_spaces\n"   > "/dev/stderr";
}

/./ {
  if (abort) exit;
  if (NF < inField) { error("not enough input fields\n"); }
  printout(recode_text(erg_erase_comments($(inField))), outField);
  next;
}

function error(msg)
{ 
  printf "line %d: %s\n", NR, msg > "/dev/stderr"
  abort = 1
  exit
}

function recode_text(str)
{
  # Converts a textstring with no embedded "{}"s to the
  # requested encoding.
  
  # Delete EVMT fillers:
  str = gensub(/[!% ]/, "", "g", str);
  
  # Normalize spaces and surround with one space
  str = gensub(/[-/=,.]+/,   ".", "g", str);
  str = gensub(/^[-/=,.]*/,  ".", "g", str);
  str = gensub(/[-/=,.]*$/,  ".", "g", str);

  # Special hacks not handled by eva2erg.gawk routines:
  gsub(/u/, "en", str);
  gsub(/z/, "k", str);
  
  str = (erase_ligatures     ? erg_erase_ligatures(str)     : str);
  str = (erase_plumes        ? erg_erase_plumes(str)        : str);
  str = (ignore_gallows_eyes ? erg_ignore_gallows_eyes(str) : str);
  str = (join_ei             ? erg_join_ei(str)             : str);
  str = (equate_aoy          ? erg_equate_aoy(str)          : str);
  str = (equate_bn           ? erg_equate_bn(str)           : str);
  str = (collapse_ii         ? erg_collapse_ii(str)         : str);
  str = (equate_eights       ? erg_equate_eights(str)       : str);
  str = (equate_pt           ? erg_equate_pt(str)           : str);
  str = (erase_q             ? erg_erase_q(str)             : str);
  str = (erase_word_spaces   ? erg_erase_word_spaces(str)   : erg_unify_word_spaces(str));
  return erg_pack(str);

  # Ensure no words have disappeared
  str = gensub(/[.][.]/,   ".o.", "g", str);
  str = gensub(/[.][.]/,   ".o.", "g", str);

  # Delete spaces
  str = gensub(/[.]/,  "", "g", str);

  # Guard against empty string:
  if (str == "") { str = "o"; }

  return str;
}

function printout(mw, fn,  i)
{
  # prints $0 with "mw" inserted as field "$(fn)"
  if (NF < fn-1) { error("not enough output fields\n"); }
  if (fn == 1)
    { print mw, $0; }
  else if (fn == NF+1)
    { print $0, mw; }
  else
    { for (i=1;i<fn;i++) { printf "%s%s", $(i), OFS; }
      printf "%s", mw;
      for (i=fn;i<=NF;i++) { printf "%s%s", OFS, $(i); }
    }
}