#! /usr/bin/gawk -f
# Last edited on 1999-02-01 09:55:25 by stolfi

# Usage: 
#   cat INFILE \
#     | collapse-words -f eva2erg.gawk \
#       [-v describe_equiv=BOOL] \
#       [-v append_tilde=BOOL] \
#       [-v field=FIELDNUM] \
#       [EQUIVOPTIONS] \
#     > OUTFILE
#
# Maps each word in INFILE to an equivalence class, by a
# built-in equivalence function. 
#
# If "append_tilde" is set, appends a tilde "~" to the mapped
# words, to indicate that it is a word class and not a 
# raw word.
#
# If "field" is specified, only the indicated field gets
# mapped to its equivalence class. Othwerwise, the 
# input may be in EVT format (with location code in columns 1-19)
# or in pure text format; either way, all text words are mapped.
#
# EQUIVOPTIONS are assignments of the form -v OPTION=BOOL where OPTION
# is an option variable of eva2erg.gawk, and BOOL is 0 or 1.
#
# If "describe_equiv" is set, prints to stderr a description 
# of the equivalence used, and exits without processing any records. 

function error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  abort = 1; exit 1;
}

function print_equiv()
{
  printf "word equivalence:\n" > "/dev/stderr";
  if (erase_ligatures)     printf "  erase_ligatures\n"     > "/dev/stderr";
  if (map_sh_to_ch)        printf "  map_sh_to_ch\n"        > "/dev/stderr";
  if (erase_plumes)        printf "  erase_plumes\n"        > "/dev/stderr";
  if (ignore_gallows_eyes) printf "  ignore_gallows_eyes\n" > "/dev/stderr";
  if (join_ei)             printf "  join_ei\n"             > "/dev/stderr";
  if (equate_aoy)          printf "  equate_aoy\n"          > "/dev/stderr";
  if (collapse_ii)         printf "  collapse_ii\n"         > "/dev/stderr";
  if (equate_eights)       printf "  equate_eights\n"       > "/dev/stderr";
  if (equate_pt)           printf "  equate_pt\n"           > "/dev/stderr";
  if (erase_q)             printf "  erase_q\n"             > "/dev/stderr";
  if (erase_word_spaces)   printf "  erase_word_spaces\n"   > "/dev/stderr";
  if (unify_word_spaces)   printf "  unify_word_spaces\n"   > "/dev/stderr";
  if (crush_invalid_words) printf "  crush_invalid_words\n" > "/dev/stderr";
  if (append_tilde)        printf "  append_tilde\n"        > "/dev/stderr";
}

function reduce_text(str)
{
  # Converts a textstring with no embedded "{}"s to the
  # requested encoding.
  
  str = (erase_ligatures     ? erg_erase_ligatures(str)     : str);
  str = (map_sh_to_ch        ? erg_map_sh_to_ch(str)        : str);
  str = (erase_plumes        ? erg_erase_plumes(str)        : str);
  str = (ignore_gallows_eyes ? erg_ignore_gallows_eyes(str) : str);
  str = (join_ei             ? erg_join_ei(str)             : str);
  str = (equate_aoy          ? erg_equate_aoy(str)          : str);
  str = (collapse_ii         ? erg_collapse_ii(str)         : str);
  str = (equate_eights       ? erg_equate_eights(str)       : str);
  str = (equate_pt           ? erg_equate_pt(str)           : str);
  str = (erase_q             ? erg_erase_q(str)             : str);
  str = (erase_word_spaces   ? erg_erase_word_spaces(str)   : str);
  str = (unify_word_spaces   ? erg_unify_word_spaces(str)   : str);
  str = (crush_invalid_words ? erg_crush_invalid_words(str) : str);
  str = erg_pack(str);
  if (append_tilde) { str = gensub(/([^-=,. ]+[~]*)/, "\\1~", "g", str); }
  return str;
}

BEGIN {
  abort = -1;
  if (field == "") { field = 0; }
  if (describe_equiv) { print_equiv(); abort = 0; exit 0; }
}

/^#/ { 
  if (abort >= 0) exit abort;
  print; next;
}

/./ {
  if (abort >= 0) exit abort;

  # Extracts the location code:
  if (field == 0)
    { loc = (""); skip = 0;
      if (match($0, /^<f[0-9][0-9]*[vr][0-9]*\.[^ >]*>/)) 
        { loc = sprintf("%-19s", substr($0,1,RLENGTH));
          skip = RLENGTH;
        }
      else if (substr($0,1,1) == "<") 
        { error(("line " NR ": bad location code"));
        }
      if (skip < length($0)) 
        { txt = erg_erase_comments(substr($0,1+skip));
          # Remove spurious spaces
          gsub(/[ ]/, "!", txt);
          # Now map words:
          txt = reduce_text(txt);
        }
      printf "%s%s\n", loc, txt;
    }
  else
    { if (field > NF) 
        { printf "** line %d: not enough fields\n", NR > "/dev/stderr"; exit 1; }
      $(field) = reduce_text($(field));
      print;
    }
  next;
}