#! /usr/bin/gawk -f
# Last edited on 2004-02-18 15:36:30 by stolfi

BEGIN {
  abort = -1;
  usage = ( \
    "cat main.wds \\\n" \
    "  | fix-tagged-words \\\n" \
    "      -f AWKLIB.gawk \\\n" \
    "      -v smp=SMP \\\n" \
    "      -v sec=SEC \\\n" \
    "      [ -v field=FIELD ] \\\n" \
    "      [ -v table=TABLE ] \\\n" \
    "  > GUD.wds" \
  );
  # 
  # Performs adjustments to words extracted from a text sample
  # needed for proper statistical analysis. This may include change
  # of encoding, removing capitalization, elimination of some
  # words, etc. 
  # 
  # The word to be fixed is assumed to be field number {field} of each
  # input record (default 1).
  # 
  # The {TABLE}, if specified, must contain pairs of words {OLD NEW}.
  # If word {OLD} appears in the input, it is replaced by {NEW}.
  #  
  # The library "{AWKLIB}.gawk" must define the following functions:
  #  
  #   fix_word(smp, sec, wd)
  #     
  #     where {wd} is a word and {smp} and {sec} are
  #     client-specified strings. The procedure should return a
  #     cleaned copy of {wd}, e.g. without capitalization or undesired
  #     markings. It may split {wd} by inserting blanks.
  #     Note that this function may adjust its behavior
  #     based on the {smp} and {sec} arguments, which 
  #     otherwise are not used for anything.
  # 
  #   define_patterns(smp,sec)
  #     
  #     A function that will be called before the first record,
  #     to set up any tables and patterns that may be needed by 
  #     {fix_word}.
  # 
  # A word that gets remapped to "*DELETE*", "*delete*" will be discarded. 
  # The result of {fix_word} is split at blanks, and one output
  # record is written for each field.
  # 
  
  if (smp == "") { arg_error("must define \"smp\""); }
  if (sec == "") { arg_error("must define \"sec\""); }
  split("", wmap); 
  if (table != "") 
    { # Read word-remapping table, if present.
      load_remapping_table(table);
    }
  if (field == "") { field = 0; }
  
  nread = 0;  # Number of words read
  nwrite = 0; # Number of words written
  define_patterns(smp, sec);
}

(abort >= 0) { exit abort; }

/^[ ]*([\#]|$)/ { next; }

/./ {
  nread++;
  word = $(field);
  if (word in wmap) 
    { word = wmap[word];
      if ((word == "*DELETE*") || (word == "*delete*")) { next; }
    }
  word = fix_word(smp, sec, word);
  if ((word == "*DELETE*") || (word == "*delete*")) { next; }
  nwds = split(word, wds, /[ \012]/);
  for(i = 1; i <= nwds; i++)
    { if (wds[i] != "") 
        { $(field) = wds[i]; print; nwrite++; }
    } 
  next;
}

END {
  if (abort >= 0) { exit abort; }
  printf "%s: %7d words read, %7d written\n", smp, nread, nwrite > "/dev/stderr";
}

function load_remapping_table(file,    nMap,lin,fld,nfld)
{
  # Reads a word mapping table from "file", containing pairs 
  # of the form ORGINAL NEW. 
  # Stores the table in "wmap[ORIGINAL] = NEW".
  
  nMap=0;
  split("", wmap)
  while((getline lin < file) > 0) { 
    gsub(/^[ ]*/, "", lin);
    if (! match(lin, /^([\#]|$)/))
      { gsub(/[ ]*[\#].*$/, "", lin);
        nfld = split(lin, fld, " ");
        if (nfld != 2) tbl_error(file, ("bad table entry = \"" lin "\""));
        if (fld[1] in wmap) tbl_error(file, ("repeated key = \"" lin "\""));
        wmap[fld[1]] = fld[2];
        nMap++;
      }
  }
  if (ERRNO != "0") { arg_error((file ": " ERRNO)); }
  close (file);
  if (nMap == 0) 
    { printf "warning: file \"" file "\" empty or missing\n" > "/dev/stderr"; }
  else
    { printf "loaded %6d map pairs\n", nMap > "/dev/stderr"; }
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort=1;  exit 1;
}

function data_error(msg)
{
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}

function tbl_error(file, msg)
{
  printf "file %s, line %s: %s\n", file, FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}