#! /bin/gawk -f
# Last edited on 2002-01-15 05:18:38 by stolfi

BEGIN {
  abort = -1;
  usage = ( \
    "cat RAW.wds \\\n" \
    "  | select-gud-bad-words \\\n" \
    "      -f SAMPLEFNS.gawk \\\n" \
    "      -v sample=SAMPLE \\\n" \
    "      -v writeGood=BOOL \\\n" \
    "  > GUD.wds" \
  );
  
  # Selects from a raw string of words from text sample SAMPLE
  # those words that are suitable for statistical analysis.
  # generally that means discarding symbols, numerals,
  # unreadable words, etc..
  #
  # If "writeGood" is TRUE, writes the bad words instead of the good words
  #
  # The package SAMPLEFNS.gawk must define the functions
  # define_patterns() and is_good_word(wd).
  
  if (sample == "") { arg_error("must define \"sample\""); }
  if (writeGood == "") { writeGood = 0; }
  
  nread = 0;  # Number of words read
  nwrite = 0; # Number of words written
  define_patterns();
}

(abort >= 0) { exit abort; }

/^[ ]*([#]|$)/ { next; }

/./ {
  nread++;
  word = $0;
  good = ((word != "") && is_good_word(word));
  if (good == writeGood) { nwrite++; print; }
  next;
}

END {
  if (abort >= 0) { exit abort; }
  printf "%s: %7d words read, %7d written\n", sample, nread, nwrite > "/dev/stderr";
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort=1;  exit 1;
}

function data_error(msg)
{
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}

function load_remapping_table(file,    nMap,lin,fld,nfld)
{
  # Reads a word mapping table from "file", containing pairs 
  # of the form ORGINAL NEW. 
  # Stores the table in "wmap[ORIGINAL] = NEW".
  
  nMap=0;
  split("", wmap)
  while((getline lin < file) > 0) { 
    if (! match(lin, /^[#]/))
      { nfld = split(lin, fld, " ");
        if (nfld != 2) error(("bad table entry = \"" lin "\""));
        if (fld[1] in wmap) error(("repeated key = \"" lin "\""));
        wmap[fld[1]] = fld[2];
        nMap++;
      }
  }
  if (ERRNO != "0") { arg_error((file ": " ERRNO)); }
  close (file);
  if (nMap == 0) { arg_error(("file \"" file "\" empty or missing")); }
  printf "loaded %6d map pairs\n", nMap > "/dev/stderr"
}
