#! /bin/gawk -f
# Last edited on 2004-02-17 15:06:10 by stolfi

BEGIN {
  abort = -1;
  usage = ( \
    "cat RAW.wds \\\n" \
    "  | select-gud-bad-words \\\n" \
    "      -f AWKLIB.gawk \\\n" \
    "      -v smp=SMP \\\n" \
    "      -v sec=SEC \\\n" \
    "      [ -v inField=NUM ] \\\n" \
    "      [ -v maxGud=NUM ] \\\n" \
    "      -v writeGud=BOOL \\\n" \
    "      -v writeBad=BOOL \\\n" \
    "  > GUD.wds" \
  );
  # 
  # Selects "good" and/or "bad" words for statistical analysis
  # generally that means discarding symbols, numerals,
  # unreadable words, etc..
  # 
  # Looks at field number `inField' of each input record.
  #
  # If {writeGud} is TRUE, writes every record with a good word.
  # If {writeBad} is TRUE, writes every record with a bad word.
  #
  # If {maxGud} is specified, stops after seeing that many good words.
  # 
  # The library "{AWKLIB}.gawk" must define the procedure
  # define_patterns(smp, sec) (which will be called before the first
  # record) and the predicate is_good_word(smp, sec, wd). The latter
  # may depend on the {smp} and {sec}, which are arbitrary
  # client-given strings.
  # 
  if (inField == "")  { inField = 1; }
  if (writeGud == "") { writeGud = 0; }
  if (writeBad == "") { writeBad = 0; }
  if (maxGud == "")   { maxGud = -1; }
  
  if (inField < 1) { arg_error("bad inField"); }
  if ((writeGud != 0) && (writeGud != 1)) { arg_error("bad writeGud"); }
  if ((writeBad != 0) && (writeBad != 1)) { arg_error("bad writeBad"); }
  if ((writeGud == 0) && (writeBad == 0)) { arg_error("no output"); }

  nread = 0;  # Number of words read
  nwrite = 0; # Number of words written
  ngud = 0;
  nbad = 0;
  define_patterns(smp, sec);
}

(abort >= 0) { exit abort; }

((maxGud >= 0) && (ngud >= maxGud)) { exit 0; }

/^[ ]*([\#]|$)/ { next; }

/./ {
  nread++;
  if (inField > NF) 
    { data_error("too few fields"); }
  word = $(inField);
  if (word == "") { next; }
  gud = is_good_word(smp, sec, word);
  if (gud) { ngud++; } else { nbad++; }
  if ((gud && writeGud) || ((! gud) && writeBad))
    { print; nwrite++; }
  next;
}

END {
  if (abort >= 0) { exit abort; }
  ntot = nbad + ngud;
  printf "%7d gud records (%5.3f)\n", ngud, ngud/(ntot > 0 ? ntot:1) > "/dev/stderr";
  printf "%7d bad records (%5.3f)\n", nbad, nbad/(ntot > 0 ? ntot:1) > "/dev/stderr";
  printf "%7d words read, %7d written\n", nread, nwrite > "/dev/stderr";
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort=1;  exit 1;
}

function data_error(msg)
{
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}