#! /usr/bin/gawk -f
# Last edited on 2001-01-02 01:40:52 by stolfi

BEGIN {
  abort = -1;
  usage = ( ARGV[0] " [ -v inField=NUM ] [ -v writeBad=NUM ] < INFILE > OUTFILE" );
  
  # Selects "good" words for statistical analysis.
  # Looks at field number `inField' of each input record.
  # Normally, writes the entire record out only if that field is "good".
  # If `writeBad' is 1, writes the record only if the word is bad.
  
  if (inField == "") { inField = 1; }
  if (writeBad == "") { writeBad = 0; }
  
  if (inField < 1) { arg_error("bad inField"); }
  if ((writeBad != 0) && (writeBad != 1)) { arg_error("bad writeBad"); }

  ngud = 0;
  nbad = 0;
}

(abort >= 0) { exit abort; }

/^[ ]*([#]|$)/ {
  print; next;
}

// { 
  if (inField > NF) 
    { data_error("too few fields"); }
  w = $(inField);
  if (w ~ /[A-Z]/) 
    { data_error("field contains capital letters"); }
  else if (w ~ /[^a-z*?]/) 
    { data_error("field contains special characters"); }
  else if (w ~ /[^a-z]/) 
    { bad = 1; }
  else
    { gsub(/ch/, "C", w); gsub(/sh/, "S", w);
      gsub(/ckh/, "K", w); gsub(/cth/, "T", w);
      gsub(/cfh/, "F", w); gsub(/cph/, "P", w);
      bad = (w ~ /[^eiaoqydlrsnmktfpCSKTPF]/);
    }
  if (writeBad == bad) { print; } 
  if (bad) { nbad++; } else { ngud++; }
}

END {
  if (abort >= 0) { exit abort; }
  ntot = nbad + ngud;
  printf "%5d good records (%5.3f)\n", ngud, ngud/(ntot > 0 ? ntot:1) > "/dev/stderr";
  printf "%5d bad records  (%5.3f)\n", nbad, nbad/(ntot > 0 ? ntot:1) > "/dev/stderr";
}

function arg_error(msg)
{
  printf "*** %s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1; exit abort;
}

function data_error(msg)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1; exit abort;
}
