#! /usr/bin/gawk -f
# Last edited on 2004-02-17 15:12:52 by stolfi

BEGIN {
  abort = -1;
  usage = ( ARGV[0] "\\\n" \
    " [ -v field=NUM ] \\\n" \
    " -v smp=LANG/BOOK -v sec=SEC \\\n" \
    " [ -v writeGud=NUM ] [ -v writeBad=NUM ] \\\n" \
    " < INFILE > OUTFILE" \
  );
  
  # Selects "good" Voynichese words for statistical analysis.
  # Looks at field number {field} of each input record.
  # If {writeGud} is 1, writes the entire record if that field is "good".
  # If {writeBad} is 1, writes the entire record if the word is "bad".
  # The {smp} and {sec} strings are provided just in case
  # the decision depends on them.
  
  if (smp == "")   { arg_error("must define \"smp\""); }
  if (sec == "")   { arg_error("must define \"sec\""); }
  if (field == "")  { field = 1; }
  if (writeGud == "") { writeGud = 0; }
  if (writeBad == "") { writeBad = 0; }
  
  if (field < 1) { arg_error("bad field"); }
  if ((writeGud != 0) && (writeGud != 1)) { arg_error("bad writeGud"); }
  if ((writeBad != 0) && (writeBad != 1)) { arg_error("bad writeBad"); }
  if ((writeBad == 0) && (writeGud == 0)) { arg_error("no output"); }

  nread = 0;  # Number of words read
  nwrite = 0; # Number of words written
  ngud = 0;
  nbad = 0;
}

(abort >= 0) { exit abort; }

/^[ ]*([\#]|$)/ {
  print; next;
}

// { 
  nread++;
  if (field > NF) 
    { data_error("too few fields"); }
  w = $(field);
  if (w ~ /[A-Z]/) 
    { data_error("field contains capital letters"); }
  else if (w ~ /[^a-z*?]/) 
    { data_error("field contains special characters"); }
  else if (w ~ /[^a-z]/) 
    { bad = 1; }
  else
    { gsub(/ch/, "C", w); gsub(/sh/, "S", w);
      gsub(/ckh/, "K", w); gsub(/cth/, "T", w);
      gsub(/cfh/, "F", w); gsub(/cph/, "P", w);
      bad = (w ~ /[^eiaoqydlrsnmktfpCSKTPF]/);
    }
  if (bad) { nbad++; } else { ngud++; }

  if ((bad && writeBad) || ((! bad) && writeGud))
    { nwrite++; print; } 
}

END {
  if (abort >= 0) { exit abort; }
  ntot = nbad + ngud;
  printf "%7d good records (%5.3f)\n", ngud, ngud/(ntot > 0 ? ntot:1) > "/dev/stderr";
  printf "%7d bad records  (%5.3f)\n", nbad, nbad/(ntot > 0 ? ntot:1) > "/dev/stderr";
  printf "%7d words read, %7d written\n", ntot, nwrite > "/dev/stderr";
}

function arg_error(msg)
{
  printf "*** %s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1; exit abort;
}

function data_error(msg)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1; exit abort;
}