#! /usr/bin/gawk -f
# Last edited on 2004-02-26 22:58:02 by stolfi
# Sampling functions for enrc/wow
# English - Well's "War of the Worlds", lowercased,
# converted to Roman numerals code.
# To be included in wds-to-tlw

# These functions assume that the word table {wmap} maps each
# lowercase word that appears in the input to a distinct 
# Roman numeral, in lowercase, prefixed by "@".

function smp_define_patterns(smp,sec)
{
  # No patterns needed 
  if (smp != "enrc/wow") { data_error(("wrong sample \"" smp "\"")); }
  if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); }
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Delete all but ordinary chapters (omit opening quote)
  if (cursec !~ /^{p[12]}{c[1-9][0-9]*}{tx}/) { return "n"; }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Reject subsections that are not main text:
  if (cursec !~ /{tx}$/) { return "x"; }
  
  # Discard punctuation other than parag breaks: 
  if ((type == "p") && ( wd != "=")) { return "n"; }
  
  return type;
}

function smp_fix_word(smp,sec,type,wd,   nfld,fld,res,sep,i)
{
  if (type == "a")
    { # Map word to lowercase: 
      wd = tolower(wd);
      # There are no accented letters in this sample.

      # Break at hyphens:
      gsub(/[~]/, " ", wd);

      # Apply word table to each piece (the word should be there):
      nfld = split(wd, fld);
      res = ""; sep = "";
      for (i = 1; i <= nfld; i++)
        { wdi = fld[i];
          if (wdi in wmap)
            { wdi = wmap[wdi]; }
          else
            { data_error(("word not in table \"" wdi "\"")); } 
          # Remove the leading "@", or mark with "?"
          if (wdi ~ /^[@]/)
            { gsub(/^[@]/, "", wdi); }
          else 
            { wdi = ("?" wdi "?"); }
          res = ( res sep wdi ); sep = " ";
        }
      wd = res;
    }
  return wd;
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # The Roman numeral code uses lowercase letters only,
  # with p = 5000, b = 10000.
  return (wd ~ /^[ivxlcdmpb]+$/);
}