#! /usr/bin/gawk -f
# Last edited on 2012-05-05 22:46:39 by stolfilocal
# Sampling functions for engl/cul
# English - main text from Culpeper's herbal, mapped to lowercase.
# Only alpha words from the text, split at hyphens, mapped to lowercase.
# Note that hyphens are encoded as "~", abbrev dots as "/".
# To be included in wds-to-tlw

function smp_define_patterns(smp,sec)
{
  # No patterns needed 
  if ((sec != "pre.1") && (sec != "her.1") && (sec != "rec.1") && (sec != "tot.1")) 
    { data_error(("invalid section \"" sec "\"")); }
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Accept "&" and "&c/" alone or lowercase alpha, plus apostrophe and "/".
  # Apostrophes can't be doubled.
  # Note that 'tis OK to begin an' end with apostrophe! 
  if (wd == "&") { return 1; }
  if (wd == "&c/") { return 1; }
  return (wd ~ /^([']?[a-z])+['\/]?$/);
  #
  # The following allows hyphenated words.
  # Note that each word of 
  # an hyphenated compound must contain at least one letter.
  #   return (wd ~ /^(([']?[a-z])+[']?)([-]([']?[a-z])+['\/]?)*$/);
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd, valid)
{
  if ((sec == "pre.1") || ((sec == "tot.1") && (cursec ~ /^{pro}/)))
    { # Delete material outside the "To the reader" section:
      if (cursec !~ /^{pro}{tordr}{tx}/) { return "n"; }
      # Reject subsections that are not plain text:
      if (cursec !~ /{tx}(|{ius}{txu})$/) { return "x"; }
    }
  else if ((sec == "her.1") || ((sec == "tot.1") && (cursec ~ /^{hb}/)))
    { # Delete material outside  the herbal proper
      # (omit herb 0 in the prologue, which may be atypical):
      if (cursec !~ /^{hb}{h[1-9][0-9]*}{tx}/) { return "n"; }
      if (cursec !~ /{tx}(|{s[A-Z]}{txs})$/) { return "x"; }
    }
  else if ((sec == "rec.1") || ((sec == "tot.1") && (cursec ~ /^{rx}/)))
    { # Delete material outside the recipes section proper, text in numbered paragraphs,
      # including the initial "Directions" section:
      if (cursec !~ /^{rx}{s[12]}{bd}{c[1-9][0-9]*}{tx/) { return "n"; }
      if (cursec !~ /{tx[0-9]*}$/) { return "x"; }
    }
  else  
    { arg_error(("bad output section \"" sec "\"")); }
  
  # Reclassify "&" and "&c/" as alpha, just in case: 
  if (wd == "&") { return "a"; }
  if (wd == "&c/") { return "a"; }
  
  # Discard punctuation other than parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }

  return type;
}

function smp_fix_word(smp,sec,type,wd)
{
  # Map upper case to lower case:
  wd = tolower(wd);
  
  # Break at hyphens:
  gsub(/[~]/, " ", wd);
  
  return wd;
}