#! /usr/bin/gawk -f
# Last edited on 2012-05-05 22:45:07 by stolfilocal
# Sampling functions for geez/gok
# Ge'ez (classic Ethiopian) - "Glory of the Kings",
# in the stadard SERA encoding.
# To be included in wds-to-tlw

function smp_define_patterns(smp,sec)
{
  # No patterns needed 
  if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); }
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Accept upper and lower case alpha, plus apostrophe
  # and backquote (as charater prefixes only):
  return ((wd ~ /^([`']?[a-zA-Z])+$/));
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Delete any material that is not ordinary chapters from the book proper
  # ("{b}{cN}{tx}") and from the introduction ("{i}{p}{tx}"). 
  if (cursec !~ /^({b}{c[1-9][0-9]*}|{i}{p}){tx}/) { return "n"; }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Within the selected sections, reject anything that is not prose text
  if (cursec !~ /{tx}$/) { return "x"; }
  
  # Discard punctuation other than parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }

  return type;
}

function smp_fix_word(smp,sec,type,wd)
{
  # Input is in SERA. 
  # No special processing needed.
  return wd;
}