#! /usr/bin/gawk -f
# Last edited on 2012-05-05 22:16:00 by stolfilocal
# Sampling functions for grek/nwt
# The New Testament in Greek, lowercased, in an ad-hoc encoding.
# To be included in wds-to-tlw

function smp_define_patterns(smp,sec)
{
  # No patterns needed 
  if ( \
    (sec != "mat.1") && (sec != "mrk.1") && (sec != "luk.1") && 
    (sec != "joh.1") && (sec != "tot.1") \
  ) 
    { data_error(("invalid section \"" sec "\"")); }
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Accept only lowercase bytes that stand for Greek letters:
  # No [chjqvwy], but [ë] (eta), [ô] (omega), and [fðçq] as above.
  # The text uses neither hyphen nor apostrophe. 
  return (wd ~ /^([abd-gik-uxz]|ë|ô|ð|ç)+$/);
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Delete anything outside Book {sec}:
  if (sec == "mat.1")
    { if (cursec !~ /^{b1}{c[1-9][0-9]*}{v[1-9][0-9]*}/) { return "n"; } }
  else if (sec == "mrk.1")
    { if (cursec !~ /^{b2}{c[1-9][0-9]*}{v[1-9][0-9]*}/) { return "n"; } }
  else if (sec == "luk.1")
    { if (cursec !~ /^{b3}{c[1-9][0-9]*}{v[1-9][0-9]*}/) { return "n"; } }
  else if (sec == "joh.1")
    { if (cursec !~ /^{b4}{c[1-9][0-9]*}{v[1-9][0-9]*}/) { return "n"; } }
  else if (sec == "tot.1")
    { if (cursec !~ /^{b[1-4]}{c[1-9][0-9]*}{v[1-9][0-9]*}/) { return "n"; } }
  else  
    { arg_error(("bad subsection \"" subsec "\"")); }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  #Reject any material that is not verse text:
  if (cursec !~ /{c[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; };
  
  # Discard punctuation other than parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }
  
  return type;
}

function smp_fix_word(smp,sec,type,wd)
{
  # Just in case: Map upper case to lower case.
  wd = tolower(wd);
  # 
  # We cannot trust that LOCALE crap:
  # 
  gsub(/[Ë]/, "ë", wd);
  gsub(/[Ð]/, "ð", wd);
  gsub(/[Ç]/, "ç", wd);
  gsub(/[Ô]/, "ô", wd);
  # 
  # Just in case: Map digraphs that stand for single Greek letters to single bytes.
  # (These replacements should have been done in the source.)
  # 
  gsub(/eh/, "ë", wd);
  gsub(/ph/, "f", wd);
  gsub(/th/, "ð", wd);
  gsub(/ps/, "ç", wd);
  gsub(/ch/, "q", wd);
  return wd;
}