# Last edited on 2002-01-16 14:15:28 by stolfi
# The New Testament in Greek, lowercased, in an ad-hoc encoding.
# Sampling functions for grek/nwt
# To be included in select-evt-lines, fix-raw-words, select-gud-bad-words

function select_evt_line(subsec,chapter,unit,linenum)
{
  # Make each of the four Gospels into a subsection; 
  # Consider only running text (unit type "P"):
  if (unit !~ /^[P]/) { return 0; }
  if (subsec == "mat.1")
    { return (chapter ~ /^[A]/); }
  else if (subsec == "mrk.1")
    { return (chapter ~ /^[B]/); }
  else if (subsec == "luk.1")
    { return (chapter ~ /^[C]/); }
  else if (subsec == "joh.1")
    { return (chapter ~ /^[D]/); }
  else  
    { arg_error(("bad subsection \"" subsec "\"")); }
}

function fix_raw_word(word)
{
  # Map upper case to lower case (just in case),
  # Map digraphs that stand for single Greek letters to single bytes.
  # 
  word = tolower(word);
  gsub(/ph/, "f", word);  # Actually done already in the source
  gsub(/th/, "ð", word);
  gsub(/ps/, "ç", word);
  gsub(/ch/, "q", word);
  return word;
}

function define_patterns()
{
  # No patterns needed 
}

function is_good_word(word)
{ 
  # Accept only lowercase bytes that stand for Greek letters:
  # No [chjqvwy], but [ë] (eta), [ô] (omega), and [fðçq] as above.
  # The text uses neither hyphen nor apostrophe. 
  return (word ~ /^[abd-gik-uxzëôðç]+$/);
}