#! /usr/bin/gawk -f
# Last edited on 2012-05-05 22:12:02 by stolfilocal
# Sampling functions for russ/ptt
# The "Pentateuch" in Russian (Synodal), lowercased, in KOI8-R encoding.
# To be included in wds-to-tlw

function smp_define_patterns(smp,sec,   i,uc,lc)
{
  if ( \
    (sec != "gen.1") && (sec != "exo.1") && (sec != "lev.1") && 
    (sec != "num.1") && (sec != "deu.1") && (sec != "tot.1") \
  ) 
    { data_error(("invalid section \"" sec "\"")); }

  koi_ucs = "ёюабцдефгхийклмнопярстужвьызшэщчъ";
  koi_lcs = "ЁЮАБЦДЕФГХИЙКЛМНОПЯРСТУЖВЬЫЗШЭЩЧЪ";
  split(koi_ucs, uc, "");
  split(koi_lcs, lc, "");
  split("", koi_tolow);
  for (i in uc) { koi_tolow[uc[i]] = lc[i]; }
  for (i in lc) { koi_tolow[lc[i]] = lc[i]; }
  
  # This does not work unfortunately:
  # koi_word_pat = ( "^[" koi_lcs "]+$" );
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Accept lowercase letters only:
  return (wd ~ /^(Ё|Ю|А|Б|Ц|Д|Е|Ф|Г|Х|И|Й|К|Л|М|Н|О|П|Я|Р|С|Т|У|Ж|В|Ь|Ы|З|Ш|Э|Щ|Ч|Ъ)+$/);
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Delete any text outside Book {sec}:
  if (sec == "gen.1")
    { if (cursec !~ /^{b1}/) { return "n"; } }
  else if (sec == "exo.1")
    { if (cursec !~ /^{b2}/) { return "n"; } }
  else if (sec == "lev.1")
    { if (cursec !~ /^{b3}/) { return "n"; } }
  else if (sec == "num.1")
    { if (cursec !~ /^{b4}/) { return "n"; } }
  else if (sec == "deu.1")
    { if (cursec !~ /^{b5}/) { return "n"; } }
  else if (sec == "tot.1")
    { if (cursec !~ /^{b[1-5]}/) { return "n"; } }
  else  
    { arg_error(("bad subsection \"" subsec "\"")); }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Reject any material that is not verse text:
  if (cursec !~ /{c[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; };

  # Discard punctuation other than parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }

  return type;
}

function smp_fix_word(smp,sec,type,wd,   n,i,tw)
{
  # Map upper case to lower case, break at hyphens:
  # Must do the mapping by hand:
  n = length(wd); tw = "";
  for (i = 1; i <= n; i++) { tw = ( tw koi_tolow[substr(wd,i,1)] ); }
  wd = tw;
  gsub(/[-]/, " ", wd);
  return wd;
}