#! /usr/bin/gawk -f
# Last edited on 2023-05-14 13:22:31 by stolfi
# Sampling functions for russ/ptt
# The "Pentateuch" in Russian (Synodal), lowercased, in KOI8-R encoding.
# To be included in wds-to-tlw

# ** MUST BE SAVED IN ISO-LATIN-1 **

function smp_define_patterns(smp,sec,   i,uc,lc)
{
  if ( \
    (sec != "gen.1") && (sec != "exo.1") && (sec != "lev.1") && 
    (sec != "num.1") && (sec != "deu.1") && (sec != "tot.1") \
  ) 
    { data_error(("invalid section \"" sec "\"")); }

  koi_ucs = "іабвгдежзийклмнопрстуфхцчшщъыьэюя";
  koi_lcs = "ЈАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ";
  split(koi_ucs, uc, "");
  split(koi_lcs, lc, "");
  split("", koi_tolow);
  for (i in uc) { koi_tolow[uc[i]] = lc[i]; }
  for (i in lc) { koi_tolow[lc[i]] = lc[i]; }
  
  # This does not work unfortunately:
  # koi_word_pat = ( "^[" koi_lcs "]+$" );
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Accept lowercase letters only:
  return (wd ~ /^([Ј]|[А]|[Б]|[В]|[Г]|[Д]|[Е]|[Ж]|[З]|[И]|[Й]|[К]|[Л]|[М]|[Н]|[О]|[П]|[Р]|[С]|[Т]|[У]|[Ф]|[Х]|[Ц]|[Ч]|[Ш]|[Щ]|[Ъ]|[Ы]|[Ь]|[Э]|[Ю]|[Я])+$/);
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Delete any text outside Book {sec}:
  # printf " ## %s/%s {%s}{%s} %s:[%s]\n", smp,sec,cursec,curlin,type,wd > "/dev/stderr";

  if (sec == "gen.1")
    { if (cursec !~ /^{b1}/) { return "n"; } }
  else if (sec == "exo.1")
    { if (cursec !~ /^{b2}/) { return "n"; } }
  else if (sec == "lev.1")
    { if (cursec !~ /^{b3}/) { return "n"; } }
  else if (sec == "num.1")
    { if (cursec !~ /^{b4}/) { return "n"; } }
  else if (sec == "deu.1")
    { if (cursec !~ /^{b5}/) { return "n"; } }
  else if (sec == "tot.1")
    { if (cursec !~ /^{b[1-5]}/) { return "n"; } }
  else  
    { arg_error(("bad subsection \"" subsec "\"")); }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Reject any material that is not verse text:
  if (cursec !~ /{c[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; };

  # Discard punctuation other than parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }

  return type;
}

function smp_fix_word(smp,sec,type,wd,   n,i,tw)
{
  # Map upper case to lower case, break at hyphens:

  # Must do the mapping by hand:
  n = length(wd); tw = "";
  for (i = 1; i <= n; i++) { tw = ( tw koi_tolow[substr(wd,i,1)] ); }
  wd = tw;
  
  gsub(/[-]/, " ", wd);

  return wd;
}