#! /usr/bin/gawk -f
# Last edited on 2012-05-05 22:13:46 by stolfilocal
# Sampling functions for russ/pic
# Russian - "Piknik na obochine" by the Brothers Strugatsky
# To be included in wds-to-tlw

function smp_define_patterns(smp,sec)
{
  # No patterns needed 
  if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); }
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Accept only lowercase alpha, "é", plus apostrophe and "^".
  # Apostrophes and "^" can't be doubled or occur next to each other.
  # They cannot occur at the beginning.
  # The "é" can only occur at the beginning, it seems.
  # The "°" can only occur at the end of the word.
  return (wd ~ /^([a-z]|é)(['^]?([a-z]|é))*['^]?°?$/);
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Delete all but ordinary chapters (omit opening quote)
  if (cursec !~ /^{b}{c[1-9][0-9]*}/) { return "n"; }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Within the selected sections, reject anything that is not Russian prose
  if (cursec !~ /{tx}$/) { return "x"; }
  
  # Discard punctuation other than parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }

  return type;
}

function smp_fix_word(smp,sec,type,wd)
{
  # Break at hyphens:
  gsub(/[-]/, " ", wd);

  # Map upper case to lower case:
  wd = tolower(wd);
  # Cannot trust that LOCALE crap:
  gsub(/É/, "é", wd);
  # The soft-sign "'" and hard-sign "^" are never word-initial
  # so they do not occur in uppercase.
  # (We assume that there are no all-caps words).
  return wd;
}