#! /usr/bin/gawk -f
# Last edited on 2023-05-10 14:36:36 by stolfi
# Sampling functions for germ/sim
# German - Grimmelshausen's "Der Abenteuerliche Simplicissimus Teutsch"
# To be included in wds-to-tlw

function smp_define_patterns(smp,sec)
{
  # No patterns needed 
  if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); }
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Accept only lowercase alpha, "äöüß", plus apostrophe and optional "^" at end:
  if (wd !~ /^(['a-z]|ä|ö|ü|ß)+[\^]?$/) { return 0; }
  
  # Apostrophes can't be doubled and cannot occur next to "^":
  if (wd ~ /['][']/) { return 0; }
  
  # The sharp-ss "ß" cannot occur at the beginning:
  if (wd ~ /^ß/) { return 0; }
  
  # Should be enough:
  return 1
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Delete all but ordinary chapters (omit Beschluss):
  # Take only ordinary chapters:  
  if (cursec !~ /^{b[1-6]}{c[1-9][0-9]*}/) { return "n"; }

  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Within the selected sections, reject anything that is not Spanish prose
  # (main text or body of letters):
  if (cursec !~ /{tx}$/) { return "x"; }
  
  # Discard punctuation other than parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }

  return type;
}

function smp_fix_word(smp,sec,type,wd)
{
  # Break at hyphens:
  gsub(/[-]/, " ", wd);

  # Break between "d'" and a noun (before lowercasing it!):
  wd = gensub(/^d[']([A-ZÄÖÜ])/, "d' \\1", 1, wd);

  # Map upper case to lower case:
  wd = tolower(wd);
  # Cannot trust that LOCALE crap:
  gsub(/[Ä]/, "ä", wd);
  gsub(/[Ö]/, "ö", wd);
  gsub(/[Ü]/, "ü", wd);
  # Sharp-ss is is never word-initial so it has no uppercase 
  # (It is written "SS" in all-caps, which is ambiguous, 
  # but we should have excluded titles...)
  return wd;
}