#! /usr/bin/gawk -f
# Last edited on 2023-05-10 14:36:26 by stolfi
# Sampling functions for ital/psp
# Italian - "I Promessi Sposi"
# Only alpha words from the text, split at hyphens and after 
# apostrophes, mapped to lowercase.
# To be included in wds-to-tlw

function smp_define_patterns(smp,sec)
{
  # No patterns needed 
  if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); }
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Accept only lowercase alpha, "āčėōųé" (not "ņ"), plus apostrophe and "°".
  # Apostrophes can't be doubled.
  # Note that the apostrophe may occur first or last but not in the middle:
  return (wd ~ /^[']?([a-z]|ā|č|ė|ō|ų|é)+('|°)?$/);
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Delete all but ordinary chapters (omit opening quote)
  if (cursec !~ /^{b}{c[1-9][0-9]*}{tx}/) { return "n"; }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Within the selected sections, reject anything that is not Italian prose
  if (cursec !~ /{tx}$/) { return "x"; }
  
  # Discard punctuation other than parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }

  return type;
}

function smp_fix_word(smp,sec,type,wd)
{
  # Map upper case to lower case:
  wd = tolower(wd);
  # Cannot trust that LOCALE crap:
  gsub(/Ā/, "ā", wd);
  gsub(/Č/, "č", wd);
  gsub(/Ė/, "ė", wd);
  gsub(/Ō/, "ō", wd);
  gsub(/Ų/, "ų", wd);
  gsub(/É/, "é", wd);
  gsub(/Ņ/, "ņ", wd);
  
  # Break at hyphens:
  gsub(/[~]/, " ", wd);
  
  # Break after apostrophes:
  gsub(/[']/, "' ", wd);
  # ... unless the word starts with an apostrophe:
  gsub(/^['][ ]+/, "'", wd);
  return wd;
}