#! /usr/bin/gawk -f
# Last edited on 2012-05-05 22:26:13 by stolfilocal
# Sampling functions for fran/tal
# French - "De la Terre a la Lune"
# Only alpha words from the text, split at hyphens and after 
# apostrophes, mapped to lowercase.
# Note that hyphens are encoded as "~", abbrev dots as "^".
# To be included in wds-to-tlw

function smp_define_patterns(smp,sec)
{
  # No patterns needed 
  if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); }
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Accept only lowercase alpha, alpha with accents (acute,grave,circumflex,
  # diaeresis), and "į"), plus apostrophe, hyphen, and "^".

  # Check for invalid characters:
  if (wd !~ /^(['~^a-z]|ā|č|ė|ō|ų|á|é|í|ó|ú|â|ę|î|ô|û|ä|ë|ï|ö|ü|į)+$/) { return 0; }
    
  # Apostrophes, hyphens and "^" can't be doubled:
  if (wd ~ /['~^]['~^]/) { return 0; }
  
  # The abbreviation mark "^" can only occur at the end:
  if (wd ~ /[\^]./) { return 0; }
  
  # An apostrophe cannot occur at the beginning of a word (but can occur at the end):
  if (wd ~ /^[']/) { return 0; }
  
  # An hyphen can only occur inside a word.
  if (wd ~ /^[~]/) { return 0; }
  if (wd ~ /[~]$/) { return 0; }
  
  # Enough checking:
  return 1;
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Delete all but ordinary chapters (omit opening quote)
  if (cursec !~ /^{b}{c[1-9][0-9]*}{tx}/) { return "n"; }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Within the selected sections, reject anything that is not Italian prose
  if (cursec !~ /{tx}$/) { return "x"; }
  
  # Discard punctuation other than parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }

  return type;
}

function smp_fix_word(smp,sec,type,wd)
{
  # Map upper case to lower case:
  wd = tolower(wd);
  # Cannot trust that LOCALE crap:
  gsub(/[Ā]/, "ā", wd);
  gsub(/[Č]/, "č", wd);
  gsub(/[Ė]/, "ė", wd);
  gsub(/[Ō]/, "ō", wd);
  gsub(/[Ų]/, "ų", wd);
  
  gsub(/[Á]/, "á", wd);
  gsub(/[É]/, "é", wd);
  gsub(/[Í]/, "í", wd);
  gsub(/[Ó]/, "ó", wd);
  gsub(/[Ú]/, "ú", wd);
  
  gsub(/[Â]/, "â", wd);
  gsub(/[Ę]/, "ę", wd);
  gsub(/[Î]/, "î", wd);
  gsub(/[Ô]/, "ô", wd);
  gsub(/[Û]/, "û", wd);
  
  gsub(/[Ä]/, "ä", wd);
  gsub(/[Ë]/, "ë", wd);
  gsub(/[Ï]/, "ï", wd);
  gsub(/[Ö]/, "ö", wd);
  gsub(/[Ü]/, "ü", wd);
  
  gsub(/[Į]/, "į", wd);
  
  # Break at hyphens:
  gsub(/[~]/, " ", wd);
  # ... except before epenthetic "~t~":
  gsub(/[ ][t][ ]/, "~t ", wd);
  # Note that "J.-T." is encoded as "J^~T^" hence it will stay split.
  
  # Break after apostrophes:
  gsub(/[']/, "' ", wd);
  # ... except before English Genitive "'s"
  gsub(/['][ ][s][ ]/, "'s ", wd);
  gsub(/['][ ][s]$/, "'s", wd);
  return wd;
}