#! /usr/bin/gawk -f
# Last edited on 2012-05-05 23:26:20 by stolfilocal
# Sampling functions for ital/psp
# Portuguese - Dom Casmurro
# Only alpha words from the text, split at hyphens except obliques. 
# Note that hyphens are encoded as "~", abbrev dots as "/".
# To be included in wds-to-tlw

function smp_define_patterns(smp,sec)
{
  # No patterns needed 
  if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); }
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Single-letter words:
  if (wd ~ /^([aeo]|é|ó|à)$/) { return 1; }
  
  # Single-letter abbreviations:
  if (wd ~ /^[a-z][\^]$/) { return 1; }
  
  # Accept only lowercase alpha, plus "áéíóúâêôãõçü~", optional "^" at end:
  if (wd !~ /^([a-z]|á|é|í|ó|ú|â|ê|ô|à|ã|õ|ç|ü|~)+[\^]?$/) { return 0; }
  
  # No "ã|õ|ç|ü|~" at beginning: 
  if (wd ~ /^(ã|õ|ç|ü|~)/) { return 0; }
  
  # No "õ|ç|ü|~" at end: 
  if (wd ~ /(õ|ç|ü|~)$/) { return 0; }
  
  # No double hyphens: 
  if (wd ~ /[~][~]/) { return 0; }
  
  # Should be enough:
  return 1
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Delete all but ordinary chapters
  if (cursec !~ /^{b}{c[1-9][0-9]*}{tx}/) { return "n"; }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Within the selected sections, reject anything that is not Italian prose
  if (cursec !~ /{tx}$/) { return "x"; }
  
  # Discard punctuation other than parag breaks: 
  if (type == "p") 
    { if (wd != "=") { return "n"; } }
  else if (type == "a")
    { # Single-letter words are symbols, with few exceptions:
      if (wd ~ /^[b-df-np-zB-DF-NP-Z~]$/) { return "s"; }
      # Words without vowels that are not abbrevs are symbols too:
      if (wd !~ /([AEIOU\^]|Á|É|Í|Ó|Ú|Â|Ê|Ô|À|a|e|i|o|u|á|é|í|ó|ú|â|ê|ô|à)/) { return "s"; }
    }

  return type;
}

function smp_fix_word(smp,sec,type,wd)
{
  # Map upper case to lower case:
  wd = tolower(wd);
  # Cannot trust that LOCALE crap:
  gsub(/[À]/, "à", wd);
  gsub(/[Á]/, "á", wd);
  gsub(/[É]/, "é", wd);
  gsub(/[Í]/, "í", wd);
  gsub(/[Ó]/, "ó", wd);
  gsub(/[Ú]/, "ú", wd);

  gsub(/[Â]/, "â", wd);
  gsub(/[Ê]/, "ê", wd);
  gsub(/[Ô]/, "ô", wd);
  
  gsub(/[Ã]/, "ã", wd);
  gsub(/[Õ]/, "õ", wd);

  gsub(/[Ü]/, "ü", wd);
  
  gsub(/[Ç]/, "ç", wd);
  
  # Break at hyphens:
  gsub(/[~]/, " ", wd);
  
  # Break at hyphens
  gsub(/[~]/, " ", wd);
  # ... unless it is an oblique pronoun:
  if (match(wd, /[ ](ia[sm]?|[í]amos|[í]eis|ei|[á][s]?|emos|eis|[ã]o)$/))
    { wd = gensub(/[ ]([^ ]+)$/, "~\\1", "g", wd); }
  if (match(wd, /[ ]([ln]?[ao][s]?|lh[aoe][s]?)([~]|$)/))
    { wd = gensub(/[ ]([^ ]+)([~]|$)/, "~\\1\\2", "g", wd); }
  if (match(wd, /[ ](te|t[oa][s]?|vo[s]?)([~]|$)/))
    { wd = gensub(/[ ]([^ ]+)([~]|$)/, "~\\1\\2", "g", wd); }
  if (match(wd, /[ ](me|m[oa][s]?|no[s]?)([~]|$)/))
    { wd = gensub(/[ ]([^ ]+)([~]|$)/, "~\\1\\2", "g", wd); }
  if (match(wd, /[ ](se)([~]|$)/))
    { wd = gensub(/[ ]([^ ]+)([~]|$)/, "~\\1\\2", "g", wd); }
  return wd;
}