#! /usr/bin/gawk -f
# Last edited on 2012-05-05 23:12:50 by stolfilocal
# Sampling functions for span/qvi
# Spanish - "Don Quijote", in old spelling, lowercased, split at hyphens.
# To be included in wds-to-tlw

function smp_define_patterns(smp,sec)
{
  # No patterns needed 
  if ((sec != "one.1") && (sec != "two.1") && (sec != "tot.1"))
    { data_error(("invalid section \"" sec "\"")); }
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Accept lowercase alpha (with acute accents, 
  # "ç", and "ñ", but no "k" or "w"); also
  # apostrophe, and "ü" (these are internal only).
  # Note that "ç" and "ñ" *can* start words in the old spanish spelling.
  
  # Accept lowercase alpha (minus "k" or "w"), acute vowels, "ç", "ñ", "ü", apostrophe:
  if (wd !~ /^(['a-jl-vx-z]|á|é|í|ó|ú|ç|ñ|ü)+$/) { return 0; }
  
  # Apostrophe and "ü" cannot begin word
  if (wd ~ /^('|ü)/) { return 0; }
  
  # Apostrophe, "ü", "ç", "ñ" cannot end word
  if (wd ~ /('|ç|ñ|ü)$/) { return 0; }
  
  # Apostrophe, "ç", "ñ", "ü" cannot be doubled
  if (wd ~ /''/) { return 0; }
  if (wd ~ /çç/) { return 0; }
  if (wd ~ /ññ/) { return 0; }
  
  # Enough already:
  return 1;
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Delete any material outside the section {sec}:
  if (sec == "one.1")
    { # Omit material outside chapter bodies of the "{m1}" installment:
      if (cursec !~ /^{m1}{p[0-9]+}{c[1-9][0-9]*}{tx}/) { return "n"; }
    }
  else if (sec == "two.1")
    { # Omit material outside chapter bodies of the "{m2}" installment:
      if (cursec !~ /^{m2}{p[0-9]+}{c[1-9][0-9]*}{tx}/) { return "n"; }
    }
  else if (sec == "tot.1")
    { # Omit material outside chapter bodies of either installment:
      if (cursec !~ /^{m[12]}{p[0-9]+}{c[1-9][0-9]*}{tx}/) { return "n"; }
    }
  else  
    { arg_error(("bad output section \"" sec "\"")); }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Within the selected sections, reject anything that is not Spanish prose
  # (main text or body of letters):
  if (cursec !~ /{tx}(|{let}{txl})$/) { return "x"; }
  
  # Discard punctuation other than parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }

  return type;
}

function smp_fix_word(smp,sec,type,wd)
{
  # Map upper case to lower case.
  wd = tolower(wd);
  # Cannot trust that LOCALE crap:
  gsub(/[Ç]/, "ç", wd);
  gsub(/[Ñ]/, "ñ", wd);
  gsub(/[Á]/, "á", wd);
  gsub(/[É]/, "é", wd);
  gsub(/[Í]/, "í", wd);
  gsub(/[Ó]/, "ó", wd);
  gsub(/[Ú]/, "ú", wd);
  gsub(/[Ü]/, "ü", wd);
  # Break at hyphens:
  gsub(/[~]/, " ", wd);
  return wd;
}