# Last edited on 2002-01-16 02:59:56 by stolfi
# Sampling functions for span/qvi
# Spanish - Cervantes's "Don Quijote", lowercased, in old spelling.
# To be included in select-evt-lines, fix-raw-words, select-gud-bad-words

function select_evt_line(subsec,chapter,unit,linenum)
{
  # Consider only running text (unit type "P") in 
  # the main body of books (parts "cdefp"),
  # skipping Cervante's prologues (parts "bn")
  # and non-Cervante's material.
     
  if (subsec = "bod.1")
    { return ((chapter ~ /^[cdefp]/) && (unit ~ /^[P]/)); }
  else
    { arg_error(("bad subsection \"" subsec "\"")); }
}

function fix_raw_word(word)
{
  # Map upper case to lower case.
  # It seems that "tolower" does not handle uppercase accents...
  word = tolower(word);
  gsub(/[Ç]/, "ç", word);
  gsub(/[Ñ]/, "ñ", word);
  gsub(/[Á]/, "á", word);
  gsub(/[É]/, "é", word);
  gsub(/[Í]/, "í", word);
  gsub(/[Ó]/, "ó", word);
  gsub(/[Ú]/, "ú", word);
  return word;
}

function define_patterns()
{
  # No patterns needed 
}

function is_good_word(word)
{ 
  # Accept lowercase alpha (with acute accents, 
  # "ç" and "ñ" but no "k" or "w"); also
  # apostrophe, hyphen, and "ü" (these are internal only).
  # Note that "ç" and "ñ" *can* start words in the old spanish spelling.
  return ( \
    (word ~ /^[a-jl-vx-záéíóú]$/) || \
    (word ~ /^[a-jl-vx-záéíóúçñ]([-'üçñ]?[a-jl-vx-záéíóú])+$/) \
  );
}