# Last edited on 2002-01-16 02:59:56 by stolfi # Sampling functions for span/qvi # Spanish - Cervantes's "Don Quijote", lowercased, in old spelling. # To be included in select-evt-lines, fix-raw-words, select-gud-bad-words function select_evt_line(subsec,chapter,unit,linenum) { # Consider only running text (unit type "P") in # the main body of books (parts "cdefp"), # skipping Cervante's prologues (parts "bn") # and non-Cervante's material. if (subsec = "bod.1") { return ((chapter ~ /^[cdefp]/) && (unit ~ /^[P]/)); } else { arg_error(("bad subsection \"" subsec "\"")); } } function fix_raw_word(word) { # Map upper case to lower case. # It seems that "tolower" does not handle uppercase accents... word = tolower(word); gsub(/[Ç]/, "ç", word); gsub(/[Ñ]/, "ñ", word); gsub(/[Á]/, "á", word); gsub(/[É]/, "é", word); gsub(/[Í]/, "í", word); gsub(/[Ó]/, "ó", word); gsub(/[Ú]/, "ú", word); return word; } function define_patterns() { # No patterns needed } function is_good_word(word) { # Accept lowercase alpha (with acute accents, # "ç" and "ñ" but no "k" or "w"); also # apostrophe, hyphen, and "ü" (these are internal only). # Note that "ç" and "ñ" *can* start words in the old spanish spelling. return ( \ (word ~ /^[a-jl-vx-záéíóú]$/) || \ (word ~ /^[a-jl-vx-záéíóúçñ]([-'üçñ]?[a-jl-vx-záéíóú])+$/) \ ); }