#! /usr/bin/gawk -f # Last edited on 2012-05-05 23:12:50 by stolfilocal # Sampling functions for span/qvi # Spanish - "Don Quijote", in old spelling, lowercased, split at hyphens. # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { # No patterns needed if ((sec != "one.1") && (sec != "two.1") && (sec != "tot.1")) { data_error(("invalid section \"" sec "\"")); } } function smp_is_good_word(smp,sec,type,wd) { # Accept lowercase alpha (with acute accents, # "ç", and "ñ", but no "k" or "w"); also # apostrophe, and "ü" (these are internal only). # Note that "ç" and "ñ" *can* start words in the old spanish spelling. # Accept lowercase alpha (minus "k" or "w"), acute vowels, "ç", "ñ", "ü", apostrophe: if (wd !~ /^(['a-jl-vx-z]|á|é|í|ó|ú|ç|ñ|ü)+$/) { return 0; } # Apostrophe and "ü" cannot begin word if (wd ~ /^('|ü)/) { return 0; } # Apostrophe, "ü", "ç", "ñ" cannot end word if (wd ~ /('|ç|ñ|ü)$/) { return 0; } # Apostrophe, "ç", "ñ", "ü" cannot be doubled if (wd ~ /''/) { return 0; } if (wd ~ /çç/) { return 0; } if (wd ~ /ññ/) { return 0; } # Enough already: return 1; } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Delete any material outside the section {sec}: if (sec == "one.1") { # Omit material outside chapter bodies of the "{m1}" installment: if (cursec !~ /^{m1}{p[0-9]+}{c[1-9][0-9]*}{tx}/) { return "n"; } } else if (sec == "two.1") { # Omit material outside chapter bodies of the "{m2}" installment: if (cursec !~ /^{m2}{p[0-9]+}{c[1-9][0-9]*}{tx}/) { return "n"; } } else if (sec == "tot.1") { # Omit material outside chapter bodies of either installment: if (cursec !~ /^{m[12]}{p[0-9]+}{c[1-9][0-9]*}{tx}/) { return "n"; } } else { arg_error(("bad output section \"" sec "\"")); } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Within the selected sections, reject anything that is not Spanish prose # (main text or body of letters): if (cursec !~ /{tx}(|{let}{txl})$/) { return "x"; } # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd) { # Map upper case to lower case. wd = tolower(wd); # Cannot trust that LOCALE crap: gsub(/[Ç]/, "ç", wd); gsub(/[Ñ]/, "ñ", wd); gsub(/[Á]/, "á", wd); gsub(/[É]/, "é", wd); gsub(/[Í]/, "í", wd); gsub(/[Ó]/, "ó", wd); gsub(/[Ú]/, "ú", wd); gsub(/[Ü]/, "ü", wd); # Break at hyphens: gsub(/[~]/, " ", wd); return wd; }