#! /usr/bin/gawk -f # Last edited on 2012-05-05 23:26:20 by stolfilocal # Sampling functions for ital/psp # Portuguese - Dom Casmurro # Only alpha words from the text, split at hyphens except obliques. # Note that hyphens are encoded as "~", abbrev dots as "/". # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { # No patterns needed if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } } function smp_is_good_word(smp,sec,type,wd) { # Single-letter words: if (wd ~ /^([aeo]|é|ó|à)$/) { return 1; } # Single-letter abbreviations: if (wd ~ /^[a-z][\^]$/) { return 1; } # Accept only lowercase alpha, plus "áéíóúâêôãõçü~", optional "^" at end: if (wd !~ /^([a-z]|á|é|í|ó|ú|â|ê|ô|à|ã|õ|ç|ü|~)+[\^]?$/) { return 0; } # No "ã|õ|ç|ü|~" at beginning: if (wd ~ /^(ã|õ|ç|ü|~)/) { return 0; } # No "õ|ç|ü|~" at end: if (wd ~ /(õ|ç|ü|~)$/) { return 0; } # No double hyphens: if (wd ~ /[~][~]/) { return 0; } # Should be enough: return 1 } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Delete all but ordinary chapters if (cursec !~ /^{b}{c[1-9][0-9]*}{tx}/) { return "n"; } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Within the selected sections, reject anything that is not Italian prose if (cursec !~ /{tx}$/) { return "x"; } # Discard punctuation other than parag breaks: if (type == "p") { if (wd != "=") { return "n"; } } else if (type == "a") { # Single-letter words are symbols, with few exceptions: if (wd ~ /^[b-df-np-zB-DF-NP-Z~]$/) { return "s"; } # Words without vowels that are not abbrevs are symbols too: if (wd !~ /([AEIOU\^]|Á|É|Í|Ó|Ú|Â|Ê|Ô|À|a|e|i|o|u|á|é|í|ó|ú|â|ê|ô|à)/) { return "s"; } } return type; } function smp_fix_word(smp,sec,type,wd) { # Map upper case to lower case: wd = tolower(wd); # Cannot trust that LOCALE crap: gsub(/[À]/, "à", wd); gsub(/[Á]/, "á", wd); gsub(/[É]/, "é", wd); gsub(/[Í]/, "í", wd); gsub(/[Ó]/, "ó", wd); gsub(/[Ú]/, "ú", wd); gsub(/[Â]/, "â", wd); gsub(/[Ê]/, "ê", wd); gsub(/[Ô]/, "ô", wd); gsub(/[Ã]/, "ã", wd); gsub(/[Õ]/, "õ", wd); gsub(/[Ü]/, "ü", wd); gsub(/[Ç]/, "ç", wd); # Break at hyphens: gsub(/[~]/, " ", wd); # Break at hyphens gsub(/[~]/, " ", wd); # ... unless it is an oblique pronoun: if (match(wd, /[ ](ia[sm]?|[í]amos|[í]eis|ei|[á][s]?|emos|eis|[ã]o)$/)) { wd = gensub(/[ ]([^ ]+)$/, "~\\1", "g", wd); } if (match(wd, /[ ]([ln]?[ao][s]?|lh[aoe][s]?)([~]|$)/)) { wd = gensub(/[ ]([^ ]+)([~]|$)/, "~\\1\\2", "g", wd); } if (match(wd, /[ ](te|t[oa][s]?|vo[s]?)([~]|$)/)) { wd = gensub(/[ ]([^ ]+)([~]|$)/, "~\\1\\2", "g", wd); } if (match(wd, /[ ](me|m[oa][s]?|no[s]?)([~]|$)/)) { wd = gensub(/[ ]([^ ]+)([~]|$)/, "~\\1\\2", "g", wd); } if (match(wd, /[ ](se)([~]|$)/)) { wd = gensub(/[ ]([^ ]+)([~]|$)/, "~\\1\\2", "g", wd); } return wd; }