#! /usr/bin/gawk -f # Last edited on 2023-05-10 14:36:26 by stolfi # Sampling functions for ital/psp # Italian - "I Promessi Sposi" # Only alpha words from the text, split at hyphens and after # apostrophes, mapped to lowercase. # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { # No patterns needed if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } } function smp_is_good_word(smp,sec,type,wd) { # Accept only lowercase alpha, "àèìòùé" (not "ñ"), plus apostrophe and "°". # Apostrophes can't be doubled. # Note that the apostrophe may occur first or last but not in the middle: return (wd ~ /^[']?([a-z]|à|è|ì|ò|ù|é)+('|°)?$/); } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Delete all but ordinary chapters (omit opening quote) if (cursec !~ /^{b}{c[1-9][0-9]*}{tx}/) { return "n"; } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Within the selected sections, reject anything that is not Italian prose if (cursec !~ /{tx}$/) { return "x"; } # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd) { # Map upper case to lower case: wd = tolower(wd); # Cannot trust that LOCALE crap: gsub(/À/, "à", wd); gsub(/È/, "è", wd); gsub(/Ì/, "ì", wd); gsub(/Ò/, "ò", wd); gsub(/Ù/, "ù", wd); gsub(/É/, "é", wd); gsub(/Ñ/, "ñ", wd); # Break at hyphens: gsub(/[~]/, " ", wd); # Break after apostrophes: gsub(/[']/, "' ", wd); # ... unless the word starts with an apostrophe: gsub(/^['][ ]+/, "'", wd); return wd; }