#! /usr/bin/gawk -f # Last edited on 2023-05-10 14:36:36 by stolfi # Sampling functions for germ/sim # German - Grimmelshausen's "Der Abenteuerliche Simplicissimus Teutsch" # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { # No patterns needed if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } } function smp_is_good_word(smp,sec,type,wd) { # Accept only lowercase alpha, "äöüß", plus apostrophe and optional "^" at end: if (wd !~ /^(['a-z]|ä|ö|ü|ß)+[\^]?$/) { return 0; } # Apostrophes can't be doubled and cannot occur next to "^": if (wd ~ /['][']/) { return 0; } # The sharp-ss "ß" cannot occur at the beginning: if (wd ~ /^ß/) { return 0; } # Should be enough: return 1 } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Delete all but ordinary chapters (omit Beschluss): # Take only ordinary chapters: if (cursec !~ /^{b[1-6]}{c[1-9][0-9]*}/) { return "n"; } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Within the selected sections, reject anything that is not Spanish prose # (main text or body of letters): if (cursec !~ /{tx}$/) { return "x"; } # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd) { # Break at hyphens: gsub(/[-]/, " ", wd); # Break between "d'" and a noun (before lowercasing it!): wd = gensub(/^d[']([A-ZÄÖÜ])/, "d' \\1", 1, wd); # Map upper case to lower case: wd = tolower(wd); # Cannot trust that LOCALE crap: gsub(/[Ä]/, "ä", wd); gsub(/[Ö]/, "ö", wd); gsub(/[Ü]/, "ü", wd); # Sharp-ss is is never word-initial so it has no uppercase # (It is written "SS" in all-caps, which is ambiguous, # but we should have excluded titles...) return wd; }