#! /usr/bin/gawk -f # Last edited on 2012-05-05 22:26:13 by stolfilocal # Sampling functions for fran/tal # French - "De la Terre a la Lune" # Only alpha words from the text, split at hyphens and after # apostrophes, mapped to lowercase. # Note that hyphens are encoded as "~", abbrev dots as "^". # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { # No patterns needed if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } } function smp_is_good_word(smp,sec,type,wd) { # Accept only lowercase alpha, alpha with accents (acute,grave,circumflex, # diaeresis), and "ç"), plus apostrophe, hyphen, and "^". # Check for invalid characters: if (wd !~ /^(['~^a-z]|à|è|ì|ò|ù|á|é|í|ó|ú|â|ê|î|ô|û|ä|ë|ï|ö|ü|ç)+$/) { return 0; } # Apostrophes, hyphens and "^" can't be doubled: if (wd ~ /['~^]['~^]/) { return 0; } # The abbreviation mark "^" can only occur at the end: if (wd ~ /[\^]./) { return 0; } # An apostrophe cannot occur at the beginning of a word (but can occur at the end): if (wd ~ /^[']/) { return 0; } # An hyphen can only occur inside a word. if (wd ~ /^[~]/) { return 0; } if (wd ~ /[~]$/) { return 0; } # Enough checking: return 1; } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Delete all but ordinary chapters (omit opening quote) if (cursec !~ /^{b}{c[1-9][0-9]*}{tx}/) { return "n"; } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Within the selected sections, reject anything that is not Italian prose if (cursec !~ /{tx}$/) { return "x"; } # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd) { # Map upper case to lower case: wd = tolower(wd); # Cannot trust that LOCALE crap: gsub(/[À]/, "à", wd); gsub(/[È]/, "è", wd); gsub(/[Ì]/, "ì", wd); gsub(/[Ò]/, "ò", wd); gsub(/[Ù]/, "ù", wd); gsub(/[Á]/, "á", wd); gsub(/[É]/, "é", wd); gsub(/[Í]/, "í", wd); gsub(/[Ó]/, "ó", wd); gsub(/[Ú]/, "ú", wd); gsub(/[Â]/, "â", wd); gsub(/[Ê]/, "ê", wd); gsub(/[Î]/, "î", wd); gsub(/[Ô]/, "ô", wd); gsub(/[Û]/, "û", wd); gsub(/[Ä]/, "ä", wd); gsub(/[Ë]/, "ë", wd); gsub(/[Ï]/, "ï", wd); gsub(/[Ö]/, "ö", wd); gsub(/[Ü]/, "ü", wd); gsub(/[Ç]/, "ç", wd); # Break at hyphens: gsub(/[~]/, " ", wd); # ... except before epenthetic "~t~": gsub(/[ ][t][ ]/, "~t ", wd); # Note that "J.-T." is encoded as "J^~T^" hence it will stay split. # Break after apostrophes: gsub(/[']/, "' ", wd); # ... except before English Genitive "'s" gsub(/['][ ][s][ ]/, "'s ", wd); gsub(/['][ ][s]$/, "'s", wd); return wd; }