#! /usr/bin/gawk -f # Last edited on 2025-09-24 15:05:49 by stolfi # Sampling functions for latn/ahl # Transcription of the so-called "Alchemical herbal" by Marco Ponzi. # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { # No patterns needed if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } } function smp_is_good_token(smp,sec,type,wd) { # Accept only lowercase alpha. # The text uses neither hyphen nor apostrophe. return (wd ~ /^[a-z]+$/); } function smp_reclassify_token(smp,sec,cursec,curlin,type,wd) { # Delete anything outside section {sec}: if (sec == "tot.1") { if (cursec !~ /^{B}{h[0-9]+}/) { return "n"; } } else { arg_error(("bad subsection \"" subsec "\"")); } # Delete titles: if (cursec ~ /{tt} *$/) { return "n"; } # Reject any subsection that is not prose text: if (cursec !~ /{p[0-9]+} *$/) { return "x"; }; # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_token(smp,sec,type,wd) { # Map upper case to lower case (just in case). wd = tolower(wd); return wd; }