#! /usr/bin/gawk -f # Last edited on 2012-05-05 23:10:07 by stolfilocal # Sampling functions for hebr/tav # Hebrew - Tanakh (Bible) with vowel points # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { # No patterns needed if ( \ (sec != "gen.1") && (sec != "exo.1") && (sec != "lev.1") && (sec != "num.1") && (sec != "deu.1") && (sec != "tot.1") \ ) { data_error(("invalid section \"" sec "\"")); } } function smp_is_good_token(smp,sec,type,wd) { # Check for invalid JSHB characters: if (wd !~ /^([bdghklmnpqrstwyz]|¡|¤|°|±|²|¹|¿|â|ä|ç|ê|ë|ï|ö|ü|þ)+$/) { return 0; } # Shin/sin dot should come only after "s" or "s¤": if (wd ~ /(^|[bdghklmnpqrtwyz]|¡|°|±|²|¹|¿|â|ä|ç|ê|ë|ï|ö|ü|þ)(¹|²)/) { return 0; } if (wd ~ /(^|[bdghklmnpqrtwyz]|¡|¤|°|±|²|¹|¿|â|ä|ç|ê|ë|ï|ö|ü|þ)¤(¹|²)/) { return 0; } # Dagesh must follow a consonant: if (wd ~ /(^|¤|°|ï|ë|ê|ä|ö|â|ü)¤/) { return 0; } # Sheva may be followed only by consonant or [êäâ¹²]: if (wd ~ /°(°|ï|ë|ö|ü|¤)/) { return 0; } # Vowels and sheva must follow consonant, dagesh, sheva, or shin/sin dot: if (wd ~ /(^|ï|ë|ê|ä|ö|â|ü)(°|ï|ë|ê|ä|ö|â|ü)/) { return 0; } # Cannot think of anything else... return 1; } function smp_reclassify_token(smp,sec,cursec,curlin,type,wd) { # Delete any text outside Book {sec}: if (sec == "gen.1") { if (cursec !~ /^{b1}/) { return "n"; } } else if (sec == "exo.1") { if (cursec !~ /^{b2}/) { return "n"; } } else if (sec == "lev.1") { if (cursec !~ /^{b3}/) { return "n"; } } else if (sec == "num.1") { if (cursec !~ /^{b4}/) { return "n"; } } else if (sec == "deu.1") { if (cursec !~ /^{b5}/) { return "n"; } } else if (sec == "tot.1") { if (cursec !~ /^{b[1-5]}/) { return "n"; } } else { arg_error(("bad subsection \"" subsec "\"")); } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Reject any material that is not verse text: if (cursec !~ /{c[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; }; # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_token(smp,sec,type,wd) { # Input should be monocase. return wd; }