#! /usr/bin/gawk -f # Last edited on 2012-05-05 21:38:25 by stolfilocal # Sampling functions for arab/qud # Arabic - Holy Quran WITH ALL MARKS REMOVED # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { # No patterns needed if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } } function smp_is_good_token(smp,sec,type,wd) { # Check for invalid JSAR characters: if (wd !~ /^([!'abdfhjklmnqrstwxyz~]|¡|£|¤|©|¨|±|µ|¿|ß|â|ä|å|ç|î|ï|ð|û|ü|þ|°)+$/) { return 0; } # Check for superscript hamza, subscript hamza, superscript madda, vowels, sukun: if (wd ~ /(!|~|¡|â|ä|î|ï|û|ü|°)/) { return 0; } # Teh-marbuta must come last in word: if (wd ~ /¨./) { return 0; } # Cannot think of anything else... return 1; } function smp_reclassify_token(smp,sec,cursec,curlin,type,wd) { # Delete anything that is not surat verses # (omit chapter titles and bismillah, except in sura 1). if (cursec !~ /^{b}{s[1-9][0-9]*}{v[1-9][0-9]*}/) { return "n"; } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Reject any material that is not verse text: if (cursec !~ /^{b}{s[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; }; # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_token(smp,sec,type,wd) { # Input should be monocase. # We replace "»" by the preceding consonant for compatibility # with other electronic editions. Besides, use of the "»" # may be inconsistent. wd = gensub(/(.)[»]/, "\\1\\1", "g", wd); # Remove script-hamza, script-madda, vowels, sukuns: gsub(/[~!¡âäîïûü°]/, "", wd); return wd; }