#! /usr/bin/gawk -f # Last edited on 2012-05-05 21:59:51 by stolfilocal # Sampling functions for arab/quf # Arabic - Holy Quran with vowels AND SUKUNS # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { # No patterns needed if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } } function smp_is_good_token(smp,sec,type,wd) { # Check for invalid JSAR characters: if (wd !~ /^([!'abdfhjklmnqrstwxyz~]|¡|£|¤|©|¨|±|µ|¿|ß|â|ä|å|ç|î|ï|ð|û|ü|þ|°)+$/) { return 0; } # Vowel marks and sukuns cannot be doubled, and cannot be first in word: if (wd ~ /(^|â|ä|î|ï|û|ü|°)(â|ä|î|ï|û|ü|°)/) { return 0; } # Teh-marbuta must come last in word, except for vowel marks: if (wd ~ /¨([!'abdfhjklmnqrstwxyz~]|¡|£|¤|©|¨|±|µ|¿|ß|å|ç|ð|þ)/) { return 0; } # Superscript hamza must come after long vowels only: if (wd ~ /(^|[^awy])!/) { return 0; } # Subscript hamza and superscript madda must come after "a" only: if (wd ~ /(^|[^a])(¡|~)/) { return 0; } # Cannot think of anything else... return 1; } function smp_reclassify_token(smp,sec,cursec,curlin,type,wd) { # Delete anything that is not surat verses # (omit chapter titles and bismillah, except in sura 1). if (cursec !~ /^{b}{s[1-9][0-9]*}{v[1-9][0-9]*}/) { return "n"; } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Reject any material that is not verse text: if (cursec !~ /^{b}{s[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; }; # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_token(smp,sec,type,wd) { # Input should be monocase. # We replace "»" by the preceding consonant for compatibility # with other electronic editions. Besides, use of the "»" # may be inconsistent. wd = gensub(/(.)[»]/, "\\1\\1", "g", wd); return wd; }