#! /usr/bin/gawk -f # Last edited on 2025-09-24 16:43:41 by stolfi # Sampling functions for arab/qcs # Arabic - Holy Quran - consonant-only source text # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { # No patterns needed if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } } function smp_is_good_token(smp,sec,type,wd) { # Check for invalid JSAR characters: if (wd !~ /^([!'abdfhjklmnqrstwxyz~]|¡|£|¤|©|¨|±|µ|¿|ß|â|ä|å|ç|î|ï|ð|û|ü|þ|°)+$/) { return 0; } # Check for short vowel marks, sukun: if (wd ~ /(ä|ï|ü|°)/) { return 0; } # Teh-marbuta must come last in word: if (wd ~ /¨./) { return 0; } # Superscript hamza must come after long vowels only: if (wd ~ /(^|[!'bdfhjklmnqrstxz~]|¡|£|¤|©|¨|±|µ|¿|ß|â|ä|å|ç|î|ï|ð|û|ü|þ|°)!/) { return 0; } # Subscript hamza must come after "a" only: if (wd ~ /(^|[!'bdfhjklmnqrstwxyz~]|¡|£|¤|©|¨|±|µ|¿|ß|â|ä|å|ç|î|ï|ð|û|ü|þ|°)¡/) { return 0; } # Cannot think of anything else... return 1; } function smp_reclassify_token(smp,sec,cursec,curlin,type,wd) { # Delete anything that is not surat verses # (omit chapter titles and bismillah, except in sura 1). if (cursec !~ /^{b}{s[1-9][0-9]*}{v[1-9][0-9]*}/) { return "n"; } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Reject any material that is not verse text: if (cursec !~ /^{b}{s[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; }; # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_token(smp,sec,type,wd) { # Input should be monocase. return wd; }