#! /usr/bin/gawk -f # Last edited on 2012-05-05 22:01:29 by stolfilocal # Sampling functions for arab/qph # Arabic - Holy Quran with vowels recovered from phonetic transcript # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { # No patterns needed if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } } function smp_is_good_token(smp,sec,type,wd) { # Check for invalid JSAR characters: if (wd !~ /^([!'abdfhjklmnqrstwxyz~]|¡|£|¤|©|¨|±|µ|¿|ß|â|ä|å|ç|î|ï|ð|û|ü|þ|°)+$/) { return 0; } # Check for subscript hamza, superscript madda, teh-marbuta, short vowels, alef-maksura, sukun: if (wd ~ /(~|¡|¨|ä|å|ï|ü|°)/) { return 0; } # Vowel marks cannot be doubled, and cannot be first in word: if (wd ~ /(^|â|î|û)(â|î|û)/) { return 0; } # Cannot think of anything else... return 1; } function smp_reclassify_token(smp,sec,cursec,curlin,type,wd) { # Delete anything that is not surat verses # (omit chapter titles and bismillah, except in sura 1). if (cursec !~ /^{b}{s[1-9][0-9]*}{v[1-9][0-9]*}/) { return "n"; } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Reject any material that is not verse text: if (cursec !~ /^{b}{s[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; }; # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_token(smp,sec,type,wd) { # Input should be monocase. # Remove hyphens, joining the words: gsub(/[-]/, "", wd); return wd; }