#! /usr/bin/gawk -f
# Last edited on 2012-05-05 21:59:51 by stolfilocal
# Sampling functions for arab/quf
# Arabic - Holy Quran with vowels AND SUKUNS
# To be included in wds-to-tlw

function smp_define_patterns(smp,sec)
{
  # No patterns needed 
  if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); }
}

function smp_is_good_token(smp,sec,type,wd)
{ 
  # Check for invalid JSAR characters:
  if (wd !~ /^([!'abdfhjklmnqrstwxyz~]|¡|£|¤|©|¨|±|µ|¿|ß|â|ä|å|ç|î|ï|ð|û|ü|þ|°)+$/) { return 0; } 
  
  # Vowel marks and sukuns cannot be doubled, and cannot be first in word:
  if (wd ~ /(^|â|ä|î|ï|û|ü|°)(â|ä|î|ï|û|ü|°)/) { return 0; }
  
  # Teh-marbuta must come last in word, except for vowel marks:
  if (wd ~ /¨([!'abdfhjklmnqrstwxyz~]|¡|£|¤|©|¨|±|µ|¿|ß|å|ç|ð|þ)/) { return 0; }
  
  # Superscript hamza must come after long vowels only:
  if (wd ~ /(^|[^awy])!/) { return 0; }
  
  # Subscript hamza and superscript madda must come after "a" only:
  if (wd ~ /(^|[^a])(¡|~)/) { return 0; }
  
  # Cannot think of anything else...
  return 1;
}

function smp_reclassify_token(smp,sec,cursec,curlin,type,wd)
{
  # Delete anything that is not surat verses
  # (omit chapter titles and bismillah, except in sura 1).
  if (cursec !~ /^{b}{s[1-9][0-9]*}{v[1-9][0-9]*}/) { return "n"; }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Reject any material that is not verse text:
  if (cursec !~ /^{b}{s[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; };

  # Discard punctuation other than parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }
  
  return type;
}

function smp_fix_token(smp,sec,type,wd)
{
  # Input should be monocase.
  # We replace "»" by the preceding consonant for compatibility
  # with other electronic editions.  Besides, use of the "»"
  # may be inconsistent.
  wd = gensub(/(.)[»]/, "\\1\\1", "g", wd);
  return wd;
}