#! /usr/bin/gawk -f
# Last edited on 2012-05-05 21:38:25 by stolfilocal
# Sampling functions for arab/qud
# Arabic - Holy Quran WITH ALL MARKS REMOVED
# To be included in wds-to-tlw

function smp_define_patterns(smp,sec)
{
  # No patterns needed 
  if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); }
}

function smp_is_good_token(smp,sec,type,wd)
{ 
  # Check for invalid JSAR characters:
  if (wd !~ /^([!'abdfhjklmnqrstwxyz~]|¡|£|¤|©|¨|±|µ|¿|ß|â|ä|å|ç|î|ï|ð|û|ü|þ|°)+$/) { return 0; } 

  # Check for superscript hamza, subscript hamza, superscript madda, vowels, sukun:
  if (wd ~ /(!|~|¡|â|ä|î|ï|û|ü|°)/) { return 0; } 
  
  # Teh-marbuta must come last in word:
  if (wd ~ /¨./) { return 0; }
  
  # Cannot think of anything else...
  return 1;
}

function smp_reclassify_token(smp,sec,cursec,curlin,type,wd)
{
  # Delete anything that is not surat verses
  # (omit chapter titles and bismillah, except in sura 1).
  if (cursec !~ /^{b}{s[1-9][0-9]*}{v[1-9][0-9]*}/) { return "n"; }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Reject any material that is not verse text:
  if (cursec !~ /^{b}{s[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; };

  # Discard punctuation other than parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }
  
  return type;
}

function smp_fix_token(smp,sec,type,wd)
{
  # Input should be monocase.
  # We replace "»" by the preceding consonant for compatibility
  # with other electronic editions.  Besides, use of the "»"
  # may be inconsistent.
  wd = gensub(/(.)[»]/, "\\1\\1", "g", wd);
  
  # Remove script-hamza, script-madda, vowels, sukuns:
  gsub(/[~!¡âäîïûü°]/, "", wd);
  return wd;
}