#! /usr/bin/gawk -f
# Last edited on 2025-09-24 16:43:41 by stolfi
# Sampling functions for arab/qcs
# Arabic - Holy Quran - consonant-only source text
# To be included in wds-to-tlw

function smp_define_patterns(smp,sec)
{
  # No patterns needed 
  if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); }
}

function smp_is_good_token(smp,sec,type,wd)
{ 
  # Check for invalid JSAR characters:
  if (wd !~ /^([!'abdfhjklmnqrstwxyz~]|¡|£|¤|©|¨|±|µ|¿|ß|â|ä|å|ç|î|ï|ð|û|ü|þ|°)+$/) { return 0; } 

  # Check for short vowel marks, sukun:
  if (wd ~ /(ä|ï|ü|°)/) { return 0; } 
  
  # Teh-marbuta must come last in word:
  if (wd ~ /¨./) { return 0; }
  
  # Superscript hamza must come after long vowels only:
  if (wd ~ /(^|[!'bdfhjklmnqrstxz~]|¡|£|¤|©|¨|±|µ|¿|ß|â|ä|å|ç|î|ï|ð|û|ü|þ|°)!/) { return 0; }
  
  # Subscript hamza must come after "a" only:
  if (wd ~ /(^|[!'bdfhjklmnqrstwxyz~]|¡|£|¤|©|¨|±|µ|¿|ß|â|ä|å|ç|î|ï|ð|û|ü|þ|°)¡/) { return 0; }
  
  # Cannot think of anything else...
  return 1;
}

function smp_reclassify_token(smp,sec,cursec,curlin,type,wd)
{
  # Delete anything that is not surat verses
  # (omit chapter titles and bismillah, except in sura 1).
  if (cursec !~ /^{b}{s[1-9][0-9]*}{v[1-9][0-9]*}/) { return "n"; }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Reject any material that is not verse text:
  if (cursec !~ /^{b}{s[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; };

  # Discard punctuation other than parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }
  
  return type;
}

function smp_fix_token(smp,sec,type,wd)
{
  # Input should be monocase.
  return wd;
}