#! /usr/bin/gawk -f
# Last edited on 2012-05-05 20:38:39 by stolfilocal
# Sampling functions for chrc/red
# Chinese - Dream of the Red Mansion in GB code,
# converted to Voynichese-looking Roman-like code
# To be included in select-evt-lines, fix-raw-words, select-gud-bad-words

# These functions assume that the word table {wmap} maps each
# lowercase pinyin word that appears in the input to a distinct 
# "pseudo-Voynich" Roman-style numeral, in lowercase, prefixed by "@".

function smp_define_patterns(smp,sec)
{
  if (smp != "chrc/red") { data_error(("wrong sample \"" smp "\"")); }
  if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); }
  
  gb_bytes = ( \
    "¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿" \
    "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞ" \
    "ßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþ" \
  );
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # The Voynichese-like pseudo-Roman code uses lowercase letters only
  return (word ~ /^[aoydsrlciektph]+$/);
}

function smp_is_GB_code(wd)
{
  return \
    (length(wd) == 2) &&
    (index(gb_bytes, substr(wd,1,1)) > 0) &&
    (index(gb_bytes, substr(wd,2,1)) > 0);
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Delete anything outside section {sec}:
  if (sec == "tot.1")
    { if (cursec !~ /^{b[12]}/) { return "n"; } }
  else  
    { arg_error(("bad subsection \"" subsec "\"")); }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Reject any material that is not prose text:
  if (cursec !~ /{c[1-9][0-9]*}{tx}$/) { return "x"; };

  if (type == "a")
    { # The word should be actually a GB byte-pair.
      # Map it through the code table:
      if (! smp_is_GB_code(wd)) { data_error(("malformed GB code \"" wd "\"")); } 
      if (wd in wmap) 
        { wd = wmap[wd];
          if (wd ~ /^[«].+[»]$/) 
            { type = "p"; }
          else if (wd ~ /^[(].+[)]$/)
            { type = "s"; }
        }
      else
        { data_error(("word not in code table \"" wd "\"")); }
   }

  # Discard punctuation other than parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }

  return type;
}

function smp_fix_word(smp,sec,type,wd,   nfld,fld,res,sep)
{
  if (type == "a")
    { # The word should be a GB byte-pair.
      # Map it through the code table:
      if (wd in wmap)
        { wd = wmap[wd]; }
      else
        { data_error(("word not in table \"" wd "\"")); } 
      # Remove the leading "@", or mark with "?"
      if (wd ~ /^[@]/)
        { gsub(/^[@]/, "", wd); }
      else 
        { wd = ("?" wd "?"); }
    }
  return wd;
}