#! /usr/bin/gawk -f
# Last edited on 2012-05-05 18:50:59 by stolfilocal
# Sampling functions for envt/wow
# English - Well's "War of the Worlds", lowercased,
# with each word replaced by one or two Vietnamese words.
# To be included in wds-to-tlw

# These functions assume that the word table {wmap} maps each
# lowercase English word that appears in the input to a distinct 
# Vietnamese word, in lowercase, or to an hyphenated compound

function smp_define_patterns(smp,sec)
{
  # No patterns needed 
  if (smp != "envt/wow") { data_error(("wrong sample \"" smp "\"")); }
  if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); }

  # This does not work, unfortunately:
  # # Some handy patterns:
  # viqr_vowel_pat = "(([a][\\(\\^]?)|([o][\\+\\^]?)|([u][\\+]?)|([e][\\^]?)|[iy])";
  # viqr_letter_pat = ( "((" viqr_vowel_pat "[`'.\\?~]?)|[b-df-hj-np-tvwxz])" );
  # viqr_word_pat = ( "^" viqr_letter_pat "+$" );
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Accept lowercase letters with postfix VIQR diacritics, 
  # and hyphen (internal)
  return (wd ~ /^([d][d]?|[ckg][h]?|[n][g]?[h]?|[p][h]|[q]|[t][rh]?|[bfhjlmrsvx]|)([a][(^]?|[o][+^]?|[u][+]?|[e][^]?|[iy])+([`'.?~]|)([c][h]?|[n][gh]?|[mpt]|)$/);
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Delete all but ordinary chapters (omit opening quote)
  if (cursec !~ /^{p[12]}{c[1-9][0-9]*}{tx}/) { return "n"; }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Reject subsections that are not main text:
  if (cursec !~ /{tx}$/) { return "x"; }
  
  # Discard punctuation other than parag breaks: 
  if ((type == "p") && ( wd != "=")) { return "n"; }
  
  return type;
}

function smp_fix_word(smp,sec,type,wd,   nfld,fld,res,sep,i)
{
  if (type == "a")
    { # Map word to lowercase: 
      wd = tolower(wd);
      # There are no accented letters in this sample.

      # Break English word at hyphens:
      gsub(/[~]/, " ", wd);

      # Apply word table to each piece (the word should be there):
      nfld = split(wd, fld);
      res = ""; sep = "";
      for (i = 1; i <= nfld; i++)
        { wdi = fld[i];
          if (wdi in wmap)
            { wdi = wmap[wdi]; }
          else
            { data_error(("word not in table \"" wdi "\"")); } 
          # Break at Vietnamese hyphens:
          gsub(/[-]/, " ", wdi);
          res = ( res sep wdi ); sep = " ";
        }
      wd = res;
    }
  return wd;
}