#! /usr/bin/gawk -f
# Last edited on 2004-02-28 02:36:31 by stolfi

# Preprocess the New Testament in Vietnamese (VIQR)

BEGIN {
  abort = -1;
  usage = ( ARGV[0] " < INPUT.src > OUTPUT.src" );

  # Fixes the encoding of accents in Vietnamese VIQR text
}

(abort >= 0) { exit abort; }

/^[\#] +File +.*[.]txt/ {
  $0 = gensub( \
    /^[\#] +File +20([0-9])-(.*)[.]txt/, \
    "@section 1 {b\\1}\n\n# File 20\\1-\\2.txt\n\n", "s", $0 \
  ); 
  print;
  next;
}

/^ *[A-Z][A-Z][A-Z][ ]/ {
  $0 = gensub( \
    /^ *([A-Z][A-Z][A-Z])[ ]([0-9]+)[:](1)([^0-9]|$)/, \
    "@section 2 {c\\2}\n\n@section 3 {v\\3}\n\n  {\\1:\\2:\\3}", "s", $0 \
  ); 
  $0 = gensub( \
    /^ *([A-Z][A-Z][A-Z])[ ]([0-9]+)[:]([0-9]+)([^0-9]|$)/, \
    "@section 3 {v\\3}\n\n  {\\1:\\2:\\3}\\4", "s", $0 \
  ); 
  print;
  next;
}

/^[ \011]*([\#@]|$)/ { 
  print;
  next;
}

/./ { 
  # Accent fixes
  $0 = remap_accents($0);
  # General contents line cleanup
  gsub(/[ \011]+$/, "", $0);
  gsub(/[ \011]+/, " ", $0);
  gsub(/^[ \011]+/, "  ", $0);
  # insert leading spaces
  gsub(/^[ ]*/, "  ", $0);
  print;
  next;
}

END {
  if (abort >= 0) { exit abort; }
}

function remap_accents(w)
{
  #
  # Remap accent codes to avoid confusion with punctuation:
  #
  #  dot-below "." -> "°"
  #  breve     "(" -> "ľ"
  #  hook      "?" -> "ß" 
  #  
  w = gensub(/([Aa])[\(]/, "\\1ľ", "g", w);
  w = gensub(/([Aa][ľ\^]?|[Oo][\+\^]?|[Uu][\+]?|[Ee][\^]?|[IiYy])[.]/, "\\1°", "g", w);
  w = gensub(/([Aa][ľ\^]?|[Oo][\+\^]?|[Uu][\+]?|[Ee][\^]?|[IiYy])[?]/, "\\1ß", "g", w);
  
  # Remove "\"-protection from puncts
  w = gensub(/[\\]([.?])/, " \\1 ", "g", w);
  
  # Space out brackets and parentheses
  w = gensub(/([][()])/, " \\1 ", "g", w);

  # Replace some common quote patterns
  w = gensub(/[:][ ]*[\"]/, ": Ť ", "g", w);
  w = gensub(/[\"] *([,;.?!)])/, " ť \\1", "g", w);
  w = gensub(/([.!?]) *[\"] *([-]|$)/, "\\1 ť \\2", "g", w);
  w = gensub(/^[ ]*[\"]/, "Ť ", "g", w);

  w = gensub(/[ ][\"]([^ ])/, " Ť \\1", "g", w);
  w = gensub(/([^ ])[\"][ ]/, "\\1 ť ", "g", w);
  
  # Isolate all punctuation:
  w = gensub(/([,.:;?!])/, " \\1 ", "g", w);

  # Except semicolons in verse numbers:
  w = gensub(/ +[:] *([0-9]+)/, ":\\1", "g", w);
  return w;
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "** usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1;
}

function data_error(msg)
{
  printf "line %d: ** %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}

function load_lowercase_table(file,    nMap,lin,fld,nfld)
{
  # Reads a word mapping table from "file", containing pairs 
  # of the form ORGINAL NEW. 
  # Stores the table in "wmap[ORIGINAL] = NEW".
  
  nMap=0;
  split("", wmap)
  while((getline lin < file) > 0) { 
    if (! match(lin, /^[#]/))
      { nfld = split(lin, fld, " ");
        if (nfld != 2) error(("bad table entry = \"" lin "\""));
        if (fld[1] in wmap) error(("repeated key = \"" lin "\""));
        wmap[fld[1]] = fld[2];
        nMap++;
      }
  }
  if (ERRNO != "0") { arg_error((file ": " ERRNO)); }
  close (file);
  if (nMap == 0) { arg_error(("file \"" file "\" empty or missing")); }
  # printf "** loaded %6d map pairs\n", nMap > "/dev/stderr"
}