#! /usr/bin/gawk -f
# Last edited on 2004-02-04 18:40:54 by stolfi

# Preprocess the Pentateuch in Vietnamese (VIQR)

BEGIN {
  abort = -1;
  usage = ( \
    "cat main.org \\\n" \
    "  | preprocess-org \\\n" \
    "  | evt-from-org -v omitControls=1 -v omitBlanks=1 \\\n" \
    "  > main.evt" \
  );

  # Maps words through the "up-to-low.tbl" that should 
  # map to lowercase all Vietnamese words, and capitalize 
  # all proper names and transliterated hebrew words.
  # 
  # Also replaces paragraph-final "=" by 
  # @= (which will be re-converted into "="-lines by org-to-evt)

  nunitlines = 0;      # Number of lines already seen in current parag.
  lastwasblank = 0;    # TRUE if last line written was a blank #-comment.
  lastword = "";       # Last word in current unit.
  
  load_lowercase_table("up-to-low.tbl");
}

(abort >= 0) { exit abort; }

/^[#]/ { 
  if ($2 == "ALPHA") { $4 = remap_accents($4); } 
  print;
  next;
}

/^[ \011]*$/ { print; next; }

/.[@\#]/ { data_error("@ or # not on column 1"); }

/^[@](part|chapter|end)/ {
  end_current_unit();
  print; next;
}

/^[@]unit[ ][A-Z][ ][+]?[0-9]+[ ][-A-Za-z0-9:.]+[ ]*$/ {
  un_type = $2;
  un_num = $3;
  un_name = $4;
  
  end_current_unit();
  print;
  begin_new_unit();
  next;
}

/^[@](format|include)/ { print; next; }

/[@]/ { data_error("invalid @-directive"); }

/./ { 
  # General contents line cleanup
  gsub(/[ \011]+$/, "", $0);
  gsub(/[ \011]+/, " ", $0);
  gsub(/^[ \011]+/, "  ", $0);
  fix_words();
  lastword = $(NF);
  endparag = (lastword == "=");
  if (endparag) 
    { NF--;
      if ((NF == 0) || ($(NF) !~ /^([.?!]|[.][.][.])$/))
        { printf " «line %s : unexpected parag» ", FNR > "/dev/stderr"; }
    }
  nunitlines++;
  print;
  if (endparag) { print "@="; }
  next;
}

END {
  if (abort >= 0) { exit abort; }
  if (nunitlines > 0) { end_current_unit(); }
}

function fix_words(   i,w)
{
  # Apply case correction:
  for (i = 1; i <= NF; i++)
    { w = $(i);
      if (w in wmap) { w = wmap[w]; }
      $(i) = remap_accents(w);
    }
}

function remap_accents(w)
{
  #
  # Remap accent codes to avoid confusion with punctuation:
  #
  #  dot-below "." -> "°"
  #  breve     "(" -> "µ"
  #  hook      "?" -> "ß" 
  #  
  if ((w != ".") && (w != "...")) { gsub(/[.]/, "°", w); }
  if (w != "(") { gsub(/[\(]/, "µ", w); }
  if (w != "?") { gsub(/[?]/, "ß", w); }
  return w;
}

function end_current_unit()
{ # Finishes off the current paragraph, adding "@=" if necessary,
  if (nunitlines > 0) 
    { if (lastword !~ /^([=:;,»]|[-][-])$/)
        { printf " «line %s : no punct» ", FNR > "/dev/stderr"; }
      # output_comment_line("#");
    }
  nunitlines = 0;
  lastword = "";
}

function begin_new_unit()
{ 
  if (nunitlines != 0) { data_error("inconsistent nunitlines (2)"); } 
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1;
}

function data_error(msg)
{
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}

function load_lowercase_table(file,    nMap,lin,fld,nfld)
{
  # Reads a word mapping table from "file", containing pairs 
  # of the form ORGINAL NEW. 
  # Stores the table in "wmap[ORIGINAL] = NEW".
  
  nMap=0;
  split("", wmap)
  while((getline lin < file) > 0) { 
    if (! match(lin, /^[#]/))
      { nfld = split(lin, fld, " ");
        if (nfld != 2) error(("bad table entry = \"" lin "\""));
        if (fld[1] in wmap) error(("repeated key = \"" lin "\""));
        wmap[fld[1]] = fld[2];
        nMap++;
      }
  }
  if (ERRNO != "0") { arg_error((file ": " ERRNO)); }
  close (file);
  if (nMap == 0) { arg_error(("file \"" file "\" empty or missing")); }
  printf "loaded %6d map pairs\n", nMap > "/dev/stderr"
}