#! /usr/bin/gawk -f
# Last edited on 2001-12-31 14:04:43 by stolfi

# Process the Pentateuch in Vietnamese (VIQR)

BEGIN {
  abort = -1;
  usage = ( "viet-bib-process \\\n" \
    "  < main.org > main.evt" \
  );

  # Lines will be numbered with <Xppp.U.nnk> where
  # X is a letter identifying the book (a,b,c...)
  # ppp is the chapter number (sequential through book)
  # U is a letter identifying the unit type: 
  # nn is the versicle number within the unit
  # k is the line number in the versicle
  #
  # Reads file "up-to-low.tbl" with upper-to-lower mapping
  # of words that are not proper names. 

  curbook = "";        # Current book code.
  curchapter = "";     # Current chapter number.
  curunit = "";        # Current unit code.
  curverse = "";       # Current verse number 
  
  nunitlines = 0;      # Number of lines already seein in current unit.
  nverselines = 0;     # Number of lines already seen in current parag.
  lastwasblank = 0;    # TRUE if last line written was a blank #-comment.
  lastword = "";       # Last word in current verse.
  
  load_lowercase_table("up-to-low.tbl");
}

(abort >= 0) { exit abort; }

/^ *[#]/ { 
  if ($2 == "ALPHA") { $4 = remap_accents($4); } 
  if ($2 == "BLANK") { $4 = "\"_\""; NF = 4; } 
  lin = $0; gsub(/^ */,"",lin); 
  output_comment_line(lin);
  next;
}

/.+[@]/ { data_error("@ not on column 1"); }

/./ { 
  # General contents and control line cleanup
  gsub(/^[ 	]+/, "", $0);
  gsub(/[ 	]+$/, "", $0);
  gsub(/[ 	]+/, " ", $0);
}

/^[@]verse[ ][A-Z][A-Z]+[ ][a-z][ ][0-9]+[ ][A-Z][ ][0-9]+[ ]*$/ {
  booktag = $2; 
  book = $3;
  chapter = $4;
  unit = $5;
  verse = $6;
  
  if (book != curbook)
    { end_current_book(); }
  else if (chapter != curchapter)
    { end_current_chapter(); }
  else if (unit != curunit)
    { end_current_unit(); }
  else
    { if (verse == curverse)
        { data_error(("repeated verse \"" verse "\"")); }
      end_current_verse();
    }

  # output_control_line($0);
  if (book != curbook) { begin_new_book(book); }
  if (chapter != curchapter) { begin_new_chapter(chapter); }
  if (unit != curunit) { begin_new_unit(unit); }
  begin_new_verse(verse);
  next;
}

/^[@]end[ ]*$/ {
  end_current_book();
  # output_control_line($0);
  next;
}

/^[@]/ {
  data_error("invalid @ directive");
}

/^ *$/ {
  next;
}

/^[@]/ {
  data_error("unknown @ directive");
}

/./ {
  # Contents line, phew!
  fix_words();
  output_contents_line(($0 " "));
  next;
}

END {
  if (abort >= 0) { exit abort; }
  end_current_book();
  insert_file("viqr-encoding.txt")
}

function fix_words(   i,w)
{
  # Apply case correction:
  for (i = 1; i <= NF; i++)
    { w = $(i);
      if (w in up_to_low) { w = up_to_low[w]; }
      $(i) = remap_accents(w);
    }
}

function remap_accents(w)
{
  #
  # Remap accent codes to avoid confusion with punctuation:
  #
  #  dot-below "." -> "°"
  #  breve     "(" -> "µ"
  #  hook      "?" -> "ß" 
  #  
  if ((w != ".") && (w != "...")) { gsub(/[.]/, "°", w); }
  if (w != "(") { gsub(/[\(]/, "µ", w); }
  if (w != "?") { gsub(/[?]/, "ß", w); }
  return w;
}

function end_current_book()
{
  # Finishes off the current book, and sets curbook to "".
  if (curbook != "") 
    { end_current_chapter();
      printf "end book %s\n", curbook > "/dev/stderr";
      curbook = "";
    }
  else
    { if (curchapter != "")
        { data_error("inconsistent curchapter (0)"); }
    }
}

function end_current_chapter()
{
  # Finishes off the current chapter, and increments "nchapters".
  if (curchapter != "") 
    { end_current_unit();
      curchapter = "";
      printf "]\n" > "/dev/stderr";
    }
  else
    { if (curunit != "")
        { data_error("inconsistent curunit"); }
    }
}

function end_current_unit()
{
  # Finishes off the current unit.
  if (curunit != "")
    { end_current_verse();
      curunit = "";
      printf "%d)", nunitlines > "/dev/stderr";
      nunitlines = 0;
    }
  else
    { if (curverse != "") 
        { data_error("inconsistent curverse"); }
      if (nunutlines != 0) 
        { data_error("inconsistent nunitlines"); }
    }
}

function end_current_verse()
{ # Finishes off the current paragraph, adding "@=" if necessary,
  if (curverse != "")
    { curverse = "";
      if (nverselines > 0) 
        { if (lastword ~ /^([.?!]|[.][.][.])$/)
            { output_contents_line("="); }
          else if (lastword !~ /^([:;,»]|[-][-])$/)
            { printf " «line %s : no punct» ", FNR > "/dev/stderr"; }
          output_comment_line("#");
        }
      nverselines = 0;
      lastword = "";
    }
  else
    { if (nverselines != 0) 
       { data_error("inconsistent nverselines"); } 
    }
}

function begin_new_book(book)
{
  if (curbook != "") { data_error("inconsitent curbook (2)"); }
  curbook = book;
  printf "begin book %s\n", curbook > "/dev/stderr";
}

function begin_new_chapter(chapter)
{
  if (curbook == "") { data_error("unspecified book"); }
  if (curchapter != "") { data_error("inconsistent curchapter (2)"); }
  curchapter = chapter;
  output_comment_line(sprintf("## <%s%03d>", curbook, curchapter));
  printf "[%d=", curchapter > "/dev/stderr";
}

function begin_new_unit(unit)
{ 
  # Initializes a new unit with given letter code.
  if (curchapter == "") { data_error("unspecified chapter"); }
  if (curunit != "") { data_error("inconsistent curunit (2)"); }
  if (nunutlines != 0) { data_error("inconsistent nunitlines (2)"); }
  curunit = unit;
  printf "(%s:", curunit > "/dev/stderr";
  if (! lastwasblank) { output_comment_line("#"); }
}

function begin_new_verse(verse)
{
  # Starts a new verse.
  if (curunit == "") { data_error("unspecified unit"); }
  if (curverse != "") { data_error("inconsistent curverse (2)"); }
  if (nverselines != 0) { data_error("inconsistent nverselines (2)"); } 
  curverse = verse;
}

function output_contents_line(lin,   loc,i)
{
  nunitlines++;
  nverselines++;
  if (nverselines >= 10)
    { data_error(("too many lines in verse")); }
  else
    { curline = 10*verse + nverselines; }
  if (curbook == "")    { data_error("unspecified book (2)"); }
  if (curchapter == "") { data_error("unspecified chapter (2)"); }
  if (curunit == "")    { data_error("unspecified unit (2)"); }
  # Replace blanks by underscores:
  gsub(/[ 	]+/, "_", lin);
  # Write line:
  loc = sprintf("<%s%03d.%s.%03d>", 
    curbook, curchapter, curunit, curline);
  printf "%-18s %s\n", loc, lin;
  lastwasblank = 0;
  lastword = lin;
  gsub(/[_][_]*$/, "", lastword);
  gsub(/^.*[_]/, "", lastword);
}

function output_comment_line(lin)
{ # Outputs the comment line `lin', and sets `lastwasblank'
  gsub(/[ 	]+$/, "", lin);
  if (lin !~ /^[#]/) { data_error("bad comment"); }
  lastwasblank = 0;
  if (lin ~ /^[#][ ]*$/) { lastwasblank = 1; }
  print lin;
}

function output_control_line(lin)
{ # Outputs an "@" line
  print lin;
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1;
}

function data_error(msg)
{
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}

function load_lowercase_table(file,    nMap,lin,fld,nfld)
{
  # Reads a word mapping table from "file", containing pairs 
  # of the form ORGINAL NEW. 
  # Stores the table in "up_to_low[ORIGINAL] = NEW".
  
  nMap=0;
  split("", up_to_low)
  while((getline lin < file) > 0) { 
    if (! match(lin, /^[#]/))
      { nfld = split(lin, fld, " ");
        if (nfld != 2) error(("bad table entry = \"" lin "\""));
        if (fld[1] in dic) error(("repeated key = \"" lin "\""));
        up_to_low[fld[1]] = fld[2];
        nMap++;
      }
  }
  if (ERRNO != "0") { error((file ": " ERRNO)); }
  close (file);
  if (nMap == 0) { arg_error(("file \"" file "\" empty or missing")); }
  printf "loaded %6d map pairs\n", nMap > "/dev/stderr"
}

function insert_file(file,    nRead,lin,fld,nfld)
{
  # Copies the specified file into the output stream, as comments
  
  nRead=0;
  while((getline lin < file) > 0) 
    { 
      gsub(/^[ \011]+/, "", lin);
      if (lin !~ /^[#]/) { lin = ("# " lin); }
      print lin;
      nRead++;
    }
  if (ERRNO != "0") { error((file ": " ERRNO)); }
  close (file);
  if (nRead == 0) { arg_error(("file \"" file "\" empty or missing")); }
  printf "copied %6d lines from %s\n", nRead, file > "/dev/stderr"
}