#! /usr/bin/gawk -f
# Last edited on 2016-05-09 23:28:02 by stolfilocal
 

# Reads a file in ".wds" format, prints the text in plain.

# !!! Maybe too specific to {port/cso} and {port/csm} !!!

BEGIN {
  abort = -1;
  usage = ( ARGV[0] " \\\n" \
    "  -f recode-tex.gawk < some.wds > some.txt" \
  );
  # 
  # See "wds-format.txt" for a description of the input file  format.

  nlines = 0; # Number of lines read.
  nwdin = 0;  # Number of words/symbols read.
  nptin = 0;  # Number of punctuation symbols read.
  
  olin = "";  # Current output line.
 
  # Global state:
  # 0 waiting for {b} section (book start)
  # 1 after {b}, waiting for {b}{tpg} (title page).
  # 2 after {b}{tpg}, waiting for {b}{tpg}{tt} (title)
  # 3 after {b}{tpg}{tt}, waiting for word or {b}{tpg}{au} (author)
  # 4 after {b}{tpg}{au}, waiting for word or {b}{XX}.. (start of chapter)
  # 5 after {b}{XX}, waiting for {b}{XX}{cn} (chapter number)
  # 6 after {b}{XX}{cn}, waiting for word or {b}{XX}{tt} (chapter title)
  # 7 after {b}{XX}{tt}, waiting for word or subsections, *{tx}, {b}{YY}, or EOF.
  # 8 after {b}{XX}{tx}, waiting for word or subsections, *{tx}, {b}{YY}, or EOF.
  gstate = 0;  
   
  # Paragraph state:
  intx = 0;    # 1 within horizontal mode text, 0 vertical mode
  ital = 0;    # 1 within italic text.
  math = 0;    # 1 within math formula.

  pword = "";  # Previous unconverted word in the paragraph.
  ptype = "";  # The type ("a", "p", "s") of the previous word.
 
  printf "% Created by wds-to-tex.gawk\n"
}

(abort >= 0) { exit abort; }

// { 
  # Get rid of funny spaces
  gsub(/[\011\014\015\240]/, " ");
  # Remove trailing blanks:
  gsub(/[ ]+$/, "");
  nlines++;
}

/^[\#][ ]*$/ { 
  # Comment line, ignore:
  next;
}

# ----------------------------------------------------------------------
# Book

/^[\$][ ]+{b} *$/ {
  # Start of book
  if ((gstate != 0) || (intx != 0)) { data_warning(("misplaced start-of-book")); }
  gstate = 1;
  next;
}

# ----------------------------------------------------------------------
# Title page

/^[\$][ ].*{tpg} *$/ { 
  # Start of title page
  if ((gstate != 1) || (intx != 0)) { data_warning(("misplaced title page")); }
  if ((ital != 0) || (math != 0)) { data_warning(("funny {math,ital} state")); }
  gstate = 2;
  next;
}

/^[\$][ ].*{tpg}{tt} *$/ { 
  # Start of book title:
  if ((gstate != 2) || (intx != 0)) { data_warning(("misplaced book title")); }
  if ((ital != 0) || (math != 0)) { data_warning(("funny {math,ital} state")); }
  gstate = 3;
  start_text("\\titlepage{");
  next;
}

/^[\$][ ].*{tpg}{au} *$/ { 
  # Start of book author:
  if ((gstate != 3) || (intx == 0)) { data_warning(("misplaced book author")); }
  close_math();
  close_ital();
  close_text();
  gstate = 4;
  start_text("{");
  next;
}

# ----------------------------------------------------------------------
# Chapter headers:
# !!! Must generalize !!!

/^[\$][ ].*{c[0-9]+} *$/ { 
  # Start of chapter:
  if ((gstate != 4) && (gstate != 7) && (gstate != 8)) { data_warning(("misplaced start of chapter")); }
  close_math();
  close_ital();
  close_text();
  gstate = 5;
  next;
}

/^[\$][ ].*{c[0-9]+}{cn} *$/ { 
  # Start of chapter number:
  if ((gstate != 5) || (intx != 0)) { data_warning(("misplaced start of chapter number")); }
  if ((ital != 0) || (math != 0)) { data_warning(("funny {math,ital} state")); }
  output_olin();
  output_olin();
  gstate = 6;
  start_text("\\chapt{");
  next;
}

/^[\$][ ].*{c[0-9]+}{tt} *$/ { 
  # Start of chapter title:
  if ((gstate != 6) || (intx == 0)) { data_warning(("misplaced start of chapter title")); }
  close_math();
  close_ital();
  close_text();
  gstate = 7;
  start_text("{");
  next;
}

/^[\$][ ].*{tx} *$/ { 
  # Start or restart of normal prose:
  if ((gstate != 7) && (gstate != 8))
    { data_warning(("misplaced start of normal prose (gstate = " gstate ")")); }
  close_math();
  close_ital();
  close_text();
  output_olin();
  output_olin();
  gstate = 8;
  start_text("\\parag{");
  next;
}

/^[\$][ ].*{(lat|frc|ita|eng)[pv]} *$/ { 
  # Start of foreign phrase, set {intx} but do not break parag.
  if ((gstate != 3) && (gstate != 7) && (gstate != 8)) { data_warning(("misplaced foreign phrase")); }
  if (gstate != 8) { data_warning(("foreign phrase in book/chapter title")); }
  close_math();
  close_ital();
  match($0, /{(lat|frc|ita|eng)[pv]}/);
  sectag = substr($0, RSTART, RLENGTH);
  start_ital(("\\" sectag "{"));
  next;
}

/^[\$][ ].*{math} *$/ { 
  # Start of math formula, output "\math{}" and set {intx,math} but do not break parag.
  if ((gstate != 3) && (gstate != 7) && (gstate != 8)) { data_warning(("misplaced math formula")); }
  close_math();
  math = 1;
  start_math("\\mth{");
  next;
}

# ----------------------------------------------------------------------
# Other sections:

/^[\$][ ]/ { 
  # Start of other sections, handle as end of paragraph, reset {intx}:
  data_warning(("unhandled section «" $0 "»"));
  if ((gstate != 3) && (gstate != 7) && (gstate != 8)) { data_warning(("misplaced section")); }
  close_math();
  close_ital();
  close_text();
  intx = 1;
  start_text("\\parag{");
  next;
}

# ----------------------------------------------------------------------
# Non-sections:

/^[@\#][ ]/ { 
  # Internal info or comment line, ignore:
  next;
}

/^[asp][ ]/ {
  if ((gstate <= 2) || (gstate == 5)) { data_warning("misplaced word (gstate = " gstate ")"); }
  if (intx == 0) { start_text("\\parag{"); }
  # Alpha, symbol, or punctuation, output it:
  # Grab the type and word:
  type = substr($0, 1, 1);
  word = substr($0, 3);
  if (type == "p")
    { nptin++; }
  else 
    { nwdin++; }
  
  # Default translation:
  befo = "";
  wtex = word;
  neol = 0;

  # Handle standard codes:
  if ((type == "p") && (word == "="))
    { # End of paragraph, force end of italic and math:
      wtex = "";
      neol = 2;
    }
  else if ((type == "p") && (word == "÷"))
    { # Hard line break:
      wtex = "\\\\";
      neol = 1;
    }
  else if ((type == "p") && (word == "_") && (! math))
    { # Toggle italic mode:
      if (ital)
        { befo = ""; wtex = "}"; ital = 0; }
      else
        { befo = " "; wtex = "\\emph{"; ital = 1; }
    }
  else
    { # Convert to LaTeX by text-specific function:
      split("", act);
      tex_recode(pword, ptype, math, type, word, act);
      befo = act[1];  # Space to insert before {wtex}, if no line break.
      wtex = act[2];  # The {word} converted to LaTeX.
    }
  # Does it fit in the current line?
  if ((befo == "") || (length(olin) + length(befo) + length(wtex) < 72))
    { # Append {word} with its space before:
      olin = (olin befo wtex);
    }
  else
    { # Flush the line and append {wtex} without space:
      output_olin(); 
      olin = wtex;
    }
  if (neol == 2)
    { close_math();
      close_ital();
      close_text();
      output_olin();
      output_olin();
      start_text("\\parag{");
    }
  else if (neol == 1)
    { output_olin(); }
    
  pword = word;
  ptype = type;
  next;
}

// {
  data_error(("bad line format"));
  next;
}

END {
  if (abort >= 0) { exit abort; }
  close_math();
  close_ital();
  close_text();
  if (olin != "") { output_olin(); }
  printf "%8d lines read\n", nlines > "/dev/stderr";
  printf "%8d words/symbols read\n", nwdin > "/dev/stderr";
  printf "%8d punctuation read\n", nptin > "/dev/stderr";
}

function start_math(macro)
{ 
  # Starts math formula. Sets {pword="$",ptype="p"}.
  if (math != 0) { data_error("close {math} first"); }
  olin = ( olin macro );
  pword = "$"; ptype = ".";
  math = 1;
}

function close_math()
{ # Closes math formula and sets {pword="$",ptype="p"}, if open.  
  if (math != 0) 
    { olin = ( olin "}" );
      pword = "$"; ptype = ".";
    }
  math = 0;
}

function start_ital(macro)
{ 
  # Starts italic section. Does not change {pword,ptype}.
  if ((math != 0) || (ital != 0)) { data_error("close {math,ital} first"); }
  olin = ( olin macro );
  ital = 1;
}

function close_ital()
{ # Closes ital section, if open.  Does not change {pword,ptype}.
  if (math != 0) { data_error("close {math} first"); }
  if (ital != 0) { olin = ( olin "}" ); }
  ital = 0;
}

function start_text(macro)
{ 
  # Resets the paragraph state for new parag: {ital}, {math}, {pword}, {ptype}
  # and sets {intx} to 1.
  olin = ( olin macro );
  math = 0; ital = 0; pword = ""; ptype = "";
  intx = 1;
}

function close_text()
{
  # Closes a text paragraph or a book/chapter title, if open.
  # At this point, {math} ad {ital} must be closed.
  if ((math != 0) || (ital != 0)) { data_error("close {math,ital} first"); }
  if (intx != 0)
    { olin = ( olin "}" );
      pword = "="; ptype = "p"; 
    }
  intx = 0;
}

function output_olin(  )
{
  print olin;
  olin = "";
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1;
}

function data_warning(msg)
{
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
}

function data_error(msg)
{
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  printf "  %s\n", $0 > "/dev/stderr";
  abort = 1; exit 1;
}

function tbl_error(f,n,msg)
{ 
  printf "file %s, line %d: %s\n", f, n, msg > "/dev/stderr";
  abort = 1;
  exit 1
}