#! /usr/bin/gawk -f
# Last edited on 2012-02-19 06:00:08 by stolfilocal
 

# Reads a file in ".wds" format, prints the text in plain.

# !!! Maybe too specific to {port/cso} and {port/csm} !!!

BEGIN {
  abort = -1;
  usage = ( ARGV[0] " \\\n" \
    "  < some.wds > some.txt" \
  );
  # 
  # See "wds-format.txt" for a description of the input file  format.

  nlines = 0; # Number of lines read.
  nwdin = 0;  # Number of words/symbols read.
  nptin = 0;  # Number of punctuation symbols read.
  
  olin = "";  # Current output line.
  
  # Parsing state (shoudl use current section):
  ital = 0;   # 1 within italic text.
  math = 0;   # 1 within math formula.
  
  intx = 0;   # 1 within normal text.
  inct = 0;   # 1 within book title, bok author, chapter number or chapter title
  
  printf "% Created by wds-to-plaintext.gawk\n"
}

(abort >= 0) { exit abort; }

// { 
  # Get rid of funny spaces
  gsub(/[\011\014\015\240]/, " ");
  # Remove trailing blanks:
  gsub(/[ ]+$/, "");
  nlines++;
}

/^[\#][ ]*$/ { 
  # Comment line, ignore:
  next;
}

# ----------------------------------------------------------------------
# Title page

/^[\$][ ].*{tpg} *$/ { 
  # Start of title page
  if ((intx != 0) || (inct != 0))
    { data_warning(("misplaced title page")); }
  intx = 0;
  inct = 0;
  math = 0;
  next;
}

/^[\$][ ].*{tpg}{tt} *$/ { 
  # Start of book title:
  if ((intx != 0) || (inct != 0))
    { data_warning(("misplaced book title")); }
  olin = "\\titpg{";
  intx = 0;
  inct = 1;
  math = 0;
  next;
}

/^[\$][ ].*{tpg}{au} *$/ { 
  # Start of book author:
  if ((intx != 0) || (inct != 1))
    { data_warning(("misplaced book author")); }
  olin = ( olin "}{" );
  intx = 0;
  math = 0;
  next;
}

# ----------------------------------------------------------------------
# Chapter headers:

/^[\$][ ].*{c[0-9]+} *$/ { 
  # Start of chapter:
  if (inct != 0)
    { # Assume it was in a book title:
      olin = ( olin "}" );
    }
  if (math != 0) { olin = ( olin "}" ); }
  if (ital != 0) 
    { # Must have been in math formula:
      data_warning(("unclosed italics"));
      olin = ( olin "}" );
    }
  if (intx != 0)
    { # Handle as parag break:
      output_olin();
    }
  output_olin();
  intx = 0;
  inct = 0;
  math = 0;
  next;
}

/^[\$][ ].*{c[0-9]+}{cn} *$/ { 
  # Start of chapter number:
  if (inct != 0) { data_warning(("dup chapter number")); }
  output_olin();
  olin = "\\chapt{"
  intx = 0;
  inct = 1;
  math = 0;
  next;
}

/^[\$][ ].*{c[0-9]+}{tt} *$/ { 
  # Start of chapter title:
  if ((intx != 0) || (inct != 1))
    { data_warning(("chapter title without chapter number")); }
  olin = ( olin "}{" );
  intx = 0;
  math = 0;
  next;
}

/^[\$][ ].*{tx} *$/ { 
  # Start or restart of normal prose:
  if (inct != 0) 
    { # end chapter title:
      olin = ( olin "}" );
    }
  if (math != 0) 
    { # Must have been in math formula:
      olin = ( olin "}" );
    }
  if (intx == 0) 
    { # Treat as paragraph break: 
      output_olin();
      output_olin();
    }
  intx = 1;
  inct = 0;
  math = 0;
  next;
}

/^[\$][ ].*{(latp|frcp|itap|engp)} *$/ { 
  # Start of foreign phrase, set {intx} but do not break parag.
  if (inct != 0) { data_warning(("foreign phrase in chapter title")); }
  if (math != 0) {  olin = ( olin "}" ); }
  intx = 1;
  math = 0;
  next;
}

/^[\$][ ].*{math} *$/ { 
  # Start of math formula, output "\math{}" and set {intx,math} but do not break parag.
  if (inct != 0) { data_warning(("foreign phrase in chapter title")); }
  if (math != 0) {  olin = ( olin "}" ); }
  olin = ( olin "\\mth{" );
  intx = 1;
  math = 1;
  next;
}

# ----------------------------------------------------------------------
# Final page

/^[\$][ ].*{fpg} *$/ { 
  # Start of final page
  if (inct != 0)
    { data_warning(("misplaced final page")); }
  if (math != 0) {  olin = ( olin "}" ); }
  if (intx != 0)
   { # Handle as parag break:
     output_olin();
     output_olin();
   }
  intx = 0;
  inct = 0;
  next;
}

/^[\$][ ].*{fpg}{tt} *$/ { 
  # Start of final page title:
  if ((intx != 0) || (inct != 0))
    { data_warning(("misplaced final title")); }
  olin = "\\finpg{";
  intx = 0;
  inct = 1;
  next;
}

# ----------------------------------------------------------------------
# Other sections:

/^[\$][ ]/ { 
  # Start of other sections, handle as end of paragraph, reset {intx}:
  data_warning(("unhandled section «" $0 "»"));
  output_olin();
  output_olin();
  intx = 0;
  inct = 0;
  next;
}

# ----------------------------------------------------------------------
# Non-sections:

/^[@\#][ ]/ { 
  # Internal info or comment line, ignore:
  next;
}

/^[asp][ ]/ {
  # Alpha, symbol, or punctuation, output it:
  # Grab the type and word:
  type = substr($0, 1, 1);
  word = substr($0, 3);
  # Get the last two characters {bisp,prev} on current line: 
  nolin = length(olin);
  prev = (nolin < 1 ? "" : substr(olin, nolin, 1));
  bisp = (nolin < 2 ? "" : substr(olin, nolin-1, 1));
  # Decide the spaces {befo} to add before before (or flush at end of parag and set {word} to nil):
  if (type == "p")
    { # Punctuation:
      nptin++;
      if (word == "÷")
        { # End of paragraph:
          output_olin(); 
          output_olin();
          ital = 0;
          math = 0;
          next;
        }
      else if (word == "=")
        { # Hard line break:
          output_olin();
          next;
        }
      else if (word == "_")
        { if (math)
            { befo = ""; }
          else if (ital)
            { befo = ""; word = "}"; ital = 0; }
          else
            { befo = " "; word = "\\emph{"; ital = 1; }
        }
      else if (word == "(")
        { befo = (prev == "(" ? "" : " "); }
      else if (word == "-")
        { if (prev == "-") 
            { befo = ""; word = "--"; }
          else 
            { befo = " "; }
        }
      else if (word == "«")
        { befo = ((prev == "(") ? "" : " "); }
      else  
        { befo = ""; }
    }
  else
    { # Alpha or symbol:
      nwdin++;
      if ((bisp == "-") && (prev == "-"))
        { # After an em-dash:
          befo = " ";
        }
      else if (prev == "»")
        { # After a close quote: 
          befo = " ";
        }
      else if ((prev == "(") || (prev == "«") || (prev == "-") || (prev == "{") || (prev == "~"))
        { befo = ""; }
      else
        { befo = " "; }
    }
  # No blanks needed at begin-of-line: 
  if (prev == "") { befo = ""; }
    
  # Recode some symbols: 
  gsub(/[_]/, "/", word);
  gsub(/[~]/, "-", word);
  gsub(/\^/, ".~", word);
  
  # Does it fit in the current line?
  if ((befo == "") || (length(olin) + length(befo) + length(word) < 72))
    { # Append {word} with its space before:
      olin = (olin befo word);
    }
  else
    { # Flush the line and append {word} without space:
      output_olin(); 
      olin = word;
    }
  next;
}

// {
  data_error(("bad line format"));
  next;
}

END {
  if (abort >= 0) { exit abort; }
  if (inct == 0) 
    { data_warning(("missing final page?")); }
  else
    { # Close off final page:
      olin = ( olin "}" );
    }
  if (olin != "") { output_olin(); }
  printf "%8d lines read\n", nlines > "/dev/stderr";
  printf "%8d words/symbols read\n", nwdin > "/dev/stderr";
  printf "%8d punctuation read\n", nptin > "/dev/stderr";
}

function output_olin(  )
{
  print olin;
  olin = "";
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1;
}

function data_warning(msg)
{
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
}

function data_error(msg)
{
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  printf "  %s\n", $0 > "/dev/stderr";
  abort = 1; exit 1;
}

function tbl_error(f,n,msg)
{ 
  printf "file %s, line %d: %s\n", f, n, msg > "/dev/stderr";
  abort = 1;
  exit 1
}