#! /usr/bin/gawk -f
# Last edited on 2002-02-11 21:57:03 by stolfi

BEGIN {
  abort = -1;
  usage = ( "words-from-evt \\\n" \
    "  [ -v showLines=BOOL ] [ -v showParags=BOOL ] \\\n" \
    "  [ -v omitInitial=BOOL ] [ -v omitFinal=BOOL ] \\\n" \
    "  [ -v omitMedial=BOOL ] \\\n" \
    "  [ -v showLocation=BOOL ] \\\n" \
    "  < INFILE > OUTFILE" \
  );

  # By default, input line breaks and paragraph breaks are treated
  # just like any other wordspace.
  
  # If "showLines" is true, input line breaks are translated into a blank line,
  # input paragraph breaks are translated into two blank lines.
  
  # If "showLines" is false but "showParags" is true, input line breaks are 
  # treated like word spaces, but paragraph breaks are translated into an
  # empty line.

  # If "omitInitial" is true, words immediately following a line or figure break are
  # omitted from the output.  The option "omitFinal" is symmetric, and 
  # "omitMedial" discards any words that are *not* adjacent to a line
  # or figure break.
  
  # If "showLocation" is true, prints the location code before every word
  # (but not in the blank lines). 
  
  if (showLines == "") { showLines = 0; }
  if (showParags == "") { showParags = 0; }
  # It simplifies things if "showLines" implies "showParags" :
  if (showLines) { showParags = 1; }
  
  if (omitInitial == "") { omitInitial = 0; }
  if (omitMedial  == "") { omitMedial  = 0; }
  if (omitFinal   == "") { omitFinal   = 0; }
  if (omitInitial && omitMedial && omitFinal)
    { arg_error("omitting everything!"); }
    
  if (showLocation == "") { showLocation = 0; }
  
  # We first reduce the input file to a stream of words alternating
  # with single separators (".", "-", or "="). This stream is fed
  # through a filter "output_word" that converts the separators to
  # blank lines, as requested, and omits line -initial, -medial, or
  # -final tokens, as requested.

  leftover = "=";      # Leftover separators from previous line
  curToken = "";       # Last token parsed but not printed, or "" if none.
  curInitial = 0;      # TRUE if curToken was line-initial.
  nBlanks = 2;         # Number of blank lines printed since last non-blank.
  nBlanksNeeded = 0;   # Number of blank lines needed before next non-blank
}

(abort >= 0) { exit abort; }

/^[ ]*([#]|$)/ { next; }

/^<[^<>;]*>/ { next; }

// {
  lin = $0;
  # Extract line locator:
  if (match(lin, /^<[^>]*>/))
    { loc = substr(lin, RSTART+1, RLENGTH-2); }
  else
    { loc = "f0r.P0.0;X"; }
  # Remove line locator:    
  gsub(/^<[^>]*> */, "", lin);
  # Remove embedded comments (twice in case of nested "{}"s)
  gsub(/{[^{}]*}/, "", lin);
  gsub(/{[^{}]*}/, "", lin);
  # Remove fillers:
  gsub(/[!]/, "", lin);
  # Reduce all bad char codes to "?"
  gsub(/[?*%]/, "?", lin);
  if (lin == "") { next; }
  # Prepend leftover delimiter:
  lin = ( leftover lin );
  while (match(lin, /^[-\/=., ]+[^-\/=., ]+/))
    { # Isolate the next word with its preceding delimiter
      w = substr(lin, 1, RLENGTH);
      lin = substr(lin, RLENGTH + 1);
      # Split them apart:
      if (! match(w, /^[-\/=., ]+/)) { prog_error(("missing delimiter")); }
      delim = substr(w, 1, RLENGTH);
      w = substr(w, RLENGTH + 1);
      # Output delimiter and word:
      output_word(delim, w);
    }
  if (lin !~ /^[-\/=., ]*$/) { prog_error(("left food on plate")); }
  # Newlines in file are implicit word breaks:
  leftover = ( lin ".");
  next;
}

function output_word(delim,w,   isBreak,isParBreak,omit,nB)
{
  # Process another delimiter and another word.
  if (delim == "") {prog_error(("empty delim")); }
  if (w == "") { prog_error(("empty w")); }
  # Outputs "curToken", if appropriate:
  isBreak = (delim ~ /[-\/=]/);
  isParBreak = (delim ~ /[=]/);
  # Note that "isParBreak" implies "isBreak".
  if (curToken != "")
    { curFinal = isBreak;
      omit = 0;
      omit += (omitInitial && curInitial);
      omit += (omitFinal && curFinal);
      omit += (omitMedial && (!curInitial) && (!curFinal));
      if (omit == 0) {
        while (nBlanks < nBlanksNeeded) { print ""; nBlanks++; }
        if (showLocation) { printf "%s ", loc; }
        print curToken; nBlanks = 0; nBlanksNeeded = 0;
      }
    }
  # Update nBlanksNeeded according to "delim" and options.
  nB = (isBreak && showLines) + (isParBreak && showParags);
  if (nBlanksNeeded < nB) { nBlanksNeeded = nB; }
  # Save "w" as the current token:
  curToken = w;
  curInitial = isBreak;
}

END{
  if (abort >= 0) { exit abort; }
  # Flush out last token:
  output_word("=", "EOF");
}

function arg_error(msg)
{ 
  printf "%s\n", msg >> "/dev/stderr";
  printf "usage: %s\n", usage >> "/dev/stderr";
  abort = 1;
  exit 1;
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg >> "/dev/stderr";
  abort = 1;
  exit 1;
}

function prog_error(msg)
{ 
  printf "line %d: prog error - %s\n", FNR, msg >> "/dev/stderr";
  abort = 1;
  exit 1;
}