#! /usr/bin/gawk -f
# Last edited on 2025-05-04 15:44:11 by stolfi

BEGIN {
  abort = -1;
  usage = ( "tokens-from-evt \\\n" \
    "  [ -v showLines=BOOL ] [ -v showParags=BOOL ] \\\n" \
    "  [ -v omitInitial=BOOL ] [ -v omitFinal=BOOL ] \\\n" \
    "  [ -v omitMedial=BOOL ] \\\n" \
    "  [ -v showLocation=BOOL ] \\\n" \
    "  < INFILE > OUTFILE" \
  );

  # By default, input line breaks and paragraph breaks are treated
  # just like any other wordspace.
  
  # If {showLines} is true, input line breaks are translated into a blank line,
  # input paragraph breaks are translated into two blank lines.
  
  # If {showLines} is false but {showParags} is true, input line breaks are 
  # treated like word spaces, but paragraph breaks are translated into an
  # empty line.

  # If {omitInitial} is true, tokens immediately following a line or figure break are
  # omitted from the output.  The option {omitFinal} is symmetric, and 
  # {omitMedial} discards any tokens that are *not* adjacent to a line
  # or figure break.
  
  # If {showLocation} is true, prints the location code before every token
  # (but not in the blank lines). 
  
  if (showLines == "") { showLines = 0; }
  if (showParags == "") { showParags = 0; }
  # It simplifies things if {showLines} implies {showParags} :
  if (showLines) { showParags = 1; }
  
  if (omitInitial == "") { omitInitial = 0; }
  if (omitMedial  == "") { omitMedial  = 0; }
  if (omitFinal   == "") { omitFinal   = 0; }
  if (omitInitial && omitMedial && omitFinal)
    { arg_error("omitting everything!"); }
    
  if (showLocation == "") { showLocation = 0; }
  
  # We first reduce the input file to a stream of tokens alternating
  # with single separators (".", "-", or "="). This stream is fed
  # through a filter {output_token} that converts the separators to
  # blank lines, as requested, and omits line -initial, -medial, or
  # -final tokens, as requested.

  leftover = "=";      # Leftover separators from previous line
  curToken = "";       # Last token parsed but not printed, or "" if none.
  curInitial = 0;      # TRUE if curToken was line-initial.
  nBlanks = 2;         # Number of blank lines printed since last non-blank.
  nBlanksNeeded = 0;   # Number of blank lines needed before next non-blank
}

(abort >= 0) { exit abort; }

/^[ ]*([#]|$)/ { next; }

/^<[^<>;]*>/ { next; }

// {
  lin = $0;
  # Extract line locator:
  if (match(lin, /^<[^>]*>/))
    { loc = substr(lin, RSTART+1, RLENGTH-2); }
  else
    { loc = "f0r.P0.0;X"; }
  # Remove line locator:    
  gsub(/^<[^>]*> */, "", lin);
  # Remove embedded comments (twice in case of nested "{}"s)
  gsub(/{[^{}]*}/, "", lin);
  gsub(/{[^{}]*}/, "", lin);
  # Remove fillers:
  gsub(/[!]/, "", lin);
  # Reduce all bad char codes to "?"
  gsub(/[?*%]/, "?", lin);
  if (lin == "") { next; }
  # Prepend leftover delimiter:
  lin = ( leftover lin );
  while (match(lin, /^[-\/=., ]+[^-\/=., ]+/))
    { # Isolate the next token with its preceding delimiter
      w = substr(lin, 1, RLENGTH);
      lin = substr(lin, RLENGTH + 1);
      # Split them apart:
      if (! match(w, /^[-\/=., ]+/)) { prog_error(("missing delimiter")); }
      delim = substr(w, 1, RLENGTH);
      w = substr(w, RLENGTH + 1);
      # Output delimiter and token:
      output_token(delim, w);
    }
  if (lin !~ /^[-\/=., ]*$/) { prog_error(("left food on plate")); }
  # Newlines in file are implicit word breaks:
  leftover = ( lin ".");
  next;
}

function output_token(delim,w,   isBreak,isParBreak,omit,nB)
{
  # Process another delimiter and another token.
  if (delim == "") {prog_error(("empty delim")); }
  if (w == "") { prog_error(("empty w")); }
  # printf "!! w = {%s}\n", w > "/dev/stderr"
  # printf "!! curToken = {%s}\n", curToken > "/dev/stderr"
  
  # Determine breaks between {curToken} and {w}:
  isBreak = (delim ~ /[-\/=]/);
  isParBreak = (delim ~ /[=]/);
  # printf "!! isBreak = %d isParBreak = %d\n", isBreak, isParBreak > "/dev/stderr"
  # Note that {isParBreak} implies {isBreak}.
  
  # Update blank lines needed bwtween {curToken} and {w}:
  nB = (isBreak && showLines) + (isParBreak && showParags);
  if (nBlanksNeeded < nB) { nBlanksNeeded = nB; }
  
  # Outputs {curToken}, if appropriate:
  if (curToken != "")
    { curFinal = isBreak;
      omit = 0;
      omit += (omitInitial && curInitial);
      omit += (omitFinal && curFinal);
      omit += (omitMedial && (!curInitial) && (!curFinal));
      if (omit == 0) {
        if (showLocation) { printf "%s ", loc; }
        print curToken;
        while (nBlanks < nBlanksNeeded) { print ""; nBlanks++; }
        nBlanks = 0; nBlanksNeeded = 0;
      }
    }
  # Save {w} as the current token:
  curToken = w;
  # printf "\n" > "/dev/stderr"
  curInitial = isBreak;
}

END{
  if (abort >= 0) { exit abort; }
  # Flush out last token:
  output_token("=", "EOF");
}

function arg_error(msg)
{ 
  printf "%s\n", msg >> "/dev/stderr";
  printf "usage: %s\n", usage >> "/dev/stderr";
  abort = 1;
  exit 1;
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg >> "/dev/stderr";
  abort = 1;
  exit 1;
}

function prog_error(msg)
{ 
  printf "line %d: prog error - %s\n", FNR, msg >> "/dev/stderr";
  abort = 1;
  exit 1;
}