#! /bin/gawk -f
# Last edited on 2002-02-12 00:31:41 by stolfi

# Extracts the text words from an EVT file. In other words, removes
# the location codes, #-comments, chapter headers, and replaces BLANK
# and all other punctuation by blanks.

BEGIN { 
  abort = -1;
  usage = ( "evt-to-wds \\\n" \
    "  [ -v showBreaks=BOOL ] \\\n" \
    "  [ -v showParags=BOOL ] \\\n" \
    "  [ -v showPuncts=BOOL ] \\\n" \
    "  [ -v showLocation=BOOL ] \\\n" \
    "  [ -v smashSymbols=BOOL ] \\\n" \
    "  < INFILE > OUTFILE" \
  );
  
  # The input is in EVT format, generalized as follows.
  # At the beginning of the file (before the first 
  # non-comment line) there may be one or more
  # comments containing charset declarations of the form 
  # NAME = "..." where NAME is one of the following:
  #
  #    NULL    characters to be deleted.
  #    BLANK   characters used for interword spaces.
  #    ALPHA   characters that may occur in words.
  #    SYMBOL  non-alpha symbols like $ or digits in English.
  #    PUNCT   puntuation symbols.
  #    BREAK   character(s) indicating a significant gap or line break.
  #    PARAG    character(s) indicating a paragraph break.
  #
  # These sets should be disjoint and should not
  # include any of the characters " <>#{}" which have 
  # special meaning in the EVT format.  Multiple 
  # declarations of the same NAME are concatenated.
  # The sets NULL, PUNCT and SYMBOL may be empty (default); the 
  # rest, if not specified, default to the 
  # Voynichese EVA values, namely
  #
  #    NULL = "!"
  #    BLANK = ".,"
  #    ALPHA = "a...zA...Z"
  #    SYMBOL = "?%*"
  #    PUNCT = ""
  #    BREAK = "-"
  #    PARAG = "="
  #
  # Each non-comment line should contain a line locator, zero or more
  # spaces (SP), and a text string containing no SPs. The original
  # spaces should have been replaced by BLANK characters. A paragraph
  # break should be indicated by a PARA character, either at the end of
  # a line or as a line by itself.  The line locator shoudl match 
  # the pattern /^<[a-zA-Z0-9.;]*>$/
  # 
  # The input lines are initially stripped of locators, comments,
  # ASCII blanks, and NULL characters.  Then:
  #
  #   * Each BLANK character is replaced by a space (ASCII SP).
  #
  #   * One space is inserted after each line.
  #
  #   * The lines are concatenated together.
  #
  #   * One PARAG character is inserted at the end of the last line.
  #
  #   * If "showPuncts" is true, a BLANK character is inserted on either
  #     side of each PUNCT character, else every PUNCT character is
  #     replaced by a space;
  #
  #   * If "showParags" is true, any maximal string of space, BREAK, or PARAG
  #     characters containing at least one PARAG is replaced by a BREAK-PARAG-space
  #     sequence; otherwise every PARAG character is replaced by a BREAK.
  #
  #   * If "showBreaks" is true, any maximal string of space or BREAK characters
  #     containing at least one BREAK is replaced by a space-BREAK-space
  #     sequence; otherwise every BREAK character is replaced by space.
  #
  #   * If "smashSymbols" is true, every SYMBOL character is replaced by "?".
  #
  #   * Any space, BREAK, or PARAG characters at the beginning of the 
  #     input stream are ignored.
  #
  #   * Every maximal sequence of one or more spaces is replaced by a single space.
  #
  # Finally, each maximal nonempty string delimited by spaces is considered
  # to be one token and is written out to a separate line. 
  # 
  # If "showLocation" is false, each output line contains only the
  # token. If "showLocation" is true, each output line has the format
  # LOC TYPE TOKEN where LOC is the EVT-style line locator where the
  # TOKEN occured, and TYPE is 
  # 
  #     3 if the TOKEN contained any SYMBOL characters, otherwise 
  #     2 if it contained any ALPHA characters, otherwise
  #     1 if it contained any PUNCT, BREAK, or PARAG characters, otherwise 
  #     0.
  #  
  # Line-final delimiters (BREAKs and PARAGs) will be reported as belonging to the next
  # line, if there is one.
  
  if (showBreaks == "") { showBreaks = 0; }
  if (showParags == "") { showParags = 0; }
  if (showPuncts == "") { showPuncts = 0; }
  if (showLocation == "") { showLocation = 0; }
  if (smashSymbols == "") { smashSymbols = 0; }
  
  # Charsets specified in input file:
  
  NULL = "";   default_NULL = 1;
  BLANK = "";  default_BLANK = 1;
  ALPHA = "";  default_ALPHA = 1;
  SYMBOL = ""; default_SYMBOL = 1;
  PUNCT = "";  default_PUNCT = 1;
  BREAK = "";  default_BREAK = 1;
  PARAG = "";  default_PARAG = 1; 
  
  ndata = 0;  # Number of input data lines (excluding comments).
  nwdout = 0; # Number of output words
}

(abort >= 0) { exit abort; }

# Process charset definitions, if any:

/^[#] *[A-Z]+ *= *".*" *$/ {
  if (ndata > 0) { data_error("late charset defintion"); }
}

/^[#] *NULL *= *".*" *$/ {
  NULL = (NULL get_val($0));
  default_NULL = 0;
}

/^[#] *BLANK *= *".*" *$/ {
  BLANK = (BLANK get_val($0));
  default_BLANK = 0;
}

/^[#] *ALPHA *= *"[^"]*" *$/ { 
  ALPHA = (ALPHA get_val($0));
  default_ALPHA = 0;
}

/^[#] *SYMBOL *= *"[^"]*" *$/ { 
  SYMBOL = (SYMBOL get_val($0));
  default_SYMBOL = 0;
}

/^[#] *PUNCT *= *".*" *$/ {
  PUNCT = (PUNCT get_val($0));
  default_PUNCT = 0;
}

/^[#] *BREAK *= *".*" *$/ {
  BREAK = (BREAK get_val($0));
  default_BREAK = 0;
}

/^[#] *PARAG *= *".*" *$/ {
  PARAG = (PARAG get_val($0));
  default_PARAG = 0; 
}

function get_val(def)
{
  gsub(/^[#][ ]*[A-Z]+ *= *"/, "", def);
  gsub(/" *$/, "", def);
  return(quote_special(def));
}

function quote_special(chars)
{
  gsub(/[\\]/, "\\\\", chars);
  gsub(/[-]/, "\\-", chars);
  gsub(/[\]]/, "\\]", chars);
  gsub(/[\^]/, "\\^", chars);
  return chars;
}

/^[#]/ { next; }

# Discard blank lines:

/^ *$/ { next; }

# Process contents lines (possibly empty):

/^[<]/ {
  if (ndata == 0) { make_patterns(); leftover = parag_char; }
  ndata++;
  if (! match($0, /^<[a-zA-Z0-9.;]*>/)) { data_error("bad locator"); }
  loc = substr($0, 2, RLENGTH-2);
  lin = substr($0, RLENGTH+1);
  # Remove {}-comments (twice for nested "{}"):
  gsub(/[{][^{}]*[}]/, "", lin);
  gsub(/[{][^{}]*[}]/, "", lin);
  # Remove null characters:
  if (NULL != "") { gsub(null_pat, "", lin); }
  # Apply transformations:
  process_line();
  # Output words (without final newline):
  output_words(loc, lin);
  next;
}

END {
  lin = space_char;
  process_line();
  if (lin != "") { prog_error(("invalid leftover \"" lin "\"")); }
  output_words(loc, leftover);
  # printf "%7d data lines read\n", ndata > "/dev/stderr";
  # printf "%7d words written\n", nwdout > "/dev/stderr";
}

function output_words(loc,lin,   wd,nwd,j,wjOrg,wjSmash,wtype)
{
  nwd = split(lin, wd);
  for (j = 1; j <= nwd; j++)
    { nwdout++;
      wjOrg = wd[j];
      # Map symbol chars to "?" if so requested
      wjSmash = wjOrg;
      if ((SYMBOL != "") && (smashSymbols)) { gsub(symbol_pat, "?", wjSmash); }
      if (match(wjSmash, invalid_out_char_pat))
        { data_error(("invalid output character = \"" substr(wjSmash,RSTART,RLENGTH) "\"")); }
      # Output word in desired format
      if (showLocation) 
        { # Determine word type:
          if ((SYMBOL != "") && (wjOrg ~ symbol_pat)) { wtype = 3; }
          else if ((ALPHA != "") && (wjOrg ~ alpha_pat)) { wtype = 2; }
          else if ((PUNCT != "") && (wjOrg ~ punct_pat)) { wtype = 1; }
          else if ((BREAK != "") && (wjOrg ~ break_pat)) { wtype = 1; }
          else if ((PARAG != "") && (wjOrg ~ parag_pat)) { wtype = 1; }
          else { 
            wtype = 0;
            printf "! wtype = 0 loc = %s word = Ť%sť\n", loc, wjOrg > "/dev/stderr";
          }
          printf "%s %d %s\n", loc, wtype, wjSmash; }
      else
        { print wjSmash; }
    }
}

function process_line()
{
  # Takes "leftover" and "lin" (without NULL characters and comments),
  # applies the transformations defined above, then splits any
  # trailing spaces, BREAK, and PARAG into "lefttover".
  
  gsub(blank_pat, " ", lin);
  
  lin = ( leftover lin " " ); leftover = "";
  
  if (PUNCT != "") 
    { if (showPuncts) 
        { lin = gensub(punct_pat, " \\0 ", "g", lin); }
      else
        { gsub(punct_pat, " ", lin); }
    }
    
  if (showParags)
    { gsub(parag_pat, break_parag_space_chars, lin); }
  else
    { gsub(parag_pat, break_char, lin); }
  
  if (showBreaks)
    { gsub(break_pat, space_break_space_chars, lin); }
  else
    { gsub(break_pat, " ", lin); }
    
  if (nwdout == 0) 
    { gsub(leading_delim_pat, "", lin); }
  
  gsub(/[ ][ ]+/, " ", lin);
  
  # Save trailing delimiters in "leftover":
  if (match(lin, trailing_delim_pat))
    { leftover = substr(lin,RSTART,RLENGTH);
      lin = substr(lin,1,RSTART-1);
    }
    
  # validity check:
  if (lin != "")
    { # if ((nwdout == 0) == (match(lin, /^[ ]/) != 0))
      #   { data_error(("inconsistent leading space \"" lin "\"")); }
      if (match(lin, /[ ]$/))
        { data_error(("invalid trailing space \"" lin "\"")); }
      if (match(lin, /[ ][ ]/))
        { data_error(("invalid double space \"" lin "\"")); }
    }
}
  
function make_patterns()
{ # Creates the patterns nonword_pat and null_pat
  if (default_NULL)   { NULL = "!"; }
  if (default_BLANK)  { BLANK = ".,"; }
  if (default_ALPHA)  { ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; }
  if (default_SYMBOL) { SYMBOL = "?*%"; }
  if (default_PUNCT)  { PUNCT = ""; }
  if (default_BREAK)  { BREAK = "-"; }
  if (default_PARAG)  { PARAG = "="; }
  
  if (NULL != "") 
    { null_pat = ( "[" NULL "]" );
      # printf "null_pat = /%s/\n", null_pat > "/dev/stderr";
    }
  
  if (BLANK == "") 
    { arg_error("BLANK charset cannot be empty"); }
  blank_pat = ( "[" BLANK "]" );
  # printf "blank_pat = /%s/\n", blank_pat > "/dev/stderr";
  
  if (SYMBOL != "") 
    { symbol_pat = ( "[" SYMBOL "]" ); 
      # printf "symbol_pat = /%s/\n", symbol_pat > "/dev/stderr";
    }

  if (ALPHA == "") { arg_error("ALPHA must be non-empty"); }
  alpha_pat = ( "[" ALPHA "]" ); 
  # printf "alpha_pat = /%s/\n", alpha_pat > "/dev/stderr";

  if (PUNCT != "") 
    { punct_pat = ( "[" PUNCT "]" );
      # printf "punct_pat = /%s/\n", punct_pat > "/dev/stderr";
    }

  if (PARAG == "") { arg_error("PARAG must be non-empty"); }
  parag_pat = ( "[ " BREAK "]*[" PARAG "][ " BREAK PARAG "]*" );
  # printf "parag_pat = /%s/\n", parag_pat > "/dev/stderr";

  parag_char = substr(PARAG,length(PARAG),1);

  if (BREAK == "") { arg_error("BREAK must be non-empty"); }
  break_pat = ( "[ ]*[" BREAK "][ " BREAK "]*" );
  # printf "break_pat = /%s/\n", break_pat > "/dev/stderr";

  break_char = substr(BREAK,length(BREAK),1);
  break_parag_space_chars = ( break_char parag_char " " );
  space_break_space_chars = ( " " break_char " " );
  
  leading_delim_pat = ( "^[ " BREAK PARAG "]+" );
  trailing_delim_pat = ( "[ " BREAK PARAG "]+$" );
  
  # Non-blank valid output characters: 
  invalid_out_char_pat = ( "[^ " \
    ALPHA \
    (smashSymbols ? "?" : SYMBOL) \
    (showPuncts ? PUNCT : "") \
    (showParags ? quote_special(parag_char) : "") \
    (showBreaks ? quote_special(break_char) : "") \
    "]" \
  );
  # printf "invalid_out_char_pat = /%s/\n", invalid_out_char_pat > "/dev/stderr";
}

// { 
  data_error("neither text nor comment");
}

function arg_error(msg)
{
 printf "%s\n", msg > "/dev/stderr";
 printf "usage: %s\n", usage > "/dev/stderr";
 abort=1;  exit 1;
}

function data_error(msg)
{
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}

function prog_error(msg)
{
  printf "PROG ERROR: %s\n", msg > "/dev/stderr";
  abort = 1; exit 1;
}