#! /usr/bin/gawk -f
# Last edited on 2000-07-10 08:14:10 by stolfi

# Usage: 
#   cat INFILE.evt \
#     | colorize-text -f eva2erg.gawk \
#       -v verbose=BOOL \
#       -v indent=INDENT \
#       -v colorTable=COLORTABLE \
#       -v textColor=TEXTCOLOR \
#       [-v defaultColor=DEFCOLOR] \
#       [EQUIVOPTIONS] \
#     > OUTFILE.html
#
# This script turns an EVA text INFILE.evt into an HTML file with
# colorized words.  Each word is reduced by some equivalence function
# and looked up in a user-provided color dictionary COLORTABLE.
#
# The input may be in EVT format (with location code in columns 1-19)
# or in pure text format.
#
# Lines are separated on output by "\n", or "\n\n" after a "=".  This
# is OK if the output is to be inserted in a <pre>...</pre>
# environment; in other contexts, it mey be necessary to insert a <br>
# at the ened of each line.
#
# The COLORTABLE file should have entries PATTERN COLOR, where PATTERN
# is an EVA string and COLOR is an HTML color (six hexadecimal
# digits).  
#
# The script assumes that words without explicit <font color=..>
# directive will appear in TEXTCOLOR If a
# word is not found in the table, it is set in DEFCOLOR (a six-digit
# hex string, TEXTCOLOR if not specified.
#
# EQUIVOPTIONS are assignments of the form -v OPTION=BOOL where OPTION
# is an option variable of eva2erg.gawk, and BOOL is 0 or 1.

function iso_to_html(str)
{
  # Converts an ISO Latin-1 string to HTML.
  # Basically, protects the characters [<>&].
  gsub(/&/, "\\\&amp;", str);
  gsub(/</, "\\\&lt;", str);
  gsub(/>/, "\\\&gt;", str);
  return str;
}

function print_word(w, color)
{ 
  # Prints word "w" in the given color.
  # Assumes the current color is "current_color"
  if (color != current_color)
    { if (current_color != textColor) { printf "</font>" }
      if (color != textColor) { printf "<font color=\#%s>", color; }
      current_color = color;
    }
  printf "%s", iso_to_html(w);
}

function reduce_word(str)
{
  # Converts a textstring with no embedded "{}"s to the
  # requested encoding.
  
  str = (erase_ligatures     ? erg_erase_ligatures(str)     : str);
  str = (erase_plumes        ? erg_erase_plumes(str)        : str);
  str = (ignore_gallows_eyes ? erg_ignore_gallows_eyes(str) : str);
  str = (join_ei             ? erg_join_ei(str)             : str);
  str = (equate_aoy          ? erg_equate_aoy(str)          : str);
  str = (collapse_ii         ? erg_collapse_ii(str)         : str);
  str = (equate_eights       ? erg_equate_eights(str)       : str);
  str = (equate_pt           ? erg_equate_pt(str)           : str);
  str = (erase_q             ? erg_erase_q(str)             : str);
  str = (erase_word_spaces   ? erg_erase_word_spaces(str)   : erg_unify_word_spaces(str));
  return erg_pack(str);
}

function process_word(w, dic, \
  x, color)
{
  # Prints word "w" colorized according to the
  # given "dic" table.
  # Assumes the current color is "current_color"
  if ((w == "-")||(w == "="))
    { color = textColor; }
  else
    { x = reduce_word(w);
      if (x in dic) {color = dic[x];} else {color = defaultColor;}
    }
  print_word(w, color);
}

function process_line(str, dic, \
  i, k, kb, m, b, c)
{
  # Prints line "str" with each word colorized according to the
  # given "dic" table.
  # Assumes "str" has been cleaned of comments, and 
  # words are separated by spaces.
  # Assumes the current color is "current_color"
  str = (" " str " ");
  m = length(str); 
  n = 0;
  b = substr(str,1,1);
  if (b != " ") { error("internal padding error"); exit; }
  for(k=2; k<=m; k++)
    { c = substr(str,k,1);
      if ((b == " ") && (c != " ")) { kb = k; }
      if ((b != " ") && (c == " ")) 
        { if (n>0) printf " ";
          process_word(substr(str, kb, k-kb), dic)
          n++;
        }
      b = c;
    }
  if (c != " ") { error("internal padding error"); exit; }
}

BEGIN {
  abort = 0;
  if (textColor == "") { error("must specify \"-v textColor=...\""); }
  if (defaultColor == "") { defaultColor = textColor; }
  current_color = textColor;
  if (verbose) 
    { 
      printf "options:\n" > "/dev/stderr";
      if (erase_ligatures)     printf "  erase_ligatures\n"     > "/dev/stderr";
      if (erase_plumes)        printf "  erase_plumes\n"        > "/dev/stderr";
      if (ignore_gallows_eyes) printf "  ignore_gallows_eyes\n" > "/dev/stderr";
      if (join_ei)             printf "  join_ei\n"             > "/dev/stderr";
      if (equate_aoy)          printf "  equate_aoy\n"          > "/dev/stderr";
      if (collapse_ii)         printf "  collapse_ii\n"         > "/dev/stderr";
      if (equate_eights)       printf "  equate_eights\n"       > "/dev/stderr";
      if (equate_pt)           printf "  equate_pt\n"           > "/dev/stderr";
      if (erase_q)             printf "  erase_q\n"             > "/dev/stderr";
      if (erase_word_spaces)   printf "  erase_word_spaces\n"   > "/dev/stderr";
    }
  if (colorTable == "") 
    { error("must specify \"-v colorTable=FILE\"\n"); }
  split("", dic);
  # Read color table:
  nMap=0;
  while((getline lin < colorTable) > 0) { 
    if (! match(lin, /^ *[#]/)) { 
      nfld = split(lin, fld);
      if (nfld != 2) 
        { error("bad colorTable entry = \"" lin "\""); }
      if (fld[1] in dic) 
        { error("repeated key = \"" lin "\""); }
      dic[fld[1]] = fld[2];
      nMap++;
    }
  }
  close (colorTable);
  if (verbose)
    { printf "loaded %6d color table entries\n", nMap > "/dev/stderr"; }
}

/^#/ { 
  if (abort) exit;
  txt = iso_to_html($0);
  print_word(txt, textColor);
  printf "\n";
  next;
}

/./ {
  if (abort) exit;

  # Extracts the location code:
  
  if (match($0, /^<f[0-9][0-9]*[vr][0-9]*\.[^ >]*>/)) 
    { loc = sprintf("%-19s", substr($0,1,RLENGTH));
      skip = RLENGTH;
    }
  else if (substr($0,1,1) == "<") 
    { error("bad location code");
    }
  else 
    { loc = (""); 
      skip = 0;
    }
  printf "%*s", indent, "";
  print_word(loc, textColor);
  if (skip < length($0)) 
    { txt = erg_erase_comments(substr($0,1+skip));
      # Erase EVA fillers:
      gsub(/[!%]/, "", txt);
      # Replace ".," by spaces
      gsub(/[.,]/, " ", txt);
      # Insert spaces around "-" and "="
      gsub(/[-]/, " - ", txt);
      gsub(/[=]/, " = ", txt);
      # Remove spurious spaces
      gsub(/^  */, "", txt);
      gsub(/  *$/, "", txt);
      gsub(/   */, " ", txt);
      # Now process word by word:
      process_line(txt, dic);
    }
  printf "\n";
  if (substr(txt,length(txt),1) == "=") printf "\n"
  next;
}

END { 
  if (current_color != textColor)
    { printf "</font>"; }
}

function error(msg)
{
  printf "line %d: %s\n", NR, msg > "/dev/stderr";
  abort = 1; exit 1;
}