#! /usr/bin/gawk -f
# Usage:
# cat INFILE.evt \
# | colorize-text -f word-equiv.gawk \
# -v colorTable=COLORTABLE \
# [-v missing=MISSCOLOR] \
# [-v default=DEFCOLOR] \
# [EQUIVOPTIONS] \
# > OUTFILE.html
#
# This script turns an EVA text INFILE.evt into an HTML file with
# colorized words. Each word is mapped by weq_reduce and looked up in
# a user-provided color dictionary COLORTABLE.
#
# The input may be in EVT format (with location code in columns 1-19)
# or in pure text format.
#
# Lines are separated on output by "\n", or "\n\n" after a "=". This
# is OK if the output is to be inserted in a
...
# environment; in other contexts, it mey be necessary to insert a
# at the ened of each line.
#
# The COLORTABLE file should have entries PATTERN COLOR, where PATTERN
# is an EVA string and COLOR is an HTML color (six hexadecimal
# digits).
#
# If a word is not found in the table, it is set in MISSCOLOR (a
# six-digit hex string, 000000 if not specified. The script assumes
# that words without explicit color will be shown in DEFCOLOR (000000)
#
# EQUIVOPTIONS are assignments of the form -v OPTION=BOOL where OPTION
# is an option variable of word-equiv.gawk, and BOOL is 0 or 1.
function iso_to_html(str)
{
# Converts an ISO Latin-1 string to HTML.
# Basically, protects the characters [<>&].
gsub(/&/, "\\\&", str);
gsub(/, "\\\<", str);
gsub(/>/, "\\\>", str);
return str;
}
function print_word(w, color)
{
# Prints word "w" in the given color.
# Assumes the current color is "current_color"
if (color != current_color)
{ if (current_color != default) { printf "" }
if (color != default) { printf "", color; }
current_color = color;
}
printf "%s", iso_to_html(w);
}
function process_word(w, dic, \
x, color)
{
# Prints word "w" colorized according to the
# given "dic" table.
# Assumes the current color is "current_color"
if ((w == "-")||(w == "="))
{ color = default; }
else
{ x = weq_reduce(w);
if (x in dic) {color = dic[x];} else {color = missing;}
}
print_word(w, color);
}
function process_line(str, dic, \
i, k, kb, m, b, c)
{
# Prints line "str" with each word colorized according to the
# given "dic" table.
# Assumes "str" has been cleaned of comments, and
# words are separated by spaces.
# Assumes the current color is "current_color"
str = (" " str " ");
m = length(str);
n = 0;
b = substr(str,1,1);
if (b != " ") { error("internal padding error"); exit; }
for(k=2; k<=m; k++)
{ c = substr(str,k,1);
if ((b == " ") && (c != " ")) { kb = k; }
if ((b != " ") && (c == " "))
{ if (n>0) printf " ";
process_word(substr(str, kb, k-kb), dic)
n++;
}
b = c;
}
if (c != " ") { error("internal padding error"); exit; }
}
BEGIN {
abort = 0;
if (default == "") { default = "000000"; }
if (missing == "") { missing = "000000"; }
if (colorTable == "")
{ error("must specify \"-v colorTable=FILE\"\n"); }
split("", dic);
# Read color table:
nMap=0;
while((getline lin < colorTable) > 0) {
split(lin, fld);
if ((3 in fld) || ! (2 in fld))
{ error("bad colorTable entry = \"" lin "\""); }
if (fld[1] in dic)
{ error("repeated key = \"" lin "\""); }
dic[fld[1]] = fld[2];
nMap++;
}
close (colorTable);
printf "loaded %6d color table entries\n", nMap > "/dev/stderr";
current_color = default;
}
/^#/ {
if (abort) exit;
txt = iso_to_html($0);
print_word(txt, default);
printf "\n";
next;
}
/./ {
if (abort) exit;
# Extracts the location code:
if (match($0, /^]*>/))
{ loc = sprintf("%-19s", substr($0,1,RLENGTH));
skip = RLENGTH;
}
else if (substr($0,1,1) == "<")
{ error("bad location code");
}
else
{ loc = ("");
skip = 0;
}
print_word(loc, default);
if (skip < length($0))
{ txt = weq_erase_comments(substr($0,1+skip));
# Erase EVA fillers:
gsub(/[!%]/, "", txt);
# Replace ".," by spaces
gsub(/[.,]/, " ", txt);
# Insert spaces around "-" and "="
gsub(/[-]/, " - ", txt);
gsub(/[=]/, " = ", txt);
# Remove spurious spaces
gsub(/^ */, "", txt);
gsub(/ *$/, "", txt);
gsub(/ */, " ", txt);
# Now process word by word:
process_line(txt, dic);
}
printf "\n";
if (substr(txt,length(txt),1) == "=") printf "\n"
next;
}
END {
if (current_color != default)
{ printf ""; }
}