#! /usr/bin/gawk -f
# Last edited on 2025-04-29 20:51:20 by stolfi
# Usage:
# cat INFILE.evt \
# | colorize-text -f eva2erg.gawk \
# -v verbose=BOOL \
# -v indent=INDENT \
# -v headers=HEADERS \
# -v colorTable=COLORTABLE \
# -v commentColor=COMMCOLOR \
# [-v defaultColor=DEFCOLOR] \
# -v comments=COMMENTS \
# [EQUIVOPTIONS] \
# > OUTFILE.html
#
# This script turns an EVA text INFILE.evt into an HTML file with
# colorized words. Each word is looked up in a user-provided color
# dictionary COLORTABLE. If the word is not found, it is reduced by
# some equivalence function and looked up again.
#
# The input may be in EVT format (with location code in columns 1-19)
# or in pure text format.
#
# An extra blank line is printed after each paragraph terminator "=".
# The spacing is done with "\n" not "
", assuming
# the output will be inserted in a
...# environment. # # If HEADERS is true, provides also the HTML headers and # the
directive.
#
# Lines beginning with "#" are assumed to be comments.
# If COMMENTS is 1 they are printed with COMMCOLOR.
# Otherwise they are treated as blank lines.
#
# In any case, blank lines or blank comments are supressed when they
# occur between non-comment lines.
#
# In other contexts, multiple consecutive blank lines (or blank
# comments) are collapsed to a single blank line or blank comment,
# depending on the context.
#
# The COLORTABLE file should have entries PATTERN COLOR, where PATTERN
# is an EVA string and COLOR is an HTML color (six hexadecimal
# digits). If COLOR is omitted, SPECIALCOLOR is assumed.
#
# Words that are not found in the table are left uncolored.
#
# EQUIVOPTIONS are assignments of the form -v OPTION=BOOL where OPTION
# is an option variable of eva2erg.gawk, and BOOL is 0 or 1.
BEGIN {
abort = 0;
noColor = "------";
if (commentColor == "") { commentColor = noColor; }
if (defaultColor == "") { defaultColor = "ff0000"; }
if (version == "") { version = "*"; }
if (showSimilar == "") { showSimilar = 0; }
current_color = noColor;
if (verbose)
{
printf "options:\n" > "/dev/stderr";
if (erase_ligatures) printf " erase_ligatures\n" > "/dev/stderr";
if (erase_plumes) printf " erase_plumes\n" > "/dev/stderr";
if (ignore_gallows_eyes) printf " ignore_gallows_eyes\n" > "/dev/stderr";
if (join_ei) printf " join_ei\n" > "/dev/stderr";
if (equate_aoy) printf " equate_aoy\n" > "/dev/stderr";
if (collapse_ii) printf " collapse_ii\n" > "/dev/stderr";
if (equate_eights) printf " equate_eights\n" > "/dev/stderr";
if (equate_pt) printf " equate_pt\n" > "/dev/stderr";
if (erase_q) printf " erase_q\n" > "/dev/stderr";
if (erase_word_spaces) printf " erase_word_spaces\n" > "/dev/stderr";
}
# "lastpar" tells whether previous non-blank line was a paragraph end.
lastpar = 0;
# "lastblank" tells whether previous line was blank (comment or not).
lastblank = 0;
# "lastcomm" tells whether the previous line (ignoring blanks) was a comment.
lastcomm = 0;
# Read color table:
if (colorTable == "")
{ error("must specify \"-v colorTable=FILE\"\n"); }
split("", dic);
split("", rdic);
nMap=0;
while((getline lin < colorTable) > 0)
{
if (! match(lin, /^[#]/))
{
nfld = split(lin, fld);
if (nfld == 0)
{ continue; }
else if (nfld > 2)
{ tbl_error("bad entry = \"" lin "\""); }
w = fld[1];
if (nfld == 2)
{ c = fld[2];
gsub(/^#/, "", c);
if (! match(c, /^[0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f]$/))
{ tbl_error("bad color = \"" c "\""); }
}
else
{ c = defaultColor; }
if (w in dic) { tbl_error("repeated key = \"" lin "\""); }
dic[w] = c;
if (showSimilar)
{ rdic[reduce_word(w)] = reduce_color(c); }
nMap++;
}
}
if (ERRNO != "0") { error((colorTable ": " ERRNO)); }
close (colorTable);
if (verbose)
{ printf "loaded %6d color table entries\n", nMap > "/dev/stderr"; }
if (headers) { print_html_head("Colored text"); }
}
/^##/ { next; }
/^[ ]*$/ { lastblank = 1; next; }
/^#[ ]*$/ { lastblank = 1; next; }
/^#/ {
if (abort) exit;
txt = $0;
if (comments)
{
if (lastpar || lastblank)
{ if (lastcomm) { print_word("#", commentColor); }
printf "\n";
lastblank = 0;
}
print_word(txt, commentColor);
printf "\n";
lastcomm = 1; lastblank = 0; lastpar = 0;
}
else
{ lastcomm = 0; lastblank = 1; }
next;
}
/./ {
if (abort) exit;
# Extracts the location code and transcriber code:
if (match($0, /^]*>/))
{ loc = substr($0, 1, RLENGTH);
ver = substr(loc, RLENGTH-1, 1);
if((version != ".") && (ver != version)) { next; }
skip = RLENGTH;
}
else if (substr($0,1,1) == "<")
{ file_error("bad location code");
}
else
{ loc = ("");
skip = 0;
}
# Print blanks if appropriate:
if (version == ".")
{ if (lastblank) { printf "\n"; } }
else
{ if ((lastblank && lastcomm) || lastpar) { printf "\n"; } }
printf "%*s", indent, "";
if (loc != "") { print_word(sprintf("%-19s", loc), commentColor); }
if (skip < length($0))
{ txt = erg_erase_comments(substr($0,1+skip));
# Erase EVA fillers:
gsub(/[!%]/, "", txt);
# Replace ".," by spaces
gsub(/[.,]/, " ", txt);
# Insert spaces around "-" and "="
gsub(/[-]/, " - ", txt);
gsub(/[=]/, " = ", txt);
# Remove spurious spaces
gsub(/^ */, "", txt);
gsub(/ *$/, "", txt);
gsub(/ */, " ", txt);
# Now process word by word:
process_line(txt, dic, rdic);
}
printf "\n";
lastpar = (substr(txt,length(txt),1) == "=");
lastcomm = 0; lastblank = 0;
next;
}
END {
if (current_color != noColor)
{ printf ""; }
if (headers) { print_html_tail(); }
}
function print_html_head(title)
{
printf "\n";
printf "Voynich Manuscript - %s \n", title;
printf "\n";
printf "%s
\n", title;
printf "\n";
}
function print_html_tail(title)
{
printf "\n";
printf "\n";
printf "\n";
}
function tbl_error(msg)
{
error(("color table: " msg));
}
function file_error(msg)
{
error(("line " FNR ": " msg));
}
function error(msg)
{
printf "\n";
print_word(msg, "ffdd00");
printf "\n";
abort = 1; exit 1;
}
function iso_to_html(str)
{
# Converts an ISO Latin-1 string to HTML.
# Basically, protects the characters [<>&].
gsub(/&/, "\\\&", str);
gsub(/, "\\\<", str);
gsub(/>/, "\\\>", str);
return str;
}
function print_word(w, color)
{
# Prints word "w" in the given color.
# Assumes the current color is "current_color"
if (color != current_color)
{ if (current_color != noColor) { printf "" }
if (color != noColor) { printf "", color; }
current_color = color;
}
printf "%s", iso_to_html(w);
}
function reduce_word(str)
{
# Converts a textstring with no embedded "{}"s to the
# requested encoding.
str = (erase_ligatures ? erg_erase_ligatures(str) : str);
str = (erase_plumes ? erg_erase_plumes(str) : str);
str = (ignore_gallows_eyes ? erg_ignore_gallows_eyes(str) : str);
str = (join_ei ? erg_join_ei(str) : str);
str = (equate_aoy ? erg_equate_aoy(str) : str);
str = (collapse_ii ? erg_collapse_ii(str) : str);
str = (equate_eights ? erg_equate_eights(str) : str);
str = (equate_pt ? erg_equate_pt(str) : str);
str = (erase_q ? erg_erase_q(str) : str);
str = (erase_word_spaces ? erg_erase_word_spaces(str) : erg_unify_word_spaces(str));
return erg_pack(str);
}
function reduce_color(col)
{
# Returns a dimmed version of the color "col"
# For the time beging, just return "col" itself.
return(col);
}
function process_word(w, dic, rdic, \
x, color)
{
# Prints word "w" colorized according to the
# given the color tables "dic" (exact matches) and "rdic" (similar words).
# Assumes the current color is "current_color"
if ((w == "-")||(w == "="))
{ color = noColor; }
else if (w in dic)
{ color = dic[w]; }
else if (showSimilar)
{ x = reduce_word(w);
if (x in rdic)
{ color = rdic[x]; }
else
{ color = noColor; }
}
else
{ color = noColor; }
print_word(w, color);
}
function process_line(str, dic, rdic, \
i, k, kb, m, b, c)
{
# Prints line "str" with each word colorized according to the
# given color tables "dic" (exact matches) and "rdic" (similar words).
# Assumes "str" has been cleaned of comments, and
# words are separated by spaces.
# Assumes the current color is "current_color"
str = (" " str " ");
m = length(str);
n = 0;
b = substr(str,1,1);
if (b != " ") { file_error("internal padding error"); exit; }
for(k=2; k<=m; k++)
{ c = substr(str,k,1);
if ((b == " ") && (c != " ")) { kb = k; }
if ((b != " ") && (c == " "))
{ if (n>0) printf " ";
process_word(substr(str, kb, k-kb), dic, rdic)
n++;
}
b = c;
}
if (c != " ") { error("internal padding error"); exit; }
}