#! /usr/bin/gawk -f
# Last edited on 1999-07-14 21:10:20 by stolfi
# Usage:
# cat INFILE.evt \
# | colorize-text -f eva2erg.gawk \
# -v verbose=BOOL \
# -v indent=INDENT \
# -v headers=HEADERS \
# -v colorTable=COLORTABLE \
# -v commentColor=COMMCOLOR \
# [-v defaultColor=DEFCOLOR] \
# -v comments=COMMENTS \
# [EQUIVOPTIONS] \
# > OUTFILE.html
#
# This script turns an EVA text INFILE.evt into an HTML file with
# colorized words. Each word is looked up in a user-provided color
# dictionary COLORTABLE. If the word is not found, it is reduced by
# some equivalence function and looked up again.
#
# The input may be in EVT format (with location code in columns 1-19)
# or in pure text format.
#
# An extra blank line is printed after each paragraph terminator "=".
# The spacing is done with "\n" not "
", assuming
# the output will be inserted in a
...# environment. # # If HEADERS is true, provides also the HTML headers and # the
directive. # # Lines beginning with "#" are assumed to be comments. # If COMMENTS is 1 they are printed with COMMCOLOR. # Otherwise they are treated as blank lines. # # In any case, blank lines or blank comments are supressed when they # occur between non-comment lines. # # In other contexts, multiple consecutive blank lines (or blank # comments) are collapsed to a single blank line or blank comment, # depending on the context. # # The COLORTABLE file should have entries PATTERN COLOR, where PATTERN # is an EVA string and COLOR is an HTML color (six hexadecimal # digits). If COLOR is omitted, SPECIALCOLOR is assumed. # # Words that are not found in the table are left uncolored. # # EQUIVOPTIONS are assignments of the form -v OPTION=BOOL where OPTION # is an option variable of eva2erg.gawk, and BOOL is 0 or 1. BEGIN { abort = 0; noColor = "------"; if (commentColor == "") { commentColor = noColor; } if (defaultColor == "") { defaultColor = "ff0000"; } if (version == "") { version = "*"; } if (showSimilar == "") { showSimilar = 0; } current_color = noColor; if (verbose) { printf "options:\n" > "/dev/stderr"; if (erase_ligatures) printf " erase_ligatures\n" > "/dev/stderr"; if (erase_plumes) printf " erase_plumes\n" > "/dev/stderr"; if (ignore_gallows_eyes) printf " ignore_gallows_eyes\n" > "/dev/stderr"; if (join_ei) printf " join_ei\n" > "/dev/stderr"; if (equate_aoy) printf " equate_aoy\n" > "/dev/stderr"; if (collapse_ii) printf " collapse_ii\n" > "/dev/stderr"; if (equate_eights) printf " equate_eights\n" > "/dev/stderr"; if (equate_pt) printf " equate_pt\n" > "/dev/stderr"; if (erase_q) printf " erase_q\n" > "/dev/stderr"; if (erase_word_spaces) printf " erase_word_spaces\n" > "/dev/stderr"; } # "lastpar" tells whether previous non-blank line was a paragraph end. lastpar = 0; # "lastblank" tells whether previous line was blank (comment or not). lastblank = 0; # "lastcomm" tells whether the previous line (ignoring blanks) was a comment. lastcomm = 0; # Read color table: if (colorTable == "") { error("must specify \"-v colorTable=FILE\"\n"); } split("", dic); split("", rdic); nMap=0; while((getline lin < colorTable) > 0) { if (! match(lin, /^[#]/)) { nfld = split(lin, fld); if (nfld == 0) { continue; } else if (nfld > 2) { tbl_error("bad entry = \"" lin "\""); } w = fld[1]; if (nfld == 2) { c = fld[2]; gsub(/^#/, "", c); if (! match(c, /^[0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f]$/)) { tbl_error("bad color = \"" c "\""); } } else { c = defaultColor; } if (w in dic) { tbl_error("repeated key = \"" lin "\""); } dic[w] = c; if (showSimilar) { rdic[reduce_word(w)] = reduce_color(c); } nMap++; } } if (ERRNO != "0") { error((colorTable ": " ERRNO)); } close (colorTable); if (verbose) { printf "loaded %6d color table entries\n", nMap > "/dev/stderr"; } if (headers) { print_html_head("Colored text"); } } /^##/ { next; } /^[ ]*$/ { lastblank = 1; next; } /^#[ ]*$/ { lastblank = 1; next; } /^#/ { if (abort) exit; txt = $0; if (comments) { if (lastpar || lastblank) { if (lastcomm) { print_word("#", commentColor); } printf "\n"; lastblank = 0; } print_word(txt, commentColor); printf "\n"; lastcomm = 1; lastblank = 0; lastpar = 0; } else { lastcomm = 0; lastblank = 1; } next; } /./ { if (abort) exit; # Extracts the location code and transcriber code: if (match($0, /^]*>/)) { loc = substr($0, 1, RLENGTH); ver = substr(loc, RLENGTH-1, 1); if((version != ".") && (ver != version)) { next; } skip = RLENGTH; } else if (substr($0,1,1) == "<") { file_error("bad location code"); } else { loc = (""); skip = 0; } # Print blanks if appropriate: if (version == ".") { if (lastblank) { printf "\n"; } } else { if ((lastblank && lastcomm) || lastpar) { printf "\n"; } } printf "%*s", indent, ""; if (loc != "") { print_word(sprintf("%-19s", loc), commentColor); } if (skip < length($0)) { txt = erg_erase_comments(substr($0,1+skip)); # Erase EVA fillers: gsub(/[!%]/, "", txt); # Replace ".," by spaces gsub(/[.,]/, " ", txt); # Insert spaces around "-" and "=" gsub(/[-]/, " - ", txt); gsub(/[=]/, " = ", txt); # Remove spurious spaces gsub(/^ */, "", txt); gsub(/ *$/, "", txt); gsub(/ */, " ", txt); # Now process word by word: process_line(txt, dic, rdic); } printf "\n"; lastpar = (substr(txt,length(txt),1) == "="); lastcomm = 0; lastblank = 0; next; } END { if (current_color != noColor) { printf ""; } if (headers) { print_html_tail(); } } function print_html_head(title) { printf "\n"; printf " Voynich Manuscript - %s \n", title; printf "\n"; printf "%s
\n", title; printf "\n"; } function print_html_tail(title) { printf "\n"; printf "\n"; printf "\n"; } function tbl_error(msg) { error(("color table: " msg)); } function file_error(msg) { error(("line " FNR ": " msg)); } function error(msg) { printf "\n"; print_word(msg, "ffdd00"); printf "\n"; abort = 1; exit 1; } function iso_to_html(str) { # Converts an ISO Latin-1 string to HTML. # Basically, protects the characters [<>&]. gsub(/&/, "\\\&", str); gsub(/, "\\\<", str); gsub(/>/, "\\\>", str); return str; } function print_word(w, color) { # Prints word "w" in the given color. # Assumes the current color is "current_color" if (color != current_color) { if (current_color != noColor) { printf "" } if (color != noColor) { printf "", color; } current_color = color; } printf "%s", iso_to_html(w); } function reduce_word(str) { # Converts a textstring with no embedded "{}"s to the # requested encoding. str = (erase_ligatures ? erg_erase_ligatures(str) : str); str = (erase_plumes ? erg_erase_plumes(str) : str); str = (ignore_gallows_eyes ? erg_ignore_gallows_eyes(str) : str); str = (join_ei ? erg_join_ei(str) : str); str = (equate_aoy ? erg_equate_aoy(str) : str); str = (collapse_ii ? erg_collapse_ii(str) : str); str = (equate_eights ? erg_equate_eights(str) : str); str = (equate_pt ? erg_equate_pt(str) : str); str = (erase_q ? erg_erase_q(str) : str); str = (erase_word_spaces ? erg_erase_word_spaces(str) : erg_unify_word_spaces(str)); return erg_pack(str); } function reduce_color(col) { # Returns a dimmed version of the color "col" # For the time beging, just return "col" itself. return(col); } function process_word(w, dic, rdic, \ x, color) { # Prints word "w" colorized according to the # given the color tables "dic" (exact matches) and "rdic" (similar words). # Assumes the current color is "current_color" if ((w == "-")||(w == "=")) { color = noColor; } else if (w in dic) { color = dic[w]; } else if (showSimilar) { x = reduce_word(w); if (x in rdic) { color = rdic[x]; } else { color = noColor; } } else { color = noColor; } print_word(w, color); } function process_line(str, dic, rdic, \ i, k, kb, m, b, c) { # Prints line "str" with each word colorized according to the # given color tables "dic" (exact matches) and "rdic" (similar words). # Assumes "str" has been cleaned of comments, and # words are separated by spaces. # Assumes the current color is "current_color" str = (" " str " "); m = length(str); n = 0; b = substr(str,1,1); if (b != " ") { file_error("internal padding error"); exit; } for(k=2; k<=m; k++) { c = substr(str,k,1); if ((b == " ") && (c != " ")) { kb = k; } if ((b != " ") && (c == " ")) { if (n>0) printf " "; process_word(substr(str, kb, k-kb), dic, rdic) n++; } b = c; } if (c != " ") { error("internal padding error"); exit; } }