#! /usr/bin/gawk -f # Last edited on 1999-07-14 21:10:20 by stolfi # Usage: # cat INFILE.evt \ # | colorize-text -f eva2erg.gawk \ # -v verbose=BOOL \ # -v indent=INDENT \ # -v headers=HEADERS \ # -v colorTable=COLORTABLE \ # -v commentColor=COMMCOLOR \ # [-v defaultColor=DEFCOLOR] \ # -v comments=COMMENTS \ # [EQUIVOPTIONS] \ # > OUTFILE.html # # This script turns an EVA text INFILE.evt into an HTML file with # colorized words. Each word is looked up in a user-provided color # dictionary COLORTABLE. If the word is not found, it is reduced by # some equivalence function and looked up again. # # The input may be in EVT format (with location code in columns 1-19) # or in pure text format. # # An extra blank line is printed after each paragraph terminator "=". # The spacing is done with "\n" not "<br>", assuming # the output will be inserted in a <pre>...</pre> # environment. # # If HEADERS is true, provides also the HTML headers and # the <pre> directive. # # Lines beginning with "#" are assumed to be comments. # If COMMENTS is 1 they are printed with COMMCOLOR. # Otherwise they are treated as blank lines. # # In any case, blank lines or blank comments are supressed when they # occur between non-comment lines. # # In other contexts, multiple consecutive blank lines (or blank # comments) are collapsed to a single blank line or blank comment, # depending on the context. # # The COLORTABLE file should have entries PATTERN COLOR, where PATTERN # is an EVA string and COLOR is an HTML color (six hexadecimal # digits). If COLOR is omitted, SPECIALCOLOR is assumed. # # Words that are not found in the table are left uncolored. # # EQUIVOPTIONS are assignments of the form -v OPTION=BOOL where OPTION # is an option variable of eva2erg.gawk, and BOOL is 0 or 1. BEGIN { abort = 0; noColor = "------"; if (commentColor == "") { commentColor = noColor; } if (defaultColor == "") { defaultColor = "ff0000"; } if (version == "") { version = "*"; } if (showSimilar == "") { showSimilar = 0; } current_color = noColor; if (verbose) { printf "options:\n" > "/dev/stderr"; if (erase_ligatures) printf " erase_ligatures\n" > "/dev/stderr"; if (erase_plumes) printf " erase_plumes\n" > "/dev/stderr"; if (ignore_gallows_eyes) printf " ignore_gallows_eyes\n" > "/dev/stderr"; if (join_ei) printf " join_ei\n" > "/dev/stderr"; if (equate_aoy) printf " equate_aoy\n" > "/dev/stderr"; if (collapse_ii) printf " collapse_ii\n" > "/dev/stderr"; if (equate_eights) printf " equate_eights\n" > "/dev/stderr"; if (equate_pt) printf " equate_pt\n" > "/dev/stderr"; if (erase_q) printf " erase_q\n" > "/dev/stderr"; if (erase_word_spaces) printf " erase_word_spaces\n" > "/dev/stderr"; } # "lastpar" tells whether previous non-blank line was a paragraph end. lastpar = 0; # "lastblank" tells whether previous line was blank (comment or not). lastblank = 0; # "lastcomm" tells whether the previous line (ignoring blanks) was a comment. lastcomm = 0; # Read color table: if (colorTable == "") { error("must specify \"-v colorTable=FILE\"\n"); } split("", dic); split("", rdic); nMap=0; while((getline lin < colorTable) > 0) { if (! match(lin, /^[#]/)) { nfld = split(lin, fld); if (nfld == 0) { continue; } else if (nfld > 2) { tbl_error("bad entry = \"" lin "\""); } w = fld[1]; if (nfld == 2) { c = fld[2]; gsub(/^#/, "", c); if (! match(c, /^[0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f]$/)) { tbl_error("bad color = \"" c "\""); } } else { c = defaultColor; } if (w in dic) { tbl_error("repeated key = \"" lin "\""); } dic[w] = c; if (showSimilar) { rdic[reduce_word(w)] = reduce_color(c); } nMap++; } } if (ERRNO != "0") { error((colorTable ": " ERRNO)); } close (colorTable); if (verbose) { printf "loaded %6d color table entries\n", nMap > "/dev/stderr"; } if (headers) { print_html_head("Colored text"); } } /^##/ { next; } /^[ ]*$/ { lastblank = 1; next; } /^#[ ]*$/ { lastblank = 1; next; } /^#/ { if (abort) exit; txt = $0; if (comments) { if (lastpar || lastblank) { if (lastcomm) { print_word("#", commentColor); } printf "\n"; lastblank = 0; } print_word(txt, commentColor); printf "\n"; lastcomm = 1; lastblank = 0; lastpar = 0; } else { lastcomm = 0; lastblank = 1; } next; } /./ { if (abort) exit; # Extracts the location code and transcriber code: if (match($0, /^<f[0-9][0-9]*[vr][0-9]*\.[^ >]*>/)) { loc = substr($0, 1, RLENGTH); ver = substr(loc, RLENGTH-1, 1); if((version != ".") && (ver != version)) { next; } skip = RLENGTH; } else if (substr($0,1,1) == "<") { file_error("bad location code"); } else { loc = (""); skip = 0; } # Print blanks if appropriate: if (version == ".") { if (lastblank) { printf "\n"; } } else { if ((lastblank && lastcomm) || lastpar) { printf "\n"; } } printf "%*s", indent, ""; if (loc != "") { print_word(sprintf("%-19s", loc), commentColor); } if (skip < length($0)) { txt = erg_erase_comments(substr($0,1+skip)); # Erase EVA fillers: gsub(/[!%]/, "", txt); # Replace ".," by spaces gsub(/[.,]/, " ", txt); # Insert spaces around "-" and "=" gsub(/[-]/, " - ", txt); gsub(/[=]/, " = ", txt); # Remove spurious spaces gsub(/^ */, "", txt); gsub(/ *$/, "", txt); gsub(/ */, " ", txt); # Now process word by word: process_line(txt, dic, rdic); } printf "\n"; lastpar = (substr(txt,length(txt),1) == "="); lastcomm = 0; lastblank = 0; next; } END { if (current_color != noColor) { printf "</font>"; } if (headers) { print_html_tail(); } } function print_html_head(title) { printf "<html>\n"; printf "<head><title>Voynich Manuscript - %s</title></head>\n", title; printf "<body bgcolor=\"#000000\" text=\"#cccccc\">\n"; printf "<h1><font color=\"#ff3300\">%s</font></h1>\n", title; printf "<b><pre>\n"; } function print_html_tail(title) { printf "</pre></b>\n"; printf "</body>\n"; printf "</html>\n"; } function tbl_error(msg) { error(("color table: " msg)); } function file_error(msg) { error(("line " FNR ": " msg)); } function error(msg) { printf "\n"; print_word(msg, "ffdd00"); printf "\n"; abort = 1; exit 1; } function iso_to_html(str) { # Converts an ISO Latin-1 string to HTML. # Basically, protects the characters [<>&]. gsub(/&/, "\\\&", str); gsub(/</, "\\\<", str); gsub(/>/, "\\\>", str); return str; } function print_word(w, color) { # Prints word "w" in the given color. # Assumes the current color is "current_color" if (color != current_color) { if (current_color != noColor) { printf "</font>" } if (color != noColor) { printf "<font color=\#%s>", color; } current_color = color; } printf "%s", iso_to_html(w); } function reduce_word(str) { # Converts a textstring with no embedded "{}"s to the # requested encoding. str = (erase_ligatures ? erg_erase_ligatures(str) : str); str = (erase_plumes ? erg_erase_plumes(str) : str); str = (ignore_gallows_eyes ? erg_ignore_gallows_eyes(str) : str); str = (join_ei ? erg_join_ei(str) : str); str = (equate_aoy ? erg_equate_aoy(str) : str); str = (collapse_ii ? erg_collapse_ii(str) : str); str = (equate_eights ? erg_equate_eights(str) : str); str = (equate_pt ? erg_equate_pt(str) : str); str = (erase_q ? erg_erase_q(str) : str); str = (erase_word_spaces ? erg_erase_word_spaces(str) : erg_unify_word_spaces(str)); return erg_pack(str); } function reduce_color(col) { # Returns a dimmed version of the color "col" # For the time beging, just return "col" itself. return(col); } function process_word(w, dic, rdic, \ x, color) { # Prints word "w" colorized according to the # given the color tables "dic" (exact matches) and "rdic" (similar words). # Assumes the current color is "current_color" if ((w == "-")||(w == "=")) { color = noColor; } else if (w in dic) { color = dic[w]; } else if (showSimilar) { x = reduce_word(w); if (x in rdic) { color = rdic[x]; } else { color = noColor; } } else { color = noColor; } print_word(w, color); } function process_line(str, dic, rdic, \ i, k, kb, m, b, c) { # Prints line "str" with each word colorized according to the # given color tables "dic" (exact matches) and "rdic" (similar words). # Assumes "str" has been cleaned of comments, and # words are separated by spaces. # Assumes the current color is "current_color" str = (" " str " "); m = length(str); n = 0; b = substr(str,1,1); if (b != " ") { file_error("internal padding error"); exit; } for(k=2; k<=m; k++) { c = substr(str,k,1); if ((b == " ") && (c != " ")) { kb = k; } if ((b != " ") && (c == " ")) { if (n>0) printf " "; process_word(substr(str, kb, k-kb), dic, rdic) n++; } b = c; } if (c != " ") { error("internal padding error"); exit; } }