#! /usr/bin/gawk -f # Last edited on 2025-05-10 13:56:14 by stolfi # Reads an EVT file as of release 16e6. Tries to convert it to an # equivalent file in XEVT format. This includes parsing the simple # glyphs that are presumably connected by ligatures into compound glyphs # (like "Sh", "CTh", etc, but more general) and enclosing each separate # glyph, simple or compound, into parens '()'. # # Lines that are '#'-comments are preserved. # # In the text, the alignment fillers '!' and trailing blanks are deleted # and any string of consecutive omitted-word markers '%' is condensed to # a single '%'. Comments embedded in the text with '{}' are preserved # but must not contain '{}' or start with '{&'. Blanks are allowed # inside comments and preserved but scrunched to single blanks. # # Also creates a file "cetx_weirdos.tbl" with a table that maps # weirdo codes seen in the input to the corresponding codes # in the output. A line locator is inserted in that file as a '#'-comment # before the weirdos found on that line. # # Must be executed with "-f convert_evt_to_xevt_funcs.gawk". # BEGIN { abort = -1; nread = 0; ndata = 0; npages = 0; nunits = 0; nplaces = 0; ncomm = 0; cetx_old_weirdo_max = 199; cetx_new_weirdo_next = 400; debug_step = 0; debug_line = 0; # Clear the weirdo table: weirdos_table = "cetx_weirdos.tbl" printf "# Weirdo mapping table\n" > weirdos_table; } (abort >= 0) { exit(abort); } # Count lines read: // { nread++; } # Pass through blank and comment lines: /^[ ]*([#]|$)/ { print; ncomm++; next; } # Set the debugging flag, debug input: // { if ((debug_step > 0) && ((ndata <= 50) || ((ndata % debug_step) == 0))) { debug_line = 1; } if (debug_line) { printf "%d: input = [[%s]]\n", nread, $0 > "/dev/stderr"; } } # Cleanup some blanks: // { gsub(/[\011]/, " ", $0); gsub(/^[ ]+/, "", $0); gsub(/[ ]+$/, "", $0); } # Record line locators in the weirdos table file: /> weirdos_table; } # Pass through EVT page attribute lines: /^ *[{][$}]/ { wrout($0); npages++; next; } # Pass through EVT unit attribute lines: /^ *[{][$}]/ { wrout($0); nunits++; next; } # Pass through EVT location attribute lines: /^ *[{][$}]/ { wrout($0); nplaces++; next; } # Process data lines: /^/)) { data_error("can't find locator"); } else { if (RSTART != 1) { prog_error("RSTART"); } loc = substr(eline, RSTART, RLENGTH); etext = substr(eline, RSTART + RLENGTH) gsub(/^[ ]+/, "", etext); etext = undo_lowercasing_of_ligatures(etext); xtext = cetx_convert_evt_to_xevt(etext); xline = sprintf("%-18s %s\n", loc, xtext); failed = (xline ~ /\[ERROR\]/) debug_line = debug_line || failed; wrout(xline); ndata++; debug_line = 0; if (failed) { exit(1); } } next; } EBD { close(weirdos_table); fflush(); if (abort > 0) { printf "aborted\n" > "/dev/stderr"; exit(abort); } printf "%6d lines read\n", nread > "/dev/stderr"; printf "%6d comment lines\n", ncomm > "/dev/stderr"; printf "%6d page header lines\n", npages > "/dev/stderr"; printf "%6d section header lines\n", nunits > "/dev/stderr"; printf "%6d location header lines\n", nplaces > "/dev/stderr"; printf "%6d data lines written\n", ndata > "/dev/stderr"; } function undo_lowercasing_of_ligatures(etext, res,pref,code,suff) { # Undoes the EVT lowercasing of EVA uppercase ligatures # "C[H]+h", "S[H*]h", and /C[TKFP]+[H]*h/. # Namely, convert c'+gallows and 's' to upper case if followed by one or more 'h', # unless the group is enclosed in "(" meaning that there is no ligation: res = "" while (match(etext, /([^(cs]|^)(c[tkfpzw]*[h]+|[s][h]+)([^h)]|$)/)) { # printf " @1@ substr = '%s'\n", substr(etext, RSTART, RLENGTH) >> "/dev/stderr"; # Exclude context chars, if any: REND = RSTART + RLENGTH - 1; if (substr(etext, RSTART, 1) !~ /[cs]/) { RSTART++; } if (substr(etext, REND, 1) != "h") { REND--; } RLENGTH = REND - RSTART + 1; if (RLENGTH < 1) { prog_error(("RLENGTH")); } # printf " @2@ substr = '%s'\n", substr(etext, RSTART, RLENGTH) >> "/dev/stderr"; # Trisect {etext}: pref = substr(etext, 1, RSTART-1); code = substr(etext, RSTART, RLENGTH); suff = substr(etext, RSTART + RLENGTH); if (substr(code, 1, 1) !~ /[cs]/) { prog_error(("code = '" code "' beg char")); } if (substr(code, length(code), 1) != "h") { prog_error(("code = '" code "' end char")); } # Uppercase all but last 'h': code = toupper(code); code = gensub(/H([^)]|$)/, "h\\1", "g", code); # Move on: res = (res pref code); etext = suff; } res = (res etext) return res; } function wrout(xline) { if (debug_line) { printf "convert_evt_format_to_xevt_format.gawk:\n" > "/dev/stderr"; printf "%d: input = [[%s]]\n", nread, $0 > "/dev/stderr"; printf "%d: output = [[%s]]\n\n", nread, xline > "/dev/stderr"; } print xline; } function data_error(msg) { printf "%s:%s: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " «%s»\n", $0 > "/dev/stderr"; abort = 1; exit 1; } function prog_error(msg) { printf "** PROG ERROR: %s\n", msg > "/dev/stderr"; abort = 1; exit 1; }