#! /bin/gawk -f # Last edited on 2025-07-03 09:33:17 by stolfi ??? Update to new IVTFF-like format. # To be included in other gawk programs. Defines a function] that # converts the EVA-like code used in the interlinear version # 16e6 to XEVA with each compound glyph surrounded by parentheses. A # /glyph/ being defined as a maximal group of strokes that are connected # (or which very likely were meant to be connected but got disconnected # by accident). Moreover, non-numeric weirdo codes like "c{&c~}" or "*{&foo}" # are replaced by numeric "&NNN" ones. # Also appends to file "cetx_weirdos.tbl" one line for each weirdo code generated # in the resulting string, with the input (possibly non-numeric) and output # (always numeric). function cetx_convert_evt_to_xevt(lin, res,xel,enext,xnext,xprev,nsimp) { # This function removes from {lin} all '!' alignment fillers, any # trailing '#'-comments, leading and trailing blanks. It also # condenses any string of one or more consecutive blanks (resp. '%') # to a single blank (resp. '%'). # # Then it parses {lin}, from left to right, as a sequence of tokens # satisfying these patterns (in priority order): # # A word, line, and parag delimiters in [-=.,]. # # A sequence of one or more simple glyph codes that are ligated together. # # A missing word marker '%'. # # A comment in braces (that is not a weirdo code, see below) # # SIMPLE GLYPH CODES # # A simple glyph code in the input string may be any letter [a-z] or # [ACEFHIKO-TWYZ], or '*' to denote an unreadable glyph, both possibly # followed by a weirdo code. The capital letters [BDGJLMNUVX] are # invalid in the input string. # # A letter or '*' followed by a weirdo code is assumed to be a # placeholder for the weirdo and thus discarded. # # Otherwise each letter in the input string denotes a simple glyph as # in the EVA encoding extended to the whole alphabet [a-z] as # explained elsewhere. Capital letters [AHIORS] denote the same glyph # as the lowercase letter, but with a horizontal ligature line # extending to the right from the top of the glyph. In the case of # 'R' and 'S' the plume of 'r' and 's' is displaced to the right, to # the approximate midpoint of this ligature line. # # For compatibility with EVA, the letters 'c' and 'C' denote an EVA # @e with that same top right ligature line. # # The letter 'E' denotes an eva @e glyph WITHOUT the top ligature # line but with an extra long lower end that can connect to some other # glyphs from below. # # As in EVA, the letter 'h' denotes an @e with a horizontal ligature # line extending to the LEFT at the top; and 'H' is an @e with both # left and right top ligature lines. # # The letter 'Q' denotes the EVA @q glyph but with a longer # horizontal stroke and raised until that stroke level with the top # of an @o, so that it serves as the right ligature line of other # capital letters. # # As in EVA, input codes [ktfpzw] denote the plain gallows, where 'z' # and 'w' denote the same as 'f' and 'p' respectively but with a # @e-like hook at the end of the horizontal arm; and [KTFPZW] are # the same with the legs slashed by a horizontal stroke at # @o-height, which will be the floor of the gallows' "platform". # # A weirdo code is either [{][&][0-9][0-9][0-9][}] denoting a # weirdo glyph with that three-digit number (defined elsewhere), or # [{][&][^{}]*[}] denoting some ad-hoc encoding of the weirdo, # like "{&O'}" for an EVA @o with a plume and a top right ligature. # In both cases the braces are stripped. In the second case, # the sequence between '&' and '}' is mapped to a three-digit code # by an internal table. Either way, the '&' and the three digits # are then treated like a letter, as above. # # LIGATURES OF SIMPLE GLYPHS # # Consecutive simple glyphs can be joined into a single XEVA glyph # code only if they their shapes are such that they are connected # together. Those that have a top right ligature line, as well as # slashed gallows with input codes [KTFPZW], will automatically # connect to any subsequent simple glyph with any of the codes # [a-eg-jl-orsu] or their upper-case versions, as well as with the # slashed gallows [KTFPZW]. # # The interglyph space code ',' can be used in the input, if necessary, to # prevent the joining of simple glyphs even when they would normally # join. # # This function assumes that, as in original EVA, benches and platform # gallows are be written /[CS][H]*h/ and /C[TKFPZW]+[H]*h/. # The "lowercased EVA" notations /csh+/ and /c[tkfpzw]+h+/ must be # converted to uppercase before calling this function. (But beware # of cases like when "cth" actually means a "CTh" without the slash # across the "t".) # # The simple glyph 'E' connects to any subsequent glyph with a round # left outline, namely those with codes [a-egosuy] or their uppercase # variants that are valid. Otherwise the input code 'E' is # automatically converted to lowercase 'e'. # # For compatibility with EVA, the input codes 'c' and 'C' denote the same # simple glyph, an @e with top right ligature. Either character is # converted to 'c' if not ligated to a subsequent glyph, or to 'C' if # ligated. # # The the codes 'h' and 'H' will connect with any PRECEDING simple # glyph with codes [acehioqrsy] or their uppercase versions, in which # case this preceding code is automatically converted to uppercase. # They also connect to preceding slashed gallows [KTFPZW]. # This rule is applied after the EVT sequences /^c[ktfpzw]*/ # and 's' preceding the 'h' are converted to upper-case, # as explained above. # # The interpretation of 'c', 'C', and 'E' is unnecessarily complex. # By logic, 'E' (not 'c' or 'C') should be 'e' with a top right # ligature line. Then 'c' (not 'E') could be the bottom-connecting # 'e', and 'C' would be invalid. But the rules above were chosen in # order to maintain compatibility with the original EVA enconding. # # The input codes '*' and '%' do not connect to any others and # therefore each occurrence is parsed as a token by itself. # # A few weirdos may connect to the the right (like 'c' and capital # input codes) or to the left (like 'h' and 'H'). The ligature # behavior of each weirdo is defined internally. # # OUTPUT XEVA LINE # # The output string is a list of XEVA glyph codes, punctuation signs # [-=.,] and '{}'-comments. # # Punctuation codes have the same meaning as in the EVT encoding. # Ideally, if '{}' comments are ignored, there should be no two # punctuation signs in a row, and '=' should occur only at the end of # the line. As in the EVT encoding, The '-' code may appear in the # middle of a line if the text is interrupted by a figure or some # other obstacle. # # A XEVA glyph code is either "%" denoting one or more words that were # omitted by the transcriber, or a /simple glyph/ code, or a /compound # glyph/ code consisting of a sequence of two or more simple glyph # codes enclosed in '()'. Each simple glyph code is either a letter or # a three-digit numeric weirdo code /[&][0-9][0-9][0-9][;]/ (without # braces). In either case, the meaning of these simple glyph codes is # the same as in the input string (original EVA extended with letters # [bcgju-zAEIHOQRWYZ]). # # In any compound XEVA glyph code (that has two or more every pair of # adjacent simple glyph codes must naturally connect to each other, # and the automatic case conversions described above must have been # applied. Thus '(RCTHHo)' is valid but '(Cv)' and '(cth)' are not. # The simple glyphs @q, @v, and @x cannot connect to anything, but # 'Q' can connect to the right like 'c' and other capitals. In XEVA # (unlike the "lowercased EVA" used in EVMT 16e6) the lowercase # letters, including 'c' cannot connect to the right. Uppercase # letters can connect to the right but only to some letters. The last # simple glyph code in a XEVA In particular, only the last simple # glyph code may be a lowercase letter. # If the parsing runs into an invalid syntax in the input string, the # function appends "[ERROR]" to the XEVA glyph code, instead of a XEVA # simple glyph code, discards the next letter of the input, and keeps # trying to parse. # The function uses global variables {cetx_old_weirdo_max} and # {cetx_new_weirdo_next}. When a new non-numeric weirdo code is # encountered, its code is replaced by {cetx_new_weirdo_next}, which is # incremented. When a numeric # weirdo code is encountered, its value must be no greater than # {cetx_old_weirdo_max}. # # Whenever a weirdo code "&NNN" is appended to the result string, # a line is appended to file "cetx_weirdos.tbl" with format "{OLD} {NEW}" # where {OLD} is the original representation of the weirdo in the input # (possibly non-numeric) and {NEW} is the resulting coe "&NNN". # Ensure that the globals are numeric and OK: cetx_old_weirdo_max += 0; cetx_new_weirdo_next += 0; if (cetx_new_weirdo_next <= cetx_old_weirdo_max) { cetx_prog_error(("counters " cetx_old_weirdo_max " " cetx_new_weirdo_next)); } xlin = "" # Converted line. # Remove '#'-comment if any: gsub(/[#].*$/, "", lin); # Remove alignment fillers: gsub(/[!]/, "", lin); # Normalize blanks: gsub(/[\011]/, " ", lin); gsub(/^[ ]+/, "", lin); gsub(/[ ]+$/, "", lin); # Reduce multiple '%' to single '%': gsub(/[%][%]+/, "%", lin); # Reduce multiple '.' and ',' to single item: gsub(/[,.]*[.][,.]*/, ".", lin); gsub(/[,][,]+/, ",", lin); xel = ( "" ); # Current incomplete XEVA glyph code. xprev = ""; # XEVA code of previous simple glyph in {xel}, or "" if none. nsimp = 0; # Number of simple glyph codes in {xel}. while (lin != "") { # printf "@@ res = «%s» xel = «%s» lin = «%s»\n", res, xel, lin > "/dev/stderr"; # Parse the next input code, setting {RSTART,RLENGTH}: if (! cetx_parse_next_evt_glyph(lin)) { cetx_data_error(("invalid EVT text line syntax '" substr(lin, 1, 6) "...'")); xel = (xel "[ERROR]"); lin = substr(lin, 2); } else { if (RSTART != 1) { cetx_prog_error(("RSTART!=1 lin = '" lin "'")); } enext = substr(lin, RSTART, RLENGTH); # Next EVT glyph code, possibly with weirdo subst. lin = substr(lin, RSTART + RLENGTH) if ( \ (enext == ",") || \ (enext == ".") || \ (enext == "-") || \ (enext == "=") || \ (enext == "%") || \ (enext ~/^[{][^&{}][^{}]*[}]$/) \ ) { # Punctuation, missing word, or embedded '{}'-comment: if (xel != "") { if (nsimp >= 2) { xel = ( "(" xel ")" ); } res = (res xel enext); xel = ""; xprev = ""; nsimp = 0; } } else { # {enext} must be a single letter possibly followed by a weirdo code. # Apply weirdo replacement and convert EVT '{&...}' to XEVA '&NNN': xnext = cetx_convert_simple_eva_glyph_to_xeva(enext); if (xnext == "[ERROR]") { cetx_data_error(("EVA to XEVA conversion error")); } if ((nsimp >= 1) && cetx_glyphs_will_connect(xprev, xnext)) { xel = (toupper(xel) xnext); xprev = xnext; nsimp++; } else { if (xel != "") { if (nsimp >= 2) { xel = ( "(" xel ")" ); } res = ( res xel ); } xel = xnext; xprev = xnext; nsimp = 1; } } } } # Finish off last XEVA glyph code, if any: if (xel != "") { if (nsimp >= 2) { xel = ( "(" xel ")" ); } res = ( res xel ); } return res; } function cetx_parse_next_evt_glyph(lin) { # Tries to match the beginning of {lin} to a valid EVT simple glyph code, # including eventual weirdo replacement. If it succeeds, sets {RSTART,RLENGTH} # and returns 1. If it fails, returns 0, and leaves {RSTART,RLENGTH} undefined. if (match(lin, /^([-=.,%]|[{]([^&{}][^{}]*|)[}])/)) { # Puntuation, missing word, or comment: return 1; } else if (match (lin, /^[*a-zA-Z]([{][&][^{}]*[}]|)/)) { # Simple glyph code, possibly follwed by a weirdo code: return 1; } else if (match (lin, /^[{][^&{}<>][^{}<>]*[}]/)) { # Inline comment: return 1; } else { return 0; } } function cetx_convert_simple_eva_glyph_to_xeva(esym, let,tail) { # The argument {enext} should be "*" or a single letter denoting a simple glyph in EVT # notation, possibly followed by a weirdo code like "{&312}" or "{&O'}". # Returns the XEVA representation of {enext}. In particular, # converts non-numeric weirdos to numeric ones. # In case of error, prints a message and returns "[ERROR]". # If the result is a weirdo code, writes the corresponding line to # "cetx_weirdos.tbl". # Check for valid chars: let = substr(esym, 1, 1); tail = substr(esym, 2); if (let !~ /^[*a-zACEFHIKO-TWYZ]$/) { cetx_data_error(("invalid letter code '" let "' in '" esym "'")); return "[ERROR]"; } # Convert 'C' to 'c' in case it does not ligate to the right. # If it does, it will be converted back to 'C' by the caller. if (let == "C") { let = "c"; } if (tail != "") { # Tail must be a code in braces of a weirdo that should replace {let}. let = cetx_normalize_weirdo_code(tail); } return let; } function cetx_normalize_weirdo_code(code, res,num) { # The argument must be a three-digit weirdo code, like "&301", or a # non-numeric code, like "&O'", possibly enclosed in braces. Converts # non-numeric codes to numeric and returns "&" followed by three # digits or (if possible) an EVT symple glyph letter code, both # without braces. In case of error, prints a message and returns # "[ERROR]" instead. # Also writes the corresponding line to "cetx_weirdos.tbl". res = code; if (match(res, /^[{].*[}]$/)) { res = substr(res, 2, length(res-2)); } if (match(res, /^[&][0-9][0-9][0-9]$/)) { num= substr(res, 2) + 0; if (num > cetx_old_weirdo_max) { cetx_data_error(("invalid numeric weirdo code '" res "'")); res = "[ERROR]"; } } else if (match(res, /^[&][^{}&]*$/)) { num = cetx_new_weirdo_next; cetx_new_weirdo_next++; if (num > 999) { cetx_data_error(("too many non-num weirdos")); res = "[ERROR]"; } else { res = sprintf("%03d", num); } } else { cetx_data_error(("invalid weirdo code '" res "' in input")); res = "[ERROR]"; } printf "%s %s\n", code, res >> "cetx_weirdos.tbl"; return res; } function cetx_glyphs_will_connect(xprev, xnext) { if ((xprev ~ /[cA-DF-Z]$/) && (xnext ~ /^[a-eg-jl-orsuA-EG-JL-ORSU]/)) { return 1; } else if ((xprev ~/[E]$/) && (xnext ~ /^[a-egosuyA-EFGKOPQSTUWYZ]/)) { return 1; } else { return 0; } } function cetx_data_error(msg) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " [[%s]]\n\n", $0 > "/dev/stderr"; } function cetx_prog_error(msg) { printf "** PROG ERROR: %s\n", msg > "/dev/stderr"; abort = 1; exit(abort); }