#! /bin/gawk -f
# Last edited on 2025-07-03 09:33:17 by stolfi

??? Update to new IVTFF-like format.

# To be included in other gawk programs. Defines a function] that
# converts the EVA-like code used in the interlinear version
# 16e6 to XEVA with each compound glyph surrounded by parentheses. A
# /glyph/ being defined as a maximal group of strokes that are connected
# (or which very likely were meant to be connected but got disconnected
# by accident).  Moreover, non-numeric weirdo codes like "c{&c~}" or "*{&foo}"
# are replaced by numeric "&NNN" ones.

# Also appends to file "cetx_weirdos.tbl" one line for each weirdo code generated
# in the resulting string, with the input (possibly non-numeric) and output 
# (always numeric).

function cetx_convert_evt_to_xevt(lin,  res,xel,enext,xnext,xprev,nsimp) {

  # This function removes from {lin} all '!' alignment fillers, any 
  # trailing '#'-comments, leading and trailing blanks. It also 
  # condenses any string of one or more consecutive blanks (resp. '%')
  # to a single blank (resp. '%').  
  # 
  # Then it parses {lin}, from left to right, as a sequence of tokens 
  # satisfying these patterns (in priority order):
  # 
  #   A word, line, and parag delimiters in [-=.,].
  #
  #   A sequence of one or more simple glyph codes that are ligated together.
  #
  #   A missing word marker '%'.
  # 
  #   A comment in braces (that is not a weirdo code, see below)
  #
  # SIMPLE GLYPH CODES
  #
  # A simple glyph code in the input string may be any letter [a-z] or
  # [ACEFHIKO-TWYZ], or '*' to denote an unreadable glyph, both possibly
  # followed by a weirdo code. The capital letters [BDGJLMNUVX] are
  # invalid in the input string.
  #
  # A letter or '*' followed by a weirdo code is assumed to be a
  # placeholder for the weirdo and thus discarded.
  # 
  # Otherwise each letter in the input string denotes a simple glyph as
  # in the EVA encoding extended to the whole alphabet [a-z] as
  # explained elsewhere. Capital letters [AHIORS] denote the same glyph
  # as the lowercase letter, but with a horizontal ligature line
  # extending to the right from the top of the glyph. In the case of
  # 'R' and 'S' the plume of 'r' and 's' is displaced to the right, to
  # the approximate midpoint of this ligature line.
  #
  # For compatibility with EVA, the letters 'c' and 'C' denote an EVA
  # @e with that same top right ligature line.
  #
  # The letter 'E' denotes an eva @e glyph WITHOUT the top ligature
  # line but with an extra long lower end that can connect to some other
  # glyphs from below. 
  #
  # As in EVA, the letter 'h' denotes an @e with a horizontal ligature
  # line extending to the LEFT at the top; and 'H' is an @e with both
  # left and right top ligature lines.
  #
  # The letter 'Q' denotes the EVA @q glyph but with a longer
  # horizontal stroke and raised until that stroke level with the top
  # of an @o, so that it serves as the right ligature line of other
  # capital letters.
  # 
  # As in EVA, input codes [ktfpzw] denote the plain gallows, where 'z'
  # and 'w' denote the same as 'f' and 'p' respectively but with a
  # @e-like hook at the end of the horizontal arm; and [KTFPZW] are
  # the same with the legs slashed by a horizontal stroke at
  # @o-height, which will be the floor of the gallows' "platform".
  #
  # A weirdo code is either [{][&][0-9][0-9][0-9][}] denoting a 
  # weirdo glyph with that three-digit number (defined elsewhere), or 
  # [{][&][^{}]*[}] denoting some ad-hoc encoding of the weirdo, 
  # like "{&O'}" for an EVA @o with a plume and a top right ligature.
  # In both cases the braces are stripped. In the second case,
  # the sequence between '&' and '}' is mapped to a three-digit code
  # by an internal table. Either way, the '&' and the three digits
  # are then treated like a letter, as above.
  #
  # LIGATURES OF SIMPLE GLYPHS
  # 
  # Consecutive simple glyphs can be joined into a single XEVA glyph
  # code only if they their shapes are such that they are connected
  # together. Those that have a top right ligature line, as well as
  # slashed gallows with input codes [KTFPZW], will automatically
  # connect to any subsequent simple glyph with any of the codes
  # [a-eg-jl-orsu] or their upper-case versions, as well as with the
  # slashed gallows [KTFPZW].
  #
  # The interglyph space code ',' can be used in the input, if necessary, to
  # prevent the joining of simple glyphs even when they would normally 
  # join.
  # 
  # This function assumes that, as in original EVA, benches and platform
  # gallows are be written /[CS][H]*h/ and /C[TKFPZW]+[H]*h/. 
  # The "lowercased EVA" notations /csh+/ and /c[tkfpzw]+h+/ must be 
  # converted to uppercase before calling this function.  (But beware
  # of cases like when "cth" actually means a "CTh" without the slash 
  # across the "t".)
  #
  # The simple glyph 'E' connects to any subsequent glyph with a round
  # left outline, namely those with codes [a-egosuy] or their uppercase
  # variants that are valid. Otherwise the input code 'E' is
  # automatically converted to lowercase 'e'.
  #
  # For compatibility with EVA, the input codes 'c' and 'C' denote the same
  # simple glyph, an @e with top right ligature. Either character is
  # converted to 'c' if not ligated to a subsequent glyph, or to 'C' if
  # ligated.
  # 
  # The the codes 'h' and 'H' will connect with any PRECEDING simple
  # glyph with codes [acehioqrsy] or their uppercase versions, in which
  # case this preceding code is automatically converted to uppercase.
  # They also connect to preceding slashed gallows [KTFPZW]. 
  # This rule is applied after the EVT sequences /^c[ktfpzw]*/
  # and 's' preceding the 'h' are converted to upper-case,
  # as explained above.
  # 
  # The interpretation of 'c', 'C', and 'E' is unnecessarily complex.
  # By logic, 'E' (not 'c' or 'C') should be 'e' with a top right
  # ligature line. Then 'c' (not 'E') could be the bottom-connecting
  # 'e', and 'C' would be invalid. But the rules above were chosen in
  # order to maintain compatibility with the original EVA enconding.
  #
  # The input codes '*' and '%' do not connect to any others and 
  # therefore each occurrence is parsed as a token by itself.
  #
  # A few weirdos may connect to the the right (like 'c' and capital
  # input codes) or to the left (like 'h' and 'H'). The ligature
  # behavior of each weirdo is defined internally.
  #
  # OUTPUT XEVA LINE
  #
  # The output string is a list of XEVA glyph codes, punctuation signs 
  # [-=.,] and '{}'-comments.  
  #
  # Punctuation codes have the same meaning as in the EVT encoding.
  # Ideally, if '{}' comments are ignored, there should be no two
  # punctuation signs in a row, and '=' should occur only at the end of
  # the line. As in the EVT encoding, The '-' code may appear in the
  # middle of a line if the text is interrupted by a figure or some
  # other obstacle.
  #
  # A XEVA glyph code is either "%" denoting one or more words that were
  # omitted by the transcriber, or a /simple glyph/ code, or a /compound
  # glyph/ code consisting of a sequence of two or more simple glyph
  # codes enclosed in '()'. Each simple glyph code is either a letter or
  # a three-digit numeric weirdo code /[&][0-9][0-9][0-9][;]/ (without
  # braces). In either case, the meaning of these simple glyph codes is
  # the same as in the input string (original EVA extended with letters 
  # [bcgju-zAEIHOQRWYZ]).
  # 
  # In any compound XEVA glyph code (that has two or more every pair of
  # adjacent simple glyph codes must naturally connect to each other,
  # and the automatic case conversions described above must have been
  # applied. Thus '(RCTHHo)' is valid but '(Cv)' and '(cth)' are not.
  # The simple glyphs @q, @v, and @x cannot connect to anything, but
  # 'Q' can connect to the right like 'c' and other capitals. In XEVA
  # (unlike the "lowercased EVA" used in EVMT 16e6) the lowercase
  # letters, including 'c' cannot connect to the right. Uppercase
  # letters can connect to the right but only to some letters. The last
  # simple glyph code in a XEVA In particular, only the last simple
  # glyph code may be a lowercase letter.
  
  # If the parsing runs into an invalid syntax in the input string, the
  # function appends "[ERROR]" to the XEVA glyph code, instead of a XEVA
  # simple glyph code, discards the next letter of the input, and keeps
  # trying to parse.
  
  # The function uses global variables {cetx_old_weirdo_max} and
  # {cetx_new_weirdo_next}. When a new non-numeric weirdo code is
  # encountered, its code is replaced by {cetx_new_weirdo_next}, which is 
  # incremented.  When a numeric
  # weirdo code is encountered, its value must be no greater than
  # {cetx_old_weirdo_max}.
  #
  # Whenever a weirdo code "&NNN" is appended to the result string,
  # a line is appended to file "cetx_weirdos.tbl" with format "{OLD} {NEW}" 
  # where {OLD} is the original representation of the weirdo in the input 
  # (possibly non-numeric) and {NEW} is the resulting coe "&NNN".
  
  # Ensure that the globals are numeric and OK:
  cetx_old_weirdo_max += 0;
  cetx_new_weirdo_next += 0;
  if (cetx_new_weirdo_next <= cetx_old_weirdo_max) {
    cetx_prog_error(("counters " cetx_old_weirdo_max " " cetx_new_weirdo_next));
  }

  xlin = "" # Converted line.

  # Remove '#'-comment if any:
  gsub(/[#].*$/, "", lin);

  # Remove alignment fillers: 
  gsub(/[!]/, "", lin);

  # Normalize blanks:
  gsub(/[\011]/, " ", lin);
  gsub(/^[ ]+/, "", lin);
  gsub(/[ ]+$/, "", lin);

  # Reduce multiple '%' to single '%':
  gsub(/[%][%]+/, "%", lin);

  # Reduce multiple '.' and ',' to single item:
  gsub(/[,.]*[.][,.]*/, ".", lin);
  gsub(/[,][,]+/, ",", lin);
  
  xel = ( "" );  # Current incomplete XEVA glyph code.
  xprev = "";    # XEVA code of previous simple glyph in {xel}, or "" if none.
  nsimp = 0;     # Number of simple glyph codes in {xel}.     

  while (lin != "") {
    # printf "@@ res = «%s» xel = «%s» lin = «%s»\n", res, xel, lin > "/dev/stderr";
    
    # Parse the next input code, setting {RSTART,RLENGTH}:
    if (! cetx_parse_next_evt_glyph(lin)) {
      cetx_data_error(("invalid EVT text line syntax '" substr(lin, 1, 6) "...'")); 
      xel = (xel "[ERROR]"); lin = substr(lin, 2);
    } else {
      if (RSTART != 1) { cetx_prog_error(("RSTART!=1 lin = '" lin "'")); }
      enext = substr(lin, RSTART, RLENGTH); # Next EVT glyph code, possibly with weirdo subst.
      lin = substr(lin, RSTART + RLENGTH)
      if ( \
        (enext == ",") || \
        (enext == ".") || \
        (enext == "-") || \
        (enext == "=") || \
        (enext == "%") || \
        (enext ~/^[{][^&{}][^{}]*[}]$/) \
      ) { 
        # Punctuation, missing word, or embedded '{}'-comment:
        if (xel != "") { 
          if (nsimp >= 2) { xel = ( "(" xel ")" ); }
          res = (res xel enext); xel = ""; xprev = ""; nsimp = 0; 
        }
      } else {
        # {enext} must be a single letter possibly followed by a weirdo code. 
        # Apply weirdo replacement and convert EVT '{&...}' to XEVA '&NNN':
        xnext = cetx_convert_simple_eva_glyph_to_xeva(enext);
        if (xnext == "[ERROR]") { cetx_data_error(("EVA to XEVA conversion error")); }
        if ((nsimp >= 1) && cetx_glyphs_will_connect(xprev, xnext)) {
          xel = (toupper(xel) xnext); xprev = xnext; nsimp++;
        } else {
          if (xel != "") { 
            if (nsimp >= 2) { xel = ( "(" xel ")" ); }
            res = ( res xel );
          }
          xel = xnext; xprev = xnext; nsimp = 1;
        }
      }
    }
  }
  # Finish off last XEVA glyph code, if any:
  if (xel != "") { 
    if (nsimp >= 2) { xel = ( "(" xel ")" ); }
    res = ( res xel );
  }
  return res;
}

function  cetx_parse_next_evt_glyph(lin) { 
  # Tries to match the beginning of {lin} to a valid EVT simple glyph code,
  # including eventual weirdo replacement. If it succeeds, sets {RSTART,RLENGTH}
  # and returns 1.  If it fails, returns 0, and leaves {RSTART,RLENGTH} undefined.
  
  if (match(lin, /^([-=.,%]|[{]([^&{}][^{}]*|)[}])/)) {
    # Puntuation, missing word, or comment: 
    return 1;
  } else if (match (lin, /^[*a-zA-Z]([{][&][^{}]*[}]|)/)) {
    # Simple glyph code, possibly follwed by a weirdo code:
    return 1;
  } else if (match (lin, /^[{][^&{}<>][^{}<>]*[}]/)) {
    # Inline comment:
    return 1;
  } else {
    return 0;
  }
}

function cetx_convert_simple_eva_glyph_to_xeva(esym,  let,tail) {
  # The argument {enext} should be "*" or a single letter denoting a simple glyph in EVT
  # notation, possibly followed by a weirdo code like "{&312}" or "{&O'}".
  # Returns the XEVA representation of {enext}. In particular,
  # converts non-numeric weirdos to numeric ones.
  # In case of error, prints a message and returns "[ERROR]".
  
  # If the result is a weirdo code, writes the corresponding line to
  # "cetx_weirdos.tbl".
  
  # Check for valid chars:
  let = substr(esym, 1, 1); tail = substr(esym, 2);
  if (let !~ /^[*a-zACEFHIKO-TWYZ]$/) {
    cetx_data_error(("invalid letter code '" let "' in '" esym "'")); return "[ERROR]";
  }
  # Convert 'C' to 'c' in case it does not ligate to the right. 
  # If it does, it will be converted back to 'C' by the caller.
  if (let == "C") { let = "c"; }
  
  if (tail != "") {
    # Tail must be a code in braces of a weirdo that should replace {let}.  
    let = cetx_normalize_weirdo_code(tail);
  }
  return let;
}

function cetx_normalize_weirdo_code(code,  res,num) {
  # The argument must be a three-digit weirdo code, like "&301", or a
  # non-numeric code, like "&O'", possibly enclosed in braces. Converts
  # non-numeric codes to numeric and returns "&" followed by three
  # digits or (if possible) an EVT symple glyph letter code, both
  # without braces. In case of error, prints a message and returns
  # "[ERROR]" instead.
  
  # Also writes the corresponding line to "cetx_weirdos.tbl".
  
  res = code;
  if (match(res, /^[{].*[}]$/)) {
    res = substr(res, 2, length(res-2));
  }
  if (match(res, /^[&][0-9][0-9][0-9]$/)) {
    num= substr(res, 2) + 0;
    if (num > cetx_old_weirdo_max) {
      cetx_data_error(("invalid numeric weirdo code '" res "'")); res = "[ERROR]";
    }
  } else if (match(res, /^[&][^{}&]*$/)) {
    num = cetx_new_weirdo_next; 
    cetx_new_weirdo_next++;
    if (num > 999) { 
      cetx_data_error(("too many non-num weirdos")); res = "[ERROR]";
    } else {
      res = sprintf("%03d", num);
    }
  } else {
    cetx_data_error(("invalid weirdo code '" res "' in input")); res = "[ERROR]";
  }
  printf "%s %s\n", code, res >> "cetx_weirdos.tbl";
  return res;
}

function cetx_glyphs_will_connect(xprev, xnext) {
  if ((xprev ~ /[cA-DF-Z]$/) && (xnext ~ /^[a-eg-jl-orsuA-EG-JL-ORSU]/)) {
    return 1;
  } else if ((xprev ~/[E]$/) && (xnext ~ /^[a-egosuyA-EFGKOPQSTUWYZ]/)) {
    return 1;
  } else {
    return 0;
  }
}

function cetx_data_error(msg) {
  printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
  printf "  [[%s]]\n\n", $0 > "/dev/stderr";
}

function cetx_prog_error(msg) {
  printf "** PROG ERROR: %s\n", msg > "/dev/stderr";
  abort = 1; exit(abort);
}