#! /usr/bin/gawk -f
# Last edited on 2025-05-10 13:56:14 by stolfi

# Reads an EVT file as of release 16e6. Tries to convert it to an
# equivalent file in XEVT format. This includes parsing the simple
# glyphs that are presumably connected by ligatures into compound glyphs
# (like "Sh", "CTh", etc, but more general) and enclosing each separate
# glyph, simple or compound, into parens '()'.
#
# Lines that are '#'-comments are preserved. 
#
# In the text, the alignment fillers '!' and trailing blanks are deleted
# and any string of consecutive omitted-word markers '%' is condensed to
# a single '%'.  Comments embedded in the text with '{}' are preserved
# but must not contain '{}' or start with '{&'. Blanks are allowed
# inside comments and preserved but scrunched to single blanks.
#
# Also creates a file "cetx_weirdos.tbl" with a table that maps
# weirdo codes seen in the input to the corresponding codes 
# in the output.  A line locator is inserted in that file as a '#'-comment
# before the weirdos found on that line.
#
# Must be executed with "-f convert_evt_to_xevt_funcs.gawk".
#

BEGIN { 
  abort = -1; 
  nread = 0; ndata = 0; npages = 0; nunits = 0; nplaces = 0; ncomm = 0;
  cetx_old_weirdo_max = 199;
  cetx_new_weirdo_next = 400;
  debug_step = 0;
  debug_line = 0;
  # Clear the weirdo table: 
  weirdos_table = "cetx_weirdos.tbl"
  printf "# Weirdo mapping table\n" > weirdos_table;
}

(abort >= 0) { exit(abort); }

# Count lines read:
// { nread++; }

# Pass through blank and comment lines: 
/^[ ]*([#]|$)/ { print; ncomm++; next; }

# Set the debugging flag, debug input:
// { 
  if ((debug_step > 0) && ((ndata <= 50) || ((ndata % debug_step) == 0))) { debug_line = 1; }
  if (debug_line) {
    printf "%d: input =  [[%s]]\n", nread, $0 > "/dev/stderr";
  }
}

# Cleanup some blanks:
// { 
  gsub(/[\011]/, " ", $0);
  gsub(/^[ ]+/, "", $0);
  gsub(/[ ]+$/, "", $0);
}

# Record line locators in the weirdos table file:
/</ { printf "# %s\n", $1 >> weirdos_table; }

# Pass through EVT page attribute lines:
/^<f[0-9]+[rv][0-9]?> *[{][$}]/ { wrout($0); npages++; next; }

# Pass through EVT unit attribute lines:
/^<f[0-9]+[rv][0-9]?[.][A-Za-z0-9]+> *[{][$}]/ { wrout($0); nunits++; next; }

# Pass through EVT location attribute lines:
/^<f[0-9]+[rv][0-9]?[.][A-Za-z0-9]+[.][A-Za-z0-9]+> *[{][$}]/ { wrout($0); nplaces++; next; }
    
# Process data lines:
/^</ {
  eline = $0;
  if (! match(eline, /^<f[0-9]+[rv][0-9]?[.][A-Za-z0-9.]+[;][A-Za-z]+>/)) {
    data_error("can't find locator");
  } else {
    if (RSTART != 1) { prog_error("RSTART"); }
    loc = substr(eline, RSTART, RLENGTH);
    etext = substr(eline, RSTART + RLENGTH)
    gsub(/^[ ]+/, "", etext);
    etext = undo_lowercasing_of_ligatures(etext);
    xtext = cetx_convert_evt_to_xevt(etext);
    xline = sprintf("%-18s %s\n", loc, xtext);
    failed = (xline ~ /\[ERROR\]/)
    debug_line = debug_line || failed;
    wrout(xline); ndata++;
    debug_line = 0;
    if (failed) { exit(1); }
  }
  next;
}

EBD {
  close(weirdos_table);
  fflush();
  if (abort > 0) { printf "aborted\n" > "/dev/stderr"; exit(abort); }
  printf "%6d lines read\n", nread > "/dev/stderr";
  printf "%6d comment lines\n", ncomm > "/dev/stderr";
  printf "%6d page header lines\n", npages > "/dev/stderr";
  printf "%6d section header lines\n", nunits > "/dev/stderr";
  printf "%6d location header lines\n", nplaces > "/dev/stderr";
  printf "%6d data lines written\n", ndata > "/dev/stderr";
}
    
function undo_lowercasing_of_ligatures(etext,   res,pref,code,suff) {
  # Undoes the EVT lowercasing of EVA uppercase ligatures 
  # "C[H]+h", "S[H*]h", and /C[TKFP]+[H]*h/.
  # Namely, convert c'+gallows and 's' to upper case if followed by one or more 'h',
  # unless the group is enclosed in "(" meaning that there is no ligation:
  
  res = ""
  while (match(etext, /([^(cs]|^)(c[tkfpzw]*[h]+|[s][h]+)([^h)]|$)/)) {
    # printf "  @1@ substr = '%s'\n", substr(etext, RSTART, RLENGTH) >> "/dev/stderr";
    
    # Exclude context chars, if any:
    REND = RSTART + RLENGTH - 1;
    if (substr(etext, RSTART, 1) !~ /[cs]/) { RSTART++; }
    if (substr(etext, REND, 1) != "h") { REND--; } 
    RLENGTH = REND - RSTART + 1;
    if (RLENGTH < 1) { prog_error(("RLENGTH")); }
    # printf "  @2@ substr = '%s'\n", substr(etext, RSTART, RLENGTH) >> "/dev/stderr";
    
    # Trisect {etext}: 
    pref = substr(etext, 1, RSTART-1);
    code = substr(etext, RSTART, RLENGTH);
    suff = substr(etext, RSTART + RLENGTH);
    if (substr(code, 1, 1) !~ /[cs]/)  { prog_error(("code = '" code "' beg char")); }
    if (substr(code, length(code), 1) != "h")  { prog_error(("code = '" code "' end char")); }

    # Uppercase all but last 'h':
    code = toupper(code);
    code = gensub(/H([^)]|$)/, "h\\1", "g", code);
    
    # Move on:
    res = (res pref code);
    etext = suff;
  }
  res = (res etext)
  
  return res;
}

function wrout(xline) {
  if (debug_line) {
    printf "convert_evt_format_to_xevt_format.gawk:\n" > "/dev/stderr";
    printf "%d: input  = [[%s]]\n", nread, $0 > "/dev/stderr";
    printf "%d: output = [[%s]]\n\n", nread, xline > "/dev/stderr";
  }
  print xline;
}    

function data_error(msg)
{ printf "%s:%s: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
  printf "  «%s»\n", $0 > "/dev/stderr";
  abort = 1; exit 1;
}

function prog_error(msg)
{ printf "** PROG ERROR: %s\n", msg > "/dev/stderr";
  abort = 1; exit 1;
}