#! /usr/bin/python3
# Last edited on 2026-04-29 10:28:15 by stolfi

import os, sys, re
from sys import stderr as err
from math import sqrt, hypot, exp, log, sin, cos, floor, ceil 
from math import inf, isfinite, nan, isnan
from process_funcs import basic_line_loop
from error_funcs import file_line_error

def main(dir, tsize):

  # Creates a dictionary mapping SBJ words with {tsize} hanzi
  # characters, to their apparent Voynichese equivalents, from the
  # ".dic" files created by "*_entry_src.py". Also builds a graph that
  # shows agreement between the tentative assignments of SPS parags to a
  # set of SBJ entries.
  
  # Specifically, reads a file "{dir}/all_{tsize}_raw.dic" produced by
  # {merge_all_dics.sh}. Writes a graph "{dir}/all_{tsize}.gra" showing
  # the apparent agreements/disagreements between SBJ-SPS parag
  # assignments. Also writes a condensed dictionary
  # "{dir}/all_{tsize}_gra.dic" with pairs hanzi-Voynichese that seem to be
  # attested by at least two SBJ-SPS assignments.
  # 
  # The vertics of the graph are SBJ entry codes, like 'ROOS' or 'IRON'.
  # 
  # An edge is a pair {(score, glosses)}.  There is at most one edge
  # between two vertices.
  #
  # In the edge between {code1} and {code2}, the {score} is a numeric
  # score that tells how much the SPS parags assigned to {code1} and
  # {core2} agree on the Voynichese translation of certain hanzi strings
  # that occur in both SBJ entries.
  # 
  # The {glosses} is a list of triples {(tch, s1, s2)}. Each pair means
  # that the hanzi string {tch} occurs in both entries {code1} and
  # {code2}, and the parags assigned to those two entries have similar
  # strings {s1} and {s2} at about the right places.
  #
  # If {tsize} is positive, the string {tch} and the strings {s1} and
  # {s2} are taken from the strings (the /gaps/) between corresponding keywords (the /hits/). The
  # hits themselves are excluded from the glosses.  If {tsize} is zero,
  # the strings {s1} and {s2} are correspondng hits.
  #
  # If entries {code1} and {code2} have a shared hanzi string {tch}
  # but the assigned SPS parags don't have similar translations
  # for it, then {s1} and {s2} are {None}.
  # 
  # The graph is represented as a dict of dicts. The edge {(score,
  # glosses)} between vertices {code1} and {code2} is stored in
  # {graph[code1][code2]} which should be equal to {graph[code2][code1]}
  # except that the fields {s1} and {s2} of all glosses are swapped.
  # 
  # A positive score means that most of the hanzi strings that occur in
  # both SBJ entries have corresponding glosses on the assigned SPS
  # parags that mostly agree. A negative score means that, most of those
  # shared hanzi strings do not seem to have matching translations in
  # the assigned SPS entries.
  #
  # If two SBJ entries have no strings in common, the score will be zero.
  #
  # Writes the graph to {gra_file} and the implied dictonary to {gra_dic_file}.
  
  raw_dic_file = f"{dir}/all_{tsize}_raw.dic"
  
  rd = open(raw_dic_file, "r") 
  rd.reconfigure(encoding='utf-8')
  
  full_match = (tsize == 0)
 
  p_code_ch = r"([A-Z0-9]+)"
  p_loc_ch = r"([.a-z0-9]+)"
  p_loc_ec = r"([.a-z0-9]+)"
  p_text_ch = r"([^\000-\377]+)"
  p_text_ec = r"([·?a-z]+)"
  p_bar = "[ ]*[|][ ]*"
  p_line = \
    f"{p_code_ch}{p_bar}{p_loc_ch}{p_bar}{p_loc_ec}{p_bar}" + \
    f"{p_text_ch}{p_bar}{p_text_ec}{p_bar}"
  
  clips = []
  cur_text_ch = None
  graph = dict()
  
  def process_line(nread, line):
    nonlocal clips, cur_text_ch, graph

    line = line.strip()

    def data_error(msg):
      file_line_error(raw_dic_file, nread, msg, line)
      return
      # ..................................................................

    if line == "":
      process_clips(graph, cur_text_ch, clips, full_match)
      clips = []
      cur_text_ch = None
      return
    
    m = re.fullmatch(p_line, line)
    if m == None: data_error("invalid dic line format")
    code_ch = m.group(1)
    loc_ch =  m.group(2)
    loc_ec =  m.group(3)
    text_ch = m.group(4)
    text_ec = m.group(5)
    if cur_text_ch == None:
      cur_text_ch = text_ch
    else:
      if text_ch != cur_text_ch:
        data_error(f"inconsistent ch text {cur_text_ch = !r} {text_ch = }")
    clips.append((code_ch, loc_ec, text_ec,))
    return
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

  nread = basic_line_loop(rd, process_line)
  err.write(f"!& {nread = !r}\n")

  rd.close()
  if len(clips) != 0: 
    process_clips(graph, cur_text_ch, clips, full_match)

  gra_file = f"{dir}/all_{tsize}.gra"
  write_graph(gra_file, graph)
  
  gra_dic_file = f"{dir}/all_{tsize}_gra.dic"
  write_global_dic(gra_dic_file, graph)
  
  return
  # ----------------------------------------------------------------------
  
def process_clips(graph, text_ch, clips, full_match):
  # This function takes a list {clips} of triples {(cd,loc_ec,text_ec)} where {cd}
  # is the 4-letter code from an SBJ entry, {loc_ec} is the locus ID
  # of the SPS parag assigned to it, and {text_ec}
  # is a fragment from the EVA text of the SPS parag assumed to correspond
  # to that entry, extracted from the position that corresponds to
  # the point in that entry where the hanzi string {text_ch} occurs.
  # 
  # Saves {loc_ec} to {graph[code1]['loc_ec']}.
  #
  # For each pair of such clips {(cd1,loc1,tx1)} and {(cd2,loc2,tx2)}, computes a {score}
  # that measures the similarity of the two EVA strings with {compute_dic_score(tx1,tx2,full_match)},
  # and extracts from them two substrngs {s1,s2} that seem to match.
  # Let {graph[cd1][cd2]} be {(old_score, glosses)}.  Updates the graph by adding the {score} to {old_score}
  # and, if {score} is positive, appending the gloss {(text_ch, s1, s2)} to {glosses}
  
  debug = False
  
  def bump(loc1, cd1, s1, cd2, s2, score):
    if cd1 not in graph: 
      graph[cd1] = dict()
      graph[cd1]['loc_ec'] = loc1
    else:
      assert graph[cd1]['loc_ec'] == loc1
    if cd2 not in graph[cd1]: 
      graph[cd1][cd2] = ( 0.00, [ ] )
    score_old, glosses = graph[cd1][cd2]
    if score > 0:
      glosses.append((text_ch, s1, s2))
    else:
      glosses.append((text_ch, None, None))
    graph[cd1][cd2] = ( score_old + score, glosses )
    if debug: err.write(f"!& bump {cd1} {cd2} {graph[cd1][cd2][0]}\n")
    return
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    
  debug = False
  nh = len(clips)
  assert nh >= 2, "hit block with less than 2 lines"
  for ih1 in range(nh):
    for ih2 in range(ih1):
      cd1, loc1, tx1 = clips[ih1]
      cd2, loc2, tx2 = clips[ih2]
      score, s1, s2 = compute_dic_score(tx1, tx2, full_match)
      if debug: 
        err.write(f"{cd1} {cd2} += {score:+8.3f} {s1 = !r} {s2 = !r}\n")
      if isfinite(score):
        assert s1 != None and s2 != None
        bump(loc1, cd1, s1, cd2, s2, score)
        bump(loc2, cd2, s2, cd1, s1, score)
  return
  # ----------------------------------------------------------------------
  
def write_graph(gra_file, graph):
  wr = open(gra_file, "w") 
  wr.reconfigure(encoding='utf-8')
  wr.write("# -*- coding: utf-8 -*-\n") 

  def write_edge(edge):
    # The {edge} must be a pair {(score, glosses)}
    # where the glosses are triples {(tch,s1,s2)}
    # where {tch} is a hanzi strng that occurs in the SNJ entries
    # of both vertices, and {s1} and {s2} are the corresponding
    # similar strings in the SPS parags assigned to them.
    # Or {None} if there are no such similar strings.
    
    sc12, gls12 = edge
    wr.write(f"  {cd1} {cd2} {sc12:+8.3f}")
    for tch, s1, s2 in gls12:
      assert (s1 == None) == (s2 == None)
      wr.write(f"  {tch}")
      if s1 == None:
        wr.write(" !!")
      else:
        wr.write(f" {s1}")
        if s2 != s1:
          wr.write(f":{s2}")
    wr.write("\n")
    return
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

  codes1 = sorted(tuple(graph.keys()))
  for cd1 in codes1:
    codes2 = sorted(tuple(graph[cd1].keys()))
    
    # Compute and write the total score of vertex {cd1}:
    tot_sc12 = 0
    loc1 = None
    for cd2 in codes2:
      if cd2 == 'loc_ec':
        loc1 = graph[cd1][cd2]
      else:
        sc12, gls12 = graph[cd1][cd2]
        tot_sc12 += sc12
    wr.write(f"{cd1} {loc1} {tot_sc12:+8.3f}\n")

    # Write the edges out of {cd1}:
    for cd2 in codes2:
      if cd2 != 'loc_ec':
        write_edge(graph[cd1][cd2])
    wr.write("\n")
  wr.close()

  return
  # ----------------------------------------------------------------------
  
def write_global_dic(gra_dic_file, graph):

  # Collect inferred hanzi-eva dictionary:
  # These dicts are indexed by a hanzi substring:
  defs = dict() # Voynichese equivalents.
  bads = dict() # Count of times when failed to get a match.
  # Each {defs[tch]} is a list of tuples {(s1_ec, cd1, s2_ec, cd2)}.

  codes1 = sorted(tuple(graph.keys()))
  for cd1 in codes1:
    codes2 = sorted(tuple(graph[cd1].keys()))
    # Collect a
    for cd2 in codes2:
      if cd2 != 'loc_ec' and cd1 <= cd2:
        sc12, gls12 = graph[cd1][cd2]
        for tch, s1, s2 in gls12:
          if tch not in defs:
            defs[tch] = []
            bads[tch] = 0
          assert (s1 == None) == (s2 == None)
          if s1 == None:
            bads[tch] += 1
          else:
            defs[tch].append((s1, cd1, s2, cd2))

  # Write dictionary out:
  wr = open(gra_dic_file, "w") 
  wr.reconfigure(encoding='utf-8')
  wr.write("# -*- coding: utf-8 -*-\n") 
  strings_ch = sorted(tuple(defs.keys()))
  for tch in strings_ch:
    dfs = sorted(defs[tch])
    wr.write(f"{tch}\n")
    if bads[tch] > 0: wr.write(f" {bads[tch]:4d} failures")
    wr.write("\n")
    for s1, cd1, s2, cd2 in dfs:
      sec = s1 if s1 == s2 else f"{s1}={s2}"
      wr.write(f"  {cd1} {cd2} {sec}\n")
    wr.write("\n")
  wr.close()

  return
  # ----------------------------------------------------------------------

def compute_dic_score(tx1, tx2, full_match):
  # Looks for the "best" common substring of EVA texts {tx1,tx2}.
  # Specifically, looks for starting indices {j1,j2} and limit indices {i1,i2}
  # such that {s1=tx1[j1:i1]} and {s2=tx2[j2:i1]} have  maximum similarity score {sc}.
  # Returns {sc,s1,s2}
  #
  # The texts {tx1} and {tx2} should not contain [-., ].
  # Ignores prefixes and suffixes of {tx1} and {tx2} with are all '·'.
  #
  # NOT: Before the comparison, maps "ir" to "iin" and then "iii" to "ii".
  
  debug = False
  
  def cleanup(tx):
    assert re.search(r"[-., ]", tx) == None, "bug invalid tx"
    tx = re.sub(r"ir", "iin", tx)
    tx = re.sub(r"iii", "ii", tx)
    return tx
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    
  tx1 = cleanup(tx1); nt1 = len(tx1)
  tx2 = cleanup(tx2); nt2 = len(tx2)
  if debug:
    err.write(f"!= {tx1 = !r} (cleaned)\n")
    err.write(f"!= {tx2 = !r} (cleaned)\n")
  
  # Dynamic programming tableau:
  tab = [ None ] * (nt1 + 1)
  for i1 in range(nt1+1): tab[i1] = [ (-inf,None,None) ] * (nt2+1)
  # Each entry {tab[i1][i2]} of the tableau is {(sc,k1,k2)} meaning that
  # the best path that ends at vertex {(i1,i2)} of the dyn prog graph
  # has score {sc} and is the best path that ends at vertex {(k1,k2)}
  # plus the edge from {(k1,k2)} to {(i1,i2)}, If {k1} and {k2} are
  # {None} it means that the best path starts at {(i1,i2)} itself.
  
  start_points =  -2.00   # Penalty for starting a new substring.
  skip_points =   -0.75   # Penalty for skipping a letter.
  match_points =  +1.00   # Reward for letters that match.
  simil_points =  +0.75   # Reward for letters with matching classes.
  diff_points =   -1.00   # Penalty for letters that dont match.

  def eva_class(tec):
    # EVA class of commonly confused characters:
    if tec == "a" or tec == "o" or tec == "y":
      return "o"
    elif tec == "k" or tec == "d" or tec == "t" or tec == "p" or tec == "f":
      return "k"
    elif tec == "c" or tec == "s":
      return "c"
    elif tec == "h" or tec == "e":
      return "e"
    else:
      return tec
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

  def eva_char_char_points(tec1, tec2):
    if tec1 == tec2:
      return match_points
    else:
      z1 = eva_class(tec1)
      z2 = eva_class(tec2) 
      if z1 == z2:
        return simil_points
      else:
        return diff_points
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  
  def step_score(k1,k2,i1,i2):
    assert k1 == i1 or k1 == i1-1 
    assert k2 == i2 or k2 == i2-1 
    if k1 < 0 or k2 < 0: return -inf
    if debug: err.write(f"!= {k1 = } {k2 = } {i1 = } {i2 = }\n")
    vsc = tab[k1][k2][0];
    if vsc == -inf: return -inf
    
    # Compute the raw edge points {esc}:
    esc = None
    if k1 == i1 or k2 == i2:
      esc = skip_points
    else:
      tec1 = tx1[k1]; tec2 = tx2[k2]
      # Compute the position weight :
      f1 = (k1+1)/(nt1+1); w1 = 4*f1*(1 - f1)
      f2 = (k2+1)/(nt2+1); w2 = 4*f2*(1 - f2)
      f12 = (f1 - f2); w12 = 1 - f12*f12;
      wt = w1*w2*w12; wt = wt*sqrt(wt)
      esc = wt * eva_char_char_points(tec1, tec2)
    if esc == -inf: return -inf
    # Combine with vertex points:
    sc = vsc + esc
    return sc
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  
  vsc_best = -inf
  i1_best = None; i2_best = None
  for i1 in range(nt1+1):
    for i2 in range(nt2+1):
      # Consider starting a path here. 
      sc_best = start_points if (i1 == 0 and i2 == 0) or not full_match else -inf
      k1_best = None; k2_best = None
      # Try extending with a diagonal move:
      for k1,k2 in (i1-1,i2-1), (i1-1,i2), (i1,i2-1):
        sck = step_score(k1,k2,i1,i2)
        if isfinite(sck) and sck > sc_best:
          sc_best = sck; k1_best = k1; k2_best = k2 
      tab[i1][i2] = ( sc_best, k1_best, k2_best )
      if sc_best > vsc_best:
        vsc_best = sc_best
        i1_best = i1; i2_best = i2
  
  if debug: 
    err.write(f"!= tableau:\n")
    print_tableau(tab, nt1, nt2)

  if full_match:
    # The path must end at the end of both strings:
    vsc_best, k1x, k2x = tab[nt1][nt2]
    i1_best = nt1; i2_best = nt2
  
  if debug: err.write(f"!= {vsc_best = } {i1_best = } {i2_best = }\n")
  
  # Now recover the best path 
  if vsc_best == -inf: return -inf, None, None
  
  j1 = i1_best; j2 = i2_best
  while True:
    assert j1 != None and j2 != None
    vsc, k1, k2 = tab[j1][j2]
    if k1 == None and k2 == None: break
    j1 = k1; j2 = k2

  return vsc_best, tx1[j1:i1_best], tx2[j2:i2_best]
  # ----------------------------------------------------------------------

def test_comp(tx1, tx2, full_match):
  err.write(f"======================================================================\n")
  err.write("testing {compute_dic_score} ...\n")
  err.write(f"{tx1 = !r}\n")
  err.write(f"{tx2 = !r}\n")
  score, s1, s2 = compute_dic_score(tx1, tx2, full_match)
  err.write(f"{score = :+9.4f} {s1 = !r} {s2 = !r}\n")
  err.write(f"======================================================================\n")
  return
  # ----------------------------------------------------------------------
  
def print_tableau(tab,nt1,nt2):
  err.write("  %s\n" % ("---" * (nt2+1)))
  for i1 in range(nt1+1):
    err.write("  ")
    for i2 in range(nt2+1):
      err.write(f"{tab[i1][i2][0]*10:+3.0f}")
    err.write("\n")
  err.write("  %s\n" % ("---" * (nt2+1)))
  return
  err.write("  %s\n" % ("---" * (nt2+1)))
  # ----------------------------------------------------------------------
    

def do_tests():
  tx1 = "·······chedyorochody"
  tx2 = "chedyachodyltaiinshedy·······"
  test_comp(tx1, tx2, False)

  tx1 = "·····chedyoroalorqorokairchody"
  tx2 = "dychodyaltaiinshe·······"
  test_comp(tx1, tx2, False)

  tx1 = "ychhedy"
  tx2 = "cheedo"
  test_comp(tx1, tx2, True)

  tx1 = "qokaiin"
  tx2 = "cheedy"
  test_comp(tx1, tx2, True)
  
  return
  # ----------------------------------------------------------------------

if sys.argv[1] == "MGF.TEST":
  do_tests()
else:
  iarg = 1
  dir = sys.argv[iarg]; iarg += 1
  tsize = int(sys.argv[iarg]); iarg += 1
  main(dir, tsize)