#! /usr/bin/python3
# Last edited on 2026-05-01 04:50:38 by stolfi

import os, sys, re
from sys import stderr as err, stdout as out
import write_parsing_funcs as wpf
import bimatching_eval_funcs as bef
from math import sqrt, hypot, log, exp, floor, ceil, inf, nan, isfinite
  
def match_bitemplate(text0, text1, bitemp, bias, eval_one_gap):
  # Matches two given strings {text0,text1} to a bitemplate {bitemp}. 
  # If there is more than one bimatching possible,
  # chooses the one with the lowest badness score. 
  #
  # A /bitemplate/ is a list {bitemp[0..nr]} of /bipatterns/.  
  # Each bipattern is a list of /variants/.  Each variant is a 
  # triple {(pena, pat0, pat1)} where {pena} is a non-negative float,
  # typically in {[0 _ 1]}; and {pat1,pat2} are two RE patterns,
  # neither of which matches the empty string.
  # 
  # Let {nt0} and {nt1} be the lengths of {text0} and {text1}. A
  # /bimatching/ of {text0,text1} to {bitemp} is a list {bimatch} of
  # {nr} /rungs/. A rung {bimatch[kr]} is a quintuple
  # {(i0,j0,i1,j1,bpvar)} where {bpvar} is a variant of the bipattern
  # {bitemp[kr]}, as above; {i0,j0} are indices in {0..nt0} with {i0 <=
  # j0}, and the string {text0[i0:j0]} matches the RE pattern {pat0} in
  # {bpvar}; and ditto for {i1,j1,text1} with {pat1} of {bpvar}. Among
  # successive
  # 
  # The procedure enumerates all the possible bimatchings
  # {bimatch[0..nr-1]} of {text0,text1} against the bitemplate
  # {bitemp[0..nr-1]}. For each bimatching it computes a badness
  # {score}, and selects the bimatching that yielded the lowest {score}.
  #
  # The badness {score} of a bimatching is computed by converting it
  # implicitly to a pair of macro-parsings {segs0,segs1} of {text0} and
  # {text1}, and calling {eval_one_gap(len(gap0),len(gap1),ig,ng)} on
  # each pair of corresponding gaps. The {score} is the sum of the given
  # {bias}, plus all the gap scores, plus the penalties of the variants
  # used to match the keywords.
  #
  # Returns the bimatching {opt_bimatch} that yielded the smallest
  # {score}, as well as its score.
  #
  # If there are no possible bimatchings, returns {None,+inf}.

  # debug_all = (text1[:10] == "fdeechdyop")
  debug_all = False
  verbose = True

  nt0 = len(text0); nt1 = len(text1)  # Lengths of the texts.
  nr = len(bitemp)                    # Number of bipatterns.
  nh = nr; ng = nh + 1; ns = ng + nh  # Number of hits, gaps, segs.

  # Universal keyword patterns (sans penalties and pairing)
  ukwpats0, ukwpats1 = get_universal_keyword_patterns(bitemp)

  if debug_all: 
    err.write(f"!~ {nr = } {bias = }\n")
    err.write(f"!~ {nt0 = :3d} {text0 = !r}\n")
    err.write(f"!~ {nt1 = :3d} {text1 = !r}\\n")
  
  tot = dict()
  tot['evals'] = 0  # Number of calls to {eval_one_gap}.
  tot['whole'] = 0  # Number of complete bimatchings evaluated.
  tot['match'] = 0  # Number of paired keyword matches found.
  tot['nohit'] = 0  # Number of times it failed to find any pair of hits.
  tot['prune'] = 0  # Number of bimatchings abandoned before complete.
  tot['bueno'] = 0  # Number of complete bimatchings that improved optimum score.

  bimatch = [ None ] * nr  # Rungs of partial bimatching.
  opt_bimatch = None       # Best bimatchng of {text0,text1}.
  opt_score = +inf         # The score of those parsings.

  def debug_bimatch(tag, kr):
    # Prints out {bimatch[0..kr-1]}.
    nonlocal bimatch
    for ir in range(kr):
      i0,j0,i1,j1,bpvar = bimatch[ir] 
      hit0 = text0[i0:j0].ljust(10,'　'); 
      hit1 = text1[i1:j1].ljust(10,' '); 
      err.write(f"{tag}")
      err.write(f"  {i0:3d}:{j0:3d} '{hit0:<10s}'")
      err.write(f"  {i1:3d}:{j1:3d} '{hit1:<10s}'\n")
    err.write("\n")
    return
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

  def match_one_bipat_at_cur_point(i0, i1, kr, score):
    # Tries to extend {bimatch[0..kr-1]} by 
    # matching all the variants of the bipattern {bpk = bitemp[kr]} at
    # positions {i0} of {text0} and {i1} of {text1}.
    #
    # For each variant that matches, sets {bimatch[kr]} to the appropriate rung
    # and calls {match_aux} to extend it further.
    #
    # The {score} should be the sum of all gaps and hits in
    # {bimatch[0..kr-1]} including the gap that ends at {i0,i1} but NOT
    # the penalty of the next rung.
    #
    # Returns true iff found one variant that matched.

    nonlocal bimatch, opt_bimatch, opt_score, tot
    debug = debug_all or False

    ind = ("." * kr) if debug else None
    if debug: err.write(f"!~ {ind} maob  {i0 = } {i1 = } {kr = } {score = :.3f}\n")
    assert i0 <= nt0 and i1 <= nt1
    assert kr < nr
    bpk = bitemp[kr]
    tail0 = text0[i0:]
    tail1 = text1[i1:]
    nv = len(bpk) # Number of variants of the bipat.
    assert bimatch[kr] == None
    matched_some_pvar = False
    for iv in range(nv):
      bpvk = bpk[iv]
      penkv, patkv0, patkv1 = bpvk
      assert isinstance(penkv, float) and penkv >= 0
      m0 = re.match(patkv0, tail0)
      m1 = re.match(patkv1, tail1)
      if m0 != None and m1 != None: 
        tot['match'] += 1
        matched_some_pvar = True
        assert m0.start(0) == 0;  assert m1.start(0) == 0
        # Update the score for the penalty of the additional rung:
        new_score = score + penkv
        if debug: 
          hitkv0 = m0.group(0); hitkv1 = m1.group(0)
          err.write(f"!~ {ind} maob  {iv = } {hitkv0 = !r} {hitkv1 = !r}")
          err.write(f" {penkv = :.3f} {new_score = :.3f}")
        if new_score >= opt_score:
          # Not worth extending with {bpk[iv]}:
          if debug: err.write(" >=\n")
        else:
          # Let's try extending:
          j0 = i0 + m0.end(0); j1 = i1 + m1.end(0);
          rngk = (i0,j0,i1,j1,bpvk)
          if debug: err.write(f" <\n")
          bimatch[kr] = rngk
          match_aux(j0, j1, kr+1, new_score)
          bimatch[kr] = None
    return matched_some_pvar
    # ----------------------------------------------------------------------
  
  def quick_next_match_positions(kt0, i0, upat0, kt1, i1, upat1, kr):
    # Scans the integer grid {{kt0..nt0}×{kt1..nt1}} in lex order starting
    # from {(i0,i1)},and returns the first pair {(r0,r1)} such that
    # {text0[r0:]} matches {upat0} at start and {text1[r1:]} matches {upat1}
    # at start.
    #
    # If there is no such pair, returns {(nt0+1,nt1+1)}
    #
    # Assumes that {upat0} and/or {upat1} may match the empty string,
    # even though that should not be allowed.
    
    nonlocal bimatch, opt_bimatch, opt_score, tot
    debug = debug_all or False
    
    ind = ("." * kr) if debug else None
    if debug: err.write(f"!Q {ind} { kt0 = } {i0 = } {nt0 = } {upat0 = }\n")   
    if debug: err.write(f"!Q {ind} { kt1 = } {i1 = } {nt1 = } {upat1 = }\n")   
    assert 0 <= kt0 and kt0 <= i0
    assert 0 <= kt1 and kt1 <= i1
    assert i0 <= nt0 and i1 <= nt1
    # Try the rest of the row {i0}, f any: 
    if i1 > kt1:
      if debug: err.write(f"!Q {ind} trying {upat0 = !r} on text1 at position r0 = {i0 = } ...")   
      m0 = re.match(upat0, text0[i0:])
      if m0 != None:
        if debug: err.write(f" matches.\n")   
        if debug: err.write(f"!Q {ind} trying {upat1 = !r} on text1 for r1 in {i1}..{nt1} ...")   
        m1 = re.search(upat1, text1[i1:])
        if m1 != None:
          r0 = i0; r1 = i1 + m1.start(0)
          if debug: err.write(f" matches at {r1 = }.\n")   
          return r0, r1 
        else:
          if debug: err.write(f"not match.\n")   
          i0 += 1; i1 = kt1
      else:
        if debug: err.write(f" does not match.\n") 
        i0 += 1; i1 = kt1
    # Try from row {i0} forward, all cols:
    if i0 > nt0: return nt0+1, nt1+1
    assert i1 == kt1
    if debug: err.write(f"!Q {ind} trying {upat0 = !r} on text0 for r0 in {i0}..{nt0} ...")   
    m0 = re.search(upat0, text0[i0:])
    if m0 != None: 
      r0 = i0 + m0.start(0); 
      if debug: err.write(f" matched at {r0 =}.\n") 
    else:
      if debug: err.write(f" no match.\n")   
      return nt0+1, nt1+1
    if debug: err.write(f"!Q {ind} trying trying {upat1 = !r} on text1 for r1 in {i0}..{nt0} ...")   
    m1 = re.search(upat1, text1[i1:])
    if m1 != None: 
      r1 =  i1 + m1.start(0)
      if debug: err.write(f" matched at {r1 =}.\n") 
    else:
      if debug: err.write(f" no match.\n")   
      return nt0+1, nt1+1
    
    if debug: err.write(f"!Q {ind} chose { r0 = } {r1 = }\n")   
    return r0, r1
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
 
  def match_one_bipat_from_here_on(kt0, kt1, kr, score):   
    # Tries to match the bipattern {bitemp[k]} at all possible pairs of
    # positions {i0,i1} in the integer grid {{kt0..nt0}×{kt1..nt1}}
    
    nonlocal bimatch, opt_bimatch, opt_score, tot
    debug = debug_all or False

    ind = ("." * kr) if debug else None
    assert kt0 <= nt0 and kt1 <= nt1
    bpk = bitemp[kr]
    upat0k = ukwpats0[kr]; upat1k = ukwpats1[kr]
    i0 = kt0; i1 = kt1
    nmatch = 0  # Positions {i0,i1} where {bpk} matched.
    while True:
      if i0 > nt0 or i1 > nt1: break
      i0, i1 = quick_next_match_positions(kt0, i0, upat0k, kt1, i1, upat1k, kr)
      if i0 > nt0 or i1 > nt1: break

      # There is a chance to match the bipattern {bpk} at {i0,i1}.
      # Accumlate the gap score:
      gszk0 = i0 - kt0; gszk1 = i1 - kt1
      ig = kr; assert ig < ng
      gsck = eval_one_gap(gszk0, gszk1, ig, ng)
      tot['evals'] += 1
      new_score = score + gsck
      if debug: 
        err.write(f"!~ {ind} ma  {i0 = } {gszk0 = } {i1 = } {gszk1 = }")      
        err.write(f" {gsck = :.3f} {new_score = :.3f}")      
      if new_score < opt_score:
        # Trying to add a rung at {(i0,i1)} 
        if debug: err.write(" <\n")
        matched = match_one_bipat_at_cur_point(i0, i1, kr, new_score)
        if matched: nmatch += 1
      else:
        if debug: err.write(" >=\n")
        tot['prune'] += 1

      # Try another pair of positions:
      i1 += 1
      if i1 > nt1:
        i0 += 1; i1 = kt1
    return nmatch
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

  def match_aux(kt0, kt1, kr, score):
    # Assumes that {bimatch[0..kr-1]} are defined for
    # some {kr <= nr} and cover the first {kt0} characters of {text0}
    # and the first {kt1} characters of {text1}.
    # Enumerates all complete parsings that extend that parsing, updaing
    # the optimum.
    #
    # The {score} should be the sum of all gaps and hits in {bimatch[0..kr-1]}
    # except the tail gap.
    
    nonlocal bimatch, opt_bimatch, opt_score, tot
    debug = debug_all or False

    ind = ("." * kr) if debug else None
    if debug: err.write(f"!~ {ind} ma  {kt0 = } {kt1 = } {kr = } {score = :.3f}\n")
    assert kt0 <= nt0 and kt1 <= nt1
    assert score < opt_score
    # jprev0, jprev1 = (0,0) if kr == 0 else (bimatch[kr-1][1], bimatch[kr-1][3])
    if kr == nr:
      # Completed another bimatching:
      # Update score with the tail gaps:
      tot['whole'] += 1
      gszf0 = nt0 - kt0; gszf1 = nt1 - kt1
      ig = kr; assert ig == ng-1
      gscf = eval_one_gap(gszf0,gszf1,ig,ng)
      tot['evals'] += 1
      fin_score = score + gscf
      if debug: 
          err.write(f"!~ {ind} ma  {gszf0 = } {gszf1 = }")      
          err.write(f" {gscf = :.3f} {fin_score = :.3f}")      
      if fin_score < opt_score:
        if debug: err.write(" |\n")
        opt_bimatch = bimatch.copy()
        opt_score = fin_score
        tot['bueno'] += 1
      else:
        if debug: err.write(" >=\n")
        tot['prune'] += 1
      # No way to extend:
      return
    else:
      # Still need more hits:
      assert kr < nr
      nmatch = match_one_bipat_from_here_on(kt0, kt1, kr, score)
      if nmatch == 0: tot['nohit'] += 1
      if debug: err.write(f"!~ ma {tot['nohit'] = }\n")
    return
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

  debug = debug_all

  if debug: err.write(f"!~ starting {bias = :.3f}\n")
  if debug: err.write("!~ >>\n")
  match_aux(0,0,0, bias)
  if debug: err.write("!~ <<\n")

  if verbose:
    err.write(f"evaluated {tot['evals']} gap pairs.\n")
    err.write(f"considered {tot['whole']} whole matchings.\n")
    err.write(f"found {tot['match']} matching keyword pairs.\n")
    err.write(f"abandoned {tot['prune']} for excessive score.\n")
    err.write(f"abandoned {tot['nohit']} for failure to match key.\n")
    err.write(f"improved optimum {tot['bueno']} times.\n")
    err.write(f"final score {opt_score:.3f}.\n")

  # Checking result:
  if opt_bimatch != None:
    assert len(opt_bimatch) == nr
    j0_prev = 0; j1_prev = 0
    for kr in range(nr): 
      rngk = opt_bimatch[kr]
      if debug: err.write(f"!@ {rngk = !r}\n")
      assert rngk != None
      i0, j0, i1, j1, bpvar = rngk
      assert j0_prev <= i0 and i0 < nt0 and i0 <= j0 and j0 <= nt0
      assert j1_prev <= i1 and i1 < nt1 and i1 <= j1 and j1 <= nt1
      pena, pat0, pat1 = bpvar
      assert isinstance(pena, float) and pena >= 0
      assert isinstance(pat0, str)
      assert isinstance(pat1, str)
      j0_prev = j0; j1_prev = j1

  return opt_bimatch, opt_score 
  # ----------------------------------------------------------------------

def extract_segments(text0, text1, bimatch):
  # Given two texts {text0,text1} and a bimatching {bimatch[0..nr-1]} 
  # for them, returns the two macro-parsings {segs0,segs1} of those
  # texts implied by the bimatching.
  #
  # The list {segs0} has {ns=2*nr+1} strings. comprising the {nr}
  # substrings of {text0} that are defined by the indices {i0} and {j0}
  # of the rungs (the /hits/) and the substrings before, between,and
  # after those hits (the /gaps/). The list {segs1} has the {ns} gaps
  # and hits of {text1} defined by the indices {i1} and {j1} of the
  # rung.
  #
  # The procedre also returns a {key_penalty} which is the sum of the
  # {pena} components in their {bpvar} fields.
  
  nr = len(bimatch)
  nh = nr; ng = nh + 1; ns = ng + nh
  
  textp = ( text0, text1 )
  segs = [ [ ], [ ] ]
  jtp_prev = [ 0, 0 ]  # End or previous hit
  key_penalty = 0
  for ir in range(nr):
    i0, j0, i1, j1, bpvar = bimatch[ir]
    pena, pat0, pat1 = bpvar
    assert isinstance(pena, float) and pena >= 0
    itp = ( i0, i1 ) 
    jtp = ( j0, j1 )
    for side in range(2):
      jt_prev = jtp_prev[side]
      it = itp[side]
      jt = jtp[side]
      tt = textp[side]
      nt = len(tt)
      assert jt_prev <= it <= nt and it <= jt <= nt
      segs[side].append(tt[jt_prev:it]) # Gap
      segs[side].append(tt[it:jt])      # Hit
    assert isinstance(pena, float) and pena >= 0
    key_penalty += pena
    jtp_prev = jtp
  # Last gaps:
  segs[0].append(text0[jtp_prev[0]:])
  segs[1].append(text1[jtp_prev[1]:])
  return segs[0], segs[1], key_penalty
  # ----------------------------------------------------------------------

def get_universal_keyword_patterns(bitemp):
  # Given a list of {bitemp[0..nr-1]} of bipatterns,
  # returns two lists {ukwpats0,ukwpats1} each with {nr} 
  # strings.  The element {ukwpats0[kr]} is a single RE
  # pattern that will match all the substrings 
  # of a text that would be matched by any patterns 
  # {pat0} in any variant {pvar=(pena,pat0,pat1)} of 
  # the bipattern {bitemp[kr]}.  Similarly 
  # {ukwpats1[kr]} subsumes all patterns {pat1}
  # of those variants.
  
  nr = len(bitemp)
  ukwpats0 = [None] * nr; 
  ukwpats1 = [None] * nr; 
  for kr in range(nr):
    bpk = bitemp[kr]
    pats0 = []
    pats1 = []
    for pvar in bpk:
      pena, pat0, pat1 = pvar
      pats0 += pat0.split('|')
      pats1 += pat1.split('|')
    ukwpats0[kr] = "|".join(tuple(set(tuple(pats0))))
    ukwpats1[kr] = "|".join(tuple(set(tuple(pats1))))
    assert re.match(ukwpats0[kr], "") == None, f"bipatern {kr} side 0 matches empty"
    assert re.match(ukwpats1[kr], "") == None, f"bipatern {kr} side 1 matches empty"
  return ukwpats0,ukwpats1
  # ----------------------------------------------------------------------

def test_match(text0, utype0, text1, utype1, bitemp, pad0):

  err.write("=== testing {match_bitemplate} ===\n")
  
  debug = True
  
  nr = len(bitemp)
  nh = nr; ng = nh + 1
   
  tsize0 = len(text0)
  tsize1 = len(text1)

  def eval_one_gap(gsize0, gsize1, ig, ng):
    debug = False
    if debug: err.write("!e {gsize0 = !r} {gsize1 = !r}\n")
    gap_score = bef.compute_single_gap_score(gsize0, utype0, gsize1, utype1, ig, ng)
    if debug: err.write(f"!e {gap_score = :.3f}\n")
    return gap_score
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
     
  # Find {bimatch[0..nr]}, the best matching of {bitemp}:

  err.write("computing bias ...\n")
  bias = bef.compute_total_size_score(tsize0, utype0, tsize1, utype1, nh)
  if debug: err.write(f"{tsize0 = } {utype0 = !r} {tsize1 = } {utype1 = !r} {nh = } {bias = :.3f}\n")
  err.write("matching ...\n")
  bimatch, score = match_bitemplate(text0, text1, bitemp, bias, eval_one_gap)
  if bimatch == None:
    assert score == +inf
    assert False, "matching failed!"
  else:
    err.write(f"{score = :.3f}\n")
    assert len(bimatch) == nr

    err.write("extracting the segments ...\n")
    segs0, segs1, key_penalty = extract_segments(text0, text1, bimatch)
    err.write(f"{key_penalty = :.3f} {score = :.3f}\n")
    mpars = ((segs0, '?', pad0,), (segs1, '?', ' ',),)
    err.write("matching result:\n")
    wpf.write_parsings(err, "", True, mpars)

    # Checking the score computation:
    score_check = bef.compute_full_score_from_macro_parsings \
      ( segs0, utype0, segs1, utype1, key_penalty )
    if abs(score_check - score) > 1.0e-6:
      err.write(f"{score = :.8f} {score_check = :.8f}\n")
      assert False, "score does not check"

  err.write("=================================\n")
  return
  # ----------------------------------------------------------------------

def test_match_0():
  text0 = "...XX........XaX..."
  text1 = "...ZZ..ZaZ...ZaZ..."

  bpvar_A1 = ( 0.000, 'XX',   'YY',   )
  bpvar_A2 = ( 0.020, 'XX',   'ZZ',   )
  bpvar_A3 = ( 0.040, 'Xa?X', 'Za?Z', )

  bipatA = ( bpvar_A1, bpvar_A2, bpvar_A3 )

  bitemp = ( bipatA, bipatA )
  
  test_match(text0, "ec", text1, "ec", bitemp, ' ')
  return
  # ----------------------------------------------------------------------
  
def test_match_1():
  text0 = \
    "蜂子主治风头除蛊毒补虚羸伤中久服令人光泽好颜色不老大黄蜂子主治心腹胀满痛轻" + \
    "身益气土蜂子主治痈肿"
  text1 = \
    "deechdyopchedaiinypchedyodalychedyqopcheokaiinshedypodairochedalloiinchedy" + \
    "qokaiinchdydaiindchdoseedolchdolkchedychokaiinchdyqoeedyokchedydaiiinchedy" + \
    "daiinykeedyokeeedychedolchdaiinykardarycheold" + \
    "chedydkchsaiinchdedyqodaiinokchedaiinchain"
  
  bpvar_USES_1 = ( 0.000, '主治|主', 'daiin' )
  bpvar_USES_2 = ( 0.100, '主治|主', 'dair|dain|laiin' )
  bpvar_USES_3 = ( 0.300, '主治|主', 'kaiin' )
  bpvar_USES_4 = ( 0.600, '主治|主', '[dklrs][ao]ii?i?[rn]' )
  
  bipat_USES = ( bpvar_USES_1, bpvar_USES_2, bpvar_USES_3, bpvar_USES_4, )
  
  bpvar_QI_1 = ( 0.000, '气',  'chedy' )
  bpvar_QI_2 = ( 0.100, '气',  'ched[ao]' )
  bpvar_QI_3 = ( 0.500, '气',  '[cs]he?[dk][yao]' )
  
  bipat_QI = ( bpvar_QI_1, bpvar_QI_2, bpvar_QI_3, )
  
  bpvar_LONG_1 = ( 0.000, '久服|久食', 'okeedy|okaiin' )
  bpvar_LONG_2 = ( 0.020, '久|服|食',  'okeedy|okaiin' )
  bpvar_LONG_3 = ( 0.200, '久服|久食', '[oy][ktd]ee[dk][aoy]|[oy][kt][ao]ii?n' )
  bpvar_LONG_4 = ( 0.220, '久|服|食',  '[oy][ktd]ee[dk][aoy]|[oy][kt][ao]ii?n' )
  
  bipat_LONG = ( bpvar_LONG_1,  bpvar_LONG_2, bpvar_LONG_3, bpvar_LONG_4, )
  
  bitemp = ( bipat_USES, bipat_LONG, bipat_USES, bipat_QI, bipat_USES, )
  
  test_match(text0, "ch", text1, "ec", bitemp, '　')
  return
  # ----------------------------------------------------------------------

if len(sys.argv) > 1 and sys.argv[1] == "BMF.TEST":
  test_match_0()
  test_match_1()