#! /usr/bin/python3 # Last edited on 2026-07-03 00:43:57 by stolfi import os, sys, re from sys import stderr as err, stdout as out import write_parsing_funcs as wpf import bimatching_eval_funcs as bef from math import sqrt, hypot, log, exp, floor, ceil, inf, nan, isfinite import standard_bipatterns as stdbip # For tests. import numpy as np def match_bitemplate(text0, text1, bitemp, eval_one_gap, max_score, debug_level): # Matches two given strings {text0,text1} to a bitemplate {bitemp}. # If there is more than one bimatching possible, # chooses the one with the lowest badness score. # # A /bitemplate/ is a list {bitemp[0..nr]} of /bipatterns/. # Each bipattern is a list of /variants/. Each variant is a # triple {(pena, pat0, pat1)} where {pena} is a non-negative float, # typically in {[0 _ 1]}; and {pat1,pat2} are two RE patterns, # neither of which matches the empty string. # # Let {nt0} and {nt1} be the lengths of {text0} and {text1}. A # /bimatching/ of {text0,text1} to {bitemp} is a list {bimatch} of # {nr} /rungs/. A rung {bimatch[kr]} is a quintuple # {(i0,j0,i1,j1,bpvar)} where {bpvar} is a variant of the bipattern # {bitemp[kr]}, as above; {i0,j0} are indices in {0..nt0} with {i0 <= # j0}, and the string {text0[i0:j0]} matches the RE pattern {pat0} in # {bpvar}; and ditto for {i1,j1,text1} with {pat1} of {bpvar}. Among # successive # # The procedure effectively enumerates all the possible bimatchings # {bimatch[0..nr-1]} of {text0,text1} against the bitemplate # {bitemp[0..nr-1]}. For each bimatching it computes a badness # {score}, and selects the bimatching that yielded the lowest {score}. # # Any score greater than {max_score} will be mapped to {+inf} # A low {max-score} can significantly speed up the search. # # Actually, this version uses a dynamic programming algorithm # to find the best match in time {O(nr*nt0^2*nt1^2)} instead of # exponential time. # # The badness {score} of a bimatching is computed by converting it # implicitly to a pair of macro-parsings {segs0,segs1} of {text0} and # {text1}, and calling {eval_one_gap(len(gap0),len(gap1),ig,ng)} on # each pair of corresponding gaps. The {score} is the sum of all the # gap scores, plus the penalties of the variants used to match the # keywords. # # Returns the bimatching {opt_bimatch} that yielded the smallest # {score}, as well as its score. # # If there are no possible bimatchings, returns {None,+inf}. # debug_all = (text1[:15] == "pokararkeeyokee") # debug_all = (text1[:13] == "poarkeeodaiin") debug_top = debug_level >= 2 debug_steps = debug_level >= 4 debug_plane = debug_level >= 6 debug_tableau = debug_level >= 8 debug_outer_loop = debug_level >= 10 debug_inner_loop = debug_level >= 20 verbose = True nt0 = len(text0); nt1 = len(text1) # Lengths of the texts. nr = len(bitemp) # Number of bipatterns. nh = nr; ng = nh + 1; ns = ng + nh # Number of hits, gaps, segs. assert ns >= 1 if debug_steps: err.write(f"!~ enter match_bitemplate\n") err.write(f"!~ {nr = } {nh = } {ng = } {ns = }\n") err.write(f"!~ {nt0 = :3d} {text0 = !r}\n") err.write(f"!~ {nt1 = :3d} {text1 = !r}\n") tot = dict() tot['evgap'] = 0 # Number of gap scores computed. tot['evhit'] = 0 # Number of attempts to match a hit pair. tot['nohit'] = 0 # Number of hit pairs that failed to match. tot['finel'] = 0 # Number of finite elements in the tableaus. tot['prune'] = 0 # Number of times that a score exceeded {max_score}. # Dynamic programming tableaus: tb_segs_shape = (ns-1,nt0+1,nt1+1) # One plane per segment except the tail. tb_sc = np.full(tb_segs_shape, +inf) # Best score at each brakpoint. tb_j0 = np.full(tb_segs_shape, -1) # Position of prev breakpoint in {text0}. tb_j1 = np.full(tb_segs_shape, -1) # Position of prev breakpoint in {text1}. tb_nm = np.full(tb_segs_shape, -1) # Number of distinct matches. tb_rungs_shape = (nr,nt0+1,nt1+1) # One plane per rung. tb_iv = np.full(tb_rungs_shape, -1) # Position of prev breakpoint in {text1}. # A /breakpoint/ on text0 is an index in {0..nt0} (note: not {0..nt0-1}). # # A /biparsing/ of size {ks} is a list of pairs {bm[ts] = # (b0[ts],b1[ts])} for {ts} in {0..ks}, such that {b0[0..ks-1]} and # {b1[0..ks-1]} are non-decreasing breakpoints on each text. # # A biparsing of size {ms} defines a partition of {text0} and {text1} # into {ms} segments. For {ks} in {0..ms-1}, segment {segs0[ks]} is # the substring of {text0} between breakpoints {b0[ks-1]} and # {b0[ks]}; where {b0[-1]} is defined as 0. Ditto for # {segs1,b1,text1}. The /partial badness score/ of that biparsing is # the sum of the badness scores of those segments. The pair {bm[ms-1]} # is said to be the /end/ of the biparsing. # # If {ms == ns - 1} we extend the above definition to {segs0[ns-1]} by # defining {b0[ns-1]} as {nt0}. That is, {segs0[ns-1]}, the /closing # segment/, is the tail of {text0} from {b0[ns-2]} to the end of # {text0}. The /final score/ is defined as the partial score of the # biparsing plus the badness score of the closing segments. # # If {ks} is even, segments {segs0[ks]} and {segs1[ks]} are called # /gaps/. The badness of those two gaps is a function of the # discrepancy between the actual length of {segs1[ks]} and the value # predicted from the length of {segs0[ks]}. # # If {ks} is odd -- that is {ks = 2*kr+1} for some {kr} in {0..nr-1} # -- the segments {segs0[ks]} and {segs1[ks]} are called /hits/. Those # two strings must match some variant of the bipattern {bitemp[kr]}, # namely there should some index {iv} such that {bpvar = # bitemp[kr][iv] = (pena, pat0, pat1)} such {segs0[ks]} matches {pat0} # and {segs1[ks]} matches {pat1}. The badness score of the segments # {segs0[ks]} and {segs1[ks]} is then the variant penalty {pena}. (The # penalties must be strictly increasing with {iv}, so that the first # {iv} that matches on both sides will also be the one with least # penalty.) # # The goal is to fill the arrays {tb_sc,tb_j0,tb_j1} so that, for {ks} # in {0..ns-2}, {tb_sc[ks,i0,i1]} is the badness score the best # partial biparsing with {ks+1} segments ending at breakpoint {i0} on # {text0} and {i1} on {text1}. Then {k0 = tb_j0[ks,i0,i1]} and {k1 = # tb_j1[ks,i0,i1]} will the previous breakpoints of that partial # biparsing; or {-1} if {ks} is zero. # # If the segments {segs0[ks],segs1[ks]} are hits -- that is, if {ks} # is odd {2*kr+1} -- then {tb_iv[kr,i0,i1]} will store the index {iv} # of the variant {bpvar} of {bitemp[kr]} that matched those segments; # or {-1} if no variant matched both segments. # # The value of {tb_nm[ks,i0,i1]} will be the number of distinct # biparsings of size {ks+1} that end at {(i0,i1)}; or -1 if that # number was not computed. def gap_score(ks, k0,i0, k1,i1): # Badness score of the gap segments {seg00[ks]} and {segs1[ks]}, # between breakpoints {(k0,k1)} and {(i0,i1)}. # Requires {ks} even, {ks == 2*ig}. nonlocal tot assert 0 <= k0 and k0 <= i0 and i0 <= nt0 assert 0 <= k1 and k1 <= i1 and i1 <= nt1 assert ks % 2 == 0 gsz0 = i0 - k0; gsz1 = i1 - k1; ig = ks//2; tot['evgap'] += 1 score_gap = eval_one_gap(gsz0, gsz1, ig, ng) if score_gap > max_score: score_gap = +inf return score_gap # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: def hit_score(ks, k0,i0, k1,i1): # Badness score of the hit segments {segs0[ks]} and {segs1[ks]}, # between breakpoints {(k0,k1)} and {(i0,i1)}. # Requires {ks} odd, {ks == 2*ih + 1}. # If the segments match the bipattern {bitemp[ih]}, returns # the penalty {pena} of the variant that matched, and its index {iv}. # Otherwise returns {+inf,-1}. nonlocal tot assert 0 <= k0 and k0 <= i0 and i0 <= nt0 assert 0 <= k0 and k0 <= i0 and i0 <= nt0 assert 0 <= k1 and k1 <= i1 and i1 <= nt1 assert ks % 2 == 1 kr = ks//2; bipat = bitemp[kr] nv = len(bipat) pena_prev = -1 tot['evhit'] += 1 seg0 = text0[k0:i0] seg1 = text1[k1:i1] for iv in range(nv): pena, pat0, pat1 = bipat[iv] assert pena > pena_prev, f"key penalties not increasing {pena_prev} {pena}" if re.fullmatch(pat0, seg0): if re.fullmatch(pat1, seg1): return pena, iv pena_prev = pena tot['nohit'] += 1 return +inf, -1 # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: def print_tableau(ks): # Prints plane {ks} of the tableau. # Only the score for now. nt_max = 30 if nt0 <= nt_max and nt1 <= nt_max: rt0 = nt0; st0 = nt0 + 1 rt1 = nt1; st1 = nt1 + 1 else: ht = (nt_max - 2)//2; rt0 = ht; st0 = nt0 - ht; rt1 = ht; st1 = nt1 - ht err.write(f"!o tb[{ks},:,:] = \n") for i0 in list(range(0,rt0+1)) + list(range(st0,nt0)): err.write("!o ") for i1 in list(range(0,rt1+1)) + list(range(st1,nt1)): if i0 < nt0 and i1 < nt1: if i0 == rt0 or i1 == rt1: valx = " ... " else: val = tb_sc[ks,i0,i1] valx = f"{val:6.3f}" if isfinite(val) else " * " if i1 > 0: err.write(" ") err.write(valx) err.write("\n") return # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: def extract_bimatch(j0_fin, j1_fin): # Recover the optimum bimatch from {tb_sc,tb_j0,tb_j1,tb_iv} # Assumes that the last breakpoints before the end {(nt0,nt1)} # were {j0_fin,j1_fin}: bimatch = [ None ] * nr # Rungs of partial bimatching. kr = nr - 1; ks = ns - 1 i0 = nt0; i1 = nt1 while kr >= 0: # Get rung {kr}. # Now {(i0,i1)} is LOW breakpoint of rung {kr+1}. # Skip the gaps ending at {(i0,i1)}: assert ks >= 1 assert ks % 2 == 0 ig = ks//2 assert i0 >= 0 and i1 >= 0 if kr == nr - 1: assert ks == ns - 1 j0 = j0_fin; j1 = j1_fin else: assert ks <= ns - 2 j0 = tb_j0[ks,i0,i1]; j1 = tb_j1[ks,i0,i1] assert isfinite(tb_sc[ks,j0,j1]) assert 0 <= j0 and j0 <= i0 and i0 <= nt0 assert 0 <= j1 and j1 <= i1 and i1 <= nt1 if debug_plane: err.write(f"!B skipped gaps {ig} segs {ks} {j0}..{i0} {j1}..{i1}\n") ks = ks - 1 # Get the hits that end at {(j0,j1)}: assert ks >= 0 assert ks % 2 == 1 assert isfinite(tb_sc[ks,j0,j1]) i0 = tb_j0[ks,j0,j1]; i1 = tb_j1[ks,j0,j1] assert 0 <= i0 and i0 <= j0 and j0 <= nt0 assert 0 <= i1 and i1 <= j1 and j1 <= nt1 # Asemble the rung: assert kr >= 0 iv = tb_iv[kr,j0,j1] assert iv >= 0 and iv < len(bitemp[kr]) bpvar = bitemp[kr][iv] bimatch[kr] = (i0,j0, i1,j1, bpvar) pena, pat0, pat1 = bpvar ih = kr if debug_plane: err.write(f"!B took hits {ih} segs {ks} {i0}..{j0} {i1}..{j1} {iv = } {pena = :.8f}\n") # Move back one more: ks = ks - 1 kr = kr - 1 assert ks == 0 assert kr == -1 return bimatch # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: def check_bimatch(bimatch): assert len(bimatch) == nr k0 = 0; k1 = 0 for kr in range(nr): rngk = bimatch[kr] if debug_plane: err.write(f"!@ checking rung {kr} = {rngk = !r}\n") assert rngk != None i0, j0, i1, j1, bpvar = rngk assert k0 <= i0 and i0 <= j0 and j0 <= nt0 assert k1 <= i1 and i1 <= j1 and j1 <= nt1 pena, pat0, pat1 = bpvar assert isinstance(pena, float) and pena >= 0 assert isinstance(pat0, str) assert isinstance(pat1, str) assert re.fullmatch(pat0, text0[i0:j0]) != None assert re.fullmatch(pat1, text1[i1:j1]) != None k0 = j0; k1 = j1 return # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::~ # For each bipat {bitemp[kr]}, find all substrings in {text0} and {text1} # that match the corresponding side. Namely # {bghits0[kr][j0]} is a list of all indices {i0} such that {text0[i0:j0]} # matches {pat0} for some variant {(pena, pat0, pat1) = bitemp[kr][iv]} # for some {iv}. Similarly for {bghits1[kr][j1]} wrt {text1}. def bipat_matches_seg(bipat, side, seg): # Returns true iff some varaint of bipat # on side {side} (0 or 1) matches {seg}. for (pena, pat0, pat1) in bipat: pat = (pat0, pat1)[side] if re.fullmatch(pat, seg): return True return False # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: bghits0 = [ [ [ ] for j0 in range(nt0+1) ] for kr in range(nr) ] # Starts of possible hits on text0. bghits1 = [ [ [ ] for j1 in range(nt1+1) ] for kr in range(nr) ] # Starts of possible hits on text1. bghits = ( bghits0, bghits1, ) for side in 0, 1: text = (text0, text1)[side] nt = len(text) for jt in range(nt+1): for it in range(jt+1): seg = text[it:jt] for kr in range(nr): bipat = bitemp[kr] if bipat_matches_seg(bipat, side, seg): bghits[side][kr][jt].append(it) if debug_steps: for kr in range(nr): nhits = 0; for jt in range(nt+1): nhits += len(bghits[side][kr][jt]) err.write(f"!~ found {nhits} potential hits of bipattern {kr} on text{side}\n") def find_best_extension(ks,i0,i1, k0_k1_list): # Finds the best biparsing with {ks} segments from {(0,0)} to {(i0,i0)}. # Assumes that plane {ks-1} of tables {tb_sc,tb_j0,tb_j1} has been filled. # # Also, if {ks>0}, assumes that {k0_k1_list} is a list of pairs of # indices {(k0,k1)} such that {k0 <= i0 {k1 <= i1} and the segments # {k0..i0} and {k1..i1} are worth considering. The list must include # at least every pair {(k0,k1)} such that {tb-sc[ks-1,k0,k1]} is # finite and the score of those two segments is finite and the sum # of those two score less than {max_score}. # # Returns the score {score_best} of the extended biparsing # and the breakpoints {k0_best,k1_best} of that biparsing prior to # {i0,i1}. # # If the last segments are hits (that is, {ks} is odd), # also returns the index {iv_best} of the variant of the bipat that # matched those segments, or {-1} if no variant matched.] # If the last segments are gaps, returns {None} as {iv_best}. # # Also returns the number of matchings with finite badness # that end at {(i0,i1)}. If {score_best} is {+inf}, # this number will be zero. # # If there is no viable extension, returns {+inf} as the # score, {-1} as {k0_best,k1_best}, and {-1} as {iv_best}. nonlocal tot, bghits0, bghits1 if debug_outer_loop: err.write(f"!X segment{ks}, trying {len(k0_k1_list)} edges to ({i0},{i1})\n") assert ks >= 0 and ks <= ns - 1 score_best = +inf k0_best = -1; k1_best = -1; iv_best = None if ks % 2 == 0 else -1 nb = 0 if ks == 0: score_best = gap_score(ks, 0,i0, 0,i1) if isfinite(score_best): k0_best = 0; k1_best = 0; nb = 1 else: k0_best = -1; k1_best = -1; nb = 0 iv_best = None else: nb = 0 for k0, k1 in k0_k1_list: assert k0 <= i0 and k1 <= i1 score_old = tb_sc[ks-1,k0,k1] if debug_inner_loop: err.write(f"!X {k0 = } {k1 = } {score_old = :.8f}\n") if isfinite(score_old): nb_old = tb_nm[ks-1,k0,k1] assert nb_old >= 1 # Get the badness {score_seg} of the segs {(k0,k1)} to {(i0,i1)}: if ks % 2 == 0: score_seg = gap_score(ks, k0,i0, k1,i1) iv_seg = None else: score_seg, iv_seg = hit_score(ks, k0,i0, k1,i1) assert isinstance(iv_seg, int) if debug_inner_loop: err.write(f"!X {score_seg = :.8f} {iv_seg = }\n") if isfinite(score_seg): score_new = score_old + score_seg if debug_inner_loop: err.write(f"!X {score_new = :.8f}\n") if score_new > max_score: score_new = +inf; tot['prune'] += 1 else: nb += nb_old if score_new < score_best: score_best = score_new k0_best = k0; k1_best = k1; iv_best = iv_seg return score_best, k0_best, k1_best, iv_best, nb # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: # Fill the tableau with partial biparsings: finels_old = [ (0,0) ] # Finite elements on previous plane of the tableau. k0_min = 0; k1_min = 0; # Min indices of finite elems in prev plane. hard_test = False for ks in range(ns-1): # Fill {tb_sc[ks,*,*]}: if debug_plane: err.write(f"filling plane {ks} ...\n") finels_new = [] # Finite elements on plane {ks} of tableau. i0_min = nt0+1; i1_min = nt1+1 # First breakpoints for {ks} with finite badness. for i0 in range(k0_min,nt0+1): for i1 in range(k1_min,nt1+1): if debug_outer_loop: err.write(f"!~ filling tb[{ks},{i0},{i1}] ...\n") # Decide which elements of the previous plane are worth looking at: k0_k1_list = None; if hard_test: k0_k1_list = [ (k0,k1) for k0 in range(k0_min,i0) for k1 in range(k1_min, k1) ] elif ks % 2 == 0: # Gap segment: loop on finite elements of prev tableau plane: k0_k1_list = finels_old else: # Hit segment: loop on segments that matched the pattern kr = ks//2; k0_k1_list = [ (k0,k1) for k0 in bghits0[kr][i0] for k1 in bghits1[kr][i1] ] k0_k1_list = [ (k0,k1) for (k0,k1) in k0_k1_list if k0 <= i0 and k1 <= i1 ] score_best, k0_best, k1_best, iv_best, nb = \ find_best_extension(ks,i0,i1, k0_k1_list) if debug_outer_loop: err.write(f"!~ {score_best = :.8f} {k0_best = } {k1_best = }") err.write(f" {iv_best = } {nb = }\n") if isfinite(score_best): assert score_best >= 0 assert 0 <= k0_best and k0_best <= i0 and i0 <= nt0 assert 0 <= k1_best and k1_best <= i1 and i1 <= nt1 assert nb >= 1 finels_new.append((i0,i1,)) else: assert k0_best == -1 and k1_best == -1 assert nb == 0 tb_sc[ks,i0,i1] = score_best tb_j0[ks,i0,i1] = k0_best tb_j1[ks,i0,i1] = k1_best tb_nm[ks,i0,i1] = nb assert isinstance(tb_sc[ks,i0,i1], float) if ks % 2 == 1: # Hit - save the variant: assert isinstance(iv_best, int) assert (iv_best != -1) == isfinite(score_best) kr = ks//2 tb_iv[kr,i0,i1] = iv_best if debug_outer_loop: err.write(f"!~ saved tb_iv[{kr},{i0},{i1}] = {iv_best}\n") else: # Gap - variant not applicable: assert iv_best == None if isfinite(score_best): tot['finel'] += 1 i0_min = min(i0_min, i0) i1_min = min(i1_min, i1) finels_new.append((i0,i1,)) # Remember where it is worth starting: finels_old = finels_new; k0_min = i0_min; k1_min = i1_min if debug_plane: err.write(f"tableau plane {ks} has {len(finels_new)} finite elements.\n") if debug_plane: print_tableau(ks) # Now complete the biparsing with the closing segments: score_fin, k0_fin, k1_fin, iv_fin, nb_fin = find_best_extension(ns-1,nt0,nt1, finels_old) assert iv_fin == None # Last segs are gaps. if debug_steps: err.write(f"!~ FINAL {score_fin = :.8f}\n") if isfinite(score_fin): bimatch_fin = extract_bimatch(k0_fin, k1_fin) check_bimatch(bimatch_fin) else: assert score_fin == +inf bimatch_fin = None if verbose: err.write(f"compared lengths of {tot['evgap']} gap pairs.\n") err.write(f"considered {tot['evhit']} possible hit pairs ({tot['nohit']} failed).\n") err.write(f"total {tot['finel']} finite entries in tableau.\n") err.write(f"pruned {tot['prune']} tableau entries because of max score.\n") err.write(f"potentially {nb_fin} distinct bimatchings.\n") err.write(f"final score {score_fin:.8f}.\n") if debug_steps: err.write("!~ exit match_bitemplate\n") return bimatch_fin, score_fin # ---------------------------------------------------------------------- def extract_segments(text0, text1, bimatch): # Given two texts {text0,text1} and a bimatching {bimatch[0..nr-1]} # for them, returns the two macro-parsings {segs0,segs1} of those # texts implied by the bimatching. # # The list {segs0} has {ns=2*nr+1} strings. comprising the {nr} # substrings of {text0} that are defined by the indices {i0} and {j0} # of the rungs (the /hits/) and the substrings before, between,and # after those hits (the /gaps/). The list {segs1} has the {ns} gaps # and hits of {text1} defined by the indices {i1} and {j1} of the # rung. # # The procedre also returns a {key_penalty} which is the sum of the # {pena} components in their {bpvar} fields. debug = False nr = len(bimatch) nh = nr; ng = nh + 1; ns = ng + nh textp = ( text0, text1 ) segs = [ [ ], [ ] ] jtp_prev = [ 0, 0 ] # End or previous hit key_penalty = 0 for ir in range(nr): i0, j0, i1, j1, bpvar = bimatch[ir] pena, pat0, pat1 = bpvar if debug: err.write(f"!E {pena = :.8f}\n") assert isinstance(pena, float) and pena >= 0 itp = ( i0, i1 ) jtp = ( j0, j1 ) for side in range(2): jt_prev = jtp_prev[side] it = itp[side] jt = jtp[side] tt = textp[side] nt = len(tt) assert jt_prev <= it <= nt and it <= jt <= nt segs[side].append(tt[jt_prev:it]) # Gap segs[side].append(tt[it:jt]) # Hit key_penalty += pena jtp_prev = jtp # Last gaps: segs[0].append(text0[jtp_prev[0]:]) segs[1].append(text1[jtp_prev[1]:]) return segs[0], segs[1], key_penalty # ---------------------------------------------------------------------- def get_gap_sizes_from_bimatching(nt0, nt1, bimatch): # Similar to {extract_segments(text0, text1, bimatch)} except that it # takes the sizes {nt0,nt1} of the two texts, instead of the texts # themselves, and returns the sizes # {gsizes0[0..ng-1],gsizes1[0..ng-1]} of the gaps, rather than the # gap and hit strings themselves. # # The procedre also returns a {key_penalty} which is the sum of the # {pena} components in their {bpvar} fields. debug = False nr = len(bimatch) nh = nr; ng = nh + 1; ns = ng + nh jtp_prev = [ 0, 0 ] # End or previous hit key_penalty = 0 gsizes = [ [ ], [ ] ] ntp = ( nt0, nt1 ) for ir in range(nr): i0, j0, i1, j1, bpvar = bimatch[ir] pena, pat0, pat1 = bpvar if debug: err.write(f"!Z {i0 = } {j0 = } {i1 = } {j1 = } {pena = :.8f}\n") assert isinstance(pena, float) and pena >= 0 itp = ( i0, i1 ) jtp = ( j0, j1 ) for side in range(2): jt_prev = jtp_prev[side] it = itp[side] jt = jtp[side] nt = ntp[side] assert jt_prev <= it <= nt and it <= jt <= nt gszi = it - jt_prev if debug: err.write(f"!Z {side = } {gszi = }\n") gsizes[side].append(gszi) assert isinstance(pena, float) and pena >= 0 key_penalty += pena jtp_prev = jtp # Last gaps: assert jtp_prev[0] <= nt0; gsizes[0].append(nt0 - jtp_prev[0]) assert jtp_prev[1] <= nt1; gsizes[1].append(nt1 - jtp_prev[1]) return gsizes[0], gsizes[1], key_penalty # ---------------------------------------------------------------------- def write_bimatching(wr, lab, bimatch, text0, pad0, text1, pad1): # Writes out a bimatch {bimatch}. # # The {bimatch} must be a list of rungs {bimatch[0..nr-1]} # as defined above. Uses {text0} and {text1} to extract the # matched strings on the two sides. They are padded with {pad0} and {pad1} # so that the columns align properly. # # Each line of the output is prefixed with the given {lab} string. nr = len(bimatch) # Num of rungs. nt0 = len(text0) nt1 = len(text1) # Find the max width of each keyword: wd0_max = 0 wd1_max = 0 for ir in range(nr): i0, j0, i1, j1, bpvar = bimatch[ir] pena, pat0, pat1 = bpvar assert 0 <= i0 and i0 <= j0 and j0 <= nt0 assert 0 <= i1 and i1 <= j1 and j1 <= nt1 assert isinstance(pena, float) and pena >= 0 assert re.fullmatch(pat0, text0[i0:j0]) != None assert re.fullmatch(pat1, text1[i1:j1]) != None wd0_max = max(wd0_max, j0 - i0) wd1_max = max(wd1_max, j1 - i1) # Prints all rungs for ir in range(nr): wr.write(f"{lab}") i0, j0, i1, j1, bpvar = bimatch[ir] pena, pat0, pat1 = bpvar key0 = text0[i0:j0].ljust(wd0_max, pad0) wr.write(f" | {i0:3d} {j0:3d} {key0}") key1 = text1[i1:j1].ljust(wd1_max, pad1) wr.write(f" | {i1:3d} {j1:3d} {key1}") wr.write(f" | ({pat0},{pat1})") wr.write("\n") return # ---------------------------------------------------------------------- def test_match(text0, utype0, text1, utype1, bitemp, pad0): err.write("=== testing {match_bitemplate} ===\n") debug = True nr = len(bitemp) nh = nr; ng = nh + 1 max_score = 20.0 def eval_one_gap(gsize0, gsize1, ig, ng): debug = False if debug: err.write("!e {gsize0 = !r} {gsize1 = !r}\n") gap_score = bef.compute_single_gap_score(gsize0, utype0, gsize1, utype1, ig, ng) if debug: err.write(f"!e {gap_score = :.3f}\n") return gap_score # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: # Find {bimatch[0..nr]}, the best matching of {bitemp}: err.write("matching ...\n") debug_level = 4 bimatch, score = match_bitemplate \ ( text0, text1, bitemp, eval_one_gap, max_score, debug_level ) if bimatch == None: assert score == +inf assert False, "matching failed!" else: err.write(f"{score = :.8f}\n") assert len(bimatch) == nr err.write("bimatching:\n") pad0 = ' ' if utype0 == "ch" else " " pad1 = ' ' if utype1 == "ch" else " " write_bimatching(err, '', bimatch, text0, pad0, text1, pad1) err.write("extracting the segments ...\n") segs0, segs1, key_penalty = extract_segments(text0, text1, bimatch) err.write(f"{key_penalty = :.3f} {score = :.3f}\n") mpars = ((segs0, '?', pad0,), (segs1, '?', ' ',),) err.write("matching result:\n") wpf.write_parsings(err, "", True, mpars) # Checking the score computation: score_check = bef.compute_full_score_from_macro_parsings \ ( segs0, utype0, segs1, utype1, key_penalty ) if abs(score_check - score) > 1.0e-7: err.write(f"{score = :.9f} {score_check = :.9f}\n") assert False, "score does not check" err.write("=================================\n") return bimatch, score # ---------------------------------------------------------------------- def test_match_0(): err.write(f"=== test_match_0() ===========\n") text0 = "...XX........XaX..." text1 = "...ZZ..ZaZ...ZaZ..." bpvar_A1 = ( 0.000, 'XX', 'YY', ) bpvar_A2 = ( 0.020, 'XX', 'ZZ', ) bpvar_A3 = ( 0.040, 'Xa?X', 'Za?Z', ) bipatA = ( bpvar_A1, bpvar_A2, bpvar_A3 ) bitemp = ( bipatA, bipatA ) bimatch, score = test_match(text0, "ec", text1, "ec", bitemp, ' ') return # ---------------------------------------------------------------------- def test_match_1(with_dain): err.write("\n\n\n") err.write(f"=== test_match_1({with_dain=!r}) ===========\n") text0 = \ "主杀鬼肪主耳聋肠主遗溺肶胵裹黄皮" text1 = \ "daiinchsdqokeeeydaiinokaiinotaiindaiincheolkallkldaindoeeokcheeoltaiinotcheedychoraiino" if not with_dain: text1 = re.sub(r"dain", "xxxx", text1) bipat_USES = stdbip.get_bencao_starps_bipattern('USES') bitemp = ( bipat_USES, bipat_USES, bipat_USES, ) bimatch, score = test_match(text0, "ch", text1, "ec", bitemp, ' ') err.write(f"{bimatch = !r}\n") err.write(f"{score = :.8f}\n") assert len(bimatch) == 3 assert bimatch[0][0:4] == (0,1,0,5) # 'daiin' if with_dain: assert bimatch[1][0:4] == (4,5,22,27) # 'kaiin' assert bimatch[2][0:4] == (8,9,49,53) # 'dain' else: assert bimatch[1][0:4] == (4,5,16,21) # 'daiin' assert bimatch[2][0:4] == (8,9,33,38) # 'daiin' return # ---------------------------------------------------------------------- def test_match_2(): err.write("\n\n\n") err.write(f"=== test_match_2() ===========\n") text0 = \ "蜂子主治风头除蛊毒补虚羸伤中久服令人光泽好颜色不老大黄蜂子主心腹胀满痛轻" + \ "身益气土蜂子主治痈肿" text1 = \ "deechdyopchedaiinypchedyodalychedyqopcheokaiinshedypodairochedalloiinchedy" + \ "qokaiinchdydaiindchdoseedolchdolkchedychokaiinchdyqoeedyokchedydaiiinchedy" + \ "daiinykeedyokeeedychedolchdaiinykardarycheold" + \ "chedydkchsaiinchdedyqodaiinokchedaiinchain" bipat_USES = stdbip.get_bencao_starps_bipattern('USES') bipat_QI = stdbip.get_bencao_starps_bipattern('QI') bipat_LONG = stdbip.get_bencao_starps_bipattern('LONG') bitemp = ( bipat_USES, bipat_LONG, bipat_USES, bipat_QI, bipat_USES, ) bimatch, score = test_match(text0, "ch", text1, "ec", bitemp, ' ') err.write(f"{bimatch = !r}\n") err.write(f"{score = :.8f}\n") assert len(bimatch) == 5 return # ---------------------------------------------------------------------- def test_match_3(with_dain, with_AMENO): err.write("\n\n\n") err.write(f"=== test_match_3({with_dain=!r},{with_AMENO=!r}) ===========\n") # (w1a1d0g0) 74 hanzi text0 = \ "丹雄鸡主女人崩中漏下赤白沃补虚温中止血通神杀毒辟不祥头主杀鬼肪主耳聋肠主遗溺肶胵" + \ "裹黄皮主泄利屎白主消渴伤寒寒热翮羽主下血闭鸡子主除热火疮痫痉可作虎魄" assert len(text0) == 74 # 371 eva text1 = \ "poarkeeodaiinqoaiinaracpheeyqoeedeodyqokaiinqotedaiinaporaiinapylsheod" + \ "ytaiinoteeyoteeoolotaiinokeeyqokaiinoraiiinaldalsheeodaiinchsdqokeeeyd" + \ "aiinokaiinotaiinchedaiinolkallkldaindoeeokcheeoltaiinotcheedychoraiino" + \ "daiinchedyotaiinalkaishdlaiinsheodokeeodyqoaiinytaiinotaiinchdaldydaii" + \ "inchdaiinockheyysheyckhysheoqoeeolkaiinchsokoltchdysheeeyokaiinaraildy" + \ "cheodyoaiiinainokshey" assert len(text1) == 371 if not with_dain: text1 = re.sub(r"dain", "xxxx", text1) bipat_USES = stdbip.get_bencao_starps_bipattern('USES') bipat_AMENO = stdbip.get_bencao_starps_bipattern('AMENO') if with_AMENO: bitemp = ( bipat_USES, ) * 7 + ( bipat_AMENO, bipat_USES, ) else: bitemp = ( bipat_USES, ) * 8 bimatch, score = test_match(text0, "ch", text1, "ec", bitemp, ' ') err.write(f"{bimatch = !r}\n") err.write(f"{score = :.8f}\n") assert bimatch[0][0:4] == ( 3, 4, 8, 13) # 'daiin' assert bimatch[1][0:4] == ( 27, 28, 123, 128) # 'daiin' assert bimatch[2][0:4] == ( 31, 32, 139, 144) # 'daiin' assert bimatch[3][0:4] == ( 35, 36, 159, 164) # 'daiin' assert bimatch[4][0:4] == ( 43, 44, 210, 215) # 'daiin' assert bimatch[5][0:4] == ( 48, 49, 234, 239) # 'laiin' assert bimatch[6][0:4] == ( 57, 58, 284, 289) # 'daiin' if with_AMENO: assert bimatch[7][0:4] == ( 59, 61, 295, 304) # 'ysheyckhy' assert bimatch[8][0:4] == ( 63, 64, 314, 319) # 'laiin' else: assert bimatch[7][0:4] == ( 63, 64, 314, 319) # 'laiin' return # ---------------------------------------------------------------------- if len(sys.argv) > 1 and sys.argv[1] == "BTF.TEST": test_match_0() test_match_3(with_dain = False, with_AMENO = False) test_match_3(with_dain = True, with_AMENO = False) test_match_3(with_dain = False, with_AMENO = True ) test_match_1(with_dain = False) test_match_1(with_dain = True) test_match_2()