#! /usr/bin/python3 # Last edited on 2026-05-01 04:50:38 by stolfi import os, sys, re from sys import stderr as err, stdout as out import write_parsing_funcs as wpf import bimatching_eval_funcs as bef from math import sqrt, hypot, log, exp, floor, ceil, inf, nan, isfinite def match_bitemplate(text0, text1, bitemp, bias, eval_one_gap): # Matches two given strings {text0,text1} to a bitemplate {bitemp}. # If there is more than one bimatching possible, # chooses the one with the lowest badness score. # # A /bitemplate/ is a list {bitemp[0..nr]} of /bipatterns/. # Each bipattern is a list of /variants/. Each variant is a # triple {(pena, pat0, pat1)} where {pena} is a non-negative float, # typically in {[0 _ 1]}; and {pat1,pat2} are two RE patterns, # neither of which matches the empty string. # # Let {nt0} and {nt1} be the lengths of {text0} and {text1}. A # /bimatching/ of {text0,text1} to {bitemp} is a list {bimatch} of # {nr} /rungs/. A rung {bimatch[kr]} is a quintuple # {(i0,j0,i1,j1,bpvar)} where {bpvar} is a variant of the bipattern # {bitemp[kr]}, as above; {i0,j0} are indices in {0..nt0} with {i0 <= # j0}, and the string {text0[i0:j0]} matches the RE pattern {pat0} in # {bpvar}; and ditto for {i1,j1,text1} with {pat1} of {bpvar}. Among # successive # # The procedure enumerates all the possible bimatchings # {bimatch[0..nr-1]} of {text0,text1} against the bitemplate # {bitemp[0..nr-1]}. For each bimatching it computes a badness # {score}, and selects the bimatching that yielded the lowest {score}. # # The badness {score} of a bimatching is computed by converting it # implicitly to a pair of macro-parsings {segs0,segs1} of {text0} and # {text1}, and calling {eval_one_gap(len(gap0),len(gap1),ig,ng)} on # each pair of corresponding gaps. The {score} is the sum of the given # {bias}, plus all the gap scores, plus the penalties of the variants # used to match the keywords. # # Returns the bimatching {opt_bimatch} that yielded the smallest # {score}, as well as its score. # # If there are no possible bimatchings, returns {None,+inf}. # debug_all = (text1[:10] == "fdeechdyop") debug_all = False verbose = True nt0 = len(text0); nt1 = len(text1) # Lengths of the texts. nr = len(bitemp) # Number of bipatterns. nh = nr; ng = nh + 1; ns = ng + nh # Number of hits, gaps, segs. # Universal keyword patterns (sans penalties and pairing) ukwpats0, ukwpats1 = get_universal_keyword_patterns(bitemp) if debug_all: err.write(f"!~ {nr = } {bias = }\n") err.write(f"!~ {nt0 = :3d} {text0 = !r}\n") err.write(f"!~ {nt1 = :3d} {text1 = !r}\\n") tot = dict() tot['evals'] = 0 # Number of calls to {eval_one_gap}. tot['whole'] = 0 # Number of complete bimatchings evaluated. tot['match'] = 0 # Number of paired keyword matches found. tot['nohit'] = 0 # Number of times it failed to find any pair of hits. tot['prune'] = 0 # Number of bimatchings abandoned before complete. tot['bueno'] = 0 # Number of complete bimatchings that improved optimum score. bimatch = [ None ] * nr # Rungs of partial bimatching. opt_bimatch = None # Best bimatchng of {text0,text1}. opt_score = +inf # The score of those parsings. def debug_bimatch(tag, kr): # Prints out {bimatch[0..kr-1]}. nonlocal bimatch for ir in range(kr): i0,j0,i1,j1,bpvar = bimatch[ir] hit0 = text0[i0:j0].ljust(10,' '); hit1 = text1[i1:j1].ljust(10,' '); err.write(f"{tag}") err.write(f" {i0:3d}:{j0:3d} '{hit0:<10s}'") err.write(f" {i1:3d}:{j1:3d} '{hit1:<10s}'\n") err.write("\n") return # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: def match_one_bipat_at_cur_point(i0, i1, kr, score): # Tries to extend {bimatch[0..kr-1]} by # matching all the variants of the bipattern {bpk = bitemp[kr]} at # positions {i0} of {text0} and {i1} of {text1}. # # For each variant that matches, sets {bimatch[kr]} to the appropriate rung # and calls {match_aux} to extend it further. # # The {score} should be the sum of all gaps and hits in # {bimatch[0..kr-1]} including the gap that ends at {i0,i1} but NOT # the penalty of the next rung. # # Returns true iff found one variant that matched. nonlocal bimatch, opt_bimatch, opt_score, tot debug = debug_all or False ind = ("." * kr) if debug else None if debug: err.write(f"!~ {ind} maob {i0 = } {i1 = } {kr = } {score = :.3f}\n") assert i0 <= nt0 and i1 <= nt1 assert kr < nr bpk = bitemp[kr] tail0 = text0[i0:] tail1 = text1[i1:] nv = len(bpk) # Number of variants of the bipat. assert bimatch[kr] == None matched_some_pvar = False for iv in range(nv): bpvk = bpk[iv] penkv, patkv0, patkv1 = bpvk assert isinstance(penkv, float) and penkv >= 0 m0 = re.match(patkv0, tail0) m1 = re.match(patkv1, tail1) if m0 != None and m1 != None: tot['match'] += 1 matched_some_pvar = True assert m0.start(0) == 0; assert m1.start(0) == 0 # Update the score for the penalty of the additional rung: new_score = score + penkv if debug: hitkv0 = m0.group(0); hitkv1 = m1.group(0) err.write(f"!~ {ind} maob {iv = } {hitkv0 = !r} {hitkv1 = !r}") err.write(f" {penkv = :.3f} {new_score = :.3f}") if new_score >= opt_score: # Not worth extending with {bpk[iv]}: if debug: err.write(" >=\n") else: # Let's try extending: j0 = i0 + m0.end(0); j1 = i1 + m1.end(0); rngk = (i0,j0,i1,j1,bpvk) if debug: err.write(f" <\n") bimatch[kr] = rngk match_aux(j0, j1, kr+1, new_score) bimatch[kr] = None return matched_some_pvar # ---------------------------------------------------------------------- def quick_next_match_positions(kt0, i0, upat0, kt1, i1, upat1, kr): # Scans the integer grid {{kt0..nt0}×{kt1..nt1}} in lex order starting # from {(i0,i1)},and returns the first pair {(r0,r1)} such that # {text0[r0:]} matches {upat0} at start and {text1[r1:]} matches {upat1} # at start. # # If there is no such pair, returns {(nt0+1,nt1+1)} # # Assumes that {upat0} and/or {upat1} may match the empty string, # even though that should not be allowed. nonlocal bimatch, opt_bimatch, opt_score, tot debug = debug_all or False ind = ("." * kr) if debug else None if debug: err.write(f"!Q {ind} { kt0 = } {i0 = } {nt0 = } {upat0 = }\n") if debug: err.write(f"!Q {ind} { kt1 = } {i1 = } {nt1 = } {upat1 = }\n") assert 0 <= kt0 and kt0 <= i0 assert 0 <= kt1 and kt1 <= i1 assert i0 <= nt0 and i1 <= nt1 # Try the rest of the row {i0}, f any: if i1 > kt1: if debug: err.write(f"!Q {ind} trying {upat0 = !r} on text1 at position r0 = {i0 = } ...") m0 = re.match(upat0, text0[i0:]) if m0 != None: if debug: err.write(f" matches.\n") if debug: err.write(f"!Q {ind} trying {upat1 = !r} on text1 for r1 in {i1}..{nt1} ...") m1 = re.search(upat1, text1[i1:]) if m1 != None: r0 = i0; r1 = i1 + m1.start(0) if debug: err.write(f" matches at {r1 = }.\n") return r0, r1 else: if debug: err.write(f"not match.\n") i0 += 1; i1 = kt1 else: if debug: err.write(f" does not match.\n") i0 += 1; i1 = kt1 # Try from row {i0} forward, all cols: if i0 > nt0: return nt0+1, nt1+1 assert i1 == kt1 if debug: err.write(f"!Q {ind} trying {upat0 = !r} on text0 for r0 in {i0}..{nt0} ...") m0 = re.search(upat0, text0[i0:]) if m0 != None: r0 = i0 + m0.start(0); if debug: err.write(f" matched at {r0 =}.\n") else: if debug: err.write(f" no match.\n") return nt0+1, nt1+1 if debug: err.write(f"!Q {ind} trying trying {upat1 = !r} on text1 for r1 in {i0}..{nt0} ...") m1 = re.search(upat1, text1[i1:]) if m1 != None: r1 = i1 + m1.start(0) if debug: err.write(f" matched at {r1 =}.\n") else: if debug: err.write(f" no match.\n") return nt0+1, nt1+1 if debug: err.write(f"!Q {ind} chose { r0 = } {r1 = }\n") return r0, r1 # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: def match_one_bipat_from_here_on(kt0, kt1, kr, score): # Tries to match the bipattern {bitemp[k]} at all possible pairs of # positions {i0,i1} in the integer grid {{kt0..nt0}×{kt1..nt1}} nonlocal bimatch, opt_bimatch, opt_score, tot debug = debug_all or False ind = ("." * kr) if debug else None assert kt0 <= nt0 and kt1 <= nt1 bpk = bitemp[kr] upat0k = ukwpats0[kr]; upat1k = ukwpats1[kr] i0 = kt0; i1 = kt1 nmatch = 0 # Positions {i0,i1} where {bpk} matched. while True: if i0 > nt0 or i1 > nt1: break i0, i1 = quick_next_match_positions(kt0, i0, upat0k, kt1, i1, upat1k, kr) if i0 > nt0 or i1 > nt1: break # There is a chance to match the bipattern {bpk} at {i0,i1}. # Accumlate the gap score: gszk0 = i0 - kt0; gszk1 = i1 - kt1 ig = kr; assert ig < ng gsck = eval_one_gap(gszk0, gszk1, ig, ng) tot['evals'] += 1 new_score = score + gsck if debug: err.write(f"!~ {ind} ma {i0 = } {gszk0 = } {i1 = } {gszk1 = }") err.write(f" {gsck = :.3f} {new_score = :.3f}") if new_score < opt_score: # Trying to add a rung at {(i0,i1)} if debug: err.write(" <\n") matched = match_one_bipat_at_cur_point(i0, i1, kr, new_score) if matched: nmatch += 1 else: if debug: err.write(" >=\n") tot['prune'] += 1 # Try another pair of positions: i1 += 1 if i1 > nt1: i0 += 1; i1 = kt1 return nmatch # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: def match_aux(kt0, kt1, kr, score): # Assumes that {bimatch[0..kr-1]} are defined for # some {kr <= nr} and cover the first {kt0} characters of {text0} # and the first {kt1} characters of {text1}. # Enumerates all complete parsings that extend that parsing, updaing # the optimum. # # The {score} should be the sum of all gaps and hits in {bimatch[0..kr-1]} # except the tail gap. nonlocal bimatch, opt_bimatch, opt_score, tot debug = debug_all or False ind = ("." * kr) if debug else None if debug: err.write(f"!~ {ind} ma {kt0 = } {kt1 = } {kr = } {score = :.3f}\n") assert kt0 <= nt0 and kt1 <= nt1 assert score < opt_score # jprev0, jprev1 = (0,0) if kr == 0 else (bimatch[kr-1][1], bimatch[kr-1][3]) if kr == nr: # Completed another bimatching: # Update score with the tail gaps: tot['whole'] += 1 gszf0 = nt0 - kt0; gszf1 = nt1 - kt1 ig = kr; assert ig == ng-1 gscf = eval_one_gap(gszf0,gszf1,ig,ng) tot['evals'] += 1 fin_score = score + gscf if debug: err.write(f"!~ {ind} ma {gszf0 = } {gszf1 = }") err.write(f" {gscf = :.3f} {fin_score = :.3f}") if fin_score < opt_score: if debug: err.write(" |\n") opt_bimatch = bimatch.copy() opt_score = fin_score tot['bueno'] += 1 else: if debug: err.write(" >=\n") tot['prune'] += 1 # No way to extend: return else: # Still need more hits: assert kr < nr nmatch = match_one_bipat_from_here_on(kt0, kt1, kr, score) if nmatch == 0: tot['nohit'] += 1 if debug: err.write(f"!~ ma {tot['nohit'] = }\n") return # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: debug = debug_all if debug: err.write(f"!~ starting {bias = :.3f}\n") if debug: err.write("!~ >>\n") match_aux(0,0,0, bias) if debug: err.write("!~ <<\n") if verbose: err.write(f"evaluated {tot['evals']} gap pairs.\n") err.write(f"considered {tot['whole']} whole matchings.\n") err.write(f"found {tot['match']} matching keyword pairs.\n") err.write(f"abandoned {tot['prune']} for excessive score.\n") err.write(f"abandoned {tot['nohit']} for failure to match key.\n") err.write(f"improved optimum {tot['bueno']} times.\n") err.write(f"final score {opt_score:.3f}.\n") # Checking result: if opt_bimatch != None: assert len(opt_bimatch) == nr j0_prev = 0; j1_prev = 0 for kr in range(nr): rngk = opt_bimatch[kr] if debug: err.write(f"!@ {rngk = !r}\n") assert rngk != None i0, j0, i1, j1, bpvar = rngk assert j0_prev <= i0 and i0 < nt0 and i0 <= j0 and j0 <= nt0 assert j1_prev <= i1 and i1 < nt1 and i1 <= j1 and j1 <= nt1 pena, pat0, pat1 = bpvar assert isinstance(pena, float) and pena >= 0 assert isinstance(pat0, str) assert isinstance(pat1, str) j0_prev = j0; j1_prev = j1 return opt_bimatch, opt_score # ---------------------------------------------------------------------- def extract_segments(text0, text1, bimatch): # Given two texts {text0,text1} and a bimatching {bimatch[0..nr-1]} # for them, returns the two macro-parsings {segs0,segs1} of those # texts implied by the bimatching. # # The list {segs0} has {ns=2*nr+1} strings. comprising the {nr} # substrings of {text0} that are defined by the indices {i0} and {j0} # of the rungs (the /hits/) and the substrings before, between,and # after those hits (the /gaps/). The list {segs1} has the {ns} gaps # and hits of {text1} defined by the indices {i1} and {j1} of the # rung. # # The procedre also returns a {key_penalty} which is the sum of the # {pena} components in their {bpvar} fields. nr = len(bimatch) nh = nr; ng = nh + 1; ns = ng + nh textp = ( text0, text1 ) segs = [ [ ], [ ] ] jtp_prev = [ 0, 0 ] # End or previous hit key_penalty = 0 for ir in range(nr): i0, j0, i1, j1, bpvar = bimatch[ir] pena, pat0, pat1 = bpvar assert isinstance(pena, float) and pena >= 0 itp = ( i0, i1 ) jtp = ( j0, j1 ) for side in range(2): jt_prev = jtp_prev[side] it = itp[side] jt = jtp[side] tt = textp[side] nt = len(tt) assert jt_prev <= it <= nt and it <= jt <= nt segs[side].append(tt[jt_prev:it]) # Gap segs[side].append(tt[it:jt]) # Hit assert isinstance(pena, float) and pena >= 0 key_penalty += pena jtp_prev = jtp # Last gaps: segs[0].append(text0[jtp_prev[0]:]) segs[1].append(text1[jtp_prev[1]:]) return segs[0], segs[1], key_penalty # ---------------------------------------------------------------------- def get_universal_keyword_patterns(bitemp): # Given a list of {bitemp[0..nr-1]} of bipatterns, # returns two lists {ukwpats0,ukwpats1} each with {nr} # strings. The element {ukwpats0[kr]} is a single RE # pattern that will match all the substrings # of a text that would be matched by any patterns # {pat0} in any variant {pvar=(pena,pat0,pat1)} of # the bipattern {bitemp[kr]}. Similarly # {ukwpats1[kr]} subsumes all patterns {pat1} # of those variants. nr = len(bitemp) ukwpats0 = [None] * nr; ukwpats1 = [None] * nr; for kr in range(nr): bpk = bitemp[kr] pats0 = [] pats1 = [] for pvar in bpk: pena, pat0, pat1 = pvar pats0 += pat0.split('|') pats1 += pat1.split('|') ukwpats0[kr] = "|".join(tuple(set(tuple(pats0)))) ukwpats1[kr] = "|".join(tuple(set(tuple(pats1)))) assert re.match(ukwpats0[kr], "") == None, f"bipatern {kr} side 0 matches empty" assert re.match(ukwpats1[kr], "") == None, f"bipatern {kr} side 1 matches empty" return ukwpats0,ukwpats1 # ---------------------------------------------------------------------- def test_match(text0, utype0, text1, utype1, bitemp, pad0): err.write("=== testing {match_bitemplate} ===\n") debug = True nr = len(bitemp) nh = nr; ng = nh + 1 tsize0 = len(text0) tsize1 = len(text1) def eval_one_gap(gsize0, gsize1, ig, ng): debug = False if debug: err.write("!e {gsize0 = !r} {gsize1 = !r}\n") gap_score = bef.compute_single_gap_score(gsize0, utype0, gsize1, utype1, ig, ng) if debug: err.write(f"!e {gap_score = :.3f}\n") return gap_score # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: # Find {bimatch[0..nr]}, the best matching of {bitemp}: err.write("computing bias ...\n") bias = bef.compute_total_size_score(tsize0, utype0, tsize1, utype1, nh) if debug: err.write(f"{tsize0 = } {utype0 = !r} {tsize1 = } {utype1 = !r} {nh = } {bias = :.3f}\n") err.write("matching ...\n") bimatch, score = match_bitemplate(text0, text1, bitemp, bias, eval_one_gap) if bimatch == None: assert score == +inf assert False, "matching failed!" else: err.write(f"{score = :.3f}\n") assert len(bimatch) == nr err.write("extracting the segments ...\n") segs0, segs1, key_penalty = extract_segments(text0, text1, bimatch) err.write(f"{key_penalty = :.3f} {score = :.3f}\n") mpars = ((segs0, '?', pad0,), (segs1, '?', ' ',),) err.write("matching result:\n") wpf.write_parsings(err, "", True, mpars) # Checking the score computation: score_check = bef.compute_full_score_from_macro_parsings \ ( segs0, utype0, segs1, utype1, key_penalty ) if abs(score_check - score) > 1.0e-6: err.write(f"{score = :.8f} {score_check = :.8f}\n") assert False, "score does not check" err.write("=================================\n") return # ---------------------------------------------------------------------- def test_match_0(): text0 = "...XX........XaX..." text1 = "...ZZ..ZaZ...ZaZ..." bpvar_A1 = ( 0.000, 'XX', 'YY', ) bpvar_A2 = ( 0.020, 'XX', 'ZZ', ) bpvar_A3 = ( 0.040, 'Xa?X', 'Za?Z', ) bipatA = ( bpvar_A1, bpvar_A2, bpvar_A3 ) bitemp = ( bipatA, bipatA ) test_match(text0, "ec", text1, "ec", bitemp, ' ') return # ---------------------------------------------------------------------- def test_match_1(): text0 = \ "蜂子主治风头除蛊毒补虚羸伤中久服令人光泽好颜色不老大黄蜂子主治心腹胀满痛轻" + \ "身益气土蜂子主治痈肿" text1 = \ "deechdyopchedaiinypchedyodalychedyqopcheokaiinshedypodairochedalloiinchedy" + \ "qokaiinchdydaiindchdoseedolchdolkchedychokaiinchdyqoeedyokchedydaiiinchedy" + \ "daiinykeedyokeeedychedolchdaiinykardarycheold" + \ "chedydkchsaiinchdedyqodaiinokchedaiinchain" bpvar_USES_1 = ( 0.000, '主治|主', 'daiin' ) bpvar_USES_2 = ( 0.100, '主治|主', 'dair|dain|laiin' ) bpvar_USES_3 = ( 0.300, '主治|主', 'kaiin' ) bpvar_USES_4 = ( 0.600, '主治|主', '[dklrs][ao]ii?i?[rn]' ) bipat_USES = ( bpvar_USES_1, bpvar_USES_2, bpvar_USES_3, bpvar_USES_4, ) bpvar_QI_1 = ( 0.000, '气', 'chedy' ) bpvar_QI_2 = ( 0.100, '气', 'ched[ao]' ) bpvar_QI_3 = ( 0.500, '气', '[cs]he?[dk][yao]' ) bipat_QI = ( bpvar_QI_1, bpvar_QI_2, bpvar_QI_3, ) bpvar_LONG_1 = ( 0.000, '久服|久食', 'okeedy|okaiin' ) bpvar_LONG_2 = ( 0.020, '久|服|食', 'okeedy|okaiin' ) bpvar_LONG_3 = ( 0.200, '久服|久食', '[oy][ktd]ee[dk][aoy]|[oy][kt][ao]ii?n' ) bpvar_LONG_4 = ( 0.220, '久|服|食', '[oy][ktd]ee[dk][aoy]|[oy][kt][ao]ii?n' ) bipat_LONG = ( bpvar_LONG_1, bpvar_LONG_2, bpvar_LONG_3, bpvar_LONG_4, ) bitemp = ( bipat_USES, bipat_LONG, bipat_USES, bipat_QI, bipat_USES, ) test_match(text0, "ch", text1, "ec", bitemp, ' ') return # ---------------------------------------------------------------------- if len(sys.argv) > 1 and sys.argv[1] == "BMF.TEST": test_match_0() test_match_1()