#! /usr/bin/python3 # Last edited on 2026-05-01 15:42:43 by stolfi import os, sys, re from sys import stderr as err, stdout as out import write_parsing_funcs as wpf import size_position_funcs as spf from math import sqrt, hypot, log, exp, floor, ceil, inf, nan, isfinite # The procedures of this module are meant to evaluate a bimatching # between the clean text {text0} of type {utype0} and the clean text # {text1} of type {utype1}. The evaluation is summarized in a # non-negative /badness score/ for the bimatching. The best possible # bimatching has badness zero. # # The most important case is when {text0} is the clean text (hanzi only) # of an SBJ entry, with {utype0 = "ch"}; and {text1} is the clean text # (EVA letters [a-z?] only) of an SPS parag, whith {utype1 = "ec"}. # # A bimatching {bimatch}, as produced by {match_bitemplate} in # {bitemplate_match_funcs}, is a list of a certain number {nh} of # /rungs/, quintuples {(i0,j0,i1,j1,bpvar)}. It describes two # corresponding macro-parsings {segs0} of {text0} and {segs1} of # {text1}. Each macro-parsing is a partition of the corresponding text # into an odd number {ns = 2*nh+1} of /segments/, alternating {ng = # nh+1} /gaps/ and {nh} /hits/. # # The full score of a bimatching is the sum of # # * a /total size score/ that depends only on the # on the total sizes of the two texts and the number of hits {nh} # # * a /gap size score/ that depends on the sizes # of the {ng} gaps of {segs0} and {segs1}. # # * a /key penalty/ which is the sum of the {pena} field # in the {pvar} field of the rungs of the bimatching. # # Note that the score does not depend on the actual texts or the sizes # of the hits. # # Note also that the total size score does not depend on the bimatching # except through the count {nh}. def compute_full_score_from_bimatching(tsize0, utype0, tsize1, utype1, bimatch): # Computes the full badness score of the bimatching {bimatch}, # given the total sizes {tsize0,tsize1} of the two texts and their # types {utype0,utype1}. nh = len(bimatch) gsizes0, gsizes1, key_penalty = get_gap_sizes_from_bimatching(bimatch) score = compute_full_score_from_gap_sizes(tsize0, gsizes0, utype0, tsize1, gsizes1, utype1, key_penalty) return score # ---------------------------------------------------------------------- def compute_full_score_from_macro_parsings(segs0, utype0, segs1, utype1, key_penalty): # Computes the full badness score of a bimatching between two texts given their # macro-parsings {segs0,segs1} as defined by the bimatching, their types {utype0,utype1}, # and the total keyword penalty {key_penalty} of the rungs of the bimatching. ns = len(segs0); assert ns == len(segs1) nh = ns//2; ng = nh + 1; assert ns == ng + nh tsize0 = 0; for sg in segs0: tsize0 += len(sg) tsize1 = 0; for sg in segs1: tsize1 += len(sg) gsizes0, tgsize0 = get_gap_sizes(segs0) gsizes1, tgsize1 = get_gap_sizes(segs1) score = compute_full_score_from_gap_sizes(tsize0, gsizes0, utype0, tsize1, gsizes1, utype1, key_penalty) return score # ---------------------------------------------------------------------- def compute_full_score_from_gap_sizes(tsize0, gsizes0, utype0, tsize1, gsizes1, utype1, key_penalty): # Computes the full badness score of a bimatching with {nh} rungs between two texts, given the # sizes {tsize0,tsize1} of the texts, the {ng = nh+1} gap sizes # {gsizes0,gsizes1} implied by the bimatching, and the # total keyword penalty {key_penalty} in the rungs of the bimatching. debug = True ng = len(gsizes0); assert ng == len(gsizes1) nh = ng - 1 if debug: err.write(f"!S {tsize0 = } {utype0 = !r} {tsize1 = } {utype1 = !r} {nh = }\n") score = 0 tsize_score = compute_total_size_score(tsize0, utype0, tsize1, utype1, nh) score += tsize_score if debug: err.write(f"!S total size score = {tsize_score:.6f}\n") gaps_score = compute_gaps_score(gsizes0, utype0, gsizes1, utype1) score += gaps_score if debug: err.write(f"!S gaps score = {gaps_score:.6f} accum {score:.6f}\n") score += key_penalty if debug: err.write(f"!S key penalty = {key_penalty:.6f} accum {score:.6f}\n") return score # ---------------------------------------------------------------------- def compute_total_size_score(tsize0, utype0, tsize1, utype1, nh): # Computes the component of the badness score of a bimatching between # two texts that is due to the discrepancy in their total sizes # {tsize0,tsize1}, and the number {nh} of hits (which is the number of # rungs in the bimatching). etgsize0 = estimate_total_gap_size(tsize0, utype0, nh) etgsize1 = estimate_total_gap_size(tsize1, utype1, nh) exp_etgsize1 = expected_size1_from_size0(etgsize0, utype0, utype1) tsz_wt = 1.00 tsize_score = tsz_wt*compute_single_size_score(etgsize1, exp_etgsize1) return tsize_score # ---------------------------------------------------------------------- def estimate_total_gap_size(tsize, utype, nh): # Estimates the total gap size in {text} of type {utype} assuming that # it will contain {nh} hits and each hit corresponds to # a fixed number of hanzi. hitsz_ch = 1.500 # Estimated average hit size in hanzi. trimmed = True ch_per_un = spf.hanzi_per_unit(utype, trimmed) etsz_hit_un = nh*hitsz_ch/ch_per_un etsz_gap_un = max(0, tsize - etsz_hit_un) # Smooth cooking a minimum: min_avgsz_gap_un = 0.5 # Min assumed average gap size in {utype} units. etsz_gap_un = hypot((nh + 1)*min_avgsz_gap_un, etsz_gap_un) etsz_gap_un = int(ceil(etsz_gap_un)) return etsz_gap_un # ---------------------------------------------------------------------- def compute_gaps_score(gsizes0, utype0, gsizes1, utype1): # Computes the gap sizes component of the badness score of a bimatching # given the actual sizes {gsizes0[0..ng-1],gsizes1[0..ng-1]} # of the gaps determined by the bimatching. debug = False ng = len(gsizes0); assert len(gsizes1) == ng gaps_score = 0 for ig in range(ng): gsci = compute_single_gap_score(gsizes0[ig], utype0, gsizes1[ig], utype1, ig, ng) gaps_score += gsci if debug: err.write(f"!G gap score = {gsci:.6f} accum = {gaps_score:.6f}\n") if debug: err.write(f"!G gap score final = {gaps_score:.6f}\n") return gaps_score # ---------------------------------------------------------------------- def compute_single_gap_score(gsize0, utype0, gsize1, utype1, ig, ng): # Computes the contribution to the the badness score of a bimatching # due to a single gap, whose size is {gsize0} on a text of type {unit0} and # the size {gsize1} of a supposedly matchng text of a type {uptype1}. # # The contribution is properly weighted based on the index {ig} # of the gap and the number of {ng} of gaps. debug = False if debug: err.write(f"\n") exp_gsize1 = expected_size1_from_size0(gsize0, utype0, utype1) wt = 0.30 if ig == 0 else 0.30 if ig == ng-1 else 1.00 gap_score = wt * compute_single_size_score(gsize1, exp_gsize1) if debug: err.write(f"!g {gsize1 = } {exp_gsize1 = } {wt = :.6f}\n") return gap_score # ---------------------------------------------------------------------- def compute_single_size_score(sz, esz): # Mismatch score between a single size {sz} and the expected size range {esz}. debug = False frac = spf.compute_frac_error(sz, esz) score = frac**2 if debug: err.write(f"!s {frac = :.6f} {score = :.6f}\n") return score # ---------------------------------------------------------------------- def expected_size1_from_size0(size0, utype0, utype1): # Computes the expected size {exp_size1} of some text of type {utype1} # given the size of {size0} of the supposedly corresponding text of type {utype0} # The result is a range, computed assuming a fixed internal conversion # factor of hanzi characters to EVA letters. un1_per_un0 = spf.hanzi_per_unit(utype0, True)/spf.hanzi_per_unit(utype1, True) eps = 0.0001 exp_size1 = spf.scale_size_fuzzy(size0, un1_per_un0, eps) return exp_size1 # ---------------------------------------------------------------------- def get_gap_sizes(segs): # Given a macro-parsing {segs} with {nh} hits and {ng = nh+1} gaps, # returns a list {sizes[0..ng-1]} of the lengths of # gaps (omitting the hits), and their total {tgsize}. ns = len(segs); nh = ns//2; ng = nh+1; assert ns == ng + nh sizes = [ len(segs[2*ig]) for ig in range(ng) ] tgsize = 0; for sz in sizes: tgsize += sz return sizes, tgsize # ---------------------------------------------------------------------- def get_hit_sizes(segs): # Given a macro-parsing {segs} with {nh} hits and {ng=nh+1} gaps, # returns a list {sizes[0..nh-1]} of the lengths of # hits (omitting the gaps), and their total {thsize}. ns = len(segs); nh = ns//2; ng = nh+1; assert ns == ng + nh sizes = [ len(segs[2*ih + 1]) for ih in range(nh) ] thsize = 0; for sz in sizes: thsize += sz return sizes, thsize # ---------------------------------------------------------------------- def test_stuff(): test_compute_full_score_from_macro_parsings_1() test_compute_full_score_from_macro_parsings_2() return # ---------------------------------------------------------------------- def test_compute_full_score_from_macro_parsings_1(): segs_ch = \ ( "龙骨", "主治", "心腹鬼注精物老魅咳逆泄利脓血漏下症瘕坚结热", "气", "惊痫齿", "主治", "惊痫癫疾狂走心下结", "气", "不能喘息诸痉杀精物", "久", "服轻身通神明延年", ) loc_ec = "f106r.42" segs_ec = \ ( "pcheodarshol", "kain", "okshchedyqoteeyshotchyqotylpaiinshedylardaiiralsheoldaiinotedyqokainakarcheoraltaiinchekalotarard", "shedy", "qoteeyotaiinchychealolchlch", "daiin", "otyotairotaiikamysheedalokainakainotarkaiin", "chda", "lkairolkaisalkeedyokalsotchdaiinshadaiinot?yqokeedyq", "okaiin", "ykarqokaincheedylolycheoarokainqokaincharokycheokam", ) key_penalty = 1.620 do_test_compute_full_score_from_macro_parsings(segs_ch, loc_ec, segs_ec, key_penalty) return # ---------------------------------------------------------------------- def test_compute_full_score_from_macro_parsings_2(): segs_ch = \ ( "龙骨", "主治", "心腹鬼注精物老魅咳逆泄利脓血漏下症瘕坚结热", "气", "惊痫齿", "主治", "惊痫癫疾狂走心下结", "气", "不能喘息诸痉杀精物", "久服", "轻身通神明延年", ) loc_ec = "f104v.1" segs_ec = \ ( "pch", "daiin", "opcheedyoraroltcheeyopchedyolearaiiralycheodaiincheekaindamychedaiinqoteedchockhyotaiinydaiinqokamdyotararal", "chedo", "tairoramshodchedyqotaiino", "daiin", "okeolockhhycholqokeedyqotairoeedaiinoldlqoteedy", "cheda", "iinchokarqotolqotchedcholcheyqolchedyqoeeeyq", "okeedy", "dcheolchdeeyoeeodainsairolchedal", ) key_penalty = 0.200 do_test_compute_full_score_from_macro_parsings(segs_ch, loc_ec, segs_ec, key_penalty) return # ---------------------------------------------------------------------- def do_test_compute_full_score_from_macro_parsings(segs_ch, loc_ec, segs_ec, key_penalty): err.write(f"=== testing compute_full_score_from_macro_parsings ===\n") err.write(f"{loc_ec = }\n") score = compute_full_score_from_macro_parsings \ ( segs_ch, "ch", segs_ec, "ec", key_penalty ) err.write(f"{score = :.6f}\n") return # ---------------------------------------------------------------------- if len(sys.argv) == 2 and sys.argv[1] == "BEF.TEST": test_stuff()