#! /usr/bin/python3 # Last edited on 2026-05-18 07:54:51 by stolfi import os, sys, re from sys import stderr as err, stdout as out import write_parsing_funcs as wpf import size_position_funcs as spf from math import sqrt, hypot, log, exp, floor, ceil, inf, nan, isfinite # The procedures of this module are meant to evaluate a bimatching # between the clean text {text0} of type {utype0} and the clean text # {text1} of type {utype1}. The evaluation is summarized in a # non-negative /badness score/ for the bimatching. The best possible # bimatching has badness zero. # # The most important case is when {text0} is the clean text (hanzi only) # of an SBJ entry, with {utype0 = "ch"}; and {text1} is the clean text # (EVA letters [a-z?] only) of an SPS parag, whith {utype1 = "ec"}. # # A bimatching {bimatch}, as produced by {match_bitemplate} in # {bitemplate_match_funcs}, is a list of a certain number {nh} of # /rungs/, quintuples {(i0,j0,i1,j1,bpvar)}. It describes two # corresponding macro-parsings {segs0} of {text0} and {segs1} of # {text1}. Each macro-parsing is a partition of the corresponding text # into an odd number {ns = 2*nh+1} of /segments/, alternating {ng = # nh+1} /gaps/ and {nh} /hits/. # # The full score of a bimatching is the sum of # # * a /gap size score/ that depends on the sizes # of the {ng} gaps of {segs0} and {segs1}. # # * a /key penalty/ which is the sum of the {pena} field # in the {pvar} field of the rungs of the bimatching. # # Note that the score does not depend on the actual texts or the sizes # of the hits. # def compute_full_score_from_bimatching(nt0, utype0, nt1, utype1, bimatch): # Computes the full badness score of the bimatching {bimatch}, # given the sizes {nt0,nt1} of the two texts and the their types {utype0,utype1}. nh = len(bimatch) gsizes0, gsizes1, key_penalty = bmf.get_gap_sizes_from_bimatching(nt0, nt1, bimatch) score = compute_full_score_from_gap_sizes(gsizes0, utype0, gsizes1, utype1, key_penalty) return score # ---------------------------------------------------------------------- def compute_full_score_from_macro_parsings(segs0, utype0, segs1, utype1, key_penalty): # Computes the full badness score of a bimatching between two texts given their # macro-parsings {segs0,segs1} as defined by the bimatching, their types {utype0,utype1}, # and the total keyword penalty {key_penalty} of the rungs of the bimatching. ns = len(segs0); assert ns == len(segs1) nh = ns//2; ng = nh + 1; assert ns == ng + nh gsizes0 = get_gap_sizes(segs0) gsizes1 = get_gap_sizes(segs1) score = compute_full_score_from_gap_sizes(gsizes0, utype0, gsizes1, utype1, key_penalty) return score # ---------------------------------------------------------------------- def compute_full_score_from_gap_sizes(gsizes0, utype0, gsizes1, utype1, key_penalty): # Computes the full badness score of a bimatching with {nh} rungs between two texts, given the # {ng = nh+1} gap sizes {gsizes0,gsizes1} implied by the bimatching, and the # total keyword penalty {key_penalty} in the rungs of the bimatching. debug = False ng = len(gsizes0); assert ng == len(gsizes1) nh = ng - 1 if debug: err.write(f"!S {utype0 = !r} {utype1 = !r} {nh = }\n") score = 0 gaps_score = compute_gaps_score(gsizes0, utype0, gsizes1, utype1) score += gaps_score if debug: err.write(f"!S gaps score = {gaps_score:.6f} accum {score:.6f}\n") score += key_penalty if debug: err.write(f"!S key penalty = {key_penalty:.6f} accum {score:.6f}\n") return score # ---------------------------------------------------------------------- def compute_gaps_score(gsizes0, utype0, gsizes1, utype1): # Computes the gap sizes component of the badness score of a bimatching # given the actual sizes {gsizes0[0..ng-1],gsizes1[0..ng-1]} # of the gaps determined by the bimatching. debug = False ng = len(gsizes0); assert len(gsizes1) == ng gaps_score = 0 for ig in range(ng): gsci = compute_single_gap_score(gsizes0[ig], utype0, gsizes1[ig], utype1, ig, ng) gaps_score += gsci if debug: err.write(f"!G gap score = {gsci:.6f} accum = {gaps_score:.6f}\n") if debug: err.write(f"!G gap score final = {gaps_score:.6f}\n") return gaps_score # ---------------------------------------------------------------------- def compute_single_gap_score(gsize0, utype0, gsize1, utype1, ig, ng): # Computes the contribution to the the badness score of a bimatching # due to a single gap, whose size is {gsize0} on a text of type {unit0} and # the size {gsize1} of a supposedly matchng text of a type {uptype1}. # # The contribution is properly weighted based on the index {ig} # of the gap and the number of {ng} of gaps. debug = False if debug: err.write(f"\n!g {gsize0 = :3d} {gsize1 = :3d} {ig = :2d} {ng = :2d}") exp_gsize1 = expected_size1_from_size0(gsize0, utype0, utype1) wt = 0.50 if ig == 0 else 0.50 if ig == ng-1 else 1.00 wt /= ng score = wt * compute_single_size_score(gsize1, exp_gsize1) if debug: err.write(f" {wt = :.6f} {score = :7.3f}\n") return score # ---------------------------------------------------------------------- def compute_single_size_score(sz, esz): # Mismatch score between a single size {sz} and the expected size range {esz}. debug = False if debug: err.write(f"!s {sz = :4.1f} esz = {esz[0]:3d}..{esz[1]:3d}") frac = spf.compute_frac_error(sz, esz) score = 0.20 * frac**2 if debug: err.write(f" {frac = :.6f} {score = :.6f}\n") return score # ---------------------------------------------------------------------- def expected_size1_from_size0(size0, utype0, utype1): # Computes the expected size {exp_size1} of some text of type {utype1} # given the size of {size0} of the supposedly corresponding text of type {utype0} # The result is a narrow range, computed assuming a fixed internal conversion # factor of hanzi characters to EVA letters. un1_per_un0 = spf.hanzi_per_unit(utype0, True)/spf.hanzi_per_unit(utype1, True) eps = 0.0001 exp_size1 = spf.scale_size_fuzzy(size0, un1_per_un0, eps) return exp_size1 # ---------------------------------------------------------------------- def get_gap_sizes(segs): # Given a macro-parsing {segs} with {nh} hits and {ng = nh+1} gaps, # returns a list {sizes[0..ng-1]} of the lengths of # gaps (omitting the hits). ns = len(segs); nh = ns//2; ng = nh+1; assert ns == ng + nh sizes = [ len(segs[2*ig]) for ig in range(ng) ] return sizes # ---------------------------------------------------------------------- def get_hit_sizes(segs): # Given a macro-parsing {segs} with {nh} hits and {ng=nh+1} gaps, # returns a list {sizes[0..nh-1]} of the lengths of # hits (omitting the gaps). ns = len(segs); nh = ns//2; ng = nh+1; assert ns == ng + nh sizes = [ len(segs[2*ih + 1]) for ih in range(nh) ] return sizes # ---------------------------------------------------------------------- def test_stuff(): test_compute_single_gap_score() test_compute_full_score_from_macro_parsings_1() test_compute_full_score_from_macro_parsings_2() return # ---------------------------------------------------------------------- def test_compute_single_gap_score(): err.write("----------------------------------------\n") err.write("testing compute_single_gap_score ...\n") gs_ch = 5 for gs_ec in range(40): sc = compute_single_gap_score(gs_ch, "ch", gs_ec, "ec", 1, 3) err.write(f"{gs_ch = :4d} {gs_ec = :4d} {sc = :7.3f}\n") def test_compute_full_score_from_macro_parsings_1(): segs_ch = \ ( "龙骨", "主治", "心腹鬼注精物老魅咳逆泄利脓血漏下症瘕坚结热", "气", "惊痫齿", "主治", "惊痫癫疾狂走心下结", "气", "不能喘息诸痉杀精物", "久", "服轻身通神明延年", ) loc_ec = "f106r.42" segs_ec = \ ( "pcheodarshol", "kain", "okshchedyqoteeyshotchyqotylpaiinshedylardaiiralsheoldaiinotedyqokainakarcheoraltaiinchekalotarard", "shedy", "qoteeyotaiinchychealolchlch", "daiin", "otyotairotaiikamysheedalokainakainotarkaiin", "chda", "lkairolkaisalkeedyokalsotchdaiinshadaiinot?yqokeedyq", "okaiin", "ykarqokaincheedylolycheoarokainqokaincharokycheokam", ) key_penalty = 1.620 do_test_compute_full_score_from_macro_parsings(segs_ch, loc_ec, segs_ec, key_penalty) return # ---------------------------------------------------------------------- def test_compute_full_score_from_macro_parsings_2(): segs_ch = \ ( "龙骨", "主治", "心腹鬼注精物老魅咳逆泄利脓血漏下症瘕坚结热", "气", "惊痫齿", "主治", "惊痫癫疾狂走心下结", "气", "不能喘息诸痉杀精物", "久服", "轻身通神明延年", ) loc_ec = "f104v.1" segs_ec = \ ( "pch", "daiin", "opcheedyoraroltcheeyopchedyolearaiiralycheodaiincheekaindamychedaiinqoteedchockhyotaiinydaiinqokamdyotararal", "chedo", "tairoramshodchedyqotaiino", "daiin", "okeolockhhycholqokeedyqotairoeedaiinoldlqoteedy", "cheda", "iinchokarqotolqotchedcholcheyqolchedyqoeeeyq", "okeedy", "dcheolchdeeyoeeodainsairolchedal", ) key_penalty = 0.200 do_test_compute_full_score_from_macro_parsings(segs_ch, loc_ec, segs_ec, key_penalty) return # ---------------------------------------------------------------------- def do_test_compute_full_score_from_macro_parsings(segs_ch, loc_ec, segs_ec, key_penalty): err.write("----------------------------------------\n") err.write(f"=== testing compute_full_score_from_macro_parsings ===\n") err.write(f"{loc_ec = }\n") score = compute_full_score_from_macro_parsings \ ( segs_ch, "ch", segs_ec, "ec", key_penalty ) err.write(f"{score = :.6f}\n") return # ---------------------------------------------------------------------- if len(sys.argv) == 2 and sys.argv[1] == "BEF.TEST": test_stuff()