# Last edited on 2026-04-30 03:12:06 by stolfi # def ???compute_gaps_score_from_gap_sizes(gsizes_ch, tgsize_ch, gsizes_ec, tgsize_ec): # ng = len(gsizes_ch); assert len(gsizes_ec) == ng # # exp_gsizes_ec, exp_tgsize_ec = \ # ??? spf.estimate_gap_size_ranges(gsizes_ch, tgsize_ch, 'ec') # # # Bias term based on total gap size: # ??? # # # Sum of individual gap scores: # gscore = compute_gaps_score(gsizes_ec, exp_gsizes_ec) # score = tscore + gscore/sqrt(ng) # return score # # ---------------------------------------------------------------------- # # def ???compute_badness_bias(tsize0, utype0, tsize1, utype1): # # Computes a badness term {bias} from the discrepancy # # betwwn the sizes {tsize0} in units of type {utype0} # # and {tsize1} of unit type {utype1}. # # # Convert both sizes to hanzi counts: # tsz0_ch = tsize0*spf.hanzi_per_unit(utype0, True) # tsz1_ch = tsize1*spf.hanzi_per_unit(utype0, True) # err.write(f"!+ {tsz0_ch = :.2f} {tsz1_ch = :.2f}\n") # # # Get a mean {tsza_ch} from the two: # tau = 1.0 # Fudge to avoid anomalies when either is near zero: # tsza_ch = sqrt((tsz0_ch + tau)*(tsz1_ch + tau)) - tau # # # Compare both to that mean size: # eps = 1.00 # Fudge to avoid anomalies etc. # rsz0 = (tsz0_ch + eps)/(tsza_ch + eps) # rsz1 = (tsz1_ch + eps)/(tsza_ch + eps) # bias = (rsz0-rsz1)**2 # return bias # # ---------------------------------------------------------------------- # # def compute_gaps_score(gsizes, exp_gsizes): # # Computes the badness score given the actual gap sizes {gsizes[0..ng-1]} # # and and expected gap sizes {exp_gsizes[0..ng-1]}. # ng = len(exp_gsizes); assert len(gsizes) == ng # # gaps_score = 0 # # def ???eval_gap_sizes(gsize0, etgsize0, gsize1, etgsize1, ng): # gap_score = 0 # rsz0 = gsize0/etgsize0 # rsz1 = gsize1/etgsize1 # gap_score = ng*4*(rsz0-rsz1)**2 # return gap_score # # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: # # # def ???estimate_gap_size_ranges(gsizes_ch, tgsize_ch, utype): # # Given the hanzi counts {gsizes_ch[0..ng-1]} of the gap # # substrings in the parsing of an entry, returns estimates # # {exp_gsizes_un[0..ng-1]} of the corresponding gap substrings # # in the compatible parsing of the SPS parag that corresponds to that # # entry. # # # # The starps_sizes are measured in EVA letters of {utype} is 'ec', # # and words (considering commas as spaces) if {utype} is 'wc'. # # # # These expected gap sizes are the hanzi gap sizes multiplied by # # an internal hanzi-to-utype scaling factor. The elements of # # {exp_gsizes_un} are ranges rather than integers to account for # # rounding errors and a modicum of variation in that factor. # # # # Also returns the expected range {exp_tgsize_un} for the sum of the . # # the gap sizes in that parsing of the SPS parag. # # # # units_per_hanzi = 1.0/hanzi_per_unit(utype,True) # Assumed avg units per hanzi. # eps = 0.0001 # exp_gsizes_un = scale_sizes_fuzzy(gsizes_ch, units_per_hanzi, eps) # exp_tgsize_un = scale_size_fuzzy(tgsize_ch, units_per_hanzi, eps) # return exp_gsizes_un, exp_tgsize_un # # ---------------------------------------------------------------------- # pinging = False # Printing pings. ping_step = 100 # Ping interval. ctping = -1 # Count of pings since pinging turned on, or {-1} if never. def do_ping(mark): # Reports an event of the current bimatching. nonlocal pinging, ping_step, ctping tev = tot['evals'] % ping_step ping = tev < 50 if ping: if not pinging and ctping >= 0: err.write("[...]"); ctping = 0 err.write(mark); err.flush() ctping += 1 else: if pinging: err.write("[...]\n"); err.flush(); ctping = 0 while ping_step * 10 < tot['evals']: ping_step *= 10; pinging = ping return # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: ??? def sub_score(s1, s2): ??? # Returns the score of substrings {s1,s2}. ??? @ns = len(s1); assert @ns == len(s2) ??? # Count matching and semi-matching characters: ??? score = 0 ??? for j in range(@ns): ??? if s1[j] == s2[j]: ??? score += 1.0 ??? else: ??? assert ec_class(s1[j]) == ec_class(s2[j]) ??? score += 0.5 ??? return score - 1 ??? # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: max_score = -inf max_pair = None for i1 in range(nt1-1): if tx1[i1] == '·': continue for i2 in range(nt2-1): if tx2[i2] == '·': continue j1 = i1; j2 = i2; while j1 < nt1 and j2 < nt2 and tx1[j1] != '·' and tx2[j2] != '·' and tec_class(tx1[j1]) == tec_class(tx2[j2]): j1 += 1; j2 += 1 @ns = j1 - i1; assert @ns == j2 - i2; if @ns >= 2: s1 = tx1[i1:j1]; f1 = (i1 + j1 + 1)/2/(nt1 + 1); w1 = 4*f1*(1 - f1) s2 = tx2[i2:j2]; f2 = (i2 + j2 + 1)/2/(nt2 + 1); w2 = 4*f2*(1 - f2) f12 = (f1 - f2); w12 = 1 - f12*f12; if debug: err.write(f" trying {s1 = !r} {w1 = :8.6f} {s2 = !r} {w2 = :8.6f} {w12 = :8.6f}") wt = w1*w2*w12; wt = wt*sqrt(wt) sc = wt*sub_score(s1, s2) if debug: err.write(f" {sc = :+9.4f}\n") if sc > max_score: max_score = sc; max_pair = (s1,s2,) score = max_score + start_points return score, max_pair ns = 0; if segs_wh != None: ns = max(ns, len(segs_wh)) if segs_ec != None: ns = max(ns, len(segs_ec)) wch = max(1, max_width(segs_wh)) wec = max(1, max_width(segs_ec)) for ks in range(ns): tag = "G" if ks % 2 == 0 else "H" sg_ch = "" if segs_wh == None else "?" if ks >= len(segs_wh) else segs_wh[ks] sg_ch = sg_ch.ljust(wch, ' ') sg_ec = "" if segs_ec == None else "??" if ks >= len(segs_ec) else segs_ec[ks] sg_ec = sg_ec.ljust(wec, ' ') wr.write(f"{lab} {tag} {sg_ec} | {sg_ch}\n") def get_text_ch_macro_parsing(cleantx_ch, kwords_ch): # Assumes that the cleaned text of the SBJ entry (hanzi only, without # punctuation or ASCII chars) is {cleantx_ch}. # # Let {nh} be {len(kwords_ch)}, and {ng???ns} ne {nh+1}. Partitions # {cleantx_ch} into alternating substrings {gaps???segs_ch[0..ng???ns-1]} and # {hits???segs_ch[0..nh-1]} where the latter are the substrings that match # the patterns {kwords_ch[0..nh-1]}, in that order. # # Returns the lists {gaps???segs_ch} and {hits???segs_ch}. nh = len(kwords_ch); ng???ns = nh + 1 # Just in case, remove spaces from the hanzi texts cleantx_ch = re.sub(r"<[^<>]*>", "", cleantx_ch) cleantx_ch = re.sub(r"[ \012]", "", cleantx_ch) gaps???segs_ch, hits???segs_ch = \ mmf.find_multi_pattern_occurrences(cleantx_ch, kwords_ch) err.write(f"!@ {gaps???segs_ch = } {hits???segs_ch = }\n") assert len(hits???segs_ch) == nh, "hit counts don't match" assert len(gaps???segs_ch) == ng???ns return gaps???segs_ch, hits???segs_ch # ---------------------------------------------------------------------- h.enum_item_parags(st, """ The second line has {N+2} numbers, where {N} is the number of specified keywords (such as '主治|主') in that SBJ entry which were matched in the SPS parag. They are the percent errors (deviation from predicted value ranges) of the parag's total gap length and of the lengths of the {N+1} individual gaps before, between, and after those {N} keywords.""") h.enum_item_parags(st, """The third line gives the actual total gap length and the {N+1} actual gap lengths.""") h.enum_item_parags(st, """The intervals in the fourth line are those expected ranges, estimated from the corresponding hanzi counts in the Chinese entry.""") h.enum_item_parags(st, """The fifth line shows the {N} substrings of the SPS parag that were assumed to be the translations of the {N} SBJ keywords.""") def find_keyword_occurrences(text, kword): # Splits {text} at every occurrence of the pattern {kword}. Returns # {gaps???segs,gsizes,hits???segs,hsizes} where {hits???segs} is a list of the {nh} # occurrences of {kword} in {text}, {gaps???segs} is a list of the {ng???ns=nh+1} # strings before, between,and after those strings, and {gsizes,hsizes} # are lists of the lengths of those strings. chops = re.split(f"({kword})", text) assert len(chops) >= 1 ng???ns = (len(chops) + 1) // 2 # Number of actual gaps???segs. nh = ng???ns - 1 # Number of actual hits???segs. assert len(chops) == 2*ng???ns - 1 gaps???segs = [ chops[2*i] for i in range(nh+1) ] hits???segs = [ chops[2*i + 1] for i in range(nh) ] gsizes = [ len(g) for g in gaps???segs ] hsizes = [ len(h) for h in hits???segs ] return gaps???segs, gsizes, hits???segs, hsizes # ---------------------------------------------------------------------- def combine_gaps???segs_and_hits???segs(gaps???segs, gsizes, new_gsizes, hits???segs, hsizes, new_hsizes): # Concatenates {gaps???segs} and {hits???segs} to match the lists {new_gsizes}. # Puts brackets around hits???segs. ng???ns = len(gsizes) assert len(hsizes) == ng???ns - 1 nog = len(new_gsizes) assert len(new_hsizes) == nog - 1 assert nog <= ng???ns fgaps???segs = [ '', ] fgsizes = [ 0 ] fhits???segs = [] fhsizes = [] iog = 0 for ig in range(ng???ns): h = '' if ig == 0 else '[' + hits???segs[ig-1] + ']' hs = 0 if ig == 0 else hsizes[ig-1] g = gaps???segs[ig] gs = gsizes[ig] if fgsizes[iog] + hs + gs <= new_gsizes[iog]: fgaps???segs[iog] += h + g fgsizes[iog] += hs + gs elif fgsizes[iog] == new_gsizes[iog]: fhits???segs.append(h); fhsizes.append(hs) fgaps???segs.append(g); fgsizes.append(gs) iog += 1 assert iog == len(fgsizes) - 1 else: err.write(f"!* {gaps???segs = !r}\n") err.write(f"!* {hits???segs = !r}\n") err.write(f"!* {gsizes = !r}\n") err.write(f"!* {hsizes = !r}\n") err.write(f"!* {new_gsizes = !r}\n") err.write(f"!* {new_hsizes = !r}\n") err.write(f"!* {iog = !r}\n") err.write(f"!* {fgaps???segs = !r}\n") err.write(f"!* {fhits???segs = !r}\n") err.write(f"!* {fgsizes = !r}\n") err.write(f"!* {fhsizes = !r}\n") assert False # Paranoia: assert len(fgaps???segs) == nog assert len(fgsizes) == nog assert len(fhits???segs) == nog - 1 assert len(fhsizes) == nog - 1 for iog in range(nog): assert fgsizes[iog] == new_gsizes[iog] if iog > 0: assert fhsizes[iog-1] == new_hsizes[iog-1] return fgaps???segs, fgsizes, fhits???segs, fhsizes # ---------------------------------------------------------------------- # Percent errors of gap sizes: cbits.append(f" |") cbits.append(f" {pcterr(tgsize_ec, exp_tgsize_ec):+4.0f}% ") cbits.append(f" |") if nh == 0: assert gsizes_ec[0] == tgsize_ec assert exp_gsizes_ec[0] == exp_tgsize_ec else: for ig in range(ng): gs = gsizes_ec[ig] egs = exp_gsizes_ec[ig] pe = nan if egs == None else pcterr(gs,egs) cbits.append(f" {pe:+4.0f}% ") cbits.append(f" |") cbits.append("\n") # Absolute gap sizes: cbits.append(f" |") cbits.append(f" {tgsize_ec:4d} ") cbits.append(" |") if nh > 0: for ig in range(ng): gs = gsizes_ec[ig] cbits.append(f" {gs:4d} ") cbits.append(f" |") cbits.append("\n") # Expected gap sizes: def write_exp(es): es_lo = es[0] es_hi = es[1] cbits.append(f" {es_lo:4d}") if es_hi != es_lo: cbits.append(f"..{es_hi:<3d}") return # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: cbits.append(f" |") write_exp(exp_tgsize_ec) cbits.append(" |") if nh > 0: for ig in range(ng): egs = exp_gsizes_ec[ig] write_exp(egs) cbits.append(f" |") cbits.append("\n") cbits.append("\n") ???gsizes_ch[iv] = [ len(gs) for gs in gaps_ch[iv] ] ???hsizes_ch[iv] = [ len(hs) for hs in hits_ch[iv] ] err.write(f"!@ {???gsizes_ch[iv] = } {???hsizes_ch[iv] = }\n") # The hits must be the same in both long and trim versions: assert hits_ch[0] == hits_ch[1], "hits don't match" # Merge the two gap size list into a single list of gap size ranges: tsize_ch = tuple(tsize_ch); assert len(tsize_ch) == 2 ???gsizes_ch = list(zip(???gsizes_ch[0], ???gsizes_ch[1])) hits_ch = list(zip(hits_ch[0], hits_ch[1])) # Cleanup the SBJ hanzi text, just in case: text_ch = re.sub(r"<[^<>]*>", "", text_ch) text_ch = re.sub(r"[ \012]", "", text_ch) text_ch = re.sub(r"[:[](),。; ]", "", text_ch) # Split the SBJ text with the hanzi pattern: tsize_ch = len(text_ch) gaps_ch, hits_ch = \ mmf.find_multi_pattern_occurrences(text_ch, ???kwords_ch|kwords_list) score, loc_ec, size, gaps_ec, ???gsizes_ec, hits_ec, ???hsizes_ec = cand assert len(gaps_ec) == ng assert len(hits_ec) == nh assert len(???gsizes_ec) == ng assert len(???hsizes_ec) == nh exp_gsizes_str = spf.format_size_ranges(exp_gsizes_ec) exp_gsizes_ec = ??? exp_???tgsize_ec = ??? ??? exp_gsizes_wc = anf.compute_wc_gap_size_ranges(gsizes_ch) ??? exp_gsizes_ec = anf.compute_ec_gap_size_ranges(gsizes_ch) ??? ??? exp_???tgsize_wc = anf.compute_total_ec_size_range(exp_gsizes_wc) ??? exp_???tgsize_ec = anf.compute_total_wc_size_range(exp_gsizes_ec) ??? ??? exp_gsizes_wc_str = spf.format_size_ranges(exp_gsizes_wc) ??? exp_gsizes_ec_str = spf.format_size_ranges(exp_gsizes_ec) # The {score} will be zero if {tgsize_ec} is in the interval # {exp_tgsize_ec}, and every gap size {gsizes_ec[ig]} is in the interval # {exp_gsizes_ec[ig]}. The score increases as those numbers lie further # and further from those intervals. # # The parameter {exp_tgsize_ec} must be a range, and {exp_gsizes_ec} # must be a list of {ng = nh+1} ranges. Each range is a pair # {(min,max)}. # # The parameter {gsizes_ec} must be a list of {ng} integers # which are interpreted as the counts of EVA letters # before, between, and after some {nh = ng-1} instances of the # relevant keyword(s) in some EVA text. # # The parameter {exp_gsizes_ec???} must be a list of {ng} integer pairs # {(lo,hi)}which are interpreted as the min and max values of {gsizes_ec} # predicted based on the gap sizes in the SBJ entry. The range should # consider excluding or includling fields of the entry that may be # omitted in the SPS. exp_kwsize_ec = (4,5,) # Expected range of keyword size in EVA letters. for gaps_ec, hits_ec, exp_gsizes_ec??? in ( \ ( [ 'nessuno', 'solouno', 'ambedue', 'tretreno', 'quattro', 'quinqux', ], [ 'ONE', 'TWO', 'THREE', 'FOUR', 'FIVE', ], [ (15,18), (6,8), (28,29), (6,8), (2,3), (6,7), ] ), ( [ 'nessuno', 'solouno', 'ambedue', ], [ 'ONE', 'TWO', ], [ (15,18), (6,8), (28,29), ], ), ( [ 'nessuno', 'soloBEHuno', ], [ 'ONE', ], [ (15,18), (6,8), ], ), ): nh = len(hits_ec); ng = nh + 1 kwords_ec = [ r"[A-Z]+" ] * nh gsizes_ec = [ len(g) for g in gaps_ec ] hsizes_ec = [ len(h) for h in hits_ec ] ???tgsize_ec = gsizes_ec[0] for hs, gs in zip(hsizes_ec[0:], gsizes_ec[1:]): ???tgsize_ec += hs + gs err.write(f"{???tgsize_ec = !r} {gsizes_ec = !r}\n") exp_gsize_ec = spf.estimate_total_ec_gap_size_range(gsizes_ch) err.write(f"{exp_???tgsize_ec??? = !r} {exp_gsizes_ec??? = !r}\n") score = compute_score(???tgsize_ec, gsizes_ec, exp_???tgsize_ec???, exp_gsizes_ec???) err.write(f"{score = :6.1f} (from {{opt_gsizes_ec}})\n") all: test-ana-cand test-ana-cand: \ analyze_candidate_matches.py analyze_candidate_matches.py testo def compute_total_gap_size_range(gsizes): # Computes a total size range # given the estimated integer ranges {gsizes[0..ng-1]} of gap sizes # between certain keywords and and the estimated size range # {hsize} for the separating . ng = len(gsizes); nh = ng - 1; ???tgsize_lo = nh*hsize[0]; ???tgsize_hi = nh*hsize[1] for egs in gsizes: ???tgsize_lo += egs[0] ???tgsize_hi += egs[1] return (???tgsize_lo, ???tgsize_hi,) # ---------------------------------------------------------------------- def write_cands_file(cev_file, gsizes_ch, cands, kwords_ec, data, exp_tsize_ec???, exp_gsizes_ec???): # {cev_file} name of output file with evaluated candidates, or "-". # {gsizes_ch} gap sizes in the SBJ entry (for documentation). # {cands} sorted list of candidates. # {kwords_ec} list of EVA keyword pattern. # {data} counts from parag parsing and evaluation. # {exp_gsizes_ec???} expected gaps sizes in EVA letters. # {exp_tsize_ec???} expected total cand size in EVA letters. # # The {exp_gsizes_ec???} is a list of integer pairs (interpreted as a # range), and {exp_tsize_ec???} is an integer pair (ditto). # # Takes a list of evaluated candidates {cands} as produced by # {analize_starps_cands}. Each element should be a tuple # {(score, loc_ec, tsize_ec, gaps_ec, gsizes_ec, hits_ec, hsizes_ec)} # The function writes each candidate to {cev_file}, formatted # as described in {format_cand}. # # Candidates with empty {hits_ec} equal to {None} are not written. ng = len(exp_gsizes_ec???); nh = ng - 1 gsizes_ch_str = spf.format_size_ranges(gsizes_ch) exp_gsizes_ec???_str = spf.format_size_ranges(exp_gsizes_ec???) exp_tsize_ec???_str = spf.format_range(exp_tsize_ec???) err.write(f"writing the file '{cev_file}' ...\n") wr = out if cev_file == "-" else open(cev_file, "w") wr.reconfigure(encoding='utf-8') wr.write("# -*- coding: utf-8 -*-\n") wr.write(f"# npar_read = {data['npar_read']}\n") wr.write(f"# npar_with = {data['npar_with']}\n") wr.write(f"# nh_min = {nh}\n") wr.write(f"# min_size = {data['min_size']}\n") wr.write(f"# max_size = {data['max_size']}\n") wr.write(f"# {kwords_ec = !r}\n") wr.write(f"# gsizes_ch = {gsizes_ch_str!r}\n") wr.write(f"# exp_gsizes_ec??? = {exp_gsizes_ec???_str!r}\n") wr.write(f"# exp_tsize_ec??? = {exp_tsize_ec???_str}\n") for cand in cands: score, loc_ec, tsize_ec, gaps_ec, gsizes_ec, hits_ec, hsizes_ec = cand if hits_ec != None: output_cand(wr, cand, kwords_ec, exp_tsize_ec???, exp_gsizes_ec???) wr.write("\n") wr.close() return # ---------------------------------------------------------------------- def cands_eval_summary_from_cev_file(st, entry_name, kw_num): # Summary of search for matching candidates. cev_file = f"cands/{entry_name}_{kw_num}.cev" rd = open(cev_file, "r") data = read_parms_from_file_header(rd) rd.close() kwords_ec = data['kwords_ec'] cands_eval_summary(st, data, kwords_ec) return # ---------------------------------------------------------------------- cev_file = f"cands/{code_ch}_{kwnum}.cev" anf.write_cands_file \ ( cev_file, gsizes_ch, cands, kwords_ec, data, exp_tsize_ec, exp_gsizes_ec ) # {exp_tsize_ec} range of expected total cand size, in EVA, sans puncts. # {exp_gsizes_ec} ranges of expected sizes of gap, ditto. # between those occurrences are consistent # with the respctive expected values,looking for matching occurrences of the {nh} # patterns in {kwords_ec} and # # exp_tsize_ec, exp_gsizes_ec {exp_tsize_ec???} range of expected total cand size, in EVA, sans puncts. # {exp_gsizes_ec???} ranges of expected sizes of gap, ditto. # {exp_tsize_ec???} range of expected total cand size, in EVA, sans puncts. # {exp_gsizes_ec???} ranges of expected sizes of gap, ditto. # between those occurrences are consistent # with the respctive expected values,looking for matching occurrences of the {nh} # patterns in {kwords_ec} and {exp_tsize_ec???}and # {exp_gsizes_ec???[0..ng-1} where {ng=nh+1}. # exp_tsize_ec???, exp_gsizes_ec??? # {exp_tsize_ec???} range of expected total cand size, in EVA, sans puncts. # {exp_gsizes_ec???} ranges of expected sizes of gap, ditto. # corresponding elements of {exp_gsizes_ec???}, and the # total size {tsize_ec} of {tclean_ec} with {exp_tsize_ec???}. def optimal_split(gsizes_ec, hsizes_ec, exp_gsizes_ec): # The parameter exp_gsizes_ec shoule be a list of {ng_ch} expected gap # size ranges in EVA letters. If {len(gsizes_ec) > ng_ch}, condenses # {gsizes_ec,hsizes_ec} to {ng_ch} gaps in the optimal way. Returns condensed # {fgsizes_ec,fhsizes_ec} ng_ec = len(gsizes_ec) nh_ec = len(hsizes_ec) assert ng_ec == nh_ec + 1 ng_ch = len(exp_gsizes_ec) nh_ch = ng_ch - 1 min_gaps_score = +inf # Score due to gaps only. if nh_ec < nh_ch: # Not enough hits. Returns a placeholder solution. opt_gsizes_ec = gsizes_ec.copy() opt_hsizes_ec = hsizes_ec.copy() return opt_gsizes_ec, opt_hsizes_ec opt_gsizes_ec = None opt_hsizes_ec = None debug = False if debug: err.write(f"!- {ng_ch = } {nh_ch = }\n") def opt_aux(gsz, hsz, k): nonlocal min_gaps_score, opt_gsizes_ec, opt_hsizes_ec # if {len(hsz) > nh_ch} tries to condense {gsz[k:]} with the intervening # hits in all possible ways that result line {len(hsz) == nh_ch}. # Remembers the lowest scoring one in {min_gaps_score,opt_gsizes_ec,opt_hsizes_ec}. mg = len(gsz) mh = len(hsz) assert mg == mh + 1 if debug: err.write(f"\n") if debug: err.write(f"!- {' '*k} {mg = } {mh = } {k = }\n") if debug: err.write(f"!- {' '*k} {gsz = !r}\n") if debug: err.write(f"!- {' '*k} {hsz = !r}\n") if mh == nh_ch: sc = compute_gaps_score(gsz, exp_gsizes_ec) if debug: err.write(f"!- {' '*k} {sc = }\n\n") if sc < min_gaps_score: min_gaps_score = sc opt_gsizes_ec = gsz.copy() opt_hsizes_ec = hsz.copy() return else: assert mh > nh_ch if k+1 < mg: # Try condensing {gsz[k]} with {gsz[k+1]} maybe more: gsz1 = gsz[0:k] + [gsz[k] + hsz[k] + gsz[k+1],] + gsz[k+2:] hsz1 = hsz[0:k] + hsz[k+1:] opt_aux(gsz1, hsz1, k) if k < mh: # Try keeping {gsz[k]} and condensing the rest: opt_aux(gsz, hsz, k+1) return # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ opt_aux(gsizes_ec, hsizes_ec, 0) return opt_gsizes_ec, opt_hsizes_ec # ---------------------------------------------------------------------- opt_gsizes_ec, opt_hsizes_ec = optimal_split(gsizes_ec, hsizes_ec, exp_gsizes_ec) err.write(f"{opt_gsizes_ec = !r}\n") err.write(f"{opt_hsizes_ec = !r}\n") fgaps_ec, fgsizes_ec, fhits_ec, fhsizes_ec = \ combine_gaps_and_hits(gaps_ec, gsizes_ec, opt_gsizes_ec, hits_ec, hsizes_ec, opt_hsizes_ec) err.write(f"{fgaps_ec = !r}\n") err.write(f"{fgsizes_ec = !r}\n") err.write(f"{fhits_ec = !r}\n") err.write(f"{fhsizes_ec = !r}\n") opt_gsizes_ec, opt_hsizes_ec = optimal_split(???gsizes_ec, ???hsizes_ec, exp_gsizes_ec) fgaps_ec, fgsizes_ec, fhits_ec, fhsizes_ec = \ spf.combine_gaps_and_hits(gaps_ec, ???gsizes_ec, opt_gsizes_ec, hits_ec, ???hsizes_ec, opt_hsizes_ec) assert len(fgaps_ec) <= ng?????? assert len(fhits_ec) <= nh?????? loc4, tags4, starps_items = h77.split_formatted_entry(""" (A) | fcheokair.oke» (A1) | - (A3) | ··daiin. (A31) | ····chedy.qokeed.okain.chdy. (A32) | ····laiin.ofar. (A33) | ····chedy.tedam. (A34) | ····okeedy.lkal.daiin. (A35) | ····ykchedy.qokol.chedy. (A36) | ····qokedy.lkedy. (A4) | ··okalo.l.chedl.y. (A41) | ····dchedy.okeedar. (A411) | ······shchy.okol. (A42) | ····kedy.okeedy. (A43) | ····chal.raiin. (A44) | ····otedy.chtal.am. (A45) | ····dain.chey.qokeedy. (A46) | ····chol.cheey.dalkar.okedy. (A61) | - (A62) | - (A63) | - (A7) | - """) assert loc4 == best_loc assert tags4 == tags1 if chosen_loc != None: h.parags(st, f"""Thus we will tentatively assign {entry_code} ({loc_ch}) to {chosen_loc}.""") chosen_cand = cands_1[0] chosen_score = chosen_cand[0]; assert chosen_cand[1] == chosen_loc else: h.parags(st, f"""Thus we will not assign {entry_code} ({loc_ch}) to any SPS parag.""") chosen_cand = None chosen_score = None; def select_best_cand(cands_1, cands_2, max_score): # Selects the best candidate from two candidare lists, # sorted by inceasing badness score. best_cand_1 = cands_1[0] if cands_1 != None else None best_cand_2 = cands_2[0] if cands_2 != None else None score_1 = best_cand_1[0] if best_cand_1 != None else +inf score_2 = best_cand_2[0] if best_cand_2 != None else +inf best_score = score_1 if score_1 <= score_2 else score_2 best_cand = best_cand_1 if score_1 <= score_2 else best_cand_2 if best_score > max_score: best_cand = None return best_cand # ---------------------------------------------------------------------- def compute_total_gap_size???(gsizes, egsizes): tsize = 0; # Total size of all gaps. for gs in gsizes: tsize += gs etsize_min = 0 # Expected max total size of gaps. for egs in egsizes: etsize_min += egs[0] etsize_max = 0 # Expecetd min total size of gaps. for egs in egsizes: etsize_max += egs[1] return tsize, (etsize_min, etsize_max,) # ---------------------------------------------------------------------- avg_deltas=() echo " computing average delta ..." 1>&2 avg_delta=$( \ gawk ' /^ *[a-z]/{ s += $2; s += $3; n += 2 } END { print s/n }' ${wpd_file} \ ) echo " avg_delta = ${avg_delta}" 1>&2 avg_deltas+=( ${avg_delta} ) ######################################################################## # Remove junk not counted as punctuation: if utype == "ch": # Nothing to remove besides punctuation. pass elif utype == "ps": # Nothing to remove besides punctuation. pass elif utype == "ec" or utype == "wc" or utype == "wp": # Delete all markup: text = re.sub(r"[<][!][^<>]*[>]", "", text) text = re.sub(r"[«=» ]", "", text) text = re.sub(r"[{]([^{}]*)[}]", r"\1", text) text = re.sub(r"^<[%]>", "", text) text = re.sub(r"<[$]>$", "", text) # Map weirdos to '?': text = re.sub(r"[&][0-9][0-9][0-9][;]?", "?", text) # Map all to lowercse: text = text.lower() else: arg_error(f"invalid unit type {utype = !r}") ###################################################################### ???if utype == "ec": m_bad = re.search(r"[^ac-fhik-tvxy?]") elif utype == "wc" or utype == "wp": m_bad = re.search(r"[^.ac-fhik-tvxy?]", text) else: arg_error(f"invalid {utype = !r}") pat_ipu = r"[:。,\[\]]" # Ideographic punctuation. pat_ann = r"[(][^()]*[)]" # Apocriphal annotations (ideographic parens). pat_ipu = r"[-,.;:'*]" # Pinyin punctuation. pat_ann = r"[(][^()]*[)]" # Apocriphal annotations (ascii parens). pat_junk???notneeded = f"{pat_ipu}|{pat_ann}" # Junk to be deleted. ###################################################################### def ttype_from_utype(utype): # Returns the text type {ttype} of an ".ivt" file required to extract # or count unts of type {utype}. ttype = None if utype == "ch": # Chinese characters: ttype = "chu" elif utype == "ps": # Isolated pinyin syllables: ttype = "pys" elif utype == "ec" or utype == "wc" or utype == "wp": # EVA characters or words: ttype = "eva" else: arg_error(f"invalid {utype = !r}") return ttype # ---------------------------------------------------------------------- ######################################################################## def add_all_word_pos_pos_plot_rules(pre, mak, tit): # ---------------------------------------------------------------------- # pos-pos-plots: res/bencao-fu-zhu3-starps-${SPS_TAG}-wpos.png # pos-pos-plots: res/bencao-fu-zhu3-starps-${SPS_TAG}-nwo-hist.png # res/starps-${SPS_TAG}.woc: \ # \ # res/starps-fu-par.ivt \ # ${MAKEFILE} # ./list_wpositions_in_parags.py voyn-eva '${SPS_WORD}' res/starps-fu-par.ivt \ # > res/starps-${SPS_TAG}.woc # # res/bencao-fu-zhu3.woc: \ # list_wpositions_in_parags.py \ # in/bencao-fu.chu \ # ${MAKEFILE} # ./list_wpositions_in_parags.py chin-chu '主' in/bencao-fu.chu \ # > res/bencao-fu-zhu3.woc # # res/bencao-fu-zhu3-starps-${SPS_TAG}-nwo-hist.png: \ # ${MAKEFILE} \ # \ # res/starps-${SPS_TAG}.woc \ # res/bencao-fu-zhu3.woc # ./plot_two_word_pos_histograms.sh \ # starps-${SPS_TAG} '${SPS_WORD}' \ # bencao-fu-zhu3 zhu3 # # res/bencao-fu-zhu3-starps-${SPS_TAG}-wpos.png: \ # ${MAKEFILE} \ # plot_two_word_pos_files.sh \ # res/starps-${SPS_TAG}.woc \ # res/bencao-fu-zhu3.woc # ./plot_two_word_pos_files.sh \ # starps-${SPS_TAG} '${SPS_WORD}' \ # bencao-fu-zhu3 zhu3 # return targets # ---------------------------------------------------------------------- ######################################################################## def add_all_ivt_rules(pre, mak, tit): # Rules to create the specialized "-lin.ivt" files, namely "in/starps-fu-lin.ivt" # and "in/starps-gd-lin.ivt" from the starred parags text of Note/074. targets = [] if True: # Complete "fu-eva-lin.ivt" file with all parags: target_full = f"starps-fu-eva-lin.ivt" source_full = "../074/st_files/str-parags.ivt" tit[target_full] = f"copying full SPS IVTFF file {target_full} from {source_full}" pre[target_full] = [ source_full ] mak[target_full] = ( f"cat {source_full} \\", f" | egrep -v -e '^]' \\", f" > res/{target_full}" ) targets.append(target_full) if True: # Subset "gd-eva-lin.ivt" file with good lines only: target_good = f"starps-gd-eva-lin.ivt" source_good = f"../074/st_files/str-parags.ivt" # The filtering script and its imported modules: filter_script = "remove_bad_lines_from_starps_ivt.gawk" erfn_gawk_lib = "work/error_funcs.gawk" tit[target_good] = f"extracting the good SPS source file {target_good} from {source_good}" pre[target_good] = [ source_good, filter_script, erfn_gawk_lib, ] mak[target_good] = ( f"cat {source_good} \\", f" | egrep -v -e '^]' \\", f" | {filter_script} \\", f" -i {erfn_gawk_lib} \\", f" > res/{target_good}", ) targets.append(target_good) return targets # ---------------------------------------------------------------------- add_all_ivt_rules(pre, mak, tit) + \ ######################################################################## ivt_target = f"bencao-fu-{ttype}-lin.ivt" tit[ivt_target] = f"making link {ivt_target} to {source_ivt}" pre[ivt_target] = [ source_ivt, ] mak[ivt_target] = ( f"( cd res && rm -f {ivt_target} ; ln -s ../{source_ivt} {ivt_target} )", ) targets.append(ivt_target) ######################################################################## def add_single_loc_word_pos_file_rules(pre, mak, tit, book, bsub, unit, sloc, word, tag): # Adds rules and commands to create a file "res/{book}-{bsub}-{unit}-{sloc}-{tag}.wpo" # with the positions of {word} in parag {sloc} of file {name} # with format {fmt} (either "voyn-eva" or "chin-chu"). source = f"res/{book}-{bsub}-{unit}-{tag}.wpo" target = f"{book}-{bsub}-{unit}-{sloc}-{tag}.wpo" tit[target] = f"making the single-parag word positions file {target}" pre[target] = [ source, ] mak[target] = ( f"cat {source} | egrep -e '^{sloc}[ ]' | cat > res/{target}", ) return target # ---------------------------------------------------------------------- ######################################################################## # Outputs a list of all tuples {tsize} consecutive words, # ignoring those that contain words that contain '*'. # # For each tuple of {tsize} consecutive words in the same line of the input, # writes {tsize+1} lines in the output with the format # # "«{LEFT}» «{MIDDLE}» «{RIGHT}» <{SEC}.{NLIN}> {KW} {SL} {SM} {SR}" # # where {LEFT}, {MIDDLE}, and {RIGHT} are the words of the tuple; {SL}, {SM}, {SR} # are the counts of words in each of these strings; {SEC} and {NLIN} specify the input line # where the tuple occurs; and {KW} is the index of the tuple's first # word in the input line. # # The strings {LEFT}, {MIDDLE}, and {RIGHT} consist of whole input # words, separated by '.'. The string {MIDDLE} has at least one word, but # {LEFT} and {RIGHT} may be empty. if book == "bencao": assert sub == "fu" enc = "chu" if unit == "ch" else "pys" if unit == "ps" else None elif book == "starps": assert sub == "fu" or sub == "gd" enc = "eva" else: assert False, f"bad {book = }" assert enc is not None, f"bad combo {book = } {unit = }" m = re.match(pat_punc, text) if enc == "utf": # Cleanup consists in deleting the Chinese punctuation: for ch in text: if debug: err.write(f"!! ch = '{ch}'") if re.fullmatch(ch, pat_punc): num_ignored += 1 if debug: err.write(" KO") else: good_chars.append(ch) if debug: err.write(" OK") if debug: err.write("\n") text = "".join(good_chars) elif enc == "eva": # Cleanup consists of deleting parag markers and ensuring simple EVA. if unit == "ec": # Remove all EVA punctuation: tlen = len(text) text = re.sub(r"[-,.]", "", text) num_ignored += tlen - len(text) elif unit == "wc": # Normalize all punc to single '.': tlen = len(text) text = re.sub(r"[-,]", ".", text) # Normalize punctuation: text = re.sub(r"[.][.]+", ".", text) text = re.sub(r"^[.]+", "", text) text = re.sub(r"[.]+$", "", text) num_ignored += tlen - len(text) else: assert False, f"invalid combo {enc = } {unit = }" ???charset = None # Sets of special hanzi characters (punct, blank, etc.) if unit == "ch": ???charset = dict() set_dir = "langbank/chin" ???charset['invalid'] = read_chinese_char_set(f"{set_dir}/utf8-invalid.tbl") ???charset['bullets'] = read_chinese_char_set(f"{set_dir}/utf8-bullets.tbl") ???charset['symbol'] = read_chinese_char_set(f"{set_dir}/utf8-symbol.tbl") ???charset['punct'] = read_chinese_char_set(f"{set_dir}/utf8-punct.tbl") ???charset['blank'] = read_chinese_char_set(f"{set_dir}/utf8-blank.tbl") # Read tables of chinese character sets: pat_line = None # Matches a pinyin line, with groups {LOC} and {TEXT}. pat_punc = None # Matches pinyin punctuation (excluding blanks). pat_word = None # Matches a pinyin word (syllable or compound). if unit == "ch": elif unit == "ps" or unit == "pj": # Patterns for parsing pinyin: pat_loc = r"b[1-3][.][1-6][.][0-9][0-9][0-9]" pat_line = f"<({pat_loc})>[ ]+(.*)\n" pat_punc = r"[.,;()*]" else: assert False pat_sec = r"s[0-2]" # Section s-number, "s0" to "s2". pat_sub = r"[.][0-9]" # Subsection number, 0 to 9, with '.'. pat_lseq = r"[.][0-9][0-9][0-9]" # , with '.'. pat_locid = f"<({pat_sec})({pat_sub})({pat_lseq})>" # Is a data line: if m.lastindex != 4: prog_error("num fields = %d" % m.lastindex) sec = m.group(1) sub = m.group(2) lseq = m.group(3) text = m.group(4).strip() # {DATA} field. loc = f"{sec}{sub}{lseq}" else: # Non-parag data line - ignore: if re.search(r"([<][%$][>])", text): data_error(nline,line, f"spurious alignment marker '{m.group(1)}'") m = re.search(r"([^-,.a-z?]", text) if m != None: data_error(nline,line, f"invalid char '{m.group(1)}'")