#! /usr/bin/python3 # Last edited on 2026-04-30 22:46:30 by stolfi # Orignal matching sections for the report -- using manual selection of # strick/liberal patterns, no penalty for liberal ones. import sys, re, string from sys import stderr as err import html_gen as h import html_report_funcs as hr import match_multi_funcs as mmf import size_position_funcs as spf import analyze_starps_parag_funcs as anf import bimatching_eval_funcs as bef from math import sqrt, hypot, exp, log, floor, ceil, isfinite, isnan, inf, nan def analyze_starps_parags_org(code_ch, segs_ch, kwords_ec): # # Arguments: # # {code_ch} four-letter code of the SBJ entry. # {segs_ch} macro-parsing of the SBJ entry into gaps and hits. # {kwords_ec} list of EVA RE pattern(s) for keywords in EVA text. # # The parameter {kwords_ec} must be a list of {nh} RE patterns that # are supposed to match in {cleantx_ec} the Voynichese EVA translations # of certain hanzi keywords. # # Let {nh} be the number of patterns {kwords_ec}, and {ng} be {nh+1}. # The procedure calls {anf.analyze_starps_parags_org} to scan the file # of good SPS parags {ivt_file} and evaluating each parag for how well # it matches the SBJ entry. # # For each parag considered, the procedure creates a version # {cleantx_ec} of its text, that has only lowercase EVA letters # [a-z?]. It then uses {mmf.match_multi_pattern} with the pattern list {kwords_ec} # to get a macro-parsing of {cleantx_ec} that is most compatible # with the macro-parsing {segs_ch} of the SBJ entry. # # The result of this procedure is a list {parevs} of parag evaluation # tuples (/parevs/) and a dictionary {data} with various counts of the # operation. There will be a parev for each parag that may possibly # match. ivt_file = f"res/starps-gd-ec-par.ivt" ns = len(segs_ch) nh = len(kwords_ec); ng = nh + 1; assert ns == ng + nh assert not isinstance(kwords_ec, str) # To catch old uses. def match_func(cleantx_ec): # SBJ-SPS entry matching function. # # The function splits the EVA text {cleantx_ec} at the occurrences # the patterns {kword_ec[0..ns-1]}, using {mmf.match_multi_pattern} # obtaining a macro-parsing {segs_ec} for it, consisting of {nh} # "hits" matched by the patterns and {ng} "gaps" between those hits. # # The function then computes a badness {score} for the parag as a # candidate for the SPS version of the SBJ entry. The sizes of the # gaps (even-indexed elements) in the SPS parag macro-parsing # {segs_ec[0..ns-1]} are compared with the sizes of the # corresponding gaps in the macro-parsing {segs_ch[0..ns-1]} of # the SBJ entry, individually and as total, taking into account the # average hanzi-to-EVA conversion factor and roundoff errors due to # the sizes being integers. # # The returned result is the {score} and the macro-parsings # {segs_ch[0..ns-1]}, {segs_ec[0..ns-1]}. def eval_segs_ec(segs_ec): assert len(segs_ec) == ns key_penalty = 0 # We don't have this for the original version . score = bef.compute_full_score_from_macro_parsings \ (segs_ch, "ch", segs_ec, "ec", key_penalty) return score # .................................................................. # Search for {kwords_ec} in the {cleantx_ec}: segs_ec, score = mmf.match_multi_pattern(cleantx_ec, kwords_ec, eval_segs_ec) key_penalty = 0 # We don't have this for the original version . if segs_ec != None: assert len(segs_ec) == ns return score, segs_ch, segs_ec, key_penalty # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: parevs, data = anf.analyze_starps_parags(ivt_file, nh, match_func) return parevs, data # ---------------------------------------------------------------------- def add_keywords_table(st, kwords_en, kwords_ch, kwords_ec): h.parags(st, "Keyword patterns:") nh = len(kwords_en) bars = [ ' || ' ] * nh rows = list(zip(kwords_en, bars, kwords_ch, bars, kwords_ec)) col_mods = [ "style='padding-left:4ch; padding-right:4ch; text-align:left;'", "align=left", "align=left", "align=left", "align=left", ] h.table(st, rows, col_mods = col_mods, centered = False) return # ---------------------------------------------------------------------- def add_bencao_parsing_section(st, loc_ch, kwords_ch, segs_ch): h.section(st, 4, "SBJ entry parsing:") hipat_ch = get_keyword_highlight_pattern_org(kwords_ch) ch_str = anf.format_macro_parsing_ch(loc_ch, segs_ch, hipat_ch) ch_str = h.protect_html(ch_str) h.append_preformatted(st, ch_str, ind = 4, centered = False) return # ---------------------------------------------------------------------- def get_ch_en_keyword_matching_templates(kwords_en, strict): # Returns lists {kwords_ch, kwords_ec} of # patterns that should match the keywords, in # a specific order. kwords_ch = [] kwords_ec = [] for kw_en in kwords_en: kw_ch, kw_ec = get_ch_ec_keyword_matching_patterns(kw_en, strict) kwords_ch.append(kw_ch) kwords_ec.append(kw_ec) return kwords_ch, kwords_ec # ---------------------------------------------------------------------- def get_keyword_highlight_pattern_org(kwords): # Given a list of keyword-matching patterns, # returns a pattern useful for highlighting keywords in # each text. debug = False nh = len(kwords); pats = [] for ih in range(nh): pats += kwords[ih].split('|') hipat = "|".join(tuple(set(tuple(pats)))) assert re.match(hipat, "") == None, f"hipat matches empty" return hipat # ---------------------------------------------------------------------- def get_ch_ec_keyword_matching_patterns(kw_en, strict): # Returns the hanzi and EVA keywords {kw_ch,kw_ec} corresponding to # the English keyword type {kw_en}. # if kw_en == "MAINLY-FOR" or kw_en == "MAIN-USES": kw_ch = '主' if kw_en == "MAINLY-FOR" else '主治' kw_ec = 'daiin|dair|laiin' if strict else '[dlkrs][ao]ii?n|[dlkrs][ao]ir' elif kw_en == "LONG-TAKE" or kw_en == "LONG" or kw_en == "TAKE": kw_ch = '久服' if kw_en == "LONG-TAKE" else '久' if kw_en == "LONG" else '服' if strict: kw_okaiin = 'q?[aoy]kaiin' kw_okeedy = 'q?[aoy]keed[aoy]' else: kw_okaiin = 'q?[aoy][ktd][ao]ii?n' kw_okeedy = '[aoy][ktd]ee?[dk][aoy]' kw_ec = kw_okeedy + "|" + kw_okaiin elif kw_en == "QI": kw_ch = '气' kw_ec = 'chedy' if strict else '[cs]he?[kd][aoy]' else: assert False, f"invalid English keyword {kw_en}" return kw_ch, kw_ec # ----------------------------------------------------------------------