#! /usr/bin/python3 last_edit = "Last edited on 2026-05-01 15:38:44 by stolfi" import sys, re, string from sys import stderr as err import html_gen as h from process_funcs import bash, basic_line_loop import html_report_funcs as hr import size_position_funcs as spf import match_multi_funcs as mmf import analyze_starps_parag_funcs as anf import align_bencao_starps_items_funcs as alf import report_077_alt_matching_funcs as r77alt import report_077_org_matching_funcs as r77org import bimatching_eval_funcs as bef from math import sqrt, hypot, exp, log, floor, ceil, isfinite, isnan, inf, nan def split_formatted_entry(entry): # Parses an SBJ entry (hanzi, pinyin, translation, or Voynichese) # that has been cast in multiline format. # # The {entry} must be a multiline string where the first line has the # format "<{LOC}>" and each subsequent line has the format # "{TAG}{SEP}{ITEM}" where the {TAG} is a string of [A-Z0-9] in parens # '()', {SEP} is one or more blanks or '|'s, {ITEM} is any string. # # Removes leading and traling ASCII spaces from {ITEM} (but not # ideographic spaces). # # Returns the {LOC}, the list of all {TAG}s, and the list of all # {ITEM}s. # Cannot strip -- must keep ideographic spaces. m = re.fullmatch(r"[ \012]*[<]([a-z0-9.]+)[>] *[\012](.*)", entry, re.DOTALL) assert m != None, f"bad entry format '{entry[:12]}'" loc = m.group(1) entry = m.group(2) lines = entry.splitlines() items_tg = [] items_wh = [] for line in lines: # Cannot strip -- must keep ideographic spaces. if re.match(r"[ \012]*([#]|$)", line): continue m = re.fullmatch(r"[ \012]*([(][A-Z0-9a-z]+[)])[ |]+(.+)[ \012]*", line) assert m != None, f"bad entry line format '{line}'" tag = m.group(1) item = m.group(2) # Cannot strip -- must keep ideographic spaces. item = re.sub(r"^[ ]+", "", item) item = re.sub(r"[ ]+$", "", item) items_tg.append(tag) items_wh.append(item) return loc, items_tg, items_wh # ---------------------------------------------------------------------- def split_formatted_entry_hanzi(entry): # Parses an SBJ entry hanzi that has been cast in multiline format. # See {split_formatted_entry} for the format of {entry}. # Then does some checking and cleanup of the items. # # The items must contain only hanzi, ideographic blanks and # punctuation (which are retained), and leading or trailing ASCII # blanks (which are stripped). Pads all items with ideographic blanks # to the same width. # # Returns the {LOC}, the list of all {TAG}s, and the list of all # {ITEM}s. loc, items_tg, items_wh = split_formatted_entry(entry) # Pad all items> max_item_sz = 0 for item in items_wh: if re.search(r"[\001-\377]", item): assert False, f"ascii character in hanzi item '{item}'" max_item_sz = max(len(item), max_item_sz) items_wh = [ item.ljust(max_item_sz, " ") for item in items_wh ] return loc, items_tg, items_wh # ---------------------------------------------------------------------- def make_three_column_entry_table(items_tg, items_wh, items_aa, items_bb): # Returns the HTML of a table with the given {items_tg} on column 1, the # given {items_wh} in column 2, and arbitrary ascii entries{items_aa,items_bb} in # columns 3 and 4. # # All four lists must have the same length, with corresponding # elements in the same positions. N = len(items_tg) assert len(items_wh) == N, f"{N = } {len(items_wh) = }" assert len(items_aa) == N, f"{N = } {len(items_aa) = }" assert len(items_bb) == N, f"{N = } {len(items_bb) = }" bars = [ ' | ' ] * N rows = list(zip(items_tg, bars, items_wh, bars, items_aa, bars, items_bb)) col_mods = [ "style='padding-left:4ch; padding-right:4ch; text-align:left; font-weight:bold;'", "align=left", "align=left", "align=left", "align=left", "align=left", "align=left", ] html_tb = h.make_table(rows, by_rows = True, col_mods = col_mods) return html_tb # ---------------------------------------------------------------------- def add_three_column_entry_table(st, items_tg, items_wh, items_aa, items_bb): # Appends to {st} a table with the given {items_tg} on column 1, # the given {items_wh} # in column 2, and arbitrary ascii entries{items_aa,items_bb} in columns 3 and 4. # # The elements of {items_wh} are assumed to consist of hanzi and/or # ideographic punctuation. The other columns are supposed to be # Latin (or pinyin) letters with ISO-Latin punctuation. # # All four lists must have the same length, with corresponding # elements in the same positions. html_tb = make_three_column_entry_table(items_tg, items_wh, items_aa, items_bb) h.append_centered(st, html_tb, centered = False) return # ---------------------------------------------------------------------- def entry_align_table(st, rows): # Prints a table with the items_tg on column 1, hanzi in column2, and # arbitrary ascii entries in columns 3 and 4. ch_ps_wp_en_wcol_mods = [ "style='padding-left:4ch; padding-right:4ch; text-align:left; font-weight:bold;'", "align=left", "align=left", "align=left", "align=left", ] h.table(st, rows, col_mods = col_mods, centered = False) return # ---------------------------------------------------------------------- def read_parms_from_file_header(rd): # Reads {rd} and looks for lines of the form "# {KEY} = {VALUE}". # Returns a dict with those keys and values. # The {KEY} may be any python3-style identifier. # The {VALUE} for now may be an integer, a float, or a string. vms_dic = dict() err.write("!= beg\n") def process_line(nread, line): nonlocal vms_dic line = line.strip() err.write(f"!= {nread:5d} {line = !r}\n") m = re.fullmatch(r"# *([a-zA-Z][a-zA-Z_0-9]*) *[=] *(.*)", line) if m == None: return key = m.group(1) val = m.group(2).strip() err.write(f"!= {key = !r} {val = !r}\n") if re.fullmatch(r"[-+]?[0-9]+", val): # Integer val = int(val) elif re.fullmatch(r"[(][-+0-9, ]+[)]", val): # Integer tuple; assume pair: m = re.fullmatch(r"[(]([-+]?[0-9]+)[ ,]+([-+]?[0-9]+)[)]", val) val = (int(m.group(1)), int(m.group(2)),) elif re.fullmatch(r"[-+]?[0-9]*([.][0-9]|[0-9][.])[0-9]*([Ee][-+]?[0-9]+)?", val): val = float(val) elif re.fullmatch(r"['][^']*[']", val): val = re.sub(r"[']", "", val) elif re.fullmatch(r'["][^"]*["]', val): val = re.sub(r'["]', "", val) elif re.fullmatch(r'\[.*\]', val): val = re.sub(r'^\[', "", val) val = re.sub(r'\]$', "", val) elems = re.split(r'[, ]+', val) items = [] for el in elems: if el != "": if el[0] == '"': el = re.sub(r'"', "", el); elif el[0] == "'": el = re.sub(r"^'", "", el); items.append(el) val = items else: assert False, f"** bad value «{val}»" vms_dic[key] = val return # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: nread = basic_line_loop(rd, process_line) return vms_dic # ---------------------------------------------------------------------- def add_title_summary_and_intro(code_ch, loc_ch, name_ch, name_py, name_en): name_en = re.sub(r"_", " ", name_en.lower()) name_en_uscore = re.sub(r"[ ]+", "_", name_en) name_en_caps = string.capwords(name_en) title = f"[{code_ch}.077] The '{name_en_caps}' entry of the SBJ" st = h.new_doc(title, "#eeffdd", text_width = 1600) h.section(st, 2, "Summary") h.parags(st, f"""This webpage discusses the SBJ entry titled {name_ch} = "{name_en}", parsed into its sub-entries and their fields, and possible correspondences to parags of the SPS. The modern Mandarin reading of the text and an English translation are also shown. This entry may be referred as "{code_ch}" in tables.""") return st, code_ch, loc_ch, name_ch, name_py, name_en # ---------------------------------------------------------------------- def add_chinese_text_section(st, loc_ch, name_ch, name_py, name_en, nch, items_wh): h.section(st, 2, "The SBJ entry (Chinese)") h.parags(st, f"""This is the SBJ entry {name_ch} {name_py} = "{name_en}":""") hanzi_text = f"<{loc_ch}> " + "".join(items_wh) hanzi_pure = display_hanzi_pure_text(st, loc_ch, nch, hanzi_text) return hanzi_pure # ---------------------------------------------------------------------- def display_hanzi_pure_text(st, loc_ch, nch, hanzi_text): hanzi_text = hanzi_text.strip() hanzi_text = re.sub(r"[ \012]", "", hanzi_text) m = re.fullmatch(r"<([a-z0-9.]+)>(.*)", hanzi_text) assert m != None, f"invalid chinese entry format {hanzi_text!r}" assert m.group(1) == loc_ch, f"loc ID mismtch: {loc_ch} actual {m.group(1)}" hanzi_body = m.group(2) hanzi_pure = re.sub(r"[:[](),。; ]", "", hanzi_body) nch_real = len(hanzi_pure) assert nch_real == nch, f"length error: {nch = } actual {nch_real}" if nch < 45: hanzi_chops = [ hanzi_pure, ] else: hanzi_chops = [ hanzi_pure[k:k+40] for k in range(0, nch, 40) ] hanzi_display = f"<{loc_ch}> {nch:2d} hanzi\n" + "\n".join(hanzi_chops) h.append_preformatted(st, h.protect_html(hanzi_display), ind = 4, centered = False) return hanzi_pure # ---------------------------------------------------------------------- def add_formatted_entry_table(st, tags1, items_wh, items_py, items_en): h.section(st, 3, "Pinyin and translation") h.parags(st, """Here is the same entry, with punctuation added according to this parsing, the modern Mandarin readings in pinyin, and a somewhat literal English translation:""") add_three_column_entry_table(st, tags1, items_wh, items_py, items_en) return # ---------------------------------------------------------------------- def add_starps_matching_section(st, code_ch, loc_ch, variants, max_score, original = False): # Appends to document {st} the body of a section that searches the SPS # file for parags matching a given SBJ entry in various ways. # # The {variants} argument must be a list of triples {(cleantx_ch, # kwords_en, strict)} where {cleantx_ch} is a suitably cleaned and # trimmed hanzi text of the entry to be matched, {kwords_en} is a list of # strings, and {strict} is a boolean. # # Calls {add_starps_matching_subsection} for each tuple # in {variants}, with the tuple fields as arguments. # # Returns a list of of the results of those calls. Each element of # this list is a list of matching results. Each matching result is a # tuple as returned by {anf.analyze_starps_parags} (quod videt). h.section(st, 3, f"Matching results") if not original: # Filter variants ignoring {strict} variation: err.write(f"{variants = !r}\n") variants = ( (ctx_ch, kw_en, False) for ctx_ch, kw_en, strict in variants ) variants = list(set(variants)) parevs_list = [] for cleantx_ch, kwords_en, strict in variants: parevs = add_starps_matching_subsection \ ( st, code_ch, loc_ch, cleantx_ch, kwords_en, strict, max_score, original ) parevs_list.append(parevs) return parevs_list # ---------------------------------------------------------------------- def add_starps_matching_subsection \ ( st, code_ch, loc_ch, cleantx_ch, kwords_en, strict, max_score, original ): # Arguments: # # {code_ch} four-letter code of the SBJ entry. # {loc_ch} loc ID of the entry in the SBJ file. # {cleantx_ch} text of that entry, suitably cleaned and trimmed. # {kwords_en} a list of bipattern codes. # {strict} a boolean modfying the interpretation of {kwords_en} # {original} a boolean that selects the matching method. # {max_score} maximum interesting badness score. # # Each element of {kwords_en} should be a /keyword code/, a string, # like 'USES' or 'QI' that specifies an abstract keyword. # # The procedure calls # {anf.analyze_starps_parags(segs_ch,ivt_file,match_func)} to scan the # file of good SPS parags {ivt_file} and evaluate each parag for how # well it matches the SBJ entry. # # Let {nh} be the number of pattern codes in {kwords}. Let {ng} be # {nh+1}, and {ns} be {ng + nh}. # # For each parag considered, the procedure creates a version # {cleantx_ec} of its text, that has only lowercase EVA letters # [a-z?]. It then splits {cleantx_ch} and {cleantx_ec} into two # macro-parsings {segs_ch[0..ns-1]} and {segs_ec[0..ns-1]}, each # consisting of {nh} /hits/ (strings matched by the keyword templates) # and {ng} /gaps/ (the strings before, between, and after the hits). # # For each parag the procedure also computes a badness {score} that # combines penalties for the use of non-perfect hits (like 'laiin' # instead of 'daiin') and discrepancies between the sizes of the # gaps in {segs_ch} and {segs_ec}. # # The result is a list of parag evaluation tuples (/parevs/), one for # each parag that may possibly match, with its badness score, the # locus ID of the parag, the macro-parsing {segs_ch[0..ns-1]} of the SBJ entry, # and the matching macro-parsing {segs_ec[0..ns-1]} of the SPS parag. # # This procedure then trims that list of parevs after the first one with # score exceeding {max_score}, and inserts the list into the document # {st}, formatted as described in {anf.format_starps_parag_evaluation}. # # The result of the call is that trimmed list. # # If {original} is true, the list {kwords_en} and the {strict} argument together also define # a list {kwords_ch} of patterns that identify certain keywords # in the clean text {cleantx_ch} of the SBJ entry, and a corresponding # list {kwords_ec} of RE patterns that are expected to match the # Voynichese equivalents of those keywords in the clean EVA # text of the SPS parag that corresponds to the entry in question. See # {get_ch_ec_keyword_pattern_lists}. The parags are matched wit # {mmf.match_multi_pattern}. # # If {original} is false, the list {kwords_en} defines a bitemplate # suitable for {bmf.match_bitemplate}, which is used for matching # each SPS parag to the SBJ entry.. debug = False verbose = True ctsize_ch = len(cleantx_ch) if original: h.section(st, 4, f"Trimmed to {ctsize_ch} hanzi") else: kwords_str = ", ".join(kwords_en) h.section(st, 4, f"Trimmed to {ctsize_ch} hanzi - keys {kwords_str}") h.parags(st, f"Trimmed SBJ entry:") temptx_ch = f"<{loc_ch}>" + cleantx_ch cleantx_ch_check = display_hanzi_pure_text(st, loc_ch, ctsize_ch, temptx_ch) assert cleantx_ch_check == cleantx_ch # The result of comparing an SBJ entry with clean text {cleantx_ch} # and and an SPS parag with clean text {cleantx_ec} is either {None} # or a parag evaluation record (/parev/). Each parev has the format # {(score, loc_ec, segs_ch, segs_ec, key_penalty)} where {score} measures the # badness of the match, and {segs_ch,segs_ec} are two aligned # macro-parsings of {cleantx_ch} and {cleantx_ec}. nh = len(kwords_en); ng = nh + 1; ns = ng + nh bitem = None # If not original, the bitemplate used. hipat_ch = None # Patterns used to highlight keywords in SBJ text. hipat_ec = None # Patterns used to highlight keywords in SPS text. if original: # Original matching algorithm kwords_str = ", ".join(kwords_en) strict_str = "strict" if strict else "liberal" h.section(st, 4, f"Matching with keys {kwords_str} ({strict_str})") # Get and display the keyword patterns: kwords_ch, kwords_ec = r77org.get_ch_en_keyword_matching_templates(kwords_en, strict) r77org.add_keywords_table(st, kwords_en, kwords_ch, kwords_ec) hipat_ch = r77org.get_keyword_highlight_pattern_org(kwords_ch) hipat_ec = r77org.get_keyword_highlight_pattern_org(kwords_ec) # Parse the SBJ entry accoring to those patterns: segs_ch = mmf.find_multi_pattern_occurrences(cleantx_ch, kwords_ch) assert len(segs_ch) == ns; err.write(f"!@ {segs_ch = }\n") r77org.add_bencao_parsing_section(st, loc_ch, kwords_ch, segs_ch) # Scan the SPS parags collecting reasonable matches: parevs, data = r77org.analyze_starps_parags_org(code_ch, segs_ch, kwords_ec) else: # New matching algorithm # Get and display the bitemplate {bitemp}: assert not strict bitemp = [] for kw_en in kwords_en: bipat = r77alt.get_bencao_starps_bipattern(kw_en) bitemp.append(bipat) r77alt.add_bitemplate_description(st, kwords_en, bitemp) hipat_ch, hipat_ec = r77alt.get_keyword_highlight_patterns_alt(bitemp) # Scan the SPS parags collecting reasonable matches: parevs, data = r77alt.analyze_starps_parags_alt(code_ch, cleantx_ch, bitemp) if debug: err.write(f"!@ SBJ highlight pattern = {hipat_ch!r}\n") if debug: err.write(f"!@ SPS highlight pattern = {hipat_ec!r}\n") # Each elem of {parevs} must be {(score, loc_ec, segs_ch, segs_ec, key_penalty)}: # Discard totally bad candidates: nc = len(parevs) nc_good = 0; # Parevs with acceptable score. while nc_good < nc and parevs[nc_good][0] <= max_score: nc_good += 1 err.write(f" {nc_good = }\n") nc_show = nc_good # First parev with unacceptable score: if nc_show < nc: nc_show += 1 # Discard excessive parevs: max_show = 10 nc_show = min(nc_show, max_show) if nc_show < max_show: # Ensure that some parevs are shown: nc_show = max(nc_show, min(nc, 3)) # Salutar paranoia: err.write(f" {nc_show = }\n") assert nc_show <= max_show + 1 validate_parevs(parevs, cleantx_ch, max_score, nc_good, verbose) # Show the candidates: h.parags(st, "Best matches:") ec_list_blocks = [] prev_segs_ch = None for ic in range(nc_show): parev = parevs[ic] score, loc_ec, segs_ch, segs_ec, key_penalty = parev if not original: if segs_ch != prev_segs_ch: # Must show the SBJ entry parsing: ec_list_blocks.append("\n") ch_str = anf.format_macro_parsing_ch(loc_ch, segs_ch, hipat_ch) ch_str = h.indent_lines(4, ch_str) ec_list_blocks.append(ch_str) ec_list_blocks.append("\n") prev_segs_ch = segs_ch ec_str = anf.format_starps_parag_evaluation(parev, hipat_ec) ec_list_blocks.append(ec_str) ec_list_str = "\n".join(ec_list_blocks) ec_list_str = h.protect_html(ec_list_str) h.append_preformatted(st, ec_list_str, ind = 2, centered = False) return parevs[:nc_good+1] # ---------------------------------------------------------------------- def validate_parevs(parevs, cleantx_ch, max_score, nc_good, verbose): # Paranoia: prev_score = -inf; prev_loc = "NONE" nc = len(parevs) for ic in range(nc): parev = parevs[ic] score, loc, segs_ch, segs_ec, key_penalty = parev assert loc != prev_loc # No duplicate parevs. assert cleantx_ch == "".join(segs_ch) if verbose: err.write(f" parag {loc:<12s} {score = :6.2f}\n") assert score >= prev_score # Badness must be non-decreasing. if ic < nc_good: assert score <= max_score # No bad parevs except the last one. score_check = bef.compute_full_score_from_macro_parsings \ (segs_ch, "ch", segs_ec, "ec", key_penalty) if score != score_check: err.write(f"{score = :24.16e}\n") err.write(f"{score_check = :24.16e}\n") assert abs(score - score_check) < 1.0e-6 prev_score = score prev_loc = loc return # ---------------------------------------------------------------------- def write_dics_from_parev(st, code_ch, loc_ch, parev): # Writes a set of hanzi-to-EVA dictionaries based on the locus ID {loc_ch} and # text {text_ch} of an SBJ entry, the locus ID {loc} of an SPS parag, # and the macro-parsings of the two parsed texts. # # The dictionary for SBJ text fragments of length {fsize_ch} # is written to file "dics/{code_ch}_{loc}_{fsize_ch}.dic". score, loc_ec, segs_ch, segs_ec, key_penalty = parev # Score and locus ID of parag: loc_ec = re.sub(r"<[^<>]*>", "", loc_ec) # Just in case: loc_ec = re.sub(r"[.]([0-9])$", r".0\1", loc_ec) # Zero-pad the line number. max_fsize_ch = 4 for fsize_ch in range(max_fsize_ch + 1): vms_dic = make_dic_from_parev(code_ch, loc_ch, parev, fsize_ch) assert vms_dic != None dic_file = f"dics/{code_ch}_{loc_ec}_{fsize_ch}.dic" wr = open(dic_file, "w") wr.reconfigure(encoding='utf-8') wr.write("# -*- coding: utf-8 -*-\n") wr.write(f"# {loc_ch = }\n") wr.write(f"# {loc_ec = }\n") pref = f"{code_ch} | {loc_ch:<8s} | {loc_ec:<8s} |" for frag_ch, frag_ec in vms_dic: wr.write(pref) assert fsize_ch == 0 or len(frag_ch) == fsize_ch frag_ch = frag_ch.ljust(10," ") frag_ec = frag_ec.ljust(50," ") wr.write(f" {frag_ch} | {frag_ec} |\n") wr.close() return # ---------------------------------------------------------------------- def make_dic_from_parev(code_ch, loc_ch, parev, fsize_ch): # Returns a list of pairs {frag_ch,frag_ec} of hanzi and EVA fragments that # are the conjectured matching parts of the SBJ entry {loc_ch}. # Each is given the badness {score} of the pairing. # # If {fsize_ch} is positive, the hanzi fragments {frag_ch} will be # all (overlapping) substrings of length {fsize_ch} of all the hanzi gaps in that # macro-parsing. Specifically, if {segs_ch[ks]} is a gap (even {ks}), then, if {frag_ch} is # centered at character position {kch} of that gap, the correspondng # EVA fragment {frag_ec} is taken from {segs_ec[ks]} centered at a location # {kec} that is {kch} scaled by the ratiof of the lengths of the two # gaps. The fragment {frag_ec} is padded if needed with '·' (centered # dots) to size {fsize. # # If {fsize_ch} is zero, the fragments {frag_ch} will be all the hits # in the macro-parsing {segs_ch}, whole; and they will be paired # with fragments {frag-ec} which are the corresponding hits of {segs_ec}. score, loc_ec, segs_ch, segs_ec, key_penalty = parev assert segs_ch != None ns = len(segs_ch); assert len(segs_ec) == ns nh = ns//2; ng = nh+1; assert ns == ng + nh def data_error(msg): assert False, msg # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ vms_dic = [] def extract_frag_pair(it_ch, fsz_ch, str_ch, str_ec): # Extracts a fragment {frag_ch} of length {fsz_ch} at index {it_ch} # from {str_ch}. Then extracts from {str_ec} # the corresponding fragment {frag_ec}, assuming that # the whole of {str_ch} maps to the whole of {str_ec}. nonlocal vms_dic # Define the number of EVA letters to take on each side of frag center: mrg_ec = 15 + int(ceil(2.5*fsz_ch)) nt_ch = len(str_ch) nt_ec = len(str_ec) scale = (nt_ec+1)/(nt_ch+1) # Limit of fragment on the SBJ gap: jt_ch = it_ch + fsz_ch # Character indices {kt_ch,kt_ec} of the frag centers: kt_ch = (it_ch + jt_ch)/2 kt_ec = int(floor(scale * kt_ch + 0.5)) # Start of fragment on the SPS gap, and necessary padding: it_ec = kt_ec - mrg_ec lpad = 0 if it_ec >= 0 else -it_ec it_ec = min(nt_ec-1, max(0, it_ec)) # Limit of fragment on the SPS gap, and necessary padding: jt_ec = kt_ec + mrg_ec rpad = 0 if jt_ec <= nt_ec else jt_ec - nt_ec jt_ec = min(nt_ec, max(1, jt_ec)) frag_ch = str_ch[it_ch:jt_ch] frag_ec = ("·" * lpad) + str_ec[it_ec:jt_ec] + ("·" * rpad) return frag_ch, frag_ec # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: if fsize_ch == 0: # Report the keyword hits: for ih in range(nh): frag_ch = segs_ch[2*ih+1] frag_ec = re.sub(r'[\[\]]', "", segs_ec[2*ih+1]) vms_dic.append((frag_ch, frag_ec,)) else: # Report substrings of the gaps with length {fsize_ch}: for ig in range(ng): gap_ch = segs_ch[2*ig] gap_ec = re.sub(r'[\[\]]', "", segs_ec[2*ig]) nt_ch = len(gap_ch) for it_ch in range(nt_ch + 1 - fsize_ch): frag_ch, frag_ec = extract_frag_pair(it_ch, fsize_ch, gap_ch, gap_ec) vms_dic.append((frag_ch, frag_ec,)) return vms_dic # ---------------------------------------------------------------------- def add_summary_of_parag_search(st, data): h.parags(st, f"""There are {data['npar_read']} parags in the good part of the SPS, and only {data['npar_with']} matched the keyword pattern. Their EVA letter sizes range in {data['min_size']}..{data['max_size']}.""") return # ---------------------------------------------------------------------- def add_chosen_starps_parag_section \ ( st, items_tg, loc_ch, code_ch, items_wh, loc_ec, parevs_list, items_wc, items_en ): # Inserts a parag saying that the SPS parag chosen to match # the SBJ entry {loc_ch} (code {code_ch}) is {loc_ec}, # # The {loc_ch} may be {None} to say that the SBJ entry will not be # assigned to any SPS parag. # # If {loc_ec} is not {None}, and looks it up in the evauation results # {parevs_list}. The latter must be a list of lists, each one of them # being either {None} or a list of tuples as returned by # {analyze_and_show_starps_parags}. # # If it finds the {loc_ec} in the {parevs_list}, chooses the parev # {best_parev} in those lists that has the specified {loc-ec} and # minimum badness score. Then displays the parsings of the SBJ entry # and of the SPS parag described therein. Also writes the hanzi-eva # dictionary files implied by it. # # The display includes the "macro-parsing" of the EVA characters of the SPS parag # into gaps and hit, as contained in the {best_parev}. # # The display also includes a table with the "micro-parsing" of the # parag defined by the lists {items_tg,items_wh,items_wc,items_en} which # will be the columns of the table. # # If {items_wc} is not {None}, it must be a list with the same size as # the other three, containing the text of the parag (with punctuation) # suitaly chopped to align with the hanzi fragments in {items_wh}. # # If {items_wc} is {None}, the procedure fetches the parag # {chosen_loc_ec} from the good parags file, and chops it into a list # {items_wc} of Voynichese words that aligns with the hanzi # fragments in {items_wh}, as well as possible. This may require # subdividing some of the fragments in {items_wh} and in the other two # lists. # # Returns that {best_parev}. # # If it cannot find the {loc_ec} in the {parevs_list}, displays a # warning and returns {None}. debug = False h.section(st, 2, "Chosen match") best_parev = None if loc_ec != None: h.parags(st, f"""We will tentatively assign {code_ch} ({loc_ch}) to {loc_ec}. However we must be aware that the true match may not have made it into the "good" subset.""") best_parev = find_best_starps_parag(loc_ec, parevs_list) if best_parev == None: msg = f"WARNING - PARAG {loc_ec} - MISSING PARAG EVALUATION RECORD" h.parags(st, f"{msg}") err.write(f"!! {msg}\n") else: h.parags(st, f"""We will not assign {code_ch} ({loc_ch}) to any SPS parag.""") if best_parev != None: score, loc_ec_chek, segs_ch, segs_ec, key_penalty = best_parev if debug: err.write(f"!& ### macro-parsing ch, ec from parev ###\n") write_wh_ec_wc_macro_parsings(err, "!&", segs_ch, segs_ec, None) assert loc_ec_chek == loc_ec # h.parags(st, "|[]目 [] 目[]  目 []  目[主 ]") # h.parags(st, "|目]目 目] 目目]  目 目]  目目主 ]") # h.parags(st, "|目目目 目目 目目目  目 目目  目目主 目") # h.parags(st, "|目目目目目目目目目目目目目目目目目目目目主目目") h.parags(st, f"SBJ entry parsing:") ch_str = anf.format_macro_parsing_ch(loc_ch, segs_ch, None) ch_str = h.protect_html(ch_str) h.append_preformatted(st, ch_str, ind = 4, centered = False) h.parags(st, f"SPS entry parsing:") ec_str = anf.format_starps_parag_evaluation(best_parev, None) ec_str = h.protect_html(ec_str) h.append_preformatted(st, ec_str, ind = 4, centered = False) write_dics_from_parev(st, code_ch, loc_ch, best_parev) # add_chosen_parag_analysis(st, best_parev) if items_wc == None: # Fetch the parag from the SPS file: ivt_file = "res/starps-gd-wc-par.ivt" text_wc = fetch_starps_line(ivt_file, loc_ec) assert text_wc != None, f"** cannot find {loc_ec} in the starps file" items_tg, items_wh, items_wc, items_en = \ alf.align_text_wc_with_micro_parsing_tg_wh_en_and_macro_parsing_ec \ ( items_tg, items_wh, items_en, segs_ch, text_wc, segs_ec ) if items_wc != None: h.section(st, 3, "Aligning the two versions") h.parags(st, f""" Here is the same text with the conjectured correspondence with parag {loc_ec} of the SPS:""") add_three_column_entry_table(st, items_tg, items_wh, items_wc, items_en) h.parags(st, """Note that the alignment of the Voynichese column is only a rough guess based on the hanzi and EVA letter counts.""") return best_parev # ---------------------------------------------------------------------- def fetch_starps_line(ivt_file, loc_starps): # Reads fle {ivt_file} (assumed to be UTF-8) and looks for a line with # locus ID "<{loc_starps}>". Returns that line minus the locus ID stripped of # blanks. If the {loc_starps} is not found, returns {None}. # Just in case: loc_starps = re.sub(r"[<>]", "", loc_starps) loc_pat = f"<{loc_starps}>" rd = open(ivt_file, "r") rd.reconfigure(encoding='utf-8') text = None for line in rd: if re.match(loc_pat, line): text = re.sub(loc_pat, "", line) text = text.strip() break rd.close() return text # ---------------------------------------------------------------------- def find_best_starps_parag(loc_ec, parevs_list): # Scans a bunch of lists of parevs (parag evaluations), selecting the # with smallest badness score. # # If not {none}, the {parevs_list} must be a list of lists, each one # of them being either {None} or a list of tuples as returned by # {analyze_and_show_starps_parags}. The procedure ignores elements of # {parevs_list} that are {None}. # # If {loc_ec} is {None}, chooses the absolute best {parev} in # {parevs_list}. If {loc_ec} is a string, considers only parevs that # have that SPS locus ID. If {loc_ec} is a set, considers only parevs # whose SPS locus ID is in that set. # # Then returned result is the best parev among the parevs that were considered. # # If it cannot find any parev as requested, returns {None}. gud_loc_ec = set((loc_ec,)) if isinstance(loc_ec, str) else loc_ec best_parev = None; if parevs_list != None: # Find the best candidate record: min_score = +inf for parevs in parevs_list: if parevs != None: for pev in parevs: sc = pev[0] lec = pev[1] gud_pass = (gud_loc_ec == None) or (lec in gud_loc_ec) if gud_pass: if sc < min_score: best_parev = pev; min_score = sc return best_parev # ---------------------------------------------------------------------- def test_stuff(original): err.write("TESTING\n") err.write("----------------------------------------\n") test_add_starps_matching_section(original) err.write("----------------------------------------\n") return # ---------------------------------------------------------------------- def test_add_starps_matching_section(original): err.write("----------------------------------------\n") err.write(f"@@@ testing add_starps_matching_section({original = !r})\n") st, code_ch, loc_ch, name_ch, name_py, name_en = \ add_title_summary_and_intro \ ( "WHOP", "b2.4.094", '白马茎', 'bái mǎ jīng', 'white horse penis' ) err.write("@@@ calling display_hanzi_pure_text ...\n") hanzi_trim_text = """ 白马茎:[主治]伤中脉绝,阴不起,强志,益气,长肌肉。肥健,生子。 眼:[主治]惊痫,腹满,疟疾。 悬蹄:[主治]惊邪,瘈疭,乳难。辟恶气,鬼毒,蛊注,不祥。 """ hanzi_trim = display_hanzi_pure_text(st, loc_ch, 51, hanzi_trim_text) kwords_en_A = ( 'MAIN-USES', 'MAIN-USES', 'MAIN-USES', ) kwords_en_B = ( 'MAIN-USES', 'QI', 'MAIN-USES', 'MAIN-USES', 'QI', ) max_score = 6.0 variants = \ ( ( hanzi_trim, kwords_en_A, True, ), ( hanzi_trim, kwords_en_B, True, ), ( hanzi_trim, kwords_en_B, False, ), ) err.write("@@@ calling add_starps_matching_section ...\n") h.section(st, 2, f"Original = {original!r}") parevs_list = add_starps_matching_section \ ( st, code_ch, loc_ch, variants, max_score, original ) err.write("@@@ finishing document ...\n") h.output_doc(st, sys.stdout, 0, last_edit) sys.stdout.flush() err.write("@@@ done.\n") err.write("----------------------------------------\n") return # ---------------------------------------------------------------------- if len(sys.argv) == 3 and sys.argv[1] == "R77.TEST": original = (int(sys.argv[2]) != 0) test_stuff(original)