#! /usr/bin/python3
# -*- coding: utf-8 -*-
last_edit = "Last edited on 2026-06-04 06:15:31 by stolfi"
import sys, re, os, string, glob
from sys import stderr as err
import html_gen as h
from process_funcs import bash, basic_line_loop
import html_report_funcs as hr
import size_position_funcs as spf
import match_multi_funcs as mmf
import analyze_starps_parag_funcs as anf
import align_bencao_starps_items_funcs as alf
import report_077_alt_matching_funcs as r77alt
import standard_bipatterns as stdbip
import bimatching_eval_funcs as bef
from math import sqrt, hypot, exp, log, floor, ceil, isfinite, isnan, inf, nan
def split_formatted_entry(entry):
# Parses an SBJ entry (hanzi, pinyin, translation, or Voynichese)
# that has been cast in multiline format.
#
# The {entry} must be a multiline string where the first line has the
# format "<{LOC}>" and each subsequent line has the format
# "{TAG}{SEP}{ITEM}" where the {TAG} is a string of [A-Z0-9] in parens
# '()', {SEP} is one or more blanks or '|'s, {ITEM} is any string.
#
# Removes leading and traling ASCII spaces from {ITEM} (but not
# ideographic spaces).
#
# Returns the {LOC}, the list of all {TAG}s, and the list of all
# {ITEM}s.
# Cannot strip -- must keep ideographic spaces.
m = re.fullmatch(r"[ \012]*[<]([a-z0-9.]+)[>] *[\012](.*)", entry, re.DOTALL)
assert m != None, f"bad entry format '{entry[:12]}'"
loc = m.group(1)
entry = m.group(2)
lines = entry.splitlines()
itms_tg = []
itms_wh = []
for line in lines:
# Cannot strip -- must keep ideographic spaces.
if re.match(r"[ \012]*([#]|$)", line): continue
m = re.fullmatch(r"[ \012]*([(][A-Z0-9a-z]+[)])[ |]+(.+)[ \012]*", line)
assert m != None, f"bad entry line format '{line}'"
tag = m.group(1)
item = m.group(2)
# Cannot strip -- must keep ideographic spaces.
item = re.sub(r"^[ ]+", "", item)
item = re.sub(r"[ ]+$", "", item)
itms_tg.append(tag)
itms_wh.append(item)
return loc, itms_tg, itms_wh
# ----------------------------------------------------------------------
def split_formatted_entry_hanzi(entry):
# Parses an SBJ entry hanzi that has been cast in multiline format.
# See {split_formatted_entry} for the format of {entry}.
# Then does some checking and cleanup of the items.
#
# The items must contain only hanzi, ideographic blanks and
# punctuation (which are retained), and leading or trailing ASCII
# blanks (which are stripped). Pads all items with ideographic blanks
# to the same width.
#
# Returns the {LOC}, the list of all {TAG}s, and the list of all
# {ITEM}s.
loc_ch, itms_tg, itms_wh = split_formatted_entry(entry)
# Pad all items>
max_item_sz = 0
for item in itms_wh:
if re.search(r"[\001-\377]", item):
assert False, f"ascii character in hanzi item '{item}'"
max_item_sz = max(len(item), max_item_sz)
itms_wh = [ item.ljust(max_item_sz, " ") for item in itms_wh ]
return loc_ch, itms_tg, itms_wh
# ----------------------------------------------------------------------
def make_three_column_entry_table(itms_tg, itms_wh, itms_aa, itms_bb):
# Returns the HTML of a table with the given {itms_tg} on column 1, the
# given {itms_wh} in column 2, and arbitrary ascii entries{itms_aa,itms_bb} in
# columns 3 and 4.
#
# All four lists must have the same length, with corresponding
# elements in the same positions.
N = len(itms_tg)
assert len(itms_wh) == N, f"{N = } {len(itms_wh) = }"
assert len(itms_aa) == N, f"{N = } {len(itms_aa) = }"
assert len(itms_bb) == N, f"{N = } {len(itms_bb) = }"
bars = [ ' | ' ] * N
rows = list(zip(itms_tg, bars, itms_wh, bars, itms_aa, bars, itms_bb))
col_mods = [
"style='padding-left:4ch; padding-right:4ch; text-align:left; font-weight:bold;'",
"align=left",
"align=left",
"align=left",
"align=left",
"align=left",
"align=left",
]
html_tb = h.make_table(rows, by_rows = True, col_mods = col_mods)
return html_tb
# ----------------------------------------------------------------------
def add_three_column_entry_table(st, itms_tg, itms_wh, itms_aa, itms_bb):
# Appends to {st} a table with the given {itms_tg} on column 1,
# the given {itms_wh}
# in column 2, and arbitrary ascii entries{itms_aa,itms_bb} in columns 3 and 4.
#
# The elements of {itms_wh} are assumed to consist of hanzi and/or
# ideographic punctuation. The other columns are supposed to be
# Latin (or pinyin) letters with ISO-Latin punctuation.
#
# All four lists must have the same length, with corresponding
# elements in the same positions.
html_tb = make_three_column_entry_table(itms_tg, itms_wh, itms_aa, itms_bb)
h.append_centered(st, html_tb, centered = False)
return
# ----------------------------------------------------------------------
def entry_align_table(st, rows):
# Prints a table with the itms_tg on column 1, hanzi in column2, and
# arbitrary ascii entries in columns 3 and 4.
ch_ps_wp_en_wcol_mods = [
"style='padding-left:4ch; padding-right:4ch; text-align:left; font-weight:bold;'",
"align=left",
"align=left",
"align=left",
"align=left",
]
h.table(st, rows, col_mods = col_mods, centered = False)
return
# ----------------------------------------------------------------------
def read_parms_from_file_header(rd):
# Reads {rd} and looks for lines of the form "# {KEY} = {VALUE}".
# Returns a dict with those keys and values.
# The {KEY} may be any python3-style identifier.
# The {VALUE} for now may be an integer, a float, or a string.
vms_dic = dict()
err.write("!= beg\n")
def process_line(nread, line):
nonlocal vms_dic
line = line.strip()
err.write(f"!= {nread:5d} {line = !r}\n")
m = re.fullmatch(r"# *([a-zA-Z][a-zA-Z_0-9]*) *[=] *(.*)", line)
if m == None: return
key = m.group(1)
val = m.group(2).strip()
err.write(f"!= {key = !r} {val = !r}\n")
if re.fullmatch(r"[-+]?[0-9]+", val):
# Integer
val = int(val)
elif re.fullmatch(r"[(][-+0-9, ]+[)]", val):
# Integer tuple; assume pair:
m = re.fullmatch(r"[(]([-+]?[0-9]+)[ ,]+([-+]?[0-9]+)[)]", val)
val = (int(m.group(1)), int(m.group(2)),)
elif re.fullmatch(r"[-+]?[0-9]*([.][0-9]|[0-9][.])[0-9]*([Ee][-+]?[0-9]+)?", val):
val = float(val)
elif re.fullmatch(r"['][^']*[']", val):
val = re.sub(r"[']", "", val)
elif re.fullmatch(r'["][^"]*["]', val):
val = re.sub(r'["]', "", val)
elif re.fullmatch(r'\[.*\]', val):
val = re.sub(r'^\[', "", val)
val = re.sub(r'\]$', "", val)
elems = re.split(r'[, ]+', val)
items = []
for el in elems:
if el != "":
if el[0] == '"':
el = re.sub(r'"', "", el);
elif el[0] == "'":
el = re.sub(r"^'", "", el);
items.append(el)
val = items
else:
assert False, f"** bad value «{val}»"
vms_dic[key] = val
return
# :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
nread = basic_line_loop(rd, process_line)
return vms_dic
# ----------------------------------------------------------------------
def add_intro(code_ch, loc_ch, name_ch, name_py, name_en, source):
name_en = re.sub(r"_", " ", name_en.lower())
name_en_uscore = re.sub(r"[ ]+", "_", name_en)
name_en_caps = string.capwords(name_en)
title = f"[{code_ch}.077] The '{name_en_caps}' entry of the SBJ [{source}]"
assert source == 'ZHB' or source == 'CTP'
color = "#ddeeff" if source == 'ZHB' else "#eeffdd"
st = h.new_doc(title, color, text_width = 1600)
h.section(st, 2, "Summary")
h.parags(st, f"""This webpage discusses the SBJ entry titled {name_ch}
{name_py} = "{name_en}", parsed into its sub-entries and their fields, and
possible correspondences to parags of the SPS. The modern Mandarin
reading of the text and an English translation are also shown. This
entry may be referred as "{code_ch}" in tables.""")
def fetch_entry(utype):
fname = f"in/{utype}/{code_ch}-{loc_ch}.utf"
rd = open(fname, "r")
text = rd.read()
rd.close()
return text
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
h.section(st, 2, "The Shennong Bencaojing entry")
loc1, tags1, itms_wh = split_formatted_entry_hanzi(fetch_entry("ch"))
loc2, tags2, itms_py = split_formatted_entry(fetch_entry("py"))
assert loc2 == loc1
assert tags2 == tags1
loc3, tags3, itms_en = split_formatted_entry(fetch_entry("en"))
assert loc3 == loc1
assert tags3 == tags1
add_formatted_entry_table(st, tags1, itms_wh, itms_py, itms_en)
return st, code_ch, loc_ch, name_ch, name_py, name_en, tags1, itms_wh, itms_py, itms_en
# ----------------------------------------------------------------------
def add_chinese_text_section(st, loc_ch, name_ch, name_py, name_en, nch, itms_wh):
h.section(st, 3, "The full entry")
hanzi_full_text = f"<{loc_ch}> (full) " + "".join(itms_wh)
hanzi_full, tvar_full = display_hanzi_pure_text(st, loc_ch, nch, hanzi_full_text)
assert tvar_full == "full"
return hanzi_full
# ----------------------------------------------------------------------
def display_hanzi_pure_text(st, loc_ch, nch, hanzi_text):
# Takes a {hanzi_text} of the form "<{LOC}> ({TVAR}) {TEXT}"
# where {LOC} must match {loc_ch} and {TEXT} must be a hanzi
# string possibly including hanzi punctuation.
#.
# Extracts a pure hanzi string {hanzi_pure} by removing all
# hanzi punctuation from {TEXT}. The result must contain only
# non-punctuation hanzi and its length (in hanzi) must be {nch}.
#
# Appends to {st} a parag with {loc_ch}, {tvar_ch},{nch}
# and the {hanzi_pure} text. Returns {hanzi_pure} and the {TVAR} as
# the result.
hanzi_text = hanzi_text.strip()
hanzi_text = re.sub(r"[ \012]", "", hanzi_text)
m = re.fullmatch(r"<([a-z0-9.]+)> *[(]([^()]+)[)] *(.*)", hanzi_text)
assert m != None, f"invalid chinese entry format {hanzi_text!r}"
assert m.group(1) == loc_ch, f"loc ID mismtch: {loc_ch} != {m.group(1)}"
tvar_ch = m.group(2)
hanzi_body = m.group(3)
hanzi_pure = re.sub(r"[:[]()、,。; ]", "", hanzi_body)
nch_real = len(hanzi_pure)
assert nch_real == nch, f"length error: {nch = } actual {nch_real}"
if nch < 45:
hanzi_chops = [ hanzi_pure, ]
else:
hanzi_chops = [ hanzi_pure[k:k+40] for k in range(0, nch, 40) ]
hanzi_display = f"<{loc_ch}> ({tvar_ch}) {nch:2d} hanzi\n" + "\n".join(hanzi_chops)
h.append_preformatted(st, h.protect_html(hanzi_display), ind = 4, centered = False)
return hanzi_pure, tvar_ch
# ----------------------------------------------------------------------
def add_formatted_entry_table(st, itms_tg, itms_wh, itms_py, itms_en):
h.section(st, 3, "Pinyin and translation")
h.parags(st, """Here is the same entry, with punctuation added according
to this parsing, the modern Mandarin readings in pinyin, and a somewhat
literal English translation:""")
add_three_column_entry_table(st, itms_tg, itms_wh, itms_py, itms_en)
return
# ----------------------------------------------------------------------
def add_starps_matching_section(st, code_ch, loc_ch, variants, max_score, locs_to_try, locs_to_show):
# Appends to document {st} the body of a section that searches the SPS
# file for parags matching a given SBJ entry in various ways.
#
# The {variants} argument must be a list of pairs {((tvar_ch, cleantx_ch),
# kwords_en)} where {cleantx_ch} is a suitably cleaned and
# trimmed hanzi text of the entry to be matched, {txtag-ch} is a short string
# that identifies {cleantx_ch} among other variants of the same SBJ recipe,
# and {kwords_en} is a list of strings identifying the "cribs" to use
# in the matching.
#
# Calls {add_starps_matching_subsection} for each tuple
# in {variants}, to compare that variant to all SPS parags.
#
# Returns a list of of the results of those calls. Each element of
# this list is a list of matching results. Each matching result is a
# tuple as returned by {r77alt.compare_bencao_entry_to_starps_parags} (quod videt).
#
# Also adds a summary of the matches at the end.
#
# If {locs_to_try} is not {None}, it must be a set with the location IDS
# of the SPS parags to be considered for the match. If {locs_to_try} is None,
# considers all parags.
#
# If {locs_to_show} is not {None}, tries to show the matches with the
# parags whose names are in the set {locs_to_show}.
h.section(st, 2, f"Matching results")
parevs_list = []
for tt_ch, kwords_en in variants:
tvar_ch, cleantx_ch = tt_ch
cribs = ",".join(kwords_en)
title = f"Variant: {tvar_ch} Cribs: {cribs}"
parevs = add_starps_matching_subsection \
( st, code_ch, loc_ch, tvar_ch, cleantx_ch,
kwords_en, title, max_score, locs_to_try, locs_to_show
)
parevs_list.append(parevs)
add_summary_of_matching_section(st, loc_ch, variants, parevs_list, locs_to_show)
return parevs_list
# ----------------------------------------------------------------------
def add_starps_matching_subsection \
( st, code_ch, loc_ch, tvar_ch, cleantx_ch,
kwords_en, title, max_score, locs_to_try, locs_to_show
):
# Appends to document {st} a section that searches the SPS
# file for parags matching a given SBJ entry in various ways.
# The parameters are:
#
# {code_ch} four-letter code of the SBJ entry.
# {loc_ch} loc ID of the entry to be matched in the SBJ file.
# {tvar_ch} a tag that identifies the variant (trimming etc.) of that entry.
# {cleantx_ch} a suitably cleaned and trimmed hanzi text of that entry.
# {kwords_en} is a list of strings.
# {title} a title fof the section
# {max_score} maximum interesting badness score.
# {locs_to_trty} set of loc IDs of SPS parags to consider, or {None} for all.
# {locs_to_show} set of loc IDs of SPS parags to show no matter what their score.
#
# Returns a list of matching results. Each matching result is a tuple
# as returned by {r77alt.compare_bencao_entry_to_starps_parags}
# (quod videt).
#
# Each element of {kwords_en} should be a /keyword code/, a string,
# like 'USES' or 'QI' that specifies an abstract keyword. This
# parameter is converted to a /bitemplate/ {bitemp} as expected by
# {bmf.match_bitemplate}
#
# The procedure finds a macro-parsing {segs_ch} of {cleantx_ch} by
# side 0 of {bitemp}. It then calls
# {r77alt.compare_bencao_entry_to_starps_parags}
# to scan the file of good SPS parags {ivt_file} and evaluate each
# parag for how well it matches the SBJ entry.
#
# Let {nh} be the number of pattern codes in {kwords}. Let {ng} be
# {nh+1}, and {ns} be {ng + nh}.
#
# For each parag considered, the procedure creates a version
# {cleantx_ec} of its text, that has only lowercase EVA letters
# [a-z?]. It then splits {cleantx_ch} and {cleantx_ec} into two
# macro-parsings {segs_ch[0..ns-1]} and {segs_ec[0..ns-1]}, each
# consisting of {nh} /hits/ (strings matched by the keyword templates)
# and {ng} /gaps/ (the strings before, between, and after the hits).
#
# For each parag the procedure also computes a badness {score} that
# combines penalties for the use of non-perfect hits (like 'laiin'
# instead of 'daiin') and discrepancies between the sizes of the
# gaps in {segs_ch} and {segs_ec}.
#
# The result is a list of parag evaluation tuples (/parevs/), one for
# each parag that may possibly match, with its badness score, the
# locus ID of the parag, the macro-parsing {segs_ch[0..ns-1]} of the SBJ entry,
# and the matching macro-parsing {segs_ec[0..ns-1]} of the SPS parag.
#
# This procedure then trims that list of parevs after the first one with
# score exceeding {max_score}, and inserts the list into the document
# {st}, formatted as described in {anf.format_starps_parag_evaluation}.
# However, parags whose locs are in {locs_to_show} are shown in any case.
#
# The result of the call is that trimmed list.
debug = False
verbose = True
ctsize_ch = len(cleantx_ch)
kwords_str = ", ".join(kwords_en)
h.section(st, 3, f"{title}")
h.parags(st, f"Trimmed SBJ entry ({tvar_ch} - {ctsize_ch} hanzi):")
temptx_ch = f"<{loc_ch}> ({tvar_ch}) " + cleantx_ch
cleantx_ch_check, tvar_ch_check = display_hanzi_pure_text \
( st, loc_ch, ctsize_ch, temptx_ch )
assert cleantx_ch_check == cleantx_ch
assert tvar_ch_check == tvar_ch
# The result of comparing an SBJ entry with clean text {cleantx_ch}
# and and an SPS parag with clean text {cleantx_ec} is either {None}
# or a parag evaluation record (/parev/). Each parev has the format
# {(score, loc_ch, tvar_ch, loc_ec, segs_ch, segs_ec, key_penalty)}.
# See {format_starps_parag_evaluation} for their meanings.
nh = len(kwords_en); ng = nh + 1; ns = ng + nh
h.parags(st, "Keyword patterns:")
hipat_ch = None # Patterns used to highlight keywords in SBJ text.
hipat_ec = None # Patterns used to highlight keywords in SPS text.
# Get and display the bitemplate {bitemp}:
bitemp = []
for kw_en in kwords_en:
bipat = stdbip.get_bencao_starps_bipattern(kw_en)
bitemp.append(bipat)
r77alt.add_bitemplate_description(st, kwords_en, bitemp)
hipat_ch, hipat_ec = r77alt.get_keyword_highlight_patterns_alt(bitemp)
if debug: err.write(f"!@ SBJ highlight pattern = {hipat_ch!r}\n")
if debug: err.write(f"!@ SPS highlight pattern = {hipat_ec!r}\n")
# Scan the SPS parags collecting reasonable matches:
parevs, data = r77alt.compare_bencao_entry_to_starps_parags \
( code_ch, loc_ch, tvar_ch, cleantx_ch, bitemp, max_score, locs_to_try )
# Discard totally bad candidates:
nc = len(parevs)
h.parags(st, f"Obtained {nc} parag evaluations.")
nc_good = 0; # Parevs with acceptable score.
while nc_good < nc and parevs[nc_good][0] <= max_score: nc_good += 1
err.write(f" {nc_good = }\n")
h.parags(st, f"Found {nc_good} parags with max score {max_score}.")
nc_show = nc_good
# First parev with unacceptable score:
if nc_show < nc: nc_show += 1
# Discard excessive parevs:
max_show = 30
nc_show = min(nc_show, max_show)
if nc_show < max_show:
# Ensure that some parevs are shown:
nc_show = max(nc_show, min(nc, 3))
# Salutar paranoia:
err.write(f" {nc_show = }\n")
assert nc_show <= max_show + 1
validate_parevs(parevs, loc_ch, tvar_ch, cleantx_ch, max_score, nc_good, verbose)
# Show the candidates:
h.parags(st, f"Showing {nc_show} matches:")
if locs_to_show == None: locs_to_show = set()
ec_list_blocks = []
prev_segs_ch = None
parevs_out = []
for ic in range(nc):
pev = parevs[ic]
score, loc_ch_1, tvar_ch_1, loc_ec, segs_ch, segs_ec, key_penalty = pev
assert loc_ch_1 == loc_ch
assert tvar_ch_1 == tvar_ch
show = ic < nc_show or loc_ec in locs_to_show
if show:
if segs_ch != prev_segs_ch:
# Must show the SBJ entry parsing:
ec_list_blocks.append("\n")
ch_str = anf.format_macro_parsing_ch(loc_ch, tvar_ch, segs_ch, hipat_ch)
ch_str = h.indent_lines(4, ch_str)
ec_list_blocks.append(ch_str)
ec_list_blocks.append("\n")
prev_segs_ch = segs_ch
ec_str = anf.format_starps_parag_evaluation(pev, hipat_ec)
ec_list_blocks.append(ec_str)
parevs_out.append(pev)
ec_list_str = "\n".join(ec_list_blocks)
ec_list_str = h.protect_html(ec_list_str)
h.append_preformatted(st, ec_list_str, ind = 2, centered = False)
return parevs_out
# ----------------------------------------------------------------------
def validate_parevs(parevs, loc_ch, tvar_ch, cleantx_ch, max_score, nc_good, verbose):
# Paranoia:
prev_score = -inf; prev_loc_ec = "NONE"
nc = len(parevs)
for ic in range(nc):
pev = parevs[ic]
score, loc_ch_1, tvar_ch_1, loc_ec, segs_ch, segs_ec, key_penalty = pev
assert loc_ch_1 == loc_ch
assert tvar_ch_1 == tvar_ch
assert loc_ec != prev_loc_ec # No duplicate parevs.
assert cleantx_ch == "".join(segs_ch)
if verbose: err.write(f" parag {loc_ec:<12s} {score = :6.2f}\n")
assert score >= prev_score # Badness must be non-decreasing.
if ic < nc_good:
assert score <= max_score # No bad parevs in this range.
score_check = bef.compute_full_score_from_macro_parsings \
(segs_ch, "ch", segs_ec, "ec", key_penalty)
if score != score_check:
err.write(f"{score = :24.16e}\n")
err.write(f"{score_check = :24.16e}\n")
assert abs(score - score_check) < 1.0e-6
prev_score = score
prev_loc_ec = loc_ec
return
# ----------------------------------------------------------------------
def write_dics_from_parev(st, code_ch, pev):
# Writes a set of hanzi-to-EVA dictionaries based on the locus ID {loc_ch} and
# text {text_ch} of an SBJ entry, the locus ID {loc_ec} of an SPS parag,
# and the macro-parsings of the two parsed texts.
#
# The dictionary for SBJ text fragments of length {fsize_ch}
# is written to file "dics/{code_ch}_{loc_ec}_{fsize_ch}.dic".
score, loc_ch, tvar_ch, loc_ec, segs_ch, segs_ec, key_penalty = pev
# Score and locus ID of parag:
loc_ec = re.sub(r"<[^<>]*>", "", loc_ec) # Just in case:
loc_ec = re.sub(r"[.]([0-9])$", r".0\1", loc_ec) # Zero-pad the line number.
max_fsize_ch = 4
for fsize_ch in range(max_fsize_ch + 1):
vms_dic = make_dic_from_parev(code_ch, loc_ch, pev, fsize_ch)
assert vms_dic != None
dic_file = f"dics/{code_ch}_{loc_ec}_{fsize_ch}.dic"
wr = open(dic_file, "w")
wr.reconfigure(encoding='utf-8')
wr.write("# -*- coding: utf-8 -*-\n")
wr.write(f"# {loc_ch = }\n")
wr.write(f"# {loc_ec = }\n")
pref = f"{code_ch}:{tvar_ch} | {loc_ch:<8s} | {loc_ec:<8s} |"
for frag_ch, frag_ec in vms_dic:
wr.write(pref)
assert fsize_ch == 0 or len(frag_ch) == fsize_ch
frag_ch = frag_ch.ljust(10," ")
frag_ec = frag_ec.ljust(50," ")
wr.write(f" {frag_ch} | {frag_ec} |\n")
wr.close()
return
# ----------------------------------------------------------------------
def add_summary_of_matching_section(st, loc_ch, variants, parevs_list, locs_to_show):
# The {parevs_list} must be a lits of lists of parevs, where
# {parevs_list[iv]} is the result of matching according to variant
# {variants[iv]}.
#
# Selects a few parags that look like valid matches, or best
# approximations thereof. For each variant, and each of
# those parags, prints a line with a terse summary:
# badness, EVA keywords matched, and gap errors.
h.section(st, 2, "Matching summary")
nv = len(variants); assert nv == len(parevs_list)
# First, select the best matches from each variant, and any
# other parags that have similar badness:
#
cands = locs_to_show # Set of loc ids of candidates.
for iv in range(nv):
cands |= select_best_cands(parevs_list[iv])
# Second, for each cand, get the best score among all the
# variants:
best_loc_scores = [] # List of pairs {(loc_ec, score_min)}
for loc_ec in cands:
score_min = +inf
for iv in range(nv):
tt_ch, kwords_en = variants[iv]
tvar_ch, cleantx_ch = tt_ch
for pev in parevs_list[iv]:
score_p, loc_ch_p, tvar_ch_p, loc_ec_p, segs_ch_p, segs_ec_p, key_penalty_p = pev
assert loc_ch_p == loc_ch
assert tvar_ch_p == tvar_ch
if loc_ec == loc_ec_p:
if score_p < score_min: score_min = score_p
best_loc_scores.append((loc_ec, score_min, ))
best_loc_scores.sort(key = lambda x: x[1])
# Now print the summaries per variant:
for iv in range(nv):
parevs = parevs_list[iv]
tt_ch, kwords_en = variants[iv]
tvar_ch, cleantx_ch = tt_ch
cribs = ",".join(kwords_en)
title = f"Trim: {tvar_ch} Cribs: {cribs}"
add_summary_of_variant_matching(st, best_loc_scores, parevs, title)
return
# ----------------------------------------------------------------------
def select_best_cands(parevs):
# Given a list {parevs} of parevs, selects the one with minimum badness score
# and a few more with similar scores, if any. Returns the /set/
# of the locus IDs of the parags selected.
# Expects the list {parevs} to be sorted by non-decreasing
# badness scores.
max_cands = 5
cands = set()
if parevs != None:
score_min = None
tol = 0.2
for pev in parevs:
score, loc_ch, tvar_ch, loc_ec, segs_ch, segs_ec, key_penalty = pev
if score < +inf:
if score_min == None: score_min = score
assert score >= score_min
if score <= (1 + tol)*score_min:
cands.add(loc_ec)
tol = 0 if len(cands) >= max_cands else tol/2
return cands
# ----------------------------------------------------------------------
def add_summary_of_variant_matching(st, best_loc_scores, parevs, title):
# Appends to {st} a summary of the matching attempts of one variant.
#
# The {best_loc_scores} must be a list of pairs {(loc_ec, score_min)}
# where {loc_ec} is the locus ID of an SPS parag and {score_min}
# is its lowest badness score over all the variants.
# The order of these pairs defines the order in which the
# parags will be listed.
#
# The {parevs} must be the list of parevs that resulted from
# the matching of this variant.
#
# The {title} should be an explanatory title for the varaint.
def find_parev(loc_ec, parevs):
# Finds the parev of parag {loc_ec} in {parevs}, or {None}:
the_pev = None
for pev in parevs:
if pev[3] == loc_ec:
the_pev = pev
break
return the_pev
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
h.section(st, 3, title)
rows = []
for loc_ec, score_min in best_loc_scores:
pev = find_parev(loc_ec, parevs)
row = [ ]
if pev == None:
row.append("No match")
else:
score_p, loc_ch_p, tvar_ch_p, loc_ec_p, segs_ch_p, segs_ec_p, key_penalty_p = pev
assert score_p >= score_min
if score_p == +inf:
row.append("No match")
else:
row.append(tvar_ch_p)
row.append(loc_ec)
score_p_str = f"({score_p:.3f})"
row.append(score_p_str)
hits_ec, tot_err, gap_errs = short_parev_summary(segs_ch_p, segs_ec_p)
hits_ec_str = ",".join([ "@" + ht for ht in hits_ec ])
row.append(hits_ec_str)
kpen_str = f"({key_penalty_p:.3f})"
row.append(kpen_str)
tot_err_str = "00" if tot_err == 0 else f"{tot_err:+d}"
row.append(tot_err_str)
gap_errs_str = ",".join([ "00" if ger == 0 else f"{ger:+d}" for ger in gap_errs ])
row.append(gap_errs_str)
while len(row) < 6: row.append("")
rows.append(row)
sty_tvar_ch = "padding-left:4ch; padding-right:1ch; text-align:left; font-weight:bold;"
sty_loc_ec = "padding-left:1ch; padding-right:2ch; text-align:left; font-weight:bold;"
sty_score = "padding-left:1ch; padding-right:1ch; text-align:right;"
sty_hter = "padding-left:1ch; padding-right:1ch; text-align:left;"
col_mods = [
f"style='{sty_tvar_ch}'",
f"style='{sty_loc_ec}'",
f"style='{sty_score}'",
f"style='{sty_hter}'",
f"style='{sty_score}'",
f"style='{sty_hter}'",
f"style='{sty_hter}'",
]
html_tb = h.make_table(rows, by_rows = True, col_mods = col_mods)
h.append_centered(st, html_tb, centered = False)
return
# ----------------------------------------------------------------------
def short_parev_summary(segs_ch, segs_ec):
# Returns a list of the hits in {segs_ec}, the total size error,
# and a list of gap errors compared to the
# predictions from {segs_ch}, in EVA letters.
ns = len(segs_ch); assert len(segs_ec) == ns
nh = ns//2; ng = nh + 1; assert ns == nh + ng
hits_ec = [ segs_ec[2*ih + 1] for ih in range(nh) ]
tsz_ch = 0; tsz_ec = 0
for ks in range(ns):
tsz_ch += len(segs_ch[ks]); tsz_ec += len(segs_ec[ks])
tot_err = anf.size_error(tsz_ch, tsz_ec)
gap_errs = [ ]
for ig in range(ng):
gsz_ch = len(segs_ch[2*ig])
gsz_ec = len(segs_ec[2*ig])
gap_errs.append(anf.size_error(gsz_ch, gsz_ec))
return hits_ec, tot_err, gap_errs
# ----------------------------------------------------------------------
def add_chosen_starps_parag_section \
( st, itms_tg, loc_ch, code_ch, itms_wh, chosen_loc_ec, chosen_tvar_ch, parevs_list, itms_en ):
# Inserts a parag saying that the SPS parag chosen to match
# the SBJ entry {loc_ch} (code {code_ch}) is {chosen_loc_ec},
#
# The {chosen_loc_ec} may be {None} to say that the SBJ entry will not be
# assigned to any SPS parag.
#
# If {chosen_loc_ec} is not {None}, the procedure looks it up in the evauation results
# {parevs_list}. The latter must be a list of lists, each one of them
# being either {None} or a list of tuples as returned by
# {analyze_and_show_starps_parags}.
#
# If {chosen_tvar_ch} is not {None}, considers only parevs that have
# that specific trim variant tag. If {chosen_tvar_ch} is {None}, considers
# all parevs, of any trim variant.
#
# If it finds the {chosen_loc_ec} in the {parevs_list}, chooses the parev
# {best_parev} in those lists that has the specified {chosen_loc_ec}, and
# minimum badness score. Then displays the parsings of the SBJ entry
# and of the SPS parag described therein. Also writes the hanzi-eva
# dictionary files implied by it.
#
# The display includes the "macro-parsing" of the EVA characters of
# the SPS parag into gaps and hit, as contained in the {best_parev}.
#
# The display includes also a table with the "micro-parsing" of the
# parag obtained by refining and merging the micro-parsings {itms_tg},
# {itms_wh}, {itms_en}, as well as the Voynichese word-split text of
# parag {chosen_loc_ec} read from the good parags file. This may
# require subdividing some of the fragments in those micro-parsings.
#
# Returns that {best_parev}.
#
# If it cannot find the {chosen_loc_ec} in the {parevs_list}, displays a
# warning and returns {None}.
debug = False
h.section(st, 2, "Chosen match")
best_parev = None
if chosen_loc_ec != None:
h.parags(st, f"""We will tentatively assign {code_ch} ({loc_ch}) to
{chosen_loc_ec}. However we must be aware
that the true match may not have made it into the "good" subset.""")
best_parev = find_best_starps_parag(chosen_tvar_ch, chosen_loc_ec, parevs_list)
if best_parev == None:
msg = f"WARNING - PARAG {chosen_loc_ec} - MISSING PARAG EVALUATION RECORD"
h.parags(st, f"{msg}")
err.write(f"!! {msg}\n")
else:
h.parags(st, f"""We will not assign {code_ch} ({loc_ch}) to any SPS parag.""")
itms_wc = None
if best_parev != None:
score, loc_ch, tvar_ch, loc_ec, segs_ch, segs_ec, key_penalty = best_parev
if debug:
err.write(f"!& ### macro-parsing ch, ec from parev ###\n")
write_wh_ec_wc_macro_parsings(err, "!&", segs_ch, segs_ec, None)
assert loc_ec == chosen_loc_ec
if chosen_tvar_ch != None: assert tvar_ch == chosen_tvar_ch
h.parags(st, f"SBJ entry parsing:")
ch_str = anf.format_macro_parsing_ch(loc_ch, tvar_ch, segs_ch, None)
ch_str = h.protect_html(ch_str)
h.append_preformatted(st, ch_str, ind = 4, centered = False)
h.parags(st, f"SPS entry parsing:")
ec_str = anf.format_starps_parag_evaluation(best_parev, None)
ec_str = h.protect_html(ec_str)
h.append_preformatted(st, ec_str, ind = 4, centered = False)
write_dics_from_parev(st, code_ch, best_parev)
# Fetch the parag from the word-split SPS good parags file:
ivt_file = "res/starps-gd-wc-par.ivt"
rawtx_wc, nlin = fetch_starps_line(ivt_file, chosen_loc_ec)
def data_error(msg):
file_line_error(ivt_file, nlin, msg, f"<{chosen_loc_ec}> {rawtx_wc}")
assert False
# ..................................................................
utype = "wc"
cleantx_wc, head, tail = spf.clean_up_starps_raw_text(rawtx_wc, utype, data_error)
cleantx_wc = spf.normalize_starps_text(cleantx_wc, utype, data_error)
assert rawtx_wc != None, f"** cannot find {chosen_loc_ec} in the starps file"
itms_tg, itms_wh, itms_wc, itms_en = \
alf.align_text_wc_with_micro_parsing_tg_wh_en_and_macro_parsing_ec \
( itms_tg, itms_wh, itms_en, segs_ch, cleantx_wc, segs_ec )
if itms_wc != None:
h.section(st, 3, "Aligning the two versions")
h.parags(st, f""" Here is the same text with the conjectured
correspondence with parag {chosen_loc_ec} of the
SPS:""")
add_three_column_entry_table(st, itms_tg, itms_wh, itms_wc, itms_en)
h.parags(st, """Note that the alignment of the Voynichese column
is only a rough guess based on the hanzi and EVA letter counts.""")
return best_parev
# ----------------------------------------------------------------------
def fetch_starps_line(ivt_file, loc_starps):
# Reads fle {ivt_file} (assumed to be UTF-8) and looks for a line with
# locus ID "<{loc_starps}>". Returns that line, minus the locus D, stripped.
# Also returns the line number (from 1).
#
# If the {loc_starps} is not found, returns {None,None}.
# Just in case:
loc_starps = re.sub(r"[<>]", "", loc_starps)
loc_pat = f"<{loc_starps}>"
rd = open(ivt_file, "r")
rd.reconfigure(encoding='utf-8')
text = None
nlin = 0
for line in rd:
nlin += 1
if re.match(loc_pat, line):
text = re.sub(loc_pat, "", line)
text = text.strip()
break
rd.close()
return text, nlin
# ----------------------------------------------------------------------
def find_best_starps_parag(tvar_ch, loc_ec, parevs_list):
# Scans a bunch of lists of parevs (parag evaluations), selecting the
# with smallest badness score.
#
# If not {none}, the {parevs_list} must be a list of lists, each one
# of them being either {None} or a list of tuples as returned by
# {analyze_and_show_starps_parags}. The procedure ignores elements of
# {parevs_list} that are {None}.
#
# If {loc_ec} is a string, considers only parevs that have that SPS
# locus ID. If {loc_ec} is a set, considers only parevs whose SPS
# locus ID is in that set. If {loc_ec} is {None}, accepts any parag.
#
# If {tvar_ch} is a string, considers only parevs that have that
# trimming variant tag. If {tvar_ch} is a set, considers only parevs
# whose trimming variant tag in that set. If {tvar_ch} is {None},
# acepts any variant tag.
#
# Then returned result is the best parev among the parevs that were considered.
#
# If it cannot find any parev as requested, returns {None}.
gud_loc_ec = set((loc_ec,)) if isinstance(loc_ec, str) else loc_ec
gud_tvar_ch = set((tvar_ch,)) if isinstance(tvar_ch, str) else tvar_ch
best_parev = None;
if parevs_list != None:
# Find the best candidate record:
min_score = +inf
for parevs in parevs_list:
if parevs != None:
for pev in parevs:
score_p, loc_ch_p, tvar_ch_p, loc_ec_p, segs_ch_p, segs_ec_p, key_penalty_p = pev
loc_ec_ok = (gud_loc_ec == None) or (loc_ec_p in gud_loc_ec)
tvar_ch_ok = (gud_tvar_ch == None) or (tvar_ch_p in gud_tvar_ch)
if loc_ec_ok and tvar_ch_ok:
if score_p < min_score:
best_parev = pev; min_score = score_p
return best_parev
# ----------------------------------------------------------------------
def make_dic_from_parev(code_ch, loc_ch, pev, fsize_ch):
# Returns a list of pairs {frag_ch,frag_ec} of hanzi and EVA fragments that
# are the conjectured matching parts of the SBJ entry {loc_ch}.
# Each is given the badness {score} of the pairing.
#
# If {fsize_ch} is positive, the hanzi fragments {frag_ch} will be
# all (overlapping) substrings of length {fsize_ch} of all the hanzi gaps in that
# macro-parsing. Specifically, if {segs_ch[ks]} is a gap (even {ks}), then, if {frag_ch} is
# centered at character position {kch} of that gap, the correspondng
# EVA fragment {frag_ec} is taken from {segs_ec[ks]} centered at a location
# {kec} that is {kch} scaled by the ratiof of the lengths of the two
# gaps. The fragment {frag_ec} is padded if needed with '·' (centered
# dots) to size {fsize.
#
# If {fsize_ch} is zero, the fragments {frag_ch} will be all the hits
# in the macro-parsing {segs_ch}, whole; and they will be paired
# with fragments {frag-ec} which are the corresponding hits of {segs_ec}.
score, loc_ch_p, tvar_ch, loc_ec, segs_ch, segs_ec, key_penalty = pev
assert loc_ch_p == loc_ch
assert segs_ch != None
ns = len(segs_ch); assert len(segs_ec) == ns
nh = ns//2; ng = nh+1; assert ns == ng + nh
def data_error(msg):
assert False, msg
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
vms_dic = []
def extract_frag_pair(it_ch, fsz_ch, str_ch, str_ec):
# Extracts a fragment {frag_ch} of length {fsz_ch} at index {it_ch}
# from {str_ch}. Then extracts from {str_ec}
# the corresponding fragment {frag_ec}, assuming that
# the whole of {str_ch} maps to the whole of {str_ec}.
nonlocal vms_dic
# Define the number of EVA letters to take on each side of frag center:
mrg_ec = 15 + int(ceil(2.5*fsz_ch))
nt_ch = len(str_ch)
nt_ec = len(str_ec)
scale = (nt_ec+1)/(nt_ch+1)
# Limit of fragment on the SBJ gap:
jt_ch = it_ch + fsz_ch
# Character indices {kt_ch,kt_ec} of the frag centers:
kt_ch = (it_ch + jt_ch)/2
kt_ec = int(floor(scale * kt_ch + 0.5))
# Start of fragment on the SPS gap, and necessary padding:
it_ec = kt_ec - mrg_ec
lpad = 0 if it_ec >= 0 else -it_ec
it_ec = min(nt_ec-1, max(0, it_ec))
# Limit of fragment on the SPS gap, and necessary padding:
jt_ec = kt_ec + mrg_ec
rpad = 0 if jt_ec <= nt_ec else jt_ec - nt_ec
jt_ec = min(nt_ec, max(1, jt_ec))
frag_ch = str_ch[it_ch:jt_ch]
frag_ec = ("·" * lpad) + str_ec[it_ec:jt_ec] + ("·" * rpad)
return frag_ch, frag_ec
# ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
if fsize_ch == 0:
# Report the keyword hits:
for ih in range(nh):
frag_ch = segs_ch[2*ih+1]
frag_ec = re.sub(r'[\[\]]', "", segs_ec[2*ih+1])
vms_dic.append((frag_ch, frag_ec,))
else:
# Report substrings of the gaps with length {fsize_ch}:
for ig in range(ng):
gap_ch = segs_ch[2*ig]
gap_ec = re.sub(r'[\[\]]', "", segs_ec[2*ig])
nt_ch = len(gap_ch)
for it_ch in range(nt_ch + 1 - fsize_ch):
frag_ch, frag_ec = extract_frag_pair(it_ch, fsize_ch, gap_ch, gap_ec)
vms_dic.append((frag_ch, frag_ec,))
return vms_dic
# ----------------------------------------------------------------------
def get_recipe_pages_in_dir(dir):
# Scans the folder "{dir}" for files called "*_entry.html" or "*_entry_src.py".
#
# For every {name} such that either "{dir}/{name}_entry_src.py" or
# "{dir}/{name}_entry.html" exist, the resulting list will have
# "{dir}/{name}_entry.html".
#
assert os.path.exists(dir), f"folder {dir} does not exist"
src_html_files = \
glob.glob(f"./{dir}/*_entry_src.py") + \
glob.glob(f"./{dir}/*_entry.html")
hnames = map(lambda x: re.sub(r"(_entry_src[.]py|_entry[.]html)$", "", x), src_html_files)
hnames = map(lambda x: re.sub(r"^([.]/)*", "", x), hnames)
hnames = map(lambda x: re.sub(r"/([.]/)*", "/", x), hnames)
hnames = [ x for x in hnames if x != "" and not re.search(r"\b(work|JUNK|SAVE)\b", x) ]
hnames = sorted(list(set(hnames)))
hfiles = list()
for hname in hnames:
hfiles.append(f"{hname}_entry.html")
return hfiles
# ----------------------------------------------------------------------
def test_stuff():
err.write("TESTING\n")
err.write("----------------------------------------\n")
test_add_starps_matching_section()
err.write("----------------------------------------\n")
return
# ----------------------------------------------------------------------
def test_add_starps_matching_section():
err.write("----------------------------------------\n")
err.write(f"@@@ testing add_starps_matching_section\n")
st, code_ch, loc_ch, name_ch, name_py, name_en, tags1, itms_ch, itms_py, itms_en = \
add_intro ( "WHOP", "b2.4.094", '白马茎', 'bái mǎ jīng', 'white horse penis', 'ZHB' )
err.write("@@@ calling display_hanzi_pure_text ...\n")
hanzi_trim_text = """
(trim)
白马茎:[主]伤中脉绝,阴不起,强志,益气,长肌肉。肥健,生子。
眼:[主]惊痫,腹满,疟疾。
悬蹄:[主]惊邪,瘈疭,乳难。辟恶气,鬼毒,蛊注,不祥。
"""
hanzi_trim, tvar_trim = display_hanzi_pure_text(st, loc_ch, 48, hanzi_trim_text)
assert tvar_trim == "trim"
kwords_en_A = ( 'USES', 'USES', 'USES', )
kwords_en_B = ( 'USES', 'QI', 'USES', 'USES', 'QI', )
max_score = 8.0
variants = \
(
( (tvar_trim, hanzi_trim), kwords_en_A, ),
( (tvar_trim, hanzi_trim), kwords_en_B, ),
)
err.write("@@@ calling add_starps_matching_section ...\n")
locs_to_show = set()
locs_to_try = None
parevs_list = add_starps_matching_section \
( st, code_ch, loc_ch, variants, max_score, locs_to_try, locs_to_show )
err.write("@@@ finishing document ...\n")
h.output_doc(st, sys.stdout, 0, last_edit)
sys.stdout.flush()
err.write("@@@ done.\n")
err.write("----------------------------------------\n")
return
# ----------------------------------------------------------------------
if len(sys.argv) == 2 and sys.argv[1] == "R77.TEST":
test_stuff()