#! /usr/bin/python3
# Last edited on 2026-04-12 13:10:04 by stolfi
import sys, re, string
from sys import stderr as err
import html_gen as h
from process_funcs import bash, basic_line_loop
import html_report_funcs as hr
import size_position_funcs as spf
import match_multi_funcs as mmf
import analyze_starps_parag_funcs as anf
from math import sqrt, hypot, exp, log, floor, ceil, isfinite, isnan, inf, nan
def split_formatted_entry(entry):
# Parses an SBJ entry (hanzi, pinyin, translation, or Voynichese)
# that has been cast in multiline format.
#
# The {entry} must be a multiline string where the first line has the
# format "<{LOC}>" and each subsequent line has the format
# "{TAG}{SEP}{ITEM}" where the {TAG} is a tring of [A-Z0-9] in parens
# '()', {SEP} is one or more blanks or '|'s, {ITEM} is any string.
#
# Removes leading and traling ASCII spaces from {ITEM} (but not
# ideographic spaces).
#
# Returns the {LOC}, the list of all {TAG}s, and the list of all
# {ITEM}s.
# Cannot strip -- must keep ideographic spaces.
m = re.fullmatch(r"[ \012]*[<]([a-z0-9.]+)[>] *[\012](.*)", entry, re.DOTALL)
assert m != None, f"bad entry format '{entry[:12]}'"
loc = m.group(1)
entry = m.group(2)
lines = entry.splitlines()
tags = []
items = []
for line in lines:
# Cannot strip -- must keep ideographic spaces.
if re.match(r"[ \012]*([#]|$)", line): continue
m = re.fullmatch(r"[ \012]*([(][A-Z0-9a-z]+[)])[ |]+(.+)[ \012]*", line)
assert m != None, f"bad entry line format '{line}'"
tag = m.group(1)
item = m.group(2)
# Cannot strip -- must keep ideographic spaces.
item = re.sub(r"^[ ]+", "", item)
item = re.sub(r"[ ]+$", "", item)
tags.append(tag)
items.append(item)
return loc, tags, items
# ----------------------------------------------------------------------
def split_formatted_entry_hanzi(entry):
# Parses an SBJ entry hanzi that has been cast in multiline format.
# See {split_formatted_entry} for the format of {entry}.
# Then does some checking and cleanup of the items.
#
# The items must contain only hanzi, ideographic blanks and
# punctuation (which are retained), and leading or trailing ASCII
# blanks (which are stripped). Pads all items with ideographic blanks
# to the same width.
#
# Returns the {LOC}, the list of all {TAG}s, and the list of all
# {ITEM}s.
loc, tags, items = split_formatted_entry(entry)
# Pad all items>
max_item_sz = 0
for item in items:
if re.search(r"[\001-\377]", item):
assert False, f"ascii character in hanzi item '{item}'"
max_item_sz = max(len(item), max_item_sz)
items = [ item.ljust(max_item_sz, " ") for item in items ]
return loc, tags, items
# ----------------------------------------------------------------------
def formatted_entry_table(st, tags, hanzi, colA, colB):
# Prints a table with the given {tags} on column 1, the given {hanzi}
# in column 2, and arbitrary ascii entries{colA,colB} in columns 3 and 4.
#
# All four lists must have the same length, with corresponding
# elements in the same positions.
N = len(tags)
assert len(hanzi) == N
assert len(colA) == N
assert len(colB) == N
bars = [ ' | ' ] * N
rows = list(zip(tags, bars, hanzi, bars, colA, bars, colB))
col_mods = [
"style='padding-left:4ch; padding-right:4ch; text-align:left; font-weight:bold;'",
"align=left",
"align=left",
"align=left",
"align=left",
"align=left",
"align=left",
]
h.table(st, rows, col_mods = col_mods, centered = False)
return
# ----------------------------------------------------------------------
def entry_align_table(st, rows):
# Prints a table with the tags on column 1, hanzi in column2, and
# arbitrary ascii entries in columns 3 and 4.
ch_ps_wp_en_wcol_mods = [
"style='padding-left:4ch; padding-right:4ch; text-align:left; font-weight:bold;'",
"align=left",
"align=left",
"align=left",
"align=left",
]
h.table(st, rows, col_mods = col_mods, centered = False)
return
# ----------------------------------------------------------------------
def read_parms_from_file_header(rd):
# Reads {rd} and looks for lines of the form "# {KEY} = {VALUE}".
# Returns a dict with those keys and values.
# The {KEY} may be any python3-style identifier.
# The {VALUE} for now may be an integer, a float, or a string.
vms_dic = dict()
err.write("!= beg\n")
def process_line(nread, line):
nonlocal vms_dic
line = line.strip()
err.write(f"!= {nread:5d} {line = !r}\n")
m = re.fullmatch(r"# *([a-zA-Z][a-zA-Z_0-9]*) *[=] *(.*)", line)
if m == None: return
key = m.group(1)
val = m.group(2).strip()
err.write(f"!= {key = !r} {val = !r}\n")
if re.fullmatch(r"[-+]?[0-9]+", val):
# Integer
val = int(val)
elif re.fullmatch(r"[(][-+0-9, ]+[)]", val):
# Integer tuple; assume pair:
m = re.fullmatch(r"[(]([-+]?[0-9]+)[ ,]+([-+]?[0-9]+)[)]", val)
val = (int(m.group(1)), int(m.group(2)),)
elif re.fullmatch(r"[-+]?[0-9]*([.][0-9]|[0-9][.])[0-9]*([Ee][-+]?[0-9]+)?", val):
val = float(val)
elif re.fullmatch(r"['][^']*[']", val):
val = re.sub(r"[']", "", val)
elif re.fullmatch(r'["][^"]*["]', val):
val = re.sub(r'["]', "", val)
elif re.fullmatch(r'\[.*\]', val):
val = re.sub(r'^\[', "", val)
val = re.sub(r'\]$', "", val)
elems = re.split(r'[, ]+', val)
items = []
for el in elems:
if el != "":
if el[0] == '"':
el = re.sub(r'"', "", el);
elif el[0] == "'":
el = re.sub(r"^'", "", el);
items.append(el)
val = items
else:
assert False, f"** bad value «{val}»"
vms_dic[key] = val
return
# :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
nread = basic_line_loop(rd, process_line)
return vms_dic
# ----------------------------------------------------------------------
def add_summary_of_parag_search_summary(st, data, kwords_ec):
nh = len(kwords_ec)
nh_only_text = ""
if nh == 1:
nh_only_text = f""", and only {data['npar_with']} have an occurrence of
{kwords_ec[0]}"""
elif nh >= 2:
minocc_only_text = f""", and only {data['npar_with']} have
occurrences of {kwords_ec!r}"""
h.parags(st, f"""There are {data['npar_read']} parags in the good part
of the SPS{nh_only_text}. Their EVA letter sizes
range in {data['min_size']}..{data['max_size']}.""")
return
# ----------------------------------------------------------------------
def add_title_summary_and_intro(code_ch, loc_ch, name_ch, name_py, name_en):
name_en = re.sub(r"_", " ", name_en.lower())
name_en_uscore = re.sub(r"[ ]+", "_", name_en)
name_en_caps = string.capwords(name_en)
title = f"[{code_ch}.077] The '{name_en_caps}' entry of the SBJ"
st = h.new_doc(title, "#eeffdd", text_width = 1600)
h.section(st, 2, "Summary")
h.parags(st, f"""This webpage discusses the SBJ entry titled {name_ch}
= "{name_en}", parsed into its sub-entries and their fields, and
possible correspondences to parags of the SPS. The modern Mandarin
reading of the text and an English translation are also shown. This
entry may be referred as "{code_ch}" in tables.""")
return st, code_ch, loc_ch, name_ch, name_py, name_en
# ----------------------------------------------------------------------
def add_chinese_text_section(st, loc_ch, name_ch, name_py, name_en, nch, hanzi_items):
h.section(st, 2, "The SBJ entry (Chinese)")
h.parags(st, f"""This is the SBJ entry {name_ch} {name_py} = "{name_en}":""")
hanzi_text = f"<{loc_ch}> " + "".join(hanzi_items)
hanzi_pure = display_hanzi_pure_text(st, loc_ch, nch, hanzi_text)
return hanzi_pure
# ----------------------------------------------------------------------
def display_hanzi_pure_text(st, loc_ch, nch, hanzi_text):
hanzi_text = hanzi_text.strip()
hanzi_text = re.sub(r"[ \012]", "", hanzi_text)
m = re.fullmatch(r"<([a-z0-9.]+)>(.*)", hanzi_text)
assert m != None, f"invalid chinese entry format {hanzi_text!r}"
assert m.group(1) == loc_ch, f"loc ID mismtch: {loc_ch} actual {m.group(1)}"
hanzi_body = m.group(2)
hanzi_pure = re.sub(r"[:[](),。; ]", "", hanzi_body)
nch_real = len(hanzi_pure)
assert nch_real == nch, f"length error: {nch = } actual {nch_real}"
if nch < 45:
hanzi_chops = [ hanzi_pure, ]
else:
hanzi_chops = [ hanzi_pure[k:k+40] for k in range(0, nch, 40) ]
hanzi_display = f"""
<{loc_ch}> {nch:2d} hanzi
{"\n ".join(hanzi_chops)}
"""
h.append_preformatted(st, h.protect_html(hanzi_display), ind = 4, centered = False)
return hanzi_pure
# ----------------------------------------------------------------------
def add_formatted_entry_table(st, tags1, hanzi_items, pinyin_items, trans_items):
h.section(st, 3, "Pinyin and translation")
h.parags(st, """Here is the same entry, with punctuation added according
to this parsing, the modern Mandarin readings in pinyin, and a somewhat
literal English translation:""")
formatted_entry_table(st, tags1, hanzi_items, pinyin_items, trans_items)
return
# ----------------------------------------------------------------------
def add_starps_matching_section(st, code_ch, variants, max_score):
# Appends to document {st} the body of a section that searches the SPS
# file for parags matching a given SBJ entry in various ways.
#
# The {variants} argument must be a list of quadruples {(cleantx_ch,
# kwords_en, strict)} where {cleantx_ch} is a suitably cleaned and
# trimmed hanzi text of the entry to be matched, {kwords_en} is a list of
# strings, and {strict} is a boolean.
#
# Calls {add_starps_matching_subsection} for each tuple
# in {variants}, with the tuple fields as arguments.
#
# Returns a list of of the results of those calls. Each element of
# this list is a list of matching results. Each matching result is a
# tuple as returned by {anf.analyze_starps_parags} (quod videt).
h.section(st, 3, f"Matching results")
parevs_list = []
for cleantx_ch, kwords, strict in variants:
parevs = add_starps_matching_subsection \
( st, code_ch, cleantx_ch, kwords, strict, max_score )
parevs_list.append(parevs)
return parevs_list
# ----------------------------------------------------------------------
def add_starps_matching_subsection(st, code_ch, cleantx_ch, kwords_en, strict, max_score):
# Arguments:
#
# {code_ch} four-letter code of the SBJ entry.
# {cleantx_ch} text of that entry, suitably cleaned and trimmed.
# {kwords_en} a list of strings.
# {strict} is a boolean.
# {max_score} a positive float.
#
# Each element of {kwords_en} should be a string specifies the /type/
# of a keyword, like 'MAIN-USES' or 'QI'.
#
# From the list {kwords_en} the procedure derives a list {kwords_ch}
# of RE patterns that are expected to appear in the text {cleantx_ch},
# in that order.
#
# The list {kwords_en} and the {strict} argument together also define a list {kwords_ec}
# of RE patterns that are expected to match the Voynichese equivalent of the
# keywords {kwords_ch} in the clean eval text of the SPS parag
# that corresponds to the entry in question. See {get_bencao_starps_keywords}.
#
# For example, if {kwords_en} is {('MAIN-USES', 'QI', 'MAINLY-FOR')},
# the list {kwords_ch} will be {('主治', '气', '主')}, and the
# list {kwords_en} will be
# {( dain_strict, chedy_strict, dain_strict )}
# if {strict} is true and
# {( daiin_loose, chedy_loose, daiin_loose )} if strict is false,
# where
# {daiin_strict = '[dl]aiin|dair'},
# {daiin_loose = '[dkls]aii?n|[dkls]air|[dkls]am'}
# and
# {chedy_strict = 'ched[oay]'},
# {chedy_loose = '[cs]hed[oay]'}
#
# Let {nh} be the number of pairs in {kwords}, and {ng} be {nh+1}.
# The procedure splits {cleantx_ch} into {nh} /hit strings/ {hits_ch[0..nh-1]} that match
# {kwords_ch[0..nh-1]}, and {ng} /gap strings/ {gaps_ch[0..ng-1]} before, between, and
# after them.
#
# The procedure then calls
# {anf.analyze_starps_parags(gaps_ch,hits_ch,ivt_file,kwords_ec)}
# to scan the file of good SPS parags {ivt_file}, obtainin a list {parevs}
# of tuples, each describing a parag that minimally matches.
#
# This procedure then trims that list after the first tuple with
# score exceeding {max_score}, and inserts the list into the document
# {st}, formatted as described in {anf.format_parev}.
#
# The result of the call is that trimmed list.
kwords_str = ", ".join(kwords_en)
strict_str = "strict" if strict else "liberal"
ctsize_ch = len(cleantx_ch)
h.section(st, 4, f"Trimmed to {ctsize_ch} hanzi - keys {kwords_str} ({strict_str})")
h.parags(st, "Trimmed entry:")
h.append_preformatted(st, h.protect_html(cleantx_ch), ind = 4, centered = False)
nh = len(kwords_en); ng = nh + 1
kwords_ch = []
kwords_ec = []
for kw_en in kwords_en:
kw_ch, kw_ec = get_bencao_starps_keywords(kw_en, strict)
kwords_ch.append(kw_ch)
kwords_ec.append(kw_ec)
add_keywords_table(st, kwords_en, kwords_ch, kwords_ec)
h.parags(st, "Best matches:")
gaps_ch, hits_ch = mmf.find_multi_pattern_occurrences(cleantx_ch, kwords_ch)
assert len(gaps_ch) == ng;
assert len(hits_ch) == nh;
err.write(f"!@ {gaps_ch = } {hits_ch = }\n")
pbits = []
parevs, data = analyze_and_show_starps_parags \
( st, code_ch, gaps_ch, hits_ch, kwords_ec, max_score )
return parevs
# ----------------------------------------------------------------------
def add_keywords_table(st, kwords_en, kwords_ch, kwords_ec):
h.parags(st, "Keyword patterns:")
N = len(kwords_en)
bars = [ ' || ' ] * N
rows = list(zip(kwords_en, bars, kwords_ch, bars, kwords_ec))
col_mods = [
"style='padding-left:4ch; padding-right:4ch; text-align:left;'",
"align=left",
"align=left",
"align=left",
"align=left",
]
h.table(st, rows, col_mods = col_mods, centered = False)
return
# ----------------------------------------------------------------------
def get_bencao_starps_keywords(kw_en, strict):
# Returns the hanzi and EVA keywords {kw_ch,kw_ec} corresponding to
# the English keyword type {kw_en}.
#
if kw_en == "MAINLY-FOR":
kw_ch = '主'
kw_ec = 'daiin|dair|laiin' if strict else '[dlkrs][ao]iin|[dlkrs][ao]ir'
elif kw_en == "MAIN-USES":
kw_ch = '主治'
kw_ec = 'daiin|dair|laiin' if strict else '[dlkrs][ao]iin|[dlkrs][ao]ir'
elif kw_en == "LONG-TAKE":
kw_ch = '久服'
if strict:
kw_okaiin = 'q?[aoy]kaiin'
kw_okeedy = 'q?[aoy]keed[aoy]'
else:
kw_okaiin = 'q?[aoy][ktd][ao]iin'
kw_okeedy = '[aoy][ktd]ee?[dk][aoy]'
kw_ec = kw_okeedy + "|" + kw_okaiin
elif kw_en == "QI":
kw_ch = '气'
kw_ec = 'chedy' if strict else '[cs]he[kd][aoy]'
else:
assert False, f"invalid English keyword {kw_en}"
return kw_ch, kw_ec
# ----------------------------------------------------------------------
def get_ch_gap_and_hits(cleantx_ch, kwords_ch):
# Assumes that the cleaned text of the SBJ entry (hanzi only, without
# punctuation or ASCII chars) is {cleantx_ch}.
#
# Let {nh} be {len(kwords_ch)}, and {ng} ne {nh+1}. Partitions
# {cleantx_ch} into alternating substrings {gaps_ch[0..ng-1]} and
# {hits_ch[0..nh-1]} where the latter are the substrings that match
# the patterns {kwords_ch[0..nh-1]}, in that order.
#
# Returns the lists {gaps_ch} and {hits_ch}.
nh = len(kwords_ch); ng = nh + 1
# Just in case, remove spaces from the hanzi texts
cleantx_ch = re.sub(r"<[^<>]*>", "", cleantx_ch)
cleantx_ch = re.sub(r"[ \012]", "", cleantx_ch)
gaps_ch, hits_ch = \
mmf.find_multi_pattern_occurrences(cleantx_ch, kwords_ch)
err.write(f"!@ {gaps_ch = } {hits_ch = }\n")
assert len(hits_ch) == nh, "hit counts don't match"
assert len(gaps_ch) == ng
return gaps_ch, hits_ch
# ----------------------------------------------------------------------
def analyze_and_show_starps_parags(st, code_ch, gaps_ch, hits_ch, kwords_ec, max_score):
#
# Arguments:
#
# {gaps_ch} gaps of the parsed SBJ entry.
# {hits_ch} hits of the parsed SBJ entry.
# {kwords_ec} list of RE pattern(s) hopefully matching the keywords in EVA text.
# {max_score} print candidates only up until about this score.
ivt_file = f"res/starps-gd-ec-par.ivt"
parevs, data = anf.analyze_starps_parags \
( gaps_ch, hits_ch, ivt_file, kwords_ec)
nc = len(parevs)
# Each elem of {parevs} must be {(score, loc_ec, gaps_ec, hits_ec, gaps_ch, hits_ch)}:
for parev in parevs: assert len(parev) == 6
# Discard totally bad candidates:
nc_good = 0; # Parevs with acceptable score.
while nc_good < nc and parevs[nc_good][0] <= max_score: nc_good += 1
err.write(f" {nc_good = }\n")
nc_show = nc_good
# First parev with unacceptable score:
if nc_show < nc: nc_show += 1
# Discard excessive parevidates:
max_show = 10
nc_show = min(nc_show, max_show)
if nc_show < max_show:
# Ensure that some parevs are shown:
min_show = max(3, nc_show)
while nc_show < nc and nc_show < min_show: nc_show += 1
# Salutar paranoia:
err.write(f" {nc_show = }\n")
assert nc_show <= max_show + 1
prev_score = -inf; prev_loc = "NONE"
for ic in range(nc_show):
score = parevs[ic][0]
loc = parevs[ic][1]
err.write(f" parag {loc:<12s} {score = :6.2f}\n")
if ic > 0:
assert score >= prev_score # Badness must be non-decreasing.
assert loc != prev_loc # No duplicate parevs.
if ic < nc_good:
assert score <= max_score # No bad parevs except the last one.
prev_score = score
prev_loc = loc
# Show the candidates:
clist_blocks = []
for ic in range(nc_show):
cstr = anf.format_parev(parevs[ic], kwords_ec)
clist_blocks.append(cstr)
clist_str = "\n".join(clist_blocks)
clist_str = h.protect_html(clist_str)
h.append_preformatted(st, clist_str, ind = 4, centered = False)
return parevs[:nc_good+1], data
# ----------------------------------------------------------------------
def write_dics_from_parev(st, code_ch, loc_ch, parev):
# Writes a set of hanzi-to-EVA dictionaries based on the locus ID {loc_ch} and
# text {text_ch} of an SBJ entry, the locus ID {loc} of an SPS parag,
# and the gaps and hits of the two parsed texts.
#
# The dictionary for SBJ text fragments of length {fsize_ch}
# is written to file "dics/{code_ch}_{loc}_{fsize_ch}.dic".
score, loc_ec, gaps_ec, hits_ec, gaps_ch, hits_ch = parev
# Score and locus ID of parag:
loc_ec = re.sub(r"<[^<>]*>", "", loc_ec) # Just in case:
loc_ec = re.sub(r"[.]([0-9])$", r".0\1", loc_ec) # Zero-pad the line number.
max_fsize_ch = 4
for fsize_ch in range(max_fsize_ch + 1):
vms_dic = make_dic_from_parev(code_ch, loc_ch, parev, fsize_ch)
assert vms_dic != None
dic_file = f"dics/{code_ch}_{loc_ec}_{fsize_ch}.dic"
wr = open(dic_file, "w")
wr.reconfigure(encoding='utf-8')
wr.write("# -*- coding: utf-8 -*-\n")
wr.write(f"# {loc_ch = }\n")
wr.write(f"# {loc_ec = }\n")
pref = f"{code_ch} | {loc_ch:<8s} | {loc_ec:<8s} |"
for frag_ch, frag_ec in vms_dic:
wr.write(pref)
assert fsize_ch == 0 or len(frag_ch) == fsize_ch
frag_ch = frag_ch.ljust(10," ")
frag_ec = frag_ec.ljust(50," ")
wr.write(f" {frag_ch} | {frag_ec} |\n")
wr.close()
return
# ----------------------------------------------------------------------
def make_dic_from_parev(code_ch, loc_ch, parev, fsize_ch):
# Returns a list of pairs {frag_ch,frag_ec} of hanzi and EVA fragments that
# are the conjectured matching parts of the SBJ entry {loc_ch}.
# Each is given the badness {score} of the pairing.
#
# If {fsize_ch} is positive, the hanzi fraggments {frag_ch} will be
# all substrings of length {fsize_ch} of all the hanzi gaps strings
# {gaps_ch}. If {frag_ch} is centered at character positin
# {kch} of {gaps_ch[ig]}m the correspondng EVA fragment {frag_ec} is
# taken from {gaps_ec[ig]} at a location {kec} that is {kch} scaled by
# the ratiof of the lengths of the two gaps. The fragment {frag_ec} is padded
# if needed with '·' (centered dots).
#
# If {fsize_ch} is zero, the fragments {frag_ch} will be al the hit
# strings in {hits_ch}. If {frag_ch} is {hits_ch[ih]}, then {frag_ec}
# will be {hits_ec[ih]}.
score, loc_ec, gaps_ec, hits_ec, gaps_ch, hits_ch = parev
nh = len(hits_ch); assert len(hits_ec) == nh
ng = len(gaps_ch); assert len(gaps_ec) == ng;
assert ng == nh + 1;
def data_error(msg):
assert False, msg
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
assert gaps_ch != None and hits_ch != None
assert len(gaps_ch) == ng
assert len(hits_ch) == nh
vms_dic = []
def extract_frag_pair(it_ch, fsz_ch, str_ch, str_ec):
# Extracts a fragment {frag_ch} of length {fsz_ch} at index {it_ch}
# from {str_ch}. Then extracts from {str_ec}
# the corresponding fragment {frag_ec}, assuming that
# the whole of {str_ch} maps to the whole of {str_ec}.
nonlocal vms_dic
# Define the number of EVA letters to take on each side of frag center:
mrg_ec = 15 + int(ceil(2.5*fsz_ch))
nt_ch = len(str_ch)
nt_ec = len(str_ec)
scale = (nt_ec+1)/(nt_ch+1)
# Limit of fragment on the SBJ gap:
jt_ch = it_ch + fsz_ch
# Character indices {kt_ch,kt_ec} of the frag centers:
kt_ch = (it_ch + jt_ch)/2
kt_ec = int(floor(scale * kt_ch + 0.5))
# Start of fragment on the SPS gap, and necessary padding:
it_ec = kt_ec - mrg_ec
lpad = 0 if it_ec >= 0 else -it_ec
it_ec = min(nt_ec-1, max(0, it_ec))
# Limit of fragment on the SPS gap, and necessary padding:
jt_ec = kt_ec + mrg_ec
rpad = 0 if jt_ec <= nt_ec else jt_ec - nt_ec
jt_ec = min(nt_ec, max(1, jt_ec))
frag_ch = str_ch[it_ch:jt_ch]
frag_ec = ("·" * lpad) + str_ec[it_ec:jt_ec] + ("·" * rpad)
return frag_ch, frag_ec
# ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
if fsize_ch == 0:
# Report the keyword hits:
for ih in range(nh):
frag_ch = hits_ch[ih]
frag_ec = re.sub(r'[\[\]]', "", hits_ec[ih])
vms_dic.append((frag_ch, frag_ec,))
else:
# Report substrings of the gaps with length {fsize_ch}:
for ig in range(ng):
gap_ch = gaps_ch[ig]
gap_ec = re.sub(r'[\[\]]', "", gaps_ec[ig])
nt_ch = len(gap_ch)
for it_ch in range(nt_ch + 1 - fsize_ch):
frag_ch, frag_ec = extract_frag_pair(it_ch, fsize_ch, gap_ch, gap_ec)
vms_dic.append((frag_ch, frag_ec,))
return vms_dic
# ----------------------------------------------------------------------
def add_chosen_starps_parag_section(st, loc_ch, code_ch, loc_ec, parev):
# Inserts a parag saying that the SPS parag chosen to match
# the SBJ entry {loc_ch} (code {code_ch}) is the one described
# in {parev}.
#
# The {parev} may be {None} to say that the SBJ entry will not be
# assigned to any SPS parag.
h.section(st, 3, "Chosen match")
if loc_ec != None:
h.parags(st, f"""Thus we will tentatively assign {code_ch} ({loc_ch}) to
{loc_ec}. However we must be aware
that the true match may not have made it into the "good" subset.""")
else:
h.parags(st, f"""Thus we will not assign {code_ch} ({loc_ch}) to any SPS parag.""")
# add_chosen_parag_analysis(st, parev)
return
# ----------------------------------------------------------------------
def choose_best_starps_parag(gud_cands_ec, bad_cands_ec, parevs_list):
# Selects the SPS parag among one or more lists of
# parags, optionally among a given set predefined choice.
#
# The {parevs_list} must be a list of lists, each one of them
# being either {None} or a list of tuples as returned by
# {analyze_and_show_starps_parags}.
#
# Looks for a {parev} tuple in each of the lists in the
# {parevs_list} (skipping any lists that are {None}) for one with
# minimum badness score.
#
# If {gud_cands_ec} is not {None}, it should be a set of strings which
# are loc IDs of zero or more SPS parags. Ditto for {bad_cands_ec}.
#
# If {gud_cands_ec} is not {None}, the procedure will
# consider only {parev} tuples whose {loc_ec} field is in that set.
#
# If {bad_cands_ec} is not {None}, the procedure will ignore
# any {parev} tuple whoe {loc_ec} field is in that set.
#
# Then returned result is the loc id of he best {parev} considered,
# and that parev itself.
#
# If it cannot find any {parev} as requested, returns {None,None}.
# A favor to the caller:
if isinstance(gud_cands_ec, str): gud_cands_ec = set((gud_cands_ec,))
if isinstance(bad_cands_ec, str): bad_cands_ec = set((bad_cands_ec,))
best_parev = None;
if parevs_list != None:
# Find the best candidate record:
min_score = +inf
for parevs in parevs_list:
if parevs != None:
for pev in parevs:
score = pev[0]
loc_ec = pev[1]
gud_pass = (gud_cands_ec == None) or (loc_ec in gud_cands_ec)
bad_pass = (bad_cands_ec == None) or (loc_ec not in bad_cands_ec)
if gud_pass and bad_pass:
if pev[0] < min_score:
best_parev = pev; min_score = pev[0]
break # There should be at most one in each list.
if best_parev == None:
msg = f"WARNING - PARAG {loc_ec} - MISSING CAND RECORD"
h.parags(st, f"{msg}")
err.write(f"!! {msg}\n")
loc_ec = None; parev = None
else:
loc_ec = best_parev[1]
return loc_ec, best_parev
# ----------------------------------------------------------------------
def fetch_starps_line(ivt_file, loc_starps):
# Reads fle {ivt_file} (assumed to be UTF-8) and looks for a line with
# locus ID "<{loc_starps}>". Returns that line minus the locus ID stripped of
# blanks. If the {loc_starps} is not found, returns None
# Just in case:
loc_starps = re.sub(r"[<>]", "", loc_starps)
loc_pat = f"<{loc_starps}>"
rd = open(ivt_file, "r")
rd.reconfigure(encoding='utf-8')
text = None
for line in rd:
if re.match(loc_pat, line):
text = re.sub(loc_pat, "", line)
text = text.strip()
break
rd.close()
return text
# ----------------------------------------------------------------------