#! /bin/python3 # Last edited on 2026-05-01 04:47:10 by stolfi import sys, os, re import regex as rex from sys import stderr as err, stdin as inp, stdout as out from process_funcs import bash, basic_line_loop from error_funcs import arg_error, file_line_error, prog_error from chinese_funcs import read_chinese_char_set import size_position_funcs as spf import match_multi_funcs as mmf import bimatching_eval_funcs as bef from math import sqrt, hypot, exp, log, pi, inf, nan, floor, ceil, isfinite def analyze_starps_parags(ivt_file, nh, match_func): # Compares all parags of the SPS with an entry of the SBJ, # according to a specific macro-parsing of the latter into # keyword hits and gaps. # # Arguments: # # {ivt_file} name of input file with all candidate parags, or "-". # {nh} number of keywords that {match_func} will look for. # {match_func} an SBJ-SPS matching function. # # The function reads from {ivt_file} (assumed to be in Unicode UTF-8 # encoding) the transcription of one or more SPS parags, which should # by an IVTFF-like format, with one parag per line in the format # "<{LOC}> {TEXT}". The {TEXT} should be the complete text of one # parag EVA encoding. # # The function then applies to the {TEXT} of each line the cleanup # appropriate for {utype = "ec"}, obtaining a cleaned EVA text {cleantx_ec}. # See {clean_up_raw_text} in {size_position_funcs.py} for details. # # The function then calls {match_func(cleantx_ec)}, # which must return return a numeric badness {score}, two macro-parsings {segs_ch[0..ns-1]} # and {segs_ec[0..ns-1]}, and the total {key_penalty}. # # The macro_parsings {segs_ch,segs_ec} must split the clean texts of # the SBJ entry and of the SPS parags into thethe same number {nh} of # hits and same number {ng=nh+1} gaps. The {key_penalty} should be # the part of the {score} that is due to non-canonical keyword # choices, like @dair instead of @daiin and @chedo instead of @chedy. # If not available, it should be set to zero. # # If the {match_func} returns {+inf} as the score, it is assumed that that # {cleantx_ec} cannot be matched. This function then assigns a score of # {+inf} to the parag and sets {segs_ec} to {None}. # # At the end, the procedure returns a list {parevs} of tuples of the # form {(score, loc_ec, segs_ch, segs_ec, key_penalty)} sorted by # increasing badness score; and the dictionary {data} with various # counts of the operation. debug = False ng = nh+1; ns = ng + nh if debug: err.write(f"!~ {ivt_file = !r}\n") rd = inp if ivt_file == "-" else open(ivt_file, "r") rd.reconfigure(encoding='utf-8') utype_ec = "ec" pat_line, pat_unit, pat_sepa, clean_sepa = spf.get_parsing_patterns(utype_ec) data = dict() # Counts of various things. data['npar_read'] = 0 # Count of input data lines (SPS parags). data['npar_with'] = 0 # Count of SPS parags for which {match-func} succeeded. data['min_size'] = +inf # Minimum size of parags that matched. data['max_size'] = 0 # Maximum size of parags that matched. parevs = [] # Candidates after analysis. def process_input_line(nline, line): nonlocal data, parevs # # Parses a line {line} assuming it is line {nline} of the file. The # {line} is always a string (never {None}), but may be "" if the # line is empty. # # Ignores the line if it is a blank or #-comment. # # Otherwise the line must be a data line, matching {pat_line} # # Increments {data['npar_read']} for each data line. # # Gets {cleantx_ec} by cleaning up {text}. Calls {match-func} to try # to obtain viable parallel macro-pasings of the SBJ entry's clean # text {cleantx_ch} and {cleantx_ec}, with {nh} hits and {nh+1} # gaps. # # If succeeded, increments {data['npar_with']}, # updates {data['min_size'],data['max_size']} creates the candidate # parsing tuple {parev} and appends it to the list {parevs}. def data_error(msg): nonlocal ivt_file, nline, line file_line_error(ivt_file, nline, msg, line) assert False # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ assert line != None, "The {line} arg must not be {None}" # Ignore comments and blank lines: if re.match(r" *([#]|$)", line): return # Just in case, ignore IVTFF page headers: if re.match(r"", line): return data['npar_read'] += 1 m = re.match(pat_line, line) if m is None: # Invalid line format. data_error("invalid line format") # Parse the line into locus ID {loc_ec} and raw text: loc_ec = m.group(1) traw_ec = m.group(2) # Normalize and cleanup the raw text: # norm_text = spf.normalize_starps_raw_text(traw_ec, utype, data_error) # cleantx_ec, head, tail = spf.clean_up_starps_raw_text(norm_text, utype, data_error) utype = "ec" cleantx_ec, head, tail = spf.clean_up_starps_raw_text(traw_ec, utype, data_error) debug = False if debug: err.write(f"!~ {cleantx_ec = !r}\n") err.write(f"······································································\n") err.write(f"parag {loc_ec} size = {len(cleantx_ec)}:\n") score, segs_ch, segs_ec, key_penalty = match_func(cleantx_ec) if isfinite(score): # Was able to match: assert segs_ch != None and segs_ec != None assert 0 <= key_penalty and key_penalty <= score assert len(segs_ec) == ns for sg in segs_ch: assert isinstance(sg, str) for sg in segs_ec: assert isinstance(sg, str) data['npar_with'] += 1 if debug: err.write(f"!~ {loc_ec:<12s} {score = :+8.3f}\n") # Collect statistics of parag lengths: psize_ec = len(cleantx_ec) if psize_ec < data['min_size']: data['min_size'] = psize_ec if psize_ec > data['max_size']: data['max_size'] = psize_ec parev = ( score, loc_ec, segs_ch, segs_ec, key_penalty ) parevs.append(parev) return # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: err.write(f"reading file '{ivt_file}' ...\n") nread = basic_line_loop(rd, process_input_line) rd.close() err.write(f"sorting parsed parags ...\n") parevs.sort(key = lambda x : x[0]) err.write(f"{nread:6d} lines read\n") err.write(f"{data['npar_read']:6d} parags read\n") err.write(f"{data['npar_with']:6d} parags matched the keywords\n") err.write(f" size range = {data['min_size']}..{data['max_size']}\n") return parevs, data # ---------------------------------------------------------------------- def analyze_one_parag(loc_ec, cleantx_ec, nh, match_func): # Analyzes an SPS parag as a candidate for the Voynichese translation of # a given SBJ entry # # Arguments: # # {cleantx_ec} an EVA text (without puncts). # {nh} number of expected keyword hits. # {match_func} the SBJ-SPS matching function. # return score, segs_ch, segs_ec, key_penalty # ---------------------------------------------------------------------- def output_parev(wr, parev, hipat_ec): # Writes to {wr} the parag evaluation tuple {parev} in the # format suitable for the report. # The {hipat_ec} should be an EVA pattern used to highlight # keywrds in the gaps of the EVA macro-parsing. parev_str = format_starps_parag_evaluation(parev, hipat_ec) wr.write(parev_str); return # ---------------------------------------------------------------------- def add_bencao_starps_matching_INFO(st): # Appends to {st] the explanation for how SPS parags are evaluated. h.parags(st, """The subpages about individual SBJ entries contain the results of structural matching of the entry in question against the SPS parags. These reports consist of one or more /parag evaluation blocks/ or /parevs/, each evaluating one parag of the SPS, which is summarized in a numeric badness score. The reports assume that the SBJ entry, minus all markup and puntuation, is parsed according to a given list of {N} /hanzi keyword patterns/, for example [ '主治|主', '气' ]. None of these patterns should match the empty string. The parsing splits the entry (cleaned of all punctuation and metadata) into {N} non-overlapping hanzi strings that match those patterns (the /hanzi hits/), in the given order, and {N+1} hanzi strings before, between, and after those hits (the /hanzi gaps/). This parsing is shown before all the parevs. Each SPS parag is evauated by deleting all word space markers [,.-] and other markup, and parsing the resulting EVA string too into {N} /EVA hits/ and {N+1} /EVA gaps/, by another list of {N} /EVA keyword patterns/, for example [ 'daiin|dain|laiin', 'chedy|chedo' ]. None of these patterns should match the empty string. Then the badness score is computed by comparing actual and predicted lengths (in EVA letters) of the whole SPS parag and of the {N+1} EVA gaps. The prediction for each of these substrings is based on the number of hanzi in the corresponding substring of the SBJ entry, multiplied by a fixed scale factor. If there are multiple ways to match the {N} EVA keywords within the parag's text, the parev data reflects the choice of the {N} matches that gave the lowest badness score. Conversely, if there is no way to match {N} EVA keywords within the parag's text, the parag is not considered to be a candidate match and is omitted from the matching report.""") return # ---------------------------------------------------------------------- def add_format_starps_parag_evaluation_INFO(st): # Appends to {st] the explanation for the output of # {format_starps_parag_evaluation}. h.parags(st, """The first line of each block is the page and line number of the head line of the candidate SPS parag, followed by its its badness score, by the length discrepancy, and by keyword penalty. The length discrepancy difference between the actual length of the parag (in EVA letters, ignoring word spaces) and the length predicted from the count of hanzi in the SBJ entry. The key penalty is the the sum of of the penalty points associated with the specific alternatives of the keywords that were used. That line is followed by {N+1} lines showing the assumed parsing of the parag's text by the specified EVA keyword patterns. There is one line for each EVA gap of that parsing. Each of these lines has tree fields: the EVA hit that precedes the gap (blank for the first gap), the difference between the actual and predicted lengths of the gap, and the gap itself. Within the gaps, any substrings that matches any of the possible EVA keywords is highighted in boldface, even though they were not considered hits for the purposes of parsing or computation. All size discrepancies are in EVA characters. The badness score summarizes all the percent errors, and would be zero for a perfect match (the total size and all gap sizes match their predicted values).""") return # ---------------------------------------------------------------------- def format_starps_parag_evaluation(parev, hipat_ec): # Formats the SPS parag evaluation tuple {parev} as a multiline # string. Each line of it ends with "\n". # # The {parev} must be a tuple {(score, loc_ec, segs_ch, segs_ec, key_penalty)} # where {score} is the badness score of the parag, {loc_ec} is the # locus ID of its first line, {segs_ec} is the macro-parsing of its # pure EVA text into gaps and hits, and {segs_ch} is the corresponding # macro-parsing of the pure hanzi text of the SBJ entry. # # Let {ns} be {len(segs_ch) = len(segs_ec)}. Let {nh = ns//2} be the # number of hits in both parsings, and {ng = nh+1} be the number of # gaps. # # If {hipat_ec} is not {None}, it must be an EVA RE pattern # used to highlight substrings of the gaps of the EVA macro-parsing {segs-ec} # in {parev}. # # The size discrepancies shown in the output are computed as the # difference between the observed individual and total gap sizes in # {segs_ec} and the expected values computed from the observed hanzi # individual and total gap sizes in {segs_ch}. # # actual and predicted lengths of the whole EVA parag text and individual # EVA gaps in the parsing {segs_ec}. The predicted values are derived from # the hanzi counds in the whole SBJ entry and in the gaps of its # parsing {segs_ch}. # # See {format_starps_parag_evaluation_INFO} for the output format. debug = False score, loc_ec, segs_ch, segs_ec, key_penalty = parev assert segs_ec != None ns = len(segs_ec); assert len(segs_ch) == ns nh = ns//2; ng = nh + 1; assert ns == ng + nh gsizes_ch, tgsize_ch = bef.get_gap_sizes(segs_ch) exp_gsizes_ec = [ bef.expected_size1_from_size0(sz_ch, "ch", "ec") for sz_ch in gsizes_ch ] exp_tgsize_ec = bef.expected_size1_from_size0(tgsize_ch, "ch", "ec") if debug: err.write(f"!> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") err.write(f"!> {segs_ch = !r}\n") err.write(f"!> {gsizes_ch = } {tgsize_ch = }\n") err.write(f"!> {loc_ec = }\n") err.write(f"!> {exp_gsizes_ec = } {exp_tgsize_ec = }\n") gsizes_ec, tgsize_ec = bef.get_gap_sizes(segs_ec) hsizes_ec, thsize_ec = bef.get_hit_sizes(segs_ec) if debug: err.write(f"!> {loc_ec = }\n") err.write(f"!> {segs_ec = !r}\n") err.write(f"!> {gsizes_ec = } {tgsize_ec = }\n") if nh == 0: assert gsizes_ch[0] == tgsize_ch assert gsizes_ec[0] == tgsize_ec assert exp_gsizes_ec[0] == exp_tgsize_ec def truerr(s, es): eps = 1 if s < es[0]: aerr = s - es[0] elif s > es[1]: aerr = s - es[1] else: aerr = 0 return aerr # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def pcterr(s, es): eps = 1 if s < es[0]: ferr = (s - es[0])/hypot(es[0], eps) elif s > es[1]: ferr = (s - es[1])/hypot(es[1], eps) else: ferr = 0 return 100*ferr # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cbits = [] # SPS parag locus ID, badness score, total gap size error, key penalty: loc_ec = f"<{loc_ec}>" cbits.append(f" {loc_ec:<10s}") cbits.append(f" {score:7.2f}") tgerr = truerr(tgsize_ec, exp_tgsize_ec) cbits.append(f" {tgerr:+4d}") cbits.append(f" {key_penalty:.3f}") cbits.append("\n") # EVA hits and gaps: hwd = 0 for hsz in hsizes_ec: hwd = max(hwd, hsz) for ig in range(ng): hit_ec = f"{segs_ec[2*ig-1]}" if ig > 0 else "" gap_ec = segs_ec[2*ig] cbits.append(" "); # The hit string hit_ec = hit_ec.ljust(hwd, " ") cbits.append(f"{hit_ec}") # The gap size error: gs = gsizes_ec[ig] assert gs == len(gap_ec); egs = exp_gsizes_ec[ig]; assert egs != None gerr = truerr(gs,egs) gerr_str = f" {gerr:+3d} " cbits.append(gerr_str) # The gap text, with highlights: if hipat_ec != None: gap_ec = highlight_keywords_in_text(gap_ec, hipat_ec, "", "") cbits.append(gap_ec) cbits.append("\n") parev_str = "".join(cbits) return parev_str # ---------------------------------------------------------------------- def highlight_keywords_in_text(text, hipat, hbeg, hend): # Insert {hbeg} and {hend} around all substrings of {text} that match the # pattern {hipat}, which should not match the empty string. # # The substrings that match may overlapm but the marekrs # {hbeg} and {hend} will be simplified so that they # are never nested or tangled. # str_hi = "" # Highlighted string. end_hi = -1 # End of current highlight, or {-1}. nt = len(text) for it in range(nt + 1): matched = False # The "(?b)" says use the longest match: m = rex.match("(?b)" + hipat, text[it:], ) if m != None: end_this = it + m.end(0) if end_this > it: # Matched a non-empty prefix: matched = True if end_hi < it: str_hi += hbeg end_hi = max(end_hi, end_this) if not matched and end_hi == it: str_hi += hend if it < nt: str_hi += text[it] return str_hi # ---------------------------------------------------------------------- def format_macro_parsing_ch(loc_ch, segs_ch, hipat_ch): # Formats the macro-parsing {segs_ch} of the pure hanzi # text of an SBJ entry. Each line ends with "\n". # # If {hipat_ch} is not {None}, it must be a hanzi RE pattern # used to highlight substrings of the gaps of the macro-parsing {segs_ch}. debug = False assert segs_ch != None ns = len(segs_ch) nh = ns//2; ng = nh+1; assert ns == ng + nh gsizes_ch, tgsize_ch = bef.get_gap_sizes(segs_ch) hsizes_ch, thsize_ch = bef.get_hit_sizes(segs_ch) if debug: err.write(f"!< <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n") err.write(f"!< {segs_ch = !r}\n") err.write(f"!< {gsizes_ch = } {tgsize_ch = }\n") err.write(f"!< {hsizes_ch = } {thsize_ch = }\n") cbits = [] # SBJ entry locus ID: loc_ch = f"<{loc_ch}>" cbits.append(f" {loc_ch:<10s}") cbits.append("\n") # Gaps and hits: hwd = 0 for hs in hsizes_ch: hwd = max(hwd, hs) for ig in range(ng): hit_ch = segs_ch[2*ig-1] if ig > 0 else "" gap_ch = segs_ch[2*ig] cbits.append(" "); # Hit and gap sizes in hanzi: hsz_str = f"{hsizes_ch[ig-1]:2d}" if ig > 0 else " " cbits.append(hsz_str); gsz_str = f" {gsizes_ch[ig]:2d}" cbits.append(gsz_str); # The hit string: cbits.append(" ") hit_ch = hit_ch.ljust(hwd, " ") cbits.append(f"{hit_ch} ") # The gap string: if hipat_ch != None: gap_ch = highlight_keywords_in_text(gap_ch, hipat_ch, "", "") cbits.append(gap_ch) cbits.append("\n") entry_str = "".join(cbits) return entry_str # ---------------------------------------------------------------------- def test_stuff(): err.write("TESTING\n") err.write("----------------------------------------\n") test_other_stuff() err.write("----------------------------------------\n") return # ---------------------------------------------------------------------- def test_other_stuff(): err.write("----------------------------------------\n") gsizes_ch_str = "20..30,7,15,12..13,8" gsizes_ch = spf.parse_size_ranges(gsizes_ch_str) err.write(f"{gsizes_ch_str =!r}\n") err.write(f"{gsizes_ch =!r}\n") err.write("----------------------------------------\n") segs_ch = [ '白', '补中', '寒主治', '益气', '寒热', '久服', '八疸消', '轻身', '渴', ] err.write(f"{segs_ch = !r}\n") segs_ec = [ 'fozar', 'FOO', 'boonarna', 'BAR', 'ooo', 'BAZ', '', 'QUX', 'cuxcux', ] err.write(f"{segs_ec = !r}\n") score = bef.compute_score(segs_ch, segs_ec) err.write(f"{score = :6.1f}\n") err.write("----------------------------------------\n") loc_ec = "f117r" key_penalty = 0.123 parev = ( score, loc_ec, segs_ch, segs_ec, key_penalty ) hipat = [ 'FO+', 'nar', 'BAR', 'BIR', 'BAZ', 'cux', 'QUX' ] output_parev(err, parev, hipat) # Paranoia: score_check = bef.compute_score(segs_ch, segs_ec) if score != score_check: err.write(f"{score = :24.16e}\n") err.write(f"{score_check = :24.16e}\n") assert score == score_check return # ---------------------------------------------------------------------- if len(sys.argv) == 2 and sys.argv[1] == "ANN.TEST": test_stuff()