#! /usr/bin/python3 # Last edited on 2026-04-29 10:28:15 by stolfi import os, sys, re from sys import stderr as err from math import sqrt, hypot, exp, log, sin, cos, floor, ceil from math import inf, isfinite, nan, isnan from process_funcs import basic_line_loop from error_funcs import file_line_error def main(dir, tsize): # Creates a dictionary mapping SBJ words with {tsize} hanzi # characters, to their apparent Voynichese equivalents, from the # ".dic" files created by "*_entry_src.py". Also builds a graph that # shows agreement between the tentative assignments of SPS parags to a # set of SBJ entries. # Specifically, reads a file "{dir}/all_{tsize}_raw.dic" produced by # {merge_all_dics.sh}. Writes a graph "{dir}/all_{tsize}.gra" showing # the apparent agreements/disagreements between SBJ-SPS parag # assignments. Also writes a condensed dictionary # "{dir}/all_{tsize}_gra.dic" with pairs hanzi-Voynichese that seem to be # attested by at least two SBJ-SPS assignments. # # The vertics of the graph are SBJ entry codes, like 'ROOS' or 'IRON'. # # An edge is a pair {(score, glosses)}. There is at most one edge # between two vertices. # # In the edge between {code1} and {code2}, the {score} is a numeric # score that tells how much the SPS parags assigned to {code1} and # {core2} agree on the Voynichese translation of certain hanzi strings # that occur in both SBJ entries. # # The {glosses} is a list of triples {(tch, s1, s2)}. Each pair means # that the hanzi string {tch} occurs in both entries {code1} and # {code2}, and the parags assigned to those two entries have similar # strings {s1} and {s2} at about the right places. # # If {tsize} is positive, the string {tch} and the strings {s1} and # {s2} are taken from the strings (the /gaps/) between corresponding keywords (the /hits/). The # hits themselves are excluded from the glosses. If {tsize} is zero, # the strings {s1} and {s2} are correspondng hits. # # If entries {code1} and {code2} have a shared hanzi string {tch} # but the assigned SPS parags don't have similar translations # for it, then {s1} and {s2} are {None}. # # The graph is represented as a dict of dicts. The edge {(score, # glosses)} between vertices {code1} and {code2} is stored in # {graph[code1][code2]} which should be equal to {graph[code2][code1]} # except that the fields {s1} and {s2} of all glosses are swapped. # # A positive score means that most of the hanzi strings that occur in # both SBJ entries have corresponding glosses on the assigned SPS # parags that mostly agree. A negative score means that, most of those # shared hanzi strings do not seem to have matching translations in # the assigned SPS entries. # # If two SBJ entries have no strings in common, the score will be zero. # # Writes the graph to {gra_file} and the implied dictonary to {gra_dic_file}. raw_dic_file = f"{dir}/all_{tsize}_raw.dic" rd = open(raw_dic_file, "r") rd.reconfigure(encoding='utf-8') full_match = (tsize == 0) p_code_ch = r"([A-Z0-9]+)" p_loc_ch = r"([.a-z0-9]+)" p_loc_ec = r"([.a-z0-9]+)" p_text_ch = r"([^\000-\377]+)" p_text_ec = r"([·?a-z]+)" p_bar = "[ ]*[|][ ]*" p_line = \ f"{p_code_ch}{p_bar}{p_loc_ch}{p_bar}{p_loc_ec}{p_bar}" + \ f"{p_text_ch}{p_bar}{p_text_ec}{p_bar}" clips = [] cur_text_ch = None graph = dict() def process_line(nread, line): nonlocal clips, cur_text_ch, graph line = line.strip() def data_error(msg): file_line_error(raw_dic_file, nread, msg, line) return # .................................................................. if line == "": process_clips(graph, cur_text_ch, clips, full_match) clips = [] cur_text_ch = None return m = re.fullmatch(p_line, line) if m == None: data_error("invalid dic line format") code_ch = m.group(1) loc_ch = m.group(2) loc_ec = m.group(3) text_ch = m.group(4) text_ec = m.group(5) if cur_text_ch == None: cur_text_ch = text_ch else: if text_ch != cur_text_ch: data_error(f"inconsistent ch text {cur_text_ch = !r} {text_ch = }") clips.append((code_ch, loc_ec, text_ec,)) return # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: nread = basic_line_loop(rd, process_line) err.write(f"!& {nread = !r}\n") rd.close() if len(clips) != 0: process_clips(graph, cur_text_ch, clips, full_match) gra_file = f"{dir}/all_{tsize}.gra" write_graph(gra_file, graph) gra_dic_file = f"{dir}/all_{tsize}_gra.dic" write_global_dic(gra_dic_file, graph) return # ---------------------------------------------------------------------- def process_clips(graph, text_ch, clips, full_match): # This function takes a list {clips} of triples {(cd,loc_ec,text_ec)} where {cd} # is the 4-letter code from an SBJ entry, {loc_ec} is the locus ID # of the SPS parag assigned to it, and {text_ec} # is a fragment from the EVA text of the SPS parag assumed to correspond # to that entry, extracted from the position that corresponds to # the point in that entry where the hanzi string {text_ch} occurs. # # Saves {loc_ec} to {graph[code1]['loc_ec']}. # # For each pair of such clips {(cd1,loc1,tx1)} and {(cd2,loc2,tx2)}, computes a {score} # that measures the similarity of the two EVA strings with {compute_dic_score(tx1,tx2,full_match)}, # and extracts from them two substrngs {s1,s2} that seem to match. # Let {graph[cd1][cd2]} be {(old_score, glosses)}. Updates the graph by adding the {score} to {old_score} # and, if {score} is positive, appending the gloss {(text_ch, s1, s2)} to {glosses} debug = False def bump(loc1, cd1, s1, cd2, s2, score): if cd1 not in graph: graph[cd1] = dict() graph[cd1]['loc_ec'] = loc1 else: assert graph[cd1]['loc_ec'] == loc1 if cd2 not in graph[cd1]: graph[cd1][cd2] = ( 0.00, [ ] ) score_old, glosses = graph[cd1][cd2] if score > 0: glosses.append((text_ch, s1, s2)) else: glosses.append((text_ch, None, None)) graph[cd1][cd2] = ( score_old + score, glosses ) if debug: err.write(f"!& bump {cd1} {cd2} {graph[cd1][cd2][0]}\n") return # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: debug = False nh = len(clips) assert nh >= 2, "hit block with less than 2 lines" for ih1 in range(nh): for ih2 in range(ih1): cd1, loc1, tx1 = clips[ih1] cd2, loc2, tx2 = clips[ih2] score, s1, s2 = compute_dic_score(tx1, tx2, full_match) if debug: err.write(f"{cd1} {cd2} += {score:+8.3f} {s1 = !r} {s2 = !r}\n") if isfinite(score): assert s1 != None and s2 != None bump(loc1, cd1, s1, cd2, s2, score) bump(loc2, cd2, s2, cd1, s1, score) return # ---------------------------------------------------------------------- def write_graph(gra_file, graph): wr = open(gra_file, "w") wr.reconfigure(encoding='utf-8') wr.write("# -*- coding: utf-8 -*-\n") def write_edge(edge): # The {edge} must be a pair {(score, glosses)} # where the glosses are triples {(tch,s1,s2)} # where {tch} is a hanzi strng that occurs in the SNJ entries # of both vertices, and {s1} and {s2} are the corresponding # similar strings in the SPS parags assigned to them. # Or {None} if there are no such similar strings. sc12, gls12 = edge wr.write(f" {cd1} {cd2} {sc12:+8.3f}") for tch, s1, s2 in gls12: assert (s1 == None) == (s2 == None) wr.write(f" {tch}") if s1 == None: wr.write(" !!") else: wr.write(f" {s1}") if s2 != s1: wr.write(f":{s2}") wr.write("\n") return # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: codes1 = sorted(tuple(graph.keys())) for cd1 in codes1: codes2 = sorted(tuple(graph[cd1].keys())) # Compute and write the total score of vertex {cd1}: tot_sc12 = 0 loc1 = None for cd2 in codes2: if cd2 == 'loc_ec': loc1 = graph[cd1][cd2] else: sc12, gls12 = graph[cd1][cd2] tot_sc12 += sc12 wr.write(f"{cd1} {loc1} {tot_sc12:+8.3f}\n") # Write the edges out of {cd1}: for cd2 in codes2: if cd2 != 'loc_ec': write_edge(graph[cd1][cd2]) wr.write("\n") wr.close() return # ---------------------------------------------------------------------- def write_global_dic(gra_dic_file, graph): # Collect inferred hanzi-eva dictionary: # These dicts are indexed by a hanzi substring: defs = dict() # Voynichese equivalents. bads = dict() # Count of times when failed to get a match. # Each {defs[tch]} is a list of tuples {(s1_ec, cd1, s2_ec, cd2)}. codes1 = sorted(tuple(graph.keys())) for cd1 in codes1: codes2 = sorted(tuple(graph[cd1].keys())) # Collect a for cd2 in codes2: if cd2 != 'loc_ec' and cd1 <= cd2: sc12, gls12 = graph[cd1][cd2] for tch, s1, s2 in gls12: if tch not in defs: defs[tch] = [] bads[tch] = 0 assert (s1 == None) == (s2 == None) if s1 == None: bads[tch] += 1 else: defs[tch].append((s1, cd1, s2, cd2)) # Write dictionary out: wr = open(gra_dic_file, "w") wr.reconfigure(encoding='utf-8') wr.write("# -*- coding: utf-8 -*-\n") strings_ch = sorted(tuple(defs.keys())) for tch in strings_ch: dfs = sorted(defs[tch]) wr.write(f"{tch}\n") if bads[tch] > 0: wr.write(f" {bads[tch]:4d} failures") wr.write("\n") for s1, cd1, s2, cd2 in dfs: sec = s1 if s1 == s2 else f"{s1}={s2}" wr.write(f" {cd1} {cd2} {sec}\n") wr.write("\n") wr.close() return # ---------------------------------------------------------------------- def compute_dic_score(tx1, tx2, full_match): # Looks for the "best" common substring of EVA texts {tx1,tx2}. # Specifically, looks for starting indices {j1,j2} and limit indices {i1,i2} # such that {s1=tx1[j1:i1]} and {s2=tx2[j2:i1]} have maximum similarity score {sc}. # Returns {sc,s1,s2} # # The texts {tx1} and {tx2} should not contain [-., ]. # Ignores prefixes and suffixes of {tx1} and {tx2} with are all '·'. # # NOT: Before the comparison, maps "ir" to "iin" and then "iii" to "ii". debug = False def cleanup(tx): assert re.search(r"[-., ]", tx) == None, "bug invalid tx" tx = re.sub(r"ir", "iin", tx) tx = re.sub(r"iii", "ii", tx) return tx # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: tx1 = cleanup(tx1); nt1 = len(tx1) tx2 = cleanup(tx2); nt2 = len(tx2) if debug: err.write(f"!= {tx1 = !r} (cleaned)\n") err.write(f"!= {tx2 = !r} (cleaned)\n") # Dynamic programming tableau: tab = [ None ] * (nt1 + 1) for i1 in range(nt1+1): tab[i1] = [ (-inf,None,None) ] * (nt2+1) # Each entry {tab[i1][i2]} of the tableau is {(sc,k1,k2)} meaning that # the best path that ends at vertex {(i1,i2)} of the dyn prog graph # has score {sc} and is the best path that ends at vertex {(k1,k2)} # plus the edge from {(k1,k2)} to {(i1,i2)}, If {k1} and {k2} are # {None} it means that the best path starts at {(i1,i2)} itself. start_points = -2.00 # Penalty for starting a new substring. skip_points = -0.75 # Penalty for skipping a letter. match_points = +1.00 # Reward for letters that match. simil_points = +0.75 # Reward for letters with matching classes. diff_points = -1.00 # Penalty for letters that dont match. def eva_class(tec): # EVA class of commonly confused characters: if tec == "a" or tec == "o" or tec == "y": return "o" elif tec == "k" or tec == "d" or tec == "t" or tec == "p" or tec == "f": return "k" elif tec == "c" or tec == "s": return "c" elif tec == "h" or tec == "e": return "e" else: return tec # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: def eva_char_char_points(tec1, tec2): if tec1 == tec2: return match_points else: z1 = eva_class(tec1) z2 = eva_class(tec2) if z1 == z2: return simil_points else: return diff_points # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: def step_score(k1,k2,i1,i2): assert k1 == i1 or k1 == i1-1 assert k2 == i2 or k2 == i2-1 if k1 < 0 or k2 < 0: return -inf if debug: err.write(f"!= {k1 = } {k2 = } {i1 = } {i2 = }\n") vsc = tab[k1][k2][0]; if vsc == -inf: return -inf # Compute the raw edge points {esc}: esc = None if k1 == i1 or k2 == i2: esc = skip_points else: tec1 = tx1[k1]; tec2 = tx2[k2] # Compute the position weight : f1 = (k1+1)/(nt1+1); w1 = 4*f1*(1 - f1) f2 = (k2+1)/(nt2+1); w2 = 4*f2*(1 - f2) f12 = (f1 - f2); w12 = 1 - f12*f12; wt = w1*w2*w12; wt = wt*sqrt(wt) esc = wt * eva_char_char_points(tec1, tec2) if esc == -inf: return -inf # Combine with vertex points: sc = vsc + esc return sc # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: vsc_best = -inf i1_best = None; i2_best = None for i1 in range(nt1+1): for i2 in range(nt2+1): # Consider starting a path here. sc_best = start_points if (i1 == 0 and i2 == 0) or not full_match else -inf k1_best = None; k2_best = None # Try extending with a diagonal move: for k1,k2 in (i1-1,i2-1), (i1-1,i2), (i1,i2-1): sck = step_score(k1,k2,i1,i2) if isfinite(sck) and sck > sc_best: sc_best = sck; k1_best = k1; k2_best = k2 tab[i1][i2] = ( sc_best, k1_best, k2_best ) if sc_best > vsc_best: vsc_best = sc_best i1_best = i1; i2_best = i2 if debug: err.write(f"!= tableau:\n") print_tableau(tab, nt1, nt2) if full_match: # The path must end at the end of both strings: vsc_best, k1x, k2x = tab[nt1][nt2] i1_best = nt1; i2_best = nt2 if debug: err.write(f"!= {vsc_best = } {i1_best = } {i2_best = }\n") # Now recover the best path if vsc_best == -inf: return -inf, None, None j1 = i1_best; j2 = i2_best while True: assert j1 != None and j2 != None vsc, k1, k2 = tab[j1][j2] if k1 == None and k2 == None: break j1 = k1; j2 = k2 return vsc_best, tx1[j1:i1_best], tx2[j2:i2_best] # ---------------------------------------------------------------------- def test_comp(tx1, tx2, full_match): err.write(f"======================================================================\n") err.write("testing {compute_dic_score} ...\n") err.write(f"{tx1 = !r}\n") err.write(f"{tx2 = !r}\n") score, s1, s2 = compute_dic_score(tx1, tx2, full_match) err.write(f"{score = :+9.4f} {s1 = !r} {s2 = !r}\n") err.write(f"======================================================================\n") return # ---------------------------------------------------------------------- def print_tableau(tab,nt1,nt2): err.write(" %s\n" % ("---" * (nt2+1))) for i1 in range(nt1+1): err.write(" ") for i2 in range(nt2+1): err.write(f"{tab[i1][i2][0]*10:+3.0f}") err.write("\n") err.write(" %s\n" % ("---" * (nt2+1))) return err.write(" %s\n" % ("---" * (nt2+1))) # ---------------------------------------------------------------------- def do_tests(): tx1 = "·······chedyorochody" tx2 = "chedyachodyltaiinshedy·······" test_comp(tx1, tx2, False) tx1 = "·····chedyoroalorqorokairchody" tx2 = "dychodyaltaiinshe·······" test_comp(tx1, tx2, False) tx1 = "ychhedy" tx2 = "cheedo" test_comp(tx1, tx2, True) tx1 = "qokaiin" tx2 = "cheedy" test_comp(tx1, tx2, True) return # ---------------------------------------------------------------------- if sys.argv[1] == "MGF.TEST": do_tests() else: iarg = 1 dir = sys.argv[iarg]; iarg += 1 tsize = int(sys.argv[iarg]); iarg += 1 main(dir, tsize)