#! /usr/bin/python3 # Last edited on 2026-01-14 17:58:22 by stolfi import sys, os, re from sys import stdout as out, stderr as err from math import log, sqrt, hypot, floor, ceil, inf, pow import numpy as np import vms_linear_gray_image_funcs as gf from process_funcs import bash from error_funcs import file_line_error def qdif(nw1, avg1, nw2, avg2): # Quadratic discrepancy between word counts {nw1,nw2} considering that # the respective average values should be {avg1,avg2}. # # The result is zero if both word counts have the same ratio to the # respective averages. # assert nw1 > 0 and nw2 > 0, "invalid word counts" fw1 = nw1/avg1 - avg1/nw1 fw2 = nw2/avg2 - avg2/nw2 d = fw1 - fw2; d2 = d*d return d2 # ---------------------------------------------------------------------- def importance(nw1, avg1, nw2, avg2): # The relative importance of matching counts {nw1,nw2} considering that # the respective average values should be {avg1,avg2}. # The importance gets higher if either count is away from the # respective average. assert nw1 > 0 and nw2 > 0, "invalid word counts" fw1 = nw1/avg1 - avg1/nw1 fw2 = nw2/avg2 - avg2/nw2 imp = hypot(1, hypot(fw1, fw2)); return imp # ---------------------------------------------------------------------- def main(name1, name2): loc1, nwp1, totw1 = read_words_per_parag(f"out/{name1}.wpp") npara1 = len(nwp1) avg_nwp1 = (totw1 + npara1)/npara1 loc2, nwp2, totw2 = read_words_per_parag(f"out/{name2}.wpp") npara2 = len(nwp2) avg_nwp2 = (totw2 + npara2)/npara2 eps = 1/hypot(avg_nwp1, avg_nwp2) # Create image {D} such that {D[i1,i2]} is the simialrty (in [0_1]) of # parag {i1} of book 1 and parag {i2} of book 2: err.write("computing the basic similarity matrix {D} ...\n") D = np.zeros((npara1,npara2)) for i1 in range(npara1): for i2 in range(npara2): qd = qdif(nwp1[i1], avg_nwp1, nwp2[i2], avg_nwp2) sim = 1/(1 + qd) D[i1,i2] = pow(sim, 8) dfile = f"out/{name1}-{name2}-rdif.png" gf.write_image_as_gray_png(dfile, D) bash(f"display -resize '200%' {dfile}") # Create an image {E} derived from {D} but taking adjacency into account: err.write("computing the basic similarity matrix {D} ...\n") E = np.zeros((npara1,npara2)); for i1 in range(1,npara1-1): for i2 in range(1,npara2-1): edif = max(D[i1-1,i2-1]*D[i1,i2], D[i1+1,i2+1]*D[i1,i2]) E[i1,i2] = edif efile = f"out/{name1}-{name2}-edif.png" gf.write_image_as_gray_png(efile, E) bash(f"display -resize '200%' {efile}") # Try to find a min weight matching, where the weight of pairing {i1} with {i2} # is proportional to the size of the parags times the abs difference of size, # corrected by the averages. passes = 20 K12, M, P = find_best_matching(nwp1, avg_nwp1, loc1, nwp2, avg_nwp2, loc2, D, passes) mfile = f"out/{name1}-{name2}-mval-{passes:02d}.png" gf.write_image_as_gray_png(mfile, M) bash(f"display -resize '200%' {mfile}") pfile = f"out/{name1}-{name2}-mper-{passes:02d}.png" gf.write_image_as_gray_png(pfile, P) bash(f"display -resize '200%' {pfile}") return # ---------------------------------------------------------------------- def find_best_matching(nwp1, avg_nwp1, loc1, nwp2, avg_nwp2, loc2, D, passes): # Tries to assign to each parag of book 1 a parag of book 2 in a way # that minimizes the total quadratic discrepancy weight # plus jump weights. # # Returns an array {K12[0..npara1]} that specifies that parag {i1} # of book 1 should be macthed to parag {i2 = K12[i1]} of book 2. # Also returns an array {M} that is {D} with columns permuted # as specified by {K12}. npara1 = len(nwp1) npara2 = len(nwp2) assert npara1 <= npara2, "should match smaller book to larger book" def mismatch_weight(i1, i2): # Quadratic mismatch term for matching parag {i1} of file 1 with parag {i2 = K12[i1]} of file 2: nonlocal npara1, nwp1, avg_nwp1, npara2, nwp2, avg_nwp2 assert i1 >= 0 and i1 < npara1, f"invalid index {i1 = }" assert i2 >= 0 and i2 < npara2, f"invalid index {i1 = }" imp = importance(nwp1[i1], avg_nwp1, nwp2[i2], avg_nwp2) qd = qdif(nwp1[i1], avg_nwp1, nwp2[i2], avg_nwp2) wm = imp*qd return wm # .................................................................... # Throw an initial matching: K12 = [ None ] * npara1 # The matching is of {loc1[i1]} to {loc2[K12[i1]]}. def jumps_at(ip): # Returns 0 if parags {ip} and {ip+1} of book 1 are mapped to # successive parags of book 2; else 1. # Returns 0 also if either of those two parags don't exist. nonlocal K12 return 0 if ip < 0 or ip+1 >= npara1 or K12[ip+1] == K12[ip]+1 else 1 # .................................................................... jump_wt = 0.01 def jump_weight(i1, j1): # Weight term due to jump of the current matchings # of {i1} and {j1} with the matchings of their neighbors. # We get a jump point when two successive parags of book 1 # are matched to successive parags of book 2. nonlocal jump_wt assert i1 != j1 nj = 0 # Number of jumps around {i1} and {j1}. nj += jumps_at(i1) + jumps_at(j1) if j1 != i1 + 1: nj += jumps_at(j1-1) if i1 != j1 + 1: nj += jumps_at(i1-1) return nj * jump_wt # .................................................................... def weight_gain_of_swap(i1, j1): # How much the total weight changes if we swap {K12[i1]} and {K12[j1]}. # Weight of current matching, for those two edges: w_curr = mismatch_weight(i1, K12[i1]) + mismatch_weight(j1, K12[j1]) + jump_weight(i1, j1) # Perform the swap temporarily: K12[i1],K12[j1] = K12[j1],K12[i1] # Weight of modified matching, for those two edges: w_swap = mismatch_weight(i1, K12[i1]) + mismatch_weight(j1, K12[j1]) + jump_weight(i1, j1) # Undo the swap for now: K12[i1],K12[j1] = K12[j1],K12[i1] return w_swap - w_curr # .................................................................... # Create an initial match that pairs parags by size rank: J1 = [ (i1, nwp1[i1],) for i1 in range(npara1) ] J2 = [ (i2, nwp2[i2],) for i2 in range(npara2) ] for b in range(2): J, nwp, avg, loc = ((J1, nwp1, avg_nwp1, loc1), (J2, nwp2, avg_nwp2, loc2))[b] J.sort(key = lambda x: x[1]) err.write("most extreme parags in book {b}:\n") for k in range(5): i = J[k][0]; err.write(f" {i:3d} {loc[i]:<12s} {nwp[i]:3d}\n") err.write("...\n") for k in range(5): i = J[npara1-6+k][0]; err.write(f" {i:3d} {loc[i]:<12s} {nwp[i]:3d}\n") err.write("\n") for k1 in range(npara1): i1 = J1[k1][0]; k2 = k1*(npara2-1)//(npara1-1) i2 = J2[k2][0] K12[i1] = i2 err.write("initial matching:\n") for i1 in range(npara1): i2 = K12[i1] err.write(f" {i1:3d} : {nwp1[i1]:2d} -> {i2:3d} : {nwp2[i2]:2d}\n") improve_matching(K12, weight_gain_of_swap, passes) # Compute {M} a col-perm of {D} with matching parags on the diagonal: M = np.zeros((npara1,npara1)); for i1 in range(npara1): i2 = K12[i1] for k1 in range(npara1): M[k1,i1] = D[k1,i2] # Create matrix P that shows the permutation: P = np.zeros((npara1,npara2)) for i1 in range(npara1): i2 = K12[i1] P[i1,i2] = 1 # Print runs of consecutive matches: err.write("consecutive matching runs:\n") prt = [ False ] * npara1 # {prt[i1]} is true if parag {i1} was listed already. for i1 in range(npara1-1): if not prt[i1] and K12[i1+1] == K12[i1]+1: # Start of a run: j1 = i1; while j1 < npara1 and (j1 == i1 or K12[j1-1] == K12[j1]-1): j2 = K12[j1] qwt = mismatch_weight(j1, j2) fwp1 = nwp1[j1]/avg_nwp1 fwp2 = nwp2[j2]/avg_nwp2 err.write(f"{loc1[j1]:<12s} {nwp1[j1]:2d} {fwp1:5.3f} -> {loc2[j2]:<12s} {nwp2[j2]:2d} {fwp2:5.3f} w = {qwt:10.4f}\n") prt[j1] = True j1 += 1 err.write("\n") return K12, M, P # ---------------------------------------------------------------------- def improve_matching(K12, weight_gain_of_swap, passes): # Tries to improve the matching {K12} by swapping matching parags. # The function {weight_gain_of_swap(i1,j1)} shoudl return the change in the total weight # if one were to swap {K12[i1]} and {K12[j1]} npara1 = len(K12) # Try to swap matchings to reduce the total weight: for p in range(passes): err.write(f"pass {p} ...\n") npair = 0 # Number of pairs tried in pass. nswap = 0 # Number of swaps done in pass. for i1 in range(npara1): for j1 in range(i1+1,npara1): dbg = (npair < 10) npair += 1 i2 = K12[i1]; j2 = K12[j1] if dbg: err.write(f" swapping {i1:3d}->{i2:3d}, {j1:3d}->{j2:3d}") err.write(f" for {i1:3d}->{j2:3d}, {j1:3d}->{i2:3d}") assert i1 != j1 and i2 != j2 assert i1 >= 0 and i1 < npara1 assert j1 >= 0 and j1 < npara1 # Try to swap matchings of {i1} and {j1}: w_gain = weight_gain_of_swap(i1, j1) if dbg: err.write(f" {w_gain = :10.4f}") if w_gain < 0: # Swap the matchings: if dbg: err.write(f" (swapped)") K12[i1],K12[j1] = K12[j1],K12[i1] nswap += 1 if dbg: err.write(f"\n") err.write(f"{nswap:8d} swaps executed in {npair:8d} attempts\n") return # ---------------------------------------------------------------------- def read_words_per_parag(fname): rd = open(fname, "r") rd.reconfigure(encoding='iso-8859-1') nread = 0 # Number of lines read from file. nwp = [] loc = [] totw = 0 maxw = -inf minw = +inf while True: line = rd.readline() if line == "": # End of file: break nread += 1 line = line.strip() if re.match(r"[ ]*([#]|$)", line): continue m = re.match(r" *([<][a-z0-9.;]+[>])[ ]+([0-9]+)", line) if m == None: file_line_error(fname,nread,"bad format",line) loc_k = m.group(1) nwp_k = int(m.group(2)) loc.append(loc_k) nwp.append(nwp_k) totw += nwp_k maxw = max(maxw, nwp_k) minw = min(minw, nwp_k) err.write(f"{nread:5d} lines read from {fname}\n") err.write(f"{len(nwp):5d} parag sizes found\n") err.write(f"{totw:5d} total words\n") err.write(f"{totw/len(nwp):8.3f} average words per parag\n") err.write(f"word count range {minw}..{maxw}\n") err.write("\n") return loc, nwp, totw # ---------------------------------------------------------------------- main(sys.argv[1], sys.argv[2])