#! /usr/bin/python3
# Last edited on 2026-01-14 17:58:22 by stolfi

import sys, os, re
from sys import stdout as out, stderr as err
from math import log, sqrt, hypot, floor, ceil, inf, pow
import numpy as np
import vms_linear_gray_image_funcs as gf
from process_funcs import bash
from error_funcs import file_line_error

def qdif(nw1, avg1, nw2, avg2):
  # Quadratic discrepancy between word counts {nw1,nw2} considering that 
  # the respective average values should be {avg1,avg2}.
  # 
  # The result is zero if both word counts have the same ratio to the
  # respective averages.
  #
  assert nw1 > 0 and nw2 > 0, "invalid word counts"
  fw1 = nw1/avg1 - avg1/nw1
  fw2 = nw2/avg2 - avg2/nw2
  d = fw1 - fw2;
  d2 = d*d
  return d2
  # ----------------------------------------------------------------------

def importance(nw1, avg1, nw2, avg2):
  # The relative importance of matching counts {nw1,nw2} considering that 
  # the respective average values should be {avg1,avg2}.
  # The importance gets higher if either count is away from the
  # respective average.
  assert nw1 > 0 and nw2 > 0, "invalid word counts"
  fw1 = nw1/avg1 - avg1/nw1
  fw2 = nw2/avg2 - avg2/nw2
  imp = hypot(1, hypot(fw1, fw2));
  return imp
  # ----------------------------------------------------------------------

def main(name1, name2):
  loc1, nwp1, totw1 = read_words_per_parag(f"out/{name1}.wpp") 
  npara1 = len(nwp1)
  avg_nwp1 = (totw1 + npara1)/npara1

  loc2, nwp2, totw2 = read_words_per_parag(f"out/{name2}.wpp")
  npara2 = len(nwp2)
  avg_nwp2 = (totw2 + npara2)/npara2
  
  eps = 1/hypot(avg_nwp1, avg_nwp2)
  
  # Create image {D} such that {D[i1,i2]} is the simialrty (in [0_1]) of 
  # parag {i1} of book 1 and parag {i2} of book 2:
  err.write("computing the basic similarity matrix {D} ...\n")
  D = np.zeros((npara1,npara2))
  for i1 in range(npara1):
    for i2 in range(npara2):
      qd = qdif(nwp1[i1], avg_nwp1, nwp2[i2], avg_nwp2)
      sim = 1/(1 + qd)
      D[i1,i2] = pow(sim, 8)
  
  dfile = f"out/{name1}-{name2}-rdif.png"
  gf.write_image_as_gray_png(dfile, D)
  bash(f"display -resize '200%' {dfile}")
      
  # Create an image {E} derived from {D} but taking adjacency into account:
  err.write("computing the basic similarity matrix {D} ...\n")
  E = np.zeros((npara1,npara2));
  for i1 in range(1,npara1-1):
    for i2 in range(1,npara2-1):
      edif = max(D[i1-1,i2-1]*D[i1,i2],  D[i1+1,i2+1]*D[i1,i2])
      E[i1,i2] = edif

  efile = f"out/{name1}-{name2}-edif.png"
  gf.write_image_as_gray_png(efile, E)
  bash(f"display -resize '200%' {efile}")

  # Try to find a min weight matching, where the weight of pairing {i1} with {i2}
  # is proportional to the size of the parags times the abs difference of size,
  # corrected by the averages.

  passes = 20
  K12, M, P = find_best_matching(nwp1, avg_nwp1, loc1, nwp2, avg_nwp2, loc2, D, passes) 

  mfile = f"out/{name1}-{name2}-mval-{passes:02d}.png"
  gf.write_image_as_gray_png(mfile, M)
  bash(f"display -resize '200%' {mfile}")

  pfile = f"out/{name1}-{name2}-mper-{passes:02d}.png"
  gf.write_image_as_gray_png(pfile, P)
  bash(f"display -resize '200%' {pfile}")

  return
  # ----------------------------------------------------------------------
  
def find_best_matching(nwp1, avg_nwp1, loc1, nwp2, avg_nwp2, loc2, D, passes):
  # Tries to assign to each parag of book 1 a parag of book 2 in a way
  # that minimizes the total quadratic discrepancy weight
  # plus jump weights.
  #
  # Returns an array {K12[0..npara1]} that specifies that parag {i1}
  # of book 1 should be macthed to parag {i2 = K12[i1]} of book 2.
  # Also returns an array {M} that is {D} with columns permuted 
  # as specified by {K12}.
    
  npara1 = len(nwp1)
  npara2 = len(nwp2)
    
  assert npara1 <= npara2, "should match smaller book to larger book"
  
  def mismatch_weight(i1, i2):
    # Quadratic mismatch term for matching parag {i1} of file 1 with parag {i2 = K12[i1]} of file 2:
    nonlocal npara1, nwp1, avg_nwp1, npara2, nwp2, avg_nwp2
    assert i1 >= 0 and i1 < npara1, f"invalid index {i1 = }"
    assert i2 >= 0 and i2 < npara2, f"invalid index {i1 = }"
    imp = importance(nwp1[i1], avg_nwp1, nwp2[i2], avg_nwp2)
    qd = qdif(nwp1[i1], avg_nwp1, nwp2[i2], avg_nwp2)
    wm = imp*qd
    return wm
    # ....................................................................
  
  # Throw an initial matching:
  K12 = [ None ] * npara1 # The matching is of {loc1[i1]} to {loc2[K12[i1]]}.

  def jumps_at(ip): 
    # Returns 0 if parags {ip} and {ip+1} of book 1 are mapped to
    # successive parags of book 2; else 1. 
    # Returns 0 also if either of those two parags don't exist.
    nonlocal K12
    return 0 if ip < 0 or ip+1 >= npara1 or K12[ip+1] == K12[ip]+1 else 1
    # ....................................................................
        
  jump_wt = 0.01

  def jump_weight(i1, j1):
    # Weight term due to jump of the current matchings 
    # of {i1} and {j1} with the matchings of their neighbors.
    # We get a jump point when two successive parags of book 1
    # are matched to successive parags of book 2.
    nonlocal jump_wt
    assert i1 != j1

    nj = 0 # Number of jumps around {i1} and {j1}.
    nj += jumps_at(i1) + jumps_at(j1)
    if j1 != i1 + 1: nj += jumps_at(j1-1)
    if i1 != j1 + 1: nj += jumps_at(i1-1)
    return nj * jump_wt
    # ....................................................................
  
  def weight_gain_of_swap(i1, j1):
    # How much the total weight changes if we swap {K12[i1]} and {K12[j1]}.
  
    # Weight of current matching, for those two edges:
    w_curr = mismatch_weight(i1, K12[i1]) + mismatch_weight(j1, K12[j1]) + jump_weight(i1, j1)

    # Perform the swap temporarily:
    K12[i1],K12[j1] = K12[j1],K12[i1]

    # Weight of modified matching, for those two edges:
    w_swap = mismatch_weight(i1, K12[i1]) + mismatch_weight(j1, K12[j1]) + jump_weight(i1, j1)
    
    # Undo the swap for now:
    K12[i1],K12[j1] = K12[j1],K12[i1]
    
    return w_swap - w_curr
    # ....................................................................

  # Create an initial match that pairs parags by size rank:
  J1 = [ (i1, nwp1[i1],) for i1 in range(npara1) ]
  J2 = [ (i2, nwp2[i2],) for i2 in range(npara2) ]
  for b in range(2):
    J, nwp, avg, loc = ((J1, nwp1, avg_nwp1, loc1), (J2, nwp2, avg_nwp2, loc2))[b]
    J.sort(key = lambda x: x[1])
    err.write("most extreme parags in book {b}:\n")
    for k in range(5):
      i = J[k][0];
      err.write(f"  {i:3d} {loc[i]:<12s} {nwp[i]:3d}\n")
    err.write("...\n")
    for k in range(5):
      i = J[npara1-6+k][0];
      err.write(f"  {i:3d} {loc[i]:<12s} {nwp[i]:3d}\n")
    err.write("\n")
    
  for k1 in range(npara1):
    i1 = J1[k1][0]; 
    k2 = k1*(npara2-1)//(npara1-1)
    i2 = J2[k2][0]
    K12[i1] = i2

  err.write("initial matching:\n")
  for i1 in range(npara1):
    i2 = K12[i1]
    err.write(f"  {i1:3d} : {nwp1[i1]:2d} -> {i2:3d} : {nwp2[i2]:2d}\n")
          
  improve_matching(K12, weight_gain_of_swap, passes)
    
  # Compute {M} a col-perm of {D} with matching parags on the diagonal:
  M = np.zeros((npara1,npara1));
  for i1 in range(npara1):
    i2 = K12[i1]
    for k1 in range(npara1):
      M[k1,i1] = D[k1,i2]
      
  # Create matrix P that shows the permutation:
  P = np.zeros((npara1,npara2))
  for i1 in range(npara1):
    i2 = K12[i1]
    P[i1,i2] = 1

  # Print runs of consecutive matches:
  err.write("consecutive matching runs:\n")
  prt = [ False ] * npara1 # {prt[i1]} is true if parag {i1} was listed already.
  for i1 in range(npara1-1):
    if not prt[i1] and K12[i1+1] == K12[i1]+1:
      # Start of a run:
      j1 = i1;
      while j1 < npara1 and (j1 == i1 or K12[j1-1] == K12[j1]-1):
        j2 = K12[j1]
        qwt = mismatch_weight(j1, j2)
        fwp1 = nwp1[j1]/avg_nwp1
        fwp2 = nwp2[j2]/avg_nwp2
        err.write(f"{loc1[j1]:<12s} {nwp1[j1]:2d} {fwp1:5.3f} ->  {loc2[j2]:<12s} {nwp2[j2]:2d} {fwp2:5.3f}  w = {qwt:10.4f}\n")
        prt[j1] = True
        j1 += 1      
      err.write("\n")
  
  return K12, M, P
  # ----------------------------------------------------------------------

def improve_matching(K12, weight_gain_of_swap, passes):
  # Tries to improve the matching {K12} by swapping matching parags.
  # The function {weight_gain_of_swap(i1,j1)} shoudl return the change in the total weight
  # if one were to swap {K12[i1]} and {K12[j1]}
  
  npara1 = len(K12)
   
  # Try to swap matchings to reduce the total weight:
  for p in range(passes):
    err.write(f"pass {p} ...\n")
    npair = 0 # Number of pairs tried in pass.
    nswap = 0 # Number of swaps done in pass.
    for i1 in range(npara1):
      for j1 in range(i1+1,npara1):
        dbg = (npair < 10)
        npair += 1

        i2 = K12[i1]; j2 = K12[j1]

        if dbg:
          err.write(f"  swapping {i1:3d}->{i2:3d}, {j1:3d}->{j2:3d}")
          err.write(f" for {i1:3d}->{j2:3d}, {j1:3d}->{i2:3d}")
    
        assert i1 != j1 and i2 != j2
        assert i1 >= 0 and i1 < npara1
        assert j1 >= 0 and j1 < npara1
        # Try to swap matchings of {i1} and {j1}:
        w_gain = weight_gain_of_swap(i1, j1)
        if dbg: err.write(f" {w_gain = :10.4f}")
        if w_gain < 0:
          # Swap the matchings:
          if dbg: err.write(f" (swapped)")
          K12[i1],K12[j1] = K12[j1],K12[i1]
          nswap += 1
          
        if dbg: err.write(f"\n")
        
    err.write(f"{nswap:8d} swaps executed in {npair:8d} attempts\n")
  return
  # ----------------------------------------------------------------------
  
def read_words_per_parag(fname):
  rd = open(fname, "r")
  rd.reconfigure(encoding='iso-8859-1')
  
  nread = 0 # Number of lines read from file.
  
  nwp = []
  loc = []
  totw = 0
  maxw = -inf
  minw = +inf
  while True:
    line = rd.readline()
    if line == "":
      # End of file:
      break
    nread += 1
    line = line.strip()
    if re.match(r"[ ]*([#]|$)", line): continue
    m = re.match(r" *([<][a-z0-9.;]+[>])[ ]+([0-9]+)", line)
    if m == None:
      file_line_error(fname,nread,"bad format",line)
    loc_k = m.group(1)
    nwp_k = int(m.group(2))
    loc.append(loc_k)
    nwp.append(nwp_k)
    totw += nwp_k
    maxw = max(maxw, nwp_k)
    minw = min(minw, nwp_k)
    
  err.write(f"{nread:5d} lines read from {fname}\n")
  err.write(f"{len(nwp):5d} parag sizes found\n")
  err.write(f"{totw:5d} total words\n")
  err.write(f"{totw/len(nwp):8.3f} average words per parag\n")
  err.write(f"word count range {minw}..{maxw}\n")
  err.write("\n")
  return loc, nwp, totw
  # ----------------------------------------------------------------------

main(sys.argv[1], sys.argv[2])