#! /usr/bin/python3
# Last edited on 2025-07-11 12:35:18 by stolfi
# A {python3} library module for flexible alignment of VMS transcribed lines.

from math import inf

def optimum_alignment(st, data, pdiff, pskip, max_skip):
  # Given two strings {data[0],data[1]} with lengths {n0} and {n1},
  # computes the optimum alignment between their characters.
  # 
  # A /partial alignment/ {a} is a pair of lists with the same length {nk}. The
  # elements {a[i][0..nk-1]} are a subset of the indices of characters
  # of {data[i]}, in strictly increasing order; and {a[i][nk]} is the
  # length of {data[i]}. For each {k} in {0..nk-1}, the alignment /pairs
  # up/ character {c0[k]=data[0][a[0][k]]} with character
  # {c1[k]=data[1][a[1][k]]}. Not all characters will be paired up. 
  #
  # A /complete alignment/ is a partial alignment with {nk+1} pairs
  # where {a[nk] = (n0,n1)} pairs up the non-existent characters just
  # past the end of {data[0]] and {data[1]}.
  # 
  # This procedure computes a complete alignment {a[0..1][0..nk]} for
  # {data[0..1]} with mininmum /total penalty/, which is the sum of a
  # /mismatch term/ and a /slippage term/.
  #
  # The mismatch term is the sum for {k} in {0..nk-1} of a character
  # discrepancy function {pdiff(c0[k],c1[k])} that measures how
  # different are the two characters.  In particular, this term
  # should be zero if the characters can be assumed to perfectly match,
  # {+inf} if they cannot be matched at all, otherwise a positive
  # number.
  # 
  # The slippage term is the sum over {k} in {0..nk-1} of {pskip(d0[k])
  # + pskip(d1[k])}, where {d0[k] = a[0][k+1]-a[0][k]-1} and {d1[k] =
  # a[1][k+1]-a[1][k]-1}. In particular, {pskip(d)} should be zero if
  # {d==0}, {+inf} if {d} exceeds some limit {max_skip}, and a finite
  # positive increasing function of {d} otherwise.
  # 
  # The cost of this procedure is proportional to {n0*n1*max_skip^2},
  # so you want to keep {max_skip} as small as possible.
  #
  # The min value of {pskip} for positive argument should be large
  # compared to the max finite value of {pdiff}, otherwise the procedure
  # will rather skip over chars that are matchable but not identical,
  # instead of matching them.
  # 
  # The procedure returns the optimum matching {a} and its total penalty {Y}.
  # If {Y} is {+inf}, the matching failed, and {a} will be just the 
  # dummy pair {([n0],[n1])}.

  n0 = len(data[0])
  n1 = len(data[1])
  Yskip = None # Reset later.
    
  # Create the penalty table {Y} and the pointer table {P}.
  # 
  # Entry {Y[i0,i1]} says what is the total penalty for the best partial
  # alignment whose last pair is {data[0][i0]} and {data[1][i1]}; or
  # {+inf} if there is no valid alignment that includes that pair.
  # 
  # Entry {P[i0,i1]} is the next-to-last pair of that partial
  # alignment; or {None} if {Y[i0,i1]} is {+inf}, or if the best alignment
  # that includes that pair has no previous pair.
  #
  # In particular, {Y[i0,i1]==+inf} and {P[i0,i1]==None} 
  # if {i0==n0} or {i1==n1}, except when {i0==no} and {i1==n1}.
  #
  
  def compute_table_entries(i0, i1):
    # Computes the entries {Y[i0,i1]} and {P[i0,i1]} assuming 
    # computed all entries {Y[j0,j1]} and {P[j0,j1]} with {j0} in {0..i0}
    # and {j1} in {0..i1} except of course {j0,j1 == i0,i1}.
    nonlocal n0, n1, Y, P, Yskip
    
    # Compute the mismatch term {pm}:
    if i0 == n0 or i1 == n1:
      pm = 0 if i0 == n0 and i1 == n1 else +inf
    else:
      pm = pdiff(data[0][i0], data[1][i1])

    # Compute {Y_best}, the min value of an align that ends at {(i0,i1)},
    # and {P_best}, the pair {(j0,j1)} that precedes {(i0,i1)} in that 
    # path. Initialize {Y_best,P_best} with  the option to start the align at {(i0,i1)}:
    Y[i0][i1], P[i0][i1] = find_best_non_trivial_path(i0, i1, pm)
    return
    # .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .

  def find_best_non_trivial_path(i0, i1, pm):
    nonlocal n0, n1, Y, P, Yskip
    # Find the best non-trivial alignment that ends with {(i0,i1)},
    # given that the matching penalty for that pair is {pm}.
    # If there is 

    Y_best = pm + pskip(i0) + pskip(i1)
    P_best = None
    if pm != +inf:
      # Check all possibilities for previous pair {(j0,j1)}:
      for d0 in range(max_skip):
        j0 = i0 - d0 - 1 
        if j0 >= 0:
          for d1 in range(max_skip):
            j1 = i1 - d1 - 1
            if j1 >= 0:
              # get {Yj} = total penalty of align that ends ...,(j0,j1),(i0,i1).
              Yj = Y[j0][j1] + Yskip[d0] + Yskip[d1] + pm 
              if Yj < Y_best:
                Y_best = Yj
                P_best = (j0,j1)
    return Y_best, P_best
    # .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
        
  def extract_alignment():
    nonlocal n0, n1, Y, P
    i0 = n0; i1 = n1
    a = ([], [])
    while True:
      assert i0 >= 0 and i1 >= 0, "bug extract"
      a[0].append(i0)
      a[1].append(i1)
      if P[i0][i1] == None:
        a[0].reverse(); a[1].reverse()
        return a, Y[n0][n1]
      else:
        i0, i1 = P[i0][i1]
    assert False
    # .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
  
  Y = [None]*(n0+1); P = [None]*(n0+1)
  for i0 in range(n0+1):
    Y[i0] = [None]*(n1+1)
    P[i0] = [None]*(n1+1)
    
  # Tabulate {pskip(d)} for efficiency:
  Yskip = [ pskip(d) for d in range(max_skip+2) ]
  assert Yskip[0] == 0, "bug in {pskip} (0)"
  assert Yskip[max_skip] < +inf, "bug in {pskip} (m):" + f" {Yskip}"
  assert Yskip[max_skip+1] == +inf, "bug in {pskip} (m+1):" + f" {Yskip}"
  
  # Fill the tables:
  for i0 in range(n0+1):
    for i1 in range(n1+1):
      compute_table_entries(i0, i1)
  
  # Extract the optimum alignment from the {P} table:
  align, penalty = extract_alignment()
  return align, penalty
  # ......................................................................