#! /usr/bin/python3 # Last edited on 2025-07-11 12:35:18 by stolfi # A {python3} library module for flexible alignment of VMS transcribed lines. from math import inf def optimum_alignment(st, data, pdiff, pskip, max_skip): # Given two strings {data[0],data[1]} with lengths {n0} and {n1}, # computes the optimum alignment between their characters. # # A /partial alignment/ {a} is a pair of lists with the same length {nk}. The # elements {a[i][0..nk-1]} are a subset of the indices of characters # of {data[i]}, in strictly increasing order; and {a[i][nk]} is the # length of {data[i]}. For each {k} in {0..nk-1}, the alignment /pairs # up/ character {c0[k]=data[0][a[0][k]]} with character # {c1[k]=data[1][a[1][k]]}. Not all characters will be paired up. # # A /complete alignment/ is a partial alignment with {nk+1} pairs # where {a[nk] = (n0,n1)} pairs up the non-existent characters just # past the end of {data[0]] and {data[1]}. # # This procedure computes a complete alignment {a[0..1][0..nk]} for # {data[0..1]} with mininmum /total penalty/, which is the sum of a # /mismatch term/ and a /slippage term/. # # The mismatch term is the sum for {k} in {0..nk-1} of a character # discrepancy function {pdiff(c0[k],c1[k])} that measures how # different are the two characters. In particular, this term # should be zero if the characters can be assumed to perfectly match, # {+inf} if they cannot be matched at all, otherwise a positive # number. # # The slippage term is the sum over {k} in {0..nk-1} of {pskip(d0[k]) # + pskip(d1[k])}, where {d0[k] = a[0][k+1]-a[0][k]-1} and {d1[k] = # a[1][k+1]-a[1][k]-1}. In particular, {pskip(d)} should be zero if # {d==0}, {+inf} if {d} exceeds some limit {max_skip}, and a finite # positive increasing function of {d} otherwise. # # The cost of this procedure is proportional to {n0*n1*max_skip^2}, # so you want to keep {max_skip} as small as possible. # # The min value of {pskip} for positive argument should be large # compared to the max finite value of {pdiff}, otherwise the procedure # will rather skip over chars that are matchable but not identical, # instead of matching them. # # The procedure returns the optimum matching {a} and its total penalty {Y}. # If {Y} is {+inf}, the matching failed, and {a} will be just the # dummy pair {([n0],[n1])}. n0 = len(data[0]) n1 = len(data[1]) Yskip = None # Reset later. # Create the penalty table {Y} and the pointer table {P}. # # Entry {Y[i0,i1]} says what is the total penalty for the best partial # alignment whose last pair is {data[0][i0]} and {data[1][i1]}; or # {+inf} if there is no valid alignment that includes that pair. # # Entry {P[i0,i1]} is the next-to-last pair of that partial # alignment; or {None} if {Y[i0,i1]} is {+inf}, or if the best alignment # that includes that pair has no previous pair. # # In particular, {Y[i0,i1]==+inf} and {P[i0,i1]==None} # if {i0==n0} or {i1==n1}, except when {i0==no} and {i1==n1}. # def compute_table_entries(i0, i1): # Computes the entries {Y[i0,i1]} and {P[i0,i1]} assuming # computed all entries {Y[j0,j1]} and {P[j0,j1]} with {j0} in {0..i0} # and {j1} in {0..i1} except of course {j0,j1 == i0,i1}. nonlocal n0, n1, Y, P, Yskip # Compute the mismatch term {pm}: if i0 == n0 or i1 == n1: pm = 0 if i0 == n0 and i1 == n1 else +inf else: pm = pdiff(data[0][i0], data[1][i1]) # Compute {Y_best}, the min value of an align that ends at {(i0,i1)}, # and {P_best}, the pair {(j0,j1)} that precedes {(i0,i1)} in that # path. Initialize {Y_best,P_best} with the option to start the align at {(i0,i1)}: Y[i0][i1], P[i0][i1] = find_best_non_trivial_path(i0, i1, pm) return # . . . . . . . . . . . . . . . . . . . . . . . . def find_best_non_trivial_path(i0, i1, pm): nonlocal n0, n1, Y, P, Yskip # Find the best non-trivial alignment that ends with {(i0,i1)}, # given that the matching penalty for that pair is {pm}. # If there is Y_best = pm + pskip(i0) + pskip(i1) P_best = None if pm != +inf: # Check all possibilities for previous pair {(j0,j1)}: for d0 in range(max_skip): j0 = i0 - d0 - 1 if j0 >= 0: for d1 in range(max_skip): j1 = i1 - d1 - 1 if j1 >= 0: # get {Yj} = total penalty of align that ends ...,(j0,j1),(i0,i1). Yj = Y[j0][j1] + Yskip[d0] + Yskip[d1] + pm if Yj < Y_best: Y_best = Yj P_best = (j0,j1) return Y_best, P_best # . . . . . . . . . . . . . . . . . . . . . . . . def extract_alignment(): nonlocal n0, n1, Y, P i0 = n0; i1 = n1 a = ([], []) while True: assert i0 >= 0 and i1 >= 0, "bug extract" a[0].append(i0) a[1].append(i1) if P[i0][i1] == None: a[0].reverse(); a[1].reverse() return a, Y[n0][n1] else: i0, i1 = P[i0][i1] assert False # . . . . . . . . . . . . . . . . . . . . . . . . Y = [None]*(n0+1); P = [None]*(n0+1) for i0 in range(n0+1): Y[i0] = [None]*(n1+1) P[i0] = [None]*(n1+1) # Tabulate {pskip(d)} for efficiency: Yskip = [ pskip(d) for d in range(max_skip+2) ] assert Yskip[0] == 0, "bug in {pskip} (0)" assert Yskip[max_skip] < +inf, "bug in {pskip} (m):" + f" {Yskip}" assert Yskip[max_skip+1] == +inf, "bug in {pskip} (m+1):" + f" {Yskip}" # Fill the tables: for i0 in range(n0+1): for i1 in range(n1+1): compute_table_entries(i0, i1) # Extract the optimum alignment from the {P} table: align, penalty = extract_alignment() return align, penalty # ......................................................................