#! /usr/bin/python3 # Last edited on 2026-03-01 09:11:23 by stolfi from sys import stderr as err from math import sqrt, sin, cos, log, exp, floor, pi, inf import random def sub_sub_match_quality(LA, LB, max_val, pert_dev, prob_del, prob_add): # Given two lists {LA,LB} of occurrence positions of something, # computes a match quality measure {Q(LA,LB)} between them, assuming # that the occurrencs in {LA} may have been obtained by deleting some # elements of {LB}, perturbing the remaining positions sightly, # shifting them all by an arbitrary amount, then adding spurious new # elements to complete {LA}. Hopefully, the higher the quality measure, # the less likely that {LA} could have been been generated at random, # independently of {LB}. # # Each element of {LA} and {LB} should be a pair {(pos[i],pmu[i])} # where {pos[i]} is a real number in the interval {[0 _ max_val]} # and {pmu[i]} is a probability. Al positions in each list must be in # increasing order, and the gaps between them must be at least # {???}. For now all {pmu[i]} must be 1.0. # # The proceduere enumerates all possible subsets {IB} of the indices # of {LB} and all possible subsets {IA} of the indices of {LA}, with # {len(IA) == len(IB)}, and computes a quality measure # {Q(LA,IA,LB,IB)} for each sub-hypothesis that those were the # corresponding elements in the two lists that were preserved. Then it # ??? combines those quality measures as # # {Pr(LA,LB|H1) = sum{ Pr(LA,LB|H1[IA,IB])*Pr(IA,IB) : IA,IB }}. # # # # Each assumed deletion or insertion affects the match quality by the # negative amounts {log(prob_del)} and {log(prob-add)}, respectively. # In addition, each insertion between two preserved values {b0,b1} # gets penalized by {log((b-a)/max_val)}. The shift amount {sh} is # assumed to align the mean of the preserved positions of {LA} with # the mean of the corresponding positions of {LB}, provided that all # shifted values stay in the interbal {[0 _ max_val]}. # # For each position {pos_B} of {LB} that is preserved and assumed to # be position {pos_A} in {LA}, the match quality is penalized by # {-0.5*(d/pert_dev)^2} where {d = posB + sh - posA}. The # probability of adding each new element would have been {prob_add}, # and their positions {pos[ia]} would have been generated with uniform # distribution in {[0 _ max_val]}. # # ??? Note that {H1[IA,IB]} is the same as {H0} if {nP} is zero, # that is, if the sub-hypothesis is that all entries of {LB} # were lost and thus all entries of {LA} were random numbers independent of {LB}. # Thus, by definition, {H1} includes only the sub-hypotheses {H1[IA,IB]} # where {nP > 0}. max_LB_size = 8 # To avoid excessive computation time. assert max_LB_size < 30 # So that {IA,IB} can be saved as {int} values. nA = len(LA); assert nA <= max_LB_size + 2, "list LA too big" nB = len(LB); assert nB <= max_LB_size, "list LB too big" nP_max = min(nA,nB) # We enumerate all meaningful subsets {IA,IB} and collect their qualities, # to be winnowed later. QL = [] # List of triples {(IA, IB, Q(LA,LB,IA,IB)}. # We first enumerate all valid subsets {IB} of {0..nB-1}. # Those all those with size at least 1 and at most {nP_max}. # There are efficient ways to do that, but let's do it the dumb way for now. for bin_IB in range(1,2**nB): IB = [ i for i in range(nB) if (bin_IB & (1 << i)) != 0 ] nP = len(IB); assert nP > 0 and nP <= nB, "bug subset {IB}" if nP <= nA: # Now we enumerate all subsets {IA} of {0..nA-1} with {nP} # elements. Again there are efficient methods to do that, but let's do it the # dumb way for now. for bin_IA in range(2**nA): IA = [ i for i in range(nA) if (bin_IA & (1 << i)) != 0 ] assert len(IA) == nP, "bug subset{IA}" # Choose the shift amount {sh}: sh = choose_shift(LA, IA, LB, IB, max_val) if sh is not None and isfinite(sh): # Compute the probability of the match: log_H1_D_K = \ (nB-nP)*log(prob_del) + \ (nA-nP)*log(prob_add) + \ sub_sub_match_quality_given_IA_IB(LA, IA, LB, IB, sh, max_val, pert_dev) QL.append(log_H1_IA_IB) # Add probabilities avoiding underflow: mismatch = log_tot_prob_from_plogs(QL) return mismatch # ---------------------------------------------------------------------- def choose_shift(LA, IA, LB, IB, max_val): # Chooses a shift amount that best aligns the means of the elements of {LA,LB} # paired by {IA,IB}, subject to all selected and shifted {LB} positions # remain in {[0 _ max_val]}. nA = len(LA) nB = len(LB) nP = len(IA); assert len(IB) == nP assert nP <= nA and nP <= nB assert nP > 0 sum_val_A = 0; sum_val_B = 0 min_val_B = +inf; max_val_B = -inf for iP in range(nP): pos_A = LA[IA[iP]][0]; sum_val_A += pos_A pos_B = LB[IB[iP]][0]; sum_val_B += pos_B min_val_B = min(min_val_B, pos_B) max_val_B = max(max_val_B, pos_B) sh = (sum_val_A - sum_val_B)/nP if max_val_B + sh > max_val: sh = max_val - max_val_B if min_val_B + sh < 0: sh = 0 - min_val_B assert min_val_B + sh >= -1.0e-10 and max_val_B + sh <= (1.0 + 1.0e-10)*max_val return sh # ---------------------------------------------------------------------- def sub_sub_match_quality_given_IA_IB(LA, IA, LB, IB, max_val, pert_dev): # Computes the match quality of the sublist of {LB} selected by {IB} # and shifted bt {sh}, against the sublist of {LA} selected by {IA}. # Includes the terms for positional insertions. Assumes # negative quadratic match function with deviation {pert_dev} # for each matched pair. nA = len(LA) nB = len(LB) nP = len(IA); assert len(IB) == nP assert nP <= nA and nP <= nB assert nP > 0 sh = choose_shift(LA, IA, LB, IB, max_val) Q = 0 # Match quality. for iP in range(nP+1): # Account for prob of inserting the elems between {IA[ip-1]} and {IA[ip]}: iA_LO = -1 if iP == 0 else IA[iP-1] iA_HI = nA if iP == nP else IA[iP] assert iA_LO < iA_HI n_ins = iA_HI - iA_LO - 1 # Points added between LA[iA_LO] and LA[iA_HI]. if n_ins > 0: pos_A_LO = 0 if iA_LO == 0 else LA[iA_LO][0] pos_A_HI = max_val if iP == nP else LA[IA[iP]][0] assert pos_A_LO < pos_A_HI Q_ins = log((pos_A_HI - pos_A_LO)/max_val) Q += n_ins * Q_ins for iP in range(nP): iA = IA[iP]; assert iA < nA iB = IB[iP]; assert iB < nB pos_A = LA[iA][0] pos_B = LB[iB][0] d = pos_B + sh - pos_A Q_pert = -0.5*(d/pert_dev)**2 Q += Q_pert return Q # ---------------------------------------------------------------------- def test_sub_sub_match_quality_given_IA_IB(LA, IA, LB, IB, max_val, min_sep, pert_dev, Q_exp): err.write(f"!! with {LA = } {IA = } {LB = } {IB = } {Q_exp = :12.6f}") LA = list(zip(LA, ( 1, )*len(LA))) LB = list(zip(LB, ( 1, )*len(LB))) Q_cmp = sub_sub_match_quality_given_IA_IB(LA, IA, LB, IB, max_val, min_sep, pert_dev) err.write(f" {Q_cmp = :12.6f}\n") den = max(abs(Q_exp), 1.0e-200) assert abs(Q_cmp - Q_exp)/den < 1.0e-8 return # ---------------------------------------------------------------------- def test_sub_sub_match_quality_given_IA_IB_all(): max_val = 3.0 pert_dev = 0.1 min_sep = 0.5*pert_dev err.write("testing {sub_sub_match_quality_given_IA_IB}\n") err.write(f"with {max_val = } {pert_dev = } {min_sep = }\n") test_sub_sub_match_quality_given_IA_IB([1.0, ], [ 0, ], [1.2, ], [0, ], -0.2, max_val, pert_dev) test_sub_sub_match_quality_given_IA_IB([1.0, 2.0, ], [ 0, 1, ], [1.0, 2.2, ], [0, 1, ], -0.1, max_val, pert_dev) for trial in range(4): nB = 2 + trial # Generate a list {LB} of {nB} random positions: # ??? sep! LB = [ random.uniform(0,max_val) for iB in range(nB) ] nP0_min = 2 nP0_max = min(nA, nB) for nP0 in range(nP0_min, nP0_max+1): # Choose SOME subset {IB0} of {nP0} positions of {LB} to survive: IB0 = choose_surviving_valitions(nB, nP) # Genearte the true derived position list {LA0,IA0}: LA0,IA0 = # Now generate various candidates {LA,IA} and C = # Generate a list {LA1} with # Now generate various lists {LA} with various numbers of # preserved positions: nA_min = max(2, 2*nB//3) nA_max = max(3, 5*nB//4) for nA in range(nA_min, nA_max+1): # Enumerate the subsets of # Generate a list {LA0} of {nA} random positions, with min separation {min-sep}: LA0, IA0 = insert_random_valitions(nA, max_val, min_sep, []) assert len(IA0) == 0 ??? ??? # Select the {nP} indices in {LB} to preserve, and their matches in {LA}: IB = random_index_subset(nP, nB) IA = random_index_subset(nP, nA) # Generate a list {LA1} of {nA} positions from {LB} by the delete-shift-insert method: LA1 = simulate_del_shift_add(nA, IA, LB, IB, max_val, pert_dev) test_sub_sub_match_quality_given_IA_IB(LA, IA, LB, IB, max_val, pert_dev) return # ---------------------------------------------------------------------- test_sub_sub_match_quality_given_IA_IB_all()