#! /usr/bin/python3
# Last edited on 2026-03-01 09:11:23 by stolfi

from sys import stderr as err
from math import sqrt, sin, cos, log, exp, floor, pi, inf
import random

def sub_sub_match_quality(LA, LB, max_val, pert_dev, prob_del, prob_add):
  # Given two lists {LA,LB} of occurrence positions of something,
  # computes a match quality measure {Q(LA,LB)} between them, assuming
  # that the occurrencs in {LA} may have been obtained by deleting some
  # elements of {LB}, perturbing the remaining positions sightly,
  # shifting them all by an arbitrary amount, then adding spurious new
  # elements to complete {LA}.  Hopefully, the higher the quality measure,
  # the less likely that {LA} could have been been generated at random,
  # independently of {LB}.
  #
  # Each element of {LA} and {LB} should be a pair {(pos[i],pmu[i])}
  # where {pos[i]} is a real number in the interval {[0 _ max_val]}
  # and {pmu[i]} is a probability. Al positions in each list must be in
  # increasing order, and the gaps between them must be at least
  # {???}. For now all {pmu[i]} must be 1.0.
  #
  # The proceduere enumerates all possible subsets {IB} of the indices
  # of {LB} and all possible subsets {IA} of the indices of {LA}, with
  # {len(IA) == len(IB)}, and computes a quality measure
  # {Q(LA,IA,LB,IB)} for each sub-hypothesis that those were the
  # corresponding elements in the two lists that were preserved. Then it
  # ??? combines those quality measures as
  #
  #  {Pr(LA,LB|H1) = sum{ Pr(LA,LB|H1[IA,IB])*Pr(IA,IB) : IA,IB }}.
  #
  # 
  #
  # Each assumed deletion or insertion affects the match quality by the
  # negative amounts {log(prob_del)} and {log(prob-add)}, respectively.
  # In addition, each insertion between two preserved values {b0,b1}
  # gets penalized by {log((b-a)/max_val)}. The shift amount {sh} is
  # assumed to align the mean of the preserved positions of {LA} with
  # the mean of the corresponding positions of {LB}, provided that all
  # shifted values stay in the interbal {[0 _ max_val]}.
  # 
  # For each position {pos_B} of {LB} that is preserved and assumed to
  # be position {pos_A} in {LA}, the match quality is penalized by
  # {-0.5*(d/pert_dev)^2} where {d = posB + sh - posA}. The
  # probability of adding each new element would have been {prob_add},
  # and their positions {pos[ia]} would have been generated with uniform
  # distribution in {[0 _ max_val]}.
  #
  # ??? Note that {H1[IA,IB]} is the same as {H0} if {nP} is zero,
  # that is, if the sub-hypothesis is that all entries of {LB}
  # were lost and thus all entries of {LA} were random numbers independent of {LB}.
  # Thus, by definition, {H1} includes only the sub-hypotheses {H1[IA,IB]}
  # where {nP > 0}. 
  
  max_LB_size = 8 # To avoid excessive computation time.
  assert max_LB_size < 30 # So that {IA,IB} can be saved as {int} values.
  nA = len(LA); assert nA <= max_LB_size + 2, "list LA too big"
  nB = len(LB); assert nB <= max_LB_size, "list LB too big"
  nP_max = min(nA,nB)
  
  # We enumerate all meaningful subsets {IA,IB} and collect their qualities,
  # to be winnowed later.
  
  QL = [] # List of triples {(IA, IB, Q(LA,LB,IA,IB)}.

  # We first enumerate all valid subsets {IB} of {0..nB-1}.
  # Those all those with size at least 1 and at most {nP_max}.
  # There are efficient ways to do that, but let's do it the dumb way for now.
  
  for bin_IB in range(1,2**nB):
    IB = [ i for i in range(nB) if (bin_IB & (1 << i)) != 0 ]
    nP = len(IB);  assert nP > 0 and nP <= nB, "bug subset {IB}"
    if nP <= nA: 
      # Now we enumerate all subsets {IA} of {0..nA-1} with {nP}
      # elements. Again there are efficient methods to do that, but let's do it the
      # dumb way for now.
      for bin_IA in range(2**nA):
        IA = [ i for i in range(nA) if (bin_IA & (1 << i)) != 0 ]
        assert len(IA) == nP, "bug subset{IA}"
        # Choose the shift amount {sh}:
        sh = choose_shift(LA, IA, LB, IB, max_val)
        if sh is not None and isfinite(sh):
          # Compute the probability of the match:
          log_H1_D_K = \
            (nB-nP)*log(prob_del) + \
            (nA-nP)*log(prob_add) + \
            sub_sub_match_quality_given_IA_IB(LA, IA, LB, IB, sh, max_val, pert_dev)
          QL.append(log_H1_IA_IB)
  # Add probabilities avoiding underflow:
  mismatch = log_tot_prob_from_plogs(QL)
  return mismatch
  # ----------------------------------------------------------------------
  
def choose_shift(LA, IA, LB, IB, max_val):
  # Chooses a shift amount that best aligns the means of the elements of {LA,LB}
  # paired by {IA,IB}, subject to all selected and shifted {LB} positions 
  # remain in {[0 _ max_val]}. 
  nA = len(LA)
  nB = len(LB)
  nP = len(IA);
  assert len(IB) == nP
  assert nP <= nA and nP <= nB
  assert nP > 0
 
  sum_val_A = 0; sum_val_B = 0
  min_val_B = +inf; max_val_B = -inf
  for iP in range(nP):
    pos_A = LA[IA[iP]][0]; sum_val_A += pos_A
    pos_B = LB[IB[iP]][0]; sum_val_B += pos_B
    min_val_B = min(min_val_B, pos_B)
    max_val_B = max(max_val_B, pos_B)
  sh = (sum_val_A - sum_val_B)/nP
  if max_val_B + sh > max_val: sh = max_val - max_val_B
  if min_val_B + sh < 0: sh = 0 - min_val_B
  assert min_val_B + sh >= -1.0e-10 and max_val_B + sh <= (1.0 + 1.0e-10)*max_val
  return sh
  # ----------------------------------------------------------------------

def sub_sub_match_quality_given_IA_IB(LA, IA, LB, IB, max_val, pert_dev):
  # Computes the match quality of the sublist of {LB} selected by {IB}
  # and shifted bt {sh}, against the sublist of {LA} selected by {IA}.
  # Includes the terms for positional insertions. Assumes
  # negative quadratic match function with deviation {pert_dev}
  # for each matched pair.

  nA = len(LA)
  nB = len(LB)
  nP = len(IA);
  assert len(IB) == nP
  assert nP <= nA and nP <= nB
  assert nP > 0
  
  sh = choose_shift(LA, IA, LB, IB, max_val)

  Q = 0 # Match quality.
  for iP in range(nP+1):
    # Account for prob of inserting the elems between {IA[ip-1]} and {IA[ip]}:
    iA_LO = -1 if iP == 0 else IA[iP-1]
    iA_HI = nA if iP == nP else IA[iP]
    assert iA_LO < iA_HI
    n_ins = iA_HI - iA_LO - 1  # Points added between LA[iA_LO] and LA[iA_HI].
    if n_ins > 0:
      pos_A_LO = 0 if iA_LO == 0 else LA[iA_LO][0]
      pos_A_HI = max_val if iP == nP else LA[IA[iP]][0]
      assert pos_A_LO < pos_A_HI
      Q_ins = log((pos_A_HI - pos_A_LO)/max_val)
      Q += n_ins * Q_ins

  for iP in range(nP):
    iA = IA[iP]; assert iA < nA
    iB = IB[iP]; assert iB < nB
    pos_A = LA[iA][0]
    pos_B = LB[iB][0]
    d = pos_B + sh - pos_A
    Q_pert = -0.5*(d/pert_dev)**2
    Q += Q_pert
  return Q
  # ----------------------------------------------------------------------

def test_sub_sub_match_quality_given_IA_IB(LA, IA, LB, IB, max_val, min_sep, pert_dev, Q_exp):
  err.write(f"!! with {LA = } {IA = } {LB = } {IB = } {Q_exp = :12.6f}")
  LA = list(zip(LA, ( 1, )*len(LA)))
  LB = list(zip(LB, ( 1, )*len(LB)))
  Q_cmp = sub_sub_match_quality_given_IA_IB(LA, IA, LB, IB, max_val, min_sep, pert_dev)
  err.write(f" {Q_cmp = :12.6f}\n")
  den = max(abs(Q_exp), 1.0e-200)
  assert abs(Q_cmp - Q_exp)/den < 1.0e-8
  return
  # ----------------------------------------------------------------------

def test_sub_sub_match_quality_given_IA_IB_all():
  max_val = 3.0
  pert_dev = 0.1
  min_sep = 0.5*pert_dev
  err.write("testing {sub_sub_match_quality_given_IA_IB}\n")
  err.write(f"with {max_val = } {pert_dev = } {min_sep = }\n")
  test_sub_sub_match_quality_given_IA_IB([1.0, ], [ 0, ], [1.2, ], [0, ], -0.2, max_val, pert_dev) 
  test_sub_sub_match_quality_given_IA_IB([1.0, 2.0, ], [ 0, 1, ], [1.0, 2.2, ], [0, 1, ], -0.1, max_val, pert_dev) 
  for trial in range(4):
    nB = 2 + trial
    # Generate a list {LB} of {nB} random positions: 
    # ??? sep!
    LB = [ random.uniform(0,max_val) for iB in range(nB) ]

    nP0_min = 2
    nP0_max = min(nA, nB)
    for nP0 in range(nP0_min, nP0_max+1):
      # Choose SOME subset {IB0} of {nP0} positions of {LB} to survive:
      IB0 = choose_surviving_valitions(nB, nP)
      
      # Genearte the true derived position list {LA0,IA0}:
      LA0,IA0 = 
     
      # Now generate various candidates {LA,IA} and 
      
  
      C = 

    # Generate a list {LA1} with 
    
    # Now generate various lists {LA} with various numbers of
    # preserved positions:
    
    nA_min = max(2, 2*nB//3)
    nA_max = max(3, 5*nB//4)
    for nA in range(nA_min, nA_max+1):
      # Enumerate the subsets of 

      # Generate a list {LA0} of {nA} random positions, with min separation {min-sep}:
      LA0, IA0 = insert_random_valitions(nA, max_val, min_sep, [])
      assert len(IA0) == 0
      
       ???
       ???
      
        # Select the {nP} indices in {LB} to preserve, and their matches in {LA}: 
        IB = random_index_subset(nP, nB)
        IA = random_index_subset(nP, nA)
        # Generate a list {LA1} of {nA} positions from {LB} by the delete-shift-insert method:
        LA1 = simulate_del_shift_add(nA, IA, LB, IB, max_val, pert_dev)
        test_sub_sub_match_quality_given_IA_IB(LA, IA, LB, IB, max_val, pert_dev) 
  return
  # ----------------------------------------------------------------------
      

test_sub_sub_match_quality_given_IA_IB_all()