#ifndef pz_match_H #define pz_match_H /* Chain distance by dynamic programming. */ /* Last edited on 2015-01-20 16:49:24 by stolfilocal */ #include #include typedef i2_t pz_match_pair; typedef i2_vec_t pz_match_t; /* A {pz_match_t} is a `monotone pairing' between two arbitrary sequences {a[0..NA-1]} and {b[0..NB-1]}, defined as a list of pairs {mt[0..m-1] == (r[0],s[0]),.. (r[m-1],s[m-1])} such that | r[0] >= 0, s[0] >= 0; | | r[m-1] <= NA-1, s[m-1] <= NB-1; | | r[k+1] == r[k] ) || ( r[k+1] == r[k]+1, | s[k+1] == s[k] ) || ( s[k+1] == s[k]+1, | | mt[k+1] != mt[k] for all {k}. By definition, element {a[r[k]] == a[mt[k][0]]} is paired with sample {b[s[k]] == b[mt[k][1]]}, for all {k}. We say that a pair {(r1,s1)} `precedes' another pair {(r2,s2)} if {r1 <= r2} and {s1 <= s2}, and at least one of these inequalities is strict. In that case we also say that {(r2,s2)} `follows' {(r1,s1)}. Note that, in any monotone pairing {mt}, {mt[i]} precedes {mt[j]} iff {i < j}. Two consecutive pairs {mt[k]==(r[k],s[k])} and {mt[k+1]==(r[k+1],s[k+1])} are said to be a `step' of the match. The step is said to be `full' if both {r} and {s} increase by 1, and `half' if only one of them increases. A `perfect pairing' is one that uses only full steps. The pairing is said to be `complete' if {mt[0] == (0,0)} and {mt[m-1] == (NA-1,NB-1)}; and `incomplete' otherwise. */ #define pz_match_new i2_vec_new #define pz_match_expand i2_vec_expand /* MINIMUM-COST PAIRINGS */ /* The procedures in this section take as argument an arbitrary `step cost function' {step_cost_fn} (which may return negative values). The `total cost' of a pairing {mt} is the sum of {step_cost_fn(s[k])} over all steps {S[k] == mt[k]->mt[k+1]} of {mt}. The `step count' of a match is defined as the number of half-steps plus twice the number of full steps. It is also the number of samples matched in the two ckains, minus two; thus any complete pairing has step count {NA+NB-2}. The optimization procedures also use internally a {CostMatrix} {rCost} with at least {NA} rowns and {NB} columns, which may be preallocated and given by the client. The procedures will (re)allocate said matrix if it is NULL or too small. */ typedef double pz_match_step_function_t ( pz_match_pair *p1, pz_match_pair *p2 ); typedef double_vec_t pz_match_cost_matrix_t; /* Work array used by DP algorithm */ void pz_match_find_best ( int NA, int NB, pz_match_step_function_t step_cost_fn, pz_match_t *mt, /* (OUT) */ double *totCost, /* (OUT) */ pz_match_cost_matrix_t *rCost /* (WORK) */ ); /* Returns in {mt} a complete monotone pairing between two strings with {NA} and {NB} elements, with mimimum total cost (which is returned in {totCost}. */ void pz_match_refine ( int NA, int NB, pz_match_step_function_t step_cost_fn, pz_match_t *mtOld, int maxAdjust, pz_match_t *mt, /* (OUT) */ double *totCost, /* (OUT) */ double_vec_t *minTotCost, /* (OUT) */ pz_match_cost_matrix_t *rCost /* (WORK) */ ); /* Computes a partial monotone pairing, between two arbitrary sequences with {NA} and {NB} elements, that is not too far from {mtOld} and has minimum total cost. The `input pairing' {mtOld} is an arbitrary (but non-empty) pairing between the two sequences, which is to be improved by the procedure. On output, {mt} will point to the pairing of minimum total cost found by the procedure, within certain constraints (or to NULL if the constraints cannot be satisfied). The total cost of {mt} will be returned in {totCost}. Let's define the `initial pairs' as consisting of all pairs of the input pairing {mtOld}, plus all possible index pairs that precede its first pair, plus all possible index pairs that follow its last pair. One constraint on the refined pairing {mt} is that is that every pair {mt[k]} must lie within {maxAdjust} distance from some initial pair, in the Manhattan metric. In other words, the `valid pairs' for the refined pairing are the union of all sets of the form {{ (r+dr,s+ds) : |dr| <= maxAdjust, |ds| <= maxAdjust }} where {(r,s)} is an initial pair. A second constraint is that the refined pairing {mt} will contain exactly one pair {(ro,so)} such that {ro+so == rc+sc}, where {(rc,sc)} is the middle pair of the input pairing {mtOld}. (This constraint is meant to erclude partial pairings that cover only short initial or final segments of the two sequences.) Thus, the returned match will be one that: (1) contains only valid pairs, as defined above; (2) begins with some pair {(ri,si)} with {ri+si <= rc+sc}; (3) ends with some pair {(rf,sf)} with {rf+sf >= rc+sc}; (4) has minimum total cost, among all pairings that satisfy (1)-(3). Note that there is always a pairing that satisfies (1)-(3), namely the pairing {mtOld} itself (or any piece of it that includes its central pair). The procedure also returns in {minTotCost} a table such that {minTotCost[k]} is the minimum total cost (NOT the mismatch) of any pairing with {k} steps that satisfies the above constraints, for each {k} in {[0..NA+NB-2]}. */ /* UTILITY FUNCTIONS */ pz_match_t *pz_match_cut(pz_match_t *mt, int start, int count); /* Returns the segment {mt[start..start+count-1]} of pairing {mt}. */ pz_match_t *pz_match_adjust ( pz_match_t *mt, pz_segment_pair *sOld, pz_segment_pair *sNew ); /* Given a pairing between segments {sOld[0]} and {sOld[1]}, returns the equivalent pairing between {sNew[0]} and {sNew[1]}. This entails mapping the relative sample indices in {mt} so that they refer to the same samples, and possibly trimming pairs that fall outside the new segments. The result is NULL if none of the old pairs are contained in the new segments, or the intersection of the restriction of {mt} to the new segments is not a single, continuous pairing. */ pz_match_t *pz_match_remove_dangling_ends ( pz_match_t *mt ); /* Returns a new pairing obtained by removing the ``dangling ends'' of {mt}, i.e. all initial and final half-steps of {mt} where one of the ``legs'' remains stuck at the initial or final point of the corresponding chain. */ pz_match_t pz_match_most_perfect ( int n0, int n1 ); void pz_match_domost_perfect ( int n0, int n1, pz_match_t *mt ); /* Generates a maximally perfect pairing between two segments with {n0} and {n1} samples, respectively. The corrresponding path is a Bresenham line between {(0,0)} and {(n0,n1)}. {pz_match_domost_perfect} is like {pz_match_most_perfect} but stores the pairing in the client-provided array {mt}, which should have {max(n0,n1)} elements exactly. */ pz_match_pair pz_match_abs_pair_clip( pz_match_pair p, pz_segment_pair *s ); /* Given a pair {p} of sample indices, relative to segments {s[0]} and {s[1]}, returns the corresponding indices in the whole chains. */ pz_match_pair pz_match_rel_pair_clip( pz_match_pair p, pz_segment_pair *s ); /* Given a pair {p} of sample indices in two chains, returns the corresponding sample indices relative to segments {s[0]} and {s[1]}. Each returned index {p[j]} will be {(int.ne - 1)} if the corresponding sample lies outside the segment {s[j]}. */ typedef bool_t pz_match_step_predicate_t ( pz_match_pair *p1, pz_match_pair *p2 ); typedef double pz_match_step_measure_t ( pz_match_pair *p1, pz_match_pair *p2 ); pz_match_t *pz_match_remove_unmatched_ends( pz_match_t *mt, pz_match_step_predicate_t isMatched ); /* Removes all initial and final steps {mt[k]->mt[k+1]} of {mt} for which {isMatched(mt[k],mt[k+1])} returns FALSE. */ double pz_match_length( pz_match_t *mt, pz_match_step_measure_t lengthFn ); /* Returns the sum of of {lengthFn(S)} for all steps of the pairing {mt}. */ double pz_match_sum( pz_match_t *mt, pz_match_step_function_t stepFn ); /* Returns the sum of {stepFn(S)} over the pairing {mt}. */ double pz_match_integral( pz_match_t *mt, pz_match_step_measure_t lengthFn, pz_match_step_function_t stepFn ); /* Returns the integral of {stepFn(S)} over the pairing {mt}, with respect to step length as returned by {lengthFn(S)}. */ int pz_match_matched_step_count( pz_match_t *mt, pz_match_step_predicate_t isMatched ); /* Returns the step count of {mt} (number of half-steps plus twice the number of full steps), considering only steps {S} for which {isMatched(S)} is TRUE. */ double pz_match_matched_length( pz_match_t *mt, pz_match_step_measure_t lengthFn, pz_match_step_predicate_t isMatched ); /* Returns the sum of {lengthFn(S)} for all steps {S} in {mt} for which {isMatched(S)} is true. */ int pz_match_overlap( pz_match_t *a, pz_match_t *b, pz_segment_pair *aSeg, pz_segment_pair *bSeg, int maxAdjust ); /* Compares two pairings {a} and {b}, which are relative to segment pairs {aSeg} and {bSeg}, respectively. Returns the number of half-steps of {a} that lie at most {maxAdjust} from some half-step of {b} with same position. Note that {pz_match_overlap(a,b,...) == pz_match_overlap(b,a,...)}. */ /* PACKED PAIRINGS */ typedef struct pz_match_packed_t { /* A compact description of a pairing */ int np; /* Number of pairs (> 0). */ pz_match_pair ini; /* Initial pair */ pz_match_pair fin; /* Final pair */ char *steps; /* Encoded steps */ } pz_match_packed_t; /* Code for encoded steps '/' advance [0], '\\' advance [1], '|' both. */ pz_match_packed_t *pz_match_pack ( pz_match_t *mt ); /* Returns a packed encoding of {mt}. */ pz_match_packed_t *pz_match_packPerfect ( int nSamples ); /* Returns a packed perfect pairing between two segments, both with {nSamples} segments. */ pz_match_t pz_match_unpack ( pz_match_packed_t *pm ); void pz_match_do_unpack ( pz_match_packed_t *pm, pz_match_t *mt ); /* Expands a packed representation into an explicit pairing. {pz_match_do_unpack} is like {pz_match_unpack} but returns the result in the client-provided array {mt}. */ void pz_match_write_packed ( FILE *wr, pz_match_packed_t *pm ); /* Writes the pairing {pm} in compact form to writer {wr}. */ pz_match_packed_t *pz_match_read_packed ( FILE *rd ); /* Reads from {rd} a packed pairing, in the format written by {Write}. */ pz_match_packed_t *pz_match_adjustPacked ( pz_match_packed_t *rpm, pz_segment_pair *oldSeg, pz_segment_pair *newSeg ); /* Given a packed pairing {rpm^} between two segments {oldSeg[0]} and {oldSeg[1]}, returns a new packed pairing that describes the same pairing between the segments {newSeg[0]} and {newSeg[1]}. If the new segments are not contained in the original ones, the pairing may be trimmed. The result is NULL if {rpm} is NULL or if the intersection of the new segments and the old pairings is not a single interval. */ /* Copyright © 2001 Universidade Estadual de Campinas (UNICAMP). Authors: Helena C. G. Leitão and Jorge Stolfi. This file can be freely distributed, used, and modified, provided that this copyright and authorship notice is preserved, and that any modified versions are clearly marked as such. This software has NO WARRANTY of correctness or applicability for any purpose. Neither the authors nor their employers chall be held responsible for any losses or damages that may result from its use. */ #endif