INTERFACE PZCandidate; (* A `candidate' is a pair of segments from two different contours *) (* Last edited on 2001-11-16 17:15:41 by stolfi *) IMPORT Rd, Wr; IMPORT PZSymbolChain, PZSegment, PZMatch; FROM PZTypes IMPORT LONG, INT, NAT, NATS, BOOL, BOOLS; TYPE T = RECORD seg: PZSegment.Pair; (* Segments being matched. *) mismatch: LONG; (* Mismatch function, or 0 if not available. *) length: LONG; (* Mean length of the two segments. *) matchedLength: LONG; (* Length effectively matched. *) pm: REF PZMatch.PackedT; (* Matching for this candidate, or NIL *) END; (* A "PZCandaidate.T" consists of two segments of contour, with a tentative matching between them. *) TYPE List = ARRAY OF T; ReadData = RECORD cmt: TEXT; lambda: LONG; c: REF List END; CONST Empty = T{ seg := PZSegment.Pair{PZSegment.Empty, PZSegment.Empty}, mismatch := 0.0d0, length := 0.0d0, matchedLength := 0.0d0, pm := NIL }; (* An `empty' candidate (NOT the only one!). May be handy. *) PROCEDURE IsEmpty(READONLY c: T): BOOL; (* TRUE if the candidate is empty (either segment has zero samples). *) PROCEDURE Expand(READONLY c: T; iniExtra, finExtra: ARRAY [0..1] OF INT): T; (* Expands the two segments of candidate "c" by "extra" steps in each direction, preserving its matching (if any). If either argument is negative, removes that many steps from the respective end, and trims the matching "c.pm" accordingly. *) PROCEDURE TrimAndPack( READONLY cand: T; READONLY mt: PZMatch.T; mismatch: LONG; length: LONG; matchedLength: LONG; ): T; (* Assumes that "mt" is a partial matching between "cand.seg[0]" and "cand.seg[1]". Returns a copy "ct" of "cand", trimmed to the part actually covered by "mt", and sets "ct.mismatch", "ct.length", "ct.matchedLength" as given. Also packs "mt" and stores it in "ct.pm". *) PROCEDURE Write( wr :Wr.T; cmt: TEXT; READONLY c: List; lambda: LONG; ); (* Writes the list of candidates "c" and the parameter "lambda" to the writer "wr". *) PROCEDURE Read(rd :Rd.T): ReadData; (* Reads from "rd" a list of candidates that was written by "Write". *) PROCEDURE ReadOld( rd :Rd.T; READONLY m: NATS; rev: ARRAY [0..1] OF BOOL; ): ReadData; (* Reads from "rd" a list of candidates that was written by the old version of "Write". The "s.tot" field of each segment "s" is taken from the array "m", indexed by the corresponding contour number; and the "s.rev" flag is taken from the "rev" parameter, indexed by the candidate's side. *) PROCEDURE ChainsUsed(READONLY cand: List): REF BOOLS; (* Returns a vector "used" such that "used[k]" is TRUE iff chain "k" occurs in some candidate of "cand[i]". *) TYPE ChainPair = ARRAY [0..1] OF PZSymbolChain.ReadData; PROCEDURE Print(wr: Wr.T; READONLY cand: T; pairing: BOOL := TRUE); (* Prints the data of candidate "cand" to "wr", in a multiline readable format. If the candidate has a matching, prints it too (in compact form), unless "pairing" is FALSE. *) (* COMPARISON AND SORTING *) TYPE CompareProc = PROCEDURE(READONLY a, b: T): BOOL; (* A procedure that tells whether "a" must come before "b". *) PROCEDURE Sort(VAR c: List; before: CompareProc); (* Sorts the list "c" according to the given comparison procedure. *) PROCEDURE IndexSort(READONLY c: List; VAR x: NATS; before: CompareProc); (* Returns in "x" a permutation of the indices of "c" so that "c[x[k]]" is non-decreasing with "k", according to the given comparison procedure. *) PROCEDURE LexicallyBetter(READONLY a, b: T): BOOL; (* Compares by chain pair, then by then start of segments, by length of segments, and breaks ties by mismatch and other criteria. *) PROCEDURE PairMismatchBetter(READONLY a, b: T): BOOL; (* Compares by chain pair, then by mismatch and total length, and breaks ties by other criteria. *) PROCEDURE MismatchBetter(READONLY a, b: T): BOOL; (* Compares by "mismatch", breaking ties by other criteria. *) PROCEDURE Overlap( READONLY a, b: T; maxAdjust: NAT; VAR (*WORK*) ra, rb: REF PZMatch.T; (* Work areas *) ): NAT; (* Compares the matchings (explicit or implied) of candidates "a" and "b", and returns the number of half-steps of "a" that lie at most "maxAdjust" from some half-step of "b" with same position. Note that "Overlap(a,b,...) = Overlap(b,a,...)". In particular, if "a" and "b" are the same candidate, returns the total number of steps in the two segments of that candidate (i.e. twice the mean length of that candidate). Returns 0 if corresponding segments of the two candidates are on different chains or have different orientations. *) PROCEDURE FindSimilarCands( READONLY aCand: List; READONLY bCand: List; minSteps: NAT; maxAdjust: NAT; VAR (*OUT*) aOK: BOOLS; VAR (*OUT*) bOK: BOOLS; printAMatched: BOOL := FALSE; printAUnmatched: BOOL := FALSE; printBMatched: BOOL := FALSE; printBUnmatched: BOOL := FALSE; printMatchedCands: BOOL := FALSE; printMatchedChains: BOOL := FALSE; ); (* Compares the candidates "aCand" against the candidates "bCand". Any pair of candidates "acand[i]", "bCand[j]" that overlaps by at least "minSteps" steps, with alignments differing by at most "maxAdjust" samples, is flagged by setting "aOK[i] = TRUE" and "bOK[j] = TRUE". The options "printAMatched", "printBMatched" "printAUnmatched", and "printBUnmatched" cause printout of the corresponding matched or unmatched elements, once each. The "printMatchedCands" option prints all matching pairs (inclunding multiple matches of the same "aCand" or "bCand" element. The "printMatchedChains" option prints all chain pairs with at least one matching pair of candidates. *) PROCEDURE PrintSimilarityStatistics( wr: Wr.T; READONLY aOK: BOOLS; aName: TEXT; READONLY bOK: BOOLS; bName: TEXT; ); (* Prints to "wr" the basic statistics of the results of "FindSimilarCands", such as counts of false/true positives/negatives. Prints also the retrieval efficiency parameters of "a" (seen as a search result) relative to "b" (seen as the ideal result): sensitivity (fracion of "b"'s that were found in "a"), significance (fraction of "a" that are in "b"), and multiplicity (how many "a"s were found for each "b"). *) (* CANDIDATE LIST PRUNING *) (* The following procedures remove candidates from the list "cand[0..nCands]". Unless observed otherwise, they preserve the order of the candidates and normally change "nCands". *) PROCEDURE RemoveDuplicates(VAR cand: List; VAR nCands: NAT); (* Removes candidates that cover the same segments of the same chains as previous candidates. Assumes "cand" is sorted in such a way that duplicates will end up in adjacent positions. *) PROCEDURE RemoveShort(VAR cand: List; VAR nCands: NAT; minSteps: NAT); (* Removes any candidates that don't have at least "minSteps" steps on both segments. *) PROCEDURE PruneCandsPerPair(VAR cand: List; VAR nCands: NAT; maxCands: NAT); (* Removes the worst candidates from each pair of chains, leaving at most "maxCands" of them. Assumes that candidates are sorted by chain pair and then by decreasing quality, as in "PairMismatchBetter". *) PROCEDURE PruneCandsPerChain(VAR cand: List; VAR nCands: NAT; maxCands: NAT); (* Removes the worst candidates for chains that have more than "candsPerChain" candidates. Assumes that the candidates are sorted by decreasing quality, as in "MismatchBetter" or "LexicallyBetter". *) PROCEDURE MergeOverlaps( VAR cand: List; VAR nCands: NAT; minSteps: NAT; maxAdjust: NAT; ); (* Joins candidates FOR which "Overlap(... minSteps,maxAdjust)" is TRUE. Assumes candidates are sorted by chain pair first, as in "PairMismatchBetter". *) END PZCandidate. (* Copyright © 2001 Universidade Estadual de Campinas (UNICAMP). Authors: Helena C. G. Leitão and Jorge Stolfi. This file can be freely distributed, used, and modified, provided that this copyright and authorship notice is preserved, and that any modified versions are clearly marked as such. This software has NO WARRANTY of correctness or applicability for any purpose. Neither the authors nor their employers chall be held responsible for any losses or damages that may result from its use. *)