MODULE PZCandidate; (* Last edited on 2001-11-20 02:27:43 by stolfi *) IMPORT Rd, Wr, Fmt, FGet, NGet, FPut, NPut, FileFmt, Thread; IMPORT PZMatch, PZSegment; FROM PZTypes IMPORT LONG, INT, NAT, NATS, BOOL, BOOLS; FROM Stdio IMPORT stderr; <* FATAL Wr.Failure, Thread.Alerted *> CONST DebugOverlap = FALSE; PROCEDURE IsEmpty(READONLY c: T): BOOL = BEGIN RETURN c.seg[0].ns = 0 OR c.seg[1].ns = 0 END IsEmpty; PROCEDURE Expand(READONLY c: T; iniExtra, finExtra: ARRAY [0..1] OF INT): T = BEGIN WITH oldSteps = c.seg[0].ns + c.seg[1].ns - 2, newSeg = PZSegment.Pair{ PZSegment.Expand(c.seg[0], iniExtra[0], finExtra[0]), PZSegment.Expand(c.seg[1], iniExtra[1], finExtra[1]) }, newSteps = newSeg[0].ns + newSeg[1].ns - 2 DO RETURN T{ seg := newSeg, mismatch := c.mismatch, length := c.length * FLOAT(newSteps, LONG)/FLOAT(oldSteps, LONG), matchedLength := c.matchedLength, pm := PZMatch.AdjustPacked(c.pm, c.seg, newSeg) } END END Expand; PROCEDURE TrimAndPack( READONLY cand: T; READONLY mt: PZMatch.T; mismatch: LONG; length: LONG; matchedLength: LONG; ): T = VAR t: PZSegment.Pair; (* Matched segments *) BEGIN WITH nm = NUMBER(mt), rpm = PZMatch.Pack(mt) DO FOR j := 0 TO 1 DO WITH sj = cand.seg[j], tj = t[j] DO tj := sj; tj.cvx := sj.cvx; tj.tot := sj.tot; IF sj.rev THEN tj.ini := (sj.ini + sj.ns - 1 - mt[nm-1][j]) MOD sj.tot ELSE tj.ini := (sj.ini + mt[0][j]) MOD sj.tot END; tj.ns := mt[nm-1][j] - mt[0][j] + 1; tj.rev := sj.rev END END; RETURN T{ seg := t, mismatch := mismatch, length := length, matchedLength := matchedLength, pm := PZMatch.AdjustPacked(rpm, cand.seg, t) } END; END TrimAndPack; CONST OldFileVersion = "97-02-03"; NewFileVersion = "99-07-25"; PROCEDURE Write( wr :Wr.T; cmt: TEXT; READONLY c: List; lambda: LONG; ) = <* FATAL Thread.Alerted, Wr.Failure *> BEGIN WITH n = LAST(c) DO FileFmt.WriteHeader(wr, "PZCandidate.List", NewFileVersion); FileFmt.WriteComment(wr, cmt, '|'); NPut.Int(wr, "candidates", NUMBER(c)); FPut.EOL(wr); NPut.LongReal(wr, "lambda", lambda); FPut.EOL(wr); FOR i := 0 TO n DO WITH ci = c[i] DO FOR j := 0 TO 1 DO PZSegment.WriteOne(wr, ci.seg[j]); FPut.Space(wr) END; FPut.LongReal(wr, ci.mismatch, prec := 4); FPut.Space(wr); FPut.LongReal(wr, ci.length, prec := 2); FPut.Space(wr); FPut.LongReal(wr, ci.matchedLength, prec := 2); IF ci.pm # NIL THEN FPut.Space(wr); FPut.Space(wr); PZMatch.WritePacked(wr, ci.pm^); END; FPut.EOL(wr); END; END; FileFmt.WriteFooter(wr, "PZCandidate.List"); Wr.Flush(wr); END; END Write; PROCEDURE Read(rd :Rd.T): ReadData = VAR d: ReadData; <* FATAL Rd.EndOfFile, Rd.Failure, Thread.Alerted *> BEGIN FileFmt.ReadHeader(rd, "PZCandidate.List", NewFileVersion); d.cmt := FileFmt.ReadComment(rd, '|'); WITH n = NGet.Int(rd, "candidates") DO FGet.EOL(rd); d.c := NEW(REF List, n); d.lambda := NGet.LongReal(rd, "lambda"); FGet.EOL(rd); FOR i := 0 TO n-1 DO FGet.Skip(rd, FGet.Spaces); WITH ci = d.c[i] DO FOR j := 0 TO 1 DO ci.seg[j] := PZSegment.ReadOne(rd); FGet.Skip(rd, FGet.Spaces) END; <* ASSERT ci.seg[0].cvx < ci.seg[1].cvx *> ci.mismatch := FGet.LongReal(rd); FGet.Skip(rd, FGet.Spaces); ci.length := FGet.LongReal(rd); FGet.Skip(rd, FGet.Spaces); ci.matchedLength := FGet.LongReal(rd); FGet.Skip(rd, FGet.Spaces); WITH c = Rd.GetChar(rd) DO Rd.UnGetChar(rd); IF c = '\n' THEN ci.pm := NIL; ELSE ci.pm := PZMatch.ReadPacked(rd) END END; FGet.EOL(rd) END END; FileFmt.ReadFooter(rd, "PZCandidate.List"); RETURN d; END; END Read; PROCEDURE ReadOld( rd :Rd.T; READONLY m: NATS; rev: ARRAY [0..1] OF BOOL; ): ReadData = VAR d: ReadData; <* FATAL Rd.EndOfFile, Rd.Failure, Thread.Alerted *> BEGIN FileFmt.ReadHeader(rd, "PZCandidate.List", OldFileVersion); d.cmt := FileFmt.ReadComment(rd, '|'); WITH n = NGet.Int(rd, "candidates") DO FGet.EOL(rd); d.c := NEW(REF List, n); d.lambda := NGet.LongReal(rd, "lambda"); FGet.EOL(rd); FOR i := 0 TO n-1 DO FGet.Skip(rd, FGet.Spaces); WITH ci = d.c[i] DO FOR j := 0 TO 1 DO WITH cij = ci.seg[j] DO cij.cvx := FGet.Int(rd); FGet.Skip(rd, FGet.Spaces); cij.tot := m[cij.cvx]; cij.ini := FGet.Int(rd); FGet.Skip(rd, FGet.Spaces); WITH fin = FGet.Int(rd) DO cij.ns := (fin - cij.ini + 1) MOD cij.tot END; FGet.Skip(rd, FGet.Spaces); cij.rev := rev[j] END END; ci.mismatch := FGet.LongReal(rd); FGet.Skip(rd, FGet.Spaces); ci.length := FGet.LongReal(rd); FGet.Skip(rd, FGet.Spaces); ci.matchedLength := FGet.LongReal(rd); FGet.Skip(rd, FGet.Spaces); WITH c = Rd.GetChar(rd) DO Rd.UnGetChar(rd); IF c = '\n' THEN ci.pm := NIL; ELSE ci.pm := PZMatch.ReadPacked(rd) END END; FGet.EOL(rd) END END; FileFmt.ReadFooter(rd, "PZCandidate.List"); RETURN d; END; END ReadOld; PROCEDURE ChainsUsed(READONLY cand: List): REF BOOLS = VAR n: NAT := 0; BEGIN (* Find number of elements to allocate: *) FOR i := 0 TO LAST(cand) DO WITH si = cand[i].seg, k0 = si[0].cvx, k1 = si[1].cvx DO n := MAX(n, MAX(k0, k1) + 1) END END; (* Mark used chains: *) WITH r = NEW(REF BOOLS, n), used = r^ DO FOR k := 0 TO LAST(used) DO used[k] := FALSE END; FOR i := 0 TO LAST(cand) DO WITH si = cand[i].seg, k0 = si[0].cvx, k1 = si[1].cvx DO used[k0] := TRUE; used[k1] := TRUE END END; RETURN r END END ChainsUsed; PROCEDURE Print (wr: Wr.T; READONLY cand: T; pairing: BOOL := TRUE) = <* FATAL Wr.Failure, Thread.Alerted *> PROCEDURE PrintHalf(j: [0..1]; seg: PZSegment.T) = BEGIN Wr.PutText(wr, " "); Wr.PutText(wr, "seg[" & Fmt.Int(j) & "]: "); PZSegment.Print(wr, seg); Wr.PutText(wr, "\n"); END PrintHalf; BEGIN PrintHalf(0, cand.seg[0]); PrintHalf(1, cand.seg[1]); IF pairing AND cand.pm # NIL THEN PZMatch.WritePacked(wr, cand.pm^); Wr.PutText(wr, "\n"); END; Wr.PutText(wr,"mismatch = " & FLR(cand.mismatch, 10, 6)); Wr.PutText(wr, " "); Wr.PutText(wr,"length = " & FLR(cand.length, 9, 4)); Wr.PutText(wr, " "); Wr.PutText(wr,"matchedLength = " & FLR(cand.matchedLength, 9, 4)); Wr.PutText(wr, "\n"); END Print; PROCEDURE Sort(VAR c: List; before: CompareProc) = VAR q, r, mx: NAT; cc, smx: T; BEGIN WITH n = NUMBER(c) DO (* 1. Build heap with largest element at c[0]: *) FOR p := 0 TO n-1 DO cc := c[p]; r := p; LOOP IF r = 0 THEN EXIT END; q := (r-1) DIV 2; IF before(c[q], cc) THEN c[r] := c[q]; ELSE EXIT END; r := q END; c[r] := cc; END; (* 2. Remove elements from heap and insert at end: *) FOR p := n-1 TO 1 BY -1 DO (* save c[p] *) cc := c[p]; (* Move largest heap element to c[p]: *) c[p] := c[0]; (* Insert cc in remaining heap, from root down: *) q := 0; LOOP c[q] := cc; r := 2*q+1; (* Find largest among cc, c[LEFT(q)], c[RIGHT(q)]: *) mx := q; smx := cc; IF r < p AND before(smx, c[r]) THEN mx := r; smx := c[r] END; INC(r); IF r < p AND before(smx, c[r]) THEN mx := r; smx := c[r] END; (* See who won: *) IF mx = q THEN (* Stop here: *) EXIT ELSE (* Promote child and advance: *) c[q] := c[mx]; q := mx END END END; (* Paranoid check: *) FOR i := 1 TO n-1 DO <* ASSERT NOT before(c[i],c[i-1]) *> END END END Sort; PROCEDURE IndexSort(READONLY c: List; VAR x: NATS; before: CompareProc) = VAR q, r, mx: NAT; cc, smx: NAT; BEGIN <* ASSERT NUMBER(c) = NUMBER(x) *> WITH n = NUMBER(c) DO (* 1. Build heap with index of largest element at x[0]: *) FOR p := 0 TO n-1 DO x[p] := p END; FOR p := 0 TO n-1 DO cc := x[p]; r := p; LOOP IF r = 0 THEN EXIT END; q := (r-1) DIV 2; IF before(c[x[q]], c[cc]) THEN x[r] := x[q]; ELSE EXIT END; r := q END; x[r] := cc; END; (* 2. Remove elements from heap and insert at end: *) FOR p := n-1 TO 1 BY -1 DO (* save x[p] *) cc := x[p]; (* Move largest heap element to x[p]: *) x[p] := x[0]; (* Insert cc in remaining heap, from root down: *) q := 0; LOOP x[q] := cc; r := 2*q+1; (* Find largest among cc, c[LEFT(q)], c[RIGHT(q)]: *) mx := q; smx := cc; IF r < p AND before(c[smx], c[x[r]]) THEN mx := r; smx := x[r] END; INC(r); IF r < p AND before(c[smx], c[x[r]]) THEN mx := r; smx := x[r] END; (* See who won: *) IF mx = q THEN (* Stop here: *) EXIT ELSE (* Promote child and advance: *) x[q] := x[mx]; q := mx END END END; (* Paranoid check: *) FOR i := 1 TO n-1 DO <* ASSERT NOT before(c[x[i]],c[x[i-1]]) *> END END END IndexSort; PROCEDURE LexicallyBetter(READONLY a, b: T): BOOL = BEGIN IF a.seg[0].cvx # b.seg[0].cvx THEN RETURN a.seg[0].cvx < b.seg[0].cvx END; <* ASSERT a.seg[0].tot = b.seg[0].tot *> IF a.seg[1].cvx # b.seg[1].cvx THEN RETURN a.seg[1].cvx < b.seg[1].cvx END; <* ASSERT a.seg[1].tot = b.seg[1].tot *> IF a.seg[0].ini # b.seg[0].ini THEN RETURN a.seg[0].ini < b.seg[0].ini END; IF a.seg[1].ini # b.seg[1].ini THEN RETURN a.seg[1].ini < b.seg[1].ini END; IF a.seg[0].ns # b.seg[0].ns THEN RETURN a.seg[0].ns < b.seg[0].ns END; IF a.seg[1].ns # b.seg[1].ns THEN RETURN a.seg[1].ns < b.seg[1].ns END; IF a.mismatch # b.mismatch THEN RETURN a.mismatch < b.mismatch END; IF a.length # b.length THEN RETURN a.length > b.length END; IF a.matchedLength # b.matchedLength THEN RETURN a.matchedLength > b.matchedLength END; RETURN FALSE END LexicallyBetter; PROCEDURE PairMismatchBetter(READONLY a, b: T): BOOL = BEGIN IF a.seg[0].cvx # b.seg[0].cvx THEN RETURN a.seg[0].cvx < b.seg[0].cvx END; <* ASSERT a.seg[0].tot = b.seg[0].tot *> IF a.seg[1].cvx # b.seg[1].cvx THEN RETURN a.seg[1].cvx < b.seg[1].cvx END; <* ASSERT a.seg[1].tot = b.seg[1].tot *> IF a.mismatch # b.mismatch THEN RETURN a.mismatch < b.mismatch END; IF a.length # b.length THEN RETURN a.length > b.length END; IF a.matchedLength # b.matchedLength THEN RETURN a.matchedLength > b.matchedLength END; IF a.seg[0].ns # b.seg[0].ns THEN RETURN a.seg[0].ns > b.seg[0].ns END; IF a.seg[1].ns # b.seg[1].ns THEN RETURN a.seg[1].ns > b.seg[1].ns END; IF a.seg[0].ini # b.seg[0].ini THEN RETURN a.seg[0].ini < b.seg[0].ini END; IF a.seg[1].ini # b.seg[1].ini THEN RETURN a.seg[1].ini < b.seg[1].ini END; RETURN FALSE END PairMismatchBetter; PROCEDURE MismatchBetter(READONLY a, b: T): BOOL = (* Compares by "mismatch", breaking ties by other criteria *) BEGIN IF a.mismatch # b.mismatch THEN RETURN a.mismatch < b.mismatch END; IF a.length # b.length THEN RETURN a.length > b.length END; IF a.matchedLength # b.matchedLength THEN RETURN a.matchedLength > b.matchedLength END; IF a.seg[0].cvx # b.seg[0].cvx THEN RETURN a.seg[0].cvx < b.seg[0].cvx END; <* ASSERT a.seg[0].tot = b.seg[0].tot *> IF a.seg[1].cvx # b.seg[1].cvx THEN RETURN a.seg[1].cvx < b.seg[1].cvx END; <* ASSERT a.seg[1].tot = b.seg[1].tot *> IF a.seg[0].ini # b.seg[0].ini THEN RETURN a.seg[0].ini < b.seg[0].ini END; IF a.seg[1].ini # b.seg[1].ini THEN RETURN a.seg[1].ini < b.seg[1].ini END; IF a.seg[0].ns # b.seg[0].ns THEN RETURN a.seg[0].ns < b.seg[0].ns END; IF a.seg[1].ns # b.seg[1].ns THEN RETURN a.seg[1].ns < b.seg[1].ns END; RETURN FALSE END MismatchBetter; PROCEDURE RemoveDuplicates(VAR cand: List; VAR nCands: NAT) = VAR nKept: NAT := 0; BEGIN FOR i := 0 TO nCands-1 DO IF nKept > 0 THEN WITH ci = cand[i], cp = cand[nKept-1] DO IF (ci.seg[0] # cp.seg[0] OR ci.seg[1] # cp.seg[1]) THEN IF nKept # i THEN cand[nKept] := ci END; INC(nKept) END END END END; nCands := nKept; END RemoveDuplicates; PROCEDURE RemoveShort(VAR cand: List; VAR nCands: NAT; minSteps: NAT) = VAR nKept: NAT := 0; BEGIN FOR i := 0 TO nCands-1 DO IF (cand[i].seg[0].ns > minSteps) OR (cand[i].seg[1].ns > minSteps) THEN IF nKept # i THEN cand[nKept] := cand[i] END; INC(nKept) END END; nCands := nKept; END RemoveShort; PROCEDURE PruneCandsPerPair(VAR cand: List; VAR nCands: NAT; maxCands: NAT) = (* Assumes candidates are sorted by "PairMismatchBetter", so that all candidates for the same chain pair are in adjacent positions, sorted by decreasing quality. *) VAR nKept: NAT := 0; (* Number of candidates kept *) nPair: NAT; (* Number of candidates of current chain pair *) k0Old, k1Old: NAT := LAST(NAT); BEGIN FOR i := 0 TO nCands-1 DO WITH k0 = cand[i].seg[0].cvx, k1 = cand[i].seg[1].cvx DO IF k0 # k0Old OR k1 # k1Old THEN nPair := 0; k0Old := k0; k1Old := k1 END; INC(nPair); IF (nPair <= maxCands) THEN IF i > nKept THEN cand[nKept] := cand[i] END; INC(nKept) END END END; nCands := nKept; END PruneCandsPerPair; PROCEDURE PruneCandsPerChain(VAR cand: List; VAR nCands: NAT; maxCands: NAT) = VAR nKept: NAT := 0; (* Number of candidates kept *) BEGIN WITH chainUsed = ChainsUsed(SUBARRAY(cand, 0, nCands))^, nChains = NUMBER(chainUsed), ctChain = NEW(REF NATS, nChains)^ DO FOR i := 0 TO nChains-1 DO ctChain[i] := 0 END; FOR i := 0 TO nCands-1 DO WITH k0 = cand[i].seg[0].cvx, ct0 = ctChain[k0], k1 = cand[i].seg[1].cvx, ct1 = ctChain[k1] DO INC(ct0); INC(ct1); IF (ct0 <= maxCands OR ct1 <= maxCands) THEN IF i > nKept THEN cand[nKept] := cand[i] END; INC(nKept) END END END; nCands := nKept; END; END PruneCandsPerChain; PROCEDURE FindSimilarCands( READONLY aCand: List; READONLY bCand: List; minSteps: NAT; maxAdjust: NAT; VAR (*OUT*) aOK: BOOLS; VAR (*OUT*) bOK: BOOLS; printAMatched: BOOL := FALSE; printAUnmatched: BOOL := FALSE; printBMatched: BOOL := FALSE; printBUnmatched: BOOL := FALSE; printMatchedCands: BOOL := FALSE; printMatchedChains: BOOL := FALSE; ) = PROCEDURE CompareCandsLexChains(READONLY a, b: T): [-1..+1] = (* Compares the chain pair of "a" with the chain pair of "b", lexicographically. Returns 0 iff the two candidates belong to the same pair of chains. *) BEGIN WITH k0a = a.seg[0].cvx, k1a = a.seg[1].cvx, k0b = b.seg[0].cvx, k1b = b.seg[1].cvx DO IF k0a < k0b THEN RETURN -1 ELSIF k0a > k0b THEN RETURN +1 ELSIF k1a < k1b THEN RETURN -1 ELSIF k1a > k1b THEN RETURN +1 ELSE RETURN 0 END END END CompareCandsLexChains; PROCEDURE PrintCandidate(msg, tag: TEXT; index: NAT; READONLY cand: T) = BEGIN Wr.PutText(stderr, Fmt.Pad(msg & ":", 11, ' ', Fmt.Align.Left) & tag & "[" & FI(index,6) & "]\n" ); Print(stderr, cand := cand) END PrintCandidate; VAR aIni, bIni: NAT; aFin, bFin: NAT; cmp: [-1..+1]; VAR ra, rb: REF PZMatch.T := NIL; (* Work areas for "PZCandidate.Overlap" *) BEGIN WITH aN = NUMBER(aCand), bN = NUMBER(bCand), ax = NEW(REF NATS, aN)^, bx = NEW(REF NATS, bN)^ DO IndexSort(aCand, ax, LexicallyBetter); IndexSort(bCand, bx, LexicallyBetter); (* Paranoid check of sorting: *) FOR ai := 1 TO LAST(aCand) DO IF aIni > 0 THEN <* ASSERT CompareCandsLexChains(aCand[ax[ai-1]], aCand[ax[ai]]) <= 0 *> END END; FOR bi := 1 TO LAST(bCand) DO IF bIni > 0 THEN <* ASSERT CompareCandsLexChains(bCand[bx[bi-1]], bCand[bx[bi]]) <= 0 *> END END; FOR aIndex := 0 TO LAST(aCand) DO aOK[aIndex] := FALSE END; FOR bIndex := 0 TO LAST(bCand) DO bOK[bIndex] := FALSE END; bIni := 0; aIni := 0; WHILE bIni < bN OR aIni < aN DO (* Decide which should move: *) IF aIni <= LAST(aCand) AND bIni <= LAST(bCand) THEN cmp := CompareCandsLexChains(aCand[ax[aIni]], bCand[bx[bIni]]) ELSIF aIni > LAST(aCand) AND bIni <= LAST(bCand) THEN cmp := +1 ELSIF aIni <= LAST(aCand) AND bIni > LAST(bCand) THEN cmp := -1 ELSE <* ASSERT FALSE *> END; (* Move along: *) IF cmp < 0 THEN IF printAUnmatched THEN Wr.PutText(stderr, "\n"); PrintCandidate("unmatched", "a", ax[aIni], aCand[ax[aIni]]); Wr.PutText(stderr, "\n"); END; INC(aIni) ELSIF cmp > 0 THEN IF printBUnmatched THEN Wr.PutText(stderr, "\n"); PrintCandidate("unmatched", "b", bx[bIni], bCand[bx[bIni]]); Wr.PutText(stderr, "\n"); END; INC(bIni) ELSE (* Found two candidates with same chain pairs: *) (* Locate the last "a" candidate "aFin-1" of this chain pair: *) aFin := aIni + 1; WHILE aFin < aN AND CompareCandsLexChains(aCand[ax[aIni]], aCand[ax[aFin]]) = 0 DO INC(aFin) END; (* Locate the last "b" candidate "bFin-1" of this chain pair: *) bFin := bIni+1; WHILE bFin < bN AND CompareCandsLexChains(bCand[bx[bIni]], bCand[bx[bFin]]) = 0 DO INC(bFin) END; (* Compare all "a" candidates against all "b" candidates: *) VAR nMatches: NAT := 0; BEGIN FOR ai := aIni TO aFin-1 DO WITH aC = aCand[ax[ai]] DO FOR bi := bIni TO bFin-1 DO WITH bC = bCand[bx[bi]] DO WITH overlap = Overlap(bC, aC, maxAdjust := maxAdjust, ra := ra, rb := rb ) DO IF DebugOverlap THEN Wr.PutText(stderr, "=== overlap = " & Fmt.Int(overlap) & " ===\n"); Print(stderr, aC, pairing := FALSE); Print(stderr, bC, pairing := FALSE); END; IF overlap >= 2*minSteps THEN aOK[ax[ai]] := TRUE; bOK[bx[bi]] := TRUE; INC(nMatches); IF printMatchedCands THEN Wr.PutText(stderr, "\n"); Wr.PutText(stderr, "overlap = " & Fmt.Int(overlap) & "\n"); PrintCandidate("match", "a", ax[ai], aCand[ax[ai]]); PrintCandidate("match", "b", bx[bi], bCand[bx[bi]]); Wr.PutText(stderr, "\n"); END; END END END END END END; IF printMatchedChains AND nMatches > 0 THEN WITH s = aCand[ax[aIni]].seg DO Wr.PutText(stderr, "chains: " & FI(s[0].cvx,4) & " × " & FI(s[1].cvx,4)); END; Wr.PutText(stderr, ": " & FI(aFin - aIni,6) & " \"a\" cands, "); Wr.PutText(stderr, FI(aFin - aIni,6) & " \"b\" cands,"); Wr.PutText(stderr, FI(nMatches,6) & " matching pairs.\n"); END END; IF printAMatched THEN FOR ai := aIni TO aFin-1 DO IF aOK[ax[ai]] THEN Wr.PutText(stderr, "\n"); PrintCandidate("matched", "a", ax[ai], aCand[ax[ai]]); Wr.PutText(stderr, "\n"); END END END; IF printBMatched THEN FOR bi := bIni TO bFin-1 DO IF bOK[bx[bi]] THEN Wr.PutText(stderr, "\n"); PrintCandidate("matched", "b", bx[bi], bCand[bx[bi]]); Wr.PutText(stderr, "\n"); END END END; (* Start next chain pair: *) bIni := bFin; aIni := aFin END END END END FindSimilarCands; PROCEDURE PrintSimilarityStatistics( wr: Wr.T; READONLY aOK: BOOLS; aName: TEXT; READONLY bOK: BOOLS; bName: TEXT; ) = PROCEDURE CountOK(READONLY OK: BOOLS): NAT = VAR n: NAT := 0; BEGIN FOR i := 0 TO LAST(OK) DO IF OK[i] THEN INC(n) END END; RETURN n END CountOK; PROCEDURE PrParm(wd: NAT; name: TEXT; x: TEXT) = BEGIN Wr.PutText(wr, Fmt.Pad(name, wd, ' ', Fmt.Align.Left) & " = " & x & "\n"); END PrParm; BEGIN (* print statistics: *) WITH aN = NUMBER(aOK), aOKN = CountOK(aOK), bN = NUMBER(bOK), bOKN = CountOK(bOK), recall = FLOAT(bOKN,LONG)/FLOAT(MAX(1,bN),LONG), precision = FLOAT(aOKN,LONG)/FLOAT(MAX(1,aN),LONG), multiplicity = FLOAT(aOKN,LONG)/FLOAT(MAX(1,bOKN),LONG) DO Wr.PutText(wr, "\n"); PrParm(30, aName & " candidates", Fmt.Int(aN)); PrParm(30, bName & " candidates", Fmt.Int(bN)); PrParm(30, aName & " in " & bName, Fmt.Int(aOKN)); PrParm(30, aName & " not in " & bName, Fmt.Int(aN - aOKN)); PrParm(30, bName & " in " & aName, Fmt.Int(bOKN)); PrParm(30, bName & " not in " & aName, Fmt.Int(bN - bOKN)); Wr.PutText(wr,"\n"); PrParm(12, "recall", FLR(recall, 6,4)); PrParm(12, "precision", FLR(precision, 6,4)); PrParm(12, "multiplicity", FLR(multiplicity, 6,4)); END; Wr.PutText(wr,"\n"); END PrintSimilarityStatistics; PROCEDURE MergeOverlaps( VAR cand: List; VAR nCands: NAT; minSteps: NAT; maxAdjust: NAT; ) = VAR nKept: NAT; (* Number of candidates kept. *) ini: NAT; (* First kept candidate for this chain pair. *) ra, rb: REF PZMatch.T := NIL; BEGIN (* Copy the first candidate to the kept part of "cand": *) ini := 0; nKept := 0; (* Process the remaining candidates: *) FOR i := 0 TO nCands DO IF nKept > 0 THEN IF i = nCands OR cand[i].seg[0].cvx # cand[nKept-1].seg[0].cvx OR cand[i].seg[1].cvx # cand[nKept-1].seg[1].cvx THEN (* New pair; process all candidates of previous pair. *) (* Those candidates are "cand[ini..nKept-1]". *) MergePairOverlaps(cand, minSteps, maxAdjust, ini, (*IO*) nKept, (*WORK*) ra, rb ); (* Start copying next pair: *) ini := nKept END END; (* Save candidate for later merging: *) IF i < nCands THEN IF i > nKept THEN cand[nKept] := cand[i] END; INC(nKept) END END; nCands := nKept; END MergeOverlaps; PROCEDURE MergePairOverlaps( VAR cand: List; minSteps: NAT; maxAdjust: NAT; ini: NAT; (* Actual index of first candidate in "cand" *) VAR lim: NAT; (* Index of first non-candidate in "cand" *) (*WORK*) VAR ra, rb: REF PZMatch.T; ) = (* Given a bunch of candidates for the same chain pair, "cand[ini..lim-1]", removes all duplicates and joins all overlapping candidtes, decrementing "lim" accordingly. *) VAR r, s: NAT; BEGIN (* This code assumes that "dist(A, Join(B,C)) = MIN(dist(A,B), dist(A,C))". *) (* The assumption may not be precisely correct, but... *) r := ini; WHILE r < lim DO (* Invariant: "cand[0..r-1]" are well separated from all other cands. *) (* Try to condense "cand[r]" with the following ones: *) s := r + 1; WHILE s < lim DO IF Overlap(cand[r], cand[s], maxAdjust, ra, rb) >= 2* minSteps THEN cand[r] := Join(cand[r], cand[s]); (* Fill the hole left by "cand[s]": *) IF s < lim-1 THEN cand[s] := cand[lim-1] END; DEC(lim); (* Now start over again, since "cand[r]" changed: *) s := r + 1 ELSE INC(s); END END; (* We got to the end, so presumably "cand[r]" is now well-separated: *) INC(r); END; END MergePairOverlaps; PROCEDURE Overlap( READONLY a, b: T; maxAdjust: NAT; VAR (*WORK*) ra, rb: REF PZMatch.T; (* Work areas *) ): NAT = BEGIN RETURN PackedMatchOverlap(a.pm, b.pm, a.seg, b.seg, maxAdjust, ra,rb) END Overlap; PROCEDURE ReallocMatch(VAR r: REF PZMatch.T; np: NAT) = (* Reallocates "r^" if needed to store at least "np" elements. May discard the old contents. *) BEGIN IF r = NIL THEN r := NEW(REF PZMatch.T, np) ELSIF NUMBER(r^) < np THEN r := NEW(REF PZMatch.T, MAX(2*NUMBER(r^), np)) END END ReallocMatch; PROCEDURE PackedMatchOverlap( a, b: REF PZMatch.PackedT; READONLY aSeg, bSeg: PZSegment.Pair; maxAdjust: NAT; VAR (*WORK*) ra, rb: REF PZMatch.T; (* Work area *) ): NAT = VAR na, nb: NAT; BEGIN IF a = NIL THEN na := MAX(aSeg[0].ns, aSeg[1].ns) ELSE na := a.np END; ReallocMatch(ra, na); IF b = NIL THEN nb := MAX(bSeg[0].ns, bSeg[1].ns) ELSE nb := b.np END; ReallocMatch(rb, nb); WITH am = SUBARRAY(ra^, 0, na), bm = SUBARRAY(rb^, 0, nb) DO IF a = NIL THEN PZMatch.DoMostPerfect(aSeg[0].ns, aSeg[1].ns, am) ELSE PZMatch.DoUnpack(a^, am) END; IF b = NIL THEN PZMatch.DoMostPerfect(bSeg[0].ns, bSeg[1].ns, bm) ELSE PZMatch.DoUnpack(b^, bm) END; RETURN PZMatch.Overlap(am, bm, aSeg, bSeg, maxAdjust) END END PackedMatchOverlap; PROCEDURE Join(READONLY a, b: T): T = (* Joins two candidates, assuming that they belong to the same chain pair and they overlap. *) VAR c: T; oldSteps, newSteps: NAT := 0; BEGIN (* Compute the union of the two candidates on each chain: *) FOR j := 0 TO 1 DO c.seg[j] := PZSegment.Join(a.seg[j], b.seg[j]); oldSteps := oldSteps + (a.seg[j].ns + b.seg[j].ns) - 2; newSteps := newSteps + c.seg[j].ns END; <* ASSERT newSteps <= oldSteps *> WITH r = FLOAT(newSteps, LONG)/FLOAT(oldSteps, LONG) DO c.length := r*(a.length + b.length); c.mismatch := 0.0d0; c.matchedLength := 0.0d0; c.pm := NIL END; RETURN c END Join; PROCEDURE FLR(x: LONG; w, p: NAT): TEXT = BEGIN RETURN Fmt.Pad(Fmt.LongReal(x, Fmt.Style.Fix, prec := p), w) END FLR; PROCEDURE FI(x: NAT; w: NAT): TEXT = BEGIN RETURN Fmt.Pad(Fmt.Int(x), w) END FI; BEGIN END PZCandidate. (* Copyright © 2001 Universidade Estadual de Campinas (UNICAMP). Authors: Helena C. G. Leitão and Jorge Stolfi. This file can be freely distributed, used, and modified, provided that this copyright and authorship notice is preserved, and that any modified versions are clearly marked as such. This software has NO WARRANTY of correctness or applicability for any purpose. Neither the authors nor their employers chall be held responsible for any losses or damages that may result from its use. *)