/* See msm_cand_vec.h */ /* Last edited on 2018-03-04 22:57:55 by stolfilocal */ #define msm_cand_vec_C_COPYRIGHT \ "Copyright © 2005 by the State University of Campinas (UNICAMP)" #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define msm_cand_vec_checking_level 2 /* Define this as positive to get paranoid checking. */ #define CHKLEV(n) (msm_cand_vec_checking_level >= (n)) /* Use {if(CHKLEV(n)){...}} for checks of paranoia level {n}. */ #define msm_cand_vec_debug_level 2 /* Set this positive to get some debugging printouts. */ #define DEBLEV(n) (msm_cand_vec_debug_level >= (n)) /* Use {if(DEBLEV(n)){...}} for debug printouts of level {n}. */ vec_typeimpl(msm_cand_vec_t,msm_cand_vec,msm_cand_t); typedef double msm_subpairing_score_proc_t(int64_t r, int64_t s); /* A client procedure that computes a score for a subpairing {p[r..s]} of some perfect pairing {p}. */ typedef void msm_subpairing_use_proc_t ( int64_t r, int64_t s, double score ); /* A client procedure that uses a subpairing {p[r..s]} of some perfect pairing {p}, whose score is {score}. */ void msm_cand_vec_scan_alignment ( msm_seq_desc_t *seq0, int i0, msm_seq_desc_t *seq1, int i1, msm_rung_step_score_proc_t *step_score, int64_t minRungs, /* Min number of rungs in a valid subpairing. */ double minScore, /* Ignore candidates with score lower than this. */ msm_subpairing_use_proc_t *use /* What to do with candidates. */ ); /* Scans the perfect pairing {p} between sequences {seq0,seq1} that starts with rungs {(i0,i1)} and has {ng} rungs. Calls {use} on some set of pairwise disjoint candidates found in it that have at least {minRungs} rungs and score at least {minScore}, hopefully with large scores. The score of a subpairing {p[r..s]} is assumed to be the sum of scores of its steps, plus the steps from nowhere to the first rung and from the ar rung to nowhere. */ msm_cand_vec_t msm_cand_vec_get_best_perfect ( msm_seq_desc_t *seq0, msm_seq_desc_t *seq1, msm_rung_step_score_proc_t *step_score, int64_t minRungs, double minScore, int maxCands ) { int n0 = seq0->size; int n1 = seq1->size; msm_cand_vec_t cdv = msm_cand_vec_new(maxCands); int ncd = 0; /* The current set of candidates is kept in the array {cdv[0..ncd-1]}. During the scan, the array is sorted by *decreasing* score (i.e. the worst candidate is {cdv[ncd-1]}. */ double lowScore = minScore; /* Current minimum score for storage. */ auto void scan_alignment(int i0, int i1, int64_t ng); /* Scans the perfect pairing {p} that starts with rung {(i0,i1)} and has {ng} rungs, and adds all candidates found in it to the heap. A valid sub-pairing is a set of consecutive rungs from {p} that has at least {minRungs} and at most {min(n0,n1)} rungs. A candidate is a valid sub-pairing whose score is positive and better than that of any of proper valid sub-pairing or super-pairing. */ /* Scan all maximal-length perfect pairings, collect best cands: */ msm_pairing_enum_alignments(n0, n1, &scan_alignment); /* Trim the candidate vector to the number of cands actually found: */ msm_cand_vec_trim(&cdv, ncd); return cdv; void scan_alignment(int i0, int i1, int64_t ng) { auto void store_cand(int64_t kmin, int64_t kmax, double score); /* Stores into the sorted list {cdv} the pairing {(i0+k,i1+k)} for {k=kmin,..kmax}. Assumes that its score is {score}. The procedure is a no-op if the pairing is not among the best {maxCands} entries found so far, or is already in the list. */ msm_cand_vec_scan_alignment(seq0, i0, seq1, i1, step_score, minRungs, lowScore, &store_cand); void store_cand(int64_t kmin, int64_t kmax, double score) { /* Check inex validity: */ assert(kmin <= kmax); msm_rung_t gini = (msm_rung_t){{ (int32_t)(i0 + kmin), (int32_t)(i1 + kmin) }}; msm_rung_t gfin = (msm_rung_t){{ (int32_t)(i0 + kmax), (int32_t)(i1 + kmax) }}; assert((gini.c[0] >= 0) && (gfin.c[0] < n0)); assert((gini.c[1] >= 0) && (gfin.c[1] < n1)); int64_t ns = kmax - kmin + 1; if (DEBLEV(2)) { fprintf ( stderr, (" got candidate from (%d,%d) to (%d,%d) steps = %" int64_d_fmt " score = %24.16e\n"), gini.c[0], gini.c[1], gfin.c[0], gfin.c[1], ns, score ); } affirm(score >= lowScore, "bad score"); msm_pairing_t *pr = msm_pairing_perfect(gini, (int)ns); msm_cand_t cd = msm_cand_from_pairing(seq0, seq1, pr, score); double frac = +INF; msm_cand_vec_insert(&cdv, &ncd, (int)minRungs, maxCands, &cd, frac); /* Update the current min useful score: */ lowScore = (ncd < maxCands ? minScore : cdv.e[ncd-1].score); if (DEBLEV(1)) { fprintf(stderr, " %d candidates, min score = %24.16e\n", ncd, lowScore); } } } } void msm_cand_vec_scan_alignment ( msm_seq_desc_t *seq0, int i0, msm_seq_desc_t *seq1, int i1, msm_rung_step_score_proc_t *step_score, int64_t minRungs, /* Min number of rungs in a valid subpairing. */ double minScore, /* Ignore candidates with score lower than this. */ msm_subpairing_use_proc_t *use /* What to do with candidates. */ ) { int n0 = seq0->size; int n1 = seq1->size; affirm((i0 >= 0) && (i0 < n0), "bad {i0}"); affirm((i1 >= 0) && (i1 < n1), "bad {i1}"); affirm((i0 == 0) || (i1 == 0), "bad {i0,i1}"); /* Get max number of rungs {ng} with this alignment: */ int64_t ng = imin(n0 - i0, n1 - i1); if (DEBLEV(1)) { fprintf ( stderr, ("\nalignment (%d,%d:%" int64_d_fmt ") minScore = %12.6f\n"), i0, i1, ng, minScore ); } affirm(ng >= 0, "bad {ng}"); /* Trivial cases: */ if (ng < minRungs) { return; } /* Let {p} be the perfect pairing that starts with rung {i0,i1} and has {ng} rungs. Uses dynamic programming in one dimension to find the sub-pairing of {p} with maximum score that ends with each rung {k}. */ double psc[ng]; /* {psc[k]} is the score of the best sub-pairing that ends at rung {k}. */ /* Scans the rungs {(i0+k,i1+k)} for {k} from {0} to {ng-1}. At each rung there are two choices: start a new sub-pairing with that rung, or extend the current one to that rung. In the former case the current rung is terminated and reported through {use}. */ auto void use_previous(int64_t kini, int64_t kmax); /* Report the best candidate that starts with rung {kini}, ends at or before rung {kmax},and has at least {minRungs} rungs, provided its score is at least {minScore}. */ int64_t kcur = 0; /* First rung of current pairing. */ msm_rung_t h = msm_rung_none; /* Previous rung. */ int64_t k; for (k = 0; k < ng; k++) { msm_rung_t g = (msm_rung_t){{ (int32_t)(i0 + k), (int32_t)(i1 + k) }}; double score_new = step_score(seq0, seq1, NULL, &g); /* Score for starting a new sub-pairing at rung {k}. */ double score_old = ( k <= 0 ? -INF : psc[k-1] + step_score(seq0, seq1, &h, &g) ); /* Score for extending. */ if (score_new > score_old) { /* Better start a new candidate here: */ use_previous(kcur, k - 1); psc[k] = score_new; /* Remember candidate start: */ kcur = k; } else { /* Extend old candidate: */ psc[k] = score_old; } } use_previous(kcur, ng-1); return; void use_previous(int64_t kini, int64_t kmax) { /* Find the best sub-pairing that ends at or before rung {kmax}: */ int64_t kmin = kini + minRungs - 1; if (kmax < kmin) { /* Too short: */ return; } int64_t kopt = kmax; int64_t k = kmax; while (k > kmin) { msm_rung_t h = (msm_rung_t){{ (int32_t)(i0 + k), (int32_t)(i1 + k) }}; psc[k] += step_score(seq0, seq1, &h, NULL); if ((k < kopt) && (psc[k] > psc[kopt])) { kopt = k; } k--; } /* Get the best rung: */ double score = psc[kopt]; if (score >= minScore) { use(kini, kopt, score); } } } void msm_cand_vec_throw ( int ntr, msm_seq_desc_t *seq0, msm_seq_desc_t *seq1, int minlen, int maxlen, double atomProb, double diagProb, double skipProb, msm_cand_vec_t *cdv, int *ncdP ) { int ncd = (*ncdP); int i; for (i = 0; i < ntr; i++) { /* Choose between atomic or not: */ bool_t atomP = (drandom() < atomProb); /* Choose between near-diagonal and random: */ bool_t diagP = (drandom() < diagProb); /* Choose length of candidate: */ int len = int32_abrandom(minlen, maxlen); /* Generate random candidate: */ msm_cand_vec_expand(cdv, ncd); cdv->e[ncd] = msm_cand_throw(seq0, seq1, len, atomP, diagP, 0.10); ncd++; } (*ncdP) = ncd; } void msm_cand_vec_free(msm_cand_vec_t *cdv) { int i; for (i = 0; i < cdv->ne; i++) { msm_cand_free(&(cdv->e[i])); } free(cdv->e); } #define msm_cand_vec_type_name "msm_cand_vec" #define msm_cand_vec_old_format "2008-01-11" #define msm_cand_vec_format "2013-10-16" void msm_cand_vec_write(FILE *wr, msm_cand_vec_t *cdv) { int ncd = cdv->ne; /* Find maximum abs of {id,strlen(name),index}: */ int idmax = 1; int nlenmax = 1; int ixmax = 1; int i; for (i = 0; i < ncd; i++) { msm_cand_t *cd = &(cdv->e[i]); int j; for (j = 0; j < 2; j++) { msm_seq_desc_t *sj = &(cd->seq[j]); int idj = (sj->id >= 0 ? sj->id : 10*(-sj->id)); if (idj > idmax) { idmax = idj; } int lenj = (sj->name == NULL ? 0 : (int)strlen(sj->name)); if (lenj > nlenmax) { nlenmax = lenj; } int ixj = (sj->size <= 0 ? 0 : sj->size - 1); if (ixj > ixmax) { ixmax = ixj; } } } /* Compute the max number of digits needed for {i0,name,index}: */ int idSize = digits(idmax); int nameSize = nlenmax; int ixSize = digits(ixmax); filefmt_write_header(wr, msm_cand_vec_type_name, msm_cand_vec_format); fprintf(wr, "ncands = %d\n", ncd); for (i = 0; i < ncd; i++) { msm_cand_write(wr, NULL, &(cdv->e[i]), idSize, nameSize, ixSize, NULL,TRUE); fputc('\n', wr); } filefmt_write_footer(wr, msm_cand_vec_type_name); fflush(wr); } void msm_cand_vec_write_named(msm_cand_vec_t *cdv, char *name, char *tag, char *ext) { FILE *wr = msm_open_write(name, tag, ext, TRUE); msm_cand_vec_write(wr, cdv); fclose(wr); } msm_cand_vec_t msm_cand_vec_read(FILE *rd) { /* Check and skip the file header: */ filefmt_read_header(rd, msm_cand_vec_type_name, msm_cand_vec_format); /* Skip comment lines, if any: */ (void)filefmt_read_comment(rd, '|'); /* Read the header fields: */ bool_t ncd = nget_int(rd, "ncands"); fget_eol(rd); /* Read the candidates: */ msm_cand_vec_t cdv = msm_cand_vec_new(ncd); int k; for (k = 0; k < ncd; k++) { cdv.e[k] = msm_cand_read(rd, NULL, NULL); fget_eol(rd); } /* Check and skip file footer: */ filefmt_read_footer(rd, msm_cand_vec_type_name); return cdv; } msm_cand_vec_t msm_cand_vec_read_named(char *name, char *tag, char *ext) { FILE *rd = msm_open_read(name, tag, ext, TRUE); msm_cand_vec_t cdv = msm_cand_vec_read(rd); fclose(rd); return cdv; } msm_cand_vec_t msm_cand_vec_map(msm_cand_vec_t *cdv, msm_seq_desc_t *s0_new, msm_seq_desc_t *s1_new) { fprintf(stderr, " mapping %d candidates ...\n", cdv->ne); msm_cand_vec_t cdvnew = msm_cand_vec_new(cdv->ne); int ic; for (ic = 0; ic < cdv->ne; ic++) { cdvnew.e[ic] = msm_cand_map(&(cdv->e[ic]), s0_new, s1_new); } fprintf(stderr, " mapped %d candidates.\n", cdvnew.ne); return cdvnew; } msm_cand_vec_t msm_cand_vec_interpolate(msm_cand_vec_t *cdv) { msm_cand_vec_t cdvnew = msm_cand_vec_new(cdv->ne); int ic; for (ic = 0; ic < cdv->ne; ic++) { cdvnew.e[ic] = msm_cand_interpolate(&(cdv->e[ic])); } return cdvnew; } msm_cand_vec_t msm_cand_vec_make_increasing(msm_cand_vec_t *cdv, int minIncrEach, int minIncrSum) { msm_cand_vec_t cdvnew = msm_cand_vec_new(cdv->ne); int ic; for (ic = 0; ic < cdv->ne; ic++) { cdvnew.e[ic] = msm_cand_make_increasing(&(cdv->e[ic]), minIncrEach, minIncrSum); } return cdvnew; } msm_cand_vec_t msm_cand_vec_refine ( msm_cand_vec_t *cdvold, msm_seq_desc_t *seq0, msm_seq_desc_t *seq1, int delta, int kappa, int expand, int shrink, int maxUnp, msm_rung_step_score_proc_t *step_score, bool_t verbose, msm_dyn_tableau_t *tb, int minCover, int maxCands, double frac ) { msm_cand_vec_t cdvnew = msm_cand_vec_new(cdvold->ne); int ic; int ncnew = 0; /* Number of candidates that were kept. */ int n_entries = 0; int n_steps = 0; fprintf(stderr,"refining candidate vector...\n"); for (ic = 0; ic < cdvold->ne; ic++) { msm_cand_t *cdoi = &(cdvold->e[ic]); demand(msm_seq_desc_equal(&(cdoi->seq[0]), seq0, FALSE), "wrong seq 0"); demand(msm_seq_desc_equal(&(cdoi->seq[1]), seq1, FALSE), "wrong seq 1"); msm_cand_t cdni = msm_cand_refine ( cdoi, delta, kappa, expand, shrink, maxUnp, step_score, verbose, tb, &n_steps, &n_entries ); msm_cand_vec_insert(&cdvnew, &ncnew, minCover, maxCands, &cdni, frac); } msm_cand_vec_trim(&cdvnew, ncnew); fprintf ( stderr, "refined candidate vactor: %d entries, %.1f steps per entry\n", n_entries, ((double)n_steps)/n_entries ); return cdvnew; } void msm_cand_vec_insert ( msm_cand_vec_t *cdv, int *ncandP, int minCover, int maxCands, msm_cand_t *cd, double frac ) { auto int bubble_up(int j); /* Assumes that the list is OK except that {cdv[j]} may be worse than {cdv[j-1]}. Swaps the two. */ auto void delete(int j); /* Deletes the entry {cdv[j]}, displacing all subsequent entries. Decrements {ncand}. */ /* Check minimum coverage requirement: */ int j; for (j = 0; j < 2; j++) { if (msm_pairing_span(cd->pr, j) < minCover) { /* Candidate does not cover enough of sequence {j}: */ msm_cand_debug("N", cd); return; } } int k; /* Final position of inserted candidate, or {(*ncandP)} if not inserted. */ /* Find the smallest {k} such that {cdv[k]} is definitely worse than {cd}: */ k = (*ncandP); while ((k > 0) && (cdv->e[k-1].score < cd->score)) { k--; } if ((frac > 0.0) && (frac <= 1)) { /* Discard {cd} if there is any better or equal cand that contains or is similar to {cd}: */ for (j = 0; j < k; j++) { int ngnew = msm_pairing_num_rungs(cd->pr); /* Rungs in {cd}. */ int ngboth = msm_cand_count_common_rungs(&(cdv->e[j]), cd); /* Shared rungs. */ if (ngboth >= frac*ngnew) { /* Discard {cd}: */ msm_cand_debug("C", cd); return; } } /* Remove any candidates that are worse than {cd} but contained in or similar to {cd}: */ for (j = (*ncandP)-1; j >= k; j--) { int ngold = msm_pairing_num_rungs(cdv->e[j].pr); /* Rungs in {cdv->e[j]}. */ int ngboth = msm_cand_count_common_rungs(&(cdv->e[j]), cd); /* Shared rungs. */ if (ngboth >= frac*ngold) { /* Discard candidate {cvd.e[j]}: */ //msm_cand_debug("S", &(cdv->e[j])); delete(j); } } } if ((*ncandP) >= maxCands) { k = (*ncandP)-1; if (cd->score < cdv->e[k].score) { /* Discard {cd}: */ msm_cand_debug("B", cd); return; } else { /* Delete worst candidate: */ msm_cand_debug("-", &(cdv->e[k])); delete(k); } } /* Add {cd} to {cdv[0..ncand-1]}: */ assert((*ncandP) < maxCands); k = (*ncandP); msm_cand_vec_expand(cdv, k); cdv->e[k] = (*cd); (*ncandP)++; k = bubble_up(k); assert((k >= 0) && (k < (*ncandP))); msm_cand_debug("+", &(cdv->e[k])); int bubble_up(int j) { while (j > 0) { /* Get parent: */ int k = j-1; /* If {cdv[k]} is not worse than {cdv[j]}, stop: */ if (cdv->e[k].score >= cdv->e[j].score) { break; } /* Swap parent with {cdv[j]}: */ msm_cand_t pp = cdv->e[k]; cdv->e[k] = cdv->e[j]; cdv->e[j] = pp; j = k; } return j; } void delete(int j) { int i; for (i = j+1; i < (*ncandP); i++) { cdv->e[i-1] = cdv->e[i]; } (*ncandP)--; } }