/* Last edited on 2022-10-30 10:48:06 by stolfi */ /* ---------------------------------------------------------------------- */ /* dm_test_180_discrim.c */ auto double rung_score(msm_seq_desc_t *xp, msm_seq_desc_t *yp, msm_rung_t g); double rung_score(msm_seq_desc_t *xp, msm_seq_desc_t *yp, msm_rung_t g) { msm_seq_desc_same_orig_seq(xp, &(x->sd), TRUE); msm_seq_desc_same_orig_seq(yp, &(y->sd), TRUE); return dm_score_rung(&(o->sc[level]), &g, x, y); } msm_cand_vec_t dm_td_throw_cands_false ( int nc, int len, msm_seq_desc_t *x, msm_seq_desc_t *y, double skipProb ); /* Generates a list of {nc} random candidates for the two {dm_seq_t}s, each pairing about {len} elements on each sequence. Each step of each candidate will be a `skip' with probability {skipProb} */ msm_cand_vec_t dm_td_throw_cands_true ( int nc, int len, msm_seq_desc_t *x, msm_seq_desc_t *y, msm_pairing_t *p ); /* Generates a list of {nc} random candidates for the two {dm_seq_t}s, each pairing about {len} elements on each sequence. The candidates are cut out from the pairing {p}. */ msm_cand_vec_t dm_td_throw_cands_false ( int nc, int len, msm_seq_desc_t *x, msm_seq_desc_t *y, double skipProb ) { /* Allocate the candidate vector: */ msm_cand_vec_t cdv = msm_cand_vec_new(nc); int i; for (i = 0; i < nc; i++) { cdv.e[i] = msm_cand_throw ( x, y, len, /*circp*/ FALSE, /*atomic*/ TRUE, /*diagonal*/ FALSE, /*skipProb*/ skipProb ); } return cdv; } msm_cand_vec_t dm_td_throw_cands_true ( int nc, int len, msm_seq_desc_t *x, msm_seq_desc_t *y, msm_pairing_t *p ) { /* Create a single candidate spanning the whole string pair: */ msm_cand_t cd = msm_cand_from_pairing(x, y, p, 0.0); /* Allocate the candidate vector: */ msm_cand_vec_t cdv = msm_cand_vec_new(nc); int i; for (i = 0; i < nc; i++) { /* Select the next piece of {p}: */ cdv.e[i] = msm_cand_sub_throw(&cd, len); } return cdv; } void dm_test_discrim_for_level_kind ( options_t *o, char *levPrefix, dm_seq_t *x, dm_seq_t *y, msm_pairing_t *p ) { int len; /* Length of candidates. */ int lenMax = (nx < ny ? nx : ny); int lenStep = lenMax / 20; int lenMin = lenStep; for (len = lenMin; len <= lenMax; len += lenStep) { msm_cand_vec_t cdv[2]; bool_t homo; /* Kind of candidates: 0 = false, 1 = true. */ /* Generate the candidates: */ fprintf(stderr, "creating %d candidates of kind %d and length %d ...\n", nc, cdkind, len); msm_cand_vec_t *cdvp = &(cdv[cdkind]); if (cdkind == 0) { /* Generate some random candidates: */ (*cdvp) = dm_test_discrim_throw_cands_false(nc, len, &(x->sd), &(y->sd), o->mutProb); } else { /* Generate some true candidates: */ (*cdvp) = dm_test_discrim_throw_cands_true(nc, len, &(x->sd), &(y->sd), p); } /* Refine the candidates: */ fprintf(stderr, "refining candidates ...\n"); msm_dyn_tableau_t tb = msm_dyn_tableau_new(); /* Dynamic pogramming tableau. */ int minCover = 0; /* !!! FIND CORRECT VALUE !!! */ int maxCands = nc; cdv[cdkind] = msm_cand_vec_refine ( cdvp, &(x->sd), &(y->sd), o->delta, o->kappa, FALSE, o->maxUnp, &step_score, &tb, minCover, maxCands, -1.000 ); /* !!! Should save scores for plotting. */ } } fprintf(stderr, "writing scores ...\n"); /* !!! TO FINISH */ } { int nc = o->nCands; /* Sequence lengths: */ int nx = dm_seq_num_positions(xf); int ny = dm_seq_num_positions(yf); auto double step_score(msm_seq_desc_t *xp, msm_seq_desc_t *yp, msm_rung_t g, msm_rung_t h); double step_score(msm_seq_desc_t *xp, msm_seq_desc_t *yp, msm_rung_t g, msm_rung_t h) { msm_seq_desc_same_seq(xp, &(x->sd), TRUE); msm_seq_desc_same_seq(yp, &(y->sd), TRUE); return dm_score_step(&(o->sc), g, h, x, y); } } /* ---------------------------------------------------------------------- */ /* dm_test_tools.h */ msm_cand_t dm_test_tools_get_optimum_pairing ( dm_seq_t *xp, dm_seq_t *yp, int delta, msm_rung_step_score_proc_t *step_score, msm_dyn_tableau_t *tb, int maxIter ) { /* Get some parameters: */ int nx = dm_seq_num_positions(xp); int ny = dm_seq_num_positions(yp); int nmax = (nx > ny ? nx : ny); /* Build a Bresenham pairing {gv[0..ng-1]} betwen {xp} and {yp}: */ int ng = 0; msm_rung_vec_t gv = msm_rung_vec_new(nmax); /* Store the initial rung {gini}: */ msm_rung_t gini = (msm_rung_t){{ 0, 0 }}; msm_rung_vec_expand(&gv,ng); gv.el[ng] = gini; ng++; /* Interpolate between {gini} and {gfin}: */ msm_rung_t gfin = (msm_rung_t){{ nx-1, ny-1 }}; msm_rung_interpolate(gini, gfin, &ng, &gv); msm_rung_vec_trim(&gv, ng); /* Make it into a candidate: */ msm_pairing_t *pr = msm_pairing_from_rung_vec(&gv, FALSE); msm_cand_t cd = msm_cand_from_pairing(xp, yp, pr, /*score*/ 0.0); /* Refine it: */ int iter = 0; while (iter < maxIter) { msm_cand_t cdref = msm_cand_refine(&cdold, delta, 0, FALSE, 6, step_score, tb); if (msm_cand_equal(&cd, &cdref)) { msm_cand_free(&cdref); break; } else { msm_cand_free(&cd); cd = cdref; } } return cd; } /* ---------------------------------------------------------------------- */ /* dm_spectrum.h */ { /* Join {name} and {tag} into a file name prefix: */ char *prefix = NULL; asprintf(&prefix, "%s%s", name, tag); /* Add the extension ".eps" and open the file: */ FILE *wr = dm_open_write(name, tag, ".eps", TRUE); } /* ---------------------------------------------------------------------- */ /* dm_score.h */ double dm_score_half_step_data_term ( dm_score_args_t *sc, msm_rung_t g, int dx, int dy, dm_seq_t *xp, dm_seq_t *yp ); 9double dm_score_step_from_datums ( double eql, double dif, int kx, int ky, dm_datum_t *d0x, dm_datum_t *d0y, dm_datum_t *e0x, dm_datum_t *e0y, dm_datum_t *e1x, dm_datum_t *e1y, dm_datum_t *d1x, dm_datum_t *d1y ) { /* Compute the part {sc_diff} of score that is due to datum diffs along step: */ double S0 = dm_score_half_step_from_datums(eql,dif,kx,ky,d0x,d0y,e0x,e0y); double S1 = dm_score_half_step_from_datums(eql,dif,kx,ky,d1x,d1y,e1x,e1y); return S0 + S1; } double dm_score_half_step_from_datums ( double eql, double dif, int kx, int ky, dm_datum_t *dx, dm_datum_t *dy, dm_datum_t *ex, dm_datum_t *ey ) { double s = dm_datum_step_diffsq(dx, dy, ex, ey); double t = 1 - s; return 0.5*(t*eql + s*dif); } /* ---------------------------------------------------------------------- */ /* dm_filter.h */ void dm_filter_pairing_multi ( msm_pairing_t *pr, int nx, int ny, bool_t circx, bool_t circy, int maxLevel, int nw0, int nw1, msm_pairing_t *prf[] ); /* Given a pairing {(*pr)} between two abstract sequences {x,y}, computes the corresponding pairings {*(prf[0..maxLevel])} between the filtered versions of those two sequences. Namely, {(*prf[0])} is a storage-independent copy of {*pr}, and {prf[r]} is the result of {msm_pairing_map_to_coarser(prf[r-1],nxold,nyold,nxnew,nynew,circx,circy,nwtb)} for {r} in {1..maxLevel}. Assumes that the unfiltered sequences {x,y} paired by {*pr} have {nx} and {ny} elements, respectively; that the filter table used for level 0 to level 1 has {nw0} entries; and the filter table used in the following stages has {nw1} entries. */ void dm_filter_pairing_multi ( msm_pairing_t *pr, int nx, int ny, bool_t circx, bool_t circy, int maxLevel, int nw0, int nw1, msm_pairing_t *prf[] ) { prf[0] = msm_pairing_copy(pr); int nxold = nx, nyold = ny; int level; for (level = 1; level <= maxLevel; level++) { int nw = (level == 1 ? nw0 : nw1); int nxnew = (nxold - nw + 1)/2, nynew = (nyold - nw + 1)/2; prf[level] = msm_rung_vec_map_to_coarser (&(prf[level-1]), nxold, nyold, nxnew, nynew, nw); nxold = nxnew; nyold = nynew; } } /* ---------------------------------------------------------------------- */ /* main progs */ " [ -nsub {NSUB} ] \\\n" \ " -nsub {NSUB}\n" \ " Specifies the subsampling factor (number of matching" \ " positions per sample). The default is \"-nsub 1\".\n" \ "\n" \ typedef struct dm_options_t { int nsub; /* Subsampling factor. */ } dm_options_t; dm_options_t* dm_get_options(int argc, char**argv) { if (argparser_keyword_present(pp, "-nsub")) { o->nsub = argparser_get_next_int(pp, 1, MAX_NSUB); } else { o->nsub = DEFAULT_NSUB; } return o; } /* ---------------------------------------------------------------------- */ /* dm_score.h */ "\n" \ " The \"discrete\" keyword, if present, specifies" \ " that the {SD} term above should be" \ " computed only at the rungs of the pairing. If \"smooth\" is" \ " present, the {SD} term is computed by" \ " interpolating the paired elements linearly between" \ " consecutive rungs, and integrating the corresponding points.\n" \ /* ---------------------------------------------------------------------- */ /* dm_rung.c */ dm_rung_t dm_rung_throw_middle(dm_rung_t *glo, dm_rung_t *ghi, double skipProb) { /* Number of steps on each side: */ int DX = ghi->c[0] - glo->c[0]; int DY = ghi->c[1] - glo->c[1]; int nsmax = DX + DY; /* Max steps from {glo} to {ghi}. */ int cs = (nsmax + abrandom(0,1))/2; /* Half the steps, rounded randomly */ affirm((cs >= 0) && (cs <= nsmax), "bug"); dm_rung_t g = *glo; if (DX > DY) { int dy = msm_choose(0, DY, skipProb); g.c[1] += dy; g.c[0] += cs - dy; } else { int dx = msm_choose(0, DX, skipProb); g.c[0] += dx; g.c[1] += cs - dx; } /* Consistency check: */ affirm(dm_rung_X(*glo) <= dm_rung_X(g), "bug"); affirm(dm_rung_X(g) <= dm_rung_X(*ghi), "bug"); affirm(dm_rung_Y(*glo) <= dm_rung_Y(g), "bug"); affirm(dm_rung_Y(g) <= dm_rung_Y(*ghi), "bug"); return g; } void dm_rung_bresenham(dm_rung_t g, dm_rung_t h, int *ngP, dm_rung_vec_t *gv) { int ng = (*ngP); /* Bresenham's algorithm: */ /* Compute total displacement {sgnd[j]*absd[j]} in each axis {j}: */ int absd[2], sgnd[2]; int j; for (j = 0; j <= 1; j++) { int d = h.c[j] - g.c[j]; absd[j] = abs(d); sgnd[j] = (d == 0 ? 0 : (d < 0 ? -1 : +1)); } /* Find axis {jmax} where all steps are nonzero: */ int jmax = (absd[0] >= absd[1] ? 0 : 1); /* The other axis {jmin} may have zero steps */ int jmin = 1 - jmax; /* Initialize Bresenham's remainder {rem}. */ int rem = 0; /* Bresenham's remainder. */ /* Interpolate rungs between {g} (incl) and {h} (excl): */ while ((g.c[0] != h.c[0]) || (g.c[1] != h.c[1])) { /* Store the new rung {pnew}: */ dm_rung_vec_expand(gv, ng); gv->el[ng] = g; ng++; /* Update {pnew} by Bresenham'h algorithm: */ /* Advance {g.c[jmax]}, maybe {g.c[jmin]}: */ g.c[jmax] += sgnd[jmax]; rem += absd[jmin]; if (2*rem >= absd[jmax]) { g.c[jmin] += sgnd[jmin]; rem -= absd[jmax]; } } (*ngP) = ng; } /* ---------------------------------------------------------------------- */ /* dm_refine.c */ void dm_refine_cand_fill_tableau ( dm_cand_t *cdold, dm_seq_t *ap, dm_seq_t *bp, int delta, int kappa, bool_t shrink, dm_step_score_proc_t *step_score, dm_rung_t *goptp, /* OUT: last rung of optimum pairing. */ double *voptp, /* OUT: score of optimum pairing. */ dm_dyn_tableau_t *tb /* WORK: dynamic programming tableau. */ ) { fprintf(stderr, " filling matrices...\n"); dm_pairing_t *prold = cdold->pr; /* We can't handle circular pairings yet: */ demand((!dm_pairing_is_circular(prold)), "circular pairings not supported yet"); int minr, maxr; /* Min and max R-coord of any valid rung. */ dm_refine_compute_r_range(prold, delta, kappa, &minr, &maxr); /* Compute max differente {maxds} between two S-coordinates of valid rungs: */ int maxds = 2*(kappa + 2*delta); /* Make sure that the tableau has all the required elements: */ dm_dyn_tableau_resize(tb, minr, maxr, maxds); auto bool_t in_head(dm_rung_t g); auto bool_t in_tail(dm_rung_t g); /* TRUE if {g} is inside the `head' (resp. `tail') of the valid region, namely the square of side {kappa} whose upper right (resp. lower left) corner is the first (resp. last) rung of {prold}, widened by {delta} on all sides. */ bool_t in_head(dm_rung_t g) { dm_rung_t gini = dm_pairing_get_rung(prold, 0); int dx = g.c[0] - gini.c[0]; int dy = g.c[1] - gini.c[1]; if ((dx > delta) || (dx < -kappa-delta)) { return FALSE; } if ((dy > delta) || (dy < -kappa-delta)) { return FALSE; } return TRUE; } bool_t in_tail(dm_rung_t g) { int nr = dm_pairing_num_rungs(prold); dm_rung_t gfin = dm_pairing_get_rung(prold, nr-1); int dx = g.c[0] - gfin.c[0]; int dy = g.c[1] - gfin.c[1]; if ((dx < -delta) || (dx > +kappa+delta)) { return FALSE; } if ((dy < -delta) || (dy > +kappa+delta)) { return FALSE; } return TRUE; } /* Fills the matrix within the band, in diagonal order. We process entries one diagonal at a time, in order of increasing R-coordinate. */ double vopt = -INF; /* Score of the best path found so far. */ dm_rung_t gopt = dm_rung_none; /* Final rung {gopt} of that best path. */ int r; int kp = 0; /* Approximate index in {p} of a rung with R-coord R; */ for (r = minr; r <= maxr; r++) { /* Process tableau entries along the diagonal with R-coordinate {r}. */ int mins, maxs; /* Range of S-coordinates for the R-coordinate {r}. */ dm_refine_compute_s_range(prold, r, &kp, delta, kappa, &mins, &maxs); dm_dyn_tableau_set_s_range(tb, r, mins, maxs); /* Compute all elements in diagonal {r} of the tableau: */ int s; for (s = mins; s <= maxs; s += 2) { assert((r + s) % 2 == 0); int ix = (r + s) / 2; int iy = (r - s) / 2; /* fprintf(stderr, " ix = %5d iy = %5d\n", ix, iy); */ dm_rung_t g = (dm_rung_t){{ix, iy}}; dm_dyn_entry_t *eg = dm_dyn_tableau_get_entry_address(tb, g); assert(eg != NULL); bool_t ix_inside = ((ap->circ) || ((ix >= 0) && (ix < ap->dv.nel))); bool_t iy_inside = ((bp->circ) || ((iy >= 0) && (iy < bp->dv.nel))); if ((! ix_inside) || (! iy_inside)) { /* Rung {g} is not valid, set tableau entry to null values: */ eg->score = -INF; eg->prev = dm_rung_none; } else { /* Compute the optimum path ending at rung {g}. */ double vsel = -INF; /* Score of best path that ends at {g}. */ dm_rung_t fsel; /* Previous rung on that path. */ auto void try_step(dm_rung_t f); /* Computes the score of the best pairing that ends with rung {f}, plus the cost of the step {f->g}. If the step is valid, updates {fsel,vsel}. */ void try_step(dm_rung_t f) { double vtof; /* Value of best path to {f}. */ if (dm_rung_is_none(&f)) { vtof = 0; } else { dm_dyn_entry_t *ef = dm_dyn_tableau_get_entry_address(tb, f); vtof = (ef == NULL ? -INF : ef->score); } if (fabs(vtof) != INF) { /* Rung {f} is valid: */ double vfg = step_score(f, g, ap, bp); /* fprintf(stderr, "\n"); */ /* fprintf(stderr, " step from (%d,%d)", f->c[0], f->c[1]); */ /* fprintf(stderr, " to (%d,%d) ", g.c[0], g.c[1]); */ /* fprintf(stderr, " score %9.6f + %9.6f = %9.6f", vtof, vfg, vf+vfg); */ /* fprintf(stderr, "\n"); */ double v = vtof + vfg; if (v > vsel) { vsel = v; fsel = f; } } } /* Choose between the four optimum paths ending at rung {g}. Namely, consider coming from rung {f00 = dm_rung_none} (path starts at {g}) and from the three admissible previous rungs {f01,f10,f11}. In case of ties, consider first the diagonal step, then the axial steps, then starting at {g}. */ try_step((dm_rung_t){{ ix-1, iy-1 }}); try_step((dm_rung_t){{ ix, iy-1 }}); try_step((dm_rung_t){{ ix-1, iy }}); if (shrink || in_head(g)) { try_step(dm_rung_none); } /* Store optimum path and score in matrix entries for {g}: */ eg->score = vsel; eg->prev = fsel; /* fprintf(stderr, " prev[%5d,%5d]", g.c[0], g.c[1]); */ /* fprintf(stderr, " = (%5d,%5d)\n", gsel.c[0], gsel.c[1]); */ /* Update the best final rung {gopt} and score {vopt}: */ if (shrink || in_tail(g)) { if (vsel > vopt) { vopt = vsel; gopt = g; } } } } } /* Return last rung and score of optimum pairing: */ fprintf (stderr, " vopt = %23.15f gopt = (%5d,%5d)\n", vopt, gopt.c[0], gopt.c[1]); (*goptp) = gopt; (*voptp) = vopt; } /* ---------------------------------------------------------------------- */ /* dm_nucleic.c */ void dm_nucleic_string_mutate_pair ( char *b, bool_t dna, bool_t upper, double mutProb, double delProb, char **xP, char **yP, dm_rung_vec_t *gv ); /* Generates two copies of nucleotide string {b}. Each sequence is generated with parameters {dna,upper,mutPorb,delprob}, as explained under {dm_nucleic_string_mutate}. The procedure also returns in {*gv} the list of `true' rungs between between the two sequences, i.e. the list of pairs {ix,iy} such that {xP[ix]} and {yP[iy]} were both copied from the same letter of {b}, without mutation. Beware that {pr} will be monotonic but not unitary in general. */ void dm_nucleic_string_mutate_pair ( char *b, bool_t dna, bool_t upper, double mutProb, double delProb, char **xP, char **yP, dm_rung_vec_t *gv ) { /* Generate two mutated copies of {b}, note the indices of copied chars: */ char_vec_t xcv = char_vec_new(nxy); char_vec_t ycv = char_vec_new(nxy); int nx = 0, ny = 0; /* Number of chars in each new string. */ bool_t xok = TRUE, yok = TRUE; char *xb = b; char *yb = b; ???; } /* ---------------------------------------------------------------------- */ /* dm_match.c */ double_vec_t wtb0; char *wname0; double_vec_t wtb1; char *wname1; double var0 = o->initVar; double var1 = 3*var0; dm_weight_table_pair_make_binomial(var0, &wtb0, &wname0, var1, &wtb1, &wname1, TRUE); fprintf(stderr, " first filter width = %d desc = %s\n", wtb0.nel, wname0); fprintf(stderr, " later filter width = %d desc = %s\n", wtb1.nel, wname1); /* ---------------------------------------------------------------------- */ /* dm_psplot.c */ /* Get the plot area size in direction {axis} (mm): */ double cSize = dp->size[axis]; /* Get an estimate of the label's size (mm) in the direction {axis}: */ double cLabSize = dm_psplot_estimate_label_size(dp, fontsz, axis, fmt); /* Adjust span {cMin,cMax} to allow space for labels: */ if (cMin < cLabSize/2) { cMin = cLabSize/2; } if (cMax > cSize - cLabSize/2) { cMax = cSize - cLabSize/2; } dm_psplot_choose_tick_coords(dp, axis, cMin, cMax, labsp, &zbMin, &zbMax, &zbStep); fprintf(stderr, " zbMin = %24.16e zbMax = %24.16e zbStep = %24.16e\n", zbMin, zbMax, zbStep); if (zbStep == 0) { /* No major ticks in range: */ labper = labphase = 0; } else { /* Some major ticks in range: */ assert(zbStep >= ztStep); assert(zbMin >= ztMin); assert(zbMax <= ztMax); /* Compute the minor-to-major tick ratio {labper}: */ labper = (int)msm_round(zbStep/ztStep); assert(labper > 0); /* Compute the {labphase} of minor ticks before the first major tick: */ labphase = ((int)msm_round((zbMin - ztMin)/ztStep)) % labper; } /* ---------------------------------------------------------------------- */ /* RUNG SCORES */ /* dm_cand.h */ typedef double dm_rung_score_proc_t(dm_rung_t g, dm_seq_t *xp, dm_seq_t *yp); /* A procedure that computes a numerical score for a trivial pairing between sequences {xp,yp}, consisting of a single rung {g}, namely a single pair of samples {xp[g[0]],yp[g[1]]}. */ dm_rung_score_proc_t *rung_score, /* Rung scoring function. */ dm_rung_score_proc_t *rung_score, /* , plus the {rung_scores} of all its rungs */ /* dm_multi.h */ dm_rung_score_proc_t *rung_score, /* dm_refine.h */ dm_rung_score_proc_t *rung_score, dm_rung_score_proc_t *rung_score, /* dm_cand.c */ dm_rung_score_proc_t *rung_score, /* Namely, {R+S} where {R = SUM{rung_score(p[k]) : k \in r..s}}, {S = } */ double rsck = (rung_score == NULL ? 0 : rung_score(h, xp, yp)); rsc.el[kk] = rsck; psc.el[kk] = psc.el[kmax % ksize] + ssck + rsck; double s_slow = dm_cand_perfect_score ( xp, ix + r, yp, iy + r, s - r + 1, FALSE, rung_score, step_score ); dm_rung_score_proc_t *rung_score, /* Rung scoring function. */ double sc = (rung_score == NULL ? 0 : rung_score(g, xp, yp)); double rsck = (rung_score == NULL ? 0 : rung_score(h, xp, yp)); sc += ssck + rsck; /* dm_match.c */ dm_rung_score_proc_t *rung_score, auto double rung_score(dm_rung_t g, dm_seq_t *xp, dm_seq_t *yp); double rung_score(dm_rung_t g, dm_seq_t *xp, dm_seq_t *yp) { return dm_score_rung(&(o->sc), g, xp, yp); } dm_rung_score_proc_t *rung_score, /* dm_refine.c */ dm_rung_score_proc_t *rung_score, dm_rung_score_proc_t *rung_score, if (rung_score != NULL) { /* Add the score of the rung {g}: */ double sg = rung_score(g, ap, bp); if (fabs(sg) != INF) { vsel += sg; } } /* dm_seq.c */ dm_rung_score_proc_t *rung_score, if (rung-score != NULL) { score += rung_score(g); } /* dm_test_refining */ auto double rung_score(dm_rung_t g, dm_seq_t *xp, dm_seq_t *yp); double rung_score(dm_rung_t g, dm_seq_t *xp, dm_seq_t *yp) { if (o->rungScores) { return dm_score_rung(&(o->sc), g, xp, yp); } else { return dm_score_rung(&(o->sc), g, xp, yp); } } /* ---------------------------------------------------------------------- */ /* Get ranges {lox,hix} and {loy,hiy} of sample indices: */ int lox = rini.c[0]; int loy = rini.c[1]; int hix = rfin.c[0]; int hiy = rfin.c[1]; fprintf(stderr, " lox = %5d hix = %5d\n", lox, hix); fprintf(stderr, " loy = %5d hiy = %5d\n", loy, hiy); /* Values {lor,hir} of {r = ix + iy} for rungs {rini,rfin}: */ int lor = lox + loy; int hir = hix + hiy; /* Values {los,his} (possibly negative) of {s = ix - iy} for rungs {rini,rfin}: */ int los = lox - loy; int his = hix - hiy; /* Finds ranges {minx,maxx} and {miny,maxy} (possibly negative) of sample indices in matrix: */ int minx = lox - kappa - delta; int miny = loy - kappa - delta; int maxx = hix + kappa + delta; int maxy = hiy + kappa + delta; fprintf(stderr, " minx = %5d maxx = %5d\n", minx, maxx); fprintf(stderr, " miny = %5d maxy = %5d\n", miny, maxy); /* Range {minr,maxr} of variation for {r = ix + iy}: */ int minr = minx + miny; int maxr = maxx + maxy; #define huge 1073741823 /* 2^30 - 1 */ if (dm_pairing_is_circular(p)) { /* Get two rungs of {p} spaced {nr} rungs apart, map them, use difference: */ dm_rung_t rini = rvnew.el[0]; /* Rung index 0. */ dm_rung_t rbak = map(dm_pairing_get_rung(p, nr)); /* Rung index {nr}. */ DX = rbak.c[0] - rini.c[0]; DY = rbak.c[1] - rini.c[1]; assert((DX != 0) && (DY != 0)); } else { DX = DY = 0; } int ix = p->ini.c[0], iy = p->ini.c[1]; if (nr == 0) { /* Empty pairing: */ demand((ix == 0) && (iy == 0), "invalid empty pairing"); } else { /* Non-empty pairing: */ demand((ix >= 0) && (ix < nx), "bad init index for {x} sequence"); demand((iy >= 0) && (iy < ny), "bad init index for {y} sequence"); } if (nr == 0) { /* Empty pairing: */ if (p->per.c[0] != 0) { return FALSE; } if (p->per.c[1] != 0) { return FALSE; } if (p->rgv.nel != 0) { return FALSE; } if (p->rgv.el != NULL) { return FALSE; } } else /* {(nr > 0)} */ { } if (nr == 0) { /* Empty pairing: */ demand((DX == 0) && (DY == 0), "an empty pairing cannot be circular"); } /* interspersed with new rungs so that every step is and each new rung {q[i]} is derived from the corresponding old rung {r[i]} by rescaling each coordinate independently An old X coordinate {x} gets mapped to {floor(x/ox)*nx + (x%ox) filtered at some positive scale {p}, returns the corresponding pairing {prnew} for the sequences at scale {p-1}. The procedure assumes that {nxnew,nynew} are the lengths of the two sequences at scale {p-1}. The parameter {nwtb} should be the number of elements in the weight table used for filtering. If {prold} is perfect, {prnew} will be perfect, too. Otherwise, each rung is mapped to the finer scale, and extra interpolating rungs are added to preserve the pairing invariants. , and each new rung {q[i]} is derived from the corresponding old rung {r[i]} by rescaling each coordinate independently. Every X coordinate {x} is mapped to {floor(x*(nx/ox) + dxy)}, and every Y coordinate {y} to {floor(y*(ny/oy) + dxy)}. An old X coordinate {x} gets mapped to {floor(x/ox)*nx + (x%ox) filtered at some positive scale {p}, returns the corresponding pairing {prnew} for the sequences at scale {p-1}. The procedure assumes that {nxnew,nynew} are the lengths of the two sequences at scale {p-1}. The parameter {nwtb} should be the number of elements in the weight table used for filtering. If {prold} is perfect, {prnew} will be perfect, too. Otherwise, each rung is mapped to the finer scale, and extra interpolating rungs are added to preserve the pairing invariants. A partial pairing between two sequences of samples. The pairing begins with the samples indexed by {ini.c[0],ini.c[1]} in sequences 0 and 1, respectively; and ends with the samples indexed {fin.c[0],fin.c[1]} If either sequence is circular, the corresponding indices must be taken modulo its length {ns[j]}; otherwise the indices must lie between 0 and {n[j]-1}. In any case, the index {fin.c[j]} is greater than or equal to {ini.c[j]}, for {j} in {{0,1}}. If {rv.nel} is zero, the pairing is a perfect one. Otherwise the pairs are {rv.el[i]} for {i} in {0..nr-1}. A pair {q==p[k]} means that samples {q[0]} of sequence {seq[0]} and {q[1]} of {seq[1]} are paired. One must have {rv.nel == nr}, {p.el[0]==ini}, and {rv.el[p.nel-1]==fin}. Successive pairs {p[k]} increase by at most 1 in each coordinate and do not repeat. If {circ} is true, the pairing itself is circular, which includes an implicit step from the last rung to the first one. In that case the two sequences must be circular, and the final rung must be a valid predecessor of the initial rung, modulo the respective sequence lengths. */ dm_cand_t dm_cand_from_rung_vec ( dm_seq_t *xp, dm_seq_t *yp, dm_rung_vec_t rv, double score, bool_t circ ); /* Creates a candidate between the two given sequences, with the pairing defined by the rung vector {rv}. If {circ} is true, generates a candidate that is a circular pairing. In that case, the two sequences must be circular, and the first rung must be a proper sucessor of the last one. */ dm_cand_t dm_cand_from_rung_vec ( dm_seq_t *xp, dm_seq_t *yp, dm_rung_vec_t rv, double score, bool_t circ ) { /* Get the rung count {nr}: */ int nr = rv->nel; /* Number of (fundamental) rungs. */ /* Get the datum counts {nx,ny} in the two sequences: */ int nx = xp->dv.nel; /* Samples in X sequence. */ int ny = yp->dv.nel; /* Samples in Y sequence. */ /* Fill fields {id,name,nbas,score} of candidate: */ dm_cand_t cd; cd.id[0] = xp->id; cd.name[0] = xp->name; cd.nbas[0] = nx; cd.id[1] = yp->id; cd.name[1] = yp->name; cd.nbas[1] = ny; cd.score = score; /* The pairing must start inside the fundamental cell: */ demand((ix >= 0) && (ix < nx), "bad initial X"); demand((iy >= 0) && (iy < ny), "bad initial Y"); int Dx, Dy; if (circ) { /* Circular pairing. */ /* X- or Y-stationary cands are not allowed, so both sequences must be circular: */ demand(xp->circ && yp->circ, "invalid circular candidate"); /* Get last fundamental rung: */ dm-rung_t } else { Dx = Dy = 0; } cd.pr = dm_pairing_from_rung_vec(&rv, Dx, Dy); return cd; } struct dm_pairing_t { int np; /* Number of pairs in pairing. */ dm_rung_t ini; /* First index pair in pairing. */ dm_rung_t fin; /* Last index pair in pairing. */ dm_rung_vec_t rv; /* Rungs of pairing, or empty vec if perfect. */ bool_t circ; /* TRUE implies an extra step from the last rung to the first. */ }; /* A partial pairing between two sequences of samples. The pairing begins with the samples indexed by {ini.c[0],ini.c[1]} in sequences 0 and 1, respectively; and ends with the samples indexed {fin.c[0],fin.c[1]}. If either sequence is circular, the corresponding indices must be taken modulo its length {ns[j]}; otherwise the indices must lie between 0 and {n[j]-1}. In any case, the index {fin.c[j]} is greater than or equal to {ini.c[j]}, for {j} in {{0,1}}. If {rv.nel} is zero, the pairing is a perfect one. Otherwise the pairs are {rv.el[i]} for {i} in {0..np-1}. A pair {q==p[k]} means that samples {q[0]} of sequence {seq[0]} and {q[1]} of {seq[1]} are paired. One must have {rv.nel == np}, {p.el[0]==ini}, and {rv.el[p.nel-1]==fin}. Successive pairs {p[k]} increase by at most 1 in each coordinate and do not repeat. If {circ} is true, the pairing itself is circular, which includes an implicit step from the last rung to the first one. In that case the two sequences must be circular, and the final rung must be a valid predecessor of the initial rung, modulo the respective sequence lengths. If {np} is zero, the pairing is an empty pairing (with no rungs). In that case the other fields are immaterial. */ dm_pairing_t dm_pairing_new (int npmax); /* Creates a pairing, initially with space for {npmax} index pairs. */ dm_pairing_t dm_pairing_new (int npmax) { dm_pairing_t p; p.np = 0; int j; for (j = 0; j < 2; j++) { p.ini.c[j] = p.fin.c[j] = 0; } p.rv = dm_rung_vec_new(npmax); p.circ = FALSE; return p; } /* Check if candidate refers to these two strings: */ bool_t cand_is_relevant; if ((xp != NULL) && (yp != NULL)) { bool_t eq0x = (cd->id[0] == xp->id); bool_t eq0y = (cd->id[0] == yp->id); bool_t eq1x = (cd->id[1] == xp->id); bool_t eq1y = (cd->id[1] == yp->id); cand_is_relevant = (eq0x && eq1y) || (eq1x && eq0y); } else { cand_is_relevant = TRUE; } void dm_sort_and_prune_candidates(dm_cand_vec_t *cdv, int nnew); /* Sorts the list {cdv} by decreasing value, and trims it to size {nnew}. */ dm_sample_t dm_datum_get_sample(dm_datum_t *d, int k); void dm_datum_set_sample(dm_datum_t *d, int k, dm_sample_t smp); /* Gets/sets the sample in channel {k} of datum {*d}. */ dm_sample_t dm_datum_get_sample(dm_datum_t *d, int k) { if ((k >= 0) && (k < dm_CHANNELS)) { return d->c[k]; } else { demand(FALSE, "bad channel"); return 0; } } void dm_datum_set_sample(dm_datum_t *d, int k, dm_sample_t smp) { if ((k >= 0) && (k < dm_CHANNELS)) { d->c[k] = smp; } else { demand(FALSE, "bad channel"); } } double dm_at_value(char c); /* The A-minus-T value of the character {c}, namely +1 if {c='A'}, -1 if {c='T'} or {c='U'}, zero otherwise (ignoring case). */ double dm_cg_value(char c); /* The C-minus-G value of the character {c}, namely +1 if {c='C'}, -1 if {c='G'}, zero otherwise (ignoring case). */ double dm_at_value(char c) { if ((c == 'A') || (c == 'a')) { return +1.0; } else if ((c == 'T') || (c == 't') || (c == 'U') || (c == 'u')) { return -1.0; } else { return 00.0; } } double dm_cg_value(char c) { if ((c == 'C') || (c == 'c')) { return +1.0; } else if ((c == 'G') || (c == 'g')) { return -1.0; } else { return 00.0; } } double dm_datum_diffabsq(dm_datum_t *xd, dm_datum_t *yd); /* Returns the abs difference squared between {xd} and {yd}, defined as {((|Dat|+|Dcg|)/2)^2} where {Dat = xd.at - yd.at} and {Dcg = xd.cg - yd.cg}. For unsmoothed bases, the distance is 1 between distinct bases, and 0 between equal bases. */ double dm_datum_step_diffabsq ( dm_datum_t *x0, dm_datum_t *y0, dm_datum_t *x1, dm_datum_t *y1 ); /* Average value of {dm_datum_diffabsq(x,y)} when {x} and {y} interpolate linearly between {x0,y0} and {x1,y1}. */ double dm_datum_diffabsq(dm_datum_t *xd, dm_datum_t *yd) { double Dat = dm_sample_decode(xd->at) - dm_sample_decode(yd->at); /* In [-2 _ +2] */ double Dcg = dm_sample_decode(xd->cg) - dm_sample_decode(yd->cg); /* In [-2 _ +2] */ double Dabs = (fabs(Dat) + fabs(Dcg))/2; return Dabs*Dabs; } #define DM_DATUM_STEP_SUBSTEPS 4 /* Number of substeps for step cost integration. */ double dm_datum_step_diffabsq ( dm_datum_t *x0, dm_datum_t *y0, dm_datum_t *x1, dm_datum_t *y1 ) { double Dat0 = dm_sample_decode(x0->at) - dm_sample_decode(y0->at); /* In [-2 _ +2] */ double Dcg0 = dm_sample_decode(x0->cg) - dm_sample_decode(y0->cg); /* In [-2 _ +2] */ double Dat1 = dm_sample_decode(x1->at) - dm_sample_decode(y1->at); /* In [-2 _ +2] */ double Dcg1 = dm_sample_decode(x1->cg) - dm_sample_decode(y1->cg); /* In [-2 _ +2] */ /* Integral of {dm_datum_diffabsq} between the two points: */ int n = DM_DATUM_STEP_SUBSTEPS, k; double sum = 0.0; for (k = 1; k < n; k++) { double r = ((double)k)/((double)n); double s = 1 - r; double Dat = s*Dat0 + r*Dat1; double Dcg = s*Dcg0 + r*Dcg1; double Dabs = (fabs(Dat) + fabs(Dcg))/2; sum += Dabs*Dabs; } return sum/(n-1); } /* Maps zero to white, {hi} to black, where {hi} is the value expected if a string of homologous pairs crosses the pixel diagonally amidst a background of random pairs: */ /* Avg value added for a homologous pair: */ double avg_homo_v = 0.0; /* Avg value added for a random pair: */ double avg_rand_v = (4*sqrt(4.0) + 4*sqrt(0.0) + 8*sqrt(2.0))/16.0; /* Expected total when there are no homologous pairs: */ double avg_bad_tot = scale*scale*avg_rand_v; /* Expected total when diagonal is homologous: */ double avg_gud_tot = scale*(scale - 1)*avg_rand_v + scale*avg_homo_v; /* Broaden range a bit to preserve some info: */ double diff = avg_gud_tot - avg_bad_tot; double gud_lim = avg_gud_tot + 0.25*diff; double bad_lim = avg_bad_tot - 0.25*diff; double dm_sample_step_corrsq ( dm_sample_t *x0, dm_sample_t *y0, dm_sample_t *x1, dm_sample_t *y1 ); /* difference between {dm_step_diffsq(x0,y0,x1,y1)} and {(dm_sample_diffsq(x0,y0) + dm_sample_diffsq(x1,y1))/2}. */ double dm_sample_step_corrsq ( dm_sample_t *x0, dm_sample_t *y0, dm_sample_t *x1, dm_sample_t *y1 ) { double at0 = dm_decode(x0->at) - dm_decode(y0->at); double cg0 = dm_decode(x0->cg) - dm_decode(y0->cg); double at1 = dm_decode(x1->at) - dm_decode(y1->at); double cg1 = dm_decode(x1->cg) - dm_decode(y1->cg); double d00 = at0*at0 + cg0*cg0; double d01 = at0*at1 + cg0*cg1; double d11 = at1*at1 + cg1*cg1; return (2*d01 - d00 - d11)/6.0; } if (prune) { int nprune = (maxMatches >> level); fprintf(stderr, "sorting %d candidates, pruning to %d ...\n", cdvref.nel, nprune); dm_sort_and_prune_candidates(&cdvref, nprune); } void dm_sort_and_prune_candidates(dm_cand_vec_t *cdv, int nprune) { auto int cmp_candidates(const void* ca, const void *cb); /* Returns -1 if {ca} is better than {cb}, +1 if worse, 00 if same. */ int cmp_candidates(const void* ca, const void *cb) { fprintf(stderr, " comparing candidates\n"); fprintf(stderr, " ca: "); dm_cand_write(stderr, (dm_cand_t *)ca); fprintf(stderr, "\n"); fprintf(stderr, " cb: "); dm_cand_write(stderr, (dm_cand_t *)cb); fprintf(stderr, "\n"); double sa = ((dm_cand_t *)ca)->score; double sb = ((dm_cand_t *)cb)->score; if (sa > sb) { return -1; } else if (sa < sb) { return +1; } else { return 0; } } fprintf(stderr, " input candidates:\n"); int ic; for (ic = 0; ic < cdv->nel; ic++) { fprintf(stderr, " cdv[%03d]: ", ic); dm_cand_write(stderr, &(cdv->el[ic])); fprintf(stderr, "\n"); } qsort((void *)cdv->el, cdv->nel, sizeof(dm_cand_t), &cmp_candidates); fprintf(stderr, " sorted candidates:\n"); for (ic = 0; ic < cdv->nel; ic++) { fprintf(stderr, " cdv[%03d]: ", ic); dm_cand_write(stderr, &(cdv->el[ic])); fprintf(stderr, "\n"); } if (cdv->nel > nprune) { fprintf(stderr, " pruning to %d candidates\n", nprune); dm_cand_vec_trim(cdv, nprune); } } int dm_cand_list_search_similar(dm_cand_vec_t *cdv, int ncand, double frac) { int k; for (k = 0; k < ncand; k++) { int pair10, pair01, pair11; dm_cand_compare(&(cdv->el[k]), cd, &pair10, &pair01, &pair11); int totpairs = pair10 + pair01 + pair11; int difpairs = pair10 + pair01; if (difpairs <= frac*totpairs) { return k; } } } auto int bubble_down(int j); /* Assumes that the heap is OK except that {cdv[j]} may be better than one or both of its children. Swaps that item with its worst child until the heap is fixed. Returns the final position of that item. */ auto dm_cand_t pop_worst(void); /* Removes the worst candidate from the heap, returns it. */ int bubble_down(int j) { int c; while ((c=(2*j+1)) < ncand) { /* Find worst child {c}: */ if ((c + 1 < ncand) && (cdv->el[c+1].score < cdv->el[c].score)) { c++; } /* If worst child is no worse than {cdv[j]}, stop: */ if (cdv->el[j].score <= cdv->el[c].score) { break; } /* Swap worst child with {cdv[j]}: */ dm_cand_t pp = cdv->el[c]; cdv->el[c] = cdv->el[j]; cdv->el[j] = pp; j = c; } return j; } dm_cand_t pop_worst(void) { /* Save worst candidate: */ dm_cand_t cd = cdv->el[0]; /* Push the hole to the fringe: */ int j = 0, c; while ((c=(2*j+1)) < ncand) { /* Find worst child {c}: */ if ((c + 1 < ncand) && (cdv->el[c+1].score < cdv->el[c].score)) { c++; } /* Pull worst child up to {cdv[j]}: */ cdv->el[j] = cdv->el[c]; j = c; } ncand--; if (j < ncand) { /* Fill the hole with the last element: */ cdv->el[j] = cdv->el[ncand]; (void)bubble_up(j); } return cd; } /* Finish sorting the candidates by decreasing score: */ while(ncand > 1) { /* Remove the worst candidate in the heap: */ dm_cand_t cd = pop_worst(); /* Insert popped cand in vacated hole: */ cdv.el[ncand] = cd; } void dm_fill_image(dm_image_t *img, float v); /* Fill the image {img} with value {v}. */ void dm_fill_image(dm_image_t *img, float v) { int ipy; for (ipy = 0; ipy < img->sz[2]; ipy++) { float *row = img->sample[ipy]; int ipx; for (ipx = 0; ipx < img->sz[1]; ipx++) { row[ipx] = v; } } } double tol = 1.0; auto double score(dm_pairing_t p, dm_seq_t *xp, dm_seq_t *yp); /* Returns the score of the pairing {p}, where each pair {(i,j)} counts as {-tol} if {xp[i]} and {yp[j]} are equal, {1-tol} if they differ. */ double score(dm_pairing_t p, dm_seq_t *xp, dm_seq_t *yp) { /* FIX - USE {dm_step_badness}!!! */ double sum = 0.0; if (p.nel == 0) { return sum; } int k; dm_rung_t p0 = p.el[0]; if (p.nel == 1) { dm_sample_t xs = dm_get_sample(xp, p0.c[0]); dm_sample_t ys = dm_get_sample(yp, p0.c[1]); sum += (tol - dm_sample_diffsq(&xs, &ys)); } else { for (k = 1; k < p.nel; k++) { dm_rung_t p1 = p.el[k]; double term = tol - dm_step_badness(p0, p1, xp, yp); sum += term; p0 = p1; } } return sum; } double perf_score(int ix, int iy, int np, bool_t circ) { double sum = 0.0; int k; if (np == 0) { return sum; } for (k = 1; k < np + (circ ? 1 : 0); k++) { int fx = ix + 1; int fy = iy + 1; dm_rung_t ini = (dm_rung_t){{ix, iy}}; dm_rung_t fin = (dm_rung_t){{fx, fy}}; double term = tol - dm_step_badness(ini, fin, xp, yp); sum += term; } return sum; } /* ---------------------------------------------------------------------- */ /* dm_classify.c */ bool_t dm_classify_adjustment_iteration ( int np[], int nc, double p[][], double coef[], double cind, double dist[][] ) { ... double baricenter[nc]; dm_classify_barycenter(baricenter,&cdv_border,seq_a,seq_b); //project point into hyperplane and find pivot double pivot[nc]; dm_classify_project_onto_hyperplane(coef,nK, baricenter,pivot,nc); //STAGE 2 - introduce a perturbation in the normal over this point //projects each misclassified point into P double vm[nc]; rn_zero(nc,vm); for(i = 0; i < cdv_border.ne; i++) { //get statistics of candidate double p[nc]; dm_classify_get_stats_vec(&(cdv_border.e[i]),seq_a,seq_b,p); double pp[nc]; dm_classify_project_onto_hyperplane(coef,nK, p,pp,nc); double v[nc]; rn_sub(nc,pp,baricenter,v); if(!cand_is_good[i]) { rn_neg(nc,v,v); } rn_add(nc,v,vm,vm); } rn_scale(nc,1/(double)cdv_border.ne,vm,vm); //new hyperplane should be the average of perturbations entered on pivot double new_normal[nc]; rn_add(nc,coef,vm,new_normal); rn_dir(nc,new_normal,new_normal); double new_cind = dm_classify_indep_coefficient(new_normal,baricenter,nc); //just in case, if Kv is negative, invert all the values if(new_cind < 0) { new_cind = -new_cind; for(i = 0; i < nc; i++){ coef[i] = -coef[i]; } } //convert back to score_weights dm_classify_hyperplane_to_score_weights(new_normal, new_cind,&nEql,&nDif,&nBrk,&nSkp,&nUnp,&nK); fprintf(stderr,"[%d] Updated score_weights\nEQL: %9.6lf\nDIF: %9.6lf\nBRK: %9.6lf\nSKP: %9.6lf\nUNP: %9.6lf\nK: %9.6lf\n",num_iter, nEql,nDif,nBrk,nSkp,nUnp,K); } free(distF); free(distV); } *aEql = nEql; *aDif = nDif; *aBrk = nBrk; *aSkp = nSkp; *aUnp = nUnp; *aK = K; } /* ---------------------------------------------------------------------- */ /* dm_score.{h,c} */ double dm_score_step_from_datums ( double eql, double dif, dm_datum_t *d0x, dm_datum_t *d0y, dm_datum_t *d1x, dm_datum_t *d1y ); /* Computes the data-related score for a step {g0-->g1}, with scoring weights {eql} and {dif}. The parameters {d0x,d0y} and {d1x,d1y} are the datums paired by rungs {g0} and {g1}, respectively, in the two sequences. The step score is defined as the mean of {dm_score_rung_from_datums(eql,dif,d0x,d0y)} and {dm_score_rung_from_datums(eql,dif,d1x,d1y)}. Note that this function returns only the data-related part of the step's score, without the move-related part. */ double dm_score_step_data_term ( dm_score_args_t *wt, msm_rung_t *g0, msm_rung_t *g1, dm_seq_t *xp, dm_seq_t *yp ); /* Computes the data-related part {Sd} of the score of a step {g0-->g1} between sequences {*xp} and {*yp}, according to the scoring criterion {wt}. The result measures the similarity between the datums paired along the step. */ double dm_score_step_from_datums ( double eql, double dif, dm_datum_t *d0x, dm_datum_t *d0y, dm_datum_t *d1x, dm_datum_t *d1y ) { /* Compute the part {sc_diff} of score that is due to datum diffs along step: */ double S0 = dm_score_rung_from_datums(eql,dif,d0x,d0y); double S1 = dm_score_rung_from_datums(eql,dif,d1x,d1y); return 0.5*(S0 + S1); } double dm_score_step_data_term ( dm_score_args_t *wt, msm_rung_t *g0, msm_rung_t *g1, dm_seq_t *xp, dm_seq_t *yp ) { double Sd0 = dm_score_rung(wt, g0, xp, yp); double Sd1 = dm_score_rung(wt, g1, xp, yp); return 0.5*(Sd0 + Sd1); } /* ---------------------------------------------------------------------- */ /* dm_match.c */ void report_lav_alignment ( msm_cand_vec_t *cdv, char* seq_filename_a, char* seq_filename_b, char *name, char *tag ); /* Writes alignments in LAV format, given a list of pairings {cdv} and the sequence file names. The output file is "{name}{tag}.lav". */ void report_lav_alignment ( msm_cand_vec_t *cdv, char* seq_filename_a, char* seq_filename_b, char *name, char *tag ) { FILE* arq = msm_open_write(name,tag,".lav",TRUE); //Write reader dm_lav_write_from_cands(arq,cdv,seq_filename_a,seq_filename_b); fclose(arq); } char* reportFilenameA = NULL; char* reportFilenameB = NULL; asprintf(&reportFilenameA,"%s.bas",o->seqNameA); asprintf(&reportFilenameB,"%s.bas",o->seqNameB); report_lav_alignment(&cdvfin,reportFilenameA,reportFilenameB,o->outName,"-fin-cd"); free(reportFilenameA); free(reportFilenameB); char* lavTag = NULL; asprintf(&lavTag,"-lav-%s",refTag); report_lav_alignment( cdv,"a","b",levName,lavTag); free(lavTag); /* ---------------------------------------------------------------------- */ /* dm_test_180_discrim.c */ "\n" \ " -nsub {N_SUB}\n" \ " This mandatory argument specifies the number" \ " of matching points per sample in the filtered sequences.\n" \ int nsub; /* Number of matching points per datum in the filtered sequences. */ argparser_get_keyword(pp, "-nsub"); o->nsub = argparser_get_next_int(pp, 0, INT_MAX); /* ---------------------------------------------------------------------- */ /* dm_lav_to_cdv.c : */ void dmvc_define_pairing() { assert(perm[j] != i); int k1 = abrandom(0,i); int k2 = abrandom(k1,i); int k3 = abrandom(k2,i); j = k3; if (perm[j] == i) { j = (j+1)%(i+1); } } void dmvc_read_lav_file ( lav_file_t *L, int min_rungs, double min_good_frac, msm_cand_vec_t *cdvP, int *ncP ); /* Read the candidates from the file described ny {L} and appends them to {cdv}, starting at position {*ncP}, incrementing {*ncP}. Expands {cdv} if needed, and but does not trim {cdv}. Discards candidates whose fraction of equal nucleotides is less than {min_good_frac} or have less than {min_rungs} rungs. */ void dmvc_read_lav_file ( lav_file_t *L, int min_rungs, double min_good_frac, msm_cand_vec_t *cdvP, int *ncP ) { FILE *rd = open_read(L->file_name, TRUE); dm_lav_read_cands ( rd, L->id_a, L->label_a, L->size_a, L->id_b, L->label_b, L->size_b, min_rungs, min_good_frac, cdvP, ncP ); fclose(rd); } /* ---------------------------------------------------------------------- */ /* dm_svm_training.c */ auto msm_cand_t *copy_cand_header(msm_cand_t *cdo); msm_cand_t *copy_cand_header(msm_cand_t *cdo) { msm_cand_t *new_cd = (msm_cand_t*)malloc(sizeof(msm_cand_t)); new_cd->seq[0] = cdo->seq[0]; new_cd->seq[1] = cdo->seq[1]; new_cd->score = cdo->score; return new_cd; } /* ---------------------------------------------------------------------- */ /* dm_cand_filter.c */ auto double rung_score(msm_seq_desc_t *xp, msm_seq_desc_t *yp, msm_rung_t g); double rung_score(msm_seq_desc_t *xp, msm_seq_desc_t *yp, msm_rung_t g) { msm_seq_desc_same_orig_seq(xp, &(x->sd), TRUE); msm_seq_desc_same_orig_seq(yp, &(y->sd), TRUE); return dm_score_rung(&(o->sc[level]), &g, x, y); } char *dm_td_lsq_basis_txt(int32_t ib) { switch(ib) { case 0: return "1"; case 1: return ""; case 2: return "(|X|^2 + |Y|^2)"; case 3: return "(X0 + X1 + X2 + Y0 + Y1 + Y2)"; case 4: return "(X0*X1 + X1*X2 + X2*X0 + Y0*Y1 + Y1*Y2 + Y2*Y0)"; case 5: return "(X0*Y1 + X1*Y2 + X2*Y0 + X1*Y0 + X2*Y1 + X0*y2)"; default: affirm(FALSE, "bad ib"); return "BAD"; } } /* ---------------------------------------------------------------------- */ /* dm_test_120_distance */ " -basis { full | reduced } \\\n" \ " The basis used for least squares fitting is specified by" \ " the \"-basis\" option. The \"full\" basis has all {NB=16} monomials" \ " of degree 0 and 2 on the coordinates" \ " of {A} and {B} as separate elements. The \"reduced\" basis" \ " takes into account that the squared distance (raw or smoothed) is unchanged" \ " by swapping {A} with {B} and permuting the coordinate axes, and keeps only" \ " the {NB=5} independent polynomials of degree 0 and 2 that are unchanged by those" \ " symmetries.\n" \ " -basis { full | reduced }\n" \ " This argument specifies the type of quadratic basis" \ " used to fit the squared distances (raw and smoothed) to the smoothed" \ " datums {A,B}.\n" \ "\n" \ bool_t basis_full; /* True to use the full basis. */ void dmtd_lsq_basis(bool_t full, r3_t *A, r3_t *B, int32_t nb, double bas[], char *name[]); /* (full or reduced, depending on {full}). Requires {nb = dmtd_lsq_basis_size(full)}. */ " [ -estimator {COEF[0]} .. {COEF[NB-1]} ] \\\n" \ "\n" \ " -estimator {COEF[0]} .. {COEF[NB-1]}\n" \ " This optional argument specifies the coefficients of a conjectured" \ " estimator for the filtered distance {D2} as a function of the smoothed" \ " samples {A} and {B}. It is followed by {NB} coefficients of the" \ " estimator in the fitting basis. If omitted, it defaults" \ " to the distance squared {DD2@@} between the filtered damples {A} and {B}.\n" \ double_vec_t estimator; /* Coeffs of the conjectured estimator, or empty vector. */ argparser_get_keyword(pp, "-basis"); char *btype = argparser_get_next_non_keyword(pp); if (strcmp(btype, "full") == 0) { o->basis_full = TRUE; } else if (strcmp(btype, "reduced") == 0) { o->basis_full = FALSE; } else { argparser_error(pp, "invalid basis type"); } if (argparser_keyword_present(pp, "-estimator")) { o->estimator = double_vec_new(nb); for (int32_t ib = 0; ib < nb; ib++) { o->estimator.e[ib] = argparser_get_next_double(pp, -100.0, +100.0); } } else { o->estimator = double_vec_new(0); } void dmtd_throw_case ( r3_t *A, r3_t *B, double *DD2@@, double *D2, bool_t verbose ); /* Generates two matched filtered datums {A[0..nG-1],B[0..nG-1]} and computes the corresponding smoothed square distance signal {*D2}, where {ng} is {dnae_CHANNELS} in {dnae_datum.Specifically}. generates two random nucleotide strings with {nw} letters. Computes their numeric equivalents {a[0..nw-1]} and {b[0..nw-1]}, where each {a[iab]} or {b[iab]} is an {ng}-vector with the convention described under {dnae_datum_decoded_from_nucleic_char}. Then computes the filtered datums {A[c] = SUM{ ws[iab]*a[iab][c] : iab in 0..nw-1}} for {c} in {0..ng-1}, and similarly for {B[0..ng-1]}. Then computes the squared difference of the smoothed datums {DD2@@ = |A - B|^2}. Also computes the individual squared differences {d2[iab] = |a[iab]-b[iab]|^2} for {iab} in {0..nw-1} and the smoothed squared difference {(*D2) = SUM{wd[iab]*d2[iab] : iab in 0..nw-1}}. If {verbose} is true, writes the test case to {stderr}. */ void dmtd_apply_filter(int32_t nw, double w[], int32_t *a, double A[]); /* Applies a filter with weights {w[0..nw-1]} to the columns of the integer array {a}. Assumes that {a} has {nw} rows and {ng} columns, where {ng=dnae_CHANNELS}. Returns the result in the float vector {A[0..ng-1]}. */ auto void throw_raw(int32_t nw, r3_t a[], r3_t b[], double *d2); /* Assumes that {a,b,d2} are vectors of {nw} elements. Shifts their contents down by 1 position. Inserts a random pair of gemetrically encoded nucleorites in {a[nw-1]} and {b[nw-1]}. Computes the squared distance between those datums and puts it into {d2[nw-1]}. Also writes those new datums to the {wr_raw}. */ auto void gen_case(int32_t it, int32_t nv, double vi[], int32_t nf, double f[], double *wi_P); /* Case generator. */ void gen_case(int32_t it, int32_t nv, double bvi[], int32_t nf, double fi[], double *wi_P) { assert(nv == nV); assert(nf == 1); double A[nc], B[nc]; double DD2@@; bool_t verbgen = verbose & (it < 2); /* Debug the test case generator? */ for (int32_t s = 0; s < str; s++) { dmtd_throw_raw(nw, nc, a, b, d2, verbgen); } dmtd_smooth_signals(ws, wd, nw, nc, a, b, d2, A, B, &DD2@@, verbgen); double D2 = dist?(A,B}; dmtd_lsq_basis(full, nc, A, B, nV, bvi, NULL); fi[0] = D2; (*wi_P) = 1.0; /* Compute the conjectural estimator {EE2@@}: */ double EE2@@; if (ne == 0) { EE2@@ = DD2@@; } else { assert(ne == nV); EE2@@ = 0; for (int32_t ib = 0; ib < nV; ib++) { EE2@@ += estimator->e[ib]*bvi[ib]; } } } /* Generate two random nucleotide strings {a,b}: */ randchars(xc, nw, "atcg", 4); randchars(yc, nw, "atcg", 4); if (verbose) { fprintf(stderr, " a = %s", xc); fprintf(stderr, " b = %s", yc); fprintf(stderr, "\n"); } /* Convert them to numeric vectors: */ for (int32_t iw = 0; iw < nw; iw++) { dnae_datum_decoded_from_nucleic_char(xc[iw], &(a[iw*nc])); dnae_datum_decoded_from_nucleic_char(yc[iw], &(b[iw*nc])); if (verbose) { fprintf(stderr, " a[%02d] = %c = ", iw, xc[iw]); msm_debug_int32_vec(&(a[iw*nc]), nc, "%+d"); fprintf(stderr, " b[%02d] = %c = ", iw, yc[iw]); msm_debug_int32_vec(&(b[iw*nc]), nc, "%+d"); fprintf(stderr, "\n"); } }