#define PROG_NAME "dm_scramble_cands" #define PROG_DESC "Generates false (random) candidates from true ones" #define PROG_VERS "1.0" /* Last edited on 2024-12-21 14:04:22 by stolfi */ #define dm_scramble_cands_C_COPYRIGHT \ "Copyright © 2012 by the State University of Campinas (UNICAMP) " \ " and the Federal Fluminense University (UFF)" #define PROG_HELP \ PROG_NAME " \\\n" \ " [ -verbose ] \\\n" \ " " argparser_help_info_HELP " \\\n" \ " < {IN_CANDS_FILE} \\\n" \ " > {OUT_CANDS_FILE}" #define PROG_INFO \ "NAME\n" \ " " PROG_NAME " - " PROG_DESC "\n" \ "\n" \ "SYNOPSIS\n" \ " " PROG_HELP "\n" \ "\n" \ "DESCRIPTION\n" \ " This program reads from standard input a candidate" \ " list (in the {msm_cand_vec_write} \".cdv\" file format), and" \ " scrambles them so that they pair the same subsequences" \ " but in a different pairs. This is useful to generate false" \ " candidates from true ones. The resulting candidate list is" \ " written to standard output in the same format.\n " \ "\n" \ "OPTIONS\n" \ " -verbose \n" \ " If present, the program will print some diagnostic messages.\n" \ "\n" \ argparser_help_info_HELP_INFO "\n" \ "SEE ALSO\n" \ " dm_cands_refine(1)\n" \ "\n" \ "AUTHOR\n" \ " This program was created in oct/2012 by R. Saracchini" \ " and J. Stolfi as the \"-scramble\" option of {dm_lav_to_cdv.c}." \ "\n" \ "MODIFICATION HISTORY\n" \ " 2013-10-12 J. Stolfi:\n" \ " * Split the code out from {dm_lav_to_cdv.c}.\n" \ " * Improved the scrambling to take the lengths into account.\n" \ "\n" \ "WARRANTY\n" \ argparser_help_info_NO_WARRANTY "\n" \ "\n" \ "RIGHTS\n" \ " " dm_scramble_cands_C_COPYRIGHT ".\n" \ "\n" \ argparser_help_info_STANDARD_RIGHTS #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef struct options_t { bool_t sort; /* TRUE to sort by decreasing total span. */ bool_t verbose; /* TRUE to print report. */ } options_t; int main(int argc, char**argv); options_t *dmsc_parse_options(int argc, char**argv); /* Parses the command line options, packs them into a {options_t} record. */ void dmsc_scramble_cands(msm_cand_vec_t *cdvP); /* Replaces the candidates in {*cdvP} by a set of fake candidates that span the same sequences but pair them in a completely different way. Useful to generate false candidates from a set of true ones. */ int32_t* dmsc_define_pairing(msm_cand_vec_t *cdvP); /* Creates permutation {perm[0..nc-1]} of the indices {0..nc-1} that defines a `wrong' re-pairing of the candidates in {*cdvP}. The permutation specifies that the 0-side sub-sequence spanned candidate {i} should be paired with the 1-side sub-sequence spanned by cnadidate {perm[i]}. Assumes that {cdvP} is sorted by increasing total span. */ int main(int argc, char**argv) { options_t *o = dmsc_parse_options(argc, argv); msm_cand_vec_t cdv = msm_cand_vec_read(stdin); dmsc_scramble_cands(&cdv); msm_cand_vec_write(stdout, &cdv); if (o->verbose) { fprintf(stderr, PROG_NAME " done.\n"); } return 0; } void dmsc_scramble_cands(msm_cand_vec_t *cdvP) { int nc = cdvP->ne; int *perm = dmsc_define_pairing(cdvP); /*derangement */ /* New candidates are {cdvn.e[0..ncn-1]}: */ msm_cand_vec_t cdvn = msm_cand_vec_new(nc); int ncn = 0; /* Scan old candidates: */ int r; for(r = 0; r < nc; r++) { /* Pick the partner {s} of candidate {r}. */ int s = perm[r]; msm_cand_t *cd_r = &(cdvP->e[r]); msm_cand_t *cd_s = &(cdvP->e[s]); /* Pair side 0 of candidate {r} with side 1 of candidate {s}. */ msm_cand_t cd_n; cd_n.seq[0] = cd_r->seq[0]; cd_n.seq[1] = cd_s->seq[1]; cd_n.score = 0; int ng_r = msm_pairing_num_rungs(cd_r->pr); int ng_s = msm_pairing_num_rungs(cd_s->pr); int ng_min = (int)imin(ng_r,ng_s); msm_rung_vec_t gvn; if (ng_min == 1) { /* One of the candidates has only one rung, result has one rung too: */ msm_rung_t gmid_r = msm_pairing_get_rung(cd_r->pr, ng_r/2); msm_rung_t gmid_s = msm_pairing_get_rung(cd_s->pr, ng_s/2); gvn = msm_rung_vec_new(1); gvn.e[0] = (msm_rung_t){{ gmid_r.c[0], gmid_s.c[1] }}; } else { /* Get the spans of side 0 or {r} and side 1 of {s}: */ msm_rung_t gini_r = msm_pairing_get_rung(cd_r->pr, 0); msm_rung_t gfin_r = msm_pairing_get_rung(cd_r->pr, ng_r-1); int nb_r = gfin_r.c[0] - gini_r.c[0] + 1; assert(nb_r >= 1); msm_rung_t gini_s = msm_pairing_get_rung(cd_s->pr, 0); msm_rung_t gfin_s = msm_pairing_get_rung(cd_s->pr, ng_s-1); int nb_s = gfin_s.c[1] - gini_s.c[1] + 1; assert(nb_s >= 1); int nb_min = (int)imin(nb_r,nb_s); /* Trim the longest span, : */ int skip_r = (nb_r - nb_min)/2; int kini_r = gini_r.c[0] + skip_r; int kfin_r = kini_r + nb_min - 1; int skip_s = (nb_s - nb_min)/2; int kini_s = gini_s.c[1] + skip_s; int kfin_s = kini_s + nb_min - 1; /* Assemble the first and last rung of the new candidate: */ msm_rung_vec_t gvt = msm_rung_vec_new(2); gvt.e[0] = (msm_rung_t){{ kini_r, kini_s }}; gvt.e[1] = (msm_rung_t){{ kfin_r, kfin_s }}; /* Interpolate those two rungs to get the full candidate: */ gvn = msm_rung_vec_interpolate(&gvt); free(gvt.e); } /* Convert to pairing: */ cd_n.pr = msm_pairing_from_rung_vec(&gvn); cdvn.e[ncn] = cd_n; ncn++; } msm_cand_vec_free(cdvP); *cdvP = cdvn; } int32_t* dmsc_define_pairing(msm_cand_vec_t *cdvP) { int nc = cdvP->ne; int* perm = notnull(malloc(nc*sizeof(int32_t)), "no mem"); int i; for (i = 0; i < nc; i++) { perm[i] = i;} auto double span_mismatch(int32_t r, int32_t s); /* Returns a number between 0 and 1 that indicates how approriate is the 1-span of candidate {s} as a `false' match for the 0-span of candidate {r}. The result is 0 if {s==r}, otherwise it is always positive. It is low if the 1-span of candidate {s} is similar to the 1-span of candidate {r}, or if the 1-span of candidate {s} is much longer or shorter than the 0-span of candidate {r}. */ for (i = nc-1; i >= 0; i--) { /* The 1-side spans that are still unassigned are {perm[0..i]}. */ /* Find an unassigned index {perm[j]} that can be assigned to {i}: */ int j; if (i == 0) { /* No choice, must use {perm[0]}: */ j = 0; } else if (i == 1) { /* Choose {perm[0]} or {perm[1]} so as to avoid loop on either one: */ j =((perm[0] == 0) || (perm[1] == 1) ? 0 : 1); } else { /* Chooose {j} such that {perm[j]} is distinct from {i}, the 1-side span of candidate {perm[j]} is about the same length as the 0-span of candidate {i}, and as disjoint as possible from the 1-span of candidate {i}. */ j = -1; double dj = -INF; /* Desirability of {perm[j]}. */ int k; for (k = 0; k <= i; k++) { double dk = span_mismatch(i, perm[k]); if (dk > dj) { j = k; dj = dk; } } } assert (perm[j] != i); int t = perm[j]; perm[j] = perm[i]; perm[i] = t; } return perm; double span_mismatch(int32_t r, int32_t s) { if (r == s) { return 0.0; } /* Get the two candidates: */ msm_cand_t* cd_r = &(cdvP->e[r]); msm_cand_t* cd_s = &(cdvP->e[s]); /* Get the candidates' rung counts: */ int32_t ng_r = msm_pairing_num_rungs(cd_r->pr); int32_t ng_s = msm_pairing_num_rungs(cd_s->pr); /* Get their first and last rungs: */ msm_rung_t gini_r = msm_pairing_get_rung(cd_r->pr, 0); msm_rung_t gini_s = msm_pairing_get_rung(cd_s->pr, 0); msm_rung_t gfin_r = msm_pairing_get_rung(cd_r->pr, ng_r-1); msm_rung_t gfin_s = msm_pairing_get_rung(cd_s->pr, ng_s-1); /* Compute the spans on each side: */ int32_t span_r0 = gfin_r.c[0] - gini_r.c[0] + 1; int32_t span_r1 = gfin_r.c[1] - gini_r.c[1] + 1; /* int32_t span_s0 = gfin_s.c[0] - gini_s.c[0] + 1; */ int32_t span_s1 = gfin_s.c[1] - gini_s.c[1] + 1; /* Compare the sizes of the segments to be matched (span 0 of {r} and span 1 of {s}): */ int32_t smin_r0s1 = (int32_t)imin(span_r0, span_s1); /* Min of the two spans. */ double savg_r0s1 = 0.5*(span_r0 + span_s1); /* Avg of the two spans. */ assert(savg_r0s1 > 0); double fsize = pow(((double)smin_r0s1)/savg_r0s1, 5); /* Size mismatch factor. */ assert(fsize > 0.0); /* Consider the overlap of span 1 of {s} with that of {r}: */ int32_t ini_r1s1 = (int32_t)imax(gini_r.c[1], gini_s.c[1]); /* Start of overlap of the 1-spans. */ int32_t fin_r1s1 = (int32_t)imin(gfin_r.c[1], gfin_s.c[1]); /* End of overlap of the 1-spans. */ int32_t sovr_r1s1 = (ini_r1s1 > fin_r1s1 ? 0 : fin_r1s1 - ini_r1s1 + 1); /* Amount of overlap. */ int32_t smin_r1s1 = (int32_t)imin(span_r1, span_s1); assert(sovr_r1s1 <= smin_r1s1); double fover = 1.0 - 0.999*((double)sovr_r1s1)/smin_r1s1; /* Old overlap factor. */ assert(fover > 0.0); return fsize*fover; } } options_t *dmsc_parse_options(int argc, char**argv) { options_t *o = (options_t *)notnull(malloc(sizeof(options_t)), "no mem"); argparser_t *pp = argparser_new(stderr, argc, argv); argparser_set_help(pp, PROG_HELP); argparser_set_info(pp, PROG_INFO); argparser_process_help_info_options(pp); o->verbose = argparser_keyword_present(pp, "-verbose"); argparser_skip_parsed(pp); argparser_finish(pp); return o; }