#define PROG_NAME "dm_chop_cands" #define PROG_DESC "Reads a candidate list and chops large candidates into smaller pieces" #define PROG_VERS "1.0" /* Last edited on 2024-12-21 14:04:46 by stolfi */ #define dm_chop_cands_C_COPYRIGHT \ "Copyright © 2013 by the State University of Campinas (UNICAMP)" #define PROG_HELP \ PROG_NAME " \\\n" \ " -maxSize {MAX_SIZE} \\\n" \ " [ -minSize {MIN_SIZE} ] \\\n" \ " [ -idealSize {IDEAL_SIZE} ] \\\n" \ " [ -verbose ] \\\n" \ " " argparser_help_info_HELP " \\\n" \ " < {IN_CANDS_FILE} \\\n" \ " > {OUT_CANDS_FILE}" #define PROG_INFO \ "NAME\n" \ " " PROG_NAME " - " PROG_DESC "\n" \ "\n" \ "SYNOPSIS\n" \ " " PROG_HELP "\n" \ "\n" \ "DESCRIPTION\n" \ " This program reads from standard input a candidate" \ " list (in the {msm_cand_vec_write} \".cdv\" file format), and" \ " cuts any large candidates into smaller separate candidates. Candidates" \ " or candidate pieces that are have too few rungs are discarded. The" \ " resulting candidate list is written to standard output in the same format.\n " \ "\n" \ "OPTIONS\n" \ " -maxSize {MAX_SIZE}\n" \ " This mandatory argument specifies the maximum length of output" \ " candidates. Input candidates that have more than {MAX_SIZE} rungs are" \ " chopped into one or more non-overlapping pieces smaller than that limit. If this argument is" \ " omitted, there is no limit to the candidate's length.\n" \ "\n" \ " -minSize {MIN_SIZE}\n" \ " This optional argument specifies the minimum length of output" \ " candidates. Input candidates or candidate pieces that have fewer than {MIN_SIZE} rungs" \ " are discarded. If this argument is omitted, there is no minimum length.\n" \ "\n" \ " -idealSize {IDEAL_SIZE}\n" \ " This optional argument specifies the preferred size of pieces if a candidate" \ " has to be split because it exceeds {MAX_SIZE}. If this argument is" \ " omitted, assumes {IDEAL_SIZE = MAX_SIZE}.\n " \ "\n" \ " -verbose \n" \ " If present, the program will print the main parameters of" \ " every candidate read from the input file and its fate (accepted," \ " rejected or chopped).\n" \ "\n" \ argparser_help_info_HELP_INFO "\n" \ "SEE ALSO\n" \ " dm_cands_scramble(1)\n" \ "\n" \ "AUTHOR\n" \ " This program was created in oct/2013 by J. Stolfi." \ "\n" \ "WARRANTY\n" \ argparser_help_info_NO_WARRANTY "\n" \ "\n" \ "RIGHTS\n" \ " " dm_chop_cands_C_COPYRIGHT ".\n" \ "\n" \ argparser_help_info_STANDARD_RIGHTS #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef struct options_t { int32_t maxSize; /* Maximum candidate size. */ int32_t minSize; /* Minimum candidate size. */ int32_t idealSize; /* Ideal piece size when chopping a candidate. */ bool_t verbose; /* TRUE to print report. */ } options_t; int main(int argc, char**argv); options_t *dmcc_parse_options(int argc, char**argv); /* Parses the command line options, packs them into a {options_t} record. */ void dmcc_chop_all_cands ( msm_cand_vec_t *cdvP, int32_t max_rungs, int32_t min_rungs, int32_t ideal_rungs, bool_t verbose ); /* Chops any candidate with more than {max_rungs} rungs into pieces with about {ideal_rungs} rungs. Discards candidates or pieces with fewer than {min_rungs} rungs. */ void dmcc_chop_cand ( msm_cand_t* cdP, int32_t max_rungs, int32_t min_rungs, int32_t ideal_rungs, bool_t verbose, msm_cand_vec_t* cdvP, int *ncP ); /* Chops the candidate {*cdP} (which is expected to have more than {max_rungs} rungs) into candidates with about {ideal_rungs} rungs. The pieces will have at most {max_rungs} and at least {min_rungs} rung each. Requires {min_rungs <= ideal_rungs <= max_rungs}. The pieces are appended to the list {*cdvP} starting at element {*ncP}, which is incremented accordingly. The vector {*cdvP} is expanded if needed but not trimmed at the end. */ int main(int argc, char**argv) { options_t *o = dmcc_parse_options(argc, argv); msm_cand_vec_t cdv = msm_cand_vec_read(stdin); dmcc_chop_all_cands(&cdv, o->maxSize, o->minSize, o->idealSize, o->verbose); msm_cand_vec_write(stdout, &cdv); return 0; } void dmcc_chop_all_cands ( msm_cand_vec_t *cdvP, int32_t max_rungs, int32_t min_rungs, int32_t ideal_rungs, bool_t verbose ) { int nc_in = cdvP->ne; msm_cand_vec_t cdv_out = msm_cand_vec_new(nc_in); int nc_cut = 0; /* Number of candidates chopped. */ int nc_acp = 0; /* Number of candidates accepted whole. */ int nc_rej = 0; /* Number of candidates rejected for too small. */ int nc_out = 0; /* Total number of candidates returned. */ int ic; for(ic = 0; ic < nc_in; ic++) { /* Get the next input candidate {cd}: */ msm_cand_t *cdP = &(cdvP->e[ic]); if (verbose) { msm_cand_write(stderr, NULL, cdP, 7, 6, 9, NULL, FALSE); } int nr = msm_pairing_num_rungs(cdP->pr); if (nr > max_rungs) { if (verbose) { fprintf(stderr, " (chopped)\n"); } dmcc_chop_cand(cdP, max_rungs, min_rungs, ideal_rungs, verbose, &cdv_out, &nc_out); msm_pairing_free(cdP->pr); nc_cut++; } else if (nr >= min_rungs) { msm_cand_vec_expand(&cdv_out, nc_out); cdv_out.e[nc_out] = (*cdP); nc_out++; if (verbose) { fprintf(stderr, " (accepted)\n"); } nc_acp++; } else { if (verbose) { fprintf(stderr, " (rejected)\n"); } nc_rej++; } } msm_cand_vec_trim(&cdv_out, nc_out); free(cdvP->e); (*cdvP) = cdv_out; fprintf(stderr, "%7d candidates scanned\n", nc_in); fprintf(stderr, "%7d accepted\n", nc_acp); fprintf(stderr, "%7d rejected\n", nc_rej); fprintf(stderr, "%7d chopped\n", nc_cut); fprintf(stderr, "%7d candidates returned\n", nc_out); } void dmcc_chop_cand ( msm_cand_t* cdP, int32_t max_rungs, int32_t min_rungs, int32_t ideal_rungs, bool_t verbose, msm_cand_vec_t* cdvP, int *ncP ) { assert(min_rungs <= ideal_rungs); assert(ideal_rungs <= max_rungs); msm_pairing_t *pr = cdP->pr; int nr = msm_pairing_num_rungs(pr); assert(nr > max_rungs); msm_seq_desc_t *xp = &(cdP->seq[0]); msm_seq_desc_t *yp = &(cdP->seq[1]); /* Compute ideal number of pieces {np}: */ int np = (int)msm_round(((double)nr)/ideal_rungs); /* Make sure that pieces are greater than {min_rungs}: */ while (np*min_rungs > nr) { np--; } assert(np > 0); /* Extract the pieces: */ int nc = (*ncP); /* Number of candidates. */ int32_t ini = 0; /* Start of new piece (raw). */ int ip; for (ip = 0; ip < np; ip++) { /* Compute start of next piece for even division into {np} pieces: */ int32_t nxt = (int32_t)msm_round(((double)(ip+1)*nr)/np); /* Trim piece ends if needed to fit max length: */ int32_t ng_raw = nxt - ini; /* Length of raw piece in rungs. */ assert(ng_raw >= min_rungs); int32_t ng_cut = (int32_t)imin(max_rungs, ng_raw); /* Length of piece to cut. */ int32_t skip = (ng_raw - ng_cut)/2; /* How many rungs to skip from start of raw piece. */ int32_t ini_cut = ini + skip; /* Start of cutout. */ int32_t fin_cut = ini_cut + ng_cut - 1; /* End of cutout. */ /* Extract the piece's rungs: */ msm_pairing_t *pr_cut = msm_pairing_sub_copy(pr, ini_cut, fin_cut); /* Output the piece as a new candidate: */ msm_cand_vec_expand(cdvP, nc); msm_cand_t* coP = &(cdvP->e[nc]); (*coP) = msm_cand_from_pairing(xp, yp, pr_cut, cdP->score/np); if (verbose) { msm_cand_write(stderr, " ", coP, 7, 6, 9, "\n", FALSE); } /* Prepare for next chunk: */ ini = nxt; nc++; } (*ncP) = nc; } options_t *dmcc_parse_options(int argc, char**argv) { options_t *o = (options_t *)notnull(malloc(sizeof(options_t)), "no mem"); argparser_t *pp = argparser_new(stderr, argc, argv); argparser_set_help(pp, PROG_HELP); argparser_set_info(pp, PROG_INFO); argparser_process_help_info_options(pp); argparser_get_keyword(pp, "-maxSize"); o->maxSize = (int)argparser_get_next_int(pp, 1, INT_MAX); if (argparser_keyword_present(pp, "-idealSize")) { o->idealSize = (int)argparser_get_next_int(pp, 1, o->maxSize); } else { o->idealSize = o->maxSize; } if (argparser_keyword_present(pp, "-minSize")) { o->minSize = (int)argparser_get_next_int(pp, 0, o->idealSize); } else { o->minSize = 0; } o->verbose = argparser_keyword_present(pp, "-verbose"); argparser_skip_parsed(pp); argparser_finish(pp); return o; }