#define PROG_NAME "dm_compute_weights" #define PROG_DESC "computes the scoring weights for true/false candidate discrimination" #define PROG_VERS "1.0" /* Last edited on 2024-12-21 14:04:39 by stolfi */ #define dm_cand_filter_C_COPYRIGHT \ "Copyright © 2012 by the the State University of Campinas (UNICAMP)" \ " and the Federal Fluminense University (UFF)" #define PROG_HELP \ PROG_NAME " \\\n" \ " -seqA {ID_A} {NAME_A} {FILENAME_A} \\\n" \ " -seqB {ID_B} {NAME_B} {FILENAME_B} \\\n" \ " -candF {CAND_FILE_F} \\\n" \ " -candT {CAND_FILE_T} \\\n" \ " [ -minCandSize {MINCANDSIZE} ] \\\n" \ " [ -maxCandSize {MAXCANDSIZE} ] \\\n" \ " [ -adjust {MAXITERATIONS} ] \\\n" \ " -outPrefix {OUT_PREFIX}" #define PROG_INFO \ "NAME\n" \ " " PROG_NAME " - " PROG_DESC "\n" \ "\n" \ "SYNOPSIS\n" \ " " PROG_HELP "\n" \ "\n" \ "DESCRIPTION\n" \ " This program reads a candidate file {CAND_FILE_T} containing correct pairings," \ " another one {CAND_FILE_F} containing random parings," \ " and the corresponding sequences {FILENAME_A} and {FILENAME_B}, and computes" \ " a estimate for the weights for dm_match for such candidates.\n " \ "\n" \ "OUTPUT FILES\n" \ " All output files will have names starting with {OUT_PREFIX}. They include:\n" \ "\n" \ " {OUT_PREFIX}-win.txt The initial guess for the scoring weights.\n" \ " {OUT_PREFIX}-wgt.txt The final scoring weights.\n" \ " {OUT_PREFIX}-svt.txt The score counts for all cands in the SVM training format.\n" \ "\n" \ "OPTIONS\n" \ " -minCandSize {MINCANDSIZE}\n" \ " -maxCandSize {MAXCANDSIZE}\n" \ " These optional arguments specify the minimum and maximum number of rungs in" \ " the candidates. Any candidate that exceeds these" \ " limits (if given) will be excluded from the analysis." \ "\n" \ " -adjust MAXITERATIONS}\n" \ " This optional argument, if given and positive, specifies the number of iterations" \ " of the local barycenter bisector weight-adjustment heuristic to use after the" \ " initial barycenter bisector guess. If not given or zero, the heuristic will not be used." \ "\n" \ argparser_help_info_HELP_INFO "\n" \ "SEE ALSO\n" \ " dnae_seq_filter(1)\n" \ "\n" \ "AUTHOR\n" \ " This program was created on 12/nov/2012 by J. Stolfi and R. Saracchini.\n" \ "WARRANTY\n" \ argparser_help_info_NO_WARRANTY "\n" \ "\n" \ "RIGHTS\n" \ " " dm_cand_filter_C_COPYRIGHT ".\n" \ "\n" \ argparser_help_info_STANDARD_RIGHTS #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef struct options_t { /* Sequence parameters: */ msm_seq_id_t seqA_id; char *seqA_name; char *seqA_filename; msm_seq_id_t seqB_id; char *seqB_name; char *seqB_filename; /* Candidate parameters: */ char *candF_filename; char *candT_filename; /* Analysis parameters: */ int minCandSize; /* Min number of rungs in candidates. */ int maxCandSize; /* Max number of rungs in candidates. */ int adjust_maxIter; /* Max iteration of the local barycenter weight adjustment heuristic. */ /* Output parameters: */ char *outPrefix; /* Output file name prefix (minus extensions). */ } options_t; int main(int argc, char**argv); options_t *dmcw_parse_options(int argc, char**argv); /* Parses the command line options, packs them into a {options_t} record. */ msm_cand_vec_t dmcw_read_cand_vec(char* filename); dnae_seq_t dmcw_read_encoded_seq(char* filename); void dmcw_trim_cand_vec(msm_cand_vec_t *cdv, int min, int max); void dmcw_write_weights(char *outPrefix, char *tag, char *title, dm_score_rec_t *wt); void dmcw_write_stats(char *outPrefix, msm_cand_vec_t cdv[], dnae_seq_t *seq_a, dnae_seq_t *seq_b, dm_score_rec_t *wt); void dmcw_write_SVM_training_data(char *outPrefix, msm_cand_vec_t cdv[], dnae_seq_t *seq_a, dnae_seq_t *seq_b); void dmcw_write_weights(char *outPrefix, char *tag, char *title, dm_score_rec_t *wt); int main(int argc, char**argv) { options_t *o = dmcw_parse_options(argc, argv); msm_cand_vec_t cdv[2]; int class; for (class = 0; class <= 1; class++) { char *fname = (class == 0 ? o->candF_filename : o->candT_filename); char *cname = (class == 0 ? "false" : "true"); cdv[class] = dmcw_read_cand_vec(fname); int n_orig = cdv[class].ne; dmcw_trim_cand_vec(&(cdv[class]), o->minCandSize, o->maxCandSize); int n_trim = cdv[class].ne; fprintf(stderr,"%s candidates - selected %d out of %d\n", cname, n_trim, n_orig); } dnae_seq_t seq_a = dmcw_read_encoded_seq(o->seqA_filename); dnae_seq_t seq_b = dmcw_read_encoded_seq(o->seqB_filename); /* Write data for SVM training: */ dmcw_write_SVM_training_data(o->outPrefix, cdv, &seq_a, &seq_b); /* Compute score weights by the barycenter bisector heuristic: */ dm_score_rec_t wt; dm_classify_determine_score_weights(&(cdv[0]), &(cdv[1]), &seq_a, &seq_b, &wt); dm_score_normalize_weights(&wt); dmcw_write_weights(o->outPrefix, "win", "initial weights", &wt); /* Adjust score weights by the local barycenter bisector heuristic: */ if (o->adjust_maxIter > 0) { fprintf(stderr,"adjusting weights - %d iterations\n", o->adjust_maxIter); dm_classify_adjust_score_weights(&(cdv[0]), &(cdv[1]), &seq_a, &seq_b, o->adjust_maxIter, &wt); } dmcw_write_weights(o->outPrefix, "wgt", "final weights", &wt); dmcw_write_stats(o->outPrefix, cdv, &seq_a, &seq_b, &wt); return 0; } void dmcw_write_stats(char *outPrefix, msm_cand_vec_t cdv[], dnae_seq_t *seq_a, dnae_seq_t *seq_b, dm_score_rec_t *wt) { char* fname = NULL; char *fname = jsprintf("%s-sts.txt", outPrefix); FILE* wr = open_write(fname, TRUE); int ngud[2]; /* Number of correctly classified cands. */ int nbad[2]; /* Number of incorrectly classified cands. */ int class; for (class = 0; class <= 1; class++) { char *cname = (class == 0 ? "false" : "true"); int sgn = 2*class - 1; /* {-1} for false, {+1} for true. */ ngud[class] = 0; nbad[class] = 0; fprintf(wr,"# %s candidates\n", cname); int i; for (i = 0; i < cdv[class].ne; i++) { msm_cand_t *cd = &(cdv[class].e[i]); msm_rung_vec_t gv = msm_pairing_get_rungs(cd->pr); dm_score_rec_t sc; dm_score_count_clear(&sc); dm_score_count_rung_vec(&sc, &gv, seq_a, seq_b); dm_score_count_rung_vec_check(&sc, &gv); cd->score = dm_score_rung_vec(wt, &gv, seq_a, seq_b); fprintf(wr,"%+2d %d %+9.6lf", sgn, i, cd->score); dm_score_rec_print(wr, NULL, &sc, "\n"); if (cd->score*sgn >= 0) { ngud[class]++; } else { nbad[class]++; } free(gv.e); } } for (class = 0; class <= 1; class++) { char *cname = (class == 0 ? "false" : "true"); fprintf(wr,"\n"); fprintf(wr,"# CLASSIFICATION SUMMARY\n"); int ntot = ngud[class] + nbad[class]; double pcerr = (100.0*nbad[class])/((double)ntot); fprintf ( wr, "# %-5s candidates: %5d total, %5d correct, %5d incorect (%4.2f%%)\n", cname, ntot, ngud[class], nbad[class], pcerr ); } fclose(wr); } void dmcw_write_SVM_training_data(char *outPrefix, msm_cand_vec_t cdv[], dnae_seq_t *seq_a, dnae_seq_t *seq_b) { char* fname = NULL; char *fname = jsprintf("%s-svm.txt", outPrefix); FILE* wr = open_write(fname, TRUE); dm_classify_write_svm_file(wr, &(cdv[0]), &(cdv[1]), seq_a, seq_b); fclose(wr); free(fname); } void dmcw_write_weights(char *outPrefix, char *tag, char *title, dm_score_rec_t *wt) { char *fname = jsprintf("%s-%s.txt", outPrefix, tag); FILE *wr = open_write(fname, TRUE); dm_score_rec_print(stderr, NULL, wt, "\n"); fclose(wr); free(fname); /* Report to [stderr}: */ fprintf(stderr,"%s:\n", title); dm_score_rec_print(stderr, " ", wt, "\n"); } msm_cand_vec_t dmcw_read_cand_vec (char* filename) { FILE* arq_cdv = open_read(filename,TRUE); msm_cand_vec_t cdv = msm_cand_vec_read(arq_cdv); fclose(arq_cdv); return cdv; } dnae_seq_t dmcw_read_encoded_seq (char* filename) { FILE* arq_seq = open_read(filename,TRUE); dnae_seq_t seq = dnae_seq_read(arq_seq); fclose(arq_seq); return seq; } void dmcw_trim_cand_vec (msm_cand_vec_t *cdv, int minCandSize, int maxCandSize) { int count = 0; int i; for(i = 0; i < cdv->ne; i++){ msm_cand_t *cd = &(cdv->e[i]); msm_pairing_t *pr = cd->pr; int nr = msm_pairing_num_rungs(pr); if ((nr >= minCandSize) && (nr <= maxCandSize)) { cdv->e[count] = *cd; count++; } } msm_cand_vec_trim(cdv, count); } options_t *dmcw_parse_options(int argc, char**argv) { options_t *o = (options_t *)notnull(malloc(sizeof(options_t)), "no mem"); argparser_t *pp = argparser_new(stderr, argc, argv); argparser_set_help(pp, PROG_HELP); argparser_set_info(pp, PROG_INFO); argparser_process_help_info_options(pp); argparser_get_keyword(pp, "-seqA"); o->seqA_id = (msm_seq_id_t)argparser_get_next_int(pp, 0, INT_MAX); o->seqA_name = argparser_get_next_non_keyword(pp); o->seqA_filename = argparser_get_next_non_keyword(pp); argparser_get_keyword(pp, "-seqB"); o->seqB_id = (msm_seq_id_t)argparser_get_next_int(pp, 0, INT_MAX); o->seqB_name = argparser_get_next_non_keyword(pp); o->seqB_filename = argparser_get_next_non_keyword(pp); argparser_get_keyword(pp,"-candF"); o->candF_filename = argparser_get_next_non_keyword(pp); argparser_get_keyword(pp,"-candT"); o->candT_filename = argparser_get_next_non_keyword(pp); if(argparser_keyword_present(pp,"-minCandSize")) { o->minCandSize = (int)argparser_get_next_int(pp, 0, INT_MAX); } else { o->minCandSize = 0; } if(argparser_keyword_present(pp,"-maxCandSize")) { o->maxCandSize = (int)argparser_get_next_int(pp, o->minCandSize, INT_MAX); } else { o->maxCandSize = INT_MAX; } if (argparser_keyword_present(pp, "-adjust")) { o->adjust_maxIter = (int)argparser_get_next_int(pp, o->minCandSize, INT_MAX); } else { o->adjust_maxIter = 0; } argparser_get_keyword(pp,"-outPrefix"); o->outPrefix = argparser_get_next(pp); argparser_finish(pp); return o; }