#define PROG_NAME "dm_svm_training" #define PROG_DESC "generation of training files for LIBLINEAR/LIBSVM for weight determination" #define PROG_VERS "1.0" /* Last edited on 2024-12-21 14:04:18 by stolfi */ #define dm_svm_training_C_COPYRIGHT \ "Copyright © 2013 by the State University of Campinas (UNICAMP)" \ " and the Federal Fluminense University (UFF)" #define PROG_HELP \ PROG_NAME " \\\n" \ " -seqA {ID_A} {NAME_A} {FILENAME_A} \\\n" \ " -seqB {ID_B} {NAME_B} {FILENAME_B} \\\n" \ " -candFile {CANDFILE} \\\n" \ " -minLevel {MIN_LEVEL} -maxLevel {MAX_LEVEL} \\\n" \ " -initFilter " wt_table_args_parse_weights_HELP " \\\n" \ " -incrFilter " wt_table_args_parse_weights_HELP " \\\n" \ " [ -maxCands {MAX_CANDS} ] \\\n" \ " -lMin {LMIN} -lMax {LMAX} \\\n" \ " [ -refine \n" \ " { -scores {LEVEL} " dm_score_args_HELP " }.. \\\n" \ " -delta {DELTA} ] [ -kappa {KAPPA} ] [ -maxUnp {MAX_UNP}\n" \ " [ -expand {EXPAND} ] [ -shrink {SHRINK} ] \\\n" \ " ] \\\n" \ " -outPrefix {OUT_PREFIX}" #define PROG_INFO \ "NAME\n" \ " " PROG_NAME " - " PROG_DESC "\n" \ "\n" \ "SYNOPSIS\n" \ " " PROG_HELP "\n" \ "\n" \ "DESCRIPTION\n" \ " This program reads a candidate file and generates training files " \ " suitable for the determination of a hyperplane or a set of supporting" \ " vector computed by the LIBLINEAR or LIBSVM binaries." \ "\n" \ " The program first reads two nucleotide sequences" \ " and filters them at various scales of resolution." \ " Then, for each level {K} between {MIN_LEVEL} and {MAX_LEVEL} the" \ " program then trims each of the candidates to various" \ " lengths, obtaining a set of nested sub-candidates that will have size" \ " between [{LMIN},{LMAX}] at level {K}." \ " Each of these sub-candidates is filtered and optionally refined (once), and" \ " its length and score" \ " are written out to disk as well the equivalent SVM training file.\n" \ "\n" \ "OUTPUT FILES\n" \ " All output files will have names starting with {OUT_NAME}.\n" \ "\n" \ " The main output files are called \"{OUT_NAME}-L{LL}-svvmtrain.txt\" where" \ " {LL} is a two digit filtering level. These files are the SVM training files." \ " The program also writes out the candidates for each" \ " level {LL} and eack kind {K}, to a file called \"{OUT_NAME}-{LL}-{K}.cdv\"," \ " using the procedure {msm_cand_vec_write}, where {K} is {true}" \ " or {false} depending if it is a set of good or false candidates.\n" \ "\n" \ "OPTIONS\n" \ " -minLevel {MIN_LEVEL}\n" \ " -maxLevel {MAX_LEVEL}\n" \ " These mandatory arguments specify the" \ " minimum and maximum level of filtering to be" \ " applied. Level 0 is the unfiltered" \ " sequence, and each subsequent level is sampled at half" \ " the frequency of the previous one.\n" \ "\n" \ " -initFilter " wt_table_args_parse_weights_HELP " \n" \ " -incrFilter " wt_table_args_parse_weights_HELP " \n" \ " These mandatory arguments specify the" \ " weights of the filter to be used at the" \ " first filtering step and at subsequent" \ " filtering steps. " wt_table_args_parse_weights_norm_sum_INFO "\n" \ "\n" \ " -refine\n" \ " This optional argument specifies that each sub-candidates must be refined.\n" \ "\n" \ " -scores " dm_score_args_HELP "\n" \ " This mandatory argument specifies the" \ " scoring of steps in a pairing. " dm_score_args_INFO "\n" \ "\n" \ " -delta {DELTA}\n" \ " This optional argument specifies the" \ " amount of adjustment allowed for the X and Y coordinates of" \ " internal rungs of each pairing. The default is 3.\n" \ "\n" \ " -kappa {KAPPA}\n" \ " This optional argument specifies the" \ " amount of X and Y extension allowed" \ " at either end of each pairing. The default is 6.\n" \ "\n" \ " -maxCands {MAX_CANDS}\n" \ " If this optional argument is specfied, each training set" \ " (true or false) will be truncated after generating {MAX_CANDS} candidates.\n" \ "\n" \ " -maxUnp {MAX_UNP}\n" \ " Max unpaired samples at each step used for candidate refining.\n" \ "\n" \ " -expand {EXPAND}\n" \ " This optional argument specifies by how much the {R}-range" \ " of a refined candidate may extend beyond the" \ " original candidate's {R}-range, in both directions. If omitted, or" \ " if {EXPAND} is zero, the refined" \ " {R}-range will be a subset of the orginal range.\n" \ "\n" \ " -shrink {SHRINK}\n" \ " This optional argument specifies by how much the {R}-range of" \ " the refined candidate may shrink into the original" \ " candidate's {R}-range, in each direction. If omitted, or" \ " if {SHRINK} is zero, the refined {R}-range" \ " will be a superset of the original range.\n" \ "\n" \ " -lMin {LMIN} - lMax{LMAX} \n" \ " Size range of subcandidates allowed to be used for training at level {K}.\n" \ "\n" \ argparser_help_info_HELP_INFO "\n" \ "SEE ALSO\n" \ " dm_match(1)\n" \ "\n" \ "AUTHOR\n" \ " This program was created on 21/dec/2006 by J. Stolfi.\n" \ "WARRANTY\n" \ argparser_help_info_NO_WARRANTY "\n" \ "\n" \ "RIGHTS\n" \ " " dm_svm_training_C_COPYRIGHT ".\n" \ "\n" \ argparser_help_info_STANDARD_RIGHTS #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef struct options_t { int lMin; int lMax; /* Input data*/ char *candFile; /* Candidate file */ msm_seq_id_t idA,idB; /* Sequence IDs */ char *nameA, *nameB; /* Sequence Names */ char *seqA, *seqB; /* Sequence filenames*/ /* Filtering parameters: */ int maxCands; int minLevel; /* Minimum analysis level. */ int maxLevel; /* Maximum filtering and analysis level. */ double_vec_t w0; /* Filter weights for first stage (level 0 to level 1). */ double_vec_t w1; /* Filter weights for subsequent stages. */ /* Scoring parameters: */ dm_score_rec_t *sc; /* Scoring criterion. */ /* Refinement parameters: */ bool_t refine; /*TRUE refine the sub-candidates*/ int delta; /* Half-width of tableau around original pairing. */ int kappa; /* Extension of tableau beyond ends of pairing. */ int expand; /* How much the refined {R}-range may expand beyond original {R}-range at level 0. */ int shrink; /* How much the refined {R}-range may shrink into the original {R}-range at level 0. */ int maxUnp; /* We found out: the maximum unpaired samples per step */ /* Output parameters: */ char *outPrefix; /* Output file name prefix (minus extensions). */ } options_t; options_t *parse_args(int argc, char**argv); msm_cand_t *partition_candidate(msm_cand_t *cd, int lmin, int lmax, int k, int *num_new_cands); /* Given a candidate {cd} of level 0 , a given desired level{k} for weight adjusting, it partitions {cd} into a list of small candidates of size [2^k*lmin, 2^k lmax]. The number of candidates is stored into {num_new_cands}. It {cd} is smaller than 2^k*lmin, returns NULL.*/ msm_cand_vec_t prepare_candidate_vector_for_level(msm_cand_vec_t *cdv, int lmin, int lmax, int k, int maxCands); /*Creates a new cand vector with candidates trimmed with {partition_candidate} given a level */ void filter_sequence(options_t *o, dnae_seq_t *x, dnae_seq_t xf[], int *maxUsefulLevel); void filter_candidates ( options_t *o, msm_cand_vec_t *cdv, dnae_seq_t xf[], dnae_seq_t yf[], msm_cand_vec_t cdvf[], int *maxUsefulLevel ); void gen_biased_derangement(int n, int perm[]); /* Fills {perm[0..n-1]} with a derangement of {0..n-1} that is biased towards the identity, that is, {perm[i]} tends to be close to (but distinct from) {i}. Uses the {jsrandom.h} random generator, in its current state. */ msm_cand_vec_t scramble_cand_vec( msm_cand_vec_t *cdv); msm_cand_vec_t readCDV(char *filename); /* Reads a candidate vector from {filename}. */ int main(int argc, char** argv) { options_t *o = parse_args(argc, argv); bool_t verbose = FALSE; srandom(46150001); /* Initialize the randomizer, for repeatability. */ /* Read input data: */ fprintf(stderr,"Reading input files\n"); msm_cand_vec_t cdv = readCDV(o->candFile); dnae_seq_t seq_a = dnae_seq_read_from_nucleic_file_named(o->seqA, "", "", o->idA, o->nameA, FALSE); dnae_seq_t seq_b = dnae_seq_read_from_nucleic_file_named(o->seqB, "", "", o->idB, o->nameB, FALSE); fprintf(stderr,"Filtering sequences\n"); /* This needs to be done only once: */ dnae_seq_t xf[o->maxLevel + 1]; /* Left sequence of genome at each scale. */ dnae_seq_t yf[o->maxLevel + 1]; /* Right sequence of genome at each scale. */ int maxUsefulLevel_seq = o->maxLevel; filter_sequence(o, &seq_a, xf, &maxUsefulLevel_seq); filter_sequence(o, &seq_b, yf, &maxUsefulLevel_seq); int i; for (i = 0; i <= maxUsefulLevel_seq; i++) { assert((xf[i].sd.id == o->idA) && (strcmp(xf[i].sd.name,o->nameA) == 0)); assert((yf[i].sd.id == o->idB) && (strcmp(yf[i].sd.name,o->nameB) == 0)); } int k; for (k = o->minLevel; k <= o->maxLevel; k++) { fprintf(stderr,"Processing Level %02d \n",k); if(k > maxUsefulLevel_seq) { fprintf(stderr,"Sequences are too short for level %02d\n",k); break; } /* Adjust sequences for the desired level: */ msm_cand_vec_t cdvT = prepare_candidate_vector_for_level(&cdv, o->lMin, o->lMax, k, o->maxCands); /* Create false candidates: */ msm_cand_vec_t cdvF = scramble_cand_vec(&cdvT); fprintf(stderr,"Trimmed candidates - %d good and %d false (%d from original)\n",cdvT.ne, cdvF.ne,cdv.ne); /* Now filter sequences and candidates: */ fprintf(stderr, "Filtering sequences and candidates Level %02d\n",k); /* This has to be done for each level ! */ int maxUsefulLevel = k; msm_cand_vec_t tcvf[maxUsefulLevel + 1]; /*True candidates at each scale*/ msm_cand_vec_t fcvf[maxUsefulLevel + 1]; /*False candidates at each scale*/ filter_candidates(o, &cdvT, xf, yf, tcvf, &maxUsefulLevel); for(i = 0; i <= maxUsefulLevel_seq; i++) { assert((xf[i].sd.id == o->idA) && (strcmp(xf[i].sd.name,o->nameA) == 0)); assert((yf[i].sd.id == o->idB) && (strcmp(yf[i].sd.name,o->nameB) == 0)); } filter_candidates(o, &cdvF, xf, yf, fcvf, &maxUsefulLevel); for(i = 0; i <= maxUsefulLevel_seq; i++) { assert((xf[i].sd.id == o->idA) && (strcmp(xf[i].sd.name,o->nameA) == 0)); assert((yf[i].sd.id == o->idB) && (strcmp(yf[i].sd.name,o->nameB) == 0)); } if(maxUsefulLevel >= k) { msm_cand_vec_t cdv_rT; msm_cand_vec_t cdv_rF; if(o->refine) { auto double step_score(msm_seq_desc_t *ad, msm_seq_desc_t *bd, msm_rung_t *g0, msm_rung_t *g1); /* Step scoring function. */ double step_score(msm_seq_desc_t *ad, msm_seq_desc_t *bd, msm_rung_t *g0, msm_rung_t *g1) { (void)msm_seq_desc_same_orig_seq(&(xf[k].sd), ad, TRUE); (void)msm_seq_desc_same_orig_seq(&(yf[k].sd), bd, TRUE); return dm_score_step(&(o->sc[k]), g0, g1, &(xf[k]), &(yf[k])); } fprintf(stderr, "Refining candidates Level %02d\n",k); msm_dyn_tableau_t tb = msm_dyn_tableau_new(); /* int n_entries = 0; */ /* int n_steps = 0; */ cdv_rT = msm_cand_vec_refine ( &(tcvf[k]), &(xf[k].sd), &(yf[k].sd), o->delta, o->kappa, o->expand, o->shrink, o->maxUnp, step_score, verbose, &tb, 0, INT_MAX, 0.99 ); cdv_rF = msm_cand_vec_refine ( &(fcvf[k]), &(xf[k].sd), &(yf[k].sd), o->delta, o->kappa, o->expand, o->shrink, o->maxUnp, step_score, verbose, &tb, 0, INT_MAX, 0.99 ); msm_dyn_tableau_free(&tb); } else { cdv_rT = (tcvf[k]); cdv_rF = (fcvf[k]); } fprintf(stderr,"Saving processed files\n"); char *true_cand_prefix = jsprintf("%s-L%02d-true", o->outPrefix, k); msm_cand_vec_write_named(&cdv_rT,true_cand_prefix , "", ".cdv"); char *false_cand_prefix = jsprintf("%s-L%02d-false", o->outPrefix, k); msm_cand_vec_write_named(&cdv_rF,false_cand_prefix , "", ".cdv"); char *arq_svm_filename = jsprintf("%s-L%02d-svmtraining.txt",o->outPrefix, k); FILE *arq_svm = open_write(arq_svm_filename,TRUE); dm_classify_write_svm_file(arq_svm, &cdv_rT, &cdv_rF,&(xf[k]),&(yf[k])); free(true_cand_prefix); free(false_cand_prefix); free(arq_svm_filename); } else { fprintf(stderr,"No candidates at level %02d !!! Maximum level - %d \n",k,maxUsefulLevel); } /* clean-up mess */ int i; for (i = 1; i <= maxUsefulLevel; i++) { msm_cand_vec_free(&(tcvf[i])); msm_cand_vec_free(&(fcvf[i])); } if (maxUsefulLevel < k) { fprintf(stderr,"Skiping further levels\n"); break; } } return 0; } msm_cand_vec_t prepare_candidate_vector_for_level(msm_cand_vec_t *cdv, int lmin, int lmax, int k, int maxCands) { /* first compute how much suitable candidates will remain */ int i; msm_cand_vec_t new_cdv = msm_cand_vec_new(cdv->ne); int num_cands = 0; for (i = 0; i < cdv->ne; i++) { msm_cand_t *cd = &(cdv->e[i]); int num_new_cands; msm_cand_t *cd_list = partition_candidate(cd,lmin,lmax,k, &num_new_cands); /* resize new_cdv if it is too small */ if (num_new_cands > 0) { if (num_new_cands + num_cands > maxCands) { num_new_cands = maxCands - num_cands; } msm_cand_vec_expand(&new_cdv, num_new_cands + num_cands -1); /* copy trimmed candidates to new_cdv */ int j; for(j = 0; j < num_new_cands; j++) { new_cdv.e[num_cands + j] = cd_list[j]; } num_cands+=num_new_cands; if(num_cands >= maxCands) { break; } } } msm_cand_vec_trim(&new_cdv,num_cands); return new_cdv; } msm_cand_t *partition_candidate(msm_cand_t *cd, int lmin, int lmax, int k, int *num_new_cands) { int nr = msm_pairing_num_rungs(cd->pr); if (nr < (pow(2,k)*lmin)) { /* rejected - cant be partitioned */ *num_new_cands = 0; return NULL; } msm_cand_t *cd_list = NULL; /* Points to array of pieces. */ if (nr < (pow(2,k)*lmax)) { /* between limits, copy candidate */ cd_list = (msm_cand_t*)malloc(sizeof(msm_cand_t)); cd_list[0] = (*cd); cd_list[0].pr = msm_pairing_copy(cd->pr); *num_new_cands = 1; } else { /* we have to split the candidate... */ int lmed = (int)floor((3.0*lmin + lmax)/4.0); double lambda = pow(2,k)*lmed; int t = (int)floor(nr/lambda); int size_part = (int)floor(nr/t); int num_cands = (int)floor(nr/(double)size_part); cd_list = (msm_cand_t*)malloc(sizeof(msm_cand_t)*(num_cands)); int sta_v = 0; int current_rung = 0; int ic = 0; while (current_rung < nr) { int size_vr = (current_rung - sta_v + 1); if (size_vr == size_part) { /* copy current rungs to new candidate */ /* fprintf(stderr,"STA: %d END: %d - SIZE %d\n",sta_v,current_rung, size_vr); */ cd_list[ic] = (*cd); /* To copy the header. */ cd_list[ic].pr = msm_pairing_sub_copy(cd->pr, sta_v, current_rung); /* update variables */ sta_v = current_rung + 1; ic++; } current_rung++; } assert(ic == num_cands); *num_new_cands = num_cands; } return cd_list; } void filter_sequence(options_t *o, dnae_seq_t *x, dnae_seq_t xf[], int *maxUsefulLevel) { int maxLev = *maxUsefulLevel; fprintf(stderr, "getting the weight tables ...\n"); char *wname0 = wt_table_make_descr(o->w0.ne, o->w0.e, "%6.4f"); char *wname1 = wt_table_make_descr(o->w1.ne, o->w1.e, "%6.4f"); int8_t ek0 = 0; /* Subsampling of first stage. */ dnae_seq_multi_filter(x, maxLev, &(o->w0), wname0, &(o->w1), wname1, ek0, xf); /* Update max useful level {maxLev}: */ while ((maxLev >= 0) && (dnae_seq_num_datums(&(xf[maxLev])) < 0)) { maxLev--; } fprintf(stderr, "maximum useful level is %d\n", maxLev); *maxUsefulLevel = maxLev; free(wname0); free(wname1); } void filter_candidates ( options_t *o, msm_cand_vec_t *cdv, dnae_seq_t xf[], dnae_seq_t yf[], msm_cand_vec_t cdvf[], int *maxUsefulLevel ) { int maxLev = *maxUsefulLevel; /* Map the candidate vector {cdv} to sucessive scales: */ cdvf[0] = *cdv; int level; for (level = 1; level <= maxLev; level++) { msm_seq_desc_t *xsd_old = &(xf[level-1].sd); msm_seq_desc_t *ysd_old = &(yf[level-1].sd); msm_seq_desc_t *xsd_new = &(xf[level].sd); msm_seq_desc_t *ysd_new = &(yf[level].sd); int nc_old = cdvf[level-1].ne; /* Start new candidate vector: */ cdvf[level] = msm_cand_vec_new(nc_old); int nc_new = 0; /* New candidates are {cdvf[level].e[0..nc_new-1]}. */ int ic; for (ic = 0; ic < nc_old; ic++) { /* Map the candidate {cdvf[level-1].e[ic]} to scale {level}*/ msm_cand_t *ci_old = &(cdvf[level-1].e[ic]); msm_cand_t ci_new; ci_new.seq[0] = xf[level].sd; ci_new.seq[1] = yf[level].sd; assert((ci_new.seq[0].id == o->idA) && (strcmp(ci_new.seq[0].name,o->nameA) == 0)); assert((ci_new.seq[1].id == o->idB) && (strcmp(ci_new.seq[1].name,o->nameB) == 0)); msm_pairing_t *pr_map = msm_pairing_map(ci_old->pr, xsd_old, ysd_old, xsd_new, ysd_new); msm_pairing_free(pr_map); msm_pairing_t *pr_inc = msm_pairing_make_increasing(pr_map, 1, 0); ci_new.pr = msm_pairing_interpolate(pr_inc); msm_pairing_free(pr_inc); if (msm_pairing_num_rungs(ci_new.pr) > 0) { msm_cand_vec_expand(&(cdvf[level]),nc_new); cdvf[level].e[nc_new] = ci_new; nc_new++; } } msm_cand_vec_trim(&(cdvf[level]),nc_new); fprintf(stderr, "level %02d - %04d candidates\n", level, nc_new); } /* Update max useful level {maxLev}: */ while ((maxLev >= 0) && (cdvf[maxLev].ne == 0)) { maxLev--; } fprintf(stderr, "maximum useful level is %d\n", maxLev); *maxUsefulLevel = maxLev; } void gen_biased_derangement(int n, int perm[]) { perm[0] = 0; int i; for (i = 1; i < n ; i++) { /* Choose a random index {j} in {0..i-1}, make {{perm[i],perm[j] = {perm[j],i}}. */ /* So bias {j} towards small {i-perm[j]} and {i-j}. */ int j = i-1; while (j > 0) { int di = i-perm[j]; int dj = i-j; double d2 = ((double)di)*di + ((double)dj)*dj; double r = drandom(); if (r*r*d2 < 6.0) { break; } j--; } assert((j >= 0) && (j < i)); assert((perm[j] >= 0) && (perm[j] < i)); /* Send {i} to {perm[j]}, {j} to {i}: */ int t = perm[j]; perm[j] = i; perm[i] = t; } } msm_cand_vec_t scramble_cand_vec( msm_cand_vec_t *cdv) { auto int compare_cands_by_size(const void *a, const void *b); int compare_cands_by_size(const void *a, const void *b) { msm_cand_t *ca = (msm_cand_t*) a; msm_cand_t *cb = (msm_cand_t*) b; int size_a = msm_pairing_num_rungs(ca->pr); int size_b = msm_pairing_num_rungs(cb->pr); if( size_a == size_b) return 0; return (size_a > size_b ? 1 : -1); } qsort(cdv->e,cdv->ne, sizeof(msm_cand_t), compare_cands_by_size); int nc = cdv->ne; int perm[nc]; /*derangement */ int i; gen_biased_derangement(nc, perm); msm_cand_vec_t cdn = msm_cand_vec_new(nc); for (i = 0; i < nc; i++) { msm_cand_t ci; int j = perm[i]; ci.seq[0] = cdv->e[i].seq[0]; ci.seq[1] = cdv->e[j].seq[1]; ci.score = 0; int ng_i = msm_pairing_num_rungs(cdv->e[i].pr); int ng_j = msm_pairing_num_rungs(cdv->e[j].pr); int ng_min = (int) imin(ng_i,ng_j); /* msm_rung_t gini_i = msm_pairing_get_rung(cdv->e[i].pr,0); */ /* msm_rung_t gfin_i = msm_pairing_get_rung(cdv->e[i].pr,ng_min -1); */ /* */ /* msm_rung_t gini_j = msm_pairing_get_rung(cdv->e[j].pr,0); */ /* msm_rung_t gfin_j = msm_pairing_get_rung(cdv->e[j].pr,ng_min -1); */ /* msm_rung_vec_t rg = msm_rung_vec_new(2); */ /* */ /* rg.e[0] = (msm_rung_t) {{ gini_i.c[0],gini_j.c[1] }}; */ /* rg.e[1] = (msm_rung_t) {{ gfin_i.c[0],gfin_j.c[1] }}; */ /* */ /* msm_pairing_t *pr = msm_pairing_from_rung_vec(&rg); */ /* ci.pr = msm_pairing_interpolate(pr); */ msm_rung_vec_t rg = msm_rung_vec_new(ng_min); int k; for(k = 0; k < ng_min; k++) { msm_rung_t g = msm_pairing_get_rung(cdv->e[i].pr,k); msm_rung_t h = msm_pairing_get_rung(cdv->e[j].pr,k); msm_rung_t nr = (msm_rung_t) {{ g.c[0],h.c[1] }}; rg.e[k] = nr; } msm_rung_vec_t rgn = msm_rung_vec_interpolate(&rg); ci.pr = msm_pairing_from_rung_vec(&rgn); /* msm_pairing_free(pr); */ free(rg.e); /* free(rgn.e); */ cdn.e[i] = ci; } return cdn; } msm_cand_vec_t readCDV(char *filename) { FILE *arq_cdv = open_read(filename,TRUE); msm_cand_vec_t cdv = msm_cand_vec_read(arq_cdv); fclose(arq_cdv); return cdv; } options_t *parse_args(int argc, char**argv) { options_t *o = (options_t *)notnull(malloc(sizeof(options_t)), "no mem"); argparser_t *pp = argparser_new(stderr, argc, argv); argparser_set_help(pp, PROG_HELP); argparser_set_info(pp, PROG_INFO); argparser_process_help_info_options(pp); o->refine = argparser_keyword_present(pp,"-refine"); argparser_get_keyword(pp, "-seqA"); o->idA = (int)argparser_get_next_int(pp,0,INT_MAX); o->nameA = argparser_get_next_non_keyword(pp); o->seqA = argparser_get_next_non_keyword(pp); argparser_get_keyword(pp, "-seqB"); o->idB = (int)argparser_get_next_int(pp,0,INT_MAX); o->nameB = argparser_get_next_non_keyword(pp); o->seqB = argparser_get_next_non_keyword(pp); argparser_get_keyword(pp,"-candFile"); o->candFile = argparser_get_next_non_keyword(pp); argparser_get_keyword(pp, "-initFilter"); o->w0 = wt_table_args_parse_weights(pp, TRUE); argparser_get_keyword(pp, "-incrFilter"); o->w1 = wt_table_args_parse_weights(pp, TRUE); argparser_get_keyword(pp, "-minLevel"); o->minLevel = (int)argparser_get_next_int(pp, 0, 20); argparser_get_keyword(pp, "-maxLevel"); o->maxLevel = (int)argparser_get_next_int(pp, o->minLevel, 20); argparser_get_keyword(pp, "-lMin"); o->lMin = (int)argparser_get_next_int(pp, 0, INT_MAX); argparser_get_keyword(pp, "-lMax"); o->lMax = (int)argparser_get_next_int(pp, o->lMin, INT_MAX); o->sc = NULL; o->delta = 0; o->kappa = 0; o->maxUnp = 0; if (o->refine ) { o->sc = (dm_score_rec_t*)malloc(sizeof(dm_score_rec_t)*(o->maxLevel+1)); int i; for (i =0; i <= o->maxLevel; i++) { o->sc[i].eql = NAN; } for (i =0; i <= o->maxLevel; i++) { argparser_get_keyword(pp, "-scores"); int lvl = (int)argparser_get_next_int(pp,0,o->maxLevel); assert(isnan(o->sc[lvl].eql)); o->sc[lvl] = dm_score_args_parse(pp); } if (argparser_keyword_present(pp, "-delta")) { o->delta = (int)argparser_get_next_int(pp, 0, INT_MAX); } else { o->delta = 3; } if (argparser_keyword_present(pp, "-kappa")) { o->kappa = (int)argparser_get_next_int(pp, 0, INT_MAX); } else { o->kappa = 6; } if (argparser_keyword_present(pp, "-expand")) { o->expand = (int)argparser_get_next_int(pp, 0, INT_MAX); } else { o->expand = 0; } if (argparser_keyword_present(pp, "-shrink")) { o->shrink = (int)argparser_get_next_int(pp, 0, INT_MAX); } else { o->shrink = 0; } if (argparser_keyword_present(pp, "-maxUnp")) { o->maxUnp = (int)argparser_get_next_int(pp, 0, INT_MAX); } else { o->maxUnp = 0; /* !!! FIND PROPER DEFAULT */ } } if (argparser_keyword_present(pp, "-maxCands")) { o->maxCands = (int)argparser_get_next_int(pp, 0, INT_MAX); } else { o->maxCands = INT_MAX; } argparser_get_keyword(pp, "-outPrefix"); o->outPrefix = argparser_get_next(pp); argparser_finish(pp); return o; }