#define PROG_NAME "dm_svm_training"
#define PROG_DESC "generation of training files for LIBLINEAR/LIBSVM for weight determination"
#define PROG_VERS "1.0"

/* Last edited on 2024-12-21 14:04:18 by stolfi */

#define dm_svm_training_C_COPYRIGHT \
  "Copyright © 2013  by the State University of Campinas (UNICAMP)" \
  " and the Federal Fluminense University (UFF)"

#define PROG_HELP \
  PROG_NAME " \\\n" \
  "  -seqA {ID_A} {NAME_A} {FILENAME_A} \\\n" \
  "  -seqB {ID_B} {NAME_B} {FILENAME_B} \\\n" \
  "  -candFile {CANDFILE} \\\n" \
  "  -minLevel {MIN_LEVEL} -maxLevel {MAX_LEVEL}  \\\n" \
  "  -initFilter " wt_table_args_parse_weights_HELP " \\\n" \
  "  -incrFilter " wt_table_args_parse_weights_HELP " \\\n" \
  "  [ -maxCands {MAX_CANDS} ] \\\n" \
  "  -lMin {LMIN} -lMax {LMAX}  \\\n" \
  "  [ -refine \n" \
  "    { -scores {LEVEL} " dm_score_args_HELP " }.. \\\n" \
  "    -delta {DELTA} ] [ -kappa {KAPPA} ] [ -maxUnp {MAX_UNP}\n" \
  "    [ -expand {EXPAND} ] [ -shrink {SHRINK} ] \\\n" \
  "  ] \\\n" \
  "  -outPrefix {OUT_PREFIX}"
  
#define PROG_INFO \
  "NAME\n" \
  "  " PROG_NAME " - " PROG_DESC "\n" \
  "\n" \
  "SYNOPSIS\n" \
  "  " PROG_HELP "\n" \
  "\n" \
  "DESCRIPTION\n" \
  "  This program reads a candidate file and generates training files " \
  " suitable for the determination of a hyperplane or a set of supporting" \
  " vector computed by the LIBLINEAR or LIBSVM binaries." \
  "\n" \
  "  The program first reads two nucleotide sequences" \
  " and filters them at various scales of resolution." \
  " Then, for each level {K} between {MIN_LEVEL} and {MAX_LEVEL} the" \
  " program then trims each of the candidates to various" \
  " lengths, obtaining a set of nested sub-candidates that will have size" \
  " between [{LMIN},{LMAX}] at level {K}." \
  " Each of these sub-candidates is filtered and optionally refined (once), and" \
  " its length and score" \
  " are written out to disk as well the equivalent SVM training file.\n" \
  "\n" \
  "OUTPUT FILES\n" \
  "  All output files will have names starting with {OUT_NAME}.\n" \
  "\n" \
  "  The main output files are called \"{OUT_NAME}-L{LL}-svvmtrain.txt\" where" \
  " {LL} is a two digit filtering level. These files are the SVM training files." \
  "  The program also writes out the candidates for each" \
  " level {LL} and eack kind {K}, to a file called \"{OUT_NAME}-{LL}-{K}.cdv\"," \
  " using the procedure {msm_cand_vec_write}, where {K} is {true}" \
  " or {false}  depending if it is a set of good or false candidates.\n" \
  "\n" \
  "OPTIONS\n" \
  "  -minLevel {MIN_LEVEL}\n" \
  "  -maxLevel {MAX_LEVEL}\n" \
  "    These  mandatory arguments specify the" \
  " minimum and maximum level of filtering to be" \
  " applied.  Level 0 is the unfiltered" \
  " sequence, and each subsequent level is sampled at half" \
  " the frequency of the previous one.\n" \
  "\n" \
  "  -initFilter " wt_table_args_parse_weights_HELP " \n" \
  "  -incrFilter " wt_table_args_parse_weights_HELP " \n" \
  "    These mandatory arguments specify the" \
  " weights of the filter to be used at the" \
  " first filtering step and at subsequent" \
  " filtering steps.  " wt_table_args_parse_weights_norm_sum_INFO "\n" \
  "\n" \
  "  -refine\n" \
  "    This optional argument specifies that each sub-candidates must be refined.\n" \
  "\n" \
  "  -scores " dm_score_args_HELP "\n" \
  "    This mandatory argument specifies the" \
  " scoring of steps in a pairing.  " dm_score_args_INFO "\n" \
  "\n" \
  "  -delta {DELTA}\n" \
  "    This optional argument specifies the" \
  " amount of adjustment allowed for the X and Y coordinates of" \
  " internal rungs of each pairing.  The default is 3.\n" \
  "\n" \
  "  -kappa {KAPPA}\n" \
  "    This optional argument specifies the" \
  " amount of X and Y extension allowed" \
  " at either end of each pairing.  The default is 6.\n" \
  "\n" \
  "  -maxCands {MAX_CANDS}\n" \
  "    If this optional argument is specfied, each training set" \
  " (true or false) will be truncated after generating {MAX_CANDS} candidates.\n" \
    "\n" \
  "  -maxUnp {MAX_UNP}\n" \
  "    Max unpaired samples at each step used for candidate refining.\n" \
  "\n" \
  "  -expand {EXPAND}\n" \
  "    This optional argument specifies by how much the {R}-range" \
  " of a refined candidate may extend beyond the" \
  " original candidate's {R}-range, in both directions.  If omitted, or" \
  " if {EXPAND} is zero, the refined" \
  " {R}-range will be a subset of the orginal range.\n" \
  "\n" \
  "  -shrink {SHRINK}\n" \
  "    This optional argument specifies by how much the {R}-range of" \
  " the refined candidate may shrink into the original" \
  " candidate's {R}-range, in each direction.  If omitted, or" \
  " if {SHRINK} is zero, the refined {R}-range" \
  " will be a superset of the original range.\n" \
  "\n" \
  "  -lMin {LMIN} - lMax{LMAX} \n" \
  "    Size range of subcandidates allowed to be used for training at level {K}.\n" \
  "\n" \
  argparser_help_info_HELP_INFO "\n" \
  "SEE ALSO\n" \
  "  dm_match(1)\n" \
  "\n" \
  "AUTHOR\n" \
  "  This program was created on 21/dec/2006 by J. Stolfi.\n" \
  "WARRANTY\n" \
  argparser_help_info_NO_WARRANTY "\n" \
  "\n" \
  "RIGHTS\n" \
  "  " dm_svm_training_C_COPYRIGHT ".\n" \
  "\n" \
  argparser_help_info_STANDARD_RIGHTS  

#include <stdio.h>
#include <math.h>
#include <assert.h>
#include <string.h>
#include <stdlib.h>

#include <affirm.h>
#include <wt_table.h>
#include <wt_table_args_parse.h>
#include <float_image.h>
#include <uint16_image.h>
#include <jsmath.h>
#include <jsrandom.h>
#include <jsfile.h>
#include <argparser.h>

#include <dnae_seq.h>
#include <dnae_seq_multi.h>
#include <dnae_nucleic.h>
#include <dnae_test_tools.h>

#include <msm_rung.h>
#include <msm_pairing.h>
#include <msm_cand.h>
#include <msm_cand_refine.h>
#include <msm_dyn.h>
#include <msm_multi.h>

#include <msm_test_tools.h>
#include <msm_image_tools.h>
#include <msm_image.h>

#include <dm_basic.h>
#include <dm_classify.h>
#include <dm_score.h>

typedef struct options_t
  { int lMin;
    int lMax;
    /* Input data*/
    char *candFile;        /* Candidate file */
    msm_seq_id_t idA,idB;       /* Sequence IDs */
    char *nameA, *nameB;                /* Sequence Names */
    char *seqA, *seqB;  /* Sequence filenames*/
    /* Filtering parameters: */
    int maxCands;
    int minLevel;        /* Minimum analysis level. */
    int maxLevel;        /* Maximum filtering and analysis level. */
    double_vec_t w0;     /* Filter weights for first stage (level 0 to level 1). */
    double_vec_t w1;     /* Filter weights for subsequent stages. */
    /* Scoring parameters: */
    dm_score_rec_t *sc;  /* Scoring criterion. */
    /* Refinement parameters: */
    bool_t refine;       /*TRUE refine the sub-candidates*/
    int delta;           /* Half-width of tableau around original pairing. */
    int kappa;           /* Extension of tableau beyond ends of pairing. */
    int expand;          /* How much the refined {R}-range may expand beyond original {R}-range at level 0. */
    int shrink;          /* How much the refined {R}-range may shrink into the original {R}-range at level 0. */
    int maxUnp;          /* We found out: the maximum unpaired samples per step */
    /* Output parameters: */
    char *outPrefix;       /* Output file name prefix (minus extensions). */
  } options_t;

options_t *parse_args(int argc, char**argv);

msm_cand_t *partition_candidate(msm_cand_t *cd, int lmin, int lmax, int k, int *num_new_cands);
  /* Given a candidate {cd} of level 0 , a given desired level{k} for
    weight adjusting, it partitions {cd} into a list of small candidates
    of size [2^k*lmin, 2^k lmax]. The number of candidates is stored
    into {num_new_cands}. It {cd} is smaller than 2^k*lmin, returns
    NULL.*/

msm_cand_vec_t prepare_candidate_vector_for_level(msm_cand_vec_t *cdv, int lmin, int lmax, int k, int maxCands);
  /*Creates a new cand vector with candidates trimmed with {partition_candidate} given a level */

void filter_sequence(options_t *o, dnae_seq_t *x, dnae_seq_t xf[], int *maxUsefulLevel);

void filter_candidates
  ( options_t *o,
    msm_cand_vec_t *cdv,
    dnae_seq_t xf[],
    dnae_seq_t yf[],
    msm_cand_vec_t cdvf[],
    int *maxUsefulLevel
  );

void gen_biased_derangement(int n, int perm[]);
  /* Fills {perm[0..n-1]} with a derangement of {0..n-1} that is biased
    towards the identity, that is, {perm[i]} tends to be close to (but
    distinct from) {i}. Uses the {jsrandom.h} random generator, in its
    current state. */

msm_cand_vec_t scramble_cand_vec( msm_cand_vec_t *cdv);

msm_cand_vec_t readCDV(char *filename);
  /* Reads a candidate vector from {filename}. */

int main(int argc, char** argv)
  {
    options_t *o = parse_args(argc, argv);
    bool_t verbose = FALSE;
    srandom(46150001); /* Initialize the randomizer, for repeatability. */

    /* Read input data: */
    fprintf(stderr,"Reading input files\n");
    msm_cand_vec_t cdv = readCDV(o->candFile);
    dnae_seq_t seq_a =  dnae_seq_read_from_nucleic_file_named(o->seqA, "", "", o->idA, o->nameA, FALSE);
    dnae_seq_t seq_b =  dnae_seq_read_from_nucleic_file_named(o->seqB, "", "", o->idB, o->nameB, FALSE);

    fprintf(stderr,"Filtering sequences\n");
    /* This needs to be done only once: */
    dnae_seq_t xf[o->maxLevel + 1];  /* Left sequence of genome at each scale. */
    dnae_seq_t yf[o->maxLevel + 1];  /* Right sequence of genome at each scale. */
    int maxUsefulLevel_seq =  o->maxLevel;
    filter_sequence(o, &seq_a, xf, &maxUsefulLevel_seq);
    filter_sequence(o, &seq_b, yf, &maxUsefulLevel_seq);

    int i;
    for (i = 0; i <= maxUsefulLevel_seq; i++)
      { assert((xf[i].sd.id == o->idA) && (strcmp(xf[i].sd.name,o->nameA) == 0));
        assert((yf[i].sd.id == o->idB) && (strcmp(yf[i].sd.name,o->nameB) == 0));
      }

    int k;
    for (k = o->minLevel; k <= o->maxLevel; k++)
      { fprintf(stderr,"Processing Level %02d \n",k);
        if(k > maxUsefulLevel_seq)
          { fprintf(stderr,"Sequences are too short for level %02d\n",k);
            break;
          }
        /* Adjust sequences for the desired level: */
        msm_cand_vec_t cdvT = prepare_candidate_vector_for_level(&cdv, o->lMin, o->lMax, k, o->maxCands);
        /* Create false candidates: */
        msm_cand_vec_t cdvF = scramble_cand_vec(&cdvT);
        fprintf(stderr,"Trimmed candidates - %d good and %d false (%d from original)\n",cdvT.ne, cdvF.ne,cdv.ne);
        /* Now filter sequences and candidates: */
        fprintf(stderr, "Filtering sequences and candidates Level %02d\n",k);

        /* This has to be done for each level ! */
        int maxUsefulLevel = k;
        msm_cand_vec_t tcvf[maxUsefulLevel + 1]; /*True candidates at each scale*/
        msm_cand_vec_t fcvf[maxUsefulLevel + 1]; /*False candidates at each scale*/
        filter_candidates(o, &cdvT, xf, yf, tcvf, &maxUsefulLevel);
        for(i = 0; i <= maxUsefulLevel_seq; i++)
          { assert((xf[i].sd.id == o->idA) && (strcmp(xf[i].sd.name,o->nameA) == 0));
            assert((yf[i].sd.id == o->idB) && (strcmp(yf[i].sd.name,o->nameB) == 0));
          }
        filter_candidates(o, &cdvF, xf, yf, fcvf, &maxUsefulLevel);
        for(i = 0; i <= maxUsefulLevel_seq; i++)
          { assert((xf[i].sd.id == o->idA) && (strcmp(xf[i].sd.name,o->nameA) == 0));
            assert((yf[i].sd.id == o->idB) && (strcmp(yf[i].sd.name,o->nameB) == 0));
          }

        if(maxUsefulLevel >= k)
          {
            msm_cand_vec_t cdv_rT;
            msm_cand_vec_t cdv_rF;

            if(o->refine)
              { auto double step_score(msm_seq_desc_t *ad, msm_seq_desc_t *bd, msm_rung_t *g0, msm_rung_t *g1);
                /* Step scoring function. */

                double step_score(msm_seq_desc_t *ad, msm_seq_desc_t *bd, msm_rung_t *g0, msm_rung_t *g1)
                  { (void)msm_seq_desc_same_orig_seq(&(xf[k].sd), ad, TRUE); 
                    (void)msm_seq_desc_same_orig_seq(&(yf[k].sd), bd, TRUE); 
                    return dm_score_step(&(o->sc[k]), g0, g1, &(xf[k]), &(yf[k]));
                  }

                fprintf(stderr, "Refining candidates Level %02d\n",k);
                msm_dyn_tableau_t tb = msm_dyn_tableau_new();
                /* int n_entries = 0; */
                /* int n_steps = 0;  */
                cdv_rT =  msm_cand_vec_refine
                  ( &(tcvf[k]), &(xf[k].sd),  &(yf[k].sd),
                    o->delta, o->kappa, o->expand, o->shrink, o->maxUnp,
                    step_score, verbose, 
                    &tb, 0, INT_MAX, 0.99
                  );
                cdv_rF =  msm_cand_vec_refine
                  ( &(fcvf[k]), &(xf[k].sd),  &(yf[k].sd), 
                    o->delta, o->kappa, o->expand, o->shrink,
                    o->maxUnp, step_score, verbose, 
                    &tb, 0, INT_MAX, 0.99
                  );
                msm_dyn_tableau_free(&tb);
              }
            else
              { cdv_rT = (tcvf[k]);
                cdv_rF = (fcvf[k]);
              }
            fprintf(stderr,"Saving processed files\n");

            char *true_cand_prefix = jsprintf("%s-L%02d-true", o->outPrefix, k);
            msm_cand_vec_write_named(&cdv_rT,true_cand_prefix , "", ".cdv");

            char *false_cand_prefix = jsprintf("%s-L%02d-false", o->outPrefix, k);
            msm_cand_vec_write_named(&cdv_rF,false_cand_prefix , "", ".cdv");

            char *arq_svm_filename = jsprintf("%s-L%02d-svmtraining.txt",o->outPrefix, k);
            FILE *arq_svm = open_write(arq_svm_filename,TRUE);
            dm_classify_write_svm_file(arq_svm, &cdv_rT, &cdv_rF,&(xf[k]),&(yf[k]));

            free(true_cand_prefix);
            free(false_cand_prefix);
            free(arq_svm_filename);
          }
        else
          { fprintf(stderr,"No candidates at level %02d !!! Maximum level - %d \n",k,maxUsefulLevel); }

        /* clean-up mess */
        int i;
        for (i = 1; i <= maxUsefulLevel; i++)
          { msm_cand_vec_free(&(tcvf[i]));
            msm_cand_vec_free(&(fcvf[i]));
          }

        if (maxUsefulLevel < k)
          { fprintf(stderr,"Skiping further levels\n");
            break;
          }
      }
    return 0;
  }

msm_cand_vec_t prepare_candidate_vector_for_level(msm_cand_vec_t *cdv, int lmin, int lmax, int k, int maxCands)
  {
    /* first compute how much suitable candidates will remain */
    int i;

    msm_cand_vec_t new_cdv = msm_cand_vec_new(cdv->ne);
    int num_cands = 0;
    for (i = 0; i < cdv->ne; i++)
      {
        msm_cand_t *cd = &(cdv->e[i]);
        int num_new_cands;
        msm_cand_t *cd_list =  partition_candidate(cd,lmin,lmax,k, &num_new_cands);

        /* resize new_cdv if it is too small  */
        if (num_new_cands > 0)
          { if (num_new_cands + num_cands > maxCands) { num_new_cands = maxCands - num_cands; }
            msm_cand_vec_expand(&new_cdv, num_new_cands + num_cands -1);
            /* copy trimmed candidates to new_cdv */
            int j;
            for(j = 0; j < num_new_cands; j++)
              { new_cdv.e[num_cands + j] = cd_list[j]; }
            num_cands+=num_new_cands;
            if(num_cands >= maxCands) { break; }
          }
      }
    msm_cand_vec_trim(&new_cdv,num_cands);
    return new_cdv;
  }

msm_cand_t *partition_candidate(msm_cand_t *cd, int lmin, int lmax, int k, int *num_new_cands)
  {
    int nr = msm_pairing_num_rungs(cd->pr);
    
    if (nr < (pow(2,k)*lmin)) 
      { /* rejected - cant be partitioned */
        *num_new_cands = 0;
        return NULL; 
      }

    msm_cand_t *cd_list = NULL; /* Points to array of pieces. */
    if (nr < (pow(2,k)*lmax))
      { /* between limits, copy candidate */
        cd_list = (msm_cand_t*)malloc(sizeof(msm_cand_t));
        cd_list[0] = (*cd);
        cd_list[0].pr = msm_pairing_copy(cd->pr);
        *num_new_cands = 1;
      }
    else
      { /* we have to split the candidate... */
        int lmed = (int)floor((3.0*lmin + lmax)/4.0);
        double lambda = pow(2,k)*lmed;
        int t = (int)floor(nr/lambda);
        int size_part = (int)floor(nr/t);

        int num_cands = (int)floor(nr/(double)size_part);
        cd_list = (msm_cand_t*)malloc(sizeof(msm_cand_t)*(num_cands));

        int sta_v = 0;
        int current_rung = 0;
        int ic = 0;
        while (current_rung < nr)
          { int size_vr = (current_rung - sta_v  + 1);
            if (size_vr == size_part)
              { /* copy current rungs to new candidate */
                /* fprintf(stderr,"STA: %d END: %d - SIZE %d\n",sta_v,current_rung, size_vr); */
                cd_list[ic] = (*cd); /* To copy the header. */
                cd_list[ic].pr = msm_pairing_sub_copy(cd->pr, sta_v, current_rung);
                /* update variables */
                sta_v = current_rung + 1;
                ic++;
              }
            current_rung++;
          }
        assert(ic == num_cands);
        *num_new_cands = num_cands;
      }
    return cd_list;

  }

void filter_sequence(options_t *o, dnae_seq_t *x, dnae_seq_t xf[], int *maxUsefulLevel)
  {
    int maxLev = *maxUsefulLevel;

    fprintf(stderr, "getting the weight tables ...\n");
    char *wname0 = wt_table_make_descr(o->w0.ne, o->w0.e, "%6.4f");
    char *wname1 = wt_table_make_descr(o->w1.ne, o->w1.e, "%6.4f");

    int8_t ek0 = 0; /* Subsampling of first stage. */
    dnae_seq_multi_filter(x, maxLev, &(o->w0), wname0, &(o->w1), wname1, ek0, xf);

    /* Update max useful level {maxLev}: */

    while ((maxLev >= 0) &&  (dnae_seq_num_datums(&(xf[maxLev])) < 0)) { maxLev--; }
    fprintf(stderr, "maximum useful level is %d\n", maxLev);
    *maxUsefulLevel = maxLev;

    free(wname0);
    free(wname1);
  }
  
void filter_candidates
  ( options_t *o,
    msm_cand_vec_t *cdv,
    dnae_seq_t xf[],
    dnae_seq_t yf[],
    msm_cand_vec_t cdvf[],
    int *maxUsefulLevel
  )
  {
    int maxLev = *maxUsefulLevel;
    /* Map the candidate vector {cdv} to sucessive scales: */
    cdvf[0] = *cdv;
    int level;
    for (level = 1; level <= maxLev; level++) 
      { 
        msm_seq_desc_t *xsd_old = &(xf[level-1].sd);
        msm_seq_desc_t *ysd_old = &(yf[level-1].sd);
        
        msm_seq_desc_t *xsd_new = &(xf[level].sd);
        msm_seq_desc_t *ysd_new = &(yf[level].sd);

        int nc_old = cdvf[level-1].ne;
        /* Start new candidate vector: */
        cdvf[level] = msm_cand_vec_new(nc_old);
        int nc_new = 0; /* New candidates are {cdvf[level].e[0..nc_new-1]}. */
        int ic;
        for (ic = 0; ic < nc_old; ic++)
          { /* Map the candidate {cdvf[level-1].e[ic]} to scale {level}*/
            msm_cand_t *ci_old = &(cdvf[level-1].e[ic]);
            msm_cand_t  ci_new;
            ci_new.seq[0] = xf[level].sd;
            ci_new.seq[1] = yf[level].sd;
            assert((ci_new.seq[0].id == o->idA) && (strcmp(ci_new.seq[0].name,o->nameA) == 0));
            assert((ci_new.seq[1].id == o->idB) && (strcmp(ci_new.seq[1].name,o->nameB) == 0));

            msm_pairing_t *pr_map = msm_pairing_map(ci_old->pr, xsd_old, ysd_old, xsd_new, ysd_new);
            msm_pairing_free(pr_map);
            msm_pairing_t *pr_inc = msm_pairing_make_increasing(pr_map, 1, 0);
            ci_new.pr = msm_pairing_interpolate(pr_inc);
            msm_pairing_free(pr_inc);
            if (msm_pairing_num_rungs(ci_new.pr) > 0)
              { msm_cand_vec_expand(&(cdvf[level]),nc_new);
                cdvf[level].e[nc_new] = ci_new;
                nc_new++;
              }
          }
        msm_cand_vec_trim(&(cdvf[level]),nc_new);
        fprintf(stderr, "level %02d - %04d candidates\n", level, nc_new);
      }

    /* Update max useful level {maxLev}: */
    while ((maxLev >= 0) &&  (cdvf[maxLev].ne == 0)) { maxLev--; }
    fprintf(stderr, "maximum useful level is %d\n", maxLev);
    *maxUsefulLevel = maxLev;
  }
  
void gen_biased_derangement(int n, int perm[])
  {
    perm[0] = 0;
    int i;
    for (i = 1; i < n ; i++)
      {
        /* Choose a random index {j} in {0..i-1}, make {{perm[i],perm[j] = {perm[j],i}}. */
        /* So bias {j} towards small {i-perm[j]} and {i-j}. */
        int j = i-1;
        while (j > 0)
          { int di = i-perm[j];
            int dj = i-j;
            double d2 = ((double)di)*di + ((double)dj)*dj;
            double r = drandom();
            if (r*r*d2 < 6.0) { break; }
            j--;
          }
        assert((j >= 0) && (j < i));
        assert((perm[j] >= 0) && (perm[j] < i));
        /* Send {i} to {perm[j]}, {j} to {i}: */
        int t = perm[j];
        perm[j] = i;
        perm[i] = t;
      }
  }

msm_cand_vec_t scramble_cand_vec( msm_cand_vec_t *cdv)
  {
    auto int compare_cands_by_size(const void *a, const void *b);

    int compare_cands_by_size(const void *a, const void *b)
      { msm_cand_t *ca = (msm_cand_t*) a;
        msm_cand_t *cb = (msm_cand_t*) b;
        int size_a = msm_pairing_num_rungs(ca->pr);
        int size_b = msm_pairing_num_rungs(cb->pr);
        if( size_a == size_b) return 0;
        return (size_a > size_b ? 1 : -1);
      }

    qsort(cdv->e,cdv->ne, sizeof(msm_cand_t), compare_cands_by_size);

    int nc = cdv->ne;
    int perm[nc]; /*derangement */
    int i;

    gen_biased_derangement(nc, perm);

    msm_cand_vec_t cdn = msm_cand_vec_new(nc);
    for (i = 0; i < nc; i++)
      { msm_cand_t ci;
        int j = perm[i];
        ci.seq[0] = cdv->e[i].seq[0];
        ci.seq[1] = cdv->e[j].seq[1];
        ci.score = 0;

        int ng_i = msm_pairing_num_rungs(cdv->e[i].pr);
        int ng_j = msm_pairing_num_rungs(cdv->e[j].pr);

        int ng_min = (int) imin(ng_i,ng_j);

        /* msm_rung_t gini_i = msm_pairing_get_rung(cdv->e[i].pr,0); */
        /* msm_rung_t gfin_i = msm_pairing_get_rung(cdv->e[i].pr,ng_min -1); */
        /*  */
        /* msm_rung_t gini_j = msm_pairing_get_rung(cdv->e[j].pr,0); */
        /* msm_rung_t gfin_j = msm_pairing_get_rung(cdv->e[j].pr,ng_min -1); */


        /* msm_rung_vec_t rg = msm_rung_vec_new(2); */
        /*  */
        /* rg.e[0] = (msm_rung_t) {{ gini_i.c[0],gini_j.c[1] }}; */
        /* rg.e[1] = (msm_rung_t) {{ gfin_i.c[0],gfin_j.c[1] }}; */
        /*  */
        /* msm_pairing_t *pr = msm_pairing_from_rung_vec(&rg); */
        /* ci.pr = msm_pairing_interpolate(pr); */
        msm_rung_vec_t rg = msm_rung_vec_new(ng_min);
        int k;
        for(k = 0; k < ng_min; k++)
          { msm_rung_t g = msm_pairing_get_rung(cdv->e[i].pr,k);
            msm_rung_t h = msm_pairing_get_rung(cdv->e[j].pr,k);
            msm_rung_t nr = (msm_rung_t) {{ g.c[0],h.c[1] }};
            rg.e[k] = nr;
          }

        msm_rung_vec_t rgn = msm_rung_vec_interpolate(&rg);
        ci.pr = msm_pairing_from_rung_vec(&rgn);
        /* msm_pairing_free(pr); */
        free(rg.e);
        /* free(rgn.e); */
        cdn.e[i] = ci;
      }

    return  cdn;
  }

msm_cand_vec_t readCDV(char *filename)
  { FILE *arq_cdv = open_read(filename,TRUE);
    msm_cand_vec_t cdv =  msm_cand_vec_read(arq_cdv);
    fclose(arq_cdv);
    return cdv;
  }

options_t *parse_args(int argc, char**argv)
  { 
    options_t *o = (options_t *)notnull(malloc(sizeof(options_t)), "no mem");
    
    argparser_t *pp = argparser_new(stderr, argc, argv);
    argparser_set_help(pp, PROG_HELP);
    argparser_set_info(pp, PROG_INFO);
    argparser_process_help_info_options(pp);
     
    o->refine = argparser_keyword_present(pp,"-refine");
    
    argparser_get_keyword(pp, "-seqA");
    o->idA = (int)argparser_get_next_int(pp,0,INT_MAX);
    o->nameA = argparser_get_next_non_keyword(pp);
    o->seqA = argparser_get_next_non_keyword(pp);
    
    argparser_get_keyword(pp, "-seqB");
    o->idB = (int)argparser_get_next_int(pp,0,INT_MAX);
    o->nameB = argparser_get_next_non_keyword(pp);
    o->seqB = argparser_get_next_non_keyword(pp);
    
    argparser_get_keyword(pp,"-candFile");
    o->candFile = argparser_get_next_non_keyword(pp);
    
     
    argparser_get_keyword(pp, "-initFilter");
    o->w0 = wt_table_args_parse_weights(pp, TRUE);

    argparser_get_keyword(pp, "-incrFilter");
    o->w1 = wt_table_args_parse_weights(pp, TRUE);

    argparser_get_keyword(pp, "-minLevel");
    o->minLevel = (int)argparser_get_next_int(pp, 0, 20);
    
    argparser_get_keyword(pp, "-maxLevel");
    o->maxLevel = (int)argparser_get_next_int(pp, o->minLevel, 20);
    
    argparser_get_keyword(pp, "-lMin");
    o->lMin = (int)argparser_get_next_int(pp, 0, INT_MAX);
    
    argparser_get_keyword(pp, "-lMax");
    o->lMax = (int)argparser_get_next_int(pp, o->lMin, INT_MAX);
    
    o->sc = NULL;
    o->delta = 0;
    o->kappa = 0;
    o->maxUnp = 0;
    if (o->refine ) 
      { o->sc = (dm_score_rec_t*)malloc(sizeof(dm_score_rec_t)*(o->maxLevel+1));
        int i;
        for (i =0; i <= o->maxLevel; i++) { o->sc[i].eql = NAN; }
      
        for (i =0; i <= o->maxLevel; i++)
          {  argparser_get_keyword(pp, "-scores");
            int lvl = (int)argparser_get_next_int(pp,0,o->maxLevel);
            assert(isnan(o->sc[lvl].eql));
            o->sc[lvl] = dm_score_args_parse(pp);
          }

        if (argparser_keyword_present(pp, "-delta"))
          { o->delta = (int)argparser_get_next_int(pp, 0, INT_MAX); }
        else
          { o->delta = 3; }

        if (argparser_keyword_present(pp, "-kappa"))
          { o->kappa = (int)argparser_get_next_int(pp, 0, INT_MAX); }
        else
          { o->kappa = 6; }

        if (argparser_keyword_present(pp, "-expand"))
          { o->expand = (int)argparser_get_next_int(pp, 0, INT_MAX); }
        else
          { o->expand = 0; }

        if (argparser_keyword_present(pp, "-shrink"))
          { o->shrink = (int)argparser_get_next_int(pp, 0, INT_MAX); }
        else
          { o->shrink = 0; }

        if (argparser_keyword_present(pp, "-maxUnp"))
          { o->maxUnp = (int)argparser_get_next_int(pp, 0, INT_MAX); }
        else
          { o->maxUnp = 0; /* !!! FIND PROPER DEFAULT */ }
      }
    
    if (argparser_keyword_present(pp, "-maxCands"))
      { o->maxCands = (int)argparser_get_next_int(pp, 0, INT_MAX); }
    else
      { o->maxCands = INT_MAX; }
    
    argparser_get_keyword(pp, "-outPrefix");
    o->outPrefix = argparser_get_next(pp);
      
    argparser_finish(pp);
    
    return o;
  }