#define PROG_NAME "dm_compute_weights"
#define PROG_DESC "computes the scoring weights for true/false candidate discrimination"
#define PROG_VERS "1.0"

/* Last edited on 2024-12-21 14:04:39 by stolfi */

#define dm_cand_filter_C_COPYRIGHT \
  "Copyright © 2012  by the the State University of Campinas (UNICAMP)" \
  " and the Federal Fluminense University (UFF)"

#define PROG_HELP \
  PROG_NAME " \\\n" \
  "  -seqA {ID_A} {NAME_A} {FILENAME_A} \\\n" \
  "  -seqB {ID_B} {NAME_B} {FILENAME_B} \\\n" \
  "  -candF {CAND_FILE_F} \\\n" \
  "  -candT {CAND_FILE_T} \\\n" \
  "  [ -minCandSize {MINCANDSIZE} ] \\\n" \
  "  [ -maxCandSize {MAXCANDSIZE} ] \\\n" \
  "  [ -adjust {MAXITERATIONS} ] \\\n" \
  "  -outPrefix {OUT_PREFIX}"

#define PROG_INFO \
  "NAME\n" \
  "  " PROG_NAME " - " PROG_DESC "\n" \
  "\n" \
  "SYNOPSIS\n" \
  "  " PROG_HELP "\n" \
  "\n" \
  "DESCRIPTION\n" \
  "  This program  reads a candidate file {CAND_FILE_T} containing correct pairings," \
  " another one {CAND_FILE_F} containing random parings," \
  " and the corresponding sequences {FILENAME_A} and {FILENAME_B}, and computes" \
  " a estimate for the weights for dm_match for such candidates.\n " \
  "\n" \
  "OUTPUT FILES\n" \
  "  All output files will have names starting with {OUT_PREFIX}.  They include:\n" \
  "\n" \
  "    {OUT_PREFIX}-win.txt  The initial guess for the scoring weights.\n" \
  "    {OUT_PREFIX}-wgt.txt  The final scoring weights.\n" \
  "    {OUT_PREFIX}-svt.txt  The score counts for all cands in the SVM training format.\n" \
  "\n" \
  "OPTIONS\n" \
  "  -minCandSize {MINCANDSIZE}\n" \
  "  -maxCandSize {MAXCANDSIZE}\n" \
  "    These optional arguments specify the minimum and maximum number of rungs in" \
  " the candidates.  Any candidate that exceeds these" \
  " limits (if given) will be excluded from the analysis." \
  "\n" \
  "  -adjust MAXITERATIONS}\n" \
  "    This optional argument, if given and positive, specifies the number of iterations" \
  " of the local barycenter bisector weight-adjustment heuristic to use after the" \
  " initial barycenter bisector guess.  If not given or zero, the heuristic will not be used." \
  "\n" \
  argparser_help_info_HELP_INFO "\n" \
  "SEE ALSO\n" \
  "  dnae_seq_filter(1)\n" \
  "\n" \
  "AUTHOR\n" \
  "  This program was created on 12/nov/2012 by J. Stolfi and R. Saracchini.\n" \
  "WARRANTY\n" \
  argparser_help_info_NO_WARRANTY "\n" \
  "\n" \
  "RIGHTS\n" \
  "  " dm_cand_filter_C_COPYRIGHT ".\n" \
  "\n" \
  argparser_help_info_STANDARD_RIGHTS  

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <math.h>

#include <affirm.h>
#include <wt_table.h>
#include <jsfile.h>
#include <argparser.h>
#include <jsrandom.h>

#include <dnae_seq.h>
#include <dnae_nucleic.h>

#include <msm_rung.h>
#include <msm_pairing.h>
#include <msm_cand.h>
#include <msm_cand_vec.h>
#include <msm_dyn.h>
#include <msm_multi.h>

#include <dm_basic.h>
#include <dm_cand.h>
#include <dm_classify.h>
#include <dm_score.h>

typedef struct options_t 
  { /* Sequence parameters: */
    msm_seq_id_t seqA_id;
    char *seqA_name;
    char *seqA_filename;
    msm_seq_id_t seqB_id;
    char *seqB_name;
    char *seqB_filename;
    /* Candidate parameters: */
    char *candF_filename;
    char *candT_filename;
    /* Analysis parameters: */
    int minCandSize;  /* Min number of rungs in candidates. */
    int maxCandSize;  /* Max number of rungs in candidates. */
    int adjust_maxIter;  /* Max iteration of the local barycenter weight adjustment heuristic. */
    /* Output parameters: */
    char *outPrefix;  /* Output file name prefix (minus extensions). */
  } options_t;
  
int main(int argc, char**argv);

options_t *dmcw_parse_options(int argc, char**argv);
  /* Parses the command line options, packs them into a {options_t} record. */
 
msm_cand_vec_t dmcw_read_cand_vec(char* filename);
dnae_seq_t dmcw_read_encoded_seq(char* filename);
void dmcw_trim_cand_vec(msm_cand_vec_t *cdv, int min, int max);
void dmcw_write_weights(char *outPrefix, char *tag, char *title, dm_score_rec_t *wt);
void dmcw_write_stats(char *outPrefix, msm_cand_vec_t cdv[], dnae_seq_t *seq_a, dnae_seq_t *seq_b, dm_score_rec_t *wt);
void dmcw_write_SVM_training_data(char *outPrefix, msm_cand_vec_t cdv[], dnae_seq_t *seq_a, dnae_seq_t *seq_b);
void dmcw_write_weights(char *outPrefix, char *tag, char *title, dm_score_rec_t *wt);

int main(int argc, char**argv)
  { 
    options_t *o = dmcw_parse_options(argc, argv);
    
    msm_cand_vec_t cdv[2];
    int class;
    for (class = 0; class <= 1; class++) 
      { char *fname = (class == 0 ? o->candF_filename : o->candT_filename);
        char *cname = (class == 0 ? "false" : "true");
        cdv[class] = dmcw_read_cand_vec(fname);
        int n_orig = cdv[class].ne;
        dmcw_trim_cand_vec(&(cdv[class]), o->minCandSize, o->maxCandSize);
        int n_trim = cdv[class].ne;
        fprintf(stderr,"%s candidates - selected %d out of %d\n", cname, n_trim, n_orig);
      }
    
    dnae_seq_t seq_a = dmcw_read_encoded_seq(o->seqA_filename);
    dnae_seq_t seq_b = dmcw_read_encoded_seq(o->seqB_filename);
    
    /* Write data for SVM training: */
    dmcw_write_SVM_training_data(o->outPrefix, cdv, &seq_a, &seq_b);
    
    /* Compute score weights by the barycenter bisector heuristic: */
    dm_score_rec_t wt;
    dm_classify_determine_score_weights(&(cdv[0]), &(cdv[1]), &seq_a, &seq_b, &wt);
    dm_score_normalize_weights(&wt);
    dmcw_write_weights(o->outPrefix, "win", "initial weights", &wt);

    /* Adjust score weights by the local barycenter bisector heuristic: */
    if (o->adjust_maxIter > 0)
      { fprintf(stderr,"adjusting weights - %d iterations\n", o->adjust_maxIter);
        dm_classify_adjust_score_weights(&(cdv[0]), &(cdv[1]), &seq_a, &seq_b, o->adjust_maxIter, &wt);
      }
      
    dmcw_write_weights(o->outPrefix, "wgt", "final weights", &wt);
    
    dmcw_write_stats(o->outPrefix, cdv, &seq_a, &seq_b, &wt);
    
    return 0;
  }

void dmcw_write_stats(char *outPrefix, msm_cand_vec_t cdv[], dnae_seq_t *seq_a, dnae_seq_t *seq_b, dm_score_rec_t *wt)
  {
    char* fname = NULL;
    char *fname = jsprintf("%s-sts.txt", outPrefix);
    FILE* wr = open_write(fname, TRUE);
    
    int ngud[2]; /* Number of correctly classified cands. */
    int nbad[2]; /* Number of incorrectly classified cands. */
    int class;
    for (class = 0; class <= 1; class++) 
      { char *cname = (class == 0 ? "false" : "true");
        int sgn = 2*class - 1; /* {-1} for false, {+1} for true. */
        ngud[class] = 0;
        nbad[class] = 0;
        fprintf(wr,"# %s candidates\n", cname);
        int i;
        for (i = 0; i < cdv[class].ne; i++)
          { msm_cand_t *cd = &(cdv[class].e[i]);
            msm_rung_vec_t gv = msm_pairing_get_rungs(cd->pr);
            dm_score_rec_t sc;
            dm_score_count_clear(&sc);
            dm_score_count_rung_vec(&sc, &gv, seq_a, seq_b);
            dm_score_count_rung_vec_check(&sc, &gv);
            cd->score = dm_score_rung_vec(wt, &gv, seq_a, seq_b);
            fprintf(wr,"%+2d %d %+9.6lf", sgn, i, cd->score);
            dm_score_rec_print(wr, NULL, &sc, "\n");
            if (cd->score*sgn >= 0) { ngud[class]++; } else { nbad[class]++; }
            free(gv.e);
          }
      }
    for (class = 0; class <= 1; class++) 
      { char *cname = (class == 0 ? "false" : "true");
        fprintf(wr,"\n");
        fprintf(wr,"# CLASSIFICATION SUMMARY\n");
        int ntot = ngud[class] + nbad[class];
        double pcerr = (100.0*nbad[class])/((double)ntot);
        fprintf
          ( wr, "# %-5s candidates: %5d total, %5d correct, %5d incorect (%4.2f%%)\n",
            cname, ntot, ngud[class], nbad[class], pcerr
          );
      }
    fclose(wr);
  }
    
void dmcw_write_SVM_training_data(char *outPrefix, msm_cand_vec_t cdv[], dnae_seq_t *seq_a, dnae_seq_t *seq_b)
  {
    char* fname = NULL;
    char *fname = jsprintf("%s-svm.txt", outPrefix);
    FILE* wr = open_write(fname, TRUE);
    dm_classify_write_svm_file(wr, &(cdv[0]), &(cdv[1]), seq_a, seq_b);
    fclose(wr);
    free(fname);
  }

void dmcw_write_weights(char *outPrefix, char *tag, char *title, dm_score_rec_t *wt)
  {
    char *fname = jsprintf("%s-%s.txt", outPrefix, tag);
    FILE *wr = open_write(fname, TRUE);
    dm_score_rec_print(stderr, NULL, wt, "\n");
    fclose(wr);
    free(fname);
    /* Report to [stderr}: */
    fprintf(stderr,"%s:\n", title);
    dm_score_rec_print(stderr, "  ", wt, "\n");
  }

msm_cand_vec_t dmcw_read_cand_vec (char* filename)
  { FILE* arq_cdv = open_read(filename,TRUE);
    msm_cand_vec_t cdv =  msm_cand_vec_read(arq_cdv);
    fclose(arq_cdv);
    return cdv;
  }

dnae_seq_t dmcw_read_encoded_seq (char* filename)
  { FILE* arq_seq = open_read(filename,TRUE);
    dnae_seq_t seq = dnae_seq_read(arq_seq);
    fclose(arq_seq);
    return seq;
  }

void dmcw_trim_cand_vec (msm_cand_vec_t *cdv, int minCandSize, int maxCandSize)
  { int count = 0;
    int i;
    for(i = 0; i < cdv->ne; i++){
      msm_cand_t *cd = &(cdv->e[i]);
      msm_pairing_t *pr = cd->pr;
      int nr = msm_pairing_num_rungs(pr);
      if ((nr >= minCandSize) && (nr <= maxCandSize))
        { cdv->e[count] = *cd; count++; }
    }
    msm_cand_vec_trim(cdv, count);
  }


options_t *dmcw_parse_options(int argc, char**argv)
  { 
    options_t *o = (options_t *)notnull(malloc(sizeof(options_t)), "no mem");
    
    argparser_t *pp = argparser_new(stderr, argc, argv);
    argparser_set_help(pp, PROG_HELP);
    argparser_set_info(pp, PROG_INFO);
    argparser_process_help_info_options(pp);
     
       
    argparser_get_keyword(pp, "-seqA");
    o->seqA_id = (msm_seq_id_t)argparser_get_next_int(pp, 0, INT_MAX);
    o->seqA_name = argparser_get_next_non_keyword(pp);
    o->seqA_filename = argparser_get_next_non_keyword(pp);
    
    argparser_get_keyword(pp, "-seqB");
    o->seqB_id = (msm_seq_id_t)argparser_get_next_int(pp, 0, INT_MAX);
    o->seqB_name = argparser_get_next_non_keyword(pp);
    o->seqB_filename = argparser_get_next_non_keyword(pp);
    
    argparser_get_keyword(pp,"-candF");
    o->candF_filename = argparser_get_next_non_keyword(pp);
    
    argparser_get_keyword(pp,"-candT");
    o->candT_filename = argparser_get_next_non_keyword(pp);
    
    if(argparser_keyword_present(pp,"-minCandSize"))
      { o->minCandSize =  (int)argparser_get_next_int(pp, 0, INT_MAX);  }
    else
      { o->minCandSize = 0; }
    
    if(argparser_keyword_present(pp,"-maxCandSize"))
      { o->maxCandSize =  (int)argparser_get_next_int(pp, o->minCandSize, INT_MAX); }
    else
      { o->maxCandSize = INT_MAX; }
    
    if (argparser_keyword_present(pp, "-adjust"))
      { o->adjust_maxIter = (int)argparser_get_next_int(pp, o->minCandSize, INT_MAX); }
    else
      { o->adjust_maxIter = 0; }
    
    argparser_get_keyword(pp,"-outPrefix");
    o->outPrefix = argparser_get_next(pp);
    
    argparser_finish(pp);
    
    return o;
  }