#define PROG_NAME "dbd_tabulate"
#define PROG_DESC "computes {k}-tuple freqs and probs from training sets"
#define PROG_VERS "1.0"
/* Last edited on 2024-12-21 14:04:59 by stolfi */

#define dbd_tabulate_C_COPYRIGHT "Copyright © 2006,2008 by IC-UFF and IC-UNICAMP"

#define PROG_HELP \
  PROG_NAME " \\\n" \
  "    -dataBank {BANK_FILE} \\\n" \
  "    -windowSize {WINDOW_SIZE} \\\n" \
  "    -labelMap {LAB_SET} = {LAB_EQV} \\\n" \
  "    [ -eventList {EVENT_LIST} | -all ] \\\n" \
  "    [ -quiet ] \\\n" \
  "    " argparser_help_info_HELP " \\\n" \
  "    > {OUT_FILE}"
  
#define PROG_INFO \
  "NAME\n" \
  "  " PROG_NAME " - " PROG_DESC "\n" \
  "\n" \
  "SYNOPSIS\n" \
  PROG_HELP "\n" \
  "\n" \
  "DESCRIPTION\n" \
  "  This program takes two input parameters, an integer {k=WINDOW_SIZE}" \
  " and the name of a file {BANK_FILE} that contains pairs of file" \
  " names \"{BAS_FILE_i} {LAB_FILE_i}\", one pair per line.\n" \
  "\n" \
  "INPUTS\n" \
  "  In each file pair, the first file {BAS_FILE_i} should contain" \
  " the nucleotides of a DNA sequence, encoded with the" \
  " letters [ATCG].  The second file {LAB_FILE_i} should" \
  " contain the corresponding labels, which may be any" \
  " of the letters in the string {LAB_SET}.\n" \
  "\n" \
  "  For the user's convenience, the program automatically" \
  " converts lowercase nucleotide letters read from {BAS_FILE_i} to upper" \
  " case, and replaces the RNA nucleotide letter 'U' by 'T'.  It also" \
  " maps each label letter read from {LAB_FILE_i} that is listed" \
  " in the {LAB_SET} string to the corresponding letter in" \
  " the {LAB_EQV} string.  These conversions are" \
  " performed before counting the events and pairs, and" \
  " affect the output file output file.\n" \
  "\n" \
  "  The input files may begin with one or more comment lines, each" \
  " beginning with a '>' characters, which are silently ignored.\n" \
  "\n" \
  "OUTPUTS\n" \
  "  The program extracts from each pair of input files all the pairs {e,t}" \
  " where {t} is a /{k}-tuple/ --- a string of {k} consecutive base" \
  " letters --- and {e} is the corresponding /{k}-event/ --- the string" \
  " of {k} consecutive label letters with same indices." \
  " For each distinct pair {e,t} that occurs in the input files, the" \
  " program writes to standard output a line in the format" \
  " \"{e} {t} {FREQ} {PROB}\" where {FREQ} is the frequency (number" \
  " of occurrences) of that {k}-tuple with that" \
  " {k}-event; and {PROB} is the conditional probability {Pr(t|r)}" \
  " estimated from those frequencies.\n" \
  "\n" \
  "  The output {k}-tuples use only the uppercase letters [ATCG], and" \
  " the {k}-events use only the letters listed in the {LAB_EQV} string.\n" \
  " \
  ""SELECTED EVENTS\n" \
  "  The user may specify a list of `interesting' {k}-events, through" \
  " the \"-eventList\" option.  In that case, the" \
  " program will tabulate the conditional probabilities" \
  " for those events only.\n" \
  "\n" \
  "OPTIONS\n" \
  PROG_INFO_OPTS "\n" \
  "\n" \
  "SEE ALSO\n" \
  "  dbd_predict(1), dbd_summary(1), dbd_paint(1).\n" \
  "\n" \
  "AUTHOR\n" \
  "  Created 2005-10-05 by Renatha Oliva Capua, IC-UFF.\n" \
  "\n" \
  "MODIFICATION HISTORY\n" \
  "  2008-06-06 J. Stolfi, IC-UNICAMP Rewritten for libs, style.\n" \
  "  2008-06-09 J. Stolfi, IC-UNICAMP Added args {LAB_SET},{LAB_EQV}.\n" \
  "  2008-06-10 J. Stolfi, IC-UNICAMP Added arg {EVENT_LIST}.\n" \
  "  2008-06-13 J. Stolfi, IC-UNICAMP Changed to use" \
  "    {argparser.h}, added option \"-quiet\".\n" \
  "\n" \
  "WARRANTY\n" \
  argparser_help_info_NO_WARRANTY "\n" \
  "\n" \
  "RIGHTS\n" \
  "  " dbd_tabulate_C_COPYRIGHT ".\n" \
  "\n" \
  argparser_help_info_STANDARD_RIGHTS
  
#define PROG_INFO_OPTS \
  "  -dataBank {BANK_FILE}\n" \
  "    This mandatory argument specifies the name of the main file," \
  " that contains the list of input file names (the nucleotide" \
  " file names and the corresponding label file names).\n" \
  "\n" \
  "  -windowSize {WIN_SIZE}\n" \
  "    This mandatory argument specifies the window size {k = WIN_SIZE}.\n" \
  "\n" \
  "  -labelMap {LAB_SET} = {LAB_EQV}\n" \
  "    This mandatory argument specifies the mapping to be applied" \
  " to the label letters before tabulating the event counts.\n" \
  "\n" \
  "  -eventList {EVENT_LIST}\n" \
  "  -all\n" \
  "    These optional and mutually exclusive arguments specify" \
  " the set of interesting events, whose counts and probabilities" \
  " are to be written to the output.  If \"-eventList\" is present, the" \
  " next argument {EVENT_LIST} must be the name of a file that contains" \
  " zero or more {k}-events, one per line.  Note that these events" \
  " should consist of" \
  " letters from the output alphabet {LAB_EQV}, rather than the input" \
  " alphabet {LAB_SET}.  The file {EVENT_LIST} may also contain" \
  " blank lines, or comments that start with '#' and extend to the end of" \
  " the current line.  If \"-all\" is present, or neither option is used, the" \
  " program will write the probabilities and counts for all" \
  " events.\n" \
  "\n" \
  "  -quiet\n" \
  "    Normally the program prints out the names of each" \
  " file {LAB_FILE_i} and {BAS_FILE_i} it reads.  This option supresses" \
  " the printout of those file names.\n" \
  "\n" \
  "DOCUMENTATION OPTIONS\n" \
  argparser_help_info_HELP_INFO

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <assert.h>

#include <jsfile.h>
#include <jsmath.h>
#include <affirm.h>
#include <fget.h>
#include <vec.h>
#include <argparser.h>

#include <dbd_lib.h>

/* PROTOTYPES */

typedef struct Options_t
  { int windowSize;  /* Size of tuples and events to consider. */
    char *labSet;    /* List of valid input label letters. */
    char *labEqv;    /* List of output label letters. */
    char *dataBank;  /* Name of file that describes the input data bank. */
    char *eventList; /* Name of file containing the interesting events, or NULL. */
    bool_t quiet;    /* If TRUE, does not print the names of ".bas" and ".lab" files. */
  } Options_t;
  
Options_t *get_options(int argc, char **argv);
  /* Parses the command line, returns them as an {Options_t} record. */
  
void process_file_pair
  ( int k,
    char *basName, 
    char mapB[],
    code_table_t *tbB,
    char *labName,
    char mapL[],
    code_table_t *tbL, 
    bool_t quiet,
    int64_vec_t *ctE,
    int64_vec_t *ctP
  );
  /* Reads a string of bases from {basFile}, and a matching string of
    labels from {labFile}. Maps each character through the char maps
    {mapB} and {mapL}, respectively. Extracts all pairs {t,r} where
    {t} is a {k}-tuple and {r} is a {k}-event. Maps {r} to an event
    index {ixE}, using the table {tbL}, and increments {ctE[ixE]};
    then maps and the pair {t,r} to a pair index {ixP}, using the
    tables {tbB,tbL}, and increments {ctP[ixP]}.

    The procedure ignores line breaks, blanks, and '>'-headers
    in the base and label files. Each remaining base (resp. label) character {ch}
    is mapped to {mapB[(unsigned)ch]} (resp. {mapL[(unsigned)ch]}). 
    Ignores any {k}-event/{k}-tuple pairs that, after this mapping,
    contain invalid characters --- that is, characters 
    that are mapped to {-1} by {tbB} or {tbL}. 
    
    If {quiet} is FALSE, the procedure writes to {stderr}
    a warning that the files {basFile} and {labFile} are being opened. */

bool_t read_event
  ( FILE *rd,
    char *fName,
    int *lineNumP,
    int k,
    code_table_t *tbL,
    int64_t *ixE
  );
  /* Reads from {rd} a {k}-event, skipping blanks and newlines.
    Returns the event index in {*ixE}. Updates {*lineNumP}. Fails on
    syntax error (showing {fName} in the error message), returns TRUE
    on success, and returns FALSE if there are no more events in {rd}. */

void write_freqs_probs
  ( FILE *wr,
    int k,
    char *eventList,
    int64_t nTfull,
    int64_t nEfull,
    code_table_t *tbB,
    code_table_t *tbL,
    int64_vec_t *ctE,
    int64_vec_t *ctP
  );
  /* Writes to file {wr} the raw occurrence counts and the conditional
    probabilities {Pr(t|e)} for intersting {k}-event/{k}-tuple pairs {(e,t)}.
    
    If {eventList} is not null, it must be the name of a file
    containing the list of interesting {k}-events, one per line. If
    {eventList} is NULL, but there are more than {nEfull} possible
    {k}-events, the procedure assumes that only those events with
    nonzero occurrences are interesting. Otherwise the procedure
    considers all possible {k}-events interesting.
    
    In any case, if the number of possible {k}-tuples is greater
    than {nTfull}, the procedure writes only those pairs 
    with non-zero occurence counts. */

int main(int argc, char **argv);

/* IMPLEMENTATIONS */

int main(int argc, char **argv)
  {
    Options_t *o = get_options(argc, argv);
    
    int k = o->windowSize;        /* Window size: */
    
    /* Build the nucleotide character map and encoding/decoding table: */
    char *basSet = "ATCGatcgUu";  /* Valid input nucleotide chars. */
    char *basEqv = "ATCGATCGTT";  /* Equivalent nucleotide chars. */
    char mapB[256];
    build_char_map(basSet, basEqv, strlen(basSet), '?', mapB);
    code_table_t tbB = build_code_table(basEqv, strlen(basEqv));
    
    /* Build the lable character map and encoding/decoding table: */
    char mapL[256];
    build_char_map(o->labSet, o->labEqv, strlen(o->labSet), '?', mapL);
    code_table_t tbL = build_code_table(o->labEqv, strlen(o->labEqv));
    
    /* Check whether we have enough memory: */
    double maxMem = 64.0e6;  /* Max reasonable table size in bytes. */
    double maxAlt = exp(log(maxMem)/k); /* Max alternatives per letter. */
    demand(tbB.n*tbL.n <= maxAlt, "tables would be too big");
    fprintf(stderr, "alphabet size: bases = %d  labels = %d\n", tbB.n, tbL.n);
    
    /* Total number of distinct events, tuples, pairs: */
    int64_t nT = ipow(tbB.n,k); 
    int64_t nE = ipow(tbL.n,k); 
    int64_t nP = nT*nE; 
    fprintf(stderr, "table size: tuples = %lld events = %lld  pairs = %lld\n", nT, nE, nP);
    
    /* Allocate and initialize the count arrays {ctE,ctP}: */
    int64_vec_t ctE = int64_vec_new(nE); /* Counts occurrences of each {k}-event. */
    int64_vec_t ctP = int64_vec_new(nP); /* Counts occurrences of each {k}-tuple/{k}-event pair. */
    int64_t ix;
    for (ix = 0; ix < nE; ix++) { ctE.e[ix] = 0; }
    for (ix = 0; ix < nP; ix++) { ctP.e[ix] = 0; }
    
    /* Loop on file pairs: */
    FILE  *bankFile = open_read(o->dataBank, TRUE);
    int nSeqs = 0; /* Number of biosequences (file pairs) processed. */
    while(TRUE)
      { 
        /* Skip initial spaces on line: */
        fget_skip_spaces(bankFile);
        
        /* Check for end-of-file: */
        if (feof(bankFile)) { break; }
        
        /* Omit blank lines: */
        if (fget_test_char(bankFile, '\n')) { continue; }
        
        /* Omit comment lines; */
        if (fget_test_char(bankFile, '>'))
          { int c = '>'; 
            while ((! feof(bankFile)) && (c != '\n')) { c = fgetc(bankFile); } 
            continue;
          }
        
        /* Get names of input files (bases and labels): */
        char *basName = fget_string(bankFile);
        fget_skip_spaces(bankFile);
        char *labName = fget_string(bankFile);
        fget_skip_spaces(bankFile);
        
        /* Process the two files: */
        process_file_pair(k, basName, mapB, &tbB, labName, mapL, &tbL, o->quiet, &ctE, &ctP);
        nSeqs++;
        
        /* Skip to next line: */
        fget_eol(bankFile);
      }
    fclose(bankFile);
    fprintf(stderr, "%d biosequences processed.\n", nSeqs); 
    
    /* Max number of entries to print the full table: */
    int64_t nTfull = ipow(4,6); /* Allows {A,T,C,G} with {k=6}. */
    int64_t nEfull = ipow(2,6); /* Allows {N,K} with {k=6}. */

    /* Output the interesting counts: */
    write_freqs_probs(stdout, k, o->eventList, nTfull, nEfull, &tbB, &tbL, &ctE, &ctP);

    return(0);
  }

void process_file_pair
  ( int k,
    char *basName, 
    char mapB[],
    code_table_t *tbB,
    char *labName,
    char mapL[],
    code_table_t *tbL, 
    bool_t quiet,
    int64_vec_t *ctE,
    int64_vec_t *ctP
  )
  {
    FILE *basFile = open_read(basName, ! quiet);
    FILE *labFile = open_read(labName, ! quiet);

    /* Total number of distinct events, tuples, pairs: */
    int64_t nE = ctE->ne; 
    int64_t nP = ctP->ne; 
    int64_t nT = nP/nE; 

    int nBases = 0; /* Count of non-blank, non-comment chars read from each file. */
    
    char tuple[k];    /* Current {k}-tuple (circular queue). */
    char event[k];    /* Current {k}-event (circular queue). */
    bool_t bad[k];    /* {bad[j]} is TRUE iff {tuple[j]} or {event[j]} is invalid. */
    
    int nBad = 0;     /* Number of `bad' base/label pairs in {tuple}/{event}. */
    
    /* Tuple and event indices (computed assuming code 0 for bad slots). */
    int64_t ixT = 0;  /* Index of {tuple} in list of all {k}-tuples. */
    int64_t ixE = 0;  /* Index of {event} in list of all {k}-events. */

    /* Line numbers in data files: */
    int basLine = 1;
    int labLine = 1;
    
    /* Skip comment blocks: */
    skip_bio_file_header(basFile, &basLine);
    skip_bio_file_header(labFile, &labLine);

    /* Loop on character pairs: */
    int basChar = fgetc(basFile);
    int labChar = fgetc(labFile);
    while ((basChar != EOF) && (labChar != EOF))
      {
        /* Skip blanks and newlines: */
        if (basChar == '\n')
          { basLine++; basChar = fgetc(basFile); continue; }
        if (basChar == ' ')
          { basChar = fgetc(basFile); continue; }
        if (labChar == '\n')
          { labLine++; labChar = fgetc(labFile); continue;  }
        if (labChar == ' ')
          { labChar = fgetc(labFile); continue;  }
          
        /* Got the next pair of non-blank characters, process it: */

        /* Apply the character maps: */
        basChar = mapB[(unsigned)basChar];
        labChar = mapL[(unsigned)labChar];

        /* Map nucleotide and label to numeric codes: */
        int basCode = tbB->num[(unsigned)basChar];
        int labCode = tbL->num[(unsigned)labChar];
 
        /* Compute slot {j} of current chars in {tuple} and {event}: */
        int j = nBases % k;

        /* Shift tuple and event indices, discarding oldest base/label pair: */
        ixT = (tbB->n*ixT) % nT;
        ixE = (tbL->n*ixE) % nE;

        /* Update {nBad} to account for discarded base/label pair: */
        if ((nBases >= k) && bad[j]) { nBad--; }

        /* Store characters in {tuple} and [event} queues: */
        tuple[j] = basChar;
        event[j] = labChar;
        bad[j] = (basCode < 0) | (basCode >= tbB->n) | (labCode < 0) | (labCode >= tbL->n);
        nBases++;

        if (bad[j]) 
          { nBad++; }
        else
          { /* Add base and label codes to the tuple and event indices: */
            ixT += basCode;
            ixE += labCode;
          }

        if ((nBases >= k) && (nBad == 0))
          { /* Count one more occurrence of pair and event: */
            int64_t ixP = ixE*nT + ixT;
            ctP->e[ixP]++;
            ctE->e[ixE]++;
          }

        /* Get next characters from each file: */
        basChar = fgetc(basFile);
        labChar = fgetc(labFile);
      }
    fclose(basFile);
    fclose(labFile);
 }

bool_t read_event
  ( FILE *rd,
    char *fName,
    int *lineNumP, 
    int k,
    code_table_t *tbL,
    int64_t *ixE
  )
{
  auto void error(char *msg);
  
  void error(char *msg)
    { fprintf(stderr, "%s:%d: ** %s\n", fName, *lineNumP, msg); 
      exit(1);
    }

  int ch = fgetc(rd);
  int i = 0; /* Number of label letters read so far. */
  (*ixE) = 0;
  while (TRUE)
    { assert(i <= k);
      if ((ch == EOF) && (i == 0))
        { /* Normal end-of-file, no event: */ return FALSE; }
      else if ((ch == EOF) || (ch == ' ') || (ch == '\011') || (ch == '\n') || (ch == '\015') || (ch == '#'))
        { /* Character {ch} is a valid event separator/terminator: */
          if (ch == '#')
            { /* Skip rest of comment, leaving only the final '\n' or {EOF}: */
              do { ch = fgetc(rd); } while ((ch != '\n') && (ch != EOF));
            }
          /* Finish the event: */
          if (i == k)
            { /* Normal end of event: */ 
              if (ch == '\n')
                { /* Put back the newline so that the next read increments {*lineNumP}: */
                  ungetc(ch, rd);
                }
              return TRUE;
            }
          else if (i > 0) 
            { error("truncated event"); }
          /* We have't found the first letter of the event yet.  Skip this one: */
          if (ch == '\n') { (*lineNumP) ++; }
        }
      else 
        { int num = tbL->num[(unsigned)ch];
          if ((num < 0) || (num > tbL->n))
            { error("Invalid character"); num = 0; }
          if (i >= k)
            { error("event is too long"); }
          else
            { (*ixE) = (*ixE)*tbL->n + num; i++; }
        }
      /* Get next character: */
      ch = fgetc(rd);
    } 
}

void write_freqs_probs
  ( FILE *wr,
    int k,
    char *eventList,
    int64_t nTfull,
    int64_t nEfull,
    code_table_t *tbB,
    code_table_t *tbL,
    int64_vec_t *ctE,
    int64_vec_t *ctP
  )
  {
    /* Total number of distinct events, tuples, pairs: */
    int64_t nE = ctE->ne; 
    int64_t nP = ctP->ne; 
    int64_t nT = nP/nE; 
    
    fprintf(wr, "> Frequencies and probabilities of %d-tuples and %d-events\n", k, k);
    fprintf(wr, "> Columns are: event, tuple, occurrences(tuple,event), cond-prob(tuple|event)\n");
    fprintf(wr, "\n");

    auto void write_event(int64_t ixE);
      /* Writes out all pairs for the {k}-event whose index is {ixE}. */
    
    if (eventList != NULL)
      { FILE *rd = open_read(eventList, TRUE);
        /* Read the interesting events and set all their counts to 0: */
        int lineNum = 1;
        int64_t ixE;
        while (read_event(rd, eventList, &lineNum, k, tbL, &ixE))
          { write_event(ixE); }
        fclose(rd);
      }
    else
      { int64_t ixE;
        for(ixE = 0; ixE < nE; ixE++)
          { if ((nE <= nEfull) || (ctE->e[ixE] > 0))
              { write_event(ixE); }
          }
      }
      
    fclose(wr);
    return;

    void write_event(int64_t ixE)
      {
        /* Decode the index {ixE} into {event[0..k-1]}: */
        char event[k];
        int64_t d;
        d = nE;
        int64_t jxE = ixE;
        int r;
        for (r = 0; r < k; r++)
          { d /= tbL->n; int labCode = jxE / d;  jxE = jxE % d;
            event[r] = tbL->chr[labCode];
          }
           
        /* Enumerate all {k}-tuples: */
        int64_t ixT;
        for (ixT = 0; ixT < nT; ixT++)
          { /* Compute the pair index {ixP}: */
            int64_t ixP = ixE*nT + ixT;
            
            if ((nT < nTfull) || (ctP->e[ixP] > 0))
              { /* Pair is interesting: */

                /* Write the {k}-event: */
                for (r = 0; r < k; r++) { fputc(event[r], wr); }
                fputc(' ', wr);

                /* Write the {k}-tuple: */
                d = ipow(tbB->n, k);
                int64_t jxT = ixT;
                for (r = 0; r < k; r++)
                  { d /= tbB->n;  int basCode = jxT / d; jxT = jxT % d;
                    fputc(tbB->chr[basCode], wr);
                  }

                /* Write raw count: */
                fprintf(wr, " %10lld", ctP->e[ixP]);

                /* Write the estimated conditional probability: */
                double prob = ((double)ctP->e[ixP]+1)/((double)ctE->e[ixE]+nT); 
                fprintf(wr, " %10.7f", prob);

                fputc('\n', wr);
              }
          }
        fputc('\n', wr);
      }

  }

Options_t *get_options(int argc, char **argv)
  {
    /* Get command-line parameters: */
    Options_t *o = notnull(malloc(sizeof(Options_t)), "no mem");

    argparser_t *pp = argparser_new(stderr, argc, argv);
    argparser_set_help(pp, PROG_HELP);
    argparser_set_info(pp, PROG_INFO);
    argparser_process_help_info_options(pp);
    
    /* Parse keyword parameters: */

    argparser_get_keyword(pp, "-dataBank");
    o->dataBank = argparser_get_next(pp);

    argparser_get_keyword(pp, "-windowSize");
    o->windowSize = argparser_get_next_int(pp, 0, 100);

    argparser_get_keyword(pp, "-labelMap");
    o->labSet = argparser_get_next(pp);
    argparser_get_keyword_next(pp, "=");
    o->labEqv = argparser_get_next(pp);

    { /* Check for invalid characters in the label map: */
      int m = strlen(o->labSet);
      if (m != strlen(o->labEqv))
        { fprintf(stderr, "label char lists have different lengths\n"); exit(1); }
      char *str[2] = { o->labSet, o->labEqv};
      int j;
      for (j = 0; j < 2; j++)
        { int i;
          for (i = 0; i < m; i++)
            { char s = str[j][i];
              if
                ( ((s < 'A') || (s > 'Z')) &&
                  ((s < 'a') || (s > 'z')) &&
                  ((s < '0') || (s > '9')) &&
                  (s != '?') && 
                  (s != '*') && 
                  (s != '@') && 
                  (s != '.') && 
                  (s != '-')
                )
                { fprintf(stderr, "invalid label character '%c'\n", s); exit(1); }
            }
        }
    }
    
    o->quiet = argparser_keyword_present(pp, "-quiet");
    
    if (argparser_keyword_present(pp, "-eventList"))
      { o->eventList = argparser_get_next(pp); }
    else if (argparser_keyword_present(pp, "-all"))
      { o->eventList = NULL; }
    else
      { o->eventList = NULL; }
    
    /* Get positional parameters: */
    argparser_skip_parsed(pp);
    
    /* Check for spurious arguments: */
    argparser_finish(pp);

    return o;
  }