#define PROG_NAME "dbd_predict"
#define PROG_DESC "statistical recognition of coding regions in DNA"
#define PROG_VERS "1.0"
/* Last edited on 2024-12-21 14:03:52 by stolfi */

#define dbd_predict_C_COPYRIGHT "Copyright © 2006,2008 by IC-UFF and IC-UNICAMP"

#define PROG_HELP \
  "  " PROG_NAME " \\\n" \
  "    {EVENT_FILE} \\\n" \
  "    {PROB_FILE_1} {PROB_FILE_2} {PROB_FILE_3} \\\n" \
  "    {BAS_FILE} {LAB_FILE} {IDEBUG} \\\n" \
  "    {OUT_FILE}"
#define PROG_INFO \
  "NAME\n" \
  "  " PROG_NAME " - " PROG_DESC "\n" \
  "\n" \
  "SYNOPSIS\n" \
  PROG_HELP "\n" \
  "\n" \
  "DESCRIPTION\n" \
  "  Reads a file {EVENT_FILE} with a list of valid events to be considered.\n" \
  "\n" \
  "  !!! TO BE WRITTEN !!! \n" \
  "\n" \
  "OPTIONS\n" \
  "  None.\n" \
  "\n" \
  "DOCUMENTATION OPTIONS\n" \
  argparser_help_info_HELP_INFO "\n" \
  "\n" \
  "SEE ALSO\n" \
  "  dbd_tabulate(1), dbd_summary(1), dbd_paint(1).\n" \
  "\n" \
  "AUTHOR\n" \
  "  Created 2005-10-05 by Renatha Oliva Capua, IC-UFF.\n" \
  "\n" \
  "MODIFICATION HISTORY\n" \
  "  2008-06-06 Various changes by by J. Stolfi, IC-UNICAMP.\n" \
  "\n" \
  "WARRANTY\n" \
  argparser_help_info_NO_WARRANTY "\n" \
  "\n" \
  "RIGHTS\n" \
  "  " dbd_predict_C_COPYRIGHT ".\n" \
  "\n" \
  argparser_help_info_STANDARD_RIGHTS

#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <math.h>

#include <jsfile.h>
#include <jstime.h>
#include <jsmath.h>
#include <rn.h>
#include <affirm.h>
#include <fget.h>
#include <vec.h>

#include <dbd_lib.h>

typedef struct Options_t
  { char *eventName;  /* Name of file with valid events. */
    char *prob1Name;  /* Name of file with single-letter cond probs. */
    char *prob2Name;  /* Name of file with 2-letter cond probs. */
    char *prob3Name;  /* Name of file with 3-letter cond probs. */
    char *basName;    /* Name of file with base sequence to classify. */
    char *labName;    /* Name of file with `correct' classification, or NULL if none. */
    char *outName;    /* Prefix for output file names. */
    int iDebug;       /* Index of basis to debug, or -1 if none. */
  } Options_t;

Options_t *get_options(int argc, char **argv);
  /* Parses the command line, returns them as an {Options_t} record. */

/* VALID EVENT TABLE */

typedef struct event_t
  { char *event;       /* Whole event. */
    double prob;       /* A priori probability for this event. */
    int_vec_t szFac;   /* Lengths of elementary factors. */
    int_vec_t ixFac;   /* Indices of elementary factors. */
  } event_t;
  /* This is an entry of the table that lists all {k}-events that are
    valid according to our DNA model, and their partitions into
    elementary events. 
    
    The {event} is a string of {k} label letters ('D', 'E', 'F', or
    'N'). Its a priori probability is {prob}. It is broken into {nF =
    szFac.ne == ixFac.ne} elementary factors, where factor {j} has
    length {kj = szFac[j]} and its index in the list of all
    {kj}-events is {ixFac[j]}, for {j} in {0..nF-1}. */

vec_typedef(event_vec_t,event_vec,event_t);

/* CONDITIONAL TUPLE PROBABILITY TABLES */

typedef struct cprob_table_t
  { int k;         /* Size of tuples and events in this table. */   
    int nE;        /* Number of all {k}-events {E}. */
    int nT;        /* Number of all {k}-tuples {T}. */
    double *cprob; /* Probs {Pr(T|E)} of {k}-tuples {T} cond to {k}-events {E}. */
  } cprob_table_t;
  /* {cprob[nE*ixE + ixT]} is the cond prob {Pr(T|E)} where {ixE}
    is the index of {E} and {ixT} the index of {T}, for every
    {k}-event {E} and every {k}-tuple {T}. */

/* PROTOTYPES */

event_vec_t read_valid_events(char *eventName, int *k);
  /* Reads the window size {k} and the list of valid {k}-events 
    from file {eventName}. */

cprob_table_t read_tuple_cond_probs(char *tabName, int k);
  /* Reads from file {tabName} a table of conditional probabilities
    {Pr(T|E)} for all {k}-tuples {T} and all {k}-events {E}. */

void get_event_factors(char *evFac, char *event, int_vec_t *szfac, int_vec_t *ixFac);
  /* Breaks the given {factorization} string (assumed to be of the
    given {event}) into elementary events at the '.' characters.
    Stores the factor lengths in {szFac} and their indices in {ixFac}.
    Allocates the {szFac} and {ixFac} vectors. */

void classify_bases
  ( char_vec_t *bas,
    char_vec_t *lab,
    cprob_table_t *vfreq1,
    cprob_table_t *vfreq2,
    cprob_table_t *vfreq3,
    int k,
    event_vec_t *vev,
    double *prob,
    int iDebug
  );
  /* Stores into {prob[4*i + j]} the probability that the true
    label of {bas.el[i]} has label {ej} where {ej=label_char(j)}.
    
    The probability is computed by trying to classify the string
    {tuple} of {k} bases centered on {bas.el[i]} as each of the valid
    {k}-events listed in {vev} (see {compute_event_probs}), and then
    collapsing those events into four based on the central letter.
    
    If {iDebug} is non-negative, prints the details of the computation
    for base {bas[iDebug]}.  For interesting results, the {k}-tuple centered
    at that base must be wholly contained in the sequence. The {lab} argument,
    if not null or empty, is assumed to be the `official' classification
    of {bas}. It is used only for debugging. */

void compute_event_probs
  ( int k, 
    char *tuple, 
    cprob_table_t *vfreq1, 
    cprob_table_t *vfreq2, 
    cprob_table_t *vfreq3, 
    event_vec_t *vev,
    double *evProb,
    bool_t show
  );
  /* Computes {evProb[t] = Pr(event[i]|tuple)}, for each valid event 
    {event[v]} where {v} is in {0..vev.ne-1}.
    
    The probability {Pr(event|tuple)} of each {event} is computed by
    Bayes's theorem. The a priori probabilities {Pr(event)} are taken
    from the {vev} table. The conditional probabilities
    {Pr(tuple|event)} are computed by factoring {tuple} and {event} as
    specified in the {vev} table, and assuming that the factors are
    independent.
    
    If {show} is TRUE, prints the details of the computation.  */

void condense_event_probs(event_vec_t *vev, double *evProb, int which, double *prob);
  /* Reduces the computed probabilities of valid {k}-events centered
    at some base to probabilities of the four possible labels for that
    base.
    
    More precisely, for each label letter {ej=label_char(j)} in
    [DEFN], stores into {prob[j]} the sum of all probabilities
    {evProb[v]} of all events {vev[v]} with have letter {ej} at
    position {which}. */

void write_classification
  ( char *estName, 
    char *ertName,
    char *scrName,
    char *hitName,
    char_vec_t *bas,
    char_vec_t *lab,
    double *prob
  );
  /* Writes to a file named {estName} the computed probabilistic
    classification of each base of sequence {bas}, and compares it with
    the corresponding label in the `official' classification {lab}. Also
    writes to a file named {ertName} the most likely categorical
    classification, and to files named {scrName} and {hitName} 
    a set of tables showing the classifier's performance.

    Each line {estName} refers to one letter {bi=bas[i]} of the given
    DNA sequence, in the format

      "{i} {bi} {prDEFN} {emDEFN}  {prKN} {emKN}  {prDEF} {emDEF}"

    The field {prDEFN} consists of four computed probabilities
    {prob[4*i+j]=Pr(ei=ej)}, for {j} in {0..3}; where {ei} is the
    (unknown) true label of {bi}, and {ej=label_char(j)} is each of 'D',
    'E', 'F', 'N'. To improve readability, the largest of these four
    probabilities is prefixed with a '+'. Field {emDEFN} shows the
    `program's favorite' label {emi}, i.e. the label {ej} with largest
    probability {Pr(ei=ej)}. This letter is followed by ':', then the
    `official' label {eoi=lab[i]}, and then the `score mark' --- '·' if
    the program's favorite {emi} and the official label {eoi} are
    equal, and '*' if they differ.

    The field {prKN} is similar to {prDEFN}, except that the labels
    {DEF} are condensed into a single label {K}. So there are only two
    probabilities, {Pr(ei='D')+Pr(ei='E')+Pr(ei='F')} and {Pr(ei='N')}.
    Likewise, field {emKN} is like {emDEFN}, with [DEF] mapped to 'K'
    (both in the favorite and official parts).  Note that the 
    program may score a hit ('·') in {emKN} even if it got miss ('*')
    in {emDEFN}.

    Finally field {prDEF} is obtained from {prDEFN} by discarding the
    'N' entry and normalizing the three remaining probabilities to unit
    sum. That is, we recompute {Pr(ei='D')} under the assumption that
    'N' has zero probability `a priori'. Likewise {emDEF} lists the
    program's favorite label among [DEF] (excluding 'N'). The score mark
    is '·' if the favorite label matches the official one, '+' if it is
    one step ahead ("E:D", "F:E", or "D:F") and '-' if it is one step
    behind ("F:D", "D:E", or "E:F").

    The favorite and official labels, as well as the score marks, may be
    '?' if the information is not avaliable. This happens, for instance,
    in the first {h-1} and last {h-1} bases of the sequence, where the
    predicted probabilities are all zero (meaning not computed). The
    official label is '?' also when {lab} is NULL or empty, and, in the
    {emDEF} column, when {lab[i]='N'}.  If either the favorite or official
    label is '?', the score mark is also '?'.

    At the end, writes to file {scrName} the probabilistic score table,
    that is, a matrix {scrDEFN}, where row {jo} is the sum of the
    probability vectos {prDEFN} over all bases with offcial label {jo}.
    This table is followed by tables {scrCN} (total of {prKN}) and
    {scrDEF} (total of {prDEF}), with analogous conventions. These
    tables do not count bases where the official label is '?'.

    Also writes to file {hitName} the categorical score table {hitDEFN},
    where element in row {jo} and column {jm} is the number of bases
    with official label {jo} and favorite label {jn}. This table is
    followed by tables {hitKN} and {hitDEF}, computed in the same way.
    These tables do not count bases where either label is '?'.  */

char write_base_probs
  ( FILE *estFile, 
    char *labChars, 
    double *pr, 
    char charOff, 
    double *scr, 
    double *hit,
    bool_t diff
  );
  /* Writes one set of labeling probabilities and the corresponding
    favorite label {em} for a single base. Also compares that label to
    the `official' label {eo=charOff} and updates the predictor's score
    matrix accordingly. Returns the favorite label.
    
    The argument {labChars} lists the possible label values being
    considered (not including '?'). The procedure assumes that {pr[j]}
    is the probability assigned by the program to label value
    {labChars[j]}, for {j} in {0..nC-1} where {nC=strlen(labChars)}.
    These probabilities must either be all zero (meaning that the
    program did not produce a classification), or must add to 1. The
    `official' label is assumed to be {eo}, and must be either
    '?' or one of the {labChars}.
    
    The probabilities {pr[0..nC-1]} are printed first. To improve
    readability, the largest probability is prefixed with a '+'. Then
    th eprocedure prints the `favorite' label {em=labChars[jm]}, where
    {jm} is the index of the largest probability in {pr}. This letter
    is followed by ':', then by the `official' label {eo}, and
    then the `score mark' --- '·' if the favorite and official labels
    {em,eo} are equal, and '*' if they differ.
    
    If {diff} is TRUE, the {labChars} must have exactly 3 choices
    "DEF"; then the printed score mark will be either '·' (correct),
    '+' (one ahead), or '-' (one behind). Otherwise the score mark
    will be either '.' (correct) or '*' (incorrect).
    
    If the official label {eo} is not {'?'}, the {score} and {hits}
    tables are updated to record the predictor's performance. The
    probabilities {pr[j]}, for {j} in {0..nC-1}, are added to
    {scr[nC*jo+j]}, where {jo} is such that {eo=labChars[jo]}. Also,
    if the favorite label {em} is not '?', adds 1 to {hit[nC*jo+jm]}. */

void write_classification_header(FILE *estFile, bool_t dashes);
  /* Writes the header lines for a classification file. If 
    {dashes} is FALSE, writes the column headers; if {dashes}
    is true, writes the dashed lines. */

void write_score_tables
  ( char *tabName, 
    int n, 
    double *tabDEFN, 
    double *tabKN, 
    double *tabDEF, 
    bool_t categ
  );
  /* Writes the score tables {tabDEFN}, {tabKN}, and {tabDEF} to file
    {tabName}. The parameter {n} is the number of bases in the
    original sequence.
    
    If {categ} is FALSE, assumes that the tables refer to the
    probabilistic classification; if {categ} is TRUE, to the
    categorical classification. Also writes below each table some
    useful totals (and hence the entries must be integers). */

void write_score_table(FILE *tabFile, int n, char *labChars, double *tab, bool_t categ);
  /* Writes the matrix {tab} to {tabFile}, followed by a summary. 
    The {categ} parameter has the same meaning as in {write_score_tables}. */

void write_score_table_header(FILE *tabFile, char *labChars, bool_t dashes);
  /* Writes to {tabFile} a header line for the score matrix relative
    to the label values listed in {labChars}. If {dashes} is false,
    prints the column headers; if {dashes} is true, prints the dashed
    lines. */

int main(int argc, char **argv);

/* IMPLEMENTATIONS */

int main(int argc, char **argv)
  {
    Options_t *o = get_options(argc, argv);

    int k; /* Window width. */
    
    /* List of valid events, their factorization, and a priori probs: */
    event_vec_t vev = read_valid_events(o->eventName, &k); /* Valid events and factoring. */
    
    char_vec_t bas = read_bio_seq_file(o->basName); /* Sequence to be classified. */
    char_vec_t lab = read_bio_seq_file(o->labName); /* Official classif of {bas}, or empty. */

    double *prob = rn_alloc(4*bas.ne);
    /* {prob[4*i +j]} is the prob of {bas.el[i]} having label code {j}. */
 
    cprob_table_t vfreq1 = read_tuple_cond_probs(o->prob1Name, 1);
    cprob_table_t vfreq2 = read_tuple_cond_probs(o->prob2Name, 2);
    cprob_table_t vfreq3 = read_tuple_cond_probs(o->prob3Name, 3);

    /* Compute the probabilistic classification of each base in {bas}: */
    double startTime = user_cpu_time_usec();
    classify_bases(&bas,&lab, &vfreq1,&vfreq2,&vfreq3, k, &vev, prob, o->iDebug);
    double stopTime = user_cpu_time_usec();

    /* Check running time: */
    double ts = (stopTime - startTime)/1.0e6; /* Running time in seconds. */
    fprintf(stderr, "\n");
    fprintf(stderr, "cpu time: %.3f sec total", ts);
    fprintf(stderr, " (%.6f sec per base)\n", ts/bas.ne);

    /* Print the probabilitic and categorical classifications: */
    char *estName = jsprintf("%s.est", o->outName);
    char *ertName = jsprintf("%s.ert", o->outName);
    char *scrName = jsprintf("%s.scr", o->outName);
    char *hitName = jsprintf("%s.hit", o->outName);
    write_classification(estName, ertName, scrName, hitName, &bas, &lab, prob);

    return 0;
  }

Options_t *get_options(int argc, char **argv)
  {
    /* Get command-line parameters: */
    Options_t *o = (Options_t *)notnull(malloc(sizeof(Options_t)), "no mem");
    
    if (argc != 9)
      { fprintf(stderr, "wrong number of arguments\n");
        fprintf(stderr, "usage: %s\n", PROG_HELP);
        exit(1);
      }

    o->eventName = argv[1];
    o->prob1Name = argv[2];
    o->prob2Name = argv[3];
    o->prob3Name = argv[4];
    o->basName = argv[5];
    o->labName = argv[6];

    o->iDebug = atoi(argv[7]);

    o->outName = argv[8];
    
    return o;
  }

event_vec_t read_valid_events(char *eventName, int *k)
  {
    FILE *eventFile = open_read(eventName, TRUE);

    int eventLine = 1; /* Line sequence number in file. */

    /* Skip ">" heder, if any: */
    skip_bio_file_header(eventFile, &eventLine);

    int nvev; /* Number of valid events in file. */
    int kvev; /* Tuple sizer as read from file. */

    /* Read tuple size from file, check with expected: */
    kvev = fget_int32(eventFile); fget_eol(eventFile); eventLine++;
    (*k) = kvev;
    demand(kvev % 2 == 1, "window size must be odd");
      
    /* Read number of events from file: */
    nvev = fget_int32(eventFile); fget_eol(eventFile);

    /* Read valid events: */
    event_vec_t vev = event_vec_new(nvev);
    int v;
    for(v = 0; v < nvev; v++)
      {
        do {
          eventLine++;
          fget_skip_spaces(eventFile);
        } while (fget_test_char(eventFile, '\n'));

        event_t *evv = &(vev.e[v]);
        evv->event = fget_string(eventFile); /* A valid event. */
        evv->prob = fget_double(eventFile);  /* A priori probability of event. */
        char *evFac = fget_string(eventFile); /* Event factorization. */

        /* Digest factorization of event: */
        get_event_factors(evFac, evv->event, &evv->szFac, &evv->ixFac);

        // /* Print factorization: */
        // { fprintf(stderr, "%s %12.10f %3d ", evv->event, evv->prob, evv->fac.ne);
        //   affirm(evv->szFac.ne == evv->ixFac.ne, "bug factors");
        //   int j;
        //   for (j = 0; j < evv->szFac.ne; j++)
        //     { fprintf(stderr, " d:%d", evv->szFac.e[j], evv->ixFac.e[j]); } 
        //   fprintf(stderr, "\n"); 
        // }
        fget_eol(eventFile);
      }
    fclose(eventFile);
    return vev;
  }

vec_typeimpl(event_vec_t,event_vec,event_t);

cprob_table_t read_tuple_cond_probs(char *tabName, int k)
  {
    int nE = ipow(4, k); /* Number of possible {k}-events. */
    int nT = ipow(4, k); /* Number of possible {k}-tuples. */
    int nP = nE * nT;    /* Number of possible event/tuple pairs. */

    /* Allocate probability table; */
    cprob_table_t tab;
    tab.k = k;
    tab.nE = nE;
    tab.nT = nT;
    tab.cprob = rn_alloc(nP);
    
    FILE *tabFile = open_read(tabName, TRUE);
    int tabLine = 1; /* Line sequence number in file. */

    /* Skip ">" heder, if any: */
    skip_bio_file_header(tabFile, &tabLine);

    /* Clear all probabilities: */
    int ixP;
    for (ixP = 0; ixP < nP; ixP++) { tab.cprob[ixP] = -1.0; }

    int ch;
    while(TRUE)
      { 
        fget_skip_spaces(tabFile);
        ch = fgetc(tabFile);
        if (ch == EOF) { break; }
        if (ch != '\n')
          { 
            ungetc(ch, tabFile);
            char *event = fget_string(tabFile);
            char *tuple = fget_string(tabFile);
            int64_t count = fget_int64(tabFile);
            double cprob = fget_double(tabFile);
            fget_skip_spaces(tabFile);
            
            affirm(strlen(event) == k, "bad event length");
            affirm(strlen(tuple) == k, "bad tuple length");
            affirm(count >= 0, "bad pair count");
            affirm((cprob >= 0) && (cprob <= 1.0), "bad cond prob");
            
            /* Store probability: */
            int ixE = event_index(event);
            int ixT = tuple_index(tuple);
            int ixP = ixE*nT + ixT;
            affirm(tab.cprob[ixP] == -1.0, "duplicated event/tuple pair");
            tab.cprob[ixP] = cprob;
          }
        tabLine++;
      }
      
    /* Normalize conditional probabilities of same event, just in case: */
    { double defProb = 1.0/nT; /* Default conditional prob of tuple. */
      int ixE, ixT;
      for (ixE = 0; ixE < nE; ixE++)
        { /* Totalize the probability of event: */
          double sum = 0.0;
          for (ixT = 0; ixT < nT; ixT++) 
            { int ixP = ixE*nT + ixT;
              if (tab.cprob[ixP] == -1.0) { tab.cprob[ixP] = 0.0; }
              sum += tab.cprob[ixP];
            }
            
          /* Normalize tuple probabilities for this event: */
          for (ixT = 0; ixT < nT; ixT++) 
            { int ixP = ixE*nT + ixT;
              if (sum == 0.0) 
                { tab.cprob[ixP] = defProb; }
              else
                { tab.cprob[ixP] /= sum; }
            }
        }
    }
    fclose(tabFile);
    return tab;
  }

void get_event_factors(char *evFac, char *event, int_vec_t *szFac, int_vec_t *ixFac)
  {
    /* Count number of factors, check consistency with {event}: */
     int nFac = 1;
    { char *p = evFac;
      char *q = event;
      while ((*p != '\000') && (*q != '\000'))
        { if (*p == '.') 
            { nFac++; p++; }
          else
            { affirm(*p == *q, "factorization differs from event"); 
             p++; q++; 
           }
       }
      affirm (*p == *q, "length mismatch in factorization and event");
     }

    /* Allocate vectors: */
    (*szFac) = int_vec_new(nFac);
    (*ixFac) = int_vec_new(nFac);

    /* Extract and process the factors: */
    { char *p = evFac;
      int j;
      for (j = 0; j < nFac; j++)
        {
          /* Now {p} points to factor {j}; find its end {q}: */
          char *q = p;
          while ((*q != '\000') && (*q != '.')) { q++; }

          /* Temporarily truncate the event at {q}, and extract the factor: */
          char ch = (*q); 
          (*q) = '\000';
          szFac->e[j] = q - p;
          ixFac->e[j] = event_index(p);

          /* Restore event and prepare for next factor: */
          (*q) = ch;
          p = q+1;
        }
    }
  }

void classify_bases
  ( char_vec_t *bas,
    char_vec_t *lab,
    cprob_table_t *vfreq1,
    cprob_table_t *vfreq2,
    cprob_table_t *vfreq3,
    int k,
    event_vec_t *vev,
    double *prob,
    int iDebug
  )
  {
    int n = bas->ne; /* Number of bases in sequence. */

    affirm(k > 0, "k must be positive");
    affirm(k % 2 == 1, "k must be odd");
    int h = (k-1)/2; /* Number of context bases on each side of current base. */
    
    char tuple[k+1]; /* A {k}-tuple containing the base letter of interest. */
    
    double evProb[vev->ne];  /* A posteriori probabilities for the {vev} events. */
    
    /* Loop on bases. */
    int i;
    for (i = 0; i < n; i++)
      {
        bool_t show = (iDebug == i); /* Should we print details for this base? */

        /* Extract the {k}-tuple of {bas} centered at {i}, check for bad chars: */
        int ini = i - h; /* Starting index of tuple in {bas}. */
        extract_substring(bas, ini, k, tuple);

        /* Check for bad chars: */
        bool_t bad = FALSE;
        int s;
        for (s = 0; s < k; s++)
          { int code = base_code(tuple[s]);
            if ((code < 0) || (code >= 4)) { bad = TRUE; }
          }
        
        if (show) 
          { /* Start diagnostic output: */
            fprintf(stderr, "\n");
            fprintf(stderr, "  | t = %s  i = %d [%d..%d]\n", tuple, i-h, i+h, i); 
            if ((lab != NULL) && (lab->ne > 0))
              { char event[k+1]; /* The official {k}-event corresponding to {tuple}. */
                extract_substring(lab, ini, k, event);
                fprintf(stderr, "  | e = %s (official)\n", event); 
              }
          }

        if (bad)
          { 
            /* The {k}-tuple centered at {i} overflows {bas}; balk out. */
            int j;
            for (j = 0; j < 4; j++) { prob[4*i+j] = 0.0; }
            if (show)
              { fprintf(stderr, "  |     bad tuple - bayes not applied.\n"); }
          }
        else
          { 
            /* Compute the probabilities of each event in {vev} given {tuple}: */
            compute_event_probs(k, tuple, vfreq1,vfreq2,vfreq3, vev, evProb, show);

            /* Condense the event probabilities according to the central label: */
            int which = i - ini; /* Index of relevant label in {k}-event. */
            condense_event_probs(vev, evProb, which, &(prob[4*i]));
          }
          
        if (show) 
          { /* Finish diagnostic output: */
            int j;
            fprintf(stderr, "  |\n");
            for (j = 0; j < 4; j++) 
              { char ej = label_char(j);
                double prj = prob[4*i + j];
                fprintf(stderr, "  | prob[%c] = %8.6f\n", ej, prj);
              }
             fprintf(stderr, "\n");
         }
      }
  }

void compute_event_probs
  ( int k, 
    char *tuple, 
    cprob_table_t *vfreq1, 
    cprob_table_t *vfreq2, 
    cprob_table_t *vfreq3, 
    event_vec_t *vev,
    double *evProb,
    bool_t show
  )
  {
    char evFac[k+1]; /* To store the current factor of the current event. */
    char tpFac[k+1]; /* To store the current factor of {tuple}. */

    double cprob[vev->ne]; /* Cond probs {cprob[v] = Pr(tuple|vev[v])}. */
    double den = 0.0;       /* Denominator of Bayes's theorem (normalization factor). */
    
    /* Compute the conditional probs {cprob[v]}. */
    int v;
    for (v = 0; v < vev->ne; v++)
      {
        event_t *evv = &(vev->e[v]);
        int ntf = 0; /* Total labels in event factors already processed. */
        /* Compute {cprob[v]} by enumerating the factors of {evv}: */
        int nFac = evv->szFac.ne;
        double cprv = 1.0; 
        int f;
        for (f = 0; f < nFac; f++)
          { 
            int kf = evv->szFac.e[f]; /* Length of event factor {f}. */
            
            /* Extract the factor {evFac} from the {event}: */
            strncpy(evFac, evv->event + ntf, kf);
            evFac[kf] = '\000';
            
            /* Extract the corresponding piece {tpFac} from the given {tuple}: */
            strncpy(tpFac, tuple + ntf, kf);
            tpFac[kf] = '\000';
            
            /* Compute the indices of {evFac} and {tpfac}: */
            int ixE = evv->ixFac.e[f];   /* Index of event factor {f}. */
            int ixT = tuple_index(tpFac); /* Index of tuple factor {f}. */
            
            /* Get the conditional probability {cprFac=Pr(tpFac|evFac)}: */
            double cprFac;
            switch(kf)
              {
                case 1:
                  cprFac = vfreq1->cprob[ixE*vfreq1->nT + ixT];
                  break;
                case 2:
                  cprFac = vfreq2->cprob[ixE*vfreq2->nT + ixT];
                  break;
                case 3:
                  cprFac = vfreq3->cprob[ixE*vfreq3->nT + ixT];
                  break;
                default:
                  fprintf(stderr, "factor \"%s\" has invalid length\n", evFac);
                  exit(1);
              }
            /* Multiply cond prob into the numerator: */
            cprv = cprv * cprFac;
            /* if (show) { fprintf(stderr, "  |     (%14.7e)\n", cprFac); } */

            /* Skip factor in tuple and event: */
            ntf += kf;
        }
        cprob[v] = cprv;
        
        /* Accumulate the denominator in {den}: */
        den += cprv * evv->prob;
    }

    if (show)
      { /* Print diagnostic output: */
        fprintf(stderr, "  |     Pr(t) = %10.4e\n", den); 
        fprintf(stderr, "  |\n"); 
        fprintf(stderr, "  |     %*s", k, ""); 
        fprintf(stderr, "  %10s %10s %10s\n", "Pr(t|e)", "Pr(e)", "Pr(e|t)"); 
        fprintf(stderr, "  |     %*s", k, ""); 
        fprintf(stderr, "  %10s %10s %10s\n", "----------", "----------", "----------"); 
      }

    /* Normalize the probabilities in {evProb} so that they add to 1: */
    for (v = 0; v < vev->ne; v++)
      { 
        event_t *evv = &(vev->e[v]);
        double cprv = cprob[v];
        evProb[v] = cprv * evv->prob / den;
        if (show)
          { /* Print diagnostic output: */
            fprintf(stderr, "  | e = %s", evv->event); 
            fprintf(stderr, "  %10.4e %10.8f %10.8f", cprv, evv->prob, evProb[v]); 
            fprintf(stderr, "\n"); 
          }
      }
  }

void condense_event_probs(event_vec_t *vev, double *evProb, int which, double *prob)
  {
    /* Clear out {prob}: */
    int j;
    for (j = 0; j < 4; j++) { prob[j] = 0; } 

    /* Scan and accumulate the event probabilities: */
    int v;
    for (v = 0; v < vev->ne; v++)
      { char ej = (vev->e[v].event)[which];
        int j = label_code(ej);
        affirm((j >= 0) && (j < 4), "bad label in valid event");
        prob[j] += evProb[v];
      }
  }

void write_classification
  ( char *estName, 
    char *ertName,
    char *scrName,
    char *hitName,
    char_vec_t *bas,
    char_vec_t *lab,
    double *prob
  )
  {
    FILE *estFile = open_write(estName, TRUE);  /* Probabilistic classification report. */
    FILE *ertFile = open_write(ertName, TRUE);  /* Categorical (`best guess') classification. */

    bool_t hasOfficial = ((lab != NULL) && (lab->ne > 0)); /* Are there official labels? */
    
    if (hasOfficial)
      { affirm(lab->ne == bas->ne, "bas/lab length mismatch"); }

    /* Probabilistic score tables: */
    double scrDEFN[4*4]; rn_zero(4*4, scrDEFN); 
    double scrKN  [2*2]; rn_zero(2*2, scrKN);
    double scrDEF [3*3]; rn_zero(3*3, scrDEF);

    /* Categorical score tables: */
    double hitDEFN[4*4]; rn_zero(4*4, hitDEFN); 
    double hitKN  [2*2]; rn_zero(2*2, hitKN);
    double hitDEF [3*3]; rn_zero(3*3, hitDEF);

    /* Write file headers: */
    write_classification_header(estFile, FALSE); /* Column titles. */
    write_classification_header(estFile, TRUE) ; /* Dashed lines. */

    /* Loop on bases: */
    int i;
    for (i = 0; i < bas->ne; i++)
      {
        char bi = bas->e[i];

        fprintf(estFile, " %11d", i);
        fprintf(estFile, " %c", bi);

        /* Get program's probs {prDEFN[0..3]} and official label {labDEFN}: */
        double *prDEFN = &(prob[4*i]); /* Program's probabilities for 'D', 'E', 'F', 'N'. */
        char labDEFN = (hasOfficial ? lab->e[i] : '?');
        
        /* Print fields {prDEFN} and {emDEFN}: */
        char estDEFN = write_base_probs(estFile, "DEFN", prDEFN, labDEFN, scrDEFN, hitDEFN, FALSE);
        
        /* Condense probs and official label into classes 'K' and 'N': */
        double prKN[2];
        prKN[0] = prDEFN[0] + prDEFN[1] + prDEFN[2];
        prKN[1] = prDEFN[3];
        char labKN = (labDEFN == '?' ? '?' : (labDEFN == 'N' ? 'N' : 'K'));

        /* Print fields {prKN} and {emKN}: */
        char estKN = write_base_probs(estFile, "CN", prKN, labKN, scrKN, hitKN, FALSE);
        estKN = estKN; /* To pacify the compiler. */
        
        /* Extract phase probabilities {prDEF} and official phase {labDEF}: */
        double prDEF[3];
        double normDEF = (prKN[0] == 0.0 ? 1.0 : prKN[0]); 
        prDEF[0] = prDEFN[0]/normDEF;
        prDEF[1] = prDEFN[1]/normDEF;
        prDEF[2] = prDEFN[2]/normDEF;
        char labDEF = ((labDEFN == '?') || (labDEFN == 'N') ? '?' : labDEFN);
        
        /* Print fields {prDEF} and {emDEF}: */
        char estDEF = write_base_probs(estFile, "DEF", prDEF, labDEF, scrDEF, hitDEF, TRUE);
        estDEF = estDEF; /* To pacify the compiler. */
        
        fprintf(estFile,"\n");

        /* Write favorite label to {ertFile}: */
        if ((i > 0) && (i % 60 == 0)) { fputc('\n', ertFile); }
        fputc(estDEFN, ertFile);

      }

    fclose(estFile);
    fprintf(ertFile, "\n"); fclose(ertFile);

    if (hasOfficial)
      { /* Write scoring tables: */
        write_score_tables(scrName, bas->ne, scrDEFN, scrKN, scrDEF, FALSE);
        write_score_tables(hitName, bas->ne, hitDEFN, hitKN, hitDEF, TRUE);
      }
  }

char write_base_probs
  ( FILE *estFile, 
    char *labChars, 
    double *pr, 
    char charOff, 
    double *scr, 
    double *hit,
    bool_t diff
  )
  { 
    int nC = strlen(labChars); /* Number of distinct label chars. */
    
    /* Compute total probability {prTot} and find code {codeOff} of official label: */
    int codeOff = nC; /* Index such that {labChars[codeOff]==charOff}, or {nC} if none. */
    double prTot = 0; /* Sum of all {pr} entries. */
    int j;
    for (j = 0; j < nC; j++) 
      { prTot += pr[j]; 
        if (labChars[j] == charOff) { codeOff = j; }
      } 
    
    /* Find program's favorite label {codeMax,charMax}: */
    int  codeMax; /* Index in {0..nC} such that {pr[codeMax]} is maximum, or {nc} if none. */
    char charMax; /* Program's favorite label {labChars[codeMax]} or '?' if undef. */

    if (prTot == 0)
      { codeMax = nC; charMax = '?'; }
    else
      { /* Find maximum {pr[codeMax]} of {pr[0..3]}: */
        codeMax = 0;
        for (j = 1; j < nC; j++)
          { if (pr[j] > pr[codeMax]) { codeMax = j; } } 
        charMax = labChars[codeMax];
      }

    /* Print probabilities {pr[j]}, with '+' before the largest one: */
    fprintf(estFile, "  "); 
    for (j = 0; j < nC; j++)
      { char mark = ((prTot > 0) && (j == codeMax) ? '+' : ' ');
        fprintf(estFile," %c%8.6lf", mark, pr[j]);
      }

    /* Print favorite label, official label, and score mark: */
    char scoreMark;
    if (diff)
      { /* Differential score mark (for phase identification only): */
        affirm(nC == 3, "wrong nC for diff scoring");
        if ((charOff == '?') || (codeOff == '?'))
          { scoreMark = '?'; }
        else
          { int delta = (codeMax - codeOff + 3) % 3;
            switch (delta)
              { case 0: scoreMark = '·'; break;
                case 1: scoreMark = '+'; break;
                case 2: scoreMark = '-'; break;
                default: affirm(FALSE, "duhh?"); scoreMark = '?';
              } 
          }
      }
    else
      { /* Simple score mark (correct or incorrect). */
        if ((charMax == '?') || (charOff == '?'))
          { scoreMark = '?'; }
        else
          { scoreMark = (charMax == charOff ? '·' : '*'); }
      }
    fprintf(estFile," %c:%c%c", charMax, charOff, scoreMark);
    
    if (charOff != '?')
      { /* Update the probabilistic scores: */
        for (j = 0; j < nC; j++)
          { scr[nC*codeOff + j] += pr[j]; }

        if (charMax != '?')
          { /* Update the categorical score: */
            hit[nC*codeOff + codeMax] += 1; 
          }
      }

    affirm((prTot == 0.0) || (fabs(prTot - 1) <= 0.00000001), "probs don't add to 1");

    return charMax;
  }

void write_classification_header(FILE *estFile, bool_t dashes)
  {
    fprintf(estFile, ">%11s", (dashes ? "----------" : "position"));
    fprintf(estFile, " %1s", (dashes ? "-" : "b"));
    fprintf(estFile, "  ");
    fprintf(estFile, " %-9s", (dashes ? "---------" : " Pr(D)"));
    fprintf(estFile, " %-9s", (dashes ? "---------" : " Pr(E)"));
    fprintf(estFile, " %-9s", (dashes ? "---------" : " Pr(F)"));
    fprintf(estFile, " %-9s", (dashes ? "---------" : " Pr(N)"));
    fprintf(estFile, " %4s", (dashes ? "----" : "labs"));
    fprintf(estFile, "  ");
    fprintf(estFile, " %-9s", (dashes ? "---------" : " Pr(K)"));
    fprintf(estFile, " %-9s", (dashes ? "---------" : " Pr(N)"));
    fprintf(estFile, " %4s", (dashes ? "----" : "codn"));
    fprintf(estFile, "  ");
    fprintf(estFile, " %-9s", (dashes ? "---------" : " Pr(D)"));
    fprintf(estFile, " %-9s", (dashes ? "---------" : " Pr(E)"));
    fprintf(estFile, " %-9s", (dashes ? "---------" : " Pr(F)"));
    fprintf(estFile, " %4s", (dashes ? "----" : "phas"));
    fprintf(estFile,"\n");
  }

void write_score_tables
  ( char *tabName, 
    int n, 
    double *tabDEFN, 
    double *tabKN, 
    double *tabDEF, 
    bool_t categ
  )
  {
    FILE *tabFile = open_write(tabName, TRUE);
    write_score_table(tabFile, n, "DEFN", tabDEFN, categ);
    write_score_table(tabFile, n, "KN",   tabKN,   categ);
    write_score_table(tabFile, n, "DEF",  tabDEF,  categ);
    fclose(tabFile);
  }

void write_score_table(FILE *tabFile, int n, char *labChars, double *tab, bool_t categ)
  { 
    double sum = 0, sumDiag = 0;

    int nC = strlen(labChars);
    
    write_score_table_header(tabFile, labChars, FALSE);
    write_score_table_header(tabFile, labChars, TRUE);
    fflush(tabFile);

    int  row, col;
    for (row = 0; row < nC; row++)
      { char *hdr = NULL;
        char *hdr = jsprintf(" Pr(%c)", labChars[row]);
        fprintf(tabFile, " %9s", hdr);
        free(hdr);
        
        double sumRow = 0.0;
        for (col = 0; col < nC; col++)
          {
            double elem = tab[nC * row + col];
            if (categ)
              { int ielem = (int)round(elem);
                affirm(fabs(ielem - elem) <= 1.0e-5*ielem, "non-integer count");
                fprintf(tabFile, " %9d", ielem);
              }
            else
              { fprintf(tabFile, " %9.2f", elem); }
            sum += elem;
            sumRow += elem;
            if (col == row) { sumDiag += elem; }
          }
        /* Print accuracy within this row: */
        double diagElem = tab[nC*row + row];
        double accRow = (diagElem + 1.0)/(sumRow + nC);
        fprintf(tabFile, "  %5.3f", accRow);
        
        fprintf(tabFile, "\n");
      }
    fprintf(tabFile, "\n\n");
    
    /* Table summary: */
    char *alf = jsprintf("[%s]", labChars);
    fprintf(tabFile, " %-8s", alf);
    free(alf);
    
    /* The number of entries considered is the table total: */
    int nok = round(sum);
    affirm(fabs(sum - nok) <= 1.0e-5*sum, "non-integer sum");
    
    /* The accuracy is the percentage of mass that is in the diagonal elements: */
    double acc = (sumDiag + 1.0)/(nok + nC);
    fprintf(tabFile, "  %9d bases ", n);
    fprintf(tabFile, "  %9d considered", nok);
    if (categ)
      { fprintf(tabFile, "  %11d", (int)round(sumDiag)); }
    else
      { fprintf(tabFile, "  %11.2f", sumDiag); }
    fprintf(tabFile, " matched");
    fprintf(tabFile, "  accuracy = %5.3f\n", acc); 
    fprintf(tabFile, "\n\n");
    fflush(tabFile);        
  }

void write_score_table_header(FILE *tabFile, char *labChars, bool_t dashes)
  {
    int nC = strlen(labChars); /* Number of distinct label values. */ 
    int col;
    fprintf(tabFile, " %9s", "");
    for (col = 0; col < nC; col++)
      { if (dashes)
          { fprintf(tabFile, " %9s", "---------"); }
        else
          { char *hdr = NULL;
            char *hdr = jsprintf(" Est(%c)", labChars[col]);
            fprintf(tabFile, " %9s", hdr);
            free(hdr);
          }
      }
    if (dashes)
      { fprintf(tabFile, "  %5s", "-----"); }
    else
      { fprintf(tabFile, "  %5s", "Accu."); }
    fprintf(tabFile, "\n");
  }