#define PROG_NAME "dbd_predict" #define PROG_DESC "statistical recognition of coding regions in DNA" #define PROG_VERS "1.0" /* Last edited on 2024-12-21 14:03:52 by stolfi */ #define dbd_predict_C_COPYRIGHT "Copyright © 2006,2008 by IC-UFF and IC-UNICAMP" #define PROG_HELP \ " " PROG_NAME " \\\n" \ " {EVENT_FILE} \\\n" \ " {PROB_FILE_1} {PROB_FILE_2} {PROB_FILE_3} \\\n" \ " {BAS_FILE} {LAB_FILE} {IDEBUG} \\\n" \ " {OUT_FILE}" #define PROG_INFO \ "NAME\n" \ " " PROG_NAME " - " PROG_DESC "\n" \ "\n" \ "SYNOPSIS\n" \ PROG_HELP "\n" \ "\n" \ "DESCRIPTION\n" \ " Reads a file {EVENT_FILE} with a list of valid events to be considered.\n" \ "\n" \ " !!! TO BE WRITTEN !!! \n" \ "\n" \ "OPTIONS\n" \ " None.\n" \ "\n" \ "DOCUMENTATION OPTIONS\n" \ argparser_help_info_HELP_INFO "\n" \ "\n" \ "SEE ALSO\n" \ " dbd_tabulate(1), dbd_summary(1), dbd_paint(1).\n" \ "\n" \ "AUTHOR\n" \ " Created 2005-10-05 by Renatha Oliva Capua, IC-UFF.\n" \ "\n" \ "MODIFICATION HISTORY\n" \ " 2008-06-06 Various changes by by J. Stolfi, IC-UNICAMP.\n" \ "\n" \ "WARRANTY\n" \ argparser_help_info_NO_WARRANTY "\n" \ "\n" \ "RIGHTS\n" \ " " dbd_predict_C_COPYRIGHT ".\n" \ "\n" \ argparser_help_info_STANDARD_RIGHTS #include #include #include #include #include #include #include #include #include #include #include #include typedef struct Options_t { char *eventName; /* Name of file with valid events. */ char *prob1Name; /* Name of file with single-letter cond probs. */ char *prob2Name; /* Name of file with 2-letter cond probs. */ char *prob3Name; /* Name of file with 3-letter cond probs. */ char *basName; /* Name of file with base sequence to classify. */ char *labName; /* Name of file with `correct' classification, or NULL if none. */ char *outName; /* Prefix for output file names. */ int iDebug; /* Index of basis to debug, or -1 if none. */ } Options_t; Options_t *get_options(int argc, char **argv); /* Parses the command line, returns them as an {Options_t} record. */ /* VALID EVENT TABLE */ typedef struct event_t { char *event; /* Whole event. */ double prob; /* A priori probability for this event. */ int_vec_t szFac; /* Lengths of elementary factors. */ int_vec_t ixFac; /* Indices of elementary factors. */ } event_t; /* This is an entry of the table that lists all {k}-events that are valid according to our DNA model, and their partitions into elementary events. The {event} is a string of {k} label letters ('D', 'E', 'F', or 'N'). Its a priori probability is {prob}. It is broken into {nF = szFac.ne == ixFac.ne} elementary factors, where factor {j} has length {kj = szFac[j]} and its index in the list of all {kj}-events is {ixFac[j]}, for {j} in {0..nF-1}. */ vec_typedef(event_vec_t,event_vec,event_t); /* CONDITIONAL TUPLE PROBABILITY TABLES */ typedef struct cprob_table_t { int k; /* Size of tuples and events in this table. */ int nE; /* Number of all {k}-events {E}. */ int nT; /* Number of all {k}-tuples {T}. */ double *cprob; /* Probs {Pr(T|E)} of {k}-tuples {T} cond to {k}-events {E}. */ } cprob_table_t; /* {cprob[nE*ixE + ixT]} is the cond prob {Pr(T|E)} where {ixE} is the index of {E} and {ixT} the index of {T}, for every {k}-event {E} and every {k}-tuple {T}. */ /* PROTOTYPES */ event_vec_t read_valid_events(char *eventName, int *k); /* Reads the window size {k} and the list of valid {k}-events from file {eventName}. */ cprob_table_t read_tuple_cond_probs(char *tabName, int k); /* Reads from file {tabName} a table of conditional probabilities {Pr(T|E)} for all {k}-tuples {T} and all {k}-events {E}. */ void get_event_factors(char *evFac, char *event, int_vec_t *szfac, int_vec_t *ixFac); /* Breaks the given {factorization} string (assumed to be of the given {event}) into elementary events at the '.' characters. Stores the factor lengths in {szFac} and their indices in {ixFac}. Allocates the {szFac} and {ixFac} vectors. */ void classify_bases ( char_vec_t *bas, char_vec_t *lab, cprob_table_t *vfreq1, cprob_table_t *vfreq2, cprob_table_t *vfreq3, int k, event_vec_t *vev, double *prob, int iDebug ); /* Stores into {prob[4*i + j]} the probability that the true label of {bas.el[i]} has label {ej} where {ej=label_char(j)}. The probability is computed by trying to classify the string {tuple} of {k} bases centered on {bas.el[i]} as each of the valid {k}-events listed in {vev} (see {compute_event_probs}), and then collapsing those events into four based on the central letter. If {iDebug} is non-negative, prints the details of the computation for base {bas[iDebug]}. For interesting results, the {k}-tuple centered at that base must be wholly contained in the sequence. The {lab} argument, if not null or empty, is assumed to be the `official' classification of {bas}. It is used only for debugging. */ void compute_event_probs ( int k, char *tuple, cprob_table_t *vfreq1, cprob_table_t *vfreq2, cprob_table_t *vfreq3, event_vec_t *vev, double *evProb, bool_t show ); /* Computes {evProb[t] = Pr(event[i]|tuple)}, for each valid event {event[v]} where {v} is in {0..vev.ne-1}. The probability {Pr(event|tuple)} of each {event} is computed by Bayes's theorem. The a priori probabilities {Pr(event)} are taken from the {vev} table. The conditional probabilities {Pr(tuple|event)} are computed by factoring {tuple} and {event} as specified in the {vev} table, and assuming that the factors are independent. If {show} is TRUE, prints the details of the computation. */ void condense_event_probs(event_vec_t *vev, double *evProb, int which, double *prob); /* Reduces the computed probabilities of valid {k}-events centered at some base to probabilities of the four possible labels for that base. More precisely, for each label letter {ej=label_char(j)} in [DEFN], stores into {prob[j]} the sum of all probabilities {evProb[v]} of all events {vev[v]} with have letter {ej} at position {which}. */ void write_classification ( char *estName, char *ertName, char *scrName, char *hitName, char_vec_t *bas, char_vec_t *lab, double *prob ); /* Writes to a file named {estName} the computed probabilistic classification of each base of sequence {bas}, and compares it with the corresponding label in the `official' classification {lab}. Also writes to a file named {ertName} the most likely categorical classification, and to files named {scrName} and {hitName} a set of tables showing the classifier's performance. Each line {estName} refers to one letter {bi=bas[i]} of the given DNA sequence, in the format "{i} {bi} {prDEFN} {emDEFN} {prKN} {emKN} {prDEF} {emDEF}" The field {prDEFN} consists of four computed probabilities {prob[4*i+j]=Pr(ei=ej)}, for {j} in {0..3}; where {ei} is the (unknown) true label of {bi}, and {ej=label_char(j)} is each of 'D', 'E', 'F', 'N'. To improve readability, the largest of these four probabilities is prefixed with a '+'. Field {emDEFN} shows the `program's favorite' label {emi}, i.e. the label {ej} with largest probability {Pr(ei=ej)}. This letter is followed by ':', then the `official' label {eoi=lab[i]}, and then the `score mark' --- '·' if the program's favorite {emi} and the official label {eoi} are equal, and '*' if they differ. The field {prKN} is similar to {prDEFN}, except that the labels {DEF} are condensed into a single label {K}. So there are only two probabilities, {Pr(ei='D')+Pr(ei='E')+Pr(ei='F')} and {Pr(ei='N')}. Likewise, field {emKN} is like {emDEFN}, with [DEF] mapped to 'K' (both in the favorite and official parts). Note that the program may score a hit ('·') in {emKN} even if it got miss ('*') in {emDEFN}. Finally field {prDEF} is obtained from {prDEFN} by discarding the 'N' entry and normalizing the three remaining probabilities to unit sum. That is, we recompute {Pr(ei='D')} under the assumption that 'N' has zero probability `a priori'. Likewise {emDEF} lists the program's favorite label among [DEF] (excluding 'N'). The score mark is '·' if the favorite label matches the official one, '+' if it is one step ahead ("E:D", "F:E", or "D:F") and '-' if it is one step behind ("F:D", "D:E", or "E:F"). The favorite and official labels, as well as the score marks, may be '?' if the information is not avaliable. This happens, for instance, in the first {h-1} and last {h-1} bases of the sequence, where the predicted probabilities are all zero (meaning not computed). The official label is '?' also when {lab} is NULL or empty, and, in the {emDEF} column, when {lab[i]='N'}. If either the favorite or official label is '?', the score mark is also '?'. At the end, writes to file {scrName} the probabilistic score table, that is, a matrix {scrDEFN}, where row {jo} is the sum of the probability vectos {prDEFN} over all bases with offcial label {jo}. This table is followed by tables {scrCN} (total of {prKN}) and {scrDEF} (total of {prDEF}), with analogous conventions. These tables do not count bases where the official label is '?'. Also writes to file {hitName} the categorical score table {hitDEFN}, where element in row {jo} and column {jm} is the number of bases with official label {jo} and favorite label {jn}. This table is followed by tables {hitKN} and {hitDEF}, computed in the same way. These tables do not count bases where either label is '?'. */ char write_base_probs ( FILE *estFile, char *labChars, double *pr, char charOff, double *scr, double *hit, bool_t diff ); /* Writes one set of labeling probabilities and the corresponding favorite label {em} for a single base. Also compares that label to the `official' label {eo=charOff} and updates the predictor's score matrix accordingly. Returns the favorite label. The argument {labChars} lists the possible label values being considered (not including '?'). The procedure assumes that {pr[j]} is the probability assigned by the program to label value {labChars[j]}, for {j} in {0..nC-1} where {nC=strlen(labChars)}. These probabilities must either be all zero (meaning that the program did not produce a classification), or must add to 1. The `official' label is assumed to be {eo}, and must be either '?' or one of the {labChars}. The probabilities {pr[0..nC-1]} are printed first. To improve readability, the largest probability is prefixed with a '+'. Then th eprocedure prints the `favorite' label {em=labChars[jm]}, where {jm} is the index of the largest probability in {pr}. This letter is followed by ':', then by the `official' label {eo}, and then the `score mark' --- '·' if the favorite and official labels {em,eo} are equal, and '*' if they differ. If {diff} is TRUE, the {labChars} must have exactly 3 choices "DEF"; then the printed score mark will be either '·' (correct), '+' (one ahead), or '-' (one behind). Otherwise the score mark will be either '.' (correct) or '*' (incorrect). If the official label {eo} is not {'?'}, the {score} and {hits} tables are updated to record the predictor's performance. The probabilities {pr[j]}, for {j} in {0..nC-1}, are added to {scr[nC*jo+j]}, where {jo} is such that {eo=labChars[jo]}. Also, if the favorite label {em} is not '?', adds 1 to {hit[nC*jo+jm]}. */ void write_classification_header(FILE *estFile, bool_t dashes); /* Writes the header lines for a classification file. If {dashes} is FALSE, writes the column headers; if {dashes} is true, writes the dashed lines. */ void write_score_tables ( char *tabName, int n, double *tabDEFN, double *tabKN, double *tabDEF, bool_t categ ); /* Writes the score tables {tabDEFN}, {tabKN}, and {tabDEF} to file {tabName}. The parameter {n} is the number of bases in the original sequence. If {categ} is FALSE, assumes that the tables refer to the probabilistic classification; if {categ} is TRUE, to the categorical classification. Also writes below each table some useful totals (and hence the entries must be integers). */ void write_score_table(FILE *tabFile, int n, char *labChars, double *tab, bool_t categ); /* Writes the matrix {tab} to {tabFile}, followed by a summary. The {categ} parameter has the same meaning as in {write_score_tables}. */ void write_score_table_header(FILE *tabFile, char *labChars, bool_t dashes); /* Writes to {tabFile} a header line for the score matrix relative to the label values listed in {labChars}. If {dashes} is false, prints the column headers; if {dashes} is true, prints the dashed lines. */ int main(int argc, char **argv); /* IMPLEMENTATIONS */ int main(int argc, char **argv) { Options_t *o = get_options(argc, argv); int k; /* Window width. */ /* List of valid events, their factorization, and a priori probs: */ event_vec_t vev = read_valid_events(o->eventName, &k); /* Valid events and factoring. */ char_vec_t bas = read_bio_seq_file(o->basName); /* Sequence to be classified. */ char_vec_t lab = read_bio_seq_file(o->labName); /* Official classif of {bas}, or empty. */ double *prob = rn_alloc(4*bas.ne); /* {prob[4*i +j]} is the prob of {bas.el[i]} having label code {j}. */ cprob_table_t vfreq1 = read_tuple_cond_probs(o->prob1Name, 1); cprob_table_t vfreq2 = read_tuple_cond_probs(o->prob2Name, 2); cprob_table_t vfreq3 = read_tuple_cond_probs(o->prob3Name, 3); /* Compute the probabilistic classification of each base in {bas}: */ double startTime = user_cpu_time_usec(); classify_bases(&bas,&lab, &vfreq1,&vfreq2,&vfreq3, k, &vev, prob, o->iDebug); double stopTime = user_cpu_time_usec(); /* Check running time: */ double ts = (stopTime - startTime)/1.0e6; /* Running time in seconds. */ fprintf(stderr, "\n"); fprintf(stderr, "cpu time: %.3f sec total", ts); fprintf(stderr, " (%.6f sec per base)\n", ts/bas.ne); /* Print the probabilitic and categorical classifications: */ char *estName = jsprintf("%s.est", o->outName); char *ertName = jsprintf("%s.ert", o->outName); char *scrName = jsprintf("%s.scr", o->outName); char *hitName = jsprintf("%s.hit", o->outName); write_classification(estName, ertName, scrName, hitName, &bas, &lab, prob); return 0; } Options_t *get_options(int argc, char **argv) { /* Get command-line parameters: */ Options_t *o = (Options_t *)notnull(malloc(sizeof(Options_t)), "no mem"); if (argc != 9) { fprintf(stderr, "wrong number of arguments\n"); fprintf(stderr, "usage: %s\n", PROG_HELP); exit(1); } o->eventName = argv[1]; o->prob1Name = argv[2]; o->prob2Name = argv[3]; o->prob3Name = argv[4]; o->basName = argv[5]; o->labName = argv[6]; o->iDebug = atoi(argv[7]); o->outName = argv[8]; return o; } event_vec_t read_valid_events(char *eventName, int *k) { FILE *eventFile = open_read(eventName, TRUE); int eventLine = 1; /* Line sequence number in file. */ /* Skip ">" heder, if any: */ skip_bio_file_header(eventFile, &eventLine); int nvev; /* Number of valid events in file. */ int kvev; /* Tuple sizer as read from file. */ /* Read tuple size from file, check with expected: */ kvev = fget_int32(eventFile); fget_eol(eventFile); eventLine++; (*k) = kvev; demand(kvev % 2 == 1, "window size must be odd"); /* Read number of events from file: */ nvev = fget_int32(eventFile); fget_eol(eventFile); /* Read valid events: */ event_vec_t vev = event_vec_new(nvev); int v; for(v = 0; v < nvev; v++) { do { eventLine++; fget_skip_spaces(eventFile); } while (fget_test_char(eventFile, '\n')); event_t *evv = &(vev.e[v]); evv->event = fget_string(eventFile); /* A valid event. */ evv->prob = fget_double(eventFile); /* A priori probability of event. */ char *evFac = fget_string(eventFile); /* Event factorization. */ /* Digest factorization of event: */ get_event_factors(evFac, evv->event, &evv->szFac, &evv->ixFac); // /* Print factorization: */ // { fprintf(stderr, "%s %12.10f %3d ", evv->event, evv->prob, evv->fac.ne); // affirm(evv->szFac.ne == evv->ixFac.ne, "bug factors"); // int j; // for (j = 0; j < evv->szFac.ne; j++) // { fprintf(stderr, " d:%d", evv->szFac.e[j], evv->ixFac.e[j]); } // fprintf(stderr, "\n"); // } fget_eol(eventFile); } fclose(eventFile); return vev; } vec_typeimpl(event_vec_t,event_vec,event_t); cprob_table_t read_tuple_cond_probs(char *tabName, int k) { int nE = ipow(4, k); /* Number of possible {k}-events. */ int nT = ipow(4, k); /* Number of possible {k}-tuples. */ int nP = nE * nT; /* Number of possible event/tuple pairs. */ /* Allocate probability table; */ cprob_table_t tab; tab.k = k; tab.nE = nE; tab.nT = nT; tab.cprob = rn_alloc(nP); FILE *tabFile = open_read(tabName, TRUE); int tabLine = 1; /* Line sequence number in file. */ /* Skip ">" heder, if any: */ skip_bio_file_header(tabFile, &tabLine); /* Clear all probabilities: */ int ixP; for (ixP = 0; ixP < nP; ixP++) { tab.cprob[ixP] = -1.0; } int ch; while(TRUE) { fget_skip_spaces(tabFile); ch = fgetc(tabFile); if (ch == EOF) { break; } if (ch != '\n') { ungetc(ch, tabFile); char *event = fget_string(tabFile); char *tuple = fget_string(tabFile); int64_t count = fget_int64(tabFile); double cprob = fget_double(tabFile); fget_skip_spaces(tabFile); affirm(strlen(event) == k, "bad event length"); affirm(strlen(tuple) == k, "bad tuple length"); affirm(count >= 0, "bad pair count"); affirm((cprob >= 0) && (cprob <= 1.0), "bad cond prob"); /* Store probability: */ int ixE = event_index(event); int ixT = tuple_index(tuple); int ixP = ixE*nT + ixT; affirm(tab.cprob[ixP] == -1.0, "duplicated event/tuple pair"); tab.cprob[ixP] = cprob; } tabLine++; } /* Normalize conditional probabilities of same event, just in case: */ { double defProb = 1.0/nT; /* Default conditional prob of tuple. */ int ixE, ixT; for (ixE = 0; ixE < nE; ixE++) { /* Totalize the probability of event: */ double sum = 0.0; for (ixT = 0; ixT < nT; ixT++) { int ixP = ixE*nT + ixT; if (tab.cprob[ixP] == -1.0) { tab.cprob[ixP] = 0.0; } sum += tab.cprob[ixP]; } /* Normalize tuple probabilities for this event: */ for (ixT = 0; ixT < nT; ixT++) { int ixP = ixE*nT + ixT; if (sum == 0.0) { tab.cprob[ixP] = defProb; } else { tab.cprob[ixP] /= sum; } } } } fclose(tabFile); return tab; } void get_event_factors(char *evFac, char *event, int_vec_t *szFac, int_vec_t *ixFac) { /* Count number of factors, check consistency with {event}: */ int nFac = 1; { char *p = evFac; char *q = event; while ((*p != '\000') && (*q != '\000')) { if (*p == '.') { nFac++; p++; } else { affirm(*p == *q, "factorization differs from event"); p++; q++; } } affirm (*p == *q, "length mismatch in factorization and event"); } /* Allocate vectors: */ (*szFac) = int_vec_new(nFac); (*ixFac) = int_vec_new(nFac); /* Extract and process the factors: */ { char *p = evFac; int j; for (j = 0; j < nFac; j++) { /* Now {p} points to factor {j}; find its end {q}: */ char *q = p; while ((*q != '\000') && (*q != '.')) { q++; } /* Temporarily truncate the event at {q}, and extract the factor: */ char ch = (*q); (*q) = '\000'; szFac->e[j] = q - p; ixFac->e[j] = event_index(p); /* Restore event and prepare for next factor: */ (*q) = ch; p = q+1; } } } void classify_bases ( char_vec_t *bas, char_vec_t *lab, cprob_table_t *vfreq1, cprob_table_t *vfreq2, cprob_table_t *vfreq3, int k, event_vec_t *vev, double *prob, int iDebug ) { int n = bas->ne; /* Number of bases in sequence. */ affirm(k > 0, "k must be positive"); affirm(k % 2 == 1, "k must be odd"); int h = (k-1)/2; /* Number of context bases on each side of current base. */ char tuple[k+1]; /* A {k}-tuple containing the base letter of interest. */ double evProb[vev->ne]; /* A posteriori probabilities for the {vev} events. */ /* Loop on bases. */ int i; for (i = 0; i < n; i++) { bool_t show = (iDebug == i); /* Should we print details for this base? */ /* Extract the {k}-tuple of {bas} centered at {i}, check for bad chars: */ int ini = i - h; /* Starting index of tuple in {bas}. */ extract_substring(bas, ini, k, tuple); /* Check for bad chars: */ bool_t bad = FALSE; int s; for (s = 0; s < k; s++) { int code = base_code(tuple[s]); if ((code < 0) || (code >= 4)) { bad = TRUE; } } if (show) { /* Start diagnostic output: */ fprintf(stderr, "\n"); fprintf(stderr, " | t = %s i = %d [%d..%d]\n", tuple, i-h, i+h, i); if ((lab != NULL) && (lab->ne > 0)) { char event[k+1]; /* The official {k}-event corresponding to {tuple}. */ extract_substring(lab, ini, k, event); fprintf(stderr, " | e = %s (official)\n", event); } } if (bad) { /* The {k}-tuple centered at {i} overflows {bas}; balk out. */ int j; for (j = 0; j < 4; j++) { prob[4*i+j] = 0.0; } if (show) { fprintf(stderr, " | bad tuple - bayes not applied.\n"); } } else { /* Compute the probabilities of each event in {vev} given {tuple}: */ compute_event_probs(k, tuple, vfreq1,vfreq2,vfreq3, vev, evProb, show); /* Condense the event probabilities according to the central label: */ int which = i - ini; /* Index of relevant label in {k}-event. */ condense_event_probs(vev, evProb, which, &(prob[4*i])); } if (show) { /* Finish diagnostic output: */ int j; fprintf(stderr, " |\n"); for (j = 0; j < 4; j++) { char ej = label_char(j); double prj = prob[4*i + j]; fprintf(stderr, " | prob[%c] = %8.6f\n", ej, prj); } fprintf(stderr, "\n"); } } } void compute_event_probs ( int k, char *tuple, cprob_table_t *vfreq1, cprob_table_t *vfreq2, cprob_table_t *vfreq3, event_vec_t *vev, double *evProb, bool_t show ) { char evFac[k+1]; /* To store the current factor of the current event. */ char tpFac[k+1]; /* To store the current factor of {tuple}. */ double cprob[vev->ne]; /* Cond probs {cprob[v] = Pr(tuple|vev[v])}. */ double den = 0.0; /* Denominator of Bayes's theorem (normalization factor). */ /* Compute the conditional probs {cprob[v]}. */ int v; for (v = 0; v < vev->ne; v++) { event_t *evv = &(vev->e[v]); int ntf = 0; /* Total labels in event factors already processed. */ /* Compute {cprob[v]} by enumerating the factors of {evv}: */ int nFac = evv->szFac.ne; double cprv = 1.0; int f; for (f = 0; f < nFac; f++) { int kf = evv->szFac.e[f]; /* Length of event factor {f}. */ /* Extract the factor {evFac} from the {event}: */ strncpy(evFac, evv->event + ntf, kf); evFac[kf] = '\000'; /* Extract the corresponding piece {tpFac} from the given {tuple}: */ strncpy(tpFac, tuple + ntf, kf); tpFac[kf] = '\000'; /* Compute the indices of {evFac} and {tpfac}: */ int ixE = evv->ixFac.e[f]; /* Index of event factor {f}. */ int ixT = tuple_index(tpFac); /* Index of tuple factor {f}. */ /* Get the conditional probability {cprFac=Pr(tpFac|evFac)}: */ double cprFac; switch(kf) { case 1: cprFac = vfreq1->cprob[ixE*vfreq1->nT + ixT]; break; case 2: cprFac = vfreq2->cprob[ixE*vfreq2->nT + ixT]; break; case 3: cprFac = vfreq3->cprob[ixE*vfreq3->nT + ixT]; break; default: fprintf(stderr, "factor \"%s\" has invalid length\n", evFac); exit(1); } /* Multiply cond prob into the numerator: */ cprv = cprv * cprFac; /* if (show) { fprintf(stderr, " | (%14.7e)\n", cprFac); } */ /* Skip factor in tuple and event: */ ntf += kf; } cprob[v] = cprv; /* Accumulate the denominator in {den}: */ den += cprv * evv->prob; } if (show) { /* Print diagnostic output: */ fprintf(stderr, " | Pr(t) = %10.4e\n", den); fprintf(stderr, " |\n"); fprintf(stderr, " | %*s", k, ""); fprintf(stderr, " %10s %10s %10s\n", "Pr(t|e)", "Pr(e)", "Pr(e|t)"); fprintf(stderr, " | %*s", k, ""); fprintf(stderr, " %10s %10s %10s\n", "----------", "----------", "----------"); } /* Normalize the probabilities in {evProb} so that they add to 1: */ for (v = 0; v < vev->ne; v++) { event_t *evv = &(vev->e[v]); double cprv = cprob[v]; evProb[v] = cprv * evv->prob / den; if (show) { /* Print diagnostic output: */ fprintf(stderr, " | e = %s", evv->event); fprintf(stderr, " %10.4e %10.8f %10.8f", cprv, evv->prob, evProb[v]); fprintf(stderr, "\n"); } } } void condense_event_probs(event_vec_t *vev, double *evProb, int which, double *prob) { /* Clear out {prob}: */ int j; for (j = 0; j < 4; j++) { prob[j] = 0; } /* Scan and accumulate the event probabilities: */ int v; for (v = 0; v < vev->ne; v++) { char ej = (vev->e[v].event)[which]; int j = label_code(ej); affirm((j >= 0) && (j < 4), "bad label in valid event"); prob[j] += evProb[v]; } } void write_classification ( char *estName, char *ertName, char *scrName, char *hitName, char_vec_t *bas, char_vec_t *lab, double *prob ) { FILE *estFile = open_write(estName, TRUE); /* Probabilistic classification report. */ FILE *ertFile = open_write(ertName, TRUE); /* Categorical (`best guess') classification. */ bool_t hasOfficial = ((lab != NULL) && (lab->ne > 0)); /* Are there official labels? */ if (hasOfficial) { affirm(lab->ne == bas->ne, "bas/lab length mismatch"); } /* Probabilistic score tables: */ double scrDEFN[4*4]; rn_zero(4*4, scrDEFN); double scrKN [2*2]; rn_zero(2*2, scrKN); double scrDEF [3*3]; rn_zero(3*3, scrDEF); /* Categorical score tables: */ double hitDEFN[4*4]; rn_zero(4*4, hitDEFN); double hitKN [2*2]; rn_zero(2*2, hitKN); double hitDEF [3*3]; rn_zero(3*3, hitDEF); /* Write file headers: */ write_classification_header(estFile, FALSE); /* Column titles. */ write_classification_header(estFile, TRUE) ; /* Dashed lines. */ /* Loop on bases: */ int i; for (i = 0; i < bas->ne; i++) { char bi = bas->e[i]; fprintf(estFile, " %11d", i); fprintf(estFile, " %c", bi); /* Get program's probs {prDEFN[0..3]} and official label {labDEFN}: */ double *prDEFN = &(prob[4*i]); /* Program's probabilities for 'D', 'E', 'F', 'N'. */ char labDEFN = (hasOfficial ? lab->e[i] : '?'); /* Print fields {prDEFN} and {emDEFN}: */ char estDEFN = write_base_probs(estFile, "DEFN", prDEFN, labDEFN, scrDEFN, hitDEFN, FALSE); /* Condense probs and official label into classes 'K' and 'N': */ double prKN[2]; prKN[0] = prDEFN[0] + prDEFN[1] + prDEFN[2]; prKN[1] = prDEFN[3]; char labKN = (labDEFN == '?' ? '?' : (labDEFN == 'N' ? 'N' : 'K')); /* Print fields {prKN} and {emKN}: */ char estKN = write_base_probs(estFile, "CN", prKN, labKN, scrKN, hitKN, FALSE); estKN = estKN; /* To pacify the compiler. */ /* Extract phase probabilities {prDEF} and official phase {labDEF}: */ double prDEF[3]; double normDEF = (prKN[0] == 0.0 ? 1.0 : prKN[0]); prDEF[0] = prDEFN[0]/normDEF; prDEF[1] = prDEFN[1]/normDEF; prDEF[2] = prDEFN[2]/normDEF; char labDEF = ((labDEFN == '?') || (labDEFN == 'N') ? '?' : labDEFN); /* Print fields {prDEF} and {emDEF}: */ char estDEF = write_base_probs(estFile, "DEF", prDEF, labDEF, scrDEF, hitDEF, TRUE); estDEF = estDEF; /* To pacify the compiler. */ fprintf(estFile,"\n"); /* Write favorite label to {ertFile}: */ if ((i > 0) && (i % 60 == 0)) { fputc('\n', ertFile); } fputc(estDEFN, ertFile); } fclose(estFile); fprintf(ertFile, "\n"); fclose(ertFile); if (hasOfficial) { /* Write scoring tables: */ write_score_tables(scrName, bas->ne, scrDEFN, scrKN, scrDEF, FALSE); write_score_tables(hitName, bas->ne, hitDEFN, hitKN, hitDEF, TRUE); } } char write_base_probs ( FILE *estFile, char *labChars, double *pr, char charOff, double *scr, double *hit, bool_t diff ) { int nC = strlen(labChars); /* Number of distinct label chars. */ /* Compute total probability {prTot} and find code {codeOff} of official label: */ int codeOff = nC; /* Index such that {labChars[codeOff]==charOff}, or {nC} if none. */ double prTot = 0; /* Sum of all {pr} entries. */ int j; for (j = 0; j < nC; j++) { prTot += pr[j]; if (labChars[j] == charOff) { codeOff = j; } } /* Find program's favorite label {codeMax,charMax}: */ int codeMax; /* Index in {0..nC} such that {pr[codeMax]} is maximum, or {nc} if none. */ char charMax; /* Program's favorite label {labChars[codeMax]} or '?' if undef. */ if (prTot == 0) { codeMax = nC; charMax = '?'; } else { /* Find maximum {pr[codeMax]} of {pr[0..3]}: */ codeMax = 0; for (j = 1; j < nC; j++) { if (pr[j] > pr[codeMax]) { codeMax = j; } } charMax = labChars[codeMax]; } /* Print probabilities {pr[j]}, with '+' before the largest one: */ fprintf(estFile, " "); for (j = 0; j < nC; j++) { char mark = ((prTot > 0) && (j == codeMax) ? '+' : ' '); fprintf(estFile," %c%8.6lf", mark, pr[j]); } /* Print favorite label, official label, and score mark: */ char scoreMark; if (diff) { /* Differential score mark (for phase identification only): */ affirm(nC == 3, "wrong nC for diff scoring"); if ((charOff == '?') || (codeOff == '?')) { scoreMark = '?'; } else { int delta = (codeMax - codeOff + 3) % 3; switch (delta) { case 0: scoreMark = '·'; break; case 1: scoreMark = '+'; break; case 2: scoreMark = '-'; break; default: affirm(FALSE, "duhh?"); scoreMark = '?'; } } } else { /* Simple score mark (correct or incorrect). */ if ((charMax == '?') || (charOff == '?')) { scoreMark = '?'; } else { scoreMark = (charMax == charOff ? '·' : '*'); } } fprintf(estFile," %c:%c%c", charMax, charOff, scoreMark); if (charOff != '?') { /* Update the probabilistic scores: */ for (j = 0; j < nC; j++) { scr[nC*codeOff + j] += pr[j]; } if (charMax != '?') { /* Update the categorical score: */ hit[nC*codeOff + codeMax] += 1; } } affirm((prTot == 0.0) || (fabs(prTot - 1) <= 0.00000001), "probs don't add to 1"); return charMax; } void write_classification_header(FILE *estFile, bool_t dashes) { fprintf(estFile, ">%11s", (dashes ? "----------" : "position")); fprintf(estFile, " %1s", (dashes ? "-" : "b")); fprintf(estFile, " "); fprintf(estFile, " %-9s", (dashes ? "---------" : " Pr(D)")); fprintf(estFile, " %-9s", (dashes ? "---------" : " Pr(E)")); fprintf(estFile, " %-9s", (dashes ? "---------" : " Pr(F)")); fprintf(estFile, " %-9s", (dashes ? "---------" : " Pr(N)")); fprintf(estFile, " %4s", (dashes ? "----" : "labs")); fprintf(estFile, " "); fprintf(estFile, " %-9s", (dashes ? "---------" : " Pr(K)")); fprintf(estFile, " %-9s", (dashes ? "---------" : " Pr(N)")); fprintf(estFile, " %4s", (dashes ? "----" : "codn")); fprintf(estFile, " "); fprintf(estFile, " %-9s", (dashes ? "---------" : " Pr(D)")); fprintf(estFile, " %-9s", (dashes ? "---------" : " Pr(E)")); fprintf(estFile, " %-9s", (dashes ? "---------" : " Pr(F)")); fprintf(estFile, " %4s", (dashes ? "----" : "phas")); fprintf(estFile,"\n"); } void write_score_tables ( char *tabName, int n, double *tabDEFN, double *tabKN, double *tabDEF, bool_t categ ) { FILE *tabFile = open_write(tabName, TRUE); write_score_table(tabFile, n, "DEFN", tabDEFN, categ); write_score_table(tabFile, n, "KN", tabKN, categ); write_score_table(tabFile, n, "DEF", tabDEF, categ); fclose(tabFile); } void write_score_table(FILE *tabFile, int n, char *labChars, double *tab, bool_t categ) { double sum = 0, sumDiag = 0; int nC = strlen(labChars); write_score_table_header(tabFile, labChars, FALSE); write_score_table_header(tabFile, labChars, TRUE); fflush(tabFile); int row, col; for (row = 0; row < nC; row++) { char *hdr = NULL; char *hdr = jsprintf(" Pr(%c)", labChars[row]); fprintf(tabFile, " %9s", hdr); free(hdr); double sumRow = 0.0; for (col = 0; col < nC; col++) { double elem = tab[nC * row + col]; if (categ) { int ielem = (int)round(elem); affirm(fabs(ielem - elem) <= 1.0e-5*ielem, "non-integer count"); fprintf(tabFile, " %9d", ielem); } else { fprintf(tabFile, " %9.2f", elem); } sum += elem; sumRow += elem; if (col == row) { sumDiag += elem; } } /* Print accuracy within this row: */ double diagElem = tab[nC*row + row]; double accRow = (diagElem + 1.0)/(sumRow + nC); fprintf(tabFile, " %5.3f", accRow); fprintf(tabFile, "\n"); } fprintf(tabFile, "\n\n"); /* Table summary: */ char *alf = jsprintf("[%s]", labChars); fprintf(tabFile, " %-8s", alf); free(alf); /* The number of entries considered is the table total: */ int nok = round(sum); affirm(fabs(sum - nok) <= 1.0e-5*sum, "non-integer sum"); /* The accuracy is the percentage of mass that is in the diagonal elements: */ double acc = (sumDiag + 1.0)/(nok + nC); fprintf(tabFile, " %9d bases ", n); fprintf(tabFile, " %9d considered", nok); if (categ) { fprintf(tabFile, " %11d", (int)round(sumDiag)); } else { fprintf(tabFile, " %11.2f", sumDiag); } fprintf(tabFile, " matched"); fprintf(tabFile, " accuracy = %5.3f\n", acc); fprintf(tabFile, "\n\n"); fflush(tabFile); } void write_score_table_header(FILE *tabFile, char *labChars, bool_t dashes) { int nC = strlen(labChars); /* Number of distinct label values. */ int col; fprintf(tabFile, " %9s", ""); for (col = 0; col < nC; col++) { if (dashes) { fprintf(tabFile, " %9s", "---------"); } else { char *hdr = NULL; char *hdr = jsprintf(" Est(%c)", labChars[col]); fprintf(tabFile, " %9s", hdr); free(hdr); } } if (dashes) { fprintf(tabFile, " %5s", "-----"); } else { fprintf(tabFile, " %5s", "Accu."); } fprintf(tabFile, "\n"); }