#define PROG_NAME "dbd_tabulate" #define PROG_DESC "computes {k}-tuple freqs and probs from training sets" #define PROG_VERS "1.0" /* Last edited on 2024-12-21 14:04:59 by stolfi */ #define dbd_tabulate_C_COPYRIGHT "Copyright © 2006,2008 by IC-UFF and IC-UNICAMP" #define PROG_HELP \ PROG_NAME " \\\n" \ " -dataBank {BANK_FILE} \\\n" \ " -windowSize {WINDOW_SIZE} \\\n" \ " -labelMap {LAB_SET} = {LAB_EQV} \\\n" \ " [ -eventList {EVENT_LIST} | -all ] \\\n" \ " [ -quiet ] \\\n" \ " " argparser_help_info_HELP " \\\n" \ " > {OUT_FILE}" #define PROG_INFO \ "NAME\n" \ " " PROG_NAME " - " PROG_DESC "\n" \ "\n" \ "SYNOPSIS\n" \ PROG_HELP "\n" \ "\n" \ "DESCRIPTION\n" \ " This program takes two input parameters, an integer {k=WINDOW_SIZE}" \ " and the name of a file {BANK_FILE} that contains pairs of file" \ " names \"{BAS_FILE_i} {LAB_FILE_i}\", one pair per line.\n" \ "\n" \ "INPUTS\n" \ " In each file pair, the first file {BAS_FILE_i} should contain" \ " the nucleotides of a DNA sequence, encoded with the" \ " letters [ATCG]. The second file {LAB_FILE_i} should" \ " contain the corresponding labels, which may be any" \ " of the letters in the string {LAB_SET}.\n" \ "\n" \ " For the user's convenience, the program automatically" \ " converts lowercase nucleotide letters read from {BAS_FILE_i} to upper" \ " case, and replaces the RNA nucleotide letter 'U' by 'T'. It also" \ " maps each label letter read from {LAB_FILE_i} that is listed" \ " in the {LAB_SET} string to the corresponding letter in" \ " the {LAB_EQV} string. These conversions are" \ " performed before counting the events and pairs, and" \ " affect the output file output file.\n" \ "\n" \ " The input files may begin with one or more comment lines, each" \ " beginning with a '>' characters, which are silently ignored.\n" \ "\n" \ "OUTPUTS\n" \ " The program extracts from each pair of input files all the pairs {e,t}" \ " where {t} is a /{k}-tuple/ --- a string of {k} consecutive base" \ " letters --- and {e} is the corresponding /{k}-event/ --- the string" \ " of {k} consecutive label letters with same indices." \ " For each distinct pair {e,t} that occurs in the input files, the" \ " program writes to standard output a line in the format" \ " \"{e} {t} {FREQ} {PROB}\" where {FREQ} is the frequency (number" \ " of occurrences) of that {k}-tuple with that" \ " {k}-event; and {PROB} is the conditional probability {Pr(t|r)}" \ " estimated from those frequencies.\n" \ "\n" \ " The output {k}-tuples use only the uppercase letters [ATCG], and" \ " the {k}-events use only the letters listed in the {LAB_EQV} string.\n" \ " \ ""SELECTED EVENTS\n" \ " The user may specify a list of `interesting' {k}-events, through" \ " the \"-eventList\" option. In that case, the" \ " program will tabulate the conditional probabilities" \ " for those events only.\n" \ "\n" \ "OPTIONS\n" \ PROG_INFO_OPTS "\n" \ "\n" \ "SEE ALSO\n" \ " dbd_predict(1), dbd_summary(1), dbd_paint(1).\n" \ "\n" \ "AUTHOR\n" \ " Created 2005-10-05 by Renatha Oliva Capua, IC-UFF.\n" \ "\n" \ "MODIFICATION HISTORY\n" \ " 2008-06-06 J. Stolfi, IC-UNICAMP Rewritten for libs, style.\n" \ " 2008-06-09 J. Stolfi, IC-UNICAMP Added args {LAB_SET},{LAB_EQV}.\n" \ " 2008-06-10 J. Stolfi, IC-UNICAMP Added arg {EVENT_LIST}.\n" \ " 2008-06-13 J. Stolfi, IC-UNICAMP Changed to use" \ " {argparser.h}, added option \"-quiet\".\n" \ "\n" \ "WARRANTY\n" \ argparser_help_info_NO_WARRANTY "\n" \ "\n" \ "RIGHTS\n" \ " " dbd_tabulate_C_COPYRIGHT ".\n" \ "\n" \ argparser_help_info_STANDARD_RIGHTS #define PROG_INFO_OPTS \ " -dataBank {BANK_FILE}\n" \ " This mandatory argument specifies the name of the main file," \ " that contains the list of input file names (the nucleotide" \ " file names and the corresponding label file names).\n" \ "\n" \ " -windowSize {WIN_SIZE}\n" \ " This mandatory argument specifies the window size {k = WIN_SIZE}.\n" \ "\n" \ " -labelMap {LAB_SET} = {LAB_EQV}\n" \ " This mandatory argument specifies the mapping to be applied" \ " to the label letters before tabulating the event counts.\n" \ "\n" \ " -eventList {EVENT_LIST}\n" \ " -all\n" \ " These optional and mutually exclusive arguments specify" \ " the set of interesting events, whose counts and probabilities" \ " are to be written to the output. If \"-eventList\" is present, the" \ " next argument {EVENT_LIST} must be the name of a file that contains" \ " zero or more {k}-events, one per line. Note that these events" \ " should consist of" \ " letters from the output alphabet {LAB_EQV}, rather than the input" \ " alphabet {LAB_SET}. The file {EVENT_LIST} may also contain" \ " blank lines, or comments that start with '#' and extend to the end of" \ " the current line. If \"-all\" is present, or neither option is used, the" \ " program will write the probabilities and counts for all" \ " events.\n" \ "\n" \ " -quiet\n" \ " Normally the program prints out the names of each" \ " file {LAB_FILE_i} and {BAS_FILE_i} it reads. This option supresses" \ " the printout of those file names.\n" \ "\n" \ "DOCUMENTATION OPTIONS\n" \ argparser_help_info_HELP_INFO #include #include #include #include #include #include #include #include #include #include #include #include /* PROTOTYPES */ typedef struct Options_t { int windowSize; /* Size of tuples and events to consider. */ char *labSet; /* List of valid input label letters. */ char *labEqv; /* List of output label letters. */ char *dataBank; /* Name of file that describes the input data bank. */ char *eventList; /* Name of file containing the interesting events, or NULL. */ bool_t quiet; /* If TRUE, does not print the names of ".bas" and ".lab" files. */ } Options_t; Options_t *get_options(int argc, char **argv); /* Parses the command line, returns them as an {Options_t} record. */ void process_file_pair ( int k, char *basName, char mapB[], code_table_t *tbB, char *labName, char mapL[], code_table_t *tbL, bool_t quiet, int64_vec_t *ctE, int64_vec_t *ctP ); /* Reads a string of bases from {basFile}, and a matching string of labels from {labFile}. Maps each character through the char maps {mapB} and {mapL}, respectively. Extracts all pairs {t,r} where {t} is a {k}-tuple and {r} is a {k}-event. Maps {r} to an event index {ixE}, using the table {tbL}, and increments {ctE[ixE]}; then maps and the pair {t,r} to a pair index {ixP}, using the tables {tbB,tbL}, and increments {ctP[ixP]}. The procedure ignores line breaks, blanks, and '>'-headers in the base and label files. Each remaining base (resp. label) character {ch} is mapped to {mapB[(unsigned)ch]} (resp. {mapL[(unsigned)ch]}). Ignores any {k}-event/{k}-tuple pairs that, after this mapping, contain invalid characters --- that is, characters that are mapped to {-1} by {tbB} or {tbL}. If {quiet} is FALSE, the procedure writes to {stderr} a warning that the files {basFile} and {labFile} are being opened. */ bool_t read_event ( FILE *rd, char *fName, int *lineNumP, int k, code_table_t *tbL, int64_t *ixE ); /* Reads from {rd} a {k}-event, skipping blanks and newlines. Returns the event index in {*ixE}. Updates {*lineNumP}. Fails on syntax error (showing {fName} in the error message), returns TRUE on success, and returns FALSE if there are no more events in {rd}. */ void write_freqs_probs ( FILE *wr, int k, char *eventList, int64_t nTfull, int64_t nEfull, code_table_t *tbB, code_table_t *tbL, int64_vec_t *ctE, int64_vec_t *ctP ); /* Writes to file {wr} the raw occurrence counts and the conditional probabilities {Pr(t|e)} for intersting {k}-event/{k}-tuple pairs {(e,t)}. If {eventList} is not null, it must be the name of a file containing the list of interesting {k}-events, one per line. If {eventList} is NULL, but there are more than {nEfull} possible {k}-events, the procedure assumes that only those events with nonzero occurrences are interesting. Otherwise the procedure considers all possible {k}-events interesting. In any case, if the number of possible {k}-tuples is greater than {nTfull}, the procedure writes only those pairs with non-zero occurence counts. */ int main(int argc, char **argv); /* IMPLEMENTATIONS */ int main(int argc, char **argv) { Options_t *o = get_options(argc, argv); int k = o->windowSize; /* Window size: */ /* Build the nucleotide character map and encoding/decoding table: */ char *basSet = "ATCGatcgUu"; /* Valid input nucleotide chars. */ char *basEqv = "ATCGATCGTT"; /* Equivalent nucleotide chars. */ char mapB[256]; build_char_map(basSet, basEqv, strlen(basSet), '?', mapB); code_table_t tbB = build_code_table(basEqv, strlen(basEqv)); /* Build the lable character map and encoding/decoding table: */ char mapL[256]; build_char_map(o->labSet, o->labEqv, strlen(o->labSet), '?', mapL); code_table_t tbL = build_code_table(o->labEqv, strlen(o->labEqv)); /* Check whether we have enough memory: */ double maxMem = 64.0e6; /* Max reasonable table size in bytes. */ double maxAlt = exp(log(maxMem)/k); /* Max alternatives per letter. */ demand(tbB.n*tbL.n <= maxAlt, "tables would be too big"); fprintf(stderr, "alphabet size: bases = %d labels = %d\n", tbB.n, tbL.n); /* Total number of distinct events, tuples, pairs: */ int64_t nT = ipow(tbB.n,k); int64_t nE = ipow(tbL.n,k); int64_t nP = nT*nE; fprintf(stderr, "table size: tuples = %lld events = %lld pairs = %lld\n", nT, nE, nP); /* Allocate and initialize the count arrays {ctE,ctP}: */ int64_vec_t ctE = int64_vec_new(nE); /* Counts occurrences of each {k}-event. */ int64_vec_t ctP = int64_vec_new(nP); /* Counts occurrences of each {k}-tuple/{k}-event pair. */ int64_t ix; for (ix = 0; ix < nE; ix++) { ctE.e[ix] = 0; } for (ix = 0; ix < nP; ix++) { ctP.e[ix] = 0; } /* Loop on file pairs: */ FILE *bankFile = open_read(o->dataBank, TRUE); int nSeqs = 0; /* Number of biosequences (file pairs) processed. */ while(TRUE) { /* Skip initial spaces on line: */ fget_skip_spaces(bankFile); /* Check for end-of-file: */ if (feof(bankFile)) { break; } /* Omit blank lines: */ if (fget_test_char(bankFile, '\n')) { continue; } /* Omit comment lines; */ if (fget_test_char(bankFile, '>')) { int c = '>'; while ((! feof(bankFile)) && (c != '\n')) { c = fgetc(bankFile); } continue; } /* Get names of input files (bases and labels): */ char *basName = fget_string(bankFile); fget_skip_spaces(bankFile); char *labName = fget_string(bankFile); fget_skip_spaces(bankFile); /* Process the two files: */ process_file_pair(k, basName, mapB, &tbB, labName, mapL, &tbL, o->quiet, &ctE, &ctP); nSeqs++; /* Skip to next line: */ fget_eol(bankFile); } fclose(bankFile); fprintf(stderr, "%d biosequences processed.\n", nSeqs); /* Max number of entries to print the full table: */ int64_t nTfull = ipow(4,6); /* Allows {A,T,C,G} with {k=6}. */ int64_t nEfull = ipow(2,6); /* Allows {N,K} with {k=6}. */ /* Output the interesting counts: */ write_freqs_probs(stdout, k, o->eventList, nTfull, nEfull, &tbB, &tbL, &ctE, &ctP); return(0); } void process_file_pair ( int k, char *basName, char mapB[], code_table_t *tbB, char *labName, char mapL[], code_table_t *tbL, bool_t quiet, int64_vec_t *ctE, int64_vec_t *ctP ) { FILE *basFile = open_read(basName, ! quiet); FILE *labFile = open_read(labName, ! quiet); /* Total number of distinct events, tuples, pairs: */ int64_t nE = ctE->ne; int64_t nP = ctP->ne; int64_t nT = nP/nE; int nBases = 0; /* Count of non-blank, non-comment chars read from each file. */ char tuple[k]; /* Current {k}-tuple (circular queue). */ char event[k]; /* Current {k}-event (circular queue). */ bool_t bad[k]; /* {bad[j]} is TRUE iff {tuple[j]} or {event[j]} is invalid. */ int nBad = 0; /* Number of `bad' base/label pairs in {tuple}/{event}. */ /* Tuple and event indices (computed assuming code 0 for bad slots). */ int64_t ixT = 0; /* Index of {tuple} in list of all {k}-tuples. */ int64_t ixE = 0; /* Index of {event} in list of all {k}-events. */ /* Line numbers in data files: */ int basLine = 1; int labLine = 1; /* Skip comment blocks: */ skip_bio_file_header(basFile, &basLine); skip_bio_file_header(labFile, &labLine); /* Loop on character pairs: */ int basChar = fgetc(basFile); int labChar = fgetc(labFile); while ((basChar != EOF) && (labChar != EOF)) { /* Skip blanks and newlines: */ if (basChar == '\n') { basLine++; basChar = fgetc(basFile); continue; } if (basChar == ' ') { basChar = fgetc(basFile); continue; } if (labChar == '\n') { labLine++; labChar = fgetc(labFile); continue; } if (labChar == ' ') { labChar = fgetc(labFile); continue; } /* Got the next pair of non-blank characters, process it: */ /* Apply the character maps: */ basChar = mapB[(unsigned)basChar]; labChar = mapL[(unsigned)labChar]; /* Map nucleotide and label to numeric codes: */ int basCode = tbB->num[(unsigned)basChar]; int labCode = tbL->num[(unsigned)labChar]; /* Compute slot {j} of current chars in {tuple} and {event}: */ int j = nBases % k; /* Shift tuple and event indices, discarding oldest base/label pair: */ ixT = (tbB->n*ixT) % nT; ixE = (tbL->n*ixE) % nE; /* Update {nBad} to account for discarded base/label pair: */ if ((nBases >= k) && bad[j]) { nBad--; } /* Store characters in {tuple} and [event} queues: */ tuple[j] = basChar; event[j] = labChar; bad[j] = (basCode < 0) | (basCode >= tbB->n) | (labCode < 0) | (labCode >= tbL->n); nBases++; if (bad[j]) { nBad++; } else { /* Add base and label codes to the tuple and event indices: */ ixT += basCode; ixE += labCode; } if ((nBases >= k) && (nBad == 0)) { /* Count one more occurrence of pair and event: */ int64_t ixP = ixE*nT + ixT; ctP->e[ixP]++; ctE->e[ixE]++; } /* Get next characters from each file: */ basChar = fgetc(basFile); labChar = fgetc(labFile); } fclose(basFile); fclose(labFile); } bool_t read_event ( FILE *rd, char *fName, int *lineNumP, int k, code_table_t *tbL, int64_t *ixE ) { auto void error(char *msg); void error(char *msg) { fprintf(stderr, "%s:%d: ** %s\n", fName, *lineNumP, msg); exit(1); } int ch = fgetc(rd); int i = 0; /* Number of label letters read so far. */ (*ixE) = 0; while (TRUE) { assert(i <= k); if ((ch == EOF) && (i == 0)) { /* Normal end-of-file, no event: */ return FALSE; } else if ((ch == EOF) || (ch == ' ') || (ch == '\011') || (ch == '\n') || (ch == '\015') || (ch == '#')) { /* Character {ch} is a valid event separator/terminator: */ if (ch == '#') { /* Skip rest of comment, leaving only the final '\n' or {EOF}: */ do { ch = fgetc(rd); } while ((ch != '\n') && (ch != EOF)); } /* Finish the event: */ if (i == k) { /* Normal end of event: */ if (ch == '\n') { /* Put back the newline so that the next read increments {*lineNumP}: */ ungetc(ch, rd); } return TRUE; } else if (i > 0) { error("truncated event"); } /* We have't found the first letter of the event yet. Skip this one: */ if (ch == '\n') { (*lineNumP) ++; } } else { int num = tbL->num[(unsigned)ch]; if ((num < 0) || (num > tbL->n)) { error("Invalid character"); num = 0; } if (i >= k) { error("event is too long"); } else { (*ixE) = (*ixE)*tbL->n + num; i++; } } /* Get next character: */ ch = fgetc(rd); } } void write_freqs_probs ( FILE *wr, int k, char *eventList, int64_t nTfull, int64_t nEfull, code_table_t *tbB, code_table_t *tbL, int64_vec_t *ctE, int64_vec_t *ctP ) { /* Total number of distinct events, tuples, pairs: */ int64_t nE = ctE->ne; int64_t nP = ctP->ne; int64_t nT = nP/nE; fprintf(wr, "> Frequencies and probabilities of %d-tuples and %d-events\n", k, k); fprintf(wr, "> Columns are: event, tuple, occurrences(tuple,event), cond-prob(tuple|event)\n"); fprintf(wr, "\n"); auto void write_event(int64_t ixE); /* Writes out all pairs for the {k}-event whose index is {ixE}. */ if (eventList != NULL) { FILE *rd = open_read(eventList, TRUE); /* Read the interesting events and set all their counts to 0: */ int lineNum = 1; int64_t ixE; while (read_event(rd, eventList, &lineNum, k, tbL, &ixE)) { write_event(ixE); } fclose(rd); } else { int64_t ixE; for(ixE = 0; ixE < nE; ixE++) { if ((nE <= nEfull) || (ctE->e[ixE] > 0)) { write_event(ixE); } } } fclose(wr); return; void write_event(int64_t ixE) { /* Decode the index {ixE} into {event[0..k-1]}: */ char event[k]; int64_t d; d = nE; int64_t jxE = ixE; int r; for (r = 0; r < k; r++) { d /= tbL->n; int labCode = jxE / d; jxE = jxE % d; event[r] = tbL->chr[labCode]; } /* Enumerate all {k}-tuples: */ int64_t ixT; for (ixT = 0; ixT < nT; ixT++) { /* Compute the pair index {ixP}: */ int64_t ixP = ixE*nT + ixT; if ((nT < nTfull) || (ctP->e[ixP] > 0)) { /* Pair is interesting: */ /* Write the {k}-event: */ for (r = 0; r < k; r++) { fputc(event[r], wr); } fputc(' ', wr); /* Write the {k}-tuple: */ d = ipow(tbB->n, k); int64_t jxT = ixT; for (r = 0; r < k; r++) { d /= tbB->n; int basCode = jxT / d; jxT = jxT % d; fputc(tbB->chr[basCode], wr); } /* Write raw count: */ fprintf(wr, " %10lld", ctP->e[ixP]); /* Write the estimated conditional probability: */ double prob = ((double)ctP->e[ixP]+1)/((double)ctE->e[ixE]+nT); fprintf(wr, " %10.7f", prob); fputc('\n', wr); } } fputc('\n', wr); } } Options_t *get_options(int argc, char **argv) { /* Get command-line parameters: */ Options_t *o = notnull(malloc(sizeof(Options_t)), "no mem"); argparser_t *pp = argparser_new(stderr, argc, argv); argparser_set_help(pp, PROG_HELP); argparser_set_info(pp, PROG_INFO); argparser_process_help_info_options(pp); /* Parse keyword parameters: */ argparser_get_keyword(pp, "-dataBank"); o->dataBank = argparser_get_next(pp); argparser_get_keyword(pp, "-windowSize"); o->windowSize = argparser_get_next_int(pp, 0, 100); argparser_get_keyword(pp, "-labelMap"); o->labSet = argparser_get_next(pp); argparser_get_keyword_next(pp, "="); o->labEqv = argparser_get_next(pp); { /* Check for invalid characters in the label map: */ int m = strlen(o->labSet); if (m != strlen(o->labEqv)) { fprintf(stderr, "label char lists have different lengths\n"); exit(1); } char *str[2] = { o->labSet, o->labEqv}; int j; for (j = 0; j < 2; j++) { int i; for (i = 0; i < m; i++) { char s = str[j][i]; if ( ((s < 'A') || (s > 'Z')) && ((s < 'a') || (s > 'z')) && ((s < '0') || (s > '9')) && (s != '?') && (s != '*') && (s != '@') && (s != '.') && (s != '-') ) { fprintf(stderr, "invalid label character '%c'\n", s); exit(1); } } } } o->quiet = argparser_keyword_present(pp, "-quiet"); if (argparser_keyword_present(pp, "-eventList")) { o->eventList = argparser_get_next(pp); } else if (argparser_keyword_present(pp, "-all")) { o->eventList = NULL; } else { o->eventList = NULL; } /* Get positional parameters: */ argparser_skip_parsed(pp); /* Check for spurious arguments: */ argparser_finish(pp); return o; }