/* Tools for statistical genome classification. */ /* Last edited on 2024-12-21 14:03:47 by stolfi */ #ifndef dbd_lib_H #define dbd_lib_H #define dbd_lib_H_COPYRIGHT "Copyright © 2006,2008 UFF, Niteroi, and UNICAMP, Campinas" #include #include #include void skip_bio_file_header(FILE *bioFile, int *bioLine); /* The procedure should be called when file {bioFile} is positioned just before the first character of a new line (or at EOF). If that character is '>', reads and discards the rest of the line, including the final end-of-line; and ditto for any subsequent lines that start with '>'. Returns with {bioFile} at EOF or poised to read the first char of a non-comment line. Increments {bioLine} (the file's line counter) as appropriate. */ void build_char_map(char in[], char out[], int m, char bad, char map[]); /* Fills {map} with a character mapping table defined by the arrays {in,out}. Specifically, sets {map[0..255]} so that {map[(unsigned)in[i]]==out[i]} for all {i} in {0..m-1}. Also, for any character {ch} that does not occur in {in[0..m-1]}, the procedure sets {map[(unsigned)ch] = bad}. The arrays {in,out} should be consistent; that is, if {in[i]} is equal to {in[j]} then {out[i]} must be equal to {out[j]}. */ typedef struct code_table_t { int n; /* Nominal alphabet size. */ int *num; /* Maps unsigned chars in {'\000'..'\377'} to integers. */ char *chr; /* Maps integers in {0..n-1} to characters. */ } code_table_t; /* A pair of tables that define a mapping of arbitrary characters to integers in {0..n-1} and vice-versa. Namely, {num[(unsigned)ch]} is the integer code for of any 8-bit character {ch}. Conversely, {chr[ix]} is an arbitrary character corresponding to any numeric code {ix} in {0..n-1}. Note that the tables need not be inverses of each other, and the values of {num} need not be in {0..n-1}. */ code_table_t build_code_table(char alpha[], int m); /* Creates an encoding/decoding table {tb} for an arbitrary alphabet defined by the string {out}. The table assigns to each distinct letter that occurs in {alpha[0..m-1]} a distinct numeric code in the range {0..tb.n-1}, where {tb.n} is the number of distinct characters in {ch}. The code of a character {c} is {tb.num[(unsigned char)ch]}, and is the number of distinct letters that precede the first occurrence of {ch} in {alpha[0..m-1]}. Characters that do not occur in {alpha} are mapped to -1. Conversely, {tb.chr[ix]} is the character of {alpha} with numeric code {ix}, provided that {ix} is in {0..tb.n-1}. */ void free_code_table(code_table_t *tb); /* Frees the code tables inside {tb} (but not {tb} itself). */ int base_code(char ch); /* The numeric code of basis letter {ch}: [Aa] = 0, [TtUu] = 1, [Cc] = 2, [Gg] = 3, (any other letter or '?') = 4, '*' = 5, invalid = -1. */ int label_code(char ch); /* The numeric code of label letter {ch}: [Dd] = 0, [Ee] = 1, [Ff] = 2, [Ii] = 3, '?' = 4, '*' = 5, invalid = -1. */ char base_char(int ix); /* The character code of basis code {ix}: 0 = 'A', 1 = 'T', 2 = 'C', 3 = 'G', other = '?'. */ char label_char(int ix); /* The character code of label code {ix}: 0 = 'D', 1 = 'E', 2 = 'F', 3 = 'N', other = '?'. */ char_vec_t read_bio_seq_file(char *seqName); /* Reads a sequence of bases or labels from file {seqName}, in the unified format. If the file name is NULL, returns an empty vector. */ void extract_substring(char_vec_t *seq, int start, int len, char *sub); /* Extract the substring of {seq} that begins at {seq[start]} and has length {len}, and stores it into {sub[0..len-1]}, followed by '\000'. Thus {sub} must be allocated with at least {len+1} bytes. If any part of the substring falls outside {seq}, those characters are set to '?'. */ int64_t event_index(char *event); /* Computes the index of the given {event} among the list of all {k}-events, where {k = strlen(event)}. Considers {event} as a number in base 4 with digits [DEFN] (ignoring case), where the digit values are given by {label_code}. Fails if the event contains any other label. */ int64_t tuple_index(char *tuple); /* Computes the index of the given {tuple} among the list of all {k}-tuples, where {k = strlen(event)}. Considers {event} as a number in base 4 with digits [ATUCG] (ignoring case), where the digit values are given by {base_code}. Note that 'U' has the same value as 'T'; so, for example, "ATU", "AUT", "AUU" and "ATT" map to the same index. Fails if the event contains any other label. */ #endif