/* See {dbd_lib.h}. */ /* Last edited on 2008-06-10 20:01:26 by stolfi */ #define dbd_lib_C_COPYRIGHT "Copyright © 2006,2008 UFF, Niteroi, and UNICAMP, Campinas" #define dbd_lib_C_AUTHORS "Renatha Oliva Capua (IC-UFF) and Jorge Stolfi (IC-UNICAMP)" #define dbd_lib_C_HISTORY \ "2005-02-27 R.Capua - created {biblioteca.c} with some string and file functions.\n" \ "2005-10-22 J.Stolfi - created {sgc_lib.c} with some DNA file functions.\n" \ "2008-06-06 J.Stolfi - merged {biblioteca.h,sgc_lib.h} into {dbd_lib.c}.\n" #include #include #include #include #include #include #include #include void build_char_map(char in[], char out[], int m, char bad, char map[]) { /* Set all table entries, check for conflicts: */ int i; for (i = 0; i < 256; i++) { map[i] = bad; } for (i = 0; i < m; i++) { int iU = (unsigned)(in[i]); if (map[iU] == bad) { map[iU] = out[i]; } else { demand(map[iU] == out[i], "inconsistent mapping"); } } /* We may have missed inconsistencies involving {bad}: */ for (i = 0; i < m; i++) { if (out[i] == bad) { int iU = (unsigned)(in[i]); demand(map[iU] == bad, "inconsistent mapping"); } } } code_table_t build_code_table(char alpha[], int m) { /* Allocate the {num,chr} tables: */ int nmax = (m < 256 ? m : 256); /* Upper bound on {n}. */ int *num = notnull(malloc(256*sizeof(int)), "no mem"); char *chr = notnull(malloc(nmax*sizeof(char)), "no mem"); /* Fill the {num,ochr} tables, get {n}: */ int i; for (i = 0; i < 256; i++) { num[i] = -1; } int n = 0; for (i = 0; i < m; i++) { int aU = (unsigned)(alpha[i]); if (num[aU] == -1) { num[aU] = n; chr[n] = alpha[i]; n++; } } /* Return the results: */ return (code_table_t){ .n = n, .num = num, .chr = chr }; } void free_code_table(code_table_t *tb) { free(tb->num); free(tb->chr); } void skip_bio_file_header(FILE *bioFile, int *bioLine) { int ch = fgetc(bioFile); while (ch == '>'){ /* Skip rest of line: */ (*bioLine)++; while((!feof(bioFile)) && (ch != '\n')) {ch = fgetc(bioFile); } ch = fgetc(bioFile); } ungetc(ch, bioFile); } int base_code(char ch) { if ((ch == 'a') || (ch == 'A')) { return 0; } else if ((ch == 't') || (ch == 'T')) { return 1; } else if ((ch == 'u') || (ch == 'U')) { return 1; } else if ((ch == 'c') || (ch == 'C')) { return 2; } else if ((ch == 'g') || (ch == 'G')) { return 3; } else if (ch == '?') { return 4; } else if ((ch >= 'a') || (ch <= 'z')) { return 4; } else if ((ch >= 'A') || (ch <= 'Z')) { return 4; } else if (ch == '*') { return 5; } else { return -1; } } int label_code(char ch) { if ((ch == 'D') || (ch == 'd')) { return 0; } else if ((ch == 'E') || (ch == 'e')) { return 1; } else if ((ch == 'F') || (ch == 'f')) { return 2; } else if ((ch == 'N') || (ch == 'n')) { return 3; } else if (ch == '?') { return 4; } else if (ch == '*') { return 5; } else { return -1; } } char base_char(int cd) { if (cd == 0) { return 'A'; } else if (cd == 1) { return 'T'; } else if (cd == 2) { return 'C'; } else if (cd == 3) { return 'G'; } else { return '?'; } } char label_char(int cd) { if (cd == 0) { return 'D'; } else if (cd == 1) { return 'E'; } else if (cd == 2) { return 'F'; } else if (cd == 3) { return 'N'; } else { return '?'; } } char_vec_t read_bio_seq_file(char *seqName) { char_vec_t seq = char_vec_new(0); /* To be expanded as needed. */ if ((seqName == NULL) || ((*seqName) == '\000') || (strcmp(seqName, "-") == 0)) { return seq; } FILE *seqFile = open_read(seqName, TRUE); int seqLine = 1; /* Line sequence number in file. */ /* Skip ">" heder, if any: */ skip_bio_file_header(seqFile, &seqLine); int nChars = 0; /* Number of bases/labels read so far. */ /* Loop on file characters: */ int ch = fgetc(seqFile); while(ch != EOF) { /* Skip newlines, counting lines: */ if (ch == '\n') { seqLine++; ch = fgetc(seqFile); continue; } /* Skip blanks: */ if (ch == ' ') { ch = fgetc(seqFile); continue; } /* Got a non-blank char, store it: */ ch = toupper(ch); char_vec_expand(&seq, nChars); seq.e[nChars] = ch; nChars++; ch = fgetc(seqFile); } fclose(seqFile); fprintf(stderr, "read %d characters\n", nChars); char_vec_trim(&seq, nChars); return seq; } void extract_substring(char_vec_t *seq, int start, int len, char *sub) { int i; for (i = 0; i < len; i++) { int j = start + i; char ch = ((j < 0) || (j >= seq->ne) ? '?' : seq->e[j]); sub[i] = ch; } sub[len] = '\000'; } int64_t event_index(char *event) { int64_t ix = 0; while (*event != '\000') { int labCode = label_code(*event); affirm((labCode >= 0) && (labCode < 4), "bad label in event"); ix = 4*ix + labCode; event++; } return ix; } int64_t tuple_index(char *tuple) { int64_t ix = 0; while (*tuple != '\000') { int basCode = base_code(*tuple); affirm((basCode >= 0) && (basCode < 4), "bad base in tuple"); ix = 4*ix + basCode; tuple++; } return ix; }