/* See dnae_nucleic.h */ /* Last edited on 2019-04-09 13:03:48 by jstolfi */ #define dnae_nucleic_C_COPYRIGHT \ "Copyright © 2006 by the State University of Campinas (UNICAMP)" #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include bool_t dnae_nucleic_mutate_step ( char c, bool_t dna, bool_t upper, double mutProb, double delProb, int *nP, char_vec_t *cv, int *skp, bool_t *mut ); /* Simulates one cycle of the natural duplication of a nucleotide sequence, given its next nucleotide letter {c}. With probability {mutProb}, the procedure discards the nucleotide {c} and outputs a random nucleotide instead. In that case, it sets {*mut=TRUE,*skp=0}. With probability {delProb/2}, the procedure discards {c} but outputs nothing; it then sets {*mut=FALSE,*skp=-1}. With probabilty {delProb/2}, the procedure does not discard {c}, but outputs an extra random nucleotide before it; it then sets {*mut=FALSE,*skp=+1}. With the remaining probability, the procedure simply copies {c} to the output, and sets {*mut=FALSE,*skp=0}. The random nucleotides are generated by {dnae_nucleic_throw(upper,dna)}. The procedure stores the output nucleotide, if any, as byte {cv[*nP]}, and then increments {*nP}. The vector {cv} is expanded as needed. The procedure returns FALSE (`failure'), without outputting anything, whenever it decides to consume or discard the nucleotide {c} but finds that {c} is zero ({NUL}). Otherwise it returns TRUE (`success'). */ char dnae_nucleic_throw(bool_t upper, bool_t dna) { int c = (dna ? "atcg" : "aucg")[abrandom(0,3)]; if (upper) { c = toupper(c); } return (char)c; } char *dnae_nucleic_string_throw(int nb, bool_t upper, bool_t dna) { char *b = (char *)notnull(malloc((nb+1)*sizeof(char)), "no mem"); int k; for (k = 0; k < nb; k++) { b[k] = dnae_nucleic_throw(upper, dna); } b[nb] = 0; return b; } bool_t dnae_nucleic_mutate_step ( char c, bool_t dna, bool_t upper, double mutProb, double delProb, int *nP, char_vec_t *cv, int *skp, bool_t *mut ) { /* Initialize {mut,skp} for normal case (copy without mutation): */ (*mut) = FALSE; (*skp) = 0; /* Throw a [0_1] die: */ double toss = drandom(); /* Get the next character {r} to be added to {cv}, or NUL if none. */ char r; /* Note that the order of the tests is important. */ if ((toss -= delProb/2) < 0) { /* Insert a random letter before {c}: */ r = dnae_nucleic_throw(upper, dna); (*skp) = +1; } else if (c == 0) { /* There is no next letter: */ return FALSE; } else if ((toss -= mutProb) < 0) { /* Mutate the next letter {c}: */ r = dnae_nucleic_throw(upper, dna); (*mut) = TRUE; } else if ((toss -= delProb/2) < 0) { /* Delete the next letter {c}: */ r = 0; (*skp) = -1; } else { /* Jusr copy the next letter {c}: */ r = c; } if (r != 0) { /* Append {c} to {cv}: */ char_vec_expand(cv, (*nP)); cv->e[(*nP)] = r; (*nP)++; } return TRUE; } void dnae_nucleic_string_mutate ( char *s, bool_t dna, bool_t upper, double mutProb, double delProb, char **rp, msm_rung_vec_t *gv ) { /* Allocate character and integer vectors {rv,*gv} slightly larger than {s}: */ int nguess = (11*((int)strlen(s)))/10; /* Guessed size of result. */ char_vec_t rv = char_vec_new(nguess); (*gv) = msm_rung_vec_new(nguess); int nr = 0; /* Number of chars in new string. */ int ng = 0; /* Number of indices in {iv}. */ int ix = 0; /* Index of next char in {s}. */ while (TRUE) { int skp; bool_t mut; bool_t ok = dnae_nucleic_mutate_step(s[ix], dna, upper, mutProb, delProb, &nr, &rv, &skp, &mut); if (! ok) { break; } if ((skp == 0) && (mut == 0)) { /* Character {s[ix]} was copied without mutation: */ msm_rung_vec_expand(gv, ng); gv->e[ng] = (msm_rung_t){{ ix, nr }}; ng++; } if (skp <= 0) { /* Character {s[ix]} was consumed: */ ix++; } } /* Close the string: */ char_vec_expand(&rv, nr); rv.e[nr] = 0; nr++; /* Trim and return: */ char_vec_trim(&rv, nr); msm_rung_vec_trim(gv, ng); (*rp) = rv.e; } void dnae_nucleic_string_write_named(char *fname, char *tag, char *ext, char *s, char *cmt) { FILE *wr = open_write_tag_ext(fname, tag, ext, TRUE); dnae_nucleic_string_write(wr, s, cmt); fclose(wr); } #define dnae_nucleic_BASES_PER_LINE 70 void dnae_nucleic_string_write(FILE *wr, char *s, char *cmt) { int ind = 0; /* Comment indentation. */ filefmt_write_comment(wr, cmt, ind, '>'); int nb = 0; /* Number of nucleotides written out. */ while ((*s) != 0) { if ((nb > 0) && ((nb % dnae_nucleic_BASES_PER_LINE) == 0)) { fputc('\n', wr); } fputc((*s), wr); s++; nb++; } fputc('\n', wr); fflush(wr); } void dnae_nucleic_string_read(FILE *rd, char **sp, char **cmtp) { /* Read comment lines: */ (*cmtp) = filefmt_read_comment(rd, '>'); /* Read bases, skipping blanks: */ char_vec_t sv = char_vec_new(1000); int nbas = 0; /* Counts base letters read. */ int nlin = 1; /* Current line number for diagnostics. */ int ch; bool_t ignore_lf = FALSE; /* Set to TRUE immediately after a '\015' */ while ((ch = getc(rd)) != EOF) { if (ch=='\015') { /* Count one more line: */ nlin++; } else if ((ch=='\012') && (! ignore_lf)) { /* Count one more line: */ nlin++; } else if ((ch==' ') || (ch=='\011') || (ch=='\012') || (ch=='\014')) { /* Ignore character */ } else if (is_dna_basis((char)ch)) { char_vec_expand(&sv, nbas); sv.e[nbas] = (char)ch; nbas++; } ignore_lf = (ch=='\015'); } /* Add final NUL byte: */ char_vec_expand(&sv, nbas); sv.e[nbas] = '\000'; nbas++; /* Trim and return: */ char_vec_trim(&sv, nbas); (*sp) = sv.e; } void dnae_nucleic_string_read_named(char *fname, char *tag, char *ext, char **sp, char **cmtp) { FILE *rd = open_read_tag_ext(fname, tag, ext, TRUE); dnae_nucleic_string_read(rd, sp, cmtp); fclose(rd); } bool_t is_dna_basis(char c) { return (c == 'A') || (c == 'T') || (c == 'U') || (c == 'C') || (c == 'G') || (c == 'a') || (c == 't') || (c == 'u') || (c == 'c') || (c == 'g') ; } void dnae_nucleic_value(char b, int *A, int *T, int *C, int *G) { (*A) = (*T) = (*C) = (*G) = 0; if ((b == 'A') || (b == 'a')) { (*A) = 1; } else if ((b == 'T') || (b == 't') || (b == 'U') || (b == 'u')) { (*T) = 1; } else if ((b == 'C') || (b == 'c')) { (*C) = 1; } else if ((b == 'G') || (b == 'g')) { (*G) = 1; } else { /* Leave all variables zero. */ } }