#! /usr/bin/gawk -f # Last edited on 2008-06-08 17:50:24 by stolfi BEGIN { usage = ( \ "dbd_synthesize_dna \\\n" \ " -v tabK=TAB3FILE.prb \\\n" \ " -v tabN=TAB1FILE.prb \\\n" \ " -v nBases=NUM \\\n" \ " -v seed=NUM \\\n" \ " -v tKMin=${NUM} \\\n" \ " -v tKAvg=${NUM} \\\n" \ " -v tKDev=${NUM} \\\n" \ " -v tNMin=${NUM} \\\n" \ " -v tNMid=${NUM} \\\n" \ " -v tNTrp=${NUM} \\\n" \ " -v tNTau=${NUM} \\\n" \ " -v outName=NAME" \ ); # Generates a random DNA sequence according to our stochastic model. # There are two generators, `non-coding' ({N}) and `coding' ({K}): # # The {N} generator simply concatenates bases independely chosen # with probabilities given by table {tabN}, and assigns them the # label 'N'. # # The {K} generator concatenates triplets of DNA # independently chosen with the probabilities of table {tabK}, and # assigns to its bases the labels 'D', 'E', 'F', according to their # positions in each codon. # # The output swaps between the two generators in such a way that # the stretches of each type have the appropriate distributions: # # The {K} regions have Gaussian-distributed lengths with mean # {tKAvg} and deviation {tKDev}, truncated at minimum length # {tKMin}. # # The {N} regions have uniformly distributed lengths between # {tNMin} and {tNMid}, and exponentially decaying lengths # above {tkMid}, with exponent factor {tKTau}. The first # case happens with probability {tNTrp} # # The {K} generator preserves its state while the {N} generator is # active; i.e. the concatenation of the {K} regions in order is a # string of whole codons. The random number generator is initialized with # the given {seed}. # # The bases and their labels are written to files "{outName}.bas" # and "{outName}.lab", respectively. Just in case, the sequence # of aminoacids corresponding to the codons in the coding regions # is written to "{outName}.trn". Note that the last codon of this # file may be incomplete in "{outName}.bas". # # Each line of a probability table must have 4 columns, # "{EVENT} {TUPLE} {FREQ} {PROB}" # where {EVENT} is a label string, {TUPLE} is a string # of bases, {FREQ} is the occurence count for that event/tuple # pair in the training dataset, and {PROB} is the estimated # probability {Pr(TUPLE|EVENT)}. The {FREQ} colum is not used. abort = -1; if (tabN == "") { arg_error("must define tabN"); } if (tabK == "") { arg_error("must define tabK"); } if (nBases == "") { arg_error("must define nBases"); } if (seed == "") { arg_error("must define seed"); } if (tKMin == "") { arg_error("must define \"tKMin\""); } if (tKAvg == "") { arg_error("must define \"tKAvg\""); } if (tKDev == "") { arg_error("must define \"tKDev\""); } if (tNMin == "") { arg_error("must define \"tNMin\""); } if (tNMid == "") { arg_error("must define \"tNMid\""); } if (tNTrp == "") { arg_error("must define \"tNTrp\""); } if (tNTau == "") { arg_error("must define \"tNTau\""); } if (outName == "") { arg_error("must define outName"); } # Make sure that all numeric parameters are indeed numeric: nBases += 0; tKMin += 0; tKAvg += 0; tKDev += 0; tNMin += 0; tNMid += 0; tNTrp += 0; tNTau += 0; # Parameter constraints: if (tKMin < 0) { arg_error("invalid \"tKMin\""); } if (tKDev < 0) { arg_error("invalid \"tKDev\""); } if (tNMin < 0) { arg_error("invalid \"tNMin\""); } if (tNMid <= tNMin) { arg_error("invalid \"tNMin,tNMid\""); } if ((tNTrp < 0) || (tNTrp > 1.0)) { arg_error("invalid \"tNTrp\""); } if (tNTau <= 0) { arg_error("invalid \"tNTau\""); } srand(seed + 1234567); basFile = ( outName ".bas" ); # DNA file name. labFile = ( outName ".lab" ); # Label file name. trnFile = ( outName ".trn" ); # Translation file name. split("", prob1); # Makes {prob1} into an array. read_cond_prob_table(tabN, prob1, 0); split("", prob3); # Makes {prob3} into an array. read_cond_prob_table(tabK, prob3, 1); outNum = 0; # Number of bases written so far. curType = 0; # Current region type: {0} = non-coding, {1} = coding. curLength = 0; # Chosen length of current region. curNum = 0; # Number of bases written since last type change. curPhase = 0; # Position of next coding base in its codon (0,1,2). curCodon = ""; # Codon that contains the last {K}-type base written. split("", tyNum); # {tyNum[type]} is the number of bases of type {ty} written so far. tyNum[0] = 0; tyNum[1] = 0; split("", tbAmino); setup_aminoacid_table(tbAmino); # Loop on bases: while(outNum < nBases) { swap = (curNum >= curLength); if (swap) { # Swap to the other model curType = 1 - curType; curNum = 0; curLength = choose_region_length(curType); # printf " %s[%d]", substr("NK",1+curType,1), curLength > "/dev/stderr" } if (curType == 0) { # Non-coding region: pick a random base from table {prob1}. bas = choose_string(prob1); lab = "N"; } else if (curType == 1) { # Coding region: pick next base of a random codon from table {prob3}. if (curPhase == 0) { # Choose the next codon: curCodon = choose_string(prob3); # Write its aminoacid to the translation file: printf "%s %s\n", codon, tbAmino[codon] > trnFile; } # Pick the next letter from the current codon: curPhase++; bas = substr(curCodon, curPhase, 1); lab = substr("DEF", curPhase, 1); curPhase = curPhase % 3; } else { prog_error("invalid region type"); } if ((outNum > 0) && (outNum % 60 == 0)) { printf "\n" > basFile; printf "\n" > labFile; } printf "%c", bas > basFile; printf "%c", lab > labFile; outNum++; curNum++; tyNum[curType]++; } printf "\n" > basFile; close(basFile); printf "\n" > labFile; close(labFile); printf "\n" > trnFile; close(trnFile); printf "%d bases written to %s and %s\n", outNum, basFile, labFile > "/dev/stderr"; printf "type K = %d type N = %d\n", tyNum[1], tyNum[0] > "/dev/stderr"; } function choose_region_length(type, g,n) { # Chooses the length {n} of the next region, assumed to be of the given {type}. if (type == 0) { # Non-coding: if (rand() < tNTrp) { # Transient part of distribution -- assume it is unform. n = tNMin + int((tNMid - tNMin)*rand()); } else { # Exponential tail of distribution. g = rand()*0.9999999; n = tNMid + int(-log(g)/tNTau + 0.5); } } else if (type == 1) { # Coding: do { n = int(tKDev * gauss_rand() + tKAvg + 0.5); } while (n < tKMin); } else { prog_error("invalid region type"); } return n; } function gauss_rand( u,v,r) { # Returns a Gaussian-distributed random number with # mean 0 and deviation 1. /* Polar method [Knuth II:3.4.1, Algorithm P] */ while (1) { u = 2.0*rand() - 1.0; v = 2.0*rand() - 1.0; r = sqrt(u*u + v*v); if ((r < 1.0) && (r > 1.0e-30)) { return (1.41421356*(u + v)*sqrt(-log(r))/r); } } } function choose_string(frq, s,pr,pra) { # Chooses a string {s} with probability proportional to {frq[s]}. pra = rand()*0.9999999; pr = pra; for (s in frq) { # printf " %s %.6f %.6f\n", s, pr, frq[s] > "/dev/stderr"; pr -= frq[s]; if (pr <= 0) { return s; } } printf "** choice failure - prob = %11.8f\n", pra > "/dev/stderr"; return "***BUG***"; } function compute_swap_prob(tavg ) { # Computes the probability of transition from a state A # to a state B such that the average number of steps # before (and excluding) that transition is {tavg}. return 1.0/(tavg + 1.0); } function read_cond_prob_table(name,prob,type, lin,fld,nfld,ok,tot,nok,nlin) { # Reads a probability table from file {name}, puts in {prob[s]} the # probability of tuple {s} given the corresponding event. # # The table must have the format "{EVENT} {TUPLE} {COUNT} {PROB}" # where {EVENT} is a string of labels [DEFN], {TUPLE} is a # string of bases [ATCG], {COUNT} (not used) is the number of # occurrences of that event/tuple pair in the training dataset, and # {PROB} is the estimated conditional probability {Pr(TUPLE|EVENT)}. # # The procedure reads only a subset of the entries present in the file. # If {type=1}, considers only tuples labeled "DEF" (i.e. codons). # If {type=0}, consider only tuples labeled "N". printf "reading table file \"%s\"...\n", name > "/dev/stderr"; tot = 0; nlin = 0; ERRNO = ""; while ((getline lin < name) > 0) { nlin++; if (lin !~ /^([>]| *$)/) { nfld = split(lin, fld); if (nfld != 4) { cond_prob_table_error(name,nlin, ("bad num fields")); } if (type) { ok = (fld[1] == "DEF"); } else { ok = (fld[1] == "N"); } if (ok) { prob[fld[2]] = fld[4]; tot += fld[4]; nok++; } } } if (ERRNO != "") { cond_prob_table_error(name,nlin, ("read error ERRNO = \"" ERRNO "\"")); } if (tot == 0) { cond_prob_table_error(name,nlin, ("total frequency is zero")); } printf " %d lines %d relevant entries", nlin, nok > "/dev/stderr"; printf " total prob = %8.6f", tot > "/dev/stderr"; printf "\n" > "/dev/stderr"; } function cond_prob_table_error(name,nlin, msg) { printf "%s:%d: ** %s\n", name, nlin, msg > "/dev/stderr"; exit 1; } function setup_aminoacid_table(tb ) { # Stores in {tb} the codon-to-aminoacid table. # Aminoacids are represented by capital letter. # The stop codons are mapped to "$". tb["AAA"] = "K"; tb["AAT"] = "N"; tb["AAC"] = "N"; tb["AAG"] = "K"; tb["ATA"] = "I"; tb["ATT"] = "I"; tb["ATC"] = "I"; tb["ATG"] = "M"; tb["ACA"] = "T"; tb["ACT"] = "T"; tb["ACC"] = "T"; tb["ACG"] = "T"; tb["AGA"] = "R"; tb["AGT"] = "S"; tb["AGC"] = "S"; tb["AGG"] = "R"; tb["TAA"] = "$"; tb["TAT"] = "Y"; tb["TAC"] = "Y"; tb["TAG"] = "$"; tb["TTA"] = "L"; tb["TTT"] = "F"; tb["TTC"] = "F"; tb["TTG"] = "L"; tb["TCA"] = "S"; tb["TCT"] = "S"; tb["TCC"] = "S"; tb["TCG"] = "S"; tb["TGA"] = "$"; tb["TGT"] = "C"; tb["TGC"] = "C"; tb["TGG"] = "W"; tb["CAA"] = "Q"; tb["CAT"] = "H"; tb["CAC"] = "H"; tb["CAG"] = "Q"; tb["CTA"] = "L"; tb["CTT"] = "L"; tb["CTC"] = "L"; tb["CTG"] = "L"; tb["CCA"] = "P"; tb["CCT"] = "P"; tb["CCC"] = "P"; tb["CCG"] = "P"; tb["CGA"] = "R"; tb["CGT"] = "R"; tb["CGC"] = "R"; tb["CGG"] = "R"; tb["GAA"] = "E"; tb["GAT"] = "D"; tb["GAC"] = "D"; tb["GAG"] = "E"; tb["GTA"] = "V"; tb["GTT"] = "V"; tb["GTC"] = "V"; tb["GTG"] = "V"; tb["GCA"] = "A"; tb["GCT"] = "A"; tb["GCC"] = "A"; tb["GCG"] = "A"; tb["GGA"] = "G"; tb["GGT"] = "G"; tb["GGC"] = "G"; tb["GGG"] = "G"; } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function prog_error(msg) { printf "%s:d: ** program error -- %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1; }