#! /usr/bin/gawk -f # Last edited on 2005-10-22 20:46:53 by stolfi BEGIN { usage = ( \ "enum-valid-events \\\n" \ " -v windowSize=NUM \\\n" \ " -v tMin=NUM -v tC=NUM -v tN=NUM \\\n" \ " -v maxNFac=NUM \\\n" \ " > OUTFILE.pre" \ ); # Enumerates all {k}-events compatible with our DNA model, and their # "a priori" probabilities, where {k=windowSize}. # # The output file starts with one line "{k} {nE}" where {nE} is the # number of valid {k}-events in the model. Then come {nE} lines in # the format "{EVENT} {PROB} {PARTITION}" where the {EVENT} is a # valid {k}-event (a string of {k} symbols in [DEFN]), {PROB} is the # a priori probability of {EVENT} according to our model, and # {PARTITION} is the same {EVENT} with '.'s inserted between its # elementary factors. # # Our model assumes that the DNA is a strictly alternating sequence # of coding (C) and non-coding (N) regions. Each region has an exponential # distributon of lengths, with minimum {tMin} and average {tC} # and {tN}, respectively. The bases in an N region are all labeled 'N'. # The bases in a C region are labeled with the repeating string "DEF". # A C region may start with any of the three letters, and its length # need not be a multiple of 3; the starting phase is independent of th # ending phase of the previous C region. if (windowSize == "") { arg_error(("must specify \"windowSize\"")); } if (tMin == "") { arg_error(("must specify \"tMin\"")); } if (tC == "") { arg_error(("must specify \"tC\"")); } if (tN == "") { arg_error(("must specify \"tN\"")); } if (maxNFac == "") { arg_error(("must specify \"maxNFac\"")); } if (tC < tMin) { arg_error(("\"tC\" too small")); } if (tN < tMin) { arg_error(("\"tN\" too small")); } if ((maxNFac != 1) && (maxNFac != 3)) { arg_error(("bad \"maxNFac\"")); } k = windowSize; # A more convenient name. printf "%d\n", k; # Check whether the single-transition hypothesis is true: if (tMin <= k-2) { arg_error("multiple transitions not implemented yet"); } # There are 4 events with no transitions, # 3*(k-1) events with an N-C transition, # and the same number of events with a C-N transition. nvev = 4 + 6*(k-1); # Number of valid events. printf "%d\n", nvev; # Suppose we take all {k}-tuples of a very long DNA string. Since # the minimum length {tMin} of a C or N region is at least {k-1}, no # {k} tuple contains more than one C-N or N-C transition. Since the # average length of C and N regions is {tC} and {tN}, there will be # one C-N transition and one N-C transition every {tC+tN} bases. So, # the fraction of {k}-tuples of each type is # # mixed C-N: {(k-1)/(tC+tN)} # mixed N-C: {(k-1)/(tC+tN)} # pure N: {(tN-k+1)/(tC+tN)} # pure C: {(tC-k+1)/(tC+tN)} # # The mixed N-C events can be divided into {k-1} patterns, all # equally likely, based on the number of 'N's (1 through {k-1}). # For every pure-C pattern there are three events, corresponding # to the three possible start phases. Ditto for each N-C pattern # and for each C-N pattern. den = 1.0/(tC+tN); totpr = 0.0; # All-'N' event: ev = ""; for (i = 0; i < k; i++) { ev = (ev "N"); } output_event(ev, den*(tN-k+1)); printf "\n"; # Loop on initial codon phase: for (ph = 0; ph < 3; ph++) { # All-codon event starting with phase {ph}: ev = ""; for (i = 0; i < k; i++) { j = 1 + (i + ph) % 3; b = substr("DEF", j, 1) ev = (ev b); } output_event(ev, den*(tC-k+1)/3); if (k > 1) { printf "\n"; # Loop on length of 'N' prefix/suffix: eva = ev; evb = revevent(ev); ii = ""; for (sh = 1; sh < k; sh++) { eva = substr(("N" eva), 1, k); output_event(eva, den/3); evb = substr((evb "N"), 2, k); output_event(evb, den/3); } } printf "\n"; } close("/dev/stdout"); printf "%d events total prob = %8.6f\n", nvev, totpr > "/dev/stderr"; } function output_event(ev,pr ) { # Output one line of the valid event file. # Also adds {pr} to the global variable {totpr}. printf "%s %14.7e %s\n", ev, pr, punctuate(ev); totpr += pr; } function revevent(e, r,b) { r = ""; while (e != "") { b = substr(e, 1, 1); e = substr(e, 2); if (b == "D") { b = "F"; } else if (b == "F") { b = "D"; } r = (b r); } return r; } function punctuate(e) { # Factor the N part, if any, and the N-C and C-N transitions. if (maxNFac == 1) { e = punctuate_N_singlets(e); } else if (maxNFac == 3) { e = punctuate_N_triplets(e); } else { arg_error("bad \"maxNFac\""); } # now insert "." at every codon-codon boundary: gsub(/FD/, "F.D", e); # done: return e; } function punctuate_N_singlets(e, n,h,pre,mid,suf) { # Factors the N part of event {e} into isolated 'N' labels, # and inserts '.' between those factors. # Insert '.' around every 'N' label: gsub(/N/, ".N.", e); # Remove redunant '.'s: gsub(/^[.]+/, "", e); gsub(/[.]+$/, "", e); gsub(/[.][.]+/, ".", e); return e; } function punctuate_N_triplets(e, n,h,pre,mid,suf) { # Factors the N part of event {e} into chunks with at most 3 labels, # and inserts '.' between those factors. # # Tries to put the central label of the event (the pivot) # in the middle of a three-label factor, if possible. if (e ~ /N/) { # Get the event's length: n = length(e); # Compute the length {h} of left context part, before pivot letter: h = int(n/2); # break event into prefix, pivot, suffix: pre = substr(e, 1, h); mid = substr(e, 1+h, 1); suf = substr(e, 2+h); if (mid == "N") { # pivot lies in "N" string. # Must break the "N" string so that it has the longest parts over the pivot. # Put "()" around pivot letter: e = (pre "(" mid ")" suf); # try to get the letter {e[h]} centered in an "NNN" part: gsub(/N[(]N[)]N/, "", e); # try to get that letter as first or last of an "NNN" part: gsub(/[(]N[)]NN/, "", e); gsub(/NN[(]N[)]/, "", e); # try to get that letter as first or last of an "NN" part: gsub(/[(]N[)]N/, "", e); gsub(/N[(]N[)]/, "", e); # well, maybe just an "N" part: gsub(/[(]N[)]/, "", e); gsub(/[(]N[)]/, "", e); } else { # insert "<" at every intron-codon boundary in {pre}: gsub(/N$/, "N<", pre); gsub(/ND/, "N" at every codon-intron boundary in {suf}: gsub(/^N/, ">N", suf); gsub(/DN/, "D>N", suf); gsub(/EN/, "E>N", suf); gsub(/FN/, "F>N", suf); e = ( pre mid suf ); } # now eat "NNN" chunks while possible: while (e ~ /[>]NNN/) { gsub(/[>]NNN/, ".NNN>", e); } while (e ~ /NNN[<]/) { gsub(/NNN[<]/, "]NN/) { gsub(/[>]NN/, ".NN>", e); } while (e ~ /NN[<]/) { gsub(/NN[<]/, "]N/) { gsub(/[>]N/, ".N>", e); } while (e ~ /N[<]/) { gsub(/N[<]/, "": gsub(/^[<]/, "", e); gsub(/[>]$/, "", e); gsub(/[<>]/, ".", e); } return e; } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; exit(1); }