#! /usr/bin/gawk -f # Last edited on 2008-06-15 08:20:22 by stolfi BEGIN{ usage = ( \ "gather_region_length_stats \\\n" \ " [ -v maxLength={NUM} ] \\\n" \ " -v outName={OUTNAME} \\\n" \ " {INFILE}.lab ..." \ ) ; abort = -1; # Reads the given label files "{INFILE}.lab ..." and # tabulates the lengths of coding and non-coding regions. # The label files must contain only letters # in {DEFHIJNXYPQRS}; except for spaces, '#'-comments, # and initial '>'-lines. # # Writes a file "{OUTNAME}-{T}.lens" containing vital statistics # about the lengths of regions of type {T}, where {T} is "N" (introns, including # splice markers) "X" (head UTR), "Y"(tail UTR), or "K" (coding, # including the stop codon). # # The output file begins with some structured '#'-comments # that give the main statistical parameters for the regions of that type: # # "# num = {NUM}" Number of regions analyzed. # "# tot = {NUM}" Total length of those regions. # "# avg = {NUM}" Average length. # "# dev = {NUM}" Standard deviation of length. # "# min = {NUM}" Minimum length. # "# max = {NUM}" Maximum length. # "# len50 = {NUM}" Median length (exceeds or equals 50% of all regions). # "# len90 = {NUM}" Length that exceeds or equals 90% of all regions. # "# maxfr = {NUM}" Count of the most popular length that is neither zero nor too big. # "# zerfr = {NUM}" Total count of regions that have zero length. # "# bigfr = {NUM}" Total count of regions that are too long. # # For the {maxfr} and {bigfr} parameters, a length is considered too # big if it exceeds the {maxLength} argument given in the command line. # # Afther these comments, the script writes the histogram of the lengths. # Each line of the histogram contains the following fields # # {LENGTH} {COUNT} {PROB_DENS} {CUM_COUNT} {CUM_PROB} # # where # # {LENGTH} is a region length, # # {COUNT} is the number of regions of type {T} with {LENGTH} bases, # # {PROB_DENS} is a smoothed estimate of the probability that # a region will have {LENGTH} bases, # # {CUM_COUNT} is the sum of the {LENGTH} fields up to this line, # # {CUM_PROB} is an estimate of the probability that a # region will have at most {LENGTH} bases. # # The {LENGTH} will vary from 0 to the maximum length present in the # data, or to the {maxLength} parameter, whichever is less, plus one. Thus, if the last # line of the histogram does not have zero {COUNT}, it # actually refers to all regions that have {LENGTH} # *or more* bases. Note that this truncation of the histogram may affect # the {len50} and {len90} parameters given in the '#'-comments. if (outName == "") { arg_error(("must define \"outName\"")); } # Max length for histograms: if (maxLength == "") { maxLength = 1000000; } maxLength += 0; if (maxLength < 1) { arg_error(("invalid \"maxLength\""));; } # The type of each region is "N", "K", or "*" for interfile gap. # These tables are indexed by region type {t}: split("", nr); # Number of complete regions of type {t}. split("", nb); # Sum of lengths (base counts) of said regions. split("", nb2); # Sum of lengths squared of said regions. split("", minb); # Min length of those regions. split("", maxb); # Max length of those regions. # These are indexed by file type {t} and length {r}, clipped at {maxLength}: split("", histb); curfile = ""; # Current file name. curlin = ""; # Current record in current file (for debugging). curfnr = 0; # Current record number in current file (for debugging). # Min `normal' length for each type: split("", mnormb); # Clear tables: zer("*"); mnormb["*"] = 0; zer("K"); mnormb["K"] = 3; zer("N"); mnormb["N"] = 30; zer("X"); mnormb["X"] = 0; zer("Y"); mnormb["Y"] = 0; } (abort >= 0) { exit(abort); } (FILENAME != curfile) { # Start of new file: if (curfile != "") { fin(curt); } # Pretend a zero-length region at file transition: ini("*"); # Make the new file current: curfile = FILENAME; } /^[>]/ { # Header record, ignore: if (curt != "*") { data_error(("misplaced header")); } next; } // { # Start processing one more line of the current file: curfnr = FNR; curlin = $0; # Grab the line of labels {lin}: lin = $0; # Delete spaces and comments: gsub(/[ ]/, "", lin) gsub(/[\#].*$/, "", lin); # Break {lin} into region pieces and process them: while(lin != ""){ if (match(lin,/^[DEFHIJ]+/)) { seq(RLENGTH, "K"); } else if (match(lin,/^[NPQRS]+/)) { seq(RLENGTH, "N"); } else if (match(lin,/^[X]+/)) { seq(RLENGTH, "X"); } else if (match(lin,/^[Y]+/)) { seq(RLENGTH, "Y"); } else { data_error(("bad format")); } lin = substr(lin, 1 + RLENGTH); } next; } END { if (abort > 0) { exit(abort); } # Finish current region, if any: if (curfile != "") { fin(curt); } # Print region statistics: out("*"); out("K"); out("N"); out("X"); out("Y"); } function zer(t, r) { # Clears the tables for region type {t}: nr[t] = 0; nb[t] = 0; nb2[t] = 0; minb[t] = 999999999; maxb[t] = 0; for (r = 0; r <= maxLength+1; r++) { histb[t,r] = 0; } } function ini(t) { # Start of a new run of type {t}: curt = t; curb = 0; } function fin(t, r) { # End of a region of type {t}. if (curb < mnormb[t]) { # The sequence is abnormally short, warn the user: printf "%-30s %s %3d %s\n", \ sprintf("%s:%s:", curfile, curfnr), \ curt, curb, \ curlin \ > "/dev/stderr"; } # Update run statistics: nr[t]++; nb[t] += curb; nb2[t] += curb*curb; if (curb < minb[t]) { minb[t] = curb; } if (curb > maxb[t]) { for (r = maxb[t]+1; r <= curb; r++) { histb[t,r] = 0; } maxb[t] = curb; } histb[t,curb]++; } function seq(n,t) { # Got another {n} bytes of region of type {t}: if (t != curt) { fin(curt); ini(t); } curb += n; } function out(t, fname,r,maxHLen,h,s,hs,f,p,q,Sq,Pq,avg,var,dev,len50,len90,maxfr) { # Writes the statistics for region {t} to the proper file. # User warning: printf "regions of type %s: %7d regions %7d bases\n", t, nr[t], nb[t] > "/dev/stderr"; # Compute mean {avg} and deviation {dev} of all lengths: avg = (nb[t] + 0.0)/nr[t]; var = ((nb2[t] + 0.0)/nr[t] - avg*avg); dev = sqrt(var); # Find parameters {len50}, {len90}, {zerfr}, {maxfr}, {bigfr} from {histb}: s = 0; # Cumulative number of runs. zerfr = histb[t,0]; maxfr = 0; bigfr = 0; len50 = -1; len90 = -1; for (r = 0; r <= maxb[t]; r++) { # Number of runs having length {r} h = histb[t,r]; s += h; if (r <= maxLength) { if ((r > 0) && (h > maxfr)) { maxfr = h; } } else { bigfr += h; } if ((s >= 0.50*nr[t]) && (len50 < 0)) { len50 = r; } if ((s >= 0.90*nr[t]) && (len90 < 0)) { len90 = r; } } if (s != nr[t]) { prog_error(("sum of histb[t,*] not equal to nr[t]")); } if (t != "*") { # Write the output file. # Choose the output file name {fname}; use {stderr} for "*" runs. fname = (t == "*" ? "/dev/stderr" : sprintf("%s-%s.lens", outName, t)); # Write the basic statistics. printf "# num = %10d\n", nr[t] > fname; printf "# tot = %10d\n", nb[t] > fname; printf "# avg = %10.1f\n", avg > fname; printf "# dev = %10.1f\n", dev > fname; printf "# min = %10d\n", minb[t] > fname; printf "# max = %10d\n", maxb[t] > fname; printf "# len50 = %10d\n", len50 > fname; printf "# len90 = %10d\n", len90 > fname; printf "# zerfr = %10d\n", zerfr > fname; printf "# maxfr = %10d\n", maxfr > fname; printf "# bigfr = %10d\n", bigfr > fname; printf "\n" > fname; # Choose an a priori length distribution for prob estimation. # We choose an exponential distribution {Pr(length=r) = q^{r+1}/Sq}. # The value of {q} should depend on region type, and we should # account for the remarkable absence of short 'N' regions. q = exp(-1.0/2001.0); Sq = q/(1 - q); # Determine the max length {maxHLen} to be printed: maxHLen = (maxLength < maxb[t] ? maxLength : maxb[t]) + 1; # Now write the length histogram and other per-length data: s = 0; # Cumulative number of runs. Pq = q/Sq; # A priori probability {q^{r+1}/Sq} for (r = 0; r <= maxHLen; r++) { # Number of runs having length {r} h = (r <= maxLength ? histb[t,r] : bigfr ); # Smoothed histogram (missing counts should not be a problem): hs = \ ( 0.0 + \ 01.0/64.0*histb[t,r-3] + \ 06.0/64.0*histb[t,r-2] + \ 15.0/64.0*histb[t,r-1] + \ 20.0/64.0*histb[t,r] + \ 15.0/64.0*histb[t,r+1] + \ 06.0/64.0*histb[t,r+2] + \ 01.0/64.0*histb[t,r+3] \ ); # Estimate the probability {p} of the run length being about {r}: p = (hs + Pq) / (nr[t] + 1.0); # Number of runs whose length does not exceed {r} s += h; # Estimate the probability {f} of the run length not exceeding {r}: f = (s + 1.0)/(nr[t] + 2.0); printf " %3d %10d %9.7f %10d %7.5f\n", r, h, p, s, f > fname; Pq *= q; } if (s != nr[t]) { prog_error(("sum of h not equal to nr[t]")); } printf "\n" > fname; close(fname); } } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; exit(1); } function data_error(msg) { printf "%s:%s: **%s\n", FILENAME, FNR, msg > "/dev/stderr"; exit 1; } function prog_error(msg) { printf "program error: **%s\n", msg > "/dev/stderr"; exit 1; }