#! /usr/bin/gawk -f
# Last edited on 2008-06-15 08:20:22 by stolfi

BEGIN{
  usage = ( \
    "gather_region_length_stats \\\n" \
    "  [ -v maxLength={NUM} ] \\\n" \
    "  -v outName={OUTNAME} \\\n" \
    "  {INFILE}.lab ..." \
  ) ;
  abort = -1;
  
  # Reads the given label files "{INFILE}.lab ..." and
  # tabulates the lengths of coding and non-coding regions.  
  # The label files must contain only letters
  # in {DEFHIJNXYPQRS}; except for spaces, '#'-comments,
  # and initial '>'-lines.
  # 
  # Writes a file "{OUTNAME}-{T}.lens" containing vital statistics
  # about the lengths of regions of type {T}, where {T} is "N" (introns, including
  # splice markers) "X" (head UTR), "Y"(tail UTR), or "K" (coding,
  # including the stop codon).
  # 
  # The output file begins with some structured '#'-comments
  # that give the main statistical parameters for the regions of that type:
  #
  #   "# num = {NUM}"    Number of regions analyzed.
  #   "# tot = {NUM}"    Total length of those regions.
  #   "# avg = {NUM}"    Average length.
  #   "# dev = {NUM}"    Standard deviation of length.
  #   "# min = {NUM}"    Minimum length.
  #   "# max = {NUM}"    Maximum length.
  #   "# len50 = {NUM}"  Median length (exceeds or equals 50% of all regions).
  #   "# len90 = {NUM}"  Length that exceeds or equals 90% of all regions.
  #   "# maxfr = {NUM}"  Count of the most popular length that is neither zero nor too big.
  #   "# zerfr = {NUM}"  Total count of regions that have zero length.
  #   "# bigfr = {NUM}"  Total count of regions that are too long.
  #
  # For the {maxfr} and {bigfr} parameters, a length is considered too
  # big if it exceeds the {maxLength} argument given in the command line.
  # 
  # Afther these comments, the script writes the histogram of the lengths.
  # Each line of the histogram contains the following fields
  #  
  #   {LENGTH} {COUNT} {PROB_DENS}  {CUM_COUNT} {CUM_PROB}
  #  
  # where 
  #   
  #   {LENGTH} is a region length,
  #   
  #   {COUNT} is the number of regions of type {T} with {LENGTH} bases,
  #   
  #   {PROB_DENS} is a smoothed estimate of the probability that 
  #      a region will have {LENGTH} bases,
  #   
  #   {CUM_COUNT} is the sum of the {LENGTH} fields up to this line,
  #   
  #   {CUM_PROB} is an estimate of the probability that a 
  #      region will have at most {LENGTH} bases.
  #   
  # The {LENGTH} will vary from 0 to the maximum length present in the
  # data, or to the {maxLength} parameter, whichever is less, plus one.  Thus, if the last 
  # line of the histogram does not have zero {COUNT}, it 
  # actually refers to all regions that have {LENGTH}
  # *or more* bases.  Note that this truncation of the histogram may affect
  # the {len50} and {len90} parameters given in the '#'-comments.
  
  if (outName == "") { arg_error(("must define \"outName\"")); }
  
  # Max length for histograms:
  if (maxLength == "") { maxLength = 1000000; }
  maxLength += 0;
  if (maxLength < 1) { arg_error(("invalid \"maxLength\""));; }
  
  # The type of each region is "N", "K", or "*" for interfile gap.
  # These tables are indexed by region type {t}:
  split("", nr);   # Number of complete regions of type {t}. 
  split("", nb);   # Sum of lengths (base counts) of said regions.
  split("", nb2);  # Sum of lengths squared of said regions.
  split("", minb); # Min length of those regions.
  split("", maxb); # Max length of those regions.
  
  # These are indexed by file type {t} and length {r}, clipped at {maxLength}:
  split("", histb);
  
  curfile = "";  # Current file name.
  curlin = "";   # Current record in current file (for debugging).
  curfnr = 0;    # Current record number in current file (for debugging).
  
  # Min `normal' length for each type:
  split("", mnormb);
  
  # Clear tables:
  zer("*"); mnormb["*"] = 0;
  zer("K"); mnormb["K"] = 3;
  zer("N"); mnormb["N"] = 30;
  zer("X"); mnormb["X"] = 0;
  zer("Y"); mnormb["Y"] = 0;
}

(abort >= 0) { exit(abort); }

(FILENAME != curfile) {
  # Start of new file:
  if (curfile != "") { fin(curt); }
  # Pretend a zero-length region at file transition:
  ini("*");
  # Make the new file current:
  curfile = FILENAME;
}

/^[>]/ {
  # Header record, ignore: 
  if (curt != "*") { data_error(("misplaced header")); }
  next;
}

// {
  # Start processing one more line of the current file:
  curfnr = FNR; curlin = $0;
  
  # Grab the line of labels {lin}:
  lin = $0;
  
  # Delete spaces and comments:
  gsub(/[ ]/, "", lin)
  gsub(/[\#].*$/, "", lin);
  
  # Break {lin} into region pieces and process them:
  while(lin != ""){
    if (match(lin,/^[DEFHIJ]+/))
      { seq(RLENGTH, "K"); }
    else if (match(lin,/^[NPQRS]+/))
      { seq(RLENGTH, "N"); } 
    else if (match(lin,/^[X]+/))
      { seq(RLENGTH, "X"); } 
    else if (match(lin,/^[Y]+/))
      { seq(RLENGTH, "Y"); } 
    else
      { data_error(("bad format")); }
    lin = substr(lin, 1 + RLENGTH);
  }
  next;
}

END {
  if (abort > 0) { exit(abort); }
  # Finish current region, if any: 
  if (curfile != "") { fin(curt); }
  # Print region statistics:
  out("*");
  out("K");
  out("N");
  out("X");
  out("Y");
}

function zer(t,   r) 
{ # Clears the tables for region type {t}:
  nr[t] = 0; 
  nb[t] = 0;
  nb2[t] = 0;
  minb[t] = 999999999;
  maxb[t] = 0;
  for (r = 0; r <= maxLength+1; r++) { histb[t,r] = 0; }
}

function ini(t) 
{ # Start of a new run of type {t}: 
  curt = t; curb = 0;
}

function fin(t,    r) 
{ # End of a region of type {t}.
  if (curb < mnormb[t]) 
    {  # The sequence is abnormally short, warn the user:
       printf "%-30s %s %3d %s\n", \
        sprintf("%s:%s:", curfile, curfnr), \
        curt, curb, \
        curlin \
        > "/dev/stderr";
    }
  # Update run statistics:
  nr[t]++; 
  nb[t] += curb;
  nb2[t] += curb*curb;
  if (curb < minb[t]) { minb[t] = curb; }
  if (curb > maxb[t])
    { for (r = maxb[t]+1; r <= curb; r++) { histb[t,r] = 0; }
      maxb[t] = curb;
    }
  histb[t,curb]++;
}

function seq(n,t) 
{ # Got another {n} bytes of region of type {t}:
  if (t != curt) { fin(curt); ini(t); }
  curb += n;
}

function out(t,    fname,r,maxHLen,h,s,hs,f,p,q,Sq,Pq,avg,var,dev,len50,len90,maxfr) 
{ # Writes the statistics for region {t} to the proper file.
  
  # User warning:
  printf "regions of type %s: %7d regions  %7d bases\n", t, nr[t], nb[t] > "/dev/stderr";

  # Compute mean {avg} and deviation {dev} of all lengths:
  avg = (nb[t] + 0.0)/nr[t];
  var = ((nb2[t] + 0.0)/nr[t] - avg*avg);
  dev = sqrt(var);
  
  # Find parameters {len50}, {len90}, {zerfr}, {maxfr}, {bigfr} from {histb}: 
  s = 0; # Cumulative number of runs.
  
  zerfr = histb[t,0];
  maxfr = 0;
  bigfr = 0;
  len50 = -1;
  len90 = -1;
  for (r = 0; r <= maxb[t]; r++) 
    { # Number of runs having length {r}
      h = histb[t,r];
      s += h;
      if (r <= maxLength)
        { if ((r > 0) && (h > maxfr)) { maxfr = h; } }
      else
        { bigfr += h; }
      if ((s >= 0.50*nr[t]) && (len50 < 0)) { len50 = r; }
      if ((s >= 0.90*nr[t]) && (len90 < 0)) { len90 = r; }
    }
  if (s != nr[t]) { prog_error(("sum of histb[t,*] not equal to nr[t]")); }

  if (t != "*")
    { # Write the output file.
    
      # Choose the output file name {fname}; use {stderr} for "*" runs.
      fname = (t == "*" ? "/dev/stderr" : sprintf("%s-%s.lens", outName, t));

      # Write the basic statistics.
      printf "# num = %10d\n", nr[t] > fname;
      printf "# tot = %10d\n", nb[t] > fname;
      printf "# avg = %10.1f\n", avg > fname;
      printf "# dev = %10.1f\n", dev > fname;
      printf "# min = %10d\n", minb[t] > fname;
      printf "# max = %10d\n", maxb[t] > fname;
      printf "# len50 = %10d\n", len50 > fname;
      printf "# len90 = %10d\n", len90 > fname;
      printf "# zerfr = %10d\n", zerfr > fname;
      printf "# maxfr = %10d\n", maxfr > fname;
      printf "# bigfr = %10d\n", bigfr > fname;
      printf "\n" > fname;

      # Choose an a priori length distribution for prob estimation.
      # We choose an exponential distribution {Pr(length=r) = q^{r+1}/Sq}.
      # The value of {q} should depend on region type, and we should 
      # account for the remarkable absence of short 'N' regions.
      q = exp(-1.0/2001.0);
      Sq = q/(1 - q);

      # Determine the max length {maxHLen} to be printed:
      maxHLen = (maxLength < maxb[t] ? maxLength : maxb[t]) + 1;

      # Now write the length histogram and other per-length data:
      s = 0;  # Cumulative number of runs.
      Pq = q/Sq; # A priori probability {q^{r+1}/Sq}
      for (r = 0; r <= maxHLen; r++) 
        { # Number of runs having length {r}
          h = (r <= maxLength ? histb[t,r] : bigfr );
          # Smoothed histogram (missing counts should not be a problem):
          hs = \
            ( 0.0 + \
              01.0/64.0*histb[t,r-3] + \
              06.0/64.0*histb[t,r-2] + \
              15.0/64.0*histb[t,r-1] + \
              20.0/64.0*histb[t,r]   + \
              15.0/64.0*histb[t,r+1] + \
              06.0/64.0*histb[t,r+2] + \
              01.0/64.0*histb[t,r+3]  \
            );
          # Estimate the probability {p} of the run length being about {r}:
          p = (hs + Pq) / (nr[t] + 1.0);
          # Number of runs whose length does not exceed {r}
          s += h;
          # Estimate the probability {f} of the run length not exceeding {r}:
          f = (s + 1.0)/(nr[t] + 2.0);
          printf "  %3d  %10d %9.7f  %10d %7.5f\n", r, h, p, s, f > fname;
          Pq *= q;
        }
      if (s != nr[t]) { prog_error(("sum of h not equal to nr[t]")); }
      printf "\n" > fname;
      close(fname);
    }
}
 
function arg_error(msg)
{ printf "** %s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  exit(1);
}

function data_error(msg) 
{ printf "%s:%s: **%s\n", FILENAME, FNR, msg > "/dev/stderr"; 
  exit 1;
}

function prog_error(msg) 
{ printf "program error: **%s\n", msg > "/dev/stderr"; 
  exit 1;
}