#! /usr/bin/gawk -f
# Last edited on 2005-10-22 20:46:53 by stolfi

BEGIN {

  usage = ( \
    "enum-valid-events \\\n" \
    "   -v windowSize=NUM \\\n" \
    "   -v tMin=NUM -v tC=NUM -v tN=NUM \\\n" \
    "   -v maxNFac=NUM \\\n" \
    " > OUTFILE.pre" \
  );
  
  # Enumerates all {k}-events compatible with our DNA model, and their
  # "a priori" probabilities, where {k=windowSize}.  
  # 
  # The output file starts with one line "{k} {nE}" where {nE} is the
  # number of valid {k}-events in the model. Then come {nE} lines in
  # the format "{EVENT} {PROB} {PARTITION}" where the {EVENT} is a
  # valid {k}-event (a string of {k} symbols in [DEFN]), {PROB} is the
  # a priori probability of {EVENT} according to our model, and
  # {PARTITION} is the same {EVENT} with '.'s inserted between its
  # elementary factors.
  # 
  # Our model assumes that the DNA is a strictly alternating sequence 
  # of coding (C) and non-coding (N) regions. Each region has an exponential
  # distributon of lengths, with minimum {tMin} and average {tC}
  # and {tN}, respectively.  The bases in an N region are all labeled 'N'.
  # The bases in a C region are labeled with the repeating string "DEF".
  # A C region may start with any of the three letters, and its length 
  # need not be a multiple of 3; the starting phase is independent of th
  # ending phase of the previous C region.
  
  if (windowSize == "") { arg_error(("must specify \"windowSize\"")); }
  if (tMin == "") { arg_error(("must specify \"tMin\"")); }
  if (tC == "") { arg_error(("must specify \"tC\"")); }
  if (tN == "") { arg_error(("must specify \"tN\"")); }
  if (maxNFac == "") { arg_error(("must specify \"maxNFac\"")); }

  if (tC < tMin) { arg_error(("\"tC\" too small")); }
  if (tN < tMin) { arg_error(("\"tN\" too small")); }
  if ((maxNFac != 1) && (maxNFac != 3)) { arg_error(("bad \"maxNFac\"")); }
  
  k = windowSize; # A more convenient name.
  printf "%d\n", k;
    
  # Check whether the single-transition hypothesis is true:
  if (tMin <= k-2)
    { arg_error("multiple transitions not implemented yet"); }

  # There are 4 events with no transitions, 
  # 3*(k-1) events with an N-C transition,  
  # and the same number of events with a C-N transition.
  
  nvev = 4 + 6*(k-1);  # Number of valid events.
  printf "%d\n", nvev;
  
  # Suppose we take all {k}-tuples of a very long DNA string. Since
  # the minimum length {tMin} of a C or N region is at least {k-1}, no
  # {k} tuple contains more than one C-N or N-C transition. Since the
  # average length of C and N regions is {tC} and {tN}, there will be
  # one C-N transition and one N-C transition every {tC+tN} bases. So,
  # the fraction of {k}-tuples of each type is
  # 
  #   mixed C-N: {(k-1)/(tC+tN)}
  #   mixed N-C: {(k-1)/(tC+tN)}
  #   pure N: {(tN-k+1)/(tC+tN)}
  #   pure C: {(tC-k+1)/(tC+tN)}
  #
  # The mixed N-C events can be divided into {k-1} patterns, all
  # equally likely, based on the number of 'N's (1 through {k-1}).
  # For every pure-C pattern there are three events, corresponding 
  # to the three possible start phases.  Ditto for each N-C pattern
  # and for each C-N pattern.
  
  den = 1.0/(tC+tN);
  totpr = 0.0;
  
  # All-'N' event:
  ev = ""; 
  for (i = 0; i < k; i++) { ev = (ev "N"); }
  output_event(ev, den*(tN-k+1));

  printf "\n";
  
  # Loop on initial codon phase:
  for (ph = 0; ph < 3; ph++)
    { 
      # All-codon event starting with phase {ph}:
      ev = "";
      for (i = 0; i < k; i++)
        { j = 1 + (i + ph) % 3;
          b = substr("DEF", j, 1)
          ev = (ev b); 
        }
      output_event(ev, den*(tC-k+1)/3);

      if (k > 1)
        { printf "\n";

          # Loop on length of 'N' prefix/suffix:
          eva = ev;
          evb = revevent(ev);
          ii = "";
          for (sh = 1; sh < k; sh++)
            { eva = substr(("N" eva), 1, k);
              output_event(eva, den/3);
              evb = substr((evb "N"), 2, k);
              output_event(evb, den/3);
            }
        }

      printf "\n";
    }
  close("/dev/stdout");
  printf "%d events total prob = %8.6f\n", nvev, totpr > "/dev/stderr";
}

function output_event(ev,pr )
{
  # Output one line of the valid event file.
  # Also adds {pr} to the global variable {totpr}.
  printf "%s %14.7e %s\n", ev, pr, punctuate(ev);
  totpr += pr;
}

function revevent(e,   r,b)
{
  r = "";
  while (e != "")
    { 
      b = substr(e, 1, 1);
      e = substr(e, 2);
      if (b == "D") { b = "F"; } else if (b == "F") { b = "D"; }
      r = (b r);
    }
  return r;
}
  
function punctuate(e)
{
  # Factor the N part, if any, and the N-C and C-N transitions.
  if (maxNFac == 1)
    { e = punctuate_N_singlets(e); }
  else if (maxNFac == 3)
    { e = punctuate_N_triplets(e); }
  else
    { arg_error("bad \"maxNFac\""); }
  # now insert "." at every codon-codon boundary:
  gsub(/FD/, "F.D", e);
  
  # done:
  return e;
}

function punctuate_N_singlets(e, n,h,pre,mid,suf)
{
  # Factors the N part of event {e} into isolated 'N' labels,
  # and inserts '.' between those factors.
  
  # Insert '.' around every 'N' label:
  gsub(/N/, ".N.", e);
  
  # Remove redunant '.'s:
  gsub(/^[.]+/, "", e);
  gsub(/[.]+$/, "", e);
  gsub(/[.][.]+/, ".", e);
  
  return e;
}

function punctuate_N_triplets(e, n,h,pre,mid,suf)
{
  # Factors the N part of event {e} into chunks with at most 3 labels,
  # and inserts '.' between those factors.
  # 
  # Tries to put the central label of the event (the pivot) 
  # in the middle of a three-label factor, if possible.
  
  if (e ~ /N/)
    {
      # Get the event's length:
      n = length(e);

      # Compute the length {h} of left context part, before pivot letter:
      h = int(n/2);

      # break event into prefix, pivot, suffix:
      pre = substr(e, 1, h);
      mid = substr(e, 1+h, 1);
      suf = substr(e, 2+h);

      if (mid == "N")
        { 
          # pivot lies in "N" string.
          # Must break the "N" string so that it has the longest parts over the pivot.
          # Put "()" around pivot letter:
          e = (pre "(" mid ")" suf);
          # try to get the letter {e[h]} centered in an "NNN" part:
          gsub(/N[(]N[)]N/, "<NNN>", e);
          # try to get that letter as first or last of an "NNN" part:
          gsub(/[(]N[)]NN/, "<NNN>", e);
          gsub(/NN[(]N[)]/, "<NNN>", e);
          # try to get that letter as first or last of an "NN" part:
          gsub(/[(]N[)]N/, "<NN>", e);
          gsub(/N[(]N[)]/, "<NN>", e);
          # well, maybe just an "N" part:
          gsub(/[(]N[)]/, "<N>", e);
          gsub(/[(]N[)]/, "<N>", e);
        }
      else
        { # insert "<" at every intron-codon boundary in {pre}: 
          gsub(/N$/, "N<", pre);
          gsub(/ND/, "N<D", pre);
          gsub(/NE/, "N<E", pre);
          gsub(/NF/, "N<F", pre);
          
          # insert ">" at every codon-intron boundary in {suf}: 
          gsub(/^N/, ">N", suf);
          gsub(/DN/, "D>N", suf);
          gsub(/EN/, "E>N", suf);
          gsub(/FN/, "F>N", suf);
          e = ( pre mid suf );
        }

      # now eat "NNN" chunks while possible:
      while (e ~ /[>]NNN/) { gsub(/[>]NNN/, ".NNN>", e); }
      while (e ~ /NNN[<]/) { gsub(/NNN[<]/, "<NNN.", e); }
      # now eat "NN" chunks while possible:
      while (e ~ /[>]NN/) { gsub(/[>]NN/, ".NN>", e); }
      while (e ~ /NN[<]/) { gsub(/NN[<]/, "<NN.", e); }
      # now eat "N" chunks while possible:
      while (e ~ /[>]N/) { gsub(/[>]N/, ".N>", e); }
      while (e ~ /N[<]/) { gsub(/N[<]/, "<N.", e); }
      # now get rid of "<" and ">":
      gsub(/^[<]/, "", e);
      gsub(/[>]$/, "", e);
      gsub(/[<>]/, ".", e);
    }
  return e; 
}
  
function arg_error(msg)
{
  printf "** %s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  exit(1);
}