#! /usr/bin/gawk -f
# Last edited on 2008-06-08 17:50:24 by stolfi

BEGIN {
  usage = ( \
    "dbd_synthesize_dna \\\n" \
    "   -v tabK=TAB3FILE.prb \\\n" \
    "   -v tabN=TAB1FILE.prb \\\n" \
    "   -v nBases=NUM \\\n" \
    "   -v seed=NUM \\\n" \
    "   -v tKMin=${NUM} \\\n" \
    "   -v tKAvg=${NUM} \\\n" \
    "   -v tKDev=${NUM} \\\n" \
    "   -v tNMin=${NUM} \\\n" \
    "   -v tNMid=${NUM} \\\n" \
    "   -v tNTrp=${NUM} \\\n" \
    "   -v tNTau=${NUM} \\\n" \
    "   -v outName=NAME" \
  );
  
  # Generates a random DNA sequence according to our stochastic model.
  # There are two generators, `non-coding' ({N}) and `coding' ({K}):
  #    
  #   The {N} generator simply concatenates bases independely chosen
  #   with probabilities given by table {tabN}, and assigns them the
  #   label 'N'.
  #   
  #   The {K} generator concatenates triplets of DNA
  #   independently chosen with the probabilities of table {tabK}, and
  #   assigns to its bases the labels 'D', 'E', 'F', according to their
  #   positions in each codon.
  # 
  # The output swaps between the two generators in such a way that
  # the stretches of each type have the appropriate distributions:
  #   
  #   The {K} regions have Gaussian-distributed lengths with mean
  #   {tKAvg} and deviation {tKDev}, truncated at minimum length 
  #   {tKMin}. 
  #   
  #   The {N} regions have uniformly distributed lengths between 
  #   {tNMin} and {tNMid}, and exponentially decaying lengths
  #   above {tkMid}, with exponent factor {tKTau}.  The first
  #   case happens with probability {tNTrp}
  # 
  # The {K} generator preserves its state while the {N} generator is
  # active; i.e. the concatenation of the {K} regions in order is a
  # string of whole codons. The random number generator is initialized with 
  # the given {seed}.
  # 
  # The bases and their labels are written to files "{outName}.bas"
  # and "{outName}.lab", respectively.  Just in case, the sequence
  # of aminoacids corresponding to the codons in the coding regions
  # is written to "{outName}.trn".  Note that the last codon of this
  # file may be incomplete in "{outName}.bas".
  #   
  # Each line of a probability table must have 4 columns, 
  # "{EVENT} {TUPLE} {FREQ} {PROB}"
  # where {EVENT} is a label string, {TUPLE} is a string
  # of bases, {FREQ} is the occurence count for that event/tuple
  # pair in the training dataset, and {PROB} is the estimated
  # probability {Pr(TUPLE|EVENT)}. The {FREQ} colum is not used.
  
  abort = -1;

  if (tabN == "") { arg_error("must define tabN"); }
  if (tabK == "") { arg_error("must define tabK"); }
  if (nBases == "") { arg_error("must define nBases"); }
  if (seed == "") { arg_error("must define seed"); }
  
  if (tKMin == "") { arg_error("must define \"tKMin\""); }
  if (tKAvg == "") { arg_error("must define \"tKAvg\""); }
  if (tKDev == "") { arg_error("must define \"tKDev\""); }
  if (tNMin == "") { arg_error("must define \"tNMin\""); }
  if (tNMid == "") { arg_error("must define \"tNMid\""); }
  if (tNTrp == "") { arg_error("must define \"tNTrp\""); }
  if (tNTau == "") { arg_error("must define \"tNTau\""); }
  if (outName == "") { arg_error("must define outName"); }
  
  # Make sure that all numeric parameters are indeed numeric:
  nBases += 0;
  tKMin  += 0;
  tKAvg  += 0;
  tKDev  += 0;
  tNMin  += 0;
  tNMid  += 0;
  tNTrp  += 0;
  tNTau  += 0;
  
  # Parameter constraints:
  if (tKMin < 0) { arg_error("invalid \"tKMin\""); }
  if (tKDev < 0) { arg_error("invalid \"tKDev\""); }
  if (tNMin < 0) { arg_error("invalid \"tNMin\""); }
  if (tNMid <= tNMin) { arg_error("invalid \"tNMin,tNMid\""); }
  if ((tNTrp < 0) || (tNTrp > 1.0)) { arg_error("invalid \"tNTrp\""); }
  if (tNTau <= 0) { arg_error("invalid \"tNTau\""); }
  
  srand(seed + 1234567);
  
  basFile = ( outName ".bas" );  # DNA file name.
  labFile = ( outName ".lab" );  # Label file name.
  trnFile = ( outName ".trn" );  # Translation file name.

  split("", prob1); # Makes {prob1} into an array.
  read_cond_prob_table(tabN, prob1, 0);

  split("", prob3); # Makes {prob3} into an array.
  read_cond_prob_table(tabK, prob3, 1);
  
  outNum = 0;   # Number of bases written so far.

  curType = 0;    # Current region type: {0} = non-coding, {1} = coding.
  curLength = 0;  # Chosen length of current region. 
  curNum = 0;     # Number of bases written since last type change.
  curPhase = 0;   # Position of next coding base in its codon (0,1,2). 
  curCodon = "";  # Codon that contains the last {K}-type base written.
  
  split("", tyNum); # {tyNum[type]} is the number of bases of type {ty} written so far. 
  tyNum[0] = 0; 
  tyNum[1] = 0;
  
  split("", tbAmino);
  setup_aminoacid_table(tbAmino);

  # Loop on bases:
  while(outNum < nBases)
    { 
      swap = (curNum >= curLength);
      if (swap)
        { # Swap to the other model
          curType = 1 - curType;
          curNum = 0;
          curLength = choose_region_length(curType);
          # printf " %s[%d]", substr("NK",1+curType,1), curLength > "/dev/stderr"
        }
      if (curType == 0)
        { # Non-coding region: pick a random base from table {prob1}.
          bas = choose_string(prob1);
          lab = "N";
        }
      else if (curType == 1)
        { # Coding region: pick next base of a random codon from table {prob3}.
          if (curPhase == 0) 
            { # Choose the next codon: 
              curCodon = choose_string(prob3);
              # Write its aminoacid to the translation file:
              printf "%s %s\n", codon, tbAmino[codon] > trnFile;
            }
          # Pick the next letter from the current codon:
          curPhase++;
          bas = substr(curCodon, curPhase, 1);
          lab = substr("DEF", curPhase, 1);
          curPhase = curPhase % 3;
        }
      else
        { prog_error("invalid region type"); }
      if ((outNum > 0) && (outNum % 60 == 0))
        { printf "\n" > basFile;
          printf "\n" > labFile;
        }
      printf "%c", bas > basFile; 
      printf "%c", lab > labFile; 
      outNum++;
      curNum++;
      tyNum[curType]++;
    }
  printf "\n" > basFile;  close(basFile);
  printf "\n" > labFile;  close(labFile);
  printf "\n" > trnFile;  close(trnFile);
  
  printf "%d bases written to %s and %s\n", outNum, basFile, labFile > "/dev/stderr";
  printf "type K = %d  type N = %d\n", tyNum[1], tyNum[0] > "/dev/stderr";
}

function choose_region_length(type,   g,n)
{
  # Chooses the length {n} of the next region, assumed to be of the given {type}.
  if (type == 0)
    { # Non-coding: 
      if (rand() < tNTrp)
        { # Transient part of distribution -- assume it is unform.
          n = tNMin + int((tNMid - tNMin)*rand());
        }
      else
        { # Exponential tail of distribution.
          g = rand()*0.9999999;
          n = tNMid + int(-log(g)/tNTau + 0.5);
        }
    }
  else if (type == 1)
    { # Coding: 
      do 
        { n = int(tKDev * gauss_rand() + tKAvg + 0.5); }
      while (n < tKMin);
    }
  else
    { prog_error("invalid region type"); }
  return n;
}

function gauss_rand(     u,v,r)
{
  # Returns a Gaussian-distributed random number with
  # mean 0 and deviation 1.
  /* Polar method [Knuth II:3.4.1, Algorithm P] */
  while (1)
    { u = 2.0*rand() - 1.0;
      v = 2.0*rand() - 1.0;
      r = sqrt(u*u + v*v);
      if ((r < 1.0) && (r > 1.0e-30))
        { return (1.41421356*(u + v)*sqrt(-log(r))/r); }
    }
}

function choose_string(frq,  s,pr,pra)
{
  # Chooses a string {s} with probability proportional to {frq[s]}.
  pra = rand()*0.9999999;
  pr = pra;
  for (s in frq)
    { # printf "  %s %.6f %.6f\n", s, pr, frq[s] > "/dev/stderr";
      pr -= frq[s]; 
      if (pr <= 0) { return s; }
    }
  printf "** choice failure - prob = %11.8f\n", pra > "/dev/stderr";
  return "***BUG***";
}

function compute_swap_prob(tavg  )
{
  # Computes the probability of transition from a state A
  # to a state B such that the average number of steps
  # before (and excluding) that transition is {tavg}. 

  return 1.0/(tavg + 1.0);
}

function read_cond_prob_table(name,prob,type,     lin,fld,nfld,ok,tot,nok,nlin)
{
  # Reads a probability table from file {name}, puts in {prob[s]} the
  # probability of tuple {s} given the corresponding event.
  # 
  # The table must have the format "{EVENT} {TUPLE} {COUNT} {PROB}" 
  # where {EVENT} is a string of labels [DEFN], {TUPLE} is a 
  # string of bases [ATCG], {COUNT} (not used) is the number of
  # occurrences of that event/tuple pair in the training dataset, and
  # {PROB} is the estimated conditional probability {Pr(TUPLE|EVENT)}. 
  # 
  # The procedure reads only a subset of the entries present in the file.
  # If {type=1}, considers only tuples labeled "DEF" (i.e. codons).
  # If {type=0}, consider only tuples labeled "N".
  
  printf "reading table file \"%s\"...\n", name > "/dev/stderr";
  tot = 0;
  nlin = 0;
  ERRNO = "";
  while ((getline lin < name) > 0)
    {
      nlin++;
      if (lin !~ /^([>]| *$)/)
        {
          nfld = split(lin, fld);
          if (nfld != 4) { cond_prob_table_error(name,nlin, ("bad num fields")); }
          if (type)
            { ok = (fld[1] == "DEF"); }
          else 
            { ok = (fld[1] == "N"); }
          if (ok)
            { prob[fld[2]] = fld[4]; tot += fld[4]; nok++; }
        }
    }
  if (ERRNO != "") { cond_prob_table_error(name,nlin, ("read error ERRNO = \"" ERRNO "\"")); }
  if (tot == 0) { cond_prob_table_error(name,nlin, ("total frequency is zero")); }
  printf "  %d lines  %d relevant entries", nlin, nok > "/dev/stderr";
  printf "  total prob = %8.6f", tot > "/dev/stderr";
  printf "\n" > "/dev/stderr";
}

function cond_prob_table_error(name,nlin, msg)
{
  printf "%s:%d: ** %s\n", name, nlin, msg > "/dev/stderr"; 
  exit 1;
}

function setup_aminoacid_table(tb )
{
  # Stores in {tb} the codon-to-aminoacid table.
  # Aminoacids are represented by capital letter.
  # The stop codons are mapped to "$".

  tb["AAA"] = "K";  
  tb["AAT"] = "N";  
  tb["AAC"] = "N";  
  tb["AAG"] = "K";  
  tb["ATA"] = "I";  
  tb["ATT"] = "I";  
  tb["ATC"] = "I";  
  tb["ATG"] = "M";  
  tb["ACA"] = "T";  
  tb["ACT"] = "T";  
  tb["ACC"] = "T";  
  tb["ACG"] = "T";  
  tb["AGA"] = "R";  
  tb["AGT"] = "S";  
  tb["AGC"] = "S";  
  tb["AGG"] = "R";  
  tb["TAA"] = "$";  
  tb["TAT"] = "Y";  
  tb["TAC"] = "Y";  
  tb["TAG"] = "$";  
  tb["TTA"] = "L";  
  tb["TTT"] = "F";  
  tb["TTC"] = "F";  
  tb["TTG"] = "L";  
  tb["TCA"] = "S";  
  tb["TCT"] = "S";  
  tb["TCC"] = "S";  
  tb["TCG"] = "S";  
  tb["TGA"] = "$";  
  tb["TGT"] = "C";  
  tb["TGC"] = "C";  
  tb["TGG"] = "W";  
  tb["CAA"] = "Q";  
  tb["CAT"] = "H";  
  tb["CAC"] = "H";  
  tb["CAG"] = "Q";  
  tb["CTA"] = "L";  
  tb["CTT"] = "L";  
  tb["CTC"] = "L";  
  tb["CTG"] = "L";  
  tb["CCA"] = "P";  
  tb["CCT"] = "P";  
  tb["CCC"] = "P";  
  tb["CCG"] = "P";  
  tb["CGA"] = "R";  
  tb["CGT"] = "R";  
  tb["CGC"] = "R";  
  tb["CGG"] = "R";  
  tb["GAA"] = "E";  
  tb["GAT"] = "D";  
  tb["GAC"] = "D";  
  tb["GAG"] = "E";  
  tb["GTA"] = "V";  
  tb["GTT"] = "V";  
  tb["GTC"] = "V";  
  tb["GTG"] = "V";  
  tb["GCA"] = "A";  
  tb["GCT"] = "A";  
  tb["GCC"] = "A";  
  tb["GCG"] = "A";  
  tb["GGA"] = "G";  
  tb["GGT"] = "G";  
  tb["GGC"] = "G";  
  tb["GGG"] = "G";  
}

function arg_error(msg)
{
  printf "** %s\n", msg > "/dev/stderr"; 
  printf "usage: %s\n", usage > "/dev/stderr"; 
  abort = 1; exit 1;
}

function prog_error(msg)
{
  printf "%s:d: ** program error -- %s\n", FILENAME, FNR, msg > "/dev/stderr"; 
  abort = 1; exit 1;
}