#! /usr/bin/gawk -f
# Last edited on 2008-06-15 08:21:27 by stolfi

BEGIN {
  # Error flag
  abort = -1; 
  
  USAGE = ( "cat INFILE | digest_Reese -v outName=NAME" );

  # Reads from standard input a GenBank data file, as 
  # in Reese's dataset. Writes four output files:
  # 
  #   "{outName}.doc" - the header files. 
  # 
  #   "{outName}.bas" - the sequence of bases [ATCG]. 
  # 
  #   "{outName}.lab" - the corresponding label sequence,
  #   with labels 
  #      
  #      'D', 'E', 'F'      The three codon positions in coding regions.
  #      'H', 'I', 'J'      Ditto for the stop codon.
  #      'P', 'Q', 'R' ,'S' Intron splice markers.
  #      'N'                Other intron bases.
  #  .   'X'                Initial UTR.
  #      'Y'                Final UTR.
  #
  #   "{outName}.ama" - the aminoacids from the 
  #   "/translation" entry.
  # 
  # All files begin with a ">" header. The ".doc" 
  # file keeps the original header. The other files
  # get a minimal ">" header with the data file id.
  #   
  # Uses the intron/exon/misc annotations in the 
  # input file's header to determine the labels.

  if (outName == "") { arg_error("must define \"outName\""); }

  state = -1 # Parsing state
  # state = -1 before "LOCUS" line
  # state =  0 before "FEATURES" line
  # state =  1 between "FEATURES" and "ORIGIN" lines
  # state =  2 between "ORIGIN" and "//" lines.
  # state =  3 after "//" line..
  
  source_ok = 0; # TRUE iff "source" entry has been found.
  mrna_ok = 0;   # TRUE iff "mRNA" entry has been found.
  cds_ok = 0;    # TRUE iff "CDS" entry has been found.
  trans_ok = 0;  # TRUE if "/translation" entry has been found.
  
  # Just in case: 
  locus = ""; # Locus identifier.

  split("", bas); # Base sequence will be {bas[seq_ini..seq_fin]}.
  split("", lab); # Label sequence will be {lab[seq_ini..seq_fin]}.
  split("", ama); # Translation sequence will be {ama[1..nama]}.
  doc = "";       # The documentation lines (including the original header).
}

(abort >= 0) { exit abort; }

((state <= 1) && ($0 !~ /^ORIGIN[ ]/)) { 
  # Append the line to the {doc} string: 
  doc = (doc $0 "\n");
  # ... but process it normally: 
}

# Ignore blank lines:
/^ *$/ { next; }

/^LOCUS[ ]/ {
  if (state != -1) { data_error(("\"LOCUS\" line out of place")); }
  state = 0;
  locus = $2;
  printf "locus = %s\n", locus > "/dev/stderr";
  next;
}

/^FEATURES[ ]/ {
  if (state != 0) { data_error(("\"FEATURES\" line out of place")); }
  state = 1;
  key = "";
  next;
}

/^ORIGIN[ ]/ {
  if (state != 1) { data_error(("\"ORIGIN\" line out of place")); }
  state = 2;
  if (! source_ok) { data_error(("missing \"source\" entry")); }
  
  # Set {bas[seq_ini..seq-fin]} to "?", just in case:
  for (i = seq_ini; i <= seq_fin; i++) { bas[i] = "?"; }
  
  # Prepare to parse the DNA data:
  ibas = seq_ini;
  next;
}

/^[\/][\/]/ {
  if (state != 2) { data_error(("\"//\" line out of place")); }
  if (ibas != seq_fin+1) { data_error(("DNA end = " ibas-1 " should be " seq_fin "")); }
  state = 3;
  next;
}

(state == -1) { 
   # Any other line before the "LOCUS" line:
   data_error(("missing \"LOCUS\" line"));
   next;
 }

(state == 0) { 
   # Any other line between "LOCUS" and "FEATURES" lines:
  next;
}

# Process annotation lines:
(state == 1) {
  # Any other line between "FEATURES" and "ORIGIN" lines:
  # Get keyword in columns 1--20:
  key = substr($0, 1, 20);  gsub(/[ ]/, "", key);
  # Get arguments in columns 21--end:
  arg = substr($0, 21);  gsub(/^[ ]+/, "", arg);
  if (key == "source")
    { process_source(arg); }
  else if (key == "mRNA")
    { process_mrna(arg); }
  else if (key == "CDS")
    { process_cds(arg); }
  else if (arg ~ /[\/]translation/)
    { process_translation(arg); }
  else
    { printf "  ignored annotation: %s\n", $0 > "/dev/stderr"; }
  next;
}

(state == 2) {
  # Any other line between "ORIGIN" and "//" lines:
  if ($0 !~ /^[ ]*[0-9]+[ ]+[a-z ]+[ ]*$/) { data_error(("bad DNA line \"" $0 "\"")); }
  if ($1 != ibas) { data_error(("sequence error \"" $1 "\" should be " ibas "")); }
  process_dna($0);
  next;
}

(state == 3) {
  # Any other line after "//" line:
  data_error(("extra line after \"//\"")); 
  next;
}

function process_source(arg,   i)
{
  # Processes the "source" (pre-splicing messenger RNA) entry.
  # Defines the variables {seq_ini} and {seq_fin}.
   
  if (source_ok) { data_error(("more than one \"source\" entry")); }
  source_ok = 1;
  seq_ini = get_ini(arg);
  seq_fin = get_fin(arg)
}

function process_mrna(arg,  k,ini,fin,i)
{
  # Processes the "mRNA" (spliced messenger RNA) entry.
  # sets the array {mrna_range[1..mrna_nranges]}.
  
  if (mrna_ok) { data_error(("more than one \"mRNA\" entry")); }
  mrna_ok = 1;

  # Get complete argument, remove "join( )"
  arg = cleanup_range_list(arg);

  printf "  mRNA = %s\n", arg > "/dev/stderr";
  
  # Remove '<' and '>' from ends:
  gsub(/^[<>]/, "", arg);
  gsub(/..[<>]/, "..", arg);

  # Split into separate ranges:
  mrna_nranges = split(arg, mrna_range, /[,]/);
}

function process_cds(arg,  phase,k,ini,fin,i)
{
  # Processes the "CDS" (coding region) entry.
  # Sets the array {cds_range[1..cds_nranges]}
  
  if (cds_ok) { data_error(("more than one \"CDS\" entry")); }
  cds_ok = 1;

  # Get complete argument, remove "join( )"
  arg = cleanup_range_list(arg);

  printf "  CDS = %s\n", arg > "/dev/stderr";

  # Split into separate ranges:
  cds_nranges = split(arg, cds_range, /[,]/);
}

function cleanup_range_list(arg)
{
  # Reduces {arg} to a bare list of comma-separated ranges.
  # If {arg} is a {join()} call, concatenates to it
  # additional lines if necessary until it is complete, then removes the 
  # "join(" and the final ")".  In any case removes all spaces. 
  # Assumes that the item does not contain any "complement()"
  # calls.
  if (arg ~ /^join/)
    { 
      while ((arg !~ /[\)] *$/) && getline) 
        { doc = ( doc $0 "\n") ;
          arg = ( arg $0 );
          if (arg ~ "complement") { data_error(("item has complementary segs")); }
        }
      gsub(/join *[\(]/, "", arg);
      gsub(/[\)] *$/, "", arg);
    }
  # Remove all spaces:
  gsub(/[ ]/, "", arg);
  
  return arg;
}

function process_translation(arg,   i)
{
  # Processes the "/translation" (aminoacid sequence) entry.
  # Sets {nama} to its length and stores the base letters into {ama[1..nama]}.
  
  if (trans_ok) { data_error(("more than one \"/translation\" entry")); }
  trans_ok = 1;
  
  # Get complete argument, remove "/translation=\"\""
  while ((arg !~ /[\"] *$/) && getline) 
    { doc = ( doc $0 "\n") ;
      arg = ( arg $0 );
    }
  gsub(/[\/]translation *[=] *[\"]/, "", arg);
  gsub(/[\"] *$/, "", arg);
  # Remove all spaces:
  gsub(/[ ]/, "", arg);
  
  printf "  translation = %s\n", arg > "/dev/stderr";
  
  # Save in global variables:
  nama = length(arg);
  for (i = 1; i <= nama; i++) { ama[i] = substr(arg, i, 1); }
}

function get_ini(r)
{
  if (r !~ /^[0-9]+[.][.][0-9]+$/) { data_error(("bad range format \"" r "\"")); }
  gsub(/[.][.][0-9]+$/, "", r);
  return r+0;
}

function get_fin(r)
{
  if (r !~ /^[0-9]+[.][.][0-9]+$/) { data_error(("bad range format \"" r "\"")); }
  gsub(/^[0-9]+[.][.]/, "", r);
  return r+0;
}

function process_dna(lin,   j,n,c)
{
  # Processes a DNA nucleotide line.
  # Stores the bases into {bas[ibas..]} and increments {ibas}.
  
  gsub(/^[ ]*[0-9]+[ ]+/, "", lin);
  gsub(/[ ]/, "", lin);
  n = length(lin);
  for (j = 1; j <= n; j++)
    { c = substr(lin, j, 1);
      if (c !~ /^[atcgATCG]$/) { data_error(("invalid base letter \"" c "\"")); }
      bas[ibas] = c;
      ibas++;
    }
}

END {
  if (abort >= 0) { exit abort; }
  
  # Check whether all necessary information has been parsed:
  if (locus == "") { data_error(("missing \"LOCUS\" line")); }
  if (state != 3) { data_error(("missing \"//\" line")); }
  if (! cds_ok) { data_error(("missing \"CDS\" entry")); }
  if (! trans_ok) { data_error(("missing \"/translation\" entry")); }

  setup_label_sequence();
  
  # Choose the output file names:
  doc_file = ( outName ".doc" );
  bas_file = ( outName ".bas" );
  lab_file = ( outName ".lab" );
  ama_file = ( outName ".ama" );

  # Write the ".doc" file (excluding the "/translation" entry): 
  gsub(/[\/]translation *[=] *["][^"\/]*["]/, "/translation=\"...\"", doc);
  printf "%s", doc > doc_file; doc = "";
  
  # Write minimal header lines:
  printf "> %s\n", locus > bas_file;
  printf "> %s\n", locus > lab_file;
  printf "> %s\n", locus > ama_file;
  
  # Write the base and label sequences: 
  for (i = seq_ini; i <= seq_fin; i++)
    { if ((i > seq_ini) && ((i - seq_ini) % 60 == 0)) 
        { printf "\n" > bas_file; printf "\n" > lab_file; }
      printf "%c", bas[i] > bas_file; printf "%c", lab[i] > lab_file;
    }
  # Write the aminoacid sequence:
  if (! trans_ok) { data_error(("missing \"/translation\" entry")); }
  for (iama = 1; iama <= nama; iama++)
    { if ((iama > 1) && (iama % 60 == 1)) { printf "\n" > ama_file; }
      printf "%s", ama[i] > ama_file; 
    }

  if (seq_ini <= seq_fin) { printf "\n" > bas_file; }
  if (seq_ini <= seq_fin) { printf "\n" > lab_file; }
  if (nama > 0) { printf "\n" > ama_file; }

  close(doc_file);
  close(bas_file);
  close(lab_file);
  close(ama_file);
}

function setup_label_sequence(     k,i,ini,fin,ch,ex)
{
  # Fills the array {lab[seq_ini..seq_fin]} with labels,
  # based on the arrays {mrna_range[1..mrna_nranges]}
  # and {cds_range[1..cds_nranges]}.
  
  # First, sets all {lab[...]} positions in the source to "N".
  # Later some of them will be turned into into [DEFHIJXYPQRS].
  for (i = seq_ini; i <= seq_fin; i++) { lab[i] = "N"; }
  
  if (mrna_ok)
    { # Item has information on the spliced mRNA range, hence 
      # we can identify the UTRs. 
      # Sets all {lab[...]} positions in the spliced mRNA to "X".
      
      # Later some of these "X" will be turned into [DEFHIJPQRS],
      # and then any post-CDS "X" will be changed to "Y".
      for (k = 1; k <= mrna_nranges; k++)
        { ini = get_ini(mrna_range[k]);
          fin = get_fin(mrna_range[k]);
          if ((ini < seq_ini) || (ini > fin) || (fin > seq_fin))
            { data_error(("mRNA range \"" mrna_range[k] "\" invalid or outside of source range")); }
          printf "  mRNA range %s..%s\n", ini, fin > "/dev/stderr";
          for (i = ini; i <= fin; i++)
            { if (lab[i] != "N") { data_error(("lab[" i "] = \"" lab[i] "\" expected \"N\"")); }
              lab[i] = "X";\
            }
        }
    }
  else
    { printf "warning: missing \"mRNA\" entry" > "/dev/stderr"; }
    
  # Sets all {lab[...]} positions in the CDS to [DEF] according to phase.
  # Later the final codon will be turned into [HIJ].
  # If the "mRNA" entry was present, the CDS must be contained in the mRNA.
  ncds = 0; # Number of bases in the coding sequence.
  for (k = 1; k <= cds_nranges; k++)
    { ini = get_ini(cds_range[k]);
      fin = get_fin(cds_range[k]);
      if ((ini < seq_ini) || (ini > fin) || (fin > seq_fin))
        { data_error(("CDS range \"" cds_range[k] "\" invalid or outside of source range")); }
      printf "  CDS range %s..%s\n", ini, fin > "/dev/stderr";
      for (i = ini; i <= fin; i++)
        { ex = (mrna_ok ? "X" : "N"); # Expected value of {lab[i]}.
          if (lab[i] != ex) { data_error(("lab[" i "] = \"" lab[i] "\" expected \"" ex "\"")); }
          lab[i] = substr("DEF", (ncds%3)+1, 1); 
          ncds ++;
        }
    }
    
  # Check consistency of CDS and translation lengths:
  if (ncds != 3*(nama+1))
    { data_error(("CDS length = \"" ncds "\" expected 3*(" nama "+1) = " 3*(nama+1) "")); }
  
  # Relabel the terminal UTR from "X" to "Y":
  i = seq_fin;
  while ((i >= seq_ini) && ((lab[i] < "D") || (lab[i] > "F")))
    { if (lab[i] == "X")
        { lab[i] = "Y"; }
      else if (lab[i] != "N")
        { data_error(("lab[" i "] = \"" lab[i] "\" expected \"N\" or \"X\"")); }
      i--; 
    }
    
  # Show final codon:
  printf "  seq_ini = %d  seq_fin = %d  i = %d\n", seq_ini, seq_fin, i > "/dev/stderr";
  show_codon("last codon bas", bas, i - 2);
  show_codon("last codon lab", lab, i - 2);
    
  # Relabel the last codon of the CDS as "HIJ": 
  if ((i < seq_ini) || (lab[i] != "F")) { data_error(("missing final \"F\"")); }
  lab[i] = "J"; i--;
  while ((i >= seq_ini) && ((lab[i] < "D") || (lab[i] > "F"))) { i--; }
  if ((i < seq_ini) || (lab[i] != "E")) { data_error(("missing final \"E\"")); }
  lab[i] = "I"; i--;
  while ((i >= seq_ini) && ((lab[i] < "D") || (lab[i] > "F"))) { i--; }
  if ((i < seq_ini) || (lab[i] != "D")) { data_error(("missing final \"D\"")); }
  lab[i] = "H"; i--;
  
  # Label intron splice markers with "PQ" and "RS":
  for (i = seq_ini+1; i <= seq_fin; i++)
    { ch = lab[i-1];
      if (lab[i] == "N")
        { if (ch == "P") 
            { lab[i] = "Q"; }
          else if ((ch != "Q") && (ch != "N")) 
            { lab[i] = "P"; }
        }
      else if (ch == "P")
        { data_error(("intron too short - expected \"N\", found \"" lab[i] "\"")); }
    }
  for (i = seq_fin-1; i >= seq_ini; i--)
    { ch = lab[i+1];
      if (lab[i] == "N")
        { if (ch == "S") 
            { lab[i] = "R"; }
          else if ((ch != "R") && (ch != "N")) 
            { lab[i] = "R"; }
        }
      else if (ch == "S")
        { data_error(("intron too short - expected \"N\", found \"" lab[i] "\"")); }
    }
}

function show_codon(title,seq,pos,  j)
{
  # Prints to {stderr} the three letters {seq[pos..pos+2]},
  # with some context, clipped to the range {seq_ini..seq_fin}.
  printf "  %s = ", title > "/dev/stderr";
  for (j = pos - 5; j <= pos + 7; j++)
    { if ((j >= seq_ini) && (j <= seq_fin))
        { if (j == pos) { printf "[" > "/dev/stderr"; }
          printf "%s", seq[j] > "/dev/stderr";
          if (j == pos + 2) { printf "]" > "/dev/stderr"; }
        }
    }
  printf "\n" > "/dev/stderr";
}

function data_error(msg)
{
  printf "  %s\n", $0 > "/dev/stderr";
  printf "** %s:%s: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1;
  exit abort;
}

function arg_error(msg)
{
  printf "** %s\n", msg > "/dev/stderr";
  printf "usage: %s\n", USAGE > "/dev/stderr";
  abort = 1;
  exit abort;
}