#! /usr/bin/gawk -f
# Last edited on 2008-06-16 02:22:18 by stolfi

# ----------------------------------------------------------------------

  doc_file = ( outName ".doc" );
  ama_file = ( outName ".ama" );

  # Write minimal header lines:
  printf "> %s\n", locus > ama_file;

  # Write the ".doc" file (excluding the "/translation" entry): 
  gsub(/[\/]translation *[=] *["][^"\/]*["]/, "/translation=\"...\"", doc);
  printf "%s", doc > doc_file; doc = "";

  # Write the aminoacid sequence:
  if (! trans_ok) { data_error(("missing «/translation» entry")); }
  for (iama = 1; iama <= nama; iama++)
    { if ((iama > 1) && (iama % 60 == 1)) { printf "\n" > ama_file; }
      printf "%s", ama[i] > ama_file; 
    }
  if (nama > 0) { printf "\n" > ama_file; }
  
  close(doc_file);
  close(ama_file);

# ----------------------------------------------------------------------

function cleanup_range_list(arg)
{
  # Reduces {arg} to a bare list of comma-separated ranges.
  # If {arg} is a {join()} call, concatenates to it
  # additional lines if necessary until it is complete, then removes the 
  # "join(" and the final ")".  In any case removes all spaces. 
  # Assumes that the item does not contain any "complement()"
  # calls.
  if (arg ~ /^join/)
    { 
      while ((arg !~ /[\)] *$/) && getline) 
        { doc = ( doc $0 "\n") ;
          arg = ( arg $0 );
          if (arg ~ "complement") { data_error(("item has complementary segs")); }
        }
      gsub(/join *[\(]/, "", arg);
      gsub(/[\)] *$/, "", arg);
    }
  # Remove all spaces:
  gsub(/[ ]/, "", arg);
  
  return arg;
}

# ----------------------------------------------------------------------

function process_translation(arg,   i)
{
  # Processes the "/translation" (aminoacid sequence) entry.
  # Sets {nama} to its length and stores the base letters into {ama[1..nama]}.

  # Abbreviation    Amino acid name
  # ------------    ---------------
  # 
  # Ala     A       Alanine
  # Arg     R       Arginine
  # Asn     N       Asparagine
  # Asp     D       Aspartic acid (Aspartate)
  # Cys     C       Cysteine
  # Gln     Q       Glutamine
  # Glu     E       Glutamic acid (Glutamate)
  # Gly     G       Glycine
  # His     H       Histidine
  # Ile     I       Isoleucine
  # Leu     L       Leucine
  # Lys     K       Lysine
  # Met     M       Methionine
  # Phe     F       Phenylalanine
  # Pro     P       Proline
  # Pyl     O       Pyrrolysine
  # Ser     S       Serine
  # Sec     U       Selenocysteine
  # Thr     T       Threonine
  # Trp     W       Tryptophan
  # Tyr     Y       Tyrosine
  # Val     V       Valine
  # Asx     B       Aspartic acid or Asparagine
  # Glx     Z       Glutamine or Glutamic acid.
  # Xaa     X       Any amino acid.
  # Xle     J       Leucine or Isoleucine
  # TERM            termination codon
  # 
  if (trans_ok) { data_error(("more than one «/translation» entry")); }
  trans_ok = 1;
  
  # Get complete argument, remove "/translation=\"\""
  while ((arg !~ /[\"] *$/) && getline) 
    { doc = ( doc $0 "\n") ;
      arg = ( arg $0 );
    }
  gsub(/[\/]translation *[=] *[\"]/, "", arg);
  gsub(/[\"] *$/, "", arg);
  # Remove all spaces:
  gsub(/[ ]/, "", arg);
  
  printf "  translation = %s\n", arg > "/dev/stderr";
  
  # Save in global variables:
  nama = length(arg);
  for (i = 1; i <= nama; i++) { ama[i] = substr(arg, i, 1); }
}

# ----------------------------------------------------------------------

function build_labeling(    i,k,ini,fin,ch,ex,)
{
  # Later some of them will be turned into into [DEFHIJXYPQRS].
  for (i = seq_ini; i <= seq_fin; i++) { lab[i] = "N"; }
  
  if (mrna_ok)
    { # Item has information on the spliced mRNA range, hence 
      # we can identify the UTRs. 
      # Sets all {lab[...]} positions in the spliced mRNA to "X".
      
      # Later some of these "X" will be turned into [DEFHIJPQRS],
      # and then any post-CDS "X" will be changed to "Y".
      for (k = 1; k <= mrna_nranges; k++)
        { ini = get_range_ini(mrna_range[k]);
          fin = get_range_fin(mrna_range[k]);
          if ((ini < seq_ini) || (ini > fin) || (fin > seq_fin))
            { data_error(("mRNA range «" mrna_range[k] "» invalid or outside of source range")); }
          printf "  mRNA range %s..%s\n", ini, fin > "/dev/stderr";
          for (i = ini; i <= fin; i++)
            { if (lab[i] != "N") { data_error(("lab[" i "] = «" lab[i] "» expected «N»")); }
              lab[i] = "X";\
            }
        }
    }
  else
    { printf "warning: missing «mRNA» entry" > "/dev/stderr"; }
    
  # Sets all {lab[...]} positions in the CDS to [DEF] according to phase.
  # Later the final codon will be turned into [HIJ].
  # If the "mRNA" entry was present, the CDS must be contained in the mRNA.
  ncds = 0; # Number of bases in the coding sequence.
  for (k = 1; k <= cds_nranges; k++)
    { ini = get_range_ini(cds_range[k]);
      fin = get_range_fin(cds_range[k]);
      if ((ini < seq_ini) || (ini > fin) || (fin > seq_fin))
        { data_error(("CDS range «" cds_range[k] "» invalid or outside of source range")); }
      printf "  CDS range %s..%s\n", ini, fin > "/dev/stderr";
      for (i = ini; i <= fin; i++)
        { ex = (mrna_ok ? "X" : "N"); # Expected value of {lab[i]}.
          if (lab[i] != ex) { data_error(("lab[" i "] = «" lab[i] "» expected «" ex "»")); }
          lab[i] = substr("DEF", (ncds%3)+1, 1); 
          ncds ++;
        }
    }
    
  # Check consistency of CDS and translation lengths:
  if (ncds != 3*(nama+1))
    { data_error(("CDS length = «" ncds "» expected 3*(" nama "+1) = " 3*(nama+1) "")); }
  
  # Relabel the terminal UTR from "X" to "Y":
  i = seq_fin;
  while ((i >= seq_ini) && ((lab[i] < "D") || (lab[i] > "F")))
    { if (lab[i] == "X")
        { lab[i] = "Y"; }
      else if (lab[i] != "N")
        { data_error(("lab[" i "] = «" lab[i] "» expected «N» or «X»")); }
      i--; 
    }
    
  # Show final codon:
  printf "  seq_ini = %d  seq_fin = %d  i = %d\n", seq_ini, seq_fin, i > "/dev/stderr";
  show_codon("last codon bas", bas, i - 2);
  show_codon("last codon lab", lab, i - 2);
    
  # Relabel the last codon of the CDS as "HIJ": 
  if ((i < seq_ini) || (lab[i] != "F")) { data_error(("missing final «F»")); }
  lab[i] = "J"; i--;
  while ((i >= seq_ini) && ((lab[i] < "D") || (lab[i] > "F"))) { i--; }
  if ((i < seq_ini) || (lab[i] != "E")) { data_error(("missing final «E»")); }
  lab[i] = "I"; i--;
  while ((i >= seq_ini) && ((lab[i] < "D") || (lab[i] > "F"))) { i--; }
  if ((i < seq_ini) || (lab[i] != "D")) { data_error(("missing final «D»")); }
  lab[i] = "H"; i--;
  
  # Label intron splice markers with "PQ" and "RS":
  for (i = seq_ini+1; i <= seq_fin; i++)
    { ch = lab[i-1];
      if (lab[i] == "N")
        { if (ch == "P") 
            { lab[i] = "Q"; }
          else if ((ch != "Q") && (ch != "N")) 
            { lab[i] = "P"; }
        }
      else if (ch == "P")
        { data_error(("intron too short - expected «N», found «" lab[i] "»")); }
    }
  for (i = seq_fin-1; i >= seq_ini; i--)
    { ch = lab[i+1];
      if (lab[i] == "N")
        { if (ch == "S") 
            { lab[i] = "R"; }
          else if ((ch != "R") && (ch != "N")) 
            { lab[i] = "R"; }
        }
      else if (ch == "S")
        { data_error(("intron too short - expected «N», found «" lab[i] "»")); }
    }
}