#! /usr/bin/gawk -f
# Last edited on 2008-06-15 08:21:57 by stolfi

BEGIN {
  USAGE = ( \
    "reformat-ncbi-translation-table \\\n" \
    "  < {INFILE}.htbl \\\n" \
    "  > {OUTFILE}.vtbl" \
  );
  
  # Reads an NCBI table in "horizontal" format, e.g.
  #   
  #     AAs  = FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
  #   Starts = ---M---------------M---------------M----------------------------
  #   Base1  = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
  #   Base2  = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
  #   Base3  = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
  #  
  # Writes out the same info in vertical format:
  #   
  #  + DEF AS
  #  + --- --
  #    TTT F-
  #    TTC F-
  #    TTA L-
  #    TTG LM
  #    ...
  #
  
  abort = -1; # Hack to get around gawk's "exit" misfeature. 

  # Rows of the input NCBI table:
  AAs = "";
  Starts = "";
  Base1 = "";
  Base2 = ""

  # Columns of the output table:
  split("", tbA); # {tbA[c]} is the aminoacid for codon {c}. 
  split("", tbS); # {tbS[c]} is the start flag for codon {c}. 
}

(abort >= 0) { exit(abort); }

# General input cleanup:
// { 
  # Remove tabs and other crud:
  gsub(/[\011\015]/, " ", $0);
  # Remove '#'-comments:
  gsub(/[\#].*$/, "", $0);
  # Make sure that the "=" is a separate field:
  gsub(/[=]/, " = ", $0);
} 

# Ignore blank lines: 
/^[ ]*$/ { next; }

# Collect table rows:
(($1 == "AAs") && ($2 == "="))    { AAs = check_input_row($3);    prtc(); next; }
(($1 == "Starts") && ($2 == "=")) { Starts = check_input_row($3); prtc(); next; }
(($1 == "Base1") && ($2 == "="))  { Base1 = check_input_row($3);  prtc(); next; }
(($1 == "Base2") && ($2 == "="))  { Base2 = check_input_row($3);  prtc(); next; }
(($1 == "Base3") && ($2 == "="))  { Base3 = check_input_row($3);  prtc(); next; }

# Complain if anything else:
// { data_error("invalid format"); next; }

function check_input_row(txt) 
{
  # Checks whether the input table row {txt} is well-formed.
  # If so, returns it unchanged.
  
  if (length(txt) != 64) { data_error(("wrong table row length = " length(txt) "")); }
  return txt;
}

function prtc()
{
  # Prints the current line as a '#'-comment.
  printf "# %s\n", $0;
}

END {
  if (abort >= 0) { exit(abort); }
  
  # Unpack the input table:
  for (k = 1; k <= 64; k++)
    { # Extract the codon from column {k} of rows {Base1,Base2,Base3}:
      c = (substr(Base1,k,1) substr(Base2,k,1) substr(Base3,k,1));
      # Store the aminoacid and start flag from rows {AAs,Starts}:
      tbA[c] = substr(AAs,k,1);
      tbS[c] = substr(Starts,k,1);
    }
  
  # Write the output table: 
  printf "\n";
  printf "# DEF AS\n";
  printf "# --- --\n";
  for (d = 1; d <= 4; d++)
    { D = substr("ATCG", d, 1);
      for (e = 1; e <= 4; e++)
        { E = substr("ATCG", e, 1);
          for (f = 1; f <= 4; f++)
            { F = substr("ATCG", f, 1);
              # Assemble the codon {c} from bases {D,E,F}:
              c = (D E F);
              # Print the row of {c}:
              printf "  %s %s%s # \n", c, tbA[c], tbS[c];
            }
        }
    }
}