#! /usr/bin/gawk -f # Last edited on 2008-06-15 08:21:57 by stolfi BEGIN { USAGE = ( \ "reformat-ncbi-translation-table \\\n" \ " < {INFILE}.htbl \\\n" \ " > {OUTFILE}.vtbl" \ ); # Reads an NCBI table in "horizontal" format, e.g. # # AAs = FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG # Starts = ---M---------------M---------------M---------------------------- # Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG # Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG # Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG # # Writes out the same info in vertical format: # # + DEF AS # + --- -- # TTT F- # TTC F- # TTA L- # TTG LM # ... # abort = -1; # Hack to get around gawk's "exit" misfeature. # Rows of the input NCBI table: AAs = ""; Starts = ""; Base1 = ""; Base2 = "" # Columns of the output table: split("", tbA); # {tbA[c]} is the aminoacid for codon {c}. split("", tbS); # {tbS[c]} is the start flag for codon {c}. } (abort >= 0) { exit(abort); } # General input cleanup: // { # Remove tabs and other crud: gsub(/[\011\015]/, " ", $0); # Remove '#'-comments: gsub(/[\#].*$/, "", $0); # Make sure that the "=" is a separate field: gsub(/[=]/, " = ", $0); } # Ignore blank lines: /^[ ]*$/ { next; } # Collect table rows: (($1 == "AAs") && ($2 == "=")) { AAs = check_input_row($3); prtc(); next; } (($1 == "Starts") && ($2 == "=")) { Starts = check_input_row($3); prtc(); next; } (($1 == "Base1") && ($2 == "=")) { Base1 = check_input_row($3); prtc(); next; } (($1 == "Base2") && ($2 == "=")) { Base2 = check_input_row($3); prtc(); next; } (($1 == "Base3") && ($2 == "=")) { Base3 = check_input_row($3); prtc(); next; } # Complain if anything else: // { data_error("invalid format"); next; } function check_input_row(txt) { # Checks whether the input table row {txt} is well-formed. # If so, returns it unchanged. if (length(txt) != 64) { data_error(("wrong table row length = " length(txt) "")); } return txt; } function prtc() { # Prints the current line as a '#'-comment. printf "# %s\n", $0; } END { if (abort >= 0) { exit(abort); } # Unpack the input table: for (k = 1; k <= 64; k++) { # Extract the codon from column {k} of rows {Base1,Base2,Base3}: c = (substr(Base1,k,1) substr(Base2,k,1) substr(Base3,k,1)); # Store the aminoacid and start flag from rows {AAs,Starts}: tbA[c] = substr(AAs,k,1); tbS[c] = substr(Starts,k,1); } # Write the output table: printf "\n"; printf "# DEF AS\n"; printf "# --- --\n"; for (d = 1; d <= 4; d++) { D = substr("ATCG", d, 1); for (e = 1; e <= 4; e++) { E = substr("ATCG", e, 1); for (f = 1; f <= 4; f++) { F = substr("ATCG", f, 1); # Assemble the codon {c} from bases {D,E,F}: c = (D E F); # Print the row of {c}: printf " %s %s%s # \n", c, tbA[c], tbS[c]; } } } }