#! /usr/bin/gawk -f # Last edited on 2008-06-16 02:22:18 by stolfi # ---------------------------------------------------------------------- doc_file = ( outName ".doc" ); ama_file = ( outName ".ama" ); # Write minimal header lines: printf "> %s\n", locus > ama_file; # Write the ".doc" file (excluding the "/translation" entry): gsub(/[\/]translation *[=] *["][^"\/]*["]/, "/translation=\"...\"", doc); printf "%s", doc > doc_file; doc = ""; # Write the aminoacid sequence: if (! trans_ok) { data_error(("missing «/translation» entry")); } for (iama = 1; iama <= nama; iama++) { if ((iama > 1) && (iama % 60 == 1)) { printf "\n" > ama_file; } printf "%s", ama[i] > ama_file; } if (nama > 0) { printf "\n" > ama_file; } close(doc_file); close(ama_file); # ---------------------------------------------------------------------- function cleanup_range_list(arg) { # Reduces {arg} to a bare list of comma-separated ranges. # If {arg} is a {join()} call, concatenates to it # additional lines if necessary until it is complete, then removes the # "join(" and the final ")". In any case removes all spaces. # Assumes that the item does not contain any "complement()" # calls. if (arg ~ /^join/) { while ((arg !~ /[\)] *$/) && getline) { doc = ( doc $0 "\n") ; arg = ( arg $0 ); if (arg ~ "complement") { data_error(("item has complementary segs")); } } gsub(/join *[\(]/, "", arg); gsub(/[\)] *$/, "", arg); } # Remove all spaces: gsub(/[ ]/, "", arg); return arg; } # ---------------------------------------------------------------------- function process_translation(arg, i) { # Processes the "/translation" (aminoacid sequence) entry. # Sets {nama} to its length and stores the base letters into {ama[1..nama]}. # Abbreviation Amino acid name # ------------ --------------- # # Ala A Alanine # Arg R Arginine # Asn N Asparagine # Asp D Aspartic acid (Aspartate) # Cys C Cysteine # Gln Q Glutamine # Glu E Glutamic acid (Glutamate) # Gly G Glycine # His H Histidine # Ile I Isoleucine # Leu L Leucine # Lys K Lysine # Met M Methionine # Phe F Phenylalanine # Pro P Proline # Pyl O Pyrrolysine # Ser S Serine # Sec U Selenocysteine # Thr T Threonine # Trp W Tryptophan # Tyr Y Tyrosine # Val V Valine # Asx B Aspartic acid or Asparagine # Glx Z Glutamine or Glutamic acid. # Xaa X Any amino acid. # Xle J Leucine or Isoleucine # TERM termination codon # if (trans_ok) { data_error(("more than one «/translation» entry")); } trans_ok = 1; # Get complete argument, remove "/translation=\"\"" while ((arg !~ /[\"] *$/) && getline) { doc = ( doc $0 "\n") ; arg = ( arg $0 ); } gsub(/[\/]translation *[=] *[\"]/, "", arg); gsub(/[\"] *$/, "", arg); # Remove all spaces: gsub(/[ ]/, "", arg); printf " translation = %s\n", arg > "/dev/stderr"; # Save in global variables: nama = length(arg); for (i = 1; i <= nama; i++) { ama[i] = substr(arg, i, 1); } } # ---------------------------------------------------------------------- function build_labeling( i,k,ini,fin,ch,ex,) { # Later some of them will be turned into into [DEFHIJXYPQRS]. for (i = seq_ini; i <= seq_fin; i++) { lab[i] = "N"; } if (mrna_ok) { # Item has information on the spliced mRNA range, hence # we can identify the UTRs. # Sets all {lab[...]} positions in the spliced mRNA to "X". # Later some of these "X" will be turned into [DEFHIJPQRS], # and then any post-CDS "X" will be changed to "Y". for (k = 1; k <= mrna_nranges; k++) { ini = get_range_ini(mrna_range[k]); fin = get_range_fin(mrna_range[k]); if ((ini < seq_ini) || (ini > fin) || (fin > seq_fin)) { data_error(("mRNA range «" mrna_range[k] "» invalid or outside of source range")); } printf " mRNA range %s..%s\n", ini, fin > "/dev/stderr"; for (i = ini; i <= fin; i++) { if (lab[i] != "N") { data_error(("lab[" i "] = «" lab[i] "» expected «N»")); } lab[i] = "X";\ } } } else { printf "warning: missing «mRNA» entry" > "/dev/stderr"; } # Sets all {lab[...]} positions in the CDS to [DEF] according to phase. # Later the final codon will be turned into [HIJ]. # If the "mRNA" entry was present, the CDS must be contained in the mRNA. ncds = 0; # Number of bases in the coding sequence. for (k = 1; k <= cds_nranges; k++) { ini = get_range_ini(cds_range[k]); fin = get_range_fin(cds_range[k]); if ((ini < seq_ini) || (ini > fin) || (fin > seq_fin)) { data_error(("CDS range «" cds_range[k] "» invalid or outside of source range")); } printf " CDS range %s..%s\n", ini, fin > "/dev/stderr"; for (i = ini; i <= fin; i++) { ex = (mrna_ok ? "X" : "N"); # Expected value of {lab[i]}. if (lab[i] != ex) { data_error(("lab[" i "] = «" lab[i] "» expected «" ex "»")); } lab[i] = substr("DEF", (ncds%3)+1, 1); ncds ++; } } # Check consistency of CDS and translation lengths: if (ncds != 3*(nama+1)) { data_error(("CDS length = «" ncds "» expected 3*(" nama "+1) = " 3*(nama+1) "")); } # Relabel the terminal UTR from "X" to "Y": i = seq_fin; while ((i >= seq_ini) && ((lab[i] < "D") || (lab[i] > "F"))) { if (lab[i] == "X") { lab[i] = "Y"; } else if (lab[i] != "N") { data_error(("lab[" i "] = «" lab[i] "» expected «N» or «X»")); } i--; } # Show final codon: printf " seq_ini = %d seq_fin = %d i = %d\n", seq_ini, seq_fin, i > "/dev/stderr"; show_codon("last codon bas", bas, i - 2); show_codon("last codon lab", lab, i - 2); # Relabel the last codon of the CDS as "HIJ": if ((i < seq_ini) || (lab[i] != "F")) { data_error(("missing final «F»")); } lab[i] = "J"; i--; while ((i >= seq_ini) && ((lab[i] < "D") || (lab[i] > "F"))) { i--; } if ((i < seq_ini) || (lab[i] != "E")) { data_error(("missing final «E»")); } lab[i] = "I"; i--; while ((i >= seq_ini) && ((lab[i] < "D") || (lab[i] > "F"))) { i--; } if ((i < seq_ini) || (lab[i] != "D")) { data_error(("missing final «D»")); } lab[i] = "H"; i--; # Label intron splice markers with "PQ" and "RS": for (i = seq_ini+1; i <= seq_fin; i++) { ch = lab[i-1]; if (lab[i] == "N") { if (ch == "P") { lab[i] = "Q"; } else if ((ch != "Q") && (ch != "N")) { lab[i] = "P"; } } else if (ch == "P") { data_error(("intron too short - expected «N», found «" lab[i] "»")); } } for (i = seq_fin-1; i >= seq_ini; i--) { ch = lab[i+1]; if (lab[i] == "N") { if (ch == "S") { lab[i] = "R"; } else if ((ch != "R") && (ch != "N")) { lab[i] = "R"; } } else if (ch == "S") { data_error(("intron too short - expected «N», found «" lab[i] "»")); } } }