#! /usr/bin/gawk -f # Last edited on 2008-06-15 08:21:27 by stolfi BEGIN { # Error flag abort = -1; USAGE = ( "cat INFILE | digest_Reese -v outName=NAME" ); # Reads from standard input a GenBank data file, as # in Reese's dataset. Writes four output files: # # "{outName}.doc" - the header files. # # "{outName}.bas" - the sequence of bases [ATCG]. # # "{outName}.lab" - the corresponding label sequence, # with labels # # 'D', 'E', 'F' The three codon positions in coding regions. # 'H', 'I', 'J' Ditto for the stop codon. # 'P', 'Q', 'R' ,'S' Intron splice markers. # 'N' Other intron bases. # . 'X' Initial UTR. # 'Y' Final UTR. # # "{outName}.ama" - the aminoacids from the # "/translation" entry. # # All files begin with a ">" header. The ".doc" # file keeps the original header. The other files # get a minimal ">" header with the data file id. # # Uses the intron/exon/misc annotations in the # input file's header to determine the labels. if (outName == "") { arg_error("must define \"outName\""); } state = -1 # Parsing state # state = -1 before "LOCUS" line # state = 0 before "FEATURES" line # state = 1 between "FEATURES" and "ORIGIN" lines # state = 2 between "ORIGIN" and "//" lines. # state = 3 after "//" line.. source_ok = 0; # TRUE iff "source" entry has been found. mrna_ok = 0; # TRUE iff "mRNA" entry has been found. cds_ok = 0; # TRUE iff "CDS" entry has been found. trans_ok = 0; # TRUE if "/translation" entry has been found. # Just in case: locus = ""; # Locus identifier. split("", bas); # Base sequence will be {bas[seq_ini..seq_fin]}. split("", lab); # Label sequence will be {lab[seq_ini..seq_fin]}. split("", ama); # Translation sequence will be {ama[1..nama]}. doc = ""; # The documentation lines (including the original header). } (abort >= 0) { exit abort; } ((state <= 1) && ($0 !~ /^ORIGIN[ ]/)) { # Append the line to the {doc} string: doc = (doc $0 "\n"); # ... but process it normally: } # Ignore blank lines: /^ *$/ { next; } /^LOCUS[ ]/ { if (state != -1) { data_error(("\"LOCUS\" line out of place")); } state = 0; locus = $2; printf "locus = %s\n", locus > "/dev/stderr"; next; } /^FEATURES[ ]/ { if (state != 0) { data_error(("\"FEATURES\" line out of place")); } state = 1; key = ""; next; } /^ORIGIN[ ]/ { if (state != 1) { data_error(("\"ORIGIN\" line out of place")); } state = 2; if (! source_ok) { data_error(("missing \"source\" entry")); } # Set {bas[seq_ini..seq-fin]} to "?", just in case: for (i = seq_ini; i <= seq_fin; i++) { bas[i] = "?"; } # Prepare to parse the DNA data: ibas = seq_ini; next; } /^[\/][\/]/ { if (state != 2) { data_error(("\"//\" line out of place")); } if (ibas != seq_fin+1) { data_error(("DNA end = " ibas-1 " should be " seq_fin "")); } state = 3; next; } (state == -1) { # Any other line before the "LOCUS" line: data_error(("missing \"LOCUS\" line")); next; } (state == 0) { # Any other line between "LOCUS" and "FEATURES" lines: next; } # Process annotation lines: (state == 1) { # Any other line between "FEATURES" and "ORIGIN" lines: # Get keyword in columns 1--20: key = substr($0, 1, 20); gsub(/[ ]/, "", key); # Get arguments in columns 21--end: arg = substr($0, 21); gsub(/^[ ]+/, "", arg); if (key == "source") { process_source(arg); } else if (key == "mRNA") { process_mrna(arg); } else if (key == "CDS") { process_cds(arg); } else if (arg ~ /[\/]translation/) { process_translation(arg); } else { printf " ignored annotation: %s\n", $0 > "/dev/stderr"; } next; } (state == 2) { # Any other line between "ORIGIN" and "//" lines: if ($0 !~ /^[ ]*[0-9]+[ ]+[a-z ]+[ ]*$/) { data_error(("bad DNA line \"" $0 "\"")); } if ($1 != ibas) { data_error(("sequence error \"" $1 "\" should be " ibas "")); } process_dna($0); next; } (state == 3) { # Any other line after "//" line: data_error(("extra line after \"//\"")); next; } function process_source(arg, i) { # Processes the "source" (pre-splicing messenger RNA) entry. # Defines the variables {seq_ini} and {seq_fin}. if (source_ok) { data_error(("more than one \"source\" entry")); } source_ok = 1; seq_ini = get_ini(arg); seq_fin = get_fin(arg) } function process_mrna(arg, k,ini,fin,i) { # Processes the "mRNA" (spliced messenger RNA) entry. # sets the array {mrna_range[1..mrna_nranges]}. if (mrna_ok) { data_error(("more than one \"mRNA\" entry")); } mrna_ok = 1; # Get complete argument, remove "join( )" arg = cleanup_range_list(arg); printf " mRNA = %s\n", arg > "/dev/stderr"; # Remove '<' and '>' from ends: gsub(/^[<>]/, "", arg); gsub(/..[<>]/, "..", arg); # Split into separate ranges: mrna_nranges = split(arg, mrna_range, /[,]/); } function process_cds(arg, phase,k,ini,fin,i) { # Processes the "CDS" (coding region) entry. # Sets the array {cds_range[1..cds_nranges]} if (cds_ok) { data_error(("more than one \"CDS\" entry")); } cds_ok = 1; # Get complete argument, remove "join( )" arg = cleanup_range_list(arg); printf " CDS = %s\n", arg > "/dev/stderr"; # Split into separate ranges: cds_nranges = split(arg, cds_range, /[,]/); } function cleanup_range_list(arg) { # Reduces {arg} to a bare list of comma-separated ranges. # If {arg} is a {join()} call, concatenates to it # additional lines if necessary until it is complete, then removes the # "join(" and the final ")". In any case removes all spaces. # Assumes that the item does not contain any "complement()" # calls. if (arg ~ /^join/) { while ((arg !~ /[\)] *$/) && getline) { doc = ( doc $0 "\n") ; arg = ( arg $0 ); if (arg ~ "complement") { data_error(("item has complementary segs")); } } gsub(/join *[\(]/, "", arg); gsub(/[\)] *$/, "", arg); } # Remove all spaces: gsub(/[ ]/, "", arg); return arg; } function process_translation(arg, i) { # Processes the "/translation" (aminoacid sequence) entry. # Sets {nama} to its length and stores the base letters into {ama[1..nama]}. if (trans_ok) { data_error(("more than one \"/translation\" entry")); } trans_ok = 1; # Get complete argument, remove "/translation=\"\"" while ((arg !~ /[\"] *$/) && getline) { doc = ( doc $0 "\n") ; arg = ( arg $0 ); } gsub(/[\/]translation *[=] *[\"]/, "", arg); gsub(/[\"] *$/, "", arg); # Remove all spaces: gsub(/[ ]/, "", arg); printf " translation = %s\n", arg > "/dev/stderr"; # Save in global variables: nama = length(arg); for (i = 1; i <= nama; i++) { ama[i] = substr(arg, i, 1); } } function get_ini(r) { if (r !~ /^[0-9]+[.][.][0-9]+$/) { data_error(("bad range format \"" r "\"")); } gsub(/[.][.][0-9]+$/, "", r); return r+0; } function get_fin(r) { if (r !~ /^[0-9]+[.][.][0-9]+$/) { data_error(("bad range format \"" r "\"")); } gsub(/^[0-9]+[.][.]/, "", r); return r+0; } function process_dna(lin, j,n,c) { # Processes a DNA nucleotide line. # Stores the bases into {bas[ibas..]} and increments {ibas}. gsub(/^[ ]*[0-9]+[ ]+/, "", lin); gsub(/[ ]/, "", lin); n = length(lin); for (j = 1; j <= n; j++) { c = substr(lin, j, 1); if (c !~ /^[atcgATCG]$/) { data_error(("invalid base letter \"" c "\"")); } bas[ibas] = c; ibas++; } } END { if (abort >= 0) { exit abort; } # Check whether all necessary information has been parsed: if (locus == "") { data_error(("missing \"LOCUS\" line")); } if (state != 3) { data_error(("missing \"//\" line")); } if (! cds_ok) { data_error(("missing \"CDS\" entry")); } if (! trans_ok) { data_error(("missing \"/translation\" entry")); } setup_label_sequence(); # Choose the output file names: doc_file = ( outName ".doc" ); bas_file = ( outName ".bas" ); lab_file = ( outName ".lab" ); ama_file = ( outName ".ama" ); # Write the ".doc" file (excluding the "/translation" entry): gsub(/[\/]translation *[=] *["][^"\/]*["]/, "/translation=\"...\"", doc); printf "%s", doc > doc_file; doc = ""; # Write minimal header lines: printf "> %s\n", locus > bas_file; printf "> %s\n", locus > lab_file; printf "> %s\n", locus > ama_file; # Write the base and label sequences: for (i = seq_ini; i <= seq_fin; i++) { if ((i > seq_ini) && ((i - seq_ini) % 60 == 0)) { printf "\n" > bas_file; printf "\n" > lab_file; } printf "%c", bas[i] > bas_file; printf "%c", lab[i] > lab_file; } # Write the aminoacid sequence: if (! trans_ok) { data_error(("missing \"/translation\" entry")); } for (iama = 1; iama <= nama; iama++) { if ((iama > 1) && (iama % 60 == 1)) { printf "\n" > ama_file; } printf "%s", ama[i] > ama_file; } if (seq_ini <= seq_fin) { printf "\n" > bas_file; } if (seq_ini <= seq_fin) { printf "\n" > lab_file; } if (nama > 0) { printf "\n" > ama_file; } close(doc_file); close(bas_file); close(lab_file); close(ama_file); } function setup_label_sequence( k,i,ini,fin,ch,ex) { # Fills the array {lab[seq_ini..seq_fin]} with labels, # based on the arrays {mrna_range[1..mrna_nranges]} # and {cds_range[1..cds_nranges]}. # First, sets all {lab[...]} positions in the source to "N". # Later some of them will be turned into into [DEFHIJXYPQRS]. for (i = seq_ini; i <= seq_fin; i++) { lab[i] = "N"; } if (mrna_ok) { # Item has information on the spliced mRNA range, hence # we can identify the UTRs. # Sets all {lab[...]} positions in the spliced mRNA to "X". # Later some of these "X" will be turned into [DEFHIJPQRS], # and then any post-CDS "X" will be changed to "Y". for (k = 1; k <= mrna_nranges; k++) { ini = get_ini(mrna_range[k]); fin = get_fin(mrna_range[k]); if ((ini < seq_ini) || (ini > fin) || (fin > seq_fin)) { data_error(("mRNA range \"" mrna_range[k] "\" invalid or outside of source range")); } printf " mRNA range %s..%s\n", ini, fin > "/dev/stderr"; for (i = ini; i <= fin; i++) { if (lab[i] != "N") { data_error(("lab[" i "] = \"" lab[i] "\" expected \"N\"")); } lab[i] = "X";\ } } } else { printf "warning: missing \"mRNA\" entry" > "/dev/stderr"; } # Sets all {lab[...]} positions in the CDS to [DEF] according to phase. # Later the final codon will be turned into [HIJ]. # If the "mRNA" entry was present, the CDS must be contained in the mRNA. ncds = 0; # Number of bases in the coding sequence. for (k = 1; k <= cds_nranges; k++) { ini = get_ini(cds_range[k]); fin = get_fin(cds_range[k]); if ((ini < seq_ini) || (ini > fin) || (fin > seq_fin)) { data_error(("CDS range \"" cds_range[k] "\" invalid or outside of source range")); } printf " CDS range %s..%s\n", ini, fin > "/dev/stderr"; for (i = ini; i <= fin; i++) { ex = (mrna_ok ? "X" : "N"); # Expected value of {lab[i]}. if (lab[i] != ex) { data_error(("lab[" i "] = \"" lab[i] "\" expected \"" ex "\"")); } lab[i] = substr("DEF", (ncds%3)+1, 1); ncds ++; } } # Check consistency of CDS and translation lengths: if (ncds != 3*(nama+1)) { data_error(("CDS length = \"" ncds "\" expected 3*(" nama "+1) = " 3*(nama+1) "")); } # Relabel the terminal UTR from "X" to "Y": i = seq_fin; while ((i >= seq_ini) && ((lab[i] < "D") || (lab[i] > "F"))) { if (lab[i] == "X") { lab[i] = "Y"; } else if (lab[i] != "N") { data_error(("lab[" i "] = \"" lab[i] "\" expected \"N\" or \"X\"")); } i--; } # Show final codon: printf " seq_ini = %d seq_fin = %d i = %d\n", seq_ini, seq_fin, i > "/dev/stderr"; show_codon("last codon bas", bas, i - 2); show_codon("last codon lab", lab, i - 2); # Relabel the last codon of the CDS as "HIJ": if ((i < seq_ini) || (lab[i] != "F")) { data_error(("missing final \"F\"")); } lab[i] = "J"; i--; while ((i >= seq_ini) && ((lab[i] < "D") || (lab[i] > "F"))) { i--; } if ((i < seq_ini) || (lab[i] != "E")) { data_error(("missing final \"E\"")); } lab[i] = "I"; i--; while ((i >= seq_ini) && ((lab[i] < "D") || (lab[i] > "F"))) { i--; } if ((i < seq_ini) || (lab[i] != "D")) { data_error(("missing final \"D\"")); } lab[i] = "H"; i--; # Label intron splice markers with "PQ" and "RS": for (i = seq_ini+1; i <= seq_fin; i++) { ch = lab[i-1]; if (lab[i] == "N") { if (ch == "P") { lab[i] = "Q"; } else if ((ch != "Q") && (ch != "N")) { lab[i] = "P"; } } else if (ch == "P") { data_error(("intron too short - expected \"N\", found \"" lab[i] "\"")); } } for (i = seq_fin-1; i >= seq_ini; i--) { ch = lab[i+1]; if (lab[i] == "N") { if (ch == "S") { lab[i] = "R"; } else if ((ch != "R") && (ch != "N")) { lab[i] = "R"; } } else if (ch == "S") { data_error(("intron too short - expected \"N\", found \"" lab[i] "\"")); } } } function show_codon(title,seq,pos, j) { # Prints to {stderr} the three letters {seq[pos..pos+2]}, # with some context, clipped to the range {seq_ini..seq_fin}. printf " %s = ", title > "/dev/stderr"; for (j = pos - 5; j <= pos + 7; j++) { if ((j >= seq_ini) && (j <= seq_fin)) { if (j == pos) { printf "[" > "/dev/stderr"; } printf "%s", seq[j] > "/dev/stderr"; if (j == pos + 2) { printf "]" > "/dev/stderr"; } } } printf "\n" > "/dev/stderr"; } function data_error(msg) { printf " %s\n", $0 > "/dev/stderr"; printf "** %s:%s: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", USAGE > "/dev/stderr"; abort = 1; exit abort; }