#! /usr/bin/gawk -f # Last edited on 2008-06-11 13:35:45 by stolfi BEGIN { usage = ( "cat {FOO}.{bas,lab,ama} | merge-bas-lab-ama > {FOO}.trn" ); abort = -1; # Reads nucleotide, label, and aminoacid sequences from {stdin}, # writes the transcription file to {stdout}. # Assumes that each of the three parts starts with a line # beginning with "> {ITEM_ID}". # Parsing state state = 0; # 0 = init, 1--3 after headers of ".bas", ".lab", ".ama". itemId = ""; # Strings bas = ""; lab = ""; ama = ""; } (abort >= 0) { exit abort; } /^[>]/ { state ++; if (state > 3) { data_error(itemId, "too many headers"); } if (state == 1) { itemId = $2; } else { if (itemId != $2) { data_error(itemid, ("inconsistent item Id = \"" $2 "\"")); } } next; } // { # Data line lin = $0; # Delete comments and whitespace: gsub(/[\#].*$/, "", lin); gsub(/[\011\012\015 ]/, "", lin); # Ignore empty lines: if (lin == "") { next; } # Append non-empty lines to proper string: if (state == 0) { data_error(itemId, "missing header"); } else if (state == 1) { bas = (bas lin); } else if (state == 2) { lab = (lab lin); } else if (state == 3) { ama = (ama lin); } else { data_error(itemId, "duh?"); } next; } END { if (abort >= 0) { exit abort; } if (length(bas) != length(lab)) { data_error(itemId, ("length mismatch: bas = " length(bas) " lab = " length(lab) "")); } # Purge non-coding bases: basK = ""; labK = ""; for (i = 1; i <= length(bas); i++) { basCh = substr(bas, i, 1); labCh = substr(lab, i, 1); if ((labCh == "D") || (labCh == "E") || (labCh == "F")) { basK = (basK basCh); labK = (labK labCh); } } if (length(basK) != 3*length(ama)) { data_error(itemId, (" length mismatch: basK = " length(basK) " ama = " length(ama) "")); } # pack codons and aminoacids: for (i = 1; i <= length(ama); i++) { amino = substr(ama, i, 1); codon = substr(basK, 3*(i-1)+1, 3); event = substr(labK, 3*(i-1)+1, 3); if (event != "DEF") { data_error(itemId, ("loss of sync")); } printf "%s %s\n", codon, amino; } } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", USAGE > "/dev/stderr"; abort = 1; exit abort; } function data_error(itemId,msg) { printf "%s:%d: item %s: ** %s\n", FILENAME, FNR, itemId, msg > "/dev/stderr"; abort = 1; exit 1 }