#! /usr/bin/gawk -f # Last edited on 2008-06-11 23:04:22 by stolfi BEGIN { usage = ( "identify-good-hEID-items [ -v reqExp={BOOL} ] < {INFILE}.hEID > {OUTFILE}.gset" ); abort = -1; # Reads an ".hEID" (headers) file from an EID dataset. # Ouptuts the identifiers of items that are good for # statistical studies. # If {reqExp} is set, requires the presence of the # string "/evidence = experimental" or "[evidence ...", in the ".hEID" # file. # The script excludes items that contain the line "CDS_incomplete". # It also excludes items that correspond to genes with two or more # variant splicings. if (reqExp == "") { reqExp = 0; } else { reqExp += 0; } beg_item("*"); nItems = 0; # Total number of items processed. nGood = 0; # Number of approved items. nBad = 0; # Number of discarded items. } (abort >= 0) { exit abort; } /^[ ]*([\#]|$)/ { # Blank line, ignore: next; } /^[>]/ { # Start of new item. end_item(); beg_item($2); next; } // { if (curItem == "*") { data_error(("missing header line")); } if (match($0, /[\/] *evidence *[=] *experimental/)) { experFlag = 1; } if (match($0, /[[] *evidence/)) { experFlag = 1; } if (match($0, /CDS_incomplete/)) { incomFlag = 1; } if (match($0, /[\/] *exception/)) { excepFlag = 1; } } END { if (abort >= 0) { exit abort; } end_item(); printf " %7d items %7d good %7d bad\n", nItems, nGood, nBad > "/dev/stderr"; } function beg_item(id) { curItem = id; # Current item identifier, or "*" before the first one. multiFlag = (match(id, /^[0-9]+[A-Z]/) != 0); # Item has multiple labelings. experFlag = 0; # Will be set to 1 if the item has experimental evidence. incomFlag = 0; # Will be set to 1 if the item has "CDS_incomplete". excepFlag = 0; # Will be set to 1 if the item has an "/exception" line. } function end_item( ok,obs) { if (curItem != "*") { ok = 1; obs = "("; sep = ""; if ((! experFlag) && reqExp) { ok = 0; obs = ( obs sep "exp"); sep = ","; } if (incomFlag) { ok = 0; obs = ( obs sep "inc"); sep = ","; } if (multiFlag) { ok = 0; obs = ( obs sep "mul"); sep = ","; } obs = (obs ")"); printf "%s %d %s\n", curItem, ok, obs; if (ok) { nGood ++; } else { nBad ++; } nItems ++; } } function data_error(msg) { printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1 }