#! /usr/bin/gawk -f # Last edited on 2008-06-11 23:16:27 by stolfi BEGIN { usage = ( "identify-good-dEID-items < {INFILE}.pEID > {OUTFILE}.gset" ); abort = -1; # Reads a ".dEID" (DNA sequence) file from an EID dataset. # Ouptuts the identifiers of items that pass certain validity # tests. # The script excludes items that have any character other than # [ATCGUatcgu] or whitespace in their data lines. beg_item("*"); nItems = 0; # Total number of items processed. nGood = 0; # Number of approved items. nBad = 0; # Number of discarded items. } (abort >= 0) { exit abort; } /^[ ]*([\#]|$)/ { # Blank line, ignore: next; } /^[>]/ { # Start of new item. end_item(); beg_item($2); next; } // { if (curItem == "*") { data_error(("missing header line")); } # Get a copy of the line, and remove any comments and whitespace: lin = $0; gsub(/[\#].*$/, "", lin); gsub(/[ \011\012\015]/, "", lin); # Check for invalid features: if (match(lin, /[^ATCGUatcgu]/)) { charFlag = 1; } } END { if (abort >= 0) { exit abort; } end_item(); printf " %7d items %7d good %7d bad\n", nItems, nGood, nBad > "/dev/stderr"; } function beg_item(id) { curItem = id; # Current item identifier, or "*" before the first one. charFlag = 0; # Will be set to 1 if the item has invalid characters. } function end_item( ok,obs) { if (curItem != "*") { ok = 1; obs = "("; sep = ""; if (charFlag) { ok = 0; obs = ( obs sep "chr"); sep = ","; } obs = (obs ")"); printf "%s %d %s\n", curItem, ok, obs; if (ok) { nGood ++; } else { nBad ++; } nItems ++; } } function data_error(msg) { printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1 }