#! /usr/bin/gawk -f # Last edited on 2008-06-09 01:50:56 by stolfi BEGIN { usage = ( "list-EID-items < {INFILE}" ); abort = -1; # Reads an EID file, writes a list of items and their sizes. # Assumes that each item starts with a line with the format # # > {ITEM_NAME} ... /gene="{GENE}" ... # The size of an item is the number of non-blank bytes # (excluding the ">" line). # Parsing state (reset by {beg_item}): curItem = ""; # Current item ID. curGene = ""; # Current gene ID. itemLines = 0; # Count of nonblank data lines in item (excl. header). itemBytes = 0; # Count of nonblamk data bytes in item (ditto). # Global state: totItems = 0; # Number of non-empty items. totLines = 0; # Total number of non-empty data lines. totBytes = 0; # Total number fo non-blank data bytes. beg_item("*", "*"); } (abort >= 0) { exit abort; } /^[>]/ { end_item(); itemId = $2; if (match($0, /[\/]gene *[=] *"[^"]*"/)) { geneId = substr($0, RSTART, RLENGTH); gsub(/^[\/]gene *[=] *"/, "", geneId); gsub(/"$/, "", geneId); } else { geneId = "*"; } beg_item(itemId, geneId); next; } // { # Data line -- delete blanks: lin = $0; gsub(/[ \011\015]/, "", lin); # Count bytes, and one more line if not blank: if (lin != "") { itemLines ++; itemBytes += length(lin); } next; } END { if (abort >= 0) { exit abort; } end_item(); printf "%10d items %10d lines %10d bytes\n", totItems, totLines, totBytes > "/dev/stderr"; } function beg_item(itemId,geneId) { curItem = itemId; curGene = geneId; itemLines = 0; itemBytes = 0; } function end_item() { if ((curItem != "*") || (itemLines > 0)) { printf "%-20s %-20s %10d %10d\n", curItem, curGene, itemLines, itemBytes; totItems ++; totLines += itemLines; totBytes += itemBytes; } }