#! /bin/csh -f 
# Last edited on 2008-06-12 23:18:37 by stolfi

# Tabulates the number of items and lines in each EID species.
# Usage: "tabulate-datasets {DATASET}..."
#
# Assumes that for each {DATASET} and each file type
# {T} in {"dEID","hEID","pEID"} there is a file called
# "orig/eids/{DATASET}/{T}EID.items" containing one 
# line for each EID item, with fields
#
#   {ITEM_ID} {GENE_ID} {N_LINES} {N_BYTES}
# 

set datasets = ( $* )

printf "%-10s" "Dataset"
foreach t ( items dEID hEID pEID )
  printf " %10s" "${t}"
end
printf '\n'
foreach f ( ${datasets} )
  printf "%-10s" "${f}"
  cat orig/eids/${f}/dEID.items \
    | gawk '//{n++;} END{printf " %10d",n;}'
  foreach t ( dEID hEID pEID ) 
    cat orig/eids/${f}/${t}.items \
      | gawk '//{n+=$4;} END{printf " %10d",n;}'
  end
  printf '\n'
end