#! /bin/bash # Last edited on 2008-06-13 18:10:46 by stolfi # Usage: get-region-lengths {SETNAME} {SETDIR} {BASDIR} {OUTDIR} # Extracts region length statistics for a set of labeled genes. # Reads from the file "{SETDIR}/{SETNAME}.gset" a list of full item # names, of the form "{SPECIES}/{ITEM}" where {SPECIES} is the name of # a species's dataser in the EID database (e.g. "at2004", "hs35p1") # and {ITEM} is an EID item identifier (e.g. "10008_NC_003074"). # Then reads the label file "{BASDIR}/{SPECIES}/{ITEM}.lab" for all # those items, and tabulates the lengths of exon and non-exon regions # in them. Writes the statistical parameters (average length, etc.) # and the length histogram to the file # "{OUTDIR}/{SETNAME}-{TYPE}.lens" where {TYPE} is "K" or "N". setName="$1"; shift setDir="$1"; shift basDir="$1"; shift outDir="$1"; shift setFile="${setDir}/${setName}.gset" outName="${outDir}/${setName}" dataBank="/tmp/$$" progdir=${STOLFIHOME}/programs/c/DNA/dnabayes cat ${setFile} \ | gawk \ -v basDir="${basDir}" \ '/^[a-zA-Z0-9]/{ printf "%s/%s.lab\n", basDir, $1; }' \ > ${dataBank} ${progdir}/dbd_gather_region_length_stats \ -v maxLength=2999 \ -v outName=${outName} \ `cat ${dataBank}` /bin/rm -f ${dataBank}