#! /bin/bash # Last edited on 2008-06-15 08:21:08 by stolfi usage=( "plot_region_length_stats {STATDIR} {OUTDIR} {SETNAME}..." ) # Plots the region length histograms found in files # "{STATDIR}/{SETNAME}-{T}.lens" for each of the given {SETNAME}s # and {T} either "N" "K", "X", and "Y". # The plots are called "{OUTDIR}/{SETNAME}-{T}-{TAG}.eps" for various # values of {TAG}. # Also generates comparative plots of all setnames, called # "{OUTDIR}/CMP-{T}-{TAG}.eps" for other values of {T}. # The plots use two statDir="$1"; shift outDir="$1"; shift setNames=( $@ ) lengthFull=3000; shift; # Max plot length for `all regions' plots. lengthShort=400; shift; # Max plot length for `short regions' plots. for t in N K X Y ; do # Prepare the comparative plot command: plotCmdFile=/tmp/$$.plt # File containing the plot command plotCmdSep="plot" # Separator from previous line of plot command plotCmdIx=0 # Dataset index in plot command for s in ${setNames[@]} ; do dataFile="${statDir}/${s}-${t}.lens" echo "dataFile = ${dataFile}" 1>&2 outName="${outDir}/${s}-${t}" # Grab the overall statistical parameters: parmFile=/tmp/$$.parms cat ${dataFile} \ | gawk \ ' /^[\#] *(num|tot|avg|dev|min|max|len50|len90|maxfr|zerfr|bigfr) *[=] *[0-9.]+ *$/ { gsub(/[\#=]/, " ", $0); printf "%sFromFile=%s\n", $1, $2; } \ ' \ > ${parmFile} cat ${parmFile} | sed -e 's:^: :' 1>&2 source ${parmFile} /bin/rm -f ${parmFile} # Plot the raw histograms of the "${t}" region lengths, full range: gnuplot <&2 ; exit 1 fi gnuplot <= mhiUnif) ? 0 : 1.0/(mhiUnif-mloUnif)) # Exponntial tail of theoretical model: mloExp = 132; tauExp = 0.00150; pExp(m) = (m < mloExp ? 0 : tauExp*exp(-tauExp*(m - mloExp))) # A priori probability estimate: qPriori = exp(-1.0/2001.0) sPriori = (qPriori/(1 - qPriori)) pPriori(m) = (qPriori**(m + 1))/sPriori # Combined theoretical model ${def_amps} N=${numFromFile} ideal(m) = ampGauss*pGauss(m) + ampUnif*pUnif(m) + ampExp*pExp(m) expect(m) = (N*ideal(m) + pPriori(m))/(N + 1.0) # Scales: maxProb = (1.0*(${maxfrFromFile})/(${numFromFile})) # Est. max prob. set yrange [(-0.05*maxProb):(+1.05*maxProb)] set xtics 500; set mxtics 5 plot \ "< sed -e '/=/d' ${dataFile}" using 1:(column(3)) title "observed" with histeps, \ "< sed -e '/=/d' ${dataFile}" using 1:(expect(column(1))) title "model" with lines lt 3 quit EOF gv "${outName}-pdf.eps" # Add the command to the comparative plot: printf "%s \\\\\n \"< sed -e '/=/d' ${dataFile}\"" "${plotCmdSep}" >> ${plotCmdFile} printf " using (column(1)+0.1*%d):(A+B*column(5))" ${plotCmdIx} >> ${plotCmdFile} printf " title \"%s\" with histeps" "${s}" >> ${plotCmdFile} plotCmdSep="," plotCmdIx=$(( ${plotCmdIx} + 1 )) done printf "\n" >> ${plotCmdFile} outName="${outDir}/CMP-${t}" # Plot the cumulative histograms of the region lengths: gnuplot <