#! /bin/csh -f
# Last edited on 2000-10-11 18:57:24 by stolfi

set usage = "$0 GRCLASS SECTION OBSNAME GENNAME "

# Compares observed word frequencies with computed word probabilities
# Writes a file with the results, and generates a plot of the same.
#
# Input files:
#
#   prob/obs/SECTION/OBSNAME.frq
#     Observed word frequencies, in the format 
#     COUNT FREQ WORD.
#   
#   prob/gen/GRCLASS/SECTION/GENNAME.prb
#     Computed probabilities, in the format PROB WORD.
#     
# Output files:
#
#   prob/cmp/GRCLASS/SECTION/GENNAME.pr2
#     Comparison file, in the format
#     PROBS PRGEN PRTOT DELTA WORD
#     where PROBS, PRGEN are the two 
#     probabilities, PRTOT is their sum, 
#     and DELTA is the log base 10 of 
#     PRGEN/PROBS (fuzzified by 1/N)
#     where N is the total observed 
#     word COUNT)
#
#   prob/cmp/GRCLASS/SECTION/GENNAME.gif
#     A plot of PRGEN against PROBS.  
#  

if ( $#argv != 4) then
  echo "usage: ${usage}"; exit 1
endif

set grclass = "$1"; shift;
set sec = "$1"; shift;
set obsname = "$1"; shift;
set genname = "$1"; shift;

set obsfile = "prob/obs/${sec}/${obsname}.frq"
set genfile = "prob/gen/${grclass}/${sec}/${genname}.prb"
set cmpfile = "prob/cmp/${grclass}/${sec}/${genname}.pr2"
set cmpplot = "prob/cmp/${grclass}/${sec}/${genname}.gif"

echo "comparing ${genfile} against ${obsfile}..."

set tmp = "/tmp/$$"

cat ${obsfile} \
  | gawk '/./{print $2,$3;}' \
  | sort -b +1 -2 \
  > ${tmp}-obs.prb
  
set eps = `cat ${obsfile} | gawk '/./{s+=$1} END{print 1.0/s}'`

sort -b +1 -2 ${genfile} \
  > ${tmp}-gen.prb

join \
  -j1 2 -j2 2 -o '1.1,2.1,0' \
  -a1 -a2 -e 0.00000 \
  ${tmp}-obs.prb ${tmp}-gen.prb \
  | gawk -v eps="${eps}" \
      ' BEGIN {log10 = log(10); } \
        /./{ \
          obs=$1; gen=$2; w=$3; \
          d = log(sqrt(gen*gen + eps*eps)/sqrt(obs*obs + eps*eps))/log10; \
          printf "%s %s %7.5f %+4.1f %s\n", obs, gen, obs+gen, d, w; \
        } \
      ' \
  | sort -b +3 -4gr +4 -5 \
  > ${cmpfile}

/bin/rm -f ${tmp}-*.prb

plot-joint-probs \
    -eps ${eps} \
    -title "${sec}: obs/${obsname} x gen/${genname}" \
    < ${cmpfile} \
    > ${cmpplot}