#! /bin/csh -f 
# Last edited on 1999-12-10 04:14:23 by stolfi

set usage = "$0 [ -maxLines NN ] [ -dir DIR ] [-title TIT] [-out OUT] SEC1 SEC2 ... "

# Reads a bunch of item count files DIR/SECi.frq (as written by compute-freqs),
# where each line has a COUNT, a FREQ, and an ITEM, and generates various reports:
#
#      DIR/OUT.cmp-cts  shows raw counts of each item and file; one line per item,
#                        one column of counts per file, plus one "total" column
#                        and one shared column with the items; sorted by decreasing
#                        total counts.
#
#      DIR/OUT.cmp-frq  ditto, with relative frequencies ×9999 instead
#                        of raw counts.
#
#      DIR/OUT.cmp-top  shows the item frequency rankings in each file, and
#                        overall; two columns per file, one with relative
#                        frequencies ×99, the other with the item; each column
#                        pair sorted by its own frequency, decreasing.
#
# The "-title" option specifies a title for the shared item column.
#
# The OUT defaults to "all".
#
# The "-maxLines NN" option truncates the output after "NN" lines
# (default 50)

set dir = ""
set tit = ""
set out = "all"
set maxLines = 50

while ( ( $#argv > 0 ) && ( "/$1" =~ /-* ) )
  if ( ( $#argv >= 2  ) && ( "/$1" == "/-dir" ) ) then
    set dir = "$2/"; shift; shift; 
  else if ( ( $#argv >= 2  ) && ( "/$1" == "/-out" ) ) then
    set out = "$2"; shift; shift;
  else if ( ( $#argv >= 2  ) && ( "/$1" == "/-title" ) ) then
    set tit = "$2"; shift; shift;
  else if ( ( $#argv >= 2  ) && ( "/$1" == "/-maxLines" ) ) then
    set maxLines = "$2"; shift; shift;
  else
    echo "invalid option $1"
    echo "usage: ${usage}"; exit 1
  endif
end

if ( $#argv < 2 ) then
  echo "usage: ${usage}"; exit 1
endif

if ( ! ( $?dir ) ) then
  echo 'must specify "-dir"'; exit 1
  echo "usage: ${usage}"; exit 1
endif

set secs = ( $* )

set frfiles = ( `echo ${secs} | tr ' ' '\012' | sed -e 's@.*@'"${dir}"'&.frq@'` )

echo "Tabulating the raw word counts..."

set tmp = "/tmp/$$"

set ctfiles = ( )
foreach ffile ( ${frfiles} )
  set name = ${ffile:t}; set name = ${name:r}
  set cfile = "${tmp}-${name}.cts"
  echo "${ffile} -> ${cfile}"
  cat ${ffile} \
    | gawk '/./{printf "%d %s\n", $1, $3;}' \
    > ${cfile}
  set ctfiles = ( $ctfiles $cfile )
end

compare-counts \
    -titles "${secs} ${tit}" \
    -sort 1 \
    -totals \
    -maxlines ${maxLines} \
    -widths '6' \
    ${ctfiles} \
  > ${dir}${out}.cmp-cts
/bin/rm ${ctfiles}
 
echo "Tabulating the relative frequencies ( × 9999 ) per file..."

set refiles = ( )
foreach ffile ( ${frfiles} )
  set name = ${ffile:t}; set name = ${name:r}
  set rfile = "${tmp}-${name}.fri"
  echo "${ffile} -> ${rfile}"
  cat ${ffile} \
    | gawk '/./{printf "%d %s\n", int(9999*$2), $3;}' \
    > ${rfile}
  set refiles = ( $refiles $rfile )
end
compare-counts \
  -titles "`echo ${secs} | tr -d '.'` ${tit}" \
  -sort 1 \
  -maxlines ${maxLines} \
  -widths '4' \
  ${refiles} \
> ${dir}${out}.cmp-frq
/bin/rm ${refiles}
  
echo "Listing the frequency-ranked items per file..."

set ppfiles = ( )
foreach ffile ( ${frfiles} )
  set name = ${ffile:t}; set name = ${name:r}
  set pfile = "${tmp}-${name}.pct"
  echo "${ffile} -> ${pfile}"
  cat ${ffile} \
    | sort -b +0 -1nr \
    | head -${maxLines} \
    | gawk '/./{printf "%3d %s\n", int(999*$2), $3;}' \
    > ${pfile}
  set ppfiles = ( $ppfiles $pfile )
end
multicol \
  -v titles="${secs}" \
  -v colsep=" " \
  ${ppfiles} \
> ${dir}${out}.cmp-top
/bin/rm ${ppfiles}

ls -l ${dir}${out}.cmp-{cts,frq,top}