#! /bin/csh -f
# Last edited on 2025-05-01 18:57:02 by stolfi
set usage = "$0 [ -maxLines NN ] [ -dir DIR ] [-title TIT] [-out OUT] SEC1 SEC2 ... "
# Reads a bunch of item count files DIR/SECi.frq (as written by compute-freqs),
# where each line has a COUNT, a FREQ, and an ITEM, and generates various reports:
#
# DIR/OUT.cmp-cts shows raw counts of each item and file; one line per item,
# one column of counts per file, plus one "total" column
# and one shared column with the items; sorted by decreasing
# total counts.
#
# DIR/OUT.cmp-frq ditto, with relative frequencies ×9999 instead
# of raw counts.
#
# DIR/OUT.cmp-top shows the item frequency rankings in each file, and
# overall; two columns per file, one with relative
# frequencies ×99, the other with the item; each column
# pair sorted by its own frequency, decreasing.
#
# The "-title" option specifies a title for the shared item column.
#
# The OUT defaults to "all".
#
# The "-maxLines NN" option truncates the output after "NN" lines
# (default 50)
set dir = ""
set tit = ""
set out = "all"
set maxLines = 50
while ( ( $#argv > 0 ) && ( "/$1" =~ /-* ) )
if ( ( $#argv >= 2 ) && ( "/$1" == "/-dir" ) ) then
set dir = "$2/"; shift; shift;
else if ( ( $#argv >= 2 ) && ( "/$1" == "/-out" ) ) then
set out = "$2"; shift; shift;
else if ( ( $#argv >= 2 ) && ( "/$1" == "/-title" ) ) then
set tit = "$2"; shift; shift;
else if ( ( $#argv >= 2 ) && ( "/$1" == "/-maxLines" ) ) then
set maxLines = "$2"; shift; shift;
else
echo "invalid option $1"
echo "usage: ${usage}"; exit 1
endif
end
if ( $#argv < 2 ) then
echo "usage: ${usage}"; exit 1
endif
if ( ! ( $?dir ) ) then
echo 'must specify "-dir"'; exit 1
echo "usage: ${usage}"; exit 1
endif
set secs = ( $* )
set frfiles = ( `echo ${secs} | tr ' ' '\012' | sed -e 's@.*@'"${dir}"'&.frq@'` )
echo "Tabulating the raw word counts..."
set tmp = "/tmp/$$"
set ctfiles = ( )
foreach ffile ( ${frfiles} )
set name = ${ffile:t}; set name = ${name:r}
set cfile = "${tmp}-${name}.cts"
echo "${ffile} -> ${cfile}"
cat ${ffile} \
| gawk '/./{printf "%d %s\n", $1, $3;}' \
> ${cfile}
set ctfiles = ( $ctfiles $cfile )
end
compare-counts \
-titles "${secs} ${tit}" \
-sort 1 \
-totals \
-maxlines ${maxLines} \
-widths '6' \
${ctfiles} \
> ${dir}${out}.cmp-cts
/bin/rm ${ctfiles}
echo "Tabulating the relative frequencies ( × 9999 ) per file..."
set refiles = ( )
foreach ffile ( ${frfiles} )
set name = ${ffile:t}; set name = ${name:r}
set rfile = "${tmp}-${name}.fri"
echo "${ffile} -> ${rfile}"
cat ${ffile} \
| gawk '/./{printf "%d %s\n", int(9999*$2), $3;}' \
> ${rfile}
set refiles = ( $refiles $rfile )
end
compare-counts \
-titles "`echo ${secs} | tr -d '.'` ${tit}" \
-sort 1 \
-maxlines ${maxLines} \
-widths '4' \
${refiles} \
> ${dir}${out}.cmp-frq
/bin/rm ${refiles}
echo "Listing the frequency-ranked items per file..."
set ppfiles = ( )
foreach ffile ( ${frfiles} )
set name = ${ffile:t}; set name = ${name:r}
set pfile = "${tmp}-${name}.pct"
echo "${ffile} -> ${pfile}"
cat ${ffile} \
| sort -b -k1nr \
| head -${maxLines} \
| gawk '/./{printf "%3d %s\n", int(999*$2), $3;}' \
> ${pfile}
set ppfiles = ( $ppfiles $pfile )
end
multicol \
-v titles="${secs}" \
-v colsep=" " \
${ppfiles} \
> ${dir}${out}.cmp-top
/bin/rm ${ppfiles}
ls -l ${dir}${out}.cmp-{cts,frq,top}