#! /bin/csh -f
# Last edited on 2002-01-03 02:54:56 by stolfi

set usage = "$0 NWORDS SAMPLE SECTAG... > list.tex"

# Produces a list of the NWORDS most common words and their 
# frequencies, per section.  The arguments may be section tags
# like "bio", or subsection tags like "bio.1".
# The output records have the format
# 
#    SECTAG COUNT FREQ CUNCOUNT CUMFREQ  WORD

if ( $#argv < 3 ) then
  echo "usage: ${usage}"; exit 1
endif

set nwords = "$1"; shift;
set sample = "$1"; shift;

set secs = ( $* )

foreach sec ( ${secs} )
  set ifiles = ( `cd sample/${sample} && ls ${sec}.?/gud.wfr` )
  gawk -v sec="${sec}" -v ifiles="${ifiles}" \
    'BEGIN { printf "%s: {%s}\n", sec, ifiles > "/dev/stderr"; }'
  ( cd sample/${sample} && cat ${ifiles} ) \
    | gawk '/./{ print $1,$3; }' \
    | combine-counts \
    | sort -b +0 -1nr +1 -2 \
    | compute-cum-freqs \
    | head -${nwords} \
    | gawk -v sec=${sec} '/./{ print sec, $1,$2,$3,$4,$5; }'
end

