#! /bin/bash
# Last edited on 2012-05-05 14:59:26 by stolfilocal

cmd="${0##*/}";
usage="${cmd} {NWORDS} {SAMPLE} {SECTAG}... > {OUTNAME}.tex"

# Produces a list of the NWORDS most common words and their 
# frequencies, per section.  The arguments may be section tags
# like "bio", or subsection tags like "bio.1".
# The output records have the format
# 
#    {SECTAG} {COUNT} {FREQ} {CUMCOUNT} {CUMFREQ}  {WORD}

if [[ $# -lt 3 ]]; then
  echo "usage: ${usage}" 1>&2; exit 1
fi

nwords="$1"; shift;
sample="$1"; shift;

secs=( "$@" )

for sec in ${secs[@]} ; do
  printf "sec = ${sec}  " 1>&2
  ifiles=( `cd dat/${sample} && ls ${sec}.?/gud.wfr` )
  gawk -v sec="${sec}" -v ifiles="${ifiles[*]}" \
    'BEGIN { printf "%s: {%s}\n", sec, ifiles > "/dev/stderr"; }'
  ( cd dat/${sample} && cat ${ifiles[@]} ) \
    | gawk '/./{ print $1,$3; }' \
    | combine-counts \
    | sort -b -k1,1nr -k2,2 \
    | compute-cum-freqs \
    | head -${nwords} \
    | gawk -v sec=${sec} '/./{ print sec, $1,$2,$3,$4,$5; }'
done