#! /bin/bash -f
# Last edited on 2012-05-05 14:42:46 by stolfilocal

cmd=${0##*/}
usage="${cmd} {NWORDS} {LANG}/{BOOK}/{SUBSEC} > {OUTNAME}.wfr"

# Produces a list of the NWORDS most frequent `good' words of the 
# given language sample, with counts and frequencies, in the format
# COUNT FREQ CUMCOUNT CUMFREQ WORD

if [[ $# -ne 2  ]]; then
  echo "usage: ${usage}"; exit 1
fi

nwords="$1"; shift;
samplesec="$1"; shift;

dir=dat/${samplesec}
sample=${samplesec%/*}
subsec=${samplesec##*/}
lang=${sample%/*}
book=${sample##*/}

echo "book = ${book} lang = ${lang} subsec = ${subsec}" 1>&2

if [[ -r ${dir}/gud.wfr  ]]; then
  ifile="${dir}/gud.wfr"
else
  echo 'no file "gud.wfr" in '"${dir}" 1>&2; exit 1
fi

cat ${ifile} \
  | gawk '/./{print $1, $3}' \
  | combine-counts \
  | sort -b -k1,1nr \
  | compute-cum-freqs \
  | head -${nwords}