#! /bin/csh -f
# Last edited on 2002-01-16 11:57:16 by stolfi

set usage = "$0 NWORDS LANG/BOOK/SUBSEC > list.wfr"

# Produces a list of the NWORDS most frequent `good' words of the 
# given language sample, with counts and frequencies, in the format
# COUNT FREQ CUMCOUNT CUMFREQ WORD

if ( $#argv != 2 ) then
  echo "usage: ${usage}"; exit 1
endif

set nwords = "$1"; shift;
set samplesec = "$1"; shift;

set dir = sample/${samplesec}
set sample = ${samplesec:h}
set subsec = ${samplesec:t}
set lang = ${sample:h}
set book = ${sample:t}

if ( -r ${dir}/gud.wfr ) then
  set ifile = "${dir}/gud.wfr"
else
  echo 'no file "gud.wfr" in '"${dir}"; exit 1
endif

cat ${ifile} \
  | gawk '/./{print $1, $3}' \
  | combine-counts \
  | sort -b +0 -1nr \
  | compute-cum-freqs \
  | head -${nwords}


