#! /bin/csh -f
# Last edited on 2012-05-05 14:42:28 by stolfilocal

set usage = "$0 NWORDS LANG/BOOK/SUBSEC > list.wfr"

# Produces a list of the NWORDS least frequent `good' words
# given language sample, with counts
# and frequencies, in the format
# COUNT FREQ CUMCOUNT CUMFREQ WORD
# Breaks ties randomly.

if ( $#argv != 2 ) then
  echo "usage: ${usage}"; exit 1
endif

set nwords = "$1"; shift;
set samplesec = "$1"; shift;

set dir = dat/${samplesec}
set sample = ${samplesec:h}
set subsec = ${samplesec:t}
set lang = ${sample:h}
set book = ${sample:t}

if ( -r ${dir}/gud.wfr ) then
  set ifile = "${dir}/gud.wfr"
else
  echo 'no "gud.wfr" in '"${dir}"; exit 1
endif

cat ${ifile} \
  | gawk '/./{print $1, $3}' \
  | combine-counts \
  | gawk '/./{i++; s=sin(i); printf "%7d %10.8f %s\n", $1, s*s, $2; }' \
  | sort -b -k1,1n -k2,2g \
  | gawk '/./{print $1, $3}' \
  | compute-cum-freqs \
  | head -${nwords} \
  | sort -b -k5,5