#! /bin/bash
# Last edited on 2025-04-29 22:34:34 by stolfi

usage="$0 NWORDS LANG/BOOK/SUBSEC > list.wfr"

# Produces a list of the NWORDS least frequent `good' words
# given language sample, with counts
# and frequencies, in the format
# COUNT FREQ CUMCOUNT CUMFREQ WORD
# Breaks ties randomly.

if [[  $# != 2  ]]; then
  echo "usage: ${usage}" 1>&2; exit 1
fi

nwords="$1"; shift;
samplesec="$1"; shift;

dir=dat/${samplesec}
sample=${samplesec%/*}
subsec=${samplesec##*/}
lang=${sample%/*}
book=${sample##*/}

if [[  -r ${dir}/gud.wfr  ]]; then
  ifile="${dir}/gud.wfr"
else
  echo 'no "gud.wfr" in '"${dir}" 1>&2; exit 1
fi

cat ${ifile} \
  | gawk '/./{print $1, $3}' \
  | combine-counts \
  | gawk '/./{i++; s=sin(i); printf "%7d %10.8f %s\n", $1, s*s, $2; }' \
  | sort -b -k1,1n -k2,2g \
  | gawk '/./{print $1, $3}' \
  | compute-cum-freqs \
  | head -${nwords} \
  | sort -b -k5,5