#! /bin/bash -ue
# Last edited on 2025-05-04 22:48:47 by stolfi

cmd=${0##*/}
usage="${cmd} {NLEXEMES} {LANG}/{BOOK}/{SUBSEC} > {OUTNAME}.wfr"

# Produces a list of the NLEXEMES most frequent `good' lexemes of the 
# given language sample, with counts and frequencies, in the format
# COUNT FREQ CUMCOUNT CUMFREQ LEXEME

if [[ $# -ne 2  ]]; then
  echo "usage: ${usage}"; exit 1
fi

nlexemes="$1"; shift;
samplesec="$1"; shift;

dir=dat/${samplesec}
sample=${samplesec%/*}
subsec=${samplesec##*/}
lang=${sample%/*}
book=${sample##*/}

echo "book = ${book} lang = ${lang} subsec = ${subsec}" 1>&2

if [[ -r ${dir}/gud.wfr  ]]; then
  ifile="${dir}/gud.wfr"
else
  echo 'no file "gud.wfr" in '"${dir}" 1>&2; exit 1
fi

cat ${ifile} \
  | gawk '/./{print $1, $3}' \
  | combine-counts \
  | sort -b -k1,1nr \
  | compute-cum-freqs \
  | head -${nlexemes}