#! /bin/bash -ue
# Last edited on 2025-05-04 22:48:14 by stolfi

# The "unique" column is how many lexemes have a single occurrence in the sample.

sizeopt="$1"; shift  # "whole" or "trunc"
smps=( "$@" )        # {LANG}/{BOOK}/{sec} ...
    
for kind in raw gud bad ; do
  sFile="summary-${sizeopt}-${kind}.txt"
  rm -fv ${sFile} | sed -e 's:^:  :g' 
  echo "  creating ${sFile}"
  printf "# %-38s\n" "Counts for ${kind} text (${sizeopt})" >> ${sFile}
  printf "# %-14s %7s %7s %7s\n" "sample/sec" "tokens" "lexemes" "unique" >> ${sFile}
  printf "# %-14s %7s %7s %7s\n" "--------------" "-------" "-------" "-------" >> ${sFile}
  for smp in  ${smps[@]} ; do
    secs=( `cat dat/${smp}/sections-ok.tags` )
    for sec in "${secs[@]}" "tot.1" ; do
      smpsec="${smp}/${sec}"
      ifile="dat/${smpsec}/${kind}.wfr";
      cat ${ifile} \
        | gawk -v smpsec=${smpsec} \
            ' /./{ w++; t+=$1; if ($1==1) { u++; } } 
              END{ printf "  %-14s %7d %7d %7d\n",smpsec,t,w,u;} 
            ' \
        >> ${sFile}
    done
  done
done