#! /bin/bash -ue # Last edited on 2025-05-04 22:48:14 by stolfi # The "unique" column is how many lexemes have a single occurrence in the sample. sizeopt="$1"; shift # "whole" or "trunc" smps=( "$@" ) # {LANG}/{BOOK}/{sec} ... for kind in raw gud bad ; do sFile="summary-${sizeopt}-${kind}.txt" rm -fv ${sFile} | sed -e 's:^: :g' echo " creating ${sFile}" printf "# %-38s\n" "Counts for ${kind} text (${sizeopt})" >> ${sFile} printf "# %-14s %7s %7s %7s\n" "sample/sec" "tokens" "lexemes" "unique" >> ${sFile} printf "# %-14s %7s %7s %7s\n" "--------------" "-------" "-------" "-------" >> ${sFile} for smp in ${smps[@]} ; do secs=( `cat dat/${smp}/sections-ok.tags` ) for sec in "${secs[@]}" "tot.1" ; do smpsec="${smp}/${sec}" ifile="dat/${smpsec}/${kind}.wfr"; cat ${ifile} \ | gawk -v smpsec=${smpsec} \ ' /./{ w++; t+=$1; if ($1==1) { u++; } } END{ printf " %-14s %7d %7d %7d\n",smpsec,t,w,u;} ' \ >> ${sFile} done done done