#! /bin/bash -ue
# Last edited on 2025-09-24 20:08:41 by stolfi

cmd="$0"; cmd="${cmd/*\\/}"
usage="${cmd} LANG BOOK SUBSEC > SUMMARY.tex"

# Reads a tabulation dat/LANG/BOOK/SUBSEC/raw.dupfr of consecutively
# duplicated words and the original list dat/LANG/BOOK/SUBSEC/raw.tlw
# of all words with locations from the sample. Writes to stdout a file
# with TeX definitions summarizing the frequency of duplications, the
# most commonly duplicated word, and its duplication probability.

if [[ $# -ne 3 ]]; then
  echo "usage: ${usage}" 1>&2; exit 1
fi

lang="$1"; shift; 
book="$1"; shift; 
subsec="$1"; shift; 

sample="${lang}/${book}"
secdir="${sample}/${subsec}"

frfile="dat/${secdir}/raw.dupfr"
lwfile="dat/${secdir}/raw.tlw"
tmpfile="/tmp/$$"

# Get most commonly duplicated word and its duplication count
tmp=( `cat ${frfile} | head -1` )
if [[ ${#tmp[@]} -eq 3 ]]; then
  maxdupct="${tmp[0]}"
  maxdupwd="${tmp[2]}"
else
  maxdupct=0
  maxdupwd="-"
fi

# Compute the total number of replications 
pgm='/./{ s+=$1; } END{print s;}'
totdupct=`cat ${frfile} | gawk "${pgm}"` 

# Compute the total number of tokens (excluding punct and breaks)
pgm='($2 != 1){ s++; } END{print s;}'
tottokct=`cat ${lwfile} | gawk "${pgm}"` 
  
maxdupfr=`gawk -v tot=${tottokct} -v ct=${maxdupct} 'BEGIN{printf "%7.5f", ct/tot;}'`
totdupfr=`gawk -v tot=${tottokct} -v ct=${totdupct} 'BEGIN{printf "%7.5f", ct/tot;}'`

# Output results
echo '% created by '"${cmd}"
echo '%'
echo "${maxdupwd}" \
  | dat/${sample}/reencode_words_for_tex.gawk -v field=1 \
  | gawk '//{printf "\\def\\'"${LANG}${BOOK}"'DupMaxWd{%s}\n", $1;}'
echo '\def\'"${LANG}${BOOK}DupMaxCt{${maxdupct}}"
echo '\def\'"${LANG}${BOOK}DupMaxFr{${maxdupfr}}"
echo '\def\'"${LANG}${BOOK}DupTotCt{${totdupct}}"
echo '\def\'"${LANG}${BOOK}DupTotFr{${totdupfr}}"