#! /bin/csh -f
# Last edited on 2004-05-27 10:10:46 by stolfi

set cmd = "$0"; set cmd = "${cmd:t}"
set usage = "${cmd} LANG BOOK SUBSEC > SUMMARY.tex"

# Reads a tabulation dat/LANG/BOOK/SUBSEC/raw.dupfr of consecutively
# duplicated words and the original list dat/LANG/BOOK/SUBSEC/raw.tlw
# of all words with locations from the sample. Writes to stdout a file
# with TeX definitions summarizing the frequency of duplications, the
# most commonly duplicated word, and its duplication probability.

if ( $#argv != 3 ) then
  echo "usage: ${usage}"; exit 1
endif

set lang = "$1"; shift; 
set book = "$1"; shift; 
set subsec = "$1"; shift; 

set sample = "${lang}/${book}"
set secdir = "${sample}/${subsec}"

set frfile = "dat/${secdir}/raw.dupfr"
set lwfile = "dat/${secdir}/raw.tlw"
set tmpfile = "/tmp/$$"

# Get most commonly duplicated word and its duplication count
set tmp = ( `cat ${frfile} | head -1` )
if ( $#tmp == 3 ) then
  set maxdupct = "${tmp[1]}"
  set maxdupwd = "${tmp[3]}"
else
  set maxdupct = 0
  set maxdupwd = "-"
endif

# Compute the total number of replications 
set pgm = '/./{ s+=$1; } END{print s;}'
set totdupct = `cat ${frfile} | gawk "${pgm}"` 

# Compute the total number of tokens (excluding punct and breaks)
set pgm = '($2 != 1){ s++; } END{print s;}'
set tottokct = `cat ${lwfile} | gawk "${pgm}"` 
  
set maxdupfr = `gawk -v tot=${tottokct} -v ct=${maxdupct} 'BEGIN{printf "%7.5f", ct/tot;}'`
set totdupfr = `gawk -v tot=${tottokct} -v ct=${totdupct} 'BEGIN{printf "%7.5f", ct/tot;}'`

# Output results
echo '% created by '"${cmd}"
echo '%'
echo "${maxdupwd}" \
  | dat/${sample}/reencode-words-for-tex -v field=1 \
  | gawk '//{printf "\\def\\'"${LANG}${BOOK}"'DupMaxWd{%s}\n", $1;}'
echo '\def\'"${LANG}${BOOK}DupMaxCt{${maxdupct}}"
echo '\def\'"${LANG}${BOOK}DupMaxFr{${maxdupfr}}"
echo '\def\'"${LANG}${BOOK}DupTotCt{${totdupct}}"
echo '\def\'"${LANG}${BOOK}DupTotFr{${totdupfr}}"