#! /bin/bash
# Last edited on 2023-05-10 12:08:33 by stolfi

cmd="$0"; cmd="${cmd##*/}"
usage="${cmd} {LANG}/{BOOK} [whole|trunc]"

if [[ $# -ne 2 ]]; then
  echo "usage: ${usage}" 1>&2; exit 1
fi

smp="$1"; shift;
sizeopt="$1"; shift;

totattr=( `get-sample-section-attrs.sh ${smp}/tot.1` )
echo "totattr = [${totattr[*]}]" 1>&2
langbook="${totattr[1]}"

echo "=== creating link dat/${smp}/org to ../../langbank/${langbook} ===" 1>&2
relink-source-dir.sh dat/${smp} org ../../langbank/${langbook}
srcfile="dat/${smp}/org/main.src"
wdsfile="dat/${smp}/org/main.wds"
if [[ -s ${srcfile} ]]; then 
  echo "=== recreating ${wdsfile} from ${srcfile} ===" 1>&2
  ( cd dat/${smp}/org && make main.wds )
else
  echo "** ${srcfile} not found" 1>&2; exit 1
fi

# Generating the word-map table, if appropriate
if [[ -r dat/${smp}/word-map.make ]]; then
  ( cd dat/${smp} && make -f word-map.make )
fi

# Ensure presence of the source links, and word-mapping tables for fix-words
for f in word-map.tbl org/Makefile sample-fns.gawk reencode-words-for-tex ; do
  if [[ ! ( -r "dat/${smp}/${f}" ) ]]; then
    echo "*** missing dat/${smp}/${f}" 1>&2; exit 1
  fi
done

# Build file "{sizeopt}.tlw" for each section
secs=( `cat dat/${smp}/sections-ok.tags` )
for sec in "${secs[@]}" "tot.1" ; do
  smpsec="${smp}/${sec}"

  # Create directories if needed
  for pd in dat exp ; do
    dir="${pd}/${smpsec}"
    if [[ ! ( -d ${dir} ) ]]; then mkdir -pv ${dir}; fi
  done

  # Obtain sample size for this section:
  if [[ "/${sizeopt}" == "/whole" ]]; then
    gudnum=9999999
  elif [[ "/${sizeopt}" == "/trunc" ]]; then
    secattr=( `get-sample-section-attrs.sh ${smpsec}` )
    # echo "${smpsec} secattr = [${secattr[*]}]" 1>&2
    gudnum="${secattr[2]}"
  else
    echo "invalid sizeopt = ${sizeopt}"; exit 1
  fi 
  
  # Create link to source file in "dat/{smpsec}" directory:
  echo "=== creating link to words file dir on langbank ==="
  relink-source.sh dat/${smpsec} org ../../../langbank/${langbook}
  wdsfile="dat/${smpsec}/org/main.wds"
  if [[ ! ( -s ${wdsfile} ) ]]; then 
    echo "** ${wdsfile} not found" 1>&2; exit 1
  fi

  # Creating main ".tlw" file:
  rFile="dat/${smpsec}/${sizeopt}.tlw"
  /bin/rm -fv ${rFile}
  if [[ "/${sec}" != "/tot.1" ]]; then
    echo "  making ${rFile} (gudnum = ${gudnum}) from ${wdsfile}" 1>&2
    cat ${wdsfile}  \
      | wds-to-tlw.sh \
          -f dat/${smp}/sample-fns.gawk \
          -v table=dat/${smp}/word-map.tbl \
          -v smp=${smp} \
          -v sec=${sec} \
          -v maxGud=${gudnum} \
      > ${rFile}
  else
    echo "  making ${rFile} (gudnum = ${gudnum}) from ${secs[@]}" 1>&2
    sec_rFiles=( `echo ${secs[@]} | tr ' ' '\012' | sed -e "s:^:dat/${smp}/:g" -e "s:\$:/${sizeopt}.tlw:g" ` )
    cat ${sec_rFiles[@]} \
      | gawk -v maxGud=${gudnum} \
          ' BEGIN { na = 0; } \
            (na >= maxGud) { exit 0; } \
            //{ print; } \
            ($1 == "a") { na++; } \
          ' \
      > ${rFile}
  fi
  wc -l ${rFile}

  echo "  creating the {raw,gud,bad}.tlw files" 1>&2 
  /bin/rm -fv dat/${smpsec}/{raw,gud,bad}.{tlw,wfr,wdf}
  cp -a ${rFile} dat/${smpsec}/raw.tlw
  cat ${rFile} \
    | gawk ' ($$1 == "a") { print } ' \
    > dat/${smpsec}/gud.tlw
  cat ${rFile} \
    | gawk ' ($$1 == "s") { print } ' \
    > dat/${smpsec}/bad.tlw

  for kind in raw gud bad ; do 
    rkFile=dat/${smpsec}/${kind}.tlw
    tkFile=dat/${smpsec}/${kind}.wdf
    echo "    creating ${kind}.wdf" 1>&2 
    cat ${rkFile} \
      | gawk ' /^ *([#]|$)/ { next; } // { print $3; } ' \
      | format-words-filled.sh -v width=72 \
      > ${tkFile}
    echo "    sample:" 1>&2 
    head -20 ${tkFile} | sed -e 's:^:      :g' 1>&2 
    echo "      . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ." 1>&2 
    tail -3 ${tkFile} | sed -e 's:^:      :g' 1>&2 
    
    echo "  creating ${kind}.wfr" 1>&2 
    fkFile=dat/${smpsec}/${kind}.wfr
    cat ${rkFile} \
      | gawk '// { print $3; }' \
      | sort | uniq -c | expand \
      | sort -b +0 -1nr +1 -2 \
      | compute-freqs.sh \
      > ${fkFile}
    echo "    the 10 most common words in ${rkFile}:"
    head -10 ${fkFile} | sed -e 's:^:      :g' 1>&2 
    
  skFile="dat/${smpsec}/${sizeopt}-${kind}-wds-summary.tex"
  ekFile="${skFile/dat/exp}"
  if [[ -r ${rkFile} ]]; then
    tex-make-sample-summary.sh ${smpsec} ${sizeopt} ${kind} > ${skFile}
    cat ${skFile}
    update-paper-include ${skFile} ${ekFile}
  else
    echo "*** no ${rkFile}, ${skFile} not created"
  fi

done

# Print summaries
all_tFiles=( `echo ${secs[@]} "tot.1" | tr ' ' '\012' | sed -e "s:^:dat/${smp}/:g" -e "s:\$:/${sizeopt}.tlw:g" ` )
dicio-wc ${all_tFiles[@]} 1>&2
for kind in raw gud bad ; do
  all_fkFiles=( `echo ${secs[@]} "tot.1" | tr ' ' '\012' | sed -e "s:^:dat/${smp}/:g" -e "s:\$:/${kind}.wfr:g"` )
  dicio-wc ${all_fkFiles[@]} 1>&2
done
pgm='/./{ n+=$1; next; } END{ print n; }'
for sec in "${secs[@]}" "tot.1" ; do
  printf "%s" ${sec}
  for kind in  raw gud bad ; do
    printf " %s = %7d" \
      ${kind} \
      `cat dat/${smp}/${sec}/${kind}.wfr | gawk "${pgm}"` 1>&2
  done
  printf "\n"
done
