#! /bin/bash -eu
# Last edited on 2026-03-03 14:35:03 by stolfi

# For each (section,text type) pair {st}, like "hea-parags", extracts
# word occurrence counts and freqs from the ".ivt" transcription files
# "st_files/{st}.ivt", and writes them to "st_words/{st}.wcf".
# 
# The counts are fractional, considering each comma ',' as a word break
# with 50% probability.
#
# Words with '?' characters are excluded before cmputing the frequencies
# and saving in the ".wcf" files.

mkdir -p .errors

# Directory with transcription files per (section,txtype):
tr_dir="st_files"

# Directory with word counts and freqa per (section,txtype):
wd_dir="st_words"

rm -f ${wd_dir}/*.wcf

mkdir -p .errors

temp="/tmp/$$"

# Get list {sts} of (section,txtype) pairs:
# sts=( $( cd ${tr_dir}/ && ls *.ivt | sed -e 's:[.]ivt::g' | sort -t- -k2,2 -k1,1 ) )
sts=( $( cd ${tr_dir}/ && ls *.ivt | sed -e 's:[.]ivt::g' ) )
for st in ${sts[@]}; do
  echo "extracting word freqs for ${st} ..."
  # Extract fractional word counts and freqs "${wd_dir}/{st}.wcf" 
  # of (section,txtype) pair {st}:
  wcf_file_words="st_words/${st}.wcf"
  cat ${tr_dir}/${st}.ivt \
    | recode latin-1..utf-8 \
    | ( ivtff_frac_word_counts.py 2>> .errors/word-${st} ) \
    | cleanup_words.sed \
    | combine_counts.gawk \
    | egrep -v -e '[?]' \
    | compute_freqs.gawk \
    | sort -b -k1,1gr \
    > ${wcf_file_words}
done

# Get word count+frequency files for sections "hea" and "heb" together, txtype "parags":
wcf_both_file="${wd_dir}/hea-heb-parags.wcf"
echo "creating ${wcf_both_file} ..." 1>&2
cat ${wd_dir}/{hea,heb}-parags.wcf \
  | gawk '//{ print $1, $3 }' \
  | combine_counts.gawk \
  | compute_freqs.gawk \
  | sort -b -k1,1gr \
  > ${wcf_both_file}

ls -ld ${wd_dir}/*.wcf 1>&2
