#! /bin/bash -eu
# Last edited on 2026-01-16 15:31:45 by stolfi

# Reads "st_unparsed/{sectyp}.wff", filters out all words with invalid or rare 
# EVA characters and writes the result to "st_parsed/{sectyp}.vwff"
# Will map EVA 'w' and 'z' to 'p' and 'f', respectively.
#
# Currently considers only "parags" text.
#
# Also writes a file "st_parsed/{sectyp}.vlex" with the list of words
# that occur in the subset {sectyp} at least a minimum number of times
# that depends on the total number of tokens.

rm -f .all .gud .wrbad .rat .tst .nlex
valid="acdefhiklmnopqrsty"
for wff_file in $( cd st_unparsed && ls *.wff | egrep -e 'parags' ) tot-parags.wff; do
  sectyp="${wff_file/.wff}"
  echo "=== ${sectyp} ===" 1>&2
  tfile=".this.wff"
  if [[ "/${sectyp}" == "/tot-parags" ]]; then
    cat st_unparsed/*-parags.wff | tr "zw" "fp" > ${tfile}
  else
    cat st_unparsed/${wff_file} | tr "zw" "fp" > ${tfile}
  fi
  
  # Test run:
  cat ${tfile} \
    | compute_freqs.gawk -v outputTotal='TOTAL' \
    | gawk -v sectyp="${sectyp}" '//{ printf "%12.6f %12.6f %s:%s\n", $1, $2, $3, sectyp }' \
    >> .tst

  # Count total input words:
  cat ${tfile} \
    | compute_freqs.gawk -v outputTotal='TOTAL' \
    | egrep -e 'TOTAL' \
    | gawk -v sectyp="${sectyp}" '//{ printf "%12.6f %s\n", $1, sectyp }' \
    >> .all

  # Filter out the invalid ones, save the good ones to a ".vwff" file:
  vwff_file="${sectyp}.vwff"
  cat ${tfile} \
    | combine_counts.gawk \
    | gawk '//{ wd = $2; if (match(wd, /^['"${valid}"']+$/)) { print } }' \
    | sort -b -k 1,1gr \
    > st_parsed/${vwff_file}
  
  # Count the valid ones:
  cat st_parsed/${vwff_file} \
    | compute_freqs.gawk -v outputTotal='TOTAL' \
    | egrep -e 'TOTAL' \
    | gawk -v sectyp="${sectyp}" '//{ printf "%12.6f %s\n", $1, sectyp }' \
    >> .gud
    
  # Count the bad ones (excluding "?"):
  cat ${tfile} \
    | egrep -v -e '[?]' \
    | gawk \
        -v sectyp="${sectyp}" \
        ' //{ 
            ct= $1; wd = $2 
            if (! match(wd, /^['"${valid}"'?]+$/)) {
              printf "%12.6f %s:%s\n", ct, sectyp, wd
            }
          }
        ' \
    | compute_freqs.gawk -v outputTotal='TOTAL' \
    >> .wrbad
  echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' >> .wrbad
  
  # Extract lexemes (words that occur at least a min number of times:
  vlex_file="${sectyp}.vlex"
  cat st_parsed/${vwff_file} \
    | egrep -v -e '[?!bgjuvwxz]' \
    | compute_freqs.gawk \
    | gawk \
        ' //{
            ct = $1; fr = $2; wd = $3;
            if (ct >= 3) { printf "%12.6f %s\n", ct, wd }
          }
        ' \
    | sort -b -k1,1gr \
    > st_parsed/${vlex_file}
  
  printf "${sectyp} lexicon size = " >> .nlex
  cat st_parsed/${vlex_file} | wc -l | gawk '//{ printf "%5d", $1}' >> .nlex
  printf " last = " >> .nlex
  tail -n 1 st_parsed/${vlex_file} >> .nlex
done

# Compute the stats summary:
join -1 2 -2 2 -a 1 -a 2 -e 0 -o1.1,2.1,0 .all .gud \
  | gawk \
      ' //{ 
          cta = $1; ctb = $2; sectyp = $3;
          printf "%12.6f %12.6f %12.6f %6.2f %s\n", cta, ctb, cta-ctb, 100*ctb/cta, sectyp
        }
      ' \
  | sort -b -k5,5 \
  > .rat
cat .rat 1>&2
echo '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' 1>&2
echo "invalid words for chars other than '?':" 1>&2
cat .wrbad 1>&2

echo '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' 1>&2
echo "LEXICON SIZES WITH VALID CHARS" 1>&2
cat .nlex | sort 1>&2

exit 0
