#! /bin/bash -eu
# Last edited on 2026-01-16 18:48:59 by stolfi

# Reads "st_parsed/{sectyp}.vwff" with counts of tokens that
# contain only valid EVA characters (no '?', weirdos, [bgjuvwxz]).
# Parses each word into elements, inserting '!' before unparseable 
# ones. 
#
# Writes to "st_parsed/{sectyp}.weff" the same file, except that the
# words are parsed into elements, delimited by '{}'. Words that failed
# parsing are written too (with the '!' markers).
#
# Also reads "st_unparsed/{sectyp}.ivt" (except for "tot-parags") and
# writes out to "st_parsed/{sectyp}.etx" the same file, except that the
# words are parsed into elements, delimited by '{}'. Words that failed
# parsing are written too (with the '!' markers).

rm -f .all .gud .bad .rat .tst
for vwff_file in $( cd st_parsed && ls *.vwff | egrep -e 'parags' ); do
  sectyp="${vwff_file/.vwff}"
  
  if [[ "/${sectyp}" != "/tot-parags" ]]; then
    # Parse the EVT file into elements:
    etx_file="${sectyp}.etx"
    rm -f st_parsed/${etx_file}
    cat st_unparsed/${sectyp}.ivt \
      | recode latin-1..utf-8 \
      | parse_ivt_file_into_elements.gawk \
          -i error_funcs.gawk \
          -i elem_parse_funcs.gawk \
      > st_parsed/${etx_file}
  fi
    
  # Parse the words frequency file into elements:
  weff_file="${sectyp}.weff"
  rm -f st_parsed/${weff_file}
  cat st_parsed/${vwff_file} \
    | egrep -v -e '[]*!?bgjuvx[]' \
    | parse_wff_file_into_elements.gawk \
        -i error_funcs.gawk \
        -i elem_parse_funcs.gawk \
    > st_parsed/${weff_file}
    
  # Compute stats of valid and invalid words:
  cat st_parsed/${weff_file} \
    | compute_freqs.gawk -v outputTotal='TOTAL' \
    | egrep -e 'TOTAL' \
    | gawk -v sectyp="${sectyp}" '//{ printf "%12.6f %s\n", $1, sectyp }' \
    >> .all
    
  cat st_parsed/${weff_file} \
    | egrep -v -e '[!]' \
    | compute_freqs.gawk -v outputTotal='TOTAL' \
    | egrep -e 'TOTAL' \
    | gawk -v sectyp="${sectyp}" '//{ printf "%12.6f %s\n", $1, sectyp }' \
    >> .gud
    
  cat st_parsed/${weff_file} \
    | ( egrep -e '[!]' || cat /dev/null ) \
    | ( egrep -v -e '[*?bgjuvx]' || cat /dev/null ) \
    | compute_freqs.gawk -v outputTotal='TOTAL' \
    | gawk -v sectyp="${sectyp}" '//{ printf "%12.6f %12.6f %s:%s\n", $1, $2, $3, sectyp }' \
    >> .bad
  echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' >> .bad
done
join -1 2 -2 2 -a 1 -a 2 -e 0 -o1.1,2.1,0 .all .gud \
  | gawk \
      ' //{ 
          cta = $1; ctb = $2; sectyp = $3;
          printf "%12.6f %12.6f %12.6f %6.2f %s\n", cta, ctb, cta-ctb, 100*ctb/cta, sectyp
        }
      ' \
  | sort -b -k5,5 \
  > .rat
cat .rat 1>&2
echo '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' 1>&2
echo "INVALID WORDS BECAUSE OF ELEMS OTHER THAN BAD GLYPHS:" 1>&2
cat .bad | sed -e 's/[}][:].*$/}/g' -e 's/[!][:].*$/!/g' 1>&2

echo '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' 1>&2
echo "PATTERNS OF INVALID WORDS OTHER THAN BAD GLYPHS:" 1>&2
cat .bad \
  | sed \
      -e 's/[}][:].*$/}/g' \
      -e 's/[!][:].*$/!/g' \
  | count_bad_patterns.sh \
  > .bad-pats
  
cat .bad-pats | recode latin-1..utf-8 1>&2

echo "" 1>&2
echo "COUNTING ELEMENT FREQUENCIES" 1>&2

txty="parags"
for vwff_file in $( cd st_parsed && ls *.vwff | egrep -e "${txty}" ); do
  sectyp="${vwff_file/.vwff}"
  sec="${sectyp/-*/}"

  weff_file="${sectyp}.weff"
  elf_file="${sectyp}.elf"
  cat st_parsed/${weff_file} \
    | egrep -v -e '[!]' \
    | gawk \
        -i error_funcs.gawk \
        ' // {
            ct = $1; we = $2
            while (we != "") {
              if (! match(we, /^{[^{}]*}/)) { data_error("FORMAT") }
              el = substr(we, RSTART, RLENGTH)
              we = substr(we, RSTART+RLENGTH)
              printf "%12.6f %s\n", ct, el
            }
          }
        ' \
    | combine_counts.gawk \
    | compute_freqs.gawk \
    | sort -b -k3,3 \
    > st_parsed/${elf_file}
  stys+=( ${sec} ${txty} ${sectyp} )
done
format_elem_count_freq.py 1 4 "${stys[@]}"
