#! /bin/bash -eu
# Last edited on 2025-12-28 10:47:38 by stolfi

rm -f .all .gud .bad .rat .tst
for sectyp in `( cd st_parsed && ls *.weff | sed -e 's:[.]weff::g' ) | egrep -e 'parags'`; do
  echo "=== ${sectyp} ===" 1>&2
  # Parse the words frequency file into OKOKO patterns:
  weff_file="st_parsed/${sectyp}.weff"
  oko_file="st_parsed/${sectyp}.oko"
  rm -f ${oko_file}
  cat ${weff_file} \
    | egrep -v -e '[]*?!bgjuvx[]' \
    | parse_weff_file_into_okoko_pats.gawk \
        -i error_funcs.gawk \
        -i elem_parse_funcs.gawk \
    > ${oko_file}
  wc -l ${oko_file} 1>&2
    
  # Compute stats of valid and invalid words:
  cat ${oko_file} \
    | compute_freqs.gawk -v outputTotal='TOTAL' \
    | egrep -e 'TOTAL' \
    | gawk -v sectyp="${sectyp}" '//{ printf "%12.6f %s\n", $1, sectyp }' \
    >> .all

  cat ${oko_file} \
    | egrep -v -e '[]*?!bgjuvx[]' \
    | compute_freqs.gawk -v outputTotal='TOTAL' \
    | egrep -e 'TOTAL' \
    | gawk -v sectyp="${sectyp}" '//{ printf "%12.6f %s\n", $1, sectyp }' \
    >> .gud
  
  cat ${oko_file} \
    | ( egrep -e '[!]' || cat /dev/null ) \
    | ( egrep -v -e '[]*?bgjuvx[]' || cat /dev/null ) \
    | compute_freqs.gawk -v outputTotal='TOTAL' \
    | gawk -v sectyp="${sectyp}" '//{ printf "%12.6f %12.6f %s:%s\n", $1, $2, $3, sectyp }' \
    >> .bad

  echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' >> .bad
done
join -1 2 -2 2 -a 1 -a 2 -e 0 -o1.1,2.1,0 .all .gud \
  | gawk \
      ' //{ 
          cta = $1; ctb = $2; sectyp = $3;
          printf "%12.6f %12.6f %12.6f %6.2f %s\n", cta, ctb, cta-ctb, 100*ctb/cta, sectyp
        }
      ' \
  | sort -b -k5,5 \
  > .rat
cat .rat 1>&2
echo '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' 1>&2
echo "invalid words because of elems other than bad or rare glyphs:" 1>&2
cat .bad | sed -e 's:[}][:].*$:}:g' 1>&2

echo "" 1>&2
echo "COUNTING TOKENS BY OKOKO PATTERN" 1>&2

sectyp="tot-parags"
oko_file="st_parsed/${sectyp}.oko"
cat ${oko_file} \
  | egrep -v -e '[!]' \
  | gawk \
      ' //{ 
          ct = $1; wo = $2; 
          gsub(/[:].*$/, "", wo);
          printf "%12.6f %s\n", ct, wo;
        }
      ' \
  | combine_counts.gawk \
  | compute_freqs.gawk \
  | sort -b -k1,1gr \
  > .okos

head -n 50 .okos 1>&2

echo "" 1>&2
echo "COUNTING TOKENS BY NUMBER OF 'O'S" 1>&2

cat .okos \
  | gawk \
      ' //{ 
          ct = $1; fr = $2; wo = $3; 
          gsub(/K/, "", wo);
          if (wo == "") { wo = "-" }
          printf "%12.6f %s\n", ct, wo;
        }
      ' \
  | combine_counts.gawk \
  | compute_freqs.gawk \
  | sort -b -k1,1gr \
  > .ooos
  
head -n 50 .ooos 1>&2

echo "" 1>&2
echo "COUNTING TOKENS BY NUMBER OF 'K'S" 1>&2

cat .okos \
  | gawk \
      ' //{ 
          ct = $1; fr = $2; wo = $3; 
          gsub(/O/, "", wo);
          if (wo == "") { wo = "-" }
          printf "%12.6f %s\n", ct, wo;
        }
      ' \
  | combine_counts.gawk \
  | compute_freqs.gawk \
  | sort -b -k1,1gr \
  > .kkks
  
head -n 50 .kkks 1>&2
