#! /bin/bash -eu
# Last edited on 2026-03-03 08:22:10 by stolfi


# Join the frequency files:

ct_dir="st_${which}s"  # Directory with input count+freq files and output plot.
ex_dir="${HOME}/projects/voynich/VoynichNinja/092" # Export directory.
mkdir -p ${ex_dir}

wcf_files=()
for sec in hea heb; do
  wct_file="${ct_dir}/${sec}-parags.wct"
  wcf_file="${temp}-${sec}.wcf"
  cat ${wct_file} \
    | cleanup_words.sed \
    | combine_counts.gawk \
    | gawk '/^ *[0-9.]/ { if ($1+0 >= 2) { print } }' \
    | sort -b -k3,3 \
    > ${wcf_file}
  wcf_files+=( ${wcf_file} )
done

jfile="${temp}-hea-heb.wcf"
join -1 3 -2 3 -a 1 -a 2 -e 0 -o1.2,2.2,0 ${wcf_files[@]} \
  | egrep -v -e '[?]' \
  > ${jfile}


########################################################################

# Get the valid roots in each file :
for sec in hea heb; do
  cat ${rt_dir}/${sec}-parags.wct \
    | egrep -v -e '[?]' \
    | sort -b -k2,2 \
    > .${sec}-root.wct
done

# Merge the two lists of roots with counts in the two sections:
rctmin=3.0 # Consider only roots with at least this many occurrences in both files.
join -1 2 -2 2 -a 1 -a 2 -e 0 -o 1.1,2.1,0 .{hea,heb}-root.wct \
  | gawk \
      -v ctmin="${rctmin}" \
      ' //{
          cta = $1; ctb = $2; rt = $3
          ctab = cta + ctb;
          if (ctab >= ctmin) {
            printf "%8.6f %s\n", ctab, rt
          }
        }
      ' \
  | sort -b -k1,1gr \
  > ${rt_dir}/hea-heb.rct
  wc -l ${rt_dir}/hea-heb-parags.wct
