#! /bin/bash -eu
# Last edited on 2026-01-15 19:25:54 by stolfi

# Create source files {out}/{sec}.wdp" with fields 
# "{SUBSEC} {FNUM} {LSEQ} {IWD} {JWD} {WD}"

mkdir -p out; rm -fv out/*.wdp

extract_parag_words_with_subsec.sh < ???text25rz-40.tx|full25rz.ivt > out/all.wdp

for ss in hea heb bio str ; do
  cat out/all.wdp \
    | gawk -v ss="${ss}" '(substr($1,1,3) == ss) { print }' \
    > out/${ss}.wdp
done
dicio-wc out/*.wdp

test=0
if [[ ${test} -eq 1 ]]; then
  # Add some repeats for testing:
  for g in '' 'y'; do
    echo "bio.1 f84v    43   1   0 ${g}bagus" >> out/bio.wdp
    echo "bio.1 f84v    43   1   0 yea" >> out/bio.wdp
    echo "bio.1 f84v    43   1   0 begus" >> out/bio.wdp
    echo "bio.1 f84v    43   1   0 -" >> out/bio.wdp
    echo "bio.1 f84v    43   1   0 ${g}begus" >> out/bio.wdp
    echo "bio.1 f84v    43   1   0 ufa" >> out/bio.wdp
    echo "bio.1 f84v    43   1   0 big" >> out/bio.wdp
    echo "bio.1 f84v    43   1   0 -" >> out/bio.wdp
    echo "bio.1 f84v    43   1   0 ${g}bigus" >> out/bio.wdp
    echo "bio.1 f84v    43   1   0 epa" >> out/bio.wdp
    echo "bio.1 f84v    43   1   0 bogus" >> out/bio.wdp
    echo "bio.1 f84v    43   1   0 -" >> out/bio.wdp
    echo "bio.1 f84v    43   1   0 ${g}gus" >> out/bio.wdp
    echo "bio.1 f84v    43   1   0 urra" >> out/bio.wdp
    echo "bio.1 f84v    43   1   0 bugus" >> out/bio.wdp
    echo "bio.1 f84v    43   1   0 =" >> out/bio.wdp
  done
fi

# Count repeats in it:

for trim in 0 1; do 
  for quasi in 0 1; do 
  for ss in hea heb bio str ; do 
    wfile="out/${ss}.wdp"
      echo "== ${ss} trim = ${trim} quasi = ${quasi} ==" 1>&2
      sfile="out/${ss}-t${trim}-q${quasi}.txt"
      cat ${wfile} \
        | count_repeats_across_lines.gawk \
            -v trim=${trim} -v quasi=${quasi} \
        | sort | uniq -c \
        | sort -k1,1nr \
        > ${sfile}
      cat ${sfile} 1>&2
    done
  done
done