#! /bin/csh -f
# Last edited on 2025-05-01 18:45:53 by stolfi

set usage = "$0 FTAG"

# Gathers statistics about a specific feature of Voynichese words
# (identified by the tag FTAG).  Assumes available the following files:
#
#   text-subsecs/all.names
#     a list of names of subsections
#
#   text-subsecs/XXX.evt
#     the text of subsection XXX (where XXX is found in all.names),
#     in EVMT format.
#
#   extract-FTAG-strings
#     a filter that takes a stream of Voynichese words, one per line,
#     and outputs the corresponding value(s) of the FTAG feature, 
#     one per line. Empty lines in the output will be ignored.
#     Will be called with "-f factor-text.gawk"
#   
#   unit-to-type.tbl 
#     a table that maps each unit number (e.g. f82v.P3) to the 
#     corresponding text type (e.g. "parags", "labels", etc.)
#
# Creates the following files:
#
#   stats-subsecs/FTAG/XXX.frq
#     counts and frequencies of the FTAG values in section XXX,
#     for each XXX.  Also the overal counts and frequencies
#     when XXX="tot".
#
#   stats-subsecs/FTAG/XXX.fcm
#     same as XXX.frq, with extra columns for cumulative counts 
#     and frequencies.
#
#   stats-subsecs/FTAG/all-cmp.cts
#   stats-subsecs/FTAG/all-cmp.frq
#     a table showing the counts and frequencies, respectively, of each 
#     FTAG value in each section, side by side.
#
#   stats-subsecs/FTAG/all-cmp.top
#     a table showing the FTAG values that occur in each section,
#     and their frequencies, sorted by the latter, side by side.
#

if ( $#argv != 1 ) then
  echo "usage: ${usage}"; exit 1
endif

set ftag = "$1"; shift;

set tmp = "/tmp/$$"

if ( ! ( -d stats-subsecs/${ftag} ) ) then
  mkdir stats-subsecs/${ftag}
else
  echo "cleaning out stats-subsecs/${ftag}"
  ( cd stats-subsecs/${ftag}/ && /bin/rm -f *.frq *.fcm all-cmp.* )
endif

# Gather statistics by section

foreach f ( `cat text-subsecs/all.names` )
  set frqfile = "stats-subsecs/${ftag}/${f}.frq"
  set fcmfile = "stats-subsecs/${ftag}/${f}.fcm"
  echo ${frqfile}
  cat text-subsecs/${f}.evt \
    | select-units \
        -v types='parags,starred-parags,circular-lines,circular-text,radial-lines,titles' \
        -v table=unit-to-type.tbl \
    | words-from-evt \
    | egrep -v '[?*]' \
    | extract-${ftag}-strings \
        -f factor-text.gawk \
    | egrep '.' \
    | sort | uniq -c | expand \
    | sort -b -k1nr \
    | compute-freqs \
    > ${frqfile}
  cat ${frqfile} \
    | gawk '/./{print $1, $3;}' \
    | compute-cum-freqs \
    > ${fcmfile}
end

# Compute total counts and frequencies over all sections

set frqfile = "stats-subsecs/${ftag}/tot.frq"
set fcmfile = "stats-subsecs/${ftag}/tot.fcm"
echo "${fcmfile}"
cat text-subsecs/all.names \
  | sed -e 's/$/.frq/' \
  > ${tmp}.ifiles
( cd stats-subsecs/${ftag} && cat `cat ${tmp}.ifiles` ) \
  | gawk '/./{print $1, $3;}' \
  | combine-counts \
  | sort -b -k1nr \
  | compute-freqs \
  > ${frqfile}
cat ${frqfile} \
  | gawk '/./{print $1, $3;}' \
  | compute-cum-freqs \
  > ${fcmfile}

plot-histogram ${fcmfile} &

# Assemble comparative tables

cat text-subsecs/all.names \
  > ${tmp}.ifiles

tabulate-frequencies \
  -dir stats-subsecs/${ftag} \
  -out all \
  -title "feature" \
  -maxLines 999999 \
  tot `cat ${tmp}.ifiles`

echo "pha.2 hea.1 cos.2 zod.1 heb.1 str.2 bio.1" \
  | tr ' ' '\012' \
  > ${tmp}.sfiles

tabulate-frequencies \
  -dir stats-subsecs/${ftag} \
  -out some \
  -title "feature" \
  -maxLines 999999 \
  tot `cat ${tmp}.sfiles`

/bin/rm ${tmp}.*