#! /bin/csh -f
# Last edited on 2000-01-22 21:08:59 by stolfi

set usage = "$0 FTAG"

# Tabulates the features found by "analyze-features"
# sorted into categories.  Assumes available the following files:
#
#   text-subsecs/all.names
#     a list of names of subsections
#
#   text-subsecs/XXX.evt
#     the text of subsection XXX (where XXX is found in all.names),
#     in EVMT format.
#
#   classify-FTAG-strings
#     a filter that takes a stream of Voynichese words, one per line,
#     with counts as in "uniq -c"; and outputs the same file,
#     sorted by arbitrary categories.  Also adds at the end 
#     of each per category a line of the form "# NNNNN TOTAL".
#   
#   stats-subsecs/FTAG/XXX.frq
#     counts and frequencies of the FTAG values in section XXX,
#     for each XXX.  Also the overal counts and frequencies
#     when XXX="tot".
#
# Creates the following files:
#
#   stats-subsecs/FTAG/XXX-cls.frq
#     same as XXX.frq, with entries sorted and subtotalized
#     by category, through classify-FATG-strings.
#

if ( $#argv != 1 ) then
  echo "usage: ${usage}"; exit 1
endif

set ftag = "$1"; shift;

if ( ! ( -d stats-subsecs/${ftag} ) ) then
  echo "directory stats-subsecs/${ftag} not found."
  exit 1
endif

# Classify the entries by the classifier script, if present:

set frqfile = "stats-subsecs/${ftag}/tot.frq"
set clsfile = "stats-subsecs/${ftag}/tot-cls.frq"
set cscript = "classify-${ftag}-strings"

if ( ! ( -r ${frqfile} ) ) then
  echo "file ${frqfile} not found."
  exit 1
endif

if ( ! ( -x  ${cscript}) ) then
  echo "script ${cscript} not found."
  exit 1
endif

echo "${frqfile} -> ${clsfile}"
cat ${frqfile} \
  | gawk '/./{printf "%7d %s\n", $1, $3;}' \
  | classify-${ftag}-strings \
  | compute-freqs \
  > ${clsfile}