#! /bin/csh -f
# Last edited on 2000-06-08 04:11:03 by stolfi

set usage = "$0 CTAG"

# Gathers statistics about components of Voynichese words
# (identified by the tag CTAG).  Assumes available the following files:
#
#   data/{words,labels}/all.names
#     a list of names of all subsections
#
#   data/{words,labels}/XXX.wsp
#     the words/labels of subsection XXX (for each XXX in
#     all.names), in EVA encoding, factored into QOKOKOKOKO
#     elements, with mantle and core set off with "()" and "<>".
#
# Creates the following files:
#
#   stats/{words,labels}/CTAG/XXX.frq
#     counts and frequencies of the CTAG components in section XXX,
#     for each XXX. Also the overall counts and frequencies,
#     when XXX="tot".
#
#   stats/{words,labels}/CTAG/XXX.fcm
#     same as XXX.frq, with extra columns for cumulative counts 
#     and frequencies.
#

if ( $#argv != 1 ) then
  echo "usage: ${usage}"; exit 1
endif

set ctag = "$1"; shift;

set tmp = "/tmp/$$"

foreach f ( labels words )

  if ( ! ( -d stats/${f}/${ctag} ) ) then
    mkdir -p stats/${f}/${ctag}
  else
    echo "cleaning out stats/${f}/${ctag}"
    ( cd stats/${f}/${ctag}/ && /bin/rm -f *.frq *.fcm all-cmp.* )
  endif

  # Gather statistics by section

  set secs = ( `cat data/${f}/all.names` )

  foreach sec ( ${secs} )
    set wspfile = "data/${f}/${sec}.wsp"
    set frqfile = "stats/${f}/${ctag}/${sec}.frq"
    set fcmfile = "stats/${f}/${ctag}/${sec}.fcm"
    echo "${wspfile} -> ${frqfile}"
    cat ${wspfile} \
      | extract-components \
          -f get-components.gawk \
          -v select=${ctag} \
      | egrep '.' \
      | sort | uniq -c | expand \
      | sort -b +0 -1nr \
      | compute-freqs \
      > ${frqfile}
    echo "${frqfile} -> ${fcmfile}"
    cat ${frqfile} \
      | gawk '/./{print $1, $3;}' \
      | compute-cum-freqs \
      > ${fcmfile}
  end
end

# Compute total counts and frequencies over all sections

foreach f ( labels words )
  
  set frqfile = "stats/${f}/${ctag}/tot.frq"
  set fcmfile = "stats/${f}/${ctag}/tot.fcm"

  echo ${secs} \
    | tr ' ' '\012' \
    | sed -e 's/$/.frq/' \
    > ${tmp}.ifiles

  echo "${frqfile}"
  ( cd stats/${f}/${ctag} && cat `cat ${tmp}.ifiles` ) \
    | gawk '/./{print $1, $3;}' \
    | combine-counts \
    | sort -b +0 -1nr \
    | compute-freqs \
    > ${frqfile}

  echo "${fcmfile}"
  cat ${frqfile} \
    | gawk '/./{print $1, $3;}' \
    | compute-cum-freqs \
    > ${fcmfile}
end

/bin/rm ${tmp}.*