#! /bin/csh -f
# Last edited on 2000-06-08 04:16:19 by stolfi

set usage = "$0 PTAG"

# Gathers statistics about pairs of elements in Voynichese words
# (identified by the tag PTAG, of the form LTAG-RTAG).  
# Assumes available the following files:
#
#   data/{words,labels}/all.names
#     a list of names of subsections
#
#   data/{words,labels}/XXX.wsp
#     the words of subsection XXX (where XXX is found in all.names),
#     in EVA encoding, factored into QOKOKOKOKO elements, 
#     with mantle and core set off with "()" and "<>".
#
# Creates the following files:
#
#   stats/{words,labels}/PTAG/XXX.frq
#   stats/{words,labels}/PTAG/XXX-L.frq
#   stats/{words,labels}/PTAG/XXX-R.frq
#     counts and frequencies of the PTAG pairs in section XXX, and
#     their left and right mebmbers, for each XXX. Also the overal
#     counts and frequencies when XXX="tot".
#
#   stats/{words,labels}/PTAG/XXX.fcm
#   stats/{words,labels}/PTAG/XXX-L.fcm
#   stats/{words,labels}/PTAG/XXX-R.fcm
#     same as XXX.frq, with extra columns for cumulative counts 
#     and frequencies.
#
#   stats/{words,labels}/PTAG/XXX.mtx
#     same as XXX.frq, restricted to the 10 most common 
#     prefixes and suffixes, in tabular format.
#

if ( $#argv != 1 ) then
  echo "usage: ${usage}"; exit 1
endif

set ptag = "$1"; shift;

set tmp = "/tmp/$$"

foreach f ( words labels )

  if ( ! ( -d stats/${f}/${ptag} ) ) then
    mkdir stats/${f}/${ptag}
  else
    echo "cleaning out stats/${f}/${ptag}"
    ( cd stats/${f}/${ptag}/ && /bin/rm -f *.frq *.fcm all-cmp.* )
  endif

  # Decide whether complex or simple words are allowed:

  if ( "$ptag" =~ k-* ) then
    set complex = 1
  else
    set complex = 0
  endif

  # Gather statistics by section

  set secs = ( `cat data/${f}/all.names` )

  foreach sec ( ${secs} )
    set wspfile = "data/${f}/${sec}.wsp"
    set frqfile = "stats/${f}/${ptag}/${sec}.frq"
    set fcmfile = "stats/${f}/${ptag}/${sec}.fcm"
    echo "${wspfile} -> ${frqfile}"
    cat ${wspfile} \
      | select-simple-words \
          -v complex=${complex} \
      | extract-components \
          -f get-components.gawk \
          -v select=${ptag} \
      | tr -d '{}' \
      | egrep '.' \
      | sort | uniq -c | expand \
      | sort -b +0 -1nr \
      | compute-freqs \
      > ${frqfile}
    echo "${frqfile} -> ${fcmfile}"
    cat ${frqfile} \
      | gawk '/./{print $1, $3;}' \
      | compute-cum-freqs \
      > ${fcmfile}
  end

  # Compute total counts and frequencies over all sections

  set frqfile = "stats/${f}/${ptag}/tot.frq"
  set fcmfile = "stats/${f}/${ptag}/tot.fcm"

  echo ${secs} \
    | tr ' ' '\012' \
    | sed -e 's/$/.frq/' \
    > ${tmp}.ifiles

  echo "${frqfile}"
  ( cd stats/${f}/${ptag} && cat `cat ${tmp}.ifiles` ) \
    | gawk '/./{print $1, $3;}' \
    | combine-counts \
    | sort -b +0 -1nr \
    | compute-freqs \
    > ${frqfile}

  echo "${fcmfile}"
  cat ${frqfile} \
    | gawk '/./{print $1, $3;}' \
    | compute-cum-freqs \
    > ${fcmfile}

  # Assemble comparative tables

  foreach sec ( tot ${secs} )
    tabulate-pairs \
      ${ptag} ${sec} 24 24
  end

end

/bin/rm ${tmp}.*