#! /bin/csh -f 
# Last edited on 2000-06-08 04:22:03 by stolfi

set usage = "$0 PTAG SEC NL NR"

# Tabulates the pairs of prefixes and suffixes from a frequency file
# of word feature PTAG for subsection SEC. Assumes available the files
#
#   stats/{words,labels}/PTAG/SEC.frq 
#     counts and frequencies of the PTAG pairs for section SEC.
#
# This script considers ony words of the frequency file that contain
# exactly one instance of "-", and separates those words into a "left"
# and a "right" components at that character. Outputs the following
# files:
#
#   stats/{words,labels}/PTAG/SEC-L.frq 
#   stats/{words,labels}/PTAG/SEC-R.frq 
#     counts and frequencies for the left and right components
#     of those pairs.
#
#   stats/{words,labels}/PTAG/SEC-L.dic 
#     The NL most common left components.
#
#   stats/{words,labels}/PTAG/SEC-R.dic 
#     The NR most common right components.
#
#   stats/{words,labels}/PTAG/SEC.mtx 
#     Counts of PTAG pairs with the prefixes SEC-L.dic and SEC-R.dic,
#     in tabular format.
#

if ( $#argv != 4 ) then
  echo "usage: ${usage}"; exit 1
endif

set ptag = "$1"; shift;
set sec = "$1"; shift;
set nLeft = "$1"; shift;
set nRight = "$1"; shift;

set tmp = "/tmp/$$"

foreach f ( words labels )

  set ifile = "stats/${f}/${ptag}/${sec}.frq"
  set lfile = "stats/${f}/${ptag}/${sec}-L.frq"
  set rfile = "stats/${f}/${ptag}/${sec}-R.frq"

  if ( ! ( -r ${ifile} ) ) then
    echo "${ifile} not found"; exit 1
  endif

  # Extract left and right components

  echo "${lfile}"
  cat ${ifile} \
    | gawk '($3 ~ /[-]/) { gsub(/[-].*$/, "-", $3); printf "%7d %s\n", $1, $3; }' \
    | combine-counts \
    | sort -b +0 -1nr +1 -2 \
    | compute-freqs \
    > ${lfile}

  echo "${rfile}"
  cat ${ifile} \
    | gawk '($3 ~ /[-]/) { gsub(/^.*[-]/, "-", $3); printf "%7d %s\n", $1, $3; }' \
    | combine-counts \
    | sort -b +0 -1nr +1 -2 \
    | compute-freqs \
    > ${rfile}

  # Compute cumulative frequencies, and get the most common ones:

  foreach sn ( L.${nLeft} R.${nRight} )
    set side = "${sn:r}"
    set num = "${sn:e}"
    set frqfile = "stats/${f}/${ptag}/${sec}-${side}.frq"
    set fcmfile = "stats/${f}/${ptag}/${sec}-${side}.fcm"
    set dicfile = "stats/${f}/${ptag}/${sec}-${side}.dic"

    echo "${fcmfile}"
    cat ${frqfile} \
      | gawk '/./{print $1, $3;}' \
      | compute-cum-freqs \
      > ${fcmfile}

    echo "${dicfile}"
    cat ${fcmfile} \
      | head -${num} \
      | gawk '($2 >= 0.001){print $5;}' \
      > ${dicfile}
  end

  # Now tabulate the corresponding pairs:

  set prefs = "stats/${f}/${ptag}/${sec}-L.dic"
  if ( -z ${prefs} ) then
    echo "${prefs} is empty"; exit 1
  endif
  set suffs = "stats/${f}/${ptag}/${sec}-R.dic"
  if ( -z ${suffs} ) then
    echo "${suffs} is empty"; exit 1
  endif

  set mfile = "stats/${f}/${ptag}/${sec}.mtx"

  echo ${mfile}
  cat ${ifile} \
    | gawk '($3 ~ /[-]/) {n=$1;w =$3; gsub(/[-]/, "- -", w); printf "%7d %s\n", n,w; }' \
    | count-diword-freqs \
        -f factor-table.gawk \
        -v counted=1 \
        -v digits=5 \
        -v rows=${prefs} \
        -v cols=${suffs} \
    > ${mfile}
end