#! /bin/csh -f
# Last edited on 2000-06-14 01:42:12 by stolfi

set usage = "$0 SECTION TAGPAIR SELTYPE PART"

# Extracts word component statistics from the frequency files prepared
# in Note 057. The input data file is stats/{words,labels}/TAGPAIR/SECTION.frq.
# Each entry should have the format
# 
#   COUNT FREQ TYPE-VALUE
# 
# where TYPE is a component type tag, as used in Note 057 (e.g. "pm"
# for crust+mantle prefix), VALUE is the component itself.
# 
# This script selects the entries with TYPE = SELTYPE and writes the
# corresponding COUNT, FREQ, and VALUE to prob/obs/SECTION/PART.frq
#
# As a special case, SELTYPE = "w" means "take all entries".

if ( $#argv != 4 ) then
  echo "usage: ${usage}"; exit 1
endif

set sec = "$1"; shift
set tagpair = "$1"; shift
set seltype = "$1"; shift
set part = "$1"; shift

if ( ${seltype} == "w" ) set seltype = '.*'
if ( ${sec} == "txt.n" ) then
  set wdtype = "words"
  set infile = "stats/words/${tagpair}/tot.frq"
else if ( ${sec} == "lab.n" ) then
  set wdtype = "labels"
  set infile = "stats/labels/${tagpair}/tot.frq"
else
  set wdtype = "words"
  set infile = "stats/words/${tagpair}/${sec}.frq"
endif

set obsdir = "prob/obs/${sec}"
set frfile = "${obsdir}/${part}.frq"

if ( ! ( -d ${obsdir} ) ) mkdir ${obsdir}

echo "${infile} -> ${frfile}"
cat ${infile} \
  | gawk \
      ' ($3 ~ /^'"${seltype}"'-/){ \
          c = $1; w = $3; \
          gsub(/^'"${seltype}"'-/,"",w); if (w==""){ w = "."; } \
          printf "%7d %s\n", c, w; next; \
        } \
      ' \
  | sort -b -n +0 -1nr +1 -2 \
  | compute-freqs \
  > ${frfile}