#! /bin/csh -f 
# Last edited on 2025-05-01 18:48:26 by stolfi

set usage = "$0 FTAG SPLITCHAR"

# Extracts prefixes and suffixes from a frequency file of 
# word feature FTAG.  Assumes available the files
#
#   stats-subsecs/FTAG/tot.frq 
#     counts and frequencies of the FTAG values over all sections,
#
#   smash-FTAG-letters
#     a script that reads a file of counts and FTAGs,
#     and collapses each `letter' of the FTAG
#     to a single "X" (preserving the SPLITCHAR if any).
#
# This script considers ony words of the frequency file that contain
# exactly one instance of the given SPLITCHAR, and separates those
# words into a "prefix" and a "suffix" at that character. Outputs
# files 
#
#    stats-subsecs/FTAG/tot-{pref,suff}.frq 
#      counts and freqs of prefixes and suffixes, separately.
#
#    stats-subsecs/FTAG/tot-{pref,suff}-len.frq 
#      counts and freqs of prefix and suffix lengths, separately.

if ( $#argv != 2 ) then
  echo "usage: ${usage}"; exit 1
endif

set ftag = "$1"; shift;
set splitchar = "$1"; shift;

set tmp = "/tmp/$$"

set ifile = "stats-subsecs/${ftag}/tot.frq"

if ( ! ( -r ${ifile} ) ) then
  echo "${ifile} not found"; exit 1
endif

foreach cp ( cat.  smash-${ftag}-letters.-len
  set ccmd = "${cp:r}"
  set ctag = "${cp:e}"
  foreach ap ( 2.pref 3.suff )
    set atag = "${ap:e}"
    set afld = "${ap:r}"
    cat ${ifile} \
      | gawk '/./{print $1, $3;}' \
      | ${ccmd} \
      | egrep '['"${splitchar}"']' \
      | egrep -v '['"${splitchar}"'].*['"${splitchar}"']' \
      | sed -e 's/['"${splitchar}"']/'"${splitchar}"' '"${splitchar}"'/g' \
      | gawk -v fld=${afld} '/./{print $1, $(fld);}' \
      | combine-counts \
      | sort -b -k1nr \
      | compute-cum-freqs \
      > stats-subsecs/${ftag}/tot-${atag}${ctag}.frq
  end
end