#! /bin/csh -f

set usage = "$0 [ -recode FILTER ] [ -chars CHARS ] INFILE OUTPREFIX"

# Extracts a list of words from a single-version transcription file in EVT format,
# arbitrary alphabet.
#
# First, removes all comments, location codes, the fillers "!" "%",
# and any leading and trailing blanks (spaces, dots, and commas).
# Replaces embedded strings of blanks by a single ".".
#
# The comment-stripped text is piped through the specified filter, if given.
#
# Then inserts a newline after each "=" and "/", if there is none.
# Inserts a "/" before each newline, if there is none.
# Maps "," and " " to ".".
# Replaces strings of ".-/=" characters by a single copy of the 
# highest character of the string, in that order. 
# Deletes any leading "." or "-" 
# Deletes any leading "/" unless it is the only char in the line. 
#
# What results is the "base text".
#
# Writes a bunch of files with names beginning in OUTPREFIX:
#
#    OUTPREFIX.txt      
#      A readable version of the base text, with "." and "-" replaced by " ",
#      "/" omitted (implied by newlines), "=" replaced by " =".
#
#    OUTPREFIX.wds      
#      List of all words, one per line, in original order.
#      The codes "-" "=" "/" are treated as separate words.
#      The code "." is treated as a word separator but omitted.
#      
#    OUTPREFIX.dic      
#      The set of all words, including "/" "=" "-", sorted and uniquified.
#
#    OUTPREFIX.frq      
#      Frequency counts for all words, in decreasing freq order.
#
#    OUTPREFIX-gut.wds  
#    OUTPREFIX-gut.dic
#    OUTPREFIX-gut.frq
#      Same as OUTPREFIX.wds, OUTPREFIX.dic, and OUTPREFIX.frq,
#      but including only the "good" words (those made entirely of CHARS).
#
#    OUTPREFIX-bad.wds
#    OUTPREFIX-bad.dic  ditto, only "bad" words 
#    OUTPREFIX-bad.frq  ditto, only for "bad" words 
#      Ditto, but including only the "bad" words (those that contain
#      one or more "?"s).
#
#    OUTPREFIX-fun.wds
#    OUTPREFIX-fun.dic
#    OUTPREFIX-fun.frq
#      Ditto, but including only the "funny" words (those
#      that do not contain "?" but contain some non-CHARS).
#
# The CHARS argument must be a fully expanded string (no ranges,
# wildcards, etc).  It may include "=" "-" "/" but not "?" or ".".

set chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
set wc = "${STOLFIHOME}/bin/dicio-wc"
set recode = "/bin/cat"

while ( ( $#argv > 0 ) && ( "x$1" =~ x-* ) ) 
  if ( ( $#argv >= 2 ) && ( "x$1" == "x-chars" ) ) then
    set chars = "$2"; shift; shift
  else if ( ( $#argv >= 2 ) && ( "x$1" == "x-recode" ) ) then
    set recode = "$2"; shift; shift
  else 
    echo "usage: ${usage}"; exit 1
  endif
end

if ( $#argv < 2 ) then
  echo "usage: ${usage}"; exit 1
endif

set infile = "$1"; shift;
set prefix = "$1"; shift;

if ( $#argv > 0 ) then
  echo "usage: ${usage}"; exit 1
endif

set temp = "/tmp/$$.txt"

cat ${infile} \
  | sed \
      -e '/^#/d' \
      -e 's/{[^}]*}//g' \
      -e 's/^<[^>]*;[A-Z][A-Za-z0-9]*> *//g' \
      -e 's/[\!%]//g' \
      -e 's/^[., ][., ]*//g' \
      -e 's/[., ][., ]*$//g' \
      -e 's/[., ][., ]*/./g' \
  | ${recode} \
  | sed \
      -e 's/[., ][., ]*/./g' \
      -e 's:=\(.\):= \1:g' \
      -e 's:/\(.\):/ \1:g' \
  | tr ' ' '\012' \
  | sed \
      -e 's:$:/:g' \
      -e 's/[.]*-[-.]*/-/g' \
      -e 's:[-.]*/[-./]*:/:g' \
      -e 's:[-./]*=[-./=]*:=:g' \
      -e 's/^[-.]*//g' \
      -e 's:^/\(.\):\1:g' \
      -e 's/[*]/?/g' \
      -e 's/?????*/???/g' \
  > ${temp}

cat ${temp} \
  | sed \
     -e 's/[-.]/ /g' \
     -e 's:/::g' \
     -e 's:^  *::g' \
     -e 's:  *$::g' \
     -e 's/\(.\)=/\1 =/g' \
  | egrep '.' \
  > ${prefix}.txt
  
cat ${temp} \
  | sed -e 's:\([-=/]\): \1 :g' \
  | /bin/tr '. ' '\012\012' \
  | egrep '.' \
  > ${prefix}.wds

cat ${prefix}.wds \
  | egrep '^['"${chars}"']*$' \
  > ${prefix}-gut.wds

cat ${prefix}.wds \
  | grep -v '?' \
  | egrep '[^'"${chars}"']' \
  > ${prefix}-fun.wds

cat ${prefix}.wds \
  | grep '?' \
  > ${prefix}-bad.wds

foreach mod ( '' '-gut' '-fun' '-bad' )
  cat ${prefix}${mod}.wds \
    | sort | uniq \
    > ${prefix}${mod}.dic

  cat ${prefix}${mod}.wds \
    | sort | uniq -c | expand \
    | sort +0 -1nr \
    > ${prefix}${mod}.frq
end

${wc} ${prefix}.txt ${prefix}{,-gut,-fun,-bad}.{wds,dic}

echo "Sample from ${prefix}.txt:"
echo "  "
cat ${prefix}.txt | head -8 | sed -e 's/^/  /' 
echo "  "

cat ${temp} \
  | count-digraph-freqs \
      -v pad="/" \
      -v showentropy=1 \
      -v chars="./=${chars}?-"

/bin/rm ${temp}