#! /bin/csh -f
set usage = "$0 [ -recode FILTER ] [ -chars CHARS ] INFILE OUTPREFIX"
# Extracts a list of words from a single-version transcription file in EVT format,
# arbitrary alphabet.
#
# First, removes all comments, location codes, the fillers "!" "%",
# and any leading and trailing blanks (spaces, dots, and commas).
# Replaces embedded strings of blanks by a single ".".
#
# The comment-stripped text is piped through the specified filter, if given.
#
# Then inserts a newline after each "=" and "/", if there is none.
# Inserts a "/" before each newline, if there is none.
# Maps "," and " " to ".".
# Replaces strings of ".-/=" characters by a single copy of the
# highest character of the string, in that order.
# Deletes any leading "." or "-"
# Deletes any leading "/" unless it is the only char in the line.
#
# What results is the "base text".
#
# Writes a bunch of files with names beginning in OUTPREFIX:
#
# OUTPREFIX.txt
# A readable version of the base text, with "." and "-" replaced by " ",
# "/" omitted (implied by newlines), "=" replaced by " =".
#
# OUTPREFIX.wds
# List of all words, one per line, in original order.
# The codes "-" "=" "/" are treated as separate words.
# The code "." is treated as a word separator but omitted.
#
# OUTPREFIX.dic
# The set of all words, including "/" "=" "-", sorted and uniquified.
#
# OUTPREFIX.frq
# Frequency counts for all words, in decreasing freq order.
#
# OUTPREFIX-gut.wds
# OUTPREFIX-gut.dic
# OUTPREFIX-gut.frq
# Same as OUTPREFIX.wds, OUTPREFIX.dic, and OUTPREFIX.frq,
# but including only the "good" words (those made entirely of CHARS).
#
# OUTPREFIX-bad.wds
# OUTPREFIX-bad.dic ditto, only "bad" words
# OUTPREFIX-bad.frq ditto, only for "bad" words
# Ditto, but including only the "bad" words (those that contain
# one or more "?"s).
#
# OUTPREFIX-fun.wds
# OUTPREFIX-fun.dic
# OUTPREFIX-fun.frq
# Ditto, but including only the "funny" words (those
# that do not contain "?" but contain some non-CHARS).
#
# The CHARS argument must be a fully expanded string (no ranges,
# wildcards, etc). It may include "=" "-" "/" but not "?" or ".".
set chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
set wc = "${STOLFIHOME}/bin/dicio-wc"
set recode = "/bin/cat"
while ( ( $#argv > 0 ) && ( "x$1" =~ x-* ) )
if ( ( $#argv >= 2 ) && ( "x$1" == "x-chars" ) ) then
set chars = "$2"; shift; shift
else if ( ( $#argv >= 2 ) && ( "x$1" == "x-recode" ) ) then
set recode = "$2"; shift; shift
else
echo "usage: ${usage}"; exit 1
endif
end
if ( $#argv < 2 ) then
echo "usage: ${usage}"; exit 1
endif
set infile = "$1"; shift;
set prefix = "$1"; shift;
if ( $#argv > 0 ) then
echo "usage: ${usage}"; exit 1
endif
set temp = "/tmp/$$.txt"
cat ${infile} \
| sed \
-e '/^#/d' \
-e 's/{[^}]*}//g' \
-e 's/^<[^>]*;[A-Z][A-Za-z0-9]*> *//g' \
-e 's/[\!%]//g' \
-e 's/^[., ][., ]*//g' \
-e 's/[., ][., ]*$//g' \
-e 's/[., ][., ]*/./g' \
| ${recode} \
| sed \
-e 's/[., ][., ]*/./g' \
-e 's:=\(.\):= \1:g' \
-e 's:/\(.\):/ \1:g' \
| tr ' ' '\012' \
| sed \
-e 's:$:/:g' \
-e 's/[.]*-[-.]*/-/g' \
-e 's:[-.]*/[-./]*:/:g' \
-e 's:[-./]*=[-./=]*:=:g' \
-e 's/^[-.]*//g' \
-e 's:^/\(.\):\1:g' \
-e 's/[*]/?/g' \
-e 's/?????*/???/g' \
> ${temp}
cat ${temp} \
| sed \
-e 's/[-.]/ /g' \
-e 's:/::g' \
-e 's:^ *::g' \
-e 's: *$::g' \
-e 's/\(.\)=/\1 =/g' \
| egrep '.' \
> ${prefix}.txt
cat ${temp} \
| sed -e 's:\([-=/]\): \1 :g' \
| /bin/tr '. ' '\012\012' \
| egrep '.' \
> ${prefix}.wds
cat ${prefix}.wds \
| egrep '^['"${chars}"']*$' \
> ${prefix}-gut.wds
cat ${prefix}.wds \
| grep -v '?' \
| egrep '[^'"${chars}"']' \
> ${prefix}-fun.wds
cat ${prefix}.wds \
| grep '?' \
> ${prefix}-bad.wds
foreach mod ( '' '-gut' '-fun' '-bad' )
cat ${prefix}${mod}.wds \
| sort | uniq \
> ${prefix}${mod}.dic
cat ${prefix}${mod}.wds \
| sort | uniq -c | expand \
| sort -k1nr \
> ${prefix}${mod}.frq
end
${wc} ${prefix}.txt ${prefix}{,-gut,-fun,-bad}.{wds,dic}
echo "Sample from ${prefix}.txt:"
echo " "
cat ${prefix}.txt | head -8 | sed -e 's/^/ /'
echo " "
cat ${temp} \
| count-digraph-freqs \
-v pad="/" \
-v showentropy=1 \
-v chars="./=${chars}?-"
/bin/rm ${temp}