#! /bin/gawk -f
# Last edited on 2025-04-29 20:45:02 by stolfi
BEGIN {
abort = -1;
usage = ( \
"cat RAW.evt \\\n" \
" | evt-to-wds \\\n" \
" | fix-raw-words \\\n" \
" -f FIXFNS.gawk \\\n" \
" -v sample=SAMPLE \\\n" \
" [ -v field=FIELD ] \\\n" \
" [ -v table=TABLE ] \\\n" \
" > GUD.wds" \
);
# Performs any adjustments to words extracted from text sample
# SAMPLE needed for proper statistical analysis.
# This may include a change of encoding, removing capitalization,
# elimination of selected words, etc.
#
# The output is the "raw" word list: it should still include all
# pronounceable words, numerals, and symbols, but should exclude
# punctuation and other silent marks.
if (sample == "") { arg_error("must define \"sample\""); }
if (table == "")
{ split("", wmap); }
else
{ # Read word-remapping table, if present.
# To discard a word from the "raw" susbset,
# map it to the string "*DELETE*".
load_remapping_table(table);
}
if (field == "") { field = 0; }
nread = 0; # Number of words read
nwrite = 0; # Number of words written
}
(abort >= 0) { exit abort; }
/^[ ]*([#]|$)/ { next; }
/./ {
nread++;
word = $(field);
if (word in wmap)
{ word = wmap[word];
if ((word == "*DELETE*") || (word == "*delete*")) { next; }
}
word = fix_raw_word(word);
if ((word == "*DELETE*") || (word == "*delete*")) { next; }
nwds = split(word, wds, "\n");
for(i = 1; i <= nwds; i++)
{ $(field) = wds[i]; print; nwrite++; }
next;
}
END {
if (abort >= 0) { exit abort; }
printf "%s: %7d words read, %7d written\n", sample, nread, nwrite > "/dev/stderr";
}
function load_remapping_table(file, nMap,lin,fld,nfld)
{
# Reads a word mapping table from "file", containing pairs
# of the form ORGINAL NEW.
# Stores the table in "wmap[ORIGINAL] = NEW".
nMap=0;
split("", wmap)
ERRNO = ""
while((getline lin < file) > 0) {
gsub(/^[ ]*/, "", lin);
if (! match(lin, /^([#]|$)/))
{ gsub(/[ ]*[#].*$/, "", lin);
nfld = split(lin, fld, " ");
if (nfld != 2) tbl_error(file, ("bad table entry = \"" lin "\""));
if (fld[1] in wmap) tbl_error(file, ("repeated key = \"" lin "\""));
wmap[fld[1]] = fld[2];
nMap++;
}
}
if (ERRNO != "") { arg_error((file ": " ERRNO)); }
close (file);
if (nMap == 0)
{ printf "warning: file \"" file "\" empty or missing\n" > "/dev/stderr"; }
else
{ printf "loaded %6d map pairs\n", nMap > "/dev/stderr"; }
}
function arg_error(msg)
{
printf "%s\n", msg > "/dev/stderr";
printf "usage: %s\n", usage > "/dev/stderr";
abort=1; exit 1;
}
function data_error(msg)
{
printf "line %d: %s\n", FNR, msg > "/dev/stderr";
abort = 1; exit 1;
}
function tbl_error(file, msg)
{
printf "file %s, line %s: %s\n", file, FNR, msg > "/dev/stderr";
abort = 1; exit 1;
}