#! /usr/bin/gawk -f
# Last edited on 2025-09-24 16:52:14 by stolfi
# ** MUST BE SAVED IN ISO-LATIN-1 **
BEGIN {
abort = -1;
usage = ( ARGV[0] "\\\n" \
" -f FUNCS.gawk \\\n" \
" -v smp=SMP \\\n" \
" -v sec=SEC \\\n" \
" [ -v table=TABLE.tbl ] \\\n" \
" [ -v maxAlpha=NUM ] \\\n" \
" < INFILE > OUTFILE " \
);
# Converts a Langbank token file ("main.wds") into a "raw.alw" or "raw.tlw"
# language sample file for VMS comparative analysis studies.
#
# Each input record must have the format "{type} {token}", where
# {type} is one of [#@$aspbn] and defines the token type, as follows:
#
# "#" = {token} is a #-comment (may include blanks).
# "$" = start of the section whose full ID is {token}, e.g. "{GEN}{c3}{v14}".
# "@" = start of line number {token} in the original text.
# "a" = {token} is an alpha token.
# "s" = {token} is a symbol-like token (numeral, math symbol, etc.).
# "p" = {token} is a punctuation-like token.
# "b" = {token} is a blank-like token (should not occur).
# "n" = {token} is a null token (should not occur).
#
# Output records will have the format "{type} {loc} {token}" where
# {token} is a text token, {type} is "a" or "s", and {loc} is the
# token's location in the original book. The location {loc} will
# consist of a full book section ID {curSec}, concatenated with the
# a book line number {curLin} surrounded in "{" and "}".
#
# This script first recomputes the type of each input record of type
# "a", "p", or "s" by calling a procedure from the user-specified
# library "FUNCS.gawk":
#
# smp_reclassify_token(smp, sec, curSec, curLin, type, token)
#
# where {token} is the {body} of the input record; {type} is its
# single-character type tag, as above; {smp} and {sec} are the
# desired sample and sub-sample tags, specified by the user; and
# {curSec} and {curLin} are the full section ID and line number
# containing this occurrence of {token} in the original book. The
# procedure must return a new type for {token}, according to the
# table above; or "x", meaning that the record is in an unwanted
# subsection of the selected section that behaves like a symbol
# (foreign phrase, table, poem, etc.).
#
# The values of {curSec} and {curLin} are obtained from the last "$"
# and "@" records, respectively, before the current record. Note
# that the format of {sec} may be very different from that of
# {curSec}; e.g. {sec="gen.1"} could mean "take the whole text from
# Genesis", i.e. {curSec ~ /{GEN}{c[0-9]+}{v[0-9]+}/}.
#
# After calling {smp_reclassify_token}, the script discards
# the record if its new type is "#", "$", "@", "n", or "b".
#
# The remaining "a", "p", "s", or "x" record has its {token} adjusted to
# be suitable for statistical analysis. This may include change of
# encoding, de-capitalization, compound splitting, supression of
# some letters or tokens, etc.
#
# The script then applies the function
#
# smp_fix_token(smp, sec, type, token)
#
# where {smp} and {sec} are the desired sample and section,
# {type} is the new type ("a", "p", "s", or "x"), and {token} is
# the input token. The procedure should return a cleaned
# copy of {token}, e.g. without capitalization or undesired
# markings. It may split {token} by inserting blanks and/or
# newlines.
#
# A token that gets remapped to "*DELETE*", "*delete*", or all blanks
# will be discarded. The result of {fix_token} is split at blanks or
# newlines, and each field is processed as if it were a separate
# input record, with the same {type} and {LOC}.
#
# The individual "a" records are then tested with
# {smp_is_good_token(smp, sec, token)}, and the {type} is changed to
# "s" if the result is FALSE. The "a", "p", and "s" records then
# written to the output.
#
# As for "x"-records, runs of three or more are squeezed, leaving only
# the first and last records of the run. These have their token enclosed
# in braces and marked with "*", e.g. "finis" becomes "*{finis}",
# and their type gets replaced by "s".
#
# The resulting file is written to
#
# The file "" is then truncated after {maxAlpha} "a"-type tokens.
# The default is to process the whole input file.
#
# INITIALIZATION
#
# If the {table} argument is specified, it must name a file that
# contain pairs of words "{OLD} {NEW}". This script will read that
# file and create an array {wmap} with {wmap[OLD] = NEW}. This table
# may then be used by {smp_reclassify_token} and/or {smp_fix_token}
#
# The "FUNCS.gawk" library must also define a function
#
# smp_define_patterns(smp, sec)
#
# that will be called by this script, after loading the {wmap} table
# (if any) but before processing the first input record. This
# procedure could, for instance, precompile any complicated patterns
# to be used by {smp_reclassify_token}.
if (smp == "") { arg_error("must define \"smp\""); }
if (sec == "") { arg_error("must define \"sec\""); }
if (maxAlpha == "") { maxAlpha = -1; }
if (maxRead == "") { maxRead = -1; }
debug = 0;
# debug = 1; maxRead = 200;
s = "???"; n = "???";
curSec = "";
curLin = "";
nExRun = 0; # Number of consecutive "x"-type records generated so far
# Data of last "x"-type output record (not written), if {nExRun >= 2}:
lastExSec = ""; # Location.
lastExLin = ""; # Original line number.
lastExToken = ""; # Token.
nRead = 0; # Total records read
nWritten = 0; # total records written to output.
nAlpha = 0; # total "a"-type records written.
nSymbol = 0; # total "s"-type records written.
nPunct = 0; # total "p"-type records written.
nIntrude = 0; # total "x"-type records written (as "s"-type).
split("", wmap);
if (table != "")
{ # Read word-remapping table, if present.
load_remapping_table(table);
}
if (field == "") { field = 0; }
smp_define_patterns(smp, sec);
}
(abort >= 0) { exit abort; }
/^ *$/ { next; }
# Stop if enough:
((maxRead >= 0) && (nRead >= maxRead)) {
exit 0;
}
($1 ~ /^.$/) {
nRead++;
# Get type tag {type} and body {token}:
type = $1;
if (type == "#")
{ token = substr($0, 3); }
else
{ if (NF != 2) { data_error("bad input format"); }
token = $2;
}
# Dispose accordingly:
if (type == "$")
{ curSec = token; }
else if (type == "@")
{ curLin = token; }
else if (type ~ /^[#nb]/)
{ }
else if (type ~ /^[aspx]/)
{ if (debug) { printf "@@ %s:[%s]\n", type, token > "/dev/stderr"; }
# Reclassify token and check whether it is inside the desired section:
otype = type;
type = smp_reclassify_token(smp, sec, curSec, curLin, type, token);
if (debug) { printf " -r-> %s:[%s]\n", type, token > "/dev/stderr"; }
if (type ~ /[apsx]/)
{
# Apply sample-specific adjustments:
otoken = token;
token = smp_fix_token(smp, sec, type, token);
if (debug) { printf " -f-> %s:[%s] -> %s:%s\n", type,otoken, type,token > "/dev/stderr"; }
if ((token == "*DELETE*") || (token == "*delete*")) { token = ""; }
# Split into separate tokens at blanks:
nwds = split(token, wds, /[ \012]+/);
# Write each token separately
for(i = 1; i <= nwds; i++)
{ wdi = wds[i]; tpi = type;
if (wdi != "")
{ if ( tpi == "a" )
{ # Re re-reclassify each piece:
tpi = ( smp_is_good_token(smp, sec, tpi, wdi) ? type : "s" );
}
else
{ tpi = type; }
if (debug) { printf " -g-> %s:[%s]\n", tpi, wdi > "/dev/stderr"; }
output_token(tpi, curSec, curLin, wdi);
}
}
next;
}
else if (type !~ /[#nb]/)
{ data_error(("invalid new type tag \"" type "\"")); }
}
else
{ data_error(("unknown input type tag \"" type "\"")); }
next;
}
// { data_error(("invalid input type tag \"" $1 "\"")); }
END {
if (abort >= 0) { exit abort; }
flush_x_record();
printf " %d records read, %d written (%d alpha, %d symbol, %d punct, %d intrusions)\n", \
nRead, nWritten, nAlpha, nSymbol, nPunct, nIntrude > "/dev/stderr";
}
function output_token(type,aSec,aLin,token, gud)
{
# Outputs token {token} of type {type}, squeezing long runs of "x" tokens.
# Updates {nAlpha}, {nSymbol}, {nPunct}, {nIntrude}, {nWritten}.
# Manages {nExRun}, {lastExSec}, {lastExLin}, {lastExToken}.
# Exits the script (with 0) after writing {maxAlpha} "a"-records.
if (type == "x")
{
# Squeeze runs of "x" records, mark them as bad (type "s"):
token = ("*{" token "}");
type = "s";
if (nExRun == 0)
{ # First in a run of "x"-records, print it:
print fmt_token(type,aSec,aLin,token);
nWritten++; nIntrude++;
nExRun = 1;
}
else
{ # Non-first in a run of "x"-records, save it for now:
lastExSec = aSec; lastExLin = aLin; lastExToken = token;
nExRun++;
}
}
else
{ # Do we have any "x"-record waiting to be printed?
flush_x_record();
# Print it:
print fmt_token(type,aSec,aLin,token);
nWritten++;
if (type == "a")
{ nAlpha++;
# Have we written enough stuff:
if ((maxAlpha >= 0) && (nAlpha >= maxAlpha)) { exit 0; }
}
else if (type == "s")
{ nSymbol++; }
else if (type == "p")
{ nPunct++; }
}
}
function flush_x_record()
{
# If there is any "x"-record suspended, print it:
if (nExRun >= 2)
{ if (nExRun >= 3) { lastExToken = ( ".." lastExToken ); }
# Flush last "x"-record:
print fmt_token("s",lastExSec,lastExLin,lastExToken);
nWritten++; nIntrude++;
}
nExRun = 0;
}
function fmt_token(type,aSec, aLin, token)
{
# Formats a token {token} of type {type} for output,
# including {aSec} and {aLin}:
return sprintf("%s %s{%s} %s", type, aSec, aLin, token);
}
function load_remapping_table(file, nMap,lin,fld,nfld)
{
# Reads a word mapping table from "file", containing pairs
# of the form ORGINAL NEW.
# Stores the table in "wmap[ORIGINAL] = NEW".
nMap=0;
split("", wmap)
ERRNO = "";
while((getline lin < file) > 0) {
gsub(/^[ ]*/, "", lin);
if (! match(lin, /^([#]|$)/))
{ gsub(/[ ]*[#].*$/, "", lin);
nfld = split(lin, fld, " ");
if (nfld != 2) tbl_error(file, ("bad table entry = \"" lin "\""));
if (fld[1] in wmap) tbl_error(file, ("repeated key = \"" lin "\""));
wmap[fld[1]] = fld[2];
nMap++;
}
}
if (ERRNO != "") { arg_error((file ": ERRNO = " ERRNO)); }
close (file);
printf " loaded %6d map pairs\n", nMap > "/dev/stderr";
}
function arg_error(msg)
{
printf "%s\n", msg > "/dev/stderr";
printf "usage: %s\n", usage > "/dev/stderr";
abort = 1;
exit 1;
}
function data_error(msg)
{
printf "line %d: %s\n", FNR, msg > "/dev/stderr";
abort = 1; exit 1;
}
function tbl_error(file, msg)
{
printf "file %s, line %s: %s\n", file, FNR, msg > "/dev/stderr";
abort = 1; exit 1;
}