#! /bin/gawk -f
# Last edited on 2025-04-29 20:36:27 by stolfi
BEGIN {
abort = -1;
usage = ( \
"cat INFILE.txt \\\n" \
" | roman-code-words -f roman-code-fns.gawk \\\n" \
" [ -v table=TBLNAME ] \\\n" \
" [ -v honorCase=BOOL ] \\\n" \
" [ -v maxNumWords=NUMWDS [ -v maxCode=MAXCODE ] ] \\\n" \
" > OUTFILE.rcd" \
);
# Each line of the input must contain zero or more words separated
# by spaces. Maps each plain-word in the input to a numeric code,
# then encodes the latter with a Roman-like number system, and
# writes the resulting code-word.
#
# The file TBLNAME, which defaults to empty, should contain
# a list of PLAINWORD CODEWORD pairs, one per line; it is used to map
# plain-words to code-words directly. In any case, each input word
# that is not found in the table get assigned the next available
# code number, and this assignment is entered in the table..
#
# By default, the valid numeric codes are sequential, starting from 1.
# However, if "maxNumWords" is specified, the valid numeric codes
# are spaced further apart so that NUMWDS of them will span the
# range [1..MAXNUMCODE] of valid numeric codes. The value
# of MAXNUMCODE defaults to the maximum allowed by the encoding,
# but may be set explicitly by giving the "maxCode" parameter
# (an *encoded* number).
#
# If {honorCase} is set (default), the script will attempt to copy
# the capitalization of the input words into the code. I.e. if the
# input word is "Mars", and "foo" is the next available code word,
# then "foo" is assigned to "mars" and "Foo" to "Mars". This rule
# also applies to pairs pre-defined in the initial table. Otherwise
# "Mars" and "mars" are considered distinct words and get distinct
# codes.
#
# In any case, the completed encoding table will be written at the end
# of the output text, as a bunch of "#"-comments, bracketed by
# the heading "# BEGIN DICTIONARY" and "# END DICTIONARY"
if (honorCase == "") { honorCase = 1; }
renc_init();
printf "maximum code = %s (%d)\n", \
renc_max_code, renc_max_num > "/dev/stderr";
if (maxNumWords == "")
{ code_increment = 1; }
else
{ printf "assuming max %d distinct words\n", maxNumWords > "/dev/stderr";
if (maxCode == "")
{ maxNumCode = renc_max_num; }
else
{ maxNumCode = renc_decode(maxCode);
if (maxNumCode > renc_max_num)
{ arg_error(("given \"maxCode\" is too big")); }
}
printf "adjusting increment for max code = %s (%d)\n", maxCode, maxNumCode > "/dev/stderr";
if (maxNumWords > maxNumCode)
{ arg_error(("given \"maxNumWords\" is too big")); }
code_increment = maxNumCode/maxNumWords;
}
printf "code increment = %d\n", code_increment > "/dev/stderr";
split("", dic);
num_distinct_words = 0;
max_assigned_num_code = 0;
max_assigned_code = "";
if (table != "") { load_table(table); }
}
(abort >= 0) { exit abort; }
/^ *([#@]|$)/ { print; next; }
// {
for (i = 1; i <= NF; i++)
{ wd = $(i);
$(i) = encode_word(wd);
}
print;
next;
}
# The following is OK for texts with Latin alphabet:
#
# // {
# lin = $0; res = ""; wd = "";
# m = length(lin);
# for (i = 1; i <= m; i++)
# { c = substr(lin,i,1);
# if (is_alpha(c))
# { w = (w c); }
# else
# { if (w != "") { res = (res encode_word(w)); w = ""; }
# res = (res c);
# }
# }
# if (w != "") { res = (res encode_word(w)); w = ""; }
# print res;
# }
#
# function is_alpha(c)
# {
# c = tolower(c);
# return \
# ((c >= "a") && (c <= "z")) || \
# ((c >= "ß") && (c <= "ö")) || \
# ((c >= "ø") && (c <= "ÿ"));
# }
END {
printf "%d distinct words seen\n", num_distinct_words > "/dev/stderr";
printf "maximum code used = %s (%d)\n", max_assigned_code, max_assigned_num_code > "/dev/stderr";
printf "# BEGIN DICTIONARY\n";
for (w in dic)
{ printf "# %s %s\n", w, dic[w]; }
printf "# END DICTIONARY\n";
}
function encode_word(word, w,num,code,pfx,sfx)
{
# Empty is not a word:
if (word == "") { return word; }
# Lookup word in table:
w = (honorCase ? tolower(word) : word);
if (w in dic)
{ code = dic[w]; }
else
{ num_distinct_words++;
num = int(max_assigned_num_code + code_increment);
if (num <= max_assigned_num_code) { prog_error(("num code bug")); }
code = renc_encode(num);
max_assigned_num_code = num;
max_assigned_code = code;
dic[w] = code;
}
if (honorCase && (word != w))
{ # The original {word} was not in lowercase.
# Try to preserve its capitalization in in the code, if possible.
# Note that some codes have non-alpha prefixes.
if (word ~ /[A-Z].*[A-Z]/)
{ # Assume all-caps:
code = toupper(code);
}
else
{ # Assume initial-caps:
if (match(code, /[a-z]/))
{ pfx = substr(code,1,RSTART-1);
sfx = substr(code,RSTART);
code = (pfx toupper(substr(sfx,1,1)) substr(sfx,2));
}
}
# We assign the modified code to {word} too, for
# the benefit of dumber programs who may want to use our table
dic[word] = code;
}
return code;
# If all else fails:
return word;
}
function load_table(fname, ntbl,nbadtbl,nlin,lin,fld,nfld,word,code,num)
{
# Loads the mapping table "dic" from file "fname".
# Also updates "max_assigned_num_code".
max_assigned_num_code = 0;
ntbl = 0;
nbadtbl = 0;
nlin = 0;
ERRNO = ""
while((getline lin < fname) > 0) {
nlin++;
if (! match(lin, /^[ \011]*([#]|$)/))
{ gsub(/[#].*$/, "", lin);
nfld = split(lin, fld, " ");
if (nfld != 2) tbl_error(fname, nlin, ("bad table entry = \"" lin "\""));
word = fld[1]; code = fld[2];
if (honorCase) { word = tolower(word); code = tolower(code); }
if (word in dic) tbl_error(fname, nlin, ("repeated plainword = \"" lin "\""));
dic[word] = code;
num = renc_decode(code,-1);
if (num == -1)
{ nbadtbl++; }
else if (num > max_assigned_num_code)
{ max_assigned_num_code = num; }
ntbl++;
}
}
if (ERRNO != "") { tbl_error(fname, nlin, ERRNO); }
close (fname);
if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); }
printf "loaded %6d pairs from %s\n", ntbl, fname > "/dev/stderr"
if (nbadtbl > 0)
{ printf "!!! warning - file %s contains %d invalid word codes\n", \
fname, nbadtbl >> "/dev/stderr";
}
}
function arg_error(msg)
{
printf "%s\n", msg > "/dev/stderr";
printf "usage: %s\n", usage > "/dev/stderr";
abort = 1;
exit 1
}
function tbl_error(f,n,msg)
{
printf "file %s, line %d: %s\n", f, n, msg > "/dev/stderr";
abort = 1;
exit 1
}
function data_error(msg)
{
printf "line %d: %s\n", FNR, msg > "/dev/stderr";
abort = 1;
exit 1
}