#! /usr/bin/gawk -f
# Last edited on 2023-05-10 15:51:27 by stolfi
BEGIN {
abort = -1;
usage = ( ARGV[0] "\\\n" \
" -f LIBRARY.gawk \\\n" \
" -v old=OLD.dic \\\n" \
" -v new=NEW.dic \\\n" \
" -v order=NUM \\\n" \
" OUTPUT.tbl " \
);
# Builds a word-sustitution table from two wordlists, "{old}.dic"
# and "{new}.dic", by pairing them in the given order. Neither list
# may contain repetitions.
#
# If the new list is too long, the excess words are not used. If it
# is too short, an attempt is made to fabricate additional distinct
# words by combining pieces of the given ones.
#
# User must provide a word-synthesisizing "monkey function"
#
# {mky_make_word(wd,pr,nw,ord,avglen,maxlen)}
#
# where {wd[0..nw-1]} are the sample words, {pr[0..nw-1]} are their
# relative probability weights, {ord} is the Markov chain order,
# {avglen} the desired average word length, {maxlen} the maximum
# word length.
if (old == "") { arg_error("must define \"old\""); }
if (new == "") { arg_error("must define \"new\""); }
if (order == "") { arg_error("must define \"order\""); }
split("", oldwd); split("", oldnm);
nold = read_dict(old, oldwd, oldnm);
split("", newwd); split("", newnm);
nnew = read_dict(new, newwd, newnm);
# Make sure that we have enough new words:
if (nnew < nold) { extend_new_word_list(); }
# Write the table:
for (i = 0; i < nold; i++)
{ print oldwd[i], newwd[i]; }
}
function extend_new_word_list( i,avglen,maxlen,totwt,newpr,mi,wi,norig)
{
# Extend the `new' wordlist {newwd,newnm,nnew}
# until its size is at least {nold}.
# Get the average and maximum length of new words:
avglen = 0; maxlen = 0; totwt = 0;
for (i = 0; i < nnew; i++)
{ mi = length(newwd[i]);
wi = 1.0/(1+i) # Assume Zipf-like probabilities
if (mi > maxlen) { maxlen = mi; }
avglen += mi * wi;
totwt += wi;
}
avglen /= totwt;
# Create a Zipf-like distribution:
split("", newpr);
for (i = 0; i < nnew; i++) { newpr[i] = 1.0/(1 + i); }
# Fabricate additional words:
norig = nnew;
for (k = nnew; k < nold; k++)
{ wd = fabricate_word(newwd, newpr, norig, newnm, order, avglen, maxlen);
newwd[nnew] = wd; newnm[wd] = nnew;
nnew++;
}
}
function fabricate_word(wd,pr,n,seen,order,avglen,maxlen, word,try)
{
# Tries to make up a new word {word} by a Shannon monkey of order {order}.
# Uses only words {wd[0..n-1]} for the process.
# Retries until {seen[word]} is undefined.
# Returns word.
try = 0;
while (try < 20)
{ word = mky_make_word(wd,pr,n,order,avglen,maxlen);
try++;
if ((word != "**FAIL**") && (! (word in seen))) { return word; }
}
data_error(("failed to generate a new word in " try " trials"));
}
function read_dict(file,dict,wnum, n,lin,fld,nfld)
{
# Reads a list of words from "file", one per line. Stores the words
# in {dict[0..N-1]}, in the order read, and returns {N}. Also defines
# {wnum[]} so that {dict[wnum[wd]] = wd} for every word {wd} Fails
# if there are any duplicate words.
n=0;
while((getline lin < file) > 0) {
gsub(/^[ ]*/, "", lin);
if (! match(lin, /^([#]|$)/))
{ gsub(/[ ]*[#].*$/, "", lin);
nfld = split(lin, fld, " ");
if (nfld != 1) tbl_error(file, ("bad wordlist entry = \"" lin "\""));
if (fld[1] in dict) tbl_error(file, ("repeated key = \"" lin "\""));
dict[n] = fld[1];
wnum[fld[1]] = n;
n++;
}
}
if (ERRNO != "0") { arg_error((file ": " ERRNO)); }
close (file);
if (n == 0)
{ printf "warning: file %s empty or missing\n", file > "/dev/stderr"; }
else
{ printf "%s: %6d words\n", file, n > "/dev/stderr"; }
return n;
}
function arg_error(msg)
{
printf "%s\n", msg > "/dev/stderr";
printf "usage: %s\n", usage > "/dev/stderr";
abort = 1;
exit 1;
}
function data_error(msg)
{
printf "line %d: %s\n", FNR, msg > "/dev/stderr";
abort = 1; exit 1;
}
function tbl_error(file, msg)
{
printf "file %s, line %s: %s\n", file, FNR, msg > "/dev/stderr";
abort = 1; exit 1;
}