#! /usr/bin/gawk -f
# Last edited on 2023-05-10 15:51:27 by stolfi


BEGIN {
  abort = -1;
  usage = ( ARGV[0] "\\\n" \
    "  -f LIBRARY.gawk \\\n" \
    "  -v old=OLD.dic \\\n" \
    "  -v new=NEW.dic \\\n" \
    "  -v order=NUM \\\n" \
    "  OUTPUT.tbl " \
  );
  
  # Builds a word-sustitution table from two wordlists, "{old}.dic"
  # and "{new}.dic", by pairing them in the given order. Neither list
  # may contain repetitions. 
  # 
  # If the new list is too long, the excess words are not used. If it
  # is too short, an attempt is made to fabricate additional distinct
  # words by combining pieces of the given ones.
  # 
  # User must provide a word-synthesisizing "monkey function"
  # 
  #  {mky_make_word(wd,pr,nw,ord,avglen,maxlen)}
  # 
  # where {wd[0..nw-1]} are the sample words, {pr[0..nw-1]} are their
  # relative probability weights, {ord} is the Markov chain order, 
  # {avglen} the desired average word length, {maxlen} the maximum
  # word length.
  
  if (old == "")      { arg_error("must define \"old\""); }
  if (new == "")      { arg_error("must define \"new\""); }
  if (order == "")    { arg_error("must define \"order\""); }

  split("", oldwd); split("", oldnm);
  nold = read_dict(old, oldwd, oldnm);
  
  split("", newwd); split("", newnm);
  nnew = read_dict(new, newwd, newnm);

  # Make sure that we have enough new words:
  if (nnew < nold) { extend_new_word_list(); }
  
  # Write the table:
  for (i = 0; i < nold; i++)
    { print oldwd[i], newwd[i]; }
}

function extend_new_word_list(     i,avglen,maxlen,totwt,newpr,mi,wi,norig)
{
  # Extend the `new' wordlist {newwd,newnm,nnew} 
  # until its size is at least {nold}.
  
  # Get the average and maximum length of new words:
  avglen = 0; maxlen = 0; totwt = 0;
  for (i = 0; i < nnew; i++)
    { mi = length(newwd[i]);
      wi = 1.0/(1+i) # Assume Zipf-like probabilities
      if (mi > maxlen) { maxlen = mi; }
      avglen += mi * wi;
      totwt += wi;
    }
  avglen /= totwt;
  
  # Create a Zipf-like distribution:
  split("", newpr);
  for (i = 0; i < nnew; i++) { newpr[i] = 1.0/(1 + i); }

  # Fabricate additional words: 
  norig = nnew;
  for (k = nnew; k < nold; k++) 
    { wd = fabricate_word(newwd, newpr, norig, newnm, order, avglen, maxlen);
      newwd[nnew] = wd; newnm[wd] = nnew;
      nnew++;
    }
}

function fabricate_word(wd,pr,n,seen,order,avglen,maxlen,   word,try)
{
  # Tries to make up a new word {word} by a Shannon monkey of order {order}.
  # Uses only words {wd[0..n-1]} for the process.
  # Retries until {seen[word]} is undefined.
  # Returns word.
  
  try = 0; 
  while (try < 20)
    { word = mky_make_word(wd,pr,n,order,avglen,maxlen);
      try++;
      if ((word != "**FAIL**") && (! (word in seen))) { return word; }
    }
  data_error(("failed to generate a new word in " try " trials"));
}

function read_dict(file,dict,wnum,    n,lin,fld,nfld)
{
  # Reads a list of words from "file", one per line. Stores the words
  # in {dict[0..N-1]}, in the order read, and returns {N}. Also defines
  # {wnum[]} so that {dict[wnum[wd]] = wd} for every word {wd} Fails
  # if there are any duplicate words.
  
  n=0;
  while((getline lin < file) > 0) { 
    gsub(/^[ ]*/, "", lin);
    if (! match(lin, /^([#]|$)/))
      { gsub(/[ ]*[#].*$/, "", lin);
        nfld = split(lin, fld, " ");
        if (nfld != 1) tbl_error(file, ("bad wordlist entry = \"" lin "\""));
        if (fld[1] in dict) tbl_error(file, ("repeated key = \"" lin "\""));
        dict[n] = fld[1];
        wnum[fld[1]] = n;
        n++;
      }
  }
  if (ERRNO != "0") { arg_error((file ": " ERRNO)); }
  close (file);
  if (n == 0) 
    { printf "warning: file %s empty or missing\n", file > "/dev/stderr"; }
  else
    { printf "%s: %6d words\n", file, n > "/dev/stderr"; }
  return n;
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1;
}

function data_error(msg)
{
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}

function tbl_error(file, msg)
{
  printf "file %s, line %s: %s\n", file, FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}