#! /usr/bin/gawk -f # Last edited on 2004-02-29 07:01:25 by stolfi BEGIN { abort = -1; usage = ( ARGV[0] "\\\n" \ " -f LIBRARY.gawk \\\n" \ " -v old=OLD.dic \\\n" \ " -v new=NEW.dic \\\n" \ " -v order=NUM \\\n" \ " OUTPUT.tbl " \ ); # Builds a word-sustitution table from two wordlists, "{old}.dic" # and "{new}.dic", by pairing them in the given order. Neither list # may contain repetitions. # # If the new list is too long, the excess words are not used. If it # is too short, an attempt is made to fabricate additional distinct # words by combining pieces of the given ones. # # User must provide a word-synthesisizing "monkey function" # # {mky_make_word(wd,pr,nw,ord,avglen,maxlen)} # # where {wd[0..nw-1]} are the sample words, {pr[0..nw-1]} are their # relative probability weights, {ord} is the Markov chain order, # {avglen} the desired average word length, {maxlen} the maximum # word length. if (old == "") { arg_error("must define \"old\""); } if (new == "") { arg_error("must define \"new\""); } if (order == "") { arg_error("must define \"order\""); } split("", oldwd); split("", oldnm); nold = read_dict(old, oldwd, oldnm); split("", newwd); split("", newnm); nnew = read_dict(new, newwd, newnm); # Make sure that we have enough new words: if (nnew < nold) { extend_new_word_list(); } # Write the table: for (i = 0; i < nold; i++) { print oldwd[i], newwd[i]; } } function extend_new_word_list( i,avglen,maxlen,totwt,newpr,mi,wi,norig) { # Extend the `new' wordlist {newwd,newnm,nnew} # until its size is at least {nold}. # Get the average and maximum length of new words: avglen = 0; maxlen = 0; totwt = 0; for (i = 0; i < nnew; i++) { mi = length(newwd[i]); wi = 1.0/(1+i) # Assume Zipf-like probabilities if (mi > maxlen) { maxlen = mi; } avglen += mi * wi; totwt += wi; } avglen /= totwt; # Create a Zipf-like distribution: split("", newpr); for (i = 0; i < nnew; i++) { newpr[i] = 1.0/(1 + i); } # Fabricate additional words: norig = nnew; for (k = nnew; k < nold; k++) { wd = fabricate_word(newwd, newpr, norig, newnm, order, avglen, maxlen); newwd[nnew] = wd; newnm[wd] = nnew; nnew++; } } function fabricate_word(wd,pr,n,seen,order,avglen,maxlen, word,try) { # Tries to make up a new word {word} by a Shannon monkey of order {order}. # Uses only words {wd[0..n-1]} for the process. # Retries until {seen[word]} is undefined. # Returns word. try = 0; while (try < 20) { word = mky_make_word(wd,pr,n,order,avglen,maxlen); try++; if ((word != "**FAIL**") && (! (word in seen))) { return word; } } data_error(("failed to generate a new word in " try " trials")); } function read_dict(file,dict,wnum, n,lin,fld,nfld) { # Reads a list of words from "file", one per line. Stores the words # in {dict[0..N-1]}, in the order read, and returns {N}. Also defines # {wnum[]} so that {dict[wnum[wd]] = wd} for every word {wd} Fails # if there are any duplicate words. n=0; while((getline lin < file) > 0) { gsub(/^[ ]*/, "", lin); if (! match(lin, /^([\#]|$)/)) { gsub(/[ ]*[\#].*$/, "", lin); nfld = split(lin, fld, " "); if (nfld != 1) tbl_error(file, ("bad wordlist entry = \"" lin "\"")); if (fld[1] in dict) tbl_error(file, ("repeated key = \"" lin "\"")); dict[n] = fld[1]; wnum[fld[1]] = n; n++; } } if (ERRNO != "0") { arg_error((file ": " ERRNO)); } close (file); if (n == 0) { printf "warning: file %s empty or missing\n", file > "/dev/stderr"; } else { printf "%s: %6d words\n", file, n > "/dev/stderr"; } return n; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function tbl_error(file, msg) { printf "file %s, line %s: %s\n", file, FNR, msg > "/dev/stderr"; abort = 1; exit 1; }