#! /usr/bin/gawk -f # Last edited on 2012-05-05 20:08:36 by stolfilocal BEGIN { abort = -1; usage = ( ARGV[0] "\\\n" \ " -v num=NUM \\\n" \ " < INPUT.tks > OUTPUT.wfr " \ ); # Reads a stream of tokens, one per line. Writes a new stream where # some consecutive word pairs have been combined into hyphenated # prefix-suffix compounds so as to make {num} distinct words in # total. Tries to achieve a Zipfian distribution of the compounds. # (If the input stream already has {num} or more distinct words, the # output is equal to the input, truncated just before the {num+1}th # new word.) # # The hyphenated compounds are such that they can be readily # recognised even if the output text is scrambled and the hyphens # are deleted. That is ensured by classifying each input word # as either "isolated", "prefix", or "suffix", # and using only compounds {u-v} where # # {u} is a prefix and {v} is not a prefix, or # {u} is not a suffix and {v} is a suffix. # # Then, given any text formed from the new lexicon and # de-hyphenated, we can recover the hyphens by simply # locating adjacent pairs {u v} where the above conditions # hold. Note that scrambling the words before removing the # hyphens cannot create such a pair. # # The choice of prefixes and suffixes is constrained by the # desire that the input text remains a legal text for the new # lexicon; i.e., # # if {u v} occurs in the input, then {u} and {v} cannot # be both prefixes or both suffixes; # # if {u v w} occurs in the input, then either {u} is not # prefix, or {w} is not suffix. # # These conditions are the "coding invariant". The input file # is considered circular for the purposes of these invariants. if (num == "") { arg_error(("must define \"num\"")); } if (num < 1) { arg_error(("invalid num = \"" num "\"")); } split("", tk); # Input token stream ntk = 0; # Number of input tokens: split("", wd); # The distinct words split("", wx); # {wd[wx[w]] = w} for every {w} nwd = 0; # Word and par counts: split("", wct); # {wct[w]} is the count of word {w}. split("", pct); # {pct[a,b]} is the count of pair {.. a b ..}. u = ""; # Prev-previous word in input stream, or "" if none. v = ""; # Previous word in input stream, or "" if none. # Predecessors and successors: split("", npred); # {npred[w]} is the number of distinct predecessors of {w}. split("", pred); # {pred[w,k]} is predecessor {k} of word {w}. split("", nsucc); # {nsucc[w]} is the number of distinct predecessors of {w}. split("", succ); # {succ[w,k]} is successor {k} of word {w}. # Potential prefix/suffix conflicts: split("", incps); # {incps[u,w]} is set if {u v w} occurs in input for some {w}. } (abort >= 0) { exit abort; } # Discard comments and blank lines: /^ *([\#]|$)/ { next; } # Process one more word: (NF == 1) { w = $1; # Save token tk[ntk] = w; ntk++; # Save/count word if (w in wct) { wct[w]++ } else { wct[w] = 1; wd[nwd] = w; wx[w] = nwd; nwd++; npred[w] = 0; nsucc[w] = 0; } # Count {v w} pair: if (v != "") { process_pair(v, w); } if (u != "") { process_triple(u, w); } u = v; v = w; next; } # Bad words: // { data_error(("bad word = [" $0 "]")); } END { if (abort >= 0) { exit abort; } printf "read %d tokens, %d words\n", ntk, nwd > "/dev/stderr"; if (ntk >= 2) { process_pair(v, tk[0]); process_triple(u, tk[0]); process_triple(v, tk[1]); } else if (ntk >= 1) { process_pair(v, v); process_triple(v, v); } # Word classification split("", type); # ">" = prefix, "<" = suffix, "-" = compound, undef ow. # Working markers split("", used_pref); # {used_pref[w]} set means {w} is prefix in some compound. split("", used_suff); # {used_suff[w]} set means {w} is suffix in some compound if (nwd < num) { compound_words(+1); } else { compound_words(-1); } write_hyphenated_stream(); } function process_pair(v, w) { if ((v,w) in pct) { pct[v,w]++; } else { pct[v,w] = 1; # Save new succ of {v} and new pred of {w}: succ[v,nsucc[v]] = w; nsucc[v]++; pred[w,npred[w]] = v; npred[w]++; } } function process_triple(u, w) { incps[u,w] = 1; } function compound_words(goal, w,trial) { # Tries to form new compounds that increase (if {goal>0}) or # decrease (if {goal < 0}) the size of the lexicon. # # We use a simple heuristic: repeatedly select a word {w} and # try "fixating" it. If this attempt succeeds, update the {wd} and # {wct} vectors. Repeat until reaching the desired number of words, # or until giving up. # # "Fixating" a word {w} means making it into an obbligatory prefix # ("prefixating"), or an obligatory suffix ("suffixating"). Because # of the coding invariant, we can prefixate {w} only if # # 0. {w} has not been prefixated already, i.e. {type[w] != ">"}. # 1. {w} has not been used as a suffix already, i.e. {!(w in used_suff)}. # 2. {w w} does not occur in the input, i.e. { pct[w,w] == 0}. # 3. For every successor {v} of {w} in the input, # 3.1 {v} has not been used as a prefix, i.e. {!(v in used_pref)}. # # During this process, {wct[w]} is meaningful only if {w} has not # yet been prefixated or suffixated; and in that case it will be the # number of occurences of {w} in the input that are still not part # of compounds. (On the other hand, {pct[v,w]} will always be the # number of {v w} pairs in the original input sequence.) # # The effect of prefixating a word {w} is therefore # # * The word {w} disappears from the lexicon. # # * For each successor {v} of {w} that has not been suffixated, the # count {wct[v]} get decremented by {pct[w,v]}. If the result is # zero, the word {v} is deleted from the lexicon. # # * The new words {w-v} are added to the lexicon, with counts {pct(w,v)}. # # Moreover, the fixating is an improvement only if these changes add # up to a net increase (if {goal>0}) or decrease (if {goal<0}) # on the size of the lexicon. # # When scanning a word {w}, the heuristic's main loop must therefore # try several possibilities: forward or backwards fixating of {w} # itself, forward fixating of any of {w}'s predecessors, or backwards # fixating of any of {w}'s successors. If several of these apply, the # procedure should choose one of them somehow. # Main loop. Note that {nwd} may increase with time. trial = 0; while (nwd < num) { fixate_some_word(goal); } } function fixate_some_word(goal, cdw,dir,eff,nc,w,ew,k,ic,tef,pef) { # Tries to combine some word {w} with one of its neighbors in order to # achieve the stated {goal}. If it succeeds, updates {wct}, {wd}, # and {ord} and returns 1, else returns 0. printf " %d words", nwd > "/dev/stderr"; # Gather candidate words and directions {(cdw[k],dir[k]}, {k=0..nc-1}: split("", cdw); split("", dir); nc = 0; tef = 0; /* Sum of {eff[ic]} */ for (k = 0; k < nwd; k++) { w = wd[k]; if (! (w in type)) { ew = goal*fixating_effect(w, ">"); if (ew > 0) { cdw[nc] = w; dir[nc] = ">"; eff[nc] = ew; tef += ew; nc++; } ew = goal*fixating_effect(w, "<"); if (ew > 0) { cdw[nc] = w; dir[nc] = "<"; eff[nc] = ew; tef += ew; nc++; } } } printf " %d candidates", nc > "/dev/stderr"; if (nc == 0) { prog_error("no candidates left"); } else { # Select a random candidate with varied probability: pef = rand()*tef; ic = 0; while ((ic < nc) && (pef > eff[ic])) { pef -= eff[ic]; ic++; } if (ic >= nc) { ic = nc-1; } printf " choose cdw[%d] = %s (%s)\n", ic, cdw[ic], dir[ic] > "/dev/stderr"; fixate_word(cdw[ic], dir[ic]); } } function fixating_effect(w,dir, k,delta,nn,vk,usk) { # Retuns the change in the lexicon size that would result from # fixating {w} in direction {dir}, i.e. prefixating {w} if {dir == # ">"}, suffixating it if {dir == "<"}. If the fixating is # not possible, return 0. # We cannot do it if {w} already as been fixated in either direction: if (w in type) { return 0; } # We cannot do it if {w} follows itself: if ((w,w) in pct) { return 0; } # We cannot do it if {w} has been used the wrong way: if ((dir == ">") && (w in used_suff)) { return 0; } if ((dir == "<") && (w in used_pref)) { return 0; } # If fixating is possible, this word will disappear from the lexicon: delta = -1; # Check wether the neighbors of {w} are availabe for fixating: nn = (dir == ">" ? nsucc[w] : npred[w]); for (k = 0; k < nn; k++) { vk = (dir == ">" ? succ[w,k] : pred[w,k]); # We cannot fixate {w} if {vk} has been combined in that direction too: usk = (dir == ">" ? (vk in used_pref) : (vk in used_suff)); if (usk) { return 0; } # Fixating will affect only neighbors that aren't fixated already: if (vk in type) { if (type[vk] == dir) { prog_error("inconsistent tables 1"); } } else { nvk = (dir == ">" ? pct[w,vk] : pct[vk,w]); if (nvk > 0) { delta++; } if (wct[vk] <= nvk) { delta--; } } } return delta; } function fixate_word(w,dir, k,delta,nn,vk,wvk,usk) { # Fixates {w} in direction {dir}, i.e. prefixates {w} if {dir == # ">"}, suffixates it if {dir == "<"}. Assumes that the fixating # is possible. type[w] = dir; # Record use of {w}: if (dir == ">") { used_pref[w] = 1; } if (dir == "<") { used_suff[w] = 1; } # Remove word from lexicon: delete_word(w); # Join {w} with its neighbors that haven't been fixated yet: nn = (dir == ">" ? nsucc[w] : npred[w]); for (k = 0; k < nn; k++) { vk = (dir == ">" ? succ[w,k] : pred[w,k]); # Record use of {vk}: if (dir == ">") { used_suff[vk] = 1; } if (dir == "<") { used_pref[vk] = 1; } # Combine with neighbors that haven't been fixated already: if (vk in type) { if (type[vk] == dir) { prog_error("inconsistent tables 2"); } } else { nvk = (dir == ">" ? pct[w,vk] : pct[vk,w]); if (nvk > 0) { wvk = ( dir == ">" ? w "-" vk : vk "-" w); wd[nwd] = wvk; wx[wvk] = nwd; nwd++; wct[wvk] = nvk; wct[vk] -= nvk; if (wct[vk] == 0) { delete_word(vk); } } } } return delta; } function delete_word(w) { wct[w] = 0; nwd--; if (wx[w] < nwd) { wd[wx[w]] = wd[nwd]; wx[wd[nwd]] = wx[w]; } delete wx[w]; } function write_hyphenated_stream( k,v,tv,vjoin,w,tw,wjoin) { if (ntk == 0) { return; } v = tk[0]; tv = type[v]; if (ntk == 1) { if ((tv == ">") || (tv == "<")) { prog_error("compounded token with self"); } print v; return; } vjoin = ((tv == "<") || (type[ntk-1] == ">")); for (k = 1; k <= ntk; k++) { w = tk[k % ntk]; tw = type[w]; if ((tv == ">") || (tw == "<")) { if (vjoin) { prog_error("conflict in compounding"); } print (v "-" w); wjoin = 1; } else { if (! vjoin) { print v; } wjoin = 0; } v = w; tv = tw; vjoin = wjoin; } } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function prog_error(msg) { printf "%s\n", msg > "/dev/stderr"; abort = 1; exit 1; }