#! /usr/bin/gawk -f
# Last edited on 2023-05-10 15:48:22 by stolfi

BEGIN {
  abort = -1;
  usage = ( ARGV[0] "\\\n" \
    "  -v num=NUM \\\n" \
    "  < INPUT.tks > OUTPUT.wfr " \
  );
  
  # Reads a stream of tokens, one per line. Writes a new stream where
  # some consecutive word pairs have been combined into hyphenated
  # prefix-suffix compounds so as to make {num} distinct words in
  # total. Tries to achieve a Zipfian distribution of the compounds.
  # (If the input stream already has {num} or more distinct words, the
  # output is equal to the input, truncated just before the {num+1}th
  # new word.)
  # 
  # The hyphenated compounds are such that they can be readily
  # recognised even if the output text is scrambled and the hyphens
  # are deleted. That is ensured by classifying each input word 
  # as either "isolated", "prefix", or "suffix",
  # and using only compounds {u-v} where 
  # 
  #   {u} is a prefix and {v} is not a prefix, or
  #   {u} is not a suffix and {v} is a suffix.
  #   
  # Then, given any text formed from the new lexicon and
  # de-hyphenated, we can recover the hyphens by simply 
  # locating adjacent pairs {u v} where the above conditions
  # hold. Note that scrambling the words before removing the 
  # hyphens cannot create such a pair. 
  # 
  # The choice of prefixes and suffixes is constrained by the 
  # desire that the input text remains a legal text for the new
  # lexicon; i.e., 
  #   
  #    if {u v} occurs in the input, then {u} and {v} cannot
  #    be both prefixes or both suffixes;
  #    
  #    if {u v w} occurs in the input, then either {u} is not
  #    prefix, or {w} is not suffix. 
  #
  # These conditions are the "coding invariant". The input file 
  # is considered circular for the purposes of these invariants.
  
  if (num == "") { arg_error(("must define \"num\"")); }
  if (num < 1) { arg_error(("invalid num = \"" num "\"")); }
  
  split("", tk); # Input token stream
  ntk = 0;       # Number of input tokens:
  
  split("", wd); # The distinct words
  split("", wx); # {wd[wx[w]] = w} for every {w}
  nwd = 0;
  
  # Word and par counts:
  split("", wct); # {wct[w]} is the count of word {w}.
  split("", pct); # {pct[a,b]} is the count of pair {.. a b ..}.
  u = "";         # Prev-previous word in input stream, or "" if none.
  v = "";         # Previous word in input stream, or "" if none.
  
  # Predecessors and successors:
  split("", npred); # {npred[w]} is the number of distinct predecessors of {w}.
  split("", pred);  # {pred[w,k]} is predecessor {k} of word {w}.
  
  split("", nsucc); # {nsucc[w]} is the number of distinct predecessors of {w}.
  split("", succ);  # {succ[w,k]} is successor {k} of word {w}.
  
  # Potential prefix/suffix conflicts:
  split("", incps); # {incps[u,w]} is set if {u v w} occurs in input for some {w}.
}

(abort >= 0) { exit abort; }

# Discard comments and blank lines:
/^ *([#]|$)/ { next; }
  
# Process one more word:
(NF == 1) {
  w = $1;
  
  # Save token
  tk[ntk] = w;
  ntk++;
  
  # Save/count word
  if (w in wct)
    { wct[w]++ }
  else
    { wct[w] = 1; 
      wd[nwd] = w; wx[w] = nwd; nwd++;
      npred[w] = 0; nsucc[w] = 0;
    }
    
  # Count {v w} pair:
  if (v != "") { process_pair(v, w); }
  if (u != "") { process_triple(u, w); }

  u = v; v = w;
  next;
}

# Bad words:
// { data_error(("bad word = [" $0 "]")); }

END {
  if (abort >= 0) { exit abort; }
  printf "read %d tokens, %d words\n", ntk, nwd > "/dev/stderr";
  
  if (ntk >= 2)
    { process_pair(v, tk[0]);
      process_triple(u, tk[0]);
      process_triple(v, tk[1]);
    }
  else if (ntk >= 1)
    { process_pair(v, v);
      process_triple(v, v);
    }

  # Word classification
  split("", type);   # ">" = prefix, "<" = suffix, "-" = compound, undef ow.

  # Working markers
  split("", used_pref); # {used_pref[w]} set means {w} is prefix in some compound.
  split("", used_suff); # {used_suff[w]} set means {w} is suffix in some compound

  if (nwd < num)
    { compound_words(+1); } 
  else
    { compound_words(-1); }
  write_hyphenated_stream();
}

function process_pair(v, w)
  { if ((v,w) in pct)
      { pct[v,w]++; }
    else
      { pct[v,w] = 1;
        # Save new succ of {v} and new pred of {w}:
        succ[v,nsucc[v]] = w; nsucc[v]++;
        pred[w,npred[w]] = v; npred[w]++;
      }
  }

function process_triple(u, w)
  { incps[u,w] = 1; }

function compound_words(goal,   w,trial)
  {
    # Tries to form new compounds that increase (if {goal>0}) or 
    # decrease (if {goal < 0}) the size of the lexicon.
    # 
    # We use a simple heuristic: repeatedly select a word {w} and
    # try "fixating" it. If this attempt succeeds, update the {wd} and
    # {wct} vectors. Repeat until reaching the desired number of words,
    # or until giving up.
    # 
    # "Fixating" a word {w} means making it into an obbligatory prefix
    # ("prefixating"), or an obligatory suffix ("suffixating"). Because
    # of the coding invariant, we can prefixate {w} only if
    #
    #  0. {w} has not been prefixated already, i.e. {type[w] != ">"}.
    #  1. {w} has not been used as a suffix already, i.e. {!(w in used_suff)}.
    #  2. {w w} does not occur in the input, i.e. { pct[w,w] == 0}.
    #  3. For every successor {v} of {w} in the input,
    #      3.1 {v} has not been used as a prefix, i.e. {!(v in used_pref)}.
    # 
    # During this process, {wct[w]} is meaningful only if {w} has not
    # yet been prefixated or suffixated; and in that case it will be the
    # number of occurences of {w} in the input that are still not part
    # of compounds. (On the other hand, {pct[v,w]} will always be the
    # number of {v w} pairs in the original input sequence.)
    # 
    # The effect of prefixating a word {w} is therefore
    #
    #   * The word {w} disappears from the lexicon.
    #     
    #   * For each successor {v} of {w} that has not been suffixated, the
    #     count {wct[v]} get decremented by {pct[w,v]}. If the result is
    #     zero, the word {v} is deleted from the lexicon.
    #     
    #   * The new words {w-v} are added to the lexicon, with counts {pct(w,v)}.
    # 
    # Moreover, the fixating is an improvement only if these changes add
    # up to a net increase (if {goal>0}) or decrease (if {goal<0})
    # on the size of the lexicon.
    # 
    # When scanning a word {w}, the heuristic's main loop must therefore
    # try several possibilities: forward or backwards fixating of {w}
    # itself, forward fixating of any of {w}'s predecessors, or backwards
    # fixating of any of {w}'s successors. If several of these apply, the
    # procedure should choose one of them somehow.

    # Main loop. Note that {nwd} may increase with time.
    trial = 0;
    while (nwd < num) { fixate_some_word(goal); }
  }

function fixate_some_word(goal,   cdw,dir,eff,nc,w,ew,k,ic,tef,pef)
  { 
    # Tries to combine some word {w} with one of its neighbors in order to
    # achieve the stated {goal}. If it succeeds, updates {wct}, {wd},
    # and {ord} and returns 1, else returns 0.
    
    printf "  %d words", nwd > "/dev/stderr";

    # Gather candidate words and directions {(cdw[k],dir[k]}, {k=0..nc-1}:
    split("", cdw); split("", dir); nc = 0;
    tef = 0; /* Sum of {eff[ic]} */
    for (k = 0; k < nwd; k++)
      { w = wd[k];
        if (! (w in type))
          { ew = goal*fixating_effect(w, ">");
            if (ew > 0)
              { cdw[nc] = w; dir[nc] = ">"; eff[nc] = ew; tef += ew; nc++; }
            ew = goal*fixating_effect(w, "<");
            if (ew > 0)
              { cdw[nc] = w; dir[nc] = "<"; eff[nc] = ew; tef += ew; nc++; }
          }
      }
    
    printf "  %d candidates", nc > "/dev/stderr";

    if (nc == 0) 
      { prog_error("no candidates left"); }
    else
      { # Select a random candidate with varied probability: 
        pef = rand()*tef;
        ic = 0; 
        while ((ic < nc) && (pef > eff[ic])) { pef -= eff[ic]; ic++; }
        if (ic >= nc) { ic = nc-1; }
        printf "  choose cdw[%d] = %s (%s)\n", ic, cdw[ic], dir[ic] > "/dev/stderr";
        fixate_word(cdw[ic], dir[ic]);
      }
  }

function fixating_effect(w,dir,   k,delta,nn,vk,usk)
  {
    # Retuns the change in the lexicon size that would result from
    # fixating {w} in direction {dir}, i.e. prefixating {w} if {dir ==
    # ">"}, suffixating it if {dir == "<"}.  If the fixating is 
    # not possible, return 0.

    # We cannot do it if {w} already as been fixated in either direction:
    if (w in type) { return 0; }

    # We cannot do it if {w} follows itself:
    if ((w,w) in pct) { return 0; }

    # We cannot do it if {w} has been used the wrong way:
    if ((dir == ">") && (w in used_suff)) { return 0; }
    if ((dir == "<") && (w in used_pref)) { return 0; }

    # If fixating is possible, this word will disappear from the lexicon:
    delta = -1;

    # Check wether the neighbors of {w} are availabe for fixating:
    nn = (dir == ">" ? nsucc[w] : npred[w]);
    for (k = 0; k < nn; k++)
      { vk = (dir == ">" ? succ[w,k] : pred[w,k]);
        # We cannot fixate {w} if {vk} has been combined in that direction too:
        usk = (dir == ">" ? (vk in used_pref) : (vk in used_suff));
        if (usk) { return 0; }

        # Fixating will affect only neighbors that aren't fixated already:
        if (vk in type) 
          { if (type[vk] == dir) 
              { prog_error("inconsistent tables 1"); }
          }
        else
          { nvk = (dir == ">" ? pct[w,vk] : pct[vk,w]);
            if (nvk > 0) { delta++; }
            if (wct[vk] <= nvk) { delta--; }
          }
      }
    return delta;
  }

function fixate_word(w,dir,   k,delta,nn,vk,wvk,usk)
  {
    # Fixates {w} in direction {dir}, i.e. prefixates {w} if {dir ==
    # ">"}, suffixates it if {dir == "<"}.  Assumes that the fixating
    # is possible.

    type[w] = dir;

    # Record use of {w}:
    if (dir == ">") { used_pref[w] = 1; }
    if (dir == "<") { used_suff[w] = 1; }

    # Remove word from lexicon:
    delete_word(w);

    # Join {w} with its neighbors that haven't been fixated yet:
    nn = (dir == ">" ? nsucc[w] : npred[w]);
    for (k = 0; k < nn; k++)
      { vk = (dir == ">" ? succ[w,k] : pred[w,k]);
        
        # Record use of {vk}:
        if (dir == ">") { used_suff[vk] = 1; }
        if (dir == "<") { used_pref[vk] = 1; }

        # Combine with neighbors that haven't been fixated already:
        if (vk in type) 
          { if (type[vk] == dir) 
              { prog_error("inconsistent tables 2"); }
          }
        else
          { nvk = (dir == ">" ? pct[w,vk] : pct[vk,w]);
            if (nvk > 0) 
              { wvk = ( dir == ">" ? w "-" vk : vk "-" w);
                wd[nwd] = wvk; wx[wvk] = nwd; nwd++;
                wct[wvk] = nvk;
                wct[vk] -= nvk;
                if (wct[vk] == 0) { delete_word(vk); }
              }
          }
      }
    return delta;
  }

function delete_word(w)
  { wct[w] = 0;
    nwd--; 
    if (wx[w] < nwd) { wd[wx[w]] = wd[nwd]; wx[wd[nwd]] = wx[w]; }
    delete wx[w];
  }   

function write_hyphenated_stream(  k,v,tv,vjoin,w,tw,wjoin)
  {
    if (ntk == 0) { return; }
    v = tk[0]; tv = type[v]; 
    if (ntk == 1) 
      { if ((tv == ">") || (tv == "<"))
          { prog_error("compounded token with self"); }
        print v; 
        return;
      }
    vjoin = ((tv == "<") || (type[ntk-1] == ">"));
    for (k = 1; k <= ntk; k++)
      { w = tk[k % ntk]; tw = type[w]; 
        if ((tv == ">") || (tw == "<"))
          { if (vjoin) { prog_error("conflict in compounding"); }
            print (v "-" w); 
            wjoin = 1;
          }
        else
          { if (! vjoin) { print v; }
            wjoin = 0;
          } 
        v = w; tv = tw; vjoin = wjoin;
      }
  }
    
         
function arg_error(msg)
  { printf "%s\n", msg > "/dev/stderr";
    printf "usage: %s\n", usage > "/dev/stderr";
    abort = 1;
    exit 1;
  }
         
function prog_error(msg)
  { printf "%s\n", msg > "/dev/stderr";
    abort = 1;
    exit 1;
  }