#! /usr/bin/gawk -f
# Last edited on 2000-05-21 20:01:30 by stolfi

BEGIN {
  abort = -1;
  usage = ( ARGV[0] \
    "  -v delims=XY\\\n" \
    "  [ -v separator=Z ]\\\n" \
    "  [ -v axiom=AXIOM ]\\\n" \
    "  [ -v ignorecounts=BOOL ] \\\n" \
    "  [ -v eps=NUMBER ] \\\n" \
    "  [ -v truncate=NUMBER ] \\\n" \
    "  < GRAMMAR.grx > WFREQS \\\n" \
  );
  
  # Reads a finite grammar from stdin, writes the words and their
  # computed probabilities to stdout.
  #
  # The "delims" parameter should be a pair of characters X and Y to be used
  # as open and close delimiters in the derived strings (default "«»").
  #
  # The grammar is a set of rules of the form
  #
  #   NTSYMB:
  #      COUNT1 OTHER1... DEFN1
  #      COUNT2 OTHER2... DEFN2
  #      ...   ...      ...
  #
  # where NTSYMB is a non-terminal symbol, OTHER is zero or more optional 
  # numeric fields (ignored), and DEFNi is an alternative to NTSYMB.
  # 
  # The NTSYMB must be a string of [A-Za-z0-9()*+_] and must begin [A-Z(]
  # 
  # The counts COUNTi must be nonnegative; they are implicitly
  # normalized to probabilities that add to 1.
  # 
  # Each alternative DEFNi must be a sequence of symbols, separated by
  # the "separator" character (default "."), which must not contain
  # blanks or the two characters XY. Symbols in DEFNi that are not
  # defined by any rule are assumed to be terminal.  Use the 
  # separator alone to indicate the empty string as an alternative.
  # 
  # If the "axiom" (root symbol) is not specified, the left-hand side
  # of the first rule is used by default.
  #
  # The output is a list of lines of the form
  #
  #   PROB PHRASE
  #
  # where PHRASE is a terminal phrase derived from the root symbol by some
  # sequence of leftmost reductions, and PROB is the product of all
  # probabilities associated to the rules used in the derivation.
  #
  # The PHRASE and any substituted phrases inside it are
  # bracketed with the characters X and Y.
  #
  # If the grammar is structurally ambiguous, the same PHRASE may
  # appear more than once in the output (in spite of the bracketing).
  # 
  # If "truncate" is positive, any derivation tree whose probabilit
  # is less than "truncate" is omitted from the output. (Use with care,
  # since many small numbers may add up to a sizabel probablity.)
  # 
  # The script will loop forever if the grammar is recursive and
  # "truncate" is zero.
  # 
  
  # Global variables:
  #   "defn[firstrule[s]]" is the RHS of the last rule of non-terminal "s".
  #  
  #   "prob[k]" is the probability for "defn[k]".
  #  
  #   "defn[nextrule[k]]" is the next rule after "defn[k]" 
  #      of the same non-term symbol (or -1).
  
  split("", defn);
  split("", prob);
  split("", firstrule);
  split("", nextrule);
  nrules = 0;
  
  if (eps == "") { eps = 0; }
  if (truncate == "") { truncate = 0; }
  
  if (delims == "")
    { dop = "«"; dcl = "»"; delims = (dop dcl); }
  else if ( \
      (length(delims) != 2) ||
      match(delims, /[A-Za-z0-9_()*+\[\]\\ ]/) \
    )
    { arg_error("bad delims"); }
  else
    { dop = substr(delims,1,1); dcl = substr(delims,2,1); }
  dpat = ("[" delims "]");
  etypat = ("[" dop "][" dcl "]");
  
  if (separator == "")
    { separator = "."; }
  else 
    { if ( \
        (length(separator) != 1) || 
        (separator == dop) || (separator == dcl) || 
        match(separator, /[A-Za-z0-9_()*+\[\]\\ ]/) \
      ) { arg_error("bad symbol separator"); }
    }
  spat = ("[" separator "]");
}

(abort >= 0) { exit abort; }

/^ *$/ { next; }

/^ *[#]/ { next; }

/^[A-Z(][A-Za-z0-9_()*+]*[ ]*[:][ ]*$/ {
  s = $0; 
  gsub(/[ ]*[:][ ]*$/, "", s);
  if (s in firstrule) { grammar_error(("repeated symbol \"" s "\"")); }
  firstrule[s] = -1;
  if (axiom == "") { axiom = s; }
  cursymb = s;
  nsymb++;
  next;
}

/^ *[0-9.]/ {
  if (cursymb == "") { grammar_error("rule without head symbol"); }
  s = cursymb;
  if (NF < 2) { grammar_error("bad rule format"); }
  if (! match($1, /[0-9]*([0-9]|([0-9][.]|[.][0-9]))[0-9]*/))
    { grammar_error("bad rule count field"); }
  if (ignorecounts) { ct = 1; } else { ct = $1; }
  
  # Add delimiters around symbols in RHS:
  def = $(NF);
  if (def ~ dpat) { grammar_error("rule uses reserved delims"); }
  def = (dop def dcl);
  gsub(spat, (dcl dop), def);
  gsub(etypat, "", def);
  if (def == "") { def = ( dop dcl ); }
  # printf "%s -> %s\n", s, def > "/dev/stderr";
  
  # Store rule:
  nextrule[nrules] = firstrule[s];
  defn[nrules] = def;
  prob[nrules] = ct;
  firstrule[s] = nrules;
  nrules++;
  next;
}

/./ {
  grammar_error("bad line format"); 
}

END {
  if (abort >= 0) { exit abort; }
  if (axiom == "") { exit 0; }
  if (! (axiom in firstrule))
    { arg_error(("no rule for axiom «" axiom "»")); }
    
  # Normalize probabilities
  printf "normalizing..." > "/dev/stderr";
  for (s in firstrule)
    { k = firstrule[s]; sp = 0; nr = 0;
      while (k >= 0) { sp += prob[k]; nr++; k = nextrule[k]; }
      k = firstrule[s];
      while (k >= 0) { prob[k] = (sp == 0 ? 1.0/nr : prob[k]/sp); k = nextrule[k]; }
    }
  printf "done\n" > "/dev/stderr";

  # Enumerate language:
  top = 1;
  split("", hstack); hstack[top] = (dop axiom dcl);  # Incompletely expanded phrases.
  split("", pstack); pstack[top] = 1;                # Their probabilities.
  split("", xstack); xstack[top] = 2;                # Index of leftmost non-terminal.
  split("", zstack); zstack[top] = length(axiom);    # Its length.
  split("", astack); astack[top] = firstrule[axiom]; # Next rule that hasn't been used yet.
  
  while(top > 0)
    { # Enumerate all terminal phrases derived from "rstack[top]", times "pstack[top]".
      # Specifically, replace the non-terminal that starts at "xstack[top]" and has
      # length "zstack[top]" by each of its alternatives beginning with "defn[astack[top]]",
      # and recursively enumerate the sentences derived from that.
      # Convention: if "xstack[top] == 0", the phrase "hstack[top]" is terminal.
      
      h = hstack[top];
      p = pstack[top];
      x = xstack[top];
      z = zstack[top];
      a = astack[top];
      
      # printf "debug: h = [%s] p = %9.7f x = %d z = %d a = %d\n", h,p,x,z,a > "/dev/stderr";
      
      if (p < truncate) 
        { # probability too small, ignore.
          top--;
        }
      else if (x == 0)
        { # Sentence is terminal:
          printf "%9.7f %s\n", p, h;
          top--;
        }
      else if (a == -1)
        { # All rules have been tried:
          top--;
        }
      else
        { # Replace symbol and recurse:
          astack[top] = nextrule[a];
          h = (substr(h,1,x-1) defn[a] substr(h,x+z));
          p *= prob[a];
          top++;
          hstack[top] = h;
          pstack[top] = p;
          find_nt(h);
          xstack[top] = RSTART;
          zstack[top] = RLENGTH;
          astack[top] = RALT;
        }
    }
}

function find_nt(h,  x,y,z,s,n,a,b)
  {
    # Finds the first non-terminal in "h", sets RSTART, RLENGTH like "match"
    # If there is a non-terminal, also set RALT to the index of its first rule
    # in "defn[]" and "prob[]".
    
    x = 1;
    n = length(h);
    a = substr(h,x,1);
    while (x < n)
      { # At this point "a" must be an open delimiter:
        if (a != dop) { prog_error(("phrase delims h = [" h "]  x = " x)); }
        y = x+1;
        if (y > n)  { prog_error(("phrase trunc h = [" h "]  y = " y)); }
        b = substr(h,y,1);
        if (b == dop) 
          { x = y; a = b; }
        else
          { # Now "b" is a close delim or symbol letter:
            while ( b != dcl)
              { y++;
                if (y > n) { prog_error(("missing close h = [" h "]  x = " x " y = " y)); }
                b = substr(h,y,1);
                if (b == dop) { prog_error(("unexp open h = [" h "]  y = " y)); }
              }
            z = y - x - 1;
            if (z > 0)
              { # Found a symbol (non-empty innermost bracketed string)
                s = substr(h,x+1,z);
                if (s in firstrule)
                   { RSTART = x+1; RLENGTH = z; RALT = firstrule[s]; return; }
              }
            # This inner bracket was no good, find next open delim:
            x = y + 1;
            while ((x < n) && ((a = substr(h,x,1)) == dcl)) { x++; }
          }
      }
    RSTART = 0; RLENGTH = 0; RALT =-1;
  }

function arg_error(msg)
  {
    printf "%s\n", msg > "/dev/stderr";
    printf "usage: %s\n", usage > "/dev/stderr";
    abort = 1;
    exit abort;
  }

function grammar_error(msg)
  {
    printf "input grammar, line %d: %s\n", FNR, msg > "/dev/stderr";
    abort = 1;
    exit abort;
  }

function prog_error(msg)
  {
    printf "*** program error: %s\n", msg > "/dev/stderr";
    abort = 1;
    exit abort;
  }