#! /usr/bin/gawk -f
# Last edited on 2000-06-09 15:32:15 by stolfi

unde development

BEGIN {
  abort = -1;
  usage = ( ARGV[0] " \\\n" \
    "  [ -v freqs=BOOL ] \\\n" \
    "  [ -v prec=NUM ] \\\n" \
    "  [ -v titles=\"TITLE1 TITLE2 ... TITLEn\" ] \\\n" \
    "  [ -v terse=BOOL ] \\\n" \
    "  INFILE1.grx  INFILE2.grx ... INFILEn.grx \\\n" \
    "  > OUTFILE.grx" \
  );
  
  # Reads one or more counted grammar files INFILE.grx... which should
  # differ only on the counts associated to each alternative. Writes a
  # new grammar OUTFILE.grx where each alternative is labeled with the
  # corresponding counts of all input grammars.
  # 
  # The grammar files INFILE.grx... must contain one or
  # more rules of the form
  # 
  #  SYMBOL:
  #    COUNT1 OTHER1... PROD1
  #    COUNT2 OTHER2... PROD2
  #    ...       ...       ...
  # 
  # where SYMBOL is a non-terminal symbol, each COUNTi is an integer
  # or fractional count, the OTHERi fields are zero or more numeric
  # fields (which will be ignored), and each PRODi is an alternative
  # for SYMBOL. Each definition PRODi must be a single string without
  # embedded blanks. (The definitions are not interpreted in any way.)
  # 
  # The TITLES string must be a list of N short names, separated by blanks.
  # 
  # The output OUTFILE.grx will be in the same format, except that
  # each rule will have the format
  #
  #  SYMBOL:
  #  #   title1   title2   title3 ...  titlen
  #     CT[1.1]  CT[1.2]  CT[1.3] ... CT[1.n] DEF1
  #     CT[2.1]  CT[2.2]  CT[2.3] ... CT[2.n] DEF2
  #         ...      ...      ...
  #
  # where CT[i.j], is the COUNT associated with
  # the alternative DEFi in the input grammar number j..
  # 
  # If "freqs" is set to 1, the COUNTs in each input grammar are
  # first converted to fractions relative to the respective total
  # COUNT for the non-terminal SYMBOL.
  # 
  # In any case, if "prec" is set to a positive number, the COUNT
  # fields of the output grammar will be printed as fractions with
  # that many decimal fraction digits. Otherwise the COUNT field will
  # be rounded to the nearest integer.
  # 
  # "# Data-File:" lines, if any, are deleted from the input files.
  
  # Global variables:

  nfiles = ARGC-1;     # Number of input grammars.
  nsymb = 0;           # Number of non-terminal symbols.
  split("", grname);   # "grname[j]" = name of input grammar file "j".
  split("", symbol);   # "symbol[i]"  = the "i"th non-terminal symbol.
  split("", comment);  # "comment[s]" = the concatenated comments of symbol "s".
                       # "comment[s,k]" = the concatenated comments of rule "[s,k]".
  final_comment = "";  # Concat comments at the end of the grammar.
  split("", nprod);    # "nprod[s]" = the number of rules for symbol "s".
  split("", prod);     # "prod[s,k]" = definition "k" of symbol "s".
  split("", ct);       # "ct[s,k,j]" = count/prob of defn "k" of symbol "s" in file "j".

  # Arguments:

  if (freqs == "") { freqs = 0; }
  if (prec == "") { prec = (freqs ? 5 : 0); }
  if (nfiles < 1) { arg_error("must specify at least one input file"); }
  if (titles == "")
    { ntitles = 0; }
  else
    { ntitles = split(titles, tit);
      if (ntitles != nfiles) { arg_error("titles don't match files"); }
    }
  
  for (j = 1; j <= nfiles; j++) { grname[j] = ARGV[j]; }

  for (j = 1; j <= nfiles; j++) 
    { inhale_grammar(j);
      if (freqs) { normalize_counts(j); }
    }
  write_new_grammar();
}

function inhale_grammar(j,   fname,ns,nf,k,lin,fld,s,cmt,nlines)
{ 
  # Variables used while inhaling grammar:
  fname = grname[j]; # Grammar file name.
  cmt = "";          # Comments for next symbol or rule.
  s = "";            # Current non-terminal symbol
  ns = 0;            # Number of symbols seen so far in this file.
  nlines = 0;        # Number of lines read from this file.
  
  if (! terse) { printf "reading grammar %d = %s ...\n", j, fname > "/dev/stderr"; }
  while ((getline lin < fname) > 0 )
    { 
      nlines++;
      if (lin ~ /^ *$/)
        { cmt = ( cmt "\n" ); }
      else if (lin ~ /^[#][ ]*Data-File[ ]*[:]/) 
        {  }
      else if (lin ~ /^[#]/) 
        { cmt = ( cmt lin "\n" ); }
      else if (lin ~ /^[A-Z(][A-Za-z0-9_()*+]*[ ]*[:][ ]*$/)
        { s = lin; 
          gsub(/[ ]*[:][ ]*$/, "", s);
          if (j == 1) 
            { if (s in nprod) 
                { grammar_error(fname, nlines, ("repeated symbol \"" s "\"")); }
              symbol[ns] = s; comment[s] = cmt;
            }
          else
            { if (s != symbol[ns]) 
                { grammar_error(fname, nlines, "inconsistent symbols"); }
              if (length(comment[s]) < length(cmt)) { comment[s] = cmt; }
            }
          cmt = "";
          k = 0;
          ns++;
        }
      else if (lin ~ /^ *[0-9.]/)
        {
          if (s == "") 
            { grammar_error(fname, nlines, "rule without head symbol"); }
          nf = split(lin, fld);
          if (nf < 2) 
            { grammar_error(fname, nlines, "bad rule format"); }
          if (! match(fld[1], /[0-9]*([0-9]|([0-9][.]|[.][0-9]))[0-9]*/))
            { grammar_error(fname, nlines, "bad rule count field"); }
          def = fld[nf];
          if (j == 1) 
            { prod[s,k] = def; comment[s,k] = cmt; nprod[s]++; }
          else
            { if (prod[s,k] != def) 
                { grammar_error(fname, nlines, "inconsistent definitions"); }
              if (length(comment[s,k]) < length(cmt)) { comment[s,k] = cmt; }
            }
          ct[s,k,j] = fld[1];
          cmt = "";
          k++;
        }
      else
        { grammar_error(fname, nlines, "bad line format"); }
    }
  if (ERRNO != "0") { grammar_error(fname, nlines, ERRNO); }
  close (fname);
  if (nlines == 0) { arg_error(("file \"" fname "\" empty or missing")); }
  
  # Comments at end of grammar:
  gsub(/[\n]*$/, "", cmt);
  if (j == 1) 
    { nsymb = ns; final_comment = cmt; }
  else
    { if (ns != nsymb) 
        { grammar_error(fname, nlines, "inconsistent symbol counts"); }
      if (length(final_comment) < length(cmt)) { final_comment = cmt; }
    }
  if (nsymb == 0) { grammar_error(fname, nlines, "empty grammar"); }
}

function normalize_counts(j,   i,s,m,k,tot)
{
  for (i = 0; i < nsymb; i++) 
    { s = symbol[i];
      tot = 0.000000;
      m = nprod[s];
      for (k = 0; k < m; k++) { tot += ct[s,k,j]; }
      if (tot == 0)
        { printf "warning: zero total count for \"%s\" in file %s\n", 
            s,grname[j] > "/dev/stderr";
        }
      for (k = 0; k < m; k++) 
        { ct[s,k,j] = (tot == 0 ? 0 : ct[s,k,j]/tot); }
    }
}

function write_new_grammar(    i,s,k,j,m,def,np,c,wd,fmtc)
  {
    if (prec == 0) 
      { wd = (freqs ? 1 : 5); }
    else
      { wd = (freqs ? prec + 2 : prec + 6); }
    for(i = 0; i < nsymb; i++)
      { s = symbol[i];
        np = nprod[s];
        printf "%s", comment[s];
        printf "%s:\n", s;
        for (k = 0; k < np; k++)
          { printf "%s", comment[s,k];
            printf "    ";
            for (j = 1; j <= nfiles; j++)
              { c = ct[s,k,j];
                if (c == 0)
                  { fmtc = "."; }
                else if (prec > 0)
                  { fmtc = sprintf("%.*f", prec, c); gsub(/^[0]*/,"",fmtc); }
                else
                  { fmtc = sprintf("%d", int(c+0.5)); }
                printf " %*s", wd, fmtc;
              }
            printf " %s\n", prod[s,k];
          }
        if (ntitles != 0) 
          { printf "#   ";
            for (j = 1; j <= nfiles; j++) { printf " %*.*s", wd, wd, "-----"; }
            printf "\n";
            printf "#   ";
            for (j = 1; j <= nfiles; j++) { printf " %*s", wd, tit[j]; }
            printf "\n";
          }
      }
    printf "%s", final_comment;
    printf "\n";
    fflush("/dev/stdout");
  }

function arg_error(msg)
  {
    printf "%s\n", msg > "/dev/stderr";
    printf "usage: %s\n", usage > "/dev/stderr";
    abort = 1;
    exit abort;
  }

function grammar_error(fname, line, msg)
  {
    printf "file %s, line %d: %s\n", fname, line, msg > "/dev/stderr";
    abort = 1;
    exit abort;
  }

function word_error(msg)
  {
    printf "file %s, line %d: %s\n", wordcounts, nwords, msg > "/dev/stderr";
    abort = 1;
    exit abort;
  }

function prog_error(msg)
  {
    printf "*** program error: %s\n", msg > "/dev/stderr";
    abort = 1;
    exit abort;
  }