#! /n/gnu/bin/gawk -f
# Last edited on 2000-02-04 16:36:01 by stolfi

BEGIN {
  abort = -1;
  usage = ( ARGV[0] " < INFILE.fcm > OUTFILE.grm" )

  # Reads from stdin a bunch of lines of the form
  #
  #   NT PROB DEFN
  #
  # where NTi is a non-terminal symbol, PROB is a probability
  # (not necessarily normalized) and DEFN a definition for NT.
  # Outputs the saem grammar in "grx" format.

  if (ARGC != 1) { error(("usage: " usage)); }
  onsy = "";
}

(abort >= 0) { exit abort; }

/^ *([#]|$)/ { print; next; }

/^[-_:A-Za-z0-9]+ +[0-9]+[.]?[0-9]* / {
  nsy = $1; prb = $2; 
  def = ""; sep = "";
  for (i=3; i<=NF; i++)
    { def = (def sep $(i)); sep = "."; }
  if (def == "") { def = "="; }
  if (nsy != onsy) 
    { printf "%s:\n", nsy;  onsy = nsy; }
  printf "  %7s %s\n", prb, def;
  next;
}

(NF >= 3) {
  frq = $1; def = $(NF);
  if (! match(frq, /^[0-9]+[.]?[0-9]*$/)) 
    { error(("line " FNR ": bad count/prob format \"" $0 "\"")); }
  gsub(/[- ]/, "", def);
  gsub(/[.]/, " ", def);
  if (nsy == "") { error(("line " FNR ": missing non-terminal")); }
  printf "%-7s %7s %s\n", nsy, frq, def;
  next;
}

// { error(("line " FNR ": bad format \"" $0 "\"")); }

function error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  abort = 1; exit 1;
}