#! /usr/bin/gawk -f
# Last edited on 2004-10-12 19:36:46 by stolfi

BEGIN {
  abort = -1;
  usage = ( ARGV[0] " < NAME.gpr > NAME.dic" \
  );
  
  # Reads a list of glossing pairs "{TOKEN} {GLOSS}". Outputs a file
  # with one line for each distinct {TOKEN}; the format is "{TOKEN}
  # {GLOSSES}", where {GLOSSES} is the concatenation of all the
  # distinct {GLOSS} fields associated with {TOKEN}, with respective
  # occurrence counts.
  
  # Lines that have {TOKEN == "×"} are assumed to be unpaired 
  # glosses; the {GLOSS} is then replaced with "×" too.
  # Lines starting with "#" are ignored. Lines starting with "@"
  # are assumed to contain a JOD-locator in field 1.
  
  split("", nt); # {nt[w]} is number of distinct glosses for word {w}.
  split("", tr); # {tr[w,0..nt[w]-1]} are those glosses
  split("", ct); # {ct[w,0..nt[w]-1]} are the correponding token counts.
  nglosses = 0; # Total glossing pairs read.
  nwords = 0;   # Number of distinct left-hand words.
  loc = "{}";   # Last JOD-locator seen.
}

(abort >= 0) { exit(abort); }

/^[\#] *{(jod|sent)[:][0-9]+[:][0-9]+[.][0-9]+}/ {
  # Locator line
  loc = $2;
  next;
}

/^ *([\#]|$)/ {
  # Comment/blank line
  next;
}

/./ {
  if (NF != 2) { data_error("bad NF"); }
  # Get lefthand token {o} and gloss {t}:
  o = $1; t = $2;
  # Collapse unpaired glosses:
  if (o == "×") { t = "×"; }
  # If first occurrence of {o}, initialize {ntr[o]}.
  if (! (o in ntr)) { ntr[o] = 0; nwords++; }
  # Find index {k} of this gloss, or set {k=ntr[o]}:
  for (k = 0; k < ntr[o]; k++) { if (tr[o,k] == t) { break; } }
  # If new gloss, save it and initialize {ct[o,k]}:
  if (k >= ntr[o]) { ntr[o]++; tr[o,k] = t; ct[o,k] = 0; }
  # Count gloss instance:
  ct[o,k]++;
  nglosses++;
  next;
}

END {
  if (abort >= 0) { exit(abort); }
  # Print stats
  printf "%7d glossing pairs lines read\n", nglosses > "/dev/stderr";
  printf "%7d distinct left-hand words\n", nwords > "/dev/stderr";
  # Print words and their glosses:
  for (o in ntr)
    { printf "%-35s  ", o;
      for (k = 0; k < ntr[o]; k++)
        { # Get gloss and count:
          t = tr[o,k]; c = ct[o,k];
          printf "%s%s", (k > 0 ? "/" : ""), t;
          if (c > 1) { printf "(%d)", c; }
        } 
      printf "\n"; 
    }
  fflush(OpEnFile);
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1;
}

function data_error(msg)
{
  printf "%s:%d: %s ** %s\n", FILENAME, FNR, loc, msg > "/dev/stderr";
  abort = 1; exit 1;
}