#! /usr/bin/gawk -f
# Last edited on 1999-01-05 01:34:36 by stolfi

BEGIN { 

  usage = ( "remove-redundant-roc-entries < INFILE > OUTFILE" );
  
# Removes redundant entries from the raw concordance (7 fields). An
# entry is redundant if it has the same location, position, length,
# string, and context of a previous entry, differing only on the
# transcriber code.

  abort = -1;
  
  # Data for previous entry:
  # "oloc" is the location code (field 1).
  # "olin" is the concatenation of fields 3-7.
  # "otrc" is the of transcriber code (field 2).
  # "otrcs" is the cat of "otrc" for all entries with same "oloc", "olin".
  # "ofnum" is the page's f-number extracted from "oloc".

  nwr = 0; nsk = 0;
  start_entry("", "", "");
  start_page("");
}

(abort >= 0) { exit abort; }

/./{
   if (NF != 7) { error("wrong number of fields"); }
   loc = $1;
   trc = $2;
   lin = ($3 " " $4 " " $5 " " $6 " " $7)
   if ((loc != oloc) || (lin != olin))
     { finish_entry(); start_entry(loc, trc, lin);
       nwr++; pgwr++;
       if (! match(loc, /[.]/)) { error("no dot in locator"); }
       fnum = substr(loc, 1, RSTART-1);
       if (fnum != ofnum) { finish_page(); start_page(fnum); }
     }
   else
     { if (trc < otrc) { error("not sorted by transcriber"); }
       otrcs = (otrcs trc); 
       nsk++;
     }
   ostart = start; olen = len;
   oleft = left; ostr = str; oright = right;
   nrd++; pgrd++;
   next;
}

END{ 
  if (abort >= 0) { exit abort; }
  finish_entry();
  finish_page();  
  printf "\n" > "/dev/stderr";
  printf "%7d records read\n", nrd > "/dev/stderr";
  printf "%7d records ignored\n", nsk > "/dev/stderr";
  printf "%7d records written\n", nwr > "/dev/stderr";
}

function start_entry(loc, trc, lin)
{
  # Called when a new entry is starting.
  oloc = loc; otrc = trc; otrcs = trc; olin = lin;
}

function finish_entry()
{
  # Called when current entry is complete.
  if (oloc != "") { print oloc, otrcs, olin; }
}

function start_page(fnum)
{
  # Called when starting new page.
  pgrd = 0; pgwr = 0; ofnum = fnum;
}

function finish_page()
{
  # Called when current page is complete.
  if (ofnum != "")
   { printf "%4d %4d %s\n", pgrd, pgwr, ofnum > "/dev/stderr"; }
}

function error (msg)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  print $0 >  "/dev/stderr";
  abort = 1; exit abort;
}