#! /usr/bin/gawk -f
# Last edited on 1999-01-06 09:56:48 by stolfi

# Usage: "$0 OCCFILE < TEXT"

# Reads a text in EVT format and a file OCCFILE of word occurrences,
# and prints the latter interspersed with the former, properly indented.
#
# Each line of the OCCFILE represents one occurrence of some pattern 
# in the TEXT, and it must have the form
#   <LOCATION> LOCPOS GLOBPOS PAT OBS
# where <LOCATION> is a location code (e.g. <f100r.P.1> or <f116r2.P.27;S>),
# exactly as it apepars in the TEXT; LOCPOS is the displacement of the pattern's occurrence 
# relative to the beginning of that text line (where LOCPOS=0 means column 20);
# GLOBPOS is the displacement since the beginning of the indexed text; PAT 
# is the pattern in question; and OBS is an optional comment (at most 17 bytes long).
#
# The <LOCATION>s found in the OCCFILE must occur in the same order
# as they appear in the TEXT. 
#
# The lines of TEXT are printed with no change. After each line
# of TEXT that contains a location code <...> starting at column 1,
# this program prints all lines of OCCFILE that have exactly the same 
# location code.  Each line contains the OBS field in column 3,
# and the PAT field in column 20+LOCPOS.

function getloc(lin, recno)
{
  # Extracts a location code <...> from the beginning of "lin"

  match(lin, /^<[^>]*>/);
  if (RSTART == 0) 
    { error((recno ": bad location = \"" substr(lin,1,19) "\"")); return "<?>" }
  else
    { return substr(lin,RSTART,RLENGTH) }
}

function readocc()
{
  # reads a line of the OCCFILE into "olin", splits it into "ofld"
  # defines "oloc" and oobs, increments NO.
  #
  # Sets oloc = "" upon end-of-file
  #
  if ((getline olin < wfile) > 0)
    { NO++; 
      oloc = getloc(olin, ("oc line " NO)); 
    }
  else
    { olin = ""; oloc = ""; oobs = "" }
  if (ERRNO != "0") { error((wfile ": " ERRNO)); }
  split(olin, ofld)
  if (5 in ofld) 
    { oobs = ofld[5] }
  else
    { oobs = "" }
}

BEGIN {
  abort = 0
  usage = (ARGV[0] " OCCFILE < TEXT")
  if (ARGC != 2) { error(("usage: " usage)); }
  wfile = ARGV[1];
  ARGC = 1
  nRefs = 0
  readocc()
}

/#/ { 
  if (abort) exit;
  print; next
}

/./ {
  if (abort) exit;
  print
  tloc = getloc($0, ("tx line " NR))
  while ((oloc != "") && (oloc == tloc))
    { pos = 20 + ofld[2];
      printf "  %s ", oobs;
      for(j=4+length(oobs);j<pos;j++) printf " ";
      printf "%s\n", ofld[4]
      readocc()
    }
  next
}

END {
  if(abort) exit;
  if(oloc != "")
    { error(("oc line " NO ": bogus location = \"" oloc "\"")) }
  printf "listed %d occurrences\n", NO > "/dev/stderr"
}

function error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr"
  abort = 1
  exit
}