#! /usr/bin/gawk -f
# Last edited on 1999-01-06 14:43:36 by stolfi

# Reads a concordance file with 10 fields (see Note-037.txt)
# including STRING (field 6) and PATT (field 8).
# 
# Appends an 11th field that is "+" for all entries of a given PATT
# that has at least two entries with distinct locations and positions,
# or whose STRING is a single word.

BEGIN { 
  # Invariants:
  #
  #   "cpat" is the pattern of the last entry seen, and "cpos" is 
  #   the position (location plus offset and length) of the first entry.
  #
  #   "s" is "+" or "-" depending on whether the lines with that
  #   pattern constitute and interesting pattern. That is, if "s" is
  #   "-" then all those lines have the same position "cpos" in the
  #   text, and none is a single word.
  #
  #   If "flushed" is 1, then "clin[0..n-1]" contains all previous lines
  #   with that pattern.  Otherwise those lines have been written out.

  abort = -1;
  start_pattern("", "");
}

(abort >= 0) { exit abort; }

/./{ 
  loc = $1; beg = $3; len = $4;
  pat = $8; str = $6;
  pos = (loc ":" beg ":" len);
  if (pat != cpat) 
    { flush_pattern(); start_pattern(pat,pos) }
  if ((pos != cpos) || (str !~ /[-/=., ]/))
    { s = "+"; flush_pattern(); }
  if (flushed) 
    { print $0, s; ct[s]++; }
  else
    { clin[n] = $0; }
  n++;
}

END {
  if (abort >= 0) { exit abort; }
  flush_pattern(s);
  printf "%7d records marked \"+\"\n", ct["+"] > "/dev/stderr";
  printf "%7d records marked \"-\"\n", ct["-"] > "/dev/stderr";
}

function start_pattern(pat, pos)
{
  # Prepares to accumulate new pattern. Assumes the 
  # Current one has been flushed.
  cpos = pos; cpat = pat;
  s = "-"; flushed = 0;
  split("", clin); n = 0;
}

function flush_pattern(i)
{
  # If there are any buffered lines, writes them out.
  if (! flushed) 
    { for (i=0;i<n;i++) { print clin[i], s; ct[s]++; }
      flushed = 1;
    }
}