#! /usr/bin/gawk -f
# Last edited on 1999-07-20 02:59:44 by stolfi

# Usage: 
#
#  cat INFILE \
#    | make-word-parag-map \
#        -v nParags=NPARAGS \
#        [-v totOnly={0|1}] \
#        [-v omitSingles={0|1}] \
#    > RAWMAP"
#
# This script reads from standard input a list of word
# occurrence records of the form
#
#   PATT STRING TAG PARAG
#   1    3      2   4
#
# where 
#
#   PATT    is a string pattern (equivalence class), e.g. "oteedodo";
#
#   STRING  is the string as it occurred in the text e.g. "qokeedy.dy"
#
#   TAG     is an occurrence type code, e.g. "+" for label, "-" for text;
#   
#   PARAG   is the sequential paragraph number, eg "125";
#
# The file should be sorted by PATT, STRING, and TAG.
# Moreover, if two records have the same STRING they should 
# have the same PATT too.
#
# The script counts all occurrences of the same STRING with the same
# TAG, per paragraph, and prints for it a line of the form
#
#   TOTCT XXX...XXX PATT STRING TAG
#
# where 
#
#   TOTCT       is the total number of occurrences for STRING and TAG;
#
#   XXX...XXX   are the occurrence counts for each paragraph;
#
#   PATT        is the corresponding pattern from the input file;
#
#   STRING      is the string from the input file;
#
#   TAG         is the input TAG;
#
# Also, all occurrences with the same PATT are added and
# printed as a separate pattern-total line with TAG "=".
# The STRING is then the most popular STRING of this PATT,
# with "~" appended.
#
# If "totOnly" is 1 then only the pattern-total lines above are printed.
# If "totOnly" is 0 then a pattern-total line is printed only if it
# combines two or more different STRINGs.
#
# If "omitSingles" is 1 then lines with TOTCT = 1 are not printed.

function error(msg)
{ 
  printf "line %d: %s\n", NR, msg > "/dev/stderr";
  abort = 1;
  exit  abort;
}

function print_line(pa, st, tg, tt, ct,    i)
{
  # prints pattern "pa", string "st", tag "tg", total count "tt",
  # per-block counts "ct[1..nParags]". Also updates "last_pat_printed".
  if ((pa != last_pat_printed) && (last_pat_printed != "")) { printf "\n"; }
  printf "%6d", tt;
  for (i = 1; i <= nParags; i++) { printf " %2d", ct[i]; }
  printf " %s %s %s", pa, st, tg;
  printf "\n";
  last_pat_printed = pa;
}

# We have a buffer for the current string, and one for the current pattern.
# "patt" is the current pattern, "pat_XX" are its attributes.
# "strn" is the current string,  "str_XX" are its attributes.

function dump_str_buffer(   i)
{
  # Prints the current string buffer and clears its accumulators
  if (strn != "")
    { # Remember most popular string for each pattern:
      if (str_tt > pat_max_tt)
        { pat_st = strn;
          pat_max_tt = str_tt;
        }
      # Print the string data
      if ((! totOnly) && ((! omitSingles) || (! pat_is_single)))
        { print_line(patt, strn, str_tg, str_tt, str_ct); }
    }
  # Clear string accumulators:
  strn = "";
  for (i = 1; i <= nParags; i++) str_ct[i] = 0;
  str_tt = 0;
}

function dump_pat_buffer(   i)
{
  # prints the current pattern buffer and clears its accumulators.
  if ((pat_tt != 0) && (totOnly || (pat_ns > 1)))
    { print_line(patt, (pat_st "~"), "=", pat_tt, pat_ct); }
  patt = "";
  for (i = 1; i <= nParags; i++) pat_ct[i] = 0;
  pat_tt = 0;
  pat_ns = 0;
  pat_max_tt = 0;
}

BEGIN { 
  abort = -1;
  if (nParags == 0) error("must specify \"-v nParags=NNNN\""); 

  strn = ""; 
  split("", str_ct); str_tt = 0;

  patt = "";
  split("", pat_ct); pat_tt = 0; pat_ns = 0;
  pat_max_tt = 0;

  last_pat_printed = "";
}

/./ {
  if(abort >= 0) exit abort;
  if (NF != 4) error("wrong number of fields");
  
  # At this point we still have "current" data that hasn't been printed.
  
  pa = $1;
  st = $2;
  tg = $3;
  pn = $4;
  
  pi = pn + 0;
  if ((! match(pn, /^[0-9]+/)) || (pi <= 0) || (pi > nParags)) 
    { error("bad parag number = " pn); }
  
  if ((pa != patt) || (st != strn))
    { 
      # Decide whether the current pattern has a single ordinary occurrence:
      pat_is_single = ((pat_tt == 1) && (patt != pa));
      
      # Print string data, if any, and reset counters:
      dump_str_buffer();

      # If pattern changed, print pattern data too, and reset counters: 
      if (pa != patt)
        { dump_pat_buffer();
          patt = pa;
        }
      
      strn = st;
      str_tg = tg;
      pat_ns++;
    }
    
  # Tally this occurrence:
  pi = pn + 0;
  str_ct[pi]++; str_tt++;
  pat_ct[pi]++; pat_tt++;
}

END {
  if(abort >= 0) exit abort;
  dump_str_buffer();
  dump_pat_buffer();
}