#! /usr/bin/gawk -f

# Usage: 
#
#  cat INFILE \
#    | make-word-location-map \
#        [-v nblocks=NBLOCKS] \
#        [-v totOnly={0|1}] \
#        [-v omitSingles={0|1}] \
#    > RAWMAP"
#
# This script reads from standard input a list of phrase
# occurrence records of the form
#
#   PNUM FNUM UNIT LINE TRANS START LENGTH POS STRING OBS LANG BLOCK PATT TAG
#   1    2    3    4    5     6     7      8   9      10  11   12    13   14
#
# where 
#   
#   PNUM    is the sequential page number, eg "125";
#
#   FNUM    is the folio-based page number, eg "f86v2";
#
#   UNIT    is the text unit within the page, eg "S1" or "P";
# 
#   LINE    is the line number within the UNIT, eg "0a" or "23";
#
#   TRANS   is the transcriber's code, eg. "F" for Friedman;
#
#   START   is not used
# 
#   LENGTH  is not used
#              
#   POS     is not used
# 
#   STRING  is a string occurring in the text e.g. "qokeedy.dy"
#
#   OBS     is an arbitrary non-empty comment string (without embedded blanks).
#
#   LANG    is the language in Currier's sense ("A", "B", or "?");
#
#   BLOCK   is the index of a block (map column), counting from 0;
#
#   PATT    is an abstracttion of STRING, e.g. "oteedodo";
#
#   TAG     is "+" for a "special" ocurrence, "-" for an "ordinary" one
#
# The file should be sorted by PATT, TAG, and STRING.
# Moreover, if two records have the same STRING they should 
# have the same PATT too.
#
# The script prints a table of the form 
#
#  TOTCT XXX...XXX PATT STRING TAG PNUM LOC OBS LANG
#
# where 
#
#   TOTCT       is a total number of occurrences for this line;
#
#   XXX...XXX   are occurrence counts for each block;
#
#   PATT        is a string pattern from the input file;
#
#   STRING      is a string from the input file;
#
#   TAG         is the corresponding TAG, or "=" for total lines;
# 
#   PNUM        is the sequential page number, or "-".
#     
#   LOC         is the full location code "FNUM.UNIT.LINE;TRANS", or "-".
#
#   OBS         is the same as in the input file.
#
#   LANG        is the same as in the input file.
#
# The TOTCT field is printed in fixed format "%6d ", the 
# remaining fields in free format, separated by spaces.
#
# The script prints one of these lines for each "special" occurrence
# (TAG != "-").  All ordinary occurrences (TAG == "-") of the same
# STRING are combined into one line, whose PNUM, LOC and OBS
# are taekn from the first such line.  
#
# Also, all ordinary occurrences with the same PATT are added and
# printed as a separate pattern-total line, whose STRING, PNUM, LOC
# and OBS are taken from the most popular STRING of that .
#
# If "totOnly" is 1 then only the pattern-total lines above are printed.
# If "totOnly" is 0 then a pattern-total line is printed only if it
# combines two or more different STRINGs.
#
# If "omitSingles" is 1 then PATTs that occur only once as 
# ordinary strings are entirely omitted from the output.

function error(msg)
{ 
  printf "line %d: %s\n", NR, msg > "/dev/stderr";
  abort = 1;
  exit;
}

function print_line(pa, st, tg, tt, ct, pn, lc, ob, lg,    i)
{
  # prints pattern "pa", string "st", tag "tg", total count "tt", 
  # per-block counts "ct[0..nblocks-1]", p-number "pn", location "lc",
  # obs "ob", language "lg".  Also updates "last_pat_printed".
  printf "%6d", tt;
  for (i=0; i<nblocks; i++) { printf " %2d", ct[i]; }
  printf " %s %s %s", pa, st, tg;
  printf " %s", (pn == "" ? "-" : pn);
  printf " %s", (lc == "" ? "-" : lc);
  printf " %s", (ob == "" ? "-" : ob);
  printf " %s", (lg == "" ? "?" : lg);
  printf "\n";
  last_pat_printed = pa;
}

# We have a buffer for the current string, and one for the current pattern.
# "patt" is the current pattern, "pat_XX" are its attributes.
# "strn" is the current string,  "str_XX" are its attributes.

function dump_str_buffer(   i)
{
  # Prints the current string buffer and clears its accumulators
  if (strn != "")
    { # Remember most popular ordinary string for each pattern:
      if ((! str_is_special) && (str_tt > pat_max_tt))
        { pat_st = strn;
          pat_pn = str_pn;
          pat_lc = str_lc;
          pat_ob = str_ob;
          pat_lg = str_lg;
          pat_max_tt = str_tt;
        }
      # Print the string data
      if ((! totOnly) && ((! omitSingles) || str_is_special || (! pat_is_single)))
        { if ((patt != last_pat_printed) && (last_pat_printed != "")) { printf "\n"; }
          print_line(patt, strn, str_tg, str_tt, str_ct, str_pn, str_lc, str_ob, str_lg);
        }
    }
  # Clear string accumulators:
  strn = "";
  for (i=0; i<nblocks; i++) str_ct[i] = 0;
  str_tt = 0;
}

function dump_pat_buffer(   i)
{
  # prints the current pattern buffer and clears its accumulators.
  if ((pat_tt != 0) && (totOnly || (pat_ns > 1)))
    { print_line(patt, (pat_st "~"), "=", pat_tt, pat_ct, pat_pn, pat_lc, pat_ob, pat_lg); }
  patt = "";
  for (i=0; i<nblocks; i++) pat_ct[i] = 0;
  pat_tt = 0;
  pat_ns = 0;
  pat_max_tt = 0;
}

BEGIN { 
  abort = 0;
  if (nblocks == 0) error("must specify \"-v nblocks\""); 

  strn = ""; 
  split("", str_ct); str_tt = 0;

  patt = "";
  split("", pat_ct); pat_tt = 0; pat_ns = 0;
  pat_max_tt = 0;

  last_pat_printed = "";
}

/./ {
  if(abort) exit 1;
  if (NF != 14) error("wrong number of fields");
  
  # At this point we still have "current" data that hasn't been printed.
  # "str_is_special" says whether the current string data can be added with 
  # other data for the same string. 
  
  pn = $1;
  lc = ($2 "." $3 "." $4 ";" $5);
  st = $9;
  ob = $10;
  lg = $11;
  bn = ($12 + 0);
  pa = $13;
  tg = $14;

  if (((bn + 0) != bn) || (bn < 0) || (bn >= nblocks)) 
    { error("bad block number = " bn); }
  
  if ((pa != patt) || (st != strn) || str_is_special)
    { 
      # Decide whether the current pattern has a single ordinary occurrence:
      pat_is_single = ((pat_tt == 1) && (patt != pa));
      
      # Print string data, if any, and reset counters:
      dump_str_buffer();

      # If pattern changed, print pattern data too, and reset counters: 
      if (pa != patt)
        { dump_pat_buffer();
          patt = pa;
        }
      
      strn = st;
      str_tg = tg;
      str_pn = pn;
      str_lc = lc;
      str_ob = ob;
      str_lg = lg;
      str_is_special = (str_tg != "-");

      if (! str_is-special) { pat_ns++; }
    }
    
  # Tally this occurrence:
  str_ct[bn]++;  str_tt++;
  if (str_is_special) 
    { patt_is_special = 1; }
  else
    { pat_ct[bn]++; pat_tt++; }
}

END {
  dump_str_buffer();
  dump_pat_buffer();
}