#! /usr/bin/gawk -f
# Last edited on 1999-07-20 04:47:42 by stolfi

# Usage: 
#
#  cat RAWMAP \
#    | format-label-parag-map \
#        -v nParags=NPARAGS \
#        [-v blockTable=BLOCKTABLE] \
#        [-v maxLen=MAXLEN] \
#        [-v countWidth=CTWD] \
#        [-v countSeparator=CHAR] \
#        [-v html={0|1}] \
#        [-v title=TITLE] \
#        [-v blockHeadings=TITLE] \
#        [-v totOnly={0|1}] \
#        [-v mostBlocks=FRACTION] \
#        [-v showPattern={0|1}] \
#        [-v showLineNumber={0|1}] \
#        [-v showAbsCounts={0|1}] \
#        [-v showRelCounts={0|1}] \
#        [-v showAvgPos={0|1}] \
#    > OUTFILE"
#
# The script reads a raw word location map, as produced by
# make-word-location-map, consisting of records of the form 
#
#  TOTCT COUNTS PATT STRING TAG
#  
# where the COUNTS are NPARAGS counts of occurrences of STRING or PATT
# per paragraph.
#
# This script reads a table that maps the paragraphs into "blocks",
# which could be pages, folios, bifolios, or other arbitrary units.
# The script condenses the COUNTS into ABSCTS of occurrences per block.
# The script prints for each line the following data, in
# order:
#
#   TAG LINE PATT STRING TOTCT BLTOT NMOST AVB ENT ABSCTS RELCTS
#       *    *                             *   *   *      *
#
# where TAG, PATT, STRING and TOTCT are as read from
# the input, and
#
#   LINE        is a sequential line number;
#
#   BLTOT       is the total occurrences in all listed blocks
#               which may be less than TOTCT (see below);
#
#   NMOST       is the minimum number of distinct blocks that account
#               for 80% or more of all occurrences in the listed blocks;
#
#   AVB         is the mean block index;
#
#   ENT         is the entropy of the word's distribution (bits);
#
#   ABSCTS      are the input COUNTS, condensed ber block;
#
#   RELCTS      are the ABSCTS divided by the line's TOTCT and
#               scaled to fit the output format.
#
# The output fields marked "*" are optional see below.
#
# If TAG = "=" the line is interpreted as a pattern total,
# and formatted specially.
#
# If "totOnly" is 1 then only lines with TAG = "=" are printed.
#
# Blank lines and #-lines in the input are ignored.
#
# Actually only a subset of the paragraph COUNTS is totalled and
# printed. The paragraphs to print and their order are read from the
# BLOCKTABLE. Each record of the BLOCKTABLE must contain fields
#
#    PARAG BLTAG
# 
# where PARAG is a paragraph number, as in the main input, and BLTAG
# is an arbitrary string identifying the block. The BLOCKTABLE must be
# sorted by BLTAG. Blank lines in this file generate a blank column in
# the corresponding place of the printout. #-lines are ignored. If the
# BLOCKTABLE is not specified, assumes each paragraph is a block by
# itself.
#
# The following options request specific fields to be printed:
#
#   showAvgPos=1      prints the fields AVG and DVP
#
#   showAbsCounts=1   prints the absolute counst per block XXX...XXX
#
#   showRelCounts=1   prints the relative counts per block YYY...YYY
#
#   showPattern=1     prints the PATT field in addition to the STRING.
#
#   showLineNumber=1  prints a sequential line number (starting from 0).
# 
# Each per-block count is printed with CTWD bytes. If CTWD > 1 then
# the maximum value printed MAXCT is 10^(CTWD-1)-1, with at least one
# leading blank; else MAXCT is 9.  The percentages are scaled from [0%
# _ 100%] to [0 _ MAXCT] and rounded.
#
#
#
#

function error(msg)
{ 
  printf "line %d: %s\n", NR, msg > "/dev/stderr";
  abort = 1;
  exit 1;
}

function html_beg_line(tg)
{
  if (tg == "-")
    { printf "<font color=\"#000000\">"; }
  else if (tg == "+")
    { printf "<font color=\"#993300\">"; }
  else if (tg == "=")
    { printf "<font color=\"#003366\">"; }
  else if (tg == "h")
    { printf "<font color=\"#9900ff\">"; }
  else
    { printf "<font color=\"#ff3300\">"; }
}

function html_end_line(tg)
{
  printf "</font>";
}

function avp(c,   i, s, n)
{
  # Computes the average block index from the block count vector "c"
  s = 0.0;
  n = 0;
  for (i in c) { s += (i-0.5)*c[i]; n += c[i]; }
  return (n == 0 ? 0 : s/n);
}

function dev(c, a,    i, d, bias, slop, ss, n, q)
{
  # Computes the estimated standard deviation of the block index
  # from the per-block count vector "c" and average position "a"
  
  # The bias term tries to fix the deviation so that
  # rare strings do not come out looking localized.
  ss = 0.0;
  n = 0;
  for (i in c) 
    { d = (i-0.5) - a; ss += (d*d)*c[i]; n += c[i]; }
  slop = (nBlocks-1.0)/(n == 0 ? 1 : n);
  bias = (1.0 + slop*slop)/12.0;
  q = (n == 0 ? 0 : ss/n);
  return sqrt(q + bias);
}

function ent(c,    i,h,N,P,q)
{
  # Computes the entropy of the probability distribution 
  # implied by the histogram "c".
  
  h = 0.0;
  N = 0; P = 0;
  for (i in c) { P++; N += c[i]; }
  for (i in c) { q = c[i]; if(q > 1) { h += q*log(q); } }
  h /= (N*log(2));
  return h;
}

function nmost(c,tc,    i,j,b,nb,tb,sb,ci,tmp)
{
  # Computes the minimum number of entries in the given per-block
  # count vector "c" that add to at least "tc" occurrences.
  
  nb = 0;
  split("", b);  # "b[0..nb-1]" are the current block index set.
  tb = 0;        # Total of "c[b[0..nb-1]]". 
  sb = 0;        # "b[0..sb-1]" are sorted by c[b[i]], decreasing.
  
  for (i in c)
    { ci = c[i];
      if (ci > 0)
        { if (tb >= tc)
            { # Bubble the smallest entry in "b" to the last position:
              for (j=sb; j<nb; j++)
                { if (c[b[j-1]] < c[b[j]])
                    { tmp = b[j-1]; b[j-1] = b[j]; b[j] = tmp;
                      if ((sb == j) && (j > 1)) { sb = j-1; }
                    }
                }
              # Discard it: 
              nb--; tb -= c[b[nb]]; 
            }
          # Add this entry:
          b[nb] = i; tb += ci; nb++;
        }
    }
  if (tb < tc) { error("nmost: inconsistent tc"); }
  split("", b);
  return (nb);
}

function print_line(pa, st, tg, tt, ct,    tb,nm,av,dv,en,bn,pn,xc)
{
  # Prints a line for pattern "pa", string "st", tag "tg", total count "tt", 
  # per-block counts "ct[0..nBlocks-1]".
  # Also computes the number of blocks with most of the
  # occurrences, the average block index, and the entropy of the 
  # distribution.
  # Also increments the line counter.
  if (html) { html_beg_line(tg); } else { printf "%s ", tg; }
  if (showLineNumber) printf "%5d ", line_count;
  if (showPattern) printf "%-*s ", maxLen, pa;
  printf "%-*s ", maxLen, st;
  tb = 0; for(bn in ct) { tb += ct[bn]; }
  nm = nmost(ct, mostBlocks*tb);
  printf "%5d %5d %3d ", tt, tb, nm;
  if (showAvgPos)
    { av = avp(ct);
      # dv = dev(ct, av);
      en = ent(ct);
      printf "%5.1f %5.2f ", av, en;
    }
  if (showAbsCounts) 
    { printf " ";
      for (bn=0; bn<nBlocks; bn++) 
        { if (parags_in_block[bn] == 0)
            { xc = countSeparator; }
          else
            { if (ct[bn] == 0) { xc = "."; }
              else if (ct[bn] >= maxCount) { xc = maxCount; }
              else { xc = ct[bn]; }
            }
          printf "%*s", countWidth, xc;
        }
      printf " ";
    }
  if (showRelCounts) 
    { printf " ";
      for (bn=0; bn<nBlocks; bn++) 
        { if (parags_in_block[bn] == 0)
            { xc = countSeparator; }
          else
            { rct = int(((ct[bn]+1)*maxCount)/(tb+nBlocks) + 0.5);
              if (ct[bn] == 0) { xc = "."; }
              else if (rct >= maxCount) { xc = maxCount; }
              else { xc = rct; }
            }
          printf "%*s", countWidth, xc;
        }
      printf " ";
    }
  if (html) { html_end_line(tg); }
  printf "\n";
  line_count++;
}

function print_headings_major(    av,dv,bn)
{
  # Prints the major column headings.  
  # Must match print_line.
  if (html) { html_beg_line("h"); } else { printf "  "; }
  if (showLineNumber) printf "%5s ", " ";
  if (showPattern) printf "%-*s ", maxLen, " ";
  printf "%-*s ", maxLen, " ";
  printf "%5s %5s %3s ", " ", " ", " ";
  if (showAvgPos)
    { printf "%5s %5s ", " ", " "; }
  if (showAbsCounts)
    { printf " ";
      printf "%-*s", nBlocks*countWidth, "abs counts";
      printf " ";
    }
  if (showRelCounts) 
    { printf " ";
      printf "%-*s", nBlocks*countWidth, "rel counts";
      printf " ";
    }
  if (html) { html_end_line("h"); }
  printf "\n"
}

function print_headings_minor(    av,dv,bn)
{
  # Prints the minor column headings.  
  # Must match print_line.
  if (html) { html_beg_line("h"); } else { printf "t "; }
  if (showLineNumber) printf "%5s ", "line";
  if (showPattern) printf "%-*s ", maxLen, "pattern";
  printf "%-*s ", maxLen, "word(s)";
  printf "%5s %5s %3s ", "totct", "totbl", "mbl";
  if (showAvgPos)
    { printf "%5s %5s ", "av.bl", "en.bl"; }
  if (showAbsCounts)
    { printf " ";
      printf "%-*s", nBlocks*countWidth, "";
      printf " ";
    }
  if (showRelCounts) 
    { printf " ";
      printf "%-*s", nBlocks*countWidth, "";
      printf " ";
    }
  if (html) { html_end_line("h"); }
  printf "\n"
}

function print_dashes(      i,bn)
{
  # Prints dashes for all fields. 
  # Must match print_line.
  if (html) { html_beg_line("h"); } else { printf "- "; }
  if (showLineNumber) printf "----- ";
  if (showPattern) { for (i=0;i<maxLen;i++) printf "-"; printf " "; }
  for (i=0;i<maxLen;i++) printf "-"; printf " ";
  printf "----- ----- --- ";
  if (showAvgPos) { printf "----- ----- "; }
  if (showAbsCounts)
    { printf " ";
      for (bn=0; bn<nBlocks; bn++) 
        { if (parags_in_block[bn] == 0)
            { xc = countSeparator; }
          else if (countWidth == 1) 
            { xc = "-"; }
          else
            { xc = substr("---------------",1,countWidth-1); }
          printf "%*s", countWidth, xc;
        }
      printf " ";
    }
  if (showRelCounts) 
    { printf " "
      for (bn=0; bn<nBlocks; bn++) 
        { if (parags_in_block[bn] == 0)
            { xc = countSeparator; }
          else if (countWidth == 1) 
            { xc = "-"; }
          else
            { xc = substr("---------------",1,countWidth-1); }
          printf "%*s", countWidth, xc;
        }
      printf " ";
    }
  if (html) { html_end_line("h"); }
  printf "\n"
}

function compute_block_column(bn,    skip)
{
  # Returns the screen column of the first char in the count for block bnum.
  # Must match print_line.
  # Assumes absolute and relative counts are printed in the same format.
  skip = 0;
  if (! html) skip += 2;                # tag;
  if (showLineNumber) skip += 6;        # line number
  if (showPattern) skip += maxLen + 1;  # pattern
  skip += maxLen + 1;                   # string
  skip += 16;                           # total count, total shown, num most blocks.
  if (showAvgPos) skip += 12;           # average block index and spread
  skip += bn*countWidth;                      # previous blocks
  return 1 + skip;
}

function print_block_headings(file,   lin, col, i)
{
  # Prints headers (verticalized) 
  # over the per-block counts.
  if (showAbsCounts || showRelCounts)
    { col = compute_block_column(0);
      while ((getline lin < file) > 0)
        { if (html) { html_beg_line("h"); }
          for (i=1;i<col;i++) printf " ";
          if (showAbsCounts) { printf " "; printf "%s", lin; printf " "; }
          if (showRelCounts) { printf " "; printf "%s", lin; printf " "; }
          if (html) { html_end_line("h"); }
          printf "\n";
        }
      if (ERRNO != "0") { error((file ": " ERRNO)); }
    }
}
      
function print_head(title)
{
  if (html)
    { printf "<html>\n";
      printf "<head><title>Voynich Manuscript - %s</title></head>\n", title;
      printf "<body bgcolor=\"#ccffcc\">\n";
      printf "<h1>Voynich Manuscript</h1>\n";
      printf "<h2>%s</h2>\n", title;
      printf "<font size=1>\n";
      printf "<pre>\n";
    }
  else
    { printf "# %s\n", title;
      printf "\n";
    }
  print_headings_major();
  if (blockHeadings != "")
    { print_dashes(); print_block_headings(blockHeadings); print_dashes(); }
  print_headings_minor();
  print_dashes();
} 

function print_tail()
{
  if (html)
    { printf "</pre>\n";
      printf "</font>\n";
      printf "</body>\n";
      printf "</html>\n";
    }
  print_dashes();
} 

function load_block_table(file,  lin,pn,bt,bn,fld,nfld,bft)
{
  # Loads the (partial) mapping of paragraphs into blocks,
  # and the block order.  Defines "nBlocks", the tables
  # "block_from_parag" and "parags_in_block".
  nBlocks = 0;
  split("", parags_in_block);
  split("", block_from_parag);
  split("", bft);
  while (getline lin < file)
    { if (! match(lin, /^[#]/))
        { nfld = split(lin, fld);
          if (nfld == 0)
            { # dummy block
              bn = nBlocks; nBlocks++;
              parags_in_block[bn] = 0;
            }
          else if (nfld == 2)
            { pn = fld[1];
              bt = fld[2];
              if (bt in bft) 
                { bn = bft[bt]; }
              else
                { bn = nBlocks; nBlocks++;
                  bft[bt] = bn; parags_in_block[bn] = 0;
                }
              block_from_parag[pn] = bn;
              parags_in_block[bn]++;
            }
        }
    }
  if (ERRNO != "0") { error((file ": " ERRNO)); }
}

function make_default_block_table(   pn,bn,pi)
{
  # Creates the default list of the paragraphs that should be printed 
  # and their order.  Defines "nBlocks" and the 
  # tables "parags_in_block" and "block_from_parag".
  nBlocks = nParags;
  split("", parags_in_block);
  split("", block_from_parag);
  for (pi = 1; pi <= nParags; pi++)
    { pn = sprintf("%04d", pi);
      bn = pi-1; 
      block_from_parag[pn] = bn;
      parags_in_block[bn] = 1;
    }
}

BEGIN { 
  abort = -1;
  if (maxLen == 0) maxLen=16; 
  if (nParags == "") { error("must specify \"-v nParags\""); }
  if (title == "") { title = "Word-paragraph occurrence map"; }
  if (blockTable != "") 
    { load_block_table(blockTable); }
  else
    { make_default_block_table(); }
  if (countWidth == 0) { countWidth = 1; }
  if (countWidth == 1) 
    { maxCount = 9; }
  else
    { maxCount=1; 
      for (i=1;i<countWidth;i++) { maxCount = maxCount*10; }
      maxCount--;
    } 
  print_head(title);
  split("", blct);
  prev_patt = "";
}

/^[#]/ { next; }

/^ *$/ { next; }

/./ {
  if(abort >= 0) exit abort;
  if (NF != 4 + nParags) error("wrong number of fields");
  ttct = $1;
  patt = $(nParags+2);
  strn = $(nParags+3);
  tagg = $(nParags+4);
  if (totOnly && (tagg != "=")) next;
  for (bn=0; bn <= nBlocks; bn++) { blct[bn] = 0; }
  for (pi=1; pi <= nParags; pi++) 
    { pn = sprintf("%04d", pi);
      if (pn in block_from_parag)
        { bn = block_from_parag[pn];
          blct[bn] += $(pi+1);
        }
    }
  if ((patt != prev_patt) && (prev_patt != "") && (! totOnly)) printf "\n";
  print_line(patt, strn, tagg, ttct, blct);
  prev_patt = patt;
}

END { 
  if (abort >= 0) { exit abort; }
  print_tail();
}