#! /usr/bin/gawk -f
# Last edited on 1999-01-06 06:43:08 by stolfi

# Usage: 
#
#  cat RAWMAP \
#    | format-word-location-map \
#        [-v nblocks=NBLOCKS] \
#        [-v maxlen=MAXLEN] \
#        [-v ctwd=CTWD] \
#        [-v html={0|1}] \
#        [-v title=TITLE] \
#        [-v blockHeadings=TITLE] \
#        [-v totOnly={0|1}] \
#        [-v showProps={0|1}] \
#        [-v showPattern={0|1}] \
#        [-v showLineNumber={0|1}] \
#        [-v showAbsCounts={0|1}] \
#        [-v showRelCounts={0|1}] \
#        [-v showAvgPos={0|1}] \
#    > OUTFILE"
#
# The script reads a raw word location map, as produced by
# make-word-location-map, consisitng of records of the form 
#
#  TOTCT XXX...XXX PATTN STRING TAG PNUM LOC OBS LANG
#
# and formats it nicely, adding the following fields:
#
#   AVP and DVP the mean and variance
#               of the block number for this STRING;
#
#   YYY...YYY   the per-block occurrence counts,
#               divided by the line's TOTCT.
#
# The following options request specific fields to be printed:
#
#   showAvgPos=1      prints the fields AVG and DVP
#
#   showAbsCounts=1   prints the absolute counst per block XXX...XXX
#
#   showRelCounts=1   prints the relative counts per block YYY...YYY
#
#   showPattern=1     prints the PATT field in addition to the STRING.
#
#   showLineNumber=1  prints a sequential line number (starting from 0).
#
#   showProps=1       prints the fields TAG PNUM LOC OBS LANG
#
# If TAG = "=" the line is interpreted as a pattern total,
# and formatted specially.
#
# If "totOnly" is 1 then only lines with TAG = "=" are printed.
#
# Blank lines in the input produce blank lines in the output.
# 
# Each per-block count is printed with CTWD bytes. If CTWD > 1 then
# the maximum value printed MAXCT is 10^(CTWD-1)-1, with at least one
# leading blank; else MAXCT is 9.  The percentages are scaled from [0%
# _ 100%] to [0 _ MAXCT] and rounded.

function html_beg_line(tg)
{
  if (tg == "-")
    { printf "<font color=#000000>"; }
  else if (tg == "+")
    { printf "<font color=#993300>"; }
  else if (tg == "=")
    { printf "<font color=#003366>"; }
  else if (tg == "h")
    { printf "<font color=#9900ff>"; }
  else
    { printf "<font color=#ff3300>"; }
}

function html_end_line(tg)
{
  printf "</font>";
}

function avp(c,   i, s, n)
{
  # Computes the average string position from histogram "c"
  s = 0.0
  n = 0
  for (i in c) { s += (i-0.5)*c[i]; n += c[i] }
  return s/n
}

function dev(c, a,    i, d, bias, slop, ss, n)
{
  # Computes the estimated standard deviation of the string position
  # from the histogram "c" and average position "a"
  
  # The biasterm tries to fix the deviation so that
  # rare strings do not come out looking localized.
  ss = 0.0
  n = 0
  for (i in c) 
    { d = (i-0.5) - a; ss += (d*d)*c[i]; n += c[i] }
  slop = (nblocks-1.0)/n
  bias = (1.0 + slop*slop)/12.0
  return sqrt(ss/n + bias)
}

function print_line(pa, st, tg, tt, ct, pn, lc, ob, lg,    av,dv,bn)
{
  # Prints a line for pattern "pa", string "st", tag "tg", total count "tt", 
  # per-block counts "ct[0..nblocks-1]", p-number "pn", location "lc",
  # obs "ob", language "lg".
  # Also computes the average position and deviation.
  # Also increments the line counter.
  if (html) html_beg_line(tg);
  if (showLineNumber) printf "%5d ", line_count;
  if (showPattern) printf "%-*s ", maxlen, pa;
  printf "%-*s ", maxlen, st;
  if (showAvgPos)
    { av = avp(c);
      dv = dev(c, av);
      printf "%5.1f %5.1f ", av, dv;
    }
  printf "%5d ", tt;
  if (showAbsCounts) 
    { printf " ";
      for (bn=0; bn<nblocks; bn++) 
        { if (ct[bn] == 0) printf "%*s", ctwd, "."
          else if (ct[bn] >= maxct) printf "%*d", ctwd, maxct
          else printf "%*d", ctwd, ct[bn]
        }
      printf " ";
    }
  if (showRelCounts) 
    { printf " ";
      for (bn=0; bn<nblocks; bn++) 
        { rct = int(((ct[bn]+1)*maxct)/(tt+nblocks) + 0.5)
          if (ct[bn] == 0) printf "%*s", ctwd, "."
          else if (rct >= maxct) printf "%*d", ctwd, maxct
          else printf "%*d", ctwd, rct
        }
      printf " ";
    }
  if (showProps) 
    { printf " %1s", (lg == "-" ? "?" : lg);
      printf " %-4s", (pn == "-" ? "" : ("p" pn));
      printf " %-15s", (lc == "-" ? "" : lc);
      printf " %s", (ob == "-" ? "" : ob);
    }
  if (html) html_end_line(tg);
  printf "\n";
  line_count++;
}

function print_headings_major(    av,dv,bn)
{
  # Prints the major column headings.  
  # Must match print_line.
  if (html) html_beg_line("h");
  if (showLineNumber) printf "%5s ", " ";
  if (showPattern) printf "%-*s ", maxlen, " ";
  printf "%-*s ", maxlen, " ";
  if (showAvgPos)
    { printf "%5s %5s ", " ", " "; }
  printf "%5s ", " ";
  if (showAbsCounts)
    { printf " ";
      printf "%-*s", nblocks*ctwd, "abs counts";
      printf " ";
    }
  if (showRelCounts) 
    { printf " ";
      printf "%-*s", nblocks*ctwd, "rel counts";
      printf " ";
    }
  if (showProps) 
    { printf " %1s", " ";
      printf " %-4s", " ";
      printf " %-15s", " ";
      printf " %s", " ";
    }
  if (html) html_end_line("h");
  printf "\n"
}

function print_headings_minor(    av,dv,bn)
{
  # Prints the minor column headings.  
  # Must match print_line.
  if (html) html_beg_line("h");
  if (showLineNumber) printf "%5s ", "line";
  if (showPattern) printf "%-*s ", maxlen, "pattern";
  printf "%-*s ", maxlen, "word(s)";
  if (showAvgPos)
    { printf "%5s %5s ", "av.bl", "dv.bl"; }
  printf "%5s ", "totct";
  if (showAbsCounts)
    { printf " ";
      for (bn=0; bn<nblocks; bn++) 
        { printf "%*s", ctwd, sprintf("%02d", bn); }
      printf " ";
    }
  if (showRelCounts) 
    { printf " ";
      for (bn=0; bn<nblocks; bn++) 
        { printf "%*s", ctwd, sprintf("%02d", bn); }
      printf " ";
    }
  if (showProps) 
    { printf " %1s", "L";
      printf " %-4s", "page";
      printf " %-15s", "location";
      printf " %s", "notes";
    }
  if (html) html_end_line("h");
  printf "\n"
}

function print_dashes(      i,bn)
{
  # Prints dashes for all fields. 
  # Must match print_line.
  if (html) html_beg_line("h");
  if (showLineNumber) printf "----- ";
  if (showPattern) { for (i=0;i<maxlen;i++) printf "-"; printf " "; }
  for (i=0;i<maxlen;i++) printf "-"; printf " ";
  if (showAvgPos) { printf "----- ----- "; }
  printf "----- ";
  if (showAbsCounts)
    { printf " ";
      for (bn=0; bn<nblocks; bn++) 
        { if (ctwd == 1) 
            { printf "-"; }
          else
            { printf " "; for (i=0;i<ctwd-1;i++) printf "-"; }
        }
      printf " ";
    }
  if (showRelCounts) 
    { printf " "
      for (bn=0; bn<nblocks; bn++) 
        { if (ctwd == 1) 
            { printf "-"; }
          else
            { printf " "; for (i=0;i<ctwd-1;i++) printf "-"; }
        }
      printf " ";
    }
  if (showProps)
    { printf " -";
      printf " ----";
      printf " ---------------";
      printf " ------"
    }
  if (html) html_end_line("h");
  printf "\n"
}

function compute_block_column(bn,    skip)
{
  # Returns the screen column of the first char in the count for block bnum.
  # Must match print_line.
  # Assumes absolute and relative counts are printed in the same format.
  skip = 0;
  if (showLineNumber) skip += 6;        # line number
  if (showPattern) skip += maxlen + 1;  # pattern
  skip += maxlen + 1;                   # string
  if (showAvgPos) skip += 12;           # average block index and spread
  skip += 6;                            # total count
  skip += bn*ctwd;                      # previous blocks
  return 1 + skip;
}

function html_block_headings(file,   lin, col, i)
{
  # Prints the page numbers (verticalized) 
  # over the per-block counts.
  if (showAbsCounts || showRelCounts)
    { col = compute_block_column(0);
      while ((getline lin < file) > 0)
        { html_beg_line("h");
          for (i=1;i<col;i++) printf " ";
          if (showAbsCounts) { printf " "; printf "%s", lin; printf " "; }
          if (showRelCounts) { printf " "; printf "%s", lin; printf " "; }
          html_end_line("h");
          printf "\n";
        }
      if (ERRNO != "0") { error((file ": " ERRNO)); }
    }
}
      
function html_head(title)
{
  printf "<html>\n";
  printf "<head><title>Voynich Manuscript - %s</title></head>\n", title;
  printf "<body bgcolor=#ccffcc>\n";
  printf "<h1>Voynich Manuscript</h1>\n";
  printf "<h2>%s</h2>\n", title;
  printf "<font size=1>\n";
  printf "<pre>\n";
  print_headings_major();
  if (blockHeadings != "")
    { print_dashes(); html_block_headings(blockHeadings); }
  print_headings_minor();
  print_dashes();
} 

function html_tail()
{
  printf "</pre>\n";
  printf "</font>\n";
  printf "</body>\n";
  printf "</html>\n";
} 

BEGIN { 
  abort = 0;
  if (maxlen == 0) maxlen=16; 
  if (nblocks == 0) error("must specify \"-v nblocks\""); 
  if (title == "") title = "Word occurrence map"; 
  if (ctwd == 0) ctwd = 1
  if (ctwd == 1) 
    { maxct = 9 }
  else
    { maxct=1; 
      for (i=1;i<ctwd;i++) { maxct = maxct*10 }
      maxct-- 
    } 
  if (html) html_head(title);
  split("", blct);
  prev_patt = "";
}

/^ *$/ { print; next; }

/./ {
  if(abort) exit 1;
  if (NF != 8 + nblocks) error("wrong number of fields");
  ttct = $1;
  for (i=0; i<nblocks; i++) blct[i] = $(i+2);
  patt = $(nblocks+2);
  strn = $(nblocks+3);
  tagg = $(nblocks+4);
  pnum = $(nblocks+5);
  locn = $(nblocks+6);
  obss = $(nblocks+7);
  lang = $(nblocks+8);
  if (totOnly && (tagg != "=")) next;
  if ((patt != prev_patt) && (prev_patt != "") && (! totOnly)) printf "\n";
  print_line(patt, strn, tagg, ttct, blct, pnum, locn, obss, lang);
  prev_patt = patt;
}

END { 
  if (html) html_tail();
}