#! /usr/bin/gawk -f
# Last edited on 1999-07-28 01:55:51 by stolfi

# Colorizes the output of "extract-signif-chars" by a tuple-indexed table.

BEGIN {
  usage = ( \
      "show-local-entropy \\\n" \
      "  -v order=ORDER \\\n" \
      "  -v place=PLACE \\\n" \
      "  -v table=TABLE \\\n" \
      "  [ -v filler=CHAR ] \\\n" \
      "  [ -v lowercase=BOOL ] \\\n" \
      "  -v hmin=HMIN \\\n" \
      "  -v hmax=HMAX \\\n" \
      "  [ -v ymin=YMIN ] \\\n" \
      "  [ -v ymax=YMAX ] \\\n" \
      "  [ -v colors=NCOLORS ] \\\n" \
      "  < SIGPAGE > COLPAGE" \
    );
  #
  # Colorizes letters in a text based on the local conditional entropy.
  #
  # The file TABLE must have entries VALUE WORD where VALUE is a real
  # value, and WORD a string of letters with length = ORDER.
  #
  # The input file SIGPAGE must have been produced by
  # extract-signif-chars.  
  # Only the "significant" letters (class 3) and breaks in SIGPAGE are
  # considered.  A word break (class 1)
  # is replaced by the FILLER character, and a paragraph break (class
  # 2) is replaced by ORDER-1 FILLERs.  If "lowercase"
  # is set, class 3 characters are downcased before comparisons
  # and tabulations.
  #  
  # Let "txt[0...N-1]" be the sequence of significant characters
  # (class 3) from SIGPAGE, plus the FILLERs resulting from break
  # expansion.  The program will compute from them a string of numbers
  # "val[0..N-1]", as follows: starting with all zero "val"s, whenever
  # some substring "txt[i..i+ORDER-1]" matches some WORD from the
  # TABLE file, this program will set "val[i+PLACE-1]" to the
  # corresponding VALUE.
  #
  # The final value of "val[i]" is then used to colorize the character
  # "txt[i]" in the standard output (using HTML formatting). Values in
  # the range HMIN to HMAX (clipped) are mapped to NCOLORS discrete
  # colors with varying hues and increasing intensities in the range
  # YMIN to YMAX (in [0 _ 1]) (plus two colors for out-of-range
  # values).
  #
  # Non-significant characters (class 0) are printed in the
  # appropriate places, with the default color, but are skipped over
  # when forming tuples.

  abort = -1;
  check_options();
  read_value_table(table, order);
  
  # Color table:
  make_color_table(colors, hmin, hmax, ymin, ymax);
  
  current_color = "";

  init_tup();
}
    
/^[0]/{
  if (abort >= 0) { exit(abort); }
  push_deco(decode(substr($0,2)));
  next;
}

/^[1]/{
  if (abort >= 0) { exit(abort); }
  push_char(filler, "_");
  push_deco(decode(substr($0,2)));
  next;
}

/^[2]/{
  if (abort >= 0) { exit(abort); }
  for (i=1;i<order;i++) { push_char(filler, "_"); }
  if (NR == 1) { push_deco("\n"); }
  push_deco(decode(substr($0,2)));
  next;
}

/^[3]/{
  c = substr($0,2,1); m = map[c];
  if (m == filler) { error(("\"filler\" character found on input")); }
  push_char(m, c);
  next;
}

END {
  if (abort >= 0) { exit(abort); }
  flush_tup();
  # close last <font> directive
  print_text("\n", ""); 

  print_color_table(colors, xcolor, tcolor);
}

function init_tup(   i)
{
  tup = "";
  for (i=1;i<order;i++) { tup = (tup " "); }
  split("", val); split("", ext); split("", glu);
  wait = order-1;
}

function push_char(m, c,   i,v,ic)
{
  # The argument "m" must be one character.  Appends the character "m"
  # to the "tup" buffer, and remembers its external representation is
  # "c".
  #
  # Specifically, sets "ext[order] = c, "glu[order] = "", val[order] =
  # 0".  Then fetches the "val" corresponding to "tup" from the table,
  # distributes it over "val[1..order]".  Finally outputs the first
  # char of "tup", "ext[1]", "glu[1]", and deletes them.
  
  # printf "push_char(\"%s\", \"%s\")\n", m,c > "/dev/stderr";
  if (m == "") { error("push_tup: empty m"); }
  
  # extend current tuple with new character:
  tup = (tup m); ext[order] = c; glu[order] = ""; val[order] = 0;
  if (wait == 0)
    { 
      # Now "tup" must have "order" characters. 
      # Find its value "v" and insert it in the "val" buffer:
      if (tup in vtable) { v = vtable[tup]; } else { v = hmax; }
      val[place] = v;
      # write out the first character of "tup":
      print_tup_head();
    }
  else
    { wait--; }
  # Shift buffer left by 1:
  pop_tup();
}

function push_deco(s)
{
  if (order > 1) 
    { glu[order-1] = (glu[order-1] s); }
  else
    { print_text(s, ""); }
}

function print_tup_head(  v)
{
  # Prints fist character in "tup" buffer, and associated glue:
  print_text(ext[1], xcolor_from_value(val[1], colors, hmin, hmax));
  print_text(glu[1], "");
}

function pop_tup(  i)
{
  # shift out first position of "tup" buffer and acessories:
  tup = substr(tup,2,order-1);
  for (i=1; i<order; i++) 
    { ext[i] = ext[i+1]; glu[i] = glu[i+1]; val[i] = val[i+1]; }
  delete ext[order]; delete glu[order]; delete val[order];
}

function flush_tup(   i)
{
  for (i=1;i<order;i++) 
    { if (substr(tup,1,1) != filler) { error(("internal error 1")); }
      print_tup_head();
      pop_tup();
    }
}

function decode(str)
{
  gsub(/\015/, "\n", str);
  return str;
}

function print_text(str, color)
{ 
  if (str != "") 
    { if (current_color != color)
        { if (current_color != "") { printf "</font>"; }
          if (color != "") { printf "<font color=#%s>", color; }
          current_color = color;
        }
      gsub(/[&]/, "\\&amp;", str);
      gsub(/[<]/, "\\&lt;", str);
      gsub(/[>]/, "\\&gt;", str);
      printf "%s", str;
    }
}

function check_options(   i,c,mk,ucs,lcs,uc,lc)
{
  # Analyzes/defaults the option variables, namely
  #
  #   "order" "place"
  #   "table"
  #   "filler" "lowercase"
  #   "hmin" "hmax"
  #   "ymin" "ymax" "colors"
  #
  # Defines the global variable "map" that maps characters to lowercase 
  # if so desired.
  
  if (order == "") { error("should define \"order\""); } 
  if ((order < 1) || (order > 20)) { error("funny \"order\""); } 
    
  if (place == "") { error("should define \"place\""); } 
  if ((place < 1) || (place > order)) { error("funny \"place\""); } 
    
  if (filler == "") { filler = "_"; }
  if (length(filler) != 1)
    { error(("the \"filler\" should be a single char")); }

  # --- lowercase mapping ----------------------------------------------
  split("", map);
  for (i=0;i<256;i++) { c = sprintf("%c", i); map[c] = c; }
  
  if (lowercase == "") { lowercase = 1; }
  if (lowercase > 0) 
    { ucs = "ABCDEFGHIJKLMNOPQRSTUVWXYZАБВГДЕЖЗИЙКЛМНОПРСТУФХЦШЩЪЫЬЭЮ";
      lcs = "abcdefghijklmnopqrstuvwxyzабвгдежзийклмнопрстуфхцшщъыьэю";
      for (i=1;i<=length(ucs);i++)
        { uc = substr(ucs,i,1); lc = substr(lcs,i,1);
          map[uc] = lc;
        }
    }

  if (table == "") { error("should define \"table\""); }  

  if (hmin == "") { error("should define \"hmin\""); } 
  if (hmax == "") { error("should define \"hmax\""); } 
  if (hmin >= hmax) { error("funny \"hmin\",\"hmax\""); } 
  
  if (ymin == "") { ymin = 0.25; } 
  if (ymax == "") { ymax = 1.00; } 

  if (colors == "") { colors = 9; } 
}

function read_value_table(table, order,   lin,fld,v,w)
{
  # Reads the table "vtable" that maps tuples to values.

  while ( getline lin < table )
    { split(lin, fld);
      if ((3 in fld) || !(2 in fld)) { error("file " table " line " NR ": bad format"); }
      v = fld[1];
      w = fld[2];

      if (length(w) != order) 
        { error("file " table " line " NR ": wrong word length"); } 

      if (v !~ /^[-+]?[0-9]+[.]?[0-9]*$/) 
        { error("file " table " line " NR ": bad value"); } 
      vtable[w] = v;
    }
}

function error(msg)
{ printf "%s\n", msg >>  "/dev/stderr";
  abort = 1; exit 0;
}

function xcolor_from_value(v, colors, hmin, hmax,   i)
{
  # Maps a letter value "v" in "[hmin _ hmax]" 
  # to an X color value:
  if (v < hmin)
    { return xcolor[-1]; }
  else if (v >= hmax)
    { return xcolor[colors]; }
  else
    { i = int(colors*(v - hmin)/(hmax - hmin)); 
      return xcolor[(i < 0 ? 0 : (i >= colors ? colors-1 : i))];
    }
}

function make_color_table(colors, hmin, hmax, ymin, ymax,   i,vlo,vhi)
{
  # Defines and prints the color table "xcolor[i]" used by 
  # "xcolor_from_value". Also sets "tcolor[i] to the 
  # correspoding legend.
  
  xcolor[-1] = xcolor_from_ratio(0, ymin, ymax);
  tcolor[-1] = sprintf(" 0.000 to %6.3f bits", hmin-0.001);
  for (i=0;i<colors;i++) 
    { vlo = hmin + (i/colors)*(hmax - hmin);
      vhi = hmin + ((i+1)/colors)*(hmax - hmin);
      xcolor[i] = xcolor_from_ratio((i+1)/(colors+1), ymin, ymax);
      tcolor[i] = sprintf("%6.3f to %6.3f bits", vlo, vhi-0.001);
    }
  xcolor[colors] = xcolor_from_ratio(1, ymin, ymax);
  tcolor[colors] = sprintf("%6.3f   or more bits", hmax);
}  

function print_color_table(colors, xcolor, tcolor,   i)
{
  printf "\n";
  printf "# Color key:\n#\n";
  for (i = colors; i >= -1; i--)
    { printf "#   <font color=#%s>%s</font>\n", xcolor[i], tcolor[i]; }
  printf "\n";
}

function xcolor_from_ratio(r, ymin, ymax,    y,rgb)
{
  # Maps the parameter "r" from "[0_1]" to 
  # a color with intensity in "[ymin _ ymax]".
  
  r = (r < 0 ? 0 : (r > 1 ? 1 : r));
  rgb_from_hue(rgb, 0.6667 - 1.25*r);
  # rgb_from_hue(rgb, r);
  y = ymin*exp((r)*log(ymax/ymin));
  rgb_fix_y(rgb, y);
  # printf "r = %7.3f  y = %7.5f  rgb = (%7.5f,%7.5f,%7.5f)\n", \
  #   r, y, rgb[0], rgb[1], rgb[2] > "/dev/stderr";
  return(xcolor_from_rgb(rgb));
}

function abs(x) { return (x >= 0 ? x : -x) }

function rgb_from_hue(rgb, h,   hf, hi)
{
  while (h >= 1) { h = h - 1; }
  while (h < 0) { h = h + 1; }
  h = 6*h;
  hi = int(h); hf = h - hi;
  if (hi == 0)
    { rgb[0] = 1;    rgb[1] = hf;   rgb[2] = 0;    }
  else if (hi == 1)
    { rgb[0] = 1-hf; rgb[1] = 1;    rgb[2] = 0;    }
  else if (hi == 2)
    { rgb[0] = 0;    rgb[1] = 1;    rgb[2] = hf;   }
  else if (hi == 3)
    { rgb[0] = 0;    rgb[1] = 1-hf; rgb[2] = 1;    }
  else if (hi == 4)
    { rgb[0] = hf;   rgb[1] = 0;    rgb[2] = 1;    }
  else if (hi == 5)
    { rgb[0] = 1;    rgb[1] = 0;    rgb[2] = 1-hf; }
}

function y_from_rgb(rgb)
{
  return 0.30*rgb[0] + 0.60*rgb[1] + 0.10*rgb[2];
}

function rgb_fix_y(rgb, y,   yy, ar, aw, ab)
{
  # mixes white or black into "rgb" so that its intensity is "y".
  yy = y_from_rgb(rgb);
  if (yy < y)
    { # mix white
      ar = (1-y)/(1-yy);
      aw = (y-yy)/(1-yy);
      rgb[0] = ar*rgb[0] + aw;
      rgb[1] = ar*rgb[1] + aw;
      rgb[2] = ar*rgb[2] + aw;
    }
  else if (yy > y)
    { # mix black
      ar = y/yy;
      rgb[0] = ar*rgb[0] + aw;
      rgb[1] = ar*rgb[1] + aw;
      rgb[2] = ar*rgb[2] + aw;
    }
}

function gamma(r)
{
  return (r <= 0 ? r : exp(log(r)/2.0))
}

function xcolor_from_rgb(rgb,   rr, gg, bb)
{ 
  rr = int(gamma(rgb[0])*255 + 0.5);
  gg = int(gamma(rgb[1])*255 + 0.5);
  bb = int(gamma(rgb[2])*255 + 0.5);
  return sprintf("%02x%02x%02x", rr, gg, bb);
}  

function xcolor_from_r_g_b_y(r,g,b,y,   rgb)
{ 
  split("", rgb); rgb[0] = r; rgb[1] = g; rgb[2] = b;
  rgb_fix_y(rgb, y);
  return(xcolor_from_rgb(rgb));
}