#! /usr/bin/gawk -f
# Last edited on 2000-07-10 08:12:35 by stolfi

function error(msg)
{ 
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1;
  exit;
}

# Usage:
#   create-color-tables \
#    -v colorPatterns=FILE.dic \
#    -v outDir=ODIR \
#    [ -v minStrangeness=MINSTR ] \
#    [ -v minLum=MINLUM ] \
#    [ -v maxStrangeness=MAXSTR ] \
#    [ -v maxLum=MAXLUM ] \
#    [ -v uniqueColor=UNICOLOR ] \
#    < INFILE
#
# Input INFILE must have fields 
# 
#    PTOTCT PFREQ  STOTCT SFREQ  STRANG  FNUM KNUM PATTERN WORDLIST
# 
# where PTOTCT STOTCT are the counts of the PATTERN in the page FNUM
# and in the whole section, PFREQ and SFREQ are the PATTERN's
# estimated frequencies in the page and section, STRANG is a measure
# of how anomalous the PTOTCT is, and WORDLIST is the list of all WORD
# values associated to PATTERN in the page.
#
# Writes two separate files per page: ODIR/FNUM.clr (pattern-to-color table)
# and ODIR/FNUM.spw (special patterns table). 
#
# ODIR/FNUM.spw has fields 
# 
#    PTOTCT STOTCT STRANG COLOR  FNUM KNUM PATTERN WORDLIST
# 
# and contains only those patterns listed in the colorPatterns file.
# that occur in the page.
#
# ODIR/FNUM.clr has fields
# 
#    PATWD COLOR 
#
# where PATWD is either a PATTERN or a WORD,
# and includes all words from patterns listed in the colorPatterns file,
# plus those with global STOTCT=1 (colored UNICOLOR).

BEGIN {
  abort = -1;
  if (colorPatterns == "") { error("must define \"colorPatterns\""); } 
  if (outDir == "") outDir = ".";
  if (minStrangeness == "") minStrangeness = 0.0;
  if (minLum == "") minLum = 0.30;
  if (maxStrangeness == "") maxStrangeness = -1.0;
  if (maxLum == "") maxLum = 0.70;
  if (uniqueColor == "") uniqueColor = "5577ff";
  rgb[0] = 0; rgb[1] = 0; rgb[2] = 0;
  phi = (sqrt(5)-1)/2;
  split("", whue);
  split("", selected);
  nexthue = 0;
  nPatterns = 0;
  maxStrang = 0.0;
  split("", pgtot_pf);
  split("", sctot_p);
  split("", str_pf);
  split("", wds_pf);
  split("", knum_f);
  read_patterns();
}

(abort >= 0) { exit abort; }

/./ {
  if (NF != 9) { error("bad format"); }
  pgCt = $1; pgFr = $2;
  scCt = $3; scFr = $4;
  strang = $5; fnum = $6; knum = $7; pat = $8; wdlist = $9; 
  pgtot_pf[pat,fnum] = pgCt;
  str_pf[pat,fnum] = strang;
  wds_pf[pat,fnum] = wdlist;
  if ((fnum in knum_f) && (knum != knum_f[fnum])) { error("inconsistent knum"); }
  knum_f[fnum] = knum;
  if (p in sctot_p)
    { if (scCt != sctot_p[pat]) { error (("inconsistent section count " pat)); } }
  else
    { sctot_p[pat] = scCt; }
  if (strang > maxStrang) { maxStrang = strang; }
  next;
}

END {
  if (abort >= 0) { exit abort; }
  if (maxStrangeness < minStrangeness) { maxStrangeness = maxStrang; }
  for (fnum in knum_f)
    { setfile(fnum);
      for (pat in sctot_p)
        { if ((pat,fnum) in pgtot_pf)
            { pgCt = pgtot_pf[pat,fnum];
              strang = str_pf[pat,fnum];
              wdlist = wds_pf[pat,fnum];
              scCt = sctot_p[pat];
              if (scCt == 1)
                { putcolor(fnum, pat, wdlist, uniqueColor); }
              else if (pat in selected)
                { hue = whue[pat];
                  rgb_from_hue(rgb, hue);
                  y = y_from_strangeness(strang);
                  rgb_fix_y(y, rgb);
                  color = xcolor_from_rgb(rgb);
                  putcolor(fnum, pat, wdlist, color);
                  putdata(pgCt, scCt, strang, color, fnum, knum, pat, wdlist);
                }
            }
        }
    }
}

function abs(x)
{ 
  return (x >= 0 ? x : -x);
}

function y_from_rgb(rgb)
{
  return 0.30*rgb[0] + 0.60*rgb[1] + 0.10*rgb[2];
}

function rgb_fix_y(y, rgb,   yy, ar, aw, ab)
{
  # mixes white or black into "rgb" so that its intensity is "y".
  yy = y_from_rgb(rgb);
  if (yy < y)
    { # mix white
      ar = (1-y)/(1-yy);
      aw = (y-yy)/(1-yy);
      rgb[0] = ar*rgb[0] + aw;
      rgb[1] = ar*rgb[1] + aw;
      rgb[2] = ar*rgb[2] + aw;
    }
  else if (yy > y)
    { # mix black
      ar = y/yy;
      rgb[0] = ar*rgb[0] + aw;
      rgb[1] = ar*rgb[1] + aw;
      rgb[2] = ar*rgb[2] + aw;
    }
}

function gamma(r)
{
  return r
}

function xcolor_from_rgb(rgb,   rr, gg, bb)
{ 
  rr = int(gamma(rgb[0])*255 + 0.5);
  gg = int(gamma(rgb[1])*255 + 0.5);
  bb = int(gamma(rgb[2])*255 + 0.5);
  return sprintf("%02x%02x%02x", rr, gg, bb);
}  

function rgb_from_hue(rgb, h,   hf, hi)
{
  while (h >= 1) { h = h - 1; }
  while (h < 0) { h = h + 1; }
  h = 6*h;
  hi = int(h); hf = h - hi;
  if (hi == 0)
    { rgb[0] = 1;    rgb[1] = hf;   rgb[2] = 0;    }
  else if (hi == 1)
    { rgb[0] = 1-hf; rgb[1] = 1;    rgb[2] = 0;    }
  else if (hi == 2)
    { rgb[0] = 0;    rgb[1] = 1;    rgb[2] = hf;   }
  else if (hi == 3)
    { rgb[0] = 0;    rgb[1] = 1-hf; rgb[2] = 1;    }
  else if (hi == 4)
    { rgb[0] = hf;   rgb[1] = 0;    rgb[2] = 1;    }
  else if (hi == 5)
    { rgb[0] = 1;    rgb[1] = 0;    rgb[2] = 1-hf; }
}

function setfile(fnum)
{
  if (fnum != ofnum) 
    { if (cfile != "") { close(cfile); }
      if (dfile != "") { close(dfile); }
      printf "%s...\n", fnum > "/dev/stderr";
      ofnum = fnum; 
      cfile = (outDir "/" fnum ".clr"); printf "#\n" > cfile;
      dfile = (outDir "/" fnum ".spw"); printf "#\n" > dfile;
    }
}

function putcolor(fnum, pat, wdlist, color,   nw,w,i)
{
  setfile(fnum);
  print pat, color > cfile;
  nw = split(wdlist, w, ",");
  for (i=1; i<=nw; i++) 
    { if (w[i] != pat) { print w[i], color > cfile; } }
}

function putdata(pgCt, scCt, strang, color, fnum, knum, pat, wdlist)
{
  setfile(fnum);
  print pgCt, scCt, pCount, strang, color, fnum, knum, pat, wdlist > dfile;
}

function y_from_strangeness(strang,  y)
{
  y = (abs(strang) - minStrangeness)/(maxStrangeness - minStrangeness);
  # print y, (maxStrangeness - minStrangeness), abs(strang) > "/dev/stderr";
  if (y > 1) { y = 1; }
  if (y < 0) { y = 0; }
  return minLum + y * (maxLum - minLum);
}

function read_patterns(   lin,fld,nfld,pat,hue)
{
  # Read list of colored patterns, and assigns them hues: 
  split("", selected);
  split("", whue);
  nPatterns=0;
  while ((getline lin < colorPatterns) > 0)
    { if (! match(lin, /^[#]/))
        { nfld = split(lin,fld);
          if (nfld != 2) { error("bad pattern format"); } 
          hue = fld[1]; pat = fld[2];
          if (pat in selected)
            { error("duplicate pattern in colorPattern file"); }
          whue[pat] = hue;
          selected[pat] = 1;
          nPatterns++;
        }
    }
  if (ERRNO != "0") { error((colorPatterns ": " ERRNO)); }
  close (colorPatterns);
  printf "read %d patterns to color\n", nPatterns > "/dev/stderr";   
}