#! /usr/bin/gawk -f
# Last edited on 2000-07-10 08:44:59 by stolfi

# Reads a file with entries for all patterns and all pages
# of one section, of the form
# 
#   PTOTCT PFREQ  STOTCT SFREQ  STRANG  FNUM KNUM PATTERN WORDLIST
# 
# where PTOTCT STOTCT are the counts of the PATTERN in the page FNUM
# (= page KNUM of section) and in the whole section; PFREQ and SFREQ
# are the PATTERN's estimated frequencies in the page and section;
# STRANG is a measure of how anomalous the PTOTCT is, and WORDLIST is
# the list of all WORD values associated to PATTERN in the page.
#
# Outputs a list of words that should be colored in this 
# section, one per line.

BEGIN {
  abort = -1;
  
  usage = ( "choose-peculiar-words \\\n" \
            "    -v maxPatterns=NUM \\\n" \
            "    -v maxDensity=FRAC \\\n" \
            "    -v minStrangeness=NUM \\\n" \
            "  < PATTERNS.strx \\\n" \
            "  > WORDS.dic" );
  
  if (maxPatterns == 0)
    { error("must specify \"-v maxPatterns=NUM\""); }
  if (maxDensity == 0)
    { error("must specify \"-v maxDensity=FRAC\""); }
  if (minStrangeness == 0)
    { error("must specify \"-v minStrangeness=NUM\""); }

  split("", tot_pf); # Word occurrences per word and page.
  split("", str_pf); # Word occurrences per word and page.
  split("", tot_p);  # Word occurrences per word
  split("", tot_f);  # Word occurrences per page
  split("", pat_w);  # Pattern of each word
  split("", knum_f); # k-num of each fnum
  split("", fnum_k); # f-number of each knum
  kMax = 0;          # Maximum knum
  tot1 = 0;          # Total tokens (from per-section counts)
  tot2 = 0;          # Total tokens (from per-page counts)
  nPairs = 0;
}

(abort >= 0) { exit abort; }

/./ {
  if (NF != 9) { error("bad entry = \"" $0 "\""); }
  pgCt = $1; pgFr = $2; scCt = $3; scFr = $4; 
  strang = $5; fnum = $6; knum = $7; pat = $8; wdlist = $9; 
  #
  if ((pat,fnum) in tot_pf) { error("repeated key = \"" $0 "\""); }
  #
  tot_pf[pat,fnum] = pgCt;
  if (pat in tot_p) 
    { if (tot_p[pat] != scCt) { error(("inconsistent section totals")); } }
  else
    { tot_p[pat] = scCt; tot1 += scCt; }
  tot_f[fnum] += pgCt;
  if ((knum in fnum_k) && (fnum_k[knum] != fnum)) { error("bad knum"); }
  knum_f[fnum] = knum;
  fnum_k[knum] = fnum;
  if (knum > kMax) { kMax = knum; }
  str_pf[pat,fnum] = strang;
  nwd = split(wdlist, wd, ",");
  for (i = 1; i <= nwd; i++) 
    { w = wd[i];
      if (w in pat_w)
        { if (pat != pat_w[w]) { error(("inconsistent pattern " w " " pat)); } }
      else
        { pat_w[w] = pat; }
    }
  tot2 += pgCt;
  nPairs++;
}

END {
  if (abort >= 0) { exit abort; }
  if (tot1 != tot2) { error(("inconsistent totals = " tot1 "," tot2)); }
    
  printf "loaded %6d patern-page counts (%d tokens)\n",
    nPairs, tot1 > "/dev/stderr";
    
  # chosen[pat] is defined if patern pat has been chosen for coloring.
  split("", chosen);
  nPatChosen = 0;
  # nColored[fnum] is the number of tokens already colored in page fnum.
  split("", nColored);
  while (nPatChosen < maxPatterns)
    { # Choose another pattern to color
      # pMax is the strangest unPatChosen pat,
      # fMax is the fnum where it occurs,
      # sMax is its strangeness. 
      sMax = -1;
      for (pf in tot_pf)
        { split(pf, pfx, SUBSEP);
          pat = pfx[1]; fnum = pfx[2];
          pfct = tot_pf[pat,fnum];
          maxColored = maxDensity * tot_f[fnum];
          if ((pfct >= 2) && (! (chosen[pat])) && (nColored[fnum] < maxColored))
            { strn = str_pf[pat,fnum];
              if (strn > sMax) { sMax = strn; pMax = pat; fMax = fnum; }
            }
        }
      printf "  * %-15s %-6s %8.4f\n", pMax, fMax, sMax > "/dev/stderr";
      if (sMax < minStrangeness) break;
      # Update nColored, delete pat from tot_pf table,
      # Compute mean knum of pattern:
      chosen[pMax] = 1; nPatChosen++;
      kTop = 0; nTop = 0;
      for (fnum in tot_f)
        { if ((pMax,fnum) in tot_pf)
            { pgCt = tot_pf[pMax,fnum];
              nColored[fnum] += pgCt;
              if (pgCt > nTop) { nTop = pgCt; kTop = knum_f[fnum]; }
              delete tot_pf[pMax,fnum];
            }
        }
      # Compute hue from mean knum:
      hue = (kTop + 0.5)/(kMax + 1);
      printf "%6.4f %s\n", hue,  pMax;
    }
  printf "%7d patterns chosen\n", nPatChosen > "/dev/stderr";
  printf "densities of colored words per page:\n" > "/dev/stderr";
  for (fnum in nColored)
    { dens = nColored[fnum]/tot_f[fnum];
      printf "%-6s %7.5f %s\n", 
        fnum, dens, (dens > maxDensity ? "+" : "-") > "/dev/stderr"; 
    }
}

function error(msg)
{ 
  printf "line %d: %s\n", NR, msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit;
}