# Last edited on 2000-07-11 10:39:16 by stolfi

function strangeness(c_pf, pgSize, c_p, scSize,  f,g,p,q,pgEx,pr0,pr1)
{
  # Computes the strangeness of having "c_pf" or more occurrences
  # of a word in a page with "pgSize" tokens, given that the word
  # occurs "c_p" times in a section with "scSize" tokens.
  p = (c_p+1)/(scSize+1); q = 1-p;
  if (c_pf < p*pgSize)
    { pgEx = int(p*pgSize);
      if (c_pf > pgEx) { error(("bad floor " p " " pgSize " " pgEx " " c_pf)); }
      pr0 = log_binom(p, pgEx, q, pgSize - pgEx);
      pr1 = log_binom(p, c_pf, q, pgSize - c_pf);
      return(pr1 - pr0);
    }
  else
    { pgEx = pgSize - int(pgSize - p*pgSize);
      if (c_pf < pgEx) { error(("bad ceiling " p " " pgSize " " pgEx " " c_pf)); }
      pr0 = log_binom(p, pgEx, q, pgSize - pgEx);
      pr1 = log_binom(p, c_pf, q, pgSize - c_pf);
      return(pr0 - pr1);
    }
}


function read_page_counts(  lin,fld,nfld,c,p,wds)
{
  # Read word / pattern count per page table:
  if (pageCounts == "") { error("must specify \"pageCounts\""); }
  split("", pgtot_p);
  split("", pglst_p);
  nPatsP = 0; pgSize = 0;
  while((getline lin < pageCounts) > 0) { 
    if (! match(lin, /^[#]/))
      { gsub(/[:]/, " ", lin);
        nfld = split(lin, fld);
        if (nfld != 3) { error("bad pageCounts entry = \"" lin "\""); }
        c = fld[1]; w = fld[2]; p = fld[3];
        if (c !~ /^[0-9]+$/) { error(("bad format:" lin)); }
        if (! (p in pgtot_p)) { pgtot_p[p] = 0; pglst_p[p] = ""; }
        pgtot_p[p] += c;
        if (w in pat_w)
          { if (p != pat_w[w]) { error(("inconsistent pattern :" p)); } }
        else
          { pglst_p[p] = (pglst_p[p] "," w); pat_w[w] = p; }
        nPatsP++; pgSize += c;
      }
  }
  close (pageCounts);
  printf "loaded %6d word/pattern pairs in page (%d tokens)\n",
    nPatsP, pgSize > "/dev/stderr";
}

function read_sec_counts(  lin,fld,nfld,c,p)
{
  # Read word count table:
  if (secCounts == "") { error("must specify \"secCounts\""); }
  split("", sctot_p);
  nPatsS = 0; scSize = 0;
  while((getline lin < secCounts) > 0) { 
    if (! match(lin, /^[#]/))
      { gsub(/[:]/, " ", lin);
        nfld = split(lin, fld);
        if (nfld != 3) { error("bad secCounts entry = \"" lin "\""); }
        c = fld[1]; w = fld[2]; p = fld[3];
        if (c !~ /^[0-9]+$/) { error(("bad format:" lin)); }
        if (! (p in sctot_p)) { sctot_p[p] = 0; }
        sctot_p[p] += c;
        nPatsS++; scSize += c;
      }
  }
  close (wordCounts);
  printf "loaded %6d word/pattern pairs in section (%d tokens)\n",
    nPatsS, scSize > "/dev/stderr";
}




function compute_hues(   k,pat,ksum)
{
  for (pat in selected)
    { scCt = sctot_p[pat];
      if (scCt > 1)
        { for (k = 1; k <= nfnums; k++) 
            { fnum = fnum_k[k]; 
              if ((pat,fnum) in pgtot_pf)
                { pgCt = pgtot_pf[pat,fnum];
                  ksum += pgCt*k;
                }
             }
           whue[pat] = (ksum/scCt - 0.5)/nfnums;
        }
    }
}