#! /usr/bin/gawk -f
# Last edited on 2000-07-11 10:43:18 by stolfi

# usage 
#   compute-strangeness \
#     -v secCounts=SECFILE.pwct \
#     -v pageCounts=PAGEFILE.pwct \
#     < INFILE.pwct \
#     > OUTFILE.strf
#
# The input file INFILE.pwct must have records with fields
# 
#   COUNT FNUM KNUM WORD PATTERN
# 
# where COUNT is the number of occurrences of WORD (which belongs to 
# class PATTERN) in the page FNUM (whose index in the section is
# KNUM).
# 
# The output file OUTFILE.strf have one line per PATTERN with fields 
# 
#    PCOUNT PFREQ  SCOUNT SFREQ  LUMPY STRANG FNUM KNUM PATTERN WORDLIST
# 
# where PCOUNT SCOUNT are the counts of the PATTERN in the page FNUM
# and in the whole section, PFREQ and SFREQ are the PATTERN's
# estimated frequencies in the page and section, LUMPY is a measure of
# the lumpiness of the PATTERN's distribution over the pages of this
# section, STRANG is the contribution of this page to LUMPY and
# WORDLIST is the list of all WORD values associated to PATTERN in the
# page.
# 
# Each WORD must be associated to only one PATTERN.

BEGIN {
  abort = -1;
  split("", pat_w);  # Pattern corresponding to each word.
  split("", tot_pf); # Token counts of each pattern in each page.
  split("", tot_p);  # Token counts of each pattern.
  split("", tot_f);  # Token counts in each page.
  tot = 0;
  split("", lst_pf); # List of words matching each pattern in each page.
  split("", knum_f); # Index (k-number) of each page in section.
  split("", fnum_k); # Label (f-number) of the page with given index (k-number).
  split("", np_f);   # Num patters occuring in each page.
  split("", nf_p);   # Num pages where each pattern appears.
  nf = 0;
  np = 0;
}

/^ *[#]/ { next; }
/^ *$/ { next; }
/./ {
  if (NF != 5) { error(("bad input format: " $0)); }
  c = $1; fnum = $2; knum = $3; pat = $4; w = $5;
  if (c !~ /^[0-9]+$/) { error(("bad count: " $0)); }
  if ((w in pat_w) && (pat != pat_w[w])) { error(("inconsistent word/pattern :" $0)); }
  if (knum in fnum_k) 
    { if (fnum != fnum_k[knum]) { error(("inconsistent knum/fnum :" $0)); } }
  else
    { fnum_k[knum] = fnum; nf++; }
  if (fnum in knum_f)
    { if (knum != knum_f[fnum]) { error(("inconsistent fnum/knum :" $0)); } }
  else
    { knum_f[fnum] = knum; }
  lst_pf[pat,fnum] = (lst_pf[pat,fnum] "," w); pat_w[w] = pat; }
  if (! (fnum in tot_f)) { nf++; }
  tot_f[fnum] += c;
  if (! (pat in tot_p)) { np++; }
  tot_p[pat] += c;
  if (! ((pat,fnum) in tot_pf)) { tot_pf[pat,fnum] = 0; np_f[fnum] ++; nf_p[pat]++; }
  tot_pf[pat,fnum] += c;
  tot += c;
}

(abort >= 0) { exit abort; }

END {
  if(abort >= 0) { exit abort; }
  compute_strangs();
  output_strangs();
  exit 0;
}

function compute_strangs(    pat,fnum,c_pf,r_pf, e_fp,r_fp, s_p,r_p,c_p,e_p)
{
  split("", lumpy_f);   # Lumpiness indicator of the page distr of each pattern.
  split("", strang_fp); # Frequency enhancement indicator of each pattern in each page.
  split("", prob_fp);   # Prob. of page for each pattern, corrected for page size.
  for (pat in tot_p)
    { c_p = tot_p[pat];
      r_p = (c_p + 1)/(tot + np);
      s_p = 0.0;
      for (fnum in knum_f)
        { c_f = tot_f[fnum];
          c_pf = tot_pf[pat,fnum]; 
          r_pf = (c_pf + r_p)/(c_f + 1);
          prob_fp[fnum,pat] = r_pf;
          s_p += r_pf;
        }
      if (s_p+0 != 0) { for (fnum in knum_f) { prob_fp[fnum,pat] /= s_p; } }
      e_p = 0;
      for (fnum in knum_f) 
        { r_fp = prob_fp[fnum,pat];
          e_fp = - r_fp*log(r_fp);
          strang_fp[fnum,pat] = r_fp;
          e_p += e_fp;
        }
      m_p = log(np) - e_p; 
      if (m_p < 0) { m_p = 0; }
      lump_p[pat] = m_p;
    }
}

function output_strangs(   pat,knum,fnum,c_pf,c_pf,r_pf,c_p,r_p,c,wds,str)
{
  c = tot;
  for (p in tot_p)
    { c_p = tot_p[p];
      r_p = (c_p + 1)/(c + np);
      lmp = lump_p[pat];
      for (fnum in knum_f)
        { knum = knum_f[fnum];
          c_pf = tot_pf[pat,fnum];
          c_f = tot_f[fnum];
          r_pf = (c_pf + 1)/(c_f + np);
          wds = substr(lst_pf[p,fnum],2);
          str = strang_fp[fnum,pat];
          printf "%7d %7d  %6.4f %6.4f %s, %02d, %s %s\n", 
            c_pf, c_p, lmp, str, fnum, knum, p, wds;
        }
    }
  fflush("/dev/stdout");
}

function error(msg)
{ 
  printf "line %d: %s\n", NR, msg > "/dev/stderr"
  abort = 1
  exit
}