#! /usr/bin/gawk -f
# Last edited on 1999-01-18 23:20:30 by stolfi

BEGIN {
  abort = -1;
  
  # Reads a file with records COUNT PAGE WORD where
  # COUNT is the number of occurrences of WORD on page PAGE.
  #
  # Outputs a file in the same format where the per-PAGE and
  # per-WORD totals are correct but the pair counts have been 
  # randomized.
  
  split("", pwct);
  split("", wct);
  split("", pct);
  nwords = 0;
  npages = 0;
}

(abort >= 0) { exit abort; }

(NF != 3) { file_error("wrong num of fields"); }

/./ { 
  n = $1; p = $2; w = $3; 
  if ((p,w) in pwct) { file_error("repeated word/page pair"); }
  pwct[p,w] += n;
  if (! (w in wct)) { nwords ++; }
  wct[w] += n;
  if (! (p in pct)) { npages ++; }
  pct[p] += n;
  ct += n;
}

END {
  if (abort >= 0) { exit abort; }
  split("", pwct);
  
  printf "expanding word sample...\n" > "/dev/stderr";
  split("", tx);
  ntx = 0;
  for (w in wct)
    { m = wct[w];
      for (i=0;i<m;i++)
        { tx[ntx] = w; ntx++; }
    }
    
  printf "randomizing word sample...\n" > "/dev/stderr";
  for(i=ntx-1;i>=1;i--)
    { j = int((i+1)*rand()-0.000001);
      if (j != i) { tmp = tx[i]; tx[i] = tx[j]; tx[j] = tmp; }
    }
    
  printf "collecting into pages...\n" > "/dev/stderr";
  k = 0;
  split("", pwct);
  for (p in pct)
    { m = pct[p];
      for(i=0; i<m; i++) { w = tx[k]; k++; pwct[p,w]++; }
    }
    
  # Print out:
  for (w in wct)
    { for (p in pct)
        { if (pwct[p,w] != 0) { printf "%7d %s %s\n", pwct[p,w], p, w; } }
    }
}
  
function file_error(msg)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1;
  exit 1;
}