#! /usr/bin/gawk -f # Last edited on 1999-01-18 23:20:30 by stolfi BEGIN { abort = -1; # Reads a file with records COUNT PAGE WORD where # COUNT is the number of occurrences of WORD on page PAGE. # # Outputs a file in the same format where the per-PAGE and # per-WORD totals are correct but the pair counts have been # randomized. split("", pwct); split("", wct); split("", pct); nwords = 0; npages = 0; } (abort >= 0) { exit abort; } (NF != 3) { file_error("wrong num of fields"); } /./ { n = $1; p = $2; w = $3; if ((p,w) in pwct) { file_error("repeated word/page pair"); } pwct[p,w] += n; if (! (w in wct)) { nwords ++; } wct[w] += n; if (! (p in pct)) { npages ++; } pct[p] += n; ct += n; } END { if (abort >= 0) { exit abort; } split("", pwct); printf "expanding word sample...\n" > "/dev/stderr"; split("", tx); ntx = 0; for (w in wct) { m = wct[w]; for (i=0;i<m;i++) { tx[ntx] = w; ntx++; } } printf "randomizing word sample...\n" > "/dev/stderr"; for(i=ntx-1;i>=1;i--) { j = int((i+1)*rand()-0.000001); if (j != i) { tmp = tx[i]; tx[i] = tx[j]; tx[j] = tmp; } } printf "collecting into pages...\n" > "/dev/stderr"; k = 0; split("", pwct); for (p in pct) { m = pct[p]; for(i=0; i<m; i++) { w = tx[k]; k++; pwct[p,w]++; } } # Print out: for (w in wct) { for (p in pct) { if (pwct[p,w] != 0) { printf "%7d %s %s\n", pwct[p,w], p, w; } } } } function file_error(msg) { printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1; }