#! /usr/bin/gawk -f # Last edited on 1999-01-18 22:34:17 by stolfi BEGIN { abort = -1; # Reads a file with records COUNT PAGE WORD where # COUNT is the number of occurrences of WORD on page PAGE. # # Computes the histogram of each word over all pages. # # Writes a file with records of the form # # WORD TOTCT NPAGES NMISS SHAPE # # where TOTCT is the total occurrence count of the string, NPAGES is # the number of pages where the word occurs, NMISS is th enumber of # pages where the word doesn't occur, and SHAPE is the shape of the # word's per-page distribution, defined here as the multiset of the # nonzero per-page counts of that word, sorted in decreasing order. split("", pwct); split("", wct); split("", pct); nwords = 0; npages = 0; } (abort >= 0) { exit abort; } (NF != 3) { file_error("wrong num of fields"); } /./ { n = $1; p = $2; w = $3; if ((p,w) in pwct) { file_error("repeated word/page pair"); } pwct[p,w] += n; if (! (w in wct)) { nwords ++; } wct[w] += n; if (! (p in pct)) { npages ++; } pct[p] += n; ct += n; } END { if (abort >= 0) { exit abort; } for (w in wct) { split("", shape); ns = 0; for (p in pct) { if (pwct[p,w] != 0) { shape[ns] = pwct[p,w]; ns++; } } # Sort entries: for (i=0; i<ns; i++) { for (j=i+1; j<ns; j++) { if (shape[i] < shape[j]) { t = shape[i]; shape[i] = shape[j]; shape[j] = t; } } } # Join histogram into a string: ss = ""; for (i=0; i<ns; i++) { ss = ( ss (i==0 ? "(" : "," ) shape[i] ); } ss = (ss ")"); printf "%s %d %d %d %s\n", w, wct[w], ns, npages - ns, ss; } } function file_error(msg) { printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1; }