#! /usr/bin/gawk -f # Last edited on 2002-02-23 16:18:20 by stolfi BEGIN { abort = -1; tottk = 0; badtk = 0; split("", totseen); totwd = 0; osec = "" } /^[#][#][ ]+[0-9]+[-][0-9]+ *$/ { print_it(); xp = $2; match(xp, /^[0-9]+[-]/); sec = substr(xp, 1, RLENGTH-1); if ((sec != osec) && (osec != "")) { printf "000 00-00 00 00\n"; } osec = sec; np++; ntk = 0 split("", seen); nwd = 0; next; } /^[ ]*([#]|$)/{ next; } // { $0 = ($0 " "); gsub(/^[-.0-9A-Za-z]+[ ]/, " ", $0); gsub(/[ ][-=][ ]/, " ", $0); for (i=1; i <=NF; i++) { w = $(i); ntk++; tottk++; if (w !~ /[*]/) { if (! (w in seen)) { nwd++; seen[w] = 1; } if (! (w in totseen)) { totwd++; totseen[w] = 1; } } else { badtk++; } } } END { print_it(); printf "total %3d recipes, %5d tokens (%4d bad), %5.2f tokens/recipe, %4d good words\n", np, tottk, badtk, tottk/np, totwd > "/dev/stderr"; } function print_it() { if (xp != "") { printf "%03d %s %4d %4d\n", np, xp, ntk, nwd; } }