#! /usr/bin/gawk -f 
# Last edited on 2004-10-03 20:46:51 by stolfi

BEGIN {
  usage = ( ARGV[0] " < SAMPLE.tks > SAMPLE.lfr" );

  # Reads a text, one token per line. Words must not contain spaces or
  # "_". Outputs a table of character counts, in the format 
  #  "{COUNT} {FREQ} {CHAR}"
  # sorted by decreasing {COUNT}. The table includes word
  # breaks, denoted by "_".  Blank lines are ignored.

  split("",c);
  totct = 0;
}

/./ {
  w = ($1 "_"); m = length(w);
  for (i = 1; i <= m; i++)
    { c[substr(w,i,1)]++;
      totct++;
    }
} 

END{
  for (x in c) 
    { printf "%7d %8.6f %s\n", c[x], c[x]/totct, x; }
}