#! /usr/bin/gawk -f
# Last edited on 2000-02-02 21:24:42 by stolfi

BEGIN {
  abort = -1;
  
  # Reads a file of word counts, as produced by "uniq -c",
  # divided into section by blank lines and/or #-comments.
  #
  # Outputs a similar file, in the format
  #
  #   COUNT  SCUMCT SFRAC SCUMFR  FCUMCT FFRAC FCUMFR  WORD...
  #
  # where COUNT WORD... is the original record, SCUMCT and FCUMCT are
  # the cumulative counts per section and per file, SFRAC and FFRAC
  # are the fractions of COUNT relative to the section and file
  # totals, and SCUMFR and FCUMFR are the corresponding cumulative
  # fractions.
  #
  # Preserves blank lines and comments.

  split("", w);
  split("", ct);
  nr = 0;
  OFS = " ";
}

# Slurp input file:

(abort >= 0) { exit abort; }

/^ *([#]|$)/ { ct[nr] = "#"; w[nr] = $0; nr++; next; }

//{ 
  if (NF != 2) { error(("line " NF ": bad format")); }
  ct[nr] = $1; $1 = ""; w[nr] = $0;
  nr++; next;
}

# Print it out: 

END {
  # Compute total totalorum:
  totF = 0;
  for(i=0; i<nr; i++) { if (ct[i] != "#") { totF += ct[i]; } }
  if (totF == 0) { totF = 1; }
  
  # Loop on sections:
  i = 0; cumF = 0;
  while (i < nr) 
    { if (ct[i] == "#") 
        { print w[i]; i++; }
      else
        { # Locate end of section and totalize:
          j = i; totS = 0;
          while ((j < nr) && (ct[j] != "#")) { totS += ct[j]; j++; }
          if (totS == 0) { totS = 1; }
          # Now print data with frequencies:
          cumS = 0;
          while (i < j) 
            { V = ct[i]; cumS += V; cumF += V;
              printf "%7d %7d %6.4f %6.4f %7d %6.4f %6.4f%s\n", 
                V, cumS, (V/totS), (cumS/totS), cumF, (V/totF), (cumF/totF), w[i];
              i++;
            }
        }
    }
    
}
    
function error(msg)
  {
    printf "*** %s\n", msg > "/dev/stderr"; 
    abort = 1; exit abort; 
  }