#! /usr/bin/gawk -f
# Last edited on 1998-08-02 19:52:06 by stolfi

BEGIN {
  abort = -1;
  usage = "compute-int-freqs [ -v cum=BOOL | -v rem=BOOL ] < INFILE > OUTFILE";
  #
  # Reads a file of COUNT ITEM pairs, as produced by "uniq -c"
  # Outputs a similar file with FREQ ITEM lines, where FREQ is
  # the fraction of COUNT relative to the total of all COUNTs, scaled 
  # to [1..999] and rounded.
  #
  # If "cum" is set, FREQ is computed from the sum of all COUNTs up to
  # this one (inclusive).  If "rem" is set, FREQ is computed from the
  # sum of all counts following this one (exclusive).
  
  if (cum == "") { cum = 0; }
  if (rem == "") { rem = 0; }
  if (cum && rem) 
    { error(("compute-int-freqs: you may specify only one of \"cum\" and \"rem\"")); }
  total = 0;
  k = 0; nitems = 0;
}

/^([#]|[ ]*$)/ {
  if (abort >= 0) { exit abort; }
  ct[k] = "#"; it[k] = $0;
  k++;
  next;  
}

// {
  if (abort >= 0) { exit abort; }
  if (NF != 2) { fatal_error(("line " NF ": bad input format")); }
  total += $1; nitems++; 
  ct[k] = $1; it[k] = $2;
  k++;
  next;  
}

END {
  if (abort >= 0) { exit abort; }
  den = total;
  if (cum) { num = 0; } else if (rem) { num = den; }
  for (i=0; i<k; i++)
    { if (ct[i] == "#")
        { print it[i]; }
      else 
        { val = ct[i];
          if (cum)
            { num += val; } 
          else if (rem) 
            { num -= val; } 
          else 
            { num = val; }
          if (num == 0)
            { printf "  . %s\n", it[i]; }
          else 
            { fr = int(num*999.999/den);
              printf "%3d %s\n", fr, it[i]; }
        }
    }
}