#! /usr/bin/gawk -f
# Last edited on 2008-06-15 21:02:36 by stolfi

BEGIN {
  USAGE = ( \
    "format-multi-counts \\\n" \
    "  [ -v titles=\"{T[1]} {T[2]} ... {T[n]} {TW}\" ] \\\n" \
    "  [ -v totals={BOOL} ] \\\n" \
    "  [ -v freqs={BOOL} | -v cumFreqs={BOOL} | -v remFreqs={BOOL} ] \\\n" \
    "  [ -v colSep={STRING} ] \\\n" \
    "  [ -v width={NUM} ] \\\n" \
    "  [ -v widths=\"{W1} {W2} ... {Wn}\" ] \\\n" \
    "  [ -v maxLines={NUM} ] \\\n" \
    "  < {INFILE} > {OUTFILE}" \
  );

  # The {INFILE} should be a file with records in the format 
  # "{F[1]} {F[2]} ... {F[n]} {WORD}" where each {F[i]} is a number
  # (except that "." is accepted as meaning 0).
  # 
  # Outputs a fixed-format listing " {F[1]} {F[2]} ... {F[n]} {WORD}"
  # with each field {F[i]} right-justified and {WORD} left-justified.
  #
  # Blank lines and '#'-comments are discarded from the input.
  # 
  # If the string "titles" is specified and non-empty, prints its
  # words as column headers.
  # 
  # If {widths} is specified, prints the each field {F[i]} with {W[i]}
  # columns, right-justified. If {W[i]} is not specified for some
  # column, it defaults to {width}, which in turn defaults to 7. The
  # {WORD} is always printed left-justified.
  #
  # The column separator is the {colSep} parameter (default "  ").
  #
  # A line with {WORD="TOTAL"} is printed if {totals} is set.
  # If {freqs} is set, prints the ratio of each {F[i]} to the column
  # total instead of {F[i]}. If {cumFreqs} is set, the ratio is
  # computed from the sum of all {F[i]} in the columns up to this one
  # (inclusive). If {remFreqs} is set, the ratio is computed from the
  # sum of all counts following this one (exclusive). 
  # 
  # For {totals}, {freqs}, {cumFreqs}, and {remFreqs}, the 
  # fields {F[i]} had better be numeric.  On output, 
  # any {F[i]} that is 0 is printed as ".".
  # 
  # If {maxLines} is set, tabulates only the first {maxLines} rows
  # of {INFILE}, and replaces any additional lines by a line
  # with {WORD=="..."} and every {F[i]=="."}.  However, the 
  # totals and frequencies are always computed relative to 
  # the whole input file.
  #
  # WARNING: the {WORD} field should not contain any spaces.

  abort = -1;
  
  # Check the arguments and provide defaults:
  if (width == "") { width = 7; } else { width += 0; }
  if (colSep == "") { colSep = "  "; }
  if (maxLines == "") { maxLines = -1; } else { maxLines += 0; }
  if (freqs == "") { freqs = 0; } else { freqs += 0; }
  if (cumFreqs == "") { cumFreqs = 0; } else { cumFreqs += 0; }
  if (remFreqs == "") { remFreqs = 0; } else { remFreqs += 0; }
  if (cumFreqs || remFreqs) { freqs = 1; }
  if (cumFreqs && remFreqs) 
    { arg_error(("options {cumFreqs} and {remFreqs} are exclusive")); }

  # Parse the {widths} string, set {wd[1..nw]}.
  if (widths !~ /^[0-9 ]*$/) { arg_error(("bad {widths} argument")); }
  nw = split(widths, wd);

  # Parse the {titles} string, set {nh} and {hd[1..nh]}.
  nh = split(titles, hd);
  if (nh < 1) { arg_error("bad {titles} argument"); }
  # Expand {wd[1..nh-1]} to account for title widths:
  for (i = 1; i <= nh-1; i++) 
    { while (i < nw) { nw++; wd[nw] = width; }
      wdi = length(hd[i]);
      if (wdi > wd[i]) { wd[i] = wdi; }
    }

  split("", tot);
  split("", ct);
  split("", key);

  nl = 0;  # Number of lines in input file.
  nf = nh; # Number of fields in output (including the {WORD}).
}

(abort >= 0) { exit abort; }

/ / { 
  gsub(/[\011\014\015]/, " ", $0);
  gsub(/[\#].*$/, "", $0);
  gsub(/^[ ]+/, "", $0);
  gsub(/[ ]+$/, "", $0);
} 

/./ {
  if (nf > 0)
    { if (NF != nf) { data_error(("wrong num of fields nh = " nh " NF = " NF "\n" $0)); } }
  else 
    { nf = NF; }
  nl++;
  for (i = 1; i <= nf-1; i++) 
    { fi = $(i);
      tot[i] += (fi == "." ? 0 : fi); 
      # Stop at {maxLines}, unless {maxLines} is negative: 
      if ((maxLines < 0) || (nl <= maxLines))
        { # Save the field: 
          ct[nl,i] = fi;
          # Expand {wd[i]} to accomodate the field: 
          while (nw < i) { nw++; wd[nw] = width; }
          wdi = length(fi);
          if (wdi > wd[i]) { wd[i] = wdi; }
        }
    }
  key[nl] = $(nf);
  next;
}

END {
  if (abort >= 0) { exit abort; }
 
  if (freqs)
    { # Set {den[i]} to the denominator of freqs in column {i}: 
      split("", den);
      for (i = 1; i < nf; i++) { den[i] = tot[i]; }
      # If {cumFreqs|remFreqs}, set {num[i]} to the initial numerator of freqs in column {i}: 
      split("", num); 
      for (i = 1; i < nf; i++) 
        { den[i] = tot[i];
          if (cumFreqs || remFreqs) { num[i] = ( remFreqs ? den[i] : 0 ); }
        }
    }

  if (totals)
    { # Adjust column widths for totals:
      for (i = 1; i < nf; i++) 
        { wdi = length((freqs ? "999" : tot[i])); 
          if (wdi > wd[i]) { wd[i] = wdi; }
        }
    }

  if (nh > 0)
    { # Print titles and dashes:
      for (i = 1; i <= nf-1; i++) { pr(i, (i < nh ? hd[i] : "?")); }
      printf "%s\n", hd[nh];
      for (i = 1; i <= nf-1; i++) { pd(i); }
      wdw = length(hd[nh]);
      for (j = 0; j < wdw; j++) { printf "-"; }
      printf "\n";
    }

  # Decide the number of lines to print: 
  if (maxLines < 0) { maxLines = nl+1; }
  ml = (nl > maxLines ? maxLines+1 : nl);
  for(k = 1; k <= ml; k++)
    { # Print line {k}:
      for (i = 1; i < nf; i++) 
        { ctF = ct[k,i]; # Formatted count
          ctN = (ctF == "." ? 0 : ctF + 0); # Numeric count
          if (freqs)
            { if (cumFreqs)
                { num[i] += ctN; }
              else if (remFreqs) 
                { num[i] -= ctN; }
              else
                { num[i] = ctN; }
              if (num[i] == 0)
                { ctF = "."; } 
              else
                { ctF = int(num[i]*999.999/den[i]); }
            }
          else if (ctN == 0)
            { ctF = "."; }
            
          if (k <= maxLines) 
            { pr(i, ctF); }
          else if (k == maxLines+1) 
            { pr(i, "."); }
        }
      if (k <= maxLines) 
        { printf "%s\n", key[k]; }
      else if (k == maxLines+1) 
        { printf "%s\n", "..."; }
    }
  if (totals)
    { for (i = 1; i < nf; i++) 
        { if (freqs)
            { pr(i, "999"); }
          else
            { pr(i, tot[i]); }
        }
      printf "%s\n", "TOTAL";
    }
}

function pr(i, str,     w)
{
  # Prints string "str" on column "i", right-justified,
  # followed by the column separator.
  w = (i > nw ? width : wd[i]);
  printf "%*s%s", w, str, colSep;
}

function pd(i,     w,j)
{
  # Prints a bunch of dashes on column "i"
  # followed by the column separator.
  w = (i > nw ? width : wd[i]);
  for (j = 0; j < w; j++) { printf "-"; }
  printf "%s", colSep;
}

function arg_error(msg)
{
  printf "format-multi-counts: **%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", USAGE > "/dev/stderr";
  abort = 1; exit 1;
}

function data_error(msg)
{
  printf "%s:%s: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
  printf "  Ť%sť\n", $0 > "/dev/stderr";
  abort = 1; exit 1;
}