#! /usr/bin/gawk -f
# Last edited on 2008-06-28 14:25:20 by stolfi

BEGIN {
  abort = -1;
  usage = ( "tot-worksheet\\\n" \
    "  < INFILE > OUTFILE" \
  );
  
  # {indent[i]} is the indentation of line {i} (a string of blanks).
  # {tag[r]} is column 1 of line {r}, for {r} in {1..nlines}.
  # {data[r,i]} is field {i} of input line {r}.
  # {val[r,i]} is numeric value of {data[r,i]}, "*" if non-numeric.
  # If {tag[r] == ""} then line is blank and {data[r,*]} is irrelevant.
  # If {tag[r] == "#" or " "} then {data[r,1]} is rest of line unparsed.
  # If {tag[r] == "+"} then line {r} is a header line.
  # If {tag[r] == "|"} then line {r} is a data line.
  split("", indent);
  split("", tag);
  split("", data);
  split("", val);
  nlines = 0;
  
  # These are derived from all data and header lines:
  # {mrg_L[i]} is the leftmost charpos of field {i} in any line.
  # {mrg_R[i]} is the rightmost charpos of field {i} in any line.
  # {width[i]} is the maximum width of field {i} in any lines.
  # {align_L[i]} is 1 if field {i} is always left-aligned.
  # {align_R[i]} is 1 if field {i} is always right-aligned.
  # Character positions are counted after stripping the indentation.
  # There is no check for consistent indentation.
  split("", mrg_L);
  split("", mrg_R);
  split("", width);
  split("", align_L);
  split("", align_R);

  # These are derived only from data lines:
  # {prec[i]} is max fraction digits seen in field {i} of all data lines.
  split("", prec);

  # These are derived only from non-TOTAL data lines:
  # {has_numbers[i]} is 1 if field {i} of some line could be numeric.
  # {has_alphas[i]} is 1 if field {i} of some line is definitely non-numeric.
  # {tot[i,lev]} is the current level {lev} (sub)total of field {i} of lines.
  split("", has_numbers);
  split("", has_alphas);
  split("", tot);
  
  # Number of data columns seen:
  ncols = -1;
  
  # Number of (sub)total levels seen; {lev} ranges from 1 to {nlevels}:
  nlevels = 1;
}

// { 
  nlines++; r = nlines;
  $0 = untabify($0);
}

// { 
  nlines++; r = nlines;
  # Get the indentation string {indent[r]}:
  if (match(/^[ ]*[+|\#]/, $0))
    { indent[r] = substr($0,1,RLENGTH-1);
      $0 = substr($0,RLENGTH);
    }
  else if (match(/^[ ]+/, $0))
    { indent[r] = substr($0,1,RLENGTH-1);
      $0 = substr($0,RLENGTH);
    }
  else
    { indent[r] = ""; }
}

/^[ ]*$/ {
  tag[r] = "";
  next;
}

/^[ #]/ {
  tag[r] = substr($0,1,1);
  data[r,1] = substr($0,2);
  next;
}

/^[+|][ ]/ {
  tag[r] = substr($0, 1,1); 
  if (ncols == -1) { ncols = NF-1; init_cols(); }
  if (ncols != NF-1) { data_error(("bad num cols = " NF-1)); }
  get_fields();
  if (tag[r] == "|")
    { get_numeric_values();
      lev = line_total_level();
      if(lev > 0)
        { set_totals(lev); }
      else
        { acc_totals(); }
    }
  next;
}

// { 
  data_error(("bad tag = \"" substr($0,1,1) "\""));
}

function init_cols(  i,j)
{
  for (i = 1; i <= ncols; i++) 
    { width[i] = 0; 
      for(j = 1; j <= nlevels; j++) { tot[i,j] = 0; }
      has_numbers[i] = 0; has_alphas[i] = 0;
      mrg_L[i] = 9999; mrg_R[i] = 0;
      align_L[i] = 1; align_R[i] = 1;
    }
}

function get_fields(   i,lin,xv,ich)
{
  # sets {data[r,i], mrg_L[i], mrg_R[i], width[i]} for fields of line {r}
  lin = substr($0,2);
  ich = 2;
  for (i = 1; i <= ncols; i++)
    { match(lin, /^[ ]*/);
    
      ich += RLENGTH; lin = substr(lin, RLENGTH+1);
      # Field {i} of this line starts on character column {ich}
      if (mrg_L[i] > ich)
        { mrg_L[i] = ich; }
      else if ((mrg_L[i] != ich) && (mrg_L[i] != 9999))
        { align_L[i] = 0; }
      
      match(lin, /^[^ ]*/);
      xv = substr(lin, 1, RLENGTH); 
     
      ich += RLENGTH; lin = substr(lin, RLENGTH+1);
      # Field {i} on this line ends on character column {ich-1}
      if (mrg_R[i] < ich-1) 
        { mrg_R[i] = ich-1; }
      else if ((mrg_R[i] != ich-1) && (mrg_R[i] != 0) && (xv !~ /^[,.]$/))
        { align_R[i] = 0; } 
      
      data[r,i] = xv;
      val[r,i] = "*";
      w = length(xv);
      if (w > width[i]) { width[i] = w; }
      # printf "xv = (%s)  width[i] = %d  mrg = %d %d  align = %d %d\n", \
      #   xv, width[i], mrg_L[i], mrg_R[i], align_L[i], align_R[i] > "/dev/stderr";
    }
  if (! match(lin, /^[ ]*$/))
    { data_error(("unexpected leftovers = \"" lin "\"")); }
}

function get_numeric_values(   i,xv,pr)
{
  # sets {val[r,i], prec[i]} for fields of line {r}
  for (i = 1; i <= ncols; i++)
    { xv = data[r,i];
      if (looks_numeric(xv)) 
        { xv = english_format(xv);
          if (match(xv, /[.][0-9]*$/))
            { pr = RLENGTH - 1; 
              if (pr > prec[i]) {  prec[i] = pr; }
            }
          val[r,i] = xv + 0;
        }
    }
}

function line_total_level(   i,k,lev)
{
  lev = 0;
  for(i = 1; i <= ncols; i++)
    { k = field_total_level(data[r,i]);
      if (k > 0) 
        { if((lev > 0) && (lev != k)) 
            { data_error("bad (SUB)TOTAL line"); }
          lev = k;
        }
    }
  return lev;
}

function field_total_level(x,  k)
{
  # 1 for "TOTAL", 2 for "SUBTOTAL", 3 for "SUBSUBTOTAL", etc.
  k = 1;
  while (x ~ /^SUB/) { k++; x = substr(x,4); }
  return (x == "TOTAL" ? k : 0);
}

function acc_totals(    i,j,v)
{
  # Accumulates numeric fields of line {r} into the respective totals.
  # Also sets {has_numbers[i]} for columns which look numeric,
  # and {has_alphas[i]} for cols that are non-numeric.
  for(i = 1; i <= ncols; i++)
    { v = val[r,i];
      if (v != "*") 
        { for (j = 1; j <= nlevels; j++) { tot[i,j] += v; }
          has_numbers[i] = 1;
        }
      else
        { has_alphas[i] = 1; }
    }
}

function set_totals(lev,     i,j,v,xv)
{
  # Replaces numeric fields of line {r} by the {lev} totals and 
  # resets totals of that and higher levels.
  # Also sets {has_numbers[i]} for those cols.
  # However does NOT set {has_alphas[i]} for other cols.

  # Replicate totals, if needed, until we have enough levels:
  while (nlevels < lev) 
    { for(i = 1; i <= ncols; i++) 
        { tot[i,nlevels+1] = tot[i,nlevels]; }
      nlevels++;
    }
    
  # Now set and clear totals:
  for(i = 1; i <= ncols; i++)
    { v = val[r,i];
      if (v != "*")
        { val[r,i] = tot[i,lev]; 
          for (j = lev; j <= nlevels; j++) { tot[i,j] = 0; }
          has_numbers[i] = 1;
        }
    }
}

END {
  # Decide whether columns are left- or right-aligned:
  for (i = 1; i <= ncols; i++)
    { if (align_R[i] + align_L[i] != 1)
        { # Original algnment ambiguous; decide by field type
          if (has_alphas[i]) 
            { align_L[i] = 1; align_R[i] = 0; }
          else
            { align_R[i] = 1; align_L[i] = 0; }
        }
    }

  # Adjust field margins to ensure minimum spacing: 
  ich = 3;
  for (i = 1; i <= ncols; i++)
    { if (align_R[i]) 
        { mrg_L[i] = mrg_R[i] - width[i] + 1; }
      else
        { mrg_R[i] = mrg_L[i] + width[i] - 1; }
      if (mrg_L[i] < ich) 
        { mrg_L[i] = ich; mrg_R[i] = ich + width[i] - 1; }
      ich = mrg_R[i] + 2;
    }

  # Print all lines
  for (r = 1; r <= nlines; r++)
    { print_line(r); }
}

function print_line(r,    i,ich,v,xv)
{
  if (tag[r] == "")
    { printf "%s%s", indent[r], tag[r];
      printf "\n";
    }
  else if ((tag[r] == " ") || (tag[r] == "#"))
    { printf "%s%s", indent[r], tag[r];
      printf "%s", data[r,1];
      printf "\n";
    }
  else
    { print_data_line(r); }
}


function print_data_line(r,   i,ich,v,xv,)
{
  # Requires {indent[r],tag[r]}.
  # Requires {data[r,1..ncols],val[r,1..ncols]}.
  # Requires {mrg_L[1..ncols],prec[1..ncols],width[1..ncols],align_R[1..ncols]}.
  printf "%s%s", indent[r], tag[r];
  ich = 2;
  for (i = 1; i <= ncols; i++)
    { if (mrg_L[i] > ich) { printf "%*s", mrg_L[i]-ich, ""; }
      ich = mrg_L[i];
      v = val[r,i];
      if (v == "*")
        { xv = data[r,i]; }
      else
        { xv = format_number(v,prec[i],0,0); }
      if (length(xv) > width[i]) 
        { data_error(("field overflow - width = " w " val = \"" xv "\"")); }
      if (align_R[i])
        { printf "%*s", width[i], xv; }
      else
        { printf "%*s", -width[i], xv; }
      ich += width[i];
    }
  printf "\n";
}