#! /usr/bin/gawk -f
# Last edited on 2008-06-28 17:31:47 by stolfi

BEGIN {
  abort = -1;
  usage = ( "tot-worksheet\\\n" \
    "  [-v altfmt=BOOL] \\\n" \
    "  < INFILE > OUTFILE" \
  );
  
  # Simple-minded ASCII spreadsheets.
  # 
  # Input lines are either blank or of three types, determined by
  # their first non-blank character (the /tag/):
  # 
  #   "#" - comment line
  #   "|" - data line
  #   "+" - field definition line
  # 
  # All non-blank lines must have the tag on the same column.
  # Before any processing, ASCII tab (HT, octal '\011') is expanded by
  # the usual rules, and ISO-Latin-1 non-breaking space (NBSP, '\240')
  # is converted to ordinary blanks.
  # 
  # Comment lines may contain anything; they are copied to the
  # output but are otherwise ignored.
  # 
  # After the tag, a "+" line must have only "-" and spaces. Each run
  # of "-"s defines the character position and length of one column of
  # the table. There must be at least one "+" line before the first
  # data line. If there is more than one "+" line, they must all
  # define exactly the same columns.
  # 
  # Each data line is broken into fixed-width fields according to the
  # format specified by the "+" lines. The characters before, between,
  # and after these fields must be blank. If a line is too short, it
  # is implicitly padded with blanks. Each field is classified as
  # blank, numeric or alphabetic; for each numeric field, the script
  # extracts, the number of decimal fraction digits, whether there are
  # any thousands-separators (TPs), and the numeric value (after
  # removing the TPs).
  # 
  # After reading the data, the scripts totalizes the worksheet. The
  # fields are scanned line by line, top down, left to right. If the
  # content of a field is a single "=", the total of all entries above
  # it is assigned the field immediately at its right (which must be a
  # numeric field). If there is more than one "=" in the same column,
  # each total after the first one totalizes only the entries up to
  # the preceding "=". 
  # 
  # A column is considered numeric if all fields in that column are
  # right-justified and either numeric, blank, "-" (which is
  # equivalent to 0), or "="; and there is at least one entry that is
  # numeric or "=". The "=" may only appear in numeric columns.
  # 
  # After totalizing, the worksheet is reformatted. Formatting is
  # applied to all numeric fields in numeric columns,
  # and to total fields that were set by "=" in the adjacent column.
  # Other fields are left alone, even if they are numeric.
  # Reformatting means typesetting the field's numeric value with N
  # decimal fraction digits, where N is the maximum number of such
  # digits among all the numeric entries in that column. Also, if any
  # of the original fields in the column had TPs, then TPs are
  # inserted in the field, too, to separate the integer part in groups
  # of three digits. The re-formatted field is then inserted back into
  # its slot in the worksheet, right-justified; the script aborts if
  # it does not fit.
  # 
  # Finally, the worksheet is printed, with all the original blank,
  # "+", and "#" lines inserted among the new "|" lines, in the
  # original order.
  #  
  # Note that totalization may insert in some column a field value
  # with more decimal fraction digits than any original entry in that
  # column. In that case the extra digits will be supressed in the
  # printout; however, they will still be considered if that field is
  # itself totalized.
  # 
  # A field is considered numeric if it contains at least one digit
  # and fits the regular expression 
  #   /^[-+]?([0-9]+[,][0-9,]*[0-9]|[0-9]*)([.][0-9]*|)$/
  # Note that the number may start or end with ".", but 
  # "." alone (with or without sign) is not accepted.
  # Note also that numbers in scientific notation are not 
  # recognized. The commas are considered thousands-separators;
  # they do not affect the numeric value, and their position
  # is irrelevant, but their presence in any numeric field of a 
  # column triggers the insertion of TPs between digits, in 
  # groups of 3.
  # 
  # If the parameter "altfmt" is set, the roles of "." and ","
  # are reversed, in input and output.

  if (altfmt == "") { altfmt = 0; }
  
  # Number of lines in worksheet, including blanks, "#", and "+"s:
  nlines = 0;
  
  # Number of columns, or {-1} before first "+"-line:
  ncols = -1;
  
  # Nominal width of data lines, after tag, or {-1} before "+"-line:
  nbody = -1;
  
  # Column of tag, or {-1} if not yet known:
  tagcol = -1;

  # The worksheet data:
  # {tag[r]} is the tag character of line {r}, for {r in [1..nlines]}.
  # {data[r,c]} is field {c} of input line {r}.
  # If {tag[r] == ""} then line is blank and {data[r,*]} is irrelevant.
  # If {tag[r] == "#"} then {data[r,1]} is rest of line unparsed.
  # If {tag[r] == "+"} then line {r} is a header line.
  # If {tag[r] == "|"} then line {r} is a data line.
  split("", tag);
  split("", data);

  # These are only defined for data lines:
  # {val[r,c]} is numeric value of {data[r,c]}, "*" if non-numeric.
  split("", val);
  
  # Column attributes, derived from all data and header lines:
  # {cskip[c]} is the num of blanks between column {c-1} (or tag) and column {c}.
  # {cwidth[c]} is the width of field {c} in any lines.
  # {n_num[c]} is the number of numeric or "=" fields in column {c}.
  # {n_alf[c]} is the number of non-numeric fields in column {c}, except blank, "-", or "=".
  # {rjust[c]} is TRUE iff all entries in column {c} are right-justified.
  # {prec[c]} is max frac digits of all numeric fields of column {c}, or -1 if all int.
  # {tsep[c]} is true iff any numeric field in column {c} has thousands-separators.
  split("", cwidth);
  split("", cskip);
  split("", n_num);
  split("", n_alf);
  split("", rjust);
  split("", prec);
  split("", thsep);
}

/* READING THE WORKSHEET */

(abort >= 0) { exit abort; }

// { 
  nlines++; r = nlines;
  $0 = untabify($0);
}

/^[ ]*$/ {
  tag[r] = "";
  next;
}

// { 
  # Non-blank line - define or check tag column:
  if (! match($0, /[^ ]/)) { prog_error(r, "duh?"); }
  if (tagcol == -1)
    { tagcol = RSTART; }
  else if (tagcol != RSTART)
    { data_error(r, ("tag not aligned  tagcol = " tagcol "  RSTART = " RSTART)); }
  tg = substr($0,tagcol,1);
  body = substr($0,tagcol+1);
  tag[r] = tg;
  if (tg == "#")
    { data[r,1] = body;
    }
  else if (tg == "+")
    { parse_field_def_line(r,body); }
  else if (tg == "|")
    { if (ncols == -1) { data_error(r, ("no \"+\"-line")); }
      parse_data_line(body,r);
    }
  else 
    { data_error(r, ("bad tag \"" tg "\"")); }
  next;
}

function parse_field_def_line(r,body,   len,eaten,c,fskip,fwidth)
{ 
  eaten = 0; c = 0;
  while(match(body, /[^ ]/))
    { # One more column in data:
      c++;

      # Skip blanks, remember column start:
      fskip = RSTART-1;
      body = substr(body,fskip+1);
      eaten += fskip;

      # Count "-"s
      if (! match(body, /^[-]+/))
        { data_error(r, ("invalid char \"" substr(body,1,1) "\" in \"+\"-line")); }

      # Save "-"s in {data} array:
      data[r,c] = substr(body, RLENGTH);

      # Skip "-", remember column width:
      fwidth = RLENGTH;
      body = substr(body,fwidth+1);
      eaten += fwidth;
      
      # printf "skip = %d width = %d\n", fskip, fwidth > "/dev/stderr";

      # Set/confirm column attributes:
      if (ncols == -1)
        { # Save column position in line:
          cskip[c] = fskip; 
          cwidth[c] = fwidth;
          # Initialize column attributes:
          n_num[c] = 0;
          n_alf[c] = 0;
          rjust[c] = 1;
          prec[c] = -1;
          thsep[c] = 0;
        }
      else
        { if (fskip != cskip[c]) 
            { data_error(r, ("field " c " is misaligned")); }
          if (fwidth != cwidth[c]) 
            { data_error(r, ("field " c " has inconsistent width")); }
        }
    }

  # Set/confirm number of columns and nominal body width:
  if (ncols == -1)
    { nbody = eaten; ncols = c; }
  else
    { if (eaten != nbody) 
        { data_error(r, ("inconsistent line length")); }
      if (c != ncols) 
        { data_error(r, ("inconsistent number of columns")); }
    }
}

function parse_data_line(body,r,  c,sk,fw,fx,fv,pt,ts)
{
  # Requires {cskip[1..ncols],cwidth[1..ncols]}
  # Sets {data[r,1..ncols]} to the raw field texts. 
  # Sets {val[r,1..ncols]} to their numeric values.
  # Updates {rjust[1..ncols],n_alf[1..ncols],n_num[1..ncols],prec[1..ncols],thsep[1..ncols]}.
  
  for (c = 1; c <= ncols; c++)
    { 
      # Check and skip inter-column space:
      sk = cskip[c];
      x = substr(body,1,sk);
      if (x !~ /^[ ]*$/) { data_error(r, ("non-blank item \"" x "\" between cols")); } 
      body = substr(body,sk+1);

      # Get field content and value:
      fw = cwidth[c]
      fx = substr(body,1,fw);
      data[r,c] = fx;

      fv = field_value2(fx,altsep);
      val[r,c] = fv;

      # Update {rjust[c]}:
      if ((length(x) < fw) || (substr(x,fw,1) == " ")) { rjust[c] = 0; }

      if (fv == "*")
        { # Non-numeric field:
          if ((fx !~ /^[ ]*[-=][ ]*$/) && (fx != "")) 
            { n_alf[c]++; }
        }
      else 
        { # Numeric field:
          n_num[c]++;

          # Update {prec[c]}:
          gsub(/[ ]+$/,"",x);
          p = index(fx,(altfmt ? "," : "."));
          if (p > 0) 
            { p = length(x)-p; 
              if (p > prec[c]) { prec[c] = p; }
            }

          # Update {thsep[c]}: 
          if (index(fx,(altfmt ? "." : ",")) > 0) { thsep[c] = 1; }
        }

      # Skip field:
      body = substr(body,fw+1);
    }

  # Check for non-blank residue:
  if (body !~ /^[ ]*$/) { data_error(r, ("non-blank item \"" body "\" after last col")); }
}

/* PROCESSING AND PRINTOUT */

END {
  if (abort >= 0) { exit abort; }

  # Decide which columns are numeric:
  split("", cnum);
  for (c = 1; c <= ncols; c++)
    { cnum[c] = ((n_num[c] > 0) && (n_alf[c] == 0) && rjust[c]); }
    
  process_and_print_worksheet();
}

function process_and_print_worksheet(   r,tot)
{
  # Totalize the numeric columns, reformat numeric fields in numeric
  # columns and computed totals, and print worksheet.
  
  # {tot[c]} total of entries in column {c} since last "=".
  split("", tot);
  for (c = 1; c <= ncols; c++) { tot[c] = 0; }
  
  # Flag {next_fmt} means that the field in the next column must be reformatted:
  next_fmt = 0;
  for (r = 1; r <= nlines; r++)
    { this_fmt = next_fmt; next_fmt = 0;
      printf "%*s%s", tagcol-1, "", tag[r];
      if (tag[r] == "|")
        { process_and_print_data_line(r,tot); }
      else if (tag[r] == "+")
        { print_field_def_line(r); }
      else
        { printf "%s", data[r,1]; }
      printf "\n";
    }
}

function print_field_def_line(r,tot, c,fx)
{
  for (c = 1; c <= ncols; c++)
    { # Print intercolumn space and field:
      printf "%*s%s",  cskip[c], "", data[r,c];
    }
  printf "\n";
}