#! /usr/bin/gawk -f # Last edited on 2008-06-28 14:25:20 by stolfi BEGIN { abort = -1; usage = ( "tot-worksheet\\\n" \ " < INFILE > OUTFILE" \ ); # {indent[i]} is the indentation of line {i} (a string of blanks). # {tag[r]} is column 1 of line {r}, for {r} in {1..nlines}. # {data[r,i]} is field {i} of input line {r}. # {val[r,i]} is numeric value of {data[r,i]}, "*" if non-numeric. # If {tag[r] == ""} then line is blank and {data[r,*]} is irrelevant. # If {tag[r] == "#" or " "} then {data[r,1]} is rest of line unparsed. # If {tag[r] == "+"} then line {r} is a header line. # If {tag[r] == "|"} then line {r} is a data line. split("", indent); split("", tag); split("", data); split("", val); nlines = 0; # These are derived from all data and header lines: # {mrg_L[i]} is the leftmost charpos of field {i} in any line. # {mrg_R[i]} is the rightmost charpos of field {i} in any line. # {width[i]} is the maximum width of field {i} in any lines. # {align_L[i]} is 1 if field {i} is always left-aligned. # {align_R[i]} is 1 if field {i} is always right-aligned. # Character positions are counted after stripping the indentation. # There is no check for consistent indentation. split("", mrg_L); split("", mrg_R); split("", width); split("", align_L); split("", align_R); # These are derived only from data lines: # {prec[i]} is max fraction digits seen in field {i} of all data lines. split("", prec); # These are derived only from non-TOTAL data lines: # {has_numbers[i]} is 1 if field {i} of some line could be numeric. # {has_alphas[i]} is 1 if field {i} of some line is definitely non-numeric. # {tot[i,lev]} is the current level {lev} (sub)total of field {i} of lines. split("", has_numbers); split("", has_alphas); split("", tot); # Number of data columns seen: ncols = -1; # Number of (sub)total levels seen; {lev} ranges from 1 to {nlevels}: nlevels = 1; } // { nlines++; r = nlines; $0 = untabify($0); } // { nlines++; r = nlines; # Get the indentation string {indent[r]}: if (match(/^[ ]*[+|\#]/, $0)) { indent[r] = substr($0,1,RLENGTH-1); $0 = substr($0,RLENGTH); } else if (match(/^[ ]+/, $0)) { indent[r] = substr($0,1,RLENGTH-1); $0 = substr($0,RLENGTH); } else { indent[r] = ""; } } /^[ ]*$/ { tag[r] = ""; next; } /^[ #]/ { tag[r] = substr($0,1,1); data[r,1] = substr($0,2); next; } /^[+|][ ]/ { tag[r] = substr($0, 1,1); if (ncols == -1) { ncols = NF-1; init_cols(); } if (ncols != NF-1) { data_error(("bad num cols = " NF-1)); } get_fields(); if (tag[r] == "|") { get_numeric_values(); lev = line_total_level(); if(lev > 0) { set_totals(lev); } else { acc_totals(); } } next; } // { data_error(("bad tag = \"" substr($0,1,1) "\"")); } function init_cols( i,j) { for (i = 1; i <= ncols; i++) { width[i] = 0; for(j = 1; j <= nlevels; j++) { tot[i,j] = 0; } has_numbers[i] = 0; has_alphas[i] = 0; mrg_L[i] = 9999; mrg_R[i] = 0; align_L[i] = 1; align_R[i] = 1; } } function get_fields( i,lin,xv,ich) { # sets {data[r,i], mrg_L[i], mrg_R[i], width[i]} for fields of line {r} lin = substr($0,2); ich = 2; for (i = 1; i <= ncols; i++) { match(lin, /^[ ]*/); ich += RLENGTH; lin = substr(lin, RLENGTH+1); # Field {i} of this line starts on character column {ich} if (mrg_L[i] > ich) { mrg_L[i] = ich; } else if ((mrg_L[i] != ich) && (mrg_L[i] != 9999)) { align_L[i] = 0; } match(lin, /^[^ ]*/); xv = substr(lin, 1, RLENGTH); ich += RLENGTH; lin = substr(lin, RLENGTH+1); # Field {i} on this line ends on character column {ich-1} if (mrg_R[i] < ich-1) { mrg_R[i] = ich-1; } else if ((mrg_R[i] != ich-1) && (mrg_R[i] != 0) && (xv !~ /^[,.]$/)) { align_R[i] = 0; } data[r,i] = xv; val[r,i] = "*"; w = length(xv); if (w > width[i]) { width[i] = w; } # printf "xv = (%s) width[i] = %d mrg = %d %d align = %d %d\n", \ # xv, width[i], mrg_L[i], mrg_R[i], align_L[i], align_R[i] > "/dev/stderr"; } if (! match(lin, /^[ ]*$/)) { data_error(("unexpected leftovers = \"" lin "\"")); } } function get_numeric_values( i,xv,pr) { # sets {val[r,i], prec[i]} for fields of line {r} for (i = 1; i <= ncols; i++) { xv = data[r,i]; if (looks_numeric(xv)) { xv = english_format(xv); if (match(xv, /[.][0-9]*$/)) { pr = RLENGTH - 1; if (pr > prec[i]) { prec[i] = pr; } } val[r,i] = xv + 0; } } } function line_total_level( i,k,lev) { lev = 0; for(i = 1; i <= ncols; i++) { k = field_total_level(data[r,i]); if (k > 0) { if((lev > 0) && (lev != k)) { data_error("bad (SUB)TOTAL line"); } lev = k; } } return lev; } function field_total_level(x, k) { # 1 for "TOTAL", 2 for "SUBTOTAL", 3 for "SUBSUBTOTAL", etc. k = 1; while (x ~ /^SUB/) { k++; x = substr(x,4); } return (x == "TOTAL" ? k : 0); } function acc_totals( i,j,v) { # Accumulates numeric fields of line {r} into the respective totals. # Also sets {has_numbers[i]} for columns which look numeric, # and {has_alphas[i]} for cols that are non-numeric. for(i = 1; i <= ncols; i++) { v = val[r,i]; if (v != "*") { for (j = 1; j <= nlevels; j++) { tot[i,j] += v; } has_numbers[i] = 1; } else { has_alphas[i] = 1; } } } function set_totals(lev, i,j,v,xv) { # Replaces numeric fields of line {r} by the {lev} totals and # resets totals of that and higher levels. # Also sets {has_numbers[i]} for those cols. # However does NOT set {has_alphas[i]} for other cols. # Replicate totals, if needed, until we have enough levels: while (nlevels < lev) { for(i = 1; i <= ncols; i++) { tot[i,nlevels+1] = tot[i,nlevels]; } nlevels++; } # Now set and clear totals: for(i = 1; i <= ncols; i++) { v = val[r,i]; if (v != "*") { val[r,i] = tot[i,lev]; for (j = lev; j <= nlevels; j++) { tot[i,j] = 0; } has_numbers[i] = 1; } } } END { # Decide whether columns are left- or right-aligned: for (i = 1; i <= ncols; i++) { if (align_R[i] + align_L[i] != 1) { # Original algnment ambiguous; decide by field type if (has_alphas[i]) { align_L[i] = 1; align_R[i] = 0; } else { align_R[i] = 1; align_L[i] = 0; } } } # Adjust field margins to ensure minimum spacing: ich = 3; for (i = 1; i <= ncols; i++) { if (align_R[i]) { mrg_L[i] = mrg_R[i] - width[i] + 1; } else { mrg_R[i] = mrg_L[i] + width[i] - 1; } if (mrg_L[i] < ich) { mrg_L[i] = ich; mrg_R[i] = ich + width[i] - 1; } ich = mrg_R[i] + 2; } # Print all lines for (r = 1; r <= nlines; r++) { print_line(r); } } function print_line(r, i,ich,v,xv) { if (tag[r] == "") { printf "%s%s", indent[r], tag[r]; printf "\n"; } else if ((tag[r] == " ") || (tag[r] == "#")) { printf "%s%s", indent[r], tag[r]; printf "%s", data[r,1]; printf "\n"; } else { print_data_line(r); } } function print_data_line(r, i,ich,v,xv,) { # Requires {indent[r],tag[r]}. # Requires {data[r,1..ncols],val[r,1..ncols]}. # Requires {mrg_L[1..ncols],prec[1..ncols],width[1..ncols],align_R[1..ncols]}. printf "%s%s", indent[r], tag[r]; ich = 2; for (i = 1; i <= ncols; i++) { if (mrg_L[i] > ich) { printf "%*s", mrg_L[i]-ich, ""; } ich = mrg_L[i]; v = val[r,i]; if (v == "*") { xv = data[r,i]; } else { xv = format_number(v,prec[i],0,0); } if (length(xv) > width[i]) { data_error(("field overflow - width = " w " val = \"" xv "\"")); } if (align_R[i]) { printf "%*s", width[i], xv; } else { printf "%*s", -width[i], xv; } ich += width[i]; } printf "\n"; }