#! /usr/bin/gawk -f # Last edited on 2008-06-28 17:31:47 by stolfi BEGIN { abort = -1; usage = ( "tot-worksheet\\\n" \ " [-v altfmt=BOOL] \\\n" \ " < INFILE > OUTFILE" \ ); # Simple-minded ASCII spreadsheets. # # Input lines are either blank or of three types, determined by # their first non-blank character (the /tag/): # # "#" - comment line # "|" - data line # "+" - field definition line # # All non-blank lines must have the tag on the same column. # Before any processing, ASCII tab (HT, octal '\011') is expanded by # the usual rules, and ISO-Latin-1 non-breaking space (NBSP, '\240') # is converted to ordinary blanks. # # Comment lines may contain anything; they are copied to the # output but are otherwise ignored. # # After the tag, a "+" line must have only "-" and spaces. Each run # of "-"s defines the character position and length of one column of # the table. There must be at least one "+" line before the first # data line. If there is more than one "+" line, they must all # define exactly the same columns. # # Each data line is broken into fixed-width fields according to the # format specified by the "+" lines. The characters before, between, # and after these fields must be blank. If a line is too short, it # is implicitly padded with blanks. Each field is classified as # blank, numeric or alphabetic; for each numeric field, the script # extracts, the number of decimal fraction digits, whether there are # any thousands-separators (TPs), and the numeric value (after # removing the TPs). # # After reading the data, the scripts totalizes the worksheet. The # fields are scanned line by line, top down, left to right. If the # content of a field is a single "=", the total of all entries above # it is assigned the field immediately at its right (which must be a # numeric field). If there is more than one "=" in the same column, # each total after the first one totalizes only the entries up to # the preceding "=". # # A column is considered numeric if all fields in that column are # right-justified and either numeric, blank, "-" (which is # equivalent to 0), or "="; and there is at least one entry that is # numeric or "=". The "=" may only appear in numeric columns. # # After totalizing, the worksheet is reformatted. Formatting is # applied to all numeric fields in numeric columns, # and to total fields that were set by "=" in the adjacent column. # Other fields are left alone, even if they are numeric. # Reformatting means typesetting the field's numeric value with N # decimal fraction digits, where N is the maximum number of such # digits among all the numeric entries in that column. Also, if any # of the original fields in the column had TPs, then TPs are # inserted in the field, too, to separate the integer part in groups # of three digits. The re-formatted field is then inserted back into # its slot in the worksheet, right-justified; the script aborts if # it does not fit. # # Finally, the worksheet is printed, with all the original blank, # "+", and "#" lines inserted among the new "|" lines, in the # original order. # # Note that totalization may insert in some column a field value # with more decimal fraction digits than any original entry in that # column. In that case the extra digits will be supressed in the # printout; however, they will still be considered if that field is # itself totalized. # # A field is considered numeric if it contains at least one digit # and fits the regular expression # /^[-+]?([0-9]+[,][0-9,]*[0-9]|[0-9]*)([.][0-9]*|)$/ # Note that the number may start or end with ".", but # "." alone (with or without sign) is not accepted. # Note also that numbers in scientific notation are not # recognized. The commas are considered thousands-separators; # they do not affect the numeric value, and their position # is irrelevant, but their presence in any numeric field of a # column triggers the insertion of TPs between digits, in # groups of 3. # # If the parameter "altfmt" is set, the roles of "." and "," # are reversed, in input and output. if (altfmt == "") { altfmt = 0; } # Number of lines in worksheet, including blanks, "#", and "+"s: nlines = 0; # Number of columns, or {-1} before first "+"-line: ncols = -1; # Nominal width of data lines, after tag, or {-1} before "+"-line: nbody = -1; # Column of tag, or {-1} if not yet known: tagcol = -1; # The worksheet data: # {tag[r]} is the tag character of line {r}, for {r in [1..nlines]}. # {data[r,c]} is field {c} of input line {r}. # If {tag[r] == ""} then line is blank and {data[r,*]} is irrelevant. # If {tag[r] == "#"} then {data[r,1]} is rest of line unparsed. # If {tag[r] == "+"} then line {r} is a header line. # If {tag[r] == "|"} then line {r} is a data line. split("", tag); split("", data); # These are only defined for data lines: # {val[r,c]} is numeric value of {data[r,c]}, "*" if non-numeric. split("", val); # Column attributes, derived from all data and header lines: # {cskip[c]} is the num of blanks between column {c-1} (or tag) and column {c}. # {cwidth[c]} is the width of field {c} in any lines. # {n_num[c]} is the number of numeric or "=" fields in column {c}. # {n_alf[c]} is the number of non-numeric fields in column {c}, except blank, "-", or "=". # {rjust[c]} is TRUE iff all entries in column {c} are right-justified. # {prec[c]} is max frac digits of all numeric fields of column {c}, or -1 if all int. # {tsep[c]} is true iff any numeric field in column {c} has thousands-separators. split("", cwidth); split("", cskip); split("", n_num); split("", n_alf); split("", rjust); split("", prec); split("", thsep); } /* READING THE WORKSHEET */ (abort >= 0) { exit abort; } // { nlines++; r = nlines; $0 = untabify($0); } /^[ ]*$/ { tag[r] = ""; next; } // { # Non-blank line - define or check tag column: if (! match($0, /[^ ]/)) { prog_error(r, "duh?"); } if (tagcol == -1) { tagcol = RSTART; } else if (tagcol != RSTART) { data_error(r, ("tag not aligned tagcol = " tagcol " RSTART = " RSTART)); } tg = substr($0,tagcol,1); body = substr($0,tagcol+1); tag[r] = tg; if (tg == "#") { data[r,1] = body; } else if (tg == "+") { parse_field_def_line(r,body); } else if (tg == "|") { if (ncols == -1) { data_error(r, ("no \"+\"-line")); } parse_data_line(body,r); } else { data_error(r, ("bad tag \"" tg "\"")); } next; } function parse_field_def_line(r,body, len,eaten,c,fskip,fwidth) { eaten = 0; c = 0; while(match(body, /[^ ]/)) { # One more column in data: c++; # Skip blanks, remember column start: fskip = RSTART-1; body = substr(body,fskip+1); eaten += fskip; # Count "-"s if (! match(body, /^[-]+/)) { data_error(r, ("invalid char \"" substr(body,1,1) "\" in \"+\"-line")); } # Save "-"s in {data} array: data[r,c] = substr(body, RLENGTH); # Skip "-", remember column width: fwidth = RLENGTH; body = substr(body,fwidth+1); eaten += fwidth; # printf "skip = %d width = %d\n", fskip, fwidth > "/dev/stderr"; # Set/confirm column attributes: if (ncols == -1) { # Save column position in line: cskip[c] = fskip; cwidth[c] = fwidth; # Initialize column attributes: n_num[c] = 0; n_alf[c] = 0; rjust[c] = 1; prec[c] = -1; thsep[c] = 0; } else { if (fskip != cskip[c]) { data_error(r, ("field " c " is misaligned")); } if (fwidth != cwidth[c]) { data_error(r, ("field " c " has inconsistent width")); } } } # Set/confirm number of columns and nominal body width: if (ncols == -1) { nbody = eaten; ncols = c; } else { if (eaten != nbody) { data_error(r, ("inconsistent line length")); } if (c != ncols) { data_error(r, ("inconsistent number of columns")); } } } function parse_data_line(body,r, c,sk,fw,fx,fv,pt,ts) { # Requires {cskip[1..ncols],cwidth[1..ncols]} # Sets {data[r,1..ncols]} to the raw field texts. # Sets {val[r,1..ncols]} to their numeric values. # Updates {rjust[1..ncols],n_alf[1..ncols],n_num[1..ncols],prec[1..ncols],thsep[1..ncols]}. for (c = 1; c <= ncols; c++) { # Check and skip inter-column space: sk = cskip[c]; x = substr(body,1,sk); if (x !~ /^[ ]*$/) { data_error(r, ("non-blank item \"" x "\" between cols")); } body = substr(body,sk+1); # Get field content and value: fw = cwidth[c] fx = substr(body,1,fw); data[r,c] = fx; fv = field_value2(fx,altsep); val[r,c] = fv; # Update {rjust[c]}: if ((length(x) < fw) || (substr(x,fw,1) == " ")) { rjust[c] = 0; } if (fv == "*") { # Non-numeric field: if ((fx !~ /^[ ]*[-=][ ]*$/) && (fx != "")) { n_alf[c]++; } } else { # Numeric field: n_num[c]++; # Update {prec[c]}: gsub(/[ ]+$/,"",x); p = index(fx,(altfmt ? "," : ".")); if (p > 0) { p = length(x)-p; if (p > prec[c]) { prec[c] = p; } } # Update {thsep[c]}: if (index(fx,(altfmt ? "." : ",")) > 0) { thsep[c] = 1; } } # Skip field: body = substr(body,fw+1); } # Check for non-blank residue: if (body !~ /^[ ]*$/) { data_error(r, ("non-blank item \"" body "\" after last col")); } } /* PROCESSING AND PRINTOUT */ END { if (abort >= 0) { exit abort; } # Decide which columns are numeric: split("", cnum); for (c = 1; c <= ncols; c++) { cnum[c] = ((n_num[c] > 0) && (n_alf[c] == 0) && rjust[c]); } process_and_print_worksheet(); } function process_and_print_worksheet( r,tot) { # Totalize the numeric columns, reformat numeric fields in numeric # columns and computed totals, and print worksheet. # {tot[c]} total of entries in column {c} since last "=". split("", tot); for (c = 1; c <= ncols; c++) { tot[c] = 0; } # Flag {next_fmt} means that the field in the next column must be reformatted: next_fmt = 0; for (r = 1; r <= nlines; r++) { this_fmt = next_fmt; next_fmt = 0; printf "%*s%s", tagcol-1, "", tag[r]; if (tag[r] == "|") { process_and_print_data_line(r,tot); } else if (tag[r] == "+") { print_field_def_line(r); } else { printf "%s", data[r,1]; } printf "\n"; } } function print_field_def_line(r,tot, c,fx) { for (c = 1; c <= ncols; c++) { # Print intercolumn space and field: printf "%*s%s", cskip[c], "", data[r,c]; } printf "\n"; }