#! /usr/bin/python -t # _*_ coding: iso-8859-1 _*_ # Last edited on 2023-01-19 00:19:32 by stolfi MODULE_NAME = "txtable" MODULE_DESC = "Tools for handling plain-text tables" MODULE_VERS = "1.0" # !!! TO DO : accept {infoColSep,headerColSep,ruleColSep,commentSep} as parameters. # !!! TO DO : extend empty rule fields to blanks, non-empty ones to dashes. # !!! TO DO : write the {MODULE_INFO} string. MODULE_COPYRIGHT = "Copyright © 2008 State University of Campinas" MODULE_INFO = "!!! MODULE_INFO to be written" import sys import re import string from decimal import * # MODULE FUNCTIONS def parse_row(lin): "Removes TABs and other garbage from line {lin}, then splits" \ " it into its main parts, which are returned as result:\n" \ " {tag} the line tag (a one-character string).\n" \ " {ind} the line's indentation (an integer).\n" \ " {fld} the line body's fields (a tuple).\n" \ " {cmt} the line's trailing comment (a string).\n" \ " {err} error message, if any (a string), or {None}.\n" \ " Note that the line ' # foo' yields {' ', 0, [ ], ' # foo'}," \ " not {' ', 2, [ ], '# foo'}.\n" \ " Does not strip blanks from the fields." # Being optimistic: err = None; # Remove tabs and other crud from the line {lin}, preserving its visual appearance: lin = re.sub(r"[\240\014]", " ", lin) lin = re.sub(r"[\015]", "", lin) lin = lin.expandtabs() # Remove trailing spaces: lin = re.sub(r"[ ]+$", "", lin) # Split out the comment {cmt}: m = re.search(r"^(.*?)([ ]*([#].*|))$", lin) if m == None : prog_error("duh?") lin = m.group(1) cmt = m.group(2) if lin == "" : # Blank line: tag = " " ind = 0 fld = [ ] else : # Non-blank line. # Determine the indentation {ind}: m = re.search(r"^([ ]*)(.*)$", lin) if m == None : prog_error("deh?") ind = m.end(1) lin = m.group(2) if lin == "" : prog_error("boh?") if re.match(r"^[-+ ]*$", lin) != None : # Assume that it is a rule line: tag = "+" elif re.search(r"[!]", lin) != None : # Assume that it is a header line: if re.search(r"[|]", lin) != None : err = "ambiguous header/info row" tag = "!" else : # Assume that it is an info line: tag = "|" # Break {lin} into fields by the {tag} character: fld = re.split("[" + tag + "]", lin) # sys.stderr.write("fld = «%s»\n" % fld) if len(fld) < 1 : prog_error("beh?") return tag,ind,fld,cmt,err # ---------------------------------------------------------------------- def field_tot_rank(xv) : "Returns the totalling-rank of the string {xv}.\n" \ " Namely, if {xv} is 'total', 'subtotal', 'subsubtotal', etc., returns" \ " the number of 'sub' prefixes. Otherwise returns {None}." # Strip any leading blanks: xv = re.sub(r"^[ ]+", "", xv); # Count "sub" prefixes into {k}: k = 0; while 1: yv = re.sub(r"^(SUB|Sub|sub)[-_]?", "", xv, 1) if yv == xv : break k += 1; xv = yv if re.match(r"(TOTAL|Total|total)[ ]*$", xv) : return k else : return None # ---------------------------------------------------------------------- def numeric_value(xv, frSep, thSep, altZero) : "Returns the numeric value of {xv}, or {None} if it is not a well-formed number.\n" \ " Assumes that {xv} is non-blank." # Strip leading ad trailing blanks: xv = re.sub(r"^[ ]+", "", xv); xv = re.sub(r"[ ]+$", "", xv); if (altZero != "" and xv == altZero) or xv == frSep : # Field is a variant of zero: return Decimal(0) elif not re.search(r"[0-9]", xv) : # Field contains no digits: return None else : # May be be fractional number, possibly with {thSep,frSep}. xv = strip_thsep(xv, thSep); if frSep == None : # There must be no fraction point: if string.find(xv, ".") >= 0 : return None elif frSep != "." : # Make sure that the fraction separator, if any, is "." # There must be no "." already: if string.find(xv, ".") >= 0 : return None # Change the frSep}, if present, to "." : xv = string.replace(xv, frSep, ".") # Now parse as an US-style number: if re.match(r"^[-+]?[0-9]+([.][0-9]*|)$", xv) : # Sign, nonempty integer, optional "." and fraction. v = Decimal(xv) elif re.match(r"^[-+]?[.][0-9]+$", xv) : # Sign, ".", nonempty fraction. v = Decimal(xv) else : return None # Normalize minus-zero to ordinary zero: if v == Decimal(0) : v = Decimal(0) return v # ---------------------------------------------------------------------- def format_value(v, pr, th, ps, frSep, thSep, altZero): "Typesets the decimal numeric value {v} with given format parameters.\n" \ " The parameters are:\n" \ " {pr} number of digits after fraction point; {None} or {-1} for integer.\n" \ " {th} if true, inserts thousands-separators every three digits from point.\n" \ " {ps} if true, inserts an explicit '+' before positive values.\n" \ " {frSep} character to use as fraction separator, or {None}.\n" \ " {thSep} character to use as thousands-separator, or {None}.\n" \ " {altZero} special representation for value 0, or {None}." \ if v == Decimal(0) : xv = format_zero_value(v, pr, frSep, altZero) else : xv = format_nonzero_value(v, pr, ps, frSep) # Insert the thousands separator if requested: if th : xv = insert_thsep(xv, pr, thSep) return xv; # ---------------------------------------------------------------------- def format_zero_value(v, pr, frSep, altZero) : "Formats a zero value.\n" \ " Namely, (1) if {altZero} is not {None}, uses {altZero}; else " \ " (2) if {pr} is a non-negative integer, uses the fraction separator {frSep}; else" \ " (3) uses a single '0'.\n" \ " In cases (1) or (2), if {pr} is positive, appends {pr} blanks to" \ " the result to maintain numeric alignment. " if altZero != None : # Use {altZero}: xv = altZero; elif pr == None or pr < 0 : # Integer column, use just "0": return "0" else : # Fractional column, use {frSep} with {pr} blanks: if frSep == None or len(frSep) != 1 : prog_error("wee!") xv = frSep if pr != None and pr > 0 : xv = xv + " "*pr return xv # ---------------------------------------------------------------------- def format_nonzero_value(v, pr, ps, frSep) : "Formats a nonzero value.\n" \ " namely, if {pr} is {None} or negative, formats {v} as an integer. Otherwise" \ " formats it as a decimal fraction, with {frSep} as the fraction separator" \ " and {pr} fraction digits. In any case, if {v} is" \ " positive and {ps} is true, inserts an explicit '+' sign." if pr == None or pr < 0 : # Format the number in "d" format. v = v.to_integral(rounding=ROUND_HALF_UP) xv = "%s" % v else : if frSep == None or len(frSep) != 1 : prog_error("woo!") # Format the number in "f" format with {pr} decimals: v = v.quantize(Decimal((0,(1,0),-pr)), rounding=ROUND_HALF_UP) xv = "%s" % v if (pr == 0) : xv = xv + "." if v > 0 and ps : xv = "+" + xv # if all is well, {pr} should be large enough to represent {v} without any rounding : if Decimal(xv) != v : prog_error("unexpected rounding of «%s» to «%s»" % (v, xv)) # Replace the "." by {frSep} if given: if pr != None and pr >= 0 and frSep != None and frSep != "." : xv = xv.replace(".", frSep, 1) return xv # ---------------------------------------------------------------------- def insert_thsep(xv, pr, ch) : "Inserts the thousands-separator {ch} in {xv}.\n" \ " Assumes that {xv} is a number with precision {pr}, with some" \ " unknown fraction separator and no thousands-separator." \ " Inserts the single character {ch} (which must be non-digit)" \ " between the integer and fraction digits of {xv} at every three digits" \ " from the decimal point position." if ch == None or len(ch) != 1 : prog_error("ulp!") if pr == None or pr < 0 : return insert_thsep_int(xv,ch) else : n = len(xv); ip = xv[0 : n-pr-1] pt = xv[n-pr-1 : n-pr] fp = xv[n-pr:]; return insert_thsep_int(ip,ch) + pt + insert_thsep_frac(fp,ch) # ---------------------------------------------------------------------- def insert_thsep_int(ip, ch) : "Inserts the thousands-separator {ch} in the integer part {ip} of a number.\n" \ " Requires {ch} to be non-digit and non-empty" while 1 : m = re.match(r"^([-+]?[0-9]+)([0-9][0-9][0-9])(|[^0-9].*)$", ip); if not m : break ip = m.group(1) + ch + m.group(2) + m.group(3) return ip; # ---------------------------------------------------------------------- def insert_thsep_frac(fp, ch) : "Inserts the thousands-separator {ch} in the fraction part {dfip} of a number.\n" \ " Requires {ch} to be non-digit and non-empty" while 1 : m = re.match(r"(^|[^0-9])([0-9][0-9][0-9])([0-9]+)$", fp); if not m : break fp = m.group(1) + m.group(2) + ch + m.group(3) return fp; # ---------------------------------------------------------------------- def strip_thsep(xv, thSep) : "Removes all occurrences of the thousands-separator {thSep} from {xv}.\n" \ " They must be between digits." " If {thSep} is not {None}, it must be a single character." if thSep == None : return xv if len(thSep) != 1 : prog_error("eek!") pat = r"^(.*[0-9])[" + thSep + r"]([0-9].*)$" while 1 : m = re.search(pat, xv) if not m : break xv = m.group(1) + m.group(2) return xv; # ---------------------------------------------------------------------- def get_precision(xv, frSep) : " Obtains the precision (number of fraction digits) of a number {xv}.\n" \ " Assumes that any thousands-separators have been removed, and that " \ "{frSep} is {None} or the fraction part is delimited by {frSep}," \ " which must be a single character. If {xv} has no {frSep}," \ " or {frSep} is {None}, returns -1." if frSep == None : return -1 if len(frSep) != 1 : prog_error("ook!") # Strip any trailing blanks: xv = re.sub(r"[ ]+$", "", xv); # Locate {frSep} and count chars after it: k = xv.find(frSep) if k < 0 : return -1 else : return len(xv) - 1 - k # ---------------------------------------------------------------------- def error(msg) : "Prints an error message {msg} to {stderr} and halts." sys.stderr.write("%s: %s\n" % (MODULE_NAME, msg)) sys.exit(1) # ---------------------------------------------------------------------- # THE TABLE CLASS class TxTable : "A table of numeric and alphabetic items." def __init__(tbl, frSep, thSep, altZero) : tbl.debug = 0; if frSep != None : if re.match(r"[-+|!0-9 ]", frSep) : error("invalid fraction separator") if thSep != None : if re.match(r"[-+|!0-9]", thSep) : error("invalid thousands separator") if thSep == frSep : error("fraction and thousands separators must be distinct") if altZero != None : if re.match(r"[+|!1-9]", altZero) : error("invalid alternate zero format") tbl.frSep = frSep # Fraction separator (one char, or {None} if no fracs allowed). tbl.thSep = thSep # Thousands separator (one char, or {None} if not allowed). tbl.altZero = altZero # Preferred representation for zero, or {None} if none. tbl.ind = -1; # Indentation of table, or -1 if not known. tbl.ncols = -1; # Number of columns in table, or -1 if not known. tbl.nrows = 0; # Number of rows in table. # These items are indexed by {[i]} where {i} is row in {0..nrows-1}: tbl.tag = [ ]; # Tag of row {i} (a single char, " ", "!", "+", or "|"). tbl.cmt = [ ]; # Comment of row {i}. tbl.trk = [ ]; # Total-rank of row {i}, or {-1} if it is not a total row. # These items are indexed by {[j]} where {j} is a column in {0..ncols-1}: tbl.nump = [ ]; # Tells whether column is numeric (0 or 1, or {None} if unknown). tbl.prec = [ ]; # Number of digits after fraction sep or -1 if all ints (int, or {None}). tbl.thfg = [ ]; # Tells whether to use thousands-separators in this column (0 or 1, or {None}). tbl.psfg = [ ]; # Tells whether nonzero values should have explicit sign (0 or 1, or {None}). # These items are indexed by {[i][j]} where {i} is row and {j} is column. # They are defined only when {tag[i]} is not " ". tbl.fld = [ ]; # Table entry in row {i} and column {j} (printable string). tbl.val = [ ]; # Table values (decimal numbers or {None}). # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def add_row(tbl, tag, ind, fld, cmt): "Appends a new row to the table.\n" \ " The row's components are {ind,tag,fld,cmt}," \ " as defined by the {parse_row()} function.\n" \ " If the row is not blank, sets/checks {tbl.ncols} from {len(fld)}," \ " and updates {tbl.ind} from {ind}. Also sets the numeric" \ " values {tbl.val[i][j]} of all fields in the row, using the" \ " current attributes {frSep,thSep,altZero}; the value is {None}" \ " if the row is a header or rule, or if the field is not a valid number." # get the index of this new row. i = tbl.nrows # Extend row lists and save row's attributes: tbl.tag[i:i] = [ tag ] tbl.cmt[i:i] = [ cmt ] tbl.fld[i:i] = [ fld ] tbl.val[i:i] = [ None ]; tbl.trk[i:i] = [ None ]; # Dispatch on {tag}: if tag == " " : # Blank row, must have zero fields and zero indent: if len(fld) != 0 : prog_error("xii!") if ind != 0 : prog_error("xoo!") elif (tag == "+") or (tag == "|") or (tag == "!") : # Non-blank row, must have at least one field: if len(fld) <= 0 : prog_error("xee!") # Check/set {tbl.ncols}, update {tbl.ind}: if tbl.ncols == -1 : # First non-blank row: if tbl.ind >= 0 : prog_error("eta!") tbl.ncols = len(fld) tbl.ind = ind # Allocate the column format attribute lists: tbl.nump = [ None ] * tbl.ncols; tbl.prec = [ None ] * tbl.ncols; tbl.thfg = [ None ] * tbl.ncols; tbl.psfg = [ None ] * tbl.ncols; else : if tbl.ind < 0 : prog_error("eca!") if ind < tbl.ind : tbl.ind = ind if tbl.ncols != len(fld) : error("row %d: inconsistent number of columns = %d %d\n %s" % (i+1, tbl.ncols, len(fld), fld[0])) # Obtain the numeric values of fields, and set {tbl.trk[i]}: tbl.val[i] = [ None ]*tbl.ncols if tag == "|" : tbl.trk[i:i] = [ None ] for j in range(tbl.ncols) : # Strip leading and trailing blanks from field: xvij = tbl.fld[i][j] xvij = re.sub(r"[ ]+$", "", xvij) xvij = re.sub(r"^[ ]+", "", xvij) tbl.fld[i][j] = xvij tbl.val[i][j] = numeric_value(xvij, tbl.frSep, tbl.thSep, tbl.altZero) trk = field_tot_rank(xvij) if trk != None : tbl.trk[i] = trk else : # Unexpected tag: prog_error("noo! tag = «%s»" % tag) # One more row: tbl.nrows += 1 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def printout(tbl): "Prints the table in its current state.\n" \ " The data for each row {i} is taken from {tbl.tag[i],tbl.fld[i],tbl.cmt[i]}." \ " If {tbl.colw[j]} is set, extends each field in that" \ " column to that width. Does not use any other table attributes." # Determine the column widths: colw = tbl.column_widths(); # Print row by row: for i in range(tbl.nrows) : if tbl.tag[i] != " " : # Non-blank row. # Print the table's indentation: if tbl.ind > 0 : sys.stdout.write(" "*tbl.ind) # Print the table fields: for j in range(tbl.ncols) : # Print the separator between columns: if j > 0 : sys.stdout.write("%s" % tbl.tag[i]) # Print the field: xvij = tbl.fld[i][j]; wdij = colw[j] if tbl.tag[i] == "+" : # Replace field by a string of '-'s: sys.stdout.write("-"*wdij); elif (j < len(tbl.nump)) and tbl.nump[j] : # Pad at left, even the header: sys.stdout.write("%*s" % (wdij, xvij)) else : # Pad at right: sys.stdout.write("%*s" % (-wdij, xvij)) # Print the row's comment: sys.stdout.write(tbl.cmt[i]) sys.stdout.write("\n") # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def column_widths(tbl) : "Returns a list of the column widths based on the current fields.\n" \ " Uses only the current values of {tbl.fld[i][j]} of non-blank" \ " table rows" if tbl.ncols > 0 : colw = [ 0 ]*tbl.ncols; for i in range(tbl.nrows) : if tbl.tag[i] != " " : # Non-blank row. for j in range(tbl.ncols) : n = len(tbl.fld[i][j]) if (n > colw[j]) : colw[j] = n else : colw = [ ] return colw # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def choose_column_formats(tbl) : "Determines the formatting attributes of each column.\n" \ " The formatting attributes of column {j} are" \ " {tbl.nump[j],tbl.prec[j],tbl.thfg[j],tbl.psfg[j]}," \ " where {tbl.prec[j],tbl.thfg[j],tbl.psfg[j]} are defined" \ " only if {tbl.nump[j]} is true.\n" \ " Assumes that {val[i][j]} is meaningful only if {i} is an info row." for j in range(tbl.ncols) : tbl.nump[j] = tbl.are_column_entries_numeric(j) if tbl.nump[j] : # Determine {tbl.prec[j],tbl.thfg[j],tbl.psfg[j]}: tbl.prec[j], tbl.thfg[j], tbl.psfg[j] = tbl.get_column_format_params(j) if tbl.debug : # Debugging printouts: sys.stderr.write("column %2d" % j) sys.stderr.write(" nump = %d" % tbl.nump[j]) if tbl.nump[j] : sys.stderr.write(" prec = %2d" % tbl.prec[j]) sys.stderr.write(" thfg = %d" % tbl.thfg[j]) sys.stderr.write(" psfg = %d" % tbl.psfg[j]) sys.stderr.write("\n") # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def are_column_entries_numeric(tbl, j) : "Determines whether column {j} is numeric from the looks of its info fields.\n" \ " Namely, returns TRUE iff there is at" \ " least one non-empty info field with valid numeric value, and no" \ " non-empty info field without such value." # Count numeric and non-numeric info fields in column: nnum = 0 nalf = 0 for i in range(tbl.nrows) : if tbl.tag[i] == "|" and not re.match(r"^[ ]*$", tbl.fld[i][j]) : # Non-empty info field: vij = tbl.val[i][j]; # Tally numeric and alphabetic fields: if vij != None : nnum += 1 else : nalf += 1 # Decide whether column is numeric or alphabetic: if tbl.debug: sys.stderr.write("#! column %d nnum = %d nalf = %d\n" % (j,nnum,nalf)) return nnum > 0 and nalf == 0 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def get_column_format_params(tbl, j) : "Determines the formatting parameters of each column.\n" \ " Namely, returns {prec,thfg,psfg} determined from the textual forms" \ " {tbl.fld[i][j]} of the info fields in column {j} and their" \ " numeric values {tbl.val[i][j]}." prec = -1; thfg = 0; psfg = 0; for i in range(tbl.nrows) : if tbl.tag[i] == "|" : # Info line: xvij = tbl.fld[i][j]; vij = tbl.val[i][j]; if xvij != tbl.altZero : # Remove the thousands separators, if any : yvij = strip_thsep(xvij, tbl.thSep); # if there were any, mark the column as needing them : thfg = thfg or (yvij != xvij); # Get the precision {pr}, or -1 if integer : pri = get_precision(yvij, tbl.frSep) prec = max(prec, pri); if tbl.debug : sys.stderr.write(" row %d xvij = «%s» yvij = «%s» prec = %d\n" % (i, xvij, yvij, pri)) # Check whether there is a leading '+' on a nonzero value: psfg = psfg or (vij != 0 and re.match(r"[ ]*[+]", yvij) != None) # sys.stderr.write("«%s» («%s»)" % (xvij, yvij)) # sys.stderr.write(" pr = %d th = %d ps = %d\n", pr, th, ps) return prec, thfg, psfg # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def recompute_and_reformat_values(tbl) : "Reformats all numeric info fields of {tbl}, recomputing (sub)totals.\n" \ " This method (1) strips leading and trailing blanks from all info and header fields," \ " (2) recomputes the numeric value of any non-empty fields in (sub)total rows and numeric columns," \ " (3) reconstructs all non-empty info fields in each numeric column from" \ " their numeric values, according to the column's common format," \ " and (4) adds one blank around all header and info fields (except at table" \ " edges). It does not affect rule rows." \ " The method uses numeric values {tbl.val[i][j]}, the column format parameters" \ " {tbl.nump[j],tbl.prec[j],tbl.thfg[j],tbl.psfg[j]}, and the row total-ranks" \ " {tbl.trk[i]}.\n" for i in range(tbl.nrows) : if tbl.tag[i] == "!" or tbl.tag[i] == "|" : # Header or info row -- reformat all fields (recomputing totals, if any) : for j in range(tbl.ncols) : xvij = tbl.fld[i][j] vij = tbl.val[i][j] if tbl.debug : sys.stderr.write("«%s» = %s" % (xvij, vij)) # Reformat the element, without any padding: if tbl.nump[j] and tbl.tag[i] == "|" : # Numeric column in info row. # Recompute field if needed : if re.match(r"^[ ]*$", xvij) : # Empty field, set value to {None} so that it stays empty: if vij != None : prog_error("ixe!") xvij = "" else : if tbl.trk[i] != None : # (Sub)total row, must recompute the value: vij = tbl.recompute_total(i,j) # Reformat the value: pr = tbl.prec[j]; th = tbl.thfg[j]; ps = tbl.psfg[j]; xvij = format_value(vij,pr,th,ps,tbl.frSep,tbl.thSep,tbl.altZero); else : # Non-numeric column, or header row: # Just strip surrounding blanks: xvij = re.sub(r"^[ ]+", "", xvij); xvij = re.sub(r"[ ]+$", "", xvij); # Add padding where needed: if xvij != "" : # Non-empty field, add padding blanks except at table edges: if j > 0 : xvij = " " + xvij if j < tbl.ncols-1 : xvij = xvij + " " # Store the recomputed/reformatted field: tbl.fld[i][j] = xvij; tbl.val[i][j] = vij; if tbl.debug : sys.stderr.write(" --> «%s» = %s\n" % (xvij, vij)) # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def recompute_total(tbl,i,j) : "Computes the correct value of a (sub)total in line {i}, column {j} of {tbl}.\n" \ " Namely, returns the sum of the numeric values of all elements" \ " in column {j} that are in non-total info rows above row {i}," \ " up to the first total line with rank less than or equal to that of row {i}.\n" \ " Assumes that column {j} is numeric, so that all non-empty fields have numeric values." if tbl.trk[i] == None : prog_error("ahh!") if not tbl.nump[j] : prog_error("fee!") sum = 0; k = i - 1; while k >= 0 : if tbl.tag[k] != '|' : # Header, rule, or blank row: pass else : # Info row: if tbl.trk[k] == None : # Ordinary line, accumulate it: if tbl.val[k][j] != None : sum += tbl.val[k][j]; elif tbl.trk[k] <= tbl.trk[i] : # (Sub)total line of same or lower rank, stop: break else : # (Sub)total line of higher rank, ignore: pass k -= 1 return sum; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def prog_error(msg): "Prints the error message {msg} about a program error, and aborts." sys.stderr.write("%s\n" % msg); sys.exit(1) # DOCUMENTATION format_INFO = \ " Each input line is parsed into three parts: the /margin/, the" \ " /body/, and the /comment/.\n" \ "\n" \ " If the line contains a '#' character, the comment consists of" \ " all blanks that precede the '#', the '#' itself, and everything that" \ " follows the '#'. Otherwise the comment part consists of" \ " all trailing blanks in the line.\n" \ "\n" \ " The margin then consists of the leading blanks that are not part of" \ " the comment. The body is anything that is not margin or comment.\n" \ "\n" \ "ROW TYPES\n" \ " The body of each input line is classified into four line" \ " types, each identified by a one-character /tag/:\n" \ "\n" \ " a /blank line/, if the body is empty (tag = ' ').\n" \ "\n" \ " an /info line/, if it contains at least one '|' (tag = '|');\n" \ "\n" \ " a /header line/, if it contains at least one '!' and no '|' (tag = '!')\n" \ "\n" \ " a /rule line/, if it contains at least one '+' and only blanks, '-' or '+' (tag = '+');\n" \ "\n" \ " Also, any non-blank line that does not contain any explicit column separator" \ " ('+', '!', or '|') is assumed to be an info line (tag '|'). Ditto for" \ " a line that contains both '|' and '!'.\n" \ "\n" \ "TABLE FIELDS\n" \ " Each line of the input contains zero or more /fields/. A blank" \ " line has zero fields; otherwise, the occurrences of the tag in the" \ " line's body split it into one or more fields.\n" \ "\n" \ " Fields are numbered from left to right, starting at 1.\n" \ "\n" \ " Note that if the body begins (resp. ends) with the tag character," \ " the first (resp. last) field of that line will be empty. Note also" \ " that one cannot have a header line with exactly one field, because" \ " it will be parsed as an info line.\n" \ "\n" \ " TABLE ROWS AND COLUMNS\n" \ " A table is set of lines (/rows/) such that all non-blank" \ " lines have the same number of fields.\n" \ "\n" \ " Thus, each table either has only blank lines, or has a well-defined" \ " and positive number of columns, and a positive number of non-blank" \ " rows. The columns are numbered from left to right, starting at 1.\n" \ "\n" \ " COLUMN NAMES\n" \ " If the table has a header row, each field of that row is taken" \ " as the /name/ of that column, provided it is non-empty and" \ " distinct from all previous headers. Otherwise the column remains" \ " nameless. (Column names are relevant only for some programs, e.g." \ " {table-join}.)\n" \ "\n" \ "NUMERIC AND ALPHABETIC FIELDS\n" \ " A field of an info row is considered /numeric/ if it is non-empty" \ " and consists of an optional sign, followed by one or more decimal" \ " digits, possibly with {frSep} (decimal point) and/or {thSep}" \ " (thousands separator) characters. The {thSep} character may appear" \ " only between two digits. The /numeric value/ of such a field is" \ " the decimal integer or fraction obtained by removing any {thSep}" \ " characters and replacing the {frSep} character by '.'. The {altZero}" \ " string, if not empty, and a single {frSep} are also accepted as" \ " numeric fields, with numeric value 0. A field is considered" \ " alphabetic if it is non-empty and not numeric.\n" \ "\n" \ "NUMERIC AND ALPHABETIC COLUMNS\n" \ " Each column of a table then classified as /numeric/ or /alphabetic/." \ " A column is assumed to be numeric if its info rows contain" \ " at least one numeric field and no alphabetic fields. Note that" \ " any non-empty, non-numeric field in an info row marks the" \ " the whole column as alphabetic.\n" \ "\n" \ " For each numeric column, the program also defines the /precision/" \ " as being the maximum number of digits after the decimal point in" \ " any info field, or -1 if no info field has an explicit decimal point." \ " It also defines a /thousands-flag/, which is true if" \ " and only if {thSep} is not empty and the column contains an" \ " info field with thousands-separators. It also sets" \ " the /plus-flag/ if any nonzero info field in the column has an" \ " explicit '+' sign.\n" \ "\n" \ "(SUB)TOTAL ROWS\n" \ " A /total row/ is an info row that contains the string 'TOTAL'," \ " 'total', or 'Total' as one of its fields, prefixed by zero or more" \ " instances of 'SUB', 'Sub', or 'sub' (with or without joining" \ " hyphens). The number of such prefixes is the /rank/ of that total" \ " row.\n" \ "\n" \ " A total row is /consistent/ if the numeric value of every field of" \ " that row that belongs to a numeric column is equal to the sum of" \ " the values of all fields in that column that are not total rows and" \ " lie strictly between that row and the previous total row with the" \ " same or lower rank (or the top of the table if there is no such" \ " previous total row).\n" \ "\n" \ "CANONICAL FIELD FORMAT\n" \ " To print a table in its /canonical format/, the leading and" \ " trailing blanks in each field are discarded (so a field which is all" \ " blanks is assumed to be empty). Then, every non-empty field" \ " in an info row and a numeric column is replaced by its numeric value," \ " converted to a string according to the column's format (the column's consensus precision," \ " thousands-flag, and plus-sign-flag attributes). Then, every non-empty field in an" \ " info or header row is padded with an extra blank at the left (except" \ " for the first field of the row) and one extra blank at the right" \ " (except for the last field).\n" \ "\n" \ "NUMERIC FIELD FORMATTING\n" \ " When converting the numeric value 0 to its canonical representation," \ " if the {altZero} parameter string is not empty, the result is the" \ " {altZero} string followed by {max(0,prec)} blanks, where {prec} is the" \ " column's precision; otherwise, if {prec} is not -1, it is a single" \ " {frSep} followed by {prec} blanks; otherwise it is just '0'.\n" \ "\n" \ " When converting a nonzero numeric value to its canonical" \ " representation, the value is first printed with {sprintf} using '%d'" \ " or '%+d' format if {prec} is -1, or '%#.{prec}f' format or" \ " '#+.{prec}f' format if {prec} is non-negative; where the '+' form is" \ " used iff the column's plus-flag is set. Then, if {thSep} is" \ " non-empty and the column's thousands-flag is set, the character" \ " {thSep} is inserted to separate the digits of the integer and fraction" \ " parts in groups of three, starting at the fraction point.\n" \ "\n" \ "COLUMN WIDTHS\n" \ " The /column width/ of each column is defined as the maximum" \ " length of any of its fields, including info, header, and rule" \ " rows. The /table margin/ is also defined as the shortest margin" \ " of any non-blank row in the table.\n" \ "\n" \ "COLUMN WIDTH REGULARIZATION\n" \ " To /regularize/ a column, every field in it extended so as to match" \ " the column width {wd}. In a rule row, that means replacing the" \ " field by a string of {wd} '-'s. In a a header or info row, that" \ " means extending with blanks until its length is {wd}; the blanks are" \ " added at the left in numeric columns, and at the right in" \ " non-numeric columns.\n" \ "\n" \ "CUSTOMIZATION\n" \ " Clients can change the fraction separator {frSep}, the thousands separator {thSep}, the alternate zero representation {altZero}, the field separators {infoColSep}, {ruleColSep} and {headerColSep}, and the comment-lead character {commentSep}, at table initialization time.\n" \ "\n" \ " There are some constraints on those parameters, necessary to ensure unambiguous parsing and avoid misleading results. The {frSep} and {thSep} parameters must be the {None} value or" \ " distinct single characters, not in [-+0123456789]. The {frSep}" \ " character must be non-blank. If {frSep} is {None}, numeric fields" \ " may not have fractional parts. If {thSep} is {None}, numeric fields" \ " may not have thousands-separators. The {altZero} string," \ " if not {None}, must not contain [+123456789] (but may contain '-' and/or '0'," \ " or embedded blanks).\n" \ "\n" \ " The column separator {infoColSep} cannot be {None}; it must be a single characters -- non-blank, not in [-+0123456789], distinct from {frSep} and {thSep} (if they are not {None}) and not present in {altZero} (if it is not {None}).\n" \ "\n" \ " If the {haderColSep} is {None}, header lines are not allowed. Otherwise, {haderColSep} must be a single character -- non-blank, not in [-+0123456789], distinct from {infoColSep}, {frSep} and {thSep} (if they are not {None}) and not present in {altZero} (if it is not {None}).\n" \ "\n" \ " If the {ruleColSep} is {None}, rule lines are not allowed. Otherwise, {ruleColSep} must be a single character -- non-blank, not in [-0123456789] (but may be '+'), distinct from {infoColSep}, {headerColSep}, {frSep} and {thSep} (if they are not {None}) and not present in {altZero} (if it is not {None}).\n" \ "\n" \ " If {commentSep} is {None}, comments are not allowed. Otwerwise the {commentSep} must be a single character -- non-blank, not in [-+0123456789], and distinct from {infoColSep}, {haderColSep}, {ruleColSep}, {frSep}, and {thSep} (if they are not {None}). " def parse_args(pp) : "Parses command line arguments for text tables.\n" \ "\n" \ " Expects an {ArgParser} instance containing the arguments," \ " still unparsed. Returns {tblSep,frSep,thSep,altZero,err}, where" \ " {err} is an error message, if any (a string) or {None}." # sys.stderr.write("called txtable.parse_args\n") # Being optimistic: err = None if pp.keyword_present("-tblSep") : tblSep = pp.get_next() else : tblSep = r"^[ ]*END_TABLE[ ]*$" if pp.keyword_present("-frSep") : frSep = pp.get_next_char(mayBeNone = 1) else : frSep = "." if pp.keyword_present("-thSep") : thSep = pp.get_next_char(mayBeNone = 1) else : thSep = "," if pp.keyword_present("-altZero") : altZero = pp.get_next(mayBeNone = 1) else : altZero = None # !!! if ((len(frSep) > 1) or (frSep ~ /[-+0-9 |!]/)) # !!! { arg_error(("invalid parameter {frSep} = \"" frSep "\"")); } # !!! if ((len(thSep) > 1) or (thSep ~ /[-+0-9 |!]/) or (thSep == frSep)) # !!! { arg_error(("invalid parameter {thSep} = \"" thSep "\"")); } # !!! if ((altZero ~ /[+1-9|!]/) or (altZero ~ /^[ ]/) or (altZero ~ /[ ]$/)) # !!! { arg_error(("invalid parameter {altZero} = \"" altZero "\"")); } # !!! return tblSep, frSep, thSep, altZero, err # ---------------------------------------------------------------------- options_HELP = \ " [ -fracSep {FR_SEP_CHAR} ] \\\n" \ " [ -thSep {TH_SEP_CHAR} ] \\\n" \ " [ -altZero {ALT_Z_STRING} ] \\\n" \ " [ -infoColSep {INFO_SEP_CHAR} ] \\\n" \ " [ -headerColSep {HEADER_SEP_CHAR} ] \\\n" \ " [ -ruleColSep {RULE_SEP_CHAR} ] \\\n" \ " [ -commentSep {CMT_SEP_CHAR} ]" options_INFO = \ " -fracSep {FR_SEP_CHAR}\n" \ " Defines the character to use as a fraction separator. The default is '.'.\n" \ "\n" \ " -thSep {TH_SEP_CHAR}\n" \ " Defines the character to use as a thousands separator. The default is ','.\n" \ "\n" \ " -altZero {ALT_Z_STRING}\n" \ " Defines a preferred representation for zero in numeric fields. The" \ " default is '0' in all-integer columns, or a single {FR_SEP_CHAR} in" \ " columns that contain fractional numbers.\n" \ "\n" \ " -infoColSep {INFO_SEP_CHAR}\n" \ " -headerColSep {HEADER_SEP_CHAR}\n" \ " -ruleColSep {RULE_SEP_CHAR}\n" \ " These parameters define the characters to use as column separators" \ " in normal (data) rows, in header rows, and in rule rows. They" \ " default to '|', '!', and '+', respectively.\n" \ "\n" \ " -commentSep {CMT_SEP_CHAR}\n" \ " Defines the comment-introducing character. The default is '#'.\n" \