#! /bin/gawk -f # Last edited on 2009-02-14 16:07:27 by stolfi BEGIN { abort = -1; USAGE = ( \ "format-table \\\n" \ " [ -v colsep={SEP} ] [ -v outsep={SEP} ] \\\n" \ " < INFILE \\\n" \ " > OUTFILE" \ ); # OBSOLETE -- see {txtable-format}. # Reads a text file {INFILE} and formats the tables contained therein. # A /table/ consists of zero or more consecutive /table lines/, all # with the same number of fields. # # A table line can be a /info line/ or a /rule line/. An info # line is a line that contains one or more occurrences of the {SEP} # string or regular expression (which default to '|'). A rule # line is a line that contains only spaces (or tabs) or characters # in '[-=+]', with at least one '+'. The fields are the # substrings of the line delimited by the occurrences of {SEP} # or '+', respectively. # # The raw length of the first field of the table defines the # minimum width of the first column. Otherwise, the # leading and trailing blanks in each field are stripped off. # That done, each field in a rule line must be either empty or # a string of one or more '-'s. # # On output, each field in the table gets extended # so that all fields in the same table column have the same # width: # # * In an info line, each field gets padded with exactly one blank on # the right and one blanks on the left. Then futher blanks are # added to the left or to the right, depending on whether the # column is assumed to be numeric or alphabetic. # # * In a rule line, each field gets extended with blanks if # it is blank or empty, otherwise it is replicated and trimmed # to fill the whole column width. # # A column is assumed to be numeric or alphabetic if the last # definite info field in it is numeric or alphabetic, respectively. # An info field is considered numeric iff it contains a digit not # followed by a letter. An info field is considered to be alphabetic # if it contains a letter not followed by a digit. Otherwise the # field is indefinite (neither numeric nor alphabetic). If the # column contains no definite field, it is assumed to be alphabetic. # # On output, the fields of info lines are separated by the {outsep} # string (which defaults to {colsep} if {colsep} is a single character, # else to "|"). The fields of rule lines are separated with '+'s, # padded with '-'s to the same length as {outsep}. # Set {insep} to {colsep}, or to the default if not specified: insep = colsep; if (insep == "") { insep = "[|]"; } if (insep ~ /[+]/) { arg_error(("\"insep\" should not contain '+'")); } # Set {outsep} to the default if not specified: if (outsep == "") { outsep = (length(colsep) == 1 ? colsep : "|"); } if (outsep == "") { arg_error(("\"outsep\" must be nonempty")); } # Pad the output rule separator {outrulesep} to match {outsep}: outrulesep = "+"; while(length(outrulesep) < length(outsep)) { ourtulesep = ("-" outrulesep "-"); } outrulesep = substr(outrulesep, 1, length(outsep)); # Set the field separator for $0 to {insep}: FS = insep; # Regular expressions matching numeric and alphabetic fields: letters = "a-zA-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞàáâãäåæçèéêëìíîïðñòóôõößøùúûüýþÿ"; numeric_re = ("[0-9][^" letters "]*$"); alphabetic_re = ("[" letters "][^0-9]*$"); clear_table(); } (abort >= 0) { exit abort; } (match($0, insep) != 0) { # Info line -- split by {insep}: save_table_line(0, $0, insep); next; } /^[- \011]*[+][-+ \011]*$/ { # Rule line -- split by '+': save_table_line(1, $0, "[+]"); next; } // { # Non-table line: output_table(); clear_table(); print; next; } END { if (abort >= 0) { exit abort; } output_table(); } function clear_table() { # Sets the current table to empty. # Does not disturb the current line $0. ntlins = 0; # Number of table lines in current table. minwd1 = 0; # Minimum width of first column. ncols = 0; # Number of colunms in table. split("", tbl); # The fields of the current table are {tbl[0..ntlins-1,0..ncols-1]}. split("", typ); # The type of line {i} is {typ[i]} (0 = info, 1 = rule). split("", cwd); # The min column widths are {cwd[1..ncols]}. split("", ckd); # The kind of column {k} is {ckd[j]} ("N" = numeric, "A" = alpha) } function save_table_line(t,lin,sep, nf,fld,j,fj,wdj) { # Splits {lin} at separator {sep} into fields {fld[1..nf-1]}. # If the current table is not empty but the # number of fields {nf} is different from {ncols}, # prints the current table and clears it. # # In any case, if the table is empty, sets {ncols} to {nf} and # initializes the column widths. # # Then strips blanks around each field {fld[1..nf]} and appends them # as a new line of the table {tbl}, with type {t}. nf = split($0, fld, sep); if (nf <= 1) { prog_error(("qua?")); } if (nf != ncols) { output_table(); clear_table(); } if (ntlins == 0) { # Initialize {ncols} and {cwd[1..ncols]} : ncols = nf; cwd[1] = length(fld[1]); for (j = 2; j <= ncols; j++) { cwd[j] = 0; } } else { # Check {ncols}: if (ncols != nf) { prog_error(("duh?")); } } for (j = 1; j <= ncols; j++) { # Strip leading and trailing blanks from field {fld[j]}: fj = fld[j]; gsub(/^[ \011]+/, "", fj); gsub(/[ \011]+$/, "", fj); if (t == 0) { # Info field, add minimum padding: fj = (" " fj " "); # Update the column kind: if (is_numeric(fj)) { ckd[j] = "N"; } if (is_alphabetic(fj)) { ckd[j] = "A"; } } else { # Rule field, make sure that is has only '-'s: gsub(/./, "-", fj); } # Save field in table: tbl[ntlins,j] = fj; # Update the column width: wdj = length(fj); if (wdj > cwd[j]) { cwd[j] = wdj; } } # Save type and increment line counts: typ[ntlins] = t; ntlins++; } function output_table( clin,i,j,fj,wdj,ch) { # Outputs the current table. Does not disturb the # current line $0. if (ntlins == 0) { return; } # Print each line: for (i = 0; i < ntlins; i++) { for (j = 1; j <= ncols; j++) { if (j > 1) { # Print separator: printf "%s", (typ[i] == 0 ? outsep : outrulesep); } # Extend field to width {cwd[j]}: fj = tbl[i,j]; if (typ[i] == 0) { # Info line field -- extend according to column kind: if ((! (j in ckd)) || (ckd[j] == "A")) { # Assume alphabetic, pad with blanks at right: fj = sprintf("%-*s", cwd[j], fj); } else if (ckd[j] == "N") { # Assume numeric, pad with blanks at left: fj = sprintf("%*s", cwd[j], fj); } else { prog_error(("was?")); } } else { # Rule line field: if (fj == "") { # Pad with blanks: fj = sprintf("%*s", cwd[j], fj); } else { # Replicate {jf} to fill the column width: ch = substr(fj,1,1); while (length(fj) < cwd[j]) { fj = (fj fj); } fj = substr(fj, 1, cwd[j]); } } printf "%s", fj; } printf "\n"; } } function is_numeric(x) { return (match(x, numeric_re) != 0); } function is_alphabetic(x) { return (match(x, alphabetic_re) != 0); } function arg_error(msg) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf "usage:\n %s\n", USAGE > "/dev/stderr"; abort = 1; exit abort; } function data_error(msg) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf "%s:%d: «%s»\n", $0 > "/dev/stderr"; abort = 1; exit abort; } function prog_error(msg) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; }