#! /usr/bin/python3 # Last edited on 2025-07-31 11:57:23 by stolfi from math import fabs, exp, log, inf, isfinite, floor import sys, re from ivtff_format import line_loop, strip_comments from process_frac_words import enum_words_in_text import argparser PROG_NAME = "tabulate_frac_counts" PROG_COPYRIGHT = "Copyright © 2025 by the State University of Campinas" PROG_HELP = \ " " + PROG_NAME + "\\" + \ """ [ -inDir IN_DIR ] \\ [ -type { raw | relative | rank } DIGITS[.PREC] ] \\ [ -sortBy { item | IXSORT } [ increasing | decreasing ] ] \\ [ -language { txt | html | tex } ] \\ [ -maxLines MAX_LINES ] \\""" + \ "\n" + \ argparser.help_info_HELP + \ """ FNAME[1] FNAME[2] ... FNAME[NF]""" PROG_INFO = \ "SYNOPSIS\n" + \ PROG_HELP + "\n" + \ "\n" + \ "DESCRIPTION" + \ """ Reads a bunch of files {F[f]}="{IN_DIR}/{FNAME[f]}.fct" with fractional counts of arbitrary items where each line has two fields "COUNT ITEM", and writes a table of those counts to {stdout}. The {COUNT} is any non-negative float value, and the {ITEM} is any string of one or more non-blank characters. The output will have {NT*NF+1} columns, where {NT} is the number of times the "-type" option was specified (or 1 if it was never specified). Each group {G[f]} of {NT} columns {f*NT..f*NT+NT-1}, for {f} in {0..NF-1}, are the {COUNT}s read from file {F[f+1]}, transformed in {NT} ways. The last column is an {ITEM} read from those files. Each occurrence of the "-type" keyword is assigned a successive column in each group {G[f]}. That colum will contain the {COUNT} from file {F[f]} transformed as specified by the argument of that "-type" keyword. All values in each output line will refer to the same {ITEM} shown on the last column. The inputs are implicitly joined by the {ITEM} column. That is, every item that appears in one of more of the input files will be shown in the output. If an {ITEM} does not appear in some file, it is assumed to have zero {COUNT} in that file.\n""" + \ "\n" + \ "OPTIONS" + \ """ -inDir IN_DIR This optional keyword specifies the directory where the input files reside. If not specified, "./" is assumed. -type { raw | relative | rank } DIGITS[.PREC] This optional keyword specifies how the {COUNT} read from each file should be transformed before being written in the column. If the first argument is "raw", the raw {COUNT} from the file will be shown. If it is "relative", the {COUNT} will be divided by the sum of all {COUNT}s from that file. If it is "rank", the values shown will be the rank (starting from 0) of the {COUNT} among all {COUNT}s in the file. The value will be printed with format "%{DIGITS}.{PREC}f" or "%{DIGITS}d" if the ".{PREC}" part is omitted. However {DIGITS} will be implicitly increased if needed to accomodate the max number of integer digits present in the input. If the ".{PREC}" part is given, trailing zeros in the value will be replaced by spaces. In this case, if the value is exactly zero, only a period "." will be printed, otherwise there will be at least a leading "0.". If the ".{PREC}" part is omitted, and the value is exactly zero, a "-" will be printed instead of "0". This option can be repeated; for instance, "-type raw 6 -type relative 8.6" will show two columns of numbers for each file, one with the raw counts in format "%6d" and one with the relative ones informat "%8.6f". If no "-type" option is specified, the program assumes one "-type raw 9". -sortBy { item | IXSORT } [ increasing | decreasing ] This optional keyword specifies how the ouput lines should be sorted. If the first argument is "item" or {NF+1}, the lines will be sorted alphabetically by the {ITEM} column. If the first argument is an integer in {1..NF}, the lines will be sorted by the value of {COUNT} in {F[IXSORT]} (breaking ties by increasing {ITEM}). The second argument specifies the sorting direction; if omitted, "increasing" is assumed for the {ITEM} column, and "decreasing" for any {COUNT} column. If no "-sortBy" is specified, "-sortBy 1 decreasing" is assumed. -language { plain | html | tex } ]] This optional keyword specifies the output table language. If not specified, the program assumes "-language plain". The "plain" option will print values and items using spaces for alignment. The "html" option will write each line as a "..." construct, suitable for inclusion in a "...
" entity. Non-breaking spaces " " will be used where needed to ensure proper alignment. The "tex" option will print each line in a format suitable for inclusion in a TeX/LaTeX "\\begin{table}...\\end{table}" construct. Namely, columns will be separated by " & ", the line will end with "\\\\", and each field will be formatted as the argument of a macro: "\\iv{...}" for integer values, "\\fv{...}" for float values, and "\\ev{...}" for the items. Trailing zeros in float values will be replaced by "~". -maxLines MAX_LINES This optional keyword instructs the program to truncate the output after {MAX_LINES}. If omitted, all lines will be written.\n""" + \ "\n" + \ "DOCUMENTATION OPTIONS\n" + \ argparser.help_info_INFO + "\n" + \ "\n" + \ argparser.help_info_INFO + "\n" + \ "SEE ALSO\n" + \ " Look deep into your soul; what do you see?\n" + \ "\n" + \ "AUTHOR\n" + \ " Created 2025-07-30 by Jorge Stolfi, IC-UNICAMP.\n" + \ "\n" + \ "MODIFICATION HISTORY\n" + \ " All entries by the author above unless indicated otherwise.\n" + \ " 2025-07-30 Created.\n" + \ "\n" + \ "WARRANTY\n" + \ " " + argparser.help_info_NO_WARRANTY + "\n" + \ "\n" \ "RIGHTS\n" + \ " " + PROG_COPYRIGHT + ".\n" + \ "\n" + \ " " + argparser.help_info_STANDARD_RIGHTS def main(): o, types, files = parse_options() NF = len(files) NT = len(types) # sys.stderr.write(f"NF = {NF} NT = {NT}\n") tb = read_raw_counts(o['inDir'], files) tb = sort_table(tb, o['sortBy'], o['sortBy_dir']) tb = tb[0: o['maxLines']] tbf = make_full_table(tb, types) types = fix_col_widths(types, NF, tbf) write_table(tbf, NF, types, o['language']) sys.stdout.flush() return 0 # ---------------------------------------------------------------------- def fix_col_widths(types, NF, tbf): # Incerases the {size} format parameter as needed to # fit the largest values in the table. NT = len(types) NR = len(tbf) types_adj = [ ] # Adjusted {types}. for t in range(NT): ty, size, prec = types[t] if prec > 0: size = max(size, prec+2) for f in range(NF): j = 1 + f*NT + t for r in range(NR): val = tbf[r][j] assert val >= 0, "prog error: negative val" # The width of the num should be the same in all languages: vsize = len(format_num(val, ty, size, prec, " ")) size = max(size, vsize) if size > types[t][1]: sys.stderr.write(f"!! width of {ty} columns increased to {vsize}\n") types_adj.append((ty, size, prec)) return types_adj # ---------------------------------------------------------------------- def sort_table(tb, sortBy, sdir): # Sorts the rows of the table {tb} as specified # by {sortBy} and {sdir}. # # If {sortBy} is 0, sorts by the item field (column 0). # There should be no ties there. # # Otherwise, sorts by column {f}, breaking ties # by column 0. # # The {sdir} must be {-1} for decerasing order, # {+1} for increasing order. # if len(tb) == 0: return tb NF = len(tb[0]) - 1 assert NF >= 1, "prog error: NF" assert sdir == -1 or sdir == +1, "prog error: sdir" if sortBy == 0: # Sort the rows by item, in requested order: tb.sort(reverse = (sdir < 0), key = lambda row: row[0]) else: assert sortBy >= 1 and sortBy <= NF, "prog error: sortBy" # Sort the rows alphabetically by index, always increasing, for tie-breaking: tb.sort(key = lambda row: row[0]) # Sort the rows by the requested count, in proper order: tb.sort(key = lambda row: row[sortBy]*sdir) return tb # ---------------------------------------------------------------------- def make_full_table(tb, types): # Returns a table {tbf} that is {tb} expanded by turning each counts # column into {NT} columns as specified by {types}. Thus {tb} has # {NF+1} columns, {tbf} nas {NF*NT+1}. Element {tbf[r][1+f*NT+t]} is # the derived value {t} from the raw count from file {f}, for item # {tbf[r][0]}. # # sys.stderr.write(f"tb = {tb}\n") NR = len(tb) # Num of rows (items). if NR == 0: return tb NF = len(tb[0]) - 1 NT = len(types) # sys.stderr.write(f"NF = {NF} NT = {NT}\n") # Start with a table of {None}: tbf = [ ] for r in range(NR): tbf.append([ tb[r][0] ] + [ 0 ]*(NT*NF)) # sys.stderr.write(f"tbf init = {tbf}\n") # Now fill it by columns: for f in range(NF): # Extract the raw counts from file {f} raw = [ tb[r][f+1] for r in range(NR) ] # sys.stderr.write(f"raw = {raw}\n") # Tranform and insert in {tbf}: for t in range(NT): j = 1 + f*NT + t # Index of col to insert. if types[t][0] == "raw": col = raw elif types[t][0] == "relative": col = make_counts_relative(raw) elif types[t][0] == "rank": col = ranks_from_counts(raw) for r in range(NR): tbf[r][j] = col[r] return tbf # ---------------------------------------------------------------------- def make_counts_relative(raw): # Returns a copy of {raw} scaled so that the sum # of all entries is 1. sum = 0.0 for count in raw: sum += count if sum == 0: sum = 1 rel = [ count/sum for count in raw ] return rel # ---------------------------------------------------------------------- def ranks_from_counts(raw): # Converts the vector of floats {raw} into # a vector of ranks {rank} where {rank[i]} # is the number of elements of {row} which are # strictly less than {row[i]}, plus half the # number of those which are equal to {row[i]}, # except {row[i]}. # NR = len(raw) rix = [ i for i in range(NR) ] rix.sort(key = lambda i: raw[i], reverse = True) rank = [ None ]*NR # Handle repeats: j = 0 # First element such that {raw[rix[j]] == {raw[rix[i]]} for i in range(NR + 1): if i == NR or raw[rix[j]] != raw[rix[i]]: # Elements {raw[rix[j..i-1]]} must be tied. jrank = (j + i - 1)/2 for k in range(j, i): rank[rix[k]] = jrank j = i return rank # ---------------------------------------------------------------------- def write_table(tbf, NF, types, language): # Writes the (expanded) table {tbf} to {stdout}. # See the {PROG_INFO} for the meaning of the parameters. # # sys.stderr.write(f"tbf = {tbf}\n") NR = len(tbf) if NR == 0: return if language == "plain": for r in range(NR): write_table_row_plain(tbf[r], NF, types) elif language == "tex": for r in range(NR): write_table_row_tex(tbf[r], NF, types) elif language == "html": for r in range(NR): write_table_row_html(tbf[r], NF, types) else: assert False, "prog error: language" sys.stdout.flush() return # ---------------------------------------------------------------------- def write_table_row_plain(row, NF, types): # The "plain" format has numbers in plain ascii, # items printed as-is, and fields separated by ' | '. # sys.stderr.write(f" row = {row}\n") NT = len(types) sys.stdout.write(" ") for f in range(NF): if f > 0: sys.stdout.write(" ||") for t in range(NT): ty, size, prec = types[t] j = 1 + f*NT + t # Index of {tbf} col to print. vstr = format_num(row[j], ty, size, prec, " ") if t > 0: sys.stdout.write(" | ") sys.stdout.write(vstr) sys.stdout.write(f" || {row[0]}\n") return # ---------------------------------------------------------------------- def write_table_row_html(row, NF, types): # The "html" format prints the row as an HTML table row, # with trailing blanks in fractions replaced by " ": # NT = len(types) sys.stdout.write(" ") for f in range(NF): for t in range(NT): ty, size, prec = types[t] j = 1 + f*NT + t # Index of {tbf} col to print. vstr = format_num(row[j], ty, size, prec, " ") sys.stdout.write(f"{vstr}") sys.stdout.write(f"{row[0]}") sys.stdout.write("\n") return # ---------------------------------------------------------------------- def write_table_row_tex(row, NF, types): NT = len(types) for f in range(NF): for t in range(NT): ty, size, prec = types[t] j = 1 + f*NT + t # Index of {tbf} col to print. vstr = format_num(row[j], ty, size, prec, "~") mac = r"\fv" if prec > 0 else r"\iv" if j > 1: sys.stdout.write(" & ") sys.stdout.write(f"{mac}{{{vstr}}}") sys.stdout.write(r" & \ev") item = re.sub(r"[{}]", "", row[0]) sys.stdout.write(f"{{{item}}}") sys.stdout.write(r" \\") sys.stdout.write("\n") return # ---------------------------------------------------------------------- def format_num(val, ty, size, prec, pad): if ty == "rank": vstr = format_rank(val, ty, size, prec, pad) else: vstr = format_count(val, ty, size, prec, pad) return vstr # ---------------------------------------------------------------------- def format_count(val, ty, size, prec, pad): # Formats the float {val} as "%{size}d" if {prec} is zero, # else as "%{size}.{prec}f". # # In the second case, if the error between the formatted string and # the given {val} is less than 1% of the last printed digit, replaces # each trailing zero by a copy of {pad}. If the whole fraction part # gets replaced, also replaces the period '.'. # # In particular, if {val} is zero, prec is zero, the result is {size} # blanks. If {val} is zero and prec is positive, the result is # {size-prec-1} blanks, then '.' then {prec} copies of {pad}. # assert ty == "raw" or ty == "relative", "prog_error: ty" if val == 0: if prec == 0: # Leave entry blank: vstr = " " * size else: # Print only the period: assert size >= prec+2, "prog error: size" vstr = (" " * (size-prec-1)) + "." + (pad * prec) elif prec == 0: # Integer format: vstr = "%*d" % (size, val) else: # Fraction format: vstr = "%*.*f" % (size, prec, val) if vstr[-1] == "0": # See if it is OK to remove trailing zeros (and maybe period): rel_err = fabs((val - float(vstr))/val) rel_tol = 0.1**(prec + 2) if rel_err < rel_tol: # OK to remove: vstr = remove_trailing_zeros(vstr, pad) return vstr # ---------------------------------------------------------------------- def format_rank(val, ty, size, prec, pad): # Formats the float {val} as "%{size}d" if {prec} is zero, # else as "%{size}.{prec}f". Assumes that {val} is a # rank. # # In the second case, replaces trailing zeros by copies of {pad}, # even if the value of the resulting string differs from {val} # just beyond the last digit printed. If the whole fraction part is # replaced, also replaces the period '.'. assert ty == "rank", "prog_error: ty" if prec == 0: # Integer format: vstr = "%*d" % (size, val) else: # Fraction format: vstr = "%*.*f" % (size, prec, val) vstr if vstr[-1] == "0": # Remove trailing zeros (and maybe period): vstr = remove_trailing_zeros(vstr, pad) return vstr # ---------------------------------------------------------------------- def remove_trailing_zeros(vstr, pad): # Replaces trailing zeros in {vstr} by copies of {pad}. If the whole # fraction part is replaced, also replaces the period '.'. m = re.search(r"([.]|)[0]+$", vstr) if m != None: vstr = vstr[0:m.start(0)] + (pad * (m.end(0)-m.start(0))) return vstr # ---------------------------------------------------------------------- def read_raw_counts(inDir, files): # Reads files {files[0..NF-1]} in folder {inDir} and makes a 2d table # {tb} out of their contents. # # The table {tb} is a list of lists. Entry {tb[r][0]} is some item # read from some input file, and {tb[r][f+1]} is the raw frac count # read from {files[f]} for that item; or zero if that item does not # occur in {files[f]}. # tb = [ ] item_to_row = {} # A dict that maps each item to its row index in the tables {tb}. NF = len(files) for f in range(NF): item_to_count = read_count_file(inDir + "/" + files[f]) # sys.stderr.write(f"item_to_count = {item_to_count}\n") for itm in item_to_count: if itm in item_to_row: r = item_to_row[itm] else: r = len(tb) item_to_row[itm] = r tb.append([ itm ] + [0.0] * NF) tb[r][f+1] += item_to_count[itm] return tb # ---------------------------------------------------------------------- def read_count_file(fname): # Reads a file {fname} of fractional item counts and returns a dict # {itm_to_ct} that maps each item to its count. # # Each line of {fname} must have two fields "{COUNT} {ITEM}" where # {COUNT} is a non-negative float value and {ITEM} is a non-empty # string that does not contain any spaces (blanks, abs, CR, etc). # # There must be at lest one blank between the two. Leading and # trailing blanks are ignored, as well as blank lines and # '#'-comments. # # If an item occurs two or more times, its counts are added together. # User beware of underflow, overflow, and roundoff errors. # item_to_count = {} rd = open(fname, "r") nread = 0 while True: line = rd.readline(); if line == "": rd.close() return item_to_count nread += 1 line = re.sub(r"[#].*$", "", line) line = line.strip() if line != "": pat_count = r"[-+]?[0-9]+(|[.][0-9]*)(|[Ee][-+]?[0-9]+)" pat_item = "[^ \000-\037]+" pat = f"({pat_count})[ ]+({pat_item})" m = re.fullmatch(pat, line) if m == None: data_error(fname, nread, "bad line format", line) assert len(m.groups()) == 4, "prog error (pattern)" # sys.stderr.write(f"groups = {m.groups()}\n") count = float(m.group(1)) if count < 0: data_error(nread, "negative count", line) if not isfinite(count): data_error(fname, nread, "invalid count", line) item = m.group(4) if item in item_to_count: item_to_count[item] += count else: item_to_count[item] = count assert False # Shouldn't get here. # ---------------------------------------------------------------------- def data_error(fname, nread, msg, line): sys.stderr.write(f"{fname}:{nread}: ** {msg}\n") sys.stderr.write(f" [[{line}]]\n") assert False # ---------------------------------------------------------------------- def parse_options(): pp = argparser.ArgParser(sys.argv, sys.stderr, PROG_HELP, PROG_INFO) # Defaults: inDir = "." sortBy = 1 sortBy_dir = -1 language = "plain" maxLines = 999999999 types = [] files = [] if pp.keyword_present("-inDir"): inDir = pp.getNext() if pp.keyword_present("-maxLines"): maxLines = pp.get_next_int(1, 999999999) assert type(maxLines) is int if maxLines < 1: pp.error(f"invald -maxLines {maxLines}") if pp.keyword_present("-language"): language = pp.get_next(mayBeKeyword = False) if language != "plain" and language != "tex" and language != "html": pp.error(f"invalid -language '{language}'") if pp.keyword_present("-sortBy"): if pp.keyword_present_next("item"): sortBy = 0 else: sortBy = pp.get_next_int(1, 999999999) if sortBy < 1: pp.error(f"invalid sort column {sortBy}") if pp.keyword_present_next("increasing"): sortBy_dir = +1 elif pp.keyword_present_next("decreasing"): sortBy_dir = -1 else: # Default is increasing for "item", decreasing for count: sortBy_dir = +1 if sortBy == 0 else -1 while pp.keyword_present("-type"): ty = pp.get_next(mayBeKeyword = False) if ty != "raw" and ty != "relative" and ty != "rank": pp.error(f"invalid column type '{ty}'") fmt = pp.get_next(mayBeKeyword = False) m = re.fullmatch(r"([0-9]+)(|[.][0-9]+)", fmt) if m == None: pp.error(f"invalid column format '{fmt}'") else: size = max(1, int(m.group(1))) prec = m.group(2) prec = 0 if prec == "" else int(prec[1:]) types.append((ty,size,prec)) if len(types) == 0: types = [ ( 'raw', 8, 6 ) ] pp.skip_parsed() assert pp.next <= len(sys.argv), "prog error: argc" NF = len(sys.argv) - pp.next for f in range(NF): files.append(pp.get_next(mayBeKeyword = False)) if NF == 0: pp.error("no input files?") # Finish checking range of {sortBy}: if sortBy == NF+1: sortBy = 0 elif sortBy > NF: pp.error(f"invalid sorting file index {sortBy}") o = dict\ ( inDir = inDir, maxLines = maxLines, sortBy = sortBy, sortBy_dir = sortBy_dir, language = language ) return o, types, files # ---------------------------------------------------------------------- main()