#! /usr/bin/python3
# Last edited on 2025-07-31 11:57:23 by stolfi
from math import fabs, exp, log, inf, isfinite, floor
import sys, re
from ivtff_format import line_loop, strip_comments
from process_frac_words import enum_words_in_text
import argparser
PROG_NAME = "tabulate_frac_counts"
PROG_COPYRIGHT = "Copyright © 2025 by the State University of Campinas"
PROG_HELP = \
" " + PROG_NAME + "\\" + \
"""
[ -inDir IN_DIR ] \\
[ -type { raw | relative | rank } DIGITS[.PREC] ] \\
[ -sortBy { item | IXSORT } [ increasing | decreasing ] ] \\
[ -language { txt | html | tex } ] \\
[ -maxLines MAX_LINES ] \\""" + \
"\n" + \
argparser.help_info_HELP + \
"""
FNAME[1] FNAME[2] ... FNAME[NF]"""
PROG_INFO = \
"SYNOPSIS\n" + \
PROG_HELP + "\n" + \
"\n" + \
"DESCRIPTION" + \
"""
Reads a bunch of files {F[f]}="{IN_DIR}/{FNAME[f]}.fct" with fractional
counts of arbitrary items where each line has two fields "COUNT ITEM",
and writes a table of those counts to {stdout}.
The {COUNT} is any non-negative float value, and the {ITEM}
is any string of one or more non-blank characters.
The output will have {NT*NF+1} columns, where {NT} is the number of
times the "-type" option was specified (or 1 if it was never
specified). Each group {G[f]} of {NT} columns {f*NT..f*NT+NT-1}, for
{f} in {0..NF-1}, are the {COUNT}s read from file {F[f+1]},
transformed in {NT} ways. The last column is an {ITEM} read from those
files.
Each occurrence of the "-type" keyword is assigned a successive column
in each group {G[f]}. That colum will contain the {COUNT} from file
{F[f]} transformed as specified by the argument of that "-type" keyword.
All values in each output line will refer to the same {ITEM} shown on
the last column.
The inputs are implicitly joined by the {ITEM} column. That is, every
item that appears in one of more of the input files will be shown in
the output. If an {ITEM} does not appear in some file, it is assumed
to have zero {COUNT} in that file.\n""" + \
"\n" + \
"OPTIONS" + \
"""
-inDir IN_DIR
This optional keyword specifies the directory where the input files
reside. If not specified, "./" is assumed.
-type { raw | relative | rank } DIGITS[.PREC]
This optional keyword specifies how the {COUNT} read from each file
should be transformed before being written in the column. If the
first argument is "raw", the raw {COUNT} from the file will be
shown. If it is "relative", the {COUNT} will be divided by the sum
of all {COUNT}s from that file. If it is "rank", the values shown
will be the rank (starting from 0) of the {COUNT} among all {COUNT}s
in the file.
The value will be printed with format "%{DIGITS}.{PREC}f" or
"%{DIGITS}d" if the ".{PREC}" part is omitted. However {DIGITS} will
be implicitly increased if needed to accomodate the max number of
integer digits present in the input. If the ".{PREC}" part is given,
trailing zeros in the value will be replaced by spaces. In this
case, if the value is exactly zero, only a period "." will be
printed, otherwise there will be at least a leading "0.". If the
".{PREC}" part is omitted, and the value is exactly zero, a "-" will
be printed instead of "0".
This option can be repeated; for instance, "-type raw 6 -type
relative 8.6" will show two columns of numbers for each file, one
with the raw counts in format "%6d" and one with the relative ones
informat "%8.6f". If no "-type" option is specified, the program
assumes one "-type raw 9".
-sortBy { item | IXSORT } [ increasing | decreasing ]
This optional keyword specifies how the ouput lines should be
sorted. If the first argument is "item" or {NF+1}, the lines will be
sorted alphabetically by the {ITEM} column. If the first argument is
an integer in {1..NF}, the lines will be sorted by the value of
{COUNT} in {F[IXSORT]} (breaking ties by increasing {ITEM}). The
second argument specifies the sorting direction; if omitted,
"increasing" is assumed for the {ITEM} column, and "decreasing" for
any {COUNT} column. If no "-sortBy" is specified, "-sortBy 1
decreasing" is assumed.
-language { plain | html | tex } ]]
This optional keyword specifies the output table language.
If not specified, the program assumes "-language plain".
The "plain" option will print values and items using spaces
for alignment.
The "html" option will write each line as a "
...
"
construct, suitable for inclusion in a "" entity.
Non-breaking spaces " " will be used where needed to ensure
proper alignment.
The "tex" option will print each line in a format suitable for
inclusion in a TeX/LaTeX "\\begin{table}...\\end{table}" construct.
Namely, columns will be separated by " & ", the line will end with
"\\\\", and each field will be formatted as the argument of a macro:
"\\iv{...}" for integer values, "\\fv{...}" for float values, and
"\\ev{...}" for the items. Trailing zeros in float values will be
replaced by "~".
-maxLines MAX_LINES
This optional keyword instructs the program to truncate
the output after {MAX_LINES}. If omitted, all lines will be
written.\n""" + \
"\n" + \
"DOCUMENTATION OPTIONS\n" + \
argparser.help_info_INFO + "\n" + \
"\n" + \
argparser.help_info_INFO + "\n" + \
"SEE ALSO\n" + \
" Look deep into your soul; what do you see?\n" + \
"\n" + \
"AUTHOR\n" + \
" Created 2025-07-30 by Jorge Stolfi, IC-UNICAMP.\n" + \
"\n" + \
"MODIFICATION HISTORY\n" + \
" All entries by the author above unless indicated otherwise.\n" + \
" 2025-07-30 Created.\n" + \
"\n" + \
"WARRANTY\n" + \
" " + argparser.help_info_NO_WARRANTY + "\n" + \
"\n" \
"RIGHTS\n" + \
" " + PROG_COPYRIGHT + ".\n" + \
"\n" + \
" " + argparser.help_info_STANDARD_RIGHTS
def main():
o, types, files = parse_options()
NF = len(files)
NT = len(types)
# sys.stderr.write(f"NF = {NF} NT = {NT}\n")
tb = read_raw_counts(o['inDir'], files)
tb = sort_table(tb, o['sortBy'], o['sortBy_dir'])
tb = tb[0: o['maxLines']]
tbf = make_full_table(tb, types)
types = fix_col_widths(types, NF, tbf)
write_table(tbf, NF, types, o['language'])
sys.stdout.flush()
return 0
# ----------------------------------------------------------------------
def fix_col_widths(types, NF, tbf):
# Incerases the {size} format parameter as needed to
# fit the largest values in the table.
NT = len(types)
NR = len(tbf)
types_adj = [ ] # Adjusted {types}.
for t in range(NT):
ty, size, prec = types[t]
if prec > 0: size = max(size, prec+2)
for f in range(NF):
j = 1 + f*NT + t
for r in range(NR):
val = tbf[r][j]
assert val >= 0, "prog error: negative val"
# The width of the num should be the same in all languages:
vsize = len(format_num(val, ty, size, prec, " "))
size = max(size, vsize)
if size > types[t][1]:
sys.stderr.write(f"!! width of {ty} columns increased to {vsize}\n")
types_adj.append((ty, size, prec))
return types_adj
# ----------------------------------------------------------------------
def sort_table(tb, sortBy, sdir):
# Sorts the rows of the table {tb} as specified
# by {sortBy} and {sdir}.
#
# If {sortBy} is 0, sorts by the item field (column 0).
# There should be no ties there.
#
# Otherwise, sorts by column {f}, breaking ties
# by column 0.
#
# The {sdir} must be {-1} for decerasing order,
# {+1} for increasing order.
#
if len(tb) == 0: return tb
NF = len(tb[0]) - 1
assert NF >= 1, "prog error: NF"
assert sdir == -1 or sdir == +1, "prog error: sdir"
if sortBy == 0:
# Sort the rows by item, in requested order:
tb.sort(reverse = (sdir < 0), key = lambda row: row[0])
else:
assert sortBy >= 1 and sortBy <= NF, "prog error: sortBy"
# Sort the rows alphabetically by index, always increasing, for tie-breaking:
tb.sort(key = lambda row: row[0])
# Sort the rows by the requested count, in proper order:
tb.sort(key = lambda row: row[sortBy]*sdir)
return tb
# ----------------------------------------------------------------------
def make_full_table(tb, types):
# Returns a table {tbf} that is {tb} expanded by turning each counts
# column into {NT} columns as specified by {types}. Thus {tb} has
# {NF+1} columns, {tbf} nas {NF*NT+1}. Element {tbf[r][1+f*NT+t]} is
# the derived value {t} from the raw count from file {f}, for item
# {tbf[r][0]}.
#
# sys.stderr.write(f"tb = {tb}\n")
NR = len(tb) # Num of rows (items).
if NR == 0: return tb
NF = len(tb[0]) - 1
NT = len(types)
# sys.stderr.write(f"NF = {NF} NT = {NT}\n")
# Start with a table of {None}:
tbf = [ ]
for r in range(NR):
tbf.append([ tb[r][0] ] + [ 0 ]*(NT*NF))
# sys.stderr.write(f"tbf init = {tbf}\n")
# Now fill it by columns:
for f in range(NF):
# Extract the raw counts from file {f}
raw = [ tb[r][f+1] for r in range(NR) ]
# sys.stderr.write(f"raw = {raw}\n")
# Tranform and insert in {tbf}:
for t in range(NT):
j = 1 + f*NT + t # Index of col to insert.
if types[t][0] == "raw":
col = raw
elif types[t][0] == "relative":
col = make_counts_relative(raw)
elif types[t][0] == "rank":
col = ranks_from_counts(raw)
for r in range(NR): tbf[r][j] = col[r]
return tbf
# ----------------------------------------------------------------------
def make_counts_relative(raw):
# Returns a copy of {raw} scaled so that the sum
# of all entries is 1.
sum = 0.0
for count in raw: sum += count
if sum == 0: sum = 1
rel = [ count/sum for count in raw ]
return rel
# ----------------------------------------------------------------------
def ranks_from_counts(raw):
# Converts the vector of floats {raw} into
# a vector of ranks {rank} where {rank[i]}
# is the number of elements of {row} which are
# strictly less than {row[i]}, plus half the
# number of those which are equal to {row[i]},
# except {row[i]}.
#
NR = len(raw)
rix = [ i for i in range(NR) ]
rix.sort(key = lambda i: raw[i], reverse = True)
rank = [ None ]*NR
# Handle repeats:
j = 0 # First element such that {raw[rix[j]] == {raw[rix[i]]}
for i in range(NR + 1):
if i == NR or raw[rix[j]] != raw[rix[i]]:
# Elements {raw[rix[j..i-1]]} must be tied.
jrank = (j + i - 1)/2
for k in range(j, i): rank[rix[k]] = jrank
j = i
return rank
# ----------------------------------------------------------------------
def write_table(tbf, NF, types, language):
# Writes the (expanded) table {tbf} to {stdout}.
# See the {PROG_INFO} for the meaning of the parameters.
#
# sys.stderr.write(f"tbf = {tbf}\n")
NR = len(tbf)
if NR == 0: return
if language == "plain":
for r in range(NR): write_table_row_plain(tbf[r], NF, types)
elif language == "tex":
for r in range(NR): write_table_row_tex(tbf[r], NF, types)
elif language == "html":
for r in range(NR): write_table_row_html(tbf[r], NF, types)
else:
assert False, "prog error: language"
sys.stdout.flush()
return
# ----------------------------------------------------------------------
def write_table_row_plain(row, NF, types):
# The "plain" format has numbers in plain ascii,
# items printed as-is, and fields separated by ' | '.
# sys.stderr.write(f" row = {row}\n")
NT = len(types)
sys.stdout.write(" ")
for f in range(NF):
if f > 0: sys.stdout.write(" ||")
for t in range(NT):
ty, size, prec = types[t]
j = 1 + f*NT + t # Index of {tbf} col to print.
vstr = format_num(row[j], ty, size, prec, " ")
if t > 0: sys.stdout.write(" | ")
sys.stdout.write(vstr)
sys.stdout.write(f" || {row[0]}\n")
return
# ----------------------------------------------------------------------
def write_table_row_html(row, NF, types):
# The "html" format prints the row as an HTML table row,
# with trailing blanks in fractions replaced by " ":
#
NT = len(types)
sys.stdout.write(" ")
for f in range(NF):
for t in range(NT):
ty, size, prec = types[t]
j = 1 + f*NT + t # Index of {tbf} col to print.
vstr = format_num(row[j], ty, size, prec, " ")
sys.stdout.write(f"{vstr} | ")
sys.stdout.write(f"{row[0]} | ")
sys.stdout.write("
\n")
return
# ----------------------------------------------------------------------
def write_table_row_tex(row, NF, types):
NT = len(types)
for f in range(NF):
for t in range(NT):
ty, size, prec = types[t]
j = 1 + f*NT + t # Index of {tbf} col to print.
vstr = format_num(row[j], ty, size, prec, "~")
mac = r"\fv" if prec > 0 else r"\iv"
if j > 1: sys.stdout.write(" & ")
sys.stdout.write(f"{mac}{{{vstr}}}")
sys.stdout.write(r" & \ev")
item = re.sub(r"[{}]", "", row[0])
sys.stdout.write(f"{{{item}}}")
sys.stdout.write(r" \\")
sys.stdout.write("\n")
return
# ----------------------------------------------------------------------
def format_num(val, ty, size, prec, pad):
if ty == "rank":
vstr = format_rank(val, ty, size, prec, pad)
else:
vstr = format_count(val, ty, size, prec, pad)
return vstr
# ----------------------------------------------------------------------
def format_count(val, ty, size, prec, pad):
# Formats the float {val} as "%{size}d" if {prec} is zero,
# else as "%{size}.{prec}f".
#
# In the second case, if the error between the formatted string and
# the given {val} is less than 1% of the last printed digit, replaces
# each trailing zero by a copy of {pad}. If the whole fraction part
# gets replaced, also replaces the period '.'.
#
# In particular, if {val} is zero, prec is zero, the result is {size}
# blanks. If {val} is zero and prec is positive, the result is
# {size-prec-1} blanks, then '.' then {prec} copies of {pad}.
#
assert ty == "raw" or ty == "relative", "prog_error: ty"
if val == 0:
if prec == 0:
# Leave entry blank:
vstr = " " * size
else:
# Print only the period:
assert size >= prec+2, "prog error: size"
vstr = (" " * (size-prec-1)) + "." + (pad * prec)
elif prec == 0:
# Integer format:
vstr = "%*d" % (size, val)
else:
# Fraction format:
vstr = "%*.*f" % (size, prec, val)
if vstr[-1] == "0":
# See if it is OK to remove trailing zeros (and maybe period):
rel_err = fabs((val - float(vstr))/val)
rel_tol = 0.1**(prec + 2)
if rel_err < rel_tol:
# OK to remove:
vstr = remove_trailing_zeros(vstr, pad)
return vstr
# ----------------------------------------------------------------------
def format_rank(val, ty, size, prec, pad):
# Formats the float {val} as "%{size}d" if {prec} is zero,
# else as "%{size}.{prec}f". Assumes that {val} is a
# rank.
#
# In the second case, replaces trailing zeros by copies of {pad},
# even if the value of the resulting string differs from {val}
# just beyond the last digit printed. If the whole fraction part is
# replaced, also replaces the period '.'.
assert ty == "rank", "prog_error: ty"
if prec == 0:
# Integer format:
vstr = "%*d" % (size, val)
else:
# Fraction format:
vstr = "%*.*f" % (size, prec, val)
vstr
if vstr[-1] == "0":
# Remove trailing zeros (and maybe period):
vstr = remove_trailing_zeros(vstr, pad)
return vstr
# ----------------------------------------------------------------------
def remove_trailing_zeros(vstr, pad):
# Replaces trailing zeros in {vstr} by copies of {pad}. If the whole
# fraction part is replaced, also replaces the period '.'.
m = re.search(r"([.]|)[0]+$", vstr)
if m != None:
vstr = vstr[0:m.start(0)] + (pad * (m.end(0)-m.start(0)))
return vstr
# ----------------------------------------------------------------------
def read_raw_counts(inDir, files):
# Reads files {files[0..NF-1]} in folder {inDir} and makes a 2d table
# {tb} out of their contents.
#
# The table {tb} is a list of lists. Entry {tb[r][0]} is some item
# read from some input file, and {tb[r][f+1]} is the raw frac count
# read from {files[f]} for that item; or zero if that item does not
# occur in {files[f]}.
#
tb = [ ]
item_to_row = {} # A dict that maps each item to its row index in the tables {tb}.
NF = len(files)
for f in range(NF):
item_to_count = read_count_file(inDir + "/" + files[f])
# sys.stderr.write(f"item_to_count = {item_to_count}\n")
for itm in item_to_count:
if itm in item_to_row:
r = item_to_row[itm]
else:
r = len(tb)
item_to_row[itm] = r
tb.append([ itm ] + [0.0] * NF)
tb[r][f+1] += item_to_count[itm]
return tb
# ----------------------------------------------------------------------
def read_count_file(fname):
# Reads a file {fname} of fractional item counts and returns a dict
# {itm_to_ct} that maps each item to its count.
#
# Each line of {fname} must have two fields "{COUNT} {ITEM}" where
# {COUNT} is a non-negative float value and {ITEM} is a non-empty
# string that does not contain any spaces (blanks, abs, CR, etc).
#
# There must be at lest one blank between the two. Leading and
# trailing blanks are ignored, as well as blank lines and
# '#'-comments.
#
# If an item occurs two or more times, its counts are added together.
# User beware of underflow, overflow, and roundoff errors.
#
item_to_count = {}
rd = open(fname, "r")
nread = 0
while True:
line = rd.readline();
if line == "":
rd.close()
return item_to_count
nread += 1
line = re.sub(r"[#].*$", "", line)
line = line.strip()
if line != "":
pat_count = r"[-+]?[0-9]+(|[.][0-9]*)(|[Ee][-+]?[0-9]+)"
pat_item = "[^ \000-\037]+"
pat = f"({pat_count})[ ]+({pat_item})"
m = re.fullmatch(pat, line)
if m == None: data_error(fname, nread, "bad line format", line)
assert len(m.groups()) == 4, "prog error (pattern)"
# sys.stderr.write(f"groups = {m.groups()}\n")
count = float(m.group(1))
if count < 0: data_error(nread, "negative count", line)
if not isfinite(count): data_error(fname, nread, "invalid count", line)
item = m.group(4)
if item in item_to_count:
item_to_count[item] += count
else:
item_to_count[item] = count
assert False # Shouldn't get here.
# ----------------------------------------------------------------------
def data_error(fname, nread, msg, line):
sys.stderr.write(f"{fname}:{nread}: ** {msg}\n")
sys.stderr.write(f" [[{line}]]\n")
assert False
# ----------------------------------------------------------------------
def parse_options():
pp = argparser.ArgParser(sys.argv, sys.stderr, PROG_HELP, PROG_INFO)
# Defaults:
inDir = "."
sortBy = 1
sortBy_dir = -1
language = "plain"
maxLines = 999999999
types = []
files = []
if pp.keyword_present("-inDir"): inDir = pp.getNext()
if pp.keyword_present("-maxLines"): maxLines = pp.get_next_int(1, 999999999)
assert type(maxLines) is int
if maxLines < 1:
pp.error(f"invald -maxLines {maxLines}")
if pp.keyword_present("-language"): language = pp.get_next(mayBeKeyword = False)
if language != "plain" and language != "tex" and language != "html":
pp.error(f"invalid -language '{language}'")
if pp.keyword_present("-sortBy"):
if pp.keyword_present_next("item"):
sortBy = 0
else:
sortBy = pp.get_next_int(1, 999999999)
if sortBy < 1:
pp.error(f"invalid sort column {sortBy}")
if pp.keyword_present_next("increasing"):
sortBy_dir = +1
elif pp.keyword_present_next("decreasing"):
sortBy_dir = -1
else:
# Default is increasing for "item", decreasing for count:
sortBy_dir = +1 if sortBy == 0 else -1
while pp.keyword_present("-type"):
ty = pp.get_next(mayBeKeyword = False)
if ty != "raw" and ty != "relative" and ty != "rank":
pp.error(f"invalid column type '{ty}'")
fmt = pp.get_next(mayBeKeyword = False)
m = re.fullmatch(r"([0-9]+)(|[.][0-9]+)", fmt)
if m == None:
pp.error(f"invalid column format '{fmt}'")
else:
size = max(1, int(m.group(1)))
prec = m.group(2)
prec = 0 if prec == "" else int(prec[1:])
types.append((ty,size,prec))
if len(types) == 0:
types = [ ( 'raw', 8, 6 ) ]
pp.skip_parsed()
assert pp.next <= len(sys.argv), "prog error: argc"
NF = len(sys.argv) - pp.next
for f in range(NF):
files.append(pp.get_next(mayBeKeyword = False))
if NF == 0:
pp.error("no input files?")
# Finish checking range of {sortBy}:
if sortBy == NF+1:
sortBy = 0
elif sortBy > NF:
pp.error(f"invalid sorting file index {sortBy}")
o = dict\
( inDir = inDir, maxLines = maxLines, sortBy = sortBy,
sortBy_dir = sortBy_dir, language = language )
return o, types, files
# ----------------------------------------------------------------------
main()