#! /usr/bin/python3
# Last edited on 2025-07-31 11:57:23 by stolfi

from math import fabs, exp, log, inf, isfinite, floor
import sys, re
from ivtff_format import line_loop, strip_comments
from process_frac_words import enum_words_in_text
import argparser

PROG_NAME = "tabulate_frac_counts"
PROG_COPYRIGHT = "Copyright © 2025 by the State University of Campinas"
PROG_HELP = \
  "  " + PROG_NAME + "\\" + \
  """
    [ -inDir IN_DIR ] \\
    [ -type { raw | relative | rank } DIGITS[.PREC] ] \\
    [ -sortBy { item | IXSORT } [ increasing | decreasing ] ] \\
    [ -language { txt | html | tex } ] \\
    [ -maxLines MAX_LINES ] \\""" + \
  "\n" + \
  argparser.help_info_HELP + \
  """
    FNAME[1] FNAME[2] ... FNAME[NF]"""

PROG_INFO = \
  "SYNOPSIS\n" + \
  PROG_HELP + "\n" + \
  "\n" + \
  "DESCRIPTION" + \
  """
  Reads a bunch of files {F[f]}="{IN_DIR}/{FNAME[f]}.fct" with fractional
  counts of arbitrary items where each line has two fields "COUNT ITEM",
  and writes a table of those counts to {stdout}.

  The {COUNT} is any non-negative float value, and the {ITEM}
  is any string of one or more non-blank characters.

  The output will have {NT*NF+1} columns, where {NT} is the number of
  times the "-type" option was specified (or 1 if it was never
  specified). Each group {G[f]} of {NT} columns {f*NT..f*NT+NT-1}, for
  {f} in {0..NF-1}, are the {COUNT}s read from file {F[f+1]},
  transformed in {NT} ways. The last column is an {ITEM} read from those
  files.

  Each occurrence of the "-type" keyword is assigned a successive column
  in each group {G[f]}. That colum will contain the {COUNT} from file
  {F[f]} transformed as specified by the argument of that "-type" keyword.

  All values in each output line will refer to the same {ITEM} shown on
  the last column.

  The inputs are implicitly joined by the {ITEM} column. That is, every
  item that appears in one of more of the input files will be shown in
  the output. If an {ITEM} does not appear in some file, it is assumed
  to have zero {COUNT} in that file.\n""" + \
  "\n" + \
  "OPTIONS" + \
  """ 
  -inDir IN_DIR
    This optional keyword specifies the directory where the input files
    reside. If not specified, "./" is assumed.
    
  -type { raw | relative | rank } DIGITS[.PREC] 
    This optional keyword specifies how the {COUNT} read from each file
    should be transformed before being written in the column. If the
    first argument is "raw", the raw {COUNT} from the file will be
    shown. If it is "relative", the {COUNT} will be divided by the sum
    of all {COUNT}s from that file. If it is "rank", the values shown
    will be the rank (starting from 0) of the {COUNT} among all {COUNT}s
    in the file. 
    
    The value will be printed with format "%{DIGITS}.{PREC}f" or
    "%{DIGITS}d" if the ".{PREC}" part is omitted. However {DIGITS} will
    be implicitly increased if needed to accomodate the max number of
    integer digits present in the input. If the ".{PREC}" part is given,
    trailing zeros in the value will be replaced by spaces. In this
    case, if the value is exactly zero, only a period "." will be
    printed, otherwise there will be at least a leading "0.". If the
    ".{PREC}" part is omitted, and the value is exactly zero, a "-" will
    be printed instead of "0".
    
    This option can be repeated; for instance, "-type raw 6 -type
    relative 8.6" will show two columns of numbers for each file, one
    with the raw counts in format "%6d" and one with the relative ones
    informat "%8.6f". If no "-type" option is specified, the program
    assumes one "-type raw 9".
    
  -sortBy { item | IXSORT } [ increasing | decreasing ] 
    This optional keyword specifies how the ouput lines should be
    sorted. If the first argument is "item" or {NF+1}, the lines will be
    sorted alphabetically by the {ITEM} column. If the first argument is
    an integer in {1..NF}, the lines will be sorted by the value of
    {COUNT} in {F[IXSORT]} (breaking ties by increasing {ITEM}). The
    second argument specifies the sorting direction; if omitted,
    "increasing" is assumed for the {ITEM} column, and "decreasing" for
    any {COUNT} column. If no "-sortBy" is specified, "-sortBy 1
    decreasing" is assumed.
    
  -language { plain | html | tex } ]]
    This optional keyword specifies the output table language. 
    If not specified, the program assumes "-language plain".
    
    The "plain" option will print values and items using spaces 
    for alignment. 
    
    The "html" option will write each line as a "<tr>...</tr>"
    construct, suitable for inclusion in a "<table>...</table>" entity.
    Non-breaking spaces "&nbsp;" will be used where needed to ensure
    proper alignment.
    
    The "tex" option will print each line in a format suitable for
    inclusion in a TeX/LaTeX "\\begin{table}...\\end{table}" construct.
    Namely, columns will be separated by " & ", the line will end with
    "\\\\", and each field will be formatted as the argument of a macro:
    "\\iv{...}" for integer values, "\\fv{...}" for float values, and
    "\\ev{...}" for the items. Trailing zeros in float values will be
    replaced by "~".
    
  -maxLines MAX_LINES
    This optional keyword instructs the program to truncate 
    the output after {MAX_LINES}. If omitted, all lines will be
    written.\n""" + \
  "\n" + \
  "DOCUMENTATION OPTIONS\n" + \
  argparser.help_info_INFO +  "\n" + \
  "\n" + \
  argparser.help_info_INFO + "\n" + \
  "SEE ALSO\n" + \
  "  Look deep into your soul; what do you see?\n" + \
  "\n" + \
  "AUTHOR\n" + \
  "  Created 2025-07-30 by Jorge Stolfi, IC-UNICAMP.\n" + \
  "\n" + \
  "MODIFICATION HISTORY\n" + \
  "  All entries by the author above unless indicated otherwise.\n" + \
  "  2025-07-30 Created.\n" + \
  "\n" + \
  "WARRANTY\n" + \
  "  " + argparser.help_info_NO_WARRANTY +  "\n" + \
  "\n" \
  "RIGHTS\n" + \
  "  "  + PROG_COPYRIGHT +  ".\n" + \
  "\n" + \
  "  "  + argparser.help_info_STANDARD_RIGHTS

def main():
  o, types, files = parse_options()
  NF = len(files)
  NT = len(types)
  # sys.stderr.write(f"NF = {NF} NT = {NT}\n")
  tb = read_raw_counts(o['inDir'], files)
  tb = sort_table(tb, o['sortBy'], o['sortBy_dir'])
  tb = tb[0: o['maxLines']]
  tbf = make_full_table(tb, types)
  types = fix_col_widths(types, NF, tbf)
  write_table(tbf, NF, types, o['language'])
  sys.stdout.flush()
  return 0
  # ----------------------------------------------------------------------

def fix_col_widths(types, NF, tbf):
  # Incerases the {size} format parameter as needed to 
  # fit the largest values in the table.
  NT = len(types)
  NR = len(tbf)
  types_adj = [ ] # Adjusted {types}.
  for t in range(NT):
    ty, size, prec = types[t]
    if prec > 0: size = max(size, prec+2)
    for f in range(NF):
      j = 1 + f*NT + t
      for r in range(NR):
        val = tbf[r][j]
        assert val >= 0, "prog error: negative val"
        # The width of the num should be the same in all languages:
        vsize = len(format_num(val, ty, size, prec, " "))
        size = max(size, vsize)
    if size > types[t][1]:
      sys.stderr.write(f"!! width of {ty} columns increased to {vsize}\n")
    types_adj.append((ty, size, prec))
  return types_adj
  # ----------------------------------------------------------------------

def sort_table(tb, sortBy, sdir):
  # Sorts the rows of the table {tb} as specified 
  # by {sortBy} and {sdir}.
  #
  # If {sortBy} is 0, sorts by the item field (column 0).
  # There should be no ties there.
  #
  # Otherwise, sorts by column {f}, breaking ties
  # by column 0.
  #
  # The {sdir} must be {-1} for decerasing order, 
  # {+1} for increasing order.
  #
  if len(tb) == 0: return tb
  NF = len(tb[0]) - 1
  assert NF >= 1, "prog error: NF"
  assert sdir == -1 or sdir == +1, "prog error: sdir"
  if sortBy == 0:
    # Sort the rows by item, in requested order:
    tb.sort(reverse = (sdir < 0), key = lambda row: row[0])
  else:
    assert sortBy >= 1 and sortBy <= NF, "prog error: sortBy"
    # Sort the rows alphabetically by index, always increasing, for tie-breaking:
    tb.sort(key = lambda row: row[0])
    # Sort the rows by the requested count, in proper order:
    tb.sort(key = lambda row: row[sortBy]*sdir)
  return tb
  # ----------------------------------------------------------------------
  
def make_full_table(tb, types):
  # Returns a table {tbf} that is {tb} expanded by turning each counts
  # column into {NT} columns as specified by {types}. Thus {tb} has
  # {NF+1} columns, {tbf} nas {NF*NT+1}. Element {tbf[r][1+f*NT+t]} is
  # the derived value {t} from the raw count from file {f}, for item
  # {tbf[r][0]}.
  #
  # sys.stderr.write(f"tb = {tb}\n")
  NR = len(tb) # Num of rows (items).
  if NR == 0: return tb
  NF = len(tb[0]) - 1
  NT = len(types)
  # sys.stderr.write(f"NF = {NF} NT = {NT}\n")
  
  # Start with a table of {None}:
  tbf = [ ]
  for r in range(NR):
    tbf.append([ tb[r][0] ] + [ 0 ]*(NT*NF))
  # sys.stderr.write(f"tbf init = {tbf}\n")
   
  # Now fill it by columns:
  for f in range(NF):
    # Extract the raw counts from file {f}
    raw = [ tb[r][f+1] for r in range(NR) ]
    # sys.stderr.write(f"raw = {raw}\n")
    
    # Tranform and insert in {tbf}:
    for t in range(NT):
      j = 1 + f*NT + t # Index of col to insert.
      if types[t][0] == "raw":
        col = raw
      elif types[t][0] == "relative":
        col = make_counts_relative(raw)
      elif types[t][0] == "rank":
        col = ranks_from_counts(raw)
      for r in range(NR): tbf[r][j] = col[r]
  return tbf
  # ----------------------------------------------------------------------
  
def make_counts_relative(raw):
  # Returns a copy of {raw} scaled so that the sum 
  # of all entries is 1.
  sum = 0.0
  for count in raw: sum += count
  if sum == 0: sum = 1
  rel = [ count/sum for count in raw ]
  return rel
  # ----------------------------------------------------------------------

def ranks_from_counts(raw):
  # Converts the vector of floats {raw} into 
  # a vector of ranks {rank} where {rank[i]}
  # is the number of elements of {row} which are
  # strictly less than {row[i]}, plus half the
  # number of those which are equal to {row[i]},
  # except {row[i]}.
  #
  NR = len(raw)
  rix = [ i for i in range(NR) ]
  rix.sort(key = lambda i: raw[i], reverse = True)
  rank = [ None ]*NR
  # Handle repeats:
  j = 0 # First element such that {raw[rix[j]] == {raw[rix[i]]}
  for i in range(NR + 1):
    if i == NR or raw[rix[j]] != raw[rix[i]]:
      # Elements {raw[rix[j..i-1]]} must be tied.
      jrank = (j + i - 1)/2
      for k in range(j, i): rank[rix[k]] = jrank
      j = i
  return rank
  # ----------------------------------------------------------------------
  
def write_table(tbf, NF, types, language):
  # Writes the (expanded) table {tbf} to {stdout}.
  # See the {PROG_INFO} for the meaning of the parameters.
  #
  
  # sys.stderr.write(f"tbf = {tbf}\n")
  NR = len(tbf)
  if NR == 0: return
  if language == "plain":
    for r in range(NR): write_table_row_plain(tbf[r], NF, types)
  elif language == "tex":
    for r in range(NR): write_table_row_tex(tbf[r], NF, types)
  elif language == "html":
    for r in range(NR): write_table_row_html(tbf[r], NF, types)
  else:
    assert False, "prog error: language"
  sys.stdout.flush()
  return
  # ----------------------------------------------------------------------
  
def write_table_row_plain(row, NF, types):
  # The "plain" format has numbers in plain ascii,
  # items printed as-is, and fields separated by ' | '.
  
  # sys.stderr.write(f" row = {row}\n")
  NT = len(types)
  sys.stdout.write(" ")
  for f in range(NF):
    if f > 0: sys.stdout.write(" ||")
    for t in range(NT):
      ty, size, prec = types[t]
      j = 1 + f*NT + t # Index of {tbf} col to print.
      vstr = format_num(row[j], ty, size, prec, " ")
      if t > 0: sys.stdout.write(" | ")
      sys.stdout.write(vstr)
  sys.stdout.write(f" || {row[0]}\n")
  return 
  # ----------------------------------------------------------------------
  
def write_table_row_html(row, NF, types):
  # The "html" format prints the row as an HTML table row,
  # with trailing blanks in fractions replaced by "&nbsp;":
  # 
  NT = len(types)
  sys.stdout.write("  <tr>")
  for f in range(NF):
    for t in range(NT):
      ty, size, prec = types[t]
      j = 1 + f*NT + t # Index of {tbf} col to print.
      vstr = format_num(row[j], ty, size, prec, "&nbsp;")
      sys.stdout.write(f"<td>{vstr}</td>")
  sys.stdout.write(f"<td>{row[0]}</td>")
  sys.stdout.write("</tr>\n")
  return 
  # ----------------------------------------------------------------------
  
def write_table_row_tex(row, NF, types):
  NT = len(types)
  for f in range(NF):
    for t in range(NT):
      ty, size, prec = types[t]
      j = 1 + f*NT + t # Index of {tbf} col to print.
      vstr = format_num(row[j], ty, size, prec, "~")
      mac = r"\fv" if prec > 0 else r"\iv"
      if j > 1: sys.stdout.write(" & ")
      sys.stdout.write(f"{mac}{{{vstr}}}")
  sys.stdout.write(r" & \ev")
  item = re.sub(r"[{}]", "", row[0])
  sys.stdout.write(f"{{{item}}}")
  sys.stdout.write(r" \\")
  sys.stdout.write("\n")
  return 
  # ----------------------------------------------------------------------

def format_num(val, ty, size, prec, pad):
  if ty == "rank":
    vstr = format_rank(val, ty, size, prec, pad)
  else:
    vstr = format_count(val, ty, size, prec, pad)
  return vstr
  # ----------------------------------------------------------------------
  
def format_count(val, ty, size, prec, pad):
  # Formats the float {val} as "%{size}d" if {prec} is zero,
  # else as "%{size}.{prec}f".
  #
  # In the second case, if the error between the formatted string and
  # the given {val} is less than 1% of the last printed digit, replaces
  # each trailing zero by a copy of {pad}. If the whole fraction part
  # gets replaced, also replaces the period '.'.
  #
  # In particular, if {val} is zero, prec is zero, the result is {size}
  # blanks. If {val} is zero and prec is positive, the result is
  # {size-prec-1} blanks, then '.' then {prec} copies of {pad}.
  #
  assert ty == "raw" or ty == "relative", "prog_error: ty"
  if val == 0:
    if prec == 0:
      # Leave entry blank:
      vstr = " " * size
    else:
      # Print only the period:
      assert size >= prec+2, "prog error: size"
      vstr = (" " * (size-prec-1)) + "." + (pad * prec)
  elif prec == 0:
    # Integer format:
    vstr = "%*d" % (size, val)
  else:
    # Fraction format:
    vstr = "%*.*f" % (size, prec, val)
    if vstr[-1] == "0":
      # See if it is OK to remove trailing zeros (and maybe period):
      rel_err = fabs((val - float(vstr))/val)
      rel_tol = 0.1**(prec + 2)
      if rel_err < rel_tol:
        # OK to remove:
        vstr = remove_trailing_zeros(vstr, pad)
  return vstr
  # ----------------------------------------------------------------------

def format_rank(val, ty, size, prec, pad):
  # Formats the float {val} as "%{size}d" if {prec} is zero,
  # else as "%{size}.{prec}f".  Assumes that {val} is a
  # rank.
  #
  # In the second case, replaces trailing zeros by copies of {pad},
  # even if the value of the resulting string differs from {val}
  # just beyond the last digit printed. If the whole fraction part is
  # replaced, also replaces the period '.'.
  assert ty == "rank", "prog_error: ty"
  if prec == 0:
    # Integer format:
    vstr = "%*d" % (size, val)
  else:
    # Fraction format:
    vstr = "%*.*f" % (size, prec, val)
    vstr
    if vstr[-1] == "0":
      # Remove trailing zeros (and maybe period):
      vstr = remove_trailing_zeros(vstr, pad)
  return vstr
  # ----------------------------------------------------------------------
  
def remove_trailing_zeros(vstr, pad):
  # Replaces trailing zeros in {vstr} by copies of {pad}. If the whole
  # fraction part is replaced, also replaces the period '.'.
  m = re.search(r"([.]|)[0]+$", vstr)
  if m != None:
    vstr = vstr[0:m.start(0)] + (pad * (m.end(0)-m.start(0)))
  return vstr
  # ----------------------------------------------------------------------

def read_raw_counts(inDir, files):
  # Reads files {files[0..NF-1]} in folder {inDir} and makes a 2d table
  # {tb} out of their contents.
  # 
  # The table {tb} is a list of lists. Entry {tb[r][0]} is some item
  # read from some input file, and {tb[r][f+1]} is the raw frac count
  # read from {files[f]} for that item; or zero if that item does not
  # occur in {files[f]}.
  #
  tb = [ ]  
  item_to_row = {} # A dict that maps each item to its row index in the tables {tb}.
  NF = len(files)
  for f in range(NF):
    item_to_count = read_count_file(inDir + "/" + files[f])
    # sys.stderr.write(f"item_to_count = {item_to_count}\n")
    for itm in item_to_count:
      if itm in item_to_row:
        r = item_to_row[itm]
      else:
        r = len(tb)
        item_to_row[itm] = r
        tb.append([ itm ] + [0.0] * NF)
      tb[r][f+1] += item_to_count[itm]
  return tb
  # ----------------------------------------------------------------------

def read_count_file(fname):
  # Reads a file {fname} of fractional item counts and returns a dict
  # {itm_to_ct} that maps each item to its count.
  #
  # Each line of {fname} must have two fields "{COUNT} {ITEM}" where
  # {COUNT} is a non-negative float value and {ITEM} is a non-empty
  # string that does not contain any spaces (blanks, abs, CR, etc).
  # 
  # There must be at lest one blank between the two. Leading and
  # trailing blanks are ignored, as well as blank lines and
  # '#'-comments.
  #
  # If an item occurs two or more times, its counts are added together.
  # User beware of underflow, overflow, and roundoff errors.
  #
  item_to_count = {}
  rd = open(fname, "r")
  nread = 0
  while True:
    line = rd.readline();
    if line == "":
      rd.close()
      return item_to_count
    nread += 1
    line = re.sub(r"[#].*$", "", line)
    line = line.strip()
    if line != "":
      pat_count = r"[-+]?[0-9]+(|[.][0-9]*)(|[Ee][-+]?[0-9]+)"
      pat_item = "[^ \000-\037]+"
      pat = f"({pat_count})[ ]+({pat_item})"
      m = re.fullmatch(pat, line)
      if m == None: data_error(fname, nread, "bad line format", line)
      assert len(m.groups()) == 4, "prog error (pattern)"
      # sys.stderr.write(f"groups = {m.groups()}\n")
      count = float(m.group(1))
      if count < 0: data_error(nread, "negative count", line)
      if not isfinite(count): data_error(fname, nread, "invalid count", line)
      item = m.group(4)
      if item in item_to_count:
        item_to_count[item] += count
      else:
        item_to_count[item] = count
  assert False # Shouldn't get here.
  # ----------------------------------------------------------------------
      
def data_error(fname, nread, msg, line):
  sys.stderr.write(f"{fname}:{nread}: ** {msg}\n")
  sys.stderr.write(f"  [[{line}]]\n")
  assert False
  # ----------------------------------------------------------------------

def parse_options():
  pp = argparser.ArgParser(sys.argv, sys.stderr, PROG_HELP, PROG_INFO)

  # Defaults:
  inDir = "."
  sortBy = 1
  sortBy_dir = -1
  language = "plain"
  maxLines = 999999999
  types = []
  files = []

  if pp.keyword_present("-inDir"): inDir = pp.getNext()
  
  if pp.keyword_present("-maxLines"): maxLines = pp.get_next_int(1, 999999999)
  assert type(maxLines) is int
  if maxLines < 1:
    pp.error(f"invald -maxLines {maxLines}")
  
  if pp.keyword_present("-language"): language = pp.get_next(mayBeKeyword = False)
  if language != "plain" and language != "tex" and language != "html":
    pp.error(f"invalid -language '{language}'")
                   
  if pp.keyword_present("-sortBy"): 
    if pp.keyword_present_next("item"):
      sortBy = 0
    else:
      sortBy = pp.get_next_int(1, 999999999)
      if sortBy < 1: 
        pp.error(f"invalid sort column {sortBy}")
    if pp.keyword_present_next("increasing"):
      sortBy_dir = +1
    elif pp.keyword_present_next("decreasing"):
      sortBy_dir = -1
    else:
      # Default is increasing for "item", decreasing for count:
      sortBy_dir = +1 if sortBy == 0 else -1
    
  while pp.keyword_present("-type"):
    ty = pp.get_next(mayBeKeyword = False)
    if ty != "raw" and ty != "relative" and ty != "rank":
      pp.error(f"invalid column type '{ty}'")
    fmt = pp.get_next(mayBeKeyword = False)
    m = re.fullmatch(r"([0-9]+)(|[.][0-9]+)", fmt)
    if m == None:
      pp.error(f"invalid column format '{fmt}'")
    else:
      size = max(1, int(m.group(1)))
      prec = m.group(2)
      prec = 0 if prec == "" else int(prec[1:])
    types.append((ty,size,prec))
  if len(types) == 0:
    types = [ ( 'raw', 8, 6 ) ]

  pp.skip_parsed()
  
  assert pp.next <= len(sys.argv), "prog error: argc"
  NF = len(sys.argv) - pp.next
  for f in range(NF):
    files.append(pp.get_next(mayBeKeyword = False))
  if NF == 0:
    pp.error("no input files?")
    
  # Finish checking range of {sortBy}:
  if sortBy == NF+1: 
    sortBy = 0
  elif sortBy > NF: 
    pp.error(f"invalid sorting file index {sortBy}") 

  o = dict\
    ( inDir = inDir, maxLines = maxLines, sortBy = sortBy, 
      sortBy_dir = sortBy_dir, language = language )
  return o, types, files
  # ----------------------------------------------------------------------
  
main()