#! /usr/bin/python3
# Last edited on 2026-01-16 14:40:58 by stolfi

import sys, os, re
from sys import stdout as out, stderr as err
from process_funcs import basic_line_loop
from error_funcs import file_line_error as fdata_error

def main():
  # Prints the element counts and/or frequencies of elements in
  # specified sections and text types.
  
  # Command line arguments must be 
  # 
  #  {CT_PREC} {FR_PREC}  {SEC[1]} {TXTY[1]} {HEAD[1]} .. {SEC[N]} {TXTY[N]} {HEAD[N}  
  #
  # where 
  #
  #  {CT_PREC} is the formatting precision for counts.
  #  {FR_PREC} the same for frequencies.
  #  {SEC[1]} .. {SEC{N} are section tags, like "hea", "zod", or "tot" for all.
  #  {TXTY[1]} .. {TXTY [N]} are text types, like "parags" or "labels".
  #  {HEAD[1]} .. {HEAD[N]} are headers for the data columns.
  #
  # See see "../Notes/092" for the valid section tags and text types.
  #
  # The data for each section {SEC[j]} and text type {TXTY[j]} are read
  # from the file "st_parsed/{SEC[j]}-{TXTY[j]}.elf". Assumes that each
  # line in that file has the format "{COUNT} {FREQ} {ELEM}" where
  # {ELEM} is an element enclosed in braces '{}', {COUNT} is the number
  # of occurrences of that element in the specified subset of the text,
  # and {FREQ} is its relative frequency ({COUNT} over sum of all
  # {COUNT}s in that file). The {COUNT} may be fractional, accounting
  # for dubious spaces.
  # 
  # The data for each {ELEM} is printed on a separate line. The data
  # ({COUNT} and/or {FREQ}) for each {SEC[j],TXTY[j]} pair is printed as
  # one or two columns on that line, in order.
  # 
  # The order of {ELEM}s in the printout is designed to be more
  # convenient for reference to the CMC model (see Notes/093),
  # irrespective of the order in the data files.
  # 
  # If {FR_PREC} is positive prints the {FREQ} with that many decimal
  # fraction digits. If it is negative or {None}, omits the {FREQ}. The
  # {FR_PREC} cannot be zero.
  #
  # If {CT_PREC} is positive prints the {COUNT} with that many
  # decimal fraction digits. If it is negative or {None}, omits the
  # {COUNT}. If it is zero, prints {count} as integer.
  # 
  
  ct_prec, fr_prec, stys = parse_args()
  # The {stys} should be a list of triples {(SEC[j],TXTY[j],HEAD[j])}
  N = len(stys)
  
  # Detemine widths {ct_w,frew_wd} of each data column:
  if ct_prec == None or ct_prec < 0:
    # Do not print counts:
    ct_prec = None; ct_wd = None; ct_zero = None
  else:
    # Print counts:
    ct_wd = 5 if ct_prec == 0 else 5 + 1 + ct_prec
    ct_zero = "-" if ct_prec == 0 else "." + (" " * ct_prec)

  if fr_prec == None or fr_prec < 0: 
    # Do not print freqs:
    fr_prec = None; fr_wd = None; fr_zero = None
  else:
    # Print freqs:
    assert fr_prec > 0, f"invalid zero {{fr_prec}}"
    fr_wd = 1 + 1 + fr_prec
    fr_zero = "." + (" " * fr_prec)
  
  def print_row(cts, frs, elem):
    assert isinstance(cts, list) and isinstance(frs, list)
    for ist in range(N):
      if ist > 0: out.write(" ")
      if ct_prec != None:
        ct = cts[ist]
        if ct == None:
          out.write(" %*s" % (ct_wd, ct_zero))
        else:
          out.write(" %*.*f" % (ct_wd, ct_prec, ct))
      if fr_prec != None:
        fr = frs[ist]
        if fr == None:
          out.write(" %*s" % (fr_wd, fr_zero))
        else:
          out.write(" %*.*f" % (fr_wd, fr_prec, fr))
    out.write(f"  {elem}\n")
    return 
    # ....................................................................

  # The data is stored in dicts {count} and {freq} with each {ELEM} as key. 
  # Each value is a list of {N} elements. Missing data is stored as {None}.
  count = dict()
  freq = dict()

  # Read the files:
  for ist in range(N):
    sec, txty, head = stys[ist]
    fname = f"st_parsed/{sec}-{txty}.elf"
    read_data_file(fname, count, freq, ist, N)
 
  # Prints the elements:
  main_elems = \
    ( "{a}", "{o}", "{y}", "",
      "{q}", "",
      "{d}", "{l}", "{r}", "{s}", "",
      "{ch}", "{che}", "",
      "{ee}", "{eee}", "",
      "{sh}", "{she}", "",
      "{ih}", "{ihe}", "",
      "{k}", "{ke}", "",
      "{t}", "{te}", "",
      "{p}", "{pe}", "",
      "{f}", "{fe}", "",
      "{ckh}", "{ckhe}", "{ckhh}", "",
      "{ikh}", "{ikhe}", "{ikhh}", "",
      "{cth}", "{cthe}", "{cthh}", "",
      "{ith}", "{ithe}", "{ithh}", "",
      "{cph}", "{cphe}", "{cphh}", "",
      "{iph}", "{iphe}", "{iphh}", "",
      "{cfh}", "{cfhe}", "{cfhh}", "",
      "{ifh}", "{ifhe}", "{ifhh}", "",
      "{id}", "{iid}", "{iiid}", "",
      "{il}", "{iil}", "{iiil}", "",
      "{n}", "{in}", "{iin}", "{iiin}", "",
      "{m}", "{im}", "{iim}", "{iiim}", "",
      "{ir}", "{iir}", "{iiir}", "",
      "{is}", "{iis}", "{iiis}", 
    )
  todo = set(count.keys()) # Unprinted elements for which we have data.
  last_was_blank = True  # Previous printed line was a blank line.
  # Print the "main" elements:
  for elem in main_elems:
    if elem == "":
      if not last_was_blank: out.write("\n")
      last_was_blank = True 
    elif elem in todo:
      assert elem in count and elem in freq, "bug 2"
      print_row(count[elem], freq[elem], elem)
      last_was_blank = False
      todo.remove(elem)
    else:
      # No data for this element:
      pass
  # Print any leftovers elements not in the main set:
  if len(todo) > 0:
    out.write("\n")
    for elem in todo:
      print_row(count[elem], ct_zero, freq[elem], fr_zero, elem)
  return
  # ----------------------------------------------------------------------
            
def read_data_file(fname, count, freq, ist, N):
  # Reads from {fname}. Inserts the data of each {elem} in slot {ist}
  # of the lists {count[elem]} and {freq[elem]}, assuming that they have {N}
  # slots. Provides such lists if they are missing.
  
  rd = open(fname, "r")
  
  fnum_pat = r"[0-9]+[.][0-9]*|[0-9]+"
  elem_pat = r"[!¡?¿]|[{][^{}]+[}]"

  def parse_line(nlin, line):
    line = line.strip()
    m = re.fullmatch(f"[ ]*({fnum_pat})[ ]+({fnum_pat})[ ]+({elem_pat})[ ]*", line)
    if m == None: fdata_error(fname, nlin, "bad format", line)
    assert m.lastindex == 3, "bug 4 {m.lastindex}"
    ct = float(m.group(1))
    fr = float(m.group(2))
    elem = m.group(3)
    if elem not in count:
      assert elem not in freq, "bug 5"
      count[elem] = [ None ]*N
      freq[elem] = [ None ]*N
    assert elem in count and elem in freq, "bug 6"
    count[elem][ist] = ct
    freq[elem][ist] = fr
    return
    # ....................................................................
     
  basic_line_loop(rd, parse_line)
  return 
  # ----------------------------------------------------------------------
  
def parse_args():
  iarg = 1
  narg = len(sys.argv)
  ct_prec = int(sys.argv[iarg]); iarg += 1
  fr_prec = int(sys.argv[iarg]); iarg += 1
  stys = []
  while iarg < narg:
    sec = sys.argv[iarg]; iarg += 1
    txty = sys.argv[iarg]; iarg += 1
    head = sys.argv[iarg]; iarg += 1
    stys.append((sec,txty,head,))
    
  return ct_prec, fr_prec, stys
  # ----------------------------------------------------------------------

main()