#! /usr/bin/python3 # Last edited on 2026-01-16 14:40:58 by stolfi import sys, os, re from sys import stdout as out, stderr as err from process_funcs import basic_line_loop from error_funcs import file_line_error as fdata_error def main(): # Prints the element counts and/or frequencies of elements in # specified sections and text types. # Command line arguments must be # # {CT_PREC} {FR_PREC} {SEC[1]} {TXTY[1]} {HEAD[1]} .. {SEC[N]} {TXTY[N]} {HEAD[N} # # where # # {CT_PREC} is the formatting precision for counts. # {FR_PREC} the same for frequencies. # {SEC[1]} .. {SEC{N} are section tags, like "hea", "zod", or "tot" for all. # {TXTY[1]} .. {TXTY [N]} are text types, like "parags" or "labels". # {HEAD[1]} .. {HEAD[N]} are headers for the data columns. # # See see "../Notes/092" for the valid section tags and text types. # # The data for each section {SEC[j]} and text type {TXTY[j]} are read # from the file "st_parsed/{SEC[j]}-{TXTY[j]}.elf". Assumes that each # line in that file has the format "{COUNT} {FREQ} {ELEM}" where # {ELEM} is an element enclosed in braces '{}', {COUNT} is the number # of occurrences of that element in the specified subset of the text, # and {FREQ} is its relative frequency ({COUNT} over sum of all # {COUNT}s in that file). The {COUNT} may be fractional, accounting # for dubious spaces. # # The data for each {ELEM} is printed on a separate line. The data # ({COUNT} and/or {FREQ}) for each {SEC[j],TXTY[j]} pair is printed as # one or two columns on that line, in order. # # The order of {ELEM}s in the printout is designed to be more # convenient for reference to the CMC model (see Notes/093), # irrespective of the order in the data files. # # If {FR_PREC} is positive prints the {FREQ} with that many decimal # fraction digits. If it is negative or {None}, omits the {FREQ}. The # {FR_PREC} cannot be zero. # # If {CT_PREC} is positive prints the {COUNT} with that many # decimal fraction digits. If it is negative or {None}, omits the # {COUNT}. If it is zero, prints {count} as integer. # ct_prec, fr_prec, stys = parse_args() # The {stys} should be a list of triples {(SEC[j],TXTY[j],HEAD[j])} N = len(stys) # Detemine widths {ct_w,frew_wd} of each data column: if ct_prec == None or ct_prec < 0: # Do not print counts: ct_prec = None; ct_wd = None; ct_zero = None else: # Print counts: ct_wd = 5 if ct_prec == 0 else 5 + 1 + ct_prec ct_zero = "-" if ct_prec == 0 else "." + (" " * ct_prec) if fr_prec == None or fr_prec < 0: # Do not print freqs: fr_prec = None; fr_wd = None; fr_zero = None else: # Print freqs: assert fr_prec > 0, f"invalid zero {{fr_prec}}" fr_wd = 1 + 1 + fr_prec fr_zero = "." + (" " * fr_prec) def print_row(cts, frs, elem): assert isinstance(cts, list) and isinstance(frs, list) for ist in range(N): if ist > 0: out.write(" ") if ct_prec != None: ct = cts[ist] if ct == None: out.write(" %*s" % (ct_wd, ct_zero)) else: out.write(" %*.*f" % (ct_wd, ct_prec, ct)) if fr_prec != None: fr = frs[ist] if fr == None: out.write(" %*s" % (fr_wd, fr_zero)) else: out.write(" %*.*f" % (fr_wd, fr_prec, fr)) out.write(f" {elem}\n") return # .................................................................... # The data is stored in dicts {count} and {freq} with each {ELEM} as key. # Each value is a list of {N} elements. Missing data is stored as {None}. count = dict() freq = dict() # Read the files: for ist in range(N): sec, txty, head = stys[ist] fname = f"st_parsed/{sec}-{txty}.elf" read_data_file(fname, count, freq, ist, N) # Prints the elements: main_elems = \ ( "{a}", "{o}", "{y}", "", "{q}", "", "{d}", "{l}", "{r}", "{s}", "", "{ch}", "{che}", "", "{ee}", "{eee}", "", "{sh}", "{she}", "", "{ih}", "{ihe}", "", "{k}", "{ke}", "", "{t}", "{te}", "", "{p}", "{pe}", "", "{f}", "{fe}", "", "{ckh}", "{ckhe}", "{ckhh}", "", "{ikh}", "{ikhe}", "{ikhh}", "", "{cth}", "{cthe}", "{cthh}", "", "{ith}", "{ithe}", "{ithh}", "", "{cph}", "{cphe}", "{cphh}", "", "{iph}", "{iphe}", "{iphh}", "", "{cfh}", "{cfhe}", "{cfhh}", "", "{ifh}", "{ifhe}", "{ifhh}", "", "{id}", "{iid}", "{iiid}", "", "{il}", "{iil}", "{iiil}", "", "{n}", "{in}", "{iin}", "{iiin}", "", "{m}", "{im}", "{iim}", "{iiim}", "", "{ir}", "{iir}", "{iiir}", "", "{is}", "{iis}", "{iiis}", ) todo = set(count.keys()) # Unprinted elements for which we have data. last_was_blank = True # Previous printed line was a blank line. # Print the "main" elements: for elem in main_elems: if elem == "": if not last_was_blank: out.write("\n") last_was_blank = True elif elem in todo: assert elem in count and elem in freq, "bug 2" print_row(count[elem], freq[elem], elem) last_was_blank = False todo.remove(elem) else: # No data for this element: pass # Print any leftovers elements not in the main set: if len(todo) > 0: out.write("\n") for elem in todo: print_row(count[elem], ct_zero, freq[elem], fr_zero, elem) return # ---------------------------------------------------------------------- def read_data_file(fname, count, freq, ist, N): # Reads from {fname}. Inserts the data of each {elem} in slot {ist} # of the lists {count[elem]} and {freq[elem]}, assuming that they have {N} # slots. Provides such lists if they are missing. rd = open(fname, "r") fnum_pat = r"[0-9]+[.][0-9]*|[0-9]+" elem_pat = r"[!¡?¿]|[{][^{}]+[}]" def parse_line(nlin, line): line = line.strip() m = re.fullmatch(f"[ ]*({fnum_pat})[ ]+({fnum_pat})[ ]+({elem_pat})[ ]*", line) if m == None: fdata_error(fname, nlin, "bad format", line) assert m.lastindex == 3, "bug 4 {m.lastindex}" ct = float(m.group(1)) fr = float(m.group(2)) elem = m.group(3) if elem not in count: assert elem not in freq, "bug 5" count[elem] = [ None ]*N freq[elem] = [ None ]*N assert elem in count and elem in freq, "bug 6" count[elem][ist] = ct freq[elem][ist] = fr return # .................................................................... basic_line_loop(rd, parse_line) return # ---------------------------------------------------------------------- def parse_args(): iarg = 1 narg = len(sys.argv) ct_prec = int(sys.argv[iarg]); iarg += 1 fr_prec = int(sys.argv[iarg]); iarg += 1 stys = [] while iarg < narg: sec = sys.argv[iarg]; iarg += 1 txty = sys.argv[iarg]; iarg += 1 head = sys.argv[iarg]; iarg += 1 stys.append((sec,txty,head,)) return ct_prec, fr_prec, stys # ---------------------------------------------------------------------- main()