#! /usr/bin/python3 # Last edited on 2025-12-06 03:54:12 by stolfi from math import exp, log, inf import sys, re from sys import stdout as out, stderr as err from ivtff_format import line_loop, strip_comments from process_frac_words import enum_words_in_text # Reads from stdin a fragment of an IVTFF-like file, e.g. a single page. # # Writes to stdout the list of all words in that file, with their # fractional counts. These consider that each comma may be a word # space or not with equal probabilities. The output is sorted by # decreasing frequency. def main(): verbose = True prc = 0.50; # Prob that ',' is a word space. # {this_cts[wd]} is float freq of EVA word {wd} in input file. this_cts = dict() def accum_word(prb, wd): nonlocal this_cts # err.write(f" accum {prb:14.6e} {wd}\n") if wd in this_cts: this_cts[wd] += prb else: this_cts[wd] = prb return # ...................................................................... def proc_line(nread,ndata,npage, line, page,lseq,posty,trans,text): nonlocal this_cts, prc if lseq != None: # Data line text = strip_comments(text) enum_words_in_text(text, prc, accum_word) elif page != None: # Must be page header line: err.write(f". page {page}\n") return # ...................................................................... def data_error(nread, msg, line): err.write("%s:%d: ** %s\n" % ("-", nread, msg)); err.write(" [[%s]]\n" % line) assert False # ...................................................................... nread, ndata, npage = line_loop(sys.stdin, proc_line, data_error) nwrit = 0; if verbose: err.write(f"{nread:5d} file lines read\n") err.write(f"{ndata:5d} text lines processed\n") err.write(f"{npage:5d} page headers seen\n") tbl = list(this_cts.items()) tbl.sort(key = lambda p: p[1], reverse = True) for wd, frt in tbl: assert frt > 0, "bug {frt = 0}" out.write(f"{frt:12.6f} {wd}\n") nwrit += 1 out.flush() if verbose: err.write(f"{nwrit:5d} word frequencies written\n") err.write("\n") return 0 # ---------------------------------------------------------------------- main()