#! /usr/bin/python3 # Last edited on 2025-07-31 14:56:54 by stolfi from math import exp, log, inf import sys, re from ivtff_format import line_loop, strip_comments from process_frac_words import enum_words_in_text # Reads from stdin a fragment of an IVTFF-like file, e.g. a single page. # # Writes to stdout the list of all words in that file, with their # fractional counts. These consider that each comma may be a word # space or not with equal probabilities. The output is sorted by # decreasing frequency. def main(): verbose = True prc = 0.50; # Prob that ',' is a word space. # {this_cts[wd]} is float freq of EVA word {wd} in input file. this_cts = dict() def accum_word(prb, wd): nonlocal this_cts # sys.stderr.write(f" accum {prb:14.6e} {wd}\n") if wd in this_cts: this_cts[wd] += prb else: this_cts[wd] = prb return # ...................................................................... def proc_line(nread,ndata,npage, line, page,lseq,posty,trans,text): nonlocal this_cts, prc if lseq != None: # Data line text = strip_comments(text) enum_words_in_text(text, prc, accum_word) elif page != None: # Must be page header line: sys.stderr.write(f". page {page}\n") return # ...................................................................... def data_error(nread, msg, line): sys.stderr.write("%s:%d: ** %s\n" % ("-", nread, msg)); sys.stderr.write(" [[%s]]\n" % line) assert False # ...................................................................... nread, ndata, npage = line_loop(sys.stdin, proc_line, data_error) if verbose: sys.stderr.write(f"{nread} file lines read\n") sys.stderr.write(f"{ndata} text lines processed\n") sys.stderr.write(f"{npage} page headers seen\n") sys.stderr.write("\n") tbl = list(this_cts.items()) tbl.sort(key = lambda p: p[1], reverse = True) for wd, frt in tbl: assert frt > 0, "bug {frt = 0}" sys.stdout.write(f"{frt:12.6f} {wd}\n") sys.stdout.flush() return 0 # ---------------------------------------------------------------------- main()