#! /usr/bin/python3 # Last edited on 2026-03-03 15:07:54 by stolfi # Takes as arguments the names of two files, each containing lines of the form # "{COUNT} {FREQ} {ITEM}", sorted by decreasing count. Merges the two files # by approximate frequency match. Writes the result to standard output # where each line is "{ACOUNT} {AFREQ} {AWORD} {BCOUNT} {BFREQ} {BWORD}" # # The files must be in Unicode UTF-8 encoding. # Ignores #-comments and blank lines. import sys, os, re from sys import stderr as err, stdout as out from process_funcs import bash, basic_line_loop from error_funcs import arg_error, file_line_error, prog_error def main(a_fname, b_fname): a_counts, a_freqs, a_items = read_counts_and_freqs_file(a_fname) b_counts, b_freqs, b_items = read_counts_and_freqs_file(b_fname) sextets = pair_up_by_freqs(a_counts, a_freqs, a_items, b_counts, b_freqs, b_items) err.write(f"{len(sextets):5d} merged count-freq-item lines\n") write_merged_counts_and_freqs(out, sextets) return # ---------------------------------------------------------------------- def write_merged_counts_and_freqs(wr, sextets): wr.reconfigure(encoding='utf-8') wr.flush() wr.write("# -*- coding: utf-8 -*-\n") def format_count_and_freq(ct, fr): if ct == 0: ctx = " . "; frx = " . " else: ctx = f"{ct:14.6f}"; frx = f"{fr:6.4f}" return ctx, frx # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ nwrit = 0 for act, afr, ait, bct, bfr, bit in sextets: actx, afrx = format_count_and_freq(act, afr) bctx, bfrx = format_count_and_freq(bct, bfr) wr.write(f"{actx:14s} {afrx:6s} {ait:<20s} {bctx:14s} {bfrx:6s} {bit:<20s}\n") nwrit += 1 wr.flush() assert nwrit == len(sextets) err.write(f"{nwrit:5d} lines written\n") return # ---------------------------------------------------------------------- def pair_up_by_freqs(acts, afrs, aits, bcts, bfrs, bits): # Given lists {acts,bcts} of counts, {afrs,bfrs} of frequencies, # their respective items {aits,bits}, # merges them by approx equality of the {freq} values. # Returns a list of sexduples {(act,afr,ait,bct,bfr,bit)}, # where either of the two halves may be {(0,0,'-')} na = len(afrs) nb = len(bfrs) sextets = [] # The result. def aux_merge(ra,sa,rb,sb): # Merges entries {ra..sa} with {rb..sb}. # Appends the result to {sextets}. assert ra >= 0 and sa < na, "bug ra,sa" assert rb >= 0 and sb < nb, "bug rb,sb" if ra > sa: # Dump all {bfrs} entries, if any: for jb in range(rb,sb+1): sextets.append(( 0, 0, '-', bcts[jb], bfrs[jb], bits[jb], )) elif rb > sb: # Dump all {afrs} entries: for ja in range(ra,sa+1): sextets.append(( acts[ja], afrs[ja], aits[ja], 0, 0, '-', )) else: # Find a stable matching pair: ja = ra; jb = rb; while True: if ja < sa: assert acts[ja] >= acts[ja+1], "file 0 is out of order" if jb < sb: assert bcts[jb] >= bcts[jb+1], "file 1 is out of order" if ja < sa and abs(bfrs[jb] - afrs[ja]) > abs(bfrs[jb] - afrs[ja+1]): ja += 1; continue if jb < sb and abs(afrs[ja] - bfrs[jb]) > abs(afrs[ja] - bfrs[jb+1]): jb += 1; continue break assert ja <= sa and jb <= sb aux_merge(ra, ja-1, rb, jb-1); sextets.append((acts[ja], afrs[ja], aits[ja], bcts[jb], bfrs[jb], bits[jb],)) aux_merge(ja+1, sa, jb+1, sb) return # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ aux_merge(0, na-1, 0, nb-1) return sextets # ---------------------------------------------------------------------- def read_counts_and_freqs_file(fname): rd = open(fname, "r") rd.reconfigure(encoding='utf-8') pat_num = r"[0-9]*[.0-9][0-9]*" pat_line = f"[ ]*({pat_num})[ ]+({pat_num})[ ]+([^ ]+)" ndata = 0 # Count of data lines. counts = [] # List of counts. freqs = [] # List of freqs. items = [] # List of items. def process_input_line(nline, line): nonlocal fname, pat_line, ndata, freqs # # Parses a line {line} assuming it is line {nline} of the file. # The {line} is always a string (never {None}), but may be "" if the line # is empty. # # Ignores the line if it is a blank or #-comment. # # Otherwise the line must be a data line, matching {pat_line}. In # that case, increments {ndata}, extracts {count}, {freq} and {item} from # the line, and appends the pair to {freqs}. # Should we debug the line? debug = False def data_error(msg): nonlocal fname, nline, line file_line_error(fname, nline, msg, line) assert False # ---------------------------------------------------------------------- assert line != None, "The {line} arg must not be {None}" line = line.rstrip() # Ignore comments and blank lines: if re.match(r" *([#]|$)", line): return ndata += 1 m = re.fullmatch(pat_line, line) if m is None: # Invalid line format. data_error(f"invalid line format for {pat_line = !r}") # Parse the line into locus ID and text: assert m.lastindex == 3, f"bug {m.lastindex = }" count = float(m.group(1)) freq = float(m.group(2)) item = m.group(3) if count < 0: data_error(f"invalid {count = :24.16e}") if freq < 0 or freq > 1: data_error(f"invalid {freq = :24.16e}") counts.append(count) freqs.append(freq) items.append(item) if debug: err.write(f"!~ {count = :.8f} {freq = :.8f} {item = !r}\n") return # ...................................................................... err.write(f"reading file '{fname}' ...\n") nread = basic_line_loop(rd, process_input_line) rd.close() err.write(f"{nread:5d} total lines read\n") err.write(f"{ndata:5d} data lines found\n") return counts, freqs, items # ---------------------------------------------------------------------- def test_stuff(): acts = [ 9002,8002, 7002, 5002,4902, 4202,3002,1202, ] afrs = [ 0.90,0.80, 0.70, 0.50,0.49, 0.42,0.30,0.12, ] aits = [ f"a{i:02d}" for i in range(len(afrs)) ] bcts = [ 903, 733, 713, 693, 683, 483, 473, 303, 103, 93, ] bfrs = [ 0.90, 0.73,0.71, 0.69,0.68, 0.48,0.47, 0.30, 0.10,0.09, ] bits = [ f"b{i:02d}" for i in range(len(bfrs)) ] err.write(f"{afrs = }\n") err.write(f"{bfrs = }\n") sextets = pair_up_by_freqs(acts, afrs, aits, bcts, bfrs, bits) write_merged_counts_and_freqs(err, sextets) return # ---------------------------------------------------------------------- if sys.argv[1] == "test": test_stuff() else: a_fname = sys.argv[1] b_fname = sys.argv[2] main(a_fname, b_fname)