#! /usr/bin/python3
# Last edited on 2026-03-03 15:07:54 by stolfi

# Takes as arguments the names of two files, each containing lines of the form
# "{COUNT} {FREQ} {ITEM}", sorted by decreasing count.  Merges the two files
# by approximate frequency match. Writes the result to standard output
# where each line is "{ACOUNT} {AFREQ} {AWORD}  {BCOUNT} {BFREQ} {BWORD}"
# 
# The files must be in Unicode UTF-8 encoding.
# Ignores #-comments and blank lines.

import sys, os, re
from sys import stderr as err, stdout as out
from process_funcs import bash, basic_line_loop
from error_funcs import arg_error, file_line_error, prog_error

def main(a_fname, b_fname):

  a_counts, a_freqs, a_items = read_counts_and_freqs_file(a_fname)
  b_counts, b_freqs, b_items = read_counts_and_freqs_file(b_fname)
  
  sextets = pair_up_by_freqs(a_counts, a_freqs, a_items, b_counts, b_freqs, b_items)
  err.write(f"{len(sextets):5d} merged count-freq-item lines\n")
  
  write_merged_counts_and_freqs(out, sextets)
  return
  # ----------------------------------------------------------------------

def write_merged_counts_and_freqs(wr, sextets):

  wr.reconfigure(encoding='utf-8')

  wr.flush()
  wr.write("# -*- coding: utf-8 -*-\n")
  
  def format_count_and_freq(ct, fr):
    if ct == 0:
      ctx = "       .      "; frx = " .    " 
    else:
      ctx = f"{ct:14.6f}"; frx = f"{fr:6.4f}"
    return ctx, frx
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

  nwrit = 0
  for act, afr, ait, bct, bfr, bit in sextets:
    actx, afrx = format_count_and_freq(act, afr)
    bctx, bfrx = format_count_and_freq(bct, bfr)
    wr.write(f"{actx:14s} {afrx:6s} {ait:<20s} {bctx:14s} {bfrx:6s} {bit:<20s}\n")
    nwrit += 1
  wr.flush()
  assert nwrit == len(sextets)
  err.write(f"{nwrit:5d} lines written\n")
  return
  # ----------------------------------------------------------------------

def pair_up_by_freqs(acts, afrs, aits, bcts, bfrs, bits):
  # Given lists {acts,bcts} of counts, {afrs,bfrs} of frequencies,
  # their respective items {aits,bits},
  # merges them by approx equality of the {freq} values.
  # Returns a list of sexduples {(act,afr,ait,bct,bfr,bit)},
  # where either of the two halves may be {(0,0,'-')} 
  
  na = len(afrs)
  nb = len(bfrs)
  sextets = [] # The result.

  def aux_merge(ra,sa,rb,sb):
    # Merges entries {ra..sa} with {rb..sb}.
    # Appends the result to {sextets}.
    
    assert ra >= 0 and sa < na, "bug ra,sa"
    assert rb >= 0 and sb < nb, "bug rb,sb"
    if ra > sa:
      # Dump all {bfrs} entries, if any:
      for jb in range(rb,sb+1):
        sextets.append(( 0, 0, '-', bcts[jb], bfrs[jb], bits[jb], ))
    elif rb > sb:
      # Dump all {afrs} entries:
      for ja in range(ra,sa+1):
        sextets.append(( acts[ja], afrs[ja], aits[ja], 0, 0, '-', ))
    else:
      # Find a stable matching pair:
      ja = ra; jb = rb;
      while True:
        if ja < sa: assert acts[ja] >= acts[ja+1], "file 0 is out of order"
        if jb < sb: assert bcts[jb] >= bcts[jb+1], "file 1 is out of order"
        if ja < sa and abs(bfrs[jb] - afrs[ja]) > abs(bfrs[jb] - afrs[ja+1]):
          ja += 1; continue
        if jb < sb and abs(afrs[ja] - bfrs[jb]) > abs(afrs[ja] - bfrs[jb+1]):
          jb += 1; continue
        break
      assert ja <= sa and jb <= sb
      aux_merge(ra, ja-1, rb, jb-1);
      sextets.append((acts[ja], afrs[ja], aits[ja], bcts[jb], bfrs[jb], bits[jb],))
      aux_merge(ja+1, sa, jb+1, sb)
    return
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     
  aux_merge(0, na-1, 0, nb-1)
  return sextets
  # ----------------------------------------------------------------------

def read_counts_and_freqs_file(fname):
  rd = open(fname, "r")
  rd.reconfigure(encoding='utf-8')

  pat_num = r"[0-9]*[.0-9][0-9]*"
  pat_line = f"[ ]*({pat_num})[ ]+({pat_num})[ ]+([^ ]+)"

  ndata = 0 # Count of data lines.

  counts = []  # List of counts.
  freqs = []   # List of freqs.
  items = []   # List of items.

  def process_input_line(nline, line):
    nonlocal fname, pat_line, ndata, freqs
    #
    # Parses a line {line} assuming it is line {nline} of the file.
    # The {line} is always a string (never {None}), but may be "" if the line
    # is empty.
    # 
    # Ignores the line if it is a blank or #-comment.
    # 
    # Otherwise the line must be a data line, matching {pat_line}. In
    # that case, increments {ndata}, extracts {count}, {freq} and {item} from
    # the line, and appends the pair to {freqs}.
    
    # Should we debug the line?
    debug = False
    
    def data_error(msg):
      nonlocal fname, nline, line
      file_line_error(fname, nline, msg, line)
      assert False
      # ----------------------------------------------------------------------

    assert line != None, "The {line} arg must not be {None}" 
    
    line = line.rstrip()

    # Ignore comments and blank lines:
    if re.match(r" *([#]|$)", line): return

    ndata += 1

    m = re.fullmatch(pat_line, line)
    if m is None: 
      # Invalid line format.
      data_error(f"invalid line format for {pat_line = !r}")

    # Parse the line into locus ID and text:
    assert m.lastindex == 3, f"bug {m.lastindex = }"
    count = float(m.group(1))
    freq = float(m.group(2))
    item = m.group(3) 
    if count < 0: data_error(f"invalid {count = :24.16e}")
    if freq < 0 or freq > 1: data_error(f"invalid {freq = :24.16e}")
    
    counts.append(count)
    freqs.append(freq)
    items.append(item)
    if debug: err.write(f"!~ {count = :.8f} {freq = :.8f} {item = !r}\n")
    return
    # ......................................................................

  err.write(f"reading file '{fname}' ...\n")
  nread = basic_line_loop(rd, process_input_line)
  rd.close()

  err.write(f"{nread:5d} total lines read\n")
  err.write(f"{ndata:5d} data lines found\n")

  return counts, freqs, items

  # ----------------------------------------------------------------------

def test_stuff():
  acts = [ 9002,8002,          7002,          5002,4902,          4202,3002,1202, ]
  afrs = [ 0.90,0.80,          0.70,          0.50,0.49,          0.42,0.30,0.12, ]
  aits = [ f"a{i:02d}" for i in range(len(afrs)) ]
  bcts = [  903,      733, 713,      693, 683,           483, 473,      303,      103,  93, ]
  bfrs = [ 0.90,     0.73,0.71,     0.69,0.68,          0.48,0.47,     0.30,     0.10,0.09, ]
  bits = [ f"b{i:02d}" for i in range(len(bfrs)) ]
  err.write(f"{afrs = }\n")
  err.write(f"{bfrs = }\n")
  sextets = pair_up_by_freqs(acts, afrs, aits, bcts, bfrs, bits)
  write_merged_counts_and_freqs(err, sextets)
  return
  # ----------------------------------------------------------------------

if sys.argv[1] == "test":
  test_stuff()
else:
  a_fname = sys.argv[1]
  b_fname = sys.argv[2]
  main(a_fname, b_fname)