#! /usr/bin/python3
# Last edited on 2026-01-12 16:57:43 by stolfi

from math import sqrt, log, exp, sin, cos, hypot;
import sys, re;
from error_funcs import arg_error, file_line_error

# Reads from {stdin} a table that maps old-style locators "<{PAGE}.{UNIT}.{OSEQ}>" (field 1)
# to new-style ones "<{PAGE}.{NSEQ}>" (field 1), where {PAGE} is the f-number of a logical page
# like "f85v2" and {UNIT} is a text unit, like "P1" or "L2".  Outputs a file with one line for each 
# page and unit combination "f{PAGE}.{UNIT}" with the {NSEQ}s of the lines that comprise
# that unit, in condensed format. E.g.  "f85v2.N2 12-25,28,31-33"
#
# The input file may also have blank lines and lines that begin with '#', 
# which are ignored.

def main():
  # tlist = [ "11", "17", "13", "14", "12", "15", "0", "16", "13b", "13cc", "13a","13cb", "18", "19"]
  # print(tlist)
  # tlist.sort(key=seq_to_numeric);
  # print(tlist)
  # print(condense_seqs(tlist))
  # return 0;

  sys.stdin.reconfigure(encoding='iso-8859-1')
  write_output_preamble();
  st = initial_state();
  # Loop on input lines:
  for line in sys.stdin:
    st['nread'] += 1;
    process_file_line(st, line);
  if st['page'] != "": finish_page(st);
  write_output_postamble(st);
  return 0
  # ......................................................................
 
def initial_state():
  # Sets up the parsing state as of the start of the file.
  st = {};
  st['nread'] = 0;    # Number of file lines read.
  st['page'] = None;  # Current page f-number, or {None} before the first page.
  st['unit_oseqs_tb'] = {};   # Keys are unit names and values are lists of {OSEQ}s.
  st['unit_nseqs_tb'] = {};   # Keys are unit names and values are lists of {NSEQ}s.
  st['ct_file'] = {}; # Various global counts.
  clear_counts(st['ct_file']);
  st['ct_page'] = {}; # Various counts per page.
  clear_counts(st['ct_page']); # Just in case.

  # Pattern to match a full non-comment table entry line:
  oloc_pat = r"<(f[0-9]+[rv][0-9]?)[.]([A-Z][0-9]*)[.]([0-9]+[a-z]?)>"; # Old-style locus ID.
  nloc_pat = r"<(f[0-9]+[rv][0-9]?)[.]([0-9]+)>"; # New-style locus ID.
  entry_pat = oloc_pat + " +" + nloc_pat + " *([#].*)?";
  st['entry_pat'] = re.compile(entry_pat);
  return st;
  # ......................................................................
  
def process_file_line(st, line):
  re.sub(r"[\011]", " ", line);
  line = line.rstrip();
  st['line'] = line;
  if re.match(r" *([\#]|$)", line):
    return;
  if re.match(r"<f[0-9]*[rv][0-9]?[.]", line):
    process_data_line(st, line);
    return;
  data_error(st, "unrecognized line format");
  # ......................................................................

def process_data_line(st, line):
  # Data line (old-new table entry):
  bump_counts(st, 'loci', 1);
  m = st['entry_pat'].fullmatch(line);
  if m == None: data_error(st, "invalid data line format");
  assert m.lastindex == 6, f"bad num fields = {m.lastindex}";
  page = m.group(1);
  unit = m.group(2);
  oseq = m.group(3);
  npage = m.group(4);
  nseq = m.group(5);
  if page != npage: data_error(st, f"mismatched pages in entry {page} {npage}");
  if page != st['page']: 
    if st['page'] != None: finish_page(st);
    start_page(st, page);
  if unit not in st['unit_oseqs_tb']: 
    st['unit_oseqs_tb'][unit] = [];
    st['unit_nseqs_tb'][unit] = [];
    bump_counts(st, 'units', 1);
  st['unit_oseqs_tb'][unit].append(oseq)
  st['unit_nseqs_tb'][unit].append(nseq)
  return;
  # ......................................................................

def start_page(st, page):
  st['page'] = page;
  clear_counts(st['ct_page']);
  st['unit_oseqs_tb'] = {}
  st['unit_nseqs_tb'] = {}
  return;
  # ......................................................................

def clear_counts(ct):
  # Assumes that {ct} is a dict whose fields are counts of various things,
  # either global or for the current page. 
  ct['loci'] = 0;  # Number of data lines (locus ID pairs).
  ct['units'] = 0;  # Number of data lines (locus ID pairs).
  return
  # ......................................................................
 
def bump_counts(st, ctname, amt):
  # Increments the file and page counters with name {ctname} by {amt}. 
  st['ct_file'][ctname] += amt;
  st['ct_page'][ctname] += amt;
  return
  # ......................................................................
    
def finish_page(st):
  write_page_units_table(st['page'], st['ct_page'], st['unit_oseqs_tb'], st['unit_nseqs_tb']);
  return
  # ......................................................................

def write_output_preamble( ):
  out("# Created by {compute_text_units_table.py}\n");
  out("# \n");
  out("# Columns:\n");
  out("# \n");
  out("#   UNIT   Code of text unit in old-style locus IDs\n");
  out("#   LINES  Number of text lines in this unit\n");
  out("#   NEW    Line numbers of this unit in new-style locus IDs\n");
  out("#   OLD    Line numbers of this unit in old-style locus IDs\n");
  out("# \n");
  return
  # ......................................................................

def write_output_postamble(st):
  out("\n")
  ct = st['ct_file']
  out("TOTAL %d loci and %d units\n" % ((ct['loci'], ct['units'])))
 
def write_page_units_table(page, ct, oseqs_tb, nseqs_tb):
  units = oseqs_tb.keys()
  assert nseqs_tb.keys() == units, "units names mismatch"
  assert ct['units'] == len(units), "units count mismatch"
  write_units_table_header(page, ct, len(units));
  for unit in units:
    write_units_table_line(unit, oseqs_tb[unit], nseqs_tb[unit])
  write_units_table_footer(page, ct, len(units))
  return;
  # ......................................................................
  
def write_units_table_header(page, ct, nunits):
  out("page %-6s: %d loci and %d units\n" % (page, ct['loci'], ct['units']));
  out("\n");
  out(" | %5s"   % 'UNIT');
  out(" | %5s"   % 'NLOCS');
  out(" | %-30s" % 'NEW');
  out(" | %-30s" % 'OLD');
  out("\n");  
  return;
  # ......................................................................
    
def write_units_table_line(unit, oseqs, nseqs):  
  assert len(oseqs) == len(nseqs), "unit loci count mismatch"
  out(" | %5s" % unit);
  out(" | %5d" % len(oseqs));
  out(" | %-30s" % condense_seqs(nseqs));
  out(" | %-30s" % condense_seqs(oseqs));
  out("\n");  
  return;
  # ......................................................................
  
def write_units_table_footer(page, ct, nunits):
  out("\n");  
  return;
  # ......................................................................

def condense_seqs(seqs):
  # Given a list of sequential locus numbers for a given unit, 
  # outputs a condensed list, like "1-30,30a,31,32,32b,33-40".
  
  seqs.sort(key=seq_to_numeric);
  con = "";     # The `closed' items of the condensed list so far.
  buf0 = None;  # Low end of integer range to be added to {con}, or {None}.
  buf1 = None;  # High end of integer range  to be added to {con}, or or {None}.
  
  # Invariant: if {buf1!=None} and {buf1!=None} then {buf0} and {buf1} are both
  # integers, {buf0<=buf1} numerically, and all integers in the range {buf0..buf1} is
  # to be appended to {con}. Otherwise both {buf0} and {buf1} must be {None},
  # and {con} is complete so far.

  def flush_bufs():
    nonlocal con, buf0, buf1
    # Flushes the buffers
    if buf1 != None:
      assert buf0 != None, "buf error"
      if con != "": con += ','
      if buf0 == buf1:
        con += str(buf0)
      elif buf1 == buf0 + 1:
        con += str(buf0) + ',' + str(buf1)
      else:
        con += str(buf0) + '-' + str(buf1)
      buf0 = None; buf1 = None
    return;
    # .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .

  def append_seq(seq):
    nonlocal con, buf0, buf1
    # If {seq} is an integer without letter suffix, tries to 
    # append it to the range that is in the buffer, if any; if that is not possible.
    # flushes the buffer and starts a new range there with {seq}.
    # If {seq} has a letter suffix, flushes the buffer and appends {seq} to {con}
    # without buffering it.
    
    m = re.fullmatch(r"([0-9]+)([a-z]*)", seq)
    assert m != None, f"invalid loc seq number '{seq}'"
    if m.group(2) != "":
      # Has a letter suffix:
      flush_bufs();
      if con != "": con += ','
      con += seq
    else:
      nseq = int(m.group(1));
      if buf1 != None and nseq != buf1+1:
        assert nseq > buf1, f"seq nums repeated or out of order {buf1} {nseq}"
        flush_bufs();
      if buf0 == None: buf0 = nseq
      buf1 = nseq;
    return;
    # .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
  
  for seq in seqs: 
    append_seq(seq)
  flush_bufs();
  return con;
  # ......................................................................

def seq_to_numeric(seq):
  # Given a locus sequence number like "23" or "12b", returns a float
  # that can be used to sort it.  Namely, if there is no letter suffix,
  # the result is the 
  m = re.fullmatch(r"([0-9]+)([a-z]*)", seq)
  assert m!= None, f"invalid locus seq format '{seq}'"
  sint = m.group(1)
  if sint != "0": assert sint[0] != "0", f"leading zero in seq '{seq}'"
  res = float(sint)
  sfrc = m.group(2);
  ufrc = 1.0;
  for i in range(len(sfrc)):
    ufrc = ufrc/27.0;
    res += (ord(sfrc[i]) - ord('a') + 1)*ufrc;
  return res;
  # ...a...................................................................

def out(str):
  sys.stdout.write(str);
  return;
  # ......................................................................
   
def err(str):
  sys.stderr.write(str);
  return;
  # ......................................................................

def data_error(st, msg):
  file_line_error("-", st['nread'], msg, st['line']);
  sys.stdout.flush();
  sys.stderr.flush();
  assert False;
  # ......................................................................

main();