#! /usr/bin/python3 # Last edited on 2026-01-12 16:57:43 by stolfi from math import sqrt, log, exp, sin, cos, hypot; import sys, re; from error_funcs import arg_error, file_line_error # Reads from {stdin} a table that maps old-style locators "<{PAGE}.{UNIT}.{OSEQ}>" (field 1) # to new-style ones "<{PAGE}.{NSEQ}>" (field 1), where {PAGE} is the f-number of a logical page # like "f85v2" and {UNIT} is a text unit, like "P1" or "L2". Outputs a file with one line for each # page and unit combination "f{PAGE}.{UNIT}" with the {NSEQ}s of the lines that comprise # that unit, in condensed format. E.g. "f85v2.N2 12-25,28,31-33" # # The input file may also have blank lines and lines that begin with '#', # which are ignored. def main(): # tlist = [ "11", "17", "13", "14", "12", "15", "0", "16", "13b", "13cc", "13a","13cb", "18", "19"] # print(tlist) # tlist.sort(key=seq_to_numeric); # print(tlist) # print(condense_seqs(tlist)) # return 0; sys.stdin.reconfigure(encoding='iso-8859-1') write_output_preamble(); st = initial_state(); # Loop on input lines: for line in sys.stdin: st['nread'] += 1; process_file_line(st, line); if st['page'] != "": finish_page(st); write_output_postamble(st); return 0 # ...................................................................... def initial_state(): # Sets up the parsing state as of the start of the file. st = {}; st['nread'] = 0; # Number of file lines read. st['page'] = None; # Current page f-number, or {None} before the first page. st['unit_oseqs_tb'] = {}; # Keys are unit names and values are lists of {OSEQ}s. st['unit_nseqs_tb'] = {}; # Keys are unit names and values are lists of {NSEQ}s. st['ct_file'] = {}; # Various global counts. clear_counts(st['ct_file']); st['ct_page'] = {}; # Various counts per page. clear_counts(st['ct_page']); # Just in case. # Pattern to match a full non-comment table entry line: oloc_pat = r"<(f[0-9]+[rv][0-9]?)[.]([A-Z][0-9]*)[.]([0-9]+[a-z]?)>"; # Old-style locus ID. nloc_pat = r"<(f[0-9]+[rv][0-9]?)[.]([0-9]+)>"; # New-style locus ID. entry_pat = oloc_pat + " +" + nloc_pat + " *([#].*)?"; st['entry_pat'] = re.compile(entry_pat); return st; # ...................................................................... def process_file_line(st, line): re.sub(r"[\011]", " ", line); line = line.rstrip(); st['line'] = line; if re.match(r" *([\#]|$)", line): return; if re.match(r" buf1, f"seq nums repeated or out of order {buf1} {nseq}" flush_bufs(); if buf0 == None: buf0 = nseq buf1 = nseq; return; # . . . . . . . . . . . . . . . . . . . . . . . . for seq in seqs: append_seq(seq) flush_bufs(); return con; # ...................................................................... def seq_to_numeric(seq): # Given a locus sequence number like "23" or "12b", returns a float # that can be used to sort it. Namely, if there is no letter suffix, # the result is the m = re.fullmatch(r"([0-9]+)([a-z]*)", seq) assert m!= None, f"invalid locus seq format '{seq}'" sint = m.group(1) if sint != "0": assert sint[0] != "0", f"leading zero in seq '{seq}'" res = float(sint) sfrc = m.group(2); ufrc = 1.0; for i in range(len(sfrc)): ufrc = ufrc/27.0; res += (ord(sfrc[i]) - ord('a') + 1)*ufrc; return res; # ...a................................................................... def out(str): sys.stdout.write(str); return; # ...................................................................... def err(str): sys.stderr.write(str); return; # ...................................................................... def data_error(st, msg): file_line_error("-", st['nread'], msg, st['line']); sys.stdout.flush(); sys.stderr.flush(); assert False; # ...................................................................... main();