#! /usr/bin/python3 # Last edited on 2026-01-22 11:42:30 by stolfi # Library function that reads the parags property table of the SPS. # To be imported by python programs. from process_funcs import basic_line_loop import re, sys from sys import stderr as err def read_table(fname): # Reads the contents of the parags property from file {fname}. # # Blank lines and #-comments in file {fname} are ignored. # # For each page there must be one line starting with "PLOC" that has the header for the # parag table of that page summaries. # # Then for each parag in that page there must be a line # describing that parag, which must have the format # # "{hloc} | P{nn} | {props}" # # where # # {hloc} is "{fnum}.{hlseq}". # {fnum} is the page's f-number. # {hlseq} is the line seq number {lseq} of the parag's head line. # {nn} is the seq number {pseq} of the parag in the page as 2 digits. # {props} are the parag propertis. # # There must be one line starting with "PAGE" that has the header for the # page summary. # # Then there must be a page summary line with various counts # for that page, such as number of lines, stars, parags, perfect parags, etc. # This line must have the format # # "{fnum} | TOT | {stats}" # # where {fnum} is a page's f-number and {stats} are the page statistics, in any format # # At the end of the file there should be a file summary line that is # like the page summary line but has "TOTAL" instead of the {fnum}, # and whose statistics are the totals for all pages. # # Returns # # {hdr_page} header for the page summaries # {tbl_page} table of page sumaries {stats}, indexed by {[fnum]}. # {hdr_parag} header for the per-page parags table. # {tbl_parag} table of parag properties {props}, indexed by {[fnum][pseq-1]} # # For now, the {stats} and {props} are unparsed strings. page_tbl = dict() page_hdr = None fnum_cur = None parag_hdr = None parag_tbl = dict() last_pseq = None def process_line(nread, line): nonlocal page_tbl, page_hdr, fnum_cur, parag_hdr, parag_tbl, last_pseq line = line.strip() if re.match(r"^[ \011]*([#]|$)", line): return # Parse parag properties header: if re.match(r"PLOC ", line): parag_hdr = line return # Parse parag properties line: m = re.fullmatch(r"(f[0-9a-z]+[.][0-9]+)[ ]*[|][ ]*(P[0-9][0-9])[ ]*[|][ ](.*)", line) if m != None: locid, Pnn, props = m.group(1,2,3) fnum= re.sub(r"[.][0-9]+$", "", locid) if fnum_cur == "TOTAL": err.write(f"{fname}:{nread}: ** parags after TOTAL\n [[{line}]]\n") sys.exit(1) if fnum != fnum_cur: err.write(f"!! start of page {fnum}\n") if fnum in parag_tbl: err.write(f"{fname}:{nread}: ** repeated page = {fnum}\n [[{line}]]\n") sys.exit(1) parag_tbl[fnum] = [] fnum_cur = fnum last_pseq = 0 pseq = int(Pnn[1:]) if pseq != last_pseq + 1: err.write(f"{fname}:{nread}: ** unexpected parag number = {Pnn} {last_pseq = }\n [[{line}]]\n") sys.exit(1) parag_tbl[fnum].append(line) last_pseq = pseq return # Parse page summary header: if re.match(r"PAGE ", line): page_hdr = line return # Parse page summary line: m = re.match(r"(TOTAL|f[0-9a-z]+)[ ]*[|][ ](.*) ", line) if m != None: fnum, props = m.group(1,2) if fnum_cur == "TOTAL": err.write(f"{fname}:{nread}: ** page summary after TOTAL\n [[{line}]]\n") sys.exit(1) if fnum != "TOTAL" and fnum != fnum_cur: err.write(f"{fname}:{nread}: ** summary for wrong page {fnum} {fnum_cur = }\n [[{line}]]\n") sys.exit(1) if fnum in page_tbl: err.write(f"{fname}:{nread}: ** repeated page summary {fnum}\n [[{line}]]\n") sys.exit(1) page_tbl[fnum] = line return err.write(f"{fname}:{nread}: ** invalid line format\n [[{line}]]\n") sys.exit(1) # return # .................................................................... rd = open(fname, "r") nread = basic_line_loop(rd, process_line) if len(page_tbl) == 0 or page_hdr == None: err.write(f"{fname}:{nread}: ** no page summaries in table file {fname}\n") sys.exit(1) if "TOTAL" not in page_tbl: err.write(f"{fname}:{nread}: ** no TOTAL summary in table file {fname}\n") sys.exit(1) if len(parag_tbl) == 0 or parag_hdr == None: err.write(f"{fname}:{nread}: ** no parag entries in table file {fname}\n") sys.exit(1) rd.close() return page_hdr, page_tbl, parag_hdr, parag_tbl # ----------------------------------------------------------------------