#! /usr/bin/python3 # Last edited on 2025-07-08 13:21:29 by stolfi from math import sqrt, log, exp, sin, cos, hypot; import sys, re; from error_funcs import arg_error, prog_error, file_line_error # Reads from {stdin} a VMS transcription file with main lines in the # format # # "{LOC} {HEADMK}{STAR}{LIND}{WORDS}{RIND}{TAILMK}" # # where # # {LOC} is is "<{anything}>". # # {HEADMK} is empty, or "<%>" to mark line {LOC} as the head of an assumed parag. # # {STAR} is either empty or "" where {STID} is "S{NN}", # and {NN} is the sequential number of a star within the page, # the starlet assigned to the line {LOC}. # # {LIND} is either '=' to mean that the text of line {LOC} starts on the left rail # (left text margin), or '»' to mean it starts to the right of that # rail. # # {WORDS} is the text of line {LOC}, being one or more IVTFF/EVA words separated # by '.' or ','. # # {RIND} is either '=' to mean that the text of line {LOC} ends at or past the right # rail (right text margin), or '«' to mean it ends to the right # of that rail. # # {TAILMK} is empty, or "<$>" to mark the line {LOC} as the tail of an assumed parag. # # The input file must also have a line before each page with format "<{PAGE}> ...". # The rest of the line is ignored. # # The input file may also have blank lines and lines that begin with '#', # which are ignored. # # Writes to {stdout} one line for each page, containing # statistics of that page, such as total lines, lines with stars, lines # with puffs, short lines, and assumed parag breaks. # # See the file "star-props.txt" for the identification and properties # of the star specified by {STID}. def main(): sys.stdin.reconfigure(encoding='iso-8859-1') write_table_preamble(); st = initial_state(); # Loop on input lines: for line in sys.stdin: st['nread'] += 1; line = line.rstrip(); st['line'] = line; process_file_line(st, line); if st['page'] != "": finish_page(st); write_table_line("TOTAL", st['ct_file']); return 0 # ...................................................................... def initial_state(): # Sets up the parsing state as of the start of the file. st = {}; st['nread'] = 0; # Number of file lines read. st['page'] = ""; # Current page f-number, or "" before the first page. st['last_was_tail'] = True; # Last line was a parag tail (or start of page). st['last_was_short'] = True; # Last line was a short line (or start of page). st['ct_file'] = {}; # Counts for the file. st['ct_page'] = {}; # Counts for the current page. clear_counts(st['ct_file']); clear_counts(st['ct_page']); st['stars_used'] = [ False ]*100; # Set to {True} for stars used in parags. st['parag_lseq'] = 0; # Line number of current parag's head. st['parag_flaws'] = []; # Flaws of current parag, or [] if perfect. st['parag_nlin'] = 0; # Number of data lines in current parag. return st; # ...................................................................... def process_file_line(st, line): re.sub(r"[\011]", " ", line); if re.match(r" *([\#]|$)", line): return; if re.match(r"[<>]f[0-9]+[rv][0-9]?[>]", line): process_page_header_line(st, line); return; if re.match(r"[#]", line): return; if re.match(r"] *", line); if m == None: data_error(st, f"invalid page header '{line}'"); page = m.group(1) start_page(st, page); return; # ...................................................................... def process_data_line(st, line): # Data (VMS text) line: bump_counts(st, 'text_lines', 1); m = re.fullmatch(r"<(f[0-9]+[rv][0-9]?)[.]([0-9]+)([;][A-Za-z0-9]+|)> *(<[%]>|)(<[!]S[0-9][0-9]>|<[!]NoS>|)([=»]?)([^ ].*[^ ])([=«])(<[$]>|) *", line); if m == None: data_error(st, "invalid line format"); if m.lastindex != 9: prog_error("num fields = %d" % m.lastindex); page = m.group(1); lseq = int(m.group(2)); trans = m.group(3); # Transcriber code, currently ignored. headmk = m.group(4); stid = m.group(5); lind = m.group(6); text = m.group(7); rind = m.group(8); tailmk = m.group(9); if st['page'] == "": data_error(st, "missing page start line"); if page != st['page']: data_error(st, f"page '{page}' in locator should be '{st['page']}'"); if stid != "": stid = re.sub(r"[<>!]", "", stid); if stid != "" and stid != "NoS": bump_counts(st, 'starlets', 1); stnum = int(stid[1:]); if stnum < 1 or stnum > 99: data_error(st, f"invalid star number '{stid}'"); if st['stars_used'][stnum]: data_error(st, f"star '{stid}' assigned twice"); st['stars_used'][stnum] = True; if lind == "»": data_error(st, "indented line"); puffed = has_puffs(st, text); if puffed: bump_counts(st, 'puffed', 1); if headmk != "": # Line is marked as a parag head. bump_counts(st, 'heads', 1); st['parag_flaws'] = []; st['parag_nlin'] = 1 st['parag_head'] = lseq if not st['last_was_tail']: data_error(st, "parag head does not follow a tail"); if stid == "": data_error(st, "parag head has no star indicator"); # printf " %s %s\n", loc, stcomm > "r"dev"stderr"; if stid == "NoS" : st['parag_flaws'].append("no starlet") if not st['last_was_short']: st['parag_flaws'].append("prev is long") else: # Line is not parag head. if st['last_was_tail']: data_error(st, "non-head follows a tail"); if puffed: bump_counts(st, 'puffed_not_head', 1); st['parag_flaws'].append("internal puffs") if stid != "": data_error(st, f"non-head line has a star indicator '{stid}'"); st['parag_nlin'] += 1 if rind == "«": # Line is short. bump_counts(st, 'shorts', 1); if tailmk == "": data_error(st, "short line is not parag tail"); st['last_was_short'] = True; else: st['last_was_short'] = False; if tailmk != "": # Line is marked as parag tail. if rind != "«": st['parag_flaws'].append("tail is long") write_parag_summary(page,lseq,st) st['last_was_tail'] = True; if len(st['parag_flaws']) == 0: bump_counts(st, 'perfect_parags', 1); bump_counts(st, 'perfect_lines', st['parag_nlin']); else: st['last_was_tail'] = False; return; # ...................................................................... def start_page(st, page): st['page'] = page; clear_counts(st['ct_page']); st['stars_used'] = [False]*100; st['last_was_short'] = True; st['last_was_tail'] = True; # Just in case: st['parag_lseq'] = 0; st['parag_flaws'] = []; st['parag_nlin'] = 0; return; # ...................................................................... def clear_counts(ct): # Assumes that {ct} is a dicts whose fields are counts of various things, # either global or for the current page. ct['starlets'] = 0; # Number of assigned stars. ct['text_lines'] = 0; # Number of data (text) lines. ct['shorts'] = 0; # Number of short lines. ct['heads'] = 0; # Number of head lines (with "<%>"). ct['puffed'] = 0; # Number of text lines with puffs (head or not). ct['puffed_not_head'] = 0; # Number of lines with puffs that are not head lines. ct['perfect_parags'] = 0; # Number of perfect parags. ct['perfect_lines'] = 0; # Number of lines in perfect parags. return # ...................................................................... def bump_counts(st, cname, amt): # Increments the file and page counter with name {ctname} by {amt}. st['ct_file'][cname] += amt; st['ct_page'][cname] += amt; return # ...................................................................... def finish_page(st): if not st['last_was_tail']: data_error(st, "page does not end with parag tail"); write_table_line(st['page'], st['ct_page']); # Check if the starlet assignment is injective: stmax = 99; while stmax > 0 and not st['stars_used'][stmax]: stmax = stmax - 1; stct = st['ct_page']['starlets'] if stmax != stct: data_error(st, "star count:maxindex mismatch {stct} 'S{stmax:02d}'"); for sti in range(1,stmax+1): if not st['stars_used'][sti]: data_error(st, "star 'S{sti:02d}' not assigned"); return # ...................................................................... def has_puffs(st, text): # Tests whether {traw} has any [pfwz] letters outside comments. tc = re.sub(r"[<]([-%$]|[!][^<>]*)[>]", "", text); if re.search(r"[<>]", tc): data_error(st, "mismatched [<>] or bad comment in text"); m = re.search(r"[pfwz]", tc); return m != None; # ...................................................................... def write_table_preamble( ): out("# Created by {compute_parag_stats.py}\n"); out("# \n"); out("# Columns:\n"); out("# \n"); out("# STR Number of lines with starlets assigned\n"); out("# SHL Number of short lines\n"); out("# PUF Number of lines with puffs\n"); out("# XPF Number of non-head text lines with puffs\n"); out("# HED Number of lines marked parag heads\n"); out("# PRP Number of perfect parags\n"); out("# TXL Number of text lines (excl. titles)\n"); out("# PRL Number of lines in perfect parags\n"); out("# IPL Number of lines in not in perfect parags\n"); out("# \n"); write_table_headings(); return; # ...................................................................... def write_table_headings(): out("%-6s" % "page"); out(" | %3s" % 'STR'); out(" | %3s" % 'SHL'); out(" | %3s" % 'PUF'); out(" | %3s" % 'XPF'); out(" | %3s" % 'HED'); out(" | %3s" % 'PRP'); out(" | %4s" % 'TXL'); out(" | %4s" % 'PRL'); out(" | %4s" % 'IPL'); out("\n"); return; # ...................................................................... def write_parag_summary(page,lseq,st): # Called at the end of parag, on line {f"<{page}.{lseq}>"} out("# <%s.%d> %d lines %d-%d " % (page,st['parag_head'],st['parag_nlin'],st['parag_head'],lseq)); if len(st['parag_flaws']) == 0: out("PERFECT") else: out("not perfect: "); out(','.join(st['parag_flaws'])); out(".\n") return # ...................................................................... def write_table_line(page, ct): out("%-6s" % page); out(" | %3d" % ct['starlets']); out(" | %3d" % ct['shorts']); out(" | %3d" % ct['puffed']); out(" | %3d" % ct['puffed_not_head']); out(" | %3d" % ct['heads']); out(" | %3d" % ct['perfect_parags']); out(" | %4d" % ct['text_lines']); out(" | %4d" % ct['perfect_lines']); out(" | %4d" % (ct['text_lines']-ct['perfect_lines'])); out("\n"); return; # ...................................................................... def out(str): sys.stdout.write(str); return; # ...................................................................... def err(str): sys.stderr.write(str); return; # ...................................................................... def data_error(st, msg): file_line_error("-", st['nread'], msg, st['line']); assert False; # ...................................................................... main();