#! /usr/bin/python3 # Last edited on 2026-01-20 10:55:41 by stolfi import sys, os, re from sys import stdout as out, stderr as err from process_funcs import basic_line_loop import error_funcs as ef # Reads a file produced by {words_from_ivt.py} from some ".ivt" # transcription file. # # Collects all the tokens of each parag into a single list, then looks # for /q-tokens/ -- tokens that start with 'qo' or 'oqo'. # # Prints to stdout one or more entries for each parag. Each entry is # "{TYPE} {COUNT}" where {TYPE} is one of 'BE', 'BQ', 'QQ', or 'QE'. # # A 'BQ' entry says that there are {COUNT} tokens between the start of # the parag and the first q-token in it. In particular, is zero if the # first token of the parag is a q-token. # # A 'QE' entry says that there are {COUNT} tokens between the last # q-tokenin the parag ad its end. In particular, is zero if the last # token of the parag is a q-token. # # A 'QQ' entry is printed for every pair of successive q-tokens, # to say that there were {CONT} tokens between them. # Two consecutive tokens that are both q-tokens generate # an entry with zero count. # # Finally a 'BE' entry is produced if the parag has {COUNT} tokens but # not a single one of them is a q-token. def main(): sys.stdout.reconfigure(encoding='iso-8859-1') sys.stdin.reconfigure(encoding='iso-8859-1') toks = [] locid_head = None cur_ipar = -1 npar_tot = 0 # Number of parags processed. ntok_tot = 0 # Total number of tokens in parags. nqtk_tot = 0 # Number of q-tokens in parags. nbad_tot = 0 # Number of bad tokens in parags. ngap_tot = 0 # Number of gaps reported. def process_line(nread, line): nonlocal toks, cur_ipar, locid_head nonlocal npar_tot, ntok_tot, nqtk_tot, nbad_tot, ngap_tot def data_error(msg): nonlocal nread, line ef.file_line_error("-",nread,msg,line) # .................................................................. line = line.strip() if re.match(r"[ ]*([#]|$)", line): return locid_pat = r"([a-zA-Z0-9.,;]+)" ct_pat = r"([0-9]+)" tok_pat = r"([a-z?]+)" line_pat = f"{locid_pat}[ ]+{ct_pat}[ ]+{ct_pat}[ ]+{ct_pat}[ ]+{tok_pat}" m = re.fullmatch(line_pat, line) if m == None: data_error("bad format") locid = m.group(1) ipar = int(m.group(2)) ilin = int(m.group(3)) itok = int(m.group(4)) token = m.group(5) if ipar != cur_ipar: if cur_ipar > 0: # Process current parag: nq, nb, ng = list_qq_gaps(toks) npar_tot += 1 ntok_tot += len(toks); nqtk_tot += nq; nbad_tot += nb; ngap_tot += ng else: assert len(toks) == 0 # Reset parag data: toks = [] cur_ipar = ipar locid_head = locid assert ipar == cur_ipar if ipar > 0: # Append token to current parag: toks.append(token) else: # Line not in parag -- ignore: err.write(f"-:{nread}: {locid} not in parag\n") return # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ basic_line_loop(sys.stdin, process_line) if len(toks) != 0: nq, nb, ng = list_qq_gaps(toks) npar_tot += 1 ntok_tot += len(toks); nqtk_tot += nq; nbad_tot += nb; ngap_tot += ng err.write(f"{npar_tot:5d} total parags processed\n") err.write(f"{ntok_tot:5d} total tokens parsed\n") err.write(f"{nbad_tot:5d} bad tokens rejected\n") err.write(f"{nqtk_tot:5d} q-tokens seen\n") err.write(f"{ngap_tot:5d} valid q-token gaps counted\n") pq = nqtk_tot/ntok_tot pb = nbad_tot/ntok_tot pn = (ntok_tot - nbad_tot - nqtk_tot)/ntok_tot err.write(f"prob(q-token) = {pq:6.4f}\n") err.write(f"prob(bad token) = {pb:6.4f}\n") err.write(f"prob(normal) = {pn:6.4f}\n") return # ---------------------------------------------------------------------- def list_qq_gaps(toks): # Receives a list {toks} with the number of tokens in the # parag. Writes the q-token gaps to [stdout}. # # Returns the counts: # {nq} of q-tokens in the parag, # {nb} of bad tokens # {ng} of good gaps written out. ntok = len(toks) assert ntok > 0 phead=".".join(toks[0:2]) err.write(f"¤¤ .{phead}\n") ptail=".".join(toks[ntok-2:]) err.write(f"++ .{ptail}\n") itok = -1 bad = False # Set to true if there are bad tokens in gap. nq = 0 # Number of q-tokens in parag. ng = 0 # Number of good gaps reported. nb = 0 # Number of bad tokens in parag. bad_gap = False for jtok in range(ntok+1): if jtok == ntok: # End of parag: gap_end = True type = 'BE' if itok == -1 else 'QE' elif re.fullmatch(r"[o]?qo[^q]*", toks[jtok]): nq += 1 gap_end = True type = 'BQ' if itok == -1 else 'QQ' else: gap_end = False if re.search(r"q", toks[jtok]): err.write(f"!! token @'{toks[jtok]}' has bad @q\n") bad_gap = True; nb += 1 elif re.match(r"[o]?[?]", toks[jtok]): err.write(f"!! token @'{toks[jtok]}' starts with @?\n") bad_gap = True; nb += 1 if gap_end: if not bad_gap: gap_length = jtok - 1 - itok out.write(f"{type} {gap_length:5d}\n"); ng += 1 # if type == 'QE': # phrase = f"{toks[itok:]}" # phrase = re.sub(r", ", ".", phrase) # phrase = re.sub(r"[][']", "", phrase) # err.write(f"~~ {type} {gap_length} {phrase}\n") else: gap_length = -1 # ti = toks[itok] if itok > 0 else "<%>" # tj = toks[jtok] if jtok < ntok else "<$>" # tb = "X" if bad_gap else "." # err.write(f"~~ gap {itok}..{jtok} ({gap_length} {tb}) {ti}..{tj}\n") itok = jtok bad_gap = False # Hopefully from now on... return nq, nb, ng # ---------------------------------------------------------------------- main()