#! /usr/bin/python3
# Last edited on 2026-01-20 10:55:41 by stolfi

import sys, os, re
from sys import stdout as out, stderr as err
from process_funcs import basic_line_loop
import error_funcs as ef

# Reads a file produced by {words_from_ivt.py} from some ".ivt"
# transcription file.
# 
# Collects all the tokens of each parag into a single list, then looks
# for /q-tokens/ -- tokens that start with 'qo' or 'oqo'.
#
# Prints to stdout one or more entries for each parag. Each entry is
# "{TYPE} {COUNT}" where {TYPE} is one of 'BE', 'BQ', 'QQ', or 'QE'.
#
# A 'BQ' entry says that there are {COUNT} tokens between the start of
# the parag and the first q-token in it. In particular, is zero if the
# first token of the parag is a q-token.
#
# A 'QE' entry says that there are {COUNT} tokens between the last
# q-tokenin the parag ad its end. In particular, is zero if the last
# token of the parag is a q-token.
#
# A 'QQ' entry is printed for every pair of successive q-tokens, 
# to say that there were {CONT} tokens between them. 
# Two consecutive tokens that are both q-tokens generate 
# an entry with zero count.
#
# Finally a 'BE' entry is produced if the parag has {COUNT} tokens but
# not a single one of them is a q-token.

def main():
   
  sys.stdout.reconfigure(encoding='iso-8859-1')
  sys.stdin.reconfigure(encoding='iso-8859-1')
  
  toks = []
  locid_head = None
  cur_ipar = -1
 
  npar_tot = 0 # Number of parags processed.
  ntok_tot = 0 # Total number of tokens in parags.
  nqtk_tot = 0 # Number of q-tokens in parags.
  nbad_tot = 0 # Number of bad tokens in parags.
  ngap_tot = 0 # Number of gaps reported.
   
  def process_line(nread, line):
    nonlocal toks, cur_ipar, locid_head 
    nonlocal npar_tot, ntok_tot, nqtk_tot, nbad_tot, ngap_tot

    def data_error(msg):
      nonlocal nread, line
      ef.file_line_error("-",nread,msg,line)
      # ..................................................................

    line = line.strip()
    if re.match(r"[ ]*([#]|$)", line): return
    locid_pat = r"([a-zA-Z0-9.,;]+)"
    ct_pat = r"([0-9]+)"
    tok_pat = r"([a-z?]+)"
    line_pat = f"{locid_pat}[ ]+{ct_pat}[ ]+{ct_pat}[ ]+{ct_pat}[ ]+{tok_pat}"
    m = re.fullmatch(line_pat, line)
    if m == None: data_error("bad format")
    locid = m.group(1)
    ipar = int(m.group(2))
    ilin = int(m.group(3))
    itok = int(m.group(4))
    token = m.group(5)
    if ipar != cur_ipar:
      if cur_ipar > 0:
        # Process current parag:
        nq, nb, ng = list_qq_gaps(toks)
        npar_tot += 1
        ntok_tot += len(toks); 
        nqtk_tot += nq; nbad_tot += nb; ngap_tot += ng
      else:
        assert len(toks) == 0
      # Reset parag data:
      toks = []
      cur_ipar = ipar
      locid_head = locid
    assert ipar == cur_ipar
    if ipar > 0: 
      # Append token to current parag:
      toks.append(token)
    else:
      # Line not in parag -- ignore:
      err.write(f"-:{nread}: {locid} not in parag\n")
    return
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

  basic_line_loop(sys.stdin, process_line)
  if len(toks) != 0: 
    nq, nb, ng = list_qq_gaps(toks)
    npar_tot += 1
    ntok_tot += len(toks); 
    nqtk_tot += nq; nbad_tot += nb; ngap_tot += ng

  err.write(f"{npar_tot:5d} total parags processed\n")
  err.write(f"{ntok_tot:5d} total tokens parsed\n")
  err.write(f"{nbad_tot:5d} bad tokens rejected\n")
  err.write(f"{nqtk_tot:5d} q-tokens seen\n")
  err.write(f"{ngap_tot:5d} valid q-token gaps counted\n")
  pq = nqtk_tot/ntok_tot
  pb = nbad_tot/ntok_tot
  pn = (ntok_tot - nbad_tot - nqtk_tot)/ntok_tot
  err.write(f"prob(q-token)   = {pq:6.4f}\n")
  err.write(f"prob(bad token) = {pb:6.4f}\n")
  err.write(f"prob(normal)    = {pn:6.4f}\n")
  return
  # ----------------------------------------------------------------------

def list_qq_gaps(toks):
  # Receives a list {toks} with the number of tokens in the 
  # parag.  Writes the q-token gaps to [stdout}.
  #
  # Returns the counts:
  #   {nq} of q-tokens in the parag,
  #   {nb} of bad tokens
  #   {ng} of good gaps written out.
  ntok = len(toks)
  assert ntok > 0
  phead=".".join(toks[0:2])
  err.write(f"¤¤ .{phead}\n")
  ptail=".".join(toks[ntok-2:])
  err.write(f"++ .{ptail}\n")
  itok = -1
  bad = False  # Set to true if there are bad tokens in gap.
  nq = 0 # Number of q-tokens in parag.
  ng = 0 # Number of good gaps reported.
  nb = 0 # Number of bad tokens in parag.
  bad_gap = False
  for jtok in range(ntok+1):
    if jtok == ntok:
      # End of parag:
      gap_end = True
      type = 'BE' if itok == -1 else 'QE'
    elif re.fullmatch(r"[o]?qo[^q]*", toks[jtok]):
      nq += 1
      gap_end = True
      type = 'BQ' if itok == -1 else 'QQ'
    else:
      gap_end = False
      if re.search(r"q", toks[jtok]):
        err.write(f"!! token @'{toks[jtok]}' has bad @q\n")
        bad_gap = True; nb += 1
      elif re.match(r"[o]?[?]", toks[jtok]):
        err.write(f"!! token @'{toks[jtok]}' starts with @?\n")
        bad_gap = True; nb += 1
    if gap_end:
      if not bad_gap:
        gap_length = jtok - 1 - itok
        out.write(f"{type} {gap_length:5d}\n"); ng += 1
        # if type == 'QE':
        #   phrase = f"{toks[itok:]}"
        #   phrase = re.sub(r", ", ".", phrase)
        #   phrase = re.sub(r"[][']", "", phrase)
        #   err.write(f"~~ {type} {gap_length} {phrase}\n")
      else:
        gap_length = -1
      # ti = toks[itok] if itok > 0 else "<%>"
      # tj = toks[jtok] if jtok < ntok else "<$>"
      # tb = "X" if bad_gap else "."
      # err.write(f"~~ gap {itok}..{jtok} ({gap_length} {tb}) {ti}..{tj}\n")
      itok = jtok
      bad_gap = False # Hopefully from now on...

  return nq, nb, ng
  # ----------------------------------------------------------------------

main()