#! /usr/bin/python3
# Last edited on 2026-02-22 18:19:15 by stolfi

# The command line arguments are a {book} ("bencao" or "starps"), the
# tag {bsub} of a subset of the same ("fu" for full, "gd" for good,
# etc), a text unit specifier ("ch", "ps", "ec", "wp", "wc", etc), and a
# positive integer {tsize}·

# Reads a file "res/{book}-{bsub}-{enc}.ivp" where {enc} is determined
# by the {unit} ("chu" for unit "ch", "pys" for "ps", and "eva" for
# "ec", "wc, and "wp"). The file must be in pseudo-IVTFF format, where
# each line is "<{LOC}> {TEXT}".
# 
# Extracts and writes all tuples of {tsize} consecutive units of the
# specified type. Writes them to a file
# "res/{book}-{bsub}-{unit}-{tsize}.tup" where each line has the format
# "{LOC} {PSIZE} {TPOS} {TSIZE} {TUPLE}" where {TUPLE} is a tuple of
# {TSIZE=tsize} units, {PSIZE} is the total number of units in the parag
# {LOC}, {TPOS} is the number of units in the parag before that tuple.
#
# Before the tuples are extracted, the {TEXT} is cleaned
# as per {spf.split_text_into_units}.

import sys, os, re
from sys import stderr as err
from process_funcs import bash, basic_line_loop
from error_funcs import arg_error, file_line_error, prog_error
from chinese_funcs import read_chinese_char_set
from note_077_funcs import compute_and_print_stats
import size_position_funcs as spf

from note_077_funcs import enc_from_unit

def main(book, bsub, unit, tsize):
  
  if tsize < 1 or tsize > 20: arg_error (f"invalid {tsize}")

  enc = enc_from_unit(unit)

  ivp_file = f"res/{book}-{bsub}-{enc}.ivp"
  ivp_rd = open(ivp_file, "r")

  tup_file = f"res/{book}-{bsub}-{unit}-{tsize}.tup"
  tup_wr = open(tup_file, "w")
  tup_wr.write("# -*- coding: utf-8 -*-\n")

  ivp_rd.reconfigure(encoding='utf-8')
  tup_wr.reconfigure(encoding='utf-8')

  pat_line, pat_unit, pat_junk, pat_sepa, clean_sepa = spf.get_parsing_patterns(enc, unit)
  
  tot_para = 0 # Count of data lines.
  tot_sepa = 0 # Count of separators in original texts.
  tot_unit = 0 # Total count of units in input file.
  tot_wtup = 0 # Total occurrences of {word} found.
  
  def process_input_line(nline, line):
    nonlocal enc, unit, ivp_file, tup_wr 
    nonlocal pat_line, pat_unit, pat_junk, pat_sepa, clean_sepa
    nonlocal tot_para, tot_unit, tot_sepa, tot_wtup
    # 
    # Parses a line {line} assuming it is line {nline} of the file.
    # The {line} is always a string (never {None}), but may be "" if the line
    # is empty.
    # 
    # Ignores the line if it is a blank or #-comment.
    # 
    # Otherwise the line must be a data line, matching {pat_line}
    # 
    # Increments {tot_para} for each data line.
    # 
    # Cleans the {text} according to {enc} and {unit}, updating
    # {tot_para,tot_sepa,tot_unit,tot_wtup} and counting the number {psize_raw} of
    # units if type {unit} in it. Then makes a list {oclist_raw} of the
    # raw positions of occurrences of {word} in it, counted as specified
    # by {enc} and {unit}.
    #
    # For each data line, appends its data to {locs,psizes,oclists}.

    # Should we debug the parag?
    debug = False
    
    def data_error(msg):
      nonlocal ivp_file, nline, line
      file_line_error(ivp_file, nline, msg, line)
      assert False
      # ----------------------------------------------------------------------

    assert line != None, "The {line} arg must not be {None}" 

    # Ignore comments and blank lines:
    if re.match(r" *([#]|$)", line): return

    # Just in case, ignore IVTFF page headers:
    if re.match(r"<f[0-9]+[rv][0-9]*>", line): return
    
    tot_para += 1

    m = re.match(pat_line, line)
    if m is None: 
      # Invalid line format.
      data_error("invalid line format")

    # Parse the line into locus ID and text:
    assert m.lastindex == 2, f"bug {m.lastindex = }"
    loc = m.group(1)
    text = m.group(2) 
    
    # Cleanup text for searching and get count of units::
    units, ct_sepa = \
      spf.split_text_into_units(text, enc, unit, pat_unit, pat_junk, pat_sepa, data_error)
    psize = len(units)
    tot_unit += psize
    tot_sepa += ct_sepa
    
    # Create list of tuples:
    ct_wtup = psize-tsize
    for iu in range(ct_wtup):
      wtup = clean_sepa.join(units[iu:iu+tsize])
      output_tuple(tup_wr, loc, psize, iu, wtup)
    tot_wtup += len(wtup)
    if debug: err.write(f"!~ {loc:<12s} psize = {psize_raw} wtups = {ct_wtup}\n")

    return      
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

  err.write(f"reading file '{ivp_file}' ...\n")
  nread = basic_line_loop(ivp_rd, process_input_line)
  ivp_rd.close()

  err.write(f"{nread:6d} lines read\n")
  err.write(f"{tot_para:6d} parags found\n")
  err.write(f"{tot_sepa:6d} total separator chars\n")
  err.write(f"{tot_unit:5d} total units\n")
  err.write(f"{tot_unit/tot_para:8.2f} avg units/parag\n")
  err.write(f"{tot_wtup:5d} total tuples written\n")
  err.write(f"{tot_wtup/tot_para:8.2f} avg tuples/parag\n")

  tup_wr.close()

  return
  # ----------------------------------------------------------------------

def output_tuple(tup_wr, loc, psize, iu, wtup):
  # Writes the tuple {wtup} assuming that it comes from the
  # parag {loc} and starts at position {iu} in the list of units

  # Sanity checks:
  assert wtup != "", "empty tuple"
  assert not re.search(r"[ \000-\037]", wtup), "blanks in tuple"
  
  tup_wr.write(f"{loc:<12s} {psize:5d} {iu:5d} {wtup}\n")
  return
  # ----------------------------------------------------------------------

def test_stuff():
  arg_error("no tests yet\n")
  return
  # ----------------------------------------------------------------------

if sys.argv[1] == "test":
  test_stuff()
else:
  book = sys.argv[1]
  bsub = sys.argv[2]
  unit = sys.argv[3]
  tsize = int(sys.argv[4])
  main(book, bsub, unit, tsize)