#! /usr/bin/python3
# Last edited on 2026-03-06 08:08:22 by stolfi

# The command line arguments are a the name {ivt_name} of an input file,
# a text unit type {utype} ("ch", "ps", "ec", "wp", "wc", etc), and a
# positive integer {tsize}. Optionally there may be also keyword pattern
# {kword} (which may be "NONE") and an filename-safe version {ktag} of
# the same.
# 
# Reads a file "res/{ivt_name}.ivt". The file must be in pseudo-IVTFF
# format, where each line is "<{LOC}> {TEXT}". where {LOC} is a locus ID
# like "f105v.20" or "b2.1.050", and {TEXT} is a line of the transcribed
# text..
# 
# The unit type {utype} specifies the nature of the {TEXT} and of the
# units used when measuring line sizes. In particuler, this script
# assumes that the {TEXT} has been cleaned-up according to the {utype}. See
# {clean_up_raw_text} in {size_position_funcs.py} for details.
# 
# In any case the input file is assumed to be in Unicode UTF-8 encoding,
# and so will be the output file.
#
# This script parses the {TEXT} into units of the specified {utype}
# with {spf.split_text_into_units}. Then it extracts and writes all strings
# that consist of {tsize} consecutive units. If the {kword} is not
# "NONE", considers only tuples that begin with a unit that matches the
# pattern {kword}.
#
# The {kword} pattern may include '^' and/or '$' and/or '\b' to specify
# that it must match a prefix, suffix, or whole unit.  This option
# is obviously not available (or meaningful) if {utype} is "ch" or "ec".#
# In any case the {kword} will be tested only against the leading unit.
# It will never match two or more units.
#
# Writes the extracted tuples to a file
# "res/{ivt_name}-{tsize}-with-{ktag}.tup". If {kword} is "NONE" omits
# the "-with-{ktag}" part.
#
# Each extracted tuple is written as a separate line with the format
# "{LOC} {PSIZE} {TPOS} {TSIZE} {TUPLE}" where {LOC} is the locus ID of
# the input file line, {PSIZE} is the total number of units in that
# line, {TSIZE} is the given {tsize}, {TUPLE} is the extracted tuple,
# and {TPOS} is the number of units in the input {TEXT} before that
# tuple.
#
# The {TUPLE} field of the output is the concatenation of the selected
# {tsize} units, with the {clean_sepa} separator defined by
# {spf.get_parsing_patterns(utype)}.

import sys, os, re
from sys import stderr as err
from process_funcs import bash, basic_line_loop
from error_funcs import arg_error, file_line_error, prog_error
from chinese_funcs import read_chinese_char_set
from note_077_funcs import compute_and_print_stats
import size_position_funcs as spf

def main(ivt_name, utype, tsize, kword, ktag):
  
  if tsize < 1 or tsize > 20: arg_error (f"invalid {tsize}")

  src_file = f"res/{ivt_name}.ivt"
  src_rd = open(src_file, "r")

  out_name = f"{ivt_name}-{tsize}"
  if kword != None: 
    if ktag == None: arg_error(f"the {ktag = !r} argument cannot be omitted")
    out_name += f"-with-{ktag}"
  else:
    if ktag != None: arg_error(f"spurious {ktag = !r} argument")
  
  tup_file = f"res/{out_name}.tup"
  tup_wr = open(tup_file, "w")
  tup_wr.write("# -*- coding: utf-8 -*-\n")

  src_rd.reconfigure(encoding='utf-8')
  tup_wr.reconfigure(encoding='utf-8')

  pat_line, pat_unit, pat_sepa, clean_sepa = spf.get_parsing_patterns(utype)
  
  tot_data = 0 # Count of data lines.
  tot_sepa = 0 # Count of separators in original texts.
  tot_unit = 0 # Total count of units in input file.
  tot_wtup = 0 # Total number of tuples extracted.
  
  psizes = []   # Number of units in each input line.
  ntupses = []  # Number of tuples in each input line.
  
  def process_input_line(nline, line):
    nonlocal tot_data, tot_unit, tot_sepa, tot_wtup
    nonlocal psizes, ntupses
    # 
    # Parses a line {line} assuming it is line {nline} of the file.
    # The {line} is always a string (never {None}), but may be "" if the line
    # is empty.
    # 
    # Ignores the line if it is a blank or #-comment.
    # 
    # Otherwise the line must be a data line, matching {pat_line}
    # 
    # Increments {tot_data} for each data line.
    # 
    # Cleans the {text} according to {utype}, updating
    # {tot_data,tot_sepa,tot_unit,tot_wtup} and counting the number {psize_raw} of
    # units of type {utype} in it. Then makes a list {oclist_raw} of the
    # raw positions of occurrences of {kword} in it, counted as specified
    # by {utype}.
    #
    # For each data line, appends its data to {psizes,ntupses}.

    # Should we debug the input line?
    debug = False
    
    def data_error(msg):
      nonlocal src_file, nline, line
      file_line_error(src_file, nline, msg, line)
      assert False
      # ----------------------------------------------------------------------

    assert line != None, "The {line} arg must not be {None}" 
    
    line = line.rstrip()

    # Ignore comments and blank lines:
    if re.match(r" *([#]|$)", line): return

    # Just in case, ignore IVTFF page headers:
    if re.match(r"<f[0-9]+[rv][0-9]*>", line): return
    
    tot_data += 1

    m = re.match(pat_line, line)
    if m is None: 
      # Invalid line format.
      data_error("invalid line format")

    # Parse the line into locus ID and text:
    assert m.lastindex == 2, f"bug {m.lastindex = }"
    loc = m.group(1)
    text = m.group(2) 
    
    # Cleanup text for searching and get count of units::
    units, ct_sepa = \
      spf.split_text_into_units(text, utype, pat_unit, pat_sepa, data_error)
    psize = len(units)
    tot_unit += psize
    tot_sepa += ct_sepa
    
    # Create list of tuples:
    ct_wtup = psize - tsize
    ntups = 0 # Tuples found in this line.
    for iu in range(ct_wtup):
      if kword == None or re.search(kword, units[iu]):
        wtup = clean_sepa.join(units[iu:iu+tsize])
        output_tuple(tup_wr, loc, psize, iu, wtup)
        ntups += 1
    if debug: err.write(f"!~ {loc:<12s} units = {psize_raw} tuples = {ntups}\n")

    tot_wtup += ntups
    psizes.append(psize)
    ntupses.append(ntups)

    return      
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

  err.write(f"reading file '{src_file}' ...\n")
  nread = basic_line_loop(src_rd, process_input_line)
  src_rd.close()
  tup_wr.close()

  err.write(f"{nread:6d} lines read\n")
  err.write(f"{tot_data:6d} data lines found\n")
  err.write(f"{tot_sepa:6d} total separator chars\n")
  err.write(f"{tot_unit:5d} total units\n")
  err.write(f"{tot_unit/tot_data:8.2f} avg units per input line\n")
  err.write(f"{tot_wtup:5d} total tuples written\n")
  err.write(f"{tot_wtup/tot_data:8.2f} avg tuples per input line\n")
  
  vnum, vtot, vmin, vsin, vmax, vsax, vavg, vdev = compute_and_print_stats("units per data linee", psizes)
  assert vnum == tot_data
  assert vtot == tot_unit
  
  vnum, vtot, vmin, vsin, vmax, vsax, vavg, vdev = compute_and_print_stats("tuples per data linee", ntupses)
  assert vnum == tot_data
  assert vtot == tot_wtup

  return
  # ----------------------------------------------------------------------

def output_tuple(tup_wr, loc, psize, iu, wtup):
  # Writes the tuple {wtup} assuming that it comes from the
  # input line {loc} and starts at position {iu} in the list of units

  # Sanity checks:
  assert wtup != "", "empty tuple"
  assert not re.search(r"[ \000-\037]", wtup), "blanks in tuple"
  
  tup_wr.write(f"{loc:<12s} {psize:5d} {iu:5d} {wtup}\n")
  return
  # ----------------------------------------------------------------------

def test_stuff():
  arg_error("no tests yet\n")
  return
  # ----------------------------------------------------------------------

if sys.argv[1] == "test":
  test_stuff()
else:
  narg = len(sys.argv)
  iarg = 1
  ivt_name = sys.argv[iarg]; iarg += 1
  utype = sys.argv[iarg]; iarg += 1
  tsize = int(sys.argv[iarg]); iarg += 1
  kword = None
  ktag = None
  if iarg < narg:
    kword = sys.argv[iarg]; iarg += 1
    if kword == "NONE" or kword == "None" or kword == '': kword = None
    ktag = sys.argv[iarg]; iarg += 1
    if ktag == "NONE" or ktag == "None" or ktag == '': ktag = None
  assert iarg == narg
  main(ivt_name, utype, tsize, kword, ktag)