#! /usr/bin/python3
# Last edited on 2026-03-06 07:48:47 by stolfi

# Takes as arguments the name {ivt_name} of a transcription file,
# without the folder or ".ivt" extension, and a text unit type {utype}
# ("ch", "wc", etc.).
# 
# Reads the transcription file "res/{ivt_name}.ivt" in pseudo-IVTFF
# format. Writes a file "res/{ivt_name}.upp" with the raw size (number
# of units) of each paragraph.
# 
# Ignores #-comments and blank lines. 
#
# Each non-ignored input line must have the format "<{LOC}> {TEXT}"
# where {LOC} is a locus ID like "f105v.20" or "b2.1.050", and {TEXT} is
# a line of the transcribed text.
#
# The unit type {utype} specifies the nature of the {TEXT} and of the
# units used when measuring line sizes. IN particuler, this script
# assumes that the {TEXT} has been cleaned-up according to the {utype}.
# See {clean_up_raw_text} in {size_position_funcs.py} for details.
# 
# In any case the input file is assumed to be in Unicode UTF-8 encoding,
# and so will be the output file.
#
# Each output line has the format "{LOC} {PSIZE}" where
# {PSIZE} is the number of units of a specified type (hanzi characters,
# EVA characters, EVA words, pinyin syllables, ...) in the paragraph.
#

import sys, os, re
from sys import stderr as err
from process_funcs import bash, basic_line_loop
from error_funcs import arg_error, file_line_error, prog_error
from chinese_funcs import read_chinese_char_set
from note_077_funcs import compute_and_print_stats, name_for_tex_macro
import size_position_funcs as spf

def main(ivt_name, utype):

  ivt_file = f"res/{ivt_name}.ivt"
  ivt_rd = open(ivt_file, "r")
  ivt_rd.reconfigure(encoding='utf-8')

  upp_file = f"res/{ivt_name}.upp"
  upp_wr = open(upp_file, "w")
  upp_wr.reconfigure(encoding='utf-8')
  upp_wr.write("# -*- coding: utf-8 -*-\n")

  pat_line, pat_unit, pat_sepa, clean_sepa = spf.get_parsing_patterns(utype)

  tot_para = 0 # Count of data lines.
  tot_unit = 0 # Total count of units in input file.
  tot_sepa = 0 # Total chars removed in cleanup.

  psizes = []   # List of raw sizes of parags, for statistics.

  def process_input_line(nline, line):
    nonlocal tot_para, tot_unit, tot_sepa, psizes
    #
    # Parses a line {line} assuming it is line {nline} of the file.
    # The {line} is always a string (never {None}), but may be "" if the line
    # is empty.
    # 
    # Ignores the line if it is a blank or #-comment.
    # 
    # Otherwise the line must be a data line, matching {pat_line}.
    # 
    # Increments {tot_para} for each data line.
    # 
    # Cleans the {text} according to {utype}, updating
    # {tot_sepa}, {tot_unit}. Then counts the number {psize_raw} of
    # units of type {utype} in it.
    # 
    # Writes to stdout one output line for each input data line,
    # and appends {psize_raw} to {psizes}.
    
    # Should we debug the parag?
    debug = False
    
    def data_error(msg):
      nonlocal ivt_file, nline, line
      file_line_error(ivt_file, nline, msg, line)
      assert False
      # ----------------------------------------------------------------------

    assert line != None, "The {line} arg must not be {None}" 

    # Ignore comments and blank lines:
    if re.match(r" *([#]|$)", line): return

    # Just in case, ignore IVTFF page headers:
    if re.match(r"<f[0-9]+[rv][0-9]*>", line): return

    tot_para += 1
    line = line.strip()

    m = re.fullmatch(pat_line, line)
    if m is None: 
      # Invalid line format.
      data_error(f"invalid line format for {pat_line = !r}")

    # Parse the line into locus ID and text:
    assert m.lastindex == 2, f"bug {m.lastindex = }"
    loc = m.group(1)
    text = m.group(2) 
    
    units, ct_sepa = spf.split_text_into_units(text, utype, pat_unit, pat_sepa, data_error)
    ct_unit = len(units)
    tot_unit += ct_unit
    tot_sepa += ct_sepa

    psize_raw = ct_unit

    psizes.append(psize_raw)
    if debug: err.write(f"!~ {loc:<12s} {ct_unit = :5d} {ct_sepa = :5d}\n")

    upp_wr.write(f"{loc:<12s} {psize_raw:5d}\n")
    return      
    # ......................................................................

  err.write(f"reading file '{ivt_file}' ...\n")
  nread = basic_line_loop(ivt_rd, process_input_line)
  ivt_rd.close()
  upp_wr.close()

  err.write(f"{nread:5d} total lines\n")
  err.write(f"{tot_para:5d} parags found\n")
  err.write(f"{tot_sepa:6d} total separator chars\n")
  err.write(f"{tot_unit:5d} total units\n")
  err.write(f"{tot_unit/tot_para:8.2f} avg units/parag\n")
  
  write_TeX_parms_file(ivt_name, utype, psizes, tot_para, tot_unit)
  return
  # ----------------------------------------------------------------------

def write_TeX_parms_file(ivt_name, utype, psizes, tot_para, tot_unit):
  # Writes the file "res/{ivt_name}-upp-parms.tex" with 
  # parameter defintions for LaTeX.
  
  sznum, sztot, szmin, szsin, szmax, szsax, szavg, szdev = \
    compute_and_print_stats("parag sizes", psizes)

  assert sznum == tot_para  
  assert sztot == tot_unit  

  szmax_rel = szmax/szavg
  err.write(f"{szmax_rel:6.2f} max relative parag size\n")

  szmin_rel = szmin/szavg
  err.write(f"{szmin_rel:6.2f} min relative parag size\n")

  txpref = name_for_tex_macro(ivt_name)
  
  tex_file = f"res/{ivt_name}-upp-parms.tex"
  tex_wr = open(tex_file, "w")
  
  tex_wr.write(f"\\def\\{txpref}NumParags{{{tot_para}}}\n")
  tex_wr.write(f"\\def\\{txpref}TotUnits{{{tot_unit}}}\n")
  tex_wr.write(f"\n") 
  tex_wr.write(f"\\def\\{txpref}PerParagMinUnits{{{szmin}}}\n")
  tex_wr.write(f"\\def\\{txpref}PerParagMaxUnits{{{szmax}}}\n")
  tex_wr.write(f"\\def\\{txpref}PerParagAvgUnits{{{szavg:.2f}}}\n")
  tex_wr.write(f"\\def\\{txpref}PerParagDevUnits{{{szdev:.2f}}}\n")
  tex_wr.write(f"\n") 
  tex_wr.write(f"\\def\\{txpref}PerParagSecMinUnits{{{szsin}}}\n")
  tex_wr.write(f"\\def\\{txpref}PerParagSecMaxUnits{{{szsax}}}\n")
  tex_wr.write(f"\n") 

  tex_wr.write(f"\\def\\{txpref}PerParagMinRelSize{{{szmin_rel:.2f}}}\n")
  tex_wr.write(f"\\def\\{txpref}PerParagMaxRelSize{{{szmax_rel:.2f}}}\n")

  tex_wr.close()
  return
  # ----------------------------------------------------------------------

def test_stuff():
  arg_error("no tests yet\n")
  return
  # ----------------------------------------------------------------------

if sys.argv[1] == "test":
  test_stuff()
else:
  ivt_name = sys.argv[1]
  utype = sys.argv[2]
  main(ivt_name, utype)