#! /usr/bin/python3
# Last edited on 2026-03-07 09:17:04 by stolfi

# The command line arguments are a the name {ivt_name} of an input file,
# a text unit type {utype} ("ch", "ps", "ec", "wp", "wc", etc), a keyword pattern
# {kword} and a filename-safe version {ktag} of
# the same.
#
# Reads the file "res/{ivt_name}.ivt" The file should by
# an IVTFF-like format, in the format "<{LOC}> {TEXT}". Finds all
# occurrences of a given string or RE pattern {kword} in the {TEXT} of
# each line, assuming that ist nature is that described by {utype}.
# 
# Writes to "res/{ivt_name}-{ktag}.wpo" a line for each such occurrence.
# Each line has the format "{LOC} {PSIZE} {WPOS}", where {LOC} is the
# input line's locus ID; {PSIZE} is the length of the {TEXT} in that
# line; and {WPOS} is the the position of the occurence of the word,
# namely the length of the part of the cleaned input {TEXT} that
# precedes that occurrence.
# 
# The unit type {utype} specifies the nature of the {TEXT} and of the
# units used when measuring line sizes and match positions. In particuler, this script
# assumes that the {TEXT} has been cleaned-up according to the {utype}. See
# {clean_up_raw_text} in {size_position_funcs.py} for details.
# 
# In any case the input file is assumed to be in Unicode UTF-8 encoding,
# and so will be the output file.
# 
# If {utype} is "ch" or "ec", the {kword} pattern is searched on the {TEXT} as
# asubtring. In the "ec" case,{kword} must match only the characters '[a-z?]';
# in the "ch" case it must match only simplified hanzi characters. In
# eiter case it must not contain any punctuation or the specialpatterns
# '^', '$', and '\b'. Raw parag sizes and positions are measured in EVA
# or hanzi characters.
#
# If {utype} is "ps", the {kword} pattern must match one or more pinyin
# syllables separated by single blanks. The pattern will be modified so that
# it matches only one or more whole syllables. The line size and word
# positions are counted in syllables.
#
# If {utype} is "wc", or "wp", the {kword} must match EVA characters
# [a-z?.] It may use '.' or '\b' to force the matching of whole units
# (Voynichese tokens); otherwise the matched string may begin with
# proper suffix of a unit, and/or may end with a proper prefix of a
# unit.
# 
# In any case, the {PSIZE} is the totalcount of units in the {TRXT},
# and {WPOS} is counts the count of units of {TEXT} that precede the matched 
# substring. If {kword} matches a proper suffix of a unit, the unmatched
# part is as half a unit in the raw position, irrespective of its length
#

import sys, os, re
from sys import stderr as err
from process_funcs import bash, basic_line_loop
from error_funcs import arg_error, file_line_error, prog_error
from chinese_funcs import read_chinese_char_set
from note_077_funcs import compute_and_print_stats, name_for_tex_macro
import size_position_funcs as spf

def main(ivt_name, utype, kword, ktag):
  # {ivt_name}  Name of input ".ivt" file without extension or folder.
  # {utype} Unit for raw parag lengths and word positions: "ch", "ps", "ec", "wc, "wp".
  # {kword}  String or RE pattern to search.
  # {ktag}   String that identifies {kword} for output file names.
  
  in_file = f"res/{ivt_name}.ivt"
  rd = open(in_file, "r")
  rd.reconfigure(encoding='utf-8')
  
  out_file = f"res/{ivt_name}-{ktag}.wpo"
  wr = open(out_file, "w")
  wr.reconfigure(encoding='utf-8')
  wr.write("# -*- coding: utf-8 -*-\n")

  pat_line, pat_unit, pat_sepa, clean_sepa = spf.get_parsing_patterns(utype)
  
  tot_line = 0 # Count of data lines.
  tot_unit = 0 # Total count of units in input file.
  tot_sepa = 0 # Total chars removed in cleanup.
  tot_wocc = 0 # Total occurrences of {kword} found.
  
  loc_list = []      # Locus IDs of the input lines, without [<>].
  psize_list = []    # List of all unit counts of input lines.
  wpos_list_list = []  # List of lists of raw word positions in line.

  def process_input_line(nline, line):
    nonlocal tot_line, tot_unit, tot_sepa, tot_wocc 
    nonlocal loc_list, psize_list, wpos_list_list
    # 
    # Parses a line {line} assuming it is line {nline} of the file.
    # The {line} is always a string (never {None}), but may be "" if the line
    # is empty.
    # 
    # Ignores the line if it is a blank or #-comment.
    # 
    # Otherwise the line must be a data line, matching {pat_line}
    # 
    # Increments {tot_line} for each data line.
    # 
    # Parses the text into units as specified by {utype}. Updates
    # {tot_sepa}, {tot_unit}. Then makes a list {wpos_list} of the
    # raw positions of occurrences of {kword} in it, counted as specified
    # by {utype}.
    #
    # For each data line, appends its data to {loc_list, psize_list,
    # wpos_list_list}.

    # Should we debug the line?
    debug = False
    
    def data_error(msg):
      nonlocal in_file, nline, line
      file_line_error(in_file, nline, msg, line)
      assert False
      # ----------------------------------------------------------------------

    assert line != None, "The {line} arg must not be {None}" 

    # Ignore comments and blank lines:
    if re.match(r" *([#]|$)", line): return

    # Just in case, ignore IVTFF page headers:
    if re.match(r"<f[0-9]+[rv][0-9]*>", line): return
    
    tot_line += 1

    m = re.match(pat_line, line)
    if m is None: 
      # Invalid line format.
      data_error("invalid line format")

    # Parse the line into locus ID and text:
    assert m.lastindex == 2, f"bug {m.lastindex = }"
    loc = m.group(1)
    text = m.group(2) 
    
    # Split text into units:
    units, ct_sepa = spf.split_text_into_units(text, utype, pat_unit, pat_sepa, data_error)
    ct_unit = len(units)
    tot_unit += ct_unit
    tot_sepa += ct_sepa

    # Compute raw line size and raw occurrences of {kword}:
    psize = ct_unit
    wpos_list = spf.list_occurrences(kword, units, clean_sepa, utype, data_error)
    tot_wocc += len(wpos_list)

    # Store for processing at end:
    loc_list.append(loc)
    psize_list.append(psize)
    wpos_list_list.append(wpos_list)
    if debug: err.write(f"!~ {loc:<12s} units = {psize} occs = {wpos_list}\n")

    return      
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

  err.write(f"reading file '{in_file}' ...\n")
  nread = basic_line_loop(rd, process_input_line)
  rd.close()

  tot_line = len(loc_list)
  assert len(wpos_list_list) == tot_line
  
  avg_wocc = tot_wocc/tot_line # Average occurrences per line.

  err.write(f"{nread:6d} lines read\n")
  err.write(f"{tot_line:6d} data lines found\n")
  err.write(f"{tot_sepa:6d} total separator chars\n")
  err.write(f"{tot_unit:5d} total units\n")
  err.write(f"{tot_wocc:5d} total occurrences of {kword}\n")
  err.write(f"{avg_wocc:8.2f} avg occurrences/line\n")

  err.write(f"writing {out_file} with occurrences of '{kword}' ...\n")
  output_data(wr, loc_list, psize_list, wpos_list_list)
  wr.close()
  
  write_TeX_parms_file(ivt_name, utype, ktag, psize_list, wpos_list_list, tot_line, tot_unit, tot_wocc)

  return
  # ----------------------------------------------------------------------

def output_data(wr, loc_list, psize_list, wpos_list_list):
  debug = False
  for loc, psize, wpos_list in zip(loc_list, psize_list, wpos_list_list):
    if len(wpos_list) > 0:
      for wpos in wpos_list:
        if debug: err.write(f"!~   {loc:<12s} {psize = :6.1f} {wpos = :6.1f}\n")
        wr.write("%-12s %6.2f %6.2f\n" % (loc, psize, wpos))
      wr.write("\n") 
  wr.flush()
  return
  # ----------------------------------------------------------------------

def write_TeX_parms_file(ivt_name, utype, ktag, psize_list, wpos_list_list, tot_line, tot_unit, tot_wocc):
  # Writes the file "res/{ivt_name}-{ktag}-woc-parms.tex" with 
  # parameter defintions for LaTeX.

  # List ana analyze the COUNT of occurrences per input file line:
  npl_list = [ len(wpos_list) for wpos_list in wpos_list_list ]
  nplnum, npltot, nplmin, nplsin, nplmax, nplsax, nplavg, npldev = \
    compute_and_print_stats("occurrences of the word per line", npl_list)
  assert nplnum == tot_line
  assert npltot == tot_wocc

  err.write("\n")
  # List and analyze all POSITIONS of the occurrences:
  wpo_list = [ wpo for sub in  wpos_list_list for wpo in sub ]
  
  wponum, wpotot, wpomin, wposin, wpomax, wposax, wpoavg, wpodev = \
    compute_and_print_stats("Positions of word per line", wpo_list)
  assert wponum == tot_wocc
  
  # Count lines with and without the word:
  tot_line_with_word = 0;
  for sub in wpos_list_list: 
    if len(sub) > 0: tot_line_with_word += 1
  tot_line_sans_word = tot_line - tot_line_with_word
  err.write(f"lines with word = {tot_line_with_word} without = {tot_line_sans_word}\n")
  
  unit_size = spf.hanzi_per_unit(utype) # Nominal avg num of Chinese chars per unit.
  err.write(f"assumed avg hanzi per unit = {unit_size:8.2f}\n")

  txpref = name_for_tex_macro(f"{ivt_name}-{ktag}-word-pos")
  
  tex_file = f"res/{ivt_name}-{ktag}-wpos-parms.tex"
  tex_wr = open(tex_file, "w")
  
  # These should match the defs from other TeX parms files:
  tex_wr.write(f"\\def\\{txpref}NumLines{{{tot_line}}}\n")
  tex_wr.write(f"\\def\\{txpref}TotUnits{{{tot_unit}}}\n")
  tex_wr.write(f"\n") 

  tex_wr.write(f"\\def\\{txpref}LinesWith{{{tot_line_with_word}}}\n")
  tex_wr.write(f"\\def\\{txpref}LinesSans{{{tot_line_sans_word}}}\n")
  tex_wr.write(f"\n") 

  tex_wr.write(f"\\def\\{txpref}HanziPerUnit{{{unit_size:.3f}}}\n")
  tex_wr.write(f"\n") 

  tex_wr.write(f"\\def\\{txpref}Min{{{wpomin:.2f}}}\n")
  tex_wr.write(f"\\def\\{txpref}Max{{{wpomax:.2f}}}\n")
  tex_wr.write(f"\\def\\{txpref}Avg{{{wpoavg:.2f}}}\n")
  tex_wr.write(f"\\def\\{txpref}Dev{{{wpodev:.2f}}}\n")
  tex_wr.write(f"\n") 

  tex_wr.write(f"\\def\\{txpref}TotCt{{{tot_wocc}}}\n")
  tex_wr.write(f"\n") 
  tex_wr.write(f"\\def\\{txpref}PerLineMinCt{{{nplmin}}}\n")
  tex_wr.write(f"\\def\\{txpref}PerLineMaxCt{{{nplmax}}}\n")
  tex_wr.write(f"\\def\\{txpref}PerLineAvgCt{{{nplavg:.2f}}}\n")
  tex_wr.write(f"\\def\\{txpref}PerLineDevCt{{{npldev:.2f}}}\n")
  tex_wr.write(f"\n") 
  tex_wr.write(f"\\def\\{txpref}PerLineSecMinCt{{{nplsin}}}\n")
  tex_wr.write(f"\\def\\{txpref}PerLineSecMaxCt{{{nplsax}}}\n")
  tex_wr.write(f"\n") 

  tex_wr.close()
  return
  # ----------------------------------------------------------------------

def test_stuff():
  arg_error("no tests yet\n")
  return
  # ----------------------------------------------------------------------

if sys.argv[1] == "test":
  test_stuff()
else:
  narg = len(sys.argv)
  iarg = 1
  ivt_name = sys.argv[iarg]; iarg += 1
  utype =    sys.argv[iarg]; iarg += 1
  kword =     sys.argv[iarg]; iarg += 1
  ktag =     sys.argv[iarg]; iarg += 1
  main(ivt_name, utype, kword, ktag)