#! /usr/bin/python3
# Last edited on 2026-04-30 22:46:30 by stolfi

# Orignal matching sections for the report -- using manual selection of 
# strick/liberal patterns, no penalty for liberal ones.

import sys, re, string
from sys import stderr as err
import html_gen as h
import html_report_funcs as hr
import match_multi_funcs as mmf
import size_position_funcs as spf
import analyze_starps_parag_funcs as anf
import bimatching_eval_funcs as bef
from math import sqrt, hypot, exp, log, floor, ceil, isfinite, isnan, inf, nan
  
def analyze_starps_parags_org(code_ch, segs_ch, kwords_ec):
  # 
  # Arguments:
  # 
  #   {code_ch}       four-letter code of the SBJ entry.
  #   {segs_ch}       macro-parsing of the SBJ entry into gaps and hits.
  #   {kwords_ec}     list of EVA RE pattern(s) for keywords in EVA text.
  #
  # The parameter {kwords_ec} must be a list of {nh} RE patterns that
  # are supposed to match in {cleantx_ec} the Voynichese EVA translations
  # of certain hanzi keywords.
  #
  # Let {nh} be the number of patterns {kwords_ec}, and {ng} be {nh+1}.
  # The procedure calls {anf.analyze_starps_parags_org} to scan the file
  # of good SPS parags {ivt_file} and evaluating each parag for how well
  # it matches the SBJ entry. 
  #
  #  For each parag considered, the procedure creates a version
  # {cleantx_ec} of its text, that has only lowercase EVA letters
  # [a-z?].  It then uses {mmf.match_multi_pattern} with the pattern list {kwords_ec}
  # to get a macro-parsing of {cleantx_ec} that is most compatible
  # with the macro-parsing {segs_ch} of the SBJ entry.
  #
  # The result of this procedure is a list {parevs} of parag evaluation
  # tuples (/parevs/) and a dictionary {data} with various counts of the
  # operation. There will be a parev for each parag that may possibly
  # match.
  
  ivt_file = f"res/starps-gd-ec-par.ivt"

  ns = len(segs_ch)
  nh = len(kwords_ec); ng = nh + 1; 
  assert ns == ng + nh 

  assert not isinstance(kwords_ec, str) # To catch old uses.

  def match_func(cleantx_ec):
    # SBJ-SPS entry matching function.
    #
    # The function splits the EVA text {cleantx_ec} at the occurrences
    # the patterns {kword_ec[0..ns-1]}, using {mmf.match_multi_pattern}
    # obtaining a macro-parsing {segs_ec} for it, consisting of {nh}
    # "hits" matched by the patterns and {ng} "gaps" between those hits.
    #
    # The function then computes a badness {score} for the parag as a
    # candidate for the SPS version of the SBJ entry. The sizes of the
    # gaps (even-indexed elements) in the SPS parag macro-parsing
    # {segs_ec[0..ns-1]} are compared with the sizes of the
    # corresponding gaps in the macro-parsing {segs_ch[0..ns-1]} of
    # the SBJ entry, individually and as total, taking into account the
    # average hanzi-to-EVA conversion factor and roundoff errors due to
    # the sizes being integers.
    #
    # The returned result is the {score} and the macro-parsings
    # {segs_ch[0..ns-1]}, {segs_ec[0..ns-1]}.

    def eval_segs_ec(segs_ec):
      assert len(segs_ec) == ns
      key_penalty = 0 # We don't have this for the original version .
      score = bef.compute_full_score_from_macro_parsings \
        (segs_ch, "ch", segs_ec, "ec", key_penalty)
      return score
      # ..................................................................

    # Search for {kwords_ec} in the {cleantx_ec}:
    segs_ec, score = mmf.match_multi_pattern(cleantx_ec, kwords_ec, eval_segs_ec)
    key_penalty = 0 # We don't have this for the original version .
    if segs_ec != None: assert len(segs_ec) == ns
    return score, segs_ch, segs_ec, key_penalty
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

  parevs, data = anf.analyze_starps_parags(ivt_file, nh, match_func)
  return parevs, data
  # ----------------------------------------------------------------------
  
def add_keywords_table(st, kwords_en, kwords_ch, kwords_ec):

  h.parags(st, "Keyword patterns:")
  
  nh = len(kwords_en)
  bars = [ '&nbsp;<b>||</b>&nbsp;' ] * nh
  rows = list(zip(kwords_en, bars, kwords_ch, bars, kwords_ec))
  col_mods = [
     "style='padding-left:4ch; padding-right:4ch; text-align:left;'", 
     "align=left",  
     "align=left", 
     "align=left",  
     "align=left", 
   ]
   
  h.table(st, rows, col_mods = col_mods, centered = False)
  return
  # ----------------------------------------------------------------------

def add_bencao_parsing_section(st, loc_ch, kwords_ch, segs_ch):
  h.section(st, 4, "SBJ entry parsing:")
  hipat_ch = get_keyword_highlight_pattern_org(kwords_ch)
  ch_str = anf.format_macro_parsing_ch(loc_ch, segs_ch, hipat_ch)
  ch_str = h.protect_html(ch_str)
  h.append_preformatted(st, ch_str, ind = 4, centered = False)
  return
  # ----------------------------------------------------------------------
   
def get_ch_en_keyword_matching_templates(kwords_en, strict):
  # Returns lists {kwords_ch, kwords_ec} of 
  # patterns that should match the keywords, in 
  # a specific order. 

  kwords_ch = []
  kwords_ec = []
  for kw_en in kwords_en:
    kw_ch, kw_ec = get_ch_ec_keyword_matching_patterns(kw_en, strict)
    kwords_ch.append(kw_ch)
    kwords_ec.append(kw_ec)
  return kwords_ch, kwords_ec
  # ----------------------------------------------------------------------
 
def get_keyword_highlight_pattern_org(kwords):
  # Given a list of keyword-matching patterns,
  # returns a pattern useful for highlighting keywords in
  # each text.
  
  debug = False
  nh = len(kwords);
  pats = []
  for ih in range(nh):
    pats += kwords[ih].split('|')
  hipat = "|".join(tuple(set(tuple(pats))))
  assert re.match(hipat, "") == None, f"hipat matches empty"
  return hipat
  # ----------------------------------------------------------------------

def get_ch_ec_keyword_matching_patterns(kw_en, strict):
  # Returns the hanzi and EVA keywords  {kw_ch,kw_ec} corresponding to 
  # the English keyword type {kw_en}.
  #
  if kw_en == "MAINLY-FOR" or kw_en == "MAIN-USES":
    kw_ch = '主' if kw_en == "MAINLY-FOR" else '主治'
    kw_ec = 'daiin|dair|laiin' if strict else '[dlkrs][ao]ii?n|[dlkrs][ao]ir'
  elif kw_en == "LONG-TAKE" or kw_en == "LONG" or kw_en == "TAKE":
    kw_ch = '久服' if kw_en == "LONG-TAKE" else '久' if kw_en == "LONG" else '服'
    if strict:
      kw_okaiin = 'q?[aoy]kaiin'
      kw_okeedy = 'q?[aoy]keed[aoy]'
    else:
      kw_okaiin = 'q?[aoy][ktd][ao]ii?n'
      kw_okeedy = '[aoy][ktd]ee?[dk][aoy]'
    kw_ec = kw_okeedy + "|" + kw_okaiin
  elif kw_en == "QI":
    kw_ch = '气'
    kw_ec = 'chedy' if strict else '[cs]he?[kd][aoy]'
  else:
    assert False, f"invalid English keyword {kw_en}"
  return kw_ch, kw_ec
  # ----------------------------------------------------------------------