#! /usr/bin/python3
# Last edited on 2026-05-18 07:54:51 by stolfi

import os, sys, re
from sys import stderr as err, stdout as out
import write_parsing_funcs as wpf
import size_position_funcs as spf
from math import sqrt, hypot, log, exp, floor, ceil, inf, nan, isfinite

# The procedures of this module are meant to evaluate a bimatching
# between the clean text {text0} of type {utype0} and the clean text
# {text1} of type {utype1}. The evaluation is summarized in a
# non-negative /badness score/ for the bimatching. The best possible
# bimatching has badness zero.
#
# The most important case is when {text0} is the clean text (hanzi only)
# of an SBJ entry, with {utype0 = "ch"}; and {text1} is the clean text
# (EVA letters [a-z?] only) of an SPS parag, whith {utype1 = "ec"}.
#
# A bimatching {bimatch}, as produced by {match_bitemplate} in
# {bitemplate_match_funcs}, is a list of a certain number {nh} of
# /rungs/, quintuples {(i0,j0,i1,j1,bpvar)}. It describes two
# corresponding macro-parsings {segs0} of {text0} and {segs1} of
# {text1}. Each macro-parsing is a partition of the corresponding text
# into an odd number {ns = 2*nh+1} of /segments/, alternating {ng =
# nh+1} /gaps/ and {nh} /hits/.
#
# The full score of a bimatching is the sum of 
#
#  * a /gap size score/ that depends on the sizes 
#    of the {ng} gaps of {segs0} and {segs1}.
#
#  * a /key penalty/ which is the sum of the {pena} field
#    in the {pvar} field of the rungs of the bimatching.
#
# Note that the score does not depend on the actual texts or the sizes
# of the hits.
#

def compute_full_score_from_bimatching(nt0, utype0, nt1, utype1, bimatch):
  # Computes the full badness score of the bimatching {bimatch},
  # given the sizes {nt0,nt1} of the two texts and the their types {utype0,utype1}.

  nh = len(bimatch)
  gsizes0, gsizes1, key_penalty = bmf.get_gap_sizes_from_bimatching(nt0, nt1, bimatch)
  score = compute_full_score_from_gap_sizes(gsizes0, utype0, gsizes1, utype1, key_penalty)
  return score
  # ----------------------------------------------------------------------

def compute_full_score_from_macro_parsings(segs0, utype0, segs1, utype1, key_penalty):
  # Computes the full badness score of a bimatching between two texts given their
  # macro-parsings {segs0,segs1} as defined by the bimatching, their types {utype0,utype1},
  # and the total keyword penalty {key_penalty} of the rungs of the bimatching.
  
  ns = len(segs0); assert ns == len(segs1)
  nh = ns//2; ng = nh + 1; assert ns == ng + nh
  
  gsizes0 = get_gap_sizes(segs0)
  gsizes1 = get_gap_sizes(segs1)
  score = compute_full_score_from_gap_sizes(gsizes0, utype0, gsizes1, utype1, key_penalty)
  return score
  # ----------------------------------------------------------------------
 
def compute_full_score_from_gap_sizes(gsizes0, utype0, gsizes1, utype1, key_penalty):
  # Computes the full badness score of a bimatching with {nh} rungs between two texts, given the
  # {ng = nh+1} gap sizes {gsizes0,gsizes1} implied by the bimatching, and the
  # total keyword penalty {key_penalty} in the rungs of the bimatching.
  
  debug = False
  
  ng = len(gsizes0); assert ng == len(gsizes1)
  nh = ng - 1

  if debug: err.write(f"!S {utype0 = !r} {utype1 = !r} {nh = }\n")

  score = 0
  gaps_score = compute_gaps_score(gsizes0, utype0, gsizes1, utype1)
  score += gaps_score
  if debug: err.write(f"!S gaps score = {gaps_score:.6f} accum {score:.6f}\n")
  score +=  key_penalty
  if debug: err.write(f"!S key penalty = {key_penalty:.6f} accum {score:.6f}\n")
  return score
  # ----------------------------------------------------------------------

def compute_gaps_score(gsizes0, utype0, gsizes1, utype1):
  # Computes the gap sizes component of the badness score of a bimatching
  # given the actual sizes {gsizes0[0..ng-1],gsizes1[0..ng-1]}
  # of the gaps determined by the bimatching.

  debug = False
   
  ng = len(gsizes0); assert len(gsizes1) == ng
  gaps_score = 0
  for ig in range(ng):
    gsci = compute_single_gap_score(gsizes0[ig], utype0, gsizes1[ig], utype1, ig, ng)
    gaps_score += gsci
    if debug: err.write(f"!G gap score = {gsci:.6f} accum = {gaps_score:.6f}\n")
  if debug: err.write(f"!G gap score final = {gaps_score:.6f}\n")
  return gaps_score
  # ----------------------------------------------------------------------
    
def compute_single_gap_score(gsize0, utype0, gsize1, utype1, ig, ng): 
  # Computes the contribution to the the badness score of a bimatching
  # due to a single gap, whose size is {gsize0} on a text of type {unit0} and 
  # the size {gsize1} of a supposedly matchng text of a type {uptype1}.
  #
  # The contribution is properly weighted based on the index {ig}
  # of the gap and the number of {ng} of gaps.
   
  debug = False
   
  if debug: err.write(f"\n!g {gsize0 = :3d} {gsize1 = :3d} {ig = :2d} {ng = :2d}")
  exp_gsize1 = expected_size1_from_size0(gsize0, utype0, utype1)
  wt = 0.50 if ig == 0 else 0.50 if ig == ng-1 else 1.00
  wt /= ng
  score = wt * compute_single_size_score(gsize1, exp_gsize1)
  if debug: err.write(f" {wt = :.6f} {score = :7.3f}\n")
  return score
  # ----------------------------------------------------------------------

def compute_single_size_score(sz, esz):
  # Mismatch score between a single size {sz} and the expected size range {esz}.

  debug = False
  if debug: err.write(f"!s {sz = :4.1f} esz = {esz[0]:3d}..{esz[1]:3d}")
  
  frac = spf.compute_frac_error(sz, esz)
  score = 0.20 * frac**2
  if debug: err.write(f" {frac = :.6f} {score = :.6f}\n")
  return score
  # ----------------------------------------------------------------------

def expected_size1_from_size0(size0, utype0, utype1):
  # Computes the expected size {exp_size1} of some text of type {utype1} 
  # given the size of {size0} of the supposedly corresponding text of type {utype0}
  # The result is a narrow range, computed assuming a fixed internal conversion
  # factor of hanzi characters to EVA letters.

  un1_per_un0 = spf.hanzi_per_unit(utype0, True)/spf.hanzi_per_unit(utype1, True) 
  eps = 0.0001
  exp_size1 = spf.scale_size_fuzzy(size0, un1_per_un0, eps)
  return exp_size1
  # ----------------------------------------------------------------------

def get_gap_sizes(segs):
  # Given a macro-parsing {segs} with {nh} hits and {ng = nh+1} gaps, 
  # returns a list {sizes[0..ng-1]} of the lengths of
  # gaps (omitting the hits).
  
  ns = len(segs); 
  nh = ns//2; ng = nh+1; assert ns == ng + nh
  
  sizes = [ len(segs[2*ig]) for ig in range(ng) ]
  return sizes
  # ----------------------------------------------------------------------

def get_hit_sizes(segs):
  # Given a macro-parsing {segs} with {nh} hits and {ng=nh+1} gaps, 
  # returns a list {sizes[0..nh-1]} of the lengths of
  # hits (omitting the gaps).
  
  ns = len(segs); 
  nh = ns//2; ng = nh+1; assert ns == ng + nh
  
  sizes = [ len(segs[2*ih + 1]) for ih in range(nh) ]
  return sizes
  # ----------------------------------------------------------------------

def test_stuff():
  test_compute_single_gap_score()
  test_compute_full_score_from_macro_parsings_1()
  test_compute_full_score_from_macro_parsings_2()
  return
  # ----------------------------------------------------------------------
  
def test_compute_single_gap_score():
  err.write("----------------------------------------\n")
  err.write("testing compute_single_gap_score ...\n")
  gs_ch = 5
  for gs_ec in range(40):
    sc = compute_single_gap_score(gs_ch, "ch", gs_ec, "ec", 1, 3)
    err.write(f"{gs_ch = :4d} {gs_ec = :4d} {sc = :7.3f}\n")

def test_compute_full_score_from_macro_parsings_1():
  segs_ch = \
    (        "龙骨",
      "主治", "心腹鬼注精物老魅咳逆泄利脓血漏下症瘕坚结热",
      "气",   "惊痫齿",
      "主治", "惊痫癫疾狂走心下结",
      "气",   "不能喘息诸痉杀精物",
      "久",   "服轻身通神明延年",
    )
  loc_ec = "f106r.42"
  segs_ec = \
    (           "pcheodarshol",
      "kain",   "okshchedyqoteeyshotchyqotylpaiinshedylardaiiralsheoldaiinotedyqokainakarcheoraltaiinchekalotarard",
      "shedy",  "qoteeyotaiinchychealolchlch",
      "daiin",  "otyotairotaiikamysheedalokainakainotarkaiin",
      "chda",   "lkairolkaisalkeedyokalsotchdaiinshadaiinot?yqokeedyq",
      "okaiin", "ykarqokaincheedylolycheoarokainqokaincharokycheokam",
    )
  key_penalty = 1.620
  do_test_compute_full_score_from_macro_parsings(segs_ch, loc_ec, segs_ec, key_penalty)
  return
  # ----------------------------------------------------------------------
    
def test_compute_full_score_from_macro_parsings_2():
  segs_ch = \
    (        "龙骨",
      "主治", "心腹鬼注精物老魅咳逆泄利脓血漏下症瘕坚结热",
      "气",   "惊痫齿",
      "主治", "惊痫癫疾狂走心下结",
      "气",   "不能喘息诸痉杀精物",
      "久服", "轻身通神明延年",
    )
  loc_ec = "f104v.1"
  segs_ec = \
    (           "pch",
      "daiin",  "opcheedyoraroltcheeyopchedyolearaiiralycheodaiincheekaindamychedaiinqoteedchockhyotaiinydaiinqokamdyotararal",
      "chedo",  "tairoramshodchedyqotaiino",
      "daiin",  "okeolockhhycholqokeedyqotairoeedaiinoldlqoteedy",
      "cheda",  "iinchokarqotolqotchedcholcheyqolchedyqoeeeyq",
      "okeedy", "dcheolchdeeyoeeodainsairolchedal",
    )
  key_penalty = 0.200
  do_test_compute_full_score_from_macro_parsings(segs_ch, loc_ec, segs_ec, key_penalty)
  return
  # ----------------------------------------------------------------------
          
def do_test_compute_full_score_from_macro_parsings(segs_ch, loc_ec, segs_ec, key_penalty):
  
  err.write("----------------------------------------\n")
  err.write(f"=== testing compute_full_score_from_macro_parsings ===\n")
  err.write(f"{loc_ec = }\n")
  score = compute_full_score_from_macro_parsings \
    ( segs_ch, "ch", segs_ec, "ec", key_penalty )
  err.write(f"{score = :.6f}\n")
  return
  # ----------------------------------------------------------------------

if len(sys.argv) == 2 and sys.argv[1] == "BEF.TEST":
  test_stuff()