#! /usr/bin/python3
# Last edited on 2026-05-01 15:42:43 by stolfi

import os, sys, re
from sys import stderr as err, stdout as out
import write_parsing_funcs as wpf
import size_position_funcs as spf
from math import sqrt, hypot, log, exp, floor, ceil, inf, nan, isfinite

# The procedures of this module are meant to evaluate a bimatching
# between the clean text {text0} of type {utype0} and the clean text
# {text1} of type {utype1}. The evaluation is summarized in a
# non-negative /badness score/ for the bimatching. The best possible
# bimatching has badness zero.
#
# The most important case is when {text0} is the clean text (hanzi only)
# of an SBJ entry, with {utype0 = "ch"}; and {text1} is the clean text
# (EVA letters [a-z?] only) of an SPS parag, whith {utype1 = "ec"}.
#
# A bimatching {bimatch}, as produced by {match_bitemplate} in
# {bitemplate_match_funcs}, is a list of a certain number {nh} of
# /rungs/, quintuples {(i0,j0,i1,j1,bpvar)}. It describes two
# corresponding macro-parsings {segs0} of {text0} and {segs1} of
# {text1}. Each macro-parsing is a partition of the corresponding text
# into an odd number {ns = 2*nh+1} of /segments/, alternating {ng =
# nh+1} /gaps/ and {nh} /hits/.
#
# The full score of a bimatching is the sum of 
#
#  * a /total size score/ that depends only on the
#    on the total sizes of the two texts and the number of hits {nh}
#
#  * a /gap size score/ that depends on the sizes 
#    of the {ng} gaps of {segs0} and {segs1}.
#
#  * a /key penalty/ which is the sum of the {pena} field
#    in the {pvar} field of the rungs of the bimatching.
#
# Note that the score does not depend on the actual texts or the sizes
# of the hits.
#
# Note also that the total size score does not depend on the bimatching
# except through the count {nh}.

def compute_full_score_from_bimatching(tsize0, utype0, tsize1, utype1, bimatch):
  # Computes the full badness score of the bimatching {bimatch},
  # given the total sizes {tsize0,tsize1} of the two texts and their
  # types {utype0,utype1}.

  nh = len(bimatch)
  gsizes0, gsizes1, key_penalty = get_gap_sizes_from_bimatching(bimatch)
  score = compute_full_score_from_gap_sizes(tsize0, gsizes0, utype0, tsize1, gsizes1, utype1, key_penalty)
  return score
  # ----------------------------------------------------------------------

def compute_full_score_from_macro_parsings(segs0, utype0, segs1, utype1, key_penalty):
  # Computes the full badness score of a bimatching between two texts given their
  # macro-parsings {segs0,segs1} as defined by the bimatching, their types {utype0,utype1},
  # and the total keyword penalty {key_penalty} of the rungs of the bimatching.
  
  ns = len(segs0); assert ns == len(segs1)
  nh = ns//2; ng = nh + 1; assert ns == ng + nh
  
  tsize0 = 0;
  for sg in segs0: tsize0 += len(sg)
  tsize1 = 0;
  for sg in segs1: tsize1 += len(sg)
   
  gsizes0, tgsize0 = get_gap_sizes(segs0)
  gsizes1, tgsize1 = get_gap_sizes(segs1)
  score = compute_full_score_from_gap_sizes(tsize0, gsizes0, utype0, tsize1, gsizes1, utype1, key_penalty)
  return score
  # ----------------------------------------------------------------------
 
def compute_full_score_from_gap_sizes(tsize0, gsizes0, utype0, tsize1, gsizes1, utype1, key_penalty):
  # Computes the full badness score of a bimatching with {nh} rungs between two texts, given the
  # sizes {tsize0,tsize1} of the texts, the {ng = nh+1} gap sizes
  # {gsizes0,gsizes1} implied by the bimatching, and the
  # total keyword penalty {key_penalty} in the rungs of the bimatching.
  
  debug = True
  
  ng = len(gsizes0); assert ng == len(gsizes1)
  nh = ng - 1

  if debug: err.write(f"!S {tsize0 = } {utype0 = !r} {tsize1 = } {utype1 = !r} {nh = }\n")

  score = 0
  tsize_score = compute_total_size_score(tsize0, utype0, tsize1, utype1, nh)
  score += tsize_score
  if debug: err.write(f"!S total size score = {tsize_score:.6f}\n")
  gaps_score = compute_gaps_score(gsizes0, utype0, gsizes1, utype1)
  score += gaps_score
  if debug: err.write(f"!S gaps score = {gaps_score:.6f} accum {score:.6f}\n")
  score +=  key_penalty
  if debug: err.write(f"!S key penalty = {key_penalty:.6f} accum {score:.6f}\n")
  return score
  # ----------------------------------------------------------------------
  
def compute_total_size_score(tsize0, utype0, tsize1, utype1, nh):
  # Computes the component of the badness score of a bimatching between
  # two texts that is due to the discrepancy in their total sizes
  # {tsize0,tsize1}, and the number {nh} of hits (which is the number of
  # rungs in the bimatching).

  etgsize0 = estimate_total_gap_size(tsize0, utype0, nh)
  etgsize1 = estimate_total_gap_size(tsize1, utype1, nh)
  exp_etgsize1 = expected_size1_from_size0(etgsize0, utype0, utype1)
  tsz_wt = 1.00
  tsize_score = tsz_wt*compute_single_size_score(etgsize1, exp_etgsize1)
  return tsize_score
  # ----------------------------------------------------------------------

def estimate_total_gap_size(tsize, utype, nh):
  # Estimates the total gap size in {text} of type {utype} assuming that
  # it will contain {nh} hits and each hit corresponds to 
  # a fixed number of hanzi.
  hitsz_ch = 1.500 # Estimated average hit size in hanzi.
  trimmed = True
  ch_per_un = spf.hanzi_per_unit(utype, trimmed)
  etsz_hit_un = nh*hitsz_ch/ch_per_un
  etsz_gap_un = max(0, tsize - etsz_hit_un)
  
  # Smooth cooking a minimum:
  min_avgsz_gap_un = 0.5 # Min assumed average gap size in {utype} units.
  etsz_gap_un = hypot((nh + 1)*min_avgsz_gap_un, etsz_gap_un)
  etsz_gap_un = int(ceil(etsz_gap_un))
  return etsz_gap_un
  # ----------------------------------------------------------------------

def compute_gaps_score(gsizes0, utype0, gsizes1, utype1):
  # Computes the gap sizes component of the badness score of a bimatching
  # given the actual sizes {gsizes0[0..ng-1],gsizes1[0..ng-1]}
  # of the gaps determined by the bimatching.

  debug = False
   
  ng = len(gsizes0); assert len(gsizes1) == ng
  gaps_score = 0
  for ig in range(ng):
    gsci = compute_single_gap_score(gsizes0[ig], utype0, gsizes1[ig], utype1, ig, ng)
    gaps_score += gsci
    if debug: err.write(f"!G gap score = {gsci:.6f} accum = {gaps_score:.6f}\n")
  if debug: err.write(f"!G gap score final = {gaps_score:.6f}\n")
  return gaps_score
  # ----------------------------------------------------------------------
    
def compute_single_gap_score(gsize0, utype0, gsize1, utype1, ig, ng): 
  # Computes the contribution to the the badness score of a bimatching
  # due to a single gap, whose size is {gsize0} on a text of type {unit0} and 
  # the size {gsize1} of a supposedly matchng text of a type {uptype1}.
  #
  # The contribution is properly weighted based on the index {ig}
  # of the gap and the number of {ng} of gaps.
   
  debug = False
   
  if debug: err.write(f"\n")
  exp_gsize1 = expected_size1_from_size0(gsize0, utype0, utype1)
  wt = 0.30 if ig == 0 else 0.30 if ig == ng-1 else 1.00
  gap_score = wt * compute_single_size_score(gsize1, exp_gsize1)
  if debug: err.write(f"!g {gsize1 = } {exp_gsize1 = } {wt = :.6f}\n")
  return gap_score
  # ----------------------------------------------------------------------

def compute_single_size_score(sz, esz):
  # Mismatch score between a single size {sz} and the expected size range {esz}.

  debug = False

  frac = spf.compute_frac_error(sz, esz)
  score = frac**2
  if debug: err.write(f"!s {frac = :.6f} {score = :.6f}\n")
  return score
  # ----------------------------------------------------------------------

def expected_size1_from_size0(size0, utype0, utype1):
  # Computes the expected size {exp_size1} of some text of type {utype1} 
  # given the size of {size0} of the supposedly corresponding text of type {utype0}
  # The result is a range, computed assuming a fixed internal conversion
  # factor of hanzi characters to EVA letters.

  un1_per_un0 = spf.hanzi_per_unit(utype0, True)/spf.hanzi_per_unit(utype1, True) 
  eps = 0.0001
  exp_size1 = spf.scale_size_fuzzy(size0, un1_per_un0, eps)
  return exp_size1
  # ----------------------------------------------------------------------

def get_gap_sizes(segs):
  # Given a macro-parsing {segs} with {nh} hits and {ng = nh+1} gaps, 
  # returns a list {sizes[0..ng-1]} of the lengths of
  # gaps (omitting the hits), and their total {tgsize}.
  
  ns = len(segs); 
  nh = ns//2; ng = nh+1; assert ns == ng + nh
  
  sizes = [ len(segs[2*ig]) for ig in range(ng) ]
  tgsize = 0; 
  for sz in sizes: tgsize += sz
  return sizes, tgsize
  # ----------------------------------------------------------------------

def get_hit_sizes(segs):
  # Given a macro-parsing {segs} with {nh} hits and {ng=nh+1} gaps, 
  # returns a list {sizes[0..nh-1]} of the lengths of
  # hits (omitting the gaps), and their total {thsize}.
  
  ns = len(segs); 
  nh = ns//2; ng = nh+1; assert ns == ng + nh
  
  sizes = [ len(segs[2*ih + 1]) for ih in range(nh) ]
  thsize = 0; 
  for sz in sizes: thsize += sz
  return sizes, thsize
  # ----------------------------------------------------------------------


def test_stuff():
  test_compute_full_score_from_macro_parsings_1()
  test_compute_full_score_from_macro_parsings_2()
  return
  # ----------------------------------------------------------------------
  
def test_compute_full_score_from_macro_parsings_1():
  segs_ch = \
    (        "龙骨",
      "主治", "心腹鬼注精物老魅咳逆泄利脓血漏下症瘕坚结热",
      "气",   "惊痫齿",
      "主治", "惊痫癫疾狂走心下结",
      "气",   "不能喘息诸痉杀精物",
      "久",   "服轻身通神明延年",
    )
  loc_ec = "f106r.42"
  segs_ec = \
    (           "pcheodarshol",
      "kain",   "okshchedyqoteeyshotchyqotylpaiinshedylardaiiralsheoldaiinotedyqokainakarcheoraltaiinchekalotarard",
      "shedy",  "qoteeyotaiinchychealolchlch",
      "daiin",  "otyotairotaiikamysheedalokainakainotarkaiin",
      "chda",   "lkairolkaisalkeedyokalsotchdaiinshadaiinot?yqokeedyq",
      "okaiin", "ykarqokaincheedylolycheoarokainqokaincharokycheokam",
    )
  key_penalty = 1.620
  do_test_compute_full_score_from_macro_parsings(segs_ch, loc_ec, segs_ec, key_penalty)
  return
  # ----------------------------------------------------------------------
    
def test_compute_full_score_from_macro_parsings_2():
  segs_ch = \
    (        "龙骨",
      "主治", "心腹鬼注精物老魅咳逆泄利脓血漏下症瘕坚结热",
      "气",   "惊痫齿",
      "主治", "惊痫癫疾狂走心下结",
      "气",   "不能喘息诸痉杀精物",
      "久服", "轻身通神明延年",
    )
  loc_ec = "f104v.1"
  segs_ec = \
    (           "pch",
      "daiin",  "opcheedyoraroltcheeyopchedyolearaiiralycheodaiincheekaindamychedaiinqoteedchockhyotaiinydaiinqokamdyotararal",
      "chedo",  "tairoramshodchedyqotaiino",
      "daiin",  "okeolockhhycholqokeedyqotairoeedaiinoldlqoteedy",
      "cheda",  "iinchokarqotolqotchedcholcheyqolchedyqoeeeyq",
      "okeedy", "dcheolchdeeyoeeodainsairolchedal",
    )
  key_penalty = 0.200
  do_test_compute_full_score_from_macro_parsings(segs_ch, loc_ec, segs_ec, key_penalty)
  return
  # ----------------------------------------------------------------------
          
def do_test_compute_full_score_from_macro_parsings(segs_ch, loc_ec, segs_ec, key_penalty):
  
  err.write(f"=== testing compute_full_score_from_macro_parsings ===\n")
  err.write(f"{loc_ec = }\n")
  score = compute_full_score_from_macro_parsings \
    ( segs_ch, "ch", segs_ec, "ec", key_penalty )
  err.write(f"{score = :.6f}\n")
  return
  # ----------------------------------------------------------------------

if len(sys.argv) == 2 and sys.argv[1] == "BEF.TEST":
  test_stuff()