#! /usr/bin/python3
# Last edited on 2026-02-27 05:43:10 by stolfi

import sys, os, re;
from sys import stderr as err
from error_funcs import arg_error, prog_error
from process_funcs import bash
from note_077_funcs import enc_from_unit
from math import fabs
  
def add_single_size_hist_plot_rules(pre, mak, tit, name, color, bin_size): 
  # Rules to create the plot file "{name}-upp-hist.png" with the
  # histogram of parag sizes listed in files "res/{name}.upp".

  target = f"{name}-upp-hist.png"

  # Hstogram plotting script and its imported modules:
  script = "plot_single_parag_size_histograms.sh"
  mkwdhist_script = "make_hist_of_units_per_parag.sh"
  mkhist_script = "work/make_histogram.gawk"
  h2poly_script = "work/turn_histogram_into_polygonal_line.gawk"
  erfn_gawk_lib = "work/error_funcs.gawk"

  pre[target] = [ script, mkwdhist_script, erfn_gawk_lib, mkhist_script, h2poly_script ]
  pre[target].append(f"res/{name}.upp")
  tit[target] = f"plotting parag size histograms for {name}"
  mak[target] = (
      f"{script} {name} '{color}' {bin_size}",
    )
  return target
  # ----------------------------------------------------------------------
  
def add_double_size_hist_plot_rules(pre, mak, tit, name0, color0, name1, color1, bin_size): 
  # Rules to create the plot file "{name0}-{name1}-upp-hist.png"
  # that compare the histograms of parag sizes listed in files
  # "res/{name0}.upp" and "res/{name1}.upp".

  target = f"{name0}-{name1}-upp-hist.png"

  # Hstogram plotting script and its imported modules:
  script = "plot_double_parag_size_histograms.sh"
  mkwdhist_script = "make_hist_of_units_per_parag.sh"
  mkhist_script = "work/make_histogram.gawk"
  h2poly_script = "work/turn_histogram_into_polygonal_line.gawk"
  erfn_gawk_lib = "work/error_funcs.gawk"

  pre[target] = [ script, mkwdhist_script, erfn_gawk_lib, mkhist_script, h2poly_script ]
  for name in (name0, name1): 
    pre[target].append(f"res/{name}.upp")
  tit[target] = f"plotting parag size histograms for {name0} and {name1}"
  mak[target] = (
      f"{script} {name0} '{color0}' {name1} '{color1}' {bin_size}",
    )
  return target
  # ----------------------------------------------------------------------

def add_word_pos_file_rules(pre, mak, tit,  book, bsub, enc, iext, unit, word, tag):
  # Adds rules and commands to create a file "res/{book}-{bsub}-{unit}-{tag}.wpo"
  # with the positions of {word} in each parag of file "res/{book}-{bsub}-{enc}.{iext}"
  # where {enc} is "eva" for "starps" and "chu" for "bencao" and "ch".

  source = f"res/{book}-{bsub}-{enc}.{iext}"
  target = f"{book}-{bsub}-{unit}-{tag}.wpo"

  gwp_script = "list_word_positions_in_lines.py"
  erfn_py_lib = "work/error_funcs.py"
  prfn_py_lib = "work/process_funcs.py"
  chin_py_lib = "work/chinese_funcs.py"
  n077_py_lib = "note_077_funcs.py"
  
  tit[target] = f"making the word positions file {target}"
  pre[target] = [ source, gwp_script, erfn_py_lib, prfn_py_lib, n077_py_lib, chin_py_lib, ]
  mak[target] = (
      f"{gwp_script} {book} {bsub} {iext} {unit} '{word}' {tag}",
    )
  return target
  # ----------------------------------------------------------------------

def add_single_loc_word_pos_file_rules(pre, mak, tit,  book, bsub, unit, sloc, word, tag):
  # Adds rules and commands to create a file "res/{book}-{bsub}-{unit}-{sloc}-{tag}.wpo"
  # with the positions of {word} in parag {sloc} of file {name}
  # with format {fmt} (either "voyn-eva" or "chin-chu").

  source = f"res/{book}-{bsub}-{unit}-{tag}.wpo"
  target = f"{book}-{bsub}-{unit}-{sloc}-{tag}.wpo"

  tit[target] = f"making the single-parag word positions file {target}"
  pre[target] = [ source, ]
  mak[target] = (
      f"cat {source} | egrep -e '^{sloc}[ ]' | cat > res/{target}",
    )
  return target
  # ----------------------------------------------------------------------

def add_single_loc_word_pos_plot_rules(pre, mak, tit,  book, bsub, unit, sloc, word, tag):
  # Adds rules and commands to create a file "res/{name}-{sloc}-{tag}-wpos.png"
  # that plots the word postions listed in "res/{name}-{sloc}-{tag}.wpo"
  # for the given {word} in parag {sloc}.

  plot_script = "plot_word_positions.sh"
  erfn_gawk_lib = "work/error_funcs.gawk"
 
  if book == "starps":
    color = starps_hist_color(bsub, unit)
    shift = 0.15
  elif book == "bencao":
    color = bencao_hist_color(bsub, unit)
    shift = 0.00
  else:
    assert False

  name = f"{book}-{bsub}-{unit}"
  source = f"res/{name}-{sloc}-{tag}.wpo"
  target = f"{name}-{sloc}-{tag}-wpos.png"
  tit[target] = f"making the word positions plot {target}"
  pre[target] = [ source, plot_script, erfn_gawk_lib, ]
  mak[target] = (
      f"cat {source} | {plot_script} {shift} '{color}' '{tag} in {sloc}' > res/{target}",
    )
  return target
  # ----------------------------------------------------------------------
 
def add_many_word_pos_plot_rules(pre, mak, tit, book, bsub, iext, unit, pattags, slocs):

  targets = []
  enc = enc_from_unit(unit)
  for word, tag in pattags:
    wpo_target = f"{book}-{bsub}-{unit}-{tag}.wpo"
    if wpo_target not in pre:
      wpo_target_1 = add_word_pos_file_rules(pre, mak, tit, book, bsub, enc, iext, unit, word, tag)
      assert wpo_target == wpo_target_1, f" {wpo_target = !r} {wpo_target_1 = !r}"
      targets.append(wpo_target)
    for sloc in slocs:
      wpo_sloc_target = add_single_loc_word_pos_file_rules(pre, mak, tit, book, bsub, unit, sloc, word, tag)
      plot_target = add_single_loc_word_pos_plot_rules(pre, mak, tit, book, bsub, unit, sloc, word, tag)
      targets.append(plot_target)
  return targets
  # ----------------------------------------------------------------------

def add_word_tuple_file_rules(pre, mak, tit, book, bsub, unit, tsize):
  # Adds rules to make "res/{book}-{bsub}-{unit}-{tsize}.tfr" that contains 
  # counts and frequencies of all tuples of {tsize} consecutive units 
  # of type {unit} from file "res/{book}-{bsub}-{enc}.ivp"
  # where {enc} depends on {book} and {unit}.
  
  enc = enc_from_unit(unit)

  assert tsize > 0 and tsize <= 99, f"bad {tsize = }"

  # First make a ".tup" file with extracted tuples per parag:
  tup_source = f"res/{book}-{bsub}-{enc}.ivp"
  tup_target = f"{book}-{bsub}-{unit}-{tsize}.tup"
  tup_script = "extract_word_tuples.py"
  tit[tup_target] = f"extracting tuples to file res/{tup_target} ..."
  pre[tup_target] = [ tup_source, tup_script ]
  mak[tup_target] = (
    f"{tup_script} {book} {bsub} {unit} {tsize}",
  )
 
  # Now compute counts and freqs of tuples in input file:
  tfr_source = f"res/{tup_target}"
  tfr_target = f"{book}-{bsub}-{unit}-{tsize}.tfr"
  freq_script = "compute_freqs_from_counts.py"
  freq_script_enc = "bytes" if enc == "eva" else "utf"
  tit[tfr_target] = f"computing tuple counts and freqs file res/{tfr_target} ..."
  pre[tfr_target] = [ tfr_source, freq_script ]
  mak[tfr_target] = (
    f"cat {tfr_source}" + ' \\',
    r" | gawk '// { print $4 }'" + ' \\',
    r" | sort | uniq -c" + ' \\',
    f" | {freq_script} -encoding {freq_script_enc} -total 'TOTAL'" + ' \\',
    f" | sort -b -k1,1nr" + ' \\',
    f" > res/{tfr_target}",
  )
  return tfr_target
  # ----------------------------------------------------------------------

def add_wpos_delta_file_rules(pre, mak, tit, book, bsub, unit, word, tag, parags):
  target = f"{book}-{bsub}-{unit}-{tag}.wpd"
  source = f"res/{book}-{bsub}-{unit}-{tag}.wpo"
  script = "list_wpos_delta_pairs.py"
  tit[target] = f"creating word position deltas file res/{target} from {source}"
  pre[target] = [ source, script ]
  mak[target] = (
    f"cat {source} | egrep -e '^({'|'.join(parags)}) ' | {script} > res/{target}", 
  )
  return target
  # ----------------------------------------------------------------------

def add_parag_parag_coin_image_rules(pre, mak, tit, name0, name1):
  # Adds rules and commands to create a target file "res/{name0}-{name1}-coin-map.png"
  # that shows coincidences of sizes etc 
  # between the parags of "res/{name0}.ivp" and "res/{name1}.ivp"
  # Returns the name of the target (sans "res/").

  target = f"{name0}-{name1}-coin-map.png"

  # Hstogram plotting script and its imported modules:
  script = "create_parag_parag_coin_image.sh"
  # ???
  # mkwdhist_script = "make_hist_of_units_per_parag.sh"
  # mkhist_script = "work/make_histogram.gawk"
  # h2poly_script = "work/turn_histogram_into_polygonal_line.gawk"
  # erfn_gawk_lib = "work/error_funcs.gawk"
  # 
  # pre[target] = [ script, mkwdhist_script, erfn_gawk_lib, mkhist_script, h2poly_script ]
  # pre[target].append(f"{dir0}/{name}.upp")
  # pre[target].append(f"{dir1}/{name}.upp")
  # tit[target] = f"plotting parag size histograms for {name0} and {name1}"
  # mak[target] = (
  #     f"{script} {name0} {name1}",
  #   )
  return target
  # ----------------------------------------------------------------------

def starps_hist_color(bsub, unit):
  # Chooses the color for histograms of {unit} counts in SPS subset {bsub}.
  if bsub == "fu":
    chue = ( 0.000, 0.600, 0.300, )
    dhue = ( +0.200, 0.000, -0.200, )
  elif bsub == "gd":
    chue = ( 0.000, 0.300, 0.800, )
    dhue = ( +0.200, -0.200, 0.000, )
  else:
    prog_error(f"bad {bsub = }")
  
  if unit == "ec":
    color = make_color(chue, dhue, 00.0)
  elif unit == "wp":
    color = make_color(chue, dhue, +1.0)
  elif unit == "wc":
    color = make_color(chue, dhue, -1.0)
  else:
    prog_error(f"bad {unit = }")
  return color
  # ----------------------------------------------------------------------

def bencao_hist_color(bsub, unit):
  # Chooses the color for histgrams of {unit} counts in SBJ subset {bsub}.
  assert bsub == "fu", f"invalid bencao subset {bsub = !r}"
  chue = ( 1.000, 0.150, 0.000, )
  dhue = ( 0.000, -0.100, +0.100, )
  if unit == "ch":
    color = make_color(chue, dhue, 00.0)
  elif unit == "ps":
    color = make_color(chue, dhue, +1.0)
  elif unit == "pj":
    color = make_color(chue, dhue, -1.0)
  else:
    prog_error(f"bad {unit = }")
  return color
  # ----------------------------------------------------------------------

def make_color(chue, dhue, pert):
  debug = False
  sat = 1 - 0.33*fabs(pert)
  val = 1 - 0.33*fabs(pert)
  
  phue = [ min(1.0, max(0.0, chue[k] + pert*dhue[k])) for k in range(3) ]
  lum = 0.3*phue[0] + 0.6*phue[1] + 0.1*phue[2]
  
  col = [ int(255*val*(sat*phue[k] + (1-sat)*lum)) for k in range(3) ]
  hex = f"#{col[0]:02x}{col[1]:02x}{col[2]:02x}"
  if debug: err.write(f"^^ {col = } {hex = }\n")
  return hex
  # ----------------------------------------------------------------------

def cleanup(which):
  bash(f"rm -f res/{which}*")
  return
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~