#! /usr/bin/python3
# Last edited on 2026-03-11 10:04:38 by stolfi

import sys, os, re;
from sys import stderr as err
from error_funcs import arg_error, prog_error
from process_funcs import bash
from math import fabs
import size_position_funcs as spf

def add_rules_for_units_per_line_file(pre, mak, tit, ivt_name, utype):

  pfn_py_lib = "work/process_funcs.py"
  cfn_py_lib = "work/chinese_funcs.py"
  nfn_py_lib = "note_077_funcs.py"
  
  target = f"{ivt_name}.upp"
  source = f"res/{ivt_name}.ivt"

  # The size counting script and its imported modules:
  script = f"count_units_per_line.py"
  script_options = f"{ivt_name} {utype}"

  tit[target] = f"making parag size file {target} from {source}"
  pre[target] = [ source, script, pfn_py_lib, nfn_py_lib, cfn_py_lib, ]
  mak[target] = (
      f"{script} {script_options}",
    )
  return target
  # ----------------------------------------------------------------------

def add_rules_for_single_size_hist_plot(pre, mak, tit, ivt_name, usize, color, bin_size): 
  # Rules to create the plot file "{ivt_name}-upp-hist.png" with the
  # histogram of parag sizes listed in files "res/{ivt_name}.upp".

  target = f"{ivt_name}-upp-hist.png"

  # Units-per-parag source file:
  upp_source = f"res/{ivt_name}.upp"

  # Histogram plotting script and its imported modules:
  script = "plot_single_parag_size_histograms.sh"
  mkwdhist_script = "make_hist_of_units_per_parag.sh"
  mkhist_script = "work/make_histogram.gawk"
  h2poly_script = "work/turn_histogram_into_polygonal_line.gawk"
  efn_gawk_lib = "work/error_funcs.gawk"

  pre[target] = [ upp_source, script, mkwdhist_script, efn_gawk_lib, mkhist_script, h2poly_script ]
  tit[target] = f"plotting parag size histograms for {ivt_name}"
  mak[target] = (
      f"{script} {ivt_name} {usize} '{color}' {bin_size}",
    )
  return target
  # ----------------------------------------------------------------------
  
def add_rules_for_double_size_hist_plot(pre, mak, tit, ivt_name0, usize0, color0, ivt_name1, usize1, color1, bin_size): 
  # Rules to create the plot file "{ivt_name0}-{ivt_name1}-upp-hist.png"
  # that compare the histograms of parag sizes listed in files
  # "res/{ivt_name0}.upp" and "res/{ivt_name1}.upp".
  #
  # The unit types {utype0,utype1} must be consistent with {ivt_name0,ivt_name1}

  target = f"{ivt_name0}-{ivt_name1}-upp-hist.png"

  # Hstogram plotting script and its imported modules:
  script = "plot_double_parag_size_histograms.sh"
  mkwdhist_script = "make_hist_of_units_per_parag.sh"
  mkhist_script = "work/make_histogram.gawk"
  h2poly_script = "work/turn_histogram_into_polygonal_line.gawk"
  efn_gawk_lib = "work/error_funcs.gawk"

  pre[target] = [ script, mkwdhist_script, efn_gawk_lib, mkhist_script, h2poly_script ]
  for ivt_name in (ivt_name0, ivt_name1): 
    upp_source = f"res/{ivt_name}.upp"
    pre[target].append(upp_source)
  tit[target] = f"plotting parag size histograms for {ivt_name0} and {ivt_name1}"
  mak[target] = (
      f"{script} {ivt_name0} {usize0} '{color0}' {ivt_name1} {usize1} '{color1}' {bin_size}",
    )
  return target
  # ----------------------------------------------------------------------

def add_rules_for_word_pos_file(pre, mak, tit, ivt_name, utype, kword, ktag):
  # Adds rules and commands to create a file "res/{ivt_name}-{ktag}.wpo"
  # with the positions of word/pattern {kword} in each line of file 
  # "res/{ivt_name}.ivt".  The {utype} is the type of unit to be searched.

  source = f"res/{ivt_name}.ivt"
  target = f"{ivt_name}-{ktag}.wpo"

  gwp_script = "list_word_positions_in_lines.py"
  
  efn_py_lib = "work/error_funcs.py"
  pfn_py_lib = "work/process_funcs.py"
  cfn_py_lib = "work/chinese_funcs.py"
  nfn_py_lib = "note_077_funcs.py"
  
  tit[target] = f"making the word positions file {target}"
  pre[target] = [ source, gwp_script, efn_py_lib, pfn_py_lib, nfn_py_lib, cfn_py_lib, ]
  mak[target] = (
      f"{gwp_script} {ivt_name} {utype} '{kword}' {ktag}",
    )
  return target
  # ----------------------------------------------------------------------

def add_rules_for_single_loc_word_pos_plot(pre, mak, tit,  ivt_name, utype, color, sloc, kword, ktag):
  # Adds rules to create a file "res/{book}-{bsub}-{utype}-{ltype}-{ktag}-{sloc}-wpos.png"
  # that plots the word postions listed in "res/{book}-{bsub}-{utype}-{ltype}-{ktag}.wpo"
  # for the given {kword} in parag {sloc}.
  
  usize = spf.hanzi_per_unit(utype)
  plot_script = "plot_word_positions.sh"
  efn_gawk_lib = "work/error_funcs.gawk"
 
  wpo_name = f"{ivt_name}-{ktag}"
  wpo_target = f"{wpo_name}.wpo"

  if wpo_target not in pre:
    wpo_target_1 = add_rules_for_word_pos_file(pre, mak, tit, ivt_name, utype, kword, ktag)
    assert wpo_target == wpo_target_1, f" {wpo_target = !r} {wpo_target_1 = !r}"

  source = f"res/{wpo_target}"
  png_target = f"{wpo_name}-{sloc}-wpos.png"

  tit[png_target] = f"making the word positions plot {png_target}"
  pre[png_target] = [ source, plot_script, efn_gawk_lib, ]
  mak[png_target] = (
      f"{plot_script} {ivt_name} {usize} '{kword}' {ktag} {sloc} '{color}'" + "\\", 
      f"  > res/{png_target}",
    )
  return png_target
  # ----------------------------------------------------------------------
 
def add_rules_for_many_word_pos_plots \
    ( pre, mak, tit, ivt_name, utype, color, kwt_pairs, sloc_list ):
  # Adds rules to create several plot files "res/{book}-{bsub}-{utype}-{ltype}-{ktag}-{sloc}-wpos.png"
  # that plots the word postions listed in "res/{book}-{bsub}-{utype}-{ltype}-{ktag}.wpo"
  # for each pair {kword,krag} in {kwt_pairs} and each locus id {sloc} in {sloc_list}.
  
  targets = []
  for kword, ktag in kwt_pairs:
    wpo_target = f"{ivt_name}-{ktag}.wpo"
    for sloc in sloc_list:
      plot_target = add_rules_for_single_loc_word_pos_plot(pre, mak, tit, ivt_name, utype, color, sloc, kword, ktag)
      targets.append(plot_target)
  return targets
  # ----------------------------------------------------------------------

def add_rules_for_double_word_delta_pairs_plot \
    ( pre, mak, tit, 
      ivt_name0, utype0, pmag0, kword0, ktag0, sloc_list0, stag0, color0, 
      ivt_name1, utype1, pmag1, kword1, ktag1, sloc_list1, stag1, color1
    ):
  # Adds rules to create the plot file
  # "res/{ivt_name0}-{ktag0}-{stag0}-{ivt_name1}-{ktag1}-{stag1}-wdpairs.png"
  # showing two sets of word position delta pairs
  # for the loci listed in {sloc0} and {sloc1}, respectively.
  #
  # The word positions are obtained from files
  # "res/{ivt_name0}-{ktag0}.wpo" and "res/{ivt_name1}-{ktag1}.wpo",
  # which have the raw positions of key words or patterns {kword0}
  # and {kword1} (of unit types {utype0} and {utype1}) in the 
  # transcription files "res/{ivt_name0}.ivt" and "res/{ivt_name1}.ivt".
  #
  # The raw word positions are scaled by {pmag0} and {pmag1}
  # times the respective unit sizes as defined by {}.
  # Those two parameters shoudl be close to 1 and may be 
  # used to fine tune the nominal unit sizes for particular parags.
  # 
  # This function also adds rules to make the necessary ".wpo" files.

  wpo_name0 = f"{ivt_name0}-{ktag0}"
  wpo_target0 = add_rules_for_word_pos_file(pre, mak, tit,  ivt_name0, utype0, kword0, ktag0)
  assert wpo_target0 == f"{wpo_name0}.wpo"

  wpo_name1 = f"{ivt_name1}-{ktag1}"
  wpo_target1 = add_rules_for_word_pos_file(pre, mak, tit,  ivt_name1, utype1, kword1, ktag1)
  assert wpo_target1 == f"{wpo_name1}.wpo"
  
  target = f"{wpo_name0}-{stag0}-{wpo_name1}-{stag1}-wdpairs.png"

  source0 = f"res/{wpo_target0}"
  uscale0 = pmag0 * spf.hanzi_per_unit(utype0)
  title0 = f"{wpo_name0}-{stag0} × {uscale0:.4f}"
 
  source1 = f"res/{wpo_target1}"
  uscale1 = pmag1 * spf.hanzi_per_unit(utype1)
  title1 = f"{wpo_name1}-{stag1} × {uscale1:.4f}"
  
  script = "plot_two_delta_pair_sets.sh"
  tit[target] = f"creating plot res/{target} from {source0}, {source1}"
  pre[target] = [ script, source0, source1, ]
  mak[target] = (
    f"{script} " + " \\",
    f"    {source0} {uscale0} '{'|'.join(sloc_list0)}[ ]' '{title0}' '{color0}'" + " \\",
    f"    {source1} {uscale1} '{'|'.join(sloc_list1)}[ ]' '{title1}' '{color1}'" + " \\",
    f"  > res/{target}",
  )
  return target
  # ----------------------------------------------------------------------

def add_rules_for_lines_with_pattern_files(pre, mak, tit, ivt_name, utype, kword, ktag):
  # Adds rules to create ".opa" file that lists the lines of a
  # transcription file with occurrences of a determineate 
  # pattern {kword}.  Also for an ".opc" file that counts
  # lines where the patterno occurs at position {p}, for each {p}.
  #
  # The input file will be "res/{ivt_name}.ivt".  For SPS,
  # this should be a "wp" file, even if the {utype} is "ec".
  #
  # The output files will be "res/{ivt_name}-with-{ktag}.opa"
  # and "res/{ivt_name}-with-{ktag}.opc".
  
  ivt_file = f"res/{ivt_name}.ivt"
  
  # First make a ".opa" file with the lines with {kword}:
  opa_name = f"{ivt_name}-with-{ktag}"
  opa_target = f"{opa_name}.opa"
  opa_file = f"res/{opa_target}"

  opa_source = ivt_file
  opa_script = "list_lines_with_pattern.py"
  tit[opa_target] = f"listing lines of {opa_source} with '{kword}' ..."
  pre[opa_target] = [ opa_source, opa_script, ]
  mak[opa_target] = (
    f"{opa_script} {opa_source} {utype} '{kword}' {opa_file}",
  )
 
  # Now create ".opc" file with counts and freqs of lines by pattern pos:
  opc_target = f"{opa_name}.opc"
  opc_file = f"res/{opc_target}"
  
  opc_source = opa_file
  opc_script = "count_lines_by_pattern_position.gawk"
  tit[opc_target] = f"computing line counts by pattern position res/{opc_target} ..."
  pre[opc_target] = [ opc_source, opc_script, ]
  mak[opc_target] = (
    f"cat {opc_source} | {opc_script} > {opc_file}",
  )
  return opc_target
  # ----------------------------------------------------------------------

def add_rules_for_word_tuple_file(pre, mak, tit, ivt_name, utype, tsize, kword, ktag):
  # Adds rules to extract and count tuples of consecutive {tsize} units.
  # If {kword} is not {None}, takes only tuples that begin with a unit
  # that matches the RE pattern {kword} in whole or in part.
  #
  # The input file will be "res/{ivt_name}.ivt".
  #
  # The extracted tuples, with locus ID and position, will be written to  
  # "res/{ivt_name}-{tsize}-with-{ktag}.tup".
  # The counts and frequencies of the tuples will be written to  
  # "res/{ivt_name}-with-{ktag}.tfr".
  # The part "-with-{ktag}" will be omitted if {kword} is {NOne}.
  
  assert tsize > 0 and tsize <= 99, f"bad {tsize = }"
  
  tup_name = f"{ivt_name}-{tsize}"
  if kword != None: tup_name += f"-with-{ktag}"

  # First make a ".tup" file with extracted tuples per parag:
  tup_source = f"res/{ivt_name}.ivt"
  tup_target = f"{tup_name}.tup"
  tup_script = "extract_keyed_tuples.py"
  tit[tup_target] = f"extracting tuples to file res/{tup_target} ..."
  pre[tup_target] = [ tup_source, tup_script ]
  mak[tup_target] = (
    f"{tup_script} {ivt_name} {utype} {tsize} '{kword}' {ktag}",
  )
 
  # Now compute counts and freqs of tuples in input file:
  tfr_source = f"res/{tup_target}"
  tfr_target = f"{tup_name}.tfr"
  freq_script = "compute_freqs_from_counts.py"
  tit[tfr_target] = f"computing tuple counts and freqs file res/{tfr_target} ..."
  pre[tfr_target] = [ tfr_source, freq_script ]
  mak[tfr_target] = (
    f"cat {tfr_source}" + ' \\',
    r" | gawk '// { print $4 }'" + ' \\',
    r" | sort | uniq -c" + ' \\',
    f" | {freq_script} -encoding utf -total 'TOTAL'" + ' \\',
    f" | sort -b -k1,1nr" + ' \\',
    f" > res/{tfr_target}",
  )
  return tfr_target
  # ----------------------------------------------------------------------

def add_rules_for_parag_parag_coin_image(pre, mak, tit, ivt_name0, ivt_name1):
  # Adds rules and commands to create a target file "res/{ivt_name0}-{ivt_name1}-coin-map.png"
  # that shows coincidences of sizes etc 
  # between the parags of "res/{ivt_name0}-par.ivt" and "res/{ivt_name1}-par.ivt"
  # Returns the ivt_name of the target (sans "res/").

  target = f"{ivt_name0}-{ivt_name1}-coin-map.png"

  # Hstogram plotting script and its imported modules:
  script = "create_parag_parag_coin_image.sh"
  # ???
  # mkwdhist_script = "make_hist_of_units_per_parag.sh"
  # mkhist_script = "work/make_histogram.gawk"
  # h2poly_script = "work/turn_histogram_into_polygonal_line.gawk"
  # efn_gawk_lib = "work/error_funcs.gawk"
  # 
  # pre[target] = [ script, mkwdhist_script, efn_gawk_lib, mkhist_script, h2poly_script ]
  # pre[target].append(f"{dir0}/{ivt_name}.upp")
  # pre[target].append(f"{dir1}/{ivt_name}.upp")
  # tit[target] = f"plotting parag size histograms for {ivt_name0} and {ivt_name1}"
  # mak[target] = (
  #     f"{script} {ivt_name0} {ivt_name1}",
  #   )
  return target
  # ----------------------------------------------------------------------

def hist_color(book, bsub, utype):
  # Chooses the color for histgrams of counts of {utype} units in 
  # the given {book} and {bsub}.
    
  if book == "bencao":
    assert bsub == "fu", f"invalid bencao subset {bsub = !r}"
    chue = ( 1.000, 0.150, 0.000, )
    dhue = ( 0.000, -0.100, +0.100, )
  elif book == "starps":
    if bsub == "fu":
      chue = ( 0.000, 0.600, 0.300, )
      dhue = ( +0.200, 0.000, -0.200, )
    elif bsub == "gd":
      chue = ( 0.000, 0.300, 0.800, )
      dhue = ( +0.200, -0.200, 0.000, )
    else:
      prog_error(f"bad {bsub = }")
  else:
    prog_error(f"bad {book = }")

  if utype == "ec" or utype == "ch":
    color = make_color(chue, dhue, 00.0)
  elif utype == "wp" or utype == "ps":
    color = make_color(chue, dhue, +1.0)
  elif utype == "wc" or utype == "pj":
    color = make_color(chue, dhue, -1.0)
  else:
    prog_error(f"bad {utype = }")

  return color
  # ----------------------------------------------------------------------

def make_color(chue, dhue, pert):
  debug = False
  sat = 1 - 0.33*fabs(pert)
  val = 1 - 0.33*fabs(pert)
  
  phue = [ min(1.0, max(0.0, chue[k] + pert*dhue[k])) for k in range(3) ]
  lum = 0.3*phue[0] + 0.6*phue[1] + 0.1*phue[2]
  
  col = [ int(255*val*(sat*phue[k] + (1-sat)*lum)) for k in range(3) ]
  hex = f"#{col[0]:02x}{col[1]:02x}{col[2]:02x}"
  if debug: err.write(f"^^ {col = } {hex = }\n")
  return hex
  # ----------------------------------------------------------------------

def cleanup(which):
  bash(f"rm -f res/{which}*")
  return