#! /usr/bin/python3 # Last edited on 2026-02-27 05:43:10 by stolfi import sys, os, re; from sys import stderr as err from error_funcs import arg_error, prog_error from process_funcs import bash from note_077_funcs import enc_from_unit from math import fabs def add_single_size_hist_plot_rules(pre, mak, tit, name, color, bin_size): # Rules to create the plot file "{name}-upp-hist.png" with the # histogram of parag sizes listed in files "res/{name}.upp". target = f"{name}-upp-hist.png" # Hstogram plotting script and its imported modules: script = "plot_single_parag_size_histograms.sh" mkwdhist_script = "make_hist_of_units_per_parag.sh" mkhist_script = "work/make_histogram.gawk" h2poly_script = "work/turn_histogram_into_polygonal_line.gawk" erfn_gawk_lib = "work/error_funcs.gawk" pre[target] = [ script, mkwdhist_script, erfn_gawk_lib, mkhist_script, h2poly_script ] pre[target].append(f"res/{name}.upp") tit[target] = f"plotting parag size histograms for {name}" mak[target] = ( f"{script} {name} '{color}' {bin_size}", ) return target # ---------------------------------------------------------------------- def add_double_size_hist_plot_rules(pre, mak, tit, name0, color0, name1, color1, bin_size): # Rules to create the plot file "{name0}-{name1}-upp-hist.png" # that compare the histograms of parag sizes listed in files # "res/{name0}.upp" and "res/{name1}.upp". target = f"{name0}-{name1}-upp-hist.png" # Hstogram plotting script and its imported modules: script = "plot_double_parag_size_histograms.sh" mkwdhist_script = "make_hist_of_units_per_parag.sh" mkhist_script = "work/make_histogram.gawk" h2poly_script = "work/turn_histogram_into_polygonal_line.gawk" erfn_gawk_lib = "work/error_funcs.gawk" pre[target] = [ script, mkwdhist_script, erfn_gawk_lib, mkhist_script, h2poly_script ] for name in (name0, name1): pre[target].append(f"res/{name}.upp") tit[target] = f"plotting parag size histograms for {name0} and {name1}" mak[target] = ( f"{script} {name0} '{color0}' {name1} '{color1}' {bin_size}", ) return target # ---------------------------------------------------------------------- def add_word_pos_file_rules(pre, mak, tit, book, bsub, enc, iext, unit, word, tag): # Adds rules and commands to create a file "res/{book}-{bsub}-{unit}-{tag}.wpo" # with the positions of {word} in each parag of file "res/{book}-{bsub}-{enc}.{iext}" # where {enc} is "eva" for "starps" and "chu" for "bencao" and "ch". source = f"res/{book}-{bsub}-{enc}.{iext}" target = f"{book}-{bsub}-{unit}-{tag}.wpo" gwp_script = "list_word_positions_in_lines.py" erfn_py_lib = "work/error_funcs.py" prfn_py_lib = "work/process_funcs.py" chin_py_lib = "work/chinese_funcs.py" n077_py_lib = "note_077_funcs.py" tit[target] = f"making the word positions file {target}" pre[target] = [ source, gwp_script, erfn_py_lib, prfn_py_lib, n077_py_lib, chin_py_lib, ] mak[target] = ( f"{gwp_script} {book} {bsub} {iext} {unit} '{word}' {tag}", ) return target # ---------------------------------------------------------------------- def add_single_loc_word_pos_file_rules(pre, mak, tit, book, bsub, unit, sloc, word, tag): # Adds rules and commands to create a file "res/{book}-{bsub}-{unit}-{sloc}-{tag}.wpo" # with the positions of {word} in parag {sloc} of file {name} # with format {fmt} (either "voyn-eva" or "chin-chu"). source = f"res/{book}-{bsub}-{unit}-{tag}.wpo" target = f"{book}-{bsub}-{unit}-{sloc}-{tag}.wpo" tit[target] = f"making the single-parag word positions file {target}" pre[target] = [ source, ] mak[target] = ( f"cat {source} | egrep -e '^{sloc}[ ]' | cat > res/{target}", ) return target # ---------------------------------------------------------------------- def add_single_loc_word_pos_plot_rules(pre, mak, tit, book, bsub, unit, sloc, word, tag): # Adds rules and commands to create a file "res/{name}-{sloc}-{tag}-wpos.png" # that plots the word postions listed in "res/{name}-{sloc}-{tag}.wpo" # for the given {word} in parag {sloc}. plot_script = "plot_word_positions.sh" erfn_gawk_lib = "work/error_funcs.gawk" if book == "starps": color = starps_hist_color(bsub, unit) shift = 0.15 elif book == "bencao": color = bencao_hist_color(bsub, unit) shift = 0.00 else: assert False name = f"{book}-{bsub}-{unit}" source = f"res/{name}-{sloc}-{tag}.wpo" target = f"{name}-{sloc}-{tag}-wpos.png" tit[target] = f"making the word positions plot {target}" pre[target] = [ source, plot_script, erfn_gawk_lib, ] mak[target] = ( f"cat {source} | {plot_script} {shift} '{color}' '{tag} in {sloc}' > res/{target}", ) return target # ---------------------------------------------------------------------- def add_many_word_pos_plot_rules(pre, mak, tit, book, bsub, iext, unit, pattags, slocs): targets = [] enc = enc_from_unit(unit) for word, tag in pattags: wpo_target = f"{book}-{bsub}-{unit}-{tag}.wpo" if wpo_target not in pre: wpo_target_1 = add_word_pos_file_rules(pre, mak, tit, book, bsub, enc, iext, unit, word, tag) assert wpo_target == wpo_target_1, f" {wpo_target = !r} {wpo_target_1 = !r}" targets.append(wpo_target) for sloc in slocs: wpo_sloc_target = add_single_loc_word_pos_file_rules(pre, mak, tit, book, bsub, unit, sloc, word, tag) plot_target = add_single_loc_word_pos_plot_rules(pre, mak, tit, book, bsub, unit, sloc, word, tag) targets.append(plot_target) return targets # ---------------------------------------------------------------------- def add_word_tuple_file_rules(pre, mak, tit, book, bsub, unit, tsize): # Adds rules to make "res/{book}-{bsub}-{unit}-{tsize}.tfr" that contains # counts and frequencies of all tuples of {tsize} consecutive units # of type {unit} from file "res/{book}-{bsub}-{enc}.ivp" # where {enc} depends on {book} and {unit}. enc = enc_from_unit(unit) assert tsize > 0 and tsize <= 99, f"bad {tsize = }" # First make a ".tup" file with extracted tuples per parag: tup_source = f"res/{book}-{bsub}-{enc}.ivp" tup_target = f"{book}-{bsub}-{unit}-{tsize}.tup" tup_script = "extract_word_tuples.py" tit[tup_target] = f"extracting tuples to file res/{tup_target} ..." pre[tup_target] = [ tup_source, tup_script ] mak[tup_target] = ( f"{tup_script} {book} {bsub} {unit} {tsize}", ) # Now compute counts and freqs of tuples in input file: tfr_source = f"res/{tup_target}" tfr_target = f"{book}-{bsub}-{unit}-{tsize}.tfr" freq_script = "compute_freqs_from_counts.py" freq_script_enc = "bytes" if enc == "eva" else "utf" tit[tfr_target] = f"computing tuple counts and freqs file res/{tfr_target} ..." pre[tfr_target] = [ tfr_source, freq_script ] mak[tfr_target] = ( f"cat {tfr_source}" + ' \\', r" | gawk '// { print $4 }'" + ' \\', r" | sort | uniq -c" + ' \\', f" | {freq_script} -encoding {freq_script_enc} -total 'TOTAL'" + ' \\', f" | sort -b -k1,1nr" + ' \\', f" > res/{tfr_target}", ) return tfr_target # ---------------------------------------------------------------------- def add_wpos_delta_file_rules(pre, mak, tit, book, bsub, unit, word, tag, parags): target = f"{book}-{bsub}-{unit}-{tag}.wpd" source = f"res/{book}-{bsub}-{unit}-{tag}.wpo" script = "list_wpos_delta_pairs.py" tit[target] = f"creating word position deltas file res/{target} from {source}" pre[target] = [ source, script ] mak[target] = ( f"cat {source} | egrep -e '^({'|'.join(parags)}) ' | {script} > res/{target}", ) return target # ---------------------------------------------------------------------- def add_parag_parag_coin_image_rules(pre, mak, tit, name0, name1): # Adds rules and commands to create a target file "res/{name0}-{name1}-coin-map.png" # that shows coincidences of sizes etc # between the parags of "res/{name0}.ivp" and "res/{name1}.ivp" # Returns the name of the target (sans "res/"). target = f"{name0}-{name1}-coin-map.png" # Hstogram plotting script and its imported modules: script = "create_parag_parag_coin_image.sh" # ??? # mkwdhist_script = "make_hist_of_units_per_parag.sh" # mkhist_script = "work/make_histogram.gawk" # h2poly_script = "work/turn_histogram_into_polygonal_line.gawk" # erfn_gawk_lib = "work/error_funcs.gawk" # # pre[target] = [ script, mkwdhist_script, erfn_gawk_lib, mkhist_script, h2poly_script ] # pre[target].append(f"{dir0}/{name}.upp") # pre[target].append(f"{dir1}/{name}.upp") # tit[target] = f"plotting parag size histograms for {name0} and {name1}" # mak[target] = ( # f"{script} {name0} {name1}", # ) return target # ---------------------------------------------------------------------- def starps_hist_color(bsub, unit): # Chooses the color for histograms of {unit} counts in SPS subset {bsub}. if bsub == "fu": chue = ( 0.000, 0.600, 0.300, ) dhue = ( +0.200, 0.000, -0.200, ) elif bsub == "gd": chue = ( 0.000, 0.300, 0.800, ) dhue = ( +0.200, -0.200, 0.000, ) else: prog_error(f"bad {bsub = }") if unit == "ec": color = make_color(chue, dhue, 00.0) elif unit == "wp": color = make_color(chue, dhue, +1.0) elif unit == "wc": color = make_color(chue, dhue, -1.0) else: prog_error(f"bad {unit = }") return color # ---------------------------------------------------------------------- def bencao_hist_color(bsub, unit): # Chooses the color for histgrams of {unit} counts in SBJ subset {bsub}. assert bsub == "fu", f"invalid bencao subset {bsub = !r}" chue = ( 1.000, 0.150, 0.000, ) dhue = ( 0.000, -0.100, +0.100, ) if unit == "ch": color = make_color(chue, dhue, 00.0) elif unit == "ps": color = make_color(chue, dhue, +1.0) elif unit == "pj": color = make_color(chue, dhue, -1.0) else: prog_error(f"bad {unit = }") return color # ---------------------------------------------------------------------- def make_color(chue, dhue, pert): debug = False sat = 1 - 0.33*fabs(pert) val = 1 - 0.33*fabs(pert) phue = [ min(1.0, max(0.0, chue[k] + pert*dhue[k])) for k in range(3) ] lum = 0.3*phue[0] + 0.6*phue[1] + 0.1*phue[2] col = [ int(255*val*(sat*phue[k] + (1-sat)*lum)) for k in range(3) ] hex = f"#{col[0]:02x}{col[1]:02x}{col[2]:02x}" if debug: err.write(f"^^ {col = } {hex = }\n") return hex # ---------------------------------------------------------------------- def cleanup(which): bash(f"rm -f res/{which}*") return # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~