#! /usr/bin/python3 # Last edited on 2026-03-11 10:04:38 by stolfi import sys, os, re; from sys import stderr as err from error_funcs import arg_error, prog_error from process_funcs import bash from math import fabs import size_position_funcs as spf def add_rules_for_units_per_line_file(pre, mak, tit, ivt_name, utype): pfn_py_lib = "work/process_funcs.py" cfn_py_lib = "work/chinese_funcs.py" nfn_py_lib = "note_077_funcs.py" target = f"{ivt_name}.upp" source = f"res/{ivt_name}.ivt" # The size counting script and its imported modules: script = f"count_units_per_line.py" script_options = f"{ivt_name} {utype}" tit[target] = f"making parag size file {target} from {source}" pre[target] = [ source, script, pfn_py_lib, nfn_py_lib, cfn_py_lib, ] mak[target] = ( f"{script} {script_options}", ) return target # ---------------------------------------------------------------------- def add_rules_for_single_size_hist_plot(pre, mak, tit, ivt_name, usize, color, bin_size): # Rules to create the plot file "{ivt_name}-upp-hist.png" with the # histogram of parag sizes listed in files "res/{ivt_name}.upp". target = f"{ivt_name}-upp-hist.png" # Units-per-parag source file: upp_source = f"res/{ivt_name}.upp" # Histogram plotting script and its imported modules: script = "plot_single_parag_size_histograms.sh" mkwdhist_script = "make_hist_of_units_per_parag.sh" mkhist_script = "work/make_histogram.gawk" h2poly_script = "work/turn_histogram_into_polygonal_line.gawk" efn_gawk_lib = "work/error_funcs.gawk" pre[target] = [ upp_source, script, mkwdhist_script, efn_gawk_lib, mkhist_script, h2poly_script ] tit[target] = f"plotting parag size histograms for {ivt_name}" mak[target] = ( f"{script} {ivt_name} {usize} '{color}' {bin_size}", ) return target # ---------------------------------------------------------------------- def add_rules_for_double_size_hist_plot(pre, mak, tit, ivt_name0, usize0, color0, ivt_name1, usize1, color1, bin_size): # Rules to create the plot file "{ivt_name0}-{ivt_name1}-upp-hist.png" # that compare the histograms of parag sizes listed in files # "res/{ivt_name0}.upp" and "res/{ivt_name1}.upp". # # The unit types {utype0,utype1} must be consistent with {ivt_name0,ivt_name1} target = f"{ivt_name0}-{ivt_name1}-upp-hist.png" # Hstogram plotting script and its imported modules: script = "plot_double_parag_size_histograms.sh" mkwdhist_script = "make_hist_of_units_per_parag.sh" mkhist_script = "work/make_histogram.gawk" h2poly_script = "work/turn_histogram_into_polygonal_line.gawk" efn_gawk_lib = "work/error_funcs.gawk" pre[target] = [ script, mkwdhist_script, efn_gawk_lib, mkhist_script, h2poly_script ] for ivt_name in (ivt_name0, ivt_name1): upp_source = f"res/{ivt_name}.upp" pre[target].append(upp_source) tit[target] = f"plotting parag size histograms for {ivt_name0} and {ivt_name1}" mak[target] = ( f"{script} {ivt_name0} {usize0} '{color0}' {ivt_name1} {usize1} '{color1}' {bin_size}", ) return target # ---------------------------------------------------------------------- def add_rules_for_word_pos_file(pre, mak, tit, ivt_name, utype, kword, ktag): # Adds rules and commands to create a file "res/{ivt_name}-{ktag}.wpo" # with the positions of word/pattern {kword} in each line of file # "res/{ivt_name}.ivt". The {utype} is the type of unit to be searched. source = f"res/{ivt_name}.ivt" target = f"{ivt_name}-{ktag}.wpo" gwp_script = "list_word_positions_in_lines.py" efn_py_lib = "work/error_funcs.py" pfn_py_lib = "work/process_funcs.py" cfn_py_lib = "work/chinese_funcs.py" nfn_py_lib = "note_077_funcs.py" tit[target] = f"making the word positions file {target}" pre[target] = [ source, gwp_script, efn_py_lib, pfn_py_lib, nfn_py_lib, cfn_py_lib, ] mak[target] = ( f"{gwp_script} {ivt_name} {utype} '{kword}' {ktag}", ) return target # ---------------------------------------------------------------------- def add_rules_for_single_loc_word_pos_plot(pre, mak, tit, ivt_name, utype, color, sloc, kword, ktag): # Adds rules to create a file "res/{book}-{bsub}-{utype}-{ltype}-{ktag}-{sloc}-wpos.png" # that plots the word postions listed in "res/{book}-{bsub}-{utype}-{ltype}-{ktag}.wpo" # for the given {kword} in parag {sloc}. usize = spf.hanzi_per_unit(utype) plot_script = "plot_word_positions.sh" efn_gawk_lib = "work/error_funcs.gawk" wpo_name = f"{ivt_name}-{ktag}" wpo_target = f"{wpo_name}.wpo" if wpo_target not in pre: wpo_target_1 = add_rules_for_word_pos_file(pre, mak, tit, ivt_name, utype, kword, ktag) assert wpo_target == wpo_target_1, f" {wpo_target = !r} {wpo_target_1 = !r}" source = f"res/{wpo_target}" png_target = f"{wpo_name}-{sloc}-wpos.png" tit[png_target] = f"making the word positions plot {png_target}" pre[png_target] = [ source, plot_script, efn_gawk_lib, ] mak[png_target] = ( f"{plot_script} {ivt_name} {usize} '{kword}' {ktag} {sloc} '{color}'" + "\\", f" > res/{png_target}", ) return png_target # ---------------------------------------------------------------------- def add_rules_for_many_word_pos_plots \ ( pre, mak, tit, ivt_name, utype, color, kwt_pairs, sloc_list ): # Adds rules to create several plot files "res/{book}-{bsub}-{utype}-{ltype}-{ktag}-{sloc}-wpos.png" # that plots the word postions listed in "res/{book}-{bsub}-{utype}-{ltype}-{ktag}.wpo" # for each pair {kword,krag} in {kwt_pairs} and each locus id {sloc} in {sloc_list}. targets = [] for kword, ktag in kwt_pairs: wpo_target = f"{ivt_name}-{ktag}.wpo" for sloc in sloc_list: plot_target = add_rules_for_single_loc_word_pos_plot(pre, mak, tit, ivt_name, utype, color, sloc, kword, ktag) targets.append(plot_target) return targets # ---------------------------------------------------------------------- def add_rules_for_double_word_delta_pairs_plot \ ( pre, mak, tit, ivt_name0, utype0, pmag0, kword0, ktag0, sloc_list0, stag0, color0, ivt_name1, utype1, pmag1, kword1, ktag1, sloc_list1, stag1, color1 ): # Adds rules to create the plot file # "res/{ivt_name0}-{ktag0}-{stag0}-{ivt_name1}-{ktag1}-{stag1}-wdpairs.png" # showing two sets of word position delta pairs # for the loci listed in {sloc0} and {sloc1}, respectively. # # The word positions are obtained from files # "res/{ivt_name0}-{ktag0}.wpo" and "res/{ivt_name1}-{ktag1}.wpo", # which have the raw positions of key words or patterns {kword0} # and {kword1} (of unit types {utype0} and {utype1}) in the # transcription files "res/{ivt_name0}.ivt" and "res/{ivt_name1}.ivt". # # The raw word positions are scaled by {pmag0} and {pmag1} # times the respective unit sizes as defined by {}. # Those two parameters shoudl be close to 1 and may be # used to fine tune the nominal unit sizes for particular parags. # # This function also adds rules to make the necessary ".wpo" files. wpo_name0 = f"{ivt_name0}-{ktag0}" wpo_target0 = add_rules_for_word_pos_file(pre, mak, tit, ivt_name0, utype0, kword0, ktag0) assert wpo_target0 == f"{wpo_name0}.wpo" wpo_name1 = f"{ivt_name1}-{ktag1}" wpo_target1 = add_rules_for_word_pos_file(pre, mak, tit, ivt_name1, utype1, kword1, ktag1) assert wpo_target1 == f"{wpo_name1}.wpo" target = f"{wpo_name0}-{stag0}-{wpo_name1}-{stag1}-wdpairs.png" source0 = f"res/{wpo_target0}" uscale0 = pmag0 * spf.hanzi_per_unit(utype0) title0 = f"{wpo_name0}-{stag0} × {uscale0:.4f}" source1 = f"res/{wpo_target1}" uscale1 = pmag1 * spf.hanzi_per_unit(utype1) title1 = f"{wpo_name1}-{stag1} × {uscale1:.4f}" script = "plot_two_delta_pair_sets.sh" tit[target] = f"creating plot res/{target} from {source0}, {source1}" pre[target] = [ script, source0, source1, ] mak[target] = ( f"{script} " + " \\", f" {source0} {uscale0} '{'|'.join(sloc_list0)}[ ]' '{title0}' '{color0}'" + " \\", f" {source1} {uscale1} '{'|'.join(sloc_list1)}[ ]' '{title1}' '{color1}'" + " \\", f" > res/{target}", ) return target # ---------------------------------------------------------------------- def add_rules_for_lines_with_pattern_files(pre, mak, tit, ivt_name, utype, kword, ktag): # Adds rules to create ".opa" file that lists the lines of a # transcription file with occurrences of a determineate # pattern {kword}. Also for an ".opc" file that counts # lines where the patterno occurs at position {p}, for each {p}. # # The input file will be "res/{ivt_name}.ivt". For SPS, # this should be a "wp" file, even if the {utype} is "ec". # # The output files will be "res/{ivt_name}-with-{ktag}.opa" # and "res/{ivt_name}-with-{ktag}.opc". ivt_file = f"res/{ivt_name}.ivt" # First make a ".opa" file with the lines with {kword}: opa_name = f"{ivt_name}-with-{ktag}" opa_target = f"{opa_name}.opa" opa_file = f"res/{opa_target}" opa_source = ivt_file opa_script = "list_lines_with_pattern.py" tit[opa_target] = f"listing lines of {opa_source} with '{kword}' ..." pre[opa_target] = [ opa_source, opa_script, ] mak[opa_target] = ( f"{opa_script} {opa_source} {utype} '{kword}' {opa_file}", ) # Now create ".opc" file with counts and freqs of lines by pattern pos: opc_target = f"{opa_name}.opc" opc_file = f"res/{opc_target}" opc_source = opa_file opc_script = "count_lines_by_pattern_position.gawk" tit[opc_target] = f"computing line counts by pattern position res/{opc_target} ..." pre[opc_target] = [ opc_source, opc_script, ] mak[opc_target] = ( f"cat {opc_source} | {opc_script} > {opc_file}", ) return opc_target # ---------------------------------------------------------------------- def add_rules_for_word_tuple_file(pre, mak, tit, ivt_name, utype, tsize, kword, ktag): # Adds rules to extract and count tuples of consecutive {tsize} units. # If {kword} is not {None}, takes only tuples that begin with a unit # that matches the RE pattern {kword} in whole or in part. # # The input file will be "res/{ivt_name}.ivt". # # The extracted tuples, with locus ID and position, will be written to # "res/{ivt_name}-{tsize}-with-{ktag}.tup". # The counts and frequencies of the tuples will be written to # "res/{ivt_name}-with-{ktag}.tfr". # The part "-with-{ktag}" will be omitted if {kword} is {NOne}. assert tsize > 0 and tsize <= 99, f"bad {tsize = }" tup_name = f"{ivt_name}-{tsize}" if kword != None: tup_name += f"-with-{ktag}" # First make a ".tup" file with extracted tuples per parag: tup_source = f"res/{ivt_name}.ivt" tup_target = f"{tup_name}.tup" tup_script = "extract_keyed_tuples.py" tit[tup_target] = f"extracting tuples to file res/{tup_target} ..." pre[tup_target] = [ tup_source, tup_script ] mak[tup_target] = ( f"{tup_script} {ivt_name} {utype} {tsize} '{kword}' {ktag}", ) # Now compute counts and freqs of tuples in input file: tfr_source = f"res/{tup_target}" tfr_target = f"{tup_name}.tfr" freq_script = "compute_freqs_from_counts.py" tit[tfr_target] = f"computing tuple counts and freqs file res/{tfr_target} ..." pre[tfr_target] = [ tfr_source, freq_script ] mak[tfr_target] = ( f"cat {tfr_source}" + ' \\', r" | gawk '// { print $4 }'" + ' \\', r" | sort | uniq -c" + ' \\', f" | {freq_script} -encoding utf -total 'TOTAL'" + ' \\', f" | sort -b -k1,1nr" + ' \\', f" > res/{tfr_target}", ) return tfr_target # ---------------------------------------------------------------------- def add_rules_for_parag_parag_coin_image(pre, mak, tit, ivt_name0, ivt_name1): # Adds rules and commands to create a target file "res/{ivt_name0}-{ivt_name1}-coin-map.png" # that shows coincidences of sizes etc # between the parags of "res/{ivt_name0}-par.ivt" and "res/{ivt_name1}-par.ivt" # Returns the ivt_name of the target (sans "res/"). target = f"{ivt_name0}-{ivt_name1}-coin-map.png" # Hstogram plotting script and its imported modules: script = "create_parag_parag_coin_image.sh" # ??? # mkwdhist_script = "make_hist_of_units_per_parag.sh" # mkhist_script = "work/make_histogram.gawk" # h2poly_script = "work/turn_histogram_into_polygonal_line.gawk" # efn_gawk_lib = "work/error_funcs.gawk" # # pre[target] = [ script, mkwdhist_script, efn_gawk_lib, mkhist_script, h2poly_script ] # pre[target].append(f"{dir0}/{ivt_name}.upp") # pre[target].append(f"{dir1}/{ivt_name}.upp") # tit[target] = f"plotting parag size histograms for {ivt_name0} and {ivt_name1}" # mak[target] = ( # f"{script} {ivt_name0} {ivt_name1}", # ) return target # ---------------------------------------------------------------------- def hist_color(book, bsub, utype): # Chooses the color for histgrams of counts of {utype} units in # the given {book} and {bsub}. if book == "bencao": assert bsub == "fu", f"invalid bencao subset {bsub = !r}" chue = ( 1.000, 0.150, 0.000, ) dhue = ( 0.000, -0.100, +0.100, ) elif book == "starps": if bsub == "fu": chue = ( 0.000, 0.600, 0.300, ) dhue = ( +0.200, 0.000, -0.200, ) elif bsub == "gd": chue = ( 0.000, 0.300, 0.800, ) dhue = ( +0.200, -0.200, 0.000, ) else: prog_error(f"bad {bsub = }") else: prog_error(f"bad {book = }") if utype == "ec" or utype == "ch": color = make_color(chue, dhue, 00.0) elif utype == "wp" or utype == "ps": color = make_color(chue, dhue, +1.0) elif utype == "wc" or utype == "pj": color = make_color(chue, dhue, -1.0) else: prog_error(f"bad {utype = }") return color # ---------------------------------------------------------------------- def make_color(chue, dhue, pert): debug = False sat = 1 - 0.33*fabs(pert) val = 1 - 0.33*fabs(pert) phue = [ min(1.0, max(0.0, chue[k] + pert*dhue[k])) for k in range(3) ] lum = 0.3*phue[0] + 0.6*phue[1] + 0.1*phue[2] col = [ int(255*val*(sat*phue[k] + (1-sat)*lum)) for k in range(3) ] hex = f"#{col[0]:02x}{col[1]:02x}{col[2]:02x}" if debug: err.write(f"^^ {col = } {hex = }\n") return hex # ---------------------------------------------------------------------- def cleanup(which): bash(f"rm -f res/{which}*") return