#! /bin/bash -eu
# Last edited on 2026-03-06 22:46:49 by stolfi

# Arguments are {ivt_name0} {usize0} {color0} {ivt_name1} {color1}
# {usize0} {bin_size}.
# 
# Reads two parag size data files "res/{ivt_name0}.upp"
# and"res/{ivt_name1}.upp". The {ltype} of both must be "par".
# 
# Each line of each input file must be "{LOC} {NUNITS}" where {LOC} is
# the locus ID of a parag (like "b.1.2.033" or "f103v.12") and {NUNITS}
# is an integer count of text units (chars, words, etc) in that parag.
# The unit counts {NUNITS} of each files
# will be multiplied by the respective nominal unit size, {usize0} or {usize1},
# to give a scaled parag size {PSIZE}..
# 
# Plots two histograms, 0 and 1 of these scaled parag sizes, on the same
# plot, using colors {color0} and {color1} and the given {bin_size}.
#
# On the plot, also scales the parag counts {P[kb]} in each bin of
# histogram 1 by {P1_mag = P0_num/P1_num}, where {P0_num} and {P1_num}
# are the number of parags in each dataset. Unless the difference
# between them is less than 1%.

ivt_name0="$1"; shift  # Name of data file for histogram 0 (sans "res/" or ".upp").
usize0="$1"; shift     # Nominal unit size for file .
color0="$1"; shift     # Color for histogram 0.

ivt_name1="$1"; shift  # Name of data file for histogram 1 (sans "res/" or ".upp").
usize1="$1"; shift     # Nominal unit size for file .
color1="$1"; shift     # Color for histogram 1.

bin_size="$1"; shift   # Bin size.

echo "=== $0 ===" 1>&2
echo "  ivt_name0 = '${ivt_name0}'  usize0 = ${usize0}  color0 = '${color0}'" 1>&2
echo "  ivt_name1 = '${ivt_name1}'  usize1 = ${usize1}  color1 = '${color1}'" 1>&2
echo "  bin_size = ${bin_size}" 1>&2

temp="/tmp/$$"

ivt_names=( ${ivt_name0} ${ivt_name1} )

upp_file0="res/${ivt_name0}.upp"
upp_file1="res/${ivt_name1}.upp"

P_nums=( )
W_maxs=( ) 
U_maxs=( )
for which in 0 1; do
  ivt_name=${ivt_names[$which]}
  upp_file="res/${ivt_name}.upp"
  echo "~~~ computing parag counts and maz size in ${upp_file} ~~" 1>&2 
  P_num=$( cat ${upp_file} | gawk '/^[a-z]/{ np += 1 } END { print np }' )
  U_max=$( cat ${upp_file} | gawk -v max=0 '/^ *[a-z]/{ if ($2+0 > max) { max = $2} } END { print max }' )
  printf "${ivt_name}:\n" 1>&2
  printf "  %7d parags\n" "${P_num}" 1>&2
  printf "  %7d max units per parag\n" "${U_max}" 1>&2
  P_nums+=( ${P_num} )
  U_maxs+=( ${U_max} )
done

nwh_file0="${temp}-${ivt_name0}.nwh"
nwh_file1="${temp}-${ivt_name1}.nwh"

echo "=== computing scaling factors for ${ivt_name1} bin counts" 1>&2 
P1_mag=$( echo "s=${P_nums[0]}/${P_nums[1]}; if ((s>0.99)&&(s<1.01)){ s=1.000; }; s" | bc -lq )
P1_mag=$( printf "%.4f" "${P1_mag}" )
printf "P1_mag = %7.4f\n" "${P1_mag}" 1>&2

echo "computing parag size histogram ${upp_file0} -> ${nwh_file0}" 1>&2
make_hist_of_units_per_parag.sh ${usize0} ${bin_size}   < "${upp_file0}" > "${nwh_file0}"

echo "computing parag size histogram ${upp_file1} -> ${nwh_file1}" 1>&2
make_hist_of_units_per_parag.sh ${usize1} ${bin_size} < "${upp_file1}" > "${nwh_file1}"

dat_files=()
for which in 0 1; do
  ivt_name="${ivt_names[${which}]}"
  nwh_file="${temp}-${ivt_name}.nwh"
  dat_file="${temp}-${ivt_name}.dat"
  echo "=== computing plot coords file ${nwh_file} -> ${dat_file}" 1>&2 
  cat ${nwh_file} \
    | turn_histogram_into_polygonal_line.gawk \
        -i error_funcs.gawk \
        -v num=2 -v which=${which} \
    > ${dat_file}
  dat_files+=( ${dat_file} )
done

export GDFONTPATH=ttf

temp_plot="${temp}-big.png"
echo "=== creating plot of ${dat_files[0]} ${dat_files[1]}" 1>&2 
gnuplot <<EOF

usize0 = ${usize0}
usize1 = ${usize1}

U0_max = ${U_maxs[0]}
U1_max = ${U_maxs[1]}

W0_max = U0_max * usize0
W1_max = U1_max * usize1

W_max = (W0_max > W1_max ? W0_max : W1_max)

bin_size = ${bin_size}

P1_mag = ${P1_mag}

ivt_name0 = "${ivt_name0}"
ivt_name1 = "${ivt_name1}"
big_hsize = 2400
big_vsize = 800

set term pngcairo size (big_hsize),(big_vsize) font "arial,20" noenhanced
set output "${temp_plot}"

if (W_max < 99) {
  set xrange [-1.0:+99.0]
  set xtics 5
  set mxtics 5
} else {
  set xrange [-5.0:(1.1*W_max)]
  set xtics 10
  set mxtics 2
}

set yrange [-1.0:]
set ytics 5
set mytics 5
set xzeroaxis
set grid ytics
set grid xtics
set xlabel "Parag size (W)"
set ylabel "Number of parags (P)"

tS0 = (usize0 == 1 ? "" : " scaling:")
tW0 = (usize0 == 1 ? "" : "W = ${usize0}")

tS1 = (usize1 == 1 && P1_mag == 1 ? "" : " scaling:")
tW1 = (usize1 == 1 ? "" : " W = ${usize1}")
tP1 = (P1_mag == 1 ? "" : " P = ${P1_mag}")

tB = (bin_size == 1 ? "" : " bin size = ${bin_size}")

set title (ivt_name0 . tS0  . tW0 . "\n" . ivt_name1 . tS1 . tW1 . tP1 . tB)

plot \
  "${dat_files[0]}" using 1:2 title ivt_name0 with filledcurves lw 2 lc rgb '${color0}', \
  "${dat_files[1]}" using 1:(column(2)*P1_mag) title ivt_name1 with filledcurves lw 2 lc rgb '${color1}'

quit
EOF

if [[ -s ${temp_plot} ]]; then
  good_plot="res/${ivt_name0}-${ivt_name1}-upp-hist.png"
  convert ${temp_plot} -resize '50%' ${good_plot}
  display ${good_plot}
  rm ${temp}-*
else
  echo "** ${temp_plot} not generated" 1>&2; exit 1
fi
