#! /bin/bash -eu
# Last edited on 2026-02-18 03:10:46 by stolfi

# Usage
#
#   plot_qq_gap_histogram.sh {TAG} {TYPE} {PNORM} < {HISTFILE} > {FIGURE}.png
#
# Reads from {stdin} a list of lines "{GTYP} {GLEN}" where {GTYP} is
# 'BE', 'BQ', 'QQ', or 'QE' and {GLEN} is the length of a q-token gap of
# that type.
# 
# Extracts the lines with {GTYP==TYPE} and plots the histogram of the
# lengthd as a PNG file that is written to standard output. 
#
# Also plots the ideal histogram if the prob of a token being "normal"
# (not q, not bad) is {pnorm}.
#
# The {TAG} parameter should be a tag that will be used only on the 
# title of the plot.  It could identifi, for instance, which sections 
# were considered in the analysis.

tag="$1"; shift   # E.g. 'hea+str'
type="$1"; shift  # 'BE', 'BQ', 'QQ', or 'QE'
pnorm="$1"; shift # Prob of token being normal;

tmp="/tmp/$$"

tmphsize=1800
tmpvsize=800

echo "extracting ${type} gaps ..." 1>&2
tmp_gaps="${tmp}-gaps.txt"
cat | egrep -e "^${type}" > ${tmp_gaps}
ngaps=$( cat ${tmp_gaps} | wc -l )
echo "total gaps in file = ${ngaps}" 1>&2

echo "compute total number of tokens in the gaps ..." 1>&2
ntoks=$( cat ${tmp_gaps} | gawk '//{n += $2} END {print n}' )
echo "total tokens in gaps = ${ntoks}" 1>&2

echo "computing the histogram of the gaps ..." 1>&2
tmp_hist="${tmp}-hist.txt"
cat ${tmp_gaps} \
  | make_histogram.gawk \
      -v col=2 -v step=1 -v sync=0.5 \
  > ${tmp_hist}

echo "creating file for gnuplot ..." 1>&2
tmp_poly="${tmp}-his.txt"
cat ${tmp_hist} \
  | gawk \
      ' BEGIN { 
          inf = "+inf" + 0; dir += 0;
          vstep = -1; nh = 0; ndat=0;
          pwd = 0.30; eps = 0.005; 
        }
        /^ *[-+0-9.]/ { 
          if (NF != 4) { data_error("bad NF") }
          ndat++;
          xlo = $1; xmd = $2; xhi = $3; ct = $4;
          plo = xmd - pwd/2
          phi = xmd + pwd/2
          if (ct > 0) {
            print "";
            print plo, 0; print plo, ct;
            print phi, ct; print phi, 0;
            nh++;
          }
          next;
        }
        // { data_error("bad format"); }
        END { 
          printf "read %d histogram entries\n", ndat > "/dev/stderr";
          printf "generated %d non-zero histogram bars\n", nh > "/dev/stderr";
          if (nh == 0) { bug("no bins"); }
          print "";
        }
      ' \
  > ${tmp_poly}

echo "creating plot ..." 1>&2
tmp_plot="${tmp}-big.png"
export GDFONTPATH="${HOME}/ttf"
gnuplot <<EOF
  set term png size ${tmphsize},${tmpvsize} font "arial,20" noenhanced
  set output "${tmp_plot}"
  
  ngaps = ${ngaps}
  pnorm = ${pnorm}

  set xrange [-1.0:]
  set yrange [-1.0:]

  set xtics 5
  set mxtics 5
  if (ngaps <= 400) {
    set ytics 5
    set mytics 5
  } else if (ngaps <= 2000) {
    set ytics 20
    set mytics 4
  } else if (ngaps <= 4000) {
    set ytics 50
    set mytics 5
  } else if (ngaps <= 20000) {
    set ytics 200
    set mytics 4
  } else {
    set ytics 5000
    set mytics 5
  }
    
  set grid ytics
  set grid xtics

  set xlabel "Number of tokens in gap"
  set ylabel "Number of gaps"
  
  set title "${tag} ${type}"

  set xzeroaxis lt 1 lw 1.5 lc rgb "#2288ff"
  set yzeroaxis lt 1 lw 1.5 lc rgb "#2288ff"
  
  set key top right
  set key invert
  
  ide(k) = idef(column(k))
  idef(x) = (x >= 0 ? ngaps*pnorm**x*(1 - pnorm) : 0/0)

  plot \
    "${tmp_hist}" using 2:(ide(2)) \
      title "expected" with linespoints lw 2 pt 7 ps 1 lc rgb '#225588', \
    "${tmp_poly}" using 1:2        \
      title "${type}" with filledcurves lw 2 lc rgb '#ff2200'
  quit
EOF

if [[ -s ${tmp_plot} ]]; then
  echo "reducing plot ..." 1>&2
  tmp_pred="${tmp}-sma.png"
  convert ${tmp_plot} -resize '50%' ${tmp_pred}
  display -title "${tag} ${type}" ${tmp_pred}
  cat ${tmp_pred}
else
  echo "** ${tmp_plot} not created" 1>&2
  exit 1
fi

rm -f ${tmp}*.*


