#! /bin/bash -ue
# Last edited on 2025-12-24 04:13:28 by stolfi

cmd="${0##*/}"
usage="${cmd} [-noylabels] [-color] [-format {FORMAT}] [-const {CONST} ] [-show {BOOL}] \\\n   {AFILE} {ACOLOR} {ATITLE} \\\n   {BFILE} {BCOLOR} {BTITLE} \\\n  ... \\\n  {ONAME}" 

# Generates a comparative Zipf plot for one or more languages.
# The "-const" option specifies the constant for the ideal Zipf plot.
# Input files {AFILE}, {BFILE}, etc. must have records of the form 
#   {count} {freq} {lexeme}
# and must be sorted by decreasing {count}. Uses the {freq} column to plot.
# The {ACOLOR}, {BCOLOR}, etc. must be colors specs in the format accepted
# by gnuplot, e. g. 'red' or '#ff0000'.
# Output files are "{ONAME}.svg" "{ONAME}.eps" "{ONAME}.png" 

ylabelfmt="%g";
color=1
format="png"
const=""
show=0
while [[ ( $# -gt 0 ) && ( "/$1" =~ /-.* )  ]]; do
  if [[ ( $# -ge 1 ) && ( "/$1" == "/-noylabels" )  ]]; then
    ylabelfmt=""; shift
  elif [[ ( $# -ge 1 ) && ( "/$1" == "/-color" )  ]]; then
    color=1; shift
  elif [[ ( $# -ge 2 ) && ( "/$1" == "/-const" )  ]]; then
    const="$2"; shift; shift
  elif [[ ( $# -ge 2 ) && ( "/$1" == "/-format" )  ]]; then
    format="$2"; shift; shift
  elif [[ ( $# -ge 2 ) && ( "/$1" == "/-show" )  ]]; then
    show="$2"; shift; shift
  else 
    echo "bad option \"$1\"" 1>&2; 
    echo "usage: ${usage}" 1>&2; exit 1
  fi
done

if [[ $# -lt 4  ]]; then
  echo "insufficient args \"$1\" - usage: ${usage}" 1>&2; exit 1
fi

# Parse file/title pairs:
wfrFile=()  # Input word frequency files.
colors=()  # Input word frequency files.
title=()    # Respective titles for plot key, from command line.
i=0
while [[ $# -gt 1 ]]; do
  if [[ $# -lt 3 ]]; then
    echo "unpaired plot arg \"$1\" - usage: ${usage}" 1>&2; exit 1
  fi
  wfrFile[$i]="$1"; shift
  colors[$i]="$1"; shift
  title[$i]="$1"; shift
  echo "  ${wfrFile[$i]} \"${title[$i]}\"" 1>&2
  
  if [[ ! -r ${wfrFile[$i]} ]]; then
    echo "** ${wfrFile[$i]} does not exist" 1>&2; exit 1
  fi
  i=$(( $i + 1 ))
done
nfiles=${i}
  
# Parse output filename prefix:
if [[ $# -ne 1  ]]; then
  echo "missing output prefix - usage: ${usage}" 1>&2; exit 1
fi
oname="$1"; shift;

tmp="/tmp/$$"

# Determine the max distinct words {Nmax} and max key length {Lmax}:
i=0
Nmax=0 # Max number of lexemes (distinct words).
Lmax=0 # Max title length.
while [[ $i -lt ${nfiles} ]]; do
  N=`cat ${wfrFile[$i]} | wc -l`
  if [[ ${N} -gt ${Nmax} ]]; then Nmax=${N}; fi
  L="${#title[$i]}"
  if [[ ${L} -gt ${Lmax} ]]; then Lmax=${L}; fi
  i=$(( $i + 1 ))
done

# Compute the constant ${const} for the ideal Zipf law:
if [[ "/${const}" == "/" ]]; then
  echo "  computing {const} from {Nmax = ${Nmax}}" 1>&2
  const=`echo "s=0; for (k=1; k <= ${Nmax}; k++) { s = s + 1/k; }; 1/s" | bc -lq`
  # echo "  const = '${const}' (2)" 1>&2 
  const=`printf "%.4f" "${const}"`
fi
echo "  ideal plot constant = '${const}'" 1>&2 

# Generate the plot commands file:
if [[ "/${format}" == "/svg"  ]]; then
  if [[ ${color} -ne 0 ]]; then
    ptypes=( 7 7 7 7 7 7 7 7 7 7 )
  else
    ptypes=( 4 2 3 1 6 7 8 1 2 4 )
  fi
  pointsize="0.30"
  linewidth="2.00"
  dashtype="(10,5)"
  keyspacing="0.8"
  # Gnuplot SVG computes the key width wrong. We must compensate:
  keywidth=`echo "w = ${Lmax}/3.0; 2-w" | bc -lq`
  keywidth=`printf "%.1f" "${keywidth}"`
  termspec="set term svg size 610,600 dynamic font 'Helvetica,18' noenhanced;"
  # sizespec="set size ratio -1;" # Does not work?
  sizespec="set size 1.05,1.00;"
  marginspec="set lmargin 6; set rmargin 4; set tmargin 0.5; set bmargin 2.0;"
elif [[ "/${format}" == "/eps"  ]]; then
  if [[ ${color} -ne 0 ]]; then
    # Gnuplot's linetype -> color mapping (bleech) for color PS output
    #   0 = black
    #   1 = red
    #   2 = light (BAD!)
    #   3 = blue
    #   4 = magenta
    #   5 = cyan (BAD!)
    #   6 = yellow (BAD!)
    #   7 = black
    #   8 = orange
    #   9 = gray (BAD!)
    #   10,11,12,... = 1,2,3,...
    colorspec="color solid"
    ptypes=( 7 7 7 7 7 7 7 7 7 7 )
  else
    colorspec="mono"
    ptypes=( 4 2 3 1 6 7 8 1 2 4 )
  fi
  pointsize="1.0"
  linewidth="2.50"
  dashtype="(4,2)"
  keyspacing="1.1"
  keywidth="0.0"
  termspec1="postscript eps ${colorspec} font 'Helvetica,32'"
  termspec2="linewidth 1.5 background '#ffffff' noenhanced;"
  termspec="set term ${termspec1} ${termspec2};"
  # sizespec="set size ratio -1;" # Does not work.
  sizespec="set size 1.5,1.85;"
  marginspec="set lmargin 7; set rmargin 4; set tmargin 0.5; set bmargin 2.0;"
elif [[ "/${format}" == "/png"  ]]; then
  if [[ ${color} -ne 0 ]]; then
    ptypes=( 7 7 7 7 7 7 7 7 7 7 )
  else
    ptypes=( 7 6 5 4 1 3 2 8 9 6 )
  fi
  pointsize="1.7"
  linewidth="1.5"
  dashtype="(4,2)"
  keyspacing="1.1"
  keywidth="0.0"
  termspec="set term png font arial 40 noenhanced linewidth 3.0 size 1950,1800;"
  marginspec="set lmargin 7; set rmargin 4; set tmargin 0.5; set bmargin 2.0;"
  # sizespec="set size ratio -1;" # Does not work.
  sizespec="set size 1.00,1.00;"
else
  echo "invalid plot output format" 1>&2; exit 1
fi

# Common part of plot file:
gplFile="${tmp}.gnuplot"
tmpPlotFile="${tmp}.${format}"
outPlotFile="${oname}.${format}"

cat > ${gplFile} <<EOF
${termspec}
set output "${tmpPlotFile}"
set logscale xy
# "set size ratio -1" does not work... 
${sizespec}; # show size
${marginspec}; # show margin
set style line 20 dashtype ${dashtype}
${linespecs[@]}
set key noenhanced right top reverse Left spacing ${keyspacing} width ${keywidth}
unset title
set xrange [0.8:37000]
set yrange [0.00002:0.5]
set format y "${ylabelfmt}"
const = ${const}
EOF

printf 'plot (x<0.9 ? 0/0 : (x>5000 ? 0/0 : const/x)) notitle' >> ${gplFile}
printf ' with lines ls 20 lw 1.8 lc rgb "#885500" ' >> ${gplFile}

# generate plot commands 
sep=","
i=0
while [[ $i -lt ${nfiles} ]]; do
  printf '%s \\\n  "%s" using ($0+1):2 title "%s"' \
    "${sep}" "${wfrFile[$i]}" "${title[$i]}" >> ${gplFile}
  printf ' \\\n     with linespoints ls %d lw %s pt %d ps %s' \
    "$(( $i + 1 ))" "${linewidth}" "${ptypes[$i]}" "${pointsize}" >> ${gplFile} 
  if [[ ${color} -ne 0 ]]; then
    printf " lc rgb '%s'" "${colors[$i]}" >> ${gplFile}
  fi
  sep=","
  i=$(( $i + 1 ))
done
printf '\n' >> ${gplFile}
printf 'quit\n' >> ${gplFile}

export GDFONTPATH=.
gnuplot < ${gplFile}

if [[ "/${format}" == "/svg" ]]; then
  mv ${tmpPlotFile} ${outPlotFile}
  if [[ ${show} -ne 0 ]]; then
    time inkview ${outPlotFile}
  fi
  /bin/rm -f ${gplFile}
elif [[ "/${format}" == "/eps" ]]; then
  mv ${tmpPlotFile} ${outPlotFile}
  if [[ ${show} -ne 0 ]]; then
    evince ${outPlotFile}
  fi
  /bin/rm -f ${gplFile}
elif [[ "/${format}" == "/png" ]]; then
  convert ${tmpPlotFile} -resize '600x' ${outPlotFile}
  if [[ ${show} -ne 0 ]]; then
    display ${outPlotFile}
  fi
  /bin/rm -f ${tmpPlotFile} ${gplFile}
else
  echo "invalid plot output format" 1>&2
fi
