#! /bin/bash
# Last edited on 2023-05-15 19:00:14 by stolfi

cmd="${0##*/}"
usage="${cmd} [-noylabels] [-color] [-format {FORMAT}] [-show {BOOL}] {ASMPSEC} {ATITLE}  {BSMPSEC} {BTITLE}  ...  {ONAME}" 

# Exit on first error:
set -e

# Generates a comparative Zipf plot for one or more languages.
# The "-const" option specifies the constant for the ideal Zipf plot.
# Input files must be named "{dir}/{lang}/{book}/{sec}/gud.wfr and 
# must have records of the form 
#   {count} {freq} {word}
# Uses the {freq} column to plot.
# Output files are "fig/{ONAME}.svg" "fig/{ONAME}.eps" "fig/{ONAME}.png" 
#
# Assumes that there are files "{dir}/{lang}/{book}/{sec}/whole.tlw" with the 
# full list of words of the original text.  See "../../Notes/101/Note-101.txt"
#
# Also writes "fig/{ONAME}.stat" with statistics of the files.
#
# Also writes "fig/{ONAME}.wik" with a description of the plot for Wikimedia.

ylabelfmt="%g";
color=0
format="svg"
const=""
show=0
while [[ ( $# -gt 0 ) && ( "/$1" =~ /-.* )  ]]; do
  if [[ ( $# -ge 1 ) && ( "/$1" == "/-noylabels" )  ]]; then
    ylabelfmt=""; shift
  elif [[ ( $# -ge 1 ) && ( "/$1" == "/-color" )  ]]; then
    color=1; shift
  elif [[ ( $# -ge 2 ) && ( "/$1" == "/-const" )  ]]; then
    const="$2"; shift; shift
  elif [[ ( $# -ge 2 ) && ( "/$1" == "/-format" )  ]]; then
    format="$2"; shift; shift
  elif [[ ( $# -ge 2 ) && ( "/$1" == "/-show" )  ]]; then
    show="$2"; shift; shift
  else 
    echo "bad option \"$1\"" 1>&2; 
    echo "usage: ${usage}" 1>&2; exit 1
  fi
done

if [[ $# -lt 3  ]]; then
  echo "insufficient args \"$1\" - usage: ${usage}" 1>&2; exit 1
fi

#Parse file/title pairs:
smpsec=() # {lang}/{book}/{sec} items, from command line.
title=()  # Respective titles for plot key, from command line.
wfrFile=()  # Respective plot input files, from {smpsec}
i=0
while [[ $# -gt 1 ]]; do
  if [[ $# -lt 2 ]]; then
    echo "unpaired plot arg \"$1\" - usage: ${usage}" 1>&2; exit 1
  fi
  smpsec[$i]="$1"; shift
  title[$i]="$1"; shift
  echo "  ${smpsec[$i]} \"${title[$i]}\"" 1>&2
  
  wfrFile[$i]="dat/${smpsec[$i]}/gud.wfr"
  if [[ ! -r ${wfrFile[$i]} ]]; then
    echo "** ${wfrFile[$i]} does not exist" 1>&2; exit 1
  fi
  i=$(( $i + 1 ))
done
nfiles=${i}
  
# Parse output filename prefix:
if [[ $# -ne 1  ]]; then
  echo "missing output prefix - usage: ${usage}" 1>&2; exit 1
fi
oname="$1"; shift;

tmp="/tmp/$$"

# Determine the max distinct words {Nmax} and max key length {Lmax}:
i=0
Nmax=0 # Max number of distinct words.
Lmax=0 # Max title length.
rm -fv ${statfile} 2>&1 | sed -e 's:^:  :g' 1>&2
while [[ $i -lt ${nfiles} ]]; do
  N=`cat ${wfrFile[$i]} | wc -l`
  if [[ ${N} -gt ${Nmax} ]]; then Nmax=${N}; fi
  L="${#title[$i]}"
  if [[ ${L} -gt ${Lmax} ]]; then Lmax=${L}; fi
  i=$(( $i + 1 ))
done

# Compute the constant ${const} for the ideal Zipf law:
if [[ "/${const}" == "/" ]]; then
  echo "  computing {const} from {Nmax = ${Nmax}}" 1>&2
  const=`echo "s=0; for (k=1; k <= ${Nmax}; k++) { s = s + 1/k; }; 1/s" | bc -lq`
  # echo "  const = '${const}' (2)" 1>&2 
  const=`printf "%.4f" "${const}"`
fi
echo "  ideal plot constant = '${const}'" 1>&2 

# Generate the wikimedia description and statistics file:
statFile="fig/${oname}.stats"
capFile="fig/${oname}.wik"
rm -fv ${statFile}  ${capFile} 2>&1 | sed -e 's:^:  :g' 1>&2 

printf "Zipf law plot (frequency as function of frequency rank) for various texts.\n\n" >> ${capFile}

printf "The languages, texts and the word frequency files are:\n\n"  >> ${capFile}

prevsmpsec="NONE/NONE/NONE" 
i=0
tfFlag=0 # Set to 1 if any files were truncated/filtered.
while [[ $i -lt ${nfiles} ]]; do
  # Compute good words statistics:
  stats=( `compute-smpsec-gud-stats.sh "${smpsec[$i]}"` )
  if [[ ${#stats[@]} -ne 3 ]]; then echo "** bad stats = (${stats[*]})" 1>&2; exit 1; fi
  nTksWhole="${stats[0]}"
  nTksTrunc="${stats[1]}"
  nWdsTrunc="${stats[2]}"
  
  if [[ ${nTksWhole} -ne  ${nTksTrunc} ]]; then tfFlag=1; fi
  
  # Append statistics to the plot statistics file:
  printf "%s:" "${smpsec[$i]}" >> ${statFile}
  printf " whole %6d words, truncated to %6d words" "${nTksWhole}" "${nTksTrunc}" >> ${statFile}
  printf ", ''N'' = %6d distinct" "${nWdsTrunc}"  >> ${statFile}
  printf " (%s)\n" "${title[$i]}" >> ${statFile}

  # Append this {smpsec} to Wikimedia description file
  create-wikimedia-description.sh \
      "${prevsmpsec}" \
      "${smpsec[$i]}" \
      "${nTksWhole}" "${nTksTrunc}" "${nWdsTrunc}" \
    >> ${capFile}

  prevsmpsec="${smpsec[$i]}"
  i=$(( $i + 1 ))
done

printf "Nmax = %d const = %.4f\n" "${Nmax}" "${const}" >> ${statFile}

printf "  --- ${statFile} ------------------------------\n" 1>&2
cat ${statFile} | sed -e 's:^:  :g' 1>&2
printf "  ----------------------------------------------------\n" 1>&2

# Degermine whether any files were truncated/filteres:
tfNoteA=""
tfNoteB=" extracted"
if [[ ${tfFlag} -ne 0 ]]; then
  tfNoteA=", before truncation/filtering,"
  tfNoteB=" truncated/filtered"
fi

printf "The word frequency files '*/*/*/gud.wfr' are available at the [https://www.ic.unicamp.br/~stolfi/EXPORT/projects/voynich/Notes/tr-stats/dat/ UNICAMP website].  The original annotated full texts${tfNoteA} are in the companion files */*/org/main.src.  The${tfNoteB} texts -- one word per line, without punctuation -- are in */*/*/gud.tlw.\n"  >> ${capFile}

printf "  --- ${capFile} ------------------------------\n" 1>&2
cat ${capFile} | sed -e 's:^:  :g' 1>&2
printf "  ----------------------------------------------------\n" 1>&2

colors=( \
  ff2200 0033ff 006633 ee8800 aa5500 \
  55aa00 008877 8800ff dd0044 ff55ff \
  777777 \
)
if [[ ${#colors[@]} -lt ${#smpsec[@]} ]]; then 
  echo "** ran out of line colors" 1>&2; exit 1;
fi 

# Generate the plot commands file:
if [[ "/${format}" == "/svg"  ]]; then
  if [[ ${color} -ne 0 ]]; then
    ptypes=( 7 7 7 7 7 7 7 7 7 7 )
  else
    ptypes=( 4 2 3 1 6 7 8 1 2 4 )
  fi
  pointsize="0.30"
  linewidth="2.00"
  dashtype="(10,5)"
  keyspacing="0.8"
  # Gnuplot SVG computes the key width wrong. We must compensate:
  keywidth=`echo "w = ${Lmax}/3.0; 2-w" | bc -lq`
  keywidth=`printf "%.1f" "${keywidth}"`
  termspec="set term svg size 610,600 dynamic font 'Helvetica,18' noenhanced;"
  # sizespec="set size ratio -1;" # Does not work?
  sizespec="set size 1.05,1.00;"
  marginspec="set lmargin 6; set rmargin 4; set tmargin 0.5; set bmargin 2.0;"
elif [[ "/${format}" == "/eps"  ]]; then
  if [[ ${color} -ne 0 ]]; then
    # Gnuplot's linetype -> color mapping (bleech) for color PS output
    #   0 = black
    #   1 = red
    #   2 = light (BAD!)
    #   3 = blue
    #   4 = magenta
    #   5 = cyan (BAD!)
    #   6 = yellow (BAD!)
    #   7 = black
    #   8 = orange
    #   9 = gray (BAD!)
    #   10,11,12,... = 1,2,3,...
    colorspec="color solid"
    ptypes=( 7 7 7 7 7 7 7 7 7 7 )
  else
    colorspec="mono"
    ptypes=( 4 2 3 1 6 7 8 1 2 4 )
  fi
  pointsize="1.0"
  linewidth="2.50"
  dashtype="(4,2)"
  keyspacing="1.1"
  keywidth="0.0"
  termspec1="postscript eps ${colorspec} font 'Helvetica,32'"
  termspec2="linewidth 1.5 background '#ffffff' noenhanced;"
  termspec="set term ${termspec1} ${termspec2};"
  # sizespec="set size ratio -1;" # Does not work.
  sizespec="set size 1.5,1.85;"
  marginspec="set lmargin 7; set rmargin 4; set tmargin 0.5; set bmargin 2.0;"
elif [[ "/${format}" == "/png"  ]]; then
  if [[ ${color} -ne 0 ]]; then
    ptypes=( 7 7 7 7 7 7 7 7 7 7 )
  else
    ptypes=( 7 6 5 4 1 3 2 8 9 6 )
  fi
  pointsize="1.7"
  linewidth="1.5"
  dashtype="(4,2)"
  keyspacing="1.1"
  keywidth="0.0"
  termspec="set term png font arial 40 noenhanced linewidth 3.0 size 1950,1800;"
  marginspec="set lmargin 7; set rmargin 4; set tmargin 0.5; set bmargin 2.0;"
  # sizespec="set size ratio -1;" # Does not work.
  sizespec="set size 1.00,1.00;"
else
  echo "invalid plot output format" 1>&2; exit 1
fi

# Common part of plot file:
gplFile="${tmp}.gnuplot"
tmpPlotFile="${tmp}.${format}"
outPlotFile="fig/${oname}.${format}"

cat > ${gplFile} <<EOF
${termspec}
set output "${tmpPlotFile}"
set logscale xy
# "set size ratio -1" does not work... 
${sizespec}; # show size
${marginspec}; # show margin
set style line 20 dashtype ${dashtype}
${linespecs[@]}
set key noenhanced right top reverse Left spacing ${keyspacing} width ${keywidth}
unset title
set xrange [0.8:37000]
set yrange [0.00002:0.5]
set format y "${ylabelfmt}"
const = ${const}
EOF

printf 'plot (x<0.9 ? 0/0 : (x>5000 ? 0/0 : const/x)) notitle' >> ${gplFile}
printf ' with lines ls 20 lw 1.8 lc rgb "#885500" ' >> ${gplFile}

# generate plot commands 
sep=","
i=0
while [[ $i -lt ${nfiles} ]]; do
  printf '%s \\\n  "%s" using ($0+1):2 title "%s"' \
    "${sep}" "${wfrFile[$i]}" "${title[$i]}" >> ${gplFile}
  printf ' \\\n     with linespoints ls %d lw %s pt %d ps %s' \
    "$(( $i + 1 ))" "${linewidth}" "${ptypes[$i]}" "${pointsize}" >> ${gplFile} 
  if [[ ${color} -ne 0 ]]; then
    printf " lc rgb '#%s'" "${colors[$i]}" >> ${gplFile} 
  fi
  sep=","
  i=$(( $i + 1 ))
done
printf '\n' >> ${gplFile}
printf 'quit\n' >> ${gplFile}

export GDFONTPATH=.
gnuplot `cat gnuplot-X11-options.sh` < ${gplFile}

if [[ "/${format}" == "/svg" ]]; then
  mv ${tmpPlotFile} ${outPlotFile}
  if [[ ${show} -ne 0 ]]; then
    time inkview ${outPlotFile}
  fi
  /bin/rm -f ${gplFile}
elif [[ "/${format}" == "/eps" ]]; then
  mv ${tmpPlotFile} ${outPlotFile}
  if [[ ${show} -ne 0 ]]; then
    evince ${outPlotFile}
  fi
  /bin/rm -f ${gplFile}
elif [[ "/${format}" == "/png" ]]; then
  convert ${tmpPlotFile} -resize '600x' ${outPlotFile}
  if [[ ${show} -ne 0 ]]; then
    display ${outPlotFile}
  fi
  /bin/rm -f ${tmpPlotFile} ${gplFile}
else
  echo "invalid plot output format" 1>&2
fi