#! /bin/csh -f
# Last edited on 2004-01-31 06:38:49 by stolfi

set cmd = "$0"
set usage = "${cmd} [-keepShaddah|-expandShaddah] DIRA DIRB NULLS"

# Compares two versions of the same book, DIRA/main.evt and DIRB/main.evt.
# The "-keepShaddah" option preserves the doubling mark "ť",
# "-expandShaddah" replaces "ť" by a copy of the previous letter.

# 1 = replace shaddahs by doubling, 2 = preserve the shaddahs
set shaddah = 2

while ( ( $#argv >= 1 ) && ( "/$1" =~ /-* ) ) 
  if ( ( $#argv >= 1 ) && ( "/$1" == "/-keepShaddah" ) ) then
    set shaddah = 2; shift;
    echo 'preserving the shaddah "ť"'
  else if ( ( $#argv >= 1 ) && ( "/$1" == "/-expandShaddah" ) ) then
    set shaddah = 1; shift;
    echo 'replacing the shaddah "ť" by previous letter'
  else
    echo "invalid option $1"; exit 1
  endif
end

set dira = "$1"; shift;
set dirb = "$1"; shift;
set nulls = "$1"; shift;

set tooldir = "/home/staff/stolfi/bin"

gawk 'BEGIN{ printf "nulls=('"${nulls}"')\n";}'

foreach dir ( $dira $dirb )

  # Preliminary filtering. Get a copy of each EVT text without chapter
  # titles ("T" units). Leave in the verse numbers as separate words.
  # Note that we must also leave the comments that define the
  # character sets, for the benefit of "evt-to-words".

  echo 'extracting clean ".evt" for version "'"${dir}"'"...'
  cat ${dir}/main.evt \
    | egrep '^ *([#]|[<][s][0-9]*[.][V][0-9]*[.][0-9]+[;A-Za-z]*[>])' \
    > ${dir}/.xx.evt

  # Extract the words, delete the null characters, and remove 
  # the shadah by doubling the previous letter.
  # Also prefix "[${dir}]" to the file, and "${dir}." to the verse numbers,
  # in order that they show up in the "diff".
  
  echo 'extracting words of version "'"${dir}"'"...'
  echo "[${dir}]" > ${dir}/.xx.wds
  cat ${dir}/.xx.evt \
    | ${tooldir}/evt-to-wds \
        -v smashSymbols=0 \
        -v showPuncts=1 -v showBreaks=0 -v showParags=0 \
    | sed \
        -e 's/\(.\)\([ť]\)/\1\'"${shaddah}"'/g' \
    | tr -d "${nulls}" \
    | sed \
        -e '/^ *$/d' \
        -e 's/^\([0-9]*[.][0-9]*[.]\)$/==='"${dir}"'.\1/g' \
    | tr '=' '\012' \
    >> ${dir}/.xx.wds

  # Count verses (excluding verse 0) and words (including verse 0),
  # per sura and total:
  
  echo 'counting words of version "'"${dir}"'"...'
  cat ${dir}/.xx.wds \
    | gawk \
       ' BEGIN{ s=""; nv=0; nw=0; tv=0; tw=0; } \
         /^ *([\#@=÷]|$)/ { next; } \
         /[0-9][.]/ { \
           gsub(/[.]/, " "); xs = $2; xv = $3; \
           if (xs \!= s) { \
             if (s \!= "") { out(); } \
             nv = 0; nw = 0; s = xs; \
           } \
           if (xv \!= "0") { nv++; tv++; } \
           next; \
         } \
         /./ { nw++; tw++; } \
         END{ if(s \!= "") { out(); } \
           printf "\ntotal:  %4d verses %5d words\n", tv, tw; \
         } \
         function out() { \
           printf "sura %3s: %3d verses %4d words\n", s, nv, nw; \
         } \
       ' \
    > ${dir}/.xx.cts
  echo "=== ${dir} ==="
  cat ${dir}/.xx.cts
end

# Now run "diff" with side-by-side option. The original "diff" does
# not count non-ascii characters and therefore gets the alignment
# wrong, so use a custom version.

echo 'comparing words of "'"${dira}"'" and "'"${dirb}"'"...'
~/pkg/diffutils-2.8.1-1/src/diff \
    --minimal \
    --expand-tabs \
    --ignore-all-space \
    --side-by-side --width=60 \
    --suppress-common-lines \
    ${dira}/.xx.wds ${dirb}/.xx.wds \
  > .diff
  
# Now summarize the differences, per section:

echo 'summarizing the differences...'
cat .diff \
  | gawk -v dira="${dira}" -v dirb="${dirb}" \
      ' BEGIN{ s= ""; nv = 0; nd = 0; tv = 0; td = 0; } \
        /^ *$/{ next; } \
        /[\#]/{ next; } \
        /[\]\[]/{ next; } \
        /[.][0-9]*[.]/{ \
          gsub(/[.]/, " "); \
          xf = $1; xs = $2; xv = $3; yf = $5; ys = $6; yv = $7; \
          if ((xf \!= dira) || (yf \!= dirb)) { bug("file"); } ; \
          if (xs \!= ys) { bug("sura"); }; \
          if (xv \!= yv) { bug("verse"); }; \
          if (xs \!= s){ \
            if(s \!= ""){ out(); } \
            s = xs; nv = 0; nd = 0; \
          }; \
          if (xv \!= "0") { nv++; tv++; } \
          next; \
        } \
        /[|<>]/{ nd++; td++; next; } \
        //{ bug("format"); } \
        END{ if (s \!= "") { out(); } \
          printf "\ntotal:  %4d verses %5d differences\n", tv, td; \
        } \
        function out(){ \
          printf "sura %4s: %3d verses %4d differences\n", os,nv,nd; \
        } \
        function bug(msg){ \
          printf "%d: error: %s\n", FNR, msg > "/dev/stderr"; \
        } \
      ' \
  > .diffcts

echo " "
echo "=== ${dira} × ${dirb} ==="
cat .diffcts

echo "=== sample differences ==="
cat .diff | egrep -v '[#]|^ *$|'"${dira}"'|'"{dirb}" | head -20
echo "..."
cat .diff | egrep -v '[#]|^ *$|'"${dira}"'|'"{dirb}" | tail -10


# Delete big files:
# /bin/rm -f {${dira},${dirb}}/.xx.wds .diff