#! /bin/csh -f # Last edited on 2004-01-31 06:38:49 by stolfi set cmd = "$0" set usage = "${cmd} [-keepShaddah|-expandShaddah] DIRA DIRB NULLS" # Compares two versions of the same book, DIRA/main.evt and DIRB/main.evt. # The "-keepShaddah" option preserves the doubling mark "»", # "-expandShaddah" replaces "»" by a copy of the previous letter. # 1 = replace shaddahs by doubling, 2 = preserve the shaddahs set shaddah = 2 while ( ( $#argv >= 1 ) && ( "/$1" =~ /-* ) ) if ( ( $#argv >= 1 ) && ( "/$1" == "/-keepShaddah" ) ) then set shaddah = 2; shift; echo 'preserving the shaddah "»"' else if ( ( $#argv >= 1 ) && ( "/$1" == "/-expandShaddah" ) ) then set shaddah = 1; shift; echo 'replacing the shaddah "»" by previous letter' else echo "invalid option $1"; exit 1 endif end set dira = "$1"; shift; set dirb = "$1"; shift; set nulls = "$1"; shift; set tooldir = "${STOLFIHOME}/bin" gawk 'BEGIN{ printf "nulls=('"${nulls}"')\n";}' foreach dir ( $dira $dirb ) # Preliminary filtering. Get a copy of each EVT text without chapter # titles ("T" units). Leave in the verse numbers as separate words. # Note that we must also leave the comments that define the # character sets, for the benefit of "evt-to-words". echo 'extracting clean ".evt" for version "'"${dir}"'"...' cat ${dir}/main.evt \ | egrep '^ *([#]|[<][s][0-9]*[.][V][0-9]*[.][0-9]+[;A-Za-z]*[>])' \ > ${dir}/.xx.evt # Extract the words, delete the null characters, and remove # the shadah by doubling the previous letter. # Also prefix "[${dir}]" to the file, and "${dir}." to the verse numbers, # in order that they show up in the "diff". echo 'extracting words of version "'"${dir}"'"...' echo "[${dir}]" > ${dir}/.xx.wds cat ${dir}/.xx.evt \ | ${tooldir}/evt-to-wds \ -v smashSymbols=0 \ -v showPuncts=1 -v showBreaks=0 -v showParags=0 \ | sed \ -e 's/\(.\)\([»]\)/\1\'"${shaddah}"'/g' \ | tr -d "${nulls}" \ | sed \ -e '/^ *$/d' \ -e 's/^\([0-9]*[.][0-9]*[.]\)$/==='"${dir}"'.\1/g' \ | tr '=' '\012' \ >> ${dir}/.xx.wds # Count verses (excluding verse 0) and words (including verse 0), # per sura and total: echo 'counting words of version "'"${dir}"'"...' cat ${dir}/.xx.wds \ | gawk \ ' BEGIN{ s=""; nv=0; nw=0; tv=0; tw=0; } \ /^ *([\#@=÷]|$)/ { next; } \ /[0-9][.]/ { \ gsub(/[.]/, " "); xs = $2; xv = $3; \ if (xs \!= s) { \ if (s \!= "") { out(); } \ nv = 0; nw = 0; s = xs; \ } \ if (xv \!= "0") { nv++; tv++; } \ next; \ } \ /./ { nw++; tw++; } \ END{ if(s \!= "") { out(); } \ printf "\ntotal: %4d verses %5d words\n", tv, tw; \ } \ function out() { \ printf "sura %3s: %3d verses %4d words\n", s, nv, nw; \ } \ ' \ > ${dir}/.xx.cts echo "=== ${dir} ===" cat ${dir}/.xx.cts end # Now run "diff" with side-by-side option. The original "diff" does # not count non-ascii characters and therefore gets the alignment # wrong, so use a custom version. echo 'comparing words of "'"${dira}"'" and "'"${dirb}"'"...' ~/pkg/diffutils-2.8.1-1/src/diff \ --minimal \ --expand-tabs \ --ignore-all-space \ --side-by-side --width=60 \ --suppress-common-lines \ ${dira}/.xx.wds ${dirb}/.xx.wds \ > .diff # Now summarize the differences, per section: echo 'summarizing the differences...' cat .diff \ | gawk -v dira="${dira}" -v dirb="${dirb}" \ ' BEGIN{ s= ""; nv = 0; nd = 0; tv = 0; td = 0; } \ /^ *$/{ next; } \ /[\#]/{ next; } \ /[\]\[]/{ next; } \ /[.][0-9]*[.]/{ \ gsub(/[.]/, " "); \ xf = $1; xs = $2; xv = $3; yf = $5; ys = $6; yv = $7; \ if ((xf \!= dira) || (yf \!= dirb)) { bug("file"); } ; \ if (xs \!= ys) { bug("sura"); }; \ if (xv \!= yv) { bug("verse"); }; \ if (xs \!= s){ \ if(s \!= ""){ out(); } \ s = xs; nv = 0; nd = 0; \ }; \ if (xv \!= "0") { nv++; tv++; } \ next; \ } \ /[|<>]/{ nd++; td++; next; } \ //{ bug("format"); } \ END{ if (s \!= "") { out(); } \ printf "\ntotal: %4d verses %5d differences\n", tv, td; \ } \ function out(){ \ printf "sura %4s: %3d verses %4d differences\n", os,nv,nd; \ } \ function bug(msg){ \ printf "%d: error: %s\n", FNR, msg > "/dev/stderr"; \ } \ ' \ > .diffcts echo " " echo "=== ${dira} × ${dirb} ===" cat .diffcts echo "=== sample differences ===" cat .diff | egrep -v '[#]|^ *$|'"${dira}"'|'"{dirb}" | head -20 echo "..." cat .diff | egrep -v '[#]|^ *$|'"${dira}"'|'"{dirb}" | tail -10 # Delete big files: # /bin/rm -f {${dira},${dirb}}/.xx.wds .diff