#! /bin/csh -f
# Last edited on 2004-01-31 06:38:49 by stolfi
set cmd = "$0"
set usage = "${cmd} [-keepShaddah|-expandShaddah] DIRA DIRB NULLS"
# Compares two versions of the same book, DIRA/main.evt and DIRB/main.evt.
# The "-keepShaddah" option preserves the doubling mark "»",
# "-expandShaddah" replaces "»" by a copy of the previous letter.
# 1 = replace shaddahs by doubling, 2 = preserve the shaddahs
set shaddah = 2
while ( ( $#argv >= 1 ) && ( "/$1" =~ /-* ) )
if ( ( $#argv >= 1 ) && ( "/$1" == "/-keepShaddah" ) ) then
set shaddah = 2; shift;
echo 'preserving the shaddah "»"'
else if ( ( $#argv >= 1 ) && ( "/$1" == "/-expandShaddah" ) ) then
set shaddah = 1; shift;
echo 'replacing the shaddah "»" by previous letter'
else
echo "invalid option $1"; exit 1
endif
end
set dira = "$1"; shift;
set dirb = "$1"; shift;
set nulls = "$1"; shift;
set tooldir = "/home/staff/stolfi/bin"
gawk 'BEGIN{ printf "nulls=('"${nulls}"')\n";}'
foreach dir ( $dira $dirb )
# Preliminary filtering. Get a copy of each EVT text without chapter
# titles ("T" units). Leave in the verse numbers as separate words.
# Note that we must also leave the comments that define the
# character sets, for the benefit of "evt-to-words".
echo 'extracting clean ".evt" for version "'"${dir}"'"...'
cat ${dir}/main.evt \
| egrep '^ *([#]|[<][s][0-9]*[.][V][0-9]*[.][0-9]+[;A-Za-z]*[>])' \
> ${dir}/.xx.evt
# Extract the words, delete the null characters, and remove
# the shadah by doubling the previous letter.
# Also prefix "[${dir}]" to the file, and "${dir}." to the verse numbers,
# in order that they show up in the "diff".
echo 'extracting words of version "'"${dir}"'"...'
echo "[${dir}]" > ${dir}/.xx.wds
cat ${dir}/.xx.evt \
| ${tooldir}/evt-to-wds \
-v smashSymbols=0 \
-v showPuncts=1 -v showBreaks=0 -v showParags=0 \
| sed \
-e 's/\(.\)\([»]\)/\1\'"${shaddah}"'/g' \
| tr -d "${nulls}" \
| sed \
-e '/^ *$/d' \
-e 's/^\([0-9]*[.][0-9]*[.]\)$/==='"${dir}"'.\1/g' \
| tr '=' '\012' \
>> ${dir}/.xx.wds
# Count verses (excluding verse 0) and words (including verse 0),
# per sura and total:
echo 'counting words of version "'"${dir}"'"...'
cat ${dir}/.xx.wds \
| gawk \
' BEGIN{ s=""; nv=0; nw=0; tv=0; tw=0; } \
/^ *([\#@=÷]|$)/ { next; } \
/[0-9][.]/ { \
gsub(/[.]/, " "); xs = $2; xv = $3; \
if (xs \!= s) { \
if (s \!= "") { out(); } \
nv = 0; nw = 0; s = xs; \
} \
if (xv \!= "0") { nv++; tv++; } \
next; \
} \
/./ { nw++; tw++; } \
END{ if(s \!= "") { out(); } \
printf "\ntotal: %4d verses %5d words\n", tv, tw; \
} \
function out() { \
printf "sura %3s: %3d verses %4d words\n", s, nv, nw; \
} \
' \
> ${dir}/.xx.cts
echo "=== ${dir} ==="
cat ${dir}/.xx.cts
end
# Now run "diff" with side-by-side option. The original "diff" does
# not count non-ascii characters and therefore gets the alignment
# wrong, so use a custom version.
echo 'comparing words of "'"${dira}"'" and "'"${dirb}"'"...'
~/pkg/diffutils-2.8.1-1/src/diff \
--minimal \
--expand-tabs \
--ignore-all-space \
--side-by-side --width=60 \
--suppress-common-lines \
${dira}/.xx.wds ${dirb}/.xx.wds \
> .diff
# Now summarize the differences, per section:
echo 'summarizing the differences...'
cat .diff \
| gawk -v dira="${dira}" -v dirb="${dirb}" \
' BEGIN{ s= ""; nv = 0; nd = 0; tv = 0; td = 0; } \
/^ *$/{ next; } \
/[\#]/{ next; } \
/[\]\[]/{ next; } \
/[.][0-9]*[.]/{ \
gsub(/[.]/, " "); \
xf = $1; xs = $2; xv = $3; yf = $5; ys = $6; yv = $7; \
if ((xf \!= dira) || (yf \!= dirb)) { bug("file"); } ; \
if (xs \!= ys) { bug("sura"); }; \
if (xv \!= yv) { bug("verse"); }; \
if (xs \!= s){ \
if(s \!= ""){ out(); } \
s = xs; nv = 0; nd = 0; \
}; \
if (xv \!= "0") { nv++; tv++; } \
next; \
} \
/[|<>]/{ nd++; td++; next; } \
//{ bug("format"); } \
END{ if (s \!= "") { out(); } \
printf "\ntotal: %4d verses %5d differences\n", tv, td; \
} \
function out(){ \
printf "sura %4s: %3d verses %4d differences\n", os,nv,nd; \
} \
function bug(msg){ \
printf "%d: error: %s\n", FNR, msg > "/dev/stderr"; \
} \
' \
> .diffcts
echo " "
echo "=== ${dira} × ${dirb} ==="
cat .diffcts
echo "=== sample differences ==="
cat .diff | egrep -v '[#]|^ *$|'"${dira}"'|'"{dirb}" | head -20
echo "..."
cat .diff | egrep -v '[#]|^ *$|'"${dira}"'|'"{dirb}" | tail -10
# Delete big files:
# /bin/rm -f {${dira},${dirb}}/.xx.wds .diff