This script does it: cat base.elt \ | sed \ -e 's/{ee/{X}{/g' \ -e 's/{[ceh]}/{E}/g' \ -e 's/{\(..*\)e}/{\1}{E}/g' \ -e 's/{[csi][h]}/{X}/g' \ -e 's/{[ci][ktpf][h]}/{X}/g' \ -e 's/{[ci][ktpf]}/{X}/g' \ -e 's/{[ktpf]}/{X}/g' \ -e 's/{[rlsn]}/{R}/g' \ -e 's/{[mdgj]}/{R}/g' \ -e 's/{[aoy]}/{O}/g' \ -e 's/{[q]}/{Q}/g' \ -e 's/{[i][i]*}/{I}/g' \ -e 's/{[ceh]}/{E}/g' \ -e 's/\([A-Z]\)/{\1}/g' \ > base.clt but this splitting makes the counts smaller hence more affected by error; and the tables get longer and harder to grok. Anyway, here is the script: cat base.txt \ | sed \ -e 's/ee/S/g' \ -e 's/[csi][h]/S/g' \ -e 's/[ci][ktpf][h]/G/g' \ -e 's/[ci][ktpf]/G/g' \ -e 's/[ktpf]/H/g' \ -e 's/[rlsn]/L/g' \ -e 's/[mdgj]/D/g' \ -e 's/[aoy]/O/g' \ -e 's/[q]/Q/g' \ -e 's/[i][i]*/I/g' \ -e 's/[ceh]/E/g' \ -e 's/\([A-Z]\)/{\1}/g' \ > base.flt Comparing pair frequencies: multicol -v titles="lin fig std non" {lin,fig,std,non}-${map}-${v}.frq \ > all-${map}-${v}.frq compare-freqs {lin,fig,std,non}-${map}-${v}.frq \ | tr ':' ' ' \ | sort +0.0 -0.1r +8b -9b +0b -1nr +4b -4nr +6b -7nr \ | gawk '/^[^#]/{if($9\!=c){print "";c=$9}} //{print}' \ > all-${map}-${v}.cmpfrq Relative frequency at breaks ---------------------------- The absolute frequency of each pair X-Y around line or word breaks is affected by the frequency of the consituent letters. So a more relevant quantity is the ratio (freq at line break)/(freq at word break). TO BE DONE foreach brk ( std lin ) cat ${brk}-elt.frq \ | gawk '//{$3=("{" $3 "}"); gsub(/:/, "}:{", $3); print $1,$3;}' \ | elt2slt \ | tr -d '{}' \ | combine-counts \ | sort -b +0 -1nr \ | compute-freqs \ > ${brk}-slt.frq end foreach brk ( std lin ) cat ${brk}-slt.frq \ | gawk '//{gsub(/^.*:/, "", $3); print $1,$3;}' \ | combine-counts \ | sort -b +0 -1nr \ | compute-freqs \ > ${brk}-aft-slt.frw cat ${brk}-slt.frq \ | gawk '//{gsub(/:.*$/, "", $3); print $1,$3;}' \ | combine-counts \ | sort -b +0 -1nr \ | compute-freqs \ > ${brk}-bef-slt.frw end foreach f ( sec-all{-bol,,-eol}.ect ) cat $f \ | gawk \ ' /../{ t += $1; print; } \ END { printf " %7s TOTAL\n", t; } \ ' \ > /tmp/$f end compare-counts /tmp/sec-all{-bol,,-eol}.ect \ | gawk \ ' /../{ \ M=($1+$3+2); B=($1+1)/M; X=M/($2+3); \ printf " %7s %7s %7s %7.4f %7.4f %s\n",$1,$2,$3,B,X,$4;} \ ' \ | sort -b +1 -2nr +3 -4nr \ > /tmp/cmp-counts.txt let's plot the bol/eol and end/mid ratios for the most significant entries: cat /tmp/cmp-counts.txt \ | gawk '($2 > 32){print;}' \ > /tmp/cmp-counts-sig.txt gnuplot <<EOF set terminal x11 plot "/tmp/cmp-counts-sig.txt" using 4:5 with points pause 300 EOF Elements sorted by which end of the line they prefer: printf " %7s %7s %7s %7s %7s %s\n" "bol" "all" "eol" "b/(b+e)" "(b+e)/a" "elem";\ echo " ------- ------- ------- ------- ------- ---------";\ cat /tmp/cmp-counts.txt \ | sort -b +3 -4 bol all eol b/(b+e) (b+e)/a elem ------- ------- ------- ------- ------- --------- . 4210 439 0.0023 0.1047 iin 2 945 642 0.0046 0.6814 m . 1209 114 0.0086 0.0957 in 11 5824 325 0.0355 0.0580 r . 49 23 0.0400 0.4808 im 1 116 39 0.0476 0.3529 n . 92 14 0.0625 0.1684 iiin . 497 13 0.0667 0.0300 ir . 13 10 0.0833 0.7500 iim 70 8985 421 0.1440 0.0549 l . 98 3 0.2000 0.0495 iir 562 15421 1519 0.2703 0.1350 y . 1 1 0.3333 0.7500 u . 3 1 0.3333 0.5000 iiil . 4 1 0.3333 0.4286 b 21 403 31 0.4074 0.1330 ? . 1 . 0.5000 0.5000 '? . 1 . 0.5000 0.5000 iiid . 3 . 0.5000 0.3333 iis . 4 . 0.5000 0.2857 ith . 6 . 0.5000 0.2222 iid . 6 . 0.5000 0.2222 ikh . 7 . 0.5000 0.2000 id . 9 . 0.5000 0.1667 ct . 10 . 0.5000 0.1538 ck . 10 . 0.5000 0.1538 iil . 13 . 0.5000 0.1250 h? . 19 . 0.5000 0.0909 is . 28 . 0.5000 0.0645 il . 32 . 0.5000 0.0571 iiir . 73 . 0.5000 0.0263 i? . 246 . 0.5000 0.0080 ckhe . 314 . 0.5000 0.0063 eee 3908 126300 3908 0.5000 0.0619 TOTAL 14 12524 13 0.5172 0.0023 a 2 723 1 0.6000 0.0069 ckh 4 3803 2 0.6250 0.0021 ee 1 3 . 0.6667 0.5000 c? 1 22 . 0.6667 0.1200 x 1 37 . 0.6667 0.0750 de 382 2021 121 0.7584 0.2495 s 3 265 . 0.8000 0.0187 e? 4 1485 . 0.8333 0.0040 ke 597 11409 108 0.8458 0.0620 d 5 261 . 0.8571 0.0265 cthe 18 901 2 0.8636 0.0243 cth 505 21348 47 0.9134 0.0259 o 130 7534 8 0.9357 0.0186 k 25 814 . 0.9630 0.0330 te 107 6067 3 0.9643 0.0185 ch 30 3798 . 0.9688 0.0084 che 39 1891 . 0.9756 0.0216 she 147 2217 1 0.9867 0.0676 sh 724 5445 6 0.9904 0.1344 t 502 5080 . 0.9980 0.0992 q So we can split the elements manually into classes, as follows: bol all eol b/(b+e) (b+e)/a elem ------- ------- ------- ------- ------- --------- 3908 126300 3908 0.5000 0.0619 TOTAL 2 945 642 0.0046 0.6814 m . 49 23 0.0400 0.4808 im . 13 10 0.0833 0.7500 iim 1 116 39 0.0476 0.3529 n . 1209 114 0.0086 0.0957 in . 4210 439 0.0023 0.1047 iin . 92 14 0.0625 0.1684 iiin 11 5824 325 0.0355 0.0580 r . 497 13 0.0667 0.0300 ir . 98 3 0.2000 0.0495 iir 562 15421 1519 0.2703 0.1350 y 70 8985 421 0.1440 0.0549 l 14 12524 13 0.5172 0.0023 a . 246 . 0.5000 0.0080 ckhe . 314 . 0.5000 0.0063 eee 2 723 1 0.6000 0.0069 ckh 4 3803 2 0.6250 0.0021 ee 382 2021 121 0.7584 0.2495 s 4 1485 . 0.8333 0.0040 ke 597 11409 108 0.8458 0.0620 d 5 261 . 0.8571 0.0265 cthe 18 901 2 0.8636 0.0243 cth 505 21348 47 0.9134 0.0259 o 130 7534 8 0.9357 0.0186 k 25 814 . 0.9630 0.0330 te 107 6067 3 0.9643 0.0185 ch 30 3798 . 0.9688 0.0084 che 39 1891 . 0.9756 0.0216 she 147 2217 1 0.9867 0.0676 sh 724 5445 6 0.9904 0.1344 t 502 5080 . 0.9980 0.0992 q . 1 1 0.3333 0.7500 u 3 265 . 0.8000 0.0187 e? 1 3 . 0.6667 0.5000 c? 1 22 . 0.6667 0.1200 x 1 37 . 0.6667 0.0750 de . 3 1 0.3333 0.5000 iiil . 4 1 0.3333 0.4286 b . 1 . 0.5000 0.5000 iiid . 3 . 0.5000 0.3333 iis . 4 . 0.5000 0.2857 ith . 6 . 0.5000 0.2222 iid . 6 . 0.5000 0.2222 ikh . 7 . 0.5000 0.2000 id . 9 . 0.5000 0.1667 ct . 10 . 0.5000 0.1538 ck . 10 . 0.5000 0.1538 iil . 13 . 0.5000 0.1250 h? . 19 . 0.5000 0.0909 is . 28 . 0.5000 0.0645 il . 32 . 0.5000 0.0571 iiir 21 403 31 0.4074 0.1330 ? . 1 . 0.5000 0.5000 '? . 73 . 0.5000 0.0263 i? Moderately final: bol all eol b/(b+e) (b+e)/a elem ------- ------- ------- ------- ------- --------- Basically indifferent: bol all eol b/(b+e) (b+e)/a elem ------- ------- ------- ------- ------- --------- . 1 . 0.5000 0.5000 iiid . 3 . 0.5000 0.3333 iis . 4 . 0.5000 0.2857 ith . 6 . 0.5000 0.2222 iid . 6 . 0.5000 0.2222 ikh . 7 . 0.5000 0.2000 id . 9 . 0.5000 0.1667 ct . 10 . 0.5000 0.1538 ck . 10 . 0.5000 0.1538 iil . 19 . 0.5000 0.0909 is . 28 . 0.5000 0.0645 il . 32 . 0.5000 0.0571 iiir . 246 . 0.5000 0.0080 ckhe . 314 . 0.5000 0.0063 eee 3908 126300 3908 0.5000 0.0619 TOTAL 14 12524 13 0.5172 0.0023 a 2 723 1 0.6000 0.0069 ckh 21 403 31 0.4074 0.1330 ? . 1 . 0.5000 0.5000 '? . 13 . 0.5000 0.1250 h? . 73 . 0.5000 0.0263 i? Too rare to tell: bol all eol b/(b+e) (b+e)/a elem ------- ------- ------- ------- ------- --------- . 1 1 0.3333 0.7500 u . 3 1 0.3333 0.5000 iiil . 4 1 0.3333 0.4286 b gnuplot <<EOF set terminal x11 plot "/tmp/cmp-counts.txt" using 2 with histeps pause 300 EOF /bin/rm /tmp/cmp-counts.txt Doing the same for labels: Comparing element frequencies (×10000): b-o-l, overall, e-o-l. foreach f ( sec-all-bol sec-all sec-all-eol ) cat $f.ect \ | est-probs \ > $f.epr cat $f.epr \ | gawk '/./{printf "%d %s\n", 10000*$2, $3;}' \ > /tmp/$f.eprx end JUNK--- Comparing element frequencies (×10000): b-o-l, overall, e-o-l cat `cat txt.pages | sed -e 's@^\(.*\)$@pages-evt/\1.els@'` \ | sed -e 's/^\({[^{}]*}\).*/\1/' \ | egrep '^{.*}$' \ | tr -d '{}' \ | egrep '.' \ | sort | uniq -c | expand \ | sort -b +0 -1nr \ > sec-all-bol.ect cat `cat txt.pages | sed -e 's@^\(.*\)$@pages-evt/\1.els@'` \ | sed -e 's/.*\({[^{}]*}\)$/\1/' \ | egrep '^{.*}$' \ | tr -d '{}' \ | egrep '.' \ | sort | uniq -c | expand \ | sort -b +0 -1nr \ > sec-all-eol.ect foreach f ( sec-all-bol sec-all sec-all-eol ) cat $f.ect \ | est-probs \ > $f.epr cat $f.epr \ | gawk '/./{printf "%d %s\n", 10000*$2, $3;}' \ > /tmp/$f.eprx end /usr/ucb/echo -n " "; \ echo "bol all eol b-d" | sed -e 's/ / /g' ;\ ( compare-counts /tmp/sec-all{-bol,,-eol}.eprx \ | gawk \ ' /--/{printf " %7s %7s %7s %7s %s\n",$1,$2,$3,$3,$4;next;} \ /../{printf " %7s %7s %7s %7.4f %s\n",$1,$2,$3,($1-$3)/(32*($2+1)),$4;} \ ' \ | sort -b +3 -4nr ) bol all eol b-d ------- ------- ------- ------- --------- 947 98 10 0.2958 p 5 . . 0.1562 c? 972 160 310 0.1285 s 1277 402 . 0.0990 q 896 333 10 0.0829 t 81 25 15 0.0793 f 376 176 5 0.0655 sh 5 2 . 0.0521 de 5 2 . 0.0521 x 1518 903 277 0.0429 d 15 11 . 0.0391 cph 66 64 . 0.0317 te 15 15 . 0.0293 cthe 1285 1689 122 0.0215 o 102 150 . 0.0211 she 274 480 10 0.0172 ch 36 60 8 0.0143 cth 10 21 . 0.0142 e? 254 571 10 0.0133 k 79 301 . 0.0082 che 13 118 . 0.0034 ke 8 53 5 0.0017 ckh 13 301 8 0.0005 ee 38 990 36 0.0001 a . . . 0.0000 '? . . . 0.0000 cf . . . 0.0000 cp . . . 0.0000 iiid . . . 0.0000 iis . . . 0.0000 ij . . . 0.0000 iph . . . 0.0000 ith . . . 0.0000 pe . 1 . 0.0000 cfhe . 1 . 0.0000 ck . 1 . 0.0000 ct . 1 . 0.0000 h? . 1 . 0.0000 id . 1 . 0.0000 iid . 1 . 0.0000 iil . 1 . 0.0000 ikh . 2 . 0.0000 iiir . 2 . 0.0000 il . 2 . 0.0000 is . 4 . 0.0000 cfh . 5 . 0.0000 cphe . 5 . 0.0000 i? . 18 . 0.0000 ckhe . 25 . 0.0000 eee 56 32 81 -0.0237 ? . 39 36 -0.0281 ir . 8 10 -0.0347 iir 180 711 1071 -0.0391 l 30 461 828 -0.0540 r 1429 1220 3859 -0.0622 y . 95 292 -0.0951 in . 333 1117 -0.1045 iin . 7 38 -0.1484 iiin . . 5 -0.1562 b . . 5 -0.1562 iiil . . 5 -0.1562 u 5 2 28 -0.2396 j 5 9 102 -0.3031 n . 1 23 -0.3594 g . 4 61 -0.3812 im . 1 28 -0.4375 iim 5 72 1587 -0.6772 m