# Last edited on 2026-02-22 12:55:01 by stolfi INTERESTING WORDS Word frequency tables: foreach f ( bencao.big5 vstars.eva ) echo " "; echo "=== ${f:r} ===" cat $f \ | gawk \ ' /^ *([#]|$)/{ next; } \ //{ \ gsub(/^[-.0-9a-zA-Z]*/, " ", $0); \ gsub(/[ ][-={}]/, " ", $0); \ print; \ } ' \ | tr ' ' '\012' \ | egrep '.' \ | sort | uniq -c | expand \ | map-field \ -v table=big5-to-html.tbl \ -v inField=2 -v outField=3 -v forgiving=1 \ | map-field \ -v table=html-to-py.tbl \ -v inField=3 -v outField=4 -v forgiving=1 \ | map-field \ -v table=html-to-meaning.tbl \ -v inField=3 -v outField=5 -v forgiving=1 \ | gawk '//{ print $1, ($3 ($3==$4 ? "" : ("=" $4)) ($5==$3 ? "" : ("=" $5))); }' \ | sort -b -k1nr -k2 \ | compute_freqs_from_counts.py -encoding bytes -total TOTAL \ > ${f:r}.wfr head -100 ${f:r}.wfr end === bencao === 362 0.02823 生=(sheng1,5:sheng5) 358 0.02791 味=(wei4) 352 0.02745 治=(zhi4) 313 0.02441 名=(ming2) 308 0.02402 一=(yi1) 299 0.02331 氣=(qi4) 293 0.02285 寒=(han2) 245 0.01910 谷=(gu3,yu4) 198 0.01544 熱=(re4) 168 0.01310 平=(ping2) 161 0.01255 身=(shen1,juan1) 154 0.01201 不=(bu4,5:bu5,bu2) 149 0.01162 久=(jiu3) 144 0.01123 中=(zhong1,zhong4) 144 0.01123 川=(chuan1) 143 0.01115 服=(fu2,fu4,5:fu5) 136 0.01060 苦=(ku3) 136 0.01060 輕=(qing1) 132 0.01029 山=(shan1) 129 0.01006 溫=(wen1) === vstars === 189 0.01802 aiin=aiin 189 0.01802 chedy=chedy 155 0.01477 qokeey=qokeey 146 0.01392 ar=ar 134 0.01277 qokeedy=qokeedy 131 0.01249 al=al 127 0.01211 daiin=daiin 121 0.01153 chey=chey 119 0.01134 qokaiin=qokaiin 115 0.01096 shedy=shedy 96 0.00915 okeey=okeey 96 0.00915 ol=ol 95 0.00906 okaiin=okaiin 89 0.00848 qokain=qokain 76 0.00724 otaiin=otaiin 75 0.00715 cheey=cheey 70 0.00667 shey=shey 69 0.00658 okain=okain 63 0.00601 chol=chol 63 0.00601 oteey=oteey Extract list of kth word from each recipe, and their distributions: foreach k ( 1 2 3 4 ) foreach f ( bencao.big5 vstars.eva ) printf "\n\n=== %s[%s] ===\n\n" "${f:r}" "$k" cat $f \ | gawk -v which=${k} \ ' /^[#][#]/{ fst = 1; next; } \ /^ *([#]|$)/{ next; } \ (fst){ \ gsub(/^[-.0-9a-zA-Z]*/, " ", $0); \ gsub(/[ ][-={}]/, " ", $0); \ print $(which); fst = 0; \ } ' \ | tr ' ' '\012' \ | egrep '.' \ > ${f:r}-${k}.tks cat ${f:r}-${k}.tks \ | sort | uniq -c | expand \ | map-field \ -v table=big5-to-html.tbl \ -v inField=2 -v outField=3 \ -v forgiving=1 \ | map-field \ -v table=html-to-py.tbl \ -v inField=3 -v outField=4 \ -v forgiving=1 \ | map-field \ -v table=html-to-meaning.tbl \ -v inField=3 -v outField=5 -v forgiving=1 \ | gawk '//{ print $1, ($3 ($3==$4 ? "" : ("=" $4)) ($5==$4 ? "" : ("=" $5))); }' \ | sort -b -k1nr -k2 \ | compute_freqs.gawk \ > ${f:r}-${k}.wfr head -5 ${f:r}-${k}.wfr end end === bencao[1] === 19 0.05322 白=(bai2,5:bai5) 15 0.04202 石=(shi2,dan4) 6 0.01681 紫=(zi3) 5 0.01401 大=(da4,dai4) 5 0.01401 水=(shui3) === vstars[1] === 6 0.01829 daiin=daiin 5 0.01524 polaiin=polaiin 5 0.01524 tchedy=tchedy 4 0.01220 pchedal=pchedal 4 0.01220 pcheor=pcheor === bencao[2] === 15 0.04202 實=(shi2) 11 0.03081 石=(shi2,dan4) 7 0.01961 草=(cao3) 6 0.01681 參=(can1,cen1,shen1,san1) 6 0.01681 芝=(zhi1) === vstars[2] === 7 0.02134 ar=ar 6 0.01829 shedy=shedy 5 0.01524 chey=chey 5 0.01524 qokaiin=qokaiin 4 0.01220 cheo=cheo === bencao[3] === 169 0.47339 一=(yi1) 111 0.31092 味=(wei4) 13 0.03641 子=(zi5,zi3,zi2) 3 0.00840 實=(shi2) 3 0.00840 草=(cao3) === vstars[3] === 9 0.02744 shedy=shedy 7 0.02134 qokain=qokain 5 0.01524 chedy=chedy 5 0.01524 okain=okain 5 0.01524 qokaiin=qokaiin === bencao[4] === 169 0.47339 名=(ming2) 44 0.12325 苦=(ku3) 36 0.10084 一=(yi1) 32 0.08964 辛=(xin1) 26 0.07283 味=(wei4) === vstars[4] === 9 0.02744 qokeey=qokeey 7 0.02134 shedy=shedy 6 0.01829 qokeedy=qokeedy 5 0.01524 oteedy=oteedy 4 0.01220 okeey=okeey