#! /usr/bin/gawk -f # Last edited on 2004-02-02 02:56:38 by stolfi BEGIN { abort = -1; # Extracts the pinyin reading of each GBcharacter from the # "main.raw" file. # split("", npin); split("", gbcount); split("", pinyin); split("", pycount); } /^[@]chinword[\{]/ { gb = gbspread(gensub(/^[@]chinword{(.*)}{.*}.*$/, "\\1", "g", $0)); gsub(/[£][­]/, "", gb); ngb = split(gb, gbf); py = pyspread(gensub(/^[@]chinword{.*}{(.*)}.*$/, "\\1", "g", $0)); py = tolower(py); gsub(/[-\/]/, " ", py); npy = split(py, pyf); if (ngb != npy) { # Expect syllable count mismatch for acronyms, multidigit numbers, percentages: if ((gb != "*") && (gb !~ /[0-9][.]*[0-9]/) && (gb !~ /[£][¥]/)) { data_warning(("syllable mismatch gb = \"" gb "\" py = \"" py "\"")); } } else { for (i = 1; i <= ngb; i++) { save_reading(gbf[i], pyf[i]); } } next; } // { next; } END { for (gb in npin) { printf "%7d %s ", gbcount[gb], gb; # Sort readings by decreasing frequency: for (k = 1; k < npin[gb]; k++) { for (j = 0; j < k; j++) { if (pycount[gb,k] > pycount[gb,j]) { t = pinyin[gb,k]; pinyin[gb,k] = pinyin[gb,j]; pinyin[gb,j] = t; t = pycount[gb,k]; pycount[gb,k] = pycount[gb,j]; pycount[gb,j] = t; } } } for (k = 0; k < npin[gb]; k++) { printf "%s(%d),", pinyin[gb,k], pycount[gb,k]; } printf "\n"; } } function save_reading(gb, py, k) { if (! (gb in npin)) { npin[gb] = 0; gbcount[gb] = 0; } gbcount[gb]++; for (k = 0; k < npin[gb]; k++) { if (py == pinyin[gb,k]) { pycount[gb,k]++; return; } } k = npin[gb]; pinyin[gb,k] = py; pycount[gb,k] = 1; npin[gb]++; return; } function gbspread(s) { s = gensub(/([\241-\376][\241-\376])/, " \\1 ", "g", s); gsub(/[ ][ ]+/, " ", s); gsub(/^[ ]+/, "", s); gsub(/[ ]+$/, "", s); return s; } function pyspread(s) { s = gensub(/([a-zA-Z][a-zA-Zü:]*[0-5])/, " \\1 ", "g", s); gsub(/[ ][ ]+/, " ", s); gsub(/^[ ]+/, "", s); gsub(/[ ]+$/, "", s); return s; } function data_error(msg) { printf "%d: ** %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function data_warning(msg) { printf "%d: !! %s\n", FNR, msg > "/dev/stderr"; }