#! /usr/bin/gawk -f 
# Last edited on 2004-02-02 02:56:38 by stolfi

BEGIN {
  abort = -1;
  # Extracts the pinyin reading of each GBcharacter from the 
  # "main.raw" file.
  #
  split("", npin); 
  split("", gbcount); 
  split("", pinyin); 
  split("", pycount); 
}

/^[@]chinword[\{]/ {
  
  gb = gbspread(gensub(/^[@]chinword{(.*)}{.*}.*$/, "\\1", "g", $0));
  gsub(/[£][­]/, "", gb);
  ngb = split(gb, gbf);
  
  py = pyspread(gensub(/^[@]chinword{.*}{(.*)}.*$/, "\\1", "g", $0));
  py = tolower(py);
  gsub(/[-\/]/, " ", py);
  npy = split(py, pyf);

  if (ngb != npy)
    { # Expect syllable count mismatch for acronyms, multidigit numbers, percentages:
      if ((gb != "*") && (gb !~ /[0-9][.]*[0-9]/) && (gb !~ /[£][¥]/))
        { data_warning(("syllable mismatch gb = \"" gb "\" py = \"" py "\"")); }
    }
  else
    { for (i = 1; i <= ngb; i++) 
        { save_reading(gbf[i], pyf[i]); }
    }
  next;
}

// { next; }

END {
  for (gb in npin) 
    { printf "%7d %s ", gbcount[gb], gb;
      # Sort readings by decreasing frequency:
      for (k = 1; k < npin[gb]; k++) 
        { for (j = 0; j < k; j++) 
            { if (pycount[gb,k] > pycount[gb,j])
                { t = pinyin[gb,k];  pinyin[gb,k]  = pinyin[gb,j];  pinyin[gb,j]  = t;
                  t = pycount[gb,k]; pycount[gb,k] = pycount[gb,j]; pycount[gb,j] = t;
                }
            }
        }
      for (k = 0; k < npin[gb]; k++) 
        { printf "%s(%d),", pinyin[gb,k], pycount[gb,k]; }
      printf "\n";
    }
}

function save_reading(gb, py,    k)
{
  if (! (gb in npin)) { npin[gb] = 0; gbcount[gb] = 0; }
  gbcount[gb]++;
  for (k = 0; k < npin[gb]; k++) { if (py == pinyin[gb,k]) { pycount[gb,k]++; return; } }
  k = npin[gb]; pinyin[gb,k] = py; pycount[gb,k] = 1; npin[gb]++;
  return;
}

function gbspread(s) {
  s = gensub(/([\241-\376][\241-\376])/, " \\1 ", "g", s);
  gsub(/[ ][ ]+/, " ", s);
  gsub(/^[ ]+/, "", s);
  gsub(/[ ]+$/, "", s);
  return s;
}

function pyspread(s) {
  s = gensub(/([a-zA-Z][a-zA-Zü:]*[0-5])/, " \\1 ", "g", s);
  gsub(/[ ][ ]+/, " ", s);
  gsub(/^[ ]+/, "", s);
  gsub(/[ ]+$/, "", s);
  return s;
}

function data_error(msg) 
{
  printf "%d: ** %s\n", FNR, msg > "/dev/stderr";
  abort = 1;
  exit 1;
}

function data_warning(msg) 
{
  printf "%d: !! %s\n", FNR, msg > "/dev/stderr";
}