#! /usr/bin/gawk -f 
# Last edited on 2004-02-02 05:02:46 by stolfi

BEGIN {
  abort = -1;
  # Extracts the pinyin reading of each GBcharacter from the 
  # "main.raw" file.
  
  split("", punct); 
  punct["กฐ"] = "``";   # A1B0 201C LEFT_DOUBLE_QUOTATION_MARK
  punct["กฑ"] = "''";   # A1B1 201D RIGHT_DOUBLE_QUOTATION_MARK
  punct["กถ"] = "ซ";    # A1B6 300A LEFT_DOUBLE_ANGLE_BRACKET
  punct["กท"] = "ป";    # A1B7 300B RIGHT_DOUBLE_ANGLE_BRACKET
  punct["กธ"] = "[";    # A1B8 300C LEFT_CORNER_BRACKET
  punct["กน"] = "]";    # A1B9 300D RIGHT_CORNER_BRACKET
  punct["กบ"] = "{";    # A1BA 300E LEFT_WHITE_CORNER_BRACKET
  punct["กป"] = "}";    # A1BB 300F RIGHT_WHITE_CORNER_BRACKET

  split("", closer_of);
  closer_of["กฐ"] = "กฑ"; # {LEFT,RIGHT}_DOUBLE_QUOTATION_MARK
  closer_of["กถ"] = "กท"; # {LEFT,RIGHT}_DOUBLE_ANGLE_BRACKET
  closer_of["กธ"] = "กน"; # {LEFT,RIGHT}_CORNER_BRACKET
  closer_of["กบ"] = "กป"; # {LEFT,RIGHT}_WHITE_CORNER_BRACKET
  
  split("", opener_of); 
  for (gbi in closer_of) { opener_of[closer_of[gbi]] = gbi; }
  
  split("", stack);
  level = 0;
  dirty = 0;
  wct = 0;
}

/^[ ]*($|[\#])/ { next; }

/^[@]fix / { next; }

/^[@]chinword[\{]/ {
  
  gb = gbspread(gensub(/^[@]chinword{(.*)}{.*}.*$/, "\\1", "g", $0));
  ngb = split(gb, gbf);

  for (i = 1; i <= ngb; i++) 
    { gbi = gbf[i];
      if (gbi in closer_of)
        { if (! dirty) { printf "%7d:", FNR; dirty = 1; }
          printf " %d %s", wct, punct[gbi]; wct = 0;
          stack[level] = gbi; 
          level++;
        }
      else if (gbi in opener_of)
        { printf " %d %s", wct, punct[gbi]; wct = 0;
          if (level == 0) 
            { printf " [** extra closer]"; }
          else 
            { level--;
              if (stack[level] != opener_of[gbi])
                { printf " [** mismatch]\n"; }
            }
        }
      else
        { wct++; }
    }
  next;
}

/^[@]/ { 
  if (level > 0) { printf " [** end at level %d]", level; }
  if (dirty) { printf " %d \n", wct; }
  dirty = 0; level = 0; wct = 0;
  next;
}

// { data_error("bad format"); }

END {
  if (level > 0) { printf " [** end at level %d]\n", level; }
}

function gbspread(s) {
  s = gensub(/([\241-\376][\241-\376])/, " \\1 ", "g", s);
  gsub(/[ ][ ]+/, " ", s);
  gsub(/^[ ]+/, "", s);
  gsub(/[ ]+$/, "", s);
  return s;
}

function data_error(msg) 
{
  printf "%d: ** %s\n", FNR, msg > "/dev/stderr";
  abort = 1;
  exit 1;
}

function data_warning(msg) 
{
  printf "%d: !! %s\n", FNR, msg > "/dev/stderr";
}