#! /usr/bin/gawk -f # Last edited on 2004-02-01 01:43:53 by stolfi BEGIN { abort = -1; # Provides a Pinyin translation of GB puntuation in @chinword{}{} # directives. # # Note: we must not use characters beyond '\200' in the Pinyin # text, because they may get confused with GB codes. # split("", punct); # GB GBhx Ucod Name # -- ---- ---- ------------------------------------- punct["¡¡"] = " "; # A1A1 3000 IDEOGRAPHIC_SPACE punct["¡¢"] = ","; # A1A2 3001 IDEOGRAPHIC_COMMA punct["¡£"] = "."; # A1A3 3002 IDEOGRAPHIC_FULL_STOP punct["¡°"] = "``"; # A1B0 201C LEFT_DOUBLE_QUOTATION_MARK punct["¡±"] = "''"; # A1B1 201D RIGHT_DOUBLE_QUOTATION_MARK punct["¡¶"] = "<<"; # A1B6 300A LEFT_DOUBLE_ANGLE_BRACKET punct["¡·"] = ">>"; # A1B7 300B RIGHT_DOUBLE_ANGLE_BRACKET punct["¡¸"] = "`"; # A1B8 300C LEFT_CORNER_BRACKET punct["¡¹"] = "'"; # A1B9 300D RIGHT_CORNER_BRACKET punct["¡º"] = "(("; # A1BA 300E LEFT_WHITE_CORNER_BRACKET punct["¡»"] = "))"; # A1BB 300F RIGHT_WHITE_CORNER_BRACKET punct["¡ø"] = "*"; # A1F8 25B2 BLACK_UP-POINTING_TRIANGLE punct["¡þ"] = "*"; # A1FE 3013 GETA_MARK punct["£¬"] = ","; # A3AC FF0C FULLWIDTH_COMMA punct["£­"] = "-"; # A3AD FF0D FULLWIDTH_HYPHEN-MINUS punct["£º"] = ":"; # A3BA FF1A FULLWIDTH_COLON punct["£»"] = ";"; # A3BB FF1B FULLWIDTH_SEMICOLON punct["£¿"] = "?"; # A3BF FF1F FULLWIDTH_QUESTION_MARK } /^[@]chinword{/ { gb = gensub(/^[@]chinword{(.*)}{.*}.*$/, "\\1", "g", $0); py = gensub(/^[@]chinword{.*}{(.*)}.*$/, "\\1", "g", $0); if (py ~ /^[?][?]+$/) { if (! (gb in punct)) { data_error(("invalid punctuation code = \"" gb "\"")); } py = punct[gb]; } else if ((gb in punct) && (py != punct[gb])) { data_warning(("discrepant punctuation gb = \"" gb "\" py = \"" py "\"")); py = punct[gb]; } printf "@chinword{%s}{%s}\n", gb, py; next; } // { print; next; } function data_error(msg) { printf "%d: ** %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function data_warning(msg) { printf "%d: !! %s\n", FNR, msg > "/dev/stderr"; }