#! /usr/bin/gawk -f # Last edited on 2012-05-05 19:49:35 by stolfilocal # Sampling functions for chin/red # Chinese - Dream of the Red Mansion in GB code, # converted to uniquified pinyin. # To be included in wds-to-tlw # These functions assume that the word table {wmap} maps each GB # byte-pair that corresponds to a Chinese character to a uniquified # pinyin; GB punctuation codes to strings in gullemots, and symbols # (stars, non-Chinese letters, etc.) to strings in parentheses. function smp_define_patterns(smp,sec) { if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } gb_bytes = ( \ "¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿" \ "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞ" \ "ßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþ" \ ); # This does not work, unfortunately: # # Pattern for Chinese GB codes (just in case): # # This does not work, unfortunately: # # gb_code_pat = ( "^[" gb_bytes "][" gb_bytes "]$" ); # # # Patterns for lowecase pinyin syllable # # with numeric tone and disambiguating suffix: # pinyin_initial_pat = "([csz][h]|[bcdfghjklmnpqrstwxyz])?"; # pinyin_middle_pat = "([aeiouüê]|a[io]|ei|i[aeou]|iao|ou|üe|u[aeio]|uai)"; # pinyin_final_pat = "([n][g]?|[r])?"; # pinyin_tone_pat = "[0-9]"; # pinyin_dupl_pat = "([.][1-9][0-9]*)?"; # pinyin_word_pat = ( \ # "^" \ # pinyin_initial_pat pinyin_middle_pat pinyin_final_pat \ # pinyin_tone_pat pinyin_dupl_pat \ # "$" \ # ); } function smp_is_good_word(smp,sec,type,wd) { # Accept pinyin word with numeric tone and disambiguating suffix, # or raw GB code: return smp_is_GB_code(wd) || smp_is_pinyin_word(wd); } function smp_is_pinyin_word(wd) { return \ (wd ~ /([csz][h]|[bcdfghjklmnpqrstwxyz])?([aeiou]|ü|ê|a[io]|ei|i[aeou]|iao|ou|üe|u[aeio]|uai)([n][g]?|[r])?[0-9]([.][1-9][0-9]*)?$/) } function smp_is_GB_code(wd) { return \ (length(wd) == 2) && (index(gb_bytes, substr(wd,1,1)) > 0) && (index(gb_bytes, substr(wd,2,1)) > 0); } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Delete anything outside section {sec}: if (sec == "tot.1") { if (cursec !~ /^{b[12]}/) { return "n"; } } else { arg_error(("bad subsection \"" subsec "\"")); } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Reject any material that is not prose text: if (cursec !~ /{c[1-9][0-9]*}{tx}$/) { return "x"; }; if (type == "a") { # The word should be actually a GB byte-pair. if (! smp_is_GB_code(wd)) { data_error(("malformed GB code \"" wd "\"")); } if (wd in wmap) { wd = wmap[wd]; if (wd ~ /^[«].+[»]$/) { type = "p"; } else if (wd ~ /^[(].+[)]$/) { type = "s"; } } # If the GB code is not in the table, assume it is alpha. } # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd) { # Apply the word-map table if (wd in wmap) { wd = wmap[wd]; } if (wd ~ /^[a-zA-Z]/) { # Chinese character in pinyin. # reencode "u:" to "ü" and "e^" to "ê" wd = tolower(wd); gsub(/[u][:]/, "ü", wd); gsub(/[e][\^]/, "ê", wd); } return wd; }