#! /usr/bin/gawk -f # Last edited on 2012-05-05 19:50:09 by stolfilocal # Sampling functions for chin/voa # Chinese - Voice of America newscasts in pinyin. # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } # This does not work, unfortunately: # # Patterns for lowecase pinyin syllable # # with numeric tone and disambiguating suffix: # pinyin_initial_pat = "([csz][h]|[bcdfghjklmnpqrstwxyz])?"; # pinyin_middle_pat = "([aeiouüê]|a[io]|ei|i[aeou]|iao|ou|üe|u[aeio]|uai)"; # pinyin_final_pat = "([n][g]?|[r])?"; # pinyin_tone_pat = "[0-9]"; # pinyin_token_pat = ( \ # "^" \ # pinyin_initial_pat pinyin_middle_pat pinyin_final_pat \ # pinyin_tone_pat \ # "$" \ # ); } function smp_is_good_token(smp,sec,type,wd) { # Accept pinyin wd with numeric tone and disambiguating suffix: return smp_is_pinyin_token(wd); } function smp_is_pinyin_token(wd) { return \ (wd ~ /([csz][h]|[bcdfghjklmnpqrstwxyz])?([aeiou]|ü|ê|a[io]|ei|i[aeou]|iao|ou|üe|u[aeio]|uai)([n][g]?|[r])?[0-9]([.][1-9][0-9]*)?$/) } function smp_reclassify_token(smp,sec,cursec,curlin,type,wd) { # Delete anything outside section {sec}: if (sec == "tot.1") { if (cursec !~ /^{s[1-9][0-9]*}/) { return "n"; } } else { arg_error(("bad subsection \"" subsec "\"")); } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Reject any material that is not prose text: if (cursec !~ /^{s[1-9][0-9]*}$/) { return "x"; }; # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_token(smp,sec,type,wd) { # Assume that the word table maps each GB byte-pair that corresponds to # a Chinese character to a uniquified pinyin; GB codes of punctuation # to strings in gullemots; and other GB codes # (symbols, non-Chinese letters, etc.) to strings in parentheses. if (wd ~ /^[a-zA-Z]/) { # Chinese character in pinyin. # reemap "u:" to "ü" and "e^" to "ê" wd = tolower(wd); gsub(/[u][:]/, "ü", wd); gsub(/[e][\^]/, "ê", wd); # Break at hyphens: gsub(/[-]/, " ", wd); } return wd; }