#! /usr/bin/python3 # Last edited on 2026-02-10 06:53:04 by stolfi # Reads the Shénnóng běncǎo jīng in UTF8 Chinese characters. Outputs # counts words per "recipe". # # Ignores Chinese spaces, punctuation, symbols, etc. Ignores #-comments # Ignores the introduction (section "s0"). # # Each output line has the forman "{loc} {size} {nchar}" where {nchar} # is the total chars in the line and {size} is the number of characters # that stand for syllables (excluding puntuation, spaces, symbols, # bullets, and Chinese blanks) import sys, os, re from sys import stdout as out, stderr as err, stdin as inp from process_funcs import bash, basic_line_loop from chinese_funcs import read_chinese_char_set from note_077_funcs import compute_and_print_stats def main(): file_name = "stdin" inp.reconfigure(encoding='utf-8') sys.stdout.reconfigure(encoding='iso-8859-1') # Read tables of chinese character sets: set_dir = "langbank/chin" charset = dict() charset['invalid'] = read_chinese_char_set(f"{set_dir}/utf8-invalid.tbl") charset['bullets'] = read_chinese_char_set(f"{set_dir}/utf8-bullets.tbl") charset['symbol'] = read_chinese_char_set(f"{set_dir}/utf8-symbol.tbl") charset['punct'] = read_chinese_char_set(f"{set_dir}/utf8-punct.tbl") charset['blank'] = read_chinese_char_set(f"{set_dir}/utf8-blank.tbl") count = dict() for k in charset.keys(): count[k] = 0 ndata = 0 # Count of data lines. size_tot = 0 # Total count of Syllabic chars. sizes = [] # List of recipe sizes, to compute statistics. def process_bencao_line(nline, line): nonlocal ndata, size_tot, sizes # Parses a line {line} assuming it is line {nline} of the file. # The {line} is always a string (never {None}), but may be "" if the line # is empty. # # Ignores the line if it is a blank or #-comment. # # Otherwise the line must be a data line, matching # "^<{sec}.{sub}.{lseq}> +{text}$" # # where # # {seq} is a section number, "s0" to "s2"; # # {sub} is a subsection number, "1" to "9"; # # {lseq} is a 3-digit integer sequential through the whole file, with gaps. # # {text} is a string of Chinese characters in UTF-8 # # Ignores the intro section "s0". # # Increments {ndata} for each data line. # Writes to stdout one output line for each input data line. assert line != None, "The {line} arg must not be {None}" # Ignore comments and blan lines: if re.match(r" *([#]|$)", line): return pat_loc = r"" pat_line = f"({pat_loc})[ ]+(.*)\n" m = re.match(pat_line, line) if m is None: # Invalid line format. data_error(file_name, nline, "invalid line format", line) else: ndata += 1 # Parse the line into locus ID and text: assert m.lastindex == 2, f"bug {m.lastindex = }" loc = m.group(1) text = m.group(2) sec = loc[1:3] # Section s-number, "s0" to "s2". sub = int(loc[4:5]) # Subsection number, 0 to 9. lseq = int(loc[6:9]) # Line (recipe) number in file, 0 to 999. # Ignore the introduction: if sec == "s0": return # Should we debug the parag? debug = False # Cleanup and count characters: text = re.sub(r'[ ]', '', text) nchar = len(text) size = 0 # Number of not-ignored chinese characters. for ch in text: ok = True if debug: err.write(f"!! ch = '{ch}'") for k in charset.keys(): if ch in charset[k]: if debug: err.write(f" {k}") count[k] += 1 ok = False if ok: size += 1 if debug: err.write("\n") sizes.append(size) size_tot += size if debug: err.write(f"!! total {size} syllables\n") out.write(f"{loc} {size:5d} {nchar:5d}\n") return # ...................................................................... nread = basic_line_loop(inp, process_bencao_line) out.flush() err.write(f"{nread:5d} total lines\n") err.write(f"{ndata:5d} recipes\n") for k in charset.keys(): if count[k] != 0: err.write(f"{count[k]:5d} ignored chars of type '{k}'\n") err.write(f"{size_tot:5d} total syllables\n") err.write(f"{size_tot/ndata:8.2f} avg syllables/recipe\n") vnum, vtot, vmin, vsin, vmax, vsax, vavg, vdev = compute_and_print_stats("parag sizes", sizes) return # ---------------------------------------------------------------------- def arg_error(msg): err.write(f"** {msg}\n") err.write("usage:\n") err.write(usage) sys.exit(1) # ---------------------------------------------------------------------- def data_error(fname, nline, msg, line): err.write(f"{fname}:{nline}: ** {msg}\n") err.write(f" [[{line}]]\n") sys.exit(1) # ---------------------------------------------------------------------- main()