#! /usr/bin/gawk -f # Last edited on 2025-07-03 10:41:32 by stolfi # Reads a file with lines of the format "{LOC} {PTEXT}" where # {LOC} is a line locator and {PTEXT} is a sequence of one or more words in UTF-8 # (such as pinyin or EVA), each preceded and followed by at least one # punctuation [- .,'=«»]. # # The locator {LOC} should have the form "<{SEC}.{NLIN}>" # where {SEC} is a section number (string of [a-z0-9]), {NLIN} is an entry # number (a string of [A-Za-z0-9.]) # Ignores all punctuation and reduces all words to lowercase. # Ignores blank lines and '#'-comments. # Ignores inline comments '' and markers '<%>', '<$>', '<->', '<~>'. # Outputs a file with one line per parag, in the format "{LOC} {NW}" where # {LOC} is as above, and {NW} is the number of words in input line {LOC}. BEGIN { ndat = 0 # Num of data lines processed. } // { gsub(/[#].*$/, "", $0); } /^ *$/ { next; } /<[0-9a-z]+[.][0-9A-Za-z.]+>[ ]/ { ndat++; loc = $1; lin = extract_words($0); nwds = split(lin, wds); printf "%s %d\n", loc, nwds; next; } //{ data_error(("bad input format")); } END { printf "%6d data lines read\n", ndat > "/dev/stderr" }