#! /usr/bin/gawk -f
# Last edited on 2025-07-03 10:41:32 by stolfi

# Reads a file with lines of the format "{LOC} {PTEXT}" where
# {LOC} is a line locator and {PTEXT} is a sequence of one or more words in UTF-8
# (such as pinyin or EVA), each preceded and followed by at least one 
# punctuation [- .,'=«»].
#
# The locator {LOC} should have the form "<{SEC}.{NLIN}>"
# where {SEC} is a section number (string of [a-z0-9]), {NLIN} is an entry
# number (a string of [A-Za-z0-9.])

# Ignores all punctuation and reduces all words to lowercase.
# Ignores blank lines and '#'-comments.
# Ignores inline comments '<!...>' and markers '<%>', '<$>', '<->', '<~>'.

# Outputs a file with one line per parag, in the format "{LOC} {NW}" where 
# {LOC} is as above, and {NW} is the number of words in input line {LOC}.

BEGIN {
  ndat = 0 # Num of data lines processed.
}

// { gsub(/[#].*$/, "", $0); }

/^ *$/ { next; }

/<[0-9a-z]+[.][0-9A-Za-z.]+>[ ]/ {
  ndat++;
  loc = $1;
  lin = extract_words($0);

  nwds = split(lin, wds);
  printf "%s %d\n", loc, nwds;
  next;
}

//{ data_error(("bad input format")); }
  
END {
  printf "%6d data lines read\n", ndat > "/dev/stderr"
}