#! /usr/bin/gawk -f # Last edited on 2025-06-16 05:44:02 by stolfi # Reads a file with lines of the format "{LOC} {PTEXT}" where # {LOC} is a line locator and {PTEXT} is a sequence of words in UTF-8 # (such as pinyin or EVA), each preceded and followed by at least one # punctuation [ .,'=()]. # # The locator {LOC} should have the form "<{SEC}.{NLIN}>" # where {SEC} is a section number (string of [a-z0-9]), {NLIN} is an entry # number (a string of [A-Za-z0-9.]) # Ignores all punctuation and reduces all words to lowercase. # Outputs a list of all tuples {tsize} consecutive words, # ignoring those that contain words that contain '*'. # # For each tuple of {tsize} consecutive words in the same line of the input, # writes {tsize+1} lines in the output with the format # # "«{LEFT}» «{MIDDLE}» «{RIGHT}» <{SEC}.{NLIN}> {KW} {SL} {SM} {SR}" # # where {LEFT}, {MIDDLE}, and {RIGHT} are the words of the tuple; {SL}, {SM}, {SR} # are the counts of words in each of these strings; {SEC} and {NLIN} specify the input line # where the tuple occurs; and {KW} is the index of the tuple's first # word in the input line. # # The strings {LEFT}, {MIDDLE}, and {RIGHT} consist of whole input # words, separated by '.'. The string {MIDDLE} has at least one word, but # {LEFT} and {RIGHT} may be empty. BEGIN { tsize = check_num_arg("tcmin",tsize,1,10); ndat = 0 # Num of data lines processed. ntup = 0 # Num of tuples written. } // { gsub(/[#].*$/, "", $0); } /^ *$/ { next; } /<[0-9a-z]+[.][0-9A-Za-z.]+>[ ]/ { ndat++; loc = $1; lin = $0; gsub(/^[<>0-9A-Za-z.]+[ ]*/, "", lin); lin = tolower(lin); gsub(/[=.,' ()]+/, " ", lin); gsub(/^[ ]+/, "", lin); gsub(/[ ]+$/, "", lin); gsub(/[ ][ ]+/, " ", lin); nwds = split(lin, wds); # printf "%s %d words\n", loc, nwds > "/dev/stderr" for (i = 1; i <= nwds - tsize + 1; i++) { write_tuples(loc,wds,i) } next; } //{ data_error(("bad input format")); } END { printf "%6d data lines read\n", ndat > "/dev/stderr" printf "%6d tuples written\n", ntup > "/dev/stderr" } function write_tuples(loc,wds,i, j,wdj,sm,sl) { # Writes the tuples consisting of {wds[i..i+tsize-1]}. # printf " i = %d tsize = %d\n", i, tsize > "/dev/stderr" # Check for '*': for (j = 0; j < tsize; j++) { wdj = wds[i+j]; if (match(wdj, /[*]/)) { return; } if (match(wdj, /[-_ .,=!%<>{}&"]/)) { data_error(("invalid char in word «" wdj "»")); } } # If passed tests, write the tuples: for (sm = 1; sm <= tsize; sm++) { # printf " sm = %d\n", sm > "/dev/stderr" for (sl = 0; sl <= tsize - sm; sl++) { sr = tsize - sm - sl; if (sr < 0) { prog_error(("bug: {sr}")); } # printf " sl = %d sr = %d\n", sl, sr > "/dev/stderr" write_one_tuple(loc,wds,i,sl,sm,sr) } } } function write_one_tuple(loc,wds,i,sl,sm,sr, k) { # Writes the tuple consisting of {wds[i..i+tsize-1]} with # the first {sl} words in left context and the next {sm} words # in the middle part. if ((sl < 0) || (sr < 0) || (sm < 1)) { prog_error(("bug {sl.sm.sr}")); } ntup++; printf "«"; for (k = 0; k < sl; k++) { if (k > 0) { printf "."; } printf "%s", wds[i+k]; } printf "» «"; for (k = 0; k < sm; k++) { if (k > 0) { printf "."; } printf "%s", wds[i+sl+k]; } printf "» «"; for (k = 0; k < sr; k++) { if (k > 0) { printf "."; } printf "%s", wds[i+sl+sm+k]; } printf "»"; printf " %s %d %d %d %d\n", loc, i, sl, sm, sr } function check_num_arg(name,x,xmin,xmax) { if (x == "") { arg_error(("must define {" name "}")); } x += 0; if ((x < xmin) || (x > xmax)) { arg_error(("bad {" name "}")); } return x; } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr" exit 1 } function prog_error(msg) { printf "** PROG ERROR: %s\n", msg > "/dev/stderr" exit 1 } function data_error(msg) { printf "%s:%d ** %s\n", FILENAME, FNR, msg > "/dev/stderr" printf " [[%s]]\n", $0 > "/dev/stderr" exit 1 }