#! /usr/bin/gawk -f
# Last edited on 2025-06-16 05:44:02 by stolfi

# Reads a file with lines of the format "{LOC} {PTEXT}" where
# {LOC} is a line locator and {PTEXT} is a sequence of words in UTF-8
# (such as pinyin or EVA), each preceded and followed by at least one 
# punctuation [ .,'=()].
#
# The locator {LOC} should have the form "<{SEC}.{NLIN}>"
# where {SEC} is a section number (string of [a-z0-9]), {NLIN} is an entry
# number (a string of [A-Za-z0-9.])

# Ignores all punctuation and reduces all words to lowercase.

# Outputs a list of all tuples {tsize} consecutive words,
# ignoring those that contain words that contain '*'.
#
# For each tuple of {tsize} consecutive words in the same line of the input,
# writes {tsize+1} lines in the output with the format 
# 
#    "«{LEFT}» «{MIDDLE}» «{RIGHT}» <{SEC}.{NLIN}> {KW} {SL} {SM} {SR}"
# 
# where {LEFT}, {MIDDLE}, and {RIGHT} are the words of the tuple; {SL}, {SM}, {SR}
# are the counts of words in each of these strings; {SEC} and {NLIN} specify the input line
# where the tuple occurs; and {KW} is the index of the tuple's first
# word in the input line.
#
# The strings {LEFT}, {MIDDLE}, and {RIGHT} consist of whole input
# words, separated by '.'. The string {MIDDLE} has at least one word, but 
# {LEFT} and {RIGHT} may be empty.

BEGIN {
  tsize = check_num_arg("tcmin",tsize,1,10);
  ndat = 0 # Num of data lines processed.
  ntup = 0 # Num of tuples written.
}

// { gsub(/[#].*$/, "", $0); }

/^ *$/ { next; }

/<[0-9a-z]+[.][0-9A-Za-z.]+>[ ]/ {
  ndat++;
  loc = $1;
  lin = $0;  gsub(/^[<>0-9A-Za-z.]+[ ]*/, "", lin);
  lin = tolower(lin);
  gsub(/[=.,' ()]+/, " ", lin);
  gsub(/^[ ]+/, "", lin);
  gsub(/[ ]+$/, "", lin);
  gsub(/[ ][ ]+/, " ", lin);
  nwds = split(lin, wds);
  # printf "%s %d words\n", loc, nwds > "/dev/stderr"
  for (i = 1; i <= nwds - tsize + 1; i++) {
    write_tuples(loc,wds,i)
  }
  next;
}

//{ data_error(("bad input format")); }
  
END {
  printf "%6d data lines read\n", ndat > "/dev/stderr"
  printf "%6d tuples written\n", ntup > "/dev/stderr"
}

function write_tuples(loc,wds,i,   j,wdj,sm,sl) {
  # Writes the tuples consisting of {wds[i..i+tsize-1]}.
  # printf "  i = %d tsize = %d\n", i, tsize > "/dev/stderr"
  # Check for '*':
  for (j = 0; j < tsize; j++) {
    wdj = wds[i+j];
    if (match(wdj, /[*]/)) { return; }
    if (match(wdj, /[-_ .,=!%<>{}&"]/)) { data_error(("invalid char in word «" wdj "»")); }
  }
  # If passed tests, write the tuples:
  for (sm = 1; sm <= tsize; sm++) {
    # printf "    sm = %d\n", sm > "/dev/stderr"
    for (sl = 0; sl <= tsize - sm; sl++) {
      sr = tsize - sm - sl;
      if (sr < 0) { prog_error(("bug: {sr}")); }
      # printf "      sl = %d sr = %d\n", sl, sr > "/dev/stderr"
      write_one_tuple(loc,wds,i,sl,sm,sr)
    }
  }
}

function write_one_tuple(loc,wds,i,sl,sm,sr,   k) {
  # Writes the tuple consisting of {wds[i..i+tsize-1]} with 
  # the first {sl} words in left context and the next {sm} words
  # in the middle part.
  if ((sl < 0) || (sr < 0) || (sm < 1)) {
    prog_error(("bug {sl.sm.sr}"));
  }
  ntup++;
  printf "«";
  for (k = 0; k < sl; k++) {
    if (k > 0) { printf "."; }
    printf "%s", wds[i+k];
  }
  printf "» «";
  for (k = 0; k < sm; k++) {
    if (k > 0) { printf "."; }
    printf "%s", wds[i+sl+k];
  }
  printf "» «";
  for (k = 0; k < sr; k++) {
    if (k > 0) { printf "."; }
    printf "%s", wds[i+sl+sm+k];
  }
  printf "»";
  printf " %s %d %d %d %d\n", loc, i, sl, sm, sr
}
 
function check_num_arg(name,x,xmin,xmax) {
  if (x == "") { arg_error(("must define {" name "}")); } 
  x += 0;
  if ((x < xmin) || (x > xmax)) { arg_error(("bad {" name "}")); } 
  return x;
}
 
function arg_error(msg) {
  printf "** %s\n", msg > "/dev/stderr"
  exit 1
}
 
function prog_error(msg) {
  printf "** PROG ERROR: %s\n", msg > "/dev/stderr"
  exit 1
}
  
function data_error(msg) {
  printf "%s:%d ** %s\n", FILENAME, FNR, msg > "/dev/stderr"
  printf "  [[%s]]\n", $0 > "/dev/stderr"
  exit 1
}