#! /usr/bin/gawk -f
# Last edited on 2025-06-17 10:31:08 by stolfi

# User must specify (with "-v") the varaible {min_size} - the min interesting tuple length.
#
# The input file must have data lines with format "{LOC} {WORDS}" where
# {LOC} is is "<{anything}>" and {WORDS} is a sequence of one or more
# words in UTF-8 (such as pinyin or EVA), each preceded and followed by
# at least one punctuation [ .,'=()].
#
# Ignores any line that does not begin with '<'.
# 
# Then finds maximal repeated tuples.  Enumerates every pair of parags {P} and {Q},
# every word positions {ip} in {P} (starting from 1) and {iq} in {Q} (ditto). 
# If {P[ip] == Q[iq]} and {P[ip-1] != Q[iq-1]}, looks for the maximum {m}
# such that words {P[ip..ip+m-1} and {Q[iq..iq+m-1]}are equal.
#
# If {m >= min_size}, writes two records 
# 
#   "{PLOC} {ip} {m} {PHRASE}"
#   "{QLOC} {iq} {m} {PHRASE}"
#
# where {PLOC} and {QLOC} are the locators of {P} and {Q}, and {PHRASE}
# is the words {P[ip..ip+m-1]==Q[iq..iq+m-1]} separated by ".".

BEGIN {
  nloc = 0; # Count of data lines (lines that start with locator).
  nrep = 0; # Count of repetitions found.
  
  split("", locs);   # The locators that occur in the file, in sequence, are {locs[0..nloc-1]}
  split("", parags); # The texts of the parags, indexed {0..nloc-1]}.
  
  min_size += 0; # Min size of tuple to consider.
  if (min_size < 2) { arg_error(("invalid min_size = " min_size)); }
}

// { gsub(/[#].*$/, "", $0); }

/^ *$/ { next; }

/^[<][a-zA-Z0-9.]+[>]/ {
  loc = $1;
  locs[nloc] = loc;
  lin = extract_words($0);
  parags[nloc] = lin;
  nloc++;
  next;
}

END {
  printf "found %d paragraphs\n", nloc > "/dev/stderr"
  for (kp = 0; kp < nloc; kp++) {
    printf "%s ", locs[kp] > "/dev/stderr"
    np = split(parags[kp], P);
    for (kq = kp; kq < nloc; kq++) { 
      nq = split(parags[kq], Q);
      ip0 = 1;
      for (ip = ip0; ip <= np - min_size + 1; ip++) {
        iq0 = (kq == kp ? ip + 1 : 1);
        for (iq = iq0; iq <= nq - min_size + 1; iq++) {
          check_repeated_tuple(locs[kp],ip,P,np, locs[kq],iq,Q,nq);
        }
      }
    }
    printf "\n" > "/dev/stderr"
  }
  printf "found %d repetitions with %d or more words\n", ntup, min_size > "/dev/stderr"
}

function check_repeated_tuple(Ploc,ip,P,np,Qloc,iq,Q,nq,  jp,jq,m) { 
  if ((ip > 1) && (iq > 1) && (P[ip-1] == Q[iq-1])) { return; }
  m = 0; jp = ip; jq = iq;
  while ((jp <= np) && (jq <= nq) && (P[jp] == Q[jq])) { jp++; jq++; m++; }
  if (m < min_size) { return; }
  printf "!" > "/dev/stderr"
  print_tuple(Ploc,ip,P,m);
  print_tuple(Qloc,iq,Q,m);
  fflush();
  ntup++;
}
  
function print_tuple(Wloc,iw,W,m,   kw) {
  printf "%s %d %d ", Wloc, iw, m;
  for (kw = 0; kw < m; kw++) {
    if (kw > 0) { printf "."; }
    printf "%s", W[iw+kw];
  }
  printf "\n";
}