#! /usr/bin/gawk -f # Last edited on 2025-06-17 10:31:08 by stolfi # User must specify (with "-v") the varaible {min_size} - the min interesting tuple length. # # The input file must have data lines with format "{LOC} {WORDS}" where # {LOC} is is "<{anything}>" and {WORDS} is a sequence of one or more # words in UTF-8 (such as pinyin or EVA), each preceded and followed by # at least one punctuation [ .,'=()]. # # Ignores any line that does not begin with '<'. # # Then finds maximal repeated tuples. Enumerates every pair of parags {P} and {Q}, # every word positions {ip} in {P} (starting from 1) and {iq} in {Q} (ditto). # If {P[ip] == Q[iq]} and {P[ip-1] != Q[iq-1]}, looks for the maximum {m} # such that words {P[ip..ip+m-1} and {Q[iq..iq+m-1]}are equal. # # If {m >= min_size}, writes two records # # "{PLOC} {ip} {m} {PHRASE}" # "{QLOC} {iq} {m} {PHRASE}" # # where {PLOC} and {QLOC} are the locators of {P} and {Q}, and {PHRASE} # is the words {P[ip..ip+m-1]==Q[iq..iq+m-1]} separated by ".". BEGIN { nloc = 0; # Count of data lines (lines that start with locator). nrep = 0; # Count of repetitions found. split("", locs); # The locators that occur in the file, in sequence, are {locs[0..nloc-1]} split("", parags); # The texts of the parags, indexed {0..nloc-1]}. min_size += 0; # Min size of tuple to consider. if (min_size < 2) { arg_error(("invalid min_size = " min_size)); } } // { gsub(/[#].*$/, "", $0); } /^ *$/ { next; } /^[<][a-zA-Z0-9.]+[>]/ { loc = $1; locs[nloc] = loc; lin = extract_words($0); parags[nloc] = lin; nloc++; next; } END { printf "found %d paragraphs\n", nloc > "/dev/stderr" for (kp = 0; kp < nloc; kp++) { printf "%s ", locs[kp] > "/dev/stderr" np = split(parags[kp], P); for (kq = kp; kq < nloc; kq++) { nq = split(parags[kq], Q); ip0 = 1; for (ip = ip0; ip <= np - min_size + 1; ip++) { iq0 = (kq == kp ? ip + 1 : 1); for (iq = iq0; iq <= nq - min_size + 1; iq++) { check_repeated_tuple(locs[kp],ip,P,np, locs[kq],iq,Q,nq); } } } printf "\n" > "/dev/stderr" } printf "found %d repetitions with %d or more words\n", ntup, min_size > "/dev/stderr" } function check_repeated_tuple(Ploc,ip,P,np,Qloc,iq,Q,nq, jp,jq,m) { if ((ip > 1) && (iq > 1) && (P[ip-1] == Q[iq-1])) { return; } m = 0; jp = ip; jq = iq; while ((jp <= np) && (jq <= nq) && (P[jp] == Q[jq])) { jp++; jq++; m++; } if (m < min_size) { return; } printf "!" > "/dev/stderr" print_tuple(Ploc,ip,P,m); print_tuple(Qloc,iq,Q,m); fflush(); ntup++; } function print_tuple(Wloc,iw,W,m, kw) { printf "%s %d %d ", Wloc, iw, m; for (kw = 0; kw < m; kw++) { if (kw > 0) { printf "."; } printf "%s", W[iw+kw]; } printf "\n"; }