#! /bin/bash -e
# Last edited on 2026-02-09 15:50:58 by stolfi
    
min_size="$1"; shift
inp_file="$1"; shift

# The input file must have data lines with format "{LOC} {WORDS}" where
# {LOC} is is "<{anything}>" and {WORDS} is a sequence of one or more
# words in UTF-8 (such as pinyin or EVA), each preceded and followed by
# at least one punctuation [ .,'=()].
#
# Ignores any line that does not begin with '<'.
#
# Writes a file "${name}/res.dup" with one line for each occurrence of
# each maximal phrase of at least {min_size} words that occur at least
# twice. Each line has format
#
#   "{LOC} {IW} {NW} {PHRASE}"
#
# where {LOC} is the locus ID of the input line where the phrase occurs
# {IW} is the index (from 1) of the first word of the phrase on that
# line, {NW} is the number of words in the phrase, and {PHRASE} is the
# words {P[ip..ip+m-1]==Q[iq..iq+m-1]} separated by ".".

name="${inp_file/.*/}"
ext="${inp_file/*./}"

tmp="/tmp/$$"

tmp_file="${tmp}.tup"
echo "=== ${inp_file} -> ${tmp_file} ===" 1>&2
cat ${inp_file} \
  | ./list_repeated_tuples.gawk \
      -i error_funcs.gawk \
      -i common_funcs_076.gawk \
      -v min_size=${min_size} \
  > ${tmp_file}

dup_file="${name}/res.dup"
echo "=== ${tmp_file} -> ${dup_file} ===" 1>&2
cat ${tmp_file} \
  | sort -k3nr -k4 -k1 -k2 \
  | uniq \
  | insert_blank_lines.gawk -v fields=4 \
  > ${dup_file}
  
#   Checking for  repeated patterns
# 
#     for tsize in 3 4; do
#       for ifile in bencao.pin starps.eva ; do
#         name="${ifile/.*/}"
#         ext="${ifile/*./}"
#         tfile="${name}-tup.${ext}"
#         rfile="${name}-pat.${ext}"
#         echo "=== ${tfile} -> ${rfile} ==="
#         cat ${ifile} \
#           | extract_word_tuples.gawk -v tsize=4 \
#           | sort -k1 -k3 -k2 \
#           > ${tfile}
# 
#         cat ${tfile} \
#           | list_repeated_patterns.gawk \
#           > ${rfile}
#       done
#     done
#