#! /usr/bin/gawk -f # Last edited on 2025-06-15 21:53:25 by stolfi # Reads a list of tuples and prints the tuples whose context strings # occur more than once. # Each input line shoudl have the format # # "«{LEFT}» «{MIDDLE}» «{RIGHT}» {SL} {SM} {SR} {LOC} {KW}" # # where {LEFT} {MIDDLE} {RIGHT} are strings of zero or more whole input # words, separated by '.'; {SL}, {SM}, {SR} # are the counts of words in each of these strings; {LOC} is the locator of the input line # where the tuple occurs; and {KW} is the index of the tuple's first # word in the input line. # # The input file must be sorted by {LEFT} and {RIGHT}. # # Outputs those tuples such that the pair {(LEFT,RIGHT)} occurs more than once. # Prints a blank line between groups. BEGIN { octx = "???"; # Previous string "{LEFT}||{RIGHT}" oout = ""; # Output line of previous tuple in group." ntup = 0; # Count of input lines. npat = 0; # Count of distinct {(LEFT,RIGHT)} pattern. } // { gsub(/[#].*$/, "", $0); } /^ *$/ { next; } /[«]/ { left = $1; mid = $2; rite = $3; loc = $4; kw = $5; sl = $6; sm = $7; sr = $8; if ((sl == 0) || (sr == 0)) { next; } ntup++; ctx = (left "||" rite); if (ctx != octx) { # Not part of current group: octx = ctx; oout = output_line(left,mid,rite,loc,kw); } else { # Part of current group: if (oout != "") { npat++; # First line of group was not printed, print it: printf "\n"; printf "%s\n", oout; oout = ""; } printf "%s\n", output_line(left,mid,rite,loc,kw); } next; } // { data_error(("invalid line format")); } END { printf "%6d input tuples with context\n", ntup > "/dev/stderr" printf "%6d repeated patterns found\n", npat > "/dev/stderr" } function output_line(left,mid,rite,loc,kw) { return sprintf("%s %s %s %s %d", left,mid,rite,loc,kw); } function check_num_arg(name,x,xmin,xmax) { if (x == "") { arg_error(("must define {" name "}")); } x += 0; if ((x < xmin) || (x > xmax)) { arg_error(("bad {" name "}")); } return x; } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr" exit 1 } function prog_error(msg) { printf "** PROG ERROR: %s\n", msg > "/dev/stderr" exit 1 } function data_error(msg) { printf "%s:%d ** %s\n", FILENAME, FNR, msg > "/dev/stderr" printf " [[%s]]\n", $0 > "/dev/stderr" exit 1 }