#! /usr/bin/gawk -f # Last edited on 2025-06-15 21:53:09 by stolfi # Reads a list of tuples and prints the tuples with zero context that # occur more than once. # Each input line shoudl have the format # # "«{LEFT}» «{MIDDLE}» «{RIGHT}» {LOC} {KW} {SL} {SM} {SR}" # # where {LEFT} {MIDDLE} {RIGHT} are strings of zero or more whole input # words, separated by '.'; {SL}, {SM}, {SR} # are the counts of words in each of these strings; {LOC} is the locator of the input line # where the tuple occurs; and {KW} is the index of the tuple's first # word in the input line. # # The input file must be sorted by {MIDDLE}. # # Ignores tuples with non-empty {LRFT} or {RIGHT}. # Then outputs the {MIDDLE} tuples that occur more than once. # Each output line has format "{MIDDLE} {LOC} {KW} {SM}" # # Prints a blank line between groups. BEGIN { omid = "???"; # Previous "«{MIDDLE}»" string. oout = ""; # Output line of previous tuple in group." nlin = 0; # Count of input lines. ntup = 0; # Count of input lines with empty context. npat = 0; # Count of distinct {(LEFT,RIGHT)} pattern. } // { gsub(/[#].*$/, "", $0); } /^ *$/ { next; } /[«]/ { nlin++; left = $1; mid = $2; rite = $3; loc = $4; kw = $5; sl = $6; sm = $7; sr = $8; if ((sl > 0) || (sr > 0)) { next; } ntup++; if (mid != omid) { # Not part of current group: omid = mid; oout = output_line(mid,loc,kw,sm); } else { # Part of current group: if (oout != "") { npat++; # First line of group was not printed, print it: printf "\n"; printf "%s\n", oout; oout = ""; } printf "%s\n", output_line(mid,loc,kw,sm); } next; } // { data_error(("invalid line format")); } END { printf "%6d input data lines read\n", nlin > "/dev/stderr" printf "%6d tuples with empty context\n", ntup > "/dev/stderr" printf "%6d repeated tuples written\n", npat > "/dev/stderr" } function output_line(mid,loc,kw,sm) { return sprintf("%s %s %d %d", mid,loc,kw,sm); } function check_num_arg(name,x,xmin,xmax) { if (x == "") { arg_error(("must define {" name "}")); } x += 0; if ((x < xmin) || (x > xmax)) { arg_error(("bad {" name "}")); } return x; } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr" exit 1 } function prog_error(msg) { printf "** PROG ERROR: %s\n", msg > "/dev/stderr" exit 1 } function data_error(msg) { printf "%s:%d ** %s\n", FILENAME, FNR, msg > "/dev/stderr" printf " [[%s]]\n", $0 > "/dev/stderr" exit 1 }