#! /usr/bin/gawk -f # Last edited on 2025-11-01 10:18:13 by stolfi # Reads a ".wdp" file as produced by {extract_parag_words.py}. # Outputs repeats across line breaks. # If {trim} is set, removes the first glyph after the break. # If {quasi} is set, acccepts also quasi-repeats BEGIN { oowd = ""; oopage = ""; owd = ""; opage = "" npair = 0; nrept = 0 } // { page = $2; wd = $6 if ((wd == "-") || (wd == "=")) { if ((page == opage) && ((owd == "-") || (owd == "="))) { data_error(("bad break sequence")) } } if ((page == oopage) && (owd == "-")) { npair++ if (oowd == "") { data_error(("missing parag break")) } if ((oowd == "-") || (oowd == "=")) { data_error(("invalid line break")) } awd = oowd if (trim) { bwd = ("@" wd) gsub(/^@[{][^{}]*[}]/, "", bwd) gsub(/^@[csi]h+/, "", bwd) gsub(/^@[ci][ktpf]h+/, "", bwd) gsub(/^@[&][0-9]+[;]/, "", bwd) gsub(/^@[a-z]/, "", bwd) if (substr(bwd,1,1) == "@") { data_error(("bad word initial" wd " -> " bwd)) } } else { bwd = wd } # printf "%20s %s %-20s\n", awd, owd, bwd > "/dev/stderr" na = length(awd); nb = length(bwd) if ((na > 0) && (nb > 0)) { if (quasi) { if (na <= nb) { ok = (awd == substr(bwd, 1, na)) } else { ok = (substr(awd, 1 + na - nb, nb) == bwd) } } else { ok = (awd == bwd) } if (ok) { printf "%20s : %-20s\n", oowd, wd nrept++ } } } oowd = owd; oopage = opage owd = wd; opage = page } END { printf "%d pairs checked, %d repeats found\n", npair, nrept > "/dev/stderr" } function data_error(msg) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg exit 1 }