#! /usr/bin/gawk -f
# Last edited on 1999-01-04 17:59:07 by stolfi
# Creates a raw concordance for all words and short phrases in an
# interlinear file.
#
# Usage:
#
# cat TEXT \
# | enum-text-phrases -f eva2erg.gawk \
# [-v maxLength=MAXLEN] \
# [-v leftContext=LCONT] \
# [-v rightContext=RCONT] \
# > OCCS
#
# This script reads a single-version text from stdin (in plain or
# ".evt" interliner format), and, for every occurrence of every word
# or sufficiently short phrase in it, writes to stdout a record of the
# form
#
# LOC TRANS START LENGTH LCTX PHRASE RCTX
# 1 2 3 4 5 6 7
#
# where
#
# LOC is a line locator, e.g. "f86v5.L5" or "f86v5.C2.12a"
#
# TRANS is a letter identifying a transcriber, e.g. "F" for Friedman
#
# START is the index of the phrase in the line (START=1 means col. 20).
#
# LENGTH is the original length of the phrase in the text,
# including fillers, {}-comments, spaces, etc..
#
# LCTX is a word or word sequence, the left context of PHRASE,
#
# PHRASE is the word or word sequence in question.
#
# RCTX is a word or word sequence, the right context of PHRASE
#
#
# The PHRASE does not contain any EVA fillers ([!% ]), newlines, or
# {}-comments.
#
# A word is a string of characters delimited by any of the EVA
# word-spaces [-=,.]. (Note that ASCII blanks are treated as fillers
# and not word breaks.)
#
# The PHRASE may extend across gaps and ordinary line breaks
# ("-") but not paragraph breaks ("=") or changes of textual unit. In
# that case the LENGTH field does not count the newline characters,
# columns 1-19, or any intervening "#"-comments.
#
# In both the phrase and the context strings, the uncertain spaces ","
# are replaced by ordinary spaces ".", The line nad paragraph breaks
# are preserved, except that line-final "-" is changed to "/" to
# distinguish it from embedded "-" denoting gaps, vellum defects, or
# intruding figures.
#
# A multi-word PHRASE is considered sufficiently short if contains at
# most MAXLENGTH EVA characters, not counting fillers and word spaces.
#
# The context fields LCTX and RCTX are writen only if the parameters
# "leftContext" and "rightContext" are specified. If they are
# present, each of these two fields includes the word separator
# characters that delimit it on both sides. The left context includes
# as many whole words as needed to make "leftContext" characters
# (including all delims after the first); and symmetrically for
# "rightContext".
#
# If the file is in ".evt" format, this script will get the LOC
# and TRANS fields from columns 1-19. The TRANS code is
# optional, but the unit must be explicit (i.e. there should be no
# "anonymous" units). If the input file is not in ".evt" format, the
# script assumes the location code is f0.P.NNN;X where NNN is the
# input record number.
function gather_words(str, wp, wl, ws, \
i, k, kb, m, b, c, bSep, cSep)
{
# Stores in wp[i] the index of the first char
# of the ith non-empty word of "str".
# Also stores in wl[i] its length, and in ws[i] the following
# separator (or "." if for some reason the separator is not found).
# Returns the number of words found.
# Assumes "str" has been cleaned of comments.
# Turn line-final "-" into "/":
str = gensub(/[-]([!%]*)$/, "/\\1", " ", str);
# Replace uncertain spaces by "."
gsub(/[,]/, ".", str);
# Pad with "." just for sentinels.
str = ("." str ".");
m = length(str);
n = 0;
b = substr(str,1,1);
if (b != ".") { error("internal padding error"); exit; }
bSep = 1;
for(k=2; k<=m; k++)
{ c = substr(str,k,1);
cSep = (match(c, /[-/.,=]/) != 0);
if (bSep && (! cSep)) { kb = k; }
if ((! bSep) && cSep)
{ n++;
wp[n] = kb - 1;
wl[n] = k-kb;
ws[n] = c;
}
b = c; bSep = cSep;
}
if (c != ".") { error("internal padding error"); exit; }
return n;
}
# === THE WORD BUFFER ===========================================
function clear_words()
{
# Clears the word buffer:
nbuf = 0;
split("", wbuf); # wbuf[1..nbuf] are the saved words, squeezed.
split("", wlen); # wlen[i] is length(wbuf[i]).
split("", wloc); # wloc[i] the loc and trcode of line containing wbuf[i] (sep " ").
split("", wiof); # wiof[i] is the index of wbuf[i]'s first char in the line.
split("", wfof); # wfof[i] is the index of the first char after wbuf[i].
split("", wsep); # wsep[i] is the word separator after wbuf[i].
split("", wskp); # wskp[i] is the number of non-word bytes skipped after wbuf[i].
wskp[0] = 0; # Sentinel.
}
function append_word(wd, sep, loc, iindex, findex)
{
# Appends to the word buffer the word "wd" and following separator "sep",
# with location "loc", original indices "[iindex..findex-1]"
if (iindex < 1) { error("append_word: iindex error"); exit; }
if (findex <= iindex) { error("append_word: findex error"); exit; }
nbuf++;
wbuf[nbuf] = wd;
wlen[nbuf] = length(wd);
wloc[nbuf] = loc;
wiof[nbuf] = iindex;
wfof[nbuf] = findex;
wsep[nbuf] = sep;
wskp[nbuf] = 0;
}
function append_filler(len)
{
# Records that "len" filler bytes were skipped after the
# last word in the buffer
if (len < 0) { error("append_filler: internal error"); exit; }
wskp[nbuf] += len
}
function dump_phrases( \
i, j, k, len, locw, olen, ctx, wtmp)
{
# Writes to stdout all words and short phrases currently in
# the word buffer. Also increments nWords.
#
for (i=1; i<=nbuf; i++)
{ nWords++;
j = i;
len = 0;
off = wiof[i];
olen = 0;
locw = wloc[i];
while((j <= nbuf) && ((j == i) || (len + wlen[j] <= maxLength)))
{ # Output one record for phrase "wbuf[i..j]":
olen += wfof[j] - wiof[j];
len += wlen[j];
printf "%s %d %d", locw, off, olen;
if (leftContext >= 0) { print_left_context(i, leftContext); }
printf " %s", wbuf[i];
for (k=i+1; k<=j; k++) { printf "%s%s", wsep[k-1],wbuf[k]; }
if (rightContext >= 0) { print_right_context(j, rightContext); }
printf "\n";
olen += wskp[j];
j++;
nPhrases++;
}
}
clear_words();
}
function print_left_context(i, minlen, k, len)
{
# Prints the words preceding "wbuf[i]", and their delimiters,
# with at least "minlen" total word characters.
len = 0;
k = i-1;
while((len < minlen) && (k >= 1)) { len += 1 + wlen[k]; k--; }
printf " %s", (k < 1 ? "=" : wsep[k] );
k++;
while(k < i)
{ printf "%s%s", wbuf[k], wsep[k]; k++; }
}
function print_right_context(j, minlen, k, len)
{
# Prints the words following "wbuf[j]", and their delimiters,
# with at least "minlen" total word characters.
len = 0;
printf " %s", wsep[j];
k = j+1;
while((len < minlen) && (k <= nbuf))
{ printf "%s%s", wbuf[k], wsep[k]; len += wlen[k] + 1; k++; }
}
# === ACTIONS ===================================================
BEGIN {
abort = 0;
if (maxLength == "") { maxLength = 0; }
if (leftContext == "") { leftContext = -1; }
if (rightContext == "") { rightContext = -1; }
# Clears the word buffer:
clear_words();
# Count of words preceding the first one in the word buffer:
nWords = 0;
# Count of phrases written to stdout:
nPhrases = 0;
# Current page+unit and line:
cur_unit = "";
cur_line = "";
cur_trans = "";
}
/^#/ {
if (abort) exit;
next;
}
/./ {
if (abort) exit;
if (match($0, /^<[fc][0-9]+[vr][0-9]?[.]([A-Za-z][A-Za-z0-9]?[.]|[A-Z]|)[0-9]+[a-z]?([;][A-Z]|)>/))
{ loc = substr($0,2,RLENGTH-2);
skip = 19;
}
else if (substr($0,1,1) == "<")
{ error("bad location code"); }
else
{ loc = ("f0.P." NR ";X");
skip = 0;
}
if (skip >= length($0)) next;
# Analyze and regularize location code:
len = length(loc);
if (substr(loc,len-1,1) != ";")
{ trans = "X";
# error("semicolon?");
}
else
{ trans = substr(loc,len,1);
loc = substr(loc, 1, len-2);
}
if (! match(loc, /[0-9]+[a-z]?$/))
{ error("prog error: no line num"); }
else
{ line = substr(loc, RSTART);
unit = substr(loc, 1, RSTART-1);
}
if (trans == cur_trn)
{ }
else if (cur_trn == "")
{ cur_trn = trans; }
else
{ error("wrong transcriber code"); }
loc = (loc " " trans);
# Do not consider phrases that span more than one text unit:
if (unit != cur_unit)
{ dump_phrases(); cur_unit = unit; }
else if (line == cur_line)
{ error("repeated line"); }
cur_line = line;
# Get the text proper "txt", omitting skipped part,
# and replace comments and fillers by "!";
txt = erg_erase_comments(substr($0,1+skip));
# Replace four or more "*"s in the same word by "***":
while(gsub(/[!][*]/, "*!", txt) > 0) { }
while(sub(/[*][*][*][*]/, "!***", txt) > 0) { }
# Extract the non-empty words from "txt":
split("", wp); split("", wl); split("", ws);
nw = gather_words(txt, wp, wl, ws);
# Append the words to the word buffer:
findex = 1;
for (i=1; i<=nw; i++)
{ wd = erg_pack(substr(txt, wp[i], wl[i]));
if (wd != "")
{ iindex = wp[i];
append_filler(iindex - findex);
findex = wp[i] + wl[i];
# print i, wp[i], wl[i], iindex, findex > "/dev/stderr";
append_word(wd, ws[i], loc, iindex, findex);
}
}
append_filler(length(txt) - findex);
# Phrases should not span paragraph boundaries:
if (match(txt, /[=]/))
{ if (! match(substr(txt, RSTART+1), /[!% ]*$/))
{ error("embedded paragraph"); }
dump_phrases();
}
next;
}
END {
if(abort) exit;
dump_phrases();
printf "read %6d words\n", nWords > "/dev/stderr";
printf "wrote %6d phrases\n", nPhrases > "/dev/stderr";
}
function error(msg)
{
printf "line %d: %s\n", NR, msg > "/dev/stderr"
abort = 1
exit
}