#! /usr/bin/gawk -f
# Last edited on 2002-03-05 00:09:58 by stolfi

# Reads a file containing lines of the form 
# 
#     SEC USEQ FNUM UNIT LINE TRAN FPOS RPOS PFRST PLAST WORD
#     1   2    3    4    5    6    7    8    9     10    11
# 
# where: WORD is a word; SEC USEQ FNUM UNIT LINE TRAN is the 
# location of one occurrence of the word in the text; FPOS is the
# sequential number of the word in the line; RPOS is the same,
# counting backwards from the and of line; PFRST is a boolean (0 or 1)
# identifying the first token of a paragraph; and PLAST is analogous
# for the last token.
# 
# Writes a file with the "special" words, in the same format.

BEGIN{
  abort = -1;
}

(abort >= 0) {exit abort;} 

(NF == 11){
  sec = $1; useq = $2; 
  fnum = $3; unit = $4; nlin = $5; tran = $6;
  fpos = $7; rpos = $8; 
  pfrst = $9; plast = $10; 
  word = $11;
  if (fpos == 1)
    { if ((word !~ /[ktpf]/) || (word ~ /[?*].*[?*]/) || (length(word) < 4)) { next; }
      if ((pfrst && (word ~ /^[c]?[kt]/)) || (word ~/[pf]/) || (word ~/[ktpf].*[ktpf]/))
        { print; }
    }
  next;
}

/./{ data_error("bad line type"); }

function data_error(msg)
{
  printf "*** line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit abort;
}