#! /usr/bin/gawk -f
# Last edited on 2025-04-29 20:36:58 by stolfi

# Usage: $0 WORDFILE < INFILE > OUTFILE

# Reads a list of words WORDFILE and a 
# file in EVT format (location in columns 1-19, text
# starting on column 20). For each occurrence of a word W
# from WORDFILE in each line of INFILE, writes out a line
# containing the triple L N W, where L is the location and number
# of the line in evt format (e.g. <f101r2.C1.12a;C>), and 
# N is the total number of text bytes in the INFILE before 
# that occurrence.
#
# Ignores "#"-comments, "{}"-comments, and the EVT fillers "!" and "%"
# in the INFILE.  Lines of INFILE that do not beging with "#" or "<"
# are assumed to be entirely text, and are assigned location <f0.X.k;X>
# where k is the count of such lines seen so far.
#

BEGIN {
  usage = (ARGV[0] "WORDFILE < TEXT")
  if (ARGC != 2) { error(("usage: " usage)); }
  wfile = ARGV[1];
  split("", dic)
  ERRNO = ""
  while((getline w < wfile) > 0) { dic[w] = 1 }
  if (ERRNO != "") { error((wfile ": " ERRNO)); }
  close (wfile);
  ARGC = 1
  nAnon = 0
  nBytes = 0
}

/^#/ { 
  if (abort) exit;
  next
}

function cleanup(txt)
{
  # Removes crud from text
  
  # We discard  "%" and "!".
  gsub(/[% !]/, "", txt);
  
  # We discard "{}"-comments:
  gsub(/\{[^}]*\}/, "", txt);
  
  # We choose arbitrarily the first of alternative transcriptions:
  gsub(/\[/, "", txt);
  gsub(/\|[^\]]*\]/, "", txt);
  gsub(/\]/, "", txt);
  
  return txt
}

/./ {
  if (abort) exit;
  if (substr($0,1,1) == "<") 
    { skip = 19;
      loc = substr($0,1,19);
      gsub(/  *$/, "", loc);
      if ( loc !~ /^<f[0-9]+[vr]*[0-9]*\.[A-Za-z][A-Za-z0-9]*\.[0-9]+[a-z]*\;[A-Z]>$/ )
        { printf "line %d, bad location \"%s\"\n", NR, loc > "/dev/stderr" }
    } 
  else 
    { skip = 0;
      nAnon++;
      loc = ("<f0.X." nAnon ";X>")
    }
  if (skip >= length($0)) next;
  txt = cleanup(substr($0,1+skip))
  
  for(w in dic)
    { i = index(txt, w);
      while (i != 0) 
        { 
          printf "%s %d %d %s\n", loc, i - 1, nBytes + i - 1, w;
          k = index(substr(txt, i+1), w);
          i = (k == 0 ? 0 : i + k)
        }
    }
  nBytes += length(txt);
  next
}