#! /n/gnu/bin/gawk -f
# Must specify -f eva2erg.gawk

<!> BUGGY AND OBSOLETE NOW.
<!> THE PART THAT SHOWS OCCURENCES MAY STILL BE USEFUL - 
<!> READ A CONCORDANCE FILE AND A TEXT FILE, AND PRINT THEM 

# Finds exact or approximate word matches betwen two files.  Usage: 
#
#    cat INPUT \
#      | find-matches \
#          [ -v field=FLDNUM ] \
#          -v words=WORDFILE \
#      > OCCS 
#
# This script reads a list of words WORDFILE, and
# then copies to stdout every record from stdin that 
# contains .
#
# Each record of the input files should begin with one blank-delimited
# word.  FIle FILEA must be sorted by that word.
#
# Two records are said to match if they begin with the same word.
# Whenever it finds one or more records from FILEA that match one
# or more from FILEB, this script 
#
#   WORD DATA 
#   
# where 
#   
#   WORD       is a word as read from the list.
#
#   MATCH      is the string from stdin that matched WORD.
#
# Fillers ([!% ]) and comments in the text are always ignored in
# the comparisons.  Normally the non-ignored characters are 
# compared for equality.  If "ignoreq=1" is specified, 
# ignores the EVA "q" characters. If "forgiving=1" is specified,
# the words are compared by a looser, error-tolerant criterion.
# In any case the OFFSET is the index of the first non-space
# charater of the line that matched the first character of WORD.
#
# If the file is in ".evt" format, the transcriber code (";S") is optional. 
# If the input file is not in ".evt" format, the output <LOCATION> code
# is simply <f0.P.NNN> where NNN is the input record number.
#
# If "aswords=1" is given then EVA word spaces [-=,.] are 
# treated as significant characters; also the target words will 
# match only whole text words.  Otherwise the spaces are ignored
# and the patterns will match anywhere within a single line.
#
# The POS field is the number of "matching opportunities" preceding
# the occurrence, excluding comments and fillers: If aswords=0, it is
# the number of text characters compared: EVA characters if
# forgiving=ignoreq=0, reduced characters if forgiving=1 or ignoreq=1.
# If aswords=1, it is the number of words.
#
# The "show=1" option causes the script to print the input text 
# and all occurrences aligned with the text.
#

function error(msg)
{ 
  printf "line %d: %s\n", NR, msg > "/dev/stderr"
  abort = 1
  exit
}

function recode_text(str)
{
  # Converts a textstring with no embedded "{}"s to the
  # requested encoding, preserving length.
  str = (forgiving ? erg_blur_details(str) : str);
  str = (ignoreq ? erg_erase_q(str) : str);
  return (aswords ? erg_unify_word_spaces(str) : erg_erase_word_spaces(str));
}

function word_counts(txt, wct,   i, k, m, b, c)
{
  # Stores in wct[i] the number of non-empty words 
  # that begin before character "i" of "txt".
  # Assumes all word breaks have been reduced to "."
  # and "txt" has been padded with one "." on each side.
  m = length(txt); k = 0;
  b = substr(txt,1,1);
  if (b != ".") { error("internal padding error"); exit; }
  wct[1] = 0;
  for(i=2;i<=m;i++)
    { c = substr(txt,i,1);
      if ((b != ".") && (c == ".")) k++; 
      wct[i] = k; b = c;
    }
  if (c != ".") { error("internal padding error"); exit; }
}

# === ACTIONS ===================================================

BEGIN {
  abort = 0;
  if (wordfile == "") error("must specify \"-v wordfile=FILE\"\n");
  nWords = 0;
  split("", dic);
  split("", pat);
  while((getline w < wordfile) > 0) { 
    p = erg_pack(recode_text(w));
    dic[nWords] = w;
    pat[nWords] = (aswords ? ("." p ".") : p);
    ctr[nWords] = 0;
    nWords++;
  }
  close (wordfile);
  printf "loaded %6d words\n", nWords > "/dev/stderr"
  nOccs = 0;   # Occurrences found so far
  nSites = 0;  # Matching opportunities found so far.
}

/^#/ { 
  if (abort) exit;
  if (show) print;
  next
}

/./ {
  if (abort) exit;
  if (show) print;
  if (substr($0,1,1) == "<") 
    { loc = substr($0,1,index($0,">"));
      skip = 19;
    }
  else 
    { loc = ("<f0.P." NR ">"); 
      skip = 0;
    }
  if (skip >= length($0)) next;
  # lin = original line without skipped part and with comments mapped to "!"
  lin = erg_erase_comments(substr($0,1+skip));
  # txt = same as lin, with all ignorable details mapped to "!" 
  txt = recode_text(lin);
  if (show && (forgiving || ignoreq)) print (substr($0,1,skip) txt);
  # If matching words, we must pad "txt" with "." to get correct
  # matches at line extremities. (Note that each pattern has been padded
  # with "."s in this case)
  if (aswords) 
    { txt = ("." txt ".");
      split("", wct);
      word_counts(txt, wct);
    }
  # pck = txt with all "!"s squeezed out
  pck = erg_pack(txt);
  if (show) print (substr($0,1,skip) pck);
  # Compute offsets in txt for each character in pck:
  split("", iof);
  split("", fof);
  erg_char_offsets(txt, iof, fof);
  ops = 0;
  for(k=0;k<nWords;k++)
    { p = pat[k];
      i = index(pck, p);
      while (i != 0) 
        { nOccs++;
          ctr[k]++;
          if (aswords)
            { # Offset of occurrence in "lin" (discount padding "." in p and txt):
              offset = iof[i + 1] - 1;
              # Length of occurrence, ignoring final "." added to "p"
              occlen = fof[i + length(p) - 2] - iof[i + 1];
              # Position from beginning of text
              pos = nSites + wct[i]
            }
          else
            { # Offset of occurrence in "lin" (same as in txt):
              offset = iof[i];
              # Length of occurrence (trust that bof[length(pck)] is defined):
              occlen = fof[i + length(p) - 1] - iof[i];
              # Position from beginning of text
              pos = nSites + i - 1
            }
          if (show) 
            { for(j=1;j<=skip+offset;j++) printf " ";
              printf "%s [%d:%d]\n", dic[k], offset, pos;
            }
          else
            { # Extract the actual occurrence from the original line (minus comments).
              occ = substr(lin, 1 + offset, occlen);
              # Delete any fillers (e.g. from comments):
              occ = erg_pack(occ);
              if (occ == "") { printf "occlen = %d bof[i]=%d\n", occlen, bof[i] > "/dev/stderr"; }
              print loc, offset, pos, dic[k], occ;
            }
          j = index(substr(pck, i+1), p);
          i = (j == 0 ? 0 : i + j)
        }
    }
  nSites += ( aswords ? wct[length(pck)] : length(pck) );
  next
}

END {
  if(abort) exit;
  printf "tested %6d potential matching sites\n", nSites > "/dev/stderr";
  printf "found  %6d occurrences\n", nOccs > "/dev/stderr";
  printf "words not found:\n" > "/dev/stderr";
  printf "\n" > "/dev/stderr";
  printf "  " > "/dev/stderr";
  for (k=0; k<nWords; k++)
    { if (ctr[k] == 0)
        { printf "%s ", dic[k] > "/dev/stderr"; }
    }
  printf "\n" > "/dev/stderr";
}