#! /bin/gawk -f
# Last edited on 2002-01-04 13:58:54 by stolfi


BEGIN {
  abort = -1;
  usage = ( \
    "cat main.org \\\n" \
    "  | find-words-in-main-org \\\n" \
    "      -v words=WORDFILE [ -v times=NUM ] \\\n" \
    "  > occurrences.txt" \
  );
  
  # Reads a list of words from WORDFILE, outputs the lines 
  # containing the first TIMES occurrences of each of those words,
  # with the corresponding unit name
  # (from the preceding @unit lines).
  
  if (words == "") { arg_error(("must define \"words\"")); }
  if (times == "") { times = 1; }
  load_special_words(words, times);
  
  curunit = "NIL:0:0";
  PUNCT = "";
}

/^[ ]*[@]unit/ {
  curunit = $4;
}

/^ *[#] *PUNCT *[=] *["].*["] *$/ {
  PUNCT = (PUNCT get_val($0));
  if (PUNCT != "") { punct_pat = ( "[" PUNCT "]" ); }
}

function get_val(def)
{
  gsub(/^[#][ ]*[A-Z]+ *= *"/, "", def);
  gsub(/" *$/, "", def);
  return(quote_special(def));
}

function quote_special(chars)
{
  gsub(/[\\]/, "\\\\", chars);
  gsub(/[-]/, "\\-", chars);
  gsub(/[\]]/, "\\]", chars);
  gsub(/[\^]/, "\\^", chars);
  return chars;
}

/^[ ]*([#@]|$)/ { next; }

// {
  lin = $0;
  if (PUNCT != "") { gsub(punct_pat, " ", $0); }
  sel = 0;
  for (i = 1; ((i <= NF) && (! sel)); i++)
    { w = $(i);
      if ((w in maxocs) && (maxocs[w] > 0)) { maxocs[w]--; sel = 1; }
    }
  if (sel)
    { gsub(/^ +/, "", lin); 
      printf "%-8s %s\n", curunit, lin;
    }
  next;
}

function load_special_words(file,moc,    nWords,lin,fld,nfld)
{
  # Reads a word list from "file". 
  # For each word W, sets "maxocs[W] = moc".
  
  nWords=0;
  split("", maxocs)
  while((getline lin < file) > 0) { 
    gsub(/^[ ]*/, "", lin);
    if (! match(lin, /^[#]/))
      { gsub(/[ ]*[#].*$/, "", lin);
        nfld = split(lin, fld, " ");
        if (nfld != 1) tbl_error(file, ("bad table entry = \"" lin "\""));
        if (fld[1] in maxocs) tbl_error(file, ("repeated key = \"" lin "\""));
        maxocs[fld[1]] = moc;
        nWords++;
      }
  }
  if (ERRNO != "0") { arg_error((file ": " ERRNO)); }
  close (file);
  if (nWords == 0) 
    { printf "warning: file \"" file "\" empty or missing\n" > "/dev/stderr"; }
  else
    { printf "loaded %6d words\n", nWords > "/dev/stderr"; }
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort=1;  exit 1;
}

function data_error(msg)
{
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}

function tbl_error(file, msg)
{
  printf "file %s, line %s: %s\n", file, FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}