#! /usr/bin/gawk -f
# Last edited on 2004-02-25 23:41:29 by stolfi

BEGIN {
  abort = -1;
  usage = ( ARGV[0] "\\\n" \
    "  -f FUNCS.gawk \\\n" \
    "  -v smp=SMP \\\n" \
    "  -v sec=SEC \\\n" \
    "  [ -v maxAlpha=NUM ] \\\n" \
    "  -v keepComments=BOOL \\\n" \
    "  -v keepSections=BOOL \\\n" \
    "  -v keepLineNums=BOOL \\\n" \
    "  -v keepOutside=BOOL \\\n" \
    "  -v keepSymbols=BOOL \\\n" \
    "  -v keepPunct=BOOL \\\n" \
    "  < main.wds > select.wds " \
  );
  # 
  # Selects words from "main.wds" based on sec and type.
  # 
  # Each input line must have the format "{TYPE} {WORD}", where {WORD}
  # is a token from the text, and {TYPE} is its type ("#"=comment,
  # "$"=section tag, "@"=line number, "a"=alpha, "b"=blank, "p"=punct,
  # "s"=symbol, "n"=null).
  # 
  # This script recomputes the type of each input record by calling a
  # procedure from the user-specified library "FUNCS.gawk":
  #
  #   smp_reclassify_word(smp, sec, cursec, curlin, type, wd)
  #     
  #     where {wd} is a word read from the input, {type} is its input
  #     type, {smp} and {sec} are user-specified strings, and {cursec}
  #     and {curlin} are the section tag and line number containing
  #     this occurrence of {wd} (saved from the last "$" and "@"
  #     records preceding this one). The procedure must return a new type
  #     for {wd}; or "x", meaning that the record should be excluded
  #     for being outside the region of interest.
  #
  # The script then disposes of the input record according to the
  # type returned by {smp_reclassify_word}:
  #
  #   "#" written only if {keepComments=1}.
  #   "$" written only if {keepSections=1}.
  #   "@" written only if {keepLineNums=1}.
  #   "x" written only if {keepOutside=1}.
  #   "s" written only if {keepSymbols=1}.
  #   "p" written only if {keepPunct=1}.
  #   "a" always written.
  #   "n" always discarded.
  #   "b" always discarded.
  # 
  # The script stops after {maxAlpha} "a"-type words have been
  # written. The default is to process the whole input file.
  # 
  # User must also provide a function
  #
  #   smp_initialize(smp, sec) 
  #     
  # that is called before the first record. This procedure could, for
  # instance, precompile any complicated patterns to be used by
  # {smp_reclassify_word}.
  #     
  
  if (smp == "")      { arg_error("must define \"smp\""); }
  if (sec == "")      { arg_error("must define \"sec\""); }
  
  if (maxAlpha == "")     { maxAlpha = -1; }
  
  if (keepComments == "") { keepComments = 0; }
  if (keepSections == "") { keepSections = 0; }
  if (keepLineNums == "") { keepLineNums = 0; }
  if (keepOutside == "")  { keepOutside = 0; }
  if (keepSymbols == "")  { keepSymbols = 0; }
  if (keepPunct == "")    { keepPunct = 0; }
  
  curSec = "";
  curLin = "";
  smp_initialize(smp, sec);
  nAlpha = 0;
  nWritten = 0;
}

(abort >= 0) { exit abort; }

((maxAlpha >= 0) && (nAlpha >= maxAlpha)) { 
  exit 0;
}

($1 ~ /^[\#$@anpbs]$/) { 
  type = $1;
  if (type == "#")
    { wd = substr($0, 3); }
  else
    { if (NF != 2) { data_error("bad input format"); }
      wd = $2; 
      if (type == "$") 
        { curSec = wd; }
      else if (type == "@") 
        { curLin = wd; }
    }
  if (type !~ /^[\#@$]/)
    { type = smp_reclassify_word(smp, sec, curSec, curLin, type, wd); }
  if (type == "#")
    { if (keepComments) { output_word(type, wd); } }
  else if (type == "$") 
    { if (keepSections) { output_word(type, wd); } }
  else if (type == "@") 
    { if (keepLineNums) { output_word(type, wd); } }
  else if (type == "x")
    { if (keepOutside) { output_word(type, wd); }
  else if (type == "s")
    { if (keepSymbols) { output_word(type, wd); }
  else if (type == "p") 
    { if (keepPunct) { output_word(type, wd); } }
  else if (type == "a") 
    { output_word(type, wd); }
  else if ((type == "n") || (type == "b"))
    {  }
  else
    { data_error(("invalid type tag = \"" type "\"")); }
  # printf "%s --> %s\n", $1, type > "/dev/stderr";
  next;
}

// { data_error(("invalid input type tag \"" $1 "\"")); }

END {
  if (abort >= 0) { exit abort; }
  printf "%d records (%d alpha)\n", nWritten, nAlpha > "/dev/stderr"; 
}

function output_word(type, wd)
{
  # Outputs word {wd} of type {type}.
  # Updates {nAlpha}
  printf "%s %s\n", type, wd;
  if (type == "a") { nAlpha++; }
  nWritten++;
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1;
}

function data_error(msg)
{
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}