#! /usr/bin/gawk -f
# Last edited on 2008-03-11 15:50:06 by stolfi

BEGIN {
  usage = ( ARGV[0] " [-v unmarked=BOOL] < CAPTIONSFILE > INDEXFILE" );
  abort = -1;

  # Given a list of image captions (such as produced by 
  # collect-all-p-comments), extracts all index entries words and phrases
  # and writes them to standard output.
  #
  # By default, assumes that the index entries only those phrases
  # delimited by "{}" --- which whould not span line boundaries.
  #
  # If "unmarked" is set, considers also isolated words that
  # are outside "{}"s.  A word is any sequence of two or more
  # ISO Latin-1 letters, or a sequence of two or more "??"s.
  # The latter is implicitly normalized to three symbols "???".
  #
  # Each output line contains two fields: an index entry, and the name
  # of the corresponding image set. Each entry has its superfluous
  # blanks removed, and it essential blanks replaced by "_".
  
  if (unmarked == "") { unmarked = 0; }
  
  img = "";
}

(abort >= 0) { exit abort; }

/^[ \011]*([#]|$)/ { next; }

/^[a-zA-Z0-9]/ { 
  img = $1;
  next;
}

/^[+] /{
  if (img == "") { data_error(("missing image directory line")); }
  lin = substr($0, 3);
  while (lin != "")
    { if (unmarked)
        { s = match(lin, /([?][?]+|[0-9a-zA-Z\300-\326\330-\366\370-\377]+|[{]+[^{}]*[}]*)/); }
      else
        { s = match(lin, /([?][?]+|[{][^{}]*[}])/); }
      if (s) 
        { key = substr(lin, RSTART, RLENGTH);
          lin = substr(lin, RSTART + RLENGTH);
          gsub(/[ ]+/, "_", key);
          gsub(/[{][_]/, "{", key);
          gsub(/[_][}]/, "}", key);
          gsub(/[?][?]+/, "???", key);
          print key, img;
        }
      else
        { lin = ""; }
    }
  next;
}

/./ {
  data_error(("bad input line format"));
}

function data_error(msg)
{
  printf "line %d: %s\n", FNR, msg;
  abort = 1;
  exit abort;
}