#! /usr/bin/gawk -f # Last edited on 2008-03-11 15:50:06 by stolfi BEGIN { usage = ( ARGV[0] " [-v unmarked=BOOL] < CAPTIONSFILE > INDEXFILE" ); abort = -1; # Given a list of image captions (such as produced by # collect-all-p-comments), extracts all index entries words and phrases # and writes them to standard output. # # By default, assumes that the index entries only those phrases # delimited by "{}" --- which whould not span line boundaries. # # If "unmarked" is set, considers also isolated words that # are outside "{}"s. A word is any sequence of two or more # ISO Latin-1 letters, or a sequence of two or more "??"s. # The latter is implicitly normalized to three symbols "???". # # Each output line contains two fields: an index entry, and the name # of the corresponding image set. Each entry has its superfluous # blanks removed, and it essential blanks replaced by "_". if (unmarked == "") { unmarked = 0; } img = ""; } (abort >= 0) { exit abort; } /^[ \011]*([#]|$)/ { next; } /^[a-zA-Z0-9]/ { img = $1; next; } /^[+] /{ if (img == "") { data_error(("missing image directory line")); } lin = substr($0, 3); while (lin != "") { if (unmarked) { s = match(lin, /([?][?]+|[0-9a-zA-Z\300-\326\330-\366\370-\377]+|[{]+[^{}]*[}]*)/); } else { s = match(lin, /([?][?]+|[{][^{}]*[}])/); } if (s) { key = substr(lin, RSTART, RLENGTH); lin = substr(lin, RSTART + RLENGTH); gsub(/[ ]+/, "_", key); gsub(/[{][_]/, "{", key); gsub(/[_][}]/, "}", key); gsub(/[?][?]+/, "???", key); print key, img; } else { lin = ""; } } next; } /./ { data_error(("bad input line format")); } function data_error(msg) { printf "line %d: %s\n", FNR, msg; abort = 1; exit abort; }