#! /usr/bin/gawk -f # Last edited on 2007-08-12 23:15:34 by stolfi BEGIN { USAGE = ( \ "extract-bibtex-entries \\\n" \ " [ -v query={QUERY} ] \\\n" \ " [ -v invert={BOOL} ] \\\n" \ " [ {BIBFILE}.. ]" \ ); # Reads one or more BIBTEX files called "{BIBFILE}.." and extracts # entries selected by the query contained in the file "{QUERYFILE}". # If there is no "{BIBFILE}" argument, reads standar input instead. # # The {query} argument should be a string of /terms/ separated by # semicolons, vertical bars, or newlines; or should have the # form "@{QUERYFILE}" where "{}" is the name of a file that # contains such a string. # # Each term of the query should have the form # # {FIELD} {OP} {VALUE} # # where {FIELD} is the name of a BIBTEX field ("author", "title", "year", etc.) # and {VALUE} is an arbitrary string of nonblank characters. # The special {FIELD}s "kind" and "key" mean the entry type ("article", # "inproceedings", etc.) and the citation key (first nameless field # of the bibtex entry). If {VALUE} is omitted, it is understood # to be the empty string. # # Each term of the query specifies a test to be applied to # every entry in the input bibfiles. The test is applied only to # the value of the entry's field named {FIELD}; unless {FIELD} is "*" # in which case the test is applied to every field of every entry. # The test performed depend on {OP}: # # "=" The field's value must be identical to "{VALUE}". # "~" The field's value must contain the string "{VALUE}". # "-" The field must be missing in the entry. # # In both cases, the test is case-insensitive. # # Note that if "{VALUE}" is empty and "{OP}" is "=", the test succeeds # only if that field is empty or missing in the entry. # If "{VALUE}" is empty and "{OP}" is "~", the test always succeeds. # # If {invert} is 0 (default), an entry is selected if ANY of the # tests succeds for it. In particular, if the {query} argument is # omitted or the query is empty, there are no tests, and # no entries are selected. # # If {invert} is 1, an entry is selected if NO tests succeed on it. # In particular, if the {query} argument is omitted or the query # is empty, there are no tests, and all entries are selected. # # In either case, all selected entries are written out to standard output. # # Comment lines in the input file are discarded. abort = -1; if (invert == "") { invert = 0; } if (invert !~ /^[01]$/) { arg_error(("invalid {invert} argument")); } invert = invert + 0; # Convert to numeric. nq = 0; # Number of tests in query file. split("", qf); # {qf[0..nq-1]} are the {FIELD}s to test. split("", qo); # {qf[0..nq-1]} are the corresponding {OP}s. split("", qv); # {qf[0..nq-1]} are the corresponding {VALUE}s. if (query ~ /^[@]/) { parse_query_file(substr(query,2)); } else { parse_query_string(query); } # dump_query(); ncmt = 0; # Number of saved cooment lines. split("", cmt); # Saved comment lines are {cmt[0..ncmt-1]}. nread = 0; # Number of BibTEX entries read. nwritten = 0; # Number of BibTEX entries written. clear_current_entry(); } function dump_query(i) { # Prints out the query for debugging. for (i = 0; i < nq; i++) { printf "%% %s %s %s\n", qf[i], qo[i], qv[i] > "/dev/stderr"; } } function clear_current_entry( ) { split("", ef); # Clear all the entry's fields. eini = 0; # Set to 1 when the entry's @-line has been detected esel = 0; # Set to 1 when the entry is selected. efin = 0; # Set to 1 when the entry's end has been detected. } # Remove tabs, CR, FF: //{ gsub(/[\011\014\015]/, " ", $0); } # Process blank lines: /^[ ]*$/ { next; } # Process nonblank comments: /^[ ]*[%]/ { next; } # Detect beginning of entry and save its kind and key into {ef}: /^ *[@]/ { flush_current_entry(); nread++; clear_current_entry(); # We found the entry's beginning: eini = 1; # Parse the line: lin = $0; gsub(/^ *[@] */, "", lin); if (! match(lin, /^[A-Za-z0-9]+ *([\{]|$)/)) { data_error(("malformed entry kind")); curkind = ""; curkey = ""; } else { # Grab and delete the entry's kind: tmp = substr(lin, RSTART, RLENGTH); lin = substr(lin, RSTART + RLENGTH); gsub(/[ {}]/, "", tmp); save_entry_field("kind", tmp); if (! match(lin, /^[-_:.\/A-Za-z0-9]+ */)) { data_error(("malformed key")); curkey = ""; } else { # Grab and delete the entry's key: tmp = substr(lin, RSTART, RLENGTH); lin = substr(lin, RSTART + RLENGTH); gsub(/[ ]/, "", tmp); save_entry_field("key", tmp); } } # Parse the rest of the line: lin = process_line(lin); next; } // { # Continuation line (presumably): lin = process_line((lin " " $0)); next; } function save_entry_field(fnam,fval) { # printf " save_entry_field(\"%s\", \"%s\")\n", fnam, fval > "/dev/stderr"; # Standardize field names to lowercase: fnam = tolower(fnam); # Check for duplicated field in the same entry: if (fnam in ef) { data_error(("duplicated field \"" fnam "\" = {" ef[fnam] "} in entry")); } # Remove spurious blanks from value: gsub(/^[ ]+/, "", fval); gsub(/[ ]+$/, "", fval); gsub(/[ ][ ]+/, " ", fval); # Save field value in original case: ef[fnam] = fval; } function flush_current_entry() { if (! eini) { return; } if (lin != "") { data_error(("incomplete field in entry \"" ef["key"] "\"")); } if (! efin) { data_error(("missing close brace in entry \"" ef["key"] "\"")); } # Check for tests that require the field to be empty or missing: esel = check_query(); # Apply {invert} option: if (invert) { esel = (! esel); } # If the entry was selected, dump it: if (esel) { output_entry(); nwritten++; } } function check_query( q,fnam) { # Check all queries: for (q in qf) { if (qf[q] == "*") { # Apply test to all existing fields, take 'or': for (fnam in ef) { if(check_query_term(fnam, qo[q], qv[q])) { return 1 }; } } else { fnam = qf[q]; if (check_query_term(fnam, qo[q], qv[q])) { return 1; } } } # No tests succeeded: return 0; } function check_query_term(fnam,qop,qval, fval) { # Check whether field {fnam} of current entry satisfies the term {op} {val}: # printf "%% checking %s %s %s\n", fnam, qop, qval > "/dev/syderr"; if (fnam in ef) { fval = tolower(ef[fnam]); if (qop == "~") { return (index(fval, qval) > 0); } if (qop == "=") { return (fval == qval) } else { return 0; } } else { # Field {fnam} is missing in entry: if (qop == "-") { return 1; } else { return 0; } } } function output_entry() { # Output the entry: printf "@%s{%s", ef["kind"], ef["key"]; delete ef["kind"]; delete ef["key"]; output_field("author"); output_field("title"); output_field("journal"); output_field("booktitle"); output_field("school"); output_field("institution"); output_field("howpublished"); output_field("location"); output_field("editor"); output_field("series"); output_field("volume"); output_field("number"); output_field("pages"); output_field("year"); output_field("month"); output_field("publisher"); output_field("address"); output_field("note"); output_field("url"); output_field("doi"); output_field("issn"); output_field("isbn"); output_field("citations"); output_field("comments"); output_all_fields(); printf "\n}\n\n"; } function output_field(fnam, fval) { # If field {fnam} of the current entry is defined, # prints it preceded by comma and newline, then # deletes it from the entry. if (fnam in ef) { fval = ef[fnam]; printf ",\n "; print_field(fnam, fval); delete ef[fnam]; } } function print_field(fnam,fval, naked) { # Prints the string "{fnam} = {fval}", adding braces # around the {fval} if appropriate. # Decide whether to leave the value naked: naked = 0; if ((fnam == "year") && (fval ~ /^[12][0-9][0-9][0-9]$/)) { naked = 1; } if ((fnam == "month") && (fval ~ /^[a-z][a-z][a-z]$/)) { naked = 1; } if ((fnam == "language") && (fval ~ /^[a-z][a-z][a-z][a-z]$/)) { naked = 1; } if (naked) { printf "%s = %s", fnam, fval; } else { printf "%s = {%s}", fnam, fval; } } function output_all_fields() { # Prints all fields preceded by comma and newline. for (fnam in ef) { fval = ef[fnam]; printf ",\n "; print_field(fnam, fval); } # Delete all fields: split("", ef); } function process_line(lin, tmp) { # Processes the {lin} string. # # Apart from blanks, the line must begin with '}' (end of entry) # or with ',' followed by a pair "{FIELDNAME} = {VALUE}". # The value must be delimited braces (not quotes) # and must be balanced with respect to braces; # or must be a single alphanumeric token. # Each such pair is saved with {save_entry_field}. # # Only complete field-value pairs are processed. # Any unparsed characters are returned as the # result of the call. # printf " process_line(\"%s\")\n", lin > "/dev/stderr"; while(1) { # Remove leading blanks: gsub(/^[ ]+/, "", lin); # Check for end-of-entry brace: if (eini && (! efin) && match(lin, /^[\}][ ]*/)) { efin = 1; lin = substr(lin, RSTART + RLENGTH); } if (lin == "") { return lin; } if (! eini) { data_error(("spurious material before entry's head: \"" lin "\"")); lin = ""; return lin; } if (efin) { data_error(("spurious material after end-of-entry: \"" lin "\"")); lin = ""; return lin; } if (match(lin, /^[,]/)) { tmp = process_next_field(lin); if (tmp == lin) { # incomplete field: return lin; } else { # Parsed one field, try again: lin = tmp; } } else { data_error(("syntax error (missing comma?) at \"" lin "\"")); lin = ""; return lin; } } } function process_next_field(lin, sav,fnam,fval,n,k,d,c) { # Processes the next field in the {lin} string. # # Apart from blanks, the line must begin with ',' # followed by a pair "{FIELDNAME} = {VALUE}". # The value must be delimited braces (not quotes) # and must be balanced with respect to braces, # or must be a single alphanumeric token. # # If the full field is present, saves it with # {save_entry_field}, and returns the rest of {lin} # as result. If the field is incomplete, # returns {lin} unchanged. # If {lin} begins with something else, # prints an error and returns the empty string. # printf " process_next_field(\"%s\")\n", lin > "/dev/stderr"; # Save {lin}, in case it is an incomplete field: sav = lin; # Parse leading comma: if (! match(lin, /^[,][ ]*/)) { data_error(("syntax error (missing comma?) at \"" lin "\"")); lin = ""; return lin; } lin = substr(lin, RSTART + RLENGTH); if (lin == "") { # The field is not complete: return sav; } # Parse field name {fnam}: if (! match(lin, /^[a-zA-Z0-9]+/)) { data_error(("syntax error (missing field name?) at \"" lin "\"")); lin = ""; return lin; } fnam = substr(lin, RSTART, RLENGTH); lin = substr(lin, RSTART + RLENGTH); if (lin == "") { # The field is not complete: return sav; } # Parse the "=": if (! match(lin, /^[ ]*[=][ ]*/)) { data_error(("syntax error (missing equal sign?) at \"" lin "\"")); lin = ""; return lin; } lin = substr(lin, RSTART + RLENGTH); if (lin == "") { # The field is not complete: return sav; } # Parse the opening brace of value field: if (match(lin, /^[\{][ ]*/)) { # The field's value is a brace-delimited string: lin = substr(lin, RSTART + RLENGTH); # Parse the field's value {fval}: n = length(lin); k = 1; # Character index. d = 1; # Brace depth. while (k <= n) { c = substr(lin, k, 1); if (c == "{") { d++; } else if (c == "}") { d--; } if (d == 0) { # Field value is complete: fval = substr(lin, 1, k-1); lin = substr(lin, k+1); save_entry_field(fnam, fval); return lin; } k++; } # The field's value is not complete: return sav; } else if (match(lin, /^[-_A-Za-z0-9]+/)) { # The field's value is a naked token: fval = substr(lin, RSTART, RLENGTH); lin = substr(lin, RSTART + RLENGTH); save_entry_field(fnam, fval); return lin; } else { data_error(("syntax error (missing open brace?) at \"" lin "\"")); lin = ""; return lin; } } END { flush_current_entry(); printf "%% read %d entries\n", nread > "/dev/stderr"; printf "%% wrote %d entries\n", nwritten > "/dev/stderr"; } function data_error(msg) { printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit abort; } function prog_error(msg) { printf "** program error: %s\n", msg > "/dev/stderr"; abort = 1; exit abort; } function parse_query_string(qstring, nterms,trm,i) { # Parses a list of query terms from {qstring}, and # appends the terms to the current query defined by the # global variables {nq,qf[0..nq-1],qo[0..nq-1],qv[0..nq-1]}. # Split string at semicolons, vertical bars, or newlines: nterms = split(qstring, trm, /[|;\012]/); # Process each term: for (i = 1; i <= nterms; i++) { parse_query_term("", i, trm[i]); } # Debugging and warnings: # printf "query string contains %d terms\n", nterms > "/dev/stderr" if (nterms == 0) { query_warning("", i, ("query string is empty")); } } function parse_query_file(fname, nlin,lin,nterms) { # Reads a list of query terms from file {fname}, and # appends the terms to the current query defined by the # global variables {nq,qf[0..nq-1],qo[0..nq-1],qv[0..nq-1]}. # Read file, line by line: nlin = 0; nterms = 0; while((getline lin < fname) > 0) { # One more line read: nlin++; if (! match(lin, /^[ \011]*([\#]|$)/)) { # Line is not a comment or blank, parse it as a term: parse_query_term(fname,nlin,lin); # Keep count of hoe many terms we got: nterms++; } } if (ERRNO != "0") { query_error(fname, nlin, ERRNO); } close (fname); if (nlin == 0) { query_warning(fname, nlin, ("query file is empty or missing")); } # printf "query file contains %d terms\n", nterms > "/dev/stderr" } function parse_query_term(fname,nlin,term, fn,op,fv) { # Splits {term} into "{FIELD} {OP} {VALUE}", and appends it # to the query defined by the global variables {nq,qf,qo,qv}. if (! match(term, /[-=~]/)) { query_error(fname,nlin, ("missing operator in query term \"" term "\"")); } else { # Extract the term's operator and arguments: fn = substr(term, 1, RSTART-1); gsub(/[ ]/, "", fn); op = substr(term, RSTART, RLENGTH); gsub(/[ ]/, "", op); fv = substr(term, RSTART+RLENGTH); gsub(/[ ]/, "", fv); # Some syntax checks: if (fn == "") { query_error(fname, nlin, ("missing field name in query term \"" term "\"")); } if (op !~ /^[-=~]$/) { query_error(fname, nlin, ("bad operator \"" op "\" in query term")); } # Append the test to the global query: qf[nq] = fn; qo[nq] = op; qv[nq] = fv; nq++; } } function query_error(f,n,msg) { query_message(f,n,msg); abort = 1; exit 1 } function query_warning(f,n,msg) { query_message(f,n, ("warning -- " msg)) } function query_message(f,n,msg) { if (f != "") { printf "file %s, line %d: ", f, n; } else { printf "query term %d: ", n; } printf "%s\n", msg > "/dev/stderr"; }