#! /usr/bin/gawk -f
# Last edited on 2007-08-12 23:15:34 by stolfi

BEGIN {
  USAGE = ( \
    "extract-bibtex-entries \\\n" \
    "  [ -v query={QUERY} ] \\\n" \
    "  [ -v invert={BOOL} ] \\\n" \
    "  [ {BIBFILE}.. ]" \
  );
  # Reads one or more BIBTEX files called "{BIBFILE}.." and extracts 
  # entries selected by the query contained in the file "{QUERYFILE}".
  # If there is no "{BIBFILE}" argument, reads standar input instead.
  #
  # The {query} argument should be a string of /terms/ separated by 
  # semicolons, vertical bars, or newlines; or should have the 
  # form "@{QUERYFILE}" where "{}" is the name of a file that 
  # contains such a string.
  #
  # Each term of the query should have the form
  #    
  #    {FIELD} {OP} {VALUE} 
  #    
  # where {FIELD} is the name of a BIBTEX field ("author", "title", "year", etc.)
  # and {VALUE} is an arbitrary string of nonblank characters.
  # The special {FIELD}s "kind" and "key" mean the entry type ("article",
  # "inproceedings", etc.) and the citation key (first nameless field
  # of the bibtex entry).  If {VALUE} is omitted, it is understood
  # to be the empty string.
  # 
  # Each term of the query specifies a test to be applied to
  # every entry in the input bibfiles.  The test is applied only to
  # the value of the entry's field named {FIELD}; unless {FIELD} is "*"
  # in which case the test is applied to every field of every entry.
  # The test performed depend on {OP}:
  #
  #   "=" The field's value must be identical to "{VALUE}".
  #   "~" The field's value must contain the string "{VALUE}".
  #   "-" The field must be missing in the entry.
  #
  # In both cases, the test is case-insensitive.
  # 
  # Note that if "{VALUE}" is empty and "{OP}" is "=", the test succeeds
  # only if that field is empty or missing in the entry.
  # If "{VALUE}" is empty and "{OP}" is "~", the test always succeeds.
  # 
  # If {invert} is 0 (default), an entry is selected if ANY of the 
  # tests succeds for it. In particular, if the {query} argument is 
  # omitted or the query is empty, there are no tests, and
  # no entries are selected.
  # 
  # If {invert} is 1, an entry is selected if NO tests succeed on it.
  # In particular, if the {query} argument is omitted or the query
  # is empty, there are no tests, and all entries are selected.
  # 
  # In either case, all selected entries are written out to standard output.
  #
  # Comment lines in the input file are discarded.
  
  abort = -1;
  
  if (invert == "") { invert = 0; }
  if (invert !~ /^[01]$/) { arg_error(("invalid {invert} argument")); }
  invert = invert + 0; # Convert to numeric.
  
  nq = 0;        # Number of tests in query file.
  split("", qf); # {qf[0..nq-1]} are the {FIELD}s to test.
  split("", qo); # {qf[0..nq-1]} are the corresponding {OP}s.
  split("", qv); # {qf[0..nq-1]} are the corresponding {VALUE}s.
  if (query ~ /^[@]/) 
    { parse_query_file(substr(query,2)); }
  else
    { parse_query_string(query); }
  # dump_query();
  
  ncmt = 0;       # Number of saved cooment lines.
  split("", cmt); # Saved comment lines are {cmt[0..ncmt-1]}.
  
  nread = 0;     # Number of BibTEX entries read.
  nwritten = 0;  # Number of BibTEX entries written.
  
  clear_current_entry();
}

function dump_query(i)
{
  # Prints out the query for debugging.
  for (i = 0; i < nq; i++)
    { printf "%% %s %s %s\n", qf[i], qo[i], qv[i] > "/dev/stderr"; }
}

function clear_current_entry(   )
{
  split("", ef); # Clear all the entry's fields.
  eini = 0;      # Set to 1 when the entry's @-line has been detected
  esel = 0;      # Set to 1 when the entry is selected.
  efin = 0;      # Set to 1 when the entry's end has been detected.
}

# Remove tabs, CR, FF:
//{ gsub(/[\011\014\015]/, " ", $0); }

# Process blank lines: 
/^[ ]*$/ { 
  next;
}

# Process nonblank comments: 
/^[ ]*[%]/ { 
  next; 
}

# Detect beginning of entry and save its kind and key into {ef}:
/^ *[@]/ {
  flush_current_entry();
  nread++;
  clear_current_entry();
  # We found the entry's beginning:
  eini = 1;
  # Parse the line:
  lin = $0;
  gsub(/^ *[@] */, "", lin); 
  if (! match(lin, /^[A-Za-z0-9]+ *([\{]|$)/))
    { data_error(("malformed entry kind")); curkind = ""; curkey = ""; }
  else 
    { # Grab and delete the entry's kind:
      tmp = substr(lin, RSTART, RLENGTH);
      lin = substr(lin, RSTART + RLENGTH);
      gsub(/[ {}]/, "", tmp);
      save_entry_field("kind", tmp);
      if (! match(lin, /^[-_:.\/A-Za-z0-9]+ */))
        { data_error(("malformed key")); curkey = ""; }
      else 
        { # Grab and delete the entry's key:
          tmp = substr(lin, RSTART, RLENGTH); 
          lin = substr(lin, RSTART + RLENGTH);
          gsub(/[ ]/, "", tmp);
          save_entry_field("key", tmp);
        }
    }
  # Parse the rest of the line: 
  lin = process_line(lin);
  next; 
}

// {
  # Continuation line (presumably):
  lin = process_line((lin " " $0));
  next; 
}

function save_entry_field(fnam,fval)
{
  # printf "      save_entry_field(\"%s\", \"%s\")\n", fnam, fval > "/dev/stderr";
  # Standardize field names to lowercase:
  fnam = tolower(fnam);
  # Check for duplicated field in the same entry:
  if (fnam in ef)
    { data_error(("duplicated field \"" fnam "\" = {" ef[fnam] "} in entry")); }
  # Remove spurious blanks from value:
  gsub(/^[ ]+/, "", fval);
  gsub(/[ ]+$/, "", fval);
  gsub(/[ ][ ]+/, " ", fval);
  # Save field value in original case:
  ef[fnam] = fval;
}

function flush_current_entry()
{
  if (! eini) { return; }
  if (lin != "") 
    { data_error(("incomplete field in entry \"" ef["key"] "\"")); }
  if (! efin) 
    { data_error(("missing close brace in entry \"" ef["key"] "\"")); }
  # Check for tests that require the field to be empty or missing:
  esel = check_query();
  # Apply {invert} option:
  if (invert) { esel = (! esel); }
  # If the entry was selected, dump it: 
  if (esel) { output_entry(); nwritten++; }
}

function check_query(   q,fnam)
{
  # Check all queries:
  for (q in qf)
    { if (qf[q] == "*")
        { # Apply test to all existing fields, take 'or': 
          for (fnam in ef)
            { if(check_query_term(fnam, qo[q], qv[q])) { return 1 }; }
        }
      else
        { fnam = qf[q];
          if (check_query_term(fnam, qo[q], qv[q])) { return 1; } 
        }
    }
  # No tests succeeded:
  return 0;
}

function check_query_term(fnam,qop,qval,  fval)
{ # Check whether field {fnam} of current entry satisfies the term {op} {val}:
  # printf "%% checking %s %s %s\n", fnam, qop, qval > "/dev/syderr";
  if (fnam in ef)
    { fval = tolower(ef[fnam]); 
      if (qop == "~")
        { return (index(fval, qval) > 0); }
      if (qop == "=")
        { return (fval == qval) }
      else
        { return 0; }
    }
  else 
    { # Field {fnam} is missing in entry: 
      if (qop == "-")
        { return 1; }
      else 
        { return 0; }
    }
}

function output_entry()
{
  # Output the entry: 
  printf "@%s{%s", ef["kind"], ef["key"];
  delete ef["kind"];
  delete ef["key"];
  output_field("author");
  output_field("title");
  output_field("journal");
  output_field("booktitle");
  output_field("school");
  output_field("institution");
  output_field("howpublished");
  output_field("location");
  output_field("editor");
  output_field("series");
  output_field("volume");
  output_field("number");
  output_field("pages");
  output_field("year");
  output_field("month");
  output_field("publisher");
  output_field("address");
  output_field("note");
  output_field("url");
  output_field("doi");
  output_field("issn");
  output_field("isbn");
  output_field("citations");
  output_field("comments");
  output_all_fields();
  printf "\n}\n\n";
}

function output_field(fnam,  fval)
{
  # If field {fnam} of the current entry is defined,
  # prints it preceded by comma and newline, then 
  # deletes it from the entry.
  if (fnam in ef)
    { fval = ef[fnam];
      printf ",\n  "; 
      print_field(fnam, fval);
      delete ef[fnam];
    }
}

function print_field(fnam,fval,  naked)
{
  # Prints the string "{fnam} = {fval}", adding braces 
  # around the {fval} if appropriate.

  # Decide whether to leave the value naked:
  naked = 0;
  if ((fnam == "year") && (fval ~ /^[12][0-9][0-9][0-9]$/)) { naked = 1; }
  if ((fnam == "month") && (fval ~ /^[a-z][a-z][a-z]$/)) { naked = 1; }
  if ((fnam == "language") && (fval ~ /^[a-z][a-z][a-z][a-z]$/)) { naked = 1; }
  if (naked)
    { printf "%s = %s", fnam, fval; }
  else
    { printf "%s = {%s}", fnam, fval; }
}

function output_all_fields()
{
  # Prints all fields  preceded by comma and newline.
  for (fnam in ef)
    { fval = ef[fnam];
      printf ",\n  "; 
      print_field(fnam, fval);
    }
  # Delete all fields:
  split("", ef);
}

function process_line(lin,   tmp)
{
  # Processes the {lin} string. 
  # 
  # Apart from blanks, the line must begin with '}' (end of entry)
  # or with ',' followed by a pair "{FIELDNAME} = {VALUE}".
  # The value must be delimited braces (not quotes) 
  # and must be balanced with respect to braces;
  # or must be a single alphanumeric token.
  # Each such pair is saved with {save_entry_field}.
  # 
  # Only complete field-value pairs are processed.
  # Any unparsed characters are returned as the 
  # result of the call.
  
  # printf "  process_line(\"%s\")\n", lin > "/dev/stderr";
  while(1)
    { # Remove leading blanks:
      gsub(/^[ ]+/, "", lin);

      # Check for end-of-entry brace:
      if (eini && (! efin) && match(lin, /^[\}][ ]*/))
        { efin = 1; 
          lin = substr(lin, RSTART + RLENGTH); 
        } 
      if (lin == "") { return lin; }
      if (! eini) 
        { data_error(("spurious material before entry's head: \"" lin "\""));
          lin = ""; return lin;
        }
      if (efin) 
        { data_error(("spurious material after end-of-entry: \"" lin "\""));
          lin = ""; return lin;
        }
      if (match(lin, /^[,]/))
        { tmp = process_next_field(lin);
          if (tmp == lin)
            { # incomplete field:
              return lin;
            }
          else
            { # Parsed one field, try again: 
              lin = tmp;
            }
        }
      else
        { data_error(("syntax error (missing comma?) at \"" lin "\""));
          lin = ""; return lin;
        }
    }
  }

function process_next_field(lin,  sav,fnam,fval,n,k,d,c)
{ # Processes the next field in the {lin} string. 
  # 
  # Apart from blanks, the line must begin with ',' 
  # followed by a pair "{FIELDNAME} = {VALUE}".
  # The value must be delimited braces (not quotes) 
  # and must be balanced with respect to braces,
  # or must be a single alphanumeric token.
  #
  # If the full field is present, saves it with
  # {save_entry_field}, and returns the rest of {lin} 
  # as result.  If the field is incomplete,
  # returns {lin} unchanged.
  # If {lin} begins with something else, 
  # prints an error and returns the empty string. 
  
  # printf "    process_next_field(\"%s\")\n", lin > "/dev/stderr";
  
  # Save {lin}, in case it is an incomplete field:
  sav = lin;

  # Parse leading comma:
  if (! match(lin, /^[,][ ]*/))
    { data_error(("syntax error (missing comma?) at \"" lin "\""));
      lin = ""; return lin;
    }
  lin = substr(lin, RSTART + RLENGTH);
  if (lin == "")
    { # The field is not complete:
      return sav;
    }

  # Parse field name {fnam}:
  if (! match(lin, /^[a-zA-Z0-9]+/))
    { data_error(("syntax error (missing field name?) at \"" lin "\""));
      lin = ""; return lin;
    }
  fnam = substr(lin, RSTART, RLENGTH); 
  lin = substr(lin, RSTART + RLENGTH);
  if (lin == "")
    { # The field is not complete:
      return sav;
    }

  # Parse the "=":
  if (! match(lin, /^[ ]*[=][ ]*/))
    { data_error(("syntax error (missing equal sign?) at \"" lin "\""));
      lin = ""; return lin;
    }
  lin = substr(lin, RSTART + RLENGTH);
  if (lin == "")
    { # The field is not complete:
      return sav;
    }
  
  # Parse the opening brace of value field:
  if (match(lin, /^[\{][ ]*/))
    { # The field's value is a brace-delimited string:
      lin = substr(lin, RSTART + RLENGTH);
      # Parse the field's value {fval}:
      n = length(lin);
      k = 1; # Character index. 
      d = 1; # Brace depth.
      while (k <= n)
        { c = substr(lin, k, 1);
          if (c == "{")
            { d++; }
          else if (c == "}")
            { d--; }
          if (d == 0)
            { # Field value is complete: 
              fval = substr(lin, 1, k-1);
              lin = substr(lin, k+1);
              save_entry_field(fnam, fval);
              return lin;
            }
          k++;
        }
      # The field's value is not complete:
      return sav;
    }
  else if (match(lin, /^[-_A-Za-z0-9]+/))
    { # The field's value is a naked token:
      fval = substr(lin, RSTART, RLENGTH);
      lin = substr(lin, RSTART + RLENGTH);
      save_entry_field(fnam, fval);
      return lin;
    }
  else
    { data_error(("syntax error (missing open brace?) at \"" lin "\""));
      lin = ""; return lin;
    }
}

END { 
  flush_current_entry();
  printf "%% read %d entries\n", nread > "/dev/stderr";
  printf "%% wrote %d entries\n", nwritten > "/dev/stderr";
}

function data_error(msg)
{
  printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1; exit abort;
}

function arg_error(msg)
{
  printf "** %s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1; exit abort;
}

function prog_error(msg)
{
  printf "** program error: %s\n", msg > "/dev/stderr";
  abort = 1; exit abort;
}

function parse_query_string(qstring,    nterms,trm,i)
{
  # Parses a list of query terms from {qstring}, and 
  # appends the terms to the current query defined by the 
  # global variables {nq,qf[0..nq-1],qo[0..nq-1],qv[0..nq-1]}.
  
  # Split string at semicolons, vertical bars, or newlines:
  nterms = split(qstring, trm, /[|;\012]/);
  # Process each term:
  for (i = 1; i <= nterms; i++)
    { parse_query_term("", i, trm[i]); }
  # Debugging and warnings:
  # printf "query string contains %d terms\n", nterms > "/dev/stderr"
  if (nterms == 0) { query_warning("", i, ("query string is empty")); }
}

function parse_query_file(fname,    nlin,lin,nterms)
{
  # Reads a list of query terms from file {fname}, and 
  # appends the terms to the current query defined by the 
  # global variables {nq,qf[0..nq-1],qo[0..nq-1],qv[0..nq-1]}.
  
  # Read file, line by line:
  nlin = 0;
  nterms = 0;
  while((getline lin < fname) > 0) { 
    # One more line read:
    nlin++;
    if (! match(lin, /^[ \011]*([\#]|$)/))
      { # Line is not a comment or blank, parse it as a term:
        parse_query_term(fname,nlin,lin);
        # Keep count of hoe many terms we got:
        nterms++; 
      }
  }
  if (ERRNO != "0") { query_error(fname, nlin, ERRNO); }
  close (fname);
  if (nlin == 0) { query_warning(fname, nlin, ("query file is empty or missing")); }
  # printf "query file contains %d terms\n", nterms > "/dev/stderr"
}

function parse_query_term(fname,nlin,term,  fn,op,fv)
{
  # Splits {term} into "{FIELD} {OP} {VALUE}", and appends it
  # to the query defined by the global variables {nq,qf,qo,qv}.
  if (! match(term, /[-=~]/))
    { query_error(fname,nlin, ("missing operator in query term \"" term "\"")); }
  else
    { # Extract the term's operator and arguments:
      fn = substr(term, 1, RSTART-1); gsub(/[ ]/, "", fn);
      op = substr(term, RSTART, RLENGTH); gsub(/[ ]/, "", op);
      fv = substr(term, RSTART+RLENGTH); gsub(/[ ]/, "", fv);
      # Some syntax checks:
      if (fn == "") { query_error(fname, nlin, ("missing field name in query term \"" term "\"")); }
      if (op !~ /^[-=~]$/) { query_error(fname, nlin, ("bad operator \"" op "\" in query term")); }
      # Append the test to the global query:
      qf[nq] = fn;
      qo[nq] = op;
      qv[nq] = fv;
      nq++;
    }
}

function query_error(f,n,msg)
{ 
  query_message(f,n,msg);
  abort = 1;
  exit 1
}

function query_warning(f,n,msg)
{ 
  query_message(f,n, ("warning -- " msg))
}

function query_message(f,n,msg)
{
  if (f != "") 
    { printf "file %s, line %d: ", f, n; }
  else 
    { printf "query term %d: ", n; }
  printf "%s\n", msg > "/dev/stderr";
}