#! /usr/bin/gawk -f
# Last edited on 2003-07-04 02:59:46 by stolfi

BEGIN {

  usage = ( "cat nec-hits.html | extract-papers-from-nec-hits > hits-raw.bib" );
  
  # 
  # Reads an HTML page returned by NEC Citeseer in response to a 
  # [Citations search] query. Extracts the papers in a format vaguely reminiscent
  # of Bibtex.
  # 
  # The typical entry looks like this
  #
  #   <br><a href="http://citeseer.nj.nec.com/context/22340/0">Context</a>
  #   &nbsp; <span class=i>Doc</span> &nbsp; &nbsp; 
  #   <i><b>219.7</b> 209 (6): </i> &nbsp; L. Guibas and <b>J</b>. <b>Stolfi</b>.
  #   <i>Primitives for the manipulation of general subdivisions and the
  #   computation of Voronoi diagrams</i>. ACM Trans. on Graphics,
  #   4(2):74--123, April 1985.
  # 
  # except that it is all in a single line.  The string 
  #
  #   <span class=i>Doc</span>
  #
  # may be replaced by a link like
  #
  #   <a href="http://citeseer.nj.nec.com/snoeyink93objects.html">Doc</a>
  #
  # if the paper has an entry in the database. In this case there is a 
  # newline after the </a>. 
  #
  split("", fld);
  
  printf "%% Created by extract-papers-from-nec-hits on files:\n"; 
  for (i = 1; i < ARGC; i++) { printf "%%   %s\n", ARGV[i]; }
  printf "\n"; 
  
}

/>Context<.*>Doc</ { 
  lin = cleanup_html_crud($0);
  
  # Join lines if Doc has a link:
  if (match(lin, />Doc<[\/]a>/))
    { getline; lin = ( lin " " cleanup_html_crud($0) ); }
  
  # The entry should now look like this:
  # 
  #   <a href="http://citeseer.nj.nec.com/context/22340/0">Context</a>
  #   <span class=i>Doc</span> <i>219.7 209 (6): </i> L. Guibas and J.
  #   Stolfi. <i>Primitives for the manipulation of general
  #   subdivisions and the computation of Voronoi diagrams</i>. ACM
  #   Trans. on Graphics, 4(2):74--123, April 1985.
  
  # Unparsed stuff in line:
  misc = "???";
  
  # Extract the URL of the citations-in-context for this paper: 
  ctxurl = "???"
  if (match(lin, /^[ ]*<a[ ]*href=[\"]([^\"]*)[\"]>Context<[\/]a>/,fld))
    { ctxurl = normalize_spaces(fld[1]);
      lin = substr(lin,RSTART+RLENGTH);
    }
  else
    { data_error(("cannot find Context link")); }
    
  # Remove "Doc" link (or placeholder): 
  docurl = "???";
  if (match(lin, /^[ ]*<a[ ]*href=[\"]([^\"]*)[\"]>Doc<[\/]a>/, fld))
    { docurl = normalize_spaces(fld[1]);
      lin = substr(lin,RSTART+RLENGTH);
    }
  else if (match(lin, /^[ ]*<span[^<>]*>Doc<[\/]span>/))
    { lin = substr(lin,RSTART+RLENGTH); }
  else
    { data_error(("cannot find Doc link")); }
    
  # Extract the number of citations:
  ncites = "???"
  if (match(lin, /^[ ]*<i>[ ]*([.0-9 ]*[ ]+[.0-9 ]*[(][.0-9 ]*[)])[ ]*[:][ ]*<[\/]i>/, fld))
    { ncites = normalize_spaces(fld[1]);
      lin = substr(lin,RSTART+RLENGTH);
    }
  else
    { data_error(("cannot find citation count")); }
    
  # Extract the authors' names:
  auths = "???"
  if (match(lin, /^ *([^<>]*)<i>/, fld))
    { auths = fld[1];
      # Remove final punctuation (unless abbrev dot): 
      auths = gensub(/([a-zA-Z][a-zA-Z])[ .]+$/, "\\1", "s", auths);
      auths = normalize_spaces(auths);
      lin = substr(lin,RSTART+RLENGTH-3);
    }
  else
    { data_warning(("cannot find authors"));
      misc = lin; lin = ""; 
    }
    
  # Extract the paper title:
  title = "???"
  if (match(lin, /^[ ]*<i>([^<>]*)<[\/]i>([. ]*)/, fld))
    { title = normalize_spaces((fld[1] fls[2]));
      lin = substr(lin,RSTART+RLENGTH);
    }
  else
    { data_warning(("cannot find title"));
      misc = lin; lin = ""; 
    }
    
    
  # Extract the bibliography data:
  where = "???"
  if (match(lin, /^[ ]*([^<>]*)$/, fld))
    { where = normalize_spaces(fld[1]);
      lin = substr(lin,RSTART+RLENGTH);
    }
  else
    { data_error(("cannot find location"));
      misc = lin; lin = ""; 
    }
    
  # Extract a "NEC ID number" from the context link:
  if (ctxurl == "???")
    { key = "??"; }
  else if (match(ctxurl, /[\/]context[\/]([0-9\/]+)$/, fld))
    { key = fld[1]; gsub(/[\/]/, "-", key); }
  else
    { data_warning(("weird ctxurl")); }
    
  # Output entry:
  printf "@necitem{??\n";
  printf "  neckey = {%s}\n", key;
  printf "  necauthor = {%s}\n", auths;
  printf "  nectitle = {%s}\n", title;
  if (where != "???") { printf "  necwhere = {%s}\n", where; }
  if (misc != "???") { printf "  necmisc = {%s}\n", misc; }
  printf "  citations = {NEC: %s}\n", ncites;
  if (docurl != "???") { printf "  docurl = {{\\url{%s}}}\n", docurl; }
  printf "  ctxurl = {{\\url{%s}}}\n", ctxurl;
  printf "}\n"
  printf "\n"

  next;
}

// { next; }

function cleanup_html_crud(lin)
{
  # Remove funny spaces and line breaks:
  gsub(/[&]nbsp[;]/, " ", lin);
  gsub(/<br>/, " ", lin);
  
  # Remove boldface marks (seem to be superfluous for parsing):
  gsub(/<[\/]*b>/, "", lin);
  
  return lin;
}

function normalize_spaces(str)
{
  gsub(/^[ ,;:.]+/, "", str);
  gsub(/[ ,;:]+[.]/, ".", str);
  gsub(/[ ,;]+[:]/, ":", str);
  gsub(/[ ,]+[;]/, ";", str);
  gsub(/[ ]+[,]/, ",", str);
  gsub(/[ ]+/, " ", str);
  gsub(/[ ,;:]+$/, "", str);
  return str;
}

function data_warning(msg)
{
  printf "%s:%d: ++ Warning: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  printf "   lin = «%s»\n", lin > "/dev/stderr";
  printf "   $0 =  «%s»\n", $0 > "/dev/stderr";
}

function data_error(msg)
{
  printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
  printf "   lin = «%s»\n", lin > "/dev/stderr";
  printf "   $0 =  «%s»\n", $0 > "/dev/stderr";
  abort = -1;
  exit abort;
}