#! /usr/bin/gawk -f # Last edited on 2003-07-04 02:59:46 by stolfi BEGIN { usage = ( "cat nec-hits.html | extract-papers-from-nec-hits > hits-raw.bib" ); # # Reads an HTML page returned by NEC Citeseer in response to a # [Citations search] query. Extracts the papers in a format vaguely reminiscent # of Bibtex. # # The typical entry looks like this # # <br><a href="http://citeseer.nj.nec.com/context/22340/0">Context</a> # <span class=i>Doc</span> # <i><b>219.7</b> 209 (6): </i> L. Guibas and <b>J</b>. <b>Stolfi</b>. # <i>Primitives for the manipulation of general subdivisions and the # computation of Voronoi diagrams</i>. ACM Trans. on Graphics, # 4(2):74--123, April 1985. # # except that it is all in a single line. The string # # <span class=i>Doc</span> # # may be replaced by a link like # # <a href="http://citeseer.nj.nec.com/snoeyink93objects.html">Doc</a> # # if the paper has an entry in the database. In this case there is a # newline after the </a>. # split("", fld); printf "%% Created by extract-papers-from-nec-hits on files:\n"; for (i = 1; i < ARGC; i++) { printf "%% %s\n", ARGV[i]; } printf "\n"; } />Context<.*>Doc</ { lin = cleanup_html_crud($0); # Join lines if Doc has a link: if (match(lin, />Doc<[\/]a>/)) { getline; lin = ( lin " " cleanup_html_crud($0) ); } # The entry should now look like this: # # <a href="http://citeseer.nj.nec.com/context/22340/0">Context</a> # <span class=i>Doc</span> <i>219.7 209 (6): </i> L. Guibas and J. # Stolfi. <i>Primitives for the manipulation of general # subdivisions and the computation of Voronoi diagrams</i>. ACM # Trans. on Graphics, 4(2):74--123, April 1985. # Unparsed stuff in line: misc = "???"; # Extract the URL of the citations-in-context for this paper: ctxurl = "???" if (match(lin, /^[ ]*<a[ ]*href=[\"]([^\"]*)[\"]>Context<[\/]a>/,fld)) { ctxurl = normalize_spaces(fld[1]); lin = substr(lin,RSTART+RLENGTH); } else { data_error(("cannot find Context link")); } # Remove "Doc" link (or placeholder): docurl = "???"; if (match(lin, /^[ ]*<a[ ]*href=[\"]([^\"]*)[\"]>Doc<[\/]a>/, fld)) { docurl = normalize_spaces(fld[1]); lin = substr(lin,RSTART+RLENGTH); } else if (match(lin, /^[ ]*<span[^<>]*>Doc<[\/]span>/)) { lin = substr(lin,RSTART+RLENGTH); } else { data_error(("cannot find Doc link")); } # Extract the number of citations: ncites = "???" if (match(lin, /^[ ]*<i>[ ]*([.0-9 ]*[ ]+[.0-9 ]*[(][.0-9 ]*[)])[ ]*[:][ ]*<[\/]i>/, fld)) { ncites = normalize_spaces(fld[1]); lin = substr(lin,RSTART+RLENGTH); } else { data_error(("cannot find citation count")); } # Extract the authors' names: auths = "???" if (match(lin, /^ *([^<>]*)<i>/, fld)) { auths = fld[1]; # Remove final punctuation (unless abbrev dot): auths = gensub(/([a-zA-Z][a-zA-Z])[ .]+$/, "\\1", "s", auths); auths = normalize_spaces(auths); lin = substr(lin,RSTART+RLENGTH-3); } else { data_warning(("cannot find authors")); misc = lin; lin = ""; } # Extract the paper title: title = "???" if (match(lin, /^[ ]*<i>([^<>]*)<[\/]i>([. ]*)/, fld)) { title = normalize_spaces((fld[1] fls[2])); lin = substr(lin,RSTART+RLENGTH); } else { data_warning(("cannot find title")); misc = lin; lin = ""; } # Extract the bibliography data: where = "???" if (match(lin, /^[ ]*([^<>]*)$/, fld)) { where = normalize_spaces(fld[1]); lin = substr(lin,RSTART+RLENGTH); } else { data_error(("cannot find location")); misc = lin; lin = ""; } # Extract a "NEC ID number" from the context link: if (ctxurl == "???") { key = "??"; } else if (match(ctxurl, /[\/]context[\/]([0-9\/]+)$/, fld)) { key = fld[1]; gsub(/[\/]/, "-", key); } else { data_warning(("weird ctxurl")); } # Output entry: printf "@necitem{??\n"; printf " neckey = {%s}\n", key; printf " necauthor = {%s}\n", auths; printf " nectitle = {%s}\n", title; if (where != "???") { printf " necwhere = {%s}\n", where; } if (misc != "???") { printf " necmisc = {%s}\n", misc; } printf " citations = {NEC: %s}\n", ncites; if (docurl != "???") { printf " docurl = {{\\url{%s}}}\n", docurl; } printf " ctxurl = {{\\url{%s}}}\n", ctxurl; printf "}\n" printf "\n" next; } // { next; } function cleanup_html_crud(lin) { # Remove funny spaces and line breaks: gsub(/[&]nbsp[;]/, " ", lin); gsub(/<br>/, " ", lin); # Remove boldface marks (seem to be superfluous for parsing): gsub(/<[\/]*b>/, "", lin); return lin; } function normalize_spaces(str) { gsub(/^[ ,;:.]+/, "", str); gsub(/[ ,;:]+[.]/, ".", str); gsub(/[ ,;]+[:]/, ":", str); gsub(/[ ,]+[;]/, ";", str); gsub(/[ ]+[,]/, ",", str); gsub(/[ ]+/, " ", str); gsub(/[ ,;:]+$/, "", str); return str; } function data_warning(msg) { printf "%s:%d: ++ Warning: %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " lin = «%s»\n", lin > "/dev/stderr"; printf " $0 = «%s»\n", $0 > "/dev/stderr"; } function data_error(msg) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " lin = «%s»\n", lin > "/dev/stderr"; printf " $0 = «%s»\n", $0 > "/dev/stderr"; abort = -1; exit abort; }