#! /usr/bin/python3.5 # Last edited on 2017-06-14 00:37:27 by stolfilocal # Copyright (c) 2007, Peter Corke - See note at end of file # pcbib_db entry class # - holds all information about one bibliographic item # - provides methods for manipulating/setting/representing that information # # TODO: # __repr__ method needs to do a better job depending on the reference type, similar # logic is required in bib2html (but it's not their either...) # import sys; import string; import re; #BadValue = "Bad value"; #BadField = "Bad field"; #BadEntryType = "Bad entry type"; class pcbib_entry: fieldDict = {}; verbose = 0; bibliography = {}; #############################################################3 # initialization #############################################################3 def __init__(self, citeKey, bib): "New entry initialization, with given access {citeKey} and a link to the bibliography {bib}." self.citeKey = citeKey; self.fieldDict = {}; self.bibliography = bib; if pcbib_entry.verbose: print >> sys.stderr, "New entry ", citeKey; #############################################################3 # validation methods #############################################################3 def isEntryType(self, rt): return self.getEntryType().lower() == rt.lower(); def isFieldValid(self, field): " Returns {True} iff the field with name {field} is allowed for entry {self}." et = self.getEntryType(); for tb in (requiredFieldNames, optionalFieldNames): if et in tb: row = tb[et]; if field in row: return True; return False # ---------------------------------------------------------------------- #############################################################3 # get methods #############################################################3 def getCiteKey(self): return self.citeKey; def getEntryType(self): return self.entryType; def getField(self, field): #print >> sys.stderr, field #print >> sys.stderr, self.fieldDict[field] field = field.lower(); if field in self.fieldDict: return self.fieldDict[field] else: return None; def getTitle(self): if 'title' in self.fieldDict: title = self.fieldDict['title']; title = re.sub(r"""[{}]""", "", title); title = title.strip('.,\'"'); return title; else: return ""; def getURL(self): if 'url' in self.fieldDict: url = self.fieldDict['url']; return url; else: return ""; def getAuthorList(self): if 'author' in self.fieldDict: return self.fieldDict['author']; else: return []; def getAuthors(self): if 'author' in self.fieldDict: l = self.fieldDict['author']; if len(l) == 1: return l[0]; elif len(l) == 2: return l[0] + " and " + l[1]; elif len(l) > 2: return string.join(l[:-1], ", ") + " and " + l[-1]; else: return ""; def surname(self, author): # remove LaTeX accents def chg(mo): return mo.group(mo.lastindex); re_accent = re.compile(r'''\\[.'`^"~=uvHcdb]\{(.)\}|\t\{(..)\}'''); author = re_accent.sub(chg, author) # "surname, first names" m = re.search(r"""^([^,]*),(.*)""", author); if m: #print >> sys.stderr, m.group(1), m.group(2) #return m.group(1) + "," + m.group(2).lstrip()[0]; return [m.group(1), m.group(2).lstrip()[0]]; #return m.group(1); # "first names surname" # take the last component after dot or space #m = re.search(r"""([a-zA-Z][a-zA-Z-]*)$""", author); m = re.search(r"""(.*?)([^\. \t]*)$""", author); if m: #print >> sys.stderr, author, ":", m.group(2), "|", m.group(1) return [m.group(2), m.group(1)[0]]; #return m.group(2) + "," + m.group(1)[0]; return ""; def getAuthorsSurnameList(self): if 'author' in self.fieldDict: l = self.fieldDict['author']; return map(self.surname, l); def getAuthorsSurname(self): l = self.getAuthorsSurnameList(); try: l = map(lambda x: x[0], l); if len(l) == 1: return l[0]; elif len(l) == 2: return l[0] + " and " + l[1]; elif len(l) > 2: return string.join(l[:-1], ", ") + " and " + l[-1]; else: return ""; except: return ""; # return initial dot sunrname def getAuthorsNames(self): l = self.getAuthorsSurnameList(); l = map(lambda x: x[1] + ". " + x[0], l); if len(l) == 1: return l[0]; elif len(l) == 2: return l[0] + " and " + l[1]; elif len(l) > 2: return string.join(l[:-1], ", ") + " and " + l[-1]; else: return ""; # return initial dot sunrname def getEditorsSurnameList(self): if 'editor' in self.fieldDict: l = self.fieldDict['editor']; return map(self.surname, l); def getEditorsNames(self): l = self.getEditorsSurnameList(); if not l: return None; l = map(lambda x: x[1] + ". " + x[0], l); if len(l) == 1: return l[0]; elif len(l) == 2: return l[0] + " and " + l[1]; elif len(l) > 2: return string.join(l[:-1], ", ") + " and " + l[-1]; else: return ""; def getBooktitle(self): if 'booktitle' in self.fieldDict: return self.fieldDict['booktitle']; else: return ""; def getVolume(self): if 'volume' in self.fieldDict: return self.fieldDict['volume']; else: return -1; def getNumber(self): if 'number' in self.fieldDict: return self.fieldDict['number']; else: return -1; def getPage(self): if 'pages' in self.fieldDict: return self.fieldDict['pages']; else: return ""; def afterDate(self, date): '''True if the entry occurs after the specified date''' if not date: return True; elif len(date) == 1: # simple case, year only return self.getYearNumeric() >= date[0]; elif len(date) == 2: # complex case, [month year] if self.getYearNumeric() > date[1]: return True; elif (date[1] == self.getYearNumeric()) and (self.getMonthNumeric() >= date[0]): return True; else: return False; def beforeDate(self, date): '''True if the entry occurs before the specified date''' if not date: return True; elif len(date) == 1: # simple case, year only return self.getYearNumeric() < date[0]; elif len(date) == 2: # complex case, [month year] if self.getYearNumeric() < date[1]: return True; elif (date[1] == self.getYearNumeric()) and (self.getMonthNumeric() < date[0]): return True; else: return False; def getYearNumeric(self): if 'year_numeric' in self.fieldDict: return self.fieldDict['year_numeric']; else: return -1; # return month ordinal in range 1 to 12 def getMonthNumeric(self): if 'month_numeric' in self.fieldDict: return self.fieldDict['month_numeric']; else: return -1; monthdict = { 'january' : 1, 'february' : 2, 'march' : 3, 'april' : 4, 'may' : 5, 'june' : 6, 'july' : 7, 'august' : 8, 'september' : 9, 'october' : 10, 'november' : 11, 'december' : 12 }; def getMonthName(self): monthNames = ( 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december' ); m = self.getMonthNumeric(); if m > 0: return string.capitalize(monthNames[m-1]); else: return ""; #############################################################3 # set methods #############################################################3 def setEntryType(self, value): "Sets the entry type as the {self.entryType} and also as the 'entry_type' bibfield." ck = self.getCiteKey(); value = string.lower(value); if not (value in allEntryTypes): raise AttributeError, "[%s] bad entry type '%s'" % (ck, value); self.entryType = value; self.fieldDict['entry_type'] = value; def setField(self, field, value): ck = self.getCiteKey(); field = field.lower(); if not self.isFieldValid(field): et = self.getEntryType(); raise AttributeError, "[%s] bad field name '%s' for entry type '%s' " % (ck, field, et); if field == 'year': self.fieldDict[field] = value; # remove all text like "to appear", just leave the digits year = filter(lambda c : c.isdigit(), value); try: self.fieldDict['year_numeric'] = int(year); except: if value.find('appear') > -1: sys.stderr.write("[%s] no year specified, continuing\n" % ck); self.fieldDict['year_numeric'] = 0; else: self.fieldDict['year_numeric'] = -1; raise AttributeError, "[%s] bad year '%s'" % (ck, value); elif field == 'month': # the Month entry has the original string from the file if it is of # nonstandard form, else is None. # the hidden entry month_numeric has the ordinal number self.fieldDict[field] = value; #print >> sys.stderr, "Month = '%s'" % value; month = mogrify(value); for monthname in self.monthdict: # handle month abbreviations, eg. nov in november if monthname.find(month) >= 0: self.fieldDict['month_numeric'] = self.monthdict[monthname]; #print >> sys.stderr, "month_numeric 1 %d" % self.monthdict[monthname]; self.fieldDict[field] = None; return; # handle extraneous like november in 'november 12-13' if month.find(monthname) >= 0: self.fieldDict['month_numeric'] = self.monthdict[monthname]; #print >> sys.stderr, "month_numeric 2 %d" % self.monthdict[monthname]; return; raise AttributeError, "[%s] bad month '%s'" % (ck, value); else: self.fieldDict[field] = value; #print >> sys.stderr, "'%s' := '%s'\n" % (field, value) #############################################################3 # matching methods #############################################################3 def search(self, field, str, caseSens=0): field = string.lower(field); if field.lower() == 'all': for be in self: for k in self.fieldDict: if k[0] == '_': continue; s = self.fieldDict[k]; if isinstance(s, list): s = ' '.join(s); if s: if caseSens == 0: m = re.search(str, s, re.IGNORECASE); else: m = re.search(str, s); if m: return True; else: # silently ignore search field if not present if not(field in self.fieldDict): return False; s = self.fieldDict[field]; if isinstance(s, list): s = ' '.join(s); if s: if caseSens == 0: m = re.search(str, s, re.IGNORECASE); else: m = re.search(str, s); if m: return True; return 0; def matchAuthorList(self, be): def split(a): return re.findall(r"""([a-zA-Z][a-zA-Z-]*[.]?)""", a); def matchfrag(s, f): sdot = s[-1:] == '.'; fdot = f[-1:] == '.'; if (sdot == 0) and (fdot == 0): return s == f; elif (sdot == 0) and (fdot == 1): matchstr = f + '*'; m = re.match(matchstr, s); if m: return m.group(0) == s; else: return 0; elif (sdot == 1) and (fdot == 0): matchstr = s + '*'; m = re.match(matchstr, f); if m: return m.group(0) == f; else: return 0; elif (sdot == 1) and (fdot == 1): return s == f; def matchAuthor(a1, a2): l1 = split(a1); l2 = split(a2); count = 0; for p1 in l1: for p2 in l2: if matchfrag(p1,p2): count += 1; return count; # check if each article has the same number of authors l1 = self.getAuthorList(); l2 = be.getAuthorList(); if len(l1) != len(l2): return 0; # now check the authors match, in order for i in range( len(l1) ): if matchAuthor(l1[i], l2[i]) < 2: return 0; return 1; def matchTitle(self, be, dthresh): # Levenstein distance between two strings def distance(a,b): c = {} n = len(a); m = len(b) for i in range(0,n+1): c[i,0] = i for j in range(0,m+1): c[0,j] = j for i in range(1,n+1): for j in range(1,m+1): x = c[i-1,j]+1 y = c[i,j-1]+1 if a[i-1] == b[j-1]: z = c[i-1,j-1] else: z = c[i-1,j-1]+1 c[i,j] = min(x,y,z) return c[n,m] d = distance( mogrify(self.getTitle()), mogrify(be.getTitle()) ); return d <= dthresh; def matchEntryType(self, be): return self.getEntryType() == be.getEntryType(); def matchYear(self, be): return fmatch(self.getYearNumeric(), be.getYearNumeric()); def matchMonth(self, be): return fmatch(self.getMonthNumeric(), be.getMonthNumeric()); def matchVolumeNumber(self, be): if not fmatch(self.getVolume(), be.getVolume()): return 0; if not fmatch(self.getNumber(), be.getNumber()): return 0; return 1; def matchPage(self, be): p1 = self.getPage(); p2 = be.getPage(); if p1 and p2: # both not null p1 = re.findall("([0-9.]+)", p1); p2 = re.findall("([0-9.]+)", p2); if (len(p1) > 0) and (len(p2) > 0): # optionally compare starting page numbers if p1[0] != p2[0]: return 0; if (len(p1) > 1) and (len(p2) > 1): # optionally compare ending page numbers if p1[1] != p2[1]: return 0; return 1; else: return 1; # see if two bibentries match def match(self, be, dthresh=2): # we do the cheapest comparisons first... if not self.matchEntryType(be): return 0; if not self.matchYear(be): return 0; if not self.matchMonth(be): return 0; if self.isEntryType('article'): if not self.matchVolumeNumber(be): return 0; if not self.matchPage(be): return 0; if not self.matchAuthorList(be): return 0; if not self.matchTitle(be, dthresh): return 0; return 1; #############################################################3 # debugging #############################################################3 def display(self, fp=sys.stdout): print >> fp, "%12s: %s" % ("citeKey", self.citeKey) for k in self.fieldDict: if (k == 'year_numeric') or (k == 'month_numeric'): continue; if k == 'author': print >> fp, "%12s: %s" % (k, self.getAuthors()) else: print >> fp, "%12s: %s" % (k, self.fieldDict[k]) def __getitem__(self, i): if type(i) is str: return self.fieldDict[i]; elif type(i) is int: return self.fieldDict.keys()[i]; else: raise; def check(self): "Check if the entry type is valid and has all required fields. Teturns a list of all missing ones." ck = self.getCiteKey(); fields = self.fieldDict.keys(); missing = []; entryType = self.getEntryType(); if not (entryType in allEntryTypes): raise AttributeError, "[%s] bad entry type '%s'" % (ck, entryType); for fld in requiredFieldNames[self.getEntryType()]: if not (string.lower(fld) in fields): missing.append(fld); return missing; def __repr__(self): "Returns a string with the main fields - title, authors, etc." str = '"' + self.getTitle() + '"; '; try: str = str + self.getAuthorsNames(); except: try: str = str + "eds. " + self.getEditorsNames(); except: pass; month = self.getMonthName(); year = self.getYearNumeric(); book = self.getBooktitle(); if book: str += ", " + book; if month: str += ", " + month; if year > 0: str += " " + `year`; else: if year > 0: str += ", " + `year`; str += '.'; return str; def brief(self, fp=sys.stdout): print >> fp, self; # ---------------------------------------------------------------------- #############################################################3 # syntax extension: #############################################################3 def addNewFields(fieldNameTable, newFields): "Adds to the {fieldNameTable} the fields in the" \ " list {newFields}. Each entry of the latter is" \ " either a field name, or a string '{entryType}.{fieldName}' if" \ " the action applies only to that entry type. Returns 0 if success, else" \ " returns the number of bugs found." nbg = 0; if newFields != None: for ef in newFields: entryType = "*" i = ef.find('.'); if (i == 0) or (i == len(ef)-1): print >> sys.stderr, "** bad field spec '%s'\n" % ef; nbg += 1; continue; elif i < 0: field = ef; for entryType in fieldNameTable: # print >> sys.stderr, "entryType = %s" % entryType; nbg += addNewField(fieldNameTable[entryType], field); else: entryType = ef[0:i]; field = ef[i+1:] if not (entryType in fieldNameTable): print >> sys.stderr, "** table has no row '%s'\n" % entryType; nbg += 1 else: row = fieldNameTable[entryType]; nbg += addNewField(row, field); return nbg # ---------------------------------------------------------------------- def addNewField(flist, field): " Adds the field name {field} to the list {flist}. Fails if" \ " the name is invalid (must be just lower case or hyphens" \ " and cannot be just one letter). Does nothing if the" \ " name is already there. Returns 0 if success, 1 if failure." if not re.match(r"^[a-z][-a-z]*[a-z]$", field): print >> sys.stderr, "** bad field name '%s'\n" % field; return 1 if field in flist: return 0 flist.append(field) return 0 # ---------------------------------------------------------------------- # we adopt the convention that a numeric value of -1 means not provided, # so here we match two quantites where either or both is not provided. Only # return false if both numbers are provided, and they are not equal, otherwise # give the benefit of the doubt and return true. def fmatch(n1, n2): if (n1 > 0) and (n2 > 0): return n1 == n2; else: return 1; # remove all punctuation marks and white space that people # might get wrong def mogrify(s): s = string.lower(s); s = re.sub(r"""[#{}:;,&$ -]""", "", s); return s; # List of all entry types allEntryTypes = ('article', 'book', 'booklet', 'inbook', 'incollection', 'inproceedings', 'manual', 'mastersthesis', 'misc', 'phdthesis', 'proceedings', 'techreport', 'unpublished'); # List of additional fields, ignored by the standard BibTeX styles ignoredFieldNames = ('crossref', 'code', 'url', 'annote', 'abstract'); # Lists of required fields for each entry type requiredFieldNames = { 'article' : ['author', 'title', 'journal', 'year'], 'book' : ['author', 'title', 'publisher', 'year'], 'booklet' : ['title'], 'inbook' : ['author', 'title', 'chapter', 'pages', 'publisher', 'year'], 'incollection' : ['author', 'title', 'booktitle', 'publisher', 'year'], 'inproceedings' : ['author', 'title', 'booktitle', 'year'], 'manual' : ['title'], 'misc' : [], 'mastersthesis' : ['author', 'title', 'school', 'year'], 'phdthesis' : ['author', 'title', 'school', 'year'], 'proceedings' : ['title', 'year'], 'techreport' : ['author', 'title', 'institution', 'year'], 'unpublished' : ['author', 'title', 'note'] }; # List of optional fields for each entry type optionalFieldNames = { 'article' : ['volume', 'number', 'pages', 'month', 'note'], 'book' : ['editor', 'volume', 'number', 'series', 'address', 'edition', 'month', 'note'], 'booklet' : ['author', 'howpublished', 'address', 'month', 'year', 'note'], 'inbook' : ['editor', 'volume', 'series', 'type', 'address', 'edition', 'month', 'note'], 'incollection' : ['editor', 'volume', 'number', 'series', 'type', 'chapter' 'pages', 'address', 'edition', 'month', 'note'], 'inproceedings' : ['editor', 'pages', 'organization', 'publisher', 'address', 'month', 'note'], 'manual' : ['author', 'organization', 'address', 'edition', 'month', 'year', 'note'], 'misc' : ['title', 'author', 'howpublished', 'month', 'year', 'note'], 'mastersthesis' : ['address', 'month', 'note'], 'phdthesis' : ['address', 'month', 'note'], 'proceedings' : ['editor', 'publisher', 'organization', 'address', 'month', 'note'], 'techreport' : ['type', 'number', 'address', 'month', 'note'], 'unpublished' : ['month', 'year'] }; # Copyright (c) 2007, Peter Corke # # All rights reserved. # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * The name of the copyright holder may not be used to endorse or # promote products derived from this software without specific prior # written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF # THE POSSIBILITY OF SUCH DAMAGE.