#! /usr/bin/python3.5
# Last edited on 2017-06-14 00:37:27 by stolfilocal
# Copyright (c) 2007, Peter Corke - See note at end of file

# pcbib_db entry class
#   - holds all information about one bibliographic item
#   - provides methods for manipulating/setting/representing that information
#
# TODO:
#  __repr__ method needs to do a better job depending on the reference type, similar
#  logic is required in bib2html (but it's not their either...)
#

import sys;
import string;
import re;

#BadValue = "Bad value";
#BadField = "Bad field";
#BadEntryType = "Bad entry type";

class pcbib_entry:
  fieldDict = {};
  verbose = 0;
  bibliography = {};

  #############################################################3
  # initialization
  #############################################################3

  def __init__(self, citeKey, bib):
    "New entry initialization, with given access {citeKey} and a link to the bibliography {bib}."
    self.citeKey = citeKey;
    self.fieldDict = {};
    self.bibliography = bib;
    if pcbib_entry.verbose:
      print >> sys.stderr, "New entry ", citeKey;

  #############################################################3
  # validation methods
  #############################################################3

  def isEntryType(self, rt):
    return self.getEntryType().lower() == rt.lower();

  def isFieldValid(self, field):
    " Returns {True} iff the field with name {field} is allowed for entry {self}."
    
    et = self.getEntryType();
    for tb in (requiredFieldNames, optionalFieldNames):
      if et in tb:
        row = tb[et];
        if field in row:
          return True;
    return False
    # ----------------------------------------------------------------------

  #############################################################3
  # get methods
  #############################################################3

  def getCiteKey(self):
    return self.citeKey;

  def getEntryType(self):
    return self.entryType;

  def getField(self, field):
    #print >> sys.stderr, field
    #print >> sys.stderr, self.fieldDict[field]
    field = field.lower();
    if field in self.fieldDict:
      return self.fieldDict[field]
    else:
      return None;

  def getTitle(self):
    if 'title' in self.fieldDict:
      title = self.fieldDict['title'];
      title = re.sub(r"""[{}]""", "", title);
      title = title.strip('.,\'"');
      return title;
    else:
      return "";

  def getURL(self):
    if 'url' in self.fieldDict:
      url = self.fieldDict['url'];
      return url;
    else:
      return "";

  def getAuthorList(self):
    if 'author' in self.fieldDict:
      return self.fieldDict['author'];
    else:
      return [];

  def getAuthors(self):
    if 'author' in self.fieldDict:
      l = self.fieldDict['author'];
      if len(l) == 1:
        return l[0];
      elif len(l) == 2:
        return l[0] + " and " + l[1];
      elif len(l) > 2:
        return string.join(l[:-1], ", ") + " and " + l[-1];
    else:
      return "";

  def surname(self, author):
    # remove LaTeX accents
    def chg(mo): return mo.group(mo.lastindex);
    re_accent = re.compile(r'''\\[.'`^"~=uvHcdb]\{(.)\}|\t\{(..)\}''');
    author = re_accent.sub(chg, author)

    # "surname, first names"
    m = re.search(r"""^([^,]*),(.*)""", author);
    if m:
      #print >> sys.stderr, m.group(1), m.group(2)
      #return m.group(1) + "," + m.group(2).lstrip()[0];
      return [m.group(1), m.group(2).lstrip()[0]];
      #return m.group(1);

    # "first names surname"

    # take the last component after dot or space
    #m = re.search(r"""([a-zA-Z][a-zA-Z-]*)$""", author);
    m = re.search(r"""(.*?)([^\. \t]*)$""", author);
    if m:
      #print >> sys.stderr, author, ":", m.group(2), "|",  m.group(1)
      return [m.group(2), m.group(1)[0]];
      #return m.group(2) + "," + m.group(1)[0];

    return "";

  def getAuthorsSurnameList(self):      
    if 'author' in self.fieldDict:
      l = self.fieldDict['author'];
      return map(self.surname, l);

  def getAuthorsSurname(self):
    l = self.getAuthorsSurnameList();
    try:
      l = map(lambda x: x[0], l);
      if len(l) == 1:
        return l[0];
      elif len(l) == 2:
        return l[0] + " and " + l[1];
      elif len(l) > 2:
        return string.join(l[:-1], ", ") + " and " + l[-1];
      else:
        return "";
    except:
      return "<NO AUTHOR>";

  # return initial dot sunrname
  def getAuthorsNames(self):
    l = self.getAuthorsSurnameList();
    l = map(lambda x: x[1] + ". " + x[0], l);
    if len(l) == 1:
      return l[0];
    elif len(l) == 2:
      return l[0] + " and " + l[1];
    elif len(l) > 2:
      return string.join(l[:-1], ", ") + " and " + l[-1];
    else:
      return "";

  # return initial dot sunrname

  def getEditorsSurnameList(self):      
    if 'editor' in self.fieldDict:
      l = self.fieldDict['editor'];
      return map(self.surname, l);
      
  def getEditorsNames(self):
    l = self.getEditorsSurnameList();
    if not l:
      return None;
    l = map(lambda x: x[1] + ". " + x[0], l);
    if len(l) == 1:
      return l[0];
    elif len(l) == 2:
      return l[0] + " and " + l[1];
    elif len(l) > 2:
      return string.join(l[:-1], ", ") + " and " + l[-1];
    else:
      return "";

  def getBooktitle(self):
    if 'booktitle' in self.fieldDict:
      return  self.fieldDict['booktitle'];
    else:
      return "";

  def getVolume(self):
    if 'volume' in self.fieldDict:
      return self.fieldDict['volume'];
    else:
      return -1;

  def getNumber(self):
    if 'number' in self.fieldDict:
      return self.fieldDict['number'];
    else:
      return -1;

  def getPage(self):
    if 'pages' in self.fieldDict:
      return self.fieldDict['pages'];
    else:
      return "";

  def afterDate(self, date):
    '''True if the entry occurs after the specified date'''
    
    if not date:
      return True;
    elif len(date) == 1:
      # simple case, year only
      return self.getYearNumeric() >= date[0];
    elif len(date) == 2:
      # complex case, [month year]
      if self.getYearNumeric() > date[1]:
        return True;
      elif (date[1] == self.getYearNumeric()) and (self.getMonthNumeric() >= date[0]):
        return True;
      else:
        return False;

  def beforeDate(self, date):
    '''True if the entry occurs before the specified date'''
    
    if not date:
      return True;
    elif len(date) == 1:
      # simple case, year only
      return self.getYearNumeric() < date[0];
    elif len(date) == 2:
      # complex case, [month year]
      if self.getYearNumeric() < date[1]:
        return True;
      elif (date[1] == self.getYearNumeric()) and (self.getMonthNumeric() < date[0]):
        return True;
      else:
        return False;

  def getYearNumeric(self):
    if 'year_numeric' in self.fieldDict:
      return self.fieldDict['year_numeric'];
    else:
      return -1;

  # return month ordinal in range 1 to 12
  def getMonthNumeric(self):
    if 'month_numeric' in self.fieldDict:
      return self.fieldDict['month_numeric'];
    else:
      return -1;

  monthdict = {
    'january' : 1,
    'february' : 2,
    'march' : 3,
    'april' : 4,
    'may' : 5,
    'june' : 6,
    'july' : 7,
    'august' : 8,
    'september' : 9,
    'october' : 10,
    'november' : 11,
    'december' : 12  };

  def getMonthName(self):
    monthNames = (
      'january',
      'february',
      'march',
      'april',
      'may',
      'june',
      'july',
      'august',
      'september',
      'october',
      'november',
      'december' );
    m = self.getMonthNumeric();
    if m > 0:
      return string.capitalize(monthNames[m-1]);
    else:
      return "";

  #############################################################3
  # set methods
  #############################################################3

  def setEntryType(self, value):
    "Sets the entry type as the {self.entryType} and also as the 'entry_type' bibfield."
    ck = self.getCiteKey();
    value = string.lower(value);
    if not (value in allEntryTypes):
      raise AttributeError, "[%s] bad entry type '%s'" % (ck, value);
    self.entryType = value;
    self.fieldDict['entry_type'] = value;

  def setField(self, field, value):
    ck = self.getCiteKey();
    field = field.lower();
    if not self.isFieldValid(field):
      et = self.getEntryType();
      raise AttributeError, "[%s] bad field name '%s' for entry type '%s' " % (ck, field, et);
    if field == 'year':
      self.fieldDict[field] = value;

      # remove all text like "to appear", just leave the digits
      year = filter(lambda c : c.isdigit(), value);
      try:
        self.fieldDict['year_numeric'] = int(year);
      except:
        if value.find('appear') > -1:
          sys.stderr.write("[%s] no year specified, continuing\n" % ck);
          self.fieldDict['year_numeric'] = 0;
        else:
          self.fieldDict['year_numeric'] = -1;
          raise AttributeError, "[%s] bad year '%s'" % (ck, value);
    elif field == 'month':
      # the Month entry has the original string from the file if it is of
      # nonstandard form, else is None.
      # the hidden entry month_numeric has the ordinal number
      self.fieldDict[field] = value;
      #print >> sys.stderr, "Month = '%s'" % value;
      month = mogrify(value);
      for monthname in self.monthdict:
        # handle month abbreviations, eg. nov in november
        if monthname.find(month) >= 0:
          self.fieldDict['month_numeric'] = self.monthdict[monthname];
          #print >> sys.stderr, "month_numeric 1 %d" % self.monthdict[monthname];
          self.fieldDict[field] = None;
            
          return;
        # handle extraneous like november in 'november 12-13'
        if month.find(monthname) >= 0:
          self.fieldDict['month_numeric'] = self.monthdict[monthname];
          #print >> sys.stderr, "month_numeric 2 %d" % self.monthdict[monthname];
          return;
      raise AttributeError, "[%s] bad month '%s'" % (ck, value);
    else:
      self.fieldDict[field] = value;
    #print >> sys.stderr, "'%s' := '%s'\n" % (field, value)


  #############################################################3
  # matching methods
  #############################################################3

  def search(self, field, str, caseSens=0):
    field = string.lower(field);

    if field.lower() == 'all':
      for be in self:
        for k in self.fieldDict:
          if k[0] == '_':
            continue;
          s = self.fieldDict[k];
          if isinstance(s, list):
            s = ' '.join(s);
          if s:
            if caseSens == 0:
              m = re.search(str, s, re.IGNORECASE);
            else:
              m = re.search(str, s);
            if m:
              return True;
        
    else:
      # silently ignore search field if not present
      if not(field in self.fieldDict):
        return False;
      s = self.fieldDict[field];
      if isinstance(s, list):
        s = ' '.join(s);
      if s:
        if caseSens == 0:
          m = re.search(str, s, re.IGNORECASE);
        else:
          m = re.search(str, s);
        if m:
          return True;

    return 0;

  def matchAuthorList(self, be):

    def split(a):
      return re.findall(r"""([a-zA-Z][a-zA-Z-]*[.]?)""", a);

    def matchfrag(s, f):
      sdot = s[-1:] == '.';
      fdot = f[-1:] == '.';

      if (sdot == 0) and (fdot == 0):
        return s == f;
      elif (sdot == 0) and (fdot == 1):
        matchstr = f + '*';
        m = re.match(matchstr, s);
        if m:
          return m.group(0) == s;
        else:
          return 0;
      elif (sdot == 1) and (fdot == 0):
        matchstr = s + '*';
        m = re.match(matchstr, f);
        if m:
          return m.group(0) == f;
        else:
          return 0;
      elif (sdot == 1) and (fdot == 1):
        return s == f;

    def matchAuthor(a1, a2):
      l1 = split(a1);
      l2 = split(a2);
      count = 0;

      for p1 in l1:
        for p2 in l2:
          if matchfrag(p1,p2):
            count += 1;
      return count;

    # check if each article has the same number of authors
    l1 = self.getAuthorList();
    l2 = be.getAuthorList();
    if len(l1) != len(l2):
      return 0;

    # now check the authors match, in order
    for i in range( len(l1) ):
      if matchAuthor(l1[i], l2[i]) < 2:
        return 0;
    return 1;

  def matchTitle(self, be, dthresh):
    # Levenstein distance between two strings
    def distance(a,b):
        c = {}
        n = len(a); m = len(b)

        for i in range(0,n+1):
          c[i,0] = i
        for j in range(0,m+1):
          c[0,j] = j
      
        for i in range(1,n+1):
          for j in range(1,m+1):
            x = c[i-1,j]+1
            y = c[i,j-1]+1
            if a[i-1] == b[j-1]:
              z = c[i-1,j-1]
            else:
              z = c[i-1,j-1]+1
            c[i,j] = min(x,y,z)
        return c[n,m]

    d = distance( mogrify(self.getTitle()), mogrify(be.getTitle()) );

    return d <= dthresh;

  def matchEntryType(self, be):
    return self.getEntryType() == be.getEntryType();

  def matchYear(self, be):
    return fmatch(self.getYearNumeric(), be.getYearNumeric());

  def matchMonth(self, be):
    return fmatch(self.getMonthNumeric(), be.getMonthNumeric());

  def matchVolumeNumber(self, be):
    if not fmatch(self.getVolume(), be.getVolume()):
      return 0;
    if not fmatch(self.getNumber(), be.getNumber()):
      return 0;
    return 1;

  def matchPage(self, be):

    p1 = self.getPage();
    p2 = be.getPage();
    if p1 and p2:
      # both not null
      p1 =  re.findall("([0-9.]+)", p1);
      p2 =  re.findall("([0-9.]+)", p2);
      if (len(p1) > 0) and (len(p2) > 0):
        # optionally compare starting page numbers
        if p1[0] != p2[0]:
          return 0;
      if (len(p1) > 1) and (len(p2) > 1):
        # optionally compare ending page numbers
        if p1[1] != p2[1]:
          return 0;
      return 1;
    else:
      return 1;


  # see if two bibentries match
  def match(self, be, dthresh=2):
    # we do the cheapest comparisons first...
    if not self.matchEntryType(be):
      return 0;
    if not self.matchYear(be):
      return 0;
    if not self.matchMonth(be):
      return 0;
    if self.isEntryType('article'):
      if not self.matchVolumeNumber(be):
        return 0;
    if not self.matchPage(be):
      return 0;
    if not self.matchAuthorList(be):
      return 0;
    if not self.matchTitle(be, dthresh):
      return 0;
    return 1;

  #############################################################3
  # debugging
  #############################################################3

  def display(self, fp=sys.stdout):
    print >> fp, "%12s: %s" % ("citeKey", self.citeKey)
    for k in self.fieldDict:
      if (k == 'year_numeric') or (k == 'month_numeric'):
        continue;
      if k == 'author':
        print >> fp, "%12s: %s" % (k, self.getAuthors())
      else:
        print >> fp, "%12s: %s" % (k, self.fieldDict[k])

  def __getitem__(self, i):
    if type(i) is str:
      return self.fieldDict[i];
    elif type(i) is int:
      return self.fieldDict.keys()[i];
    else:
      raise;

  def check(self):
    "Check if the entry type is valid and has all required fields. Teturns a list of all missing ones."
    ck = self.getCiteKey();
    fields = self.fieldDict.keys();
    missing = [];
    entryType = self.getEntryType();
    if not (entryType in allEntryTypes):
      raise AttributeError, "[%s] bad entry type '%s'" % (ck, entryType);
    for fld in requiredFieldNames[self.getEntryType()]:
      if not (string.lower(fld) in fields):
        missing.append(fld);
    return missing;
      
  def __repr__(self):
    "Returns a string with the main fields - title, authors, etc."
    
    str = '"' + self.getTitle() + '"; ';
    try:
      str = str + self.getAuthorsNames();
    except:
      try:
        str = str + "eds. " + self.getEditorsNames();
      except:
        pass;
    month = self.getMonthName();
    year = self.getYearNumeric();
    book = self.getBooktitle();
    if book:
      str += ", " + book;
    if month:
      str += ", " + month;
      if year > 0:
        str += " " + `year`;
    else:
      if year > 0:
        str += ", " + `year`;
    str += '.';
    return str;

  def brief(self, fp=sys.stdout):
    print >> fp, self;

  # ----------------------------------------------------------------------

#############################################################3
# syntax extension:
#############################################################3

def addNewFields(fieldNameTable, newFields):
  "Adds to the {fieldNameTable} the fields in the" \
  " list {newFields}.  Each entry of the latter is" \
  " either a field name, or a string '{entryType}.{fieldName}' if" \
  " the action applies only to that entry type.  Returns 0 if success, else" \
  " returns the number of bugs found."
  
  nbg = 0;
  if newFields != None:
    for ef in newFields:
      entryType = "*"
      i = ef.find('.');
      if (i == 0) or (i == len(ef)-1):
        print >> sys.stderr, "** bad field spec '%s'\n" % ef;
        nbg += 1;
        continue;
      elif i < 0:
        field = ef;
        for entryType in fieldNameTable:
          # print >> sys.stderr, "entryType = %s" % entryType;
          nbg += addNewField(fieldNameTable[entryType], field);
      else:
        entryType = ef[0:i];
        field = ef[i+1:]
        if not (entryType in fieldNameTable):
          print >> sys.stderr, "** table has no row '%s'\n" % entryType;
          nbg += 1
        else:
          row = fieldNameTable[entryType];
          nbg += addNewField(row, field);
  return nbg
  # ----------------------------------------------------------------------
    
def addNewField(flist, field):
  " Adds the field name {field} to the list {flist}. Fails if" \
  " the name is invalid (must be just lower case or hyphens" \
  " and cannot be just one letter).  Does nothing if the" \
  " name is already there. Returns 0 if success, 1 if failure."
  
  if not re.match(r"^[a-z][-a-z]*[a-z]$", field):
    print >> sys.stderr, "** bad field name '%s'\n" % field;
    return 1
  if field in flist:
    return 0
  flist.append(field)
  return 0
  # ----------------------------------------------------------------------

# we adopt the convention that a numeric value of -1 means not provided,
# so here we match two quantites where either or both is not provided.  Only
# return false if both numbers are provided, and they are not equal, otherwise
# give the benefit of the doubt and return true.
def fmatch(n1, n2):
  if (n1 > 0) and (n2 > 0):
    return n1 == n2;
  else:
    return 1;

# remove all punctuation marks and white space that people
# might get wrong
def mogrify(s):
  s = string.lower(s);
  s = re.sub(r"""[#{}:;,&$ -]""", "", s);
  return s;

# List of all entry types
allEntryTypes = ('article', 'book', 'booklet', 'inbook', 'incollection',
      'inproceedings', 'manual', 'mastersthesis', 'misc', 'phdthesis',
      'proceedings', 'techreport', 'unpublished');

# List of additional fields, ignored by the standard BibTeX styles
ignoredFieldNames = ('crossref', 'code', 'url', 'annote', 'abstract');

# Lists of required fields for each entry type
requiredFieldNames = {
  'article' :       ['author', 'title', 'journal', 'year'],
  'book' :          ['author', 'title', 'publisher', 'year'],
  'booklet' :       ['title'],
  'inbook' :        ['author', 'title', 'chapter', 'pages', 'publisher', 'year'],
  'incollection' :  ['author', 'title', 'booktitle', 'publisher', 'year'],
  'inproceedings' : ['author', 'title', 'booktitle', 'year'],
  'manual' :        ['title'],
  'misc' :          [],
  'mastersthesis' : ['author', 'title', 'school', 'year'],
  'phdthesis' :     ['author', 'title', 'school', 'year'],
  'proceedings' :   ['title', 'year'],
  'techreport' :    ['author', 'title', 'institution', 'year'],
  'unpublished' :   ['author', 'title', 'note']
};

# List of optional fields for each entry type
optionalFieldNames = {
  'article' :       ['volume', 'number', 'pages', 'month', 'note'],
  'book' :          ['editor', 'volume', 'number', 'series', 'address', 'edition', 'month', 'note'],
  'booklet' :       ['author', 'howpublished', 'address', 'month', 'year', 'note'],
  'inbook' :        ['editor', 'volume', 'series', 'type', 'address', 'edition', 'month', 'note'],
  'incollection' :  ['editor', 'volume', 'number', 'series', 'type', 'chapter'  'pages', 'address', 'edition', 'month', 'note'],
  'inproceedings' : ['editor', 'pages', 'organization', 'publisher', 'address', 'month', 'note'],
  'manual' :        ['author', 'organization', 'address', 'edition', 'month', 'year', 'note'],
  'misc' :          ['title', 'author', 'howpublished', 'month', 'year', 'note'],
  'mastersthesis' : ['address', 'month', 'note'],
  'phdthesis' :     ['address', 'month', 'note'],
  'proceedings' :   ['editor', 'publisher', 'organization', 'address', 'month', 'note'],
  'techreport' :    ['type', 'number', 'address', 'month', 'note'],
  'unpublished' :   ['month', 'year']
};

# Copyright (c) 2007, Peter Corke
#
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * The name of the copyright holder may not be used to endorse or 
#       promote products derived from this software without specific prior 
#       written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
# THE POSSIBILITY OF SUCH DAMAGE.