#!/usr/bin/python

import sys, string, os, re, StringIO

# alternate pairs are marked with a #
reqfields = {
'article' : ['author','title','journal','year'],
'book' : ['#author','#editor','title','publisher','year'],
'booklet' : ['title'],
'conference' : ['author','title','booktitle','year'],
'inproceedings' : ['author','title','booktitle','year'],
'inbook' : ['#author','#editor','title','#chapter','#pages','publisher','year'],
'incollection' : ['author','title','bookpublisher','year'],
'manual' : ['title'],
'mastersthesis' : ['author','title','school','year'],
'misc' : [],
'phdthesis' : ['author','title','school','year'],
'proceedings' : ['title','year'],
'techreport' : ['author','title','institution','year']
}

def CheckRequiredFields( etype, nams, ignore=1 ):
    """
    Takes the bib entry type and the list of field names for a given
    bib entry and makes sure that the required fields are presnt

    If one required field is absent it throws a BibEntryException
    """
    if ignore: return
    f = reqfields[etype]

    reqf = []
    altf = []
    for i in f:
        if string.find(i,'#') == -1: reqf.append(i)
        else: altf.append(i)
    altf = [ (x,y) for x,y in [(x,y) for x,y,z in zip(altf,altf[1:],[1,0]*len(altf)) if z]]

    # all the fields in reqf *must* be in our name/value list
    for field in reqf:
        if field not in nams:
            raise BibEntryException('reqnamefail')
    # for each pair of fields in altf, one member of each pair *must*
    # be present in the name/value list
    for alt1, alt2 in altf:
        if alt1 not in ['#'+x for x in nams] and alt2 not in ['#'+x for x in nams]:
            raise BibEntryException('reqnamefail')

def LoadBibFile( filename ):
    """
    Takes the filename for a bib file and reads in each bib entry as
    a BibEntry object.

    Returns a dictionary of BibEntry objects keyd on the bib entry key

    CAVEAT: It requires that the last field in a bib entry *does not* have a comma
    at the end of the line. So the ending of a bib entry should look like

    volume = {4} }

    or 

    volume = {4}
    }

    or whatever

    UPDATE: The above caveat seems to have been fixed!
    """
    bibdata = open(filename,'r').readlines()
    bibdata = string.join(bibdata)      

    reflags = re.MULTILINE | re.DOTALL | re.IGNORECASE

    # the re to capture any type of bibitem. I think its robust
    #re_alltype = re.compile(r"""(@(?:Book|Article|InBook|booklet|conference|incollection|inproceedings|manual|mastersthesis|misc|phdthesis|proceedings|techreport|unpublished){.*?}(?:\s*|\n|}))""", reflags)
    re_alltype = re.compile(r"""(@(?:Book|Article|InBook|booklet|conference|incollection|inproceedings|manual|mastersthesis|misc|phdthesis|proceedings|techreport|unpublished)\{.*?\}[\s,]*})""", reflags)

    # the re to get the key for a given bibitem
    re_key = re.compile(r"""@.+?\{(.+?),""", reflags)

    # the re to get the various fields of a bibitem
    re_fields = re.compile(r"""(\w*)\s*=\s*{(.*?)}""", reflags)

    bib_all = re.findall(re_alltype, bibdata)

    bibdict = {}
    for article in bib_all:
        key = string.strip(re.findall(re_key,article)[0])
        fields = re.findall(re_fields, article)
        etype = string.lower(re.findall(re.compile(r"""@(\w*){""", reflags), article)[0])

        try:
            bio = BibEntryFactory( etype,  key, fields )
            if key in bibdict.keys():
                print """
                Looks like the key /%s/ was loaded multiple times. The last
                entry will be the one saved
                """ % (key)
            bibdict[key] = bio                
        except BibEntryException:
            print key
    return bibdict            
   
def BibEntryFactory( etype, key, namvals ):

    f = reqfields[etype]
    try:
        CheckRequiredFields(etype, [x for x,y in namvals])
    except BibEntryException:
        raise BibEntryException('reqnamefail')

    bibentryobj = BibEntry( etype, key, namvals )
    return bibentryobj

class BibEntryException:
    def __init__(self,code):
        self.errmsg = {
        'reqnamefail' : 'Required field absent'
        }
        self.msg = self.errmsg[code]
    def __str__(self):
        return(self.msg)

class BibEntry(dict):
    def __init__(self, etype='article', key='junk',namvals = []):
        self.etype = etype
        self.key = key
        self.data = {}

        for name, val in namvals:
            if string.find(name,'OPT') == 0 or string.find(name,'ALT') == 0:
                continue
            name = string.lower(name)
            val = string.strip(val)
            val = re.sub('\n','',val)
            val = re.sub('\s{2,}',' ',val)
            self.data[name] = val
            
    def __getitem__(self,key):
        return self.data[key]
    def __setitem__(self,key,item): 
        self.data[key] = item
    def get_year(self):
        if 'year' in self.data.keys():
            return self.data['year']
        else: return ''
    def get_title(self):
        if 'title' in self.data.keys():
            return self.data['title']
        else: return ''
    def get_first_author(self):
        if 'author' in self.data.keys():
            author = self.data['author']
            if string.find(author,' and ') == -1 and string.find(author,' AND ') == -1:
                return author
            else:
                x = re.findall(re.compile(r"""(.*?)\sand\s""",re.MULTILINE | re.DOTALL | re.IGNORECASE), author)
                return x[0]
        else:
            return str('')
        

    def __str__(self):
        s = StringIO.StringIO()
        keys = self.data.keys()
        keys.sort()

        s.write('%s { %s\n' % (self.etype, self.key))
        for key in keys:
            s.write('\t%s = %s,\n' % (key,self.data[key]))
        s = s.getvalue()
        s = s[:len(s)-2] + '\n}'   
        return s
        
        
if __name__ == '__main__':
    bibdict = LoadBibFile(sys.argv[1])
    print len(bibdict.keys())
    print bibdict['talanta:2000:51:455'].get_first_author()
    for key in bibdict.keys():
        #print bibdict[key]
        pass

