import xml.sax
import sys, StringIO
from UserDict import UserDict

# CML 1.01 Parser
#
# Rajarshi Guha 
# <rajarshi@presidency.com> Oct, 2002

class Atom(UserDict):
    """
    This class represents a single atom and is basically a dictionary,
    whose keys are the builtin values of atom in the CML DTD. 

    Keys (and values) are added as they occur in the CML file
    """
    def __init__(self):
        UserDict.__init__(self)
    
class Bond(UserDict):
    """
    This class represents a single bond, and is basically a 
    dictionary whose keys are the builtin values for the bond 
    element in the CML DTD. 

    Keys (and values) are added as they occur in the CML file
    """
    def __init__(self):
        UserDict.__init__(self)
        

class Molecule:
    """
    This class represents a single Molecule and contains all the information
    contained within the <molecule></molecule> tags in a CML file. 
    Currently is just contains a list of atoms (each an instance of
    class Atom) and bonds (each an instance of class Bond) and a few
    variables.

    The instance variable attrs is simply the attrs dictionary returned
    by the SAX parser for the <molecule> tag. Currently the DTD does'nt
    specifically mention any attributes for the <molecule> tag. From what
    I understand, there are'nt any and the attributes are just the general attributes
    ( title, id & convention ) that any element is allowed to have. (I need to check 
    this up)

    Right now, the code to determine the text data for each tag and assigning it to
    the correct field in a Molecule() instance is a set of if .. then
    statements. Seems pretty crude to me - there must be a more elegant
    way to handle this.
    """
    def __init__(self, d = None):
        if d:
            self.attrs = d
        else:
            self.attrs = None
        self.numatom = 0
        self.numbond = 0
        self.atomlist = []
        self.bondlist = []

    def display(self):

        s = StringIO.StringIO()
        for key in self.attrs.keys():
            s.write(key+' = ' + self.attrs[key] + '\n')
        print s    
        # Each element of atomlist is a list -> [id, name, x, y]
        print "Atoms"
        for a in self.atomlist:
            print a
        print 'Bonds'    
        for a in self.bondlist:
            print a
            
        
class CMLContentHandler(xml.sax.ContentHandler):
    """
    This class basically overrides the default ContentHandler class
    in the SAX parser returned by xml.sax.make_parser(). 

    It is designed to handle CML 1.01 (DTD found at http://www.xml-cml.org/)

    This class takes in an empty list, which it will fill up with instances
    of the Molecule class, for each <molecule></molecule> tag set it finds in
    the supplied CML file. This also allows you to parse multiple files
    (each containg any number of molecules) and they are all contained in
    the user supplied list.

    Currently it handles the <bond>, <bondArray>, <atom>, <atomArray>,
    <molecule>, <string>, <float> elements. An important element that needs
    attention is the <array> element. The CML 1.01 DTD mentions the <array> element,
    but the examples I got dont seem to have this element. However, some old
    CML files I have (which dont look like CML 1.01?) have a number of <array> tags.
    I need to be able to process them
    """
    def __init__(self, m):
        self.m = m
        
        self.currattr = None
        self.tmpstr = ''
        self.tmpmol = None
        self.tmpatom = None
        self.tmpbond = None
        
        self.inFloat = 0
        self.inString = 0
        self.inAtom = 0
        self.inBond = 0
        self.inAtomArray = 0
        self.inBondArray = 0
        self.inMolecule = 0
        
    def startDocument(self):
        print 'Doc handling started'

    def endDocument(self):
        print 'Doc handling ended'

    def startElement(self, name, attrs):

        self.currattr = attrs

        if name == 'molecule':
            self.inMolecule = 1
            self.tmpmol = Molecule(self.currattr)
            
        if name == 'string':
            self.tmpstr = ''
            self.inString = 1

            if self.currattr['builtin'] == 'atomRef' and self.inBond:
                self.tmpbond['atomRef'] = []
                
            
        if name == 'float':
            self.tmpstr = ''
            self.inFloat = 1
            
        if name == 'atomArray':
            self.inAtomArray = 1
            self.tmpmol.atomlist = []
        if name == 'bondArray':
            self.inBondArray = 1
            self.tmpmol.bondlist = []

        if name == 'atom' and self.inAtomArray:
            self.tmpatom = Atom()
            self.inAtom = 1
            self.tmpatom['atomid'] = attrs['id']

        if name == 'bond' and self.inBondArray:
            self.tmpbond = Bond()
            self.inBond = 1
            self.tmpbond['bondid'] = attrs['id']

    def characters(self,ch):
        self.tmpstr += ch

    def endElement(self, name):
        if name == 'molecule': 
            self.tmpmol.numatom = len(self.tmpmol.atomlist)
            self.tmpmol.numbond = len(self.tmpmol.bondlist)
            self.m.append( self.tmpmol )
            self.inMolecule = 0

        # We have a Atom element data
        if name == 'string' and self.inAtom: 
            self.tmpatom[ self.currattr['builtin'] ] = self.tmpstr
            self.inString = 0
        if name == 'float' and self.inAtom:
            self.tmpatom[ self.currattr['builtin'] ] = float(self.tmpstr)
            self.inFloat = 0

        # We have a Bond element data   
        if name == 'string' and self.inBond:
            if self.currattr['builtin'] == 'atomRef':
                self.tmpbond['atomRef'].append( self.tmpstr )
            else:
                self.tmpbond[ self.currattr['builtin'] ] = self.tmpstr
            self.inString = 0
        if name == 'float' and self.inBond:
            self.tmpbond[ self.currattr['builtin'] ] = float(self.tmpstr)
            self.inFloat = 0

        # OK, we have all the data for this bond, append it to the molecule 
        # bond list
        if name == 'bond':
            self.tmpmol.bondlist.append(self.tmpbond)
            self.inBond = 0
            
        # OK, we have all the data for this atom, append it to the
        # molecule atom  list
        if name == 'atom':
            self.tmpmol.atomlist.append( self.tmpatom )
            self.inAtom = 0
            
        if name == 'bondArray':
            self.inBondArray = 0
        if name == 'atomArray':
            self.inAtomArray = 0

    def skippedEntity(self,name):
        pass

if __name__ == '__main__':

    if len(sys.argv) == 1:
        print """
        Usage: cml101.py CML_FILE ...

        Wil parse a list of CML files (which can containm one or more
        molecule definitions) and stores the information in a Molecule
        class. See the source to get more info about Molecule, Atom &
        Bond classes.

        No error handling yet, so you need to supply a well formed CML
        file.
        """
    m = []

    parser = xml.sax.make_parser()
    parser.setFeature( xml.sax.handler.feature_namespaces, 0 )

    ch = CMLContentHandler(m)

    parser.setContentHandler(ch)
    for i in sys.argv[1:]:
        parser.parse(i)

    print  'Parsed %s molecules' % ( len(m) )
    for i in m:
        print 'Number of atoms = %d, Number of bonds = %d' % (i.numatom, i.numbond)