import xml.sax import sys, StringIO from UserDict import UserDict # CML 1.01 Parser # # Rajarshi Guha # Oct, 2002 class Atom(UserDict): """ This class represents a single atom and is basically a dictionary, whose keys are the builtin values of atom in the CML DTD. Keys (and values) are added as they occur in the CML file """ def __init__(self): UserDict.__init__(self) class Bond(UserDict): """ This class represents a single bond, and is basically a dictionary whose keys are the builtin values for the bond element in the CML DTD. Keys (and values) are added as they occur in the CML file """ def __init__(self): UserDict.__init__(self) class Molecule: """ This class represents a single Molecule and contains all the information contained within the tags in a CML file. Currently is just contains a list of atoms (each an instance of class Atom) and bonds (each an instance of class Bond) and a few variables. The instance variable attrs is simply the attrs dictionary returned by the SAX parser for the tag. Currently the DTD does'nt specifically mention any attributes for the tag. From what I understand, there are'nt any and the attributes are just the general attributes ( title, id & convention ) that any element is allowed to have. (I need to check this up) Right now, the code to determine the text data for each tag and assigning it to the correct field in a Molecule() instance is a set of if .. then statements. Seems pretty crude to me - there must be a more elegant way to handle this. """ def __init__(self, d = None): if d: self.attrs = d else: self.attrs = None self.numatom = 0 self.numbond = 0 self.atomlist = [] self.bondlist = [] def display(self): s = StringIO.StringIO() for key in self.attrs.keys(): s.write(key+' = ' + self.attrs[key] + '\n') print s # Each element of atomlist is a list -> [id, name, x, y] print "Atoms" for a in self.atomlist: print a print 'Bonds' for a in self.bondlist: print a class CMLContentHandler(xml.sax.ContentHandler): """ This class basically overrides the default ContentHandler class in the SAX parser returned by xml.sax.make_parser(). It is designed to handle CML 1.01 (DTD found at http://www.xml-cml.org/) This class takes in an empty list, which it will fill up with instances of the Molecule class, for each tag set it finds in the supplied CML file. This also allows you to parse multiple files (each containg any number of molecules) and they are all contained in the user supplied list. Currently it handles the , , , , , , elements. An important element that needs attention is the element. The CML 1.01 DTD mentions the element, but the examples I got dont seem to have this element. However, some old CML files I have (which dont look like CML 1.01?) have a number of tags. I need to be able to process them """ def __init__(self, m): self.m = m self.currattr = None self.tmpstr = '' self.tmpmol = None self.tmpatom = None self.tmpbond = None self.inFloat = 0 self.inString = 0 self.inAtom = 0 self.inBond = 0 self.inAtomArray = 0 self.inBondArray = 0 self.inMolecule = 0 def startDocument(self): print 'Doc handling started' def endDocument(self): print 'Doc handling ended' def startElement(self, name, attrs): self.currattr = attrs if name == 'molecule': self.inMolecule = 1 self.tmpmol = Molecule(self.currattr) if name == 'string': self.tmpstr = '' self.inString = 1 if self.currattr['builtin'] == 'atomRef' and self.inBond: self.tmpbond['atomRef'] = [] if name == 'float': self.tmpstr = '' self.inFloat = 1 if name == 'atomArray': self.inAtomArray = 1 self.tmpmol.atomlist = [] if name == 'bondArray': self.inBondArray = 1 self.tmpmol.bondlist = [] if name == 'atom' and self.inAtomArray: self.tmpatom = Atom() self.inAtom = 1 self.tmpatom['atomid'] = attrs['id'] if name == 'bond' and self.inBondArray: self.tmpbond = Bond() self.inBond = 1 self.tmpbond['bondid'] = attrs['id'] def characters(self,ch): self.tmpstr += ch def endElement(self, name): if name == 'molecule': self.tmpmol.numatom = len(self.tmpmol.atomlist) self.tmpmol.numbond = len(self.tmpmol.bondlist) self.m.append( self.tmpmol ) self.inMolecule = 0 # We have a Atom element data if name == 'string' and self.inAtom: self.tmpatom[ self.currattr['builtin'] ] = self.tmpstr self.inString = 0 if name == 'float' and self.inAtom: self.tmpatom[ self.currattr['builtin'] ] = float(self.tmpstr) self.inFloat = 0 # We have a Bond element data if name == 'string' and self.inBond: if self.currattr['builtin'] == 'atomRef': self.tmpbond['atomRef'].append( self.tmpstr ) else: self.tmpbond[ self.currattr['builtin'] ] = self.tmpstr self.inString = 0 if name == 'float' and self.inBond: self.tmpbond[ self.currattr['builtin'] ] = float(self.tmpstr) self.inFloat = 0 # OK, we have all the data for this bond, append it to the molecule # bond list if name == 'bond': self.tmpmol.bondlist.append(self.tmpbond) self.inBond = 0 # OK, we have all the data for this atom, append it to the # molecule atom list if name == 'atom': self.tmpmol.atomlist.append( self.tmpatom ) self.inAtom = 0 if name == 'bondArray': self.inBondArray = 0 if name == 'atomArray': self.inAtomArray = 0 def skippedEntity(self,name): pass if __name__ == '__main__': if len(sys.argv) == 1: print """ Usage: cml101.py CML_FILE ... Wil parse a list of CML files (which can containm one or more molecule definitions) and stores the information in a Molecule class. See the source to get more info about Molecule, Atom & Bond classes. No error handling yet, so you need to supply a well formed CML file. """ m = [] parser = xml.sax.make_parser() parser.setFeature( xml.sax.handler.feature_namespaces, 0 ) ch = CMLContentHandler(m) parser.setContentHandler(ch) for i in sys.argv[1:]: parser.parse(i) print 'Parsed %s molecules' % ( len(m) ) for i in m: print 'Number of atoms = %d, Number of bonds = %d' % (i.numatom, i.numbond)