# # Rajarshi Guha # 04/03/2004 # import sys, string, copy, getopt import HinFile class Sorter : # Initialize self.__attribute . It will be abused later. def __init__ (self) : self.__attribute = None # Notice how __compare is dependant upon self.__attribute def __compare (self, x, y) : return cmp(x[self.__attribute], y[self.__attribute]) # Pass the sort function the __compare function defined above. def __call__ (self, data, attribute = None) : if attribute == None : data.sort() else : self.__attribute = attribute data.sort(self.__compare) # This is useful if you want to inline things. # I don't know how much innefficiency it introduces. # Is this just a new reference? (that would be cool.) return data def usage(): print """ Usage: morgan.py OPTIONS -h,--help This messsage -i,--input FILE Input file (HIN format) -v,--verbose Verbose output This script converts the serial numbers of the atoms in the supplied HIN file to the canonical form using the basic Morgan algorithm. Currently if two or more atoms have the same EC values, they are simply numbered serially (ie, ordering via atom type or bond type is ignored). Furthermore, oscillating EC values and atoms with isospectral points are not considered. Furthermore it operates on the full molecule (ie includes hydrogens) The program simply outputs the new and old serial numbers. A new HIN file is not written out. Its mainly a proof of concept code. The algorithm was implemented from the description in Cheminformatics, A Textbook by Gasteiger & Engel (Eds.), page 60. """ def num_unique( v ): l = [] for i in v: if i not in l: l.append(i) return(len(l)) if __name__ == '__main__': if len(sys.argv) < 3: usage() sys.exit(0) verbose = 0 hinfilename = '' try: opt,args = getopt.getopt(sys.argv[1:], 'i:hv',\ ['input=','help','verbose']) except getopt.GetoptError: usage() sys.exit(0) for o,a in opt: if o in ('-h','--help'): usage() sys.exit(0) if o in ('-v','--verbose'): verbose = 1 if o in ('-i','--input'): hinfilename = a sorter = Sorter() hinfile = HinFile.HinFileReader( hinfilename ) # Get the serial numbers that were read in serial = [] for atom in hinfile.atomlist: serial.append( atom['serial'] ) # get initial EC values oldec = [] for ser in serial: oldec.append( hinfile.get_numbond(ser) ) nclass = num_unique( oldec ) # start expanding the sphere currec = [] while 1: for ser in serial: # get the serial number of the atoms connected to the current atom con = hinfile.get_connected_serials(ser) s = 0 # loop over each connected atom and find its EC value and sum them for cs in con: idx = serial.index(cs) s = s + oldec[idx] # s thus has the EC value of the current atom in this sphere currec.append( s ) # see if the number of EC's is <= number of EC's in the previous iter if num_unique( currec ) <= nclass: break else: nclass = num_unique( currec ) oldec = copy.deepcopy( currec ) currec = [] if verbose: print 'Obtained EC values' # OK, num unique EC's converged # Since we use the iteration where the highest number of EC's showed up # means we use oldec rather than currec (since we exited when # currec <= oldec) sec = zip(serial, oldec) sorter(sec,1).reverse() newserial = {} currserial = 1 for s,e in sec: #set the new serial for the serial corresponding to this ec if verbose: print 'Renumbering serial %d' % (s) if str(s) not in newserial.keys(): newserial[ str(s) ] = currserial currserial += 1 else: if verbose: print ' Ignoring serial %d' %(s) # lets number the neighbors of this serial number cons = hinfile.get_connected_serials( s ) if verbose: print ' Numbering nbrs: ' + str(cons) # make a list of ec values for these connected serials consec = [] for c in cons: for ss,ee in sec: if ss == c: consec.append(ee) break consec = zip(cons, consec) sorter(consec,1).reverse() for ss, ee in consec: if str(ss) not in newserial.keys(): newserial[ str(ss) ] = currserial currserial += 1 else: if verbose: print ' Ignoring serial %d' % (ss) newkeys = [int(x) for x in newserial.keys()] newkeys.sort() print 'old : new' for key in newkeys: print '%d : %d' % (key, newserial[str(key)])