#!/usr/bin/env python

# Rajarshi Guha <rguha@indiana.edu>
# 08/26/07

import string, sys, getopt
from openeye.oechem import *

max_path_len = 9
verbose = False
scale_type = 'raw'
error_file = 'error.txt'

# Schneiders SMARTS's patterns
#cats_smarts = {
#    'D' : ['[#6H]', '[#7H,#7H2]'],
#    'A' : ['[#6]', '[#7H0]'],
#    'P' : ['[*+]', '[#7H2]'],
#    'N' : ['[*-]', '[C&$(C(=O)O),P&$(P(=O)),S&$(S(=O)O)]'],
#    'L' : ['[Cl,Br,I]', '[S;D2;$(S(C)(C))]']
#}

# donor and acceptor SMARTS taken from 
# http://www.daylight.com/dayhtml_tutorials/languages/smarts/smarts_examples.html#H_BOND
#
# I think the L patterns for the carbon adjacent only to carbons covers
# all possible cases. I have not taken into account primary carbon
# atoms (i.e., methyl groups)
cats_smarts = {
    'D' : ['[!$([#6,H0,-,-2,-3])]'],
    'A' : ['[!$([#6,F,Cl,Br,I,o,s,nX3,#7v5,#15v5,#16v4,#16v6,*+1,*+2,*+3])]'],
    'P' : ['[*+]', '[#7H2]'],
    'N' : ['[*-]', '[C&$(C(=O)O),P&$(P(=O)),S&$(S(=O)O)]'],
    'L' : ['[Cl,Br,I]', '[S;D2;$(S(C)(C))]',
	   '[C;D2;$(C(=C)(=C))]', '[C;D3;$(C(=C)(C)(C))]', '[C;D4;$(C(C)(C)(C)(C))]',
	   '[C;D3;H1;$(C(C)(C)(C))]', '[C;D2;H2;$(C(C)(C))]',]
}
    
cats_desc = ['DD','AD', 'DP', 'DN', 'DL',
	     'AA', 'AP', 'AN', 'AL',
	     'PP', 'NP', 'LP',
	     'NN', 'LN', 'LL']


def getPcoreGroups(mol, smarts):
    """
    Given a molecule it assigns PPP's to individual atoms

    The return value is a list of length number of atoms in
    the input molecule. The i'th element of the list contains
    a list of PPP labels that were identified for the i'th atom
    """
    matcher = OESubSearch()
    ret = ['' for x in range(0,mol.NumAtoms())]

    labels = smarts.keys()
    for label in labels:
	patterns = smarts[label]
	for pattern in patterns:
	    matcher.Init(pattern)
	    matched = False
	    for matchbase in matcher.Match(mol,1):
		for pair in matchbase.GetAtoms():
		    idx = pair.target.GetIdx()
		    if ret[idx] == '': ret[idx] = [label]
		    else: 
			tmp = ret[idx]
			tmp.append(label)
			ret[idx] = tmp
		matched = True
	    if matched: break
    return ret

def _getZeroMatrix(r,c):
    return [[0 for x in range(0,c)] for x in range(0,r)]

def getAdjacencyMatrix(mol):
    """
    Generates an adjacency matrix for a molecule. Note that this
    will be a matrix with 0 along the diagonals
    """
    n = mol.NumAtoms()
    admat = _getZeroMatrix(n,n)
    for bond in mol.GetBonds():
	bgn_idx = bond.GetBgnIdx()
	end_idx = bond.GetEndIdx()
	admat[bgn_idx][end_idx] = 1
	admat[end_idx][bgn_idx] = 1
    return admat

def getTopologicalDistanceMatrix(admat):
    """
    Generates the topological distance matrix given
    an adjacency matrix
    """
    n = len(admat)
    d = _getZeroMatrix(n,n)
    for i in range(0,n):
	for j in range(0,n):
	    if admat[i][j] == 0: d[i][j] = 99999999;
	    else: d[i][j] = 1
    for i in range(0,n): d[i][i] = 0
    
    for k in range(0,n):
	for i in range(0,n):
	    for j in range(0,n):
		if d[i][k]+d[k][j] < d[i][j]: d[i][j] = d[i][k]+d[k][j]
    return d

def getPPPMatrix(admat, ppp_labels):
    pppm = {}
    n = len(admat)
    for i in range(0,n):
	ppp_i = ppp_labels[i]
	if ppp_i == '': continue
	for j in range(0,n):
	    ppp_j = ppp_labels[j]
	    if ppp_j == '': continue
	    pairs = []
	    for x in ppp_i:
		for y in ppp_j:
		    if (x,y) not in pairs and (y,x) not in pairs:
			## make sure to add the labels in increasing
			## lexicographical order
			if x < y: tmp = (x,y)
			else: tmp = (y,x)
			pairs.append(tmp)
	    pppm[i,j] = pairs
    return pppm
	    
def usage():
    print """
    cats2d.py [options]

    Generates CATS2D descriptors which are topological pharmacophore
    fingerprints.

    -e, --error    Error log file. Default is error.txt
    -h, --help     Print this message
    -i, --in       Name of the input SMILES file
    -l, --len      Maximum path length (default is 9)
    -o, --out      Name of the output file (default is cats2d.txt)
    -s, --scale    What type of scaling to use. The possible values
                   are: raw, num and occ. The default is raw and occ
                   is not implemented at the moment
    -v, --verbose  Verbose output
    """

if __name__ == '__main__':
    
    ifilename = None
    ofilename = 'cats2d.txt'

    if len(sys.argv) == 1:
	usage()
	sys.exit(-1)

    try:
	opt,args = getopt.getopt(sys.argv[1:], 'o:i:l:s:e:hv', \
				 ['help', 'in=', 'out=', 'verbose', 
				  'len=', 'scale=', 'error='])
    except getopt.GetoptError:
	usage()
	sys.exit(-1)

    for o,a in opt:
	if o in ('-e', '--error'):
	    error_file = a
	if o in ('-s', '--scale'):
	    if a not in ['raw', 'num', 'occ']:
		usage()
		sys.exit(-1)
	    else: scale_type = a
	if o in ('-h', '--help'):
	    usage()
	    sys.exit(-1)
	if o in ('-i', '--in'):
	    ifilename = a
	if o in ('-o', '--out'):
	    ofilename = a
	if o in ('-v', '--verbose'):
	    verbose = True
	if o in ('-l', '--len'):
	    max_path_len = int(a)
    
    if max_path_len < 1:
	print 'Using max_path_len = 9'
	max_path_len = 9

    if not ifilename:
	print 'Must specify an input file'
	usage()
	sys.exit(-1)

    ofile = open(ofilename, 'w')
    ofile.write("Title ")
    for label in cats_desc:
	for i in range(0, max_path_len+1):
	    ofile.write('%s.%d ' % (label,i))
    ofile.write('\n')

    ## Lets start looping over the SMILES file
    sw = OEStopwatch()

    ifs = oemolistream(ifilename)
    nmol = 0
    for mol in ifs.GetOEMols():
	natom = mol.NumAtoms()
	ppp_labels = getPcoreGroups(mol, cats_smarts)
	admat = getAdjacencyMatrix(mol)
	tdistmat = getTopologicalDistanceMatrix(admat)
	pppmat = getPPPMatrix(tdistmat, ppp_labels)

	# get the occurence of each of the PPP's
	ppp_count = dict(zip(['D','N', 'A', 'P', 'L'], [0]*5))
	for label in ppp_labels:
	    for ppp in label:
		ppp_count[ppp] = ppp_count[ppp] + 1

	# lets calculate the CATS2D raw descriptor
	desc = [ [0 for x in range(0,max_path_len+1)] for x in range(0,15)]
	for x,y in pppmat.keys():
	    labels = pppmat[x,y]
	    dist = tdistmat[x][y]
	    if dist > max_path_len: continue
	    for pair in labels:
		id = '%s%s' % (pair[0],pair[1])
		idx = cats_desc.index(id)
		vals = desc[idx]
		vals[dist] += 1
		desc[idx] = vals
	
	if scale_type == 'num':
	    for row in range(0, len(desc)):
		for col in range(0, len(desc[0])):
		    desc[row][col] = float(desc[row][col]) / natom
	elif scale_type == 'occ':
	    #  get the scaling factors
	    facs = [0]*len(cats_desc)
	    count = 0
	    for ppp in cats_desc:
		facs[count] = ppp_count[ppp[0]] + ppp_count[ppp[1]]
		count += 1

	    # each row in desc corresponds to a PPP pair
	    # so the scale factor is constant over cols of a row
	    count = 0
	    for i in range(0, len(desc)):
		if facs[i] == 0: continue
		for j in range(0, len(desc[0])):
		    desc[i][j] = desc[i][j] / float(facs[i])

	ofile.write('%s ' % (mol.GetTitle()))
	for row in desc:
	    for col in row: ofile.write('%3.5f ' % (col))
	ofile.write('\n')

	nmol += 1
	if nmol % 100 == 0 and verbose:
	    sys.stdout.write('\rProcessed %d molecules' % (nmol))
	    sys.stdout.flush()


    ofile.close()
    ifs.close()

    if verbose: 
	print '\rProcessed %d molecules [%1.3f sec/mol]' % (nmol, sw.Elapsed()/nmol)