#!/usr/local/bin/py
import sys
import os
import string
import math
import random

import dan
import setbin

def stripreturn(l):
    if l[len(l)-1] == '\n':
        return(l[:len(l)-1])
    return(l)

def scramblelist(list, iter = 10):
    for i in range(0,iter):
        random.shuffle(list)
    return(list)    
    
##############################################################

if len(sys.argv) != 3:
    print 'Must supply the pos.txt & depv.txt file'
    sys.exit(-1)
print
tsetp = float(raw_input("Enter tset percentage: "))
psetp = float(raw_input("Enter pset percentage: "))
csetp = float(raw_input("Enter cvset percentage: "))


# read in all dans and associated class
f1 = open(sys.argv[2],'r')
f = open(sys.argv[1],'r')
danlist = []
for line in f:
    if string.find(line,'Molecule') != -1:
        continue
    if string.find(line,'Total') != -1:
        continue

    line = string.split(line,'(')
    d = string.strip(line[0])  # This will simply be an integer
    line = string.split(line[1],')')
    c = int(line[1])
    
    v = float(f1.readline())
    
    danlist.append( (d,c,v) )
f.close()
f1.close()

# Now make a tranlation table from the dans and
# from here on only use the tranlated forms (ie integers)
old2new, new2old = dan.dan_translation([d for d,c,v in danlist])

# Now seperate the dans into 2 classes, using *new* dan numbers
c2 = []
c1 = []
for d,c,v in danlist:
    if c == 1:
        c1.append( old2new[d] )
    else:
        c2.append( old2new[d] )

percentc1 = math.floor(float(len(c1)) / (len(c2) + len(c1)) * 100)
percentc2 = 100 - percentc1

# NOTE: if the hin files are in danXXX.hin format, then the we need
# to extract dan numbers from them
dandir = raw_input('Enter the directory where all the hin files are: ')
try:
    danflist = os.listdir(dandir)
except OSError:
    print 'Error: Directory '+dandir+' does not exist'

#
# From here on, work with internal numberings
#
nummol = len(danflist)
numt = int(math.ceil(tsetp/100 * nummol)) 
nump = int(math.ceil(psetp/100 * nummol))
numc = nummol - numt - nump


# Get number of (*)set in KSOM class1 and class2
numtc1 = int(math.ceil(percentc1/100 * numt))
numpc1 = int(math.ceil(percentc1/100 * nump))
numcc1 = int(math.floor(percentc1/100 * numc))

numtc2 = numt - numtc1
numpc2 = nump - numpc1
numcc2 = numc - numcc1
# take into account possible rounding errors
diff1 = len(c1) - (numtc1+numpc1+numcc1)
diff2 = len(c2) - (numtc2+numpc2+numcc2)
if diff1 == 0 and diff2 == 0:
    pass
else:
    numtc1 += diff1
    numtc2 += diff2

print
print 'Total Number of molecules = '+str(nummol)
print 'KSOM:: Class 1 = '+str(percentc1)+'% ('+str(len(c1))+') Class 2 = '+str(percentc2)+'% ('+str(len(c2))+')'
print 'TSET = '+str(numt)+' PSET = '+str(nump)+' CVSET = '+str(numc)
print
print 'Breakup by KSOM class:'
print '        Class 1    Class 2'
print 'TSET    '+str(numtc1)+'        '+str(numtc2)+'  = '+str(numtc1+numtc2)
print 'PSET    '+str(numpc1)+'        '+str(numpc2)+'    = '+str(numpc1+numpc2)
print 'CSET    '+str(numcc1)+'        '+str(numcc2)+'    = '+str(numcc1+numcc2)
print '        ----------------'
print '        '+str(numtc1+numpc1+numcc1)+'      '+str(numtc2+numpc2+numcc2)

# Get the tset, pset & cset from KSOM class 1
# Basically send the whole class 1 list to setbin, with the proper
# numbers for tset, pset & cset
depv = []
for i in c1:
    for j in danlist:
        d,c,v = j
        if i == old2new[d]: depv.append(v)
bins,probs = setbin.makebin(c1, depv, 10)
tset1, pset1, cset1 = setbin.generate_sets(bins, probs, numtc1, numpc1, numcc1, c1, depv)
    
# Now do it for KSOM class 2
depv = []
for i in c2:
    for j in danlist:
        d,c,v = j
        if i == old2new[d]: depv.append(v)
bins,probs = setbin.makebin(c2, depv, 10)
tset2, pset2, cset2 = setbin.generate_sets(bins, probs, numtc2, numpc2, numcc2, c2, depv)

# Now just join the respective sets
tset1.extend(tset2)
pset1.extend(pset2)
cset1.extend(cset2)

# Since the dans are all str's (see above) we convert them to ints
# and sort - just sugar :)
tset1 = [int(x) for x in tset1]
tset1.sort()

pset1 = [int(x) for x in pset1]
pset1.sort()

cset1 = [int(x) for x in cset1]
cset1.sort()

# Now write them out!!
setbin.print_set(tset1, pset1, cset1, 'tsets.in',1)