#!/usr/local/bin/py import sys import os import string import math import random import dan import setbin def stripreturn(l): if l[len(l)-1] == '\n': return(l[:len(l)-1]) return(l) def scramblelist(list, iter = 10): for i in range(0,iter): random.shuffle(list) return(list) ############################################################## if len(sys.argv) != 3: print 'Must supply the pos.txt & depv.txt file' sys.exit(-1) print tsetp = float(raw_input("Enter tset percentage: ")) psetp = float(raw_input("Enter pset percentage: ")) csetp = float(raw_input("Enter cvset percentage: ")) # read in all dans and associated class f1 = open(sys.argv[2],'r') f = open(sys.argv[1],'r') danlist = [] for line in f: if string.find(line,'Molecule') != -1: continue if string.find(line,'Total') != -1: continue line = string.split(line,'(') d = string.strip(line[0]) # This will simply be an integer line = string.split(line[1],')') c = int(line[1]) v = float(f1.readline()) danlist.append( (d,c,v) ) f.close() f1.close() # Now make a tranlation table from the dans and # from here on only use the tranlated forms (ie integers) old2new, new2old = dan.dan_translation([d for d,c,v in danlist]) # Now seperate the dans into 2 classes, using *new* dan numbers c2 = [] c1 = [] for d,c,v in danlist: if c == 1: c1.append( old2new[d] ) else: c2.append( old2new[d] ) percentc1 = math.floor(float(len(c1)) / (len(c2) + len(c1)) * 100) percentc2 = 100 - percentc1 # NOTE: if the hin files are in danXXX.hin format, then the we need # to extract dan numbers from them dandir = raw_input('Enter the directory where all the hin files are: ') try: danflist = os.listdir(dandir) except OSError: print 'Error: Directory '+dandir+' does not exist' # # From here on, work with internal numberings # nummol = len(danflist) numt = int(math.ceil(tsetp/100 * nummol)) nump = int(math.ceil(psetp/100 * nummol)) numc = nummol - numt - nump # Get number of (*)set in KSOM class1 and class2 numtc1 = int(math.ceil(percentc1/100 * numt)) numpc1 = int(math.ceil(percentc1/100 * nump)) numcc1 = int(math.floor(percentc1/100 * numc)) numtc2 = numt - numtc1 numpc2 = nump - numpc1 numcc2 = numc - numcc1 # take into account possible rounding errors diff1 = len(c1) - (numtc1+numpc1+numcc1) diff2 = len(c2) - (numtc2+numpc2+numcc2) if diff1 == 0 and diff2 == 0: pass else: numtc1 += diff1 numtc2 += diff2 print print 'Total Number of molecules = '+str(nummol) print 'KSOM:: Class 1 = '+str(percentc1)+'% ('+str(len(c1))+') Class 2 = '+str(percentc2)+'% ('+str(len(c2))+')' print 'TSET = '+str(numt)+' PSET = '+str(nump)+' CVSET = '+str(numc) print print 'Breakup by KSOM class:' print ' Class 1 Class 2' print 'TSET '+str(numtc1)+' '+str(numtc2)+' = '+str(numtc1+numtc2) print 'PSET '+str(numpc1)+' '+str(numpc2)+' = '+str(numpc1+numpc2) print 'CSET '+str(numcc1)+' '+str(numcc2)+' = '+str(numcc1+numcc2) print ' ----------------' print ' '+str(numtc1+numpc1+numcc1)+' '+str(numtc2+numpc2+numcc2) # Get the tset, pset & cset from KSOM class 1 # Basically send the whole class 1 list to setbin, with the proper # numbers for tset, pset & cset depv = [] for i in c1: for j in danlist: d,c,v = j if i == old2new[d]: depv.append(v) bins,probs = setbin.makebin(c1, depv, 10) tset1, pset1, cset1 = setbin.generate_sets(bins, probs, numtc1, numpc1, numcc1, c1, depv) # Now do it for KSOM class 2 depv = [] for i in c2: for j in danlist: d,c,v = j if i == old2new[d]: depv.append(v) bins,probs = setbin.makebin(c2, depv, 10) tset2, pset2, cset2 = setbin.generate_sets(bins, probs, numtc2, numpc2, numcc2, c2, depv) # Now just join the respective sets tset1.extend(tset2) pset1.extend(pset2) cset1.extend(cset2) # Since the dans are all str's (see above) we convert them to ints # and sort - just sugar :) tset1 = [int(x) for x in tset1] tset1.sort() pset1 = [int(x) for x in pset1] pset1.sort() cset1 = [int(x) for x in cset1] cset1.sort() # Now write them out!! setbin.print_set(tset1, pset1, cset1, 'tsets.in',1)