import math import random import sys def makebin(danlist, depvlist, numbin = 10): """ Generates the bins and cumulative counts for each bin Input: list of dans, list of depvars, number of bins (default is 10) Output: 2 lists - bins (a list whose each element contains a list of dans in that bin) & probs (a list of tuples, whose members give the range of ofcumulative population of each bin) Useage: Basically this method is used to generate input for generate_set method. The method will return a list whose elemenst are lists of dans in a given range. In addition another list is returned whoe members are tuples of integers. Basically it is used by generate_sets to select which bin to choose a dan from. The struccture of each prob element, i, is (lower, upper), so that if a random number lies between lower & upper, then a dan from the bin indexed by i. The bin ranges are generated by taking the lowest and highest depv value and creating the specified number of bins in that range (*NO* normalization is carried out) """ # normalize the depv list #sum = 0 #for depv in depvlist: # sum = sum + depv*depv #sum = math.sqrt(sum) #depvlist = [ (x/sum) for x in depvlist ] # find min and max vals and bin on that range # maxdepv = -99999 # mindepv = 99999 # for i in depvlist: # if i > maxdepv: # maxdepv = i # continue # if i < mindepv: # mindepv = i # continue maxdepv = max(depvlist) mindepv = min(depvlist) bins = [] binrange = [] probrange = [] bininc = (maxdepv - mindepv) / numbin for i in range(0, numbin): bins.append([]) probrange.append(0) low = mindepv for i in range(0,numbin): up = low + bininc binrange.append( (low,up) ) low = up # I do this so that the max value is binned (since the condition # enter a bin is >= lower && < upper. # Thus the max val which is == upper (for the higghest bin) would # never satisfy the condition. Thus I increase the upper of the highest bin l,u = binrange[numbin-1] u += bininc binrange[numbin-1] = (l,u) # now do the binning for i in range(0, len(depvlist)): for j in range(0, numbin): low , up = binrange[j] if depvlist[i] >= low and depvlist[i] < up: bins[j].append( danlist[i] ) # Now calculate the cumulative counts of bin contents old = 0; for i in range(0, numbin): if (len(bins[i])) != 0: probrange[i] = old + len(bins[i]) old = probrange[i] else: probrange[i] = 0 #old = 0 # Now I should sort the probrange and bins lists, in # order increasing probrange flag = 1 while(flag): for i in range(0, numbin): for j in range(i+1, numbin): flag = 0 if probrange[i] > probrange[j]: t = probrange[i] probrange[i] = probrange[j] probrange[j] = t t = bins[i] bins[i] = bins[j] bins[j] = t flag = 1 # Now rearrange the cum probs into prob ranges # for each bin - makes it easier to handle later probs = [] low = 0 for i in probrange: probs.append( (low, i) ) low = i return( bins, probs ) ########################################################### def generate_sets(bins, probs, numt, nump, numc, danl, depv, debug = 0): """ Generate TSET, PSET & CVSET, using binned values of the depv variable Input: list of bins, cumulative counts, number of members in tset, pset & cvset Output: 3 lists of dans -> tset, pset, cvset Before this function is called, you must call makebin to get the binned dans and cumulative population counts. It works as follows: for each member of the tset, we first choose which bin to get a dan from. A random number is generated (ranging from 0 to the max cumulative count) and then is compared to each range in probs. If it lies in the range if prob[i], then selection is made from bins[i]. After the bin has been selected, another random number is generated (between 0 and the length of the bin) and the element of the dan list in the bin[i] is put into the tset and removed from the danlist of that bin """ if numt == 0 and nump == 0 and numc == 0: return None # should also check that numt+nump+numc !> number of obs tset = [] pset = [] cset = [] # find the max value in probs - so that we generate # RN's between 0 and the max value maxprob = -999999 for l,u in probs: if u > maxprob: maxprob = u # select sets one by one, deleting each chosen dan # from its bin binnum = 0 if debug: print 'Doing tset' for i in range(0, numt): # choose which bin while 1: r1 = random.randrange(0, maxprob+1, 1) for j in range(0, len(probs)): l,u = probs[j] if r1 >= l and r1 < u: binnum = j blist = bins[binnum] if blist == []: continue r2 = random.randrange(0, len(blist), 1) tset.append(blist[r2]) blist.remove(blist[r2]) bins[binnum] = blist break # PSETS binnum = 0 if debug: print 'Doing pset' for i in range(0, nump): # choose which bin while 1: r1 = random.randrange(0, maxprob+1, 1) for j in range(0, len(probs)): l,u = probs[j] if r1 >= l and r1 < u: binnum = j #print 'choosing bin = '+str(binnum) blist = bins[binnum] if blist == []: continue r2 = random.randrange(0, len(blist), 1) pset.append(blist[r2]) blist.remove(blist[r2]) bins[binnum] = blist break # CSET - no need to do this randomly, just take the remainder!! binnum = 0 if debug: print 'Doing cset' for i in bins: if i != []: cset.extend(i) # Now we want to make sure that the dans of the 2 highest and # 2 lowest depv values are in the training set. max1 = max(depv) idx = depv.index(max1) max1 = danl[idx] depv.pop(idx) danl.pop(idx) max2 = max(depv) idx = depv.index(max2) max2 = danl[idx] depv.pop(idx) danl.pop(idx) min1 = max(depv) idx = depv.index(min1) min1 = danl[idx] depv.pop(idx) danl.pop(idx) min2 = max(depv) idx = depv.index(min2) min2 = danl[idx] depv.pop(idx) danl.pop(idx) if max1 in tset and min1 in tset and max2 in tset and min2 in tset: return tset,pset,cset ml = [max1, max2, min1, min2] for newval in ml: if newval not in tset: # Find out which set its in and swap it with tset[1] if newval in pset: pset.remove(newval) for i in tset: if i not in ml: t = i break pset.append(t) tset.remove(t) tset.append(newval) continue if newval in cset: cset.remove(newval) for i in tset: if i not in ml: t = i break cset.append(t) tset.remove(t) tset.append(newval) continue return tset, pset, cset ########################################## def print_set(tset, pset, cset, fname, style = 1): try: f = open(fname,'w') except IOError: print "Cound'nt open "+fname+". print_set() failed" return linelen = 10 c = 0 tcset = [] for i in tset: tcset.append(i) for i in cset: tcset.append(i) tcset.sort() pset.sort() cset.sort() tset.sort() if style == 1: f.write('add\n') else: f.write('Training Set with Cross Validation Set .. '+str(len(tcset))+' members\n') for i in tcset: c += 1 f.write( str(i) + ' ' ) if c == linelen: f.write('\n') c = 0 if c != 0: f.write('\n') f.write('\n') if style == 1: f.write('stor 1\nwipe\nadd\n') else: f.write('Prediction Set .. '+str(len(pset))+' members\n') c = 0 for i in pset: c += 1 f.write( str(i) + ' ' ) if c == linelen: f.write('\n') c = 0 if c != 0: f.write('\n') f.write('\n') if style == 1: f.write('stor 2\nwipe\nadd\n') else: f.write('Training Set without Prediction & Cross Validation Sets .. '+str(len(tset))+' members\n') c = 0 for i in tset: c += 1 f.write( str(i) + ' ' ) if c == linelen: f.write('\n') c = 0 if c != 0: f.write('\n') f.write('\n') if style == 1: f.write('stor 3\nwipe\nadd\n') else: f.write('Cross Validation Set .. '+str(len(cset))+' members\n') c = 0 for i in cset: c += 1 f.write( str(i) + ' ' ) if c == linelen: f.write('\n') c = 0 if c != 0: f.write('\n') f.write('\n') if style == 1: f.write('stor 4\ndone\n')