import math
import random
import sys

def makebin(danlist, depvlist, numbin = 10):

    """ Generates the bins and cumulative counts for each bin

    Input: list of dans, list of depvars, number of bins (default is 10)
    Output: 2 lists - bins (a list whose each element contains a list
    of dans in that bin) & probs (a list of tuples, whose members give the
    range of ofcumulative population of each bin)

    Useage: Basically this method is used to generate input for generate_set
    method. The method will return a list whose elemenst are lists of dans in
    a given range. In addition another list is returned whoe members are tuples
    of integers. Basically it is used by generate_sets to select which bin to 
    choose a dan from. 

    The struccture of each prob element, i,  is (lower, upper), so that if a random
    number lies between lower & upper, then a dan from the bin indexed by i.

    The bin ranges are generated by taking the lowest and highest depv value
    and creating the specified number of bins in that range (*NO* normalization
    is carried out)
    """
    
    # normalize the depv list
    #sum = 0
    #for depv in depvlist:
    #    sum = sum + depv*depv
    #sum = math.sqrt(sum)    
    #depvlist = [ (x/sum) for x in depvlist ]

    # find min and max vals and bin on that range
#    maxdepv = -99999
#    mindepv = 99999
#    for i in depvlist:
#        if i > maxdepv: 
#            maxdepv = i
#            continue
#        if i < mindepv: 
#            mindepv = i
#            continue
    maxdepv = max(depvlist)
    mindepv = min(depvlist)

    bins = []
    binrange = []
    probrange = []

    bininc = (maxdepv - mindepv) / numbin
    for i in range(0, numbin):
        bins.append([])
        probrange.append(0)
    
    low = mindepv
    for i in range(0,numbin):
        up = low + bininc
        binrange.append( (low,up) )
        low = up

    # I do this so that the max value is binned (since the condition
    # enter a bin is >= lower && < upper.
    # Thus the max val which is == upper (for the higghest bin) would 
    # never satisfy the condition. Thus I increase the upper of the highest bin
    l,u = binrange[numbin-1]
    u += bininc
    binrange[numbin-1] = (l,u)
    
    
    # now do the binning
    for i in range(0, len(depvlist)):
        for j in range(0, numbin):
            low , up = binrange[j]
            if depvlist[i] >= low and depvlist[i] < up:
                bins[j].append( danlist[i] )

    # Now calculate the cumulative counts of bin contents            
    old = 0;
    for i in range(0, numbin):
        if (len(bins[i])) != 0:
            probrange[i] = old + len(bins[i])
            old = probrange[i]
        else:
            probrange[i] = 0
            #old = 0
    
    # Now I should sort the probrange and bins lists, in 
    # order increasing probrange
    flag = 1
    while(flag):
        for i in range(0, numbin):

            for j in range(i+1, numbin):

                flag = 0
                if probrange[i] > probrange[j]:
                    t = probrange[i]
                    probrange[i] = probrange[j]
                    probrange[j] = t

                    t = bins[i]
                    bins[i] = bins[j]
                    bins[j] = t
                    flag = 1
                    
    # Now rearrange the cum probs into prob ranges
    # for each bin - makes it easier to handle later
    probs = []
    low = 0
    for i in probrange:
        probs.append( (low, i) )
        low = i
        
    return( bins, probs )     
        
###########################################################

def generate_sets(bins, probs, numt, nump, numc, danl, depv, debug = 0):
    """ Generate TSET, PSET & CVSET, using binned values of the depv variable

    Input: list of bins, cumulative counts, number of members in tset, pset & cvset
    Output: 3 lists of dans -> tset, pset, cvset

    Before this function is called, you must call makebin to get the binned dans
    and cumulative population counts.

    It works as follows: for each member of the tset, we first choose which bin to
    get a dan from. A random number is generated (ranging from 0 to the max cumulative
    count) and then is compared to each range in probs. If it lies in the range if
    prob[i], then selection is made from bins[i].

    After the bin has been selected, another random number is generated (between
    0 and the length of the bin) and the element of the dan list in the bin[i]
    is put into the tset and removed from the danlist of that bin
    """
    

    if numt == 0 and nump == 0 and numc == 0:
        return None

    # should also check that numt+nump+numc !> number of obs

    tset = []
    pset = []
    cset = []
    
    
    # find the max value in probs - so that we generate
    # RN's between 0 and the max value
    maxprob = -999999
    for l,u in probs:
        if u > maxprob: maxprob = u
        
    
    # select sets one by one, deleting each chosen dan
    # from its bin
    binnum = 0
    if debug: print 'Doing tset'
    for i in range(0, numt):

        # choose which bin    
        while 1:
            r1 = random.randrange(0, maxprob+1, 1)
            for j in range(0, len(probs)):
                l,u = probs[j]
                if r1 >= l and r1 < u:
                    binnum = j

            blist = bins[binnum]
            if blist == []: continue
            r2 = random.randrange(0, len(blist), 1)
            tset.append(blist[r2])
            blist.remove(blist[r2])
            bins[binnum] = blist
            break

    # PSETS
    binnum = 0
    if debug: print 'Doing pset'
    for i in range(0, nump):

        # choose which bin    
        while 1:
            r1 = random.randrange(0, maxprob+1, 1)
            for j in range(0, len(probs)):
                l,u = probs[j]
                if r1 >= l and r1 < u:
                    binnum = j
            
            #print 'choosing bin = '+str(binnum)
            blist = bins[binnum]
            if blist == []: continue
            r2 = random.randrange(0, len(blist), 1)
            pset.append(blist[r2])
            blist.remove(blist[r2])
            bins[binnum] = blist
            break

    # CSET - no need to do this randomly, just take the remainder!!
    binnum = 0
    if debug: print 'Doing cset'
    for i in bins:
        if i != []:
            cset.extend(i)
            
    # Now we want to make sure that the dans of the 2 highest and
    # 2 lowest depv values are in the training set.
    max1 = max(depv) 
    idx = depv.index(max1)
    max1 = danl[idx]
    depv.pop(idx)
    danl.pop(idx)
    max2 = max(depv) 
    idx = depv.index(max2)
    max2 = danl[idx]
    depv.pop(idx)
    danl.pop(idx)

    min1 = max(depv) 
    idx = depv.index(min1)
    min1 = danl[idx]
    depv.pop(idx)
    danl.pop(idx)
    min2 = max(depv) 
    idx = depv.index(min2)
    min2 = danl[idx]
    depv.pop(idx)
    danl.pop(idx)
    
    if max1 in tset and min1 in tset and max2 in tset and min2 in tset:
        return tset,pset,cset
    
    ml = [max1, max2, min1, min2]
    
    for newval in ml:
        if newval not in tset:
            # Find out which set its in and swap it with tset[1] 
            if newval in pset:
                pset.remove(newval)
                for i in tset:
                    if i not in ml:
                        t = i
                        break
                pset.append(t)
                tset.remove(t)
                tset.append(newval)
                continue

            if newval in cset:
                cset.remove(newval)
                for i in tset:
                    if i not in ml:
                        t = i
                        break
                cset.append(t)
                tset.remove(t)
                tset.append(newval)
                continue

    return tset, pset, cset
    
##########################################

def print_set(tset, pset, cset, fname, style = 1):

    try:
        f = open(fname,'w')
    except IOError:
        print "Cound'nt open "+fname+". print_set() failed"
        return

    linelen = 10
    c = 0

    tcset = []
    for i in tset:
        tcset.append(i)
    for i in cset:
        tcset.append(i)
    tcset.sort()    

    pset.sort()
    cset.sort()
    tset.sort()
    
    if style == 1: 
        f.write('add\n')
    else:
        f.write('Training Set with Cross Validation Set .. '+str(len(tcset))+' members\n')
        
    for i in tcset:
        c += 1
        f.write( str(i) + ' ' )
        if c == linelen:
            f.write('\n')
            c = 0

    if c != 0: f.write('\n')
    f.write('\n')

    if style == 1:
        f.write('stor 1\nwipe\nadd\n')
    else:
        f.write('Prediction Set .. '+str(len(pset))+' members\n')

    c = 0
    for i in pset:
        c += 1
        f.write( str(i) + ' ' )
        if c == linelen:
            f.write('\n')
            c = 0
      
    if c != 0: f.write('\n')
    f.write('\n')
    if style == 1:
        f.write('stor 2\nwipe\nadd\n')
    else:
        f.write('Training Set without Prediction & Cross Validation Sets .. '+str(len(tset))+' members\n')
        

    c = 0
    for i in tset:
        c += 1
        f.write( str(i) + ' ' )
        if c == linelen:
            f.write('\n')
            c = 0
    
    if c != 0: f.write('\n')
    f.write('\n')
    if style == 1:
        f.write('stor 3\nwipe\nadd\n')
    else:
        f.write('Cross Validation Set .. '+str(len(cset))+' members\n')

    c = 0
    for i in cset:
        c += 1
        f.write( str(i) + ' ' )
        if c == linelen:
            f.write('\n')
            c = 0
    
    if c != 0: f.write('\n')
    f.write('\n')
    if style == 1:
        f.write('stor 4\ndone\n')