#include #include #include #include #include #include #define METRIC_EUCLIDEAN 1 #define METRIC_COSINE 2 int verbose, metric; double **spe(double**,double,int,int,int,double,double,int,int); double eval_stress(double**,double**,int,int); double ed(double*,double*,int); double cd(double*,double*,int); double ed(double *v1, double *v2, int dim) { int i; register double d = 0; for (i = 0; i < dim; i++) d += (v1[i] - v2[i]) * (v1[i] - v2[i]); return(sqrt(d)); } double cd(double *v1, double *v2, int dim) { /* evaluates the cosine distance metric, the angle * between the supplied vectors */ double s1,s2,s3; int i; s1 = 0; s2 = 0; s3 = 0; for (i = 0; i < dim ;i++) { s1 += v1[i]*v2[i]; s2 += v1[i]*v1[i]; s3 += v2[i]*v2[i]; } return( acos(s1/sqrt(s2*s3)) ); } double eval_stress(double **x, double **sim, int edim, int nobs) { int samplesize = 1e6; int i; long double denom = 0.0; long double numer = 0.0; for (i =0; i < samplesize; i++) { register int a,b; double dab = 0; a = (int)(rand() * (float)(nobs-1) / (RAND_MAX+1.0)); while(1) { b = (int)(rand() * (float)(nobs-1) / (RAND_MAX+1.0)); if (b == a) continue; else break; } switch(metric) { case METRIC_EUCLIDEAN: dab = ed(x[a],x[b],edim); break; case METRIC_COSINE: dab = cd(x[a],x[b],edim); break; } denom += sim[a][b]; numer += (dab - sim[a][b]) * (dab - sim[a][b]) / sim[a][b]; } return( (double)(numer / denom) ); } double **spe(double **sim, double rcutpercent, int nobs, int ndim, int edim, double lambda0, double lambda1, int nstep, int ncycle) { FILE *fp1,*fp2; char t[64]; int i,j,k; double epsilon = 1e-8; double **x; double lambda = lambda0; double *tx1, *tx2; double rcut; /* storage for stress values - sstress is a nobs x 3 matrix with the columns * being ncycle, nstep, stress */ double **sstress, **cstress; /* get the value of rcut - its rcutpercent% of the maximum proximity * value */ double maxprox = 1e-37; for (i = 0; i < nobs-1; i++) { for (j = i+1; j < nobs; j++) if (sim[i][j] > maxprox) maxprox = sim[i][j]; } rcut = maxprox * rcutpercent; if (verbose) fprintf(stderr,"Obtained rcut\n"); /* alloc space */ x = malloc(sizeof(double*) * nobs); sstress = malloc(sizeof(double*) * nobs); for (i = 0; i < nobs; i++) { x[i] = malloc(sizeof(double) * edim); sstress[i] = malloc(sizeof(double) * 3); } tx1 = malloc(sizeof(double) * edim); tx2 = malloc(sizeof(double) * edim); /* random init (and dump the initial random config) */ fp1 = fopen("origx.txt","w"); for (i = 0; i < nobs; i++) { for (j = 0; j < edim; j++) { x[i][j] = (double) rand() / RAND_MAX; fprintf(fp1,"%f ",x[i][j]); } fprintf(fp1,"\n"); } fclose(fp1); /* start self organization */ fp1 = fopen("sstress.txt","w"); fp2 = fopen("cstress.txt","w"); for (i = 0; i < ncycle; i++) { if (ncycle >= 10 && i % (int)(.1*ncycle) == 0) { fprintf(stderr,"Cycle %d lambda = %f\n",i,lambda); /* should calculate stress at this point */ //fprintf(stderr,"\tEvaling stress\n"); //double stress = eval_stress(x, sim, edim, nobs); //fprintf(fp2,"%d %f\n",i,stress); } for (j = 0; j < nstep; j++) { int a,b; double dab; a = (int)(rand() * (float)(nobs-1) / (RAND_MAX+1.0)); while(1) { b = 1+ (int)(rand() * (float)(nobs-1) / (RAND_MAX+1.0)); if (b == a) continue; else break; } switch(metric) { case METRIC_EUCLIDEAN: dab = ed(x[a],x[b],edim); break; case METRIC_COSINE: dab = cd(x[a],x[b],edim); break; } if (sim[a][b] > rcut && dab >= sim[a][b]) continue; else { double t1; t1 = lambda * 0.5 * (sim[a][b] - dab) / (dab +epsilon); for (k = 0; k < edim; k++) { tx1[k] = t1 * (x[a][k] - x[b][k]); tx2[k] = t1 * (x[b][k] - x[a][k]); } for (k = 0; k < edim; k++) { x[a][k] += tx1[k]; x[b][k] += tx2[k]; } } } lambda -= ((lambda0 - lambda1) / (double)(ncycle - 1)); if (lambda < lambda1) break; } fclose(fp1); fclose(fp2); /* cleanup */ for (i = 0; i < nobs; i++) free(sstress[i]); free(sstress); free(tx1); free(tx2); return(x); } void usage(void) { printf("\nUsage: spe [OPTIONS]\n" "\n" "Options are:\n\n" "\t--help\t\tThis message\n" "\t--dump\t\tDump proximity data\n" "\t--nobs N\tThe number of observations in the dataset (N>0)\n" "\t--ndim N\tThe number of input dimensions (N>0)\n" "\t--edim N\tThe dimensions to embed in (N>0)\n" "\t--nstep N\tNumber of steps (default N=1e6)\n" "\t--ncycle N\tNumber of cycles (default N=100)\n" "\t--lambda0 N\tStarting lambda (default = 2.0)\n" "\t--lambda1 N\tEnding lambda (default = 0.1)\n" "\t--rcutp N\trcut percentage (default = 100). Set it to values\n" "\t \tabove hundred to get rcut = infinity\n" "\t--input FILE\tName of file containing input values\n" "\t--metric N\tThe distance metric to use. Options are:\n" "\t\t\t1 - Euclidean (default)\n" "\t\t\t2 - Cosine\n" "\t--prox FILE\tName of the file containing proximity data. This\n" "\t \tis optional and if supplied the input file is\n" "\t \tignored\n" "\tOutput files are:\n" "\t origx.txt - the initial random coordinates (edim dimension)\n" "\t x.txt - the final self organized coordinates (edim dimension)\n" "\t cstress.txt - stress at each cycle\n" "\t sstress.txt - stress at every 100 steps\n" "\t prox.txt - proximity data (optional)\n" ); return; } int main(int argc, char **argv) { FILE *fp; double **coord; double **dist; double d = 0; int i,j,k; /*********************** * config variables */ int dump = 0; int nobs = 0; int ndim = 0; int edim = 0; int ncycle = 100; int nstep = 1000000; double lambda0 = 2.0; double lambda1 = 0.1; char *inputdat = NULL; char *proxdat = NULL; double rcutp = 1; verbose = 0; metric = METRIC_EUCLIDEAN; while(1) { int c = 0; int option_index = 0; static struct option lopt[] = { {"help", no_argument, 0, 'h'}, {"nobs", required_argument, 0, 'o'}, {"ndim", required_argument, 0, 'n'}, {"edim", required_argument, 0, 'e'}, {"ncycle", required_argument, 0, 'c'}, {"nstep", required_argument, 0, 's'}, {"input", required_argument, 0, 'p'}, {"prox", required_argument, 0, 'b'}, {"lambda0", required_argument, 0, 'z'}, {"lambda1", required_argument, 0, 'x'}, {"rcutp", required_argument, 0, 'r'}, {"metric", required_argument, 0, 'm'}, {"verbose", no_argument, 0,255}, {"dump", no_argument, 0,254}, {0,0,0,0} }; c = getopt_long_only(argc, argv, "m:o:n:e:c:s:p:z:x:b:hv", lopt, &option_index); if (c == -1) break; switch(c) { case 'm': metric = atoi(optarg); if (metric < 1 || metric > 2) metric = METRIC_EUCLIDEAN; break; case 'r': rcutp = atof(optarg); if (rcutp < 0) rcutp = 0.1; if (rcutp > 100) rcutp = 1e6; rcutp = rcutp / 100; break; case 254: dump = 1; break; case 255: verbose = 1; break; case 'h': usage(); exit(0); break; case 'o': nobs = atoi(optarg); break; case 'n': ndim = atoi(optarg); break; case 'e': edim = atoi(optarg); break; case 'c': ncycle = atoi(optarg); break; case 's': nstep = atoi(optarg); break; case 'p': inputdat = strdup(optarg); break; case 'b': proxdat = strdup(optarg); break; case 'z': lambda0 = atof(optarg); break; case 'x': lambda1 = atof(optarg); break; } } if (nobs <= 0 || ndim <= 0 || edim <= 0 || ncycle <= 0 || nstep <= 0 || lambda0 <= 0 || lambda1 <= 0 || lambda0 < lambda1 || (!inputdat && !proxdat)) { usage(); exit(0); } if (verbose) { fprintf(stderr,"\n\nConfig: nobs: %d\n", nobs); fprintf(stderr," ndim: %d\n", ndim); fprintf(stderr," edim: %d\n", edim); fprintf(stderr," nstep: %d\n", nstep); fprintf(stderr," ncycle: %d\n", ncycle); fprintf(stderr," lambda0: %f\n", lambda0); fprintf(stderr," lambda1: %f\n", lambda1); fprintf(stderr," rcutp: %f\n", rcutp); fprintf(stderr," inputfile: %s\n", inputdat); switch(metric) { case METRIC_EUCLIDEAN: fprintf(stderr," metric: EUCLIDEAN\n"); break; case METRIC_COSINE: fprintf(stderr," metric: COSINE\n"); break; } fprintf(stderr,"\n\n"); } dist = (double**)calloc(nobs, sizeof(double*)); coord = (double**)calloc(nobs, sizeof(double*)); for (i = 0; i < nobs; i++) { dist[i] = (double*)calloc(nobs, sizeof(double)); coord[i] = (double*)calloc(ndim, sizeof(double)); } if (proxdat) { /* read in proximity data, ignore input coords and distance calculation */ fprintf(stderr,"Reading proximities "); fp = fopen(proxdat,"r"); for (i = 0; i < nobs-1; i++) { if (i % 200 == 0) fprintf(stderr,"."); dist[i][i] = 0.0; for (j = i+1; j < nobs; j++) { float t = 0; fscanf(fp,"%f", &t); dist[i][j] = t; dist[j][i] = t; } } fclose(fp); fprintf(stderr,"done\n"); } else { fp = fopen(inputdat,"r"); for (i = 0; i < nobs; i++) { for (j = 0; j < ndim; j++) { float t = 0; fscanf(fp, "%f", &t); coord[i][j] = (double)t; } } fclose(fp); fprintf(stderr,"Read in coordinates\n"); /* get proximity data */ for (i = 0; i < nobs-1; i++) { for (j = i+1; j < nobs; j++) { double dab; switch(metric) { case METRIC_EUCLIDEAN: dab = ed(coord[i],coord[j],ndim); break; case METRIC_COSINE: dab = cd(coord[i],coord[j],ndim); break; } dist[i][j] = dab; dist[j][i] = dab; } } fprintf(stderr,"Calculated distances\n"); } // dump distances if (dump) { /* just dump the upper triangle */ double maxd = -1e37; fp = fopen("prox.txt","w"); for (i = 0; i < nobs; i++) { for (j = 0; j < nobs; j++) { fprintf(fp,"%f ", dist[i][j]); if (dist[i][j] > maxd) maxd = dist[i][j]; } fprintf(fp,"\n"); } fclose(fp); fprintf(stderr,"Saved distances\n"); printf("maxd = %f\n", maxd); } fprintf(stderr,"Evaluating SPE\n"); /**************************************************** * OK, everything is prep'ed. Lets do SPE! ****************************************************/ double **x = spe(dist, rcutp, nobs, ndim, edim, lambda0, lambda1, nstep, ncycle); double stress = eval_stress(x,dist,edim,nobs); fprintf(stdout,"final stress = %f\n", stress); /* dump the final x[][] */ fp = fopen("x.txt","w"); for (i = 0; i < nobs; i++) { for (j = 0; j < edim; j++) { fprintf(fp,"%f ", x[i][j]); } fprintf(fp,"\n"); } fclose(fp); /* cleanup */ for (i = 0; i < nobs; i++) { free(coord[i]); free(dist[i]); } free(coord); free(dist); if (inputdat) free(inputdat); if (proxdat) free(proxdat); return(1); }