├── .gitignore ├── fasttsne ├── orig-lvdm │ ├── compile_linux │ ├── compile_mac │ ├── Readme.txt │ ├── tsne.h │ ├── quadtree.h │ ├── fast_tsne.m │ ├── vptree.h │ ├── quadtree.cpp │ └── tsne.cpp ├── Makefile ├── fasttsne.pyx ├── setup.py └── __init__.py ├── test.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | */*.so 2 | */build/ 3 | fasttsne/fasttsne.cpp 4 | *.pkl.gz 5 | -------------------------------------------------------------------------------- /fasttsne/orig-lvdm/compile_linux: -------------------------------------------------------------------------------- 1 | echo "Make sure to change the path to CBLAS in this file before running it!" 2 | g++ quadtree.cpp tsne.cpp -o bh_tsne -O3 -I./CBLAS/include -L./ -lcblas -------------------------------------------------------------------------------- /fasttsne/Makefile: -------------------------------------------------------------------------------- 1 | build: 2 | python setup.py build_ext --inplace 3 | 4 | install: 5 | python setup.py build_ext --inplace 6 | python setup.py install 7 | 8 | clean : 9 | rm -rf *.pyc *.so build/ fasttsne.cpp 10 | -------------------------------------------------------------------------------- /fasttsne/orig-lvdm/compile_mac: -------------------------------------------------------------------------------- 1 | echo "Make sure to change the path to CBLAS in this file before running it!" 2 | g++ quadtree.cpp tsne.cpp -o bh_tsne -O3 -I/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/Headers -lcblas 3 | -------------------------------------------------------------------------------- /fasttsne/fasttsne.pyx: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | cimport numpy as np 3 | cimport cython 4 | 5 | cdef extern from "tsne.h": 6 | cdef cppclass TSNE: 7 | TSNE() 8 | void run(double* X, int N, int D, double* Y, int no_dims, double perplexity, double theta) 9 | 10 | 11 | cdef class _TSNE: 12 | cdef TSNE* thisptr # hold a C++ instance 13 | 14 | def __cinit__(self): 15 | self.thisptr = new TSNE() 16 | 17 | def __dealloc__(self): 18 | del self.thisptr 19 | 20 | @cython.boundscheck(False) 21 | @cython.wraparound(False) 22 | def run(self, X, N, D, d, perplexity, theta): 23 | cdef np.ndarray[np.float64_t, ndim=2, mode='c'] _X = X 24 | cdef np.ndarray[np.float64_t, ndim=2, mode='c'] Y = np.zeros((N, d), dtype=np.float64) 25 | self.thisptr.run(&_X[0,0], N, D, &Y[0,0], d, perplexity, theta) 26 | return Y 27 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import gzip, cPickle 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | from fasttsne import fast_tsne 7 | 8 | 9 | f = gzip.open("mnist.pkl.gz", "rb") 10 | train, val, test = cPickle.load(f) 11 | f.close() 12 | 13 | # Get all data in one array 14 | _train = np.asarray(train[0], dtype=np.float64) 15 | _val = np.asarray(val[0], dtype=np.float64) 16 | _test = np.asarray(test[0], dtype = np.float64) 17 | mnist = np.vstack((_train, _val, _test)) 18 | 19 | # Also the classes, for labels in the plot later 20 | classes = np.hstack((train[1], val[1], test[1])) 21 | 22 | perplexity = 30. 23 | theta = 0.5 24 | Y = fast_tsne(mnist, perplexity=perplexity, theta=theta) 25 | 26 | digits = set(classes) 27 | fig = plt.figure() 28 | colormap = plt.cm.spectral 29 | plt.gca().set_color_cycle(colormap(i) for i in np.linspace(0, 0.9, 10)) 30 | ax = fig.add_subplot(111) 31 | labels = [] 32 | for d in digits: 33 | idx = classes==d 34 | ax.plot(Y[idx, 0], Y[idx, 1], 'o') 35 | labels.append(d) 36 | ax.legend(labels, numpoints=1, fancybox=True) 37 | plt.show() 38 | -------------------------------------------------------------------------------- /fasttsne/setup.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | from distutils.core import setup 3 | from distutils.extension import Extension 4 | from Cython.Distutils import build_ext 5 | import sys 6 | 7 | 8 | if sys.platform == 'darwin': 9 | ext_modules = [Extension( 10 | name="fasttsne", 11 | sources=["orig-lvdm/quadtree.cpp", "orig-lvdm/tsne.cpp", "fasttsne.pyx"], 12 | include_dirs = [numpy.get_include(), "orig-lvdm/"], 13 | extra_compile_args=['-faltivec', '-I/System/Library/Frameworks/vecLib.framework/Headers'], 14 | extra_link_args=["-Wl,-framework", "-Wl,Accelerate", "-lcblas"], 15 | language="c++" 16 | )] 17 | else: 18 | ext_modules = [Extension( 19 | name="fasttsne", 20 | sources=["orig-lvdm/quadtree.cpp", "orig-lvdm/tsne.cpp", "fasttsne.pyx"], 21 | include_dirs = [numpy.get_include(), "/usr/local/include", "orig-lvdm/"], 22 | library_dirs = ["/usr/local/lib"], 23 | extra_compile_args=['-msse2', '-O3', '-fPIC', '-w'], 24 | extra_link_args=["-lopenblas"], 25 | language="c++" 26 | )] 27 | 28 | setup( 29 | name = "fasttsne", 30 | cmdclass = {"build_ext": build_ext}, 31 | ext_modules = ext_modules, 32 | ) 33 | -------------------------------------------------------------------------------- /fasttsne/__init__.py: -------------------------------------------------------------------------------- 1 | import scipy.linalg as la 2 | import numpy as np 3 | 4 | 5 | from fasttsne import _TSNE as TSNE 6 | 7 | 8 | def fast_tsne(data, pca_d=None, d=2, perplexity=30., theta=0.5): 9 | """ 10 | Run Barnes-Hut T-SNE on _data_. 11 | 12 | @param data The data. 13 | 14 | @param pca_d The dimensionality of data is reduced via PCA 15 | to this dimensionality. 16 | 17 | @param d The embedding dimensionality. Must be fixed to 18 | 2. 19 | 20 | @param perplexity The perplexity controls the effective number of 21 | neighbors. 22 | 23 | @param theta If set to 0, exact t-SNE is run, which takes 24 | very long for dataset > 5000 samples. 25 | """ 26 | N, _ = data.shape 27 | 28 | # inplace!! 29 | 30 | if pca_d is None: 31 | X = data 32 | else: 33 | # do PCA 34 | data -= data.mean(axis=0) 35 | 36 | # working with covariance + (svd on cov.) is 37 | # much faster than svd on data directly. 38 | cov = np.dot(data.T, data)/N 39 | u, s, v = la.svd(cov, full_matrices=False) 40 | u = u[:,0:pca_d] 41 | X = np.dot(data, u) 42 | 43 | tsne = TSNE() 44 | Y = tsne.run(X, N, X.shape[1], d, perplexity, theta) 45 | return Y 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Summary 2 | ======= 3 | 4 | A python wrapper for [Barnes-Hut t-SNE](http://homepage.tudelft.nl/19j49/t-SNE.html). The wrapper was successfully tested on OSX (10.6/10.7), Ubuntu (11.04) and Arch Linux. 5 | 6 | The modifications to the original [C++ source](http://homepage.tudelft.nl/19j49/t-SNE_files/bh_tsne.tar.gz) are minimal: See the diff for the second overall commit. 7 | 8 | Differently to an already existing [wrapper](https://github.com/ninjin/barnes-hut-sne), I use [cython](http://www.cython.org). 9 | 10 | Requirements 11 | ------------ 12 | 13 | * [numpy](numpy.scipy.org) 14 | * [cython](cython.org) 15 | * [openblas](https://github.com/xianyi/OpenBLAS). Tested version is v0.2.5 and v0.2.6 (not necessary for OSX). 16 | 17 | 18 | Building 19 | -------- 20 | In the subdirectory(!) ```fasttsne/```, run ```make```. Make sure that your openblas library is available. Or any other BLAS library, but then changes in ```fasttsne/setup.py``` are necessary (change ```include_dirs``` and/or ```library_dirs```). 21 | 22 | 23 | Testing 24 | ------- 25 | For testing the algorithm, add ```fasttsne/``` to your ```PYTHONPATH``` and run ```python test.py``` after a successful build. Note that the file ```mnist.pkl.gz``` has to be in the main directory. You can download it from [here](http://deeplearning.net/data/mnist/mnist.pkl.gz). 26 | 27 | 28 | More Information 29 | ---------------- 30 | See *Barnes-Hut-SNE*, L.J.P. van der Maaten. It is available on [arxiv](http://arxiv.org/abs/1301.3342). 31 | -------------------------------------------------------------------------------- /fasttsne/orig-lvdm/Readme.txt: -------------------------------------------------------------------------------- 1 | Barnes-Hut-SNE v0.1 2 | ------------------------------------------- 3 | © Laurens van der Maaten 4 | Delft University of Technology, 2012 5 | =========================================== 6 | 7 | 8 | DESCRIPTION 9 | 10 | This code contains a C++ implementation of Barnes-Hut-SNE as described in the corresponding paper. Please cite this paper whenever you use this code. 11 | 12 | The code also contains a Matlab wrapper for the C++ code (fast_tsne.m). Please refer to the help text in fast_tsne.m for more information on the input format for the data. Based on the code in fast_tsne.m, it is straightforward to develop wrappers in other programming languages. 13 | 14 | 15 | COMPILATION 16 | 17 | Compilation of the files is relatively straightforward, but requires a working installation of CBLAS. Please refer to the compile_mac and compile_linux shell scripts to see the required compilation command. Note that paths may be different on your machine, so may have to be changed in order for the files to compile. 18 | 19 | 20 | LEGAL 21 | 22 | You are free to use, modify, or redistribute this software in any way you want, but only for non-commercial purposes. The use of the software is at your own risk; the authors are not responsible for any damage as a result from errors in the software. 23 | 24 | 25 | CONTACT 26 | If you encounter problems with the implementations or have questions about Barnes-Hut-SNE, make sure you read the paper and the online FAQ first! If your question is not answered afterwards, feel free to send me an email at: lvdmaaten@gmail.com 27 | -------------------------------------------------------------------------------- /fasttsne/orig-lvdm/tsne.h: -------------------------------------------------------------------------------- 1 | /* 2 | * tsne.h 3 | * Header file for t-SNE. 4 | * 5 | * Created by Laurens van der Maaten. 6 | * Copyright 2012, Delft University of Technology. All rights reserved. 7 | * 8 | */ 9 | 10 | 11 | #ifndef TSNE_H 12 | #define TSNE_H 13 | 14 | 15 | static inline double sign(double x) { return (x == .0 ? .0 : (x < .0 ? -1.0 : 1.0)); } 16 | 17 | 18 | class TSNE 19 | { 20 | public: 21 | void run(double* X, int N, int D, double* Y, int no_dims, double perplexity, double theta); 22 | bool load_data(double** data, int* n, int* d, double* theta, double* perplexity); 23 | void save_data(double* data, int* landmarks, double* costs, int n, int d); 24 | 25 | void symmetrizeMatrix(int** row_P, int** col_P, double** val_P, int N); // should be static?! 26 | 27 | 28 | private: 29 | void computeGradient(double* P, int* inp_row_P, int* inp_col_P, double* inp_val_P, double* Y, int N, int D, double* dC, double theta); 30 | void computeExactGradient(double* P, double* Y, int N, int D, double* dC); 31 | double evaluateError(double* P, double* Y, int N); 32 | double evaluateError(int* row_P, int* col_P, double* val_P, double* Y, int N, double theta); 33 | void zeroMean(double* X, int N, int D); 34 | void computeGaussianPerplexity(double* X, int N, int D, double* P, double perplexity); 35 | void computeGaussianPerplexity(double* X, int N, int D, int** _row_P, int** _col_P, double** _val_P, double perplexity, int K); 36 | void computeGaussianPerplexity(double* X, int N, int D, int** _row_P, int** _col_P, double** _val_P, double perplexity, double threshold); 37 | void computeSquaredEuclideanDistance(double* X, int N, int D, double* DD); 38 | double randn(); 39 | }; 40 | 41 | #endif 42 | 43 | -------------------------------------------------------------------------------- /fasttsne/orig-lvdm/quadtree.h: -------------------------------------------------------------------------------- 1 | /* 2 | * quadtree.h 3 | * Header file for a quadtree. 4 | * 5 | * Created by Laurens van der Maaten. 6 | * Copyright 2012, Delft University of Technology. All rights reserved. 7 | * 8 | */ 9 | 10 | #ifndef QUADTREE_H 11 | #define QUADTREE_H 12 | 13 | using namespace std; 14 | 15 | static inline double min(double x, double y) { return (x <= y ? x : y); } 16 | static inline double max(double x, double y) { return (x <= y ? y : x); } 17 | static inline double abs(double x) { return (x < .0 ? -x : x); } 18 | 19 | class Cell { 20 | 21 | public: 22 | double x; 23 | double y; 24 | double hw; 25 | double hh; 26 | bool containsPoint(double point[]); 27 | }; 28 | 29 | 30 | class QuadTree 31 | { 32 | 33 | // Fixed constants 34 | static const int QT_NO_DIMS = 2; 35 | static const int QT_NODE_CAPACITY = 1; 36 | 37 | // A buffer we use when doing force computations 38 | double buff[QT_NO_DIMS]; 39 | 40 | // Properties of this node in the tree 41 | QuadTree* parent; 42 | bool is_leaf; 43 | int size; 44 | int cum_size; 45 | 46 | // Axis-aligned bounding box stored as a center with half-dimensions to represent the boundaries of this quad tree 47 | Cell boundary; 48 | 49 | // Indices in this quad tree node, corresponding center-of-mass, and list of all children 50 | double* data; 51 | double center_of_mass[QT_NO_DIMS]; 52 | int index[QT_NODE_CAPACITY]; 53 | 54 | // Children 55 | QuadTree* northWest; 56 | QuadTree* northEast; 57 | QuadTree* southWest; 58 | QuadTree* southEast; 59 | 60 | public: 61 | QuadTree(double* inp_data, int N); 62 | QuadTree(double* inp_data, double inp_x, double inp_y, double inp_hw, double inp_hh); 63 | QuadTree(double* inp_data, int N, double inp_x, double inp_y, double inp_hw, double inp_hh); 64 | QuadTree(QuadTree* inp_parent, double* inp_data, int N, double inp_x, double inp_y, double inp_hw, double inp_hh); 65 | QuadTree(QuadTree* inp_parent, double* inp_data, double inp_x, double inp_y, double inp_hw, double inp_hh); 66 | ~QuadTree(); 67 | void setData(double* inp_data); 68 | QuadTree* getParent(); 69 | void construct(Cell boundary); 70 | bool insert(int new_index); 71 | void subdivide(); 72 | bool isCorrect(); 73 | void rebuildTree(); 74 | void getAllIndices(int* indices); 75 | int getDepth(); 76 | void computeNonEdgeForces(int point_index, double theta, double neg_f[], double* sum_Q); 77 | void computeEdgeForces(int* row_P, int* col_P, double* val_P, int N, double* pos_f); 78 | void print(); 79 | 80 | private: 81 | void init(QuadTree* inp_parent, double* inp_data, double inp_x, double inp_y, double inp_hw, double inp_hh); 82 | void fill(int N); 83 | int getAllIndices(int* indices, int loc); 84 | bool isChild(int test_index, int start, int end); 85 | }; 86 | 87 | #endif 88 | -------------------------------------------------------------------------------- /fasttsne/orig-lvdm/fast_tsne.m: -------------------------------------------------------------------------------- 1 | function mappedX = fast_tsne(X, initial_dims, perplexity, theta) 2 | %FAST_TSNE Runs the (landmark) C++ implementation of t-SNE 3 | % 4 | % mappedX = fast_tsne(X, initial_dims, perplexity, theta) 5 | % 6 | % Runs the C++ implementation of Barnes-Hut-SNE. The high-dimensional 7 | % datapoints are specified in the NxD matrix X. The dimensionality of the 8 | % datapoints is reduced to initial_dims dimensions using PCA (default = 50) 9 | % before t-SNE is performed. Next, t-SNE reduces the points to two 10 | % dimensions. The perplexity of the input similarities may be specified 11 | % through the perplexity variable (default = 30). The variable theta sets 12 | % the trade-off parameter between speed and accuracy: theta = 0 corresponds 13 | % to standard, slow t-SNE, while theta = 1 makes very crude approximations. 14 | % Appropriate values for theta are between 0.1 and 0.7 (default = 0.5). 15 | % The function returns the two-dimensional data points in mappedX. 16 | % 17 | % NOTE: The function is designed to run on large (N > 5000) data sets. It 18 | % may give poor performance on very small data sets (it is better to use a 19 | % standard t-SNE implementation on such data). 20 | % 21 | % 22 | % (C) Laurens van der Maaten 23 | % Delft University of Technology, 2012 24 | 25 | 26 | if ~exist('initial_dims', 'var') || isempty(initial_dims) 27 | initial_dims = 50; 28 | end 29 | if ~exist('perplexity', 'var') 30 | perplexity = 30; 31 | end 32 | if ~exist('theta', 'var') 33 | theta = 0.5; 34 | end 35 | 36 | % Perform the initial dimensionality reduction using PCA 37 | X = double(X); 38 | X = bsxfun(@minus, X, mean(X, 1)); 39 | covX = X' * X; 40 | [M, lambda] = eig(covX); 41 | [~, ind] = sort(diag(lambda), 'descend'); 42 | if initial_dims > size(M, 2) 43 | initial_dims = size(M, 2); 44 | end 45 | M = M(:,ind(1:initial_dims)); 46 | X = X * M; 47 | clear covX M lambda 48 | 49 | % Run the fast diffusion SNE implementation 50 | write_data(X, theta, perplexity); 51 | tic, system('./bh_tsne'); toc 52 | [mappedX, landmarks, costs] = read_data; 53 | landmarks = landmarks + 1; % correct for Matlab indexing 54 | delete('data.dat'); 55 | delete('result.dat'); 56 | end 57 | 58 | 59 | % Writes the datafile for the fast t-SNE implementation 60 | function write_data(X, theta, perplexity) 61 | [n, d] = size(X); 62 | h = fopen('data.dat', 'wb'); 63 | fwrite(h, n, 'integer*4'); 64 | fwrite(h, d, 'integer*4'); 65 | fwrite(h, theta, 'double'); 66 | fwrite(h, perplexity, 'double'); 67 | fwrite(h, X', 'double'); 68 | fclose(h); 69 | end 70 | 71 | 72 | % Reads the result file from the fast t-SNE implementation 73 | function [X, landmarks, costs] = read_data 74 | h = fopen('result.dat', 'rb'); 75 | n = fread(h, 1, 'integer*4'); 76 | d = fread(h, 1, 'integer*4'); 77 | X = fread(h, n * d, 'double'); 78 | landmarks = fread(h, n, 'integer*4'); 79 | landmarks = landmarks + 1; 80 | costs = fread(h, n, 'double'); % this vector contains only zeros 81 | X = reshape(X, [d n])'; 82 | fclose(h); 83 | end 84 | -------------------------------------------------------------------------------- /fasttsne/orig-lvdm/vptree.h: -------------------------------------------------------------------------------- 1 | /* 2 | * vptree.h 3 | * Implementation of a vantage-point tree. 4 | * 5 | * Created by Laurens van der Maaten. 6 | * Copyright 2012, Delft University of Technology. All rights reserved. 7 | * 8 | */ 9 | 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | 19 | #ifndef VPTREE_H 20 | #define VPTREE_H 21 | 22 | class DataPoint 23 | { 24 | int _D; 25 | int _ind; 26 | double* _x; 27 | 28 | public: 29 | DataPoint() { 30 | _D = 1; 31 | _ind = -1; 32 | _x = NULL; 33 | } 34 | DataPoint(int D, int ind, double* x) { 35 | _D = D; 36 | _ind = ind; 37 | _x = (double*) malloc(_D * sizeof(double)); 38 | for(int d = 0; d < _D; d++) _x[d] = x[d]; 39 | } 40 | DataPoint(const DataPoint& other) { // this makes a deep copy -- should not free anything 41 | if(this != &other) { 42 | _D = other.dimensionality(); 43 | _ind = other.index(); 44 | _x = (double*) malloc(_D * sizeof(double)); 45 | for(int d = 0; d < _D; d++) _x[d] = other.x(d); 46 | } 47 | } 48 | ~DataPoint() { if(_x != NULL) free(_x); } 49 | DataPoint& operator= (const DataPoint& other) { // asignment should free old object 50 | if(this != &other) { 51 | if(_x != NULL) free(_x); 52 | _D = other.dimensionality(); 53 | _ind = other.index(); 54 | _x = (double*) malloc(_D * sizeof(double)); 55 | for(int d = 0; d < _D; d++) _x[d] = other.x(d); 56 | } 57 | return *this; 58 | } 59 | int index() const { return _ind; } 60 | int dimensionality() const { return _D; } 61 | double x(int d) const { return _x[d]; } 62 | }; 63 | 64 | 65 | double euclidean_distance(const DataPoint &t1, const DataPoint &t2) { 66 | double dd = .0; 67 | for(int d = 0; d < t1.dimensionality(); d++) dd += (t1.x(d) - t2.x(d)) * (t1.x(d) - t2.x(d)); 68 | return dd; 69 | } 70 | 71 | 72 | template 73 | class VpTree 74 | { 75 | public: 76 | 77 | // Default constructor 78 | VpTree() : _root(0) {} 79 | 80 | // Destructor 81 | ~VpTree() { 82 | delete _root; 83 | } 84 | 85 | // Function to create a new VpTree from data 86 | void create(const std::vector& items) { 87 | delete _root; 88 | _items = items; 89 | _root = buildFromPoints(0, items.size()); 90 | } 91 | 92 | // Function that uses the tree to find the k nearest neighbors of target 93 | void search(const T& target, int k, std::vector* results, std::vector* distances) 94 | { 95 | 96 | // Use a priority queue to store intermediate results on 97 | std::priority_queue heap; 98 | 99 | // Variable that tracks the distance to the farthest point in our results 100 | _tau = DBL_MAX; 101 | 102 | // Perform the searcg 103 | search(_root, target, k, heap); 104 | 105 | // Gather final results 106 | results->clear(); distances->clear(); 107 | while(!heap.empty()) { 108 | results->push_back(_items[heap.top().index]); 109 | distances->push_back(heap.top().dist); 110 | heap.pop(); 111 | } 112 | 113 | // Results are in reverse order 114 | std::reverse(results->begin(), results->end()); 115 | std::reverse(distances->begin(), distances->end()); 116 | } 117 | 118 | private: 119 | std::vector _items; 120 | double _tau; 121 | 122 | // Single node of a VP tree (has a point and radius; left children are closer to point than the radius) 123 | struct Node 124 | { 125 | int index; // index of point in node 126 | double threshold; // radius(?) 127 | Node* left; // points closer by than threshold 128 | Node* right; // points farther away than threshold 129 | 130 | Node() : 131 | index(0), threshold(0.), left(0), right(0) {} 132 | 133 | ~Node() { // destructor 134 | delete left; 135 | delete right; 136 | } 137 | }* _root; 138 | 139 | 140 | // An item on the intermediate result queue 141 | struct HeapItem { 142 | HeapItem( int index, double dist) : 143 | index(index), dist(dist) {} 144 | int index; 145 | double dist; 146 | bool operator<(const HeapItem& o) const { 147 | return dist < o.dist; 148 | } 149 | }; 150 | 151 | // Distance comparator for use in std::nth_element 152 | struct DistanceComparator 153 | { 154 | const T& item; 155 | DistanceComparator(const T& item) : item(item) {} 156 | bool operator()(const T& a, const T& b) { 157 | return distance(item, a) < distance(item, b); 158 | } 159 | }; 160 | 161 | // Function that (recursively) fills the tree 162 | Node* buildFromPoints( int lower, int upper ) 163 | { 164 | if (upper == lower) { // indicates that we're done here! 165 | return NULL; 166 | } 167 | 168 | // Lower index is center of current node 169 | Node* node = new Node(); 170 | node->index = lower; 171 | 172 | if (upper - lower > 1) { // if we did not arrive at leaf yet 173 | 174 | // Choose an arbitrary point and move it to the start 175 | int i = (int) ((double)rand() / RAND_MAX * (upper - lower - 1)) + lower; 176 | std::swap(_items[lower], _items[i]); 177 | 178 | // Partition around the median distance 179 | int median = (upper + lower) / 2; 180 | std::nth_element(_items.begin() + lower + 1, 181 | _items.begin() + median, 182 | _items.begin() + upper, 183 | DistanceComparator(_items[lower])); 184 | 185 | // Threshold of the new node will be the distance to the median 186 | node->threshold = distance(_items[lower], _items[median]); 187 | 188 | // Recursively build tree 189 | node->index = lower; 190 | node->left = buildFromPoints(lower + 1, median); 191 | node->right = buildFromPoints(median, upper); 192 | } 193 | 194 | // Return result 195 | return node; 196 | } 197 | 198 | // Helper function that searches the tree 199 | void search(Node* node, const T& target, int k, std::priority_queue& heap) 200 | { 201 | if(node == NULL) return; // indicates that we're done here 202 | 203 | // Compute distance between target and current node 204 | double dist = distance(_items[node->index], target); 205 | 206 | // If current node within radius tau 207 | if(dist < _tau) { 208 | if(heap.size() == k) heap.pop(); // remove furthest node from result list (if we already have k results) 209 | heap.push(HeapItem(node->index, dist)); // add current node to result list 210 | if(heap.size() == k) _tau = heap.top().dist; // update value of tau (farthest point in result list) 211 | } 212 | 213 | // Return if we arrived at a leaf 214 | if(node->left == NULL && node->right == NULL) { 215 | return; 216 | } 217 | 218 | // If the target lies within the radius of ball 219 | if(dist < node->threshold) { 220 | if(dist - _tau <= node->threshold) { // if there can still be neighbors inside the ball, recursively search left child first 221 | search(node->left, target, k, heap); 222 | } 223 | 224 | if(dist + _tau >= node->threshold) { // if there can still be neighbors outside the ball, recursively search right child 225 | search(node->right, target, k, heap); 226 | } 227 | 228 | // If the target lies outsize the radius of the ball 229 | } else { 230 | if(dist + _tau >= node->threshold) { // if there can still be neighbors outside the ball, recursively search right child first 231 | search(node->right, target, k, heap); 232 | } 233 | 234 | if (dist - _tau <= node->threshold) { // if there can still be neighbors inside the ball, recursively search left child 235 | search(node->left, target, k, heap); 236 | } 237 | } 238 | } 239 | }; 240 | 241 | #endif 242 | -------------------------------------------------------------------------------- /fasttsne/orig-lvdm/quadtree.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * quadtree.cpp 3 | * Implementation of a quadtree in two dimensions + Barnes-Hut algorithm for t-SNE. 4 | * 5 | * Created by Laurens van der Maaten. 6 | * Copyright 2012, Delft University of Technology. All rights reserved. 7 | * 8 | */ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "quadtree.h" 16 | 17 | 18 | 19 | // Checks whether a point lies in a cell 20 | bool Cell::containsPoint(double point[]) 21 | { 22 | if(x - hw > point[0]) return false; 23 | if(x + hw < point[0]) return false; 24 | if(y - hh > point[1]) return false; 25 | if(y + hh < point[1]) return false; 26 | return true; 27 | } 28 | 29 | 30 | // Default constructor for quadtree -- build tree, too! 31 | QuadTree::QuadTree(double* inp_data, int N) 32 | { 33 | 34 | // Compute mean, width, and height of current map (boundaries of quadtree) 35 | double* mean_Y = new double[QT_NO_DIMS]; for(int d = 0; d < QT_NO_DIMS; d++) mean_Y[d] = .0; 36 | double* min_Y = new double[QT_NO_DIMS]; for(int d = 0; d < QT_NO_DIMS; d++) min_Y[d] = DBL_MAX; 37 | double* max_Y = new double[QT_NO_DIMS]; for(int d = 0; d < QT_NO_DIMS; d++) max_Y[d] = -DBL_MAX; 38 | for(int n = 0; n < N; n++) { 39 | for(int d = 0; d < QT_NO_DIMS; d++) { 40 | mean_Y[d] += inp_data[n * QT_NO_DIMS + d]; 41 | if(inp_data[n * QT_NO_DIMS + d] < min_Y[d]) min_Y[d] = inp_data[n * QT_NO_DIMS + d]; 42 | if(inp_data[n * QT_NO_DIMS + d] > max_Y[d]) max_Y[d] = inp_data[n * QT_NO_DIMS + d]; 43 | } 44 | } 45 | for(int d = 0; d < QT_NO_DIMS; d++) mean_Y[d] /= (double) N; 46 | 47 | // Construct quadtree 48 | init(NULL, inp_data, mean_Y[0], mean_Y[1], max(max_Y[0] - mean_Y[0], mean_Y[0] - min_Y[0]) + 1e-5, 49 | max(max_Y[1] - mean_Y[1], mean_Y[1] - min_Y[1]) + 1e-5); 50 | fill(N); 51 | delete[] mean_Y; delete[] max_Y; delete[] min_Y; 52 | } 53 | 54 | 55 | // Constructor for quadtree with particular size and parent -- build the tree, too! 56 | QuadTree::QuadTree(double* inp_data, int N, double inp_x, double inp_y, double inp_hw, double inp_hh) 57 | { 58 | init(NULL, inp_data, inp_x, inp_y, inp_hw, inp_hh); 59 | fill(N); 60 | } 61 | 62 | // Constructor for quadtree with particular size and parent -- build the tree, too! 63 | QuadTree::QuadTree(QuadTree* inp_parent, double* inp_data, int N, double inp_x, double inp_y, double inp_hw, double inp_hh) 64 | { 65 | init(inp_parent, inp_data, inp_x, inp_y, inp_hw, inp_hh); 66 | fill(N); 67 | } 68 | 69 | 70 | // Constructor for quadtree with particular size (do not fill the tree) 71 | QuadTree::QuadTree(double* inp_data, double inp_x, double inp_y, double inp_hw, double inp_hh) 72 | { 73 | init(NULL, inp_data, inp_x, inp_y, inp_hw, inp_hh); 74 | } 75 | 76 | 77 | // Constructor for quadtree with particular size and parent (do not fill the tree) 78 | QuadTree::QuadTree(QuadTree* inp_parent, double* inp_data, double inp_x, double inp_y, double inp_hw, double inp_hh) 79 | { 80 | init(inp_parent, inp_data, inp_x, inp_y, inp_hw, inp_hh); 81 | } 82 | 83 | 84 | // Main initialization function 85 | void QuadTree::init(QuadTree* inp_parent, double* inp_data, double inp_x, double inp_y, double inp_hw, double inp_hh) 86 | { 87 | parent = inp_parent; 88 | data = inp_data; 89 | is_leaf = true; 90 | size = 0; 91 | cum_size = 0; 92 | boundary.x = inp_x; 93 | boundary.y = inp_y; 94 | boundary.hw = inp_hw; 95 | boundary.hh = inp_hh; 96 | northWest = NULL; 97 | northEast = NULL; 98 | southWest = NULL; 99 | southEast = NULL; 100 | for(int i = 0; i < QT_NO_DIMS; i++) center_of_mass[i] = .0; 101 | } 102 | 103 | 104 | // Destructor for quadtree 105 | QuadTree::~QuadTree() 106 | { 107 | delete northWest; 108 | delete northEast; 109 | delete southWest; 110 | delete southEast; 111 | } 112 | 113 | 114 | // Update the data underlying this tree 115 | void QuadTree::setData(double* inp_data) 116 | { 117 | data = inp_data; 118 | } 119 | 120 | 121 | // Get the parent of the current tree 122 | QuadTree* QuadTree::getParent() 123 | { 124 | return parent; 125 | } 126 | 127 | 128 | // Insert a point into the QuadTree 129 | bool QuadTree::insert(int new_index) 130 | { 131 | // Ignore objects which do not belong in this quad tree 132 | double* point = data + new_index * QT_NO_DIMS; 133 | if(!boundary.containsPoint(point)) 134 | return false; 135 | 136 | // Online update of cumulative size and center-of-mass 137 | cum_size++; 138 | double mult1 = (double) (cum_size - 1) / (double) cum_size; 139 | double mult2 = 1.0 / (double) cum_size; 140 | for(int d = 0; d < QT_NO_DIMS; d++) center_of_mass[d] *= mult1; 141 | for(int d = 0; d < QT_NO_DIMS; d++) center_of_mass[d] += mult2 * point[d]; 142 | 143 | // If there is space in this quad tree and it is a leaf, add the object here 144 | if(is_leaf && size < QT_NODE_CAPACITY) { 145 | index[size] = new_index; 146 | size++; 147 | return true; 148 | } 149 | 150 | // Don't add duplicates for now (this is not very nice) 151 | bool any_duplicate = false; 152 | for(int n = 0; n < size; n++) { 153 | bool duplicate = true; 154 | for(int d = 0; d < QT_NO_DIMS; d++) { 155 | if(point[d] != data[index[n] * QT_NO_DIMS + d]) { duplicate = false; break; } 156 | } 157 | any_duplicate = any_duplicate | duplicate; 158 | } 159 | if(any_duplicate) return true; 160 | 161 | // Otherwise, we need to subdivide the current cell 162 | if(is_leaf) subdivide(); 163 | 164 | // Find out where the point can be inserted 165 | if(northWest->insert(new_index)) return true; 166 | if(northEast->insert(new_index)) return true; 167 | if(southWest->insert(new_index)) return true; 168 | if(southEast->insert(new_index)) return true; 169 | 170 | // Otherwise, the point cannot be inserted (this should never happen) 171 | return false; 172 | } 173 | 174 | 175 | // Create four children which fully divide this cell into four quads of equal area 176 | void QuadTree::subdivide() { 177 | 178 | // Create four children 179 | northWest = new QuadTree(this, data, boundary.x - .5 * boundary.hw, boundary.y - .5 * boundary.hh, .5 * boundary.hw, .5 * boundary.hh); 180 | northEast = new QuadTree(this, data, boundary.x + .5 * boundary.hw, boundary.y - .5 * boundary.hh, .5 * boundary.hw, .5 * boundary.hh); 181 | southWest = new QuadTree(this, data, boundary.x - .5 * boundary.hw, boundary.y + .5 * boundary.hh, .5 * boundary.hw, .5 * boundary.hh); 182 | southEast = new QuadTree(this, data, boundary.x + .5 * boundary.hw, boundary.y + .5 * boundary.hh, .5 * boundary.hw, .5 * boundary.hh); 183 | 184 | // Move existing points to correct children 185 | for(int i = 0; i < size; i++) { 186 | bool success = false; 187 | if(!success) success = northWest->insert(index[i]); 188 | if(!success) success = northEast->insert(index[i]); 189 | if(!success) success = southWest->insert(index[i]); 190 | if(!success) success = southEast->insert(index[i]); 191 | index[i] = -1; 192 | } 193 | 194 | // Empty parent node 195 | size = 0; 196 | is_leaf = false; 197 | } 198 | 199 | 200 | // Build quadtree on dataset 201 | void QuadTree::fill(int N) 202 | { 203 | for(int i = 0; i < N; i++) insert(i); 204 | } 205 | 206 | 207 | // Checks whether the specified tree is correct 208 | bool QuadTree::isCorrect() 209 | { 210 | for(int n = 0; n < size; n++) { 211 | double* point = data + index[n] * QT_NO_DIMS; 212 | if(!boundary.containsPoint(point)) return false; 213 | } 214 | if(!is_leaf) return northWest->isCorrect() && 215 | northEast->isCorrect() && 216 | southWest->isCorrect() && 217 | southEast->isCorrect(); 218 | else return true; 219 | } 220 | 221 | 222 | // Rebuilds a possibly incorrect tree (LAURENS: This function is not tested yet!) 223 | void QuadTree::rebuildTree() 224 | { 225 | for(int n = 0; n < size; n++) { 226 | 227 | // Check whether point is erroneous 228 | double* point = data + index[n] * QT_NO_DIMS; 229 | if(!boundary.containsPoint(point)) { 230 | 231 | // Remove erroneous point 232 | int rem_index = index[n]; 233 | for(int m = n + 1; m < size; m++) index[m - 1] = index[m]; 234 | index[size - 1] = -1; 235 | size--; 236 | 237 | // Update center-of-mass and counter in all parents 238 | bool done = false; 239 | QuadTree* node = this; 240 | while(!done) { 241 | for(int d = 0; d < QT_NO_DIMS; d++) { 242 | node->center_of_mass[d] = ((double) node->cum_size * node->center_of_mass[d] - point[d]) / (double) (node->cum_size - 1); 243 | } 244 | node->cum_size--; 245 | if(node->getParent() == NULL) done = true; 246 | else node = node->getParent(); 247 | } 248 | 249 | // Reinsert point in the root tree 250 | node->insert(rem_index); 251 | } 252 | } 253 | 254 | // Rebuild lower parts of the tree 255 | northWest->rebuildTree(); 256 | northEast->rebuildTree(); 257 | southWest->rebuildTree(); 258 | southEast->rebuildTree(); 259 | } 260 | 261 | 262 | // Build a list of all indices in quadtree 263 | void QuadTree::getAllIndices(int* indices) 264 | { 265 | getAllIndices(indices, 0); 266 | } 267 | 268 | 269 | // Build a list of all indices in quadtree 270 | int QuadTree::getAllIndices(int* indices, int loc) 271 | { 272 | 273 | // Gather indices in current quadrant 274 | for(int i = 0; i < size; i++) indices[loc + i] = index[i]; 275 | loc += size; 276 | 277 | // Gather indices in children 278 | if(!is_leaf) { 279 | loc = northWest->getAllIndices(indices, loc); 280 | loc = northEast->getAllIndices(indices, loc); 281 | loc = southWest->getAllIndices(indices, loc); 282 | loc = southEast->getAllIndices(indices, loc); 283 | } 284 | return loc; 285 | } 286 | 287 | 288 | int QuadTree::getDepth() { 289 | if(is_leaf) return 1; 290 | return 1 + max(max(northWest->getDepth(), 291 | northEast->getDepth()), 292 | max(southWest->getDepth(), 293 | southEast->getDepth())); 294 | 295 | } 296 | 297 | 298 | // Compute non-edge forces using Barnes-Hut algorithm 299 | void QuadTree::computeNonEdgeForces(int point_index, double theta, double neg_f[], double* sum_Q) 300 | { 301 | 302 | // Make sure that we spend no time on empty nodes or self-interactions 303 | if(cum_size == 0 || (is_leaf && size == 1 && index[0] == point_index)) return; 304 | 305 | // Compute distance between point and center-of-mass 306 | double D = .0; 307 | int ind = point_index * QT_NO_DIMS; 308 | for(int d = 0; d < QT_NO_DIMS; d++) buff[d] = data[ind + d]; 309 | for(int d = 0; d < QT_NO_DIMS; d++) buff[d] -= center_of_mass[d]; 310 | for(int d = 0; d < QT_NO_DIMS; d++) D += buff[d] * buff[d]; 311 | 312 | // Check whether we can use this node as a "summary" 313 | if(is_leaf || max(boundary.hh, boundary.hw) / sqrt(D) < theta) { 314 | 315 | // Compute and add t-SNE force between point and current node 316 | double Q = 1.0 / (1.0 + D); 317 | *sum_Q += cum_size * Q; 318 | double mult = cum_size * Q * Q; 319 | for(int d = 0; d < QT_NO_DIMS; d++) neg_f[d] += mult * buff[d]; 320 | } 321 | else { 322 | 323 | // Recursively apply Barnes-Hut to children 324 | northWest->computeNonEdgeForces(point_index, theta, neg_f, sum_Q); 325 | northEast->computeNonEdgeForces(point_index, theta, neg_f, sum_Q); 326 | southWest->computeNonEdgeForces(point_index, theta, neg_f, sum_Q); 327 | southEast->computeNonEdgeForces(point_index, theta, neg_f, sum_Q); 328 | } 329 | } 330 | 331 | 332 | // Computes edge forces 333 | void QuadTree::computeEdgeForces(int* row_P, int* col_P, double* val_P, int N, double* pos_f) 334 | { 335 | 336 | // Loop over all edges in the graph 337 | int ind1, ind2; 338 | double D; 339 | for(int n = 0; n < N; n++) { 340 | ind1 = n * QT_NO_DIMS; 341 | for(int i = row_P[n]; i < row_P[n + 1]; i++) { 342 | 343 | // Compute pairwise distance and Q-value 344 | D = .0; 345 | ind2 = col_P[i] * QT_NO_DIMS; 346 | for(int d = 0; d < QT_NO_DIMS; d++) buff[d] = data[ind1 + d]; 347 | for(int d = 0; d < QT_NO_DIMS; d++) buff[d] -= data[ind2 + d]; 348 | for(int d = 0; d < QT_NO_DIMS; d++) D += buff[d] * buff[d]; 349 | D = val_P[i] / (1.0 + D); 350 | 351 | // Sum positive force 352 | for(int d = 0; d < QT_NO_DIMS; d++) pos_f[ind1 + d] += D * buff[d]; 353 | } 354 | } 355 | } 356 | 357 | 358 | // Print out tree 359 | void QuadTree::print() 360 | { 361 | if(cum_size == 0) { 362 | printf("Empty node\n"); 363 | return; 364 | } 365 | 366 | if(is_leaf) { 367 | printf("Leaf node; data = ["); 368 | for(int i = 0; i < size; i++) { 369 | double* point = data + index[i] * QT_NO_DIMS; 370 | for(int d = 0; d < QT_NO_DIMS; d++) printf("%f, ", point[d]); 371 | printf(" (index = %d)", index[i]); 372 | if(i < size - 1) printf("\n"); 373 | else printf("]\n"); 374 | } 375 | } 376 | else { 377 | printf("Intersection node with center-of-mass = ["); 378 | for(int d = 0; d < QT_NO_DIMS; d++) printf("%f, ", center_of_mass[d]); 379 | printf("]; children are:\n"); 380 | northEast->print(); 381 | northWest->print(); 382 | southEast->print(); 383 | southWest->print(); 384 | } 385 | } 386 | 387 | -------------------------------------------------------------------------------- /fasttsne/orig-lvdm/tsne.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * tsne.cpp 3 | * Implementation of both standard and Barnes-Hut-SNE. 4 | * 5 | * Created by Laurens van der Maaten. 6 | * Copyright 2012, Delft University of Technology. All rights reserved. 7 | * 8 | */ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "quadtree.h" 17 | #include "vptree.h" 18 | #include "tsne.h" 19 | 20 | extern "C" { 21 | #include 22 | } 23 | 24 | 25 | using namespace std; 26 | 27 | // Perform t-SNE 28 | void TSNE::run(double* X, int N, int D, double* Y, int no_dims, double perplexity, double theta) { 29 | 30 | // Determine whether we are using an exact algorithm 31 | if(N - 1 < 3 * perplexity) { printf("Perplexity too large for the number of data points!\n"); exit(1); } 32 | printf("Using no_dims = %d, perplexity = %f, and theta = %f\n", no_dims, perplexity, theta); 33 | bool exact = (theta == .0) ? true : false; 34 | 35 | // Set learning parameters 36 | float total_time = .0; 37 | clock_t start, end; 38 | int max_iter = 1000, stop_lying_iter = 250, mom_switch_iter = 250; 39 | double momentum = .5, final_momentum = .8; 40 | double eta = 200.0; 41 | 42 | // Allocate some memory 43 | double* dY = (double*) malloc(N * no_dims * sizeof(double)); 44 | double* uY = (double*) malloc(N * no_dims * sizeof(double)); 45 | double* gains = (double*) malloc(N * no_dims * sizeof(double)); 46 | if(dY == NULL || uY == NULL || gains == NULL) { printf("Memory allocation failed!\n"); exit(1); } 47 | for(int i = 0; i < N * no_dims; i++) uY[i] = .0; 48 | for(int i = 0; i < N * no_dims; i++) gains[i] = 1.0; 49 | 50 | // Normalize input data (to prevent numerical problems) 51 | printf("Computing input similarities...\n"); 52 | start = clock(); 53 | zeroMean(X, N, D); 54 | double max_X = .0; 55 | for(int i = 0; i < N * D; i++) { 56 | if(X[i] > max_X) max_X = X[i]; 57 | } 58 | for(int i = 0; i < N * D; i++) X[i] /= max_X; 59 | 60 | // Compute input similarities for exact t-SNE 61 | double* P; int* row_P; int* col_P; double* val_P; 62 | if(exact) { 63 | 64 | // Compute similarities 65 | P = (double*) malloc(N * N * sizeof(double)); 66 | if(P == NULL) { printf("Memory allocation failed!\n"); exit(1); } 67 | computeGaussianPerplexity(X, N, D, P, perplexity); 68 | 69 | // Symmetrize input similarities 70 | printf("Symmetrizing...\n"); 71 | for(int n = 0; n < N; n++) { 72 | for(int m = n + 1; m < N; m++) { 73 | P[n * N + m] += P[m * N + n]; 74 | P[m * N + n] = P[n * N + m]; 75 | } 76 | } 77 | double sum_P = .0; 78 | for(int i = 0; i < N * N; i++) sum_P += P[i]; 79 | for(int i = 0; i < N * N; i++) P[i] /= sum_P; 80 | } 81 | 82 | // Compute input similarities for approximate t-SNE 83 | else { 84 | 85 | // Compute asymmetric pairwise input similarities 86 | computeGaussianPerplexity(X, N, D, &row_P, &col_P, &val_P, perplexity, (int) (3 * perplexity)); 87 | 88 | // Symmetrize input similarities 89 | symmetrizeMatrix(&row_P, &col_P, &val_P, N); 90 | double sum_P = .0; 91 | for(int i = 0; i < row_P[N]; i++) sum_P += val_P[i]; 92 | for(int i = 0; i < row_P[N]; i++) val_P[i] /= sum_P; 93 | } 94 | end = clock(); 95 | 96 | // Lie about the P-values 97 | if(exact) { for(int i = 0; i < N * N; i++) P[i] *= 12.0; } 98 | else { for(int i = 0; i < row_P[N]; i++) val_P[i] *= 12.0; } 99 | 100 | // Initialize solution (randomly) 101 | for(int i = 0; i < N * no_dims; i++) Y[i] = randn() * .0001; 102 | 103 | // Perform main training loop 104 | if(exact) printf("Done in %4.2f seconds!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC); 105 | else printf("Done in %4.2f seconds (sparsity = %f)!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC, (double) row_P[N] / ((double) N * (double) N)); 106 | start = clock(); 107 | for(int iter = 0; iter < max_iter; iter++) { 108 | 109 | // Compute (approximate) gradient 110 | if(exact) computeExactGradient(P, Y, N, no_dims, dY); 111 | else computeGradient(P, row_P, col_P, val_P, Y, N, no_dims, dY, theta); 112 | 113 | // Update gains 114 | for(int i = 0; i < N * no_dims; i++) gains[i] = (sign(dY[i]) != sign(uY[i])) ? (gains[i] + .2) : (gains[i] * .8); 115 | for(int i = 0; i < N * no_dims; i++) if(gains[i] < .01) gains[i] = .01; 116 | 117 | // Perform gradient update (with momentum and gains) 118 | for(int i = 0; i < N * no_dims; i++) uY[i] = momentum * uY[i] - eta * gains[i] * dY[i]; 119 | for(int i = 0; i < N * no_dims; i++) Y[i] = Y[i] + uY[i]; 120 | 121 | // Make solution zero-mean 122 | zeroMean(Y, N, no_dims); 123 | 124 | // Stop lying about the P-values after a while, and switch momentum 125 | if(iter == stop_lying_iter) { 126 | if(exact) { for(int i = 0; i < N * N; i++) P[i] /= 12.0; } 127 | else { for(int i = 0; i < row_P[N]; i++) val_P[i] /= 12.0; } 128 | } 129 | if(iter == mom_switch_iter) momentum = final_momentum; 130 | 131 | // Print out progress 132 | if(iter > 0 && iter % 50 == 0 || iter == max_iter - 1) { 133 | end = clock(); 134 | double C = .0; 135 | if(exact) C = evaluateError(P, Y, N); 136 | else C = evaluateError(row_P, col_P, val_P, Y, N, theta); // doing approximate computation here! 137 | if(iter == 0) 138 | printf("Iteration %d: error is %f\n", iter + 1, C); 139 | else { 140 | total_time += (float) (end - start) / CLOCKS_PER_SEC; 141 | printf("Iteration %d: error is %f (50 iterations in %4.2f seconds)\n", iter, C, (float) (end - start) / CLOCKS_PER_SEC); 142 | } 143 | start = clock(); 144 | } 145 | } 146 | end = clock(); total_time += (float) (end - start) / CLOCKS_PER_SEC; 147 | 148 | // Clean up memory 149 | free(dY); 150 | free(uY); 151 | free(gains); 152 | if(exact) free(P); 153 | else { 154 | free(row_P); row_P = NULL; 155 | free(col_P); col_P = NULL; 156 | free(val_P); val_P = NULL; 157 | } 158 | printf("Fitting performed in %4.2f seconds.\n", total_time); 159 | } 160 | 161 | 162 | // Compute gradient of the t-SNE cost function (using Barnes-Hut algorithm) 163 | void TSNE::computeGradient(double* P, int* inp_row_P, int* inp_col_P, double* inp_val_P, double* Y, int N, int D, double* dC, double theta) 164 | { 165 | 166 | // Construct quadtree on current map 167 | QuadTree* tree = new QuadTree(Y, N); 168 | 169 | // Compute all terms required for t-SNE gradient 170 | double sum_Q = .0; 171 | double* pos_f = (double*) calloc(N * D, sizeof(double)); 172 | double* neg_f = (double*) calloc(N * D, sizeof(double)); 173 | if(pos_f == NULL || neg_f == NULL) { printf("Memory allocation failed!\n"); exit(1); } 174 | tree->computeEdgeForces(inp_row_P, inp_col_P, inp_val_P, N, pos_f); 175 | for(int n = 0; n < N; n++) tree->computeNonEdgeForces(n, theta, neg_f + n * D, &sum_Q); 176 | 177 | // Compute final t-SNE gradient 178 | for(int i = 0; i < N * D; i++) { 179 | dC[i] = pos_f[i] - (neg_f[i] / sum_Q); 180 | } 181 | free(pos_f); 182 | free(neg_f); 183 | delete tree; 184 | } 185 | 186 | // Compute gradient of the t-SNE cost function (exact) 187 | void TSNE::computeExactGradient(double* P, double* Y, int N, int D, double* dC) { 188 | 189 | // Make sure the current gradient contains zeros 190 | for(int i = 0; i < N * D; i++) dC[i] = 0.0; 191 | 192 | // Compute the squared Euclidean distance matrix 193 | double* DD = (double*) malloc(N * N * sizeof(double)); 194 | if(DD == NULL) { printf("Memory allocation failed!\n"); exit(1); } 195 | computeSquaredEuclideanDistance(Y, N, D, DD); 196 | 197 | // Compute Q-matrix and normalization sum 198 | double* Q = (double*) malloc(N * N * sizeof(double)); 199 | if(Q == NULL) { printf("Memory allocation failed!\n"); exit(1); } 200 | double sum_Q = .0; 201 | for(int n = 0; n < N; n++) { 202 | for(int m = 0; m < N; m++) { 203 | if(n != m) { 204 | Q[n * N + m] = 1 / (1 + DD[n * N + m]); 205 | sum_Q += Q[n * N + m]; 206 | } 207 | } 208 | } 209 | 210 | // Perform the computation of the gradient 211 | for(int n = 0; n < N; n++) { 212 | for(int m = 0; m < N; m++) { 213 | if(n != m) { 214 | double mult = (P[n * N + m] - (Q[n * N + m] / sum_Q)) * Q[n * N + m]; 215 | for(int d = 0; d < D; d++) { 216 | dC[n * D + d] += (Y[n * D + d] - Y[m * D + d]) * mult; 217 | } 218 | } 219 | } 220 | } 221 | 222 | // Free memory 223 | free(DD); DD = NULL; 224 | free(Q); Q = NULL; 225 | } 226 | 227 | 228 | // Evaluate t-SNE cost function (exactly) 229 | double TSNE::evaluateError(double* P, double* Y, int N) { 230 | 231 | // Compute the squared Euclidean distance matrix 232 | double* DD = (double*) malloc(N * N * sizeof(double)); 233 | double* Q = (double*) malloc(N * N * sizeof(double)); 234 | if(DD == NULL || Q == NULL) { printf("Memory allocation failed!\n"); exit(1); } 235 | computeSquaredEuclideanDistance(Y, N, 2, DD); 236 | 237 | // Compute Q-matrix and normalization sum 238 | double sum_Q = DBL_MIN; 239 | for(int n = 0; n < N; n++) { 240 | for(int m = 0; m < N; m++) { 241 | if(n != m) { 242 | Q[n * N + m] = 1 / (1 + DD[n * N + m]); 243 | sum_Q += Q[n * N + m]; 244 | } 245 | else Q[n * N + m] = DBL_MIN; 246 | } 247 | } 248 | for(int i = 0; i < N * N; i++) Q[i] /= sum_Q; 249 | 250 | // Sum t-SNE error 251 | double C = .0; 252 | for(int n = 0; n < N; n++) { 253 | for(int m = 0; m < N; m++) { 254 | C += P[n * N + m] * log((P[n * N + m] + 1e-9) / (Q[n * N + m] + 1e-9)); 255 | } 256 | } 257 | 258 | // Clean up memory 259 | free(DD); 260 | free(Q); 261 | return C; 262 | } 263 | 264 | // Evaluate t-SNE cost function (approximately) 265 | double TSNE::evaluateError(int* row_P, int* col_P, double* val_P, double* Y, int N, double theta) 266 | { 267 | 268 | // Get estimate of normalization term 269 | const int QT_NO_DIMS = 2; 270 | QuadTree* tree = new QuadTree(Y, N); 271 | double buff[QT_NO_DIMS] = {.0, .0}; 272 | double sum_Q = .0; 273 | for(int n = 0; n < N; n++) tree->computeNonEdgeForces(n, theta, buff, &sum_Q); 274 | 275 | // Loop over all edges to compute t-SNE error 276 | int ind1, ind2; 277 | double C = .0, Q; 278 | for(int n = 0; n < N; n++) { 279 | ind1 = n * QT_NO_DIMS; 280 | for(int i = row_P[n]; i < row_P[n + 1]; i++) { 281 | Q = .0; 282 | ind2 = col_P[i] * QT_NO_DIMS; 283 | for(int d = 0; d < QT_NO_DIMS; d++) buff[d] = Y[ind1 + d]; 284 | for(int d = 0; d < QT_NO_DIMS; d++) buff[d] -= Y[ind2 + d]; 285 | for(int d = 0; d < QT_NO_DIMS; d++) Q += buff[d] * buff[d]; 286 | Q = (1.0 / (1.0 + Q)) / sum_Q; 287 | C += val_P[i] * log((val_P[i] + FLT_MIN) / (Q + FLT_MIN)); 288 | } 289 | } 290 | return C; 291 | } 292 | 293 | 294 | // Compute input similarities with a fixed perplexity 295 | void TSNE::computeGaussianPerplexity(double* X, int N, int D, double* P, double perplexity) { 296 | 297 | // Compute the squared Euclidean distance matrix 298 | double* DD = (double*) malloc(N * N * sizeof(double)); 299 | if(DD == NULL) { printf("Memory allocation failed!\n"); exit(1); } 300 | computeSquaredEuclideanDistance(X, N, D, DD); 301 | 302 | // Compute the Gaussian kernel row by row 303 | for(int n = 0; n < N; n++) { 304 | 305 | // Initialize some variables 306 | bool found = false; 307 | double beta = 1.0; 308 | double min_beta = -DBL_MAX; 309 | double max_beta = DBL_MAX; 310 | double tol = 1e-5; 311 | double sum_P; 312 | 313 | // Iterate until we found a good perplexity 314 | int iter = 0; 315 | while(!found && iter < 200) { 316 | 317 | // Compute Gaussian kernel row 318 | for(int m = 0; m < N; m++) P[n * N + m] = exp(-beta * DD[n * N + m]); 319 | P[n * N + n] = DBL_MIN; 320 | 321 | // Compute entropy of current row 322 | sum_P = DBL_MIN; 323 | for(int m = 0; m < N; m++) sum_P += P[n * N + m]; 324 | double H = 0.0; 325 | for(int m = 0; m < N; m++) H += beta * (DD[n * N + m] * P[n * N + m]); 326 | H = (H / sum_P) + log(sum_P); 327 | 328 | // Evaluate whether the entropy is within the tolerance level 329 | double Hdiff = H - log(perplexity); 330 | if(Hdiff < tol && -Hdiff < tol) { 331 | found = true; 332 | } 333 | else { 334 | if(Hdiff > 0) { 335 | min_beta = beta; 336 | if(max_beta == DBL_MAX || max_beta == -DBL_MAX) 337 | beta *= 2.0; 338 | else 339 | beta = (beta + max_beta) / 2.0; 340 | } 341 | else { 342 | max_beta = beta; 343 | if(min_beta == -DBL_MAX || min_beta == DBL_MAX) 344 | beta /= 2.0; 345 | else 346 | beta = (beta + min_beta) / 2.0; 347 | } 348 | } 349 | 350 | // Update iteration counter 351 | iter++; 352 | } 353 | 354 | // Row normalize P 355 | for(int m = 0; m < N; m++) P[n * N + m] /= sum_P; 356 | } 357 | 358 | // Clean up memory 359 | free(DD); DD = NULL; 360 | } 361 | 362 | 363 | // Compute input similarities with a fixed perplexity using ball trees (this function allocates memory another function should free) 364 | void TSNE::computeGaussianPerplexity(double* X, int N, int D, int** _row_P, int** _col_P, double** _val_P, double perplexity, int K) { 365 | 366 | if(perplexity > K) printf("Perplexity should be lower than K!\n"); 367 | 368 | // Allocate the memory we need 369 | *_row_P = (int*) malloc((N + 1) * sizeof(int)); 370 | *_col_P = (int*) calloc(N * K, sizeof(int)); 371 | *_val_P = (double*) calloc(N * K, sizeof(double)); 372 | if(*_row_P == NULL || *_col_P == NULL || *_val_P == NULL) { printf("Memory allocation failed!\n"); exit(1); } 373 | int* row_P = *_row_P; 374 | int* col_P = *_col_P; 375 | double* val_P = *_val_P; 376 | double* cur_P = (double*) malloc((N - 1) * sizeof(double)); 377 | if(cur_P == NULL) { printf("Memory allocation failed!\n"); exit(1); } 378 | row_P[0] = 0; 379 | for(int n = 0; n < N; n++) row_P[n + 1] = row_P[n] + K; 380 | 381 | // Build ball tree on data set 382 | VpTree* tree = new VpTree(); 383 | vector obj_X(N, DataPoint(D, -1, X)); 384 | for(int n = 0; n < N; n++) obj_X[n] = DataPoint(D, n, X + n * D); 385 | tree->create(obj_X); 386 | 387 | // Loop over all points to find nearest neighbors 388 | printf("Building tree...\n"); 389 | vector indices; 390 | vector distances; 391 | for(int n = 0; n < N; n++) { 392 | 393 | if(n % 10000 == 0) printf(" - point %d of %d\n", n, N); 394 | 395 | // Find nearest neighbors 396 | indices.clear(); 397 | distances.clear(); 398 | tree->search(obj_X[n], K + 1, &indices, &distances); 399 | 400 | // Initialize some variables for binary search 401 | bool found = false; 402 | double beta = 1.0; 403 | double min_beta = -DBL_MAX; 404 | double max_beta = DBL_MAX; 405 | double tol = 1e-5; 406 | 407 | // Iterate until we found a good perplexity 408 | int iter = 0; double sum_P; 409 | while(!found && iter < 200) { 410 | 411 | // Compute Gaussian kernel row 412 | for(int m = 0; m < K; m++) cur_P[m] = exp(-beta * distances[m + 1]); 413 | 414 | // Compute entropy of current row 415 | sum_P = DBL_MIN; 416 | for(int m = 0; m < K; m++) sum_P += cur_P[m]; 417 | double H = .0; 418 | for(int m = 0; m < K; m++) H += beta * (distances[m + 1] * cur_P[m]); 419 | H = (H / sum_P) + log(sum_P); 420 | 421 | // Evaluate whether the entropy is within the tolerance level 422 | double Hdiff = H - log(perplexity); 423 | if(Hdiff < tol && -Hdiff < tol) { 424 | found = true; 425 | } 426 | else { 427 | if(Hdiff > 0) { 428 | min_beta = beta; 429 | if(max_beta == DBL_MAX || max_beta == -DBL_MAX) 430 | beta *= 2.0; 431 | else 432 | beta = (beta + max_beta) / 2.0; 433 | } 434 | else { 435 | max_beta = beta; 436 | if(min_beta == -DBL_MAX || min_beta == DBL_MAX) 437 | beta /= 2.0; 438 | else 439 | beta = (beta + min_beta) / 2.0; 440 | } 441 | } 442 | 443 | // Update iteration counter 444 | iter++; 445 | } 446 | 447 | // Row-normalize current row of P and store in matrix 448 | for(int m = 0; m < K; m++) cur_P[m] /= sum_P; 449 | for(int m = 0; m < K; m++) { 450 | col_P[row_P[n] + m] = indices[m + 1].index(); 451 | val_P[row_P[n] + m] = cur_P[m]; 452 | } 453 | } 454 | 455 | // Clean up memory 456 | obj_X.clear(); 457 | free(cur_P); 458 | delete tree; 459 | } 460 | 461 | 462 | // Compute input similarities with a fixed perplexity (this function allocates memory another function should free) 463 | void TSNE::computeGaussianPerplexity(double* X, int N, int D, int** _row_P, int** _col_P, double** _val_P, double perplexity, double threshold) { 464 | 465 | // Allocate some memory we need for computations 466 | double* buff = (double*) malloc(D * sizeof(double)); 467 | double* DD = (double*) malloc(N * sizeof(double)); 468 | double* cur_P = (double*) malloc(N * sizeof(double)); 469 | if(buff == NULL || DD == NULL || cur_P == NULL) { printf("Memory allocation failed!\n"); exit(1); } 470 | 471 | // Compute the Gaussian kernel row by row (to find number of elements in sparse P) 472 | int total_count = 0; 473 | for(int n = 0; n < N; n++) { 474 | 475 | // Compute the squared Euclidean distance matrix 476 | for(int m = 0; m < N; m++) { 477 | for(int d = 0; d < D; d++) buff[d] = X[n * D + d]; 478 | for(int d = 0; d < D; d++) buff[d] -= X[m * D + d]; 479 | DD[m] = .0; 480 | for(int d = 0; d < D; d++) DD[m] += buff[d] * buff[d]; 481 | } 482 | 483 | // Initialize some variables 484 | bool found = false; 485 | double beta = 1.0; 486 | double min_beta = -DBL_MAX; 487 | double max_beta = DBL_MAX; 488 | double tol = 1e-5; 489 | 490 | // Iterate until we found a good perplexity 491 | int iter = 0; double sum_P; 492 | while(!found && iter < 200) { 493 | 494 | // Compute Gaussian kernel row 495 | for(int m = 0; m < N; m++) cur_P[m] = exp(-beta * DD[m]); 496 | cur_P[n] = DBL_MIN; 497 | 498 | // Compute entropy of current row 499 | sum_P = DBL_MIN; 500 | for(int m = 0; m < N; m++) sum_P += cur_P[m]; 501 | double H = 0.0; 502 | for(int m = 0; m < N; m++) H += beta * (DD[m] * cur_P[m]); 503 | H = (H / sum_P) + log(sum_P); 504 | 505 | // Evaluate whether the entropy is within the tolerance level 506 | double Hdiff = H - log(perplexity); 507 | if(Hdiff < tol && -Hdiff < tol) { 508 | found = true; 509 | } 510 | else { 511 | if(Hdiff > 0) { 512 | min_beta = beta; 513 | if(max_beta == DBL_MAX || max_beta == -DBL_MAX) 514 | beta *= 2.0; 515 | else 516 | beta = (beta + max_beta) / 2.0; 517 | } 518 | else { 519 | max_beta = beta; 520 | if(min_beta == -DBL_MAX || min_beta == DBL_MAX) 521 | beta /= 2.0; 522 | else 523 | beta = (beta + min_beta) / 2.0; 524 | } 525 | } 526 | 527 | // Update iteration counter 528 | iter++; 529 | } 530 | 531 | // Row-normalize and threshold current row of P 532 | for(int m = 0; m < N; m++) cur_P[m] /= sum_P; 533 | for(int m = 0; m < N; m++) { 534 | if(cur_P[m] > threshold / (double) N) total_count++; 535 | } 536 | } 537 | 538 | // Allocate the memory we need 539 | *_row_P = (int*) malloc((N + 1) * sizeof(int)); 540 | *_col_P = (int*) malloc(total_count * sizeof(int)); 541 | *_val_P = (double*) malloc(total_count * sizeof(double)); 542 | int* row_P = *_row_P; 543 | int* col_P = *_col_P; 544 | double* val_P = *_val_P; 545 | if(row_P == NULL || col_P == NULL || val_P == NULL) { printf("Memory allocation failed!\n"); exit(1); } 546 | row_P[0] = 0; 547 | 548 | // Compute the Gaussian kernel row by row (this time, store the results) 549 | int count = 0; 550 | for(int n = 0; n < N; n++) { 551 | 552 | // Compute the squared Euclidean distance matrix 553 | for(int m = 0; m < N; m++) { 554 | for(int d = 0; d < D; d++) buff[d] = X[n * D + d]; 555 | for(int d = 0; d < D; d++) buff[d] -= X[m * D + d]; 556 | DD[m] = .0; 557 | for(int d = 0; d < D; d++) DD[m] += buff[d] * buff[d]; 558 | } 559 | 560 | // Initialize some variables 561 | bool found = false; 562 | double beta = 1.0; 563 | double min_beta = -DBL_MAX; 564 | double max_beta = DBL_MAX; 565 | double tol = 1e-5; 566 | 567 | // Iterate until we found a good perplexity 568 | int iter = 0; double sum_P; 569 | while(!found && iter < 200) { 570 | 571 | // Compute Gaussian kernel row 572 | for(int m = 0; m < N; m++) cur_P[m] = exp(-beta * DD[m]); 573 | cur_P[n] = DBL_MIN; 574 | 575 | // Compute entropy of current row 576 | sum_P = DBL_MIN; 577 | for(int m = 0; m < N; m++) sum_P += cur_P[m]; 578 | double H = 0.0; 579 | for(int m = 0; m < N; m++) H += beta * (DD[m] * cur_P[m]); 580 | H = (H / sum_P) + log(sum_P); 581 | 582 | // Evaluate whether the entropy is within the tolerance level 583 | double Hdiff = H - log(perplexity); 584 | if(Hdiff < tol && -Hdiff < tol) { 585 | found = true; 586 | } 587 | else { 588 | if(Hdiff > 0) { 589 | min_beta = beta; 590 | if(max_beta == DBL_MAX || max_beta == -DBL_MAX) 591 | beta *= 2.0; 592 | else 593 | beta = (beta + max_beta) / 2.0; 594 | } 595 | else { 596 | max_beta = beta; 597 | if(min_beta == -DBL_MAX || min_beta == DBL_MAX) 598 | beta /= 2.0; 599 | else 600 | beta = (beta + min_beta) / 2.0; 601 | } 602 | } 603 | 604 | // Update iteration counter 605 | iter++; 606 | } 607 | 608 | // Row-normalize and threshold current row of P 609 | for(int m = 0; m < N; m++) cur_P[m] /= sum_P; 610 | for(int m = 0; m < N; m++) { 611 | if(cur_P[m] > threshold / (double) N) { 612 | col_P[count] = m; 613 | val_P[count] = cur_P[m]; 614 | count++; 615 | } 616 | } 617 | row_P[n + 1] = count; 618 | } 619 | 620 | // Clean up memory 621 | free(DD); DD = NULL; 622 | free(buff); buff = NULL; 623 | free(cur_P); cur_P = NULL; 624 | } 625 | 626 | 627 | void TSNE::symmetrizeMatrix(int** _row_P, int** _col_P, double** _val_P, int N) { 628 | 629 | // Get sparse matrix 630 | int* row_P = *_row_P; 631 | int* col_P = *_col_P; 632 | double* val_P = *_val_P; 633 | 634 | // Count number of elements and row counts of symmetric matrix 635 | int* row_counts = (int*) calloc(N, sizeof(int)); 636 | if(row_counts == NULL) { printf("Memory allocation failed!\n"); exit(1); } 637 | for(int n = 0; n < N; n++) { 638 | for(int i = row_P[n]; i < row_P[n + 1]; i++) { 639 | 640 | // Check whether element (col_P[i], n) is present 641 | bool present = false; 642 | for(int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) { 643 | if(col_P[m] == n) present = true; 644 | } 645 | if(present) row_counts[n]++; 646 | else { 647 | row_counts[n]++; 648 | row_counts[col_P[i]]++; 649 | } 650 | } 651 | } 652 | int no_elem = 0; 653 | for(int n = 0; n < N; n++) no_elem += row_counts[n]; 654 | 655 | // Allocate memory for symmetrized matrix 656 | int* sym_row_P = (int*) malloc((N + 1) * sizeof(int)); 657 | int* sym_col_P = (int*) malloc(no_elem * sizeof(int)); 658 | double* sym_val_P = (double*) malloc(no_elem * sizeof(double)); 659 | if(sym_row_P == NULL || sym_col_P == NULL || sym_val_P == NULL) { printf("Memory allocation failed!\n"); exit(1); } 660 | 661 | // Construct new row indices for symmetric matrix 662 | sym_row_P[0] = 0; 663 | for(int n = 0; n < N; n++) sym_row_P[n + 1] = sym_row_P[n] + row_counts[n]; 664 | 665 | // Fill the result matrix 666 | int* offset = (int*) calloc(N, sizeof(int)); 667 | if(offset == NULL) { printf("Memory allocation failed!\n"); exit(1); } 668 | for(int n = 0; n < N; n++) { 669 | for(int i = row_P[n]; i < row_P[n + 1]; i++) { // considering element(n, col_P[i]) 670 | 671 | // Check whether element (col_P[i], n) is present 672 | bool present = false; 673 | for(int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) { 674 | if(col_P[m] == n) { 675 | present = true; 676 | if(n <= col_P[i]) { // make sure we do not add elements twice 677 | sym_col_P[sym_row_P[n] + offset[n]] = col_P[i]; 678 | sym_col_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = n; 679 | sym_val_P[sym_row_P[n] + offset[n]] = val_P[i] + val_P[m]; 680 | sym_val_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = val_P[i] + val_P[m]; 681 | } 682 | } 683 | } 684 | 685 | // If (col_P[i], n) is not present, there is no addition involved 686 | if(!present) { 687 | sym_col_P[sym_row_P[n] + offset[n]] = col_P[i]; 688 | sym_col_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = n; 689 | sym_val_P[sym_row_P[n] + offset[n]] = val_P[i]; 690 | sym_val_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = val_P[i]; 691 | } 692 | 693 | // Update offsets 694 | if(!present || (present && n <= col_P[i])) { 695 | offset[n]++; 696 | if(col_P[i] != n) offset[col_P[i]]++; 697 | } 698 | } 699 | } 700 | 701 | // Divide the result by two 702 | for(int i = 0; i < no_elem; i++) sym_val_P[i] /= 2.0; 703 | 704 | // Return symmetrized matrices 705 | free(*_row_P); *_row_P = sym_row_P; 706 | free(*_col_P); *_col_P = sym_col_P; 707 | free(*_val_P); *_val_P = sym_val_P; 708 | 709 | // Free up some memery 710 | free(offset); offset = NULL; 711 | free(row_counts); row_counts = NULL; 712 | } 713 | 714 | // Compute squared Euclidean distance matrix (using BLAS) 715 | void TSNE::computeSquaredEuclideanDistance(double* X, int N, int D, double* DD) { 716 | double* dataSums = (double*) calloc(N, sizeof(double)); 717 | if(dataSums == NULL) { printf("Memory allocation failed!\n"); exit(1); } 718 | for(int n = 0; n < N; n++) { 719 | for(int d = 0; d < D; d++) { 720 | dataSums[n] += (X[n * D + d] * X[n * D + d]); 721 | } 722 | } 723 | for(int n = 0; n < N; n++) { 724 | for(int m = 0; m < N; m++) { 725 | DD[n * N + m] = dataSums[n] + dataSums[m]; 726 | } 727 | } 728 | cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, N, N, D, -2.0, X, D, X, D, 1.0, DD, N); 729 | free(dataSums); dataSums = NULL; 730 | } 731 | 732 | 733 | // Makes data zero-mean 734 | void TSNE::zeroMean(double* X, int N, int D) { 735 | 736 | // Compute data mean 737 | double* mean = (double*) calloc(D, sizeof(double)); 738 | if(mean == NULL) { printf("Memory allocation failed!\n"); exit(1); } 739 | for(int n = 0; n < N; n++) { 740 | for(int d = 0; d < D; d++) { 741 | mean[d] += X[n * D + d]; 742 | } 743 | } 744 | for(int d = 0; d < D; d++) { 745 | mean[d] /= (double) N; 746 | } 747 | 748 | // Subtract data mean 749 | for(int n = 0; n < N; n++) { 750 | for(int d = 0; d < D; d++) { 751 | X[n * D + d] -= mean[d]; 752 | } 753 | } 754 | free(mean); mean = NULL; 755 | } 756 | 757 | 758 | // Generates a Gaussian random number 759 | double TSNE::randn() { 760 | double x, y, radius; 761 | do { 762 | x = 2 * (rand() / ((double) RAND_MAX + 1)) - 1; 763 | y = 2 * (rand() / ((double) RAND_MAX + 1)) - 1; 764 | radius = (x * x) + (y * y); 765 | } while((radius >= 1.0) || (radius == 0.0)); 766 | radius = sqrt(-2 * log(radius) / radius); 767 | x *= radius; 768 | y *= radius; 769 | return x; 770 | } 771 | 772 | // Function that loads data from a t-SNE file 773 | // Note: this function does a malloc that should be freed elsewhere 774 | bool TSNE::load_data(double** data, int* n, int* d, double* theta, double* perplexity) { 775 | 776 | // Open file, read first 2 integers, allocate memory, and read the data 777 | FILE *h; 778 | if((h = fopen("data.dat", "r+b")) == NULL) { 779 | printf("Error: could not open data file.\n"); 780 | return false; 781 | } 782 | fread(n, sizeof(int), 1, h); // number of datapoints 783 | fread(d, sizeof(int), 1, h); // original dimensionality 784 | fread(theta, sizeof(double), 1, h); // gradient accuracy 785 | fread(perplexity, sizeof(double), 1, h); // perplexity 786 | *data = (double*) calloc(*d * *n, sizeof(double)); 787 | if(*data == NULL) { printf("Memory allocation failed!\n"); exit(1); } 788 | fread(*data, sizeof(double), *n * *d, h); // the data 789 | fclose(h); 790 | printf("Read the %i x %i data matrix successfully!\n", *n, *d); 791 | return true; 792 | } 793 | 794 | // Function that saves map to a t-SNE file 795 | void TSNE::save_data(double* data, int* landmarks, double* costs, int n, int d) { 796 | 797 | // Open file, write first 2 integers and then the data 798 | FILE *h; 799 | if((h = fopen("result.dat", "w+b")) == NULL) { 800 | printf("Error: could not open data file.\n"); 801 | return; 802 | } 803 | fwrite(&n, sizeof(int), 1, h); 804 | fwrite(&d, sizeof(int), 1, h); 805 | fwrite(data, sizeof(double), n * d, h); 806 | fwrite(landmarks, sizeof(int), n, h); 807 | fwrite(costs, sizeof(double), n, h); 808 | fclose(h); 809 | printf("Wrote the %i x %i data matrix successfully!\n", n, d); 810 | } 811 | 812 | 813 | // Function that runs the Barnes-Hut implementation of t-SNE 814 | //int main() { 815 | // 816 | // // Define some variables 817 | // int origN, N, D, no_dims = 2, *landmarks; 818 | // double perc_landmarks; 819 | // double perplexity, theta, *data; 820 | // TSNE* tsne = new TSNE(); 821 | // 822 | // // Read the parameters and the dataset 823 | // if(tsne->load_data(&data, &origN, &D, &theta, &perplexity)) { 824 | // 825 | // // Make dummy landmarks 826 | // N = origN; 827 | // int* landmarks = (int*) malloc(N * sizeof(int)); 828 | // if(landmarks == NULL) { printf("Memory allocation failed!\n"); exit(1); } 829 | // for(int n = 0; n < N; n++) landmarks[n] = n; 830 | // 831 | // // Now fire up the SNE implementation 832 | // double* Y = (double*) malloc(N * no_dims * sizeof(double)); 833 | // double* costs = (double*) calloc(N, sizeof(double)); 834 | // if(Y == NULL || costs == NULL) { printf("Memory allocation failed!\n"); exit(1); } 835 | // tsne->run(data, N, D, Y, no_dims, perplexity, theta); 836 | // 837 | // // Save the results 838 | // tsne->save_data(Y, landmarks, costs, N, no_dims); 839 | // 840 | // // Clean up the memory 841 | // free(data); data = NULL; 842 | // free(Y); Y = NULL; 843 | // free(costs); costs = NULL; 844 | // free(landmarks); landmarks = NULL; 845 | // } 846 | // delete(tsne); 847 | //} 848 | 849 | --------------------------------------------------------------------------------