├── .gitignore ├── .travis.yml ├── LICENSE.txt ├── Makefile.win ├── README.md ├── bhtsne.py ├── fast_tsne.m ├── sptree.cpp ├── sptree.h ├── tsne.cpp ├── tsne.h ├── tsne_main.cpp └── vptree.h /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.ipynb 3 | *.py[oc] 4 | 5 | # ignore the C++ binary 6 | bh_tsne 7 | 8 | # ignore data files 9 | *.txt 10 | *.tsv -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: trusty 2 | language: cpp 3 | 4 | script: 5 | - g++ sptree.cpp tsne.cpp tsne_main.cpp -o bh_tsne -O2 -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology) 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 1. Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | 2. Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | 3. All advertising materials mentioning features or use of this software 12 | must display the following acknowledgement: 13 | This product includes software developed by the Delft University of Technology. 14 | 4. Neither the name of the Delft University of Technology nor the names of 15 | its contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS 19 | OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 | OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 21 | EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 24 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 26 | IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY 27 | OF SUCH DAMAGE. -------------------------------------------------------------------------------- /Makefile.win: -------------------------------------------------------------------------------- 1 | CXX = cl.exe 2 | CFLAGS = /nologo /O2 /EHsc /D "_CRT_SECURE_NO_DEPRECATE" /D "USEOMP" /openmp 3 | 4 | TARGET = windows 5 | 6 | all: $(TARGET) $(TARGET)\bh_tsne.exe 7 | 8 | $(TARGET)\bh_tsne.exe: tsne_main.obj tsne.obj sptree.obj 9 | $(CXX) $(CFLAGS) tsne_main.obj tsne.obj sptree.obj -Fe$(TARGET)\bh_tsne.exe 10 | 11 | sptree.obj: sptree.cpp sptree.h 12 | $(CXX) $(CFLAGS) -c sptree.cpp 13 | 14 | tsne.obj: tsne.cpp tsne.h sptree.h vptree.h 15 | $(CXX) $(CFLAGS) -c tsne.cpp 16 | 17 | tsne_main.obj: tsne_main.cpp tsne.h sptree.h vptree.h 18 | $(CXX) $(CFLAGS) -c tsne_main.cpp 19 | 20 | .PHONY: $(TARGET) 21 | $(TARGET): 22 | -mkdir $(TARGET) 23 | 24 | clean: 25 | -erase /Q *.obj *.exe $(TARGET)\. 26 | -rd $(TARGET) 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![Build Status](https://travis-ci.org/lvdmaaten/bhtsne.svg)](https://travis-ci.org/lvdmaaten/bhtsne) 3 | 4 | This software package contains a Barnes-Hut implementation of the t-SNE algorithm. The implementation is described in [this paper](http://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf). 5 | 6 | 7 | # Installation # 8 | 9 | On Linux or OS X, compile the source using the following command: 10 | 11 | ``` 12 | g++ sptree.cpp tsne.cpp tsne_main.cpp -o bh_tsne -O2 13 | ``` 14 | 15 | The executable will be called `bh_tsne`. 16 | 17 | On Windows using Visual C++, do the following in your command line: 18 | 19 | - Find the `vcvars64.bat` file in your Visual C++ installation directory. This file may be named `vcvars64.bat` or something similar. For example: 20 | 21 | ``` 22 | // Visual Studio 12 23 | "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\amd64\vcvars64.bat" 24 | 25 | // Visual Studio 2013 Express: 26 | C:\VisualStudioExp2013\VC\bin\x86_amd64\vcvarsx86_amd64.bat 27 | ``` 28 | 29 | - From `cmd.exe`, go to the directory containing that .bat file and run it. 30 | 31 | - Go to `bhtsne` directory and run: 32 | 33 | ``` 34 | nmake -f Makefile.win all 35 | ``` 36 | 37 | The executable will be called `windows\bh_tsne.exe`. 38 | 39 | # Usage # 40 | 41 | The code comes with wrappers for Matlab and Python. These wrappers write your data to a file called `data.dat`, run the `bh_tsne` binary, and read the result file `result.dat` that the binary produces. There are also external wrappers available for [Torch](https://github.com/clementfarabet/manifold), [R](https://github.com/jkrijthe/Rtsne), and [Julia](https://github.com/zhmz90/BHTsne.jl). Writing your own wrapper should be straightforward; please refer to one of the existing wrappers for the format of the data and result files. 42 | 43 | Demonstration of usage in Matlab: 44 | 45 | ```matlab 46 | filename = websave('mnist_train.mat', 'https://github.com/awni/cs224n-pa4/blob/master/Simple_tSNE/mnist_train.mat?raw=true'); 47 | load(filename); 48 | numDims = 2; pcaDims = 50; perplexity = 50; theta = .5; alg = 'svd'; 49 | map = fast_tsne(digits', numDims, pcaDims, perplexity, theta, alg); 50 | gscatter(map(:,1), map(:,2), labels'); 51 | ``` 52 | 53 | Demonstration of usage in Python: 54 | 55 | ```python 56 | import numpy as np 57 | import bhtsne 58 | 59 | data = np.loadtxt("mnist2500_X.txt", skiprows=1) 60 | 61 | embedding_array = bhtsne.run_bh_tsne(data, initial_dims=data.shape[1]) 62 | ``` 63 | 64 | ### Python Wrapper 65 | 66 | Usage: 67 | 68 | ```bash 69 | python bhtsne.py [-h] [-d NO_DIMS] [-p PERPLEXITY] [-t THETA] 70 | [-r RANDSEED] [-n INITIAL_DIMS] [-v] [-i INPUT] 71 | [-o OUTPUT] [--use_pca] [--no_pca] [-m MAX_ITER] 72 | ``` 73 | 74 | Below are the various options the wrapper program `bhtsne.py` expects: 75 | 76 | - `-h, --help` show this help message and exit 77 | - `-d NO_DIMS, --no_dims` NO_DIMS 78 | - `-p PERPLEXITY, --perplexity` PERPLEXITY 79 | - `-t THETA, --theta` THETA 80 | - `-r RANDSEED, --randseed` RANDSEED 81 | - `-n INITIAL_DIMS, --initial_dims` INITIAL_DIMS 82 | - `-v, --verbose` 83 | - `-i INPUT, --input` INPUT: the input file, expects a TSV with the first row as the header. 84 | - `-o OUTPUT, --output` OUTPUT: A TSV file having each row as the `d` dimensional embedding. 85 | - `--use_pca` 86 | - `--no_pca` 87 | - `-m MAX_ITER, --max_iter` MAX_ITER 88 | 89 | -------------------------------------------------------------------------------- /bhtsne.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | A simple Python wrapper for the bh_tsne binary that makes it easier to use it 5 | for TSV files in a pipeline without any shell script trickery. 6 | 7 | Note: The script does some minimal sanity checking of the input, but don't 8 | expect it to cover all cases. After all, it is a just a wrapper. 9 | 10 | Example: 11 | 12 | > echo -e '1.0\t0.0\n0.0\t1.0' | ./bhtsne.py -d 2 -p 0.1 13 | -2458.83181442 -6525.87718385 14 | 2458.83181442 6525.87718385 15 | 16 | The output will not be normalised, maybe the below one-liner is of interest?: 17 | 18 | python -c 'import numpy; from sys import stdin, stdout; 19 | d = numpy.loadtxt(stdin); d -= d.min(axis=0); d /= d.max(axis=0); 20 | numpy.savetxt(stdout, d, fmt="%.8f", delimiter="\t")' 21 | 22 | Authors: Pontus Stenetorp 23 | Philippe Remy 24 | Version: 2016-03-08 25 | ''' 26 | 27 | # Copyright (c) 2013, Pontus Stenetorp 28 | # 29 | # Permission to use, copy, modify, and/or distribute this software for any 30 | # purpose with or without fee is hereby granted, provided that the above 31 | # copyright notice and this permission notice appear in all copies. 32 | # 33 | # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 34 | # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 35 | # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 36 | # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 37 | # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 38 | # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 39 | # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 40 | 41 | from argparse import ArgumentParser, FileType 42 | from os.path import abspath, dirname, isfile, join as path_join 43 | from shutil import rmtree 44 | from struct import calcsize, pack, unpack 45 | from subprocess import Popen 46 | from sys import stderr, stdin, stdout 47 | from tempfile import mkdtemp 48 | from platform import system 49 | from os import devnull 50 | import numpy as np 51 | import os, sys 52 | import io 53 | 54 | ### Constants 55 | IS_WINDOWS = True if system() == 'Windows' else False 56 | BH_TSNE_BIN_PATH = path_join(dirname(__file__), 'windows', 'bh_tsne.exe') if IS_WINDOWS else path_join(dirname(__file__), 'bh_tsne') 57 | assert isfile(BH_TSNE_BIN_PATH), ('Unable to find the bh_tsne binary in the ' 58 | 'same directory as this script, have you forgotten to compile it?: {}' 59 | ).format(BH_TSNE_BIN_PATH) 60 | # Default hyper-parameter values from van der Maaten (2014) 61 | # https://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf (Experimental Setup, page 13) 62 | DEFAULT_NO_DIMS = 2 63 | INITIAL_DIMENSIONS = 50 64 | DEFAULT_PERPLEXITY = 50 65 | DEFAULT_THETA = 0.5 66 | EMPTY_SEED = -1 67 | DEFAULT_USE_PCA = True 68 | DEFAULT_MAX_ITERATIONS = 1000 69 | 70 | ### 71 | 72 | def _argparse(): 73 | argparse = ArgumentParser('bh_tsne Python wrapper') 74 | argparse.add_argument('-d', '--no_dims', type=int, 75 | default=DEFAULT_NO_DIMS) 76 | argparse.add_argument('-p', '--perplexity', type=float, 77 | default=DEFAULT_PERPLEXITY) 78 | # 0.0 for theta is equivalent to vanilla t-SNE 79 | argparse.add_argument('-t', '--theta', type=float, default=DEFAULT_THETA) 80 | argparse.add_argument('-r', '--randseed', type=int, default=EMPTY_SEED) 81 | argparse.add_argument('-n', '--initial_dims', type=int, default=INITIAL_DIMENSIONS) 82 | argparse.add_argument('-v', '--verbose', action='store_true') 83 | argparse.add_argument('-i', '--input', type=FileType('r'), default=stdin) 84 | argparse.add_argument('-o', '--output', type=FileType('w'), 85 | default=stdout) 86 | argparse.add_argument('--use_pca', action='store_true') 87 | argparse.add_argument('--no_pca', dest='use_pca', action='store_false') 88 | argparse.set_defaults(use_pca=DEFAULT_USE_PCA) 89 | argparse.add_argument('-m', '--max_iter', type=int, default=DEFAULT_MAX_ITERATIONS) 90 | return argparse 91 | 92 | 93 | def _read_unpack(fmt, fh): 94 | return unpack(fmt, fh.read(calcsize(fmt))) 95 | 96 | 97 | def _is_filelike_object(f): 98 | try: 99 | return isinstance(f, (file, io.IOBase)) 100 | except NameError: 101 | # 'file' is not a class in python3 102 | return isinstance(f, io.IOBase) 103 | 104 | 105 | def init_bh_tsne(samples, workdir, no_dims=DEFAULT_NO_DIMS, initial_dims=INITIAL_DIMENSIONS, perplexity=DEFAULT_PERPLEXITY, 106 | theta=DEFAULT_THETA, randseed=EMPTY_SEED, verbose=False, use_pca=DEFAULT_USE_PCA, max_iter=DEFAULT_MAX_ITERATIONS): 107 | 108 | if use_pca: 109 | samples = samples - np.mean(samples, axis=0) 110 | cov_x = np.dot(np.transpose(samples), samples) 111 | [eig_val, eig_vec] = np.linalg.eig(cov_x) 112 | 113 | # sorting the eigen-values in the descending order 114 | eig_vec = eig_vec[:, eig_val.argsort()[::-1]] 115 | 116 | if initial_dims > len(eig_vec): 117 | initial_dims = len(eig_vec) 118 | 119 | # truncating the eigen-vectors matrix to keep the most important vectors 120 | eig_vec = np.real(eig_vec[:, :initial_dims]) 121 | samples = np.dot(samples, eig_vec) 122 | 123 | # Assume that the dimensionality of the first sample is representative for 124 | # the whole batch 125 | sample_dim = len(samples[0]) 126 | sample_count = len(samples) 127 | 128 | # Note: The binary format used by bh_tsne is roughly the same as for 129 | # vanilla tsne 130 | with open(path_join(workdir, 'data.dat'), 'wb') as data_file: 131 | # Write the bh_tsne header 132 | data_file.write(pack('iiddii', sample_count, sample_dim, theta, perplexity, no_dims, max_iter)) 133 | # Then write the data 134 | for sample in samples: 135 | data_file.write(pack('{}d'.format(len(sample)), *sample)) 136 | # Write random seed if specified 137 | if randseed != EMPTY_SEED: 138 | data_file.write(pack('i', randseed)) 139 | 140 | def load_data(input_file): 141 | # Read the data, using numpy's good judgement 142 | return np.loadtxt(input_file) 143 | 144 | def bh_tsne(workdir, verbose=False): 145 | 146 | # Call bh_tsne and let it do its thing 147 | with open(devnull, 'w') as dev_null: 148 | bh_tsne_p = Popen((abspath(BH_TSNE_BIN_PATH), ), cwd=workdir, 149 | # bh_tsne is very noisy on stdout, tell it to use stderr 150 | # if it is to print any output 151 | stdout=stderr if verbose else dev_null) 152 | bh_tsne_p.wait() 153 | assert not bh_tsne_p.returncode, ('ERROR: Call to bh_tsne exited ' 154 | 'with a non-zero return code exit status, please ' + 155 | ('enable verbose mode and ' if not verbose else '') + 156 | 'refer to the bh_tsne output for further details') 157 | 158 | # Read and pass on the results 159 | with open(path_join(workdir, 'result.dat'), 'rb') as output_file: 160 | # The first two integers are just the number of samples and the 161 | # dimensionality 162 | result_samples, result_dims = _read_unpack('ii', output_file) 163 | # Collect the results, but they may be out of order 164 | results = [_read_unpack('{}d'.format(result_dims), output_file) 165 | for _ in range(result_samples)] 166 | # Now collect the landmark data so that we can return the data in 167 | # the order it arrived 168 | results = [(_read_unpack('i', output_file), e) for e in results] 169 | # Put the results in order and yield it 170 | results.sort() 171 | for _, result in results: 172 | yield result 173 | # The last piece of data is the cost for each sample, we ignore it 174 | #read_unpack('{}d'.format(sample_count), output_file) 175 | 176 | def run_bh_tsne(data, no_dims=2, perplexity=50, theta=0.5, randseed=-1, verbose=False, initial_dims=50, use_pca=True, max_iter=1000): 177 | ''' 178 | Run TSNE based on the Barnes-HT algorithm 179 | 180 | Parameters: 181 | ---------- 182 | data: file or numpy.array 183 | The data used to run TSNE, one sample per row 184 | no_dims: int 185 | perplexity: int 186 | randseed: int 187 | theta: float 188 | initial_dims: int 189 | verbose: boolean 190 | use_pca: boolean 191 | max_iter: int 192 | ''' 193 | 194 | # bh_tsne works with fixed input and output paths, give it a temporary 195 | # directory to work in so we don't clutter the filesystem 196 | tmp_dir_path = mkdtemp() 197 | 198 | # Load data in forked process to free memory for actual bh_tsne calculation 199 | child_pid = os.fork() 200 | if child_pid == 0: 201 | if _is_filelike_object(data): 202 | data = load_data(data) 203 | 204 | init_bh_tsne(data, tmp_dir_path, no_dims=no_dims, perplexity=perplexity, theta=theta, randseed=randseed,verbose=verbose, initial_dims=initial_dims, use_pca=use_pca, max_iter=max_iter) 205 | os._exit(0) 206 | else: 207 | try: 208 | os.waitpid(child_pid, 0) 209 | except KeyboardInterrupt: 210 | print("Please run this program directly from python and not from ipython or jupyter.") 211 | print("This is an issue due to asynchronous error handling.") 212 | 213 | res = [] 214 | for result in bh_tsne(tmp_dir_path, verbose): 215 | sample_res = [] 216 | for r in result: 217 | sample_res.append(r) 218 | res.append(sample_res) 219 | rmtree(tmp_dir_path) 220 | return np.asarray(res, dtype='float64') 221 | 222 | 223 | def main(args): 224 | parser = _argparse() 225 | 226 | if len(args) <= 1: 227 | print(parser.print_help()) 228 | return 229 | 230 | argp = parser.parse_args(args[1:]) 231 | 232 | for result in run_bh_tsne(argp.input, no_dims=argp.no_dims, perplexity=argp.perplexity, theta=argp.theta, randseed=argp.randseed, 233 | verbose=argp.verbose, initial_dims=argp.initial_dims, use_pca=argp.use_pca, max_iter=argp.max_iter): 234 | fmt = '' 235 | for i in range(1, len(result)): 236 | fmt = fmt + '{}\t' 237 | fmt = fmt + '{}\n' 238 | argp.output.write(fmt.format(*result)) 239 | 240 | if __name__ == '__main__': 241 | from sys import argv 242 | exit(main(argv)) 243 | -------------------------------------------------------------------------------- /fast_tsne.m: -------------------------------------------------------------------------------- 1 | function mappedX = fast_tsne(X, no_dims, initial_dims, perplexity, theta, alg, max_iter) 2 | %FAST_TSNE Runs the C++ implementation of Barnes-Hut t-SNE 3 | % 4 | % mappedX = fast_tsne(X, no_dims, initial_dims, perplexity, theta, alg) 5 | % 6 | % Runs the C++ implementation of Barnes-Hut-SNE. The high-dimensional 7 | % datapoints are specified in the NxD matrix X. The dimensionality of the 8 | % datapoints is reduced to initial_dims dimensions using PCA (default = 50) 9 | % before t-SNE is performed. Next, t-SNE reduces the points to no_dims 10 | % dimensions. The perplexity of the input similarities may be specified 11 | % through the perplexity variable (default = 30). The variable theta sets 12 | % the trade-off parameter between speed and accuracy: theta = 0 corresponds 13 | % to standard, slow t-SNE, while theta = 1 makes very crude approximations. 14 | % Appropriate values for theta are between 0.1 and 0.7 (default = 0.5). 15 | % The variable alg determines the algorithm used for PCA. The default is set 16 | % to 'svd'. Other options are 'eig' or 'als' (see 'doc pca' for more details). 17 | % The function returns the two-dimensional data points in mappedX. 18 | % 19 | % NOTE: The function is designed to run on large (N > 5000) data sets. It 20 | % may give poor performance on very small data sets (it is better to use a 21 | % standard t-SNE implementation on such data). 22 | 23 | 24 | % Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology) 25 | % All rights reserved. 26 | % 27 | % Redistribution and use in source and binary forms, with or without 28 | % modification, are permitted provided that the following conditions are met: 29 | % 1. Redistributions of source code must retain the above copyright 30 | % notice, this list of conditions and the following disclaimer. 31 | % 2. Redistributions in binary form must reproduce the above copyright 32 | % notice, this list of conditions and the following disclaimer in the 33 | % documentation and/or other materials provided with the distribution. 34 | % 3. All advertising materials mentioning features or use of this software 35 | % must display the following acknowledgement: 36 | % This product includes software developed by the Delft University of Technology. 37 | % 4. Neither the name of the Delft University of Technology nor the names of 38 | % its contributors may be used to endorse or promote products derived from 39 | % this software without specific prior written permission. 40 | % 41 | % THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS 42 | % OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 43 | % OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 44 | % EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 45 | % SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 46 | % PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 47 | % BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 48 | % CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 49 | % IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY 50 | % OF SUCH DAMAGE. 51 | 52 | 53 | if ~exist('no_dims', 'var') || isempty(no_dims) 54 | no_dims = 2; 55 | end 56 | if ~exist('initial_dims', 'var') || isempty(initial_dims) 57 | initial_dims = 50; 58 | end 59 | if ~exist('perplexity', 'var') || isempty(perplexity) 60 | perplexity = 30; 61 | end 62 | if ~exist('theta', 'var') || isempty(theta) 63 | theta = 0.5; 64 | end 65 | if ~exist('alg', 'var') || isempty(alg) 66 | alg = 'svd'; 67 | end 68 | if ~exist('max_iter', 'var') || isempty(max_iter) 69 | max_iter=1000; 70 | end 71 | 72 | % Perform the initial dimensionality reduction using PCA 73 | X = double(X); 74 | X = bsxfun(@minus, X, mean(X, 1)); 75 | M = pca(X,'NumComponents',initial_dims,'Algorithm',alg); 76 | X = X * M; 77 | 78 | tsne_path = which('fast_tsne'); 79 | tsne_path = fileparts(tsne_path); 80 | 81 | % Compile t-SNE C code 82 | if(~exist(fullfile(tsne_path,'./bh_tsne'),'file') && isunix) 83 | system(sprintf('g++ %s %s -o %s -O2',... 84 | fullfile(tsne_path,'./sptree.cpp'),... 85 | fullfile(tsne_path,'./tsne.cpp'),... 86 | fullfile(tsne_path,'./bh_tsne'))); 87 | end 88 | 89 | % Run the fast diffusion SNE implementation 90 | write_data(X, no_dims, theta, perplexity, max_iter); 91 | tic 92 | [flag, cmdout] = system(['"' fullfile(tsne_path,'./bh_tsne') '"']); 93 | if(flag~=0) 94 | error(cmdout); 95 | end 96 | toc 97 | [mappedX, landmarks, costs] = read_data; 98 | landmarks = landmarks + 1; % correct for Matlab indexing 99 | delete('data.dat'); 100 | delete('result.dat'); 101 | end 102 | 103 | 104 | % Writes the datafile for the fast t-SNE implementation 105 | function write_data(X, no_dims, theta, perplexity, max_iter) 106 | [n, d] = size(X); 107 | h = fopen('data.dat', 'wb'); 108 | fwrite(h, n, 'integer*4'); 109 | fwrite(h, d, 'integer*4'); 110 | fwrite(h, theta, 'double'); 111 | fwrite(h, perplexity, 'double'); 112 | fwrite(h, no_dims, 'integer*4'); 113 | fwrite(h, max_iter, 'integer*4'); 114 | fwrite(h, X', 'double'); 115 | fclose(h); 116 | end 117 | 118 | 119 | % Reads the result file from the fast t-SNE implementation 120 | function [X, landmarks, costs] = read_data 121 | h = fopen('result.dat', 'rb'); 122 | n = fread(h, 1, 'integer*4'); 123 | d = fread(h, 1, 'integer*4'); 124 | X = fread(h, n * d, 'double'); 125 | landmarks = fread(h, n, 'integer*4'); 126 | costs = fread(h, n, 'double'); % this vector contains only zeros 127 | X = reshape(X, [d n])'; 128 | fclose(h); 129 | end 130 | -------------------------------------------------------------------------------- /sptree.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology) 4 | * All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are met: 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * 3. All advertising materials mentioning features or use of this software 14 | * must display the following acknowledgement: 15 | * This product includes software developed by the Delft University of Technology. 16 | * 4. Neither the name of the Delft University of Technology nor the names of 17 | * its contributors may be used to endorse or promote products derived from 18 | * this software without specific prior written permission. 19 | * 20 | * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS 21 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 23 | * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 28 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY 29 | * OF SUCH DAMAGE. 30 | * 31 | */ 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include "sptree.h" 39 | 40 | 41 | 42 | // Constructs cell 43 | Cell::Cell(unsigned int inp_dimension) { 44 | dimension = inp_dimension; 45 | corner = (double*) malloc(dimension * sizeof(double)); 46 | width = (double*) malloc(dimension * sizeof(double)); 47 | } 48 | 49 | Cell::Cell(unsigned int inp_dimension, double* inp_corner, double* inp_width) { 50 | dimension = inp_dimension; 51 | corner = (double*) malloc(dimension * sizeof(double)); 52 | width = (double*) malloc(dimension * sizeof(double)); 53 | for(int d = 0; d < dimension; d++) setCorner(d, inp_corner[d]); 54 | for(int d = 0; d < dimension; d++) setWidth( d, inp_width[d]); 55 | } 56 | 57 | // Destructs cell 58 | Cell::~Cell() { 59 | free(corner); 60 | free(width); 61 | } 62 | 63 | double Cell::getCorner(unsigned int d) { 64 | return corner[d]; 65 | } 66 | 67 | double Cell::getWidth(unsigned int d) { 68 | return width[d]; 69 | } 70 | 71 | void Cell::setCorner(unsigned int d, double val) { 72 | corner[d] = val; 73 | } 74 | 75 | void Cell::setWidth(unsigned int d, double val) { 76 | width[d] = val; 77 | } 78 | 79 | // Checks whether a point lies in a cell 80 | bool Cell::containsPoint(double point[]) 81 | { 82 | for(int d = 0; d < dimension; d++) { 83 | if(corner[d] - width[d] > point[d]) return false; 84 | if(corner[d] + width[d] < point[d]) return false; 85 | } 86 | return true; 87 | } 88 | 89 | 90 | // Default constructor for SPTree -- build tree, too! 91 | SPTree::SPTree(unsigned int D, double* inp_data, unsigned int N) 92 | { 93 | 94 | // Compute mean, width, and height of current map (boundaries of SPTree) 95 | int nD = 0; 96 | double* mean_Y = (double*) calloc(D, sizeof(double)); 97 | double* min_Y = (double*) malloc(D * sizeof(double)); for(unsigned int d = 0; d < D; d++) min_Y[d] = DBL_MAX; 98 | double* max_Y = (double*) malloc(D * sizeof(double)); for(unsigned int d = 0; d < D; d++) max_Y[d] = -DBL_MAX; 99 | for(unsigned int n = 0; n < N; n++) { 100 | for(unsigned int d = 0; d < D; d++) { 101 | mean_Y[d] += inp_data[n * D + d]; 102 | if(inp_data[nD + d] < min_Y[d]) min_Y[d] = inp_data[nD + d]; 103 | if(inp_data[nD + d] > max_Y[d]) max_Y[d] = inp_data[nD + d]; 104 | } 105 | nD += D; 106 | } 107 | for(int d = 0; d < D; d++) mean_Y[d] /= (double) N; 108 | 109 | // Construct SPTree 110 | double* width = (double*) malloc(D * sizeof(double)); 111 | for(int d = 0; d < D; d++) width[d] = fmax(max_Y[d] - mean_Y[d], mean_Y[d] - min_Y[d]) + 1e-5; 112 | init(NULL, D, inp_data, mean_Y, width); 113 | fill(N); 114 | 115 | // Clean up memory 116 | free(mean_Y); 117 | free(max_Y); 118 | free(min_Y); 119 | free(width); 120 | } 121 | 122 | 123 | // Constructor for SPTree with particular size and parent -- build the tree, too! 124 | SPTree::SPTree(unsigned int D, double* inp_data, unsigned int N, double* inp_corner, double* inp_width) 125 | { 126 | init(NULL, D, inp_data, inp_corner, inp_width); 127 | fill(N); 128 | } 129 | 130 | 131 | // Constructor for SPTree with particular size (do not fill the tree) 132 | SPTree::SPTree(unsigned int D, double* inp_data, double* inp_corner, double* inp_width) 133 | { 134 | init(NULL, D, inp_data, inp_corner, inp_width); 135 | } 136 | 137 | 138 | // Constructor for SPTree with particular size and parent (do not fill tree) 139 | SPTree::SPTree(SPTree* inp_parent, unsigned int D, double* inp_data, double* inp_corner, double* inp_width) { 140 | init(inp_parent, D, inp_data, inp_corner, inp_width); 141 | } 142 | 143 | 144 | // Constructor for SPTree with particular size and parent -- build the tree, too! 145 | SPTree::SPTree(SPTree* inp_parent, unsigned int D, double* inp_data, unsigned int N, double* inp_corner, double* inp_width) 146 | { 147 | init(inp_parent, D, inp_data, inp_corner, inp_width); 148 | fill(N); 149 | } 150 | 151 | 152 | // Main initialization function 153 | void SPTree::init(SPTree* inp_parent, unsigned int D, double* inp_data, double* inp_corner, double* inp_width) 154 | { 155 | parent = inp_parent; 156 | dimension = D; 157 | no_children = 2; 158 | for(unsigned int d = 1; d < D; d++) no_children *= 2; 159 | data = inp_data; 160 | is_leaf = true; 161 | size = 0; 162 | cum_size = 0; 163 | 164 | boundary = new Cell(dimension); 165 | for(unsigned int d = 0; d < D; d++) boundary->setCorner(d, inp_corner[d]); 166 | for(unsigned int d = 0; d < D; d++) boundary->setWidth( d, inp_width[d]); 167 | 168 | children = (SPTree**) malloc(no_children * sizeof(SPTree*)); 169 | for(unsigned int i = 0; i < no_children; i++) children[i] = NULL; 170 | 171 | center_of_mass = (double*) malloc(D * sizeof(double)); 172 | for(unsigned int d = 0; d < D; d++) center_of_mass[d] = .0; 173 | 174 | buff = (double*) malloc(D * sizeof(double)); 175 | } 176 | 177 | 178 | // Destructor for SPTree 179 | SPTree::~SPTree() 180 | { 181 | for(unsigned int i = 0; i < no_children; i++) { 182 | if(children[i] != NULL) delete children[i]; 183 | } 184 | free(children); 185 | free(center_of_mass); 186 | free(buff); 187 | delete boundary; 188 | } 189 | 190 | 191 | // Update the data underlying this tree 192 | void SPTree::setData(double* inp_data) 193 | { 194 | data = inp_data; 195 | } 196 | 197 | 198 | // Get the parent of the current tree 199 | SPTree* SPTree::getParent() 200 | { 201 | return parent; 202 | } 203 | 204 | 205 | // Insert a point into the SPTree 206 | bool SPTree::insert(unsigned int new_index) 207 | { 208 | // Ignore objects which do not belong in this quad tree 209 | double* point = data + new_index * dimension; 210 | if(!boundary->containsPoint(point)) 211 | return false; 212 | 213 | // Online update of cumulative size and center-of-mass 214 | cum_size++; 215 | double mult1 = (double) (cum_size - 1) / (double) cum_size; 216 | double mult2 = 1.0 / (double) cum_size; 217 | for(unsigned int d = 0; d < dimension; d++) center_of_mass[d] *= mult1; 218 | for(unsigned int d = 0; d < dimension; d++) center_of_mass[d] += mult2 * point[d]; 219 | 220 | // If there is space in this quad tree and it is a leaf, add the object here 221 | if(is_leaf && size < QT_NODE_CAPACITY) { 222 | index[size] = new_index; 223 | size++; 224 | return true; 225 | } 226 | 227 | // Don't add duplicates for now (this is not very nice) 228 | bool any_duplicate = false; 229 | for(unsigned int n = 0; n < size; n++) { 230 | bool duplicate = true; 231 | for(unsigned int d = 0; d < dimension; d++) { 232 | if(point[d] != data[index[n] * dimension + d]) { duplicate = false; break; } 233 | } 234 | any_duplicate = any_duplicate | duplicate; 235 | } 236 | if(any_duplicate) return true; 237 | 238 | // Otherwise, we need to subdivide the current cell 239 | if(is_leaf) subdivide(); 240 | 241 | // Find out where the point can be inserted 242 | for(unsigned int i = 0; i < no_children; i++) { 243 | if(children[i]->insert(new_index)) return true; 244 | } 245 | 246 | // Otherwise, the point cannot be inserted (this should never happen) 247 | return false; 248 | } 249 | 250 | 251 | // Create four children which fully divide this cell into four quads of equal area 252 | void SPTree::subdivide() { 253 | 254 | // Create new children 255 | double* new_corner = (double*) malloc(dimension * sizeof(double)); 256 | double* new_width = (double*) malloc(dimension * sizeof(double)); 257 | for(unsigned int i = 0; i < no_children; i++) { 258 | unsigned int div = 1; 259 | for(unsigned int d = 0; d < dimension; d++) { 260 | new_width[d] = .5 * boundary->getWidth(d); 261 | if((i / div) % 2 == 1) new_corner[d] = boundary->getCorner(d) - .5 * boundary->getWidth(d); 262 | else new_corner[d] = boundary->getCorner(d) + .5 * boundary->getWidth(d); 263 | div *= 2; 264 | } 265 | children[i] = new SPTree(this, dimension, data, new_corner, new_width); 266 | } 267 | free(new_corner); 268 | free(new_width); 269 | 270 | // Move existing points to correct children 271 | for(unsigned int i = 0; i < size; i++) { 272 | bool success = false; 273 | for(unsigned int j = 0; j < no_children; j++) { 274 | if(!success) success = children[j]->insert(index[i]); 275 | } 276 | index[i] = -1; 277 | } 278 | 279 | // Empty parent node 280 | size = 0; 281 | is_leaf = false; 282 | } 283 | 284 | 285 | // Build SPTree on dataset 286 | void SPTree::fill(unsigned int N) 287 | { 288 | for(unsigned int i = 0; i < N; i++) insert(i); 289 | } 290 | 291 | 292 | // Checks whether the specified tree is correct 293 | bool SPTree::isCorrect() 294 | { 295 | for(unsigned int n = 0; n < size; n++) { 296 | double* point = data + index[n] * dimension; 297 | if(!boundary->containsPoint(point)) return false; 298 | } 299 | if(!is_leaf) { 300 | bool correct = true; 301 | for(int i = 0; i < no_children; i++) correct = correct && children[i]->isCorrect(); 302 | return correct; 303 | } 304 | else return true; 305 | } 306 | 307 | 308 | 309 | // Build a list of all indices in SPTree 310 | void SPTree::getAllIndices(unsigned int* indices) 311 | { 312 | getAllIndices(indices, 0); 313 | } 314 | 315 | 316 | // Build a list of all indices in SPTree 317 | unsigned int SPTree::getAllIndices(unsigned int* indices, unsigned int loc) 318 | { 319 | 320 | // Gather indices in current quadrant 321 | for(unsigned int i = 0; i < size; i++) indices[loc + i] = index[i]; 322 | loc += size; 323 | 324 | // Gather indices in children 325 | if(!is_leaf) { 326 | for(int i = 0; i < no_children; i++) loc = children[i]->getAllIndices(indices, loc); 327 | } 328 | return loc; 329 | } 330 | 331 | 332 | unsigned int SPTree::getDepth() { 333 | if(is_leaf) return 1; 334 | int depth = 0; 335 | for(unsigned int i = 0; i < no_children; i++) depth = fmax(depth, children[i]->getDepth()); 336 | return 1 + depth; 337 | } 338 | 339 | 340 | // Compute non-edge forces using Barnes-Hut algorithm 341 | void SPTree::computeNonEdgeForces(unsigned int point_index, double theta, double neg_f[], double* sum_Q) 342 | { 343 | 344 | // Make sure that we spend no time on empty nodes or self-interactions 345 | if(cum_size == 0 || (is_leaf && size == 1 && index[0] == point_index)) return; 346 | 347 | // Compute distance between point and center-of-mass 348 | double D = .0; 349 | unsigned int ind = point_index * dimension; 350 | for(unsigned int d = 0; d < dimension; d++) buff[d] = data[ind + d] - center_of_mass[d]; 351 | for(unsigned int d = 0; d < dimension; d++) D += buff[d] * buff[d]; 352 | 353 | // Check whether we can use this node as a "summary" 354 | double max_width = 0.0; 355 | double cur_width; 356 | for(unsigned int d = 0; d < dimension; d++) { 357 | cur_width = boundary->getWidth(d); 358 | max_width = (max_width > cur_width) ? max_width : cur_width; 359 | } 360 | if(is_leaf || max_width / sqrt(D) < theta) { 361 | 362 | // Compute and add t-SNE force between point and current node 363 | D = 1.0 / (1.0 + D); 364 | double mult = cum_size * D; 365 | *sum_Q += mult; 366 | mult *= D; 367 | for(unsigned int d = 0; d < dimension; d++) neg_f[d] += mult * buff[d]; 368 | } 369 | else { 370 | 371 | // Recursively apply Barnes-Hut to children 372 | for(unsigned int i = 0; i < no_children; i++) children[i]->computeNonEdgeForces(point_index, theta, neg_f, sum_Q); 373 | } 374 | } 375 | 376 | 377 | // Computes edge forces 378 | void SPTree::computeEdgeForces(unsigned int* row_P, unsigned int* col_P, double* val_P, int N, double* pos_f) 379 | { 380 | 381 | // Loop over all edges in the graph 382 | unsigned int ind1 = 0; 383 | unsigned int ind2 = 0; 384 | double D; 385 | for(unsigned int n = 0; n < N; n++) { 386 | for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) { 387 | 388 | // Compute pairwise distance and Q-value 389 | D = 1.0; 390 | ind2 = col_P[i] * dimension; 391 | for(unsigned int d = 0; d < dimension; d++) buff[d] = data[ind1 + d] - data[ind2 + d]; 392 | for(unsigned int d = 0; d < dimension; d++) D += buff[d] * buff[d]; 393 | D = val_P[i] / D; 394 | 395 | // Sum positive force 396 | for(unsigned int d = 0; d < dimension; d++) pos_f[ind1 + d] += D * buff[d]; 397 | } 398 | ind1 += dimension; 399 | } 400 | } 401 | 402 | 403 | // Print out tree 404 | void SPTree::print() 405 | { 406 | if(cum_size == 0) { 407 | printf("Empty node\n"); 408 | return; 409 | } 410 | 411 | if(is_leaf) { 412 | printf("Leaf node; data = ["); 413 | for(int i = 0; i < size; i++) { 414 | double* point = data + index[i] * dimension; 415 | for(int d = 0; d < dimension; d++) printf("%f, ", point[d]); 416 | printf(" (index = %d)", index[i]); 417 | if(i < size - 1) printf("\n"); 418 | else printf("]\n"); 419 | } 420 | } 421 | else { 422 | printf("Intersection node with center-of-mass = ["); 423 | for(int d = 0; d < dimension; d++) printf("%f, ", center_of_mass[d]); 424 | printf("]; children are:\n"); 425 | for(int i = 0; i < no_children; i++) children[i]->print(); 426 | } 427 | } 428 | 429 | -------------------------------------------------------------------------------- /sptree.h: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology) 4 | * All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are met: 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * 3. All advertising materials mentioning features or use of this software 14 | * must display the following acknowledgement: 15 | * This product includes software developed by the Delft University of Technology. 16 | * 4. Neither the name of the Delft University of Technology nor the names of 17 | * its contributors may be used to endorse or promote products derived from 18 | * this software without specific prior written permission. 19 | * 20 | * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS 21 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 23 | * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 28 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY 29 | * OF SUCH DAMAGE. 30 | * 31 | */ 32 | 33 | 34 | #ifndef SPTREE_H 35 | #define SPTREE_H 36 | 37 | using namespace std; 38 | 39 | 40 | class Cell { 41 | 42 | unsigned int dimension; 43 | double* corner; 44 | double* width; 45 | 46 | 47 | public: 48 | Cell(unsigned int inp_dimension); 49 | Cell(unsigned int inp_dimension, double* inp_corner, double* inp_width); 50 | ~Cell(); 51 | 52 | double getCorner(unsigned int d); 53 | double getWidth(unsigned int d); 54 | void setCorner(unsigned int d, double val); 55 | void setWidth(unsigned int d, double val); 56 | bool containsPoint(double point[]); 57 | }; 58 | 59 | 60 | class SPTree 61 | { 62 | 63 | // Fixed constants 64 | static const unsigned int QT_NODE_CAPACITY = 1; 65 | 66 | // A buffer we use when doing force computations 67 | double* buff; 68 | 69 | // Properties of this node in the tree 70 | SPTree* parent; 71 | unsigned int dimension; 72 | bool is_leaf; 73 | unsigned int size; 74 | unsigned int cum_size; 75 | 76 | // Axis-aligned bounding box stored as a center with half-dimensions to represent the boundaries of this quad tree 77 | Cell* boundary; 78 | 79 | // Indices in this space-partitioning tree node, corresponding center-of-mass, and list of all children 80 | double* data; 81 | double* center_of_mass; 82 | unsigned int index[QT_NODE_CAPACITY]; 83 | 84 | // Children 85 | SPTree** children; 86 | unsigned int no_children; 87 | 88 | public: 89 | SPTree(unsigned int D, double* inp_data, unsigned int N); 90 | SPTree(unsigned int D, double* inp_data, double* inp_corner, double* inp_width); 91 | SPTree(unsigned int D, double* inp_data, unsigned int N, double* inp_corner, double* inp_width); 92 | SPTree(SPTree* inp_parent, unsigned int D, double* inp_data, unsigned int N, double* inp_corner, double* inp_width); 93 | SPTree(SPTree* inp_parent, unsigned int D, double* inp_data, double* inp_corner, double* inp_width); 94 | ~SPTree(); 95 | void setData(double* inp_data); 96 | SPTree* getParent(); 97 | void construct(Cell boundary); 98 | bool insert(unsigned int new_index); 99 | void subdivide(); 100 | bool isCorrect(); 101 | void rebuildTree(); 102 | void getAllIndices(unsigned int* indices); 103 | unsigned int getDepth(); 104 | void computeNonEdgeForces(unsigned int point_index, double theta, double neg_f[], double* sum_Q); 105 | void computeEdgeForces(unsigned int* row_P, unsigned int* col_P, double* val_P, int N, double* pos_f); 106 | void print(); 107 | 108 | private: 109 | void init(SPTree* inp_parent, unsigned int D, double* inp_data, double* inp_corner, double* inp_width); 110 | void fill(unsigned int N); 111 | unsigned int getAllIndices(unsigned int* indices, unsigned int loc); 112 | bool isChild(unsigned int test_index, unsigned int start, unsigned int end); 113 | }; 114 | 115 | #endif 116 | -------------------------------------------------------------------------------- /tsne.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology) 4 | * All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are met: 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * 3. All advertising materials mentioning features or use of this software 14 | * must display the following acknowledgement: 15 | * This product includes software developed by the Delft University of Technology. 16 | * 4. Neither the name of the Delft University of Technology nor the names of 17 | * its contributors may be used to endorse or promote products derived from 18 | * this software without specific prior written permission. 19 | * 20 | * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS 21 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 23 | * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 28 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY 29 | * OF SUCH DAMAGE. 30 | * 31 | */ 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include "vptree.h" 40 | #include "sptree.h" 41 | #include "tsne.h" 42 | 43 | 44 | using namespace std; 45 | 46 | static double sign(double x) { return (x == .0 ? .0 : (x < .0 ? -1.0 : 1.0)); } 47 | 48 | static void zeroMean(double* X, int N, int D); 49 | static void computeGaussianPerplexity(double* X, int N, int D, double* P, double perplexity); 50 | static void computeGaussianPerplexity(double* X, int N, int D, unsigned int** _row_P, unsigned int** _col_P, double** _val_P, double perplexity, int K); 51 | static double randn(); 52 | static void computeExactGradient(double* P, double* Y, int N, int D, double* dC); 53 | static void computeGradient(unsigned int* inp_row_P, unsigned int* inp_col_P, double* inp_val_P, double* Y, int N, int D, double* dC, double theta); 54 | static double evaluateError(double* P, double* Y, int N, int D); 55 | static double evaluateError(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, int N, int D, double theta); 56 | static void computeSquaredEuclideanDistance(double* X, int N, int D, double* DD); 57 | static void symmetrizeMatrix(unsigned int** row_P, unsigned int** col_P, double** val_P, int N); 58 | 59 | // Perform t-SNE 60 | void TSNE::run(double* X, int N, int D, double* Y, int no_dims, double perplexity, double theta, int rand_seed, 61 | bool skip_random_init, int max_iter, int stop_lying_iter, int mom_switch_iter) { 62 | 63 | // Set random seed 64 | if (skip_random_init != true) { 65 | if(rand_seed >= 0) { 66 | printf("Using random seed: %d\n", rand_seed); 67 | srand((unsigned int) rand_seed); 68 | } else { 69 | printf("Using current time as random seed...\n"); 70 | srand(time(NULL)); 71 | } 72 | } 73 | 74 | // Determine whether we are using an exact algorithm 75 | if(N - 1 < 3 * perplexity) { printf("Perplexity too large for the number of data points!\n"); exit(1); } 76 | printf("Using no_dims = %d, perplexity = %f, and theta = %f\n", no_dims, perplexity, theta); 77 | bool exact = (theta == .0) ? true : false; 78 | 79 | // Set learning parameters 80 | float total_time = .0; 81 | clock_t start, end; 82 | double momentum = .5, final_momentum = .8; 83 | double eta = 200.0; 84 | 85 | // Allocate some memory 86 | double* dY = (double*) malloc(N * no_dims * sizeof(double)); 87 | double* uY = (double*) malloc(N * no_dims * sizeof(double)); 88 | double* gains = (double*) malloc(N * no_dims * sizeof(double)); 89 | if(dY == NULL || uY == NULL || gains == NULL) { printf("Memory allocation failed!\n"); exit(1); } 90 | for(int i = 0; i < N * no_dims; i++) uY[i] = .0; 91 | for(int i = 0; i < N * no_dims; i++) gains[i] = 1.0; 92 | 93 | // Normalize input data (to prevent numerical problems) 94 | printf("Computing input similarities...\n"); 95 | start = clock(); 96 | zeroMean(X, N, D); 97 | double max_X = .0; 98 | for(int i = 0; i < N * D; i++) { 99 | if(fabs(X[i]) > max_X) max_X = fabs(X[i]); 100 | } 101 | for(int i = 0; i < N * D; i++) X[i] /= max_X; 102 | 103 | // Compute input similarities for exact t-SNE 104 | double* P; unsigned int* row_P; unsigned int* col_P; double* val_P; 105 | if(exact) { 106 | 107 | // Compute similarities 108 | printf("Exact?"); 109 | P = (double*) malloc(N * N * sizeof(double)); 110 | if(P == NULL) { printf("Memory allocation failed!\n"); exit(1); } 111 | computeGaussianPerplexity(X, N, D, P, perplexity); 112 | 113 | // Symmetrize input similarities 114 | printf("Symmetrizing...\n"); 115 | int nN = 0; 116 | for(int n = 0; n < N; n++) { 117 | int mN = (n + 1) * N; 118 | for(int m = n + 1; m < N; m++) { 119 | P[nN + m] += P[mN + n]; 120 | P[mN + n] = P[nN + m]; 121 | mN += N; 122 | } 123 | nN += N; 124 | } 125 | double sum_P = .0; 126 | for(int i = 0; i < N * N; i++) sum_P += P[i]; 127 | for(int i = 0; i < N * N; i++) P[i] /= sum_P; 128 | } 129 | 130 | // Compute input similarities for approximate t-SNE 131 | else { 132 | 133 | // Compute asymmetric pairwise input similarities 134 | computeGaussianPerplexity(X, N, D, &row_P, &col_P, &val_P, perplexity, (int) (3 * perplexity)); 135 | 136 | // Symmetrize input similarities 137 | symmetrizeMatrix(&row_P, &col_P, &val_P, N); 138 | double sum_P = .0; 139 | for(int i = 0; i < row_P[N]; i++) sum_P += val_P[i]; 140 | for(int i = 0; i < row_P[N]; i++) val_P[i] /= sum_P; 141 | } 142 | end = clock(); 143 | 144 | // Lie about the P-values 145 | if(exact) { for(int i = 0; i < N * N; i++) P[i] *= 12.0; } 146 | else { for(int i = 0; i < row_P[N]; i++) val_P[i] *= 12.0; } 147 | 148 | // Initialize solution (randomly) 149 | if (skip_random_init != true) { 150 | for(int i = 0; i < N * no_dims; i++) Y[i] = randn() * .0001; 151 | } 152 | 153 | // Perform main training loop 154 | if(exact) printf("Input similarities computed in %4.2f seconds!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC); 155 | else printf("Input similarities computed in %4.2f seconds (sparsity = %f)!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC, (double) row_P[N] / ((double) N * (double) N)); 156 | start = clock(); 157 | 158 | for(int iter = 0; iter < max_iter; iter++) { 159 | 160 | // Compute (approximate) gradient 161 | if(exact) computeExactGradient(P, Y, N, no_dims, dY); 162 | else computeGradient(row_P, col_P, val_P, Y, N, no_dims, dY, theta); 163 | 164 | // Update gains 165 | for(int i = 0; i < N * no_dims; i++) gains[i] = (sign(dY[i]) != sign(uY[i])) ? (gains[i] + .2) : (gains[i] * .8); 166 | for(int i = 0; i < N * no_dims; i++) if(gains[i] < .01) gains[i] = .01; 167 | 168 | // Perform gradient update (with momentum and gains) 169 | for(int i = 0; i < N * no_dims; i++) uY[i] = momentum * uY[i] - eta * gains[i] * dY[i]; 170 | for(int i = 0; i < N * no_dims; i++) Y[i] = Y[i] + uY[i]; 171 | 172 | // Make solution zero-mean 173 | zeroMean(Y, N, no_dims); 174 | 175 | // Stop lying about the P-values after a while, and switch momentum 176 | if(iter == stop_lying_iter) { 177 | if(exact) { for(int i = 0; i < N * N; i++) P[i] /= 12.0; } 178 | else { for(int i = 0; i < row_P[N]; i++) val_P[i] /= 12.0; } 179 | } 180 | if(iter == mom_switch_iter) momentum = final_momentum; 181 | 182 | // Print out progress 183 | if (iter > 0 && (iter % 50 == 0 || iter == max_iter - 1)) { 184 | end = clock(); 185 | double C = .0; 186 | if(exact) C = evaluateError(P, Y, N, no_dims); 187 | else C = evaluateError(row_P, col_P, val_P, Y, N, no_dims, theta); // doing approximate computation here! 188 | if(iter == 0) 189 | printf("Iteration %d: error is %f\n", iter + 1, C); 190 | else { 191 | total_time += (float) (end - start) / CLOCKS_PER_SEC; 192 | printf("Iteration %d: error is %f (50 iterations in %4.2f seconds)\n", iter, C, (float) (end - start) / CLOCKS_PER_SEC); 193 | } 194 | start = clock(); 195 | } 196 | } 197 | end = clock(); total_time += (float) (end - start) / CLOCKS_PER_SEC; 198 | 199 | // Clean up memory 200 | free(dY); 201 | free(uY); 202 | free(gains); 203 | if(exact) free(P); 204 | else { 205 | free(row_P); row_P = NULL; 206 | free(col_P); col_P = NULL; 207 | free(val_P); val_P = NULL; 208 | } 209 | printf("Fitting performed in %4.2f seconds.\n", total_time); 210 | } 211 | 212 | 213 | // Compute gradient of the t-SNE cost function (using Barnes-Hut algorithm) 214 | static void computeGradient(unsigned int* inp_row_P, unsigned int* inp_col_P, double* inp_val_P, double* Y, int N, int D, double* dC, double theta) 215 | { 216 | 217 | // Construct space-partitioning tree on current map 218 | SPTree* tree = new SPTree(D, Y, N); 219 | 220 | // Compute all terms required for t-SNE gradient 221 | double sum_Q = .0; 222 | double* pos_f = (double*) calloc(N * D, sizeof(double)); 223 | double* neg_f = (double*) calloc(N * D, sizeof(double)); 224 | if(pos_f == NULL || neg_f == NULL) { printf("Memory allocation failed!\n"); exit(1); } 225 | tree->computeEdgeForces(inp_row_P, inp_col_P, inp_val_P, N, pos_f); 226 | for(int n = 0; n < N; n++) tree->computeNonEdgeForces(n, theta, neg_f + n * D, &sum_Q); 227 | 228 | // Compute final t-SNE gradient 229 | for(int i = 0; i < N * D; i++) { 230 | dC[i] = pos_f[i] - (neg_f[i] / sum_Q); 231 | } 232 | free(pos_f); 233 | free(neg_f); 234 | delete tree; 235 | } 236 | 237 | // Compute gradient of the t-SNE cost function (exact) 238 | static void computeExactGradient(double* P, double* Y, int N, int D, double* dC) { 239 | 240 | // Make sure the current gradient contains zeros 241 | for(int i = 0; i < N * D; i++) dC[i] = 0.0; 242 | 243 | // Compute the squared Euclidean distance matrix 244 | double* DD = (double*) malloc(N * N * sizeof(double)); 245 | if(DD == NULL) { printf("Memory allocation failed!\n"); exit(1); } 246 | computeSquaredEuclideanDistance(Y, N, D, DD); 247 | 248 | // Compute Q-matrix and normalization sum 249 | double* Q = (double*) malloc(N * N * sizeof(double)); 250 | if(Q == NULL) { printf("Memory allocation failed!\n"); exit(1); } 251 | double sum_Q = .0; 252 | int nN = 0; 253 | for(int n = 0; n < N; n++) { 254 | for(int m = 0; m < N; m++) { 255 | if(n != m) { 256 | Q[nN + m] = 1 / (1 + DD[nN + m]); 257 | sum_Q += Q[nN + m]; 258 | } 259 | } 260 | nN += N; 261 | } 262 | 263 | // Perform the computation of the gradient 264 | nN = 0; 265 | int nD = 0; 266 | for(int n = 0; n < N; n++) { 267 | int mD = 0; 268 | for(int m = 0; m < N; m++) { 269 | if(n != m) { 270 | double mult = (P[nN + m] - (Q[nN + m] / sum_Q)) * Q[nN + m]; 271 | for(int d = 0; d < D; d++) { 272 | dC[nD + d] += (Y[nD + d] - Y[mD + d]) * mult; 273 | } 274 | } 275 | mD += D; 276 | } 277 | nN += N; 278 | nD += D; 279 | } 280 | 281 | // Free memory 282 | free(DD); DD = NULL; 283 | free(Q); Q = NULL; 284 | } 285 | 286 | 287 | // Evaluate t-SNE cost function (exactly) 288 | static double evaluateError(double* P, double* Y, int N, int D) { 289 | 290 | // Compute the squared Euclidean distance matrix 291 | double* DD = (double*) malloc(N * N * sizeof(double)); 292 | double* Q = (double*) malloc(N * N * sizeof(double)); 293 | if(DD == NULL || Q == NULL) { printf("Memory allocation failed!\n"); exit(1); } 294 | computeSquaredEuclideanDistance(Y, N, D, DD); 295 | 296 | // Compute Q-matrix and normalization sum 297 | int nN = 0; 298 | double sum_Q = DBL_MIN; 299 | for(int n = 0; n < N; n++) { 300 | for(int m = 0; m < N; m++) { 301 | if(n != m) { 302 | Q[nN + m] = 1 / (1 + DD[nN + m]); 303 | sum_Q += Q[nN + m]; 304 | } 305 | else Q[nN + m] = DBL_MIN; 306 | } 307 | nN += N; 308 | } 309 | for(int i = 0; i < N * N; i++) Q[i] /= sum_Q; 310 | 311 | // Sum t-SNE error 312 | double C = .0; 313 | for(int n = 0; n < N * N; n++) { 314 | C += P[n] * log((P[n] + FLT_MIN) / (Q[n] + FLT_MIN)); 315 | } 316 | 317 | // Clean up memory 318 | free(DD); 319 | free(Q); 320 | return C; 321 | } 322 | 323 | // Evaluate t-SNE cost function (approximately) 324 | static double evaluateError(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, int N, int D, double theta) 325 | { 326 | 327 | // Get estimate of normalization term 328 | SPTree* tree = new SPTree(D, Y, N); 329 | double* buff = (double*) calloc(D, sizeof(double)); 330 | double sum_Q = .0; 331 | for(int n = 0; n < N; n++) tree->computeNonEdgeForces(n, theta, buff, &sum_Q); 332 | 333 | // Loop over all edges to compute t-SNE error 334 | int ind1, ind2; 335 | double C = .0, Q; 336 | for(int n = 0; n < N; n++) { 337 | ind1 = n * D; 338 | for(int i = row_P[n]; i < row_P[n + 1]; i++) { 339 | Q = .0; 340 | ind2 = col_P[i] * D; 341 | for(int d = 0; d < D; d++) buff[d] = Y[ind1 + d]; 342 | for(int d = 0; d < D; d++) buff[d] -= Y[ind2 + d]; 343 | for(int d = 0; d < D; d++) Q += buff[d] * buff[d]; 344 | Q = (1.0 / (1.0 + Q)) / sum_Q; 345 | C += val_P[i] * log((val_P[i] + FLT_MIN) / (Q + FLT_MIN)); 346 | } 347 | } 348 | 349 | // Clean up memory 350 | free(buff); 351 | delete tree; 352 | return C; 353 | } 354 | 355 | 356 | // Compute input similarities with a fixed perplexity 357 | static void computeGaussianPerplexity(double* X, int N, int D, double* P, double perplexity) { 358 | 359 | // Compute the squared Euclidean distance matrix 360 | double* DD = (double*) malloc(N * N * sizeof(double)); 361 | if(DD == NULL) { printf("Memory allocation failed!\n"); exit(1); } 362 | computeSquaredEuclideanDistance(X, N, D, DD); 363 | 364 | // Compute the Gaussian kernel row by row 365 | int nN = 0; 366 | for(int n = 0; n < N; n++) { 367 | 368 | // Initialize some variables 369 | bool found = false; 370 | double beta = 1.0; 371 | double min_beta = -DBL_MAX; 372 | double max_beta = DBL_MAX; 373 | double tol = 1e-5; 374 | double sum_P; 375 | 376 | // Iterate until we found a good perplexity 377 | int iter = 0; 378 | while(!found && iter < 200) { 379 | 380 | // Compute Gaussian kernel row 381 | for(int m = 0; m < N; m++) P[nN + m] = exp(-beta * DD[nN + m]); 382 | P[nN + n] = DBL_MIN; 383 | 384 | // Compute entropy of current row 385 | sum_P = DBL_MIN; 386 | for(int m = 0; m < N; m++) sum_P += P[nN + m]; 387 | double H = 0.0; 388 | for(int m = 0; m < N; m++) H += beta * (DD[nN + m] * P[nN + m]); 389 | H = (H / sum_P) + log(sum_P); 390 | 391 | // Evaluate whether the entropy is within the tolerance level 392 | double Hdiff = H - log(perplexity); 393 | if(Hdiff < tol && -Hdiff < tol) { 394 | found = true; 395 | } 396 | else { 397 | if(Hdiff > 0) { 398 | min_beta = beta; 399 | if(max_beta == DBL_MAX || max_beta == -DBL_MAX) 400 | beta *= 2.0; 401 | else 402 | beta = (beta + max_beta) / 2.0; 403 | } 404 | else { 405 | max_beta = beta; 406 | if(min_beta == -DBL_MAX || min_beta == DBL_MAX){ 407 | if (beta < 0) { 408 | beta *= 2; 409 | } else { 410 | beta = beta <= 1.0 ? -0.5 : beta / 2.0; 411 | } 412 | } else { 413 | beta = (beta + min_beta) / 2.0; 414 | } 415 | } 416 | } 417 | 418 | // Update iteration counter 419 | iter++; 420 | } 421 | 422 | // Row normalize P 423 | for(int m = 0; m < N; m++) P[nN + m] /= sum_P; 424 | nN += N; 425 | } 426 | 427 | // Clean up memory 428 | free(DD); DD = NULL; 429 | } 430 | 431 | 432 | // Compute input similarities with a fixed perplexity using ball trees (this function allocates memory another function should free) 433 | static void computeGaussianPerplexity(double* X, int N, int D, unsigned int** _row_P, unsigned int** _col_P, double** _val_P, double perplexity, int K) { 434 | 435 | if(perplexity > K) printf("Perplexity should be lower than K!\n"); 436 | 437 | // Allocate the memory we need 438 | *_row_P = (unsigned int*) malloc((N + 1) * sizeof(unsigned int)); 439 | *_col_P = (unsigned int*) calloc(N * K, sizeof(unsigned int)); 440 | *_val_P = (double*) calloc(N * K, sizeof(double)); 441 | if(*_row_P == NULL || *_col_P == NULL || *_val_P == NULL) { printf("Memory allocation failed!\n"); exit(1); } 442 | unsigned int* row_P = *_row_P; 443 | unsigned int* col_P = *_col_P; 444 | double* val_P = *_val_P; 445 | double* cur_P = (double*) malloc((N - 1) * sizeof(double)); 446 | if(cur_P == NULL) { printf("Memory allocation failed!\n"); exit(1); } 447 | row_P[0] = 0; 448 | for(int n = 0; n < N; n++) row_P[n + 1] = row_P[n] + (unsigned int) K; 449 | 450 | // Build ball tree on data set 451 | VpTree* tree = new VpTree(); 452 | vector obj_X(N, DataPoint(D, -1, X)); 453 | for(int n = 0; n < N; n++) obj_X[n] = DataPoint(D, n, X + n * D); 454 | tree->create(obj_X); 455 | 456 | // Loop over all points to find nearest neighbors 457 | printf("Building tree...\n"); 458 | vector indices; 459 | vector distances; 460 | for(int n = 0; n < N; n++) { 461 | 462 | if(n % 10000 == 0) printf(" - point %d of %d\n", n, N); 463 | 464 | // Find nearest neighbors 465 | indices.clear(); 466 | distances.clear(); 467 | tree->search(obj_X[n], K + 1, &indices, &distances); 468 | 469 | // Initialize some variables for binary search 470 | bool found = false; 471 | double beta = 1.0; 472 | double min_beta = -DBL_MAX; 473 | double max_beta = DBL_MAX; 474 | double tol = 1e-5; 475 | 476 | // Iterate until we found a good perplexity 477 | int iter = 0; double sum_P; 478 | while(!found && iter < 200) { 479 | 480 | // Compute Gaussian kernel row 481 | for(int m = 0; m < K; m++) cur_P[m] = exp(-beta * distances[m + 1] * distances[m + 1]); 482 | 483 | // Compute entropy of current row 484 | sum_P = DBL_MIN; 485 | for(int m = 0; m < K; m++) sum_P += cur_P[m]; 486 | double H = .0; 487 | for(int m = 0; m < K; m++) H += beta * (distances[m + 1] * distances[m + 1] * cur_P[m]); 488 | H = (H / sum_P) + log(sum_P); 489 | 490 | // Evaluate whether the entropy is within the tolerance level 491 | double Hdiff = H - log(perplexity); 492 | if(Hdiff < tol && -Hdiff < tol) { 493 | found = true; 494 | } 495 | else { 496 | if(Hdiff > 0) { 497 | min_beta = beta; 498 | if(max_beta == DBL_MAX || max_beta == -DBL_MAX) 499 | beta *= 2.0; 500 | else 501 | beta = (beta + max_beta) / 2.0; 502 | } 503 | else { 504 | max_beta = beta; 505 | if(min_beta == -DBL_MAX || min_beta == DBL_MAX) 506 | beta /= 2.0; 507 | else 508 | beta = (beta + min_beta) / 2.0; 509 | } 510 | } 511 | 512 | // Update iteration counter 513 | iter++; 514 | } 515 | 516 | // Row-normalize current row of P and store in matrix 517 | for(unsigned int m = 0; m < K; m++) cur_P[m] /= sum_P; 518 | for(unsigned int m = 0; m < K; m++) { 519 | col_P[row_P[n] + m] = (unsigned int) indices[m + 1].index(); 520 | val_P[row_P[n] + m] = cur_P[m]; 521 | } 522 | } 523 | 524 | // Clean up memory 525 | obj_X.clear(); 526 | free(cur_P); 527 | delete tree; 528 | } 529 | 530 | 531 | // Symmetrizes a sparse matrix 532 | static void symmetrizeMatrix(unsigned int** _row_P, unsigned int** _col_P, double** _val_P, int N) { 533 | 534 | // Get sparse matrix 535 | unsigned int* row_P = *_row_P; 536 | unsigned int* col_P = *_col_P; 537 | double* val_P = *_val_P; 538 | 539 | // Count number of elements and row counts of symmetric matrix 540 | int* row_counts = (int*) calloc(N, sizeof(int)); 541 | if(row_counts == NULL) { printf("Memory allocation failed!\n"); exit(1); } 542 | for(int n = 0; n < N; n++) { 543 | for(int i = row_P[n]; i < row_P[n + 1]; i++) { 544 | 545 | // Check whether element (col_P[i], n) is present 546 | bool present = false; 547 | for(int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) { 548 | if(col_P[m] == n) present = true; 549 | } 550 | if(present) row_counts[n]++; 551 | else { 552 | row_counts[n]++; 553 | row_counts[col_P[i]]++; 554 | } 555 | } 556 | } 557 | int no_elem = 0; 558 | for(int n = 0; n < N; n++) no_elem += row_counts[n]; 559 | 560 | // Allocate memory for symmetrized matrix 561 | unsigned int* sym_row_P = (unsigned int*) malloc((N + 1) * sizeof(unsigned int)); 562 | unsigned int* sym_col_P = (unsigned int*) malloc(no_elem * sizeof(unsigned int)); 563 | double* sym_val_P = (double*) malloc(no_elem * sizeof(double)); 564 | if(sym_row_P == NULL || sym_col_P == NULL || sym_val_P == NULL) { printf("Memory allocation failed!\n"); exit(1); } 565 | 566 | // Construct new row indices for symmetric matrix 567 | sym_row_P[0] = 0; 568 | for(int n = 0; n < N; n++) sym_row_P[n + 1] = sym_row_P[n] + (unsigned int) row_counts[n]; 569 | 570 | // Fill the result matrix 571 | int* offset = (int*) calloc(N, sizeof(int)); 572 | if(offset == NULL) { printf("Memory allocation failed!\n"); exit(1); } 573 | for(int n = 0; n < N; n++) { 574 | for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) { // considering element(n, col_P[i]) 575 | 576 | // Check whether element (col_P[i], n) is present 577 | bool present = false; 578 | for(unsigned int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) { 579 | if(col_P[m] == n) { 580 | present = true; 581 | if(n <= col_P[i]) { // make sure we do not add elements twice 582 | sym_col_P[sym_row_P[n] + offset[n]] = col_P[i]; 583 | sym_col_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = n; 584 | sym_val_P[sym_row_P[n] + offset[n]] = val_P[i] + val_P[m]; 585 | sym_val_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = val_P[i] + val_P[m]; 586 | } 587 | } 588 | } 589 | 590 | // If (col_P[i], n) is not present, there is no addition involved 591 | if(!present) { 592 | sym_col_P[sym_row_P[n] + offset[n]] = col_P[i]; 593 | sym_col_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = n; 594 | sym_val_P[sym_row_P[n] + offset[n]] = val_P[i]; 595 | sym_val_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = val_P[i]; 596 | } 597 | 598 | // Update offsets 599 | if(!present || (present && n <= col_P[i])) { 600 | offset[n]++; 601 | if(col_P[i] != n) offset[col_P[i]]++; 602 | } 603 | } 604 | } 605 | 606 | // Divide the result by two 607 | for(int i = 0; i < no_elem; i++) sym_val_P[i] /= 2.0; 608 | 609 | // Return symmetrized matrices 610 | free(*_row_P); *_row_P = sym_row_P; 611 | free(*_col_P); *_col_P = sym_col_P; 612 | free(*_val_P); *_val_P = sym_val_P; 613 | 614 | // Free up some memery 615 | free(offset); offset = NULL; 616 | free(row_counts); row_counts = NULL; 617 | } 618 | 619 | // Compute squared Euclidean distance matrix 620 | static void computeSquaredEuclideanDistance(double* X, int N, int D, double* DD) { 621 | const double* XnD = X; 622 | for(int n = 0; n < N; ++n, XnD += D) { 623 | const double* XmD = XnD + D; 624 | double* curr_elem = &DD[n*N + n]; 625 | *curr_elem = 0.0; 626 | double* curr_elem_sym = curr_elem + N; 627 | for(int m = n + 1; m < N; ++m, XmD+=D, curr_elem_sym+=N) { 628 | *(++curr_elem) = 0.0; 629 | for(int d = 0; d < D; ++d) { 630 | *curr_elem += (XnD[d] - XmD[d]) * (XnD[d] - XmD[d]); 631 | } 632 | *curr_elem_sym = *curr_elem; 633 | } 634 | } 635 | } 636 | 637 | 638 | // Makes data zero-mean 639 | static void zeroMean(double* X, int N, int D) { 640 | 641 | // Compute data mean 642 | double* mean = (double*) calloc(D, sizeof(double)); 643 | if(mean == NULL) { printf("Memory allocation failed!\n"); exit(1); } 644 | int nD = 0; 645 | for(int n = 0; n < N; n++) { 646 | for(int d = 0; d < D; d++) { 647 | mean[d] += X[nD + d]; 648 | } 649 | nD += D; 650 | } 651 | for(int d = 0; d < D; d++) { 652 | mean[d] /= (double) N; 653 | } 654 | 655 | // Subtract data mean 656 | nD = 0; 657 | for(int n = 0; n < N; n++) { 658 | for(int d = 0; d < D; d++) { 659 | X[nD + d] -= mean[d]; 660 | } 661 | nD += D; 662 | } 663 | free(mean); mean = NULL; 664 | } 665 | 666 | 667 | // Generates a Gaussian random number 668 | static double randn() { 669 | double x, y, radius; 670 | do { 671 | x = 2 * (rand() / ((double) RAND_MAX + 1)) - 1; 672 | y = 2 * (rand() / ((double) RAND_MAX + 1)) - 1; 673 | radius = (x * x) + (y * y); 674 | } while((radius >= 1.0) || (radius == 0.0)); 675 | radius = sqrt(-2 * log(radius) / radius); 676 | x *= radius; 677 | y *= radius; 678 | return x; 679 | } 680 | 681 | // Function that loads data from a t-SNE file 682 | // Note: this function does a malloc that should be freed elsewhere 683 | bool TSNE::load_data(double** data, int* n, int* d, int* no_dims, double* theta, double* perplexity, int* rand_seed, int* max_iter) { 684 | 685 | // Open file, read first 2 integers, allocate memory, and read the data 686 | FILE *h; 687 | if((h = fopen("data.dat", "r+b")) == NULL) { 688 | printf("Error: could not open data file.\n"); 689 | return false; 690 | } 691 | fread(n, sizeof(int), 1, h); // number of datapoints 692 | fread(d, sizeof(int), 1, h); // original dimensionality 693 | fread(theta, sizeof(double), 1, h); // gradient accuracy 694 | fread(perplexity, sizeof(double), 1, h); // perplexity 695 | fread(no_dims, sizeof(int), 1, h); // output dimensionality 696 | fread(max_iter, sizeof(int),1,h); // maximum number of iterations 697 | *data = (double*) malloc(*d * *n * sizeof(double)); 698 | if(*data == NULL) { printf("Memory allocation failed!\n"); exit(1); } 699 | fread(*data, sizeof(double), *n * *d, h); // the data 700 | if(!feof(h)) fread(rand_seed, sizeof(int), 1, h); // random seed 701 | fclose(h); 702 | printf("Read the %i x %i data matrix successfully!\n", *n, *d); 703 | return true; 704 | } 705 | 706 | // Function that saves map to a t-SNE file 707 | void TSNE::save_data(double* data, int* landmarks, double* costs, int n, int d) { 708 | 709 | // Open file, write first 2 integers and then the data 710 | FILE *h; 711 | if((h = fopen("result.dat", "w+b")) == NULL) { 712 | printf("Error: could not open data file.\n"); 713 | return; 714 | } 715 | fwrite(&n, sizeof(int), 1, h); 716 | fwrite(&d, sizeof(int), 1, h); 717 | fwrite(data, sizeof(double), n * d, h); 718 | fwrite(landmarks, sizeof(int), n, h); 719 | fwrite(costs, sizeof(double), n, h); 720 | fclose(h); 721 | printf("Wrote the %i x %i data matrix successfully!\n", n, d); 722 | } 723 | -------------------------------------------------------------------------------- /tsne.h: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology) 4 | * All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are met: 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * 3. All advertising materials mentioning features or use of this software 14 | * must display the following acknowledgement: 15 | * This product includes software developed by the Delft University of Technology. 16 | * 4. Neither the name of the Delft University of Technology nor the names of 17 | * its contributors may be used to endorse or promote products derived from 18 | * this software without specific prior written permission. 19 | * 20 | * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS 21 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 23 | * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 28 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY 29 | * OF SUCH DAMAGE. 30 | * 31 | */ 32 | 33 | 34 | #ifndef TSNE_H 35 | #define TSNE_H 36 | 37 | #ifdef __cplusplus 38 | extern "C" { 39 | namespace TSNE { 40 | #endif 41 | void run(double* X, int N, int D, double* Y, int no_dims, double perplexity, double theta, int rand_seed, 42 | bool skip_random_init, int max_iter, int stop_lying_iter, int mom_switch_iter); 43 | bool load_data(double** data, int* n, int* d, int* no_dims, double* theta, double* perplexity, int* rand_seed, int* max_iter); 44 | void save_data(double* data, int* landmarks, double* costs, int n, int d); 45 | #ifdef __cplusplus 46 | }; 47 | } 48 | #endif 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /tsne_main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "tsne.h" 8 | 9 | // Function that runs the Barnes-Hut implementation of t-SNE 10 | int main() { 11 | 12 | // Define some variables 13 | int origN, N, D, no_dims, max_iter; 14 | double perplexity, theta, *data; 15 | int rand_seed = -1; 16 | 17 | // Read the parameters and the dataset 18 | if(TSNE::load_data(&data, &origN, &D, &no_dims, &theta, &perplexity, &rand_seed, &max_iter)) { 19 | 20 | // Make dummy landmarks 21 | N = origN; 22 | int* landmarks = (int*) malloc(N * sizeof(int)); 23 | if(landmarks == NULL) { printf("Memory allocation failed!\n"); exit(1); } 24 | for(int n = 0; n < N; n++) landmarks[n] = n; 25 | 26 | // Now fire up the SNE implementation 27 | double* Y = (double*) malloc(N * no_dims * sizeof(double)); 28 | double* costs = (double*) calloc(N, sizeof(double)); 29 | if(Y == NULL || costs == NULL) { printf("Memory allocation failed!\n"); exit(1); } 30 | TSNE::run(data, N, D, Y, no_dims, perplexity, theta, rand_seed, false, max_iter, 250, 250); 31 | 32 | // Save the results 33 | TSNE::save_data(Y, landmarks, costs, N, no_dims); 34 | 35 | // Clean up the memory 36 | free(data); data = NULL; 37 | free(Y); Y = NULL; 38 | free(costs); costs = NULL; 39 | free(landmarks); landmarks = NULL; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /vptree.h: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology) 4 | * All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are met: 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * 3. All advertising materials mentioning features or use of this software 14 | * must display the following acknowledgement: 15 | * This product includes software developed by the Delft University of Technology. 16 | * 4. Neither the name of the Delft University of Technology nor the names of 17 | * its contributors may be used to endorse or promote products derived from 18 | * this software without specific prior written permission. 19 | * 20 | * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS 21 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 23 | * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 28 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY 29 | * OF SUCH DAMAGE. 30 | * 31 | */ 32 | 33 | 34 | /* This code was adopted with minor modifications from Steve Hanov's great tutorial at http://stevehanov.ca/blog/index.php?id=130 */ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | 44 | 45 | #ifndef VPTREE_H 46 | #define VPTREE_H 47 | 48 | class DataPoint 49 | { 50 | int _ind; 51 | 52 | public: 53 | double* _x; 54 | int _D; 55 | DataPoint() { 56 | _D = 1; 57 | _ind = -1; 58 | _x = NULL; 59 | } 60 | DataPoint(int D, int ind, double* x) { 61 | _D = D; 62 | _ind = ind; 63 | _x = (double*) malloc(_D * sizeof(double)); 64 | for(int d = 0; d < _D; d++) _x[d] = x[d]; 65 | } 66 | DataPoint(const DataPoint& other) { // this makes a deep copy -- should not free anything 67 | if(this != &other) { 68 | _D = other.dimensionality(); 69 | _ind = other.index(); 70 | _x = (double*) malloc(_D * sizeof(double)); 71 | for(int d = 0; d < _D; d++) _x[d] = other.x(d); 72 | } 73 | } 74 | ~DataPoint() { if(_x != NULL) free(_x); } 75 | DataPoint& operator= (const DataPoint& other) { // asignment should free old object 76 | if(this != &other) { 77 | if(_x != NULL) free(_x); 78 | _D = other.dimensionality(); 79 | _ind = other.index(); 80 | _x = (double*) malloc(_D * sizeof(double)); 81 | for(int d = 0; d < _D; d++) _x[d] = other.x(d); 82 | } 83 | return *this; 84 | } 85 | int index() const { return _ind; } 86 | int dimensionality() const { return _D; } 87 | double x(int d) const { return _x[d]; } 88 | }; 89 | 90 | double euclidean_distance(const DataPoint &t1, const DataPoint &t2) { 91 | double dd = .0; 92 | double* x1 = t1._x; 93 | double* x2 = t2._x; 94 | double diff; 95 | for(int d = 0; d < t1._D; d++) { 96 | diff = (x1[d] - x2[d]); 97 | dd += diff * diff; 98 | } 99 | return sqrt(dd); 100 | } 101 | 102 | 103 | template 104 | class VpTree 105 | { 106 | public: 107 | 108 | // Default constructor 109 | VpTree() : _root(0) {} 110 | 111 | // Destructor 112 | ~VpTree() { 113 | delete _root; 114 | } 115 | 116 | // Function to create a new VpTree from data 117 | void create(const std::vector& items) { 118 | delete _root; 119 | _items = items; 120 | _root = buildFromPoints(0, items.size()); 121 | } 122 | 123 | // Function that uses the tree to find the k nearest neighbors of target 124 | void search(const T& target, int k, std::vector* results, std::vector* distances) 125 | { 126 | 127 | // Use a priority queue to store intermediate results on 128 | std::priority_queue heap; 129 | 130 | // Variable that tracks the distance to the farthest point in our results 131 | _tau = DBL_MAX; 132 | 133 | // Perform the search 134 | search(_root, target, k, heap); 135 | 136 | // Gather final results 137 | results->clear(); distances->clear(); 138 | while(!heap.empty()) { 139 | results->push_back(_items[heap.top().index]); 140 | distances->push_back(heap.top().dist); 141 | heap.pop(); 142 | } 143 | 144 | // Results are in reverse order 145 | std::reverse(results->begin(), results->end()); 146 | std::reverse(distances->begin(), distances->end()); 147 | } 148 | 149 | private: 150 | std::vector _items; 151 | double _tau; 152 | 153 | // Single node of a VP tree (has a point and radius; left children are closer to point than the radius) 154 | struct Node 155 | { 156 | int index; // index of point in node 157 | double threshold; // radius(?) 158 | Node* left; // points closer by than threshold 159 | Node* right; // points farther away than threshold 160 | 161 | Node() : 162 | index(0), threshold(0.), left(0), right(0) {} 163 | 164 | ~Node() { // destructor 165 | delete left; 166 | delete right; 167 | } 168 | }* _root; 169 | 170 | 171 | // An item on the intermediate result queue 172 | struct HeapItem { 173 | HeapItem( int index, double dist) : 174 | index(index), dist(dist) {} 175 | int index; 176 | double dist; 177 | bool operator<(const HeapItem& o) const { 178 | return dist < o.dist; 179 | } 180 | }; 181 | 182 | // Distance comparator for use in std::nth_element 183 | struct DistanceComparator 184 | { 185 | const T& item; 186 | DistanceComparator(const T& item) : item(item) {} 187 | bool operator()(const T& a, const T& b) { 188 | return distance(item, a) < distance(item, b); 189 | } 190 | }; 191 | 192 | // Function that (recursively) fills the tree 193 | Node* buildFromPoints( int lower, int upper ) 194 | { 195 | if (upper == lower) { // indicates that we're done here! 196 | return NULL; 197 | } 198 | 199 | // Lower index is center of current node 200 | Node* node = new Node(); 201 | node->index = lower; 202 | 203 | if (upper - lower > 1) { // if we did not arrive at leaf yet 204 | 205 | // Choose an arbitrary point and move it to the start 206 | int i = (int) ((double)rand() / RAND_MAX * (upper - lower - 1)) + lower; 207 | std::swap(_items[lower], _items[i]); 208 | 209 | // Partition around the median distance 210 | int median = (upper + lower) / 2; 211 | std::nth_element(_items.begin() + lower + 1, 212 | _items.begin() + median, 213 | _items.begin() + upper, 214 | DistanceComparator(_items[lower])); 215 | 216 | // Threshold of the new node will be the distance to the median 217 | node->threshold = distance(_items[lower], _items[median]); 218 | 219 | // Recursively build tree 220 | node->index = lower; 221 | node->left = buildFromPoints(lower + 1, median); 222 | node->right = buildFromPoints(median, upper); 223 | } 224 | 225 | // Return result 226 | return node; 227 | } 228 | 229 | // Helper function that searches the tree 230 | void search(Node* node, const T& target, int k, std::priority_queue& heap) 231 | { 232 | if(node == NULL) return; // indicates that we're done here 233 | 234 | // Compute distance between target and current node 235 | double dist = distance(_items[node->index], target); 236 | 237 | // If current node within radius tau 238 | if(dist < _tau) { 239 | if(heap.size() == k) heap.pop(); // remove furthest node from result list (if we already have k results) 240 | heap.push(HeapItem(node->index, dist)); // add current node to result list 241 | if(heap.size() == k) _tau = heap.top().dist; // update value of tau (farthest point in result list) 242 | } 243 | 244 | // Return if we arrived at a leaf 245 | if(node->left == NULL && node->right == NULL) { 246 | return; 247 | } 248 | 249 | // If the target lies within the radius of ball 250 | if(dist < node->threshold) { 251 | if(dist - _tau <= node->threshold) { // if there can still be neighbors inside the ball, recursively search left child first 252 | search(node->left, target, k, heap); 253 | } 254 | 255 | if(dist + _tau >= node->threshold) { // if there can still be neighbors outside the ball, recursively search right child 256 | search(node->right, target, k, heap); 257 | } 258 | 259 | // If the target lies outsize the radius of the ball 260 | } else { 261 | if(dist + _tau >= node->threshold) { // if there can still be neighbors outside the ball, recursively search right child first 262 | search(node->right, target, k, heap); 263 | } 264 | 265 | if (dist - _tau <= node->threshold) { // if there can still be neighbors inside the ball, recursively search left child 266 | search(node->left, target, k, heap); 267 | } 268 | } 269 | } 270 | }; 271 | 272 | #endif 273 | --------------------------------------------------------------------------------