├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ └── test-coverage.yaml ├── tests ├── testthat.R └── testthat │ ├── test_neighbors.R │ └── test_Rtsne.R ├── tools └── example-1.png ├── .gitignore ├── .Rbuildignore ├── cran-comments.md ├── .travis.yml ├── src ├── Makevars.win ├── Makevars ├── normalize_input.cpp ├── datapoint.h ├── tsne.h ├── sptree.h ├── Rtsne.cpp ├── RcppExports.cpp ├── vptree.h ├── sptree.cpp └── tsne.cpp ├── NAMESPACE ├── NEWS.md ├── DESCRIPTION ├── R ├── RcppExports.R ├── neighbors.R ├── utils.R └── Rtsne.R ├── man ├── normalize_input.Rd └── Rtsne.Rd ├── inst └── CITATION ├── README.md ├── README.Rmd └── LICENSE /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | test_check("Rtsne") -------------------------------------------------------------------------------- /tools/example-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jkrijthe/Rtsne/HEAD/tools/example-1.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.o 3 | 4 | *.Rproj 5 | 6 | *.so 7 | 8 | 9 | .Rhistory 10 | .Rproj.user 11 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^CRAN-RELEASE$ 2 | ^.*\.Rproj$ 3 | ^\.Rproj\.user$ 4 | ^\.travis\.yml$ 5 | ^README\.Rmd$ 6 | ^README-.*\.png$ 7 | ^cran-comments\.md$ 8 | ^\.github$ 9 | ^CRAN-SUBMISSION$ 10 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | The main change is the use of USE_FC_LEN_T for calling Fortran BLAS to prepare for R 4.2.0, among with some small documentation and typo fixes. 2 | 3 | ## Test environments 4 | * local OS X install 5 | * win-builder (devel and release) 6 | * ubuntu linux (devel, release and oldrelease) on Github Actions 7 | 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: r 2 | sudo: false 3 | cache: packages 4 | 5 | matrix: 6 | include: 7 | - r: release 8 | r_github_packages: r-lib/covr 9 | after_success: Rscript -e 'covr::codecov()' 10 | - r: devel 11 | - r: oldrel 12 | 13 | 14 | notifications: 15 | email: 16 | on_success: change 17 | on_failure: change -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | 2 | ## Use the R_HOME indirection to support installations of multiple R version 3 | ## PKG_LIBS = $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "Rcpp:::LdFlags()") $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 4 | PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) $(SHLIB_OPENMP_CXXFLAGS) 5 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(Rtsne,data.frame) 4 | S3method(Rtsne,default) 5 | S3method(Rtsne,dist) 6 | export(Rtsne) 7 | export(Rtsne_neighbors) 8 | export(normalize_input) 9 | import(Rcpp) 10 | importFrom(stats,model.matrix) 11 | importFrom(stats,na.fail) 12 | importFrom(stats,prcomp) 13 | useDynLib(Rtsne, .registration = TRUE) 14 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # Rtsne 0.15 2 | * Substantial speed increase by fixing the possible embedding dimensionalities to 1, 2 or 3 3 | * Clarification of the licensing of different parts of the package 4 | * Support for using, more efficient, partial PCA (by Daniel Wells) 5 | * Made the normalization optional and the normalization function used available (by Aaron Lun) 6 | * Support for using precomputed nearest neighbour matrices (by Aaron Lun) 7 | * Added OpenMP support 8 | * Default verbose value is now the global setting (by Richard Cotton) 9 | * Added a `NEWS.md` file to track changes to the package. 10 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: Rtsne 2 | Type: Package 3 | Title: T-Distributed Stochastic Neighbor Embedding using a Barnes-Hut Implementation 4 | Version: 0.17 5 | Authors@R: c( 6 | person("Jesse", "Krijthe", ,"jkrijthe@gmail.com", role = c("aut", "cre")), 7 | person("Laurens", "van der Maaten", role = c("cph"), comment = "Author of original C++ code") 8 | ) 9 | Description: An R wrapper around the fast T-distributed Stochastic 10 | Neighbor Embedding implementation by Van der Maaten (see for more information on the original implementation). 11 | License: file LICENSE 12 | URL: https://github.com/jkrijthe/Rtsne 13 | Encoding: UTF-8 14 | Imports: 15 | Rcpp (>= 0.11.0), 16 | stats 17 | LinkingTo: Rcpp 18 | Suggests: 19 | irlba, 20 | testthat 21 | RoxygenNote: 7.2.3 22 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | Rtsne_cpp <- function(X, no_dims, perplexity, theta, verbose, max_iter, distance_precomputed, Y_in, init, stop_lying_iter, mom_switch_iter, momentum, final_momentum, eta, exaggeration_factor, num_threads) { 5 | .Call(`_Rtsne_Rtsne_cpp`, X, no_dims, perplexity, theta, verbose, max_iter, distance_precomputed, Y_in, init, stop_lying_iter, mom_switch_iter, momentum, final_momentum, eta, exaggeration_factor, num_threads) 6 | } 7 | 8 | Rtsne_nn_cpp <- function(nn_dex, nn_dist, no_dims, perplexity, theta, verbose, max_iter, Y_in, init, stop_lying_iter, mom_switch_iter, momentum, final_momentum, eta, exaggeration_factor, num_threads) { 9 | .Call(`_Rtsne_Rtsne_nn_cpp`, nn_dex, nn_dist, no_dims, perplexity, theta, verbose, max_iter, Y_in, init, stop_lying_iter, mom_switch_iter, momentum, final_momentum, eta, exaggeration_factor, num_threads) 10 | } 11 | 12 | normalize_input_cpp <- function(input) { 13 | .Call(`_Rtsne_normalize_input_cpp`, input) 14 | } 15 | 16 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | ## Use the R_HOME indirection to support installations of multiple R version 2 | ## PKG_LIBS = `$(R_HOME)/bin/Rscript -e "Rcpp:::LdFlags()"` $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 3 | PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) $(SHLIB_OPENMP_CXXFLAGS) 4 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 5 | 6 | ## As an alternative, one can also add this code in a file 'configure' 7 | ## 8 | ## PKG_LIBS=`${R_HOME}/bin/Rscript -e "Rcpp:::LdFlags()"` 9 | ## 10 | ## sed -e "s|@PKG_LIBS@|${PKG_LIBS}|" \ 11 | ## src/Makevars.in > src/Makevars 12 | ## 13 | ## which together with the following file 'src/Makevars.in' 14 | ## 15 | ## PKG_LIBS = @PKG_LIBS@ 16 | ## 17 | ## can be used to create src/Makevars dynamically. This scheme is more 18 | ## powerful and can be expanded to also check for and link with other 19 | ## libraries. It should be complemented by a file 'cleanup' 20 | ## 21 | ## rm src/Makevars 22 | ## 23 | ## which removes the autogenerated file src/Makevars. 24 | ## 25 | ## Of course, autoconf can also be used to write configure files. This is 26 | ## done by a number of packages, but recommended only for more advanced users 27 | ## comfortable with autoconf and its related tools. 28 | 29 | 30 | -------------------------------------------------------------------------------- /src/normalize_input.cpp: -------------------------------------------------------------------------------- 1 | #include "Rcpp.h" 2 | #include 3 | 4 | // Function that performs the matrix normalization. 5 | // [[Rcpp::export]] 6 | Rcpp::NumericMatrix normalize_input_cpp(Rcpp::NumericMatrix input) { 7 | // Rows are observations, columns are variables. 8 | Rcpp::NumericMatrix output=Rcpp::clone(input); 9 | const int N=output.nrow(), D=output.ncol(); 10 | 11 | // Running through each column and centering it. 12 | Rcpp::NumericMatrix::iterator oIt=output.begin(); 13 | for (int d=0; d max_X) max_X = tmp; 32 | } 33 | for (oIt=output.begin(); oIt!=output.end(); ++oIt) { 34 | *oIt /= max_X; 35 | } 36 | return output; 37 | } 38 | -------------------------------------------------------------------------------- /man/normalize_input.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{normalize_input} 4 | \alias{normalize_input} 5 | \title{Normalize input data matrix} 6 | \usage{ 7 | normalize_input(X) 8 | } 9 | \arguments{ 10 | \item{X}{matrix; Input data matrix with rows as observations and columns as variables/dimensions.} 11 | } 12 | \value{ 13 | A numeric matrix of the same dimensions as \code{X} but centred by column and scaled to have a maximum deviation of 1. 14 | } 15 | \description{ 16 | Mean centers each column of an input data matrix so that it has a mean of zero. 17 | Scales the entire matrix so that the largest absolute of the centered matrix is equal to unity. 18 | } 19 | \details{ 20 | Normalization avoids numerical problems when the coordinates (and thus the distances between observations) are very large. 21 | Directly computing distances on this scale may lead to underflow when computing the probabilities in the t-SNE algorithm. 22 | Rescaling the input values mitigates these problems to some extent. 23 | } 24 | \examples{ 25 | iris_unique <- unique(iris) # Remove duplicates 26 | iris_matrix <- as.matrix(iris_unique[,1:4]) 27 | X <- normalize_input(iris_matrix) 28 | colMeans(X) 29 | range(X) 30 | } 31 | \author{ 32 | Aaron Lun 33 | } 34 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | bibentry( 2 | bibtype="Manual", 3 | title = "{Rtsne}: T-Distributed Stochastic Neighbor Embedding using Barnes-Hut 4 | Implementation", 5 | author = "Jesse H. Krijthe", 6 | year = "2015", 7 | note = sprintf("R package version %s", meta$Version), 8 | url = "https://github.com/jkrijthe/Rtsne", 9 | textVersion="Jesse H. Krijthe (2015). Rtsne: T-Distributed Stochastic Neighbor Embedding using a Barnes-Hut Implementation, URL: https://github.com/jkrijthe/Rtsne" 10 | ) 11 | 12 | bibentry( 13 | bibtype="Article", 14 | title="Visualizing High-Dimensional Data Using t-SNE", 15 | volume= "9", 16 | pages="2579-2605", 17 | year="2008", 18 | author="L.J.P. van der Maaten and G.E. Hinton", 19 | journal="Journal of Machine Learning Research", 20 | textVersion="L.J.P. van der Maaten and G.E. Hinton. Visualizing High-Dimensional Data Using t-SNE. Journal of Machine Learning Research 9(Nov):2579-2605, 2008.") 21 | 22 | bibentry( 23 | bibtype="Article", 24 | title="Accelerating t-SNE using Tree-Based Algorithms", 25 | volume= "15", 26 | pages="3221-3245", 27 | year="2014", 28 | author="L.J.P. van der Maaten", 29 | journal="Journal of Machine Learning Research", 30 | textVersion="L.J.P. van der Maaten. Accelerating t-SNE using Tree-Based Algorithms. Journal of Machine Learning Research 15(Oct):3221-3245, 2014." 31 | ) 32 | 33 | 34 | -------------------------------------------------------------------------------- /src/datapoint.h: -------------------------------------------------------------------------------- 1 | #ifndef DATAPOINT_H 2 | #define DATAPOINT_H 3 | 4 | class DataPoint 5 | { 6 | int _ind; 7 | 8 | public: 9 | double* _x; 10 | int _D; 11 | DataPoint() { 12 | _D = 1; 13 | _ind = -1; 14 | _x = NULL; 15 | } 16 | DataPoint(int D, int ind, double* x) { 17 | _D = D; 18 | _ind = ind; 19 | _x = (double*) malloc(_D * sizeof(double)); 20 | for(int d = 0; d < _D; d++) _x[d] = x[d]; 21 | } 22 | DataPoint(const DataPoint& other) { // this makes a deep copy -- should not free anything 23 | if(this != &other) { 24 | _D = other.dimensionality(); 25 | _ind = other.index(); 26 | _x = (double*) malloc(_D * sizeof(double)); 27 | for(int d = 0; d < _D; d++) _x[d] = other.x(d); 28 | } 29 | } 30 | ~DataPoint() { if(_x != NULL) free(_x); } 31 | DataPoint& operator= (const DataPoint& other) { // asignment should free old object 32 | if(this != &other) { 33 | if(_x != NULL) free(_x); 34 | _D = other.dimensionality(); 35 | _ind = other.index(); 36 | _x = (double*) malloc(_D * sizeof(double)); 37 | for(int d = 0; d < _D; d++) _x[d] = other.x(d); 38 | } 39 | return *this; 40 | } 41 | int index() const { return _ind; } 42 | int dimensionality() const { return _D; } 43 | double x(int d) const { return _x[d]; } 44 | }; 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /R/neighbors.R: -------------------------------------------------------------------------------- 1 | #' @rdname Rtsne 2 | #' @export 3 | Rtsne_neighbors <- function(index, distance, dims=2, perplexity=30, theta=0.5, 4 | max_iter=1000,verbose=getOption("verbose", FALSE), 5 | Y_init=NULL, 6 | stop_lying_iter=ifelse(is.null(Y_init),250L,0L), 7 | mom_switch_iter=ifelse(is.null(Y_init),250L,0L), 8 | momentum=0.5, final_momentum=0.8, 9 | eta=200.0, exaggeration_factor=12.0, num_threads=1, ...) { 10 | 11 | if (!is.matrix(index)) { stop("Input index is not a matrix") } 12 | if (!identical(dim(index), dim(distance))) { stop("index and distance matrices should have the same dimensions") } 13 | R <- range(index) 14 | if (any(R < 1 | R > nrow(index) | !is.finite(R))) { stop("invalid indices") } 15 | tsne.args <- .check_tsne_params(nrow(index), dims=dims, perplexity=perplexity, theta=theta, max_iter=max_iter, verbose=verbose, 16 | Y_init=Y_init, stop_lying_iter=stop_lying_iter, mom_switch_iter=mom_switch_iter, 17 | momentum=momentum, final_momentum=final_momentum, eta=eta, exaggeration_factor=exaggeration_factor) 18 | 19 | # Transposing is necessary for fast column-major access to each sample, -1 for zero-indexing. 20 | out <- do.call(Rtsne_nn_cpp, c(list(nn_dex=t(index - 1L), nn_dist=t(distance), num_threads=num_threads), tsne.args)) 21 | out$Y <- t(out$Y) # Transposing back. 22 | c(list(N=nrow(index)), out, .clear_unwanted_params(tsne.args)) 23 | } 24 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | permissions: read-all 11 | 12 | jobs: 13 | R-CMD-check: 14 | runs-on: ${{ matrix.config.os }} 15 | 16 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 17 | 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | config: 22 | - {os: macos-latest, r: 'release'} 23 | - {os: windows-latest, r: 'release'} 24 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 25 | - {os: ubuntu-latest, r: 'release'} 26 | - {os: ubuntu-latest, r: 'oldrel-1'} 27 | 28 | env: 29 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 30 | R_KEEP_PKG_SOURCE: yes 31 | 32 | steps: 33 | - uses: actions/checkout@v5 34 | 35 | - uses: r-lib/actions/setup-pandoc@v2 36 | 37 | - uses: r-lib/actions/setup-r@v2 38 | with: 39 | r-version: ${{ matrix.config.r }} 40 | http-user-agent: ${{ matrix.config.http-user-agent }} 41 | use-public-rspm: true 42 | 43 | - uses: r-lib/actions/setup-r-dependencies@v2 44 | with: 45 | extra-packages: any::rcmdcheck 46 | needs: check 47 | 48 | - uses: r-lib/actions/check-r-package@v2 49 | with: 50 | upload-snapshots: true 51 | build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' 52 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | 8 | name: test-coverage.yaml 9 | 10 | permissions: read-all 11 | 12 | jobs: 13 | test-coverage: 14 | runs-on: ubuntu-latest 15 | env: 16 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 17 | 18 | steps: 19 | - uses: actions/checkout@v5 20 | 21 | - uses: r-lib/actions/setup-r@v2 22 | with: 23 | use-public-rspm: true 24 | 25 | - uses: r-lib/actions/setup-r-dependencies@v2 26 | with: 27 | extra-packages: any::covr, any::xml2 28 | needs: coverage 29 | 30 | - name: Test coverage 31 | run: | 32 | cov <- covr::package_coverage( 33 | quiet = FALSE, 34 | clean = FALSE, 35 | install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") 36 | ) 37 | print(cov) 38 | covr::to_cobertura(cov) 39 | shell: Rscript {0} 40 | 41 | - uses: codecov/codecov-action@v5 42 | with: 43 | # Fail if error if not on PR, or if on PR and token is given 44 | fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }} 45 | files: ./cobertura.xml 46 | plugins: noop 47 | disable_search: true 48 | token: ${{ secrets.CODECOV_TOKEN }} 49 | 50 | - name: Show testthat output 51 | if: always() 52 | run: | 53 | ## -------------------------------------------------------------------- 54 | find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true 55 | shell: bash 56 | 57 | - name: Upload test results 58 | if: failure() 59 | uses: actions/upload-artifact@v4 60 | with: 61 | name: coverage-test-failures 62 | path: ${{ runner.temp }}/package 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | [![CRAN 5 | version](http://www.r-pkg.org/badges/version/Rtsne)](https://cran.r-project.org/package=Rtsne/) 6 | [![R-CMD-check](https://github.com/jkrijthe/Rtsne/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/jkrijthe/Rtsne/actions/workflows/R-CMD-check.yaml) 7 | [![codecov.io](https://codecov.io/github/jkrijthe/Rtsne/coverage.svg?branch=master)](https://app.codecov.io/github/jkrijthe/Rtsne?branch=master) 8 | [![CRAN mirror 9 | downloads](http://cranlogs.r-pkg.org/badges/Rtsne)](https://cran.r-project.org/package=Rtsne/) 10 | 11 | # R wrapper for Van der Maaten’s Barnes-Hut implementation of t-Distributed Stochastic Neighbor Embedding 12 | 13 | ## Installation 14 | 15 | To install from CRAN: 16 | 17 | ``` r 18 | install.packages("Rtsne") # Install Rtsne package from CRAN 19 | ``` 20 | 21 | To install the latest version from the github repository, use: 22 | 23 | ``` r 24 | if(!require(devtools)) install.packages("devtools") # If not already installed 25 | devtools::install_github("jkrijthe/Rtsne") 26 | ``` 27 | 28 | ## Usage 29 | 30 | After installing the package, use the following code to run a simple 31 | example (to install, see below). 32 | 33 | ``` r 34 | library(Rtsne) # Load package 35 | iris_unique <- unique(iris) # Remove duplicates 36 | set.seed(42) # Sets seed for reproducibility 37 | tsne_out <- Rtsne(as.matrix(iris_unique[,1:4])) # Run TSNE 38 | plot(tsne_out$Y,col=iris_unique$Species,asp=1) # Plot the result 39 | ``` 40 | 41 | ![](tools/example-1.png) 42 | 43 | # Details 44 | 45 | This R package offers a wrapper around the Barnes-Hut TSNE C++ 46 | implementation of \[2\] \[3\]. Changes were made to the original code to 47 | allow it to function as an R package and to add additional functionality 48 | and speed improvements. 49 | 50 | # References 51 | 52 | \[1\] L.J.P. van der Maaten and G.E. Hinton. “Visualizing 53 | High-Dimensional Data Using t-SNE.” Journal of Machine Learning Research 54 | 9(Nov):2579-2605, 2008. 55 | 56 | \[2\] L.J.P van der Maaten. “Accelerating t-SNE using tree-based 57 | algorithms.” Journal of Machine Learning Research 15.1:3221-3245, 2014. 58 | 59 | \[3\] 60 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | github_document: 4 | html_preview: false 5 | --- 6 | 7 | 8 | 9 | ```{r, echo = FALSE} 10 | knitr::opts_chunk$set( 11 | collapse = TRUE, 12 | comment = "#>", 13 | fig.path = "README-" 14 | ) 15 | ``` 16 | [![CRAN version](http://www.r-pkg.org/badges/version/Rtsne)](https://cran.r-project.org/package=Rtsne/) 17 | [![R-CMD-check](https://github.com/jkrijthe/Rtsne/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/jkrijthe/Rtsne/actions/workflows/R-CMD-check.yaml) 18 | [![codecov.io](https://codecov.io/github/jkrijthe/Rtsne/coverage.svg?branch=master)](https://app.codecov.io/github/jkrijthe/Rtsne?branch=master) 19 | [![CRAN mirror downloads](http://cranlogs.r-pkg.org/badges/Rtsne)](https://cran.r-project.org/package=Rtsne/) 20 | 21 | 22 | # R wrapper for Van der Maaten's Barnes-Hut implementation of t-Distributed Stochastic Neighbor Embedding 23 | 24 | ## Installation 25 | 26 | To install from CRAN: 27 | ```{r, eval = FALSE} 28 | install.packages("Rtsne") # Install Rtsne package from CRAN 29 | ``` 30 | To install the latest version from the github repository, use: 31 | ```{r, eval = FALSE} 32 | if(!require(devtools)) install.packages("devtools") # If not already installed 33 | devtools::install_github("jkrijthe/Rtsne") 34 | ``` 35 | 36 | 37 | ## Usage 38 | After installing the package, use the following code to run a simple example (to install, see below). 39 | ```{r example, fig.path="tools/"} 40 | library(Rtsne) # Load package 41 | iris_unique <- unique(iris) # Remove duplicates 42 | set.seed(42) # Sets seed for reproducibility 43 | tsne_out <- Rtsne(as.matrix(iris_unique[,1:4])) # Run TSNE 44 | plot(tsne_out$Y,col=iris_unique$Species,asp=1) # Plot the result 45 | ``` 46 | 47 | 48 | # Details 49 | This R package offers a wrapper around the Barnes-Hut TSNE C++ implementation of [2] [3]. Changes were made to the original code to allow it to function as an R package and to add additional functionality and speed improvements. 50 | 51 | # References 52 | [1] L.J.P. van der Maaten and G.E. Hinton. "Visualizing High-Dimensional Data Using t-SNE." Journal of Machine Learning Research 9(Nov):2579-2605, 2008. 53 | 54 | [2] L.J.P van der Maaten. "Accelerating t-SNE using tree-based algorithms." Journal of Machine Learning Research 15.1:3221-3245, 2014. 55 | 56 | [3] https://lvdmaaten.github.io/tsne/ 57 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | #' Normalize input data matrix 2 | #' 3 | #' Mean centers each column of an input data matrix so that it has a mean of zero. 4 | #' Scales the entire matrix so that the largest absolute of the centered matrix is equal to unity. 5 | #' 6 | #' @param X matrix; Input data matrix with rows as observations and columns as variables/dimensions. 7 | #' 8 | #' @details 9 | #' Normalization avoids numerical problems when the coordinates (and thus the distances between observations) are very large. 10 | #' Directly computing distances on this scale may lead to underflow when computing the probabilities in the t-SNE algorithm. 11 | #' Rescaling the input values mitigates these problems to some extent. 12 | #' 13 | #' @return A numeric matrix of the same dimensions as \code{X} but centred by column and scaled to have a maximum deviation of 1. 14 | #' 15 | #' @author 16 | #' Aaron Lun 17 | #' 18 | #' @examples 19 | #' iris_unique <- unique(iris) # Remove duplicates 20 | #' iris_matrix <- as.matrix(iris_unique[,1:4]) 21 | #' X <- normalize_input(iris_matrix) 22 | #' colMeans(X) 23 | #' range(X) 24 | #' @export 25 | normalize_input <- function(X) { 26 | # Using the original C++ code from bhtsne to do mean centering, even though it would be simple to do with sweep(). 27 | # This is because R's sums are more numerically precise, so for consistency with the original code, we need to use the naive C++ version. 28 | normalize_input_cpp(X) 29 | } 30 | 31 | is.wholenumber <- function(x, tol = .Machine$double.eps^0.5) 32 | # Checks if the input is a whole number. 33 | { 34 | abs(x - round(x)) < tol 35 | } 36 | 37 | .check_tsne_params <- function(nsamples, dims, perplexity, theta, max_iter, verbose, Y_init, stop_lying_iter, mom_switch_iter, 38 | momentum, final_momentum, eta, exaggeration_factor) 39 | # Checks parameters for the t-SNE algorithm that are independent of 40 | # the format of the input data (e.g., distance matrix or neighbors). 41 | { 42 | if (!is.wholenumber(dims) || dims < 1 || dims > 3) { stop("dims should be either 1, 2 or 3") } 43 | if (!is.wholenumber(max_iter) || !(max_iter>0)) { stop("number of iterations should be a positive integer")} 44 | if (!is.null(Y_init) && (nsamples!=nrow(Y_init) || ncol(Y_init)!=dims)) { stop("incorrect format for Y_init") } 45 | 46 | if (!is.numeric(perplexity) || perplexity <= 0) { stop("perplexity should be a positive number") } 47 | if (!is.numeric(theta) || (theta<0.0) || (theta>1.0) ) { stop("theta should lie in [0, 1]")} 48 | if (!is.wholenumber(stop_lying_iter) || stop_lying_iter<0) { stop("stop_lying_iter should be a positive integer")} 49 | if (!is.wholenumber(mom_switch_iter) || mom_switch_iter<0) { stop("mom_switch_iter should be a positive integer")} 50 | if (!is.numeric(momentum) || momentum < 0) { stop("momentum should be a positive number") } 51 | if (!is.numeric(final_momentum) || final_momentum < 0) { stop("final momentum should be a positive number") } 52 | if (!is.numeric(eta) || eta <= 0) { stop("eta should be a positive number") } 53 | if (!is.numeric(exaggeration_factor) || exaggeration_factor <= 0) { stop("exaggeration_factor should be a positive number")} 54 | 55 | if (nsamples - 1 < 3 * perplexity) { stop("perplexity is too large for the number of samples")} 56 | 57 | init <- !is.null(Y_init) 58 | if (init) { 59 | Y_init <- t(Y_init) # transposing for rapid column-major access. 60 | } else { 61 | Y_init <- matrix() 62 | } 63 | 64 | list(no_dims=dims, perplexity=perplexity, theta=theta, max_iter=max_iter, verbose=verbose, 65 | init=init, Y_in=Y_init, 66 | stop_lying_iter=stop_lying_iter, mom_switch_iter=mom_switch_iter, 67 | momentum=momentum, final_momentum=final_momentum, 68 | eta=eta, exaggeration_factor=exaggeration_factor) 69 | } 70 | 71 | .clear_unwanted_params <- function(args) 72 | # Removing parameters that we do not want to store in the output. 73 | { 74 | args$Y_in <- args$init <- args$no_dims <- args$verbose <- NULL 75 | args 76 | } 77 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | All the files in this package are covered by a BSD-style licence, but with 2 | different attribution requirements and conditions for: 3 | * The following files in the src directory: tsne.cpp, tsne.h, datapoint.h, sptree.cpp, sptree.h vptree.h 4 | * The rest of the files 5 | 6 | ================================================================= 7 | 8 | The following license applies to the following files in the src directory: tsne.cpp, tsne.h, datapoint.h, sptree.cpp, sptree.h vptree.h 9 | 10 | Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology) 11 | All rights reserved. 12 | 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are met: 15 | 1. Redistributions of source code must retain the above copyright 16 | notice, this list of conditions and the following disclaimer. 17 | 2. Redistributions in binary form must reproduce the above copyright 18 | notice, this list of conditions and the following disclaimer in the 19 | documentation and/or other materials provided with the distribution. 20 | 3. All advertising materials mentioning features or use of this software 21 | must display the following acknowledgement: 22 | This product includes software developed by the Delft University of Technology. 23 | 4. Neither the name of the Delft University of Technology nor the names of 24 | its contributors may be used to endorse or promote products derived from 25 | this software without specific prior written permission. 26 | 27 | THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS 28 | OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 29 | OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 30 | EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 31 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 33 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 35 | IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY 36 | OF SUCH DAMAGE. 37 | 38 | ================================================================= 39 | 40 | The following license applies to the rest of the files: 41 | 42 | Copyright (c) 2014, Jesse Krijthe 43 | 44 | Redistribution and use in source and binary forms, with or without 45 | modification, are permitted provided that the following conditions are 46 | met: 47 | 48 | Redistributions of source code must retain the above copyright 49 | notice, this list of conditions and the following disclaimer. 50 | 51 | Redistributions in binary form must reproduce the above copyright 52 | notice, this list of conditions and the following disclaimer in 53 | the documentation and/or other materials provided with the 54 | distribution. 55 | 56 | Neither the name of Delft University of Technology nor the names 57 | of its contributors may be used to endorse or promote products derived 58 | from this software without specific prior written permission. 59 | 60 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 61 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 62 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 63 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 64 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 65 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 66 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 67 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 68 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 69 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 70 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 71 | -------------------------------------------------------------------------------- /src/tsne.h: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology) 4 | * All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are met: 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * 3. All advertising materials mentioning features or use of this software 14 | * must display the following acknowledgement: 15 | * This product includes software developed by the Delft University of Technology. 16 | * 4. Neither the name of the Delft University of Technology nor the names of 17 | * its contributors may be used to endorse or promote products derived from 18 | * this software without specific prior written permission. 19 | * 20 | * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS 21 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 23 | * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 28 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY 29 | * OF SUCH DAMAGE. 30 | * 31 | */ 32 | 33 | #include "datapoint.h" 34 | #include 35 | 36 | #ifndef TSNE_H 37 | #define TSNE_H 38 | 39 | 40 | static inline double sign_tsne(double x) { return (x == .0 ? .0 : (x < .0 ? -1.0 : 1.0)); } 41 | 42 | template 43 | class TSNE 44 | { 45 | public: 46 | TSNE(double perplexity, double theta, bool verbose, int max_iter, bool init, int stop_lying_iter, 47 | int mom_switch_iter, double momentum, double final_momentum, double eta, double exaggeration_factor,int num_threads); 48 | 49 | void run(double* X, unsigned int N, int D, double* Y, bool distance_precomputed, double* cost, double* itercost); 50 | void run(const int* nn_index, const double* nn_dist, unsigned int N, int K, double* Y, double* cost, double* itercost); 51 | 52 | private: 53 | void symmetrizeMatrix(unsigned int N); 54 | void trainIterations(unsigned int N, double* Y, double* cost, double* itercost); 55 | 56 | void computeGradient(double* P, unsigned int* inp_row_P, unsigned int* inp_col_P, double* inp_val_P, double* Y, unsigned int N, int D, double* dC, double theta); 57 | void computeExactGradient(double* P, double* Y, unsigned int N, int D, double* dC); 58 | double evaluateError(double* P, double* Y, unsigned int N, int D); 59 | double evaluateError(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, unsigned int N, int D, double theta); 60 | void getCost(double* P, double* Y, unsigned int N, int D, double* costs); 61 | void getCost(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, unsigned int N, int D, double theta, double* costs); 62 | void zeroMean(double* X, unsigned int N, int D); 63 | 64 | void computeGaussianPerplexity(double* X, unsigned int N, int D, bool distance_precomputed); 65 | template 66 | void computeGaussianPerplexity(double* X, unsigned int N, int D, int K); 67 | void computeGaussianPerplexity(const int* nn_dex, const double* nn_dist, unsigned int N, int K); 68 | void setupApproximateMemory(unsigned int N, int K); 69 | 70 | void computeProbabilities(const double perplexity, const int K, const double* distances, double* cur_P); 71 | void computeSquaredEuclideanDistance(double* X, unsigned int N, int D, double* DD); 72 | void computeSquaredEuclideanDistanceDirect(double* X, unsigned int N, int D, double* DD); 73 | 74 | double randn(); 75 | 76 | // Member variables. 77 | double perplexity, theta, momentum, final_momentum, eta, exaggeration_factor; 78 | int max_iter, stop_lying_iter, mom_switch_iter, num_threads; 79 | bool verbose, init, exact; 80 | 81 | std::vector row_P, col_P; 82 | std::vector val_P, P; 83 | }; 84 | 85 | #endif 86 | 87 | -------------------------------------------------------------------------------- /tests/testthat/test_neighbors.R: -------------------------------------------------------------------------------- 1 | context("Rtsne neighbor input") 2 | 3 | set.seed(101) 4 | # The original iris_matrix has a few tied distances, which alters the order of nearest neighbours. 5 | # This then alters the order of addition when computing various statistics; 6 | # which results in small rounding errors that are amplified across t-SNE iterations. 7 | # Hence, I have to simulate a case where no ties are possible. 8 | simdata <- matrix(rnorm(234*5), nrow=234) 9 | 10 | NNFUN <- function(D, K) 11 | # A quick reference function for computing NNs, to avoid depending on other packages. 12 | { 13 | all.indices <- matrix(0L, nrow(D), K) 14 | all.distances <- matrix(0, nrow(D), K) 15 | for (i in seq_len(nrow(D))) { 16 | current <- D[i,] 17 | by.dist <- setdiff(order(current), i) 18 | all.indices[i,] <- head(by.dist, ncol(all.indices)) 19 | all.distances[i,] <- current[all.indices[i,]] 20 | } 21 | list(index=all.indices, distance=all.distances) 22 | } 23 | 24 | test_that("Rtsne with nearest-neighbor input compares to distance matrix input", { 25 | D <- as.matrix(dist(simdata)) 26 | out <- NNFUN(D, 90) # 3 * perplexity 27 | all.indices <- out$index 28 | all.distances <- out$distance 29 | 30 | # The vptree involves a few R::runif calls, which alters the seed in the precomputed distance case. 31 | # This results in different random initialization of Y. 32 | # Thus, we need to supply a pre-computed Y as well. 33 | Y_in <- matrix(runif(nrow(simdata)*2), ncol=2) 34 | out <- Rtsne_neighbors(all.indices, all.distances, Y_init=Y_in, perplexity=30, max_iter=1) 35 | blah <- Rtsne(D, is_distance=TRUE, Y_init=Y_in, perplexity=30, max_iter=1) 36 | expect_equal(out$Y, blah$Y) 37 | 38 | # Trying again with a different number of neighbors. 39 | Y_in <- matrix(runif(nrow(simdata)*2), ncol=2) 40 | out <- Rtsne_neighbors(all.indices[,1:30], all.distances[,1:30], Y_init=Y_in, perplexity=10) 41 | blah <- Rtsne(D, is_distance=TRUE, Y_init=Y_in, perplexity=10) 42 | expect_equal(out$Y, blah$Y) 43 | }) 44 | 45 | test_that("Rtsne with nearest-neighbor input behaves upon normalization", { 46 | D <- as.matrix(dist(normalize_input(simdata))) 47 | out <- NNFUN(D, 90) # 3 * perplexity 48 | all.indices <- out$index 49 | all.distances <- out$distance 50 | 51 | # The vptree involves a few R::runif calls, which alters the seed in the precomputed distance case. 52 | # This results in different random initialization of Y. 53 | # Thus, we need to supply a pre-computed Y as well. 54 | Y_in <- matrix(runif(nrow(simdata)*2), ncol=2) 55 | out <- Rtsne_neighbors(all.indices, all.distances, Y_init=Y_in, perplexity=30, max_iter=1) 56 | blah <- Rtsne(D, is_distance=TRUE, Y_init=Y_in, perplexity=30, max_iter=1) 57 | expect_equal(out$Y, blah$Y) 58 | 59 | # Trying again with a different number of neighbors. 60 | Y_in <- matrix(runif(nrow(simdata)*2), ncol=2) 61 | out <- Rtsne_neighbors(all.indices[,1:30], all.distances[,1:30], Y_init=Y_in, perplexity=10) 62 | blah <- Rtsne(D, is_distance=TRUE, Y_init=Y_in, perplexity=10) 63 | expect_equal(out$Y, blah$Y) 64 | }) 65 | 66 | test_that("Rtsne with nearest-neighbor input gives no errors for no_dims 1 and 3", { 67 | D <- as.matrix(dist(simdata)) 68 | out <- NNFUN(D, 90) # 3 * perplexity 69 | all.indices <- out$index 70 | all.distances <- out$distance 71 | 72 | # The vptree involves a few R::runif calls, which alters the seed in the precomputed distance case. 73 | # This results in different random initialization of Y. 74 | # Thus, we need to supply a pre-computed Y as well. 75 | Y_in <- matrix(runif(nrow(simdata)*1), ncol=1) 76 | out <- Rtsne_neighbors(all.indices, all.distances, Y_init=Y_in, perplexity=30, max_iter=1, dims = 1) 77 | blah <- Rtsne(D, is_distance=TRUE, Y_init=Y_in, perplexity=30, max_iter=1,dims = 1) 78 | expect_equal(out$Y, blah$Y) 79 | 80 | Y_in <- matrix(runif(nrow(simdata)*3), ncol=3) 81 | out <- Rtsne_neighbors(all.indices, all.distances, Y_init=Y_in, perplexity=30, max_iter=1, dims = 3) 82 | blah <- Rtsne(D, is_distance=TRUE, Y_init=Y_in, perplexity=30, max_iter=1,dims = 3) 83 | expect_equal(out$Y, blah$Y) 84 | }) 85 | 86 | test_that("error conditions are correctly explored", { 87 | expect_error(Rtsne_neighbors("yay", matrix(0, 50, 10), Y_init=Y_in, perplexity=10), "matrix") 88 | expect_error(Rtsne_neighbors(matrix(0L, 50, 5), matrix(0, 50, 10), Y_init=Y_in, perplexity=10), "same dimensions") 89 | expect_error(Rtsne_neighbors(matrix(0L, 50, 10), matrix(0, 50, 10), Y_init=Y_in, perplexity=10), "invalid indices") 90 | expect_error(Rtsne_neighbors(matrix(51L, 50, 10), matrix(0, 50, 10), Y_init=Y_in, perplexity=10), "invalid indices") 91 | expect_error(Rtsne_neighbors(matrix(NA_real_, 50, 10), matrix(0, 50, 10), Y_init=Y_in, perplexity=10), "invalid indices") 92 | }) 93 | -------------------------------------------------------------------------------- /src/sptree.h: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology) 4 | * All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are met: 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * 3. All advertising materials mentioning features or use of this software 14 | * must display the following acknowledgement: 15 | * This product includes software developed by the Delft University of Technology. 16 | * 4. Neither the name of the Delft University of Technology nor the names of 17 | * its contributors may be used to endorse or promote products derived from 18 | * this software without specific prior written permission. 19 | * 20 | * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS 21 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 23 | * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 28 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY 29 | * OF SUCH DAMAGE. 30 | * 31 | */ 32 | 33 | #ifndef SPTREE_H 34 | #define SPTREE_H 35 | 36 | using namespace std; 37 | 38 | static inline double max_tsne(double x, double y) { return (x <= y ? y : x); } 39 | 40 | template 41 | class Cell { 42 | double corner[NDims]; 43 | double width[NDims]; 44 | 45 | 46 | public: 47 | Cell(); 48 | Cell(double* inp_corner, double* inp_width); 49 | ~Cell(); 50 | 51 | double getCorner(unsigned int d) const; 52 | double getWidth(unsigned int d) const; 53 | void setCorner(unsigned int d, double val); 54 | void setWidth(unsigned int d, double val); 55 | bool containsPoint(double point[]) const; 56 | }; 57 | 58 | template 59 | class SPTree 60 | { 61 | public: 62 | enum { no_children = 2 * SPTree::no_children }; 63 | 64 | private: 65 | // Fixed constants 66 | static const unsigned int QT_NODE_CAPACITY = 1; 67 | 68 | // Properties of this node in the tree 69 | SPTree* parent; 70 | unsigned int dimension; 71 | bool is_leaf; 72 | unsigned int size; 73 | unsigned int cum_size; 74 | int num_threads; 75 | 76 | // Axis-aligned bounding box stored as a center with half-dimensions to represent the boundaries of this quad tree 77 | Cell boundary; 78 | 79 | // Indices in this space-partitioning tree node, corresponding center-of-mass, and list of all children 80 | double* data; 81 | double center_of_mass[NDims]; 82 | unsigned int index[QT_NODE_CAPACITY]; 83 | 84 | // Children 85 | SPTree* children[no_children]; 86 | 87 | public: 88 | SPTree(double* inp_data, unsigned int N); 89 | SPTree(double* inp_data, double* inp_corner, double* inp_width); 90 | SPTree(double* inp_data, unsigned int N, double* inp_corner, double* inp_width); 91 | SPTree(SPTree* inp_parent, double* inp_data, unsigned int N, double* inp_corner, double* inp_width); 92 | SPTree(SPTree* inp_parent, double* inp_data, double* inp_corner, double* inp_width); 93 | ~SPTree(); 94 | void setData(double* inp_data); 95 | SPTree* getParent(); 96 | void construct(Cell boundary); 97 | bool insert(unsigned int new_index); 98 | void subdivide(); 99 | bool isCorrect(); 100 | void rebuildTree(); 101 | void getAllIndices(unsigned int* indices); 102 | unsigned int getDepth(); 103 | double computeNonEdgeForces(unsigned int point_index, double theta, double neg_f[]) const; 104 | void computeEdgeForces(unsigned int* row_P, unsigned int* col_P, double* val_P, unsigned int N, double* pos_f, int num_threads) const; 105 | void print(); 106 | 107 | private: 108 | void init(SPTree* inp_parent, double* inp_data, double* inp_corner, double* inp_width); 109 | void fill(unsigned int N); 110 | unsigned int getAllIndices(unsigned int* indices, unsigned int loc); 111 | bool isChild(unsigned int test_index, unsigned int start, unsigned int end); 112 | }; 113 | 114 | template <> 115 | struct SPTree<0> 116 | { 117 | enum { no_children = 1 }; 118 | }; 119 | 120 | #endif -------------------------------------------------------------------------------- /src/Rtsne.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "tsne.h" 3 | using namespace Rcpp; 4 | 5 | Rcpp::List save_results(int N, int no_dims, const std::vector& Y, const std::vector& costs, const std::vector& itercosts, 6 | double theta, double perplexity, int D, int stop_lying_iter, int mom_switch_iter, 7 | double momentum, double final_momentum, double eta, double exaggeration_factor); 8 | 9 | // Function that runs the Barnes-Hut implementation of t-SNE 10 | // [[Rcpp::export]] 11 | Rcpp::List Rtsne_cpp(NumericMatrix X, int no_dims, double perplexity, 12 | double theta, bool verbose, int max_iter, 13 | bool distance_precomputed, NumericMatrix Y_in, bool init, 14 | int stop_lying_iter, int mom_switch_iter, 15 | double momentum, double final_momentum, 16 | double eta, double exaggeration_factor, unsigned int num_threads) { 17 | 18 | size_t N = X.ncol(), D = X.nrow(); 19 | double * data=X.begin(); 20 | 21 | if (verbose) Rprintf("Read the %lu x %lu data matrix successfully!\n", (unsigned long)N, (unsigned long)D); 22 | std::vector Y(N * no_dims), costs(N), itercosts(static_cast(std::ceil(max_iter/50.0))); 23 | 24 | // Providing user-supplied solution. 25 | if (init) { 26 | for (size_t i = 0; i < Y.size(); i++) Y[i] = Y_in[i]; 27 | if (verbose) Rprintf("Using user supplied starting positions\n"); 28 | } 29 | 30 | // Run tsne 31 | if (no_dims==1) { 32 | TSNE<1> tsne(perplexity, theta, verbose, max_iter, init, stop_lying_iter, mom_switch_iter, 33 | momentum, final_momentum, eta, exaggeration_factor, num_threads); 34 | tsne.run(data, N, D, Y.data(), distance_precomputed, costs.data(), itercosts.data()); 35 | } else if (no_dims==2) { 36 | TSNE<2> tsne(perplexity, theta, verbose, max_iter, init, stop_lying_iter, mom_switch_iter, 37 | momentum, final_momentum, eta, exaggeration_factor, num_threads); 38 | tsne.run(data, N, D, Y.data(), distance_precomputed, costs.data(), itercosts.data()); 39 | } else if (no_dims==3) { 40 | TSNE<3> tsne(perplexity, theta, verbose, max_iter, init, stop_lying_iter, mom_switch_iter, 41 | momentum, final_momentum, eta, exaggeration_factor, num_threads); 42 | tsne.run(data, N, D, Y.data(), distance_precomputed, costs.data(), itercosts.data()); 43 | } else { 44 | Rcpp::stop("Only 1, 2 or 3 dimensional output is suppported.\n"); 45 | } 46 | 47 | return Rcpp::List::create(Rcpp::_["Y"]=Rcpp::NumericMatrix(no_dims, N, Y.data()), 48 | Rcpp::_["costs"]=Rcpp::NumericVector(costs.begin(), costs.end()), 49 | Rcpp::_["itercosts"]=Rcpp::NumericVector(itercosts.begin(), itercosts.end())); 50 | } 51 | 52 | // Function that runs the Barnes-Hut implementation of t-SNE on nearest neighbor results. 53 | // [[Rcpp::export]] 54 | Rcpp::List Rtsne_nn_cpp(IntegerMatrix nn_dex, NumericMatrix nn_dist, 55 | int no_dims, double perplexity, 56 | double theta, bool verbose, int max_iter, 57 | NumericMatrix Y_in, bool init, 58 | int stop_lying_iter, int mom_switch_iter, 59 | double momentum, double final_momentum, 60 | double eta, double exaggeration_factor, unsigned int num_threads) { 61 | 62 | size_t N = nn_dex.ncol(), K=nn_dex.nrow(); // transposed - columns are points, rows are neighbors. 63 | if (verbose) Rprintf("Read the NN results for %lu points successfully!\n", (unsigned long)N); 64 | std::vector Y(N * no_dims), costs(N), itercosts(static_cast(std::ceil(max_iter/50.0))); 65 | 66 | // Providing user-supplied solution. 67 | if (init) { 68 | for (size_t i = 0; i < Y.size(); i++) Y[i] = Y_in[i]; 69 | if (verbose) Rprintf("Using user supplied starting positions\n"); 70 | } 71 | 72 | // Run tsne 73 | if (no_dims==1) { 74 | TSNE<1> tsne(perplexity, theta, verbose, max_iter, init, stop_lying_iter, mom_switch_iter, 75 | momentum, final_momentum, eta, exaggeration_factor, num_threads); 76 | tsne.run(nn_dex.begin(), nn_dist.begin(), N, K, Y.data(), costs.data(), itercosts.data()); 77 | } else if (no_dims==2) { 78 | TSNE<2> tsne(perplexity, theta, verbose, max_iter, init, stop_lying_iter, mom_switch_iter, 79 | momentum, final_momentum, eta, exaggeration_factor, num_threads); 80 | tsne.run(nn_dex.begin(), nn_dist.begin(), N, K, Y.data(), costs.data(), itercosts.data()); 81 | } else if (no_dims==3) { 82 | TSNE<3> tsne(perplexity, theta, verbose, max_iter, init, stop_lying_iter, mom_switch_iter, 83 | momentum, final_momentum, eta, exaggeration_factor, num_threads); 84 | tsne.run(nn_dex.begin(), nn_dist.begin(), N, K, Y.data(), costs.data(), itercosts.data()); 85 | } else { 86 | Rcpp::stop("Only 1, 2 or 3 dimensional output is suppported.\n"); 87 | } 88 | 89 | return Rcpp::List::create(Rcpp::_["Y"]=Rcpp::NumericMatrix(no_dims, N, Y.data()), 90 | Rcpp::_["costs"]=Rcpp::NumericVector(costs.begin(), costs.end()), 91 | Rcpp::_["itercosts"]=Rcpp::NumericVector(itercosts.begin(), itercosts.end())); 92 | } 93 | 94 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | 6 | using namespace Rcpp; 7 | 8 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 9 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 10 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 11 | #endif 12 | 13 | // Rtsne_cpp 14 | Rcpp::List Rtsne_cpp(NumericMatrix X, int no_dims, double perplexity, double theta, bool verbose, int max_iter, bool distance_precomputed, NumericMatrix Y_in, bool init, int stop_lying_iter, int mom_switch_iter, double momentum, double final_momentum, double eta, double exaggeration_factor, unsigned int num_threads); 15 | RcppExport SEXP _Rtsne_Rtsne_cpp(SEXP XSEXP, SEXP no_dimsSEXP, SEXP perplexitySEXP, SEXP thetaSEXP, SEXP verboseSEXP, SEXP max_iterSEXP, SEXP distance_precomputedSEXP, SEXP Y_inSEXP, SEXP initSEXP, SEXP stop_lying_iterSEXP, SEXP mom_switch_iterSEXP, SEXP momentumSEXP, SEXP final_momentumSEXP, SEXP etaSEXP, SEXP exaggeration_factorSEXP, SEXP num_threadsSEXP) { 16 | BEGIN_RCPP 17 | Rcpp::RObject rcpp_result_gen; 18 | Rcpp::RNGScope rcpp_rngScope_gen; 19 | Rcpp::traits::input_parameter< NumericMatrix >::type X(XSEXP); 20 | Rcpp::traits::input_parameter< int >::type no_dims(no_dimsSEXP); 21 | Rcpp::traits::input_parameter< double >::type perplexity(perplexitySEXP); 22 | Rcpp::traits::input_parameter< double >::type theta(thetaSEXP); 23 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 24 | Rcpp::traits::input_parameter< int >::type max_iter(max_iterSEXP); 25 | Rcpp::traits::input_parameter< bool >::type distance_precomputed(distance_precomputedSEXP); 26 | Rcpp::traits::input_parameter< NumericMatrix >::type Y_in(Y_inSEXP); 27 | Rcpp::traits::input_parameter< bool >::type init(initSEXP); 28 | Rcpp::traits::input_parameter< int >::type stop_lying_iter(stop_lying_iterSEXP); 29 | Rcpp::traits::input_parameter< int >::type mom_switch_iter(mom_switch_iterSEXP); 30 | Rcpp::traits::input_parameter< double >::type momentum(momentumSEXP); 31 | Rcpp::traits::input_parameter< double >::type final_momentum(final_momentumSEXP); 32 | Rcpp::traits::input_parameter< double >::type eta(etaSEXP); 33 | Rcpp::traits::input_parameter< double >::type exaggeration_factor(exaggeration_factorSEXP); 34 | Rcpp::traits::input_parameter< unsigned int >::type num_threads(num_threadsSEXP); 35 | rcpp_result_gen = Rcpp::wrap(Rtsne_cpp(X, no_dims, perplexity, theta, verbose, max_iter, distance_precomputed, Y_in, init, stop_lying_iter, mom_switch_iter, momentum, final_momentum, eta, exaggeration_factor, num_threads)); 36 | return rcpp_result_gen; 37 | END_RCPP 38 | } 39 | // Rtsne_nn_cpp 40 | Rcpp::List Rtsne_nn_cpp(IntegerMatrix nn_dex, NumericMatrix nn_dist, int no_dims, double perplexity, double theta, bool verbose, int max_iter, NumericMatrix Y_in, bool init, int stop_lying_iter, int mom_switch_iter, double momentum, double final_momentum, double eta, double exaggeration_factor, unsigned int num_threads); 41 | RcppExport SEXP _Rtsne_Rtsne_nn_cpp(SEXP nn_dexSEXP, SEXP nn_distSEXP, SEXP no_dimsSEXP, SEXP perplexitySEXP, SEXP thetaSEXP, SEXP verboseSEXP, SEXP max_iterSEXP, SEXP Y_inSEXP, SEXP initSEXP, SEXP stop_lying_iterSEXP, SEXP mom_switch_iterSEXP, SEXP momentumSEXP, SEXP final_momentumSEXP, SEXP etaSEXP, SEXP exaggeration_factorSEXP, SEXP num_threadsSEXP) { 42 | BEGIN_RCPP 43 | Rcpp::RObject rcpp_result_gen; 44 | Rcpp::RNGScope rcpp_rngScope_gen; 45 | Rcpp::traits::input_parameter< IntegerMatrix >::type nn_dex(nn_dexSEXP); 46 | Rcpp::traits::input_parameter< NumericMatrix >::type nn_dist(nn_distSEXP); 47 | Rcpp::traits::input_parameter< int >::type no_dims(no_dimsSEXP); 48 | Rcpp::traits::input_parameter< double >::type perplexity(perplexitySEXP); 49 | Rcpp::traits::input_parameter< double >::type theta(thetaSEXP); 50 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 51 | Rcpp::traits::input_parameter< int >::type max_iter(max_iterSEXP); 52 | Rcpp::traits::input_parameter< NumericMatrix >::type Y_in(Y_inSEXP); 53 | Rcpp::traits::input_parameter< bool >::type init(initSEXP); 54 | Rcpp::traits::input_parameter< int >::type stop_lying_iter(stop_lying_iterSEXP); 55 | Rcpp::traits::input_parameter< int >::type mom_switch_iter(mom_switch_iterSEXP); 56 | Rcpp::traits::input_parameter< double >::type momentum(momentumSEXP); 57 | Rcpp::traits::input_parameter< double >::type final_momentum(final_momentumSEXP); 58 | Rcpp::traits::input_parameter< double >::type eta(etaSEXP); 59 | Rcpp::traits::input_parameter< double >::type exaggeration_factor(exaggeration_factorSEXP); 60 | Rcpp::traits::input_parameter< unsigned int >::type num_threads(num_threadsSEXP); 61 | rcpp_result_gen = Rcpp::wrap(Rtsne_nn_cpp(nn_dex, nn_dist, no_dims, perplexity, theta, verbose, max_iter, Y_in, init, stop_lying_iter, mom_switch_iter, momentum, final_momentum, eta, exaggeration_factor, num_threads)); 62 | return rcpp_result_gen; 63 | END_RCPP 64 | } 65 | // normalize_input_cpp 66 | Rcpp::NumericMatrix normalize_input_cpp(Rcpp::NumericMatrix input); 67 | RcppExport SEXP _Rtsne_normalize_input_cpp(SEXP inputSEXP) { 68 | BEGIN_RCPP 69 | Rcpp::RObject rcpp_result_gen; 70 | Rcpp::RNGScope rcpp_rngScope_gen; 71 | Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type input(inputSEXP); 72 | rcpp_result_gen = Rcpp::wrap(normalize_input_cpp(input)); 73 | return rcpp_result_gen; 74 | END_RCPP 75 | } 76 | 77 | static const R_CallMethodDef CallEntries[] = { 78 | {"_Rtsne_Rtsne_cpp", (DL_FUNC) &_Rtsne_Rtsne_cpp, 16}, 79 | {"_Rtsne_Rtsne_nn_cpp", (DL_FUNC) &_Rtsne_Rtsne_nn_cpp, 16}, 80 | {"_Rtsne_normalize_input_cpp", (DL_FUNC) &_Rtsne_normalize_input_cpp, 1}, 81 | {NULL, NULL, 0} 82 | }; 83 | 84 | RcppExport void R_init_Rtsne(DllInfo *dll) { 85 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 86 | R_useDynamicSymbols(dll, FALSE); 87 | } 88 | -------------------------------------------------------------------------------- /tests/testthat/test_Rtsne.R: -------------------------------------------------------------------------------- 1 | context("Rtsne main function") 2 | 3 | # Prepare iris dataset 4 | iris_unique <- unique(iris) # Remove duplicates 5 | Xscale<-normalize_input(as.matrix(iris_unique[,1:4])) 6 | distmat <- as.matrix(dist(Xscale)) 7 | 8 | # Run models to compare to 9 | iter_equal <- 500 10 | 11 | test_that("Scaling gives the expected result", { 12 | Xscale2 <- scale(as.matrix(iris_unique[,1:4]), 13 | center = TRUE, scale=FALSE) 14 | Xscale2 <- scale(Xscale, 15 | center=FALSE, 16 | scale=rep(max(abs(Xscale)),4)) 17 | expect_equivalent(Xscale,Xscale2) 18 | }) 19 | 20 | test_that("Manual distance calculation equals C++ distance calculation", { 21 | 22 | # Does not work on 32 bit windows 23 | skip_on_cran() 24 | 25 | # Exact 26 | set.seed(50) 27 | tsne_matrix <- Rtsne(as.matrix(iris_unique[,1:4]),verbose=FALSE, 28 | is_distance = FALSE,theta=0.0,max_iter=iter_equal, 29 | pca=FALSE, normalize=TRUE) 30 | set.seed(50) 31 | tsne_dist <- Rtsne(distmat, verbose=FALSE, is_distance = TRUE, 32 | theta = 0.0, max_iter=iter_equal) 33 | expect_equal(tsne_matrix$Y,tsne_dist$Y) 34 | 35 | # Inexact 36 | set.seed(50) 37 | tsne_matrix <- Rtsne(as.matrix(iris_unique[,1:4]),verbose=FALSE, is_distance = FALSE, 38 | theta=0.1,max_iter=iter_equal,pca=FALSE) 39 | set.seed(50) 40 | tsne_dist <- Rtsne(distmat, verbose=FALSE, is_distance = TRUE, theta = 0.1, max_iter=iter_equal, 41 | pca=FALSE) 42 | expect_equal(tsne_matrix$Y,tsne_dist$Y) 43 | }) 44 | 45 | test_that("Accepts dist", { 46 | 47 | # Exact 48 | set.seed(50) 49 | tsne_out_dist_matrix <- Rtsne(distmat, is_distance = TRUE, theta=0.0, max_iter=iter_equal) 50 | set.seed(50) 51 | tsne_out_dist <- Rtsne(dist(Xscale),theta=0.0,max_iter=iter_equal) 52 | expect_equal(tsne_out_dist$Y,tsne_out_dist_matrix$Y) 53 | 54 | # Inexact 55 | set.seed(50) 56 | tsne_out_dist_matrix <- Rtsne(distmat, is_distance = TRUE, theta=0.1, max_iter=iter_equal) 57 | set.seed(50) 58 | tsne_out_dist <- Rtsne(dist(Xscale), theta=0.1, max_iter=iter_equal) 59 | expect_equal(tsne_out_dist$Y,tsne_out_dist_matrix$Y) 60 | }) 61 | 62 | test_that("Accepts data.frame", { 63 | 64 | # Exact 65 | set.seed(50) 66 | tsne_out_matrix <- Rtsne(as.matrix(iris_unique[,1:4]),dims=1,verbose=FALSE, is_distance = FALSE,theta=0.0,max_iter=iter_equal,pca=FALSE) 67 | set.seed(50) 68 | tsne_out_df <- Rtsne(iris_unique[,1:4],dims=1,verbose=FALSE, is_distance = FALSE,theta=0.0,pca=FALSE,max_iter=iter_equal,num_threads=1) 69 | expect_equal(tsne_out_matrix$Y,tsne_out_df$Y) 70 | 71 | # Inexact 72 | set.seed(50) 73 | tsne_out_matrix_bh <- Rtsne(as.matrix(iris_unique[,1:4]),verbose=FALSE, is_distance = FALSE,theta=0.1,pca=FALSE,max_iter=iter_equal) 74 | set.seed(50) 75 | tsne_out_df <- Rtsne(iris_unique[,1:4],verbose=FALSE, is_distance = FALSE,theta=0.1,pca=FALSE,max_iter=iter_equal,num_threads=1) 76 | expect_equal(tsne_out_matrix_bh$Y,tsne_out_df$Y) 77 | }) 78 | 79 | test_that("OpenMP with different threads returns same result",{ 80 | 81 | # Does not work on windows 82 | skip_on_cran() 83 | skip_on_ci() 84 | 85 | set.seed(50) 86 | tsne_out_df1 <- Rtsne(iris_unique[,1:4],dims=3,verbose=FALSE, is_distance = FALSE, 87 | theta=0.1,pca=FALSE,max_iter=iter_equal,num_threads=1) 88 | set.seed(50) 89 | tsne_out_df2 <- Rtsne(iris_unique[,1:4],dims=3,verbose=FALSE, is_distance = FALSE, 90 | theta=0.1,pca=FALSE,max_iter=iter_equal,num_threads=2) 91 | set.seed(50) 92 | tsne_out_df3 <- Rtsne(iris_unique[,1:4],dims=3,verbose=FALSE, is_distance = FALSE, 93 | theta=0.1,pca=FALSE,max_iter=iter_equal,num_threads=3) 94 | expect_equal(tsne_out_df1$Y,tsne_out_df2$Y) 95 | expect_equal(tsne_out_df2$Y,tsne_out_df3$Y) 96 | }) 97 | 98 | test_that("Continuing from initialization gives approximately the same result as direct run", { 99 | 100 | # Does not work exactly due to resetting of "gains". 101 | iter_equal <- 1000 102 | extra_iter <- 200 103 | 104 | #Exact 105 | set.seed(50) 106 | tsne_out_full <- Rtsne(iris_unique[,1:4], 107 | perplexity=3,theta=0.0,pca=FALSE, 108 | max_iter=iter_equal,final_momentum = 0) 109 | set.seed(50) 110 | tsne_out_part1 <- Rtsne(iris_unique[,1:4], 111 | perplexity=3,theta=0.0,pca=FALSE, 112 | max_iter=iter_equal-extra_iter,final_momentum = 0) 113 | tsne_out_part2 <- Rtsne(iris_unique[,1:4], 114 | perplexity=3,theta=0.0,pca=FALSE, 115 | max_iter=extra_iter,Y_init=tsne_out_part1$Y,final_momentum = 0) 116 | expect_equivalent(dist(tsne_out_full$Y),dist(tsne_out_part2$Y),tolerance=0.01) 117 | 118 | #Inexact 119 | set.seed(50) 120 | tsne_out_full <- Rtsne(iris_unique[,1:4],final_momentum=0,theta=0.1,pca=FALSE,max_iter=iter_equal) 121 | set.seed(50) 122 | tsne_out_part1 <- Rtsne(iris_unique[,1:4],final_momentum=0,theta=0.1,pca=FALSE,max_iter=iter_equal-extra_iter) 123 | set.seed(50) 124 | tsne_out_part2 <- Rtsne(iris_unique[,1:4],final_momentum=0,theta=0.1,pca=FALSE,max_iter=extra_iter,Y_init=tsne_out_part1$Y) 125 | expect_equivalent(dist(tsne_out_full$Y),dist(tsne_out_part2$Y),tolerance=0.01) 126 | }) 127 | 128 | test_that("partial_pca FALSE and TRUE give similar results", { 129 | skip_if_not_installed("irlba") 130 | 131 | # Only first few iterations 132 | iter_equal <- 5 133 | 134 | set.seed(42) 135 | fat_data <- rbind(sapply(runif(200,-1,1), function(x) rnorm(200,x)), 136 | sapply(runif(200,-1,1), function(x) rnorm(200,x))) 137 | 138 | set.seed(42) 139 | tsne_out_prcomp <- Rtsne(fat_data, max_iter = iter_equal) 140 | 141 | set.seed(42) 142 | tsne_out_irlba <- Rtsne(fat_data, partial_pca = T, max_iter = iter_equal) 143 | 144 | # Sign of principal components are arbitrary so even with same seed tSNE coordinates are not the same 145 | expect_equal(tsne_out_prcomp$costs, tsne_out_irlba$costs, tolerance = .01, scale = 1) 146 | }) 147 | 148 | test_that("Error conditions", { 149 | expect_error(Rtsne("test", matrix(0, 50, 10), Y_init=Y_in, perplexity=10), "matrix") 150 | expect_error(Rtsne(distmat,is_distance = 3),"logical") 151 | expect_error(Rtsne(matrix(0,2,3),is_distance = TRUE),"Input") 152 | expect_error(Rtsne(matrix(0,100,3)),"duplicates") 153 | expect_error(Rtsne(matrix(0,2,3),pca_center = 2),"TRUE") 154 | expect_error(Rtsne(matrix(0,2,3),initial_dims=1.3),"dimensionality") 155 | expect_error(Rtsne(matrix(0,2,3),dims=4),"dims") 156 | expect_error(Rtsne(matrix(0,2,3),max_iter=1.5),"should") 157 | 158 | expect_error(Rtsne(matrix(0,2,3),Y_init=matrix(0,2,1)),"incorrect format") 159 | expect_error(Rtsne(matrix(0,2,3),perplexity = 0),"positive") 160 | expect_error(Rtsne(matrix(0,2,3),theta = -0.1),"lie") 161 | expect_error(Rtsne(matrix(0,2,3),theta = 1.001),"lie") 162 | expect_error(Rtsne(matrix(0,2,3),stop_lying_iter = -1),"positive") 163 | expect_error(Rtsne(matrix(0,2,3),mom_switch_iter = -1),"positive") 164 | expect_error(Rtsne(matrix(0,2,3),momentum = -0.1),"positive") 165 | expect_error(Rtsne(matrix(0,2,3),final_momentum = -0.1),"positive") 166 | expect_error(Rtsne(matrix(0,2,3),eta = 0.0),"positive") 167 | expect_error(Rtsne(matrix(0,2,3),exaggeration_factor = 0.0),"positive") 168 | expect_error(Rtsne(matrix(0,2,3)),"perplexity is too large") 169 | }) 170 | 171 | test_that("Verbose option", { 172 | expect_output(Rtsne(iris_unique[,1:4],pca=TRUE,verbose=TRUE,max_iter=150),"Fitting performed") 173 | }) 174 | -------------------------------------------------------------------------------- /src/vptree.h: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology) 4 | * All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are met: 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * 3. All advertising materials mentioning features or use of this software 14 | * must display the following acknowledgement: 15 | * This product includes software developed by the Delft University of Technology. 16 | * 4. Neither the name of the Delft University of Technology nor the names of 17 | * its contributors may be used to endorse or promote products derived from 18 | * this software without specific prior written permission. 19 | * 20 | * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS 21 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 23 | * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 28 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY 29 | * OF SUCH DAMAGE. 30 | * 31 | */ 32 | 33 | 34 | /* This code was adopted with minor modifications from Steve Hanov's great tutorial at http://stevehanov.ca/blog/index.php?id=130 */ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include "datapoint.h" 45 | 46 | #ifndef VPTREE_H 47 | #define VPTREE_H 48 | 49 | double euclidean_distance(const DataPoint &t1, const DataPoint &t2) { 50 | double dd = .0; 51 | for(int d = 0; d < t1.dimensionality(); d++) dd += (t1.x(d) - t2.x(d)) * (t1.x(d) - t2.x(d)); 52 | return sqrt(dd); 53 | } 54 | 55 | double precomputed_distance(const DataPoint &t1, const DataPoint &t2) { 56 | double dd = .0; 57 | dd = t1.x(t2.index()); 58 | return dd; 59 | } 60 | 61 | template 62 | class VpTree 63 | { 64 | public: 65 | 66 | // Default constructor 67 | VpTree() : _root(0) {} 68 | 69 | // Destructor 70 | ~VpTree() { 71 | delete _root; 72 | } 73 | 74 | // Function to create a new VpTree from data 75 | void create(const std::vector& items) { 76 | delete _root; 77 | _items = items; 78 | _root = buildFromPoints(0, items.size()); 79 | } 80 | 81 | // Function that uses the tree to find the k nearest neighbors of target 82 | void search(const T& target, int k, std::vector* results, std::vector* distances) 83 | { 84 | 85 | // Use a priority queue to store intermediate results on 86 | std::priority_queue heap; 87 | 88 | // Variable that tracks the distance to the farthest point in our results 89 | double tau = DBL_MAX; 90 | 91 | // Perform the search 92 | search(_root, target, k, heap, &tau); 93 | 94 | // Gather final results 95 | results->clear(); distances->clear(); 96 | while(!heap.empty()) { 97 | results->push_back(_items[heap.top().index]); 98 | distances->push_back(heap.top().dist); 99 | heap.pop(); 100 | } 101 | 102 | // Results are in reverse order 103 | std::reverse(results->begin(), results->end()); 104 | std::reverse(distances->begin(), distances->end()); 105 | } 106 | 107 | private: 108 | std::vector _items; 109 | 110 | // Single node of a VP tree (has a point and radius; left children are closer to point than the radius) 111 | struct Node 112 | { 113 | int index; // index of point in node 114 | double threshold; // radius(?) 115 | Node* left; // points closer by than threshold 116 | Node* right; // points farther away than threshold 117 | 118 | Node() : 119 | index(0), threshold(0.), left(0), right(0) {} 120 | 121 | ~Node() { // destructor 122 | delete left; 123 | delete right; 124 | } 125 | }* _root; 126 | 127 | 128 | // An item on the intermediate result queue 129 | struct HeapItem { 130 | HeapItem( int index, double dist) : 131 | index(index), dist(dist) {} 132 | int index; 133 | double dist; 134 | bool operator<(const HeapItem& o) const { 135 | return dist < o.dist; 136 | } 137 | }; 138 | 139 | // Distance comparator for use in std::nth_element 140 | struct DistanceComparator 141 | { 142 | const T& item; 143 | DistanceComparator(const T& item) : item(item) {} 144 | bool operator()(const T& a, const T& b) { 145 | return distance(item, a) < distance(item, b); 146 | } 147 | }; 148 | 149 | // Function that (recursively) fills the tree 150 | Node* buildFromPoints( int lower, int upper ) 151 | { 152 | if (upper == lower) { // indicates that we're done here! 153 | return NULL; 154 | } 155 | 156 | // Lower index is center of current node 157 | Node* node = new Node(); 158 | node->index = lower; 159 | 160 | if (upper - lower > 1) { // if we did not arrive at leaf yet 161 | 162 | // Choose an arbitrary point and move it to the start 163 | Rcpp::RNGScope scope; 164 | int i = (int) ((double)R::runif(0,1) * (upper - lower - 1)) + lower; 165 | std::swap(_items[lower], _items[i]); 166 | 167 | // Partition around the median distance 168 | int median = (upper + lower) / 2; 169 | std::nth_element(_items.begin() + lower + 1, 170 | _items.begin() + median, 171 | _items.begin() + upper, 172 | DistanceComparator(_items[lower])); 173 | 174 | // Threshold of the new node will be the distance to the median 175 | node->threshold = distance(_items[lower], _items[median]); 176 | 177 | // Recursively build tree 178 | node->index = lower; 179 | node->left = buildFromPoints(lower + 1, median); 180 | node->right = buildFromPoints(median, upper); 181 | } 182 | 183 | // Return result 184 | return node; 185 | } 186 | 187 | // Helper function that searches the tree 188 | void search(Node* node, const T& target, unsigned int k, std::priority_queue& heap, double* ptau) 189 | { 190 | if(node == NULL) return; // indicates that we're done here 191 | 192 | // Compute distance between target and current node 193 | double dist = distance(_items[node->index], target); 194 | 195 | // If current node within radius tau 196 | if(dist < (*ptau)) { 197 | if(heap.size() == k) heap.pop(); // remove furthest node from result list (if we already have k results) 198 | heap.push(HeapItem(node->index, dist)); // add current node to result list 199 | if(heap.size() == k) *ptau = heap.top().dist; // update value of tau (farthest point in result list) 200 | } 201 | 202 | // Return if we arrived at a leaf 203 | if(node->left == NULL && node->right == NULL) { 204 | return; 205 | } 206 | 207 | // If the target lies within the radius of ball 208 | if(dist < node->threshold) { 209 | if(dist - (*ptau) <= node->threshold) { // if there can still be neighbors inside the ball, recursively search left child first 210 | search(node->left, target, k, heap, ptau); 211 | } 212 | 213 | if(dist + (*ptau) >= node->threshold) { // if there can still be neighbors outside the ball, recursively search right child 214 | search(node->right, target, k, heap, ptau); 215 | } 216 | 217 | // If the target lies outsize the radius of the ball 218 | } else { 219 | if(dist + (*ptau) >= node->threshold) { // if there can still be neighbors outside the ball, recursively search right child first 220 | search(node->right, target, k, heap, ptau); 221 | } 222 | 223 | if(dist - (*ptau) <= node->threshold) { // if there can still be neighbors inside the ball, recursively search left child 224 | search(node->left, target, k, heap, ptau); 225 | } 226 | } 227 | } 228 | }; 229 | 230 | #endif 231 | -------------------------------------------------------------------------------- /man/Rtsne.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Rtsne.R, R/neighbors.R 3 | \name{Rtsne} 4 | \alias{Rtsne} 5 | \alias{Rtsne.default} 6 | \alias{Rtsne.dist} 7 | \alias{Rtsne.data.frame} 8 | \alias{Rtsne_neighbors} 9 | \title{Barnes-Hut implementation of t-Distributed Stochastic Neighbor Embedding} 10 | \usage{ 11 | Rtsne(X, ...) 12 | 13 | \method{Rtsne}{default}( 14 | X, 15 | dims = 2, 16 | initial_dims = 50, 17 | perplexity = 30, 18 | theta = 0.5, 19 | check_duplicates = TRUE, 20 | pca = TRUE, 21 | partial_pca = FALSE, 22 | max_iter = 1000, 23 | verbose = getOption("verbose", FALSE), 24 | is_distance = FALSE, 25 | Y_init = NULL, 26 | pca_center = TRUE, 27 | pca_scale = FALSE, 28 | normalize = TRUE, 29 | stop_lying_iter = ifelse(is.null(Y_init), 250L, 0L), 30 | mom_switch_iter = ifelse(is.null(Y_init), 250L, 0L), 31 | momentum = 0.5, 32 | final_momentum = 0.8, 33 | eta = 200, 34 | exaggeration_factor = 12, 35 | num_threads = 1, 36 | ... 37 | ) 38 | 39 | \method{Rtsne}{dist}(X, ..., is_distance = TRUE) 40 | 41 | \method{Rtsne}{data.frame}(X, ...) 42 | 43 | Rtsne_neighbors( 44 | index, 45 | distance, 46 | dims = 2, 47 | perplexity = 30, 48 | theta = 0.5, 49 | max_iter = 1000, 50 | verbose = getOption("verbose", FALSE), 51 | Y_init = NULL, 52 | stop_lying_iter = ifelse(is.null(Y_init), 250L, 0L), 53 | mom_switch_iter = ifelse(is.null(Y_init), 250L, 0L), 54 | momentum = 0.5, 55 | final_momentum = 0.8, 56 | eta = 200, 57 | exaggeration_factor = 12, 58 | num_threads = 1, 59 | ... 60 | ) 61 | } 62 | \arguments{ 63 | \item{X}{matrix; Data matrix (each row is an observation, each column is a variable)} 64 | 65 | \item{...}{Other arguments that can be passed to Rtsne} 66 | 67 | \item{dims}{integer; Output dimensionality (default: 2)} 68 | 69 | \item{initial_dims}{integer; the number of dimensions that should be retained in the initial PCA step (default: 50)} 70 | 71 | \item{perplexity}{numeric; Perplexity parameter (should not be bigger than 3 * perplexity < nrow(X) - 1, see details for interpretation)} 72 | 73 | \item{theta}{numeric; Speed/accuracy trade-off (increase for less accuracy), set to 0.0 for exact TSNE (default: 0.5)} 74 | 75 | \item{check_duplicates}{logical; Checks whether duplicates are present. It is best to make sure there are no duplicates present and set this option to FALSE, especially for large datasets (default: TRUE)} 76 | 77 | \item{pca}{logical; Whether an initial PCA step should be performed (default: TRUE)} 78 | 79 | \item{partial_pca}{logical; Whether truncated PCA should be used to calculate principal components (requires the irlba package). This is faster for large input matrices (default: FALSE)} 80 | 81 | \item{max_iter}{integer; Number of iterations (default: 1000)} 82 | 83 | \item{verbose}{logical; Whether progress updates should be printed (default: global "verbose" option, or FALSE if that is not set)} 84 | 85 | \item{is_distance}{logical; Indicate whether X is a distance matrix (default: FALSE)} 86 | 87 | \item{Y_init}{matrix; Initial locations of the objects. If NULL, random initialization will be used (default: NULL). Note that when using this, the initial stage with exaggerated perplexity values and a larger momentum term will be skipped.} 88 | 89 | \item{pca_center}{logical; Should data be centered before pca is applied? (default: TRUE)} 90 | 91 | \item{pca_scale}{logical; Should data be scaled before pca is applied? (default: FALSE)} 92 | 93 | \item{normalize}{logical; Should data be normalized internally prior to distance calculations with \code{\link{normalize_input}}? (default: TRUE)} 94 | 95 | \item{stop_lying_iter}{integer; Iteration after which the perplexities are no longer exaggerated (default: 250, except when Y_init is used, then 0)} 96 | 97 | \item{mom_switch_iter}{integer; Iteration after which the final momentum is used (default: 250, except when Y_init is used, then 0)} 98 | 99 | \item{momentum}{numeric; Momentum used in the first part of the optimization (default: 0.5)} 100 | 101 | \item{final_momentum}{numeric; Momentum used in the final part of the optimization (default: 0.8)} 102 | 103 | \item{eta}{numeric; Learning rate (default: 200.0)} 104 | 105 | \item{exaggeration_factor}{numeric; Exaggeration factor used to multiply the P matrix in the first part of the optimization (default: 12.0)} 106 | 107 | \item{num_threads}{integer; Number of threads to use when using OpenMP, default is 1. Setting to 0 corresponds to detecting and using all available cores} 108 | 109 | \item{index}{integer matrix; Each row contains the identity of the nearest neighbors for each observation} 110 | 111 | \item{distance}{numeric matrix; Each row contains the distance to the nearest neighbors in \code{index} for each observation} 112 | } 113 | \value{ 114 | List with the following elements: 115 | \item{Y}{Matrix containing the new representations for the objects} 116 | \item{N}{Number of objects} 117 | \item{origD}{Original Dimensionality before TSNE (only when \code{X} is a data matrix)} 118 | \item{perplexity}{See above} 119 | \item{theta}{See above} 120 | \item{costs}{The cost for every object after the final iteration} 121 | \item{itercosts}{The total costs (KL-divergence) for all objects in every 50th + the last iteration} 122 | \item{stop_lying_iter}{Iteration after which the perplexities are no longer exaggerated} 123 | \item{mom_switch_iter}{Iteration after which the final momentum is used} 124 | \item{momentum}{Momentum used in the first part of the optimization} 125 | \item{final_momentum}{Momentum used in the final part of the optimization} 126 | \item{eta}{Learning rate} 127 | \item{exaggeration_factor}{Exaggeration factor used to multiply the P matrix in the first part of the optimization} 128 | } 129 | \description{ 130 | Wrapper for the C++ implementation of Barnes-Hut t-Distributed Stochastic Neighbor Embedding. t-SNE is a method for constructing a low dimensional embedding of high-dimensional data, distances or similarities. Exact t-SNE can be computed by setting theta=0.0. 131 | } 132 | \details{ 133 | Given a distance matrix \eqn{D} between input objects (which by default, is the euclidean distances between two objects), we calculate a similarity score in the original space: \deqn{ p_{j | i} = \frac{\exp(-\|D_{ij}\|^2 / 2 \sigma_i^2)}{\sum_{k \neq i} \exp(-\|D_{ij}\|^2 / 2 \sigma_i^2)} } which is then symmetrized using: \deqn{ p_{i j}=\frac{p_{j|i} + p_{i|j}}{2n}.} The \eqn{\sigma} for each object is chosen in such a way that the perplexity of \eqn{p_{j|i}} has a value that is close to the user defined perplexity. This value effectively controls how many nearest neighbours are taken into account when constructing the embedding in the low-dimensional space. 134 | For the low-dimensional space we use the Cauchy distribution (t-distribution with one degree of freedom) as the distribution of the distances to neighbouring objects: 135 | \deqn{ q_{i j} = \frac{(1+ \| y_i-y_j\|^2)^{-1}}{\sum_{k \neq l} 1+ \| y_k-y_l\|^2)^{-1}}.} 136 | By changing the location of the objects y in the embedding to minimize the Kullback-Leibler divergence between these two distributions \eqn{ q_{i j}} and \eqn{ p_{i j}}, we create a map that focusses on small-scale structure, due to the asymmetry of the KL-divergence. The t-distribution is chosen to avoid the crowding problem: in the original high dimensional space, there are potentially many equidistant objects with moderate distance from a particular object, more than can be accounted for in the low dimensional representation. The t-distribution makes sure that these objects are more spread out in the new representation. 137 | 138 | For larger datasets, a problem with the a simple gradient descent to minimize the Kullback-Leibler divergence is the computational complexity of each gradient step (which is \eqn{O(n^2)}). The Barnes-Hut implementation of the algorithm attempts to mitigate this problem using two tricks: (1) approximating small similarities by 0 in the \eqn{p_{ij}} distribution, where the non-zero entries are computed by finding 3*perplexity nearest neighbours using an efficient tree search. (2) Using the Barnes-Hut algorithm in the computation of the gradient which approximates large distance similarities using a quadtree. This approximation is controlled by the \code{theta} parameter, with smaller values leading to more exact approximations. When \code{theta=0.0}, the implementation uses a standard t-SNE implementation. The Barnes-Hut approximation leads to a \eqn{O(n log(n))} computational complexity for each iteration. 139 | 140 | During the minimization of the KL-divergence, the implementation uses a trick known as early exaggeration, which multiplies the \eqn{p_{ij}}'s by 12 during the first 250 iterations. This leads to tighter clustering and more distance between clusters of objects. This early exaggeration is not used when the user gives an initialization of the objects in the embedding by setting \code{Y_init}. During the early exaggeration phase, a momentum term of 0.5 is used while this is changed to 0.8 after the first 250 iterations. All these default parameters can be changed by the user. 141 | 142 | After checking the correctness of the input, the \code{Rtsne} function (optionally) does an initial reduction of the feature space using \code{\link{prcomp}}, before calling the C++ TSNE implementation. Since R's random number generator is used, use \code{\link{set.seed}} before the function call to get reproducible results. 143 | 144 | If \code{X} is a data.frame, it is transformed into a matrix using \code{\link{model.matrix}}. If \code{X} is a \code{\link{dist}} object, it is currently first expanded into a full distance matrix. 145 | } 146 | \section{Methods (by class)}{ 147 | \itemize{ 148 | \item \code{Rtsne(default)}: Default Interface 149 | 150 | \item \code{Rtsne(dist)}: tsne on given dist object 151 | 152 | \item \code{Rtsne(data.frame)}: tsne on data.frame 153 | 154 | }} 155 | \section{Supplying precomputed distances}{ 156 | 157 | If a distance matrix is already available, this can be directly supplied to \code{Rtsne} by setting \code{is_distance=TRUE}. 158 | This improves efficiency by avoiding recalculation of distances, but requires some work to get the same results as running default \code{Rtsne} on a data matrix. 159 | Specifically, Euclidean distances should be computed from a normalized data matrix - see \code{\link{normalize_input}} for details. 160 | PCA arguments will also be ignored if \code{is_distance=TRUE}. 161 | 162 | NN search results can be directly supplied to \code{Rtsne_neighbors} to avoid repeating the (possibly time-consuming) search. 163 | To achieve the same results as \code{Rtsne} on the data matrix, the search should be conducted on the normalized data matrix. 164 | The number of nearest neighbors should also be equal to three-fold the \code{perplexity}, rounded down to the nearest integer. 165 | Note that pre-supplied NN results cannot be used when \code{theta=0} as they are only relevant for the approximate algorithm. 166 | 167 | Any kind of distance metric can be used as input. 168 | In contrast, running \code{Rtsne} on a data matrix will always use Euclidean distances. 169 | } 170 | 171 | \examples{ 172 | iris_unique <- unique(iris) # Remove duplicates 173 | iris_matrix <- as.matrix(iris_unique[,1:4]) 174 | 175 | # Set a seed if you want reproducible results 176 | set.seed(42) 177 | tsne_out <- Rtsne(iris_matrix,pca=FALSE,perplexity=30,theta=0.0) # Run TSNE 178 | 179 | # Show the objects in the 2D tsne representation 180 | plot(tsne_out$Y,col=iris_unique$Species, asp=1) 181 | 182 | # data.frame as input 183 | tsne_out <- Rtsne(iris_unique,pca=FALSE, theta=0.0) 184 | 185 | # Using a dist object 186 | set.seed(42) 187 | tsne_out <- Rtsne(dist(normalize_input(iris_matrix)), theta=0.0) 188 | plot(tsne_out$Y,col=iris_unique$Species, asp=1) 189 | 190 | set.seed(42) 191 | tsne_out <- Rtsne(as.matrix(dist(normalize_input(iris_matrix))),theta=0.0) 192 | plot(tsne_out$Y,col=iris_unique$Species, asp=1) 193 | 194 | # Supplying starting positions (example: continue from earlier embedding) 195 | set.seed(42) 196 | tsne_part1 <- Rtsne(iris_unique[,1:4], theta=0.0, pca=FALSE, max_iter=350) 197 | tsne_part2 <- Rtsne(iris_unique[,1:4], theta=0.0, pca=FALSE, max_iter=650, Y_init=tsne_part1$Y) 198 | plot(tsne_part2$Y,col=iris_unique$Species, asp=1) 199 | \dontrun{ 200 | # Fast PCA and multicore 201 | 202 | tsne_out <- Rtsne(iris_matrix, theta=0.1, partial_pca = TRUE, initial_dims=3) 203 | tsne_out <- Rtsne(iris_matrix, theta=0.1, num_threads = 2) 204 | } 205 | } 206 | \references{ 207 | Maaten, L. Van Der, 2014. Accelerating t-SNE using Tree-Based Algorithms. Journal of Machine Learning Research, 15, p.3221-3245. 208 | 209 | van der Maaten, L.J.P. & Hinton, G.E., 2008. Visualizing High-Dimensional Data Using t-SNE. Journal of Machine Learning Research, 9, pp.2579-2605. 210 | } 211 | -------------------------------------------------------------------------------- /R/Rtsne.R: -------------------------------------------------------------------------------- 1 | #' Barnes-Hut implementation of t-Distributed Stochastic Neighbor Embedding 2 | #' 3 | #' Wrapper for the C++ implementation of Barnes-Hut t-Distributed Stochastic Neighbor Embedding. t-SNE is a method for constructing a low dimensional embedding of high-dimensional data, distances or similarities. Exact t-SNE can be computed by setting theta=0.0. 4 | #' 5 | #' Given a distance matrix \eqn{D} between input objects (which by default, is the euclidean distances between two objects), we calculate a similarity score in the original space: \deqn{ p_{j | i} = \frac{\exp(-\|D_{ij}\|^2 / 2 \sigma_i^2)}{\sum_{k \neq i} \exp(-\|D_{ij}\|^2 / 2 \sigma_i^2)} } which is then symmetrized using: \deqn{ p_{i j}=\frac{p_{j|i} + p_{i|j}}{2n}.} The \eqn{\sigma} for each object is chosen in such a way that the perplexity of \eqn{p_{j|i}} has a value that is close to the user defined perplexity. This value effectively controls how many nearest neighbours are taken into account when constructing the embedding in the low-dimensional space. 6 | #' For the low-dimensional space we use the Cauchy distribution (t-distribution with one degree of freedom) as the distribution of the distances to neighbouring objects: 7 | #' \deqn{ q_{i j} = \frac{(1+ \| y_i-y_j\|^2)^{-1}}{\sum_{k \neq l} 1+ \| y_k-y_l\|^2)^{-1}}.} 8 | #' By changing the location of the objects y in the embedding to minimize the Kullback-Leibler divergence between these two distributions \eqn{ q_{i j}} and \eqn{ p_{i j}}, we create a map that focusses on small-scale structure, due to the asymmetry of the KL-divergence. The t-distribution is chosen to avoid the crowding problem: in the original high dimensional space, there are potentially many equidistant objects with moderate distance from a particular object, more than can be accounted for in the low dimensional representation. The t-distribution makes sure that these objects are more spread out in the new representation. 9 | #' 10 | #' For larger datasets, a problem with the a simple gradient descent to minimize the Kullback-Leibler divergence is the computational complexity of each gradient step (which is \eqn{O(n^2)}). The Barnes-Hut implementation of the algorithm attempts to mitigate this problem using two tricks: (1) approximating small similarities by 0 in the \eqn{p_{ij}} distribution, where the non-zero entries are computed by finding 3*perplexity nearest neighbours using an efficient tree search. (2) Using the Barnes-Hut algorithm in the computation of the gradient which approximates large distance similarities using a quadtree. This approximation is controlled by the \code{theta} parameter, with smaller values leading to more exact approximations. When \code{theta=0.0}, the implementation uses a standard t-SNE implementation. The Barnes-Hut approximation leads to a \eqn{O(n log(n))} computational complexity for each iteration. 11 | #' 12 | #' During the minimization of the KL-divergence, the implementation uses a trick known as early exaggeration, which multiplies the \eqn{p_{ij}}'s by 12 during the first 250 iterations. This leads to tighter clustering and more distance between clusters of objects. This early exaggeration is not used when the user gives an initialization of the objects in the embedding by setting \code{Y_init}. During the early exaggeration phase, a momentum term of 0.5 is used while this is changed to 0.8 after the first 250 iterations. All these default parameters can be changed by the user. 13 | #' 14 | #' After checking the correctness of the input, the \code{Rtsne} function (optionally) does an initial reduction of the feature space using \code{\link{prcomp}}, before calling the C++ TSNE implementation. Since R's random number generator is used, use \code{\link{set.seed}} before the function call to get reproducible results. 15 | #' 16 | #' If \code{X} is a data.frame, it is transformed into a matrix using \code{\link{model.matrix}}. If \code{X} is a \code{\link{dist}} object, it is currently first expanded into a full distance matrix. 17 | #' 18 | #' @param X matrix; Data matrix (each row is an observation, each column is a variable) 19 | #' @param index integer matrix; Each row contains the identity of the nearest neighbors for each observation 20 | #' @param distance numeric matrix; Each row contains the distance to the nearest neighbors in \code{index} for each observation 21 | #' @param dims integer; Output dimensionality (default: 2) 22 | #' @param initial_dims integer; the number of dimensions that should be retained in the initial PCA step (default: 50) 23 | #' @param perplexity numeric; Perplexity parameter (should not be bigger than 3 * perplexity < nrow(X) - 1, see details for interpretation) 24 | #' @param theta numeric; Speed/accuracy trade-off (increase for less accuracy), set to 0.0 for exact TSNE (default: 0.5) 25 | #' @param check_duplicates logical; Checks whether duplicates are present. It is best to make sure there are no duplicates present and set this option to FALSE, especially for large datasets (default: TRUE) 26 | #' @param pca logical; Whether an initial PCA step should be performed (default: TRUE) 27 | #' @param partial_pca logical; Whether truncated PCA should be used to calculate principal components (requires the irlba package). This is faster for large input matrices (default: FALSE) 28 | #' @param max_iter integer; Number of iterations (default: 1000) 29 | #' @param verbose logical; Whether progress updates should be printed (default: global "verbose" option, or FALSE if that is not set) 30 | #' @param ... Other arguments that can be passed to Rtsne 31 | #' @param is_distance logical; Indicate whether X is a distance matrix (default: FALSE) 32 | #' @param Y_init matrix; Initial locations of the objects. If NULL, random initialization will be used (default: NULL). Note that when using this, the initial stage with exaggerated perplexity values and a larger momentum term will be skipped. 33 | #' @param pca_center logical; Should data be centered before pca is applied? (default: TRUE) 34 | #' @param pca_scale logical; Should data be scaled before pca is applied? (default: FALSE) 35 | #' @param normalize logical; Should data be normalized internally prior to distance calculations with \code{\link{normalize_input}}? (default: TRUE) 36 | #' @param stop_lying_iter integer; Iteration after which the perplexities are no longer exaggerated (default: 250, except when Y_init is used, then 0) 37 | #' @param mom_switch_iter integer; Iteration after which the final momentum is used (default: 250, except when Y_init is used, then 0) 38 | #' @param momentum numeric; Momentum used in the first part of the optimization (default: 0.5) 39 | #' @param final_momentum numeric; Momentum used in the final part of the optimization (default: 0.8) 40 | #' @param eta numeric; Learning rate (default: 200.0) 41 | #' @param exaggeration_factor numeric; Exaggeration factor used to multiply the P matrix in the first part of the optimization (default: 12.0) 42 | #' @param num_threads integer; Number of threads to use when using OpenMP, default is 1. Setting to 0 corresponds to detecting and using all available cores 43 | #' 44 | #' @return List with the following elements: 45 | #' \item{Y}{Matrix containing the new representations for the objects} 46 | #' \item{N}{Number of objects} 47 | #' \item{origD}{Original Dimensionality before TSNE (only when \code{X} is a data matrix)} 48 | #' \item{perplexity}{See above} 49 | #' \item{theta}{See above} 50 | #' \item{costs}{The cost for every object after the final iteration} 51 | #' \item{itercosts}{The total costs (KL-divergence) for all objects in every 50th + the last iteration} 52 | #' \item{stop_lying_iter}{Iteration after which the perplexities are no longer exaggerated} 53 | #' \item{mom_switch_iter}{Iteration after which the final momentum is used} 54 | #' \item{momentum}{Momentum used in the first part of the optimization} 55 | #' \item{final_momentum}{Momentum used in the final part of the optimization} 56 | #' \item{eta}{Learning rate} 57 | #' \item{exaggeration_factor}{Exaggeration factor used to multiply the P matrix in the first part of the optimization} 58 | #' 59 | #' @section Supplying precomputed distances: 60 | #' If a distance matrix is already available, this can be directly supplied to \code{Rtsne} by setting \code{is_distance=TRUE}. 61 | #' This improves efficiency by avoiding recalculation of distances, but requires some work to get the same results as running default \code{Rtsne} on a data matrix. 62 | #' Specifically, Euclidean distances should be computed from a normalized data matrix - see \code{\link{normalize_input}} for details. 63 | #' PCA arguments will also be ignored if \code{is_distance=TRUE}. 64 | #' 65 | #' NN search results can be directly supplied to \code{Rtsne_neighbors} to avoid repeating the (possibly time-consuming) search. 66 | #' To achieve the same results as \code{Rtsne} on the data matrix, the search should be conducted on the normalized data matrix. 67 | #' The number of nearest neighbors should also be equal to three-fold the \code{perplexity}, rounded down to the nearest integer. 68 | #' Note that pre-supplied NN results cannot be used when \code{theta=0} as they are only relevant for the approximate algorithm. 69 | #' 70 | #' Any kind of distance metric can be used as input. 71 | #' In contrast, running \code{Rtsne} on a data matrix will always use Euclidean distances. 72 | #' 73 | #' @references Maaten, L. Van Der, 2014. Accelerating t-SNE using Tree-Based Algorithms. Journal of Machine Learning Research, 15, p.3221-3245. 74 | #' @references van der Maaten, L.J.P. & Hinton, G.E., 2008. Visualizing High-Dimensional Data Using t-SNE. Journal of Machine Learning Research, 9, pp.2579-2605. 75 | #' 76 | #' @examples 77 | #' iris_unique <- unique(iris) # Remove duplicates 78 | #' iris_matrix <- as.matrix(iris_unique[,1:4]) 79 | #' 80 | #' # Set a seed if you want reproducible results 81 | #' set.seed(42) 82 | #' tsne_out <- Rtsne(iris_matrix,pca=FALSE,perplexity=30,theta=0.0) # Run TSNE 83 | #' 84 | #' # Show the objects in the 2D tsne representation 85 | #' plot(tsne_out$Y,col=iris_unique$Species, asp=1) 86 | #' 87 | #' # data.frame as input 88 | #' tsne_out <- Rtsne(iris_unique,pca=FALSE, theta=0.0) 89 | #' 90 | #' # Using a dist object 91 | #' set.seed(42) 92 | #' tsne_out <- Rtsne(dist(normalize_input(iris_matrix)), theta=0.0) 93 | #' plot(tsne_out$Y,col=iris_unique$Species, asp=1) 94 | #' 95 | #' set.seed(42) 96 | #' tsne_out <- Rtsne(as.matrix(dist(normalize_input(iris_matrix))),theta=0.0) 97 | #' plot(tsne_out$Y,col=iris_unique$Species, asp=1) 98 | #' 99 | #' # Supplying starting positions (example: continue from earlier embedding) 100 | #' set.seed(42) 101 | #' tsne_part1 <- Rtsne(iris_unique[,1:4], theta=0.0, pca=FALSE, max_iter=350) 102 | #' tsne_part2 <- Rtsne(iris_unique[,1:4], theta=0.0, pca=FALSE, max_iter=650, Y_init=tsne_part1$Y) 103 | #' plot(tsne_part2$Y,col=iris_unique$Species, asp=1) 104 | #' \dontrun{ 105 | #' # Fast PCA and multicore 106 | #' 107 | #' tsne_out <- Rtsne(iris_matrix, theta=0.1, partial_pca = TRUE, initial_dims=3) 108 | #' tsne_out <- Rtsne(iris_matrix, theta=0.1, num_threads = 2) 109 | #' } 110 | #' @useDynLib Rtsne, .registration = TRUE 111 | #' @import Rcpp 112 | #' @importFrom stats model.matrix na.fail prcomp 113 | #' 114 | #' @export 115 | Rtsne <- function (X, ...) { 116 | UseMethod("Rtsne", X) 117 | } 118 | 119 | #' @describeIn Rtsne Default Interface 120 | #' @export 121 | Rtsne.default <- function(X, dims=2, initial_dims=50, 122 | perplexity=30, theta=0.5, 123 | check_duplicates=TRUE, 124 | pca=TRUE, partial_pca=FALSE, max_iter=1000,verbose=getOption("verbose", FALSE), 125 | is_distance=FALSE, Y_init=NULL, 126 | pca_center=TRUE, pca_scale=FALSE, normalize=TRUE, 127 | stop_lying_iter=ifelse(is.null(Y_init),250L,0L), 128 | mom_switch_iter=ifelse(is.null(Y_init),250L,0L), 129 | momentum=0.5, final_momentum=0.8, 130 | eta=200.0, exaggeration_factor=12.0, num_threads=1, ...) { 131 | 132 | if (!is.logical(is_distance)) { stop("is_distance should be a logical variable")} 133 | if (!is.matrix(X)) { stop("Input X is not a matrix")} 134 | if (is_distance & !(is.matrix(X) & (nrow(X)==ncol(X)))) { stop("Input is not an accepted distance matrix") } 135 | if (!(is.logical(pca_center) && is.logical(pca_scale)) ) { stop("pca_center and pca_scale should be TRUE or FALSE")} 136 | if (!is.wholenumber(initial_dims) || initial_dims<=0) { stop("Incorrect initial dimensionality.")} 137 | if (!is.wholenumber(num_threads) || num_threads<0) { stop("Incorrect number of threads.")} 138 | tsne.args <- .check_tsne_params(nrow(X), dims=dims, perplexity=perplexity, theta=theta, max_iter=max_iter, verbose=verbose, 139 | Y_init=Y_init, stop_lying_iter=stop_lying_iter, mom_switch_iter=mom_switch_iter, 140 | momentum=momentum, final_momentum=final_momentum, eta=eta, exaggeration_factor=exaggeration_factor) 141 | 142 | # Check for missing values 143 | X <- na.fail(X) 144 | 145 | # Apply PCA 146 | if (!is_distance) { 147 | if (pca) { 148 | if(verbose) cat("Performing PCA\n") 149 | if(partial_pca){ 150 | if (!requireNamespace("irlba", quietly = TRUE)) {stop("Package \"irlba\" is required for partial PCA. Please install it.", call. = FALSE)} 151 | X <- irlba::prcomp_irlba(X, n = initial_dims, center = pca_center, scale. = pca_scale)$x 152 | }else{ 153 | if(verbose & min(dim(X))>2500) cat("Consider setting partial_pca=TRUE for large matrices\n") 154 | X <- prcomp(X, retx=TRUE, center = pca_center, scale. = pca_scale, rank. = initial_dims)$x 155 | } 156 | } 157 | if (check_duplicates) { 158 | if (any(duplicated(X))) { stop("Remove duplicates before running TSNE.") } 159 | } 160 | if (normalize) { 161 | X <- normalize_input(X) 162 | } 163 | X <- t(X) # transposing for rapid column-major access. 164 | } else { 165 | # Compute Squared distance if we are using exact TSNE 166 | if (theta==0.0) { 167 | X <- X^2 168 | } 169 | } 170 | 171 | out <- do.call(Rtsne_cpp, c(list(X=X, distance_precomputed=is_distance, num_threads=num_threads), tsne.args)) 172 | out$Y <- t(out$Y) # Transposing back. 173 | info <- list(N=ncol(X)) 174 | if (!is_distance) { out$origD <- nrow(X) } # 'origD' is unknown for distance matrices. 175 | out <- c(info, out, .clear_unwanted_params(tsne.args)) 176 | class(out) <- c("Rtsne","list") 177 | out 178 | } 179 | 180 | #' @describeIn Rtsne tsne on given dist object 181 | #' @export 182 | Rtsne.dist <- function(X,...,is_distance=TRUE) { 183 | X <- as.matrix(na.fail(X)) 184 | Rtsne(X, ..., is_distance=is_distance) 185 | } 186 | 187 | #' @describeIn Rtsne tsne on data.frame 188 | #' @export 189 | Rtsne.data.frame <- function(X,...) { 190 | X <- model.matrix(~.-1,na.fail(X)) 191 | Rtsne(X, ...) 192 | } 193 | -------------------------------------------------------------------------------- /src/sptree.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology) 4 | * All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are met: 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * 3. All advertising materials mentioning features or use of this software 14 | * must display the following acknowledgement: 15 | * This product includes software developed by the Delft University of Technology. 16 | * 4. Neither the name of the Delft University of Technology nor the names of 17 | * its contributors may be used to endorse or promote products derived from 18 | * this software without specific prior written permission. 19 | * 20 | * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS 21 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 23 | * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 28 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY 29 | * OF SUCH DAMAGE. 30 | * 31 | */ 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include "sptree.h" 41 | 42 | 43 | 44 | // Constructs cell 45 | template 46 | Cell::Cell() { 47 | } 48 | 49 | template 50 | Cell::Cell(double* inp_corner, double* inp_width) { 51 | for(int d = 0; d < NDims; d++) setCorner(d, inp_corner[d]); 52 | for(int d = 0; d < NDims; d++) setWidth( d, inp_width[d]); 53 | } 54 | 55 | // Destructs cell 56 | template 57 | Cell::~Cell() { 58 | } 59 | 60 | template 61 | double Cell::getCorner(unsigned int d) const { 62 | return corner[d]; 63 | } 64 | 65 | template 66 | double Cell::getWidth(unsigned int d) const { 67 | return width[d]; 68 | } 69 | 70 | template 71 | void Cell::setCorner(unsigned int d, double val) { 72 | corner[d] = val; 73 | } 74 | 75 | template 76 | void Cell::setWidth(unsigned int d, double val) { 77 | width[d] = val; 78 | } 79 | 80 | // Checks whether a point lies in a cell 81 | template 82 | bool Cell::containsPoint(double point[]) const 83 | { 84 | for(int d = 0; d < NDims; d++) { 85 | if(corner[d] - width[d] > point[d]) return false; 86 | if(corner[d] + width[d] < point[d]) return false; 87 | } 88 | return true; 89 | } 90 | 91 | 92 | // Default constructor for SPTree -- build tree, too! 93 | template 94 | SPTree::SPTree(double* inp_data, unsigned int N) 95 | { 96 | 97 | // Compute mean, width, and height of current map (boundaries of SPTree) 98 | int nD = 0; 99 | double* mean_Y = (double*) calloc(NDims, sizeof(double)); 100 | double* min_Y = (double*) malloc(NDims * sizeof(double)); 101 | double* max_Y = (double*) malloc(NDims * sizeof(double)); 102 | 103 | for(unsigned int d = 0; d < NDims; d++) { 104 | min_Y[d] = DBL_MAX; 105 | max_Y[d] = -DBL_MAX; 106 | } 107 | 108 | for(unsigned int n = 0; n < N; n++) { 109 | for(unsigned int d = 0; d < NDims; d++) { 110 | mean_Y[d] += inp_data[n * NDims + d]; 111 | if(inp_data[nD + d] < min_Y[d]) min_Y[d] = inp_data[nD + d]; 112 | if(inp_data[nD + d] > max_Y[d]) max_Y[d] = inp_data[nD + d]; 113 | } 114 | nD += NDims; 115 | } 116 | 117 | for(int d = 0; d < NDims; d++) mean_Y[d] /= (double) N; 118 | 119 | // Construct SPTree 120 | double* width = (double*) malloc(NDims * sizeof(double)); 121 | for(int d = 0; d < NDims; d++) width[d] = max_tsne(max_Y[d] - mean_Y[d], mean_Y[d] - min_Y[d]) + 1e-5; 122 | init(NULL, inp_data, mean_Y, width); 123 | fill(N); 124 | 125 | // Clean up memory 126 | free(mean_Y); 127 | free(max_Y); 128 | free(min_Y); 129 | free(width); 130 | } 131 | 132 | 133 | // Constructor for SPTree with particular size and parent -- build the tree, too! 134 | template 135 | SPTree::SPTree(double* inp_data, unsigned int N, double* inp_corner, double* inp_width) 136 | { 137 | init(NULL, inp_data, inp_corner, inp_width); 138 | fill(N); 139 | } 140 | 141 | 142 | // Constructor for SPTree with particular size (do not fill the tree) 143 | template 144 | SPTree::SPTree(double* inp_data, double* inp_corner, double* inp_width) 145 | { 146 | init(NULL, inp_data, inp_corner, inp_width); 147 | } 148 | 149 | 150 | // Constructor for SPTree with particular size and parent (do not fill tree) 151 | template 152 | SPTree::SPTree(SPTree* inp_parent, double* inp_data, double* inp_corner, double* inp_width) { 153 | init(inp_parent, inp_data, inp_corner, inp_width); 154 | } 155 | 156 | 157 | // Constructor for SPTree with particular size and parent -- build the tree, too! 158 | template 159 | SPTree::SPTree(SPTree* inp_parent, double* inp_data, unsigned int N, double* inp_corner, double* inp_width) 160 | { 161 | init(inp_parent, inp_data, inp_corner, inp_width); 162 | fill(N); 163 | } 164 | 165 | 166 | // Main initialization function 167 | template 168 | void SPTree::init(SPTree* inp_parent, double* inp_data, double* inp_corner, double* inp_width) 169 | { 170 | parent = inp_parent; 171 | data = inp_data; 172 | is_leaf = true; 173 | size = 0; 174 | cum_size = 0; 175 | 176 | for(unsigned int d = 0; d < NDims; d++) boundary.setCorner(d, inp_corner[d]); 177 | for(unsigned int d = 0; d < NDims; d++) boundary.setWidth( d, inp_width[d]); 178 | 179 | for(unsigned int i = 0; i < no_children; i++) children[i] = NULL; 180 | for(unsigned int d = 0; d < NDims; d++) center_of_mass[d] = .0; 181 | } 182 | 183 | 184 | // Destructor for SPTree 185 | template 186 | SPTree::~SPTree() 187 | { 188 | for(unsigned int i = 0; i < no_children; i++) { 189 | if(children[i] != NULL) delete children[i]; 190 | } 191 | } 192 | 193 | 194 | // Update the data underlying this tree 195 | template 196 | void SPTree::setData(double* inp_data) 197 | { 198 | data = inp_data; 199 | } 200 | 201 | 202 | // Get the parent of the current tree 203 | template 204 | SPTree* SPTree::getParent() 205 | { 206 | return parent; 207 | } 208 | 209 | 210 | // Insert a point into the SPTree 211 | template 212 | bool SPTree::insert(unsigned int new_index) 213 | { 214 | // Ignore objects which do not belong in this quad tree 215 | double* point = data + new_index * NDims; 216 | if(!boundary.containsPoint(point)) 217 | return false; 218 | 219 | // Online update of cumulative size and center-of-mass 220 | cum_size++; 221 | double mult1 = (double) (cum_size - 1) / (double) cum_size; 222 | double mult2 = 1.0 / (double) cum_size; 223 | 224 | for(unsigned int d = 0; d < NDims; d++) { 225 | center_of_mass[d] = center_of_mass[d] * mult1 + mult2 * point[d]; 226 | } 227 | 228 | // If there is space in this quad tree and it is a leaf, add the object here 229 | if(is_leaf && size < QT_NODE_CAPACITY) { 230 | index[size] = new_index; 231 | size++; 232 | return true; 233 | } 234 | 235 | // Don't add duplicates for now (this is not very nice) 236 | bool any_duplicate = false; 237 | for(unsigned int n = 0; n < size; n++) { 238 | bool duplicate = true; 239 | for(unsigned int d = 0; d < NDims; d++) { 240 | if(point[d] != data[index[n] * NDims + d]) { duplicate = false; break; } 241 | } 242 | any_duplicate = any_duplicate | duplicate; 243 | } 244 | if(any_duplicate) return true; 245 | 246 | // Otherwise, we need to subdivide the current cell 247 | if(is_leaf) subdivide(); 248 | 249 | // Find out where the point can be inserted 250 | for(unsigned int i = 0; i < no_children; i++) { 251 | if(children[i]->insert(new_index)) return true; 252 | } 253 | 254 | // Otherwise, the point cannot be inserted (this should never happen) 255 | return false; 256 | } 257 | 258 | 259 | // Create four children which fully divide this cell into four quads of equal area 260 | template 261 | void SPTree::subdivide() { 262 | 263 | // Create new children 264 | double new_corner[NDims]; 265 | double new_width[NDims]; 266 | for(unsigned int i = 0; i < no_children; i++) { 267 | unsigned int div = 1; 268 | for(unsigned int d = 0; d < NDims; d++) { 269 | new_width[d] = .5 * boundary.getWidth(d); 270 | if((i / div) % 2 == 1) new_corner[d] = boundary.getCorner(d) - .5 * boundary.getWidth(d); 271 | else new_corner[d] = boundary.getCorner(d) + .5 * boundary.getWidth(d); 272 | div *= 2; 273 | } 274 | children[i] = new SPTree(this, data, new_corner, new_width); 275 | } 276 | 277 | // Move existing points to correct children 278 | for(unsigned int i = 0; i < size; i++) { 279 | bool success = false; 280 | for(unsigned int j = 0; j < no_children; j++) { 281 | if(!success) success = children[j]->insert(index[i]); 282 | } 283 | index[i] = -1; 284 | } 285 | 286 | // Empty parent node 287 | size = 0; 288 | is_leaf = false; 289 | } 290 | 291 | 292 | // Build SPTree on dataset 293 | template 294 | void SPTree::fill(unsigned int N) 295 | { 296 | for(unsigned int i = 0; i < N; i++) insert(i); 297 | } 298 | 299 | 300 | // Checks whether the specified tree is correct 301 | template 302 | bool SPTree::isCorrect() 303 | { 304 | for(unsigned int n = 0; n < size; n++) { 305 | double* point = data + index[n] * NDims; 306 | if(!boundary.containsPoint(point)) return false; 307 | } 308 | if(!is_leaf) { 309 | bool correct = true; 310 | for(int i = 0; i < no_children; i++) correct = correct && children[i]->isCorrect(); 311 | return correct; 312 | } 313 | else return true; 314 | } 315 | 316 | 317 | 318 | // Build a list of all indices in SPTree 319 | template 320 | void SPTree::getAllIndices(unsigned int* indices) 321 | { 322 | getAllIndices(indices, 0); 323 | } 324 | 325 | 326 | // Build a list of all indices in SPTree 327 | template 328 | unsigned int SPTree::getAllIndices(unsigned int* indices, unsigned int loc) 329 | { 330 | 331 | // Gather indices in current quadrant 332 | for(unsigned int i = 0; i < size; i++) indices[loc + i] = index[i]; 333 | loc += size; 334 | 335 | // Gather indices in children 336 | if(!is_leaf) { 337 | for(int i = 0; i < no_children; i++) loc = children[i]->getAllIndices(indices, loc); 338 | } 339 | return loc; 340 | } 341 | 342 | template 343 | unsigned int SPTree::getDepth() { 344 | if(is_leaf) return 1; 345 | int depth = 0; 346 | for(unsigned int i = 0; i < no_children; i++) depth = max_tsne(depth, children[i]->getDepth()); 347 | return 1 + depth; 348 | } 349 | 350 | 351 | // Compute non-edge forces using Barnes-Hut algorithm 352 | template 353 | double SPTree::computeNonEdgeForces(unsigned int point_index, double theta, double neg_f[]) const 354 | { 355 | double resultSum = 0; 356 | double buff[NDims]; // make buff local for parallelization 357 | 358 | // Make sure that we spend no time on empty nodes or self-interactions 359 | if(cum_size == 0 || (is_leaf && size == 1 && index[0] == point_index)) return resultSum; 360 | 361 | // Compute distance between point and center-of-mass 362 | double sqdist = .0; 363 | unsigned int ind = point_index * NDims; 364 | 365 | for(unsigned int d = 0; d < NDims; d++) { 366 | buff[d] = data[ind + d] - center_of_mass[d]; 367 | sqdist += buff[d] * buff[d]; 368 | } 369 | 370 | // Check whether we can use this node as a "summary" 371 | double max_width = 0.0; 372 | double cur_width; 373 | for(unsigned int d = 0; d < NDims; d++) { 374 | cur_width = boundary.getWidth(d); 375 | max_width = (max_width > cur_width) ? max_width : cur_width; 376 | } 377 | if(is_leaf || max_width / sqrt(sqdist) < theta) { 378 | 379 | // Compute and add t-SNE force between point and current node 380 | sqdist = 1.0 / (1.0 + sqdist); 381 | double mult = cum_size * sqdist; 382 | resultSum += mult; 383 | mult *= sqdist; 384 | for(unsigned int d = 0; d < NDims; d++) neg_f[d] += mult * buff[d]; 385 | } 386 | else { 387 | 388 | // Recursively apply Barnes-Hut to children 389 | for(unsigned int i = 0; i < no_children; i++){ 390 | resultSum += children[i]->computeNonEdgeForces(point_index, theta, neg_f); 391 | } 392 | } 393 | return resultSum; 394 | } 395 | 396 | 397 | // Computes edge forces 398 | template 399 | void SPTree::computeEdgeForces(unsigned int* row_P, unsigned int* col_P, double* val_P, unsigned int N, double* pos_f, int num_threads) const 400 | { 401 | 402 | // Loop over all edges in the graph 403 | #pragma omp parallel for schedule(static) num_threads(num_threads) 404 | for(unsigned int n = 0; n < N; n++) { 405 | unsigned int ind1 = n * NDims; 406 | for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) { 407 | 408 | double buff[NDims]; // make buff local for parallelization 409 | 410 | // Compute pairwise distance and Q-value 411 | double sqdist = 1.0; 412 | unsigned int ind2 = col_P[i] * NDims; 413 | 414 | for(unsigned int d = 0; d < NDims; d++) { 415 | buff[d] = data[ind1 + d] - data[ind2 + d]; 416 | sqdist += buff[d] * buff[d]; 417 | } 418 | 419 | sqdist = val_P[i] / sqdist; 420 | 421 | // Sum positive force 422 | for(unsigned int d = 0; d < NDims; d++) pos_f[ind1 + d] += sqdist * buff[d]; 423 | } 424 | } 425 | } 426 | 427 | 428 | // Print out tree 429 | template 430 | void SPTree::print() 431 | { 432 | if(cum_size == 0) { 433 | Rprintf("Empty node\n"); 434 | return; 435 | } 436 | 437 | if(is_leaf) { 438 | Rprintf("Leaf node; data = ["); 439 | for(unsigned int i = 0; i < size; i++) { 440 | double* point = data + index[i] * NDims; 441 | for(int d = 0; d < NDims; d++) Rprintf("%f, ", point[d]); 442 | Rprintf(" (index = %d)", index[i]); 443 | if(i < size - 1) Rprintf("\n"); 444 | else Rprintf("]\n"); 445 | } 446 | } 447 | else { 448 | Rprintf("Intersection node with center-of-mass = ["); 449 | for(int d = 0; d < NDims; d++) Rprintf("%f, ", center_of_mass[d]); 450 | Rprintf("]; children are:\n"); 451 | for(int i = 0; i < no_children; i++) children[i]->print(); 452 | } 453 | } 454 | 455 | // declare templates explicitly 456 | template class SPTree<1>; 457 | template class SPTree<2>; 458 | template class SPTree<3>; -------------------------------------------------------------------------------- /src/tsne.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology) 4 | * All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are met: 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * 3. All advertising materials mentioning features or use of this software 14 | * must display the following acknowledgement: 15 | * This product includes software developed by the Delft University of Technology. 16 | * 4. Neither the name of the Delft University of Technology nor the names of 17 | * its contributors may be used to endorse or promote products derived from 18 | * this software without specific prior written permission. 19 | * 20 | * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS 21 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 23 | * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 28 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY 29 | * OF SUCH DAMAGE. 30 | * 31 | */ 32 | 33 | #define USE_FC_LEN_T 34 | 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include "sptree.h" 43 | #include "vptree.h" 44 | #include "tsne.h" 45 | 46 | 47 | #ifdef _OPENMP 48 | #include 49 | #endif 50 | 51 | extern "C" { 52 | #include 53 | #include 54 | } 55 | 56 | #ifndef FCONE 57 | # define FCONE 58 | #endif 59 | 60 | using namespace std; 61 | 62 | template 63 | TSNE::TSNE(double Perplexity, double Theta, bool Verbose, int Max_iter, bool Init, int Stop_lying_iter, 64 | int Mom_switch_iter, double Momentum, double Final_momentum, double Eta, double Exaggeration_factor, int Num_threads) : 65 | perplexity(Perplexity), theta(Theta), momentum(Momentum), final_momentum(Final_momentum), eta(Eta), exaggeration_factor(Exaggeration_factor), 66 | max_iter(Max_iter), stop_lying_iter(Stop_lying_iter), mom_switch_iter(Mom_switch_iter), num_threads(Num_threads), 67 | verbose(Verbose), init(Init), exact(theta==.0) { 68 | 69 | #ifdef _OPENMP 70 | int threads = num_threads; 71 | if (num_threads==0) { 72 | threads = omp_get_max_threads(); 73 | } 74 | 75 | // Print notice whether OpenMP is used 76 | if (verbose) Rprintf("OpenMP is working. %d threads.\n", threads); 77 | #endif 78 | 79 | return; 80 | } 81 | 82 | // Perform t-SNE 83 | template 84 | void TSNE::run(double* X, unsigned int N, int D, double* Y, bool distance_precomputed, double* cost, double* itercost) { 85 | if(N - 1 < 3 * perplexity) { Rcpp::stop("Perplexity too large for the number of data points!\n"); } 86 | if (verbose) Rprintf("Using no_dims = %d, perplexity = %f, and theta = %f\n", NDims, perplexity, theta); 87 | if (verbose) Rprintf("Computing input similarities...\n"); 88 | clock_t start = clock(); 89 | 90 | // Compute input similarities for exact t-SNE 91 | if(exact) { 92 | // Compute similarities 93 | computeGaussianPerplexity(X, N, D, distance_precomputed); 94 | 95 | // Symmetrize input similarities 96 | if (verbose) Rprintf("Symmetrizing...\n"); 97 | for(unsigned long n = 0; n < N; n++) { 98 | for(unsigned long m = n + 1; m < N; m++) { 99 | P[n * N + m] += P[m * N + n]; 100 | P[m * N + n] = P[n * N + m]; 101 | } 102 | } 103 | 104 | double sum_P = 0; 105 | for(size_t i = 0; i < P.size(); i++) sum_P += P[i]; 106 | for(size_t i = 0; i < P.size(); i++) P[i] /= sum_P; 107 | } 108 | 109 | // Compute input similarities for approximate t-SNE 110 | else { 111 | int K=3*perplexity; 112 | 113 | // Compute asymmetric pairwise input similarities 114 | if (distance_precomputed) { 115 | computeGaussianPerplexity(X, N, D, K); 116 | } else { 117 | computeGaussianPerplexity(X, N, D, K); 118 | } 119 | 120 | // Symmetrize input similarities 121 | symmetrizeMatrix(N); 122 | double sum_P = .0; 123 | for(unsigned int i = 0; i < row_P[N]; i++) sum_P += val_P[i]; 124 | for(unsigned int i = 0; i < row_P[N]; i++) val_P[i] /= sum_P; 125 | } 126 | 127 | if (verbose) { 128 | clock_t end = clock(); 129 | if(exact) Rprintf("Done in %4.2f seconds!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC); 130 | else Rprintf("Done in %4.2f seconds (sparsity = %f)!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC, (double) row_P[N] / ((double) N * (double) N)); 131 | } 132 | 133 | trainIterations(N, Y, cost, itercost); 134 | return; 135 | } 136 | 137 | // Perform t-SNE with nearest neighbor results. 138 | template 139 | void TSNE::run(const int* nn_index, const double* nn_dist, unsigned int N, int K, double* Y, double* cost, double* itercost) { 140 | if(N - 1 < 3 * perplexity) { Rcpp::stop("Perplexity too large for the number of data points!\n"); } 141 | if (verbose) Rprintf("Using no_dims = %d, perplexity = %f, and theta = %f\n", NDims, perplexity, theta); 142 | if (verbose) Rprintf("Computing input similarities...\n"); 143 | clock_t start = clock(); 144 | 145 | // Compute asymmetric pairwise input similarities 146 | computeGaussianPerplexity(nn_index, nn_dist, N, K); 147 | 148 | // Symmetrize input similarities 149 | symmetrizeMatrix(N); 150 | double sum_P = .0; 151 | for(unsigned int i = 0; i < row_P[N]; i++) sum_P += val_P[i]; 152 | for(unsigned int i = 0; i < row_P[N]; i++) val_P[i] /= sum_P; 153 | 154 | if (verbose) { 155 | clock_t end = clock(); 156 | if(exact) Rprintf("Done in %4.2f seconds!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC); 157 | else Rprintf("Done in %4.2f seconds (sparsity = %f)!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC, (double) row_P[N] / ((double) N * (double) N)); 158 | } 159 | 160 | trainIterations(N, Y, cost, itercost); 161 | return; 162 | } 163 | 164 | // Perform main training loop 165 | template 166 | void TSNE::trainIterations(unsigned int N, double* Y, double* cost, double* itercost) { 167 | // Allocate some memory 168 | double* dY = (double*) malloc(N * NDims * sizeof(double)); 169 | double* uY = (double*) malloc(N * NDims * sizeof(double)); 170 | double* gains = (double*) malloc(N * NDims * sizeof(double)); 171 | if(dY == NULL || uY == NULL || gains == NULL) { Rcpp::stop("Memory allocation failed!\n"); } 172 | for(unsigned int i = 0; i < N * NDims; i++) uY[i] = .0; 173 | for(unsigned int i = 0; i < N * NDims; i++) gains[i] = 1.0; 174 | 175 | // Lie about the P-values 176 | if(exact) { for(unsigned long i = 0; i < (unsigned long)N * N; i++) P[i] *= exaggeration_factor; } 177 | else { for(unsigned long i = 0; i < row_P[N]; i++) val_P[i] *= exaggeration_factor; } 178 | 179 | // Initialize solution (randomly), if not already done 180 | if (!init) { for(unsigned int i = 0; i < N * NDims; i++) Y[i] = randn() * .0001; } 181 | 182 | clock_t start = clock(), end; 183 | float total_time=0; 184 | int costi = 0; //iterator for saving the total costs for the iterations 185 | 186 | for(int iter = 0; iter < max_iter; iter++) { 187 | 188 | // Stop lying about the P-values after a while, and switch momentum 189 | if(iter == stop_lying_iter) { 190 | if(exact) { for(unsigned long i = 0; i < (unsigned long)N * N; i++) P[i] /= exaggeration_factor; } 191 | else { for(unsigned int i = 0; i < row_P[N]; i++) val_P[i] /= exaggeration_factor; } 192 | } 193 | if(iter == mom_switch_iter) momentum = final_momentum; 194 | 195 | // Compute (approximate) gradient 196 | if(exact) computeExactGradient(P.data(), Y, N, NDims, dY); 197 | else computeGradient(P.data(), row_P.data(), col_P.data(), val_P.data(), Y, N, NDims, dY, theta); 198 | 199 | // Update gains 200 | for(unsigned int i = 0; i < N * NDims; i++) gains[i] = (sign_tsne(dY[i]) != sign_tsne(uY[i])) ? (gains[i] + .2) : (gains[i] * .8); 201 | for(unsigned int i = 0; i < N * NDims; i++) if(gains[i] < .01) gains[i] = .01; 202 | 203 | // Perform gradient update (with momentum and gains) 204 | for(unsigned int i = 0; i < N * NDims; i++) uY[i] = momentum * uY[i] - eta * gains[i] * dY[i]; 205 | for(unsigned int i = 0; i < N * NDims; i++) Y[i] = Y[i] + uY[i]; 206 | 207 | // Make solution zero-mean 208 | zeroMean(Y, N, NDims); 209 | 210 | // Print out progress 211 | if((iter > 0 && (iter+1) % 50 == 0) || iter == max_iter - 1) { 212 | end = clock(); 213 | double C = .0; 214 | if(exact) C = evaluateError(P.data(), Y, N, NDims); 215 | else C = evaluateError(row_P.data(), col_P.data(), val_P.data(), Y, N, NDims, theta); // doing approximate computation here! 216 | if(iter == 0) { 217 | if (verbose) Rprintf("Iteration %d: error is %f\n", iter + 1, C); 218 | } 219 | else { 220 | total_time += (float) (end - start) / CLOCKS_PER_SEC; 221 | if (verbose) Rprintf("Iteration %d: error is %f (50 iterations in %4.2f seconds)\n", iter+1, C, (float) (end - start) / CLOCKS_PER_SEC); 222 | } 223 | itercost[costi] = C; 224 | costi++; 225 | start = clock(); 226 | } 227 | } 228 | end = clock(); total_time += (float) (end - start) / CLOCKS_PER_SEC; 229 | 230 | if(exact) getCost(P.data(), Y, N, NDims, cost); 231 | else getCost(row_P.data(), col_P.data(), val_P.data(), Y, N, NDims, theta, cost); // doing approximate computation here! 232 | 233 | // Clean up memory 234 | free(dY); 235 | free(uY); 236 | free(gains); 237 | if (verbose) Rprintf("Fitting performed in %4.2f seconds.\n", total_time); 238 | return; 239 | } 240 | 241 | // Compute gradient of the t-SNE cost function (using Barnes-Hut algorithm) 242 | template 243 | void TSNE::computeGradient(double* P, unsigned int* inp_row_P, unsigned int* inp_col_P, double* inp_val_P, double* Y, unsigned int N, int D, double* dC, double theta) 244 | { 245 | // Construct space-partitioning tree on current map 246 | SPTree* tree = new SPTree(Y, N); 247 | 248 | // Compute all terms required for t-SNE gradient 249 | double* pos_f = (double*) calloc(N * D, sizeof(double)); 250 | double* neg_f = (double*) calloc(N * D, sizeof(double)); 251 | if(pos_f == NULL || neg_f == NULL) { Rcpp::stop("Memory allocation failed!\n"); } 252 | tree->computeEdgeForces(inp_row_P, inp_col_P, inp_val_P, N, pos_f, num_threads); 253 | 254 | // Storing the output to sum in single-threaded mode; avoid randomness in rounding errors. 255 | std::vector output(N); 256 | 257 | #pragma omp parallel for schedule(guided) num_threads(num_threads) 258 | for (unsigned int n = 0; n < N; n++) { 259 | output[n]=tree->computeNonEdgeForces(n, theta, neg_f + n * D); 260 | } 261 | 262 | double sum_Q = .0; 263 | for (unsigned int n=0; n 279 | void TSNE::computeExactGradient(double* P, double* Y, unsigned int N, int D, double* dC) { 280 | 281 | // Make sure the current gradient contains zeros 282 | for(unsigned int i = 0; i < N * D; i++) dC[i] = 0.0; 283 | 284 | // Compute the squared Euclidean distance matrix 285 | double* DD = (double*) malloc((unsigned long)N * N * sizeof(double)); 286 | if(DD == NULL) { Rcpp::stop("Memory allocation failed!\n"); } 287 | computeSquaredEuclideanDistance(Y, N, D, DD); 288 | 289 | // Compute Q-matrix and normalization sum 290 | double* Q = (double*) malloc((unsigned long)N * N * sizeof(double)); 291 | if(Q == NULL) { Rcpp::stop("Memory allocation failed!\n"); } 292 | double sum_Q = .0; 293 | for(unsigned long n = 0; n < N; n++) { 294 | for(unsigned long m = 0; m < N; m++) { 295 | if(n != m) { 296 | Q[n * N + m] = 1 / (1 + DD[n * N + m]); 297 | sum_Q += Q[n * N + m]; 298 | } 299 | } 300 | } 301 | 302 | // Perform the computation of the gradient 303 | for(unsigned long n = 0; n < N; n++) { 304 | for(unsigned long m = 0; m < N; m++) { 305 | if(n != m) { 306 | double mult = (P[n * N + m] - (Q[n * N + m] / sum_Q)) * Q[n * N + m]; 307 | for(int d = 0; d < D; d++) { 308 | dC[n * D + d] += (Y[n * D + d] - Y[m * D + d]) * mult; 309 | } 310 | } 311 | } 312 | } 313 | 314 | // Free memory 315 | free(DD); DD = NULL; 316 | free(Q); Q = NULL; 317 | } 318 | 319 | 320 | // Evaluate t-SNE cost function (exactly) 321 | template 322 | double TSNE::evaluateError(double* P, double* Y, unsigned int N, int D) { 323 | 324 | // Compute the squared Euclidean distance matrix 325 | double* DD = (double*) malloc((unsigned long)N * N * sizeof(double)); 326 | double* Q = (double*) malloc((unsigned long)N * N * sizeof(double)); 327 | if(DD == NULL || Q == NULL) { Rcpp::stop("Memory allocation failed!\n"); } 328 | computeSquaredEuclideanDistance(Y, N, D, DD); 329 | 330 | // Compute Q-matrix and normalization sum 331 | double sum_Q = DBL_MIN; 332 | for(unsigned long n = 0; n < N; n++) { 333 | for(unsigned long m = 0; m < N; m++) { 334 | if(n != m) { 335 | Q[n * N + m] = 1 / (1 + DD[n * N + m]); 336 | sum_Q += Q[n * N + m]; 337 | } 338 | else Q[n * N + m] = DBL_MIN; 339 | } 340 | } 341 | for(unsigned long i = 0; i < (unsigned long)N * N; i++) Q[i] /= sum_Q; 342 | 343 | // Sum t-SNE error 344 | double C = .0; 345 | for(unsigned long n = 0; n < N; n++) { 346 | for(unsigned long m = 0; m < N; m++) { 347 | C += P[n * N + m] * log((P[n * N + m] + 1e-9) / (Q[n * N + m] + 1e-9)); 348 | } 349 | } 350 | 351 | // Clean up memory 352 | free(DD); 353 | free(Q); 354 | return C; 355 | } 356 | 357 | // Evaluate t-SNE cost function (approximately) 358 | template 359 | double TSNE::evaluateError(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, unsigned int N, int D, double theta) 360 | { 361 | 362 | // Get estimate of normalization term 363 | SPTree* tree = new SPTree(Y, N); 364 | double* buff = (double*) calloc(D, sizeof(double)); 365 | double sum_Q = .0; 366 | for(unsigned int n = 0; n < N; n++) sum_Q += tree->computeNonEdgeForces(n, theta, buff); 367 | 368 | // Loop over all edges to compute t-SNE error 369 | int ind1, ind2; 370 | double C = .0, Q; 371 | for(unsigned int n = 0; n < N; n++) { 372 | ind1 = n * D; 373 | for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) { 374 | Q = .0; 375 | ind2 = col_P[i] * D; 376 | for(int d = 0; d < D; d++) buff[d] = Y[ind1 + d]; 377 | for(int d = 0; d < D; d++) buff[d] -= Y[ind2 + d]; 378 | for(int d = 0; d < D; d++) Q += buff[d] * buff[d]; 379 | Q = (1.0 / (1.0 + Q)) / sum_Q; 380 | C += val_P[i] * log((val_P[i] + FLT_MIN) / (Q + FLT_MIN)); 381 | } 382 | } 383 | 384 | // Clean up memory 385 | free(buff); 386 | delete tree; 387 | return C; 388 | } 389 | 390 | // Evaluate t-SNE cost function (exactly) 391 | template 392 | void TSNE::getCost(double* P, double* Y, unsigned int N, int D, double* costs) { 393 | 394 | // Compute the squared Euclidean distance matrix 395 | double* DD = (double*) malloc((unsigned long)N * N * sizeof(double)); 396 | double* Q = (double*) malloc((unsigned long)N * N * sizeof(double)); 397 | if(DD == NULL || Q == NULL) { Rcpp::stop("Memory allocation failed!\n"); } 398 | computeSquaredEuclideanDistance(Y, N, D, DD); 399 | 400 | // Compute Q-matrix and normalization sum 401 | double sum_Q = DBL_MIN; 402 | for(unsigned long n = 0; n < N; n++) { 403 | for(unsigned long m = 0; m < N; m++) { 404 | if(n != m) { 405 | Q[n * N + m] = 1 / (1 + DD[n * N + m]); 406 | sum_Q += Q[n * N + m]; 407 | } 408 | else Q[n * N + m] = DBL_MIN; 409 | } 410 | } 411 | for(unsigned long i = 0; i < (unsigned long)N * N; i++) Q[i] /= sum_Q; 412 | 413 | // Sum t-SNE error 414 | for(unsigned long n = 0; n < N; n++) { 415 | costs[n] = 0.0; 416 | for(unsigned long m = 0; m < N; m++) { 417 | costs[n] += P[n * N + m] * log((P[n * N + m] + 1e-9) / (Q[n * N + m] + 1e-9)); 418 | } 419 | } 420 | 421 | // Clean up memory 422 | free(DD); 423 | free(Q); 424 | } 425 | 426 | // Evaluate t-SNE cost function (approximately) 427 | template 428 | void TSNE::getCost(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, unsigned int N, int D, double theta, double* costs) 429 | { 430 | 431 | // Get estimate of normalization term 432 | SPTree* tree = new SPTree(Y, N); 433 | double* buff = (double*) calloc(D, sizeof(double)); 434 | double sum_Q = .0; 435 | for(unsigned int n = 0; n < N; n++) sum_Q += tree->computeNonEdgeForces(n, theta, buff); 436 | 437 | // Loop over all edges to compute t-SNE error 438 | int ind1, ind2; 439 | double Q; 440 | for(unsigned int n = 0; n < N; n++) { 441 | ind1 = n * D; 442 | costs[n] = 0.0; 443 | for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) { 444 | Q = .0; 445 | ind2 = col_P[i] * D; 446 | for(int d = 0; d < D; d++) buff[d] = Y[ind1 + d]; 447 | for(int d = 0; d < D; d++) buff[d] -= Y[ind2 + d]; 448 | for(int d = 0; d < D; d++) Q += buff[d] * buff[d]; 449 | Q = (1.0 / (1.0 + Q)) / sum_Q; 450 | costs[n] += val_P[i] * log((val_P[i] + FLT_MIN) / (Q + FLT_MIN)); 451 | } 452 | } 453 | 454 | // Clean up memory 455 | free(buff); 456 | delete tree; 457 | } 458 | 459 | 460 | // Compute input similarities with a fixed perplexity 461 | template 462 | void TSNE::computeGaussianPerplexity(double* X, unsigned int N, int D, bool distance_precomputed) { 463 | size_t N2=N; 464 | N2*=N; 465 | P.resize(N2); 466 | 467 | // Compute the squared Euclidean distance matrix 468 | double* DD = (double*) malloc(N2 * sizeof(double)); 469 | if(DD == NULL) { Rcpp::stop("Memory allocation failed!\n"); } 470 | 471 | if (distance_precomputed) { 472 | DD = X; 473 | } else { 474 | computeSquaredEuclideanDistanceDirect(X, N, D, DD); 475 | 476 | // Needed to make sure the results are exactly equal to distance calculation in R 477 | for (size_t n=0; n 0) { 515 | min_beta = beta; 516 | if(max_beta == DBL_MAX || max_beta == -DBL_MAX) 517 | beta *= 2.0; 518 | else 519 | beta = (beta + max_beta) / 2.0; 520 | } 521 | else { 522 | max_beta = beta; 523 | if(min_beta == -DBL_MAX || min_beta == DBL_MAX) 524 | beta /= 2.0; 525 | else 526 | beta = (beta + min_beta) / 2.0; 527 | } 528 | } 529 | 530 | // Update iteration counter 531 | iter++; 532 | } 533 | 534 | // Row normalize P 535 | for(unsigned long m = 0; m < N; m++) P[n * N + m] /= sum_P; 536 | } 537 | 538 | // Clean up memory 539 | if (!distance_precomputed) { free(DD); } 540 | DD = NULL; 541 | } 542 | 543 | 544 | // Compute input similarities with a fixed perplexity using ball trees (this function allocates memory another function should free) 545 | template 546 | template 547 | void TSNE::computeGaussianPerplexity(double* X, unsigned int N, int D, int K) { 548 | 549 | if(perplexity > K) Rprintf("Perplexity should be lower than K!\n"); 550 | 551 | // Allocate the memory we need 552 | setupApproximateMemory(N, K); 553 | 554 | // Build ball tree on data set 555 | VpTree* tree = new VpTree(); 556 | vector obj_X(N, DataPoint(D, -1, X)); 557 | for(unsigned int n = 0; n < N; n++) obj_X[n] = DataPoint(D, n, X + n * D); 558 | tree->create(obj_X); 559 | 560 | // Loop over all points to find nearest neighbors 561 | if (verbose) Rprintf("Building tree...\n"); 562 | 563 | int steps_completed = 0; 564 | #pragma omp parallel for schedule(guided) num_threads(num_threads) 565 | for(unsigned int n = 0; n < N; n++) { 566 | 567 | vector indices; 568 | vector distances; 569 | indices.reserve(K+1); 570 | distances.reserve(K+1); 571 | 572 | // Find nearest neighbors 573 | tree->search(obj_X[n], K + 1, &indices, &distances); 574 | 575 | double * cur_P = val_P.data() + row_P[n]; 576 | computeProbabilities(perplexity, K, distances.data()+1, cur_P); // +1 to avoid self. 577 | 578 | unsigned int * cur_col_P = col_P.data() + row_P[n]; 579 | for (int m=0; m 598 | void TSNE::computeGaussianPerplexity(const int* nn_idx, const double* nn_dist, unsigned int N, int K) { 599 | 600 | if(perplexity > K) Rprintf("Perplexity should be lower than K!\n"); 601 | 602 | // Allocate the memory we need 603 | setupApproximateMemory(N, K); 604 | 605 | // Loop over all points to find nearest neighbors 606 | int steps_completed = 0; 607 | #pragma omp parallel for schedule(guided) num_threads(num_threads) 608 | for(unsigned int n = 0; n < N; n++) { 609 | double * cur_P = val_P.data() + row_P[n]; 610 | computeProbabilities(perplexity, K, nn_dist + row_P[n], cur_P); 611 | 612 | const int * cur_idx = nn_idx + row_P[n]; 613 | unsigned int * cur_col_P = col_P.data() + row_P[n]; 614 | for (int m=0; m 628 | void TSNE::setupApproximateMemory(unsigned int N, int K) { 629 | row_P.resize(N+1); 630 | col_P.resize(N*K); 631 | val_P.resize(N*K); 632 | row_P[0] = 0; 633 | for(unsigned int n = 0; n < N; n++) row_P[n + 1] = row_P[n] + K; 634 | return; 635 | } 636 | 637 | template 638 | void TSNE::computeProbabilities (const double perplexity, const int K, const double* distances, double* cur_P) { 639 | 640 | // Initialize some variables for binary search 641 | bool found = false; 642 | double beta = 1.0; 643 | double min_beta = -DBL_MAX; 644 | double max_beta = DBL_MAX; 645 | double tol = 1e-5; 646 | 647 | // Iterate until we found a good perplexity 648 | int iter = 0; double sum_P; 649 | while(!found && iter < 200) { 650 | 651 | // Compute Gaussian kernel row 652 | for(int m = 0; m < K; m++) cur_P[m] = exp(-beta * distances[m] * distances[m]); 653 | 654 | // Compute entropy of current row 655 | sum_P = DBL_MIN; 656 | for(int m = 0; m < K; m++) sum_P += cur_P[m]; 657 | double H = .0; 658 | for(int m = 0; m < K; m++) H += beta * (distances[m] * distances[m] * cur_P[m]); 659 | H = (H / sum_P) + log(sum_P); 660 | 661 | // Evaluate whether the entropy is within the tolerance level 662 | double Hdiff = H - log(perplexity); 663 | if(Hdiff < tol && -Hdiff < tol) { 664 | found = true; 665 | } 666 | else { 667 | if(Hdiff > 0) { 668 | min_beta = beta; 669 | if(max_beta == DBL_MAX || max_beta == -DBL_MAX) 670 | beta *= 2.0; 671 | else 672 | beta = (beta + max_beta) / 2.0; 673 | } 674 | else { 675 | max_beta = beta; 676 | if(min_beta == -DBL_MAX || min_beta == DBL_MAX) 677 | beta /= 2.0; 678 | else 679 | beta = (beta + min_beta) / 2.0; 680 | } 681 | } 682 | 683 | // Update iteration counter 684 | iter++; 685 | } 686 | 687 | // Row-normalize current row of P. 688 | for(int m = 0; m < K; m++) { 689 | cur_P[m] /= sum_P; 690 | } 691 | return; 692 | } 693 | 694 | template 695 | void TSNE::symmetrizeMatrix(unsigned int N) { 696 | // Count number of elements and row counts of symmetric matrix 697 | int* row_counts = (int*) calloc(N, sizeof(int)); 698 | if(row_counts == NULL) { Rcpp::stop("Memory allocation failed!\n"); } 699 | for(unsigned int n = 0; n < N; n++) { 700 | for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) { 701 | 702 | // Check whether element (col_P[i], n) is present 703 | bool present = false; 704 | for(unsigned int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) { 705 | if(col_P[m] == n) present = true; 706 | } 707 | if(present) row_counts[n]++; 708 | else { 709 | row_counts[n]++; 710 | row_counts[col_P[i]]++; 711 | } 712 | } 713 | } 714 | int no_elem = 0; 715 | for(unsigned int n = 0; n < N; n++) no_elem += row_counts[n]; 716 | 717 | // Allocate memory for symmetrized matrix 718 | std::vector sym_row_P(N+1), sym_col_P(no_elem); 719 | std::vector sym_val_P(no_elem); 720 | 721 | // Construct new row indices for symmetric matrix 722 | sym_row_P[0] = 0; 723 | for(unsigned int n = 0; n < N; n++) sym_row_P[n + 1] = sym_row_P[n] + row_counts[n]; 724 | 725 | // Fill the result matrix 726 | int* offset = (int*) calloc(N, sizeof(int)); 727 | if(offset == NULL) { Rcpp::stop("Memory allocation failed!\n"); } 728 | for(unsigned int n = 0; n < N; n++) { 729 | for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) { // considering element(n, col_P[i]) 730 | 731 | // Check whether element (col_P[i], n) is present 732 | bool present = false; 733 | for(unsigned int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) { 734 | if(col_P[m] == n) { 735 | present = true; 736 | if(n <= col_P[i]) { // make sure we do not add elements twice 737 | sym_col_P[sym_row_P[n] + offset[n]] = col_P[i]; 738 | sym_col_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = n; 739 | sym_val_P[sym_row_P[n] + offset[n]] = val_P[i] + val_P[m]; 740 | sym_val_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = val_P[i] + val_P[m]; 741 | } 742 | } 743 | } 744 | 745 | // If (col_P[i], n) is not present, there is no addition involved 746 | if(!present) { 747 | sym_col_P[sym_row_P[n] + offset[n]] = col_P[i]; 748 | sym_col_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = n; 749 | sym_val_P[sym_row_P[n] + offset[n]] = val_P[i]; 750 | sym_val_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = val_P[i]; 751 | } 752 | 753 | // Update offsets 754 | if(!present || (present && n <= col_P[i])) { 755 | offset[n]++; 756 | if(col_P[i] != n) offset[col_P[i]]++; 757 | } 758 | } 759 | } 760 | 761 | // Divide the result by two 762 | for(int i = 0; i < no_elem; i++) sym_val_P[i] /= 2.0; 763 | 764 | // Return symmetrized matrices 765 | row_P.swap(sym_row_P); 766 | col_P.swap(sym_col_P); 767 | val_P.swap(sym_val_P); 768 | 769 | // Free up some memery 770 | free(offset); offset = NULL; 771 | free(row_counts); row_counts = NULL; 772 | } 773 | 774 | // Compute squared Euclidean distance matrix (using BLAS) 775 | template 776 | void TSNE::computeSquaredEuclideanDistance(double* X, unsigned int N, int D, double* DD) { 777 | double* dataSums = (double*) calloc(N, sizeof(double)); 778 | if(dataSums == NULL) { Rcpp::stop("Memory allocation failed!\n"); } 779 | for(unsigned int n = 0; n < N; n++) { 780 | for(int d = 0; d < D; d++) { 781 | dataSums[n] += (X[n * D + d] * X[n * D + d]); 782 | } 783 | } 784 | for(unsigned long n = 0; n < N; n++) { 785 | for(unsigned long m = 0; m < N; m++) { 786 | DD[n * N + m] = dataSums[n] + dataSums[m]; 787 | } 788 | } 789 | double a1 = -2.0; 790 | double a2 = 1.0; 791 | int Nsigned = N; 792 | dgemm_("T", "N", &Nsigned, &Nsigned, &D, &a1, X, &D, X, &D, &a2, DD, &Nsigned FCONE FCONE); 793 | free(dataSums); dataSums = NULL; 794 | } 795 | 796 | // Compute squared Euclidean distance matrix (No BLAS) 797 | template 798 | void TSNE::computeSquaredEuclideanDistanceDirect(double* X, unsigned int N, int D, double* DD) { 799 | const double* XnD = X; 800 | for(unsigned int n = 0; n < N; ++n, XnD += D) { 801 | const double* XmD = XnD + D; 802 | double* curr_elem = &DD[n*N + n]; 803 | *curr_elem = 0.0; 804 | double* curr_elem_sym = curr_elem + N; 805 | for(unsigned int m = n + 1; m < N; ++m, XmD+=D, curr_elem_sym+=N) { 806 | *(++curr_elem) = 0.0; 807 | for(int d = 0; d < D; ++d) { 808 | *curr_elem += (XnD[d] - XmD[d]) * (XnD[d] - XmD[d]); 809 | } 810 | *curr_elem_sym = *curr_elem; 811 | } 812 | } 813 | } 814 | 815 | 816 | // Makes data zero-mean 817 | template 818 | void TSNE::zeroMean(double* X, unsigned int N, int D) { 819 | 820 | // Compute data mean 821 | double* mean = (double*) calloc(D, sizeof(double)); 822 | if(mean == NULL) { Rcpp::stop("Memory allocation failed!\n"); } 823 | int nD = 0; 824 | for(unsigned int n = 0; n < N; n++) { 825 | for(int d = 0; d < D; d++) { 826 | mean[d] += X[nD + d]; 827 | } 828 | nD += D; 829 | } 830 | for(int d = 0; d < D; d++) { 831 | mean[d] /= (double) N; 832 | } 833 | 834 | // Subtract data mean 835 | nD = 0; 836 | for(unsigned int n = 0; n < N; n++) { 837 | for(int d = 0; d < D; d++) { 838 | X[nD + d] -= mean[d]; 839 | } 840 | nD += D; 841 | } 842 | free(mean); mean = NULL; 843 | } 844 | 845 | // Generates a Gaussian random number 846 | template 847 | double TSNE::randn() { 848 | Rcpp::RNGScope scope; 849 | double x, y, radius; 850 | do { 851 | x = 2 * (double)R::runif(0,1) - 1; 852 | y = 2 * (double)R::runif(0,1) - 1; 853 | radius = (x * x) + (y * y); 854 | } while((radius >= 1.0) || (radius == 0.0)); 855 | radius = sqrt(-2 * log(radius) / radius); 856 | x *= radius; 857 | y *= radius; 858 | return x; 859 | } 860 | 861 | // declare templates explicitly 862 | template class TSNE<1>; 863 | template class TSNE<2>; 864 | template class TSNE<3>; 865 | --------------------------------------------------------------------------------