├── .github
├── .gitignore
└── workflows
│ ├── R-CMD-check.yaml
│ └── test-coverage.yaml
├── tests
├── testthat.R
└── testthat
│ ├── test_neighbors.R
│ └── test_Rtsne.R
├── tools
└── example-1.png
├── .gitignore
├── .Rbuildignore
├── cran-comments.md
├── .travis.yml
├── src
├── Makevars.win
├── Makevars
├── normalize_input.cpp
├── datapoint.h
├── tsne.h
├── sptree.h
├── Rtsne.cpp
├── RcppExports.cpp
├── vptree.h
├── sptree.cpp
└── tsne.cpp
├── NAMESPACE
├── NEWS.md
├── DESCRIPTION
├── R
├── RcppExports.R
├── neighbors.R
├── utils.R
└── Rtsne.R
├── man
├── normalize_input.Rd
└── Rtsne.Rd
├── inst
└── CITATION
├── README.md
├── README.Rmd
└── LICENSE
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 |
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | test_check("Rtsne")
--------------------------------------------------------------------------------
/tools/example-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkrijthe/Rtsne/HEAD/tools/example-1.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | *.o
3 |
4 | *.Rproj
5 |
6 | *.so
7 |
8 |
9 | .Rhistory
10 | .Rproj.user
11 |
--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^CRAN-RELEASE$
2 | ^.*\.Rproj$
3 | ^\.Rproj\.user$
4 | ^\.travis\.yml$
5 | ^README\.Rmd$
6 | ^README-.*\.png$
7 | ^cran-comments\.md$
8 | ^\.github$
9 | ^CRAN-SUBMISSION$
10 |
--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
1 | The main change is the use of USE_FC_LEN_T for calling Fortran BLAS to prepare for R 4.2.0, among with some small documentation and typo fixes.
2 |
3 | ## Test environments
4 | * local OS X install
5 | * win-builder (devel and release)
6 | * ubuntu linux (devel, release and oldrelease) on Github Actions
7 |
8 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: r
2 | sudo: false
3 | cache: packages
4 |
5 | matrix:
6 | include:
7 | - r: release
8 | r_github_packages: r-lib/covr
9 | after_success: Rscript -e 'covr::codecov()'
10 | - r: devel
11 | - r: oldrel
12 |
13 |
14 | notifications:
15 | email:
16 | on_success: change
17 | on_failure: change
--------------------------------------------------------------------------------
/src/Makevars.win:
--------------------------------------------------------------------------------
1 |
2 | ## Use the R_HOME indirection to support installations of multiple R version
3 | ## PKG_LIBS = $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "Rcpp:::LdFlags()") $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
4 | PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) $(SHLIB_OPENMP_CXXFLAGS)
5 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS)
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | S3method(Rtsne,data.frame)
4 | S3method(Rtsne,default)
5 | S3method(Rtsne,dist)
6 | export(Rtsne)
7 | export(Rtsne_neighbors)
8 | export(normalize_input)
9 | import(Rcpp)
10 | importFrom(stats,model.matrix)
11 | importFrom(stats,na.fail)
12 | importFrom(stats,prcomp)
13 | useDynLib(Rtsne, .registration = TRUE)
14 |
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # Rtsne 0.15
2 | * Substantial speed increase by fixing the possible embedding dimensionalities to 1, 2 or 3
3 | * Clarification of the licensing of different parts of the package
4 | * Support for using, more efficient, partial PCA (by Daniel Wells)
5 | * Made the normalization optional and the normalization function used available (by Aaron Lun)
6 | * Support for using precomputed nearest neighbour matrices (by Aaron Lun)
7 | * Added OpenMP support
8 | * Default verbose value is now the global setting (by Richard Cotton)
9 | * Added a `NEWS.md` file to track changes to the package.
10 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: Rtsne
2 | Type: Package
3 | Title: T-Distributed Stochastic Neighbor Embedding using a Barnes-Hut Implementation
4 | Version: 0.17
5 | Authors@R: c(
6 | person("Jesse", "Krijthe", ,"jkrijthe@gmail.com", role = c("aut", "cre")),
7 | person("Laurens", "van der Maaten", role = c("cph"), comment = "Author of original C++ code")
8 | )
9 | Description: An R wrapper around the fast T-distributed Stochastic
10 | Neighbor Embedding implementation by Van der Maaten (see for more information on the original implementation).
11 | License: file LICENSE
12 | URL: https://github.com/jkrijthe/Rtsne
13 | Encoding: UTF-8
14 | Imports:
15 | Rcpp (>= 0.11.0),
16 | stats
17 | LinkingTo: Rcpp
18 | Suggests:
19 | irlba,
20 | testthat
21 | RoxygenNote: 7.2.3
22 |
--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
3 |
4 | Rtsne_cpp <- function(X, no_dims, perplexity, theta, verbose, max_iter, distance_precomputed, Y_in, init, stop_lying_iter, mom_switch_iter, momentum, final_momentum, eta, exaggeration_factor, num_threads) {
5 | .Call(`_Rtsne_Rtsne_cpp`, X, no_dims, perplexity, theta, verbose, max_iter, distance_precomputed, Y_in, init, stop_lying_iter, mom_switch_iter, momentum, final_momentum, eta, exaggeration_factor, num_threads)
6 | }
7 |
8 | Rtsne_nn_cpp <- function(nn_dex, nn_dist, no_dims, perplexity, theta, verbose, max_iter, Y_in, init, stop_lying_iter, mom_switch_iter, momentum, final_momentum, eta, exaggeration_factor, num_threads) {
9 | .Call(`_Rtsne_Rtsne_nn_cpp`, nn_dex, nn_dist, no_dims, perplexity, theta, verbose, max_iter, Y_in, init, stop_lying_iter, mom_switch_iter, momentum, final_momentum, eta, exaggeration_factor, num_threads)
10 | }
11 |
12 | normalize_input_cpp <- function(input) {
13 | .Call(`_Rtsne_normalize_input_cpp`, input)
14 | }
15 |
16 |
--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
1 | ## Use the R_HOME indirection to support installations of multiple R version
2 | ## PKG_LIBS = `$(R_HOME)/bin/Rscript -e "Rcpp:::LdFlags()"` $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
3 | PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) $(SHLIB_OPENMP_CXXFLAGS)
4 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS)
5 |
6 | ## As an alternative, one can also add this code in a file 'configure'
7 | ##
8 | ## PKG_LIBS=`${R_HOME}/bin/Rscript -e "Rcpp:::LdFlags()"`
9 | ##
10 | ## sed -e "s|@PKG_LIBS@|${PKG_LIBS}|" \
11 | ## src/Makevars.in > src/Makevars
12 | ##
13 | ## which together with the following file 'src/Makevars.in'
14 | ##
15 | ## PKG_LIBS = @PKG_LIBS@
16 | ##
17 | ## can be used to create src/Makevars dynamically. This scheme is more
18 | ## powerful and can be expanded to also check for and link with other
19 | ## libraries. It should be complemented by a file 'cleanup'
20 | ##
21 | ## rm src/Makevars
22 | ##
23 | ## which removes the autogenerated file src/Makevars.
24 | ##
25 | ## Of course, autoconf can also be used to write configure files. This is
26 | ## done by a number of packages, but recommended only for more advanced users
27 | ## comfortable with autoconf and its related tools.
28 |
29 |
30 |
--------------------------------------------------------------------------------
/src/normalize_input.cpp:
--------------------------------------------------------------------------------
1 | #include "Rcpp.h"
2 | #include
3 |
4 | // Function that performs the matrix normalization.
5 | // [[Rcpp::export]]
6 | Rcpp::NumericMatrix normalize_input_cpp(Rcpp::NumericMatrix input) {
7 | // Rows are observations, columns are variables.
8 | Rcpp::NumericMatrix output=Rcpp::clone(input);
9 | const int N=output.nrow(), D=output.ncol();
10 |
11 | // Running through each column and centering it.
12 | Rcpp::NumericMatrix::iterator oIt=output.begin();
13 | for (int d=0; d max_X) max_X = tmp;
32 | }
33 | for (oIt=output.begin(); oIt!=output.end(); ++oIt) {
34 | *oIt /= max_X;
35 | }
36 | return output;
37 | }
38 |
--------------------------------------------------------------------------------
/man/normalize_input.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/utils.R
3 | \name{normalize_input}
4 | \alias{normalize_input}
5 | \title{Normalize input data matrix}
6 | \usage{
7 | normalize_input(X)
8 | }
9 | \arguments{
10 | \item{X}{matrix; Input data matrix with rows as observations and columns as variables/dimensions.}
11 | }
12 | \value{
13 | A numeric matrix of the same dimensions as \code{X} but centred by column and scaled to have a maximum deviation of 1.
14 | }
15 | \description{
16 | Mean centers each column of an input data matrix so that it has a mean of zero.
17 | Scales the entire matrix so that the largest absolute of the centered matrix is equal to unity.
18 | }
19 | \details{
20 | Normalization avoids numerical problems when the coordinates (and thus the distances between observations) are very large.
21 | Directly computing distances on this scale may lead to underflow when computing the probabilities in the t-SNE algorithm.
22 | Rescaling the input values mitigates these problems to some extent.
23 | }
24 | \examples{
25 | iris_unique <- unique(iris) # Remove duplicates
26 | iris_matrix <- as.matrix(iris_unique[,1:4])
27 | X <- normalize_input(iris_matrix)
28 | colMeans(X)
29 | range(X)
30 | }
31 | \author{
32 | Aaron Lun
33 | }
34 |
--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
1 | bibentry(
2 | bibtype="Manual",
3 | title = "{Rtsne}: T-Distributed Stochastic Neighbor Embedding using Barnes-Hut
4 | Implementation",
5 | author = "Jesse H. Krijthe",
6 | year = "2015",
7 | note = sprintf("R package version %s", meta$Version),
8 | url = "https://github.com/jkrijthe/Rtsne",
9 | textVersion="Jesse H. Krijthe (2015). Rtsne: T-Distributed Stochastic Neighbor Embedding using a Barnes-Hut Implementation, URL: https://github.com/jkrijthe/Rtsne"
10 | )
11 |
12 | bibentry(
13 | bibtype="Article",
14 | title="Visualizing High-Dimensional Data Using t-SNE",
15 | volume= "9",
16 | pages="2579-2605",
17 | year="2008",
18 | author="L.J.P. van der Maaten and G.E. Hinton",
19 | journal="Journal of Machine Learning Research",
20 | textVersion="L.J.P. van der Maaten and G.E. Hinton. Visualizing High-Dimensional Data Using t-SNE. Journal of Machine Learning Research 9(Nov):2579-2605, 2008.")
21 |
22 | bibentry(
23 | bibtype="Article",
24 | title="Accelerating t-SNE using Tree-Based Algorithms",
25 | volume= "15",
26 | pages="3221-3245",
27 | year="2014",
28 | author="L.J.P. van der Maaten",
29 | journal="Journal of Machine Learning Research",
30 | textVersion="L.J.P. van der Maaten. Accelerating t-SNE using Tree-Based Algorithms. Journal of Machine Learning Research 15(Oct):3221-3245, 2014."
31 | )
32 |
33 |
34 |
--------------------------------------------------------------------------------
/src/datapoint.h:
--------------------------------------------------------------------------------
1 | #ifndef DATAPOINT_H
2 | #define DATAPOINT_H
3 |
4 | class DataPoint
5 | {
6 | int _ind;
7 |
8 | public:
9 | double* _x;
10 | int _D;
11 | DataPoint() {
12 | _D = 1;
13 | _ind = -1;
14 | _x = NULL;
15 | }
16 | DataPoint(int D, int ind, double* x) {
17 | _D = D;
18 | _ind = ind;
19 | _x = (double*) malloc(_D * sizeof(double));
20 | for(int d = 0; d < _D; d++) _x[d] = x[d];
21 | }
22 | DataPoint(const DataPoint& other) { // this makes a deep copy -- should not free anything
23 | if(this != &other) {
24 | _D = other.dimensionality();
25 | _ind = other.index();
26 | _x = (double*) malloc(_D * sizeof(double));
27 | for(int d = 0; d < _D; d++) _x[d] = other.x(d);
28 | }
29 | }
30 | ~DataPoint() { if(_x != NULL) free(_x); }
31 | DataPoint& operator= (const DataPoint& other) { // asignment should free old object
32 | if(this != &other) {
33 | if(_x != NULL) free(_x);
34 | _D = other.dimensionality();
35 | _ind = other.index();
36 | _x = (double*) malloc(_D * sizeof(double));
37 | for(int d = 0; d < _D; d++) _x[d] = other.x(d);
38 | }
39 | return *this;
40 | }
41 | int index() const { return _ind; }
42 | int dimensionality() const { return _D; }
43 | double x(int d) const { return _x[d]; }
44 | };
45 |
46 | #endif
47 |
--------------------------------------------------------------------------------
/R/neighbors.R:
--------------------------------------------------------------------------------
1 | #' @rdname Rtsne
2 | #' @export
3 | Rtsne_neighbors <- function(index, distance, dims=2, perplexity=30, theta=0.5,
4 | max_iter=1000,verbose=getOption("verbose", FALSE),
5 | Y_init=NULL,
6 | stop_lying_iter=ifelse(is.null(Y_init),250L,0L),
7 | mom_switch_iter=ifelse(is.null(Y_init),250L,0L),
8 | momentum=0.5, final_momentum=0.8,
9 | eta=200.0, exaggeration_factor=12.0, num_threads=1, ...) {
10 |
11 | if (!is.matrix(index)) { stop("Input index is not a matrix") }
12 | if (!identical(dim(index), dim(distance))) { stop("index and distance matrices should have the same dimensions") }
13 | R <- range(index)
14 | if (any(R < 1 | R > nrow(index) | !is.finite(R))) { stop("invalid indices") }
15 | tsne.args <- .check_tsne_params(nrow(index), dims=dims, perplexity=perplexity, theta=theta, max_iter=max_iter, verbose=verbose,
16 | Y_init=Y_init, stop_lying_iter=stop_lying_iter, mom_switch_iter=mom_switch_iter,
17 | momentum=momentum, final_momentum=final_momentum, eta=eta, exaggeration_factor=exaggeration_factor)
18 |
19 | # Transposing is necessary for fast column-major access to each sample, -1 for zero-indexing.
20 | out <- do.call(Rtsne_nn_cpp, c(list(nn_dex=t(index - 1L), nn_dist=t(distance), num_threads=num_threads), tsne.args))
21 | out$Y <- t(out$Y) # Transposing back.
22 | c(list(N=nrow(index)), out, .clear_unwanted_params(tsne.args))
23 | }
24 |
--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | pull_request:
7 | branches: [main, master]
8 |
9 | name: R-CMD-check
10 | permissions: read-all
11 |
12 | jobs:
13 | R-CMD-check:
14 | runs-on: ${{ matrix.config.os }}
15 |
16 | name: ${{ matrix.config.os }} (${{ matrix.config.r }})
17 |
18 | strategy:
19 | fail-fast: false
20 | matrix:
21 | config:
22 | - {os: macos-latest, r: 'release'}
23 | - {os: windows-latest, r: 'release'}
24 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
25 | - {os: ubuntu-latest, r: 'release'}
26 | - {os: ubuntu-latest, r: 'oldrel-1'}
27 |
28 | env:
29 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
30 | R_KEEP_PKG_SOURCE: yes
31 |
32 | steps:
33 | - uses: actions/checkout@v5
34 |
35 | - uses: r-lib/actions/setup-pandoc@v2
36 |
37 | - uses: r-lib/actions/setup-r@v2
38 | with:
39 | r-version: ${{ matrix.config.r }}
40 | http-user-agent: ${{ matrix.config.http-user-agent }}
41 | use-public-rspm: true
42 |
43 | - uses: r-lib/actions/setup-r-dependencies@v2
44 | with:
45 | extra-packages: any::rcmdcheck
46 | needs: check
47 |
48 | - uses: r-lib/actions/check-r-package@v2
49 | with:
50 | upload-snapshots: true
51 | build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")'
52 |
--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | pull_request:
7 |
8 | name: test-coverage.yaml
9 |
10 | permissions: read-all
11 |
12 | jobs:
13 | test-coverage:
14 | runs-on: ubuntu-latest
15 | env:
16 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
17 |
18 | steps:
19 | - uses: actions/checkout@v5
20 |
21 | - uses: r-lib/actions/setup-r@v2
22 | with:
23 | use-public-rspm: true
24 |
25 | - uses: r-lib/actions/setup-r-dependencies@v2
26 | with:
27 | extra-packages: any::covr, any::xml2
28 | needs: coverage
29 |
30 | - name: Test coverage
31 | run: |
32 | cov <- covr::package_coverage(
33 | quiet = FALSE,
34 | clean = FALSE,
35 | install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package")
36 | )
37 | print(cov)
38 | covr::to_cobertura(cov)
39 | shell: Rscript {0}
40 |
41 | - uses: codecov/codecov-action@v5
42 | with:
43 | # Fail if error if not on PR, or if on PR and token is given
44 | fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }}
45 | files: ./cobertura.xml
46 | plugins: noop
47 | disable_search: true
48 | token: ${{ secrets.CODECOV_TOKEN }}
49 |
50 | - name: Show testthat output
51 | if: always()
52 | run: |
53 | ## --------------------------------------------------------------------
54 | find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true
55 | shell: bash
56 |
57 | - name: Upload test results
58 | if: failure()
59 | uses: actions/upload-artifact@v4
60 | with:
61 | name: coverage-test-failures
62 | path: ${{ runner.temp }}/package
63 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | [](https://cran.r-project.org/package=Rtsne/)
6 | [](https://github.com/jkrijthe/Rtsne/actions/workflows/R-CMD-check.yaml)
7 | [](https://app.codecov.io/github/jkrijthe/Rtsne?branch=master)
8 | [](https://cran.r-project.org/package=Rtsne/)
10 |
11 | # R wrapper for Van der Maaten’s Barnes-Hut implementation of t-Distributed Stochastic Neighbor Embedding
12 |
13 | ## Installation
14 |
15 | To install from CRAN:
16 |
17 | ``` r
18 | install.packages("Rtsne") # Install Rtsne package from CRAN
19 | ```
20 |
21 | To install the latest version from the github repository, use:
22 |
23 | ``` r
24 | if(!require(devtools)) install.packages("devtools") # If not already installed
25 | devtools::install_github("jkrijthe/Rtsne")
26 | ```
27 |
28 | ## Usage
29 |
30 | After installing the package, use the following code to run a simple
31 | example (to install, see below).
32 |
33 | ``` r
34 | library(Rtsne) # Load package
35 | iris_unique <- unique(iris) # Remove duplicates
36 | set.seed(42) # Sets seed for reproducibility
37 | tsne_out <- Rtsne(as.matrix(iris_unique[,1:4])) # Run TSNE
38 | plot(tsne_out$Y,col=iris_unique$Species,asp=1) # Plot the result
39 | ```
40 |
41 | 
42 |
43 | # Details
44 |
45 | This R package offers a wrapper around the Barnes-Hut TSNE C++
46 | implementation of \[2\] \[3\]. Changes were made to the original code to
47 | allow it to function as an R package and to add additional functionality
48 | and speed improvements.
49 |
50 | # References
51 |
52 | \[1\] L.J.P. van der Maaten and G.E. Hinton. “Visualizing
53 | High-Dimensional Data Using t-SNE.” Journal of Machine Learning Research
54 | 9(Nov):2579-2605, 2008.
55 |
56 | \[2\] L.J.P van der Maaten. “Accelerating t-SNE using tree-based
57 | algorithms.” Journal of Machine Learning Research 15.1:3221-3245, 2014.
58 |
59 | \[3\]
60 |
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output:
3 | github_document:
4 | html_preview: false
5 | ---
6 |
7 |
8 |
9 | ```{r, echo = FALSE}
10 | knitr::opts_chunk$set(
11 | collapse = TRUE,
12 | comment = "#>",
13 | fig.path = "README-"
14 | )
15 | ```
16 | [](https://cran.r-project.org/package=Rtsne/)
17 | [](https://github.com/jkrijthe/Rtsne/actions/workflows/R-CMD-check.yaml)
18 | [](https://app.codecov.io/github/jkrijthe/Rtsne?branch=master)
19 | [](https://cran.r-project.org/package=Rtsne/)
20 |
21 |
22 | # R wrapper for Van der Maaten's Barnes-Hut implementation of t-Distributed Stochastic Neighbor Embedding
23 |
24 | ## Installation
25 |
26 | To install from CRAN:
27 | ```{r, eval = FALSE}
28 | install.packages("Rtsne") # Install Rtsne package from CRAN
29 | ```
30 | To install the latest version from the github repository, use:
31 | ```{r, eval = FALSE}
32 | if(!require(devtools)) install.packages("devtools") # If not already installed
33 | devtools::install_github("jkrijthe/Rtsne")
34 | ```
35 |
36 |
37 | ## Usage
38 | After installing the package, use the following code to run a simple example (to install, see below).
39 | ```{r example, fig.path="tools/"}
40 | library(Rtsne) # Load package
41 | iris_unique <- unique(iris) # Remove duplicates
42 | set.seed(42) # Sets seed for reproducibility
43 | tsne_out <- Rtsne(as.matrix(iris_unique[,1:4])) # Run TSNE
44 | plot(tsne_out$Y,col=iris_unique$Species,asp=1) # Plot the result
45 | ```
46 |
47 |
48 | # Details
49 | This R package offers a wrapper around the Barnes-Hut TSNE C++ implementation of [2] [3]. Changes were made to the original code to allow it to function as an R package and to add additional functionality and speed improvements.
50 |
51 | # References
52 | [1] L.J.P. van der Maaten and G.E. Hinton. "Visualizing High-Dimensional Data Using t-SNE." Journal of Machine Learning Research 9(Nov):2579-2605, 2008.
53 |
54 | [2] L.J.P van der Maaten. "Accelerating t-SNE using tree-based algorithms." Journal of Machine Learning Research 15.1:3221-3245, 2014.
55 |
56 | [3] https://lvdmaaten.github.io/tsne/
57 |
--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
1 | #' Normalize input data matrix
2 | #'
3 | #' Mean centers each column of an input data matrix so that it has a mean of zero.
4 | #' Scales the entire matrix so that the largest absolute of the centered matrix is equal to unity.
5 | #'
6 | #' @param X matrix; Input data matrix with rows as observations and columns as variables/dimensions.
7 | #'
8 | #' @details
9 | #' Normalization avoids numerical problems when the coordinates (and thus the distances between observations) are very large.
10 | #' Directly computing distances on this scale may lead to underflow when computing the probabilities in the t-SNE algorithm.
11 | #' Rescaling the input values mitigates these problems to some extent.
12 | #'
13 | #' @return A numeric matrix of the same dimensions as \code{X} but centred by column and scaled to have a maximum deviation of 1.
14 | #'
15 | #' @author
16 | #' Aaron Lun
17 | #'
18 | #' @examples
19 | #' iris_unique <- unique(iris) # Remove duplicates
20 | #' iris_matrix <- as.matrix(iris_unique[,1:4])
21 | #' X <- normalize_input(iris_matrix)
22 | #' colMeans(X)
23 | #' range(X)
24 | #' @export
25 | normalize_input <- function(X) {
26 | # Using the original C++ code from bhtsne to do mean centering, even though it would be simple to do with sweep().
27 | # This is because R's sums are more numerically precise, so for consistency with the original code, we need to use the naive C++ version.
28 | normalize_input_cpp(X)
29 | }
30 |
31 | is.wholenumber <- function(x, tol = .Machine$double.eps^0.5)
32 | # Checks if the input is a whole number.
33 | {
34 | abs(x - round(x)) < tol
35 | }
36 |
37 | .check_tsne_params <- function(nsamples, dims, perplexity, theta, max_iter, verbose, Y_init, stop_lying_iter, mom_switch_iter,
38 | momentum, final_momentum, eta, exaggeration_factor)
39 | # Checks parameters for the t-SNE algorithm that are independent of
40 | # the format of the input data (e.g., distance matrix or neighbors).
41 | {
42 | if (!is.wholenumber(dims) || dims < 1 || dims > 3) { stop("dims should be either 1, 2 or 3") }
43 | if (!is.wholenumber(max_iter) || !(max_iter>0)) { stop("number of iterations should be a positive integer")}
44 | if (!is.null(Y_init) && (nsamples!=nrow(Y_init) || ncol(Y_init)!=dims)) { stop("incorrect format for Y_init") }
45 |
46 | if (!is.numeric(perplexity) || perplexity <= 0) { stop("perplexity should be a positive number") }
47 | if (!is.numeric(theta) || (theta<0.0) || (theta>1.0) ) { stop("theta should lie in [0, 1]")}
48 | if (!is.wholenumber(stop_lying_iter) || stop_lying_iter<0) { stop("stop_lying_iter should be a positive integer")}
49 | if (!is.wholenumber(mom_switch_iter) || mom_switch_iter<0) { stop("mom_switch_iter should be a positive integer")}
50 | if (!is.numeric(momentum) || momentum < 0) { stop("momentum should be a positive number") }
51 | if (!is.numeric(final_momentum) || final_momentum < 0) { stop("final momentum should be a positive number") }
52 | if (!is.numeric(eta) || eta <= 0) { stop("eta should be a positive number") }
53 | if (!is.numeric(exaggeration_factor) || exaggeration_factor <= 0) { stop("exaggeration_factor should be a positive number")}
54 |
55 | if (nsamples - 1 < 3 * perplexity) { stop("perplexity is too large for the number of samples")}
56 |
57 | init <- !is.null(Y_init)
58 | if (init) {
59 | Y_init <- t(Y_init) # transposing for rapid column-major access.
60 | } else {
61 | Y_init <- matrix()
62 | }
63 |
64 | list(no_dims=dims, perplexity=perplexity, theta=theta, max_iter=max_iter, verbose=verbose,
65 | init=init, Y_in=Y_init,
66 | stop_lying_iter=stop_lying_iter, mom_switch_iter=mom_switch_iter,
67 | momentum=momentum, final_momentum=final_momentum,
68 | eta=eta, exaggeration_factor=exaggeration_factor)
69 | }
70 |
71 | .clear_unwanted_params <- function(args)
72 | # Removing parameters that we do not want to store in the output.
73 | {
74 | args$Y_in <- args$init <- args$no_dims <- args$verbose <- NULL
75 | args
76 | }
77 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | All the files in this package are covered by a BSD-style licence, but with
2 | different attribution requirements and conditions for:
3 | * The following files in the src directory: tsne.cpp, tsne.h, datapoint.h, sptree.cpp, sptree.h vptree.h
4 | * The rest of the files
5 |
6 | =================================================================
7 |
8 | The following license applies to the following files in the src directory: tsne.cpp, tsne.h, datapoint.h, sptree.cpp, sptree.h vptree.h
9 |
10 | Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology)
11 | All rights reserved.
12 |
13 | Redistribution and use in source and binary forms, with or without
14 | modification, are permitted provided that the following conditions are met:
15 | 1. Redistributions of source code must retain the above copyright
16 | notice, this list of conditions and the following disclaimer.
17 | 2. Redistributions in binary form must reproduce the above copyright
18 | notice, this list of conditions and the following disclaimer in the
19 | documentation and/or other materials provided with the distribution.
20 | 3. All advertising materials mentioning features or use of this software
21 | must display the following acknowledgement:
22 | This product includes software developed by the Delft University of Technology.
23 | 4. Neither the name of the Delft University of Technology nor the names of
24 | its contributors may be used to endorse or promote products derived from
25 | this software without specific prior written permission.
26 |
27 | THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS
28 | OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
29 | OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
30 | EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
33 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
35 | IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
36 | OF SUCH DAMAGE.
37 |
38 | =================================================================
39 |
40 | The following license applies to the rest of the files:
41 |
42 | Copyright (c) 2014, Jesse Krijthe
43 |
44 | Redistribution and use in source and binary forms, with or without
45 | modification, are permitted provided that the following conditions are
46 | met:
47 |
48 | Redistributions of source code must retain the above copyright
49 | notice, this list of conditions and the following disclaimer.
50 |
51 | Redistributions in binary form must reproduce the above copyright
52 | notice, this list of conditions and the following disclaimer in
53 | the documentation and/or other materials provided with the
54 | distribution.
55 |
56 | Neither the name of Delft University of Technology nor the names
57 | of its contributors may be used to endorse or promote products derived
58 | from this software without specific prior written permission.
59 |
60 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
61 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
62 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
63 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
64 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
65 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
66 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
67 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
68 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
69 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
70 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
71 |
--------------------------------------------------------------------------------
/src/tsne.h:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology)
4 | * All rights reserved.
5 | *
6 | * Redistribution and use in source and binary forms, with or without
7 | * modification, are permitted provided that the following conditions are met:
8 | * 1. Redistributions of source code must retain the above copyright
9 | * notice, this list of conditions and the following disclaimer.
10 | * 2. Redistributions in binary form must reproduce the above copyright
11 | * notice, this list of conditions and the following disclaimer in the
12 | * documentation and/or other materials provided with the distribution.
13 | * 3. All advertising materials mentioning features or use of this software
14 | * must display the following acknowledgement:
15 | * This product includes software developed by the Delft University of Technology.
16 | * 4. Neither the name of the Delft University of Technology nor the names of
17 | * its contributors may be used to endorse or promote products derived from
18 | * this software without specific prior written permission.
19 | *
20 | * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS
21 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
23 | * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
28 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
29 | * OF SUCH DAMAGE.
30 | *
31 | */
32 |
33 | #include "datapoint.h"
34 | #include
35 |
36 | #ifndef TSNE_H
37 | #define TSNE_H
38 |
39 |
40 | static inline double sign_tsne(double x) { return (x == .0 ? .0 : (x < .0 ? -1.0 : 1.0)); }
41 |
42 | template
43 | class TSNE
44 | {
45 | public:
46 | TSNE(double perplexity, double theta, bool verbose, int max_iter, bool init, int stop_lying_iter,
47 | int mom_switch_iter, double momentum, double final_momentum, double eta, double exaggeration_factor,int num_threads);
48 |
49 | void run(double* X, unsigned int N, int D, double* Y, bool distance_precomputed, double* cost, double* itercost);
50 | void run(const int* nn_index, const double* nn_dist, unsigned int N, int K, double* Y, double* cost, double* itercost);
51 |
52 | private:
53 | void symmetrizeMatrix(unsigned int N);
54 | void trainIterations(unsigned int N, double* Y, double* cost, double* itercost);
55 |
56 | void computeGradient(double* P, unsigned int* inp_row_P, unsigned int* inp_col_P, double* inp_val_P, double* Y, unsigned int N, int D, double* dC, double theta);
57 | void computeExactGradient(double* P, double* Y, unsigned int N, int D, double* dC);
58 | double evaluateError(double* P, double* Y, unsigned int N, int D);
59 | double evaluateError(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, unsigned int N, int D, double theta);
60 | void getCost(double* P, double* Y, unsigned int N, int D, double* costs);
61 | void getCost(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, unsigned int N, int D, double theta, double* costs);
62 | void zeroMean(double* X, unsigned int N, int D);
63 |
64 | void computeGaussianPerplexity(double* X, unsigned int N, int D, bool distance_precomputed);
65 | template
66 | void computeGaussianPerplexity(double* X, unsigned int N, int D, int K);
67 | void computeGaussianPerplexity(const int* nn_dex, const double* nn_dist, unsigned int N, int K);
68 | void setupApproximateMemory(unsigned int N, int K);
69 |
70 | void computeProbabilities(const double perplexity, const int K, const double* distances, double* cur_P);
71 | void computeSquaredEuclideanDistance(double* X, unsigned int N, int D, double* DD);
72 | void computeSquaredEuclideanDistanceDirect(double* X, unsigned int N, int D, double* DD);
73 |
74 | double randn();
75 |
76 | // Member variables.
77 | double perplexity, theta, momentum, final_momentum, eta, exaggeration_factor;
78 | int max_iter, stop_lying_iter, mom_switch_iter, num_threads;
79 | bool verbose, init, exact;
80 |
81 | std::vector row_P, col_P;
82 | std::vector val_P, P;
83 | };
84 |
85 | #endif
86 |
87 |
--------------------------------------------------------------------------------
/tests/testthat/test_neighbors.R:
--------------------------------------------------------------------------------
1 | context("Rtsne neighbor input")
2 |
3 | set.seed(101)
4 | # The original iris_matrix has a few tied distances, which alters the order of nearest neighbours.
5 | # This then alters the order of addition when computing various statistics;
6 | # which results in small rounding errors that are amplified across t-SNE iterations.
7 | # Hence, I have to simulate a case where no ties are possible.
8 | simdata <- matrix(rnorm(234*5), nrow=234)
9 |
10 | NNFUN <- function(D, K)
11 | # A quick reference function for computing NNs, to avoid depending on other packages.
12 | {
13 | all.indices <- matrix(0L, nrow(D), K)
14 | all.distances <- matrix(0, nrow(D), K)
15 | for (i in seq_len(nrow(D))) {
16 | current <- D[i,]
17 | by.dist <- setdiff(order(current), i)
18 | all.indices[i,] <- head(by.dist, ncol(all.indices))
19 | all.distances[i,] <- current[all.indices[i,]]
20 | }
21 | list(index=all.indices, distance=all.distances)
22 | }
23 |
24 | test_that("Rtsne with nearest-neighbor input compares to distance matrix input", {
25 | D <- as.matrix(dist(simdata))
26 | out <- NNFUN(D, 90) # 3 * perplexity
27 | all.indices <- out$index
28 | all.distances <- out$distance
29 |
30 | # The vptree involves a few R::runif calls, which alters the seed in the precomputed distance case.
31 | # This results in different random initialization of Y.
32 | # Thus, we need to supply a pre-computed Y as well.
33 | Y_in <- matrix(runif(nrow(simdata)*2), ncol=2)
34 | out <- Rtsne_neighbors(all.indices, all.distances, Y_init=Y_in, perplexity=30, max_iter=1)
35 | blah <- Rtsne(D, is_distance=TRUE, Y_init=Y_in, perplexity=30, max_iter=1)
36 | expect_equal(out$Y, blah$Y)
37 |
38 | # Trying again with a different number of neighbors.
39 | Y_in <- matrix(runif(nrow(simdata)*2), ncol=2)
40 | out <- Rtsne_neighbors(all.indices[,1:30], all.distances[,1:30], Y_init=Y_in, perplexity=10)
41 | blah <- Rtsne(D, is_distance=TRUE, Y_init=Y_in, perplexity=10)
42 | expect_equal(out$Y, blah$Y)
43 | })
44 |
45 | test_that("Rtsne with nearest-neighbor input behaves upon normalization", {
46 | D <- as.matrix(dist(normalize_input(simdata)))
47 | out <- NNFUN(D, 90) # 3 * perplexity
48 | all.indices <- out$index
49 | all.distances <- out$distance
50 |
51 | # The vptree involves a few R::runif calls, which alters the seed in the precomputed distance case.
52 | # This results in different random initialization of Y.
53 | # Thus, we need to supply a pre-computed Y as well.
54 | Y_in <- matrix(runif(nrow(simdata)*2), ncol=2)
55 | out <- Rtsne_neighbors(all.indices, all.distances, Y_init=Y_in, perplexity=30, max_iter=1)
56 | blah <- Rtsne(D, is_distance=TRUE, Y_init=Y_in, perplexity=30, max_iter=1)
57 | expect_equal(out$Y, blah$Y)
58 |
59 | # Trying again with a different number of neighbors.
60 | Y_in <- matrix(runif(nrow(simdata)*2), ncol=2)
61 | out <- Rtsne_neighbors(all.indices[,1:30], all.distances[,1:30], Y_init=Y_in, perplexity=10)
62 | blah <- Rtsne(D, is_distance=TRUE, Y_init=Y_in, perplexity=10)
63 | expect_equal(out$Y, blah$Y)
64 | })
65 |
66 | test_that("Rtsne with nearest-neighbor input gives no errors for no_dims 1 and 3", {
67 | D <- as.matrix(dist(simdata))
68 | out <- NNFUN(D, 90) # 3 * perplexity
69 | all.indices <- out$index
70 | all.distances <- out$distance
71 |
72 | # The vptree involves a few R::runif calls, which alters the seed in the precomputed distance case.
73 | # This results in different random initialization of Y.
74 | # Thus, we need to supply a pre-computed Y as well.
75 | Y_in <- matrix(runif(nrow(simdata)*1), ncol=1)
76 | out <- Rtsne_neighbors(all.indices, all.distances, Y_init=Y_in, perplexity=30, max_iter=1, dims = 1)
77 | blah <- Rtsne(D, is_distance=TRUE, Y_init=Y_in, perplexity=30, max_iter=1,dims = 1)
78 | expect_equal(out$Y, blah$Y)
79 |
80 | Y_in <- matrix(runif(nrow(simdata)*3), ncol=3)
81 | out <- Rtsne_neighbors(all.indices, all.distances, Y_init=Y_in, perplexity=30, max_iter=1, dims = 3)
82 | blah <- Rtsne(D, is_distance=TRUE, Y_init=Y_in, perplexity=30, max_iter=1,dims = 3)
83 | expect_equal(out$Y, blah$Y)
84 | })
85 |
86 | test_that("error conditions are correctly explored", {
87 | expect_error(Rtsne_neighbors("yay", matrix(0, 50, 10), Y_init=Y_in, perplexity=10), "matrix")
88 | expect_error(Rtsne_neighbors(matrix(0L, 50, 5), matrix(0, 50, 10), Y_init=Y_in, perplexity=10), "same dimensions")
89 | expect_error(Rtsne_neighbors(matrix(0L, 50, 10), matrix(0, 50, 10), Y_init=Y_in, perplexity=10), "invalid indices")
90 | expect_error(Rtsne_neighbors(matrix(51L, 50, 10), matrix(0, 50, 10), Y_init=Y_in, perplexity=10), "invalid indices")
91 | expect_error(Rtsne_neighbors(matrix(NA_real_, 50, 10), matrix(0, 50, 10), Y_init=Y_in, perplexity=10), "invalid indices")
92 | })
93 |
--------------------------------------------------------------------------------
/src/sptree.h:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology)
4 | * All rights reserved.
5 | *
6 | * Redistribution and use in source and binary forms, with or without
7 | * modification, are permitted provided that the following conditions are met:
8 | * 1. Redistributions of source code must retain the above copyright
9 | * notice, this list of conditions and the following disclaimer.
10 | * 2. Redistributions in binary form must reproduce the above copyright
11 | * notice, this list of conditions and the following disclaimer in the
12 | * documentation and/or other materials provided with the distribution.
13 | * 3. All advertising materials mentioning features or use of this software
14 | * must display the following acknowledgement:
15 | * This product includes software developed by the Delft University of Technology.
16 | * 4. Neither the name of the Delft University of Technology nor the names of
17 | * its contributors may be used to endorse or promote products derived from
18 | * this software without specific prior written permission.
19 | *
20 | * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS
21 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
23 | * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
28 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
29 | * OF SUCH DAMAGE.
30 | *
31 | */
32 |
33 | #ifndef SPTREE_H
34 | #define SPTREE_H
35 |
36 | using namespace std;
37 |
38 | static inline double max_tsne(double x, double y) { return (x <= y ? y : x); }
39 |
40 | template
41 | class Cell {
42 | double corner[NDims];
43 | double width[NDims];
44 |
45 |
46 | public:
47 | Cell();
48 | Cell(double* inp_corner, double* inp_width);
49 | ~Cell();
50 |
51 | double getCorner(unsigned int d) const;
52 | double getWidth(unsigned int d) const;
53 | void setCorner(unsigned int d, double val);
54 | void setWidth(unsigned int d, double val);
55 | bool containsPoint(double point[]) const;
56 | };
57 |
58 | template
59 | class SPTree
60 | {
61 | public:
62 | enum { no_children = 2 * SPTree::no_children };
63 |
64 | private:
65 | // Fixed constants
66 | static const unsigned int QT_NODE_CAPACITY = 1;
67 |
68 | // Properties of this node in the tree
69 | SPTree* parent;
70 | unsigned int dimension;
71 | bool is_leaf;
72 | unsigned int size;
73 | unsigned int cum_size;
74 | int num_threads;
75 |
76 | // Axis-aligned bounding box stored as a center with half-dimensions to represent the boundaries of this quad tree
77 | Cell boundary;
78 |
79 | // Indices in this space-partitioning tree node, corresponding center-of-mass, and list of all children
80 | double* data;
81 | double center_of_mass[NDims];
82 | unsigned int index[QT_NODE_CAPACITY];
83 |
84 | // Children
85 | SPTree* children[no_children];
86 |
87 | public:
88 | SPTree(double* inp_data, unsigned int N);
89 | SPTree(double* inp_data, double* inp_corner, double* inp_width);
90 | SPTree(double* inp_data, unsigned int N, double* inp_corner, double* inp_width);
91 | SPTree(SPTree* inp_parent, double* inp_data, unsigned int N, double* inp_corner, double* inp_width);
92 | SPTree(SPTree* inp_parent, double* inp_data, double* inp_corner, double* inp_width);
93 | ~SPTree();
94 | void setData(double* inp_data);
95 | SPTree* getParent();
96 | void construct(Cell boundary);
97 | bool insert(unsigned int new_index);
98 | void subdivide();
99 | bool isCorrect();
100 | void rebuildTree();
101 | void getAllIndices(unsigned int* indices);
102 | unsigned int getDepth();
103 | double computeNonEdgeForces(unsigned int point_index, double theta, double neg_f[]) const;
104 | void computeEdgeForces(unsigned int* row_P, unsigned int* col_P, double* val_P, unsigned int N, double* pos_f, int num_threads) const;
105 | void print();
106 |
107 | private:
108 | void init(SPTree* inp_parent, double* inp_data, double* inp_corner, double* inp_width);
109 | void fill(unsigned int N);
110 | unsigned int getAllIndices(unsigned int* indices, unsigned int loc);
111 | bool isChild(unsigned int test_index, unsigned int start, unsigned int end);
112 | };
113 |
114 | template <>
115 | struct SPTree<0>
116 | {
117 | enum { no_children = 1 };
118 | };
119 |
120 | #endif
--------------------------------------------------------------------------------
/src/Rtsne.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include "tsne.h"
3 | using namespace Rcpp;
4 |
5 | Rcpp::List save_results(int N, int no_dims, const std::vector& Y, const std::vector& costs, const std::vector& itercosts,
6 | double theta, double perplexity, int D, int stop_lying_iter, int mom_switch_iter,
7 | double momentum, double final_momentum, double eta, double exaggeration_factor);
8 |
9 | // Function that runs the Barnes-Hut implementation of t-SNE
10 | // [[Rcpp::export]]
11 | Rcpp::List Rtsne_cpp(NumericMatrix X, int no_dims, double perplexity,
12 | double theta, bool verbose, int max_iter,
13 | bool distance_precomputed, NumericMatrix Y_in, bool init,
14 | int stop_lying_iter, int mom_switch_iter,
15 | double momentum, double final_momentum,
16 | double eta, double exaggeration_factor, unsigned int num_threads) {
17 |
18 | size_t N = X.ncol(), D = X.nrow();
19 | double * data=X.begin();
20 |
21 | if (verbose) Rprintf("Read the %lu x %lu data matrix successfully!\n", (unsigned long)N, (unsigned long)D);
22 | std::vector Y(N * no_dims), costs(N), itercosts(static_cast(std::ceil(max_iter/50.0)));
23 |
24 | // Providing user-supplied solution.
25 | if (init) {
26 | for (size_t i = 0; i < Y.size(); i++) Y[i] = Y_in[i];
27 | if (verbose) Rprintf("Using user supplied starting positions\n");
28 | }
29 |
30 | // Run tsne
31 | if (no_dims==1) {
32 | TSNE<1> tsne(perplexity, theta, verbose, max_iter, init, stop_lying_iter, mom_switch_iter,
33 | momentum, final_momentum, eta, exaggeration_factor, num_threads);
34 | tsne.run(data, N, D, Y.data(), distance_precomputed, costs.data(), itercosts.data());
35 | } else if (no_dims==2) {
36 | TSNE<2> tsne(perplexity, theta, verbose, max_iter, init, stop_lying_iter, mom_switch_iter,
37 | momentum, final_momentum, eta, exaggeration_factor, num_threads);
38 | tsne.run(data, N, D, Y.data(), distance_precomputed, costs.data(), itercosts.data());
39 | } else if (no_dims==3) {
40 | TSNE<3> tsne(perplexity, theta, verbose, max_iter, init, stop_lying_iter, mom_switch_iter,
41 | momentum, final_momentum, eta, exaggeration_factor, num_threads);
42 | tsne.run(data, N, D, Y.data(), distance_precomputed, costs.data(), itercosts.data());
43 | } else {
44 | Rcpp::stop("Only 1, 2 or 3 dimensional output is suppported.\n");
45 | }
46 |
47 | return Rcpp::List::create(Rcpp::_["Y"]=Rcpp::NumericMatrix(no_dims, N, Y.data()),
48 | Rcpp::_["costs"]=Rcpp::NumericVector(costs.begin(), costs.end()),
49 | Rcpp::_["itercosts"]=Rcpp::NumericVector(itercosts.begin(), itercosts.end()));
50 | }
51 |
52 | // Function that runs the Barnes-Hut implementation of t-SNE on nearest neighbor results.
53 | // [[Rcpp::export]]
54 | Rcpp::List Rtsne_nn_cpp(IntegerMatrix nn_dex, NumericMatrix nn_dist,
55 | int no_dims, double perplexity,
56 | double theta, bool verbose, int max_iter,
57 | NumericMatrix Y_in, bool init,
58 | int stop_lying_iter, int mom_switch_iter,
59 | double momentum, double final_momentum,
60 | double eta, double exaggeration_factor, unsigned int num_threads) {
61 |
62 | size_t N = nn_dex.ncol(), K=nn_dex.nrow(); // transposed - columns are points, rows are neighbors.
63 | if (verbose) Rprintf("Read the NN results for %lu points successfully!\n", (unsigned long)N);
64 | std::vector Y(N * no_dims), costs(N), itercosts(static_cast(std::ceil(max_iter/50.0)));
65 |
66 | // Providing user-supplied solution.
67 | if (init) {
68 | for (size_t i = 0; i < Y.size(); i++) Y[i] = Y_in[i];
69 | if (verbose) Rprintf("Using user supplied starting positions\n");
70 | }
71 |
72 | // Run tsne
73 | if (no_dims==1) {
74 | TSNE<1> tsne(perplexity, theta, verbose, max_iter, init, stop_lying_iter, mom_switch_iter,
75 | momentum, final_momentum, eta, exaggeration_factor, num_threads);
76 | tsne.run(nn_dex.begin(), nn_dist.begin(), N, K, Y.data(), costs.data(), itercosts.data());
77 | } else if (no_dims==2) {
78 | TSNE<2> tsne(perplexity, theta, verbose, max_iter, init, stop_lying_iter, mom_switch_iter,
79 | momentum, final_momentum, eta, exaggeration_factor, num_threads);
80 | tsne.run(nn_dex.begin(), nn_dist.begin(), N, K, Y.data(), costs.data(), itercosts.data());
81 | } else if (no_dims==3) {
82 | TSNE<3> tsne(perplexity, theta, verbose, max_iter, init, stop_lying_iter, mom_switch_iter,
83 | momentum, final_momentum, eta, exaggeration_factor, num_threads);
84 | tsne.run(nn_dex.begin(), nn_dist.begin(), N, K, Y.data(), costs.data(), itercosts.data());
85 | } else {
86 | Rcpp::stop("Only 1, 2 or 3 dimensional output is suppported.\n");
87 | }
88 |
89 | return Rcpp::List::create(Rcpp::_["Y"]=Rcpp::NumericMatrix(no_dims, N, Y.data()),
90 | Rcpp::_["costs"]=Rcpp::NumericVector(costs.begin(), costs.end()),
91 | Rcpp::_["itercosts"]=Rcpp::NumericVector(itercosts.begin(), itercosts.end()));
92 | }
93 |
94 |
--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
3 |
4 | #include
5 |
6 | using namespace Rcpp;
7 |
8 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
9 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get();
10 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
11 | #endif
12 |
13 | // Rtsne_cpp
14 | Rcpp::List Rtsne_cpp(NumericMatrix X, int no_dims, double perplexity, double theta, bool verbose, int max_iter, bool distance_precomputed, NumericMatrix Y_in, bool init, int stop_lying_iter, int mom_switch_iter, double momentum, double final_momentum, double eta, double exaggeration_factor, unsigned int num_threads);
15 | RcppExport SEXP _Rtsne_Rtsne_cpp(SEXP XSEXP, SEXP no_dimsSEXP, SEXP perplexitySEXP, SEXP thetaSEXP, SEXP verboseSEXP, SEXP max_iterSEXP, SEXP distance_precomputedSEXP, SEXP Y_inSEXP, SEXP initSEXP, SEXP stop_lying_iterSEXP, SEXP mom_switch_iterSEXP, SEXP momentumSEXP, SEXP final_momentumSEXP, SEXP etaSEXP, SEXP exaggeration_factorSEXP, SEXP num_threadsSEXP) {
16 | BEGIN_RCPP
17 | Rcpp::RObject rcpp_result_gen;
18 | Rcpp::RNGScope rcpp_rngScope_gen;
19 | Rcpp::traits::input_parameter< NumericMatrix >::type X(XSEXP);
20 | Rcpp::traits::input_parameter< int >::type no_dims(no_dimsSEXP);
21 | Rcpp::traits::input_parameter< double >::type perplexity(perplexitySEXP);
22 | Rcpp::traits::input_parameter< double >::type theta(thetaSEXP);
23 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP);
24 | Rcpp::traits::input_parameter< int >::type max_iter(max_iterSEXP);
25 | Rcpp::traits::input_parameter< bool >::type distance_precomputed(distance_precomputedSEXP);
26 | Rcpp::traits::input_parameter< NumericMatrix >::type Y_in(Y_inSEXP);
27 | Rcpp::traits::input_parameter< bool >::type init(initSEXP);
28 | Rcpp::traits::input_parameter< int >::type stop_lying_iter(stop_lying_iterSEXP);
29 | Rcpp::traits::input_parameter< int >::type mom_switch_iter(mom_switch_iterSEXP);
30 | Rcpp::traits::input_parameter< double >::type momentum(momentumSEXP);
31 | Rcpp::traits::input_parameter< double >::type final_momentum(final_momentumSEXP);
32 | Rcpp::traits::input_parameter< double >::type eta(etaSEXP);
33 | Rcpp::traits::input_parameter< double >::type exaggeration_factor(exaggeration_factorSEXP);
34 | Rcpp::traits::input_parameter< unsigned int >::type num_threads(num_threadsSEXP);
35 | rcpp_result_gen = Rcpp::wrap(Rtsne_cpp(X, no_dims, perplexity, theta, verbose, max_iter, distance_precomputed, Y_in, init, stop_lying_iter, mom_switch_iter, momentum, final_momentum, eta, exaggeration_factor, num_threads));
36 | return rcpp_result_gen;
37 | END_RCPP
38 | }
39 | // Rtsne_nn_cpp
40 | Rcpp::List Rtsne_nn_cpp(IntegerMatrix nn_dex, NumericMatrix nn_dist, int no_dims, double perplexity, double theta, bool verbose, int max_iter, NumericMatrix Y_in, bool init, int stop_lying_iter, int mom_switch_iter, double momentum, double final_momentum, double eta, double exaggeration_factor, unsigned int num_threads);
41 | RcppExport SEXP _Rtsne_Rtsne_nn_cpp(SEXP nn_dexSEXP, SEXP nn_distSEXP, SEXP no_dimsSEXP, SEXP perplexitySEXP, SEXP thetaSEXP, SEXP verboseSEXP, SEXP max_iterSEXP, SEXP Y_inSEXP, SEXP initSEXP, SEXP stop_lying_iterSEXP, SEXP mom_switch_iterSEXP, SEXP momentumSEXP, SEXP final_momentumSEXP, SEXP etaSEXP, SEXP exaggeration_factorSEXP, SEXP num_threadsSEXP) {
42 | BEGIN_RCPP
43 | Rcpp::RObject rcpp_result_gen;
44 | Rcpp::RNGScope rcpp_rngScope_gen;
45 | Rcpp::traits::input_parameter< IntegerMatrix >::type nn_dex(nn_dexSEXP);
46 | Rcpp::traits::input_parameter< NumericMatrix >::type nn_dist(nn_distSEXP);
47 | Rcpp::traits::input_parameter< int >::type no_dims(no_dimsSEXP);
48 | Rcpp::traits::input_parameter< double >::type perplexity(perplexitySEXP);
49 | Rcpp::traits::input_parameter< double >::type theta(thetaSEXP);
50 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP);
51 | Rcpp::traits::input_parameter< int >::type max_iter(max_iterSEXP);
52 | Rcpp::traits::input_parameter< NumericMatrix >::type Y_in(Y_inSEXP);
53 | Rcpp::traits::input_parameter< bool >::type init(initSEXP);
54 | Rcpp::traits::input_parameter< int >::type stop_lying_iter(stop_lying_iterSEXP);
55 | Rcpp::traits::input_parameter< int >::type mom_switch_iter(mom_switch_iterSEXP);
56 | Rcpp::traits::input_parameter< double >::type momentum(momentumSEXP);
57 | Rcpp::traits::input_parameter< double >::type final_momentum(final_momentumSEXP);
58 | Rcpp::traits::input_parameter< double >::type eta(etaSEXP);
59 | Rcpp::traits::input_parameter< double >::type exaggeration_factor(exaggeration_factorSEXP);
60 | Rcpp::traits::input_parameter< unsigned int >::type num_threads(num_threadsSEXP);
61 | rcpp_result_gen = Rcpp::wrap(Rtsne_nn_cpp(nn_dex, nn_dist, no_dims, perplexity, theta, verbose, max_iter, Y_in, init, stop_lying_iter, mom_switch_iter, momentum, final_momentum, eta, exaggeration_factor, num_threads));
62 | return rcpp_result_gen;
63 | END_RCPP
64 | }
65 | // normalize_input_cpp
66 | Rcpp::NumericMatrix normalize_input_cpp(Rcpp::NumericMatrix input);
67 | RcppExport SEXP _Rtsne_normalize_input_cpp(SEXP inputSEXP) {
68 | BEGIN_RCPP
69 | Rcpp::RObject rcpp_result_gen;
70 | Rcpp::RNGScope rcpp_rngScope_gen;
71 | Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type input(inputSEXP);
72 | rcpp_result_gen = Rcpp::wrap(normalize_input_cpp(input));
73 | return rcpp_result_gen;
74 | END_RCPP
75 | }
76 |
77 | static const R_CallMethodDef CallEntries[] = {
78 | {"_Rtsne_Rtsne_cpp", (DL_FUNC) &_Rtsne_Rtsne_cpp, 16},
79 | {"_Rtsne_Rtsne_nn_cpp", (DL_FUNC) &_Rtsne_Rtsne_nn_cpp, 16},
80 | {"_Rtsne_normalize_input_cpp", (DL_FUNC) &_Rtsne_normalize_input_cpp, 1},
81 | {NULL, NULL, 0}
82 | };
83 |
84 | RcppExport void R_init_Rtsne(DllInfo *dll) {
85 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
86 | R_useDynamicSymbols(dll, FALSE);
87 | }
88 |
--------------------------------------------------------------------------------
/tests/testthat/test_Rtsne.R:
--------------------------------------------------------------------------------
1 | context("Rtsne main function")
2 |
3 | # Prepare iris dataset
4 | iris_unique <- unique(iris) # Remove duplicates
5 | Xscale<-normalize_input(as.matrix(iris_unique[,1:4]))
6 | distmat <- as.matrix(dist(Xscale))
7 |
8 | # Run models to compare to
9 | iter_equal <- 500
10 |
11 | test_that("Scaling gives the expected result", {
12 | Xscale2 <- scale(as.matrix(iris_unique[,1:4]),
13 | center = TRUE, scale=FALSE)
14 | Xscale2 <- scale(Xscale,
15 | center=FALSE,
16 | scale=rep(max(abs(Xscale)),4))
17 | expect_equivalent(Xscale,Xscale2)
18 | })
19 |
20 | test_that("Manual distance calculation equals C++ distance calculation", {
21 |
22 | # Does not work on 32 bit windows
23 | skip_on_cran()
24 |
25 | # Exact
26 | set.seed(50)
27 | tsne_matrix <- Rtsne(as.matrix(iris_unique[,1:4]),verbose=FALSE,
28 | is_distance = FALSE,theta=0.0,max_iter=iter_equal,
29 | pca=FALSE, normalize=TRUE)
30 | set.seed(50)
31 | tsne_dist <- Rtsne(distmat, verbose=FALSE, is_distance = TRUE,
32 | theta = 0.0, max_iter=iter_equal)
33 | expect_equal(tsne_matrix$Y,tsne_dist$Y)
34 |
35 | # Inexact
36 | set.seed(50)
37 | tsne_matrix <- Rtsne(as.matrix(iris_unique[,1:4]),verbose=FALSE, is_distance = FALSE,
38 | theta=0.1,max_iter=iter_equal,pca=FALSE)
39 | set.seed(50)
40 | tsne_dist <- Rtsne(distmat, verbose=FALSE, is_distance = TRUE, theta = 0.1, max_iter=iter_equal,
41 | pca=FALSE)
42 | expect_equal(tsne_matrix$Y,tsne_dist$Y)
43 | })
44 |
45 | test_that("Accepts dist", {
46 |
47 | # Exact
48 | set.seed(50)
49 | tsne_out_dist_matrix <- Rtsne(distmat, is_distance = TRUE, theta=0.0, max_iter=iter_equal)
50 | set.seed(50)
51 | tsne_out_dist <- Rtsne(dist(Xscale),theta=0.0,max_iter=iter_equal)
52 | expect_equal(tsne_out_dist$Y,tsne_out_dist_matrix$Y)
53 |
54 | # Inexact
55 | set.seed(50)
56 | tsne_out_dist_matrix <- Rtsne(distmat, is_distance = TRUE, theta=0.1, max_iter=iter_equal)
57 | set.seed(50)
58 | tsne_out_dist <- Rtsne(dist(Xscale), theta=0.1, max_iter=iter_equal)
59 | expect_equal(tsne_out_dist$Y,tsne_out_dist_matrix$Y)
60 | })
61 |
62 | test_that("Accepts data.frame", {
63 |
64 | # Exact
65 | set.seed(50)
66 | tsne_out_matrix <- Rtsne(as.matrix(iris_unique[,1:4]),dims=1,verbose=FALSE, is_distance = FALSE,theta=0.0,max_iter=iter_equal,pca=FALSE)
67 | set.seed(50)
68 | tsne_out_df <- Rtsne(iris_unique[,1:4],dims=1,verbose=FALSE, is_distance = FALSE,theta=0.0,pca=FALSE,max_iter=iter_equal,num_threads=1)
69 | expect_equal(tsne_out_matrix$Y,tsne_out_df$Y)
70 |
71 | # Inexact
72 | set.seed(50)
73 | tsne_out_matrix_bh <- Rtsne(as.matrix(iris_unique[,1:4]),verbose=FALSE, is_distance = FALSE,theta=0.1,pca=FALSE,max_iter=iter_equal)
74 | set.seed(50)
75 | tsne_out_df <- Rtsne(iris_unique[,1:4],verbose=FALSE, is_distance = FALSE,theta=0.1,pca=FALSE,max_iter=iter_equal,num_threads=1)
76 | expect_equal(tsne_out_matrix_bh$Y,tsne_out_df$Y)
77 | })
78 |
79 | test_that("OpenMP with different threads returns same result",{
80 |
81 | # Does not work on windows
82 | skip_on_cran()
83 | skip_on_ci()
84 |
85 | set.seed(50)
86 | tsne_out_df1 <- Rtsne(iris_unique[,1:4],dims=3,verbose=FALSE, is_distance = FALSE,
87 | theta=0.1,pca=FALSE,max_iter=iter_equal,num_threads=1)
88 | set.seed(50)
89 | tsne_out_df2 <- Rtsne(iris_unique[,1:4],dims=3,verbose=FALSE, is_distance = FALSE,
90 | theta=0.1,pca=FALSE,max_iter=iter_equal,num_threads=2)
91 | set.seed(50)
92 | tsne_out_df3 <- Rtsne(iris_unique[,1:4],dims=3,verbose=FALSE, is_distance = FALSE,
93 | theta=0.1,pca=FALSE,max_iter=iter_equal,num_threads=3)
94 | expect_equal(tsne_out_df1$Y,tsne_out_df2$Y)
95 | expect_equal(tsne_out_df2$Y,tsne_out_df3$Y)
96 | })
97 |
98 | test_that("Continuing from initialization gives approximately the same result as direct run", {
99 |
100 | # Does not work exactly due to resetting of "gains".
101 | iter_equal <- 1000
102 | extra_iter <- 200
103 |
104 | #Exact
105 | set.seed(50)
106 | tsne_out_full <- Rtsne(iris_unique[,1:4],
107 | perplexity=3,theta=0.0,pca=FALSE,
108 | max_iter=iter_equal,final_momentum = 0)
109 | set.seed(50)
110 | tsne_out_part1 <- Rtsne(iris_unique[,1:4],
111 | perplexity=3,theta=0.0,pca=FALSE,
112 | max_iter=iter_equal-extra_iter,final_momentum = 0)
113 | tsne_out_part2 <- Rtsne(iris_unique[,1:4],
114 | perplexity=3,theta=0.0,pca=FALSE,
115 | max_iter=extra_iter,Y_init=tsne_out_part1$Y,final_momentum = 0)
116 | expect_equivalent(dist(tsne_out_full$Y),dist(tsne_out_part2$Y),tolerance=0.01)
117 |
118 | #Inexact
119 | set.seed(50)
120 | tsne_out_full <- Rtsne(iris_unique[,1:4],final_momentum=0,theta=0.1,pca=FALSE,max_iter=iter_equal)
121 | set.seed(50)
122 | tsne_out_part1 <- Rtsne(iris_unique[,1:4],final_momentum=0,theta=0.1,pca=FALSE,max_iter=iter_equal-extra_iter)
123 | set.seed(50)
124 | tsne_out_part2 <- Rtsne(iris_unique[,1:4],final_momentum=0,theta=0.1,pca=FALSE,max_iter=extra_iter,Y_init=tsne_out_part1$Y)
125 | expect_equivalent(dist(tsne_out_full$Y),dist(tsne_out_part2$Y),tolerance=0.01)
126 | })
127 |
128 | test_that("partial_pca FALSE and TRUE give similar results", {
129 | skip_if_not_installed("irlba")
130 |
131 | # Only first few iterations
132 | iter_equal <- 5
133 |
134 | set.seed(42)
135 | fat_data <- rbind(sapply(runif(200,-1,1), function(x) rnorm(200,x)),
136 | sapply(runif(200,-1,1), function(x) rnorm(200,x)))
137 |
138 | set.seed(42)
139 | tsne_out_prcomp <- Rtsne(fat_data, max_iter = iter_equal)
140 |
141 | set.seed(42)
142 | tsne_out_irlba <- Rtsne(fat_data, partial_pca = T, max_iter = iter_equal)
143 |
144 | # Sign of principal components are arbitrary so even with same seed tSNE coordinates are not the same
145 | expect_equal(tsne_out_prcomp$costs, tsne_out_irlba$costs, tolerance = .01, scale = 1)
146 | })
147 |
148 | test_that("Error conditions", {
149 | expect_error(Rtsne("test", matrix(0, 50, 10), Y_init=Y_in, perplexity=10), "matrix")
150 | expect_error(Rtsne(distmat,is_distance = 3),"logical")
151 | expect_error(Rtsne(matrix(0,2,3),is_distance = TRUE),"Input")
152 | expect_error(Rtsne(matrix(0,100,3)),"duplicates")
153 | expect_error(Rtsne(matrix(0,2,3),pca_center = 2),"TRUE")
154 | expect_error(Rtsne(matrix(0,2,3),initial_dims=1.3),"dimensionality")
155 | expect_error(Rtsne(matrix(0,2,3),dims=4),"dims")
156 | expect_error(Rtsne(matrix(0,2,3),max_iter=1.5),"should")
157 |
158 | expect_error(Rtsne(matrix(0,2,3),Y_init=matrix(0,2,1)),"incorrect format")
159 | expect_error(Rtsne(matrix(0,2,3),perplexity = 0),"positive")
160 | expect_error(Rtsne(matrix(0,2,3),theta = -0.1),"lie")
161 | expect_error(Rtsne(matrix(0,2,3),theta = 1.001),"lie")
162 | expect_error(Rtsne(matrix(0,2,3),stop_lying_iter = -1),"positive")
163 | expect_error(Rtsne(matrix(0,2,3),mom_switch_iter = -1),"positive")
164 | expect_error(Rtsne(matrix(0,2,3),momentum = -0.1),"positive")
165 | expect_error(Rtsne(matrix(0,2,3),final_momentum = -0.1),"positive")
166 | expect_error(Rtsne(matrix(0,2,3),eta = 0.0),"positive")
167 | expect_error(Rtsne(matrix(0,2,3),exaggeration_factor = 0.0),"positive")
168 | expect_error(Rtsne(matrix(0,2,3)),"perplexity is too large")
169 | })
170 |
171 | test_that("Verbose option", {
172 | expect_output(Rtsne(iris_unique[,1:4],pca=TRUE,verbose=TRUE,max_iter=150),"Fitting performed")
173 | })
174 |
--------------------------------------------------------------------------------
/src/vptree.h:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology)
4 | * All rights reserved.
5 | *
6 | * Redistribution and use in source and binary forms, with or without
7 | * modification, are permitted provided that the following conditions are met:
8 | * 1. Redistributions of source code must retain the above copyright
9 | * notice, this list of conditions and the following disclaimer.
10 | * 2. Redistributions in binary form must reproduce the above copyright
11 | * notice, this list of conditions and the following disclaimer in the
12 | * documentation and/or other materials provided with the distribution.
13 | * 3. All advertising materials mentioning features or use of this software
14 | * must display the following acknowledgement:
15 | * This product includes software developed by the Delft University of Technology.
16 | * 4. Neither the name of the Delft University of Technology nor the names of
17 | * its contributors may be used to endorse or promote products derived from
18 | * this software without specific prior written permission.
19 | *
20 | * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS
21 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
23 | * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
28 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
29 | * OF SUCH DAMAGE.
30 | *
31 | */
32 |
33 |
34 | /* This code was adopted with minor modifications from Steve Hanov's great tutorial at http://stevehanov.ca/blog/index.php?id=130 */
35 |
36 | #include
37 | #include
38 | #include
39 | #include
40 | #include
41 | #include
42 | #include
43 | #include
44 | #include "datapoint.h"
45 |
46 | #ifndef VPTREE_H
47 | #define VPTREE_H
48 |
49 | double euclidean_distance(const DataPoint &t1, const DataPoint &t2) {
50 | double dd = .0;
51 | for(int d = 0; d < t1.dimensionality(); d++) dd += (t1.x(d) - t2.x(d)) * (t1.x(d) - t2.x(d));
52 | return sqrt(dd);
53 | }
54 |
55 | double precomputed_distance(const DataPoint &t1, const DataPoint &t2) {
56 | double dd = .0;
57 | dd = t1.x(t2.index());
58 | return dd;
59 | }
60 |
61 | template
62 | class VpTree
63 | {
64 | public:
65 |
66 | // Default constructor
67 | VpTree() : _root(0) {}
68 |
69 | // Destructor
70 | ~VpTree() {
71 | delete _root;
72 | }
73 |
74 | // Function to create a new VpTree from data
75 | void create(const std::vector& items) {
76 | delete _root;
77 | _items = items;
78 | _root = buildFromPoints(0, items.size());
79 | }
80 |
81 | // Function that uses the tree to find the k nearest neighbors of target
82 | void search(const T& target, int k, std::vector* results, std::vector* distances)
83 | {
84 |
85 | // Use a priority queue to store intermediate results on
86 | std::priority_queue heap;
87 |
88 | // Variable that tracks the distance to the farthest point in our results
89 | double tau = DBL_MAX;
90 |
91 | // Perform the search
92 | search(_root, target, k, heap, &tau);
93 |
94 | // Gather final results
95 | results->clear(); distances->clear();
96 | while(!heap.empty()) {
97 | results->push_back(_items[heap.top().index]);
98 | distances->push_back(heap.top().dist);
99 | heap.pop();
100 | }
101 |
102 | // Results are in reverse order
103 | std::reverse(results->begin(), results->end());
104 | std::reverse(distances->begin(), distances->end());
105 | }
106 |
107 | private:
108 | std::vector _items;
109 |
110 | // Single node of a VP tree (has a point and radius; left children are closer to point than the radius)
111 | struct Node
112 | {
113 | int index; // index of point in node
114 | double threshold; // radius(?)
115 | Node* left; // points closer by than threshold
116 | Node* right; // points farther away than threshold
117 |
118 | Node() :
119 | index(0), threshold(0.), left(0), right(0) {}
120 |
121 | ~Node() { // destructor
122 | delete left;
123 | delete right;
124 | }
125 | }* _root;
126 |
127 |
128 | // An item on the intermediate result queue
129 | struct HeapItem {
130 | HeapItem( int index, double dist) :
131 | index(index), dist(dist) {}
132 | int index;
133 | double dist;
134 | bool operator<(const HeapItem& o) const {
135 | return dist < o.dist;
136 | }
137 | };
138 |
139 | // Distance comparator for use in std::nth_element
140 | struct DistanceComparator
141 | {
142 | const T& item;
143 | DistanceComparator(const T& item) : item(item) {}
144 | bool operator()(const T& a, const T& b) {
145 | return distance(item, a) < distance(item, b);
146 | }
147 | };
148 |
149 | // Function that (recursively) fills the tree
150 | Node* buildFromPoints( int lower, int upper )
151 | {
152 | if (upper == lower) { // indicates that we're done here!
153 | return NULL;
154 | }
155 |
156 | // Lower index is center of current node
157 | Node* node = new Node();
158 | node->index = lower;
159 |
160 | if (upper - lower > 1) { // if we did not arrive at leaf yet
161 |
162 | // Choose an arbitrary point and move it to the start
163 | Rcpp::RNGScope scope;
164 | int i = (int) ((double)R::runif(0,1) * (upper - lower - 1)) + lower;
165 | std::swap(_items[lower], _items[i]);
166 |
167 | // Partition around the median distance
168 | int median = (upper + lower) / 2;
169 | std::nth_element(_items.begin() + lower + 1,
170 | _items.begin() + median,
171 | _items.begin() + upper,
172 | DistanceComparator(_items[lower]));
173 |
174 | // Threshold of the new node will be the distance to the median
175 | node->threshold = distance(_items[lower], _items[median]);
176 |
177 | // Recursively build tree
178 | node->index = lower;
179 | node->left = buildFromPoints(lower + 1, median);
180 | node->right = buildFromPoints(median, upper);
181 | }
182 |
183 | // Return result
184 | return node;
185 | }
186 |
187 | // Helper function that searches the tree
188 | void search(Node* node, const T& target, unsigned int k, std::priority_queue& heap, double* ptau)
189 | {
190 | if(node == NULL) return; // indicates that we're done here
191 |
192 | // Compute distance between target and current node
193 | double dist = distance(_items[node->index], target);
194 |
195 | // If current node within radius tau
196 | if(dist < (*ptau)) {
197 | if(heap.size() == k) heap.pop(); // remove furthest node from result list (if we already have k results)
198 | heap.push(HeapItem(node->index, dist)); // add current node to result list
199 | if(heap.size() == k) *ptau = heap.top().dist; // update value of tau (farthest point in result list)
200 | }
201 |
202 | // Return if we arrived at a leaf
203 | if(node->left == NULL && node->right == NULL) {
204 | return;
205 | }
206 |
207 | // If the target lies within the radius of ball
208 | if(dist < node->threshold) {
209 | if(dist - (*ptau) <= node->threshold) { // if there can still be neighbors inside the ball, recursively search left child first
210 | search(node->left, target, k, heap, ptau);
211 | }
212 |
213 | if(dist + (*ptau) >= node->threshold) { // if there can still be neighbors outside the ball, recursively search right child
214 | search(node->right, target, k, heap, ptau);
215 | }
216 |
217 | // If the target lies outsize the radius of the ball
218 | } else {
219 | if(dist + (*ptau) >= node->threshold) { // if there can still be neighbors outside the ball, recursively search right child first
220 | search(node->right, target, k, heap, ptau);
221 | }
222 |
223 | if(dist - (*ptau) <= node->threshold) { // if there can still be neighbors inside the ball, recursively search left child
224 | search(node->left, target, k, heap, ptau);
225 | }
226 | }
227 | }
228 | };
229 |
230 | #endif
231 |
--------------------------------------------------------------------------------
/man/Rtsne.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/Rtsne.R, R/neighbors.R
3 | \name{Rtsne}
4 | \alias{Rtsne}
5 | \alias{Rtsne.default}
6 | \alias{Rtsne.dist}
7 | \alias{Rtsne.data.frame}
8 | \alias{Rtsne_neighbors}
9 | \title{Barnes-Hut implementation of t-Distributed Stochastic Neighbor Embedding}
10 | \usage{
11 | Rtsne(X, ...)
12 |
13 | \method{Rtsne}{default}(
14 | X,
15 | dims = 2,
16 | initial_dims = 50,
17 | perplexity = 30,
18 | theta = 0.5,
19 | check_duplicates = TRUE,
20 | pca = TRUE,
21 | partial_pca = FALSE,
22 | max_iter = 1000,
23 | verbose = getOption("verbose", FALSE),
24 | is_distance = FALSE,
25 | Y_init = NULL,
26 | pca_center = TRUE,
27 | pca_scale = FALSE,
28 | normalize = TRUE,
29 | stop_lying_iter = ifelse(is.null(Y_init), 250L, 0L),
30 | mom_switch_iter = ifelse(is.null(Y_init), 250L, 0L),
31 | momentum = 0.5,
32 | final_momentum = 0.8,
33 | eta = 200,
34 | exaggeration_factor = 12,
35 | num_threads = 1,
36 | ...
37 | )
38 |
39 | \method{Rtsne}{dist}(X, ..., is_distance = TRUE)
40 |
41 | \method{Rtsne}{data.frame}(X, ...)
42 |
43 | Rtsne_neighbors(
44 | index,
45 | distance,
46 | dims = 2,
47 | perplexity = 30,
48 | theta = 0.5,
49 | max_iter = 1000,
50 | verbose = getOption("verbose", FALSE),
51 | Y_init = NULL,
52 | stop_lying_iter = ifelse(is.null(Y_init), 250L, 0L),
53 | mom_switch_iter = ifelse(is.null(Y_init), 250L, 0L),
54 | momentum = 0.5,
55 | final_momentum = 0.8,
56 | eta = 200,
57 | exaggeration_factor = 12,
58 | num_threads = 1,
59 | ...
60 | )
61 | }
62 | \arguments{
63 | \item{X}{matrix; Data matrix (each row is an observation, each column is a variable)}
64 |
65 | \item{...}{Other arguments that can be passed to Rtsne}
66 |
67 | \item{dims}{integer; Output dimensionality (default: 2)}
68 |
69 | \item{initial_dims}{integer; the number of dimensions that should be retained in the initial PCA step (default: 50)}
70 |
71 | \item{perplexity}{numeric; Perplexity parameter (should not be bigger than 3 * perplexity < nrow(X) - 1, see details for interpretation)}
72 |
73 | \item{theta}{numeric; Speed/accuracy trade-off (increase for less accuracy), set to 0.0 for exact TSNE (default: 0.5)}
74 |
75 | \item{check_duplicates}{logical; Checks whether duplicates are present. It is best to make sure there are no duplicates present and set this option to FALSE, especially for large datasets (default: TRUE)}
76 |
77 | \item{pca}{logical; Whether an initial PCA step should be performed (default: TRUE)}
78 |
79 | \item{partial_pca}{logical; Whether truncated PCA should be used to calculate principal components (requires the irlba package). This is faster for large input matrices (default: FALSE)}
80 |
81 | \item{max_iter}{integer; Number of iterations (default: 1000)}
82 |
83 | \item{verbose}{logical; Whether progress updates should be printed (default: global "verbose" option, or FALSE if that is not set)}
84 |
85 | \item{is_distance}{logical; Indicate whether X is a distance matrix (default: FALSE)}
86 |
87 | \item{Y_init}{matrix; Initial locations of the objects. If NULL, random initialization will be used (default: NULL). Note that when using this, the initial stage with exaggerated perplexity values and a larger momentum term will be skipped.}
88 |
89 | \item{pca_center}{logical; Should data be centered before pca is applied? (default: TRUE)}
90 |
91 | \item{pca_scale}{logical; Should data be scaled before pca is applied? (default: FALSE)}
92 |
93 | \item{normalize}{logical; Should data be normalized internally prior to distance calculations with \code{\link{normalize_input}}? (default: TRUE)}
94 |
95 | \item{stop_lying_iter}{integer; Iteration after which the perplexities are no longer exaggerated (default: 250, except when Y_init is used, then 0)}
96 |
97 | \item{mom_switch_iter}{integer; Iteration after which the final momentum is used (default: 250, except when Y_init is used, then 0)}
98 |
99 | \item{momentum}{numeric; Momentum used in the first part of the optimization (default: 0.5)}
100 |
101 | \item{final_momentum}{numeric; Momentum used in the final part of the optimization (default: 0.8)}
102 |
103 | \item{eta}{numeric; Learning rate (default: 200.0)}
104 |
105 | \item{exaggeration_factor}{numeric; Exaggeration factor used to multiply the P matrix in the first part of the optimization (default: 12.0)}
106 |
107 | \item{num_threads}{integer; Number of threads to use when using OpenMP, default is 1. Setting to 0 corresponds to detecting and using all available cores}
108 |
109 | \item{index}{integer matrix; Each row contains the identity of the nearest neighbors for each observation}
110 |
111 | \item{distance}{numeric matrix; Each row contains the distance to the nearest neighbors in \code{index} for each observation}
112 | }
113 | \value{
114 | List with the following elements:
115 | \item{Y}{Matrix containing the new representations for the objects}
116 | \item{N}{Number of objects}
117 | \item{origD}{Original Dimensionality before TSNE (only when \code{X} is a data matrix)}
118 | \item{perplexity}{See above}
119 | \item{theta}{See above}
120 | \item{costs}{The cost for every object after the final iteration}
121 | \item{itercosts}{The total costs (KL-divergence) for all objects in every 50th + the last iteration}
122 | \item{stop_lying_iter}{Iteration after which the perplexities are no longer exaggerated}
123 | \item{mom_switch_iter}{Iteration after which the final momentum is used}
124 | \item{momentum}{Momentum used in the first part of the optimization}
125 | \item{final_momentum}{Momentum used in the final part of the optimization}
126 | \item{eta}{Learning rate}
127 | \item{exaggeration_factor}{Exaggeration factor used to multiply the P matrix in the first part of the optimization}
128 | }
129 | \description{
130 | Wrapper for the C++ implementation of Barnes-Hut t-Distributed Stochastic Neighbor Embedding. t-SNE is a method for constructing a low dimensional embedding of high-dimensional data, distances or similarities. Exact t-SNE can be computed by setting theta=0.0.
131 | }
132 | \details{
133 | Given a distance matrix \eqn{D} between input objects (which by default, is the euclidean distances between two objects), we calculate a similarity score in the original space: \deqn{ p_{j | i} = \frac{\exp(-\|D_{ij}\|^2 / 2 \sigma_i^2)}{\sum_{k \neq i} \exp(-\|D_{ij}\|^2 / 2 \sigma_i^2)} } which is then symmetrized using: \deqn{ p_{i j}=\frac{p_{j|i} + p_{i|j}}{2n}.} The \eqn{\sigma} for each object is chosen in such a way that the perplexity of \eqn{p_{j|i}} has a value that is close to the user defined perplexity. This value effectively controls how many nearest neighbours are taken into account when constructing the embedding in the low-dimensional space.
134 | For the low-dimensional space we use the Cauchy distribution (t-distribution with one degree of freedom) as the distribution of the distances to neighbouring objects:
135 | \deqn{ q_{i j} = \frac{(1+ \| y_i-y_j\|^2)^{-1}}{\sum_{k \neq l} 1+ \| y_k-y_l\|^2)^{-1}}.}
136 | By changing the location of the objects y in the embedding to minimize the Kullback-Leibler divergence between these two distributions \eqn{ q_{i j}} and \eqn{ p_{i j}}, we create a map that focusses on small-scale structure, due to the asymmetry of the KL-divergence. The t-distribution is chosen to avoid the crowding problem: in the original high dimensional space, there are potentially many equidistant objects with moderate distance from a particular object, more than can be accounted for in the low dimensional representation. The t-distribution makes sure that these objects are more spread out in the new representation.
137 |
138 | For larger datasets, a problem with the a simple gradient descent to minimize the Kullback-Leibler divergence is the computational complexity of each gradient step (which is \eqn{O(n^2)}). The Barnes-Hut implementation of the algorithm attempts to mitigate this problem using two tricks: (1) approximating small similarities by 0 in the \eqn{p_{ij}} distribution, where the non-zero entries are computed by finding 3*perplexity nearest neighbours using an efficient tree search. (2) Using the Barnes-Hut algorithm in the computation of the gradient which approximates large distance similarities using a quadtree. This approximation is controlled by the \code{theta} parameter, with smaller values leading to more exact approximations. When \code{theta=0.0}, the implementation uses a standard t-SNE implementation. The Barnes-Hut approximation leads to a \eqn{O(n log(n))} computational complexity for each iteration.
139 |
140 | During the minimization of the KL-divergence, the implementation uses a trick known as early exaggeration, which multiplies the \eqn{p_{ij}}'s by 12 during the first 250 iterations. This leads to tighter clustering and more distance between clusters of objects. This early exaggeration is not used when the user gives an initialization of the objects in the embedding by setting \code{Y_init}. During the early exaggeration phase, a momentum term of 0.5 is used while this is changed to 0.8 after the first 250 iterations. All these default parameters can be changed by the user.
141 |
142 | After checking the correctness of the input, the \code{Rtsne} function (optionally) does an initial reduction of the feature space using \code{\link{prcomp}}, before calling the C++ TSNE implementation. Since R's random number generator is used, use \code{\link{set.seed}} before the function call to get reproducible results.
143 |
144 | If \code{X} is a data.frame, it is transformed into a matrix using \code{\link{model.matrix}}. If \code{X} is a \code{\link{dist}} object, it is currently first expanded into a full distance matrix.
145 | }
146 | \section{Methods (by class)}{
147 | \itemize{
148 | \item \code{Rtsne(default)}: Default Interface
149 |
150 | \item \code{Rtsne(dist)}: tsne on given dist object
151 |
152 | \item \code{Rtsne(data.frame)}: tsne on data.frame
153 |
154 | }}
155 | \section{Supplying precomputed distances}{
156 |
157 | If a distance matrix is already available, this can be directly supplied to \code{Rtsne} by setting \code{is_distance=TRUE}.
158 | This improves efficiency by avoiding recalculation of distances, but requires some work to get the same results as running default \code{Rtsne} on a data matrix.
159 | Specifically, Euclidean distances should be computed from a normalized data matrix - see \code{\link{normalize_input}} for details.
160 | PCA arguments will also be ignored if \code{is_distance=TRUE}.
161 |
162 | NN search results can be directly supplied to \code{Rtsne_neighbors} to avoid repeating the (possibly time-consuming) search.
163 | To achieve the same results as \code{Rtsne} on the data matrix, the search should be conducted on the normalized data matrix.
164 | The number of nearest neighbors should also be equal to three-fold the \code{perplexity}, rounded down to the nearest integer.
165 | Note that pre-supplied NN results cannot be used when \code{theta=0} as they are only relevant for the approximate algorithm.
166 |
167 | Any kind of distance metric can be used as input.
168 | In contrast, running \code{Rtsne} on a data matrix will always use Euclidean distances.
169 | }
170 |
171 | \examples{
172 | iris_unique <- unique(iris) # Remove duplicates
173 | iris_matrix <- as.matrix(iris_unique[,1:4])
174 |
175 | # Set a seed if you want reproducible results
176 | set.seed(42)
177 | tsne_out <- Rtsne(iris_matrix,pca=FALSE,perplexity=30,theta=0.0) # Run TSNE
178 |
179 | # Show the objects in the 2D tsne representation
180 | plot(tsne_out$Y,col=iris_unique$Species, asp=1)
181 |
182 | # data.frame as input
183 | tsne_out <- Rtsne(iris_unique,pca=FALSE, theta=0.0)
184 |
185 | # Using a dist object
186 | set.seed(42)
187 | tsne_out <- Rtsne(dist(normalize_input(iris_matrix)), theta=0.0)
188 | plot(tsne_out$Y,col=iris_unique$Species, asp=1)
189 |
190 | set.seed(42)
191 | tsne_out <- Rtsne(as.matrix(dist(normalize_input(iris_matrix))),theta=0.0)
192 | plot(tsne_out$Y,col=iris_unique$Species, asp=1)
193 |
194 | # Supplying starting positions (example: continue from earlier embedding)
195 | set.seed(42)
196 | tsne_part1 <- Rtsne(iris_unique[,1:4], theta=0.0, pca=FALSE, max_iter=350)
197 | tsne_part2 <- Rtsne(iris_unique[,1:4], theta=0.0, pca=FALSE, max_iter=650, Y_init=tsne_part1$Y)
198 | plot(tsne_part2$Y,col=iris_unique$Species, asp=1)
199 | \dontrun{
200 | # Fast PCA and multicore
201 |
202 | tsne_out <- Rtsne(iris_matrix, theta=0.1, partial_pca = TRUE, initial_dims=3)
203 | tsne_out <- Rtsne(iris_matrix, theta=0.1, num_threads = 2)
204 | }
205 | }
206 | \references{
207 | Maaten, L. Van Der, 2014. Accelerating t-SNE using Tree-Based Algorithms. Journal of Machine Learning Research, 15, p.3221-3245.
208 |
209 | van der Maaten, L.J.P. & Hinton, G.E., 2008. Visualizing High-Dimensional Data Using t-SNE. Journal of Machine Learning Research, 9, pp.2579-2605.
210 | }
211 |
--------------------------------------------------------------------------------
/R/Rtsne.R:
--------------------------------------------------------------------------------
1 | #' Barnes-Hut implementation of t-Distributed Stochastic Neighbor Embedding
2 | #'
3 | #' Wrapper for the C++ implementation of Barnes-Hut t-Distributed Stochastic Neighbor Embedding. t-SNE is a method for constructing a low dimensional embedding of high-dimensional data, distances or similarities. Exact t-SNE can be computed by setting theta=0.0.
4 | #'
5 | #' Given a distance matrix \eqn{D} between input objects (which by default, is the euclidean distances between two objects), we calculate a similarity score in the original space: \deqn{ p_{j | i} = \frac{\exp(-\|D_{ij}\|^2 / 2 \sigma_i^2)}{\sum_{k \neq i} \exp(-\|D_{ij}\|^2 / 2 \sigma_i^2)} } which is then symmetrized using: \deqn{ p_{i j}=\frac{p_{j|i} + p_{i|j}}{2n}.} The \eqn{\sigma} for each object is chosen in such a way that the perplexity of \eqn{p_{j|i}} has a value that is close to the user defined perplexity. This value effectively controls how many nearest neighbours are taken into account when constructing the embedding in the low-dimensional space.
6 | #' For the low-dimensional space we use the Cauchy distribution (t-distribution with one degree of freedom) as the distribution of the distances to neighbouring objects:
7 | #' \deqn{ q_{i j} = \frac{(1+ \| y_i-y_j\|^2)^{-1}}{\sum_{k \neq l} 1+ \| y_k-y_l\|^2)^{-1}}.}
8 | #' By changing the location of the objects y in the embedding to minimize the Kullback-Leibler divergence between these two distributions \eqn{ q_{i j}} and \eqn{ p_{i j}}, we create a map that focusses on small-scale structure, due to the asymmetry of the KL-divergence. The t-distribution is chosen to avoid the crowding problem: in the original high dimensional space, there are potentially many equidistant objects with moderate distance from a particular object, more than can be accounted for in the low dimensional representation. The t-distribution makes sure that these objects are more spread out in the new representation.
9 | #'
10 | #' For larger datasets, a problem with the a simple gradient descent to minimize the Kullback-Leibler divergence is the computational complexity of each gradient step (which is \eqn{O(n^2)}). The Barnes-Hut implementation of the algorithm attempts to mitigate this problem using two tricks: (1) approximating small similarities by 0 in the \eqn{p_{ij}} distribution, where the non-zero entries are computed by finding 3*perplexity nearest neighbours using an efficient tree search. (2) Using the Barnes-Hut algorithm in the computation of the gradient which approximates large distance similarities using a quadtree. This approximation is controlled by the \code{theta} parameter, with smaller values leading to more exact approximations. When \code{theta=0.0}, the implementation uses a standard t-SNE implementation. The Barnes-Hut approximation leads to a \eqn{O(n log(n))} computational complexity for each iteration.
11 | #'
12 | #' During the minimization of the KL-divergence, the implementation uses a trick known as early exaggeration, which multiplies the \eqn{p_{ij}}'s by 12 during the first 250 iterations. This leads to tighter clustering and more distance between clusters of objects. This early exaggeration is not used when the user gives an initialization of the objects in the embedding by setting \code{Y_init}. During the early exaggeration phase, a momentum term of 0.5 is used while this is changed to 0.8 after the first 250 iterations. All these default parameters can be changed by the user.
13 | #'
14 | #' After checking the correctness of the input, the \code{Rtsne} function (optionally) does an initial reduction of the feature space using \code{\link{prcomp}}, before calling the C++ TSNE implementation. Since R's random number generator is used, use \code{\link{set.seed}} before the function call to get reproducible results.
15 | #'
16 | #' If \code{X} is a data.frame, it is transformed into a matrix using \code{\link{model.matrix}}. If \code{X} is a \code{\link{dist}} object, it is currently first expanded into a full distance matrix.
17 | #'
18 | #' @param X matrix; Data matrix (each row is an observation, each column is a variable)
19 | #' @param index integer matrix; Each row contains the identity of the nearest neighbors for each observation
20 | #' @param distance numeric matrix; Each row contains the distance to the nearest neighbors in \code{index} for each observation
21 | #' @param dims integer; Output dimensionality (default: 2)
22 | #' @param initial_dims integer; the number of dimensions that should be retained in the initial PCA step (default: 50)
23 | #' @param perplexity numeric; Perplexity parameter (should not be bigger than 3 * perplexity < nrow(X) - 1, see details for interpretation)
24 | #' @param theta numeric; Speed/accuracy trade-off (increase for less accuracy), set to 0.0 for exact TSNE (default: 0.5)
25 | #' @param check_duplicates logical; Checks whether duplicates are present. It is best to make sure there are no duplicates present and set this option to FALSE, especially for large datasets (default: TRUE)
26 | #' @param pca logical; Whether an initial PCA step should be performed (default: TRUE)
27 | #' @param partial_pca logical; Whether truncated PCA should be used to calculate principal components (requires the irlba package). This is faster for large input matrices (default: FALSE)
28 | #' @param max_iter integer; Number of iterations (default: 1000)
29 | #' @param verbose logical; Whether progress updates should be printed (default: global "verbose" option, or FALSE if that is not set)
30 | #' @param ... Other arguments that can be passed to Rtsne
31 | #' @param is_distance logical; Indicate whether X is a distance matrix (default: FALSE)
32 | #' @param Y_init matrix; Initial locations of the objects. If NULL, random initialization will be used (default: NULL). Note that when using this, the initial stage with exaggerated perplexity values and a larger momentum term will be skipped.
33 | #' @param pca_center logical; Should data be centered before pca is applied? (default: TRUE)
34 | #' @param pca_scale logical; Should data be scaled before pca is applied? (default: FALSE)
35 | #' @param normalize logical; Should data be normalized internally prior to distance calculations with \code{\link{normalize_input}}? (default: TRUE)
36 | #' @param stop_lying_iter integer; Iteration after which the perplexities are no longer exaggerated (default: 250, except when Y_init is used, then 0)
37 | #' @param mom_switch_iter integer; Iteration after which the final momentum is used (default: 250, except when Y_init is used, then 0)
38 | #' @param momentum numeric; Momentum used in the first part of the optimization (default: 0.5)
39 | #' @param final_momentum numeric; Momentum used in the final part of the optimization (default: 0.8)
40 | #' @param eta numeric; Learning rate (default: 200.0)
41 | #' @param exaggeration_factor numeric; Exaggeration factor used to multiply the P matrix in the first part of the optimization (default: 12.0)
42 | #' @param num_threads integer; Number of threads to use when using OpenMP, default is 1. Setting to 0 corresponds to detecting and using all available cores
43 | #'
44 | #' @return List with the following elements:
45 | #' \item{Y}{Matrix containing the new representations for the objects}
46 | #' \item{N}{Number of objects}
47 | #' \item{origD}{Original Dimensionality before TSNE (only when \code{X} is a data matrix)}
48 | #' \item{perplexity}{See above}
49 | #' \item{theta}{See above}
50 | #' \item{costs}{The cost for every object after the final iteration}
51 | #' \item{itercosts}{The total costs (KL-divergence) for all objects in every 50th + the last iteration}
52 | #' \item{stop_lying_iter}{Iteration after which the perplexities are no longer exaggerated}
53 | #' \item{mom_switch_iter}{Iteration after which the final momentum is used}
54 | #' \item{momentum}{Momentum used in the first part of the optimization}
55 | #' \item{final_momentum}{Momentum used in the final part of the optimization}
56 | #' \item{eta}{Learning rate}
57 | #' \item{exaggeration_factor}{Exaggeration factor used to multiply the P matrix in the first part of the optimization}
58 | #'
59 | #' @section Supplying precomputed distances:
60 | #' If a distance matrix is already available, this can be directly supplied to \code{Rtsne} by setting \code{is_distance=TRUE}.
61 | #' This improves efficiency by avoiding recalculation of distances, but requires some work to get the same results as running default \code{Rtsne} on a data matrix.
62 | #' Specifically, Euclidean distances should be computed from a normalized data matrix - see \code{\link{normalize_input}} for details.
63 | #' PCA arguments will also be ignored if \code{is_distance=TRUE}.
64 | #'
65 | #' NN search results can be directly supplied to \code{Rtsne_neighbors} to avoid repeating the (possibly time-consuming) search.
66 | #' To achieve the same results as \code{Rtsne} on the data matrix, the search should be conducted on the normalized data matrix.
67 | #' The number of nearest neighbors should also be equal to three-fold the \code{perplexity}, rounded down to the nearest integer.
68 | #' Note that pre-supplied NN results cannot be used when \code{theta=0} as they are only relevant for the approximate algorithm.
69 | #'
70 | #' Any kind of distance metric can be used as input.
71 | #' In contrast, running \code{Rtsne} on a data matrix will always use Euclidean distances.
72 | #'
73 | #' @references Maaten, L. Van Der, 2014. Accelerating t-SNE using Tree-Based Algorithms. Journal of Machine Learning Research, 15, p.3221-3245.
74 | #' @references van der Maaten, L.J.P. & Hinton, G.E., 2008. Visualizing High-Dimensional Data Using t-SNE. Journal of Machine Learning Research, 9, pp.2579-2605.
75 | #'
76 | #' @examples
77 | #' iris_unique <- unique(iris) # Remove duplicates
78 | #' iris_matrix <- as.matrix(iris_unique[,1:4])
79 | #'
80 | #' # Set a seed if you want reproducible results
81 | #' set.seed(42)
82 | #' tsne_out <- Rtsne(iris_matrix,pca=FALSE,perplexity=30,theta=0.0) # Run TSNE
83 | #'
84 | #' # Show the objects in the 2D tsne representation
85 | #' plot(tsne_out$Y,col=iris_unique$Species, asp=1)
86 | #'
87 | #' # data.frame as input
88 | #' tsne_out <- Rtsne(iris_unique,pca=FALSE, theta=0.0)
89 | #'
90 | #' # Using a dist object
91 | #' set.seed(42)
92 | #' tsne_out <- Rtsne(dist(normalize_input(iris_matrix)), theta=0.0)
93 | #' plot(tsne_out$Y,col=iris_unique$Species, asp=1)
94 | #'
95 | #' set.seed(42)
96 | #' tsne_out <- Rtsne(as.matrix(dist(normalize_input(iris_matrix))),theta=0.0)
97 | #' plot(tsne_out$Y,col=iris_unique$Species, asp=1)
98 | #'
99 | #' # Supplying starting positions (example: continue from earlier embedding)
100 | #' set.seed(42)
101 | #' tsne_part1 <- Rtsne(iris_unique[,1:4], theta=0.0, pca=FALSE, max_iter=350)
102 | #' tsne_part2 <- Rtsne(iris_unique[,1:4], theta=0.0, pca=FALSE, max_iter=650, Y_init=tsne_part1$Y)
103 | #' plot(tsne_part2$Y,col=iris_unique$Species, asp=1)
104 | #' \dontrun{
105 | #' # Fast PCA and multicore
106 | #'
107 | #' tsne_out <- Rtsne(iris_matrix, theta=0.1, partial_pca = TRUE, initial_dims=3)
108 | #' tsne_out <- Rtsne(iris_matrix, theta=0.1, num_threads = 2)
109 | #' }
110 | #' @useDynLib Rtsne, .registration = TRUE
111 | #' @import Rcpp
112 | #' @importFrom stats model.matrix na.fail prcomp
113 | #'
114 | #' @export
115 | Rtsne <- function (X, ...) {
116 | UseMethod("Rtsne", X)
117 | }
118 |
119 | #' @describeIn Rtsne Default Interface
120 | #' @export
121 | Rtsne.default <- function(X, dims=2, initial_dims=50,
122 | perplexity=30, theta=0.5,
123 | check_duplicates=TRUE,
124 | pca=TRUE, partial_pca=FALSE, max_iter=1000,verbose=getOption("verbose", FALSE),
125 | is_distance=FALSE, Y_init=NULL,
126 | pca_center=TRUE, pca_scale=FALSE, normalize=TRUE,
127 | stop_lying_iter=ifelse(is.null(Y_init),250L,0L),
128 | mom_switch_iter=ifelse(is.null(Y_init),250L,0L),
129 | momentum=0.5, final_momentum=0.8,
130 | eta=200.0, exaggeration_factor=12.0, num_threads=1, ...) {
131 |
132 | if (!is.logical(is_distance)) { stop("is_distance should be a logical variable")}
133 | if (!is.matrix(X)) { stop("Input X is not a matrix")}
134 | if (is_distance & !(is.matrix(X) & (nrow(X)==ncol(X)))) { stop("Input is not an accepted distance matrix") }
135 | if (!(is.logical(pca_center) && is.logical(pca_scale)) ) { stop("pca_center and pca_scale should be TRUE or FALSE")}
136 | if (!is.wholenumber(initial_dims) || initial_dims<=0) { stop("Incorrect initial dimensionality.")}
137 | if (!is.wholenumber(num_threads) || num_threads<0) { stop("Incorrect number of threads.")}
138 | tsne.args <- .check_tsne_params(nrow(X), dims=dims, perplexity=perplexity, theta=theta, max_iter=max_iter, verbose=verbose,
139 | Y_init=Y_init, stop_lying_iter=stop_lying_iter, mom_switch_iter=mom_switch_iter,
140 | momentum=momentum, final_momentum=final_momentum, eta=eta, exaggeration_factor=exaggeration_factor)
141 |
142 | # Check for missing values
143 | X <- na.fail(X)
144 |
145 | # Apply PCA
146 | if (!is_distance) {
147 | if (pca) {
148 | if(verbose) cat("Performing PCA\n")
149 | if(partial_pca){
150 | if (!requireNamespace("irlba", quietly = TRUE)) {stop("Package \"irlba\" is required for partial PCA. Please install it.", call. = FALSE)}
151 | X <- irlba::prcomp_irlba(X, n = initial_dims, center = pca_center, scale. = pca_scale)$x
152 | }else{
153 | if(verbose & min(dim(X))>2500) cat("Consider setting partial_pca=TRUE for large matrices\n")
154 | X <- prcomp(X, retx=TRUE, center = pca_center, scale. = pca_scale, rank. = initial_dims)$x
155 | }
156 | }
157 | if (check_duplicates) {
158 | if (any(duplicated(X))) { stop("Remove duplicates before running TSNE.") }
159 | }
160 | if (normalize) {
161 | X <- normalize_input(X)
162 | }
163 | X <- t(X) # transposing for rapid column-major access.
164 | } else {
165 | # Compute Squared distance if we are using exact TSNE
166 | if (theta==0.0) {
167 | X <- X^2
168 | }
169 | }
170 |
171 | out <- do.call(Rtsne_cpp, c(list(X=X, distance_precomputed=is_distance, num_threads=num_threads), tsne.args))
172 | out$Y <- t(out$Y) # Transposing back.
173 | info <- list(N=ncol(X))
174 | if (!is_distance) { out$origD <- nrow(X) } # 'origD' is unknown for distance matrices.
175 | out <- c(info, out, .clear_unwanted_params(tsne.args))
176 | class(out) <- c("Rtsne","list")
177 | out
178 | }
179 |
180 | #' @describeIn Rtsne tsne on given dist object
181 | #' @export
182 | Rtsne.dist <- function(X,...,is_distance=TRUE) {
183 | X <- as.matrix(na.fail(X))
184 | Rtsne(X, ..., is_distance=is_distance)
185 | }
186 |
187 | #' @describeIn Rtsne tsne on data.frame
188 | #' @export
189 | Rtsne.data.frame <- function(X,...) {
190 | X <- model.matrix(~.-1,na.fail(X))
191 | Rtsne(X, ...)
192 | }
193 |
--------------------------------------------------------------------------------
/src/sptree.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology)
4 | * All rights reserved.
5 | *
6 | * Redistribution and use in source and binary forms, with or without
7 | * modification, are permitted provided that the following conditions are met:
8 | * 1. Redistributions of source code must retain the above copyright
9 | * notice, this list of conditions and the following disclaimer.
10 | * 2. Redistributions in binary form must reproduce the above copyright
11 | * notice, this list of conditions and the following disclaimer in the
12 | * documentation and/or other materials provided with the distribution.
13 | * 3. All advertising materials mentioning features or use of this software
14 | * must display the following acknowledgement:
15 | * This product includes software developed by the Delft University of Technology.
16 | * 4. Neither the name of the Delft University of Technology nor the names of
17 | * its contributors may be used to endorse or promote products derived from
18 | * this software without specific prior written permission.
19 | *
20 | * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS
21 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
23 | * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
28 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
29 | * OF SUCH DAMAGE.
30 | *
31 | */
32 |
33 | #include
34 | #include
35 | #include
36 | #include
37 | #include
38 | #include
39 | #include
40 | #include "sptree.h"
41 |
42 |
43 |
44 | // Constructs cell
45 | template
46 | Cell::Cell() {
47 | }
48 |
49 | template
50 | Cell::Cell(double* inp_corner, double* inp_width) {
51 | for(int d = 0; d < NDims; d++) setCorner(d, inp_corner[d]);
52 | for(int d = 0; d < NDims; d++) setWidth( d, inp_width[d]);
53 | }
54 |
55 | // Destructs cell
56 | template
57 | Cell::~Cell() {
58 | }
59 |
60 | template
61 | double Cell::getCorner(unsigned int d) const {
62 | return corner[d];
63 | }
64 |
65 | template
66 | double Cell::getWidth(unsigned int d) const {
67 | return width[d];
68 | }
69 |
70 | template
71 | void Cell::setCorner(unsigned int d, double val) {
72 | corner[d] = val;
73 | }
74 |
75 | template
76 | void Cell::setWidth(unsigned int d, double val) {
77 | width[d] = val;
78 | }
79 |
80 | // Checks whether a point lies in a cell
81 | template
82 | bool Cell::containsPoint(double point[]) const
83 | {
84 | for(int d = 0; d < NDims; d++) {
85 | if(corner[d] - width[d] > point[d]) return false;
86 | if(corner[d] + width[d] < point[d]) return false;
87 | }
88 | return true;
89 | }
90 |
91 |
92 | // Default constructor for SPTree -- build tree, too!
93 | template
94 | SPTree::SPTree(double* inp_data, unsigned int N)
95 | {
96 |
97 | // Compute mean, width, and height of current map (boundaries of SPTree)
98 | int nD = 0;
99 | double* mean_Y = (double*) calloc(NDims, sizeof(double));
100 | double* min_Y = (double*) malloc(NDims * sizeof(double));
101 | double* max_Y = (double*) malloc(NDims * sizeof(double));
102 |
103 | for(unsigned int d = 0; d < NDims; d++) {
104 | min_Y[d] = DBL_MAX;
105 | max_Y[d] = -DBL_MAX;
106 | }
107 |
108 | for(unsigned int n = 0; n < N; n++) {
109 | for(unsigned int d = 0; d < NDims; d++) {
110 | mean_Y[d] += inp_data[n * NDims + d];
111 | if(inp_data[nD + d] < min_Y[d]) min_Y[d] = inp_data[nD + d];
112 | if(inp_data[nD + d] > max_Y[d]) max_Y[d] = inp_data[nD + d];
113 | }
114 | nD += NDims;
115 | }
116 |
117 | for(int d = 0; d < NDims; d++) mean_Y[d] /= (double) N;
118 |
119 | // Construct SPTree
120 | double* width = (double*) malloc(NDims * sizeof(double));
121 | for(int d = 0; d < NDims; d++) width[d] = max_tsne(max_Y[d] - mean_Y[d], mean_Y[d] - min_Y[d]) + 1e-5;
122 | init(NULL, inp_data, mean_Y, width);
123 | fill(N);
124 |
125 | // Clean up memory
126 | free(mean_Y);
127 | free(max_Y);
128 | free(min_Y);
129 | free(width);
130 | }
131 |
132 |
133 | // Constructor for SPTree with particular size and parent -- build the tree, too!
134 | template
135 | SPTree::SPTree(double* inp_data, unsigned int N, double* inp_corner, double* inp_width)
136 | {
137 | init(NULL, inp_data, inp_corner, inp_width);
138 | fill(N);
139 | }
140 |
141 |
142 | // Constructor for SPTree with particular size (do not fill the tree)
143 | template
144 | SPTree::SPTree(double* inp_data, double* inp_corner, double* inp_width)
145 | {
146 | init(NULL, inp_data, inp_corner, inp_width);
147 | }
148 |
149 |
150 | // Constructor for SPTree with particular size and parent (do not fill tree)
151 | template
152 | SPTree::SPTree(SPTree* inp_parent, double* inp_data, double* inp_corner, double* inp_width) {
153 | init(inp_parent, inp_data, inp_corner, inp_width);
154 | }
155 |
156 |
157 | // Constructor for SPTree with particular size and parent -- build the tree, too!
158 | template
159 | SPTree::SPTree(SPTree* inp_parent, double* inp_data, unsigned int N, double* inp_corner, double* inp_width)
160 | {
161 | init(inp_parent, inp_data, inp_corner, inp_width);
162 | fill(N);
163 | }
164 |
165 |
166 | // Main initialization function
167 | template
168 | void SPTree::init(SPTree* inp_parent, double* inp_data, double* inp_corner, double* inp_width)
169 | {
170 | parent = inp_parent;
171 | data = inp_data;
172 | is_leaf = true;
173 | size = 0;
174 | cum_size = 0;
175 |
176 | for(unsigned int d = 0; d < NDims; d++) boundary.setCorner(d, inp_corner[d]);
177 | for(unsigned int d = 0; d < NDims; d++) boundary.setWidth( d, inp_width[d]);
178 |
179 | for(unsigned int i = 0; i < no_children; i++) children[i] = NULL;
180 | for(unsigned int d = 0; d < NDims; d++) center_of_mass[d] = .0;
181 | }
182 |
183 |
184 | // Destructor for SPTree
185 | template
186 | SPTree::~SPTree()
187 | {
188 | for(unsigned int i = 0; i < no_children; i++) {
189 | if(children[i] != NULL) delete children[i];
190 | }
191 | }
192 |
193 |
194 | // Update the data underlying this tree
195 | template
196 | void SPTree::setData(double* inp_data)
197 | {
198 | data = inp_data;
199 | }
200 |
201 |
202 | // Get the parent of the current tree
203 | template
204 | SPTree* SPTree::getParent()
205 | {
206 | return parent;
207 | }
208 |
209 |
210 | // Insert a point into the SPTree
211 | template
212 | bool SPTree::insert(unsigned int new_index)
213 | {
214 | // Ignore objects which do not belong in this quad tree
215 | double* point = data + new_index * NDims;
216 | if(!boundary.containsPoint(point))
217 | return false;
218 |
219 | // Online update of cumulative size and center-of-mass
220 | cum_size++;
221 | double mult1 = (double) (cum_size - 1) / (double) cum_size;
222 | double mult2 = 1.0 / (double) cum_size;
223 |
224 | for(unsigned int d = 0; d < NDims; d++) {
225 | center_of_mass[d] = center_of_mass[d] * mult1 + mult2 * point[d];
226 | }
227 |
228 | // If there is space in this quad tree and it is a leaf, add the object here
229 | if(is_leaf && size < QT_NODE_CAPACITY) {
230 | index[size] = new_index;
231 | size++;
232 | return true;
233 | }
234 |
235 | // Don't add duplicates for now (this is not very nice)
236 | bool any_duplicate = false;
237 | for(unsigned int n = 0; n < size; n++) {
238 | bool duplicate = true;
239 | for(unsigned int d = 0; d < NDims; d++) {
240 | if(point[d] != data[index[n] * NDims + d]) { duplicate = false; break; }
241 | }
242 | any_duplicate = any_duplicate | duplicate;
243 | }
244 | if(any_duplicate) return true;
245 |
246 | // Otherwise, we need to subdivide the current cell
247 | if(is_leaf) subdivide();
248 |
249 | // Find out where the point can be inserted
250 | for(unsigned int i = 0; i < no_children; i++) {
251 | if(children[i]->insert(new_index)) return true;
252 | }
253 |
254 | // Otherwise, the point cannot be inserted (this should never happen)
255 | return false;
256 | }
257 |
258 |
259 | // Create four children which fully divide this cell into four quads of equal area
260 | template
261 | void SPTree::subdivide() {
262 |
263 | // Create new children
264 | double new_corner[NDims];
265 | double new_width[NDims];
266 | for(unsigned int i = 0; i < no_children; i++) {
267 | unsigned int div = 1;
268 | for(unsigned int d = 0; d < NDims; d++) {
269 | new_width[d] = .5 * boundary.getWidth(d);
270 | if((i / div) % 2 == 1) new_corner[d] = boundary.getCorner(d) - .5 * boundary.getWidth(d);
271 | else new_corner[d] = boundary.getCorner(d) + .5 * boundary.getWidth(d);
272 | div *= 2;
273 | }
274 | children[i] = new SPTree(this, data, new_corner, new_width);
275 | }
276 |
277 | // Move existing points to correct children
278 | for(unsigned int i = 0; i < size; i++) {
279 | bool success = false;
280 | for(unsigned int j = 0; j < no_children; j++) {
281 | if(!success) success = children[j]->insert(index[i]);
282 | }
283 | index[i] = -1;
284 | }
285 |
286 | // Empty parent node
287 | size = 0;
288 | is_leaf = false;
289 | }
290 |
291 |
292 | // Build SPTree on dataset
293 | template
294 | void SPTree::fill(unsigned int N)
295 | {
296 | for(unsigned int i = 0; i < N; i++) insert(i);
297 | }
298 |
299 |
300 | // Checks whether the specified tree is correct
301 | template
302 | bool SPTree::isCorrect()
303 | {
304 | for(unsigned int n = 0; n < size; n++) {
305 | double* point = data + index[n] * NDims;
306 | if(!boundary.containsPoint(point)) return false;
307 | }
308 | if(!is_leaf) {
309 | bool correct = true;
310 | for(int i = 0; i < no_children; i++) correct = correct && children[i]->isCorrect();
311 | return correct;
312 | }
313 | else return true;
314 | }
315 |
316 |
317 |
318 | // Build a list of all indices in SPTree
319 | template
320 | void SPTree::getAllIndices(unsigned int* indices)
321 | {
322 | getAllIndices(indices, 0);
323 | }
324 |
325 |
326 | // Build a list of all indices in SPTree
327 | template
328 | unsigned int SPTree::getAllIndices(unsigned int* indices, unsigned int loc)
329 | {
330 |
331 | // Gather indices in current quadrant
332 | for(unsigned int i = 0; i < size; i++) indices[loc + i] = index[i];
333 | loc += size;
334 |
335 | // Gather indices in children
336 | if(!is_leaf) {
337 | for(int i = 0; i < no_children; i++) loc = children[i]->getAllIndices(indices, loc);
338 | }
339 | return loc;
340 | }
341 |
342 | template
343 | unsigned int SPTree::getDepth() {
344 | if(is_leaf) return 1;
345 | int depth = 0;
346 | for(unsigned int i = 0; i < no_children; i++) depth = max_tsne(depth, children[i]->getDepth());
347 | return 1 + depth;
348 | }
349 |
350 |
351 | // Compute non-edge forces using Barnes-Hut algorithm
352 | template
353 | double SPTree::computeNonEdgeForces(unsigned int point_index, double theta, double neg_f[]) const
354 | {
355 | double resultSum = 0;
356 | double buff[NDims]; // make buff local for parallelization
357 |
358 | // Make sure that we spend no time on empty nodes or self-interactions
359 | if(cum_size == 0 || (is_leaf && size == 1 && index[0] == point_index)) return resultSum;
360 |
361 | // Compute distance between point and center-of-mass
362 | double sqdist = .0;
363 | unsigned int ind = point_index * NDims;
364 |
365 | for(unsigned int d = 0; d < NDims; d++) {
366 | buff[d] = data[ind + d] - center_of_mass[d];
367 | sqdist += buff[d] * buff[d];
368 | }
369 |
370 | // Check whether we can use this node as a "summary"
371 | double max_width = 0.0;
372 | double cur_width;
373 | for(unsigned int d = 0; d < NDims; d++) {
374 | cur_width = boundary.getWidth(d);
375 | max_width = (max_width > cur_width) ? max_width : cur_width;
376 | }
377 | if(is_leaf || max_width / sqrt(sqdist) < theta) {
378 |
379 | // Compute and add t-SNE force between point and current node
380 | sqdist = 1.0 / (1.0 + sqdist);
381 | double mult = cum_size * sqdist;
382 | resultSum += mult;
383 | mult *= sqdist;
384 | for(unsigned int d = 0; d < NDims; d++) neg_f[d] += mult * buff[d];
385 | }
386 | else {
387 |
388 | // Recursively apply Barnes-Hut to children
389 | for(unsigned int i = 0; i < no_children; i++){
390 | resultSum += children[i]->computeNonEdgeForces(point_index, theta, neg_f);
391 | }
392 | }
393 | return resultSum;
394 | }
395 |
396 |
397 | // Computes edge forces
398 | template
399 | void SPTree::computeEdgeForces(unsigned int* row_P, unsigned int* col_P, double* val_P, unsigned int N, double* pos_f, int num_threads) const
400 | {
401 |
402 | // Loop over all edges in the graph
403 | #pragma omp parallel for schedule(static) num_threads(num_threads)
404 | for(unsigned int n = 0; n < N; n++) {
405 | unsigned int ind1 = n * NDims;
406 | for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) {
407 |
408 | double buff[NDims]; // make buff local for parallelization
409 |
410 | // Compute pairwise distance and Q-value
411 | double sqdist = 1.0;
412 | unsigned int ind2 = col_P[i] * NDims;
413 |
414 | for(unsigned int d = 0; d < NDims; d++) {
415 | buff[d] = data[ind1 + d] - data[ind2 + d];
416 | sqdist += buff[d] * buff[d];
417 | }
418 |
419 | sqdist = val_P[i] / sqdist;
420 |
421 | // Sum positive force
422 | for(unsigned int d = 0; d < NDims; d++) pos_f[ind1 + d] += sqdist * buff[d];
423 | }
424 | }
425 | }
426 |
427 |
428 | // Print out tree
429 | template
430 | void SPTree::print()
431 | {
432 | if(cum_size == 0) {
433 | Rprintf("Empty node\n");
434 | return;
435 | }
436 |
437 | if(is_leaf) {
438 | Rprintf("Leaf node; data = [");
439 | for(unsigned int i = 0; i < size; i++) {
440 | double* point = data + index[i] * NDims;
441 | for(int d = 0; d < NDims; d++) Rprintf("%f, ", point[d]);
442 | Rprintf(" (index = %d)", index[i]);
443 | if(i < size - 1) Rprintf("\n");
444 | else Rprintf("]\n");
445 | }
446 | }
447 | else {
448 | Rprintf("Intersection node with center-of-mass = [");
449 | for(int d = 0; d < NDims; d++) Rprintf("%f, ", center_of_mass[d]);
450 | Rprintf("]; children are:\n");
451 | for(int i = 0; i < no_children; i++) children[i]->print();
452 | }
453 | }
454 |
455 | // declare templates explicitly
456 | template class SPTree<1>;
457 | template class SPTree<2>;
458 | template class SPTree<3>;
--------------------------------------------------------------------------------
/src/tsne.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology)
4 | * All rights reserved.
5 | *
6 | * Redistribution and use in source and binary forms, with or without
7 | * modification, are permitted provided that the following conditions are met:
8 | * 1. Redistributions of source code must retain the above copyright
9 | * notice, this list of conditions and the following disclaimer.
10 | * 2. Redistributions in binary form must reproduce the above copyright
11 | * notice, this list of conditions and the following disclaimer in the
12 | * documentation and/or other materials provided with the distribution.
13 | * 3. All advertising materials mentioning features or use of this software
14 | * must display the following acknowledgement:
15 | * This product includes software developed by the Delft University of Technology.
16 | * 4. Neither the name of the Delft University of Technology nor the names of
17 | * its contributors may be used to endorse or promote products derived from
18 | * this software without specific prior written permission.
19 | *
20 | * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS
21 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
23 | * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
28 | * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
29 | * OF SUCH DAMAGE.
30 | *
31 | */
32 |
33 | #define USE_FC_LEN_T
34 |
35 | #include
36 | #include
37 | #include
38 | #include
39 | #include
40 | #include
41 | #include
42 | #include "sptree.h"
43 | #include "vptree.h"
44 | #include "tsne.h"
45 |
46 |
47 | #ifdef _OPENMP
48 | #include
49 | #endif
50 |
51 | extern "C" {
52 | #include
53 | #include
54 | }
55 |
56 | #ifndef FCONE
57 | # define FCONE
58 | #endif
59 |
60 | using namespace std;
61 |
62 | template
63 | TSNE::TSNE(double Perplexity, double Theta, bool Verbose, int Max_iter, bool Init, int Stop_lying_iter,
64 | int Mom_switch_iter, double Momentum, double Final_momentum, double Eta, double Exaggeration_factor, int Num_threads) :
65 | perplexity(Perplexity), theta(Theta), momentum(Momentum), final_momentum(Final_momentum), eta(Eta), exaggeration_factor(Exaggeration_factor),
66 | max_iter(Max_iter), stop_lying_iter(Stop_lying_iter), mom_switch_iter(Mom_switch_iter), num_threads(Num_threads),
67 | verbose(Verbose), init(Init), exact(theta==.0) {
68 |
69 | #ifdef _OPENMP
70 | int threads = num_threads;
71 | if (num_threads==0) {
72 | threads = omp_get_max_threads();
73 | }
74 |
75 | // Print notice whether OpenMP is used
76 | if (verbose) Rprintf("OpenMP is working. %d threads.\n", threads);
77 | #endif
78 |
79 | return;
80 | }
81 |
82 | // Perform t-SNE
83 | template
84 | void TSNE::run(double* X, unsigned int N, int D, double* Y, bool distance_precomputed, double* cost, double* itercost) {
85 | if(N - 1 < 3 * perplexity) { Rcpp::stop("Perplexity too large for the number of data points!\n"); }
86 | if (verbose) Rprintf("Using no_dims = %d, perplexity = %f, and theta = %f\n", NDims, perplexity, theta);
87 | if (verbose) Rprintf("Computing input similarities...\n");
88 | clock_t start = clock();
89 |
90 | // Compute input similarities for exact t-SNE
91 | if(exact) {
92 | // Compute similarities
93 | computeGaussianPerplexity(X, N, D, distance_precomputed);
94 |
95 | // Symmetrize input similarities
96 | if (verbose) Rprintf("Symmetrizing...\n");
97 | for(unsigned long n = 0; n < N; n++) {
98 | for(unsigned long m = n + 1; m < N; m++) {
99 | P[n * N + m] += P[m * N + n];
100 | P[m * N + n] = P[n * N + m];
101 | }
102 | }
103 |
104 | double sum_P = 0;
105 | for(size_t i = 0; i < P.size(); i++) sum_P += P[i];
106 | for(size_t i = 0; i < P.size(); i++) P[i] /= sum_P;
107 | }
108 |
109 | // Compute input similarities for approximate t-SNE
110 | else {
111 | int K=3*perplexity;
112 |
113 | // Compute asymmetric pairwise input similarities
114 | if (distance_precomputed) {
115 | computeGaussianPerplexity(X, N, D, K);
116 | } else {
117 | computeGaussianPerplexity(X, N, D, K);
118 | }
119 |
120 | // Symmetrize input similarities
121 | symmetrizeMatrix(N);
122 | double sum_P = .0;
123 | for(unsigned int i = 0; i < row_P[N]; i++) sum_P += val_P[i];
124 | for(unsigned int i = 0; i < row_P[N]; i++) val_P[i] /= sum_P;
125 | }
126 |
127 | if (verbose) {
128 | clock_t end = clock();
129 | if(exact) Rprintf("Done in %4.2f seconds!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC);
130 | else Rprintf("Done in %4.2f seconds (sparsity = %f)!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC, (double) row_P[N] / ((double) N * (double) N));
131 | }
132 |
133 | trainIterations(N, Y, cost, itercost);
134 | return;
135 | }
136 |
137 | // Perform t-SNE with nearest neighbor results.
138 | template
139 | void TSNE::run(const int* nn_index, const double* nn_dist, unsigned int N, int K, double* Y, double* cost, double* itercost) {
140 | if(N - 1 < 3 * perplexity) { Rcpp::stop("Perplexity too large for the number of data points!\n"); }
141 | if (verbose) Rprintf("Using no_dims = %d, perplexity = %f, and theta = %f\n", NDims, perplexity, theta);
142 | if (verbose) Rprintf("Computing input similarities...\n");
143 | clock_t start = clock();
144 |
145 | // Compute asymmetric pairwise input similarities
146 | computeGaussianPerplexity(nn_index, nn_dist, N, K);
147 |
148 | // Symmetrize input similarities
149 | symmetrizeMatrix(N);
150 | double sum_P = .0;
151 | for(unsigned int i = 0; i < row_P[N]; i++) sum_P += val_P[i];
152 | for(unsigned int i = 0; i < row_P[N]; i++) val_P[i] /= sum_P;
153 |
154 | if (verbose) {
155 | clock_t end = clock();
156 | if(exact) Rprintf("Done in %4.2f seconds!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC);
157 | else Rprintf("Done in %4.2f seconds (sparsity = %f)!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC, (double) row_P[N] / ((double) N * (double) N));
158 | }
159 |
160 | trainIterations(N, Y, cost, itercost);
161 | return;
162 | }
163 |
164 | // Perform main training loop
165 | template
166 | void TSNE::trainIterations(unsigned int N, double* Y, double* cost, double* itercost) {
167 | // Allocate some memory
168 | double* dY = (double*) malloc(N * NDims * sizeof(double));
169 | double* uY = (double*) malloc(N * NDims * sizeof(double));
170 | double* gains = (double*) malloc(N * NDims * sizeof(double));
171 | if(dY == NULL || uY == NULL || gains == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
172 | for(unsigned int i = 0; i < N * NDims; i++) uY[i] = .0;
173 | for(unsigned int i = 0; i < N * NDims; i++) gains[i] = 1.0;
174 |
175 | // Lie about the P-values
176 | if(exact) { for(unsigned long i = 0; i < (unsigned long)N * N; i++) P[i] *= exaggeration_factor; }
177 | else { for(unsigned long i = 0; i < row_P[N]; i++) val_P[i] *= exaggeration_factor; }
178 |
179 | // Initialize solution (randomly), if not already done
180 | if (!init) { for(unsigned int i = 0; i < N * NDims; i++) Y[i] = randn() * .0001; }
181 |
182 | clock_t start = clock(), end;
183 | float total_time=0;
184 | int costi = 0; //iterator for saving the total costs for the iterations
185 |
186 | for(int iter = 0; iter < max_iter; iter++) {
187 |
188 | // Stop lying about the P-values after a while, and switch momentum
189 | if(iter == stop_lying_iter) {
190 | if(exact) { for(unsigned long i = 0; i < (unsigned long)N * N; i++) P[i] /= exaggeration_factor; }
191 | else { for(unsigned int i = 0; i < row_P[N]; i++) val_P[i] /= exaggeration_factor; }
192 | }
193 | if(iter == mom_switch_iter) momentum = final_momentum;
194 |
195 | // Compute (approximate) gradient
196 | if(exact) computeExactGradient(P.data(), Y, N, NDims, dY);
197 | else computeGradient(P.data(), row_P.data(), col_P.data(), val_P.data(), Y, N, NDims, dY, theta);
198 |
199 | // Update gains
200 | for(unsigned int i = 0; i < N * NDims; i++) gains[i] = (sign_tsne(dY[i]) != sign_tsne(uY[i])) ? (gains[i] + .2) : (gains[i] * .8);
201 | for(unsigned int i = 0; i < N * NDims; i++) if(gains[i] < .01) gains[i] = .01;
202 |
203 | // Perform gradient update (with momentum and gains)
204 | for(unsigned int i = 0; i < N * NDims; i++) uY[i] = momentum * uY[i] - eta * gains[i] * dY[i];
205 | for(unsigned int i = 0; i < N * NDims; i++) Y[i] = Y[i] + uY[i];
206 |
207 | // Make solution zero-mean
208 | zeroMean(Y, N, NDims);
209 |
210 | // Print out progress
211 | if((iter > 0 && (iter+1) % 50 == 0) || iter == max_iter - 1) {
212 | end = clock();
213 | double C = .0;
214 | if(exact) C = evaluateError(P.data(), Y, N, NDims);
215 | else C = evaluateError(row_P.data(), col_P.data(), val_P.data(), Y, N, NDims, theta); // doing approximate computation here!
216 | if(iter == 0) {
217 | if (verbose) Rprintf("Iteration %d: error is %f\n", iter + 1, C);
218 | }
219 | else {
220 | total_time += (float) (end - start) / CLOCKS_PER_SEC;
221 | if (verbose) Rprintf("Iteration %d: error is %f (50 iterations in %4.2f seconds)\n", iter+1, C, (float) (end - start) / CLOCKS_PER_SEC);
222 | }
223 | itercost[costi] = C;
224 | costi++;
225 | start = clock();
226 | }
227 | }
228 | end = clock(); total_time += (float) (end - start) / CLOCKS_PER_SEC;
229 |
230 | if(exact) getCost(P.data(), Y, N, NDims, cost);
231 | else getCost(row_P.data(), col_P.data(), val_P.data(), Y, N, NDims, theta, cost); // doing approximate computation here!
232 |
233 | // Clean up memory
234 | free(dY);
235 | free(uY);
236 | free(gains);
237 | if (verbose) Rprintf("Fitting performed in %4.2f seconds.\n", total_time);
238 | return;
239 | }
240 |
241 | // Compute gradient of the t-SNE cost function (using Barnes-Hut algorithm)
242 | template
243 | void TSNE::computeGradient(double* P, unsigned int* inp_row_P, unsigned int* inp_col_P, double* inp_val_P, double* Y, unsigned int N, int D, double* dC, double theta)
244 | {
245 | // Construct space-partitioning tree on current map
246 | SPTree* tree = new SPTree(Y, N);
247 |
248 | // Compute all terms required for t-SNE gradient
249 | double* pos_f = (double*) calloc(N * D, sizeof(double));
250 | double* neg_f = (double*) calloc(N * D, sizeof(double));
251 | if(pos_f == NULL || neg_f == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
252 | tree->computeEdgeForces(inp_row_P, inp_col_P, inp_val_P, N, pos_f, num_threads);
253 |
254 | // Storing the output to sum in single-threaded mode; avoid randomness in rounding errors.
255 | std::vector output(N);
256 |
257 | #pragma omp parallel for schedule(guided) num_threads(num_threads)
258 | for (unsigned int n = 0; n < N; n++) {
259 | output[n]=tree->computeNonEdgeForces(n, theta, neg_f + n * D);
260 | }
261 |
262 | double sum_Q = .0;
263 | for (unsigned int n=0; n
279 | void TSNE::computeExactGradient(double* P, double* Y, unsigned int N, int D, double* dC) {
280 |
281 | // Make sure the current gradient contains zeros
282 | for(unsigned int i = 0; i < N * D; i++) dC[i] = 0.0;
283 |
284 | // Compute the squared Euclidean distance matrix
285 | double* DD = (double*) malloc((unsigned long)N * N * sizeof(double));
286 | if(DD == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
287 | computeSquaredEuclideanDistance(Y, N, D, DD);
288 |
289 | // Compute Q-matrix and normalization sum
290 | double* Q = (double*) malloc((unsigned long)N * N * sizeof(double));
291 | if(Q == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
292 | double sum_Q = .0;
293 | for(unsigned long n = 0; n < N; n++) {
294 | for(unsigned long m = 0; m < N; m++) {
295 | if(n != m) {
296 | Q[n * N + m] = 1 / (1 + DD[n * N + m]);
297 | sum_Q += Q[n * N + m];
298 | }
299 | }
300 | }
301 |
302 | // Perform the computation of the gradient
303 | for(unsigned long n = 0; n < N; n++) {
304 | for(unsigned long m = 0; m < N; m++) {
305 | if(n != m) {
306 | double mult = (P[n * N + m] - (Q[n * N + m] / sum_Q)) * Q[n * N + m];
307 | for(int d = 0; d < D; d++) {
308 | dC[n * D + d] += (Y[n * D + d] - Y[m * D + d]) * mult;
309 | }
310 | }
311 | }
312 | }
313 |
314 | // Free memory
315 | free(DD); DD = NULL;
316 | free(Q); Q = NULL;
317 | }
318 |
319 |
320 | // Evaluate t-SNE cost function (exactly)
321 | template
322 | double TSNE::evaluateError(double* P, double* Y, unsigned int N, int D) {
323 |
324 | // Compute the squared Euclidean distance matrix
325 | double* DD = (double*) malloc((unsigned long)N * N * sizeof(double));
326 | double* Q = (double*) malloc((unsigned long)N * N * sizeof(double));
327 | if(DD == NULL || Q == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
328 | computeSquaredEuclideanDistance(Y, N, D, DD);
329 |
330 | // Compute Q-matrix and normalization sum
331 | double sum_Q = DBL_MIN;
332 | for(unsigned long n = 0; n < N; n++) {
333 | for(unsigned long m = 0; m < N; m++) {
334 | if(n != m) {
335 | Q[n * N + m] = 1 / (1 + DD[n * N + m]);
336 | sum_Q += Q[n * N + m];
337 | }
338 | else Q[n * N + m] = DBL_MIN;
339 | }
340 | }
341 | for(unsigned long i = 0; i < (unsigned long)N * N; i++) Q[i] /= sum_Q;
342 |
343 | // Sum t-SNE error
344 | double C = .0;
345 | for(unsigned long n = 0; n < N; n++) {
346 | for(unsigned long m = 0; m < N; m++) {
347 | C += P[n * N + m] * log((P[n * N + m] + 1e-9) / (Q[n * N + m] + 1e-9));
348 | }
349 | }
350 |
351 | // Clean up memory
352 | free(DD);
353 | free(Q);
354 | return C;
355 | }
356 |
357 | // Evaluate t-SNE cost function (approximately)
358 | template
359 | double TSNE::evaluateError(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, unsigned int N, int D, double theta)
360 | {
361 |
362 | // Get estimate of normalization term
363 | SPTree* tree = new SPTree(Y, N);
364 | double* buff = (double*) calloc(D, sizeof(double));
365 | double sum_Q = .0;
366 | for(unsigned int n = 0; n < N; n++) sum_Q += tree->computeNonEdgeForces(n, theta, buff);
367 |
368 | // Loop over all edges to compute t-SNE error
369 | int ind1, ind2;
370 | double C = .0, Q;
371 | for(unsigned int n = 0; n < N; n++) {
372 | ind1 = n * D;
373 | for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) {
374 | Q = .0;
375 | ind2 = col_P[i] * D;
376 | for(int d = 0; d < D; d++) buff[d] = Y[ind1 + d];
377 | for(int d = 0; d < D; d++) buff[d] -= Y[ind2 + d];
378 | for(int d = 0; d < D; d++) Q += buff[d] * buff[d];
379 | Q = (1.0 / (1.0 + Q)) / sum_Q;
380 | C += val_P[i] * log((val_P[i] + FLT_MIN) / (Q + FLT_MIN));
381 | }
382 | }
383 |
384 | // Clean up memory
385 | free(buff);
386 | delete tree;
387 | return C;
388 | }
389 |
390 | // Evaluate t-SNE cost function (exactly)
391 | template
392 | void TSNE::getCost(double* P, double* Y, unsigned int N, int D, double* costs) {
393 |
394 | // Compute the squared Euclidean distance matrix
395 | double* DD = (double*) malloc((unsigned long)N * N * sizeof(double));
396 | double* Q = (double*) malloc((unsigned long)N * N * sizeof(double));
397 | if(DD == NULL || Q == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
398 | computeSquaredEuclideanDistance(Y, N, D, DD);
399 |
400 | // Compute Q-matrix and normalization sum
401 | double sum_Q = DBL_MIN;
402 | for(unsigned long n = 0; n < N; n++) {
403 | for(unsigned long m = 0; m < N; m++) {
404 | if(n != m) {
405 | Q[n * N + m] = 1 / (1 + DD[n * N + m]);
406 | sum_Q += Q[n * N + m];
407 | }
408 | else Q[n * N + m] = DBL_MIN;
409 | }
410 | }
411 | for(unsigned long i = 0; i < (unsigned long)N * N; i++) Q[i] /= sum_Q;
412 |
413 | // Sum t-SNE error
414 | for(unsigned long n = 0; n < N; n++) {
415 | costs[n] = 0.0;
416 | for(unsigned long m = 0; m < N; m++) {
417 | costs[n] += P[n * N + m] * log((P[n * N + m] + 1e-9) / (Q[n * N + m] + 1e-9));
418 | }
419 | }
420 |
421 | // Clean up memory
422 | free(DD);
423 | free(Q);
424 | }
425 |
426 | // Evaluate t-SNE cost function (approximately)
427 | template
428 | void TSNE::getCost(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, unsigned int N, int D, double theta, double* costs)
429 | {
430 |
431 | // Get estimate of normalization term
432 | SPTree* tree = new SPTree(Y, N);
433 | double* buff = (double*) calloc(D, sizeof(double));
434 | double sum_Q = .0;
435 | for(unsigned int n = 0; n < N; n++) sum_Q += tree->computeNonEdgeForces(n, theta, buff);
436 |
437 | // Loop over all edges to compute t-SNE error
438 | int ind1, ind2;
439 | double Q;
440 | for(unsigned int n = 0; n < N; n++) {
441 | ind1 = n * D;
442 | costs[n] = 0.0;
443 | for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) {
444 | Q = .0;
445 | ind2 = col_P[i] * D;
446 | for(int d = 0; d < D; d++) buff[d] = Y[ind1 + d];
447 | for(int d = 0; d < D; d++) buff[d] -= Y[ind2 + d];
448 | for(int d = 0; d < D; d++) Q += buff[d] * buff[d];
449 | Q = (1.0 / (1.0 + Q)) / sum_Q;
450 | costs[n] += val_P[i] * log((val_P[i] + FLT_MIN) / (Q + FLT_MIN));
451 | }
452 | }
453 |
454 | // Clean up memory
455 | free(buff);
456 | delete tree;
457 | }
458 |
459 |
460 | // Compute input similarities with a fixed perplexity
461 | template
462 | void TSNE::computeGaussianPerplexity(double* X, unsigned int N, int D, bool distance_precomputed) {
463 | size_t N2=N;
464 | N2*=N;
465 | P.resize(N2);
466 |
467 | // Compute the squared Euclidean distance matrix
468 | double* DD = (double*) malloc(N2 * sizeof(double));
469 | if(DD == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
470 |
471 | if (distance_precomputed) {
472 | DD = X;
473 | } else {
474 | computeSquaredEuclideanDistanceDirect(X, N, D, DD);
475 |
476 | // Needed to make sure the results are exactly equal to distance calculation in R
477 | for (size_t n=0; n 0) {
515 | min_beta = beta;
516 | if(max_beta == DBL_MAX || max_beta == -DBL_MAX)
517 | beta *= 2.0;
518 | else
519 | beta = (beta + max_beta) / 2.0;
520 | }
521 | else {
522 | max_beta = beta;
523 | if(min_beta == -DBL_MAX || min_beta == DBL_MAX)
524 | beta /= 2.0;
525 | else
526 | beta = (beta + min_beta) / 2.0;
527 | }
528 | }
529 |
530 | // Update iteration counter
531 | iter++;
532 | }
533 |
534 | // Row normalize P
535 | for(unsigned long m = 0; m < N; m++) P[n * N + m] /= sum_P;
536 | }
537 |
538 | // Clean up memory
539 | if (!distance_precomputed) { free(DD); }
540 | DD = NULL;
541 | }
542 |
543 |
544 | // Compute input similarities with a fixed perplexity using ball trees (this function allocates memory another function should free)
545 | template
546 | template
547 | void TSNE::computeGaussianPerplexity(double* X, unsigned int N, int D, int K) {
548 |
549 | if(perplexity > K) Rprintf("Perplexity should be lower than K!\n");
550 |
551 | // Allocate the memory we need
552 | setupApproximateMemory(N, K);
553 |
554 | // Build ball tree on data set
555 | VpTree* tree = new VpTree();
556 | vector obj_X(N, DataPoint(D, -1, X));
557 | for(unsigned int n = 0; n < N; n++) obj_X[n] = DataPoint(D, n, X + n * D);
558 | tree->create(obj_X);
559 |
560 | // Loop over all points to find nearest neighbors
561 | if (verbose) Rprintf("Building tree...\n");
562 |
563 | int steps_completed = 0;
564 | #pragma omp parallel for schedule(guided) num_threads(num_threads)
565 | for(unsigned int n = 0; n < N; n++) {
566 |
567 | vector indices;
568 | vector distances;
569 | indices.reserve(K+1);
570 | distances.reserve(K+1);
571 |
572 | // Find nearest neighbors
573 | tree->search(obj_X[n], K + 1, &indices, &distances);
574 |
575 | double * cur_P = val_P.data() + row_P[n];
576 | computeProbabilities(perplexity, K, distances.data()+1, cur_P); // +1 to avoid self.
577 |
578 | unsigned int * cur_col_P = col_P.data() + row_P[n];
579 | for (int m=0; m
598 | void TSNE::computeGaussianPerplexity(const int* nn_idx, const double* nn_dist, unsigned int N, int K) {
599 |
600 | if(perplexity > K) Rprintf("Perplexity should be lower than K!\n");
601 |
602 | // Allocate the memory we need
603 | setupApproximateMemory(N, K);
604 |
605 | // Loop over all points to find nearest neighbors
606 | int steps_completed = 0;
607 | #pragma omp parallel for schedule(guided) num_threads(num_threads)
608 | for(unsigned int n = 0; n < N; n++) {
609 | double * cur_P = val_P.data() + row_P[n];
610 | computeProbabilities(perplexity, K, nn_dist + row_P[n], cur_P);
611 |
612 | const int * cur_idx = nn_idx + row_P[n];
613 | unsigned int * cur_col_P = col_P.data() + row_P[n];
614 | for (int m=0; m
628 | void TSNE::setupApproximateMemory(unsigned int N, int K) {
629 | row_P.resize(N+1);
630 | col_P.resize(N*K);
631 | val_P.resize(N*K);
632 | row_P[0] = 0;
633 | for(unsigned int n = 0; n < N; n++) row_P[n + 1] = row_P[n] + K;
634 | return;
635 | }
636 |
637 | template
638 | void TSNE::computeProbabilities (const double perplexity, const int K, const double* distances, double* cur_P) {
639 |
640 | // Initialize some variables for binary search
641 | bool found = false;
642 | double beta = 1.0;
643 | double min_beta = -DBL_MAX;
644 | double max_beta = DBL_MAX;
645 | double tol = 1e-5;
646 |
647 | // Iterate until we found a good perplexity
648 | int iter = 0; double sum_P;
649 | while(!found && iter < 200) {
650 |
651 | // Compute Gaussian kernel row
652 | for(int m = 0; m < K; m++) cur_P[m] = exp(-beta * distances[m] * distances[m]);
653 |
654 | // Compute entropy of current row
655 | sum_P = DBL_MIN;
656 | for(int m = 0; m < K; m++) sum_P += cur_P[m];
657 | double H = .0;
658 | for(int m = 0; m < K; m++) H += beta * (distances[m] * distances[m] * cur_P[m]);
659 | H = (H / sum_P) + log(sum_P);
660 |
661 | // Evaluate whether the entropy is within the tolerance level
662 | double Hdiff = H - log(perplexity);
663 | if(Hdiff < tol && -Hdiff < tol) {
664 | found = true;
665 | }
666 | else {
667 | if(Hdiff > 0) {
668 | min_beta = beta;
669 | if(max_beta == DBL_MAX || max_beta == -DBL_MAX)
670 | beta *= 2.0;
671 | else
672 | beta = (beta + max_beta) / 2.0;
673 | }
674 | else {
675 | max_beta = beta;
676 | if(min_beta == -DBL_MAX || min_beta == DBL_MAX)
677 | beta /= 2.0;
678 | else
679 | beta = (beta + min_beta) / 2.0;
680 | }
681 | }
682 |
683 | // Update iteration counter
684 | iter++;
685 | }
686 |
687 | // Row-normalize current row of P.
688 | for(int m = 0; m < K; m++) {
689 | cur_P[m] /= sum_P;
690 | }
691 | return;
692 | }
693 |
694 | template
695 | void TSNE::symmetrizeMatrix(unsigned int N) {
696 | // Count number of elements and row counts of symmetric matrix
697 | int* row_counts = (int*) calloc(N, sizeof(int));
698 | if(row_counts == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
699 | for(unsigned int n = 0; n < N; n++) {
700 | for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) {
701 |
702 | // Check whether element (col_P[i], n) is present
703 | bool present = false;
704 | for(unsigned int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) {
705 | if(col_P[m] == n) present = true;
706 | }
707 | if(present) row_counts[n]++;
708 | else {
709 | row_counts[n]++;
710 | row_counts[col_P[i]]++;
711 | }
712 | }
713 | }
714 | int no_elem = 0;
715 | for(unsigned int n = 0; n < N; n++) no_elem += row_counts[n];
716 |
717 | // Allocate memory for symmetrized matrix
718 | std::vector sym_row_P(N+1), sym_col_P(no_elem);
719 | std::vector sym_val_P(no_elem);
720 |
721 | // Construct new row indices for symmetric matrix
722 | sym_row_P[0] = 0;
723 | for(unsigned int n = 0; n < N; n++) sym_row_P[n + 1] = sym_row_P[n] + row_counts[n];
724 |
725 | // Fill the result matrix
726 | int* offset = (int*) calloc(N, sizeof(int));
727 | if(offset == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
728 | for(unsigned int n = 0; n < N; n++) {
729 | for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) { // considering element(n, col_P[i])
730 |
731 | // Check whether element (col_P[i], n) is present
732 | bool present = false;
733 | for(unsigned int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) {
734 | if(col_P[m] == n) {
735 | present = true;
736 | if(n <= col_P[i]) { // make sure we do not add elements twice
737 | sym_col_P[sym_row_P[n] + offset[n]] = col_P[i];
738 | sym_col_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = n;
739 | sym_val_P[sym_row_P[n] + offset[n]] = val_P[i] + val_P[m];
740 | sym_val_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = val_P[i] + val_P[m];
741 | }
742 | }
743 | }
744 |
745 | // If (col_P[i], n) is not present, there is no addition involved
746 | if(!present) {
747 | sym_col_P[sym_row_P[n] + offset[n]] = col_P[i];
748 | sym_col_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = n;
749 | sym_val_P[sym_row_P[n] + offset[n]] = val_P[i];
750 | sym_val_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = val_P[i];
751 | }
752 |
753 | // Update offsets
754 | if(!present || (present && n <= col_P[i])) {
755 | offset[n]++;
756 | if(col_P[i] != n) offset[col_P[i]]++;
757 | }
758 | }
759 | }
760 |
761 | // Divide the result by two
762 | for(int i = 0; i < no_elem; i++) sym_val_P[i] /= 2.0;
763 |
764 | // Return symmetrized matrices
765 | row_P.swap(sym_row_P);
766 | col_P.swap(sym_col_P);
767 | val_P.swap(sym_val_P);
768 |
769 | // Free up some memery
770 | free(offset); offset = NULL;
771 | free(row_counts); row_counts = NULL;
772 | }
773 |
774 | // Compute squared Euclidean distance matrix (using BLAS)
775 | template
776 | void TSNE::computeSquaredEuclideanDistance(double* X, unsigned int N, int D, double* DD) {
777 | double* dataSums = (double*) calloc(N, sizeof(double));
778 | if(dataSums == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
779 | for(unsigned int n = 0; n < N; n++) {
780 | for(int d = 0; d < D; d++) {
781 | dataSums[n] += (X[n * D + d] * X[n * D + d]);
782 | }
783 | }
784 | for(unsigned long n = 0; n < N; n++) {
785 | for(unsigned long m = 0; m < N; m++) {
786 | DD[n * N + m] = dataSums[n] + dataSums[m];
787 | }
788 | }
789 | double a1 = -2.0;
790 | double a2 = 1.0;
791 | int Nsigned = N;
792 | dgemm_("T", "N", &Nsigned, &Nsigned, &D, &a1, X, &D, X, &D, &a2, DD, &Nsigned FCONE FCONE);
793 | free(dataSums); dataSums = NULL;
794 | }
795 |
796 | // Compute squared Euclidean distance matrix (No BLAS)
797 | template
798 | void TSNE::computeSquaredEuclideanDistanceDirect(double* X, unsigned int N, int D, double* DD) {
799 | const double* XnD = X;
800 | for(unsigned int n = 0; n < N; ++n, XnD += D) {
801 | const double* XmD = XnD + D;
802 | double* curr_elem = &DD[n*N + n];
803 | *curr_elem = 0.0;
804 | double* curr_elem_sym = curr_elem + N;
805 | for(unsigned int m = n + 1; m < N; ++m, XmD+=D, curr_elem_sym+=N) {
806 | *(++curr_elem) = 0.0;
807 | for(int d = 0; d < D; ++d) {
808 | *curr_elem += (XnD[d] - XmD[d]) * (XnD[d] - XmD[d]);
809 | }
810 | *curr_elem_sym = *curr_elem;
811 | }
812 | }
813 | }
814 |
815 |
816 | // Makes data zero-mean
817 | template
818 | void TSNE::zeroMean(double* X, unsigned int N, int D) {
819 |
820 | // Compute data mean
821 | double* mean = (double*) calloc(D, sizeof(double));
822 | if(mean == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
823 | int nD = 0;
824 | for(unsigned int n = 0; n < N; n++) {
825 | for(int d = 0; d < D; d++) {
826 | mean[d] += X[nD + d];
827 | }
828 | nD += D;
829 | }
830 | for(int d = 0; d < D; d++) {
831 | mean[d] /= (double) N;
832 | }
833 |
834 | // Subtract data mean
835 | nD = 0;
836 | for(unsigned int n = 0; n < N; n++) {
837 | for(int d = 0; d < D; d++) {
838 | X[nD + d] -= mean[d];
839 | }
840 | nD += D;
841 | }
842 | free(mean); mean = NULL;
843 | }
844 |
845 | // Generates a Gaussian random number
846 | template
847 | double TSNE::randn() {
848 | Rcpp::RNGScope scope;
849 | double x, y, radius;
850 | do {
851 | x = 2 * (double)R::runif(0,1) - 1;
852 | y = 2 * (double)R::runif(0,1) - 1;
853 | radius = (x * x) + (y * y);
854 | } while((radius >= 1.0) || (radius == 0.0));
855 | radius = sqrt(-2 * log(radius) / radius);
856 | x *= radius;
857 | y *= radius;
858 | return x;
859 | }
860 |
861 | // declare templates explicitly
862 | template class TSNE<1>;
863 | template class TSNE<2>;
864 | template class TSNE<3>;
865 |
--------------------------------------------------------------------------------