├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── CONDUCT.md ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── NEWS.md ├── R ├── classes.R ├── umap.R └── umap_app.R ├── README-NOT.md ├── README.Rmd ├── README.md ├── appveyor.yml ├── img ├── multiple_algorithms_bean.png ├── multiple_algorithms_cancer.png ├── multiple_algorithms_iris.png ├── multiple_algorithms_memory.png ├── multiple_algorithms_rna.png ├── multiple_algorithms_time.png ├── shiny.png ├── unnamed-chunk-3-1.png ├── unnamed-chunk-5-1.png ├── unnamed-chunk-6-1.png └── unnamed-chunk-7-1.png ├── inst └── ropensci_blog │ ├── img │ ├── multiple_algorithms_cancer.png │ ├── multiple_algorithms_memory.png │ ├── multiple_algorithms_time.png │ └── shiny.png │ └── working-on-the-umapr-package.md ├── man ├── make_umap_object.Rd ├── run_umap_shiny.Rd └── umap.Rd ├── tests ├── shinyTest │ └── test_umap_shiny.R ├── testthat.R ├── testthat │ └── test-umapr.R └── umap_output.txt ├── timings.R └── travis_setup.sh /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^CONDUCT\.md$ 4 | ^\.travis\.yml$ 5 | ^README\.Rmd$ 6 | ^README-.*\.png$ 7 | ^timings\.R$ 8 | ^img$ 9 | ^LICENSE\.md$ 10 | ^appveyor\.yml$ 11 | ^travis_setup.sh$ 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | umapr.Rproj 2 | .Rproj.user 3 | .Rhistory 4 | .RData 5 | .Ruserdata 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r 2 | 3 | language: R 4 | sudo: true 5 | cache: packages 6 | 7 | os: 8 | - linux 9 | #- os: osx 10 | #osx_image: xcode10.1 11 | #disable_homebrew: true 12 | #r_build_args: '--no-build-vignettes' 13 | #r_check_args: '--ignore-vignettes --no-examples' 14 | 15 | addons: 16 | apt: 17 | #sources: 18 | # - ubuntu-toolchain-r-test 19 | packages: 20 | - libpython-dev 21 | - libpython3-dev 22 | - texlive-full 23 | - cmake 24 | - python3 25 | #homebrew: 26 | #packages: 27 | #- libpython-dev 28 | #- libpython3-dev 29 | #- texlive-full 30 | #- cmake 31 | #- python3 32 | #update: true 33 | 34 | r: 35 | - release 36 | - devel 37 | 38 | r-packages: 39 | - reticulate 40 | - knitr 41 | - testthat 42 | - RColorBrewer 43 | 44 | env: 45 | global: 46 | - R_CHECK_ARGS="--no-build-vignettes --no-manual --ignore-vignettes --no-examples" 47 | - R_BUILD_ARGS="--no-build-vignettes" 48 | 49 | python: 50 | - "3.5" 51 | - "3.6" 52 | - "3.7" 53 | 54 | before_install: 55 | - chmod +x travis_setup.sh 56 | - ./travis_setup.sh 57 | - pip install --user conda 58 | - pip install --user umap-learn 59 | 60 | warnings_are_errors: false 61 | -------------------------------------------------------------------------------- /CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Code of Conduct 2 | 3 | As contributors and maintainers of this project, we pledge to respect all people who 4 | contribute through reporting issues, posting feature requests, updating documentation, 5 | submitting pull requests or patches, and other activities. 6 | 7 | We are committed to making participation in this project a harassment-free experience for 8 | everyone, regardless of level of experience, gender, gender identity and expression, 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion. 10 | 11 | Examples of unacceptable behavior by participants include the use of sexual language or 12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment, 13 | insults, or other unprofessional conduct. 14 | 15 | Project maintainers have the right and responsibility to remove, edit, or reject comments, 16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 18 | from the project team. 19 | 20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 21 | opening an issue or contacting one or more of the project maintainers. 22 | 23 | This Code of Conduct is adapted from the Contributor Covenant 24 | (http:contributor-covenant.org), version 1.0.0, available at 25 | http://contributor-covenant.org/version/1/0/0/ 26 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: umapr 2 | Title: Wraps UMAP Algorithm for Dimension Reduction 3 | Version: 0.0.0.9001 4 | Authors@R: c( 5 | person("Sean", "Hughes", role = c("aut", "cre"), email = "smhughes@uw.edu"), 6 | person("Ted", "Laderas", role = "aut", email="tedladeras@gmail.com"), 7 | person("Malisa", "Smith", role = "aut"), 8 | person("Ju Yeong", "Kim", role = "aut"), 9 | person("Angela", "Li", role = "aut") 10 | ) 11 | Description: Wraps the Python implementation of the UMAP dimension reductionality algorithm to use in `R`. Uniform Manifold Approximation and Projection (UMAP) is a non-linear dimensionality reduction algorithm that is computationally more efficient than t-SNE (McInnes and Healy, 2018) . This package allows the user to run UMAP from R, producing a data frame that can be plotted on a 2-D graph. 12 | Depends: R (>= 3.2.3) 13 | License: MIT + file LICENSE 14 | URL: https://github.com/ropenscilabs/umapr 15 | BugReports: https://github.com/ropenscilabs/umapr/issues 16 | Encoding: UTF-8 17 | LazyData: true 18 | Imports: 19 | reticulate, 20 | shiny, 21 | ggplot2, 22 | assertthat 23 | Suggests: 24 | testthat, 25 | tidyverse, 26 | knitr, 27 | rmarkdown 28 | VignetteBuilder: knitr 29 | RoxygenNote: 6.1.1 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2018 2 | COPYRIGHT HOLDER: Sean Hughes;Ted Laderas;Malisa Smith;Ju Yeong Kim;Angela Li 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2018 Sean Hughes;Ted Laderas;Malisa Smith;Ju Yeong Kim;Angela Li 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(make_umap_object) 4 | export(run_umap_shiny) 5 | export(umap) 6 | importFrom(assertthat,assert_that) 7 | importFrom(assertthat,is.count) 8 | importFrom(assertthat,is.flag) 9 | importFrom(reticulate,dict) 10 | importFrom(reticulate,import) 11 | importFrom(reticulate,py_available) 12 | importFrom(reticulate,py_install) 13 | importFrom(reticulate,py_module_available) 14 | importFrom(reticulate,r_to_py) 15 | importFrom(reticulate,use_condaenv) 16 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # umapr 0.0.0.9000 2 | 3 | * Added a `NEWS.md` file to track changes to the package. 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /R/classes.R: -------------------------------------------------------------------------------- 1 | library(R6) 2 | library(ggplot2) 3 | 4 | umap_obj <- R6Class("umap_obj", 5 | public = list( 6 | #inherit=base::data.frame, 7 | markers=NULL, 8 | umap_table=NULL, 9 | 10 | plot = function(marker){ 11 | markers <- self$markers 12 | if(!marker %in% markers){stop("marker not in list of markers")} 13 | ggplot2::ggplot(self$umap_table, ggplot2::aes_string(x = "UMAP1", y = "UMAP2", color=marker)) + 14 | ggplot2::geom_point() 15 | }, 16 | 17 | initialize = function(umap_table, annotation=NULL){ 18 | 19 | self$umap_table <- umap_table 20 | if(!is.null(annotation)){ 21 | self$annotation = annotation 22 | } 23 | markers <- colnames(umap_table)[!colnames(umap_table) %in% c("UMAP1","UMAP2")] 24 | 25 | self$markers <- markers 26 | invisible(self) 27 | }, 28 | 29 | explore = function(markers=NULL){ 30 | runUmapShiny(self) 31 | }, 32 | 33 | set_markers = function(markers=NULL){ 34 | self$markers 35 | invisible(self) 36 | }, 37 | 38 | returnData = function(){ 39 | return(self$umap_table) 40 | } 41 | )) 42 | 43 | #' Title 44 | #' 45 | #' @param umap_result - output of running 46 | #' @param annotation - optional annotation file 47 | #' 48 | #' @return - a umap object that includes plotting 49 | #' @export 50 | #' 51 | #' @examples 52 | #' \dontrun{ 53 | #' library(flowCore) 54 | #' 55 | #' umap_table <- umap() 56 | #' } 57 | make_umap_object <- function(umap_result, annotation=NULL){ 58 | umapobj <-umap_obj$new(umap_table=umap_result, annotation=annotation) 59 | return(umapobj) 60 | } 61 | -------------------------------------------------------------------------------- /R/umap.R: -------------------------------------------------------------------------------- 1 | #' umap 2 | #' 3 | #' @description Provides an interface to the UMAP algorithm implemented in Python. 4 | #' 5 | #' @references Leland McInnes and John Healy (2018). UMAP: Uniform Manifold 6 | #' Approximation and Projection for Dimension Reduction. 7 | #' ArXiv e-prints 1802.03426. 8 | #' 9 | #' @param data data frame or matrix. input data. 10 | #' @param include_input logical. Attach input data to UMAP embeddings if desired. 11 | #' @param n_neighbors integer. The size of local neighborhood 12 | #' (in terms of number of neighboring sample points) used for manifold 13 | #' approximation. Larger values result in more global views of the manifold, 14 | #' while smaller values result in more local data being preserved. In general 15 | #' values should be in the range 2 to 100. 16 | #' @param n_components integer The dimension of the space to embed into. This 17 | #' defaults to 2 to provide easy visualization, but can reasonably be set to 18 | #' any integer value in the range 2 to 100. 19 | #' @param metric character. The metric to use to compute distances in high 20 | #' dimensional space. If a string is passed it must match a valid predefined 21 | #' metric. If a general metric is required a function that takes two 1d arrays 22 | #' and returns a float can be provided. For performance purposes it is required 23 | #' that this be a numba jit'd function. Valid string metrics include: euclidean, 24 | #' manhattan, chebyshev, minkowski, canberra, braycurtis, mahalanobis, 25 | #' wminkowski, seuclidean, cosine, correlation, haversine, hamming, jaccard, 26 | #' dice, russelrao, kulsinski, rogerstanimoto, sokalmichener, sokalsneath, yule. 27 | #' Metrics that take arguments (such as minkowski, mahalanobis etc.) can have 28 | #' arguments passed via the metric_kwds dictionary. At this time care must be 29 | #' taken and dictionary elements must be ordered appropriately; this will 30 | #' hopefully be fixed in the future. 31 | #' @param n_epochs integer The number of training epochs to use in optimization. 32 | #' @param learning_rate numeric. The initial learning rate for the embedding optimization. 33 | #' @param alpha numeric. The initial learning rate for the embedding optimization. 34 | #' @param init character. How to initialize the low dimensional embedding. 35 | #' Options are: 'spectral' (use a spectral embedding of the fuzzy 1-skeleton), 36 | #' 'random' (assign initial embedding positions at random), 37 | #' * A numpy array of initial embedding positions. 38 | #' @param spread numeric. The effective scale of embedded points. 39 | #' In combination with ``min_dist`` this determines how clustered/clumped the 40 | #' embedded points are. 41 | #' @param min_dist numeric. The effective minimum distance between embedded 42 | #' points. Smaller values will result in a more clustered/clumped embedding 43 | #' where nearby points on the manifold are drawn closer together, while larger 44 | #' values will result on a more even dispersal of points. The value should be 45 | #' set relative to the ``spread`` value, which determines the scale at which 46 | #' embedded points will be spread out. 47 | #' @param set_op_mix_ratio numeric. Interpolate between (fuzzy) union and 48 | #' intersection as the set operation used to combine local fuzzy simplicial 49 | #' sets to obtain a global fuzzy simplicial sets. Both fuzzy set operations use 50 | #' the product t-norm. The value of this parameter should be between 0.0 and 51 | #' 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure 52 | #' fuzzy intersection. 53 | #' @param local_connectivity integer The local connectivity required -- i.e. 54 | #' the number of nearest neighbors that should be assumed to be connected at a 55 | #' local level. The higher this value the more connected the manifold becomes 56 | #' locally. In practice, this should be not more than the local intrinsic 57 | #' dimension of the manifold. 58 | #' @param repulsion_strength numeric. Weighting applied to negative samples in 59 | #' low dimensional embedding optimization. Values higher than one will result in 60 | #' greater weight being given to negative samples. 61 | #' @param bandwidth numeric. The effective bandwidth of the kernel if we view 62 | #' the algorithm as similar to Laplacian eigenmaps. Larger values induce more 63 | #' connectivity and a more global view of the data, smaller values concentrate 64 | #' more locally. 65 | #' @param gamma numeric. Weighting applied to negative samples in low 66 | #' dimensional embedding optimization. Values higher than one will result in 67 | #' greater weight being given to negative samples. 68 | #' @param negative_sample_rate numeric. The number of negative edge/1-simplex 69 | #' samples to use per positive edge/1-simplex sample in optimizing the low 70 | #' dimensional embedding. 71 | #' @param transform_queue_size numeric. For transform operations (embedding new points 72 | #' using a trained model_ this will control how aggressively to search for 73 | #' nearest neighbors. Larger values will result in slower performance but 74 | #' more accurate nearest neighbor evaluation. 75 | #' @param a numeric. More specific parameters controlling the embedding. 76 | #' If NULL, these values are set automatically as determined by ``min_dist`` 77 | #' and ``spread``. 78 | #' @param b numeric. More specific parameters controlling the embedding. 79 | #' If NULL, these values are set automatically as determined by ``min_dist`` 80 | #' and ``spread``. 81 | #' @param random_state integer. If integer, random_state is the seed used by the 82 | #' random number generator; If NULL, the random number generator is the 83 | #' RandomState instance used by `np.random`. 84 | #' @param metric_kwds reticulate dictionary. Arguments to pass on to the metric, 85 | #' such as the ``p`` value for Minkowski distance. 86 | #' @param angular_rp_forest logical. Whether to use an angular random projection 87 | #' forest to initialise the approximate nearest neighbor search. This can be 88 | #' faster, but is mostly on useful for metric that use an angular style distance 89 | #' such as cosine, correlation etc. In the case of those metrics angular forests 90 | #' will be chosen automatically. 91 | #' @param target_n_neighbors integer. The number of nearest neighbors to use to 92 | #' construct the target simplcial set. If set to -1 use the n_neighbors value. 93 | #' @param target_metric character or function. The metric used to measure distance 94 | #' for a target array is using supervised dimension reduction. By default this is 95 | #' ‘categorical’ which will measure distance in terms of whether categories match 96 | #' or are different. Furthermore, if semi-supervised is required target values of 97 | #' -1 will be trated as unlabelled under the ‘categorical’ metric. If the target 98 | #' array takes continuous values (e.g. for a regression problem) then metric of 99 | #' ‘l1’ or ‘l2’ is probably more appropriate. 100 | #' @param target_metric_kwds reticulate dictionary. Keyword argument to pass to 101 | #' the target metric when performing supervised dimension reduction. If None then 102 | #' no arguments are passed on. 103 | #' @param target_weight numeric. weighting factor between data topology and target 104 | #' topology. A value of 0.0 weights entirely on data, a value of 1.0 weights 105 | #' entirely on target. The default of 0.5 balances the weighting equally between 106 | #' data and target. 107 | #' @param transform_seed integer. Random seed used for the stochastic aspects of 108 | #' the transform operation. This ensures consistency in transform operations. 109 | #' @param verbose logical. Controls verbosity of logging. 110 | #' 111 | #' @return matrix 112 | #' @export 113 | #' @importFrom assertthat assert_that is.count is.flag 114 | #' @importFrom reticulate dict r_to_py py_module_available py_install import use_condaenv py_available 115 | #' 116 | #' @examples 117 | #' #test only if umap python module 118 | #' if(reticulate::py_module_available("umap")){} 119 | #' 120 | #' #import umap library (and load python module) 121 | #' 122 | #' library("umapr") 123 | #' umap(as.matrix(iris[, 1:4])) 124 | #' umap(iris[, 1:4]) 125 | #' 126 | #' 127 | #' } 128 | umap <- function(data, 129 | include_input = TRUE, 130 | n_neighbors = 15L, 131 | n_components = 2L, 132 | metric = "euclidean", 133 | n_epochs = NULL, 134 | learning_rate = 1.0, 135 | alpha = 1.0, 136 | init = "spectral", 137 | spread = 1.0, 138 | min_dist = 0.1, 139 | set_op_mix_ratio = 1.0, 140 | local_connectivity = 1L, 141 | repulsion_strength = 1.0, 142 | bandwidth = 1.0, 143 | gamma = 1.0, 144 | negative_sample_rate = 5L, 145 | transform_queue_size = 4.0, 146 | a = NULL, 147 | b = NULL, 148 | random_state = NULL, 149 | metric_kwds = dict(), 150 | angular_rp_forest = FALSE, 151 | target_n_neighbors = -1L, 152 | target_metric = "categorical", 153 | target_metric_kwds = dict(), 154 | target_weight = 0.5, 155 | transform_seed = 42L, 156 | verbose = FALSE) { 157 | assert_that(is.matrix(data) | is.data.frame(data), msg = "Data must be a data frame or a matrix.") 158 | if (!all(unlist(lapply(data, is.numeric)))) stop("All columns should be numeric.") 159 | assert_that(is.logical(include_input)) 160 | assert_that(is.count(n_neighbors)) 161 | assert_that(is.count(n_components)) 162 | assert_that(is.character(metric), msg = "Valid string metrics include: euclidean, manhattan, chebyshev, minkowski, canberra, braycurtis, mahalanobis, wminkowski, seuclidean, cosine, correlation, haversine, hamming, jaccard, dice, russelrao, kulsinski, rogerstanimoto, sokalmichener, sokalsneath, yule.") 163 | assert_that(is.null(n_epochs) | is.count(n_epochs), msg = "n_epochs is not a count (a single positive integer)") 164 | assert_that(is.numeric(learning_rate)) 165 | assert_that(is.numeric(alpha)) 166 | assert_that(init %in% c("spectral", "random"), msg = "init must be one of 'spectral', 'random', or a numpy array of initial embedding positions") 167 | assert_that(is.numeric(spread)) 168 | assert_that(is.numeric(min_dist)) 169 | assert_that(is.numeric(set_op_mix_ratio)) 170 | assert_that(is.count(local_connectivity)) 171 | assert_that(is.numeric(repulsion_strength)) 172 | assert_that(is.numeric(bandwidth)) 173 | assert_that(is.numeric(gamma)) 174 | assert_that(is.count(negative_sample_rate)) 175 | assert_that(is.numeric(transform_queue_size)) 176 | assert_that(is.null(a) | is.numeric(a)) 177 | assert_that(is.null(b) | is.numeric(b)) 178 | assert_that(is.null(random_state) | is.count(random_state)) 179 | assert_that(is_dict(metric_kwds), msg = "metric_kwds must be a Python dictionary object, you can create it using 'reticulate::dict()'") 180 | assert_that(is.flag(angular_rp_forest)) 181 | assert_that(is.integer(target_n_neighbors)) 182 | assert_that(is.character(target_metric) | is.function(target_metric)) 183 | assert_that(is_dict(target_metric_kwds)) 184 | assert_that(is.numeric(target_weight)) 185 | assert_that(is.integer(transform_seed)) 186 | assert_that(is.flag(verbose)) 187 | 188 | # keyword "alpha" was renamed "initial_alpha" in a later version of the 189 | # python library, try running it both ways in case of failure 190 | 191 | 192 | modules <- py_module_available("umap") 193 | if(!modules){ 194 | install_python_modules <- function(method = "auto", conda = "auto") { 195 | py_install("umap-learn", method = method, conda = conda) 196 | } 197 | tryCatch(install_python_modules(), 198 | error = function(e) { 199 | modules <- FALSE 200 | }, 201 | finally = "umap-learn installed") 202 | modules <- py_module_available("umap") 203 | } else { 204 | print("umap-learn already installed") 205 | } 206 | 207 | umap_module <- import("umap") 208 | 209 | umap_vec <- tryCatch( 210 | umap_module$UMAP( 211 | n_neighbors = as.integer(n_neighbors), 212 | n_components = as.integer(n_components), 213 | metric = metric, 214 | n_epochs = n_epochs, 215 | alpha = alpha, 216 | init = init, 217 | spread = spread, 218 | min_dist = min_dist, 219 | set_op_mix_ratio = set_op_mix_ratio, 220 | local_connectivity = local_connectivity, 221 | bandwidth = bandwidth, 222 | gamma = r_to_py(gamma), 223 | negative_sample_rate = as.integer(negative_sample_rate), 224 | a = a, 225 | b = b, 226 | random_state = random_state, 227 | metric_kwds = metric_kwds, 228 | angular_rp_forest = angular_rp_forest, 229 | verbose = verbose 230 | )$fit_transform(r_to_py(as.matrix(data))), 231 | error = function(e) { 232 | if (grepl("alpha", e$message) || grepl("bandwidth", e$message)) { 233 | umap_module$UMAP( 234 | n_neighbors = r_to_py(as.integer(n_neighbors)), 235 | n_components = r_to_py(as.integer(n_components)), 236 | metric = r_to_py(metric), 237 | n_epochs = r_to_py(n_epochs), 238 | learning_rate = r_to_py(as.numeric(learning_rate)), 239 | init = r_to_py(init), 240 | min_dist = r_to_py(as.numeric(min_dist)), 241 | spread = r_to_py(as.numeric(spread)), 242 | set_op_mix_ratio = r_to_py(as.numeric(set_op_mix_ratio)), 243 | local_connectivity = r_to_py(as.integer(local_connectivity)), 244 | repulsion_strength = r_to_py(as.numeric(repulsion_strength)), 245 | negative_sample_rate = r_to_py(as.integer(negative_sample_rate)), 246 | transform_queue_size = r_to_py(as.numeric(transform_queue_size)), 247 | a = r_to_py(a), 248 | b = r_to_py(b), 249 | random_state = r_to_py(random_state), 250 | metric_kwds = r_to_py(metric_kwds), 251 | angular_rp_forest = r_to_py(angular_rp_forest), 252 | target_n_neighbors = as.integer(target_n_neighbors), 253 | target_metric = r_to_py(target_metric), 254 | target_metric_kwds = r_to_py(target_metric_kwds), 255 | target_weight = r_to_py(target_weight), 256 | transform_seed = r_to_py(as.integer(transform_seed)), 257 | verbose = r_to_py(verbose) 258 | )$fit_transform(r_to_py(as.matrix(data))) 259 | } else { 260 | stop(e) 261 | } 262 | } 263 | ) 264 | colnames(umap_vec) <- paste0("UMAP", seq_len(ncol(umap_vec))) 265 | 266 | # attach input data to UMAP embeddings if desired 267 | if (include_input) { 268 | output <- data.frame(cbind(data, umap_vec)) 269 | } else { 270 | output <- data.frame(umap_vec) 271 | } 272 | 273 | #make_umap_object(output) 274 | output 275 | } 276 | 277 | is_dict <- function(x) { 278 | inherits(x, "python.builtin.dict") 279 | } 280 | 281 | # global reference to umap (will be initialized in .onLoad) 282 | umap_module <<- NULL 283 | 284 | .onLoad <- function(libname, pkgname) { 285 | # use superassignment to update global reference to umap 286 | if(py_available()){ 287 | use_condaenv("r-reticulate") 288 | modules <- py_module_available("umap") 289 | if(!modules){ 290 | install_python_modules <- function(method = "auto", conda = "auto") { 291 | py_install("umap-learn", method = method, conda = conda) 292 | } 293 | tryCatch(install_python_modules(), 294 | error = function(e) { 295 | modules <- FALSE 296 | }) 297 | modules <- py_module_available("umap") 298 | } 299 | if (suppressWarnings(suppressMessages(requireNamespace("reticulate")))) { 300 | 301 | if (modules) { 302 | ## assignment in parent environment! 303 | umap_module <- import("umap", delay_load = TRUE) 304 | } else { 305 | install_python_modules() 306 | } 307 | } 308 | } 309 | } 310 | 311 | .onAttach <- function(libname, pkgname) { 312 | if(py_available()){ 313 | use_condaenv("r-reticulate") 314 | modules <- py_module_available("umap") 315 | if(!modules){ 316 | install_python_modules <- function(method = "auto", conda = "auto") { 317 | py_install("umap-learn", method = method, conda = conda) 318 | } 319 | tryCatch(install_python_modules(), 320 | error = function(e) { 321 | modules <- FALSE 322 | }, 323 | finally = "umap-learn installed") 324 | modules <- py_module_available("umap") 325 | } 326 | } else{ 327 | packageStartupMessage("Warning message: 328 | Python not installed 329 | Please install anaconda or miniconda 330 | https://conda.io/projects/conda/en/latest/user-guide/install/index.html") 331 | } 332 | if(py_available() && py_module_available("umap")){ 333 | umap_module <- import("umap") 334 | packageStartupMessage("umap-learn python module loaded successfully") 335 | } else { 336 | packageStartupMessage("Warning message: 337 | umap-learn module is not installed 338 | Please run one of the following: 339 | conda install -n r-reticulate -c conda-forge umap-learn 340 | conda activate r-reticulate; pip install umap-learn") 341 | } 342 | } 343 | -------------------------------------------------------------------------------- /R/umap_app.R: -------------------------------------------------------------------------------- 1 | #' Open a shiny app to explore the data in a UMAP embedding. 2 | #' 3 | #' @param umap output of a call to `umap` 4 | #' 5 | #' @return Open an interactive shiny app to explore the data. 6 | #' @export 7 | run_umap_shiny <- function(umap){ 8 | 9 | #umapobj <- deparse(substitute(umap_obj)) 10 | 11 | #umap <- umap_obj$umap_table 12 | #markers <- umap_obj 13 | markers <- colnames(umap)[!colnames(umap) %in% c("UMAP1", "UMAP2")] 14 | 15 | #if(is.null(markers)){ markers <- umap_obj$markers} 16 | 17 | # Define UI for application that draws a histogram 18 | ui <- shiny::fluidPage( 19 | 20 | # Application title 21 | shiny::titlePanel("UMAP Explorer"), 22 | 23 | # Sidebar with a slider input for number of bins 24 | shiny::sidebarLayout( 25 | shiny::sidebarPanel( 26 | shiny::selectInput("marker", label = "Select Variable to Color By", choices = markers, selected = markers[1]) 27 | ), 28 | 29 | # Show a plot of the generated distribution 30 | shiny::mainPanel( 31 | shiny::plotOutput("umapPlot") 32 | ) 33 | ) 34 | ) 35 | 36 | # Define server logic required to draw a histogram 37 | server <- function(input, output) { 38 | 39 | output$umapPlot <- shiny::renderPlot({ 40 | 41 | #umap$plot(input$marker) 42 | out_plot <- ggplot2::ggplot(umap, ggplot2::aes_string(x = "UMAP1", y = "UMAP2", color=input$marker)) + 43 | ggplot2::geom_point() 44 | 45 | #umap_obj$plot(input$marker) 46 | 47 | out_plot 48 | }) 49 | } 50 | 51 | # Run the application 52 | shiny::shinyApp(ui = ui, server = server) 53 | 54 | 55 | } 56 | 57 | -------------------------------------------------------------------------------- /README-NOT.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | umapr 4 | ===== 5 | 6 | [![Project Status: Abandoned – Initial development has started, but there has not yet been a stable, usable release; the project has been abandoned and the author(s) do not intend on continuing development.](https://www.repostatus.org/badges/latest/abandoned.svg)](https://www.repostatus.org/#abandoned) 7 | [![Travis-CI Build Status](https://travis-ci.org/ropenscilabs/umapr.svg?branch=master)](https://travis-ci.org/ropenscilabs/umapr) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/juyeongkim/umapr?branch=master&svg=true)](https://ci.appveyor.com/project/juyeongkim/umapr) [![codecov](https://codecov.io/gh/ropenscilabs/umapr/branch/master/graph/badge.svg)](https://codecov.io/gh/ropenscilabs/umapr) 8 | 9 | `umapr` wraps the Python implementation of UMAP to make the algorithm accessible from within R. It uses the great [`reticulate`](https://cran.r-project.org/web/packages/reticulate/index.html) package. 10 | 11 | Uniform Manifold Approximation and Projection (UMAP) is a non-linear dimensionality reduction algorithm. It is similar to t-SNE but computationally more efficient. UMAP was created by Leland McInnes and John Healy ([github](https://github.com/lmcinnes/umap), [arxiv](https://arxiv.org/abs/1802.03426)). 12 | 13 | Recently, two new UMAP R packages have appeared. These new packages provide more features than `umapr` does and they are more actively developed. These packages are: 14 | 15 | - [umap](https://github.com/tkonopka/umap), which provides the same Python wrapping function as `umapr` and also an R implementation, removing the need for the Python version to be installed. It is available on [CRAN](https://cran.r-project.org/web/packages/umap/index.html). 16 | 17 | - [uwot](https://github.com/jlmelville/uwot), which also provides an R implementation, removing the need for the Python version to be installed. 18 | 19 | Contributors 20 | ------------ 21 | 22 | [Angela Li](https://github.com/angela-li), [Ju Kim](https://github.com/juyeongkim), [Malisa Smith](https://github.com/malisas), [Sean Hughes](https://github.com/seaaan), [Ted Laderas](https://github.com/laderast) 23 | 24 | `umapr` is a project that was first developed at [rOpenSci Unconf 2018](http://unconf18.ropensci.org). 25 | 26 | Installation 27 | ------------ 28 | 29 | **First**, you will need to install `Python` and the `UMAP` package. Instruction available [here](https://github.com/lmcinnes/umap#installing). 30 | 31 | Then, you can install the development version from [GitHub](https://github.com/) with: 32 | 33 | ``` r 34 | # install.packages("devtools") 35 | devtools::install_github("ropenscilabs/umapr") 36 | ``` 37 | 38 | Basic use 39 | --------- 40 | 41 | Here is an example of running UMAP on the `iris` data set. 42 | 43 | ``` r 44 | library(umapr) 45 | library(tidyverse) 46 | 47 | # select only numeric columns 48 | df <- as.matrix(iris[ , 1:4]) 49 | 50 | # run UMAP algorithm 51 | embedding <- umap(df) 52 | ``` 53 | 54 | `umap` returns a `data.frame` with two attached columns called "UMAP1" and "UMAP2". These columns represent the UMAP embeddings of the data, which are column-bound to the original data frame. 55 | 56 | ``` r 57 | # look at result 58 | head(embedding) 59 | #> Sepal.Length Sepal.Width Petal.Length Petal.Width UMAP1 UMAP2 60 | #> 1 5.1 3.5 1.4 0.2 5.647059 -6.666872 61 | #> 2 4.9 3.0 1.4 0.2 4.890193 -8.130815 62 | #> 3 4.7 3.2 1.3 0.2 4.397037 -7.546669 63 | #> 4 4.6 3.1 1.5 0.2 4.412886 -7.633424 64 | #> 5 5.0 3.6 1.4 0.2 5.707233 -6.863213 65 | #> 6 5.4 3.9 1.7 0.4 6.442851 -5.726554 66 | 67 | # plot the result 68 | embedding %>% 69 | mutate(Species = iris$Species) %>% 70 | ggplot(aes(UMAP1, UMAP2, color = Species)) + geom_point() 71 | ``` 72 | 73 | ![](img/unnamed-chunk-3-1.png) 74 | 75 | There is a function called `run_umap_shiny()` which will bring up a Shiny app for exploring different colors of the variables on the umap plots. 76 | 77 | ``` r 78 | run_umap_shiny(embedding) 79 | ``` 80 | 81 | ![Shiny App for Exploring Results](img/shiny.png) 82 | 83 | Function parameters 84 | ------------------- 85 | 86 | There are a few important parameters. These are fully described in the UMAP Python [documentation](https://github.com/lmcinnes/umap/blob/bf1c3e5c89ea393c9de10bd66c5e3d9bc30588ee/notebooks/UMAP%20usage%20and%20parameters.ipynb). 87 | 88 | The `n_neighbor` argument can range from 2 to n-1 where n is the number of rows in the data. 89 | 90 | ``` r 91 | neighbors <- c(4, 8, 16, 32, 64, 128) 92 | 93 | 94 | 95 | neighbors %>% 96 | map_df(~umap(as.matrix(iris[,1:4]), n_neighbors = .x) %>% 97 | mutate(Species = iris$Species, Neighbor = .x)) %>% 98 | mutate(Neighbor = as.integer(Neighbor)) %>% 99 | ggplot(aes(UMAP1, UMAP2, color = Species)) + 100 | geom_point() + 101 | facet_wrap(~ Neighbor, scales = "free") 102 | ``` 103 | 104 | ![](img/unnamed-chunk-5-1.png) 105 | 106 | The `min_dist` argument can range from 0 to 1. 107 | 108 | ``` r 109 | dists <- c(0.001, 0.01, 0.05, 0.1, 0.5, 0.99) 110 | 111 | dists %>% 112 | map_df(~umap(as.matrix(iris[,1:4]), min_dist = .x) %>% 113 | mutate(Species = iris$Species, Distance = .x)) %>% 114 | ggplot(aes(UMAP1, UMAP2, color = Species)) + 115 | geom_point() + 116 | facet_wrap(~ Distance, scales = "free") 117 | ``` 118 | 119 | ![](img/unnamed-chunk-6-1.png) 120 | 121 | The `distance` argument can be many different distance functions. 122 | 123 | ``` r 124 | dists <- c("euclidean", "manhattan", "canberra", "cosine", "hamming", "dice") 125 | 126 | dists %>% 127 | map_df(~umap(as.matrix(iris[,1:4]), metric = .x) %>% 128 | mutate(Species = iris$Species, Metric = .x)) %>% 129 | ggplot(aes(UMAP1, UMAP2, color = Species)) + 130 | geom_point() + 131 | facet_wrap(~ Metric, scales = "free") 132 | ``` 133 | 134 | ![](img/unnamed-chunk-7-1.png) 135 | 136 | Comparison to t-SNE and principal components analysis 137 | ----------------------------------------------------- 138 | 139 | t-SNE and UMAP are both non-linear dimensionality reduction methods, in contrast to PCA. Because t-SNE is relatively slow, PCA is sometimes run first to reduce the dimensions of the data. 140 | 141 | We compared UMAP to PCA and t-SNE alone, as well as to t-SNE run on data preprocessed with PCA. In each case, the data were subset to include only complete observations. The code to reproduce these findings are available in [`timings.R`](timings.R). 142 | 143 | The first data set is the same iris data set used above (149 observations of 4 variables): 144 | 145 | ![t-SNE, PCA, and UMAP on iris](img/multiple_algorithms_iris.png) 146 | 147 | Next we tried a cancer data set, made up of 699 observations of 10 variables: 148 | 149 | ![t-SNE, PCA, and UMAP on cancer](img/multiple_algorithms_cancer.png) 150 | 151 | Third we tried a soybean data set. It is made up of 531 observations and 35 variables: 152 | 153 | ![t-SNE, PCA, and UMAP on soybeans](img/multiple_algorithms_bean.png) 154 | 155 | Finally we used a large single-cell RNAsequencing data set, with 561 observations (cells) of 55186 variables (over 30 million elements)! 156 | 157 | ![t-SNE, PCA, and UMAP on rna](img/multiple_algorithms_rna.png) 158 | 159 | PCA is orders of magnitude faster than t-SNE or UMAP (not shown). UMAP, though, is a substantial improvement over t-SNE both in terms of memory and time taken to run. 160 | 161 | ![Time to run t-SNE vs UMAP](img/multiple_algorithms_time.png) 162 | 163 | ![Memory to run t-SNE vs UMAP](img/multiple_algorithms_memory.png) 164 | 165 | Related projects 166 | ---------------- 167 | 168 | - [`umap`](https://github.com/tkonopka/umap): R implementation of UMAP 169 | - [`seurat`](https://github.com/satijalab/seurat): R toolkit for single cell genomics 170 | - [`smallvis`](https://github.com/jlmelville/smallvis): R package for dimensionality reduction of small datasets 171 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, echo = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "img/" 12 | ) 13 | library(bindrcpp) 14 | ``` 15 | 16 | # umapr 17 | 18 | [![Travis-CI Build Status](https://travis-ci.org/ropenscilabs/umapr.svg?branch=master)](https://travis-ci.org/ropenscilabs/umapr) 19 | [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/juyeongkim/umapr?branch=master&svg=true)](https://ci.appveyor.com/project/juyeongkim/umapr) 20 | [![codecov](https://codecov.io/gh/ropenscilabs/umapr/branch/master/graph/badge.svg)](https://codecov.io/gh/ropenscilabs/umapr) 21 | 22 | `umapr` wraps the Python implementation of UMAP to make the algorithm accessible from within R. It uses the great [`reticulate`](https://cran.r-project.org/web/packages/reticulate/index.html) package. 23 | 24 | Uniform Manifold Approximation and Projection (UMAP) is a non-linear dimensionality reduction algorithm. It is similar to t-SNE but computationally more efficient. UMAP was created by Leland McInnes and John Healy ([github](https://github.com/lmcinnes/umap), [arxiv](https://arxiv.org/abs/1802.03426)). 25 | 26 | Recently, two new UMAP R packages have appeared. These new packages provide more features than `umapr` does and they are more actively developed. These packages are: 27 | 28 | * [umap](https://github.com/tkonopka/umap), which provides the same Python wrapping function as `umapr` and also an R implementation, removing the need for the Python version to be installed. It is available on [CRAN](https://cran.r-project.org/web/packages/umap/index.html). 29 | 30 | * [uwot](https://github.com/jlmelville/uwot), which also provides an R implementation, removing the need for the Python version to be installed. 31 | 32 | ## Contributors 33 | 34 | [Angela Li](https://github.com/angela-li), [Ju Kim](https://github.com/juyeongkim), [Malisa Smith](https://github.com/malisas), [Sean Hughes](https://github.com/seaaan), [Ted Laderas](https://github.com/laderast) 35 | 36 | `umapr` is a project that was first developed at [rOpenSci Unconf 2018](http://unconf18.ropensci.org). 37 | 38 | ## Installation 39 | 40 | **First**, you will need to install `Python` and the `UMAP` package. Instruction available [here](https://github.com/lmcinnes/umap#installing). 41 | 42 | Then, you can install the development version from [GitHub](https://github.com/) with: 43 | 44 | ``` r 45 | # install.packages("devtools") 46 | devtools::install_github("ropenscilabs/umapr") 47 | ``` 48 | 49 | ## Basic use 50 | 51 | Here is an example of running UMAP on the `iris` data set. 52 | 53 | ```{r message=FALSE, warning=FALSE, fig.width=7} 54 | library(umapr) 55 | library(tidyverse) 56 | 57 | # select only numeric columns 58 | df <- as.matrix(iris[ , 1:4]) 59 | 60 | # run UMAP algorithm 61 | embedding <- umap(df) 62 | ``` 63 | 64 | `umap` returns a `data.frame` with two attached columns called "UMAP1" and "UMAP2". These columns represent the UMAP embeddings of the data, which are column-bound to the original data frame. 65 | 66 | ```{r} 67 | # look at result 68 | head(embedding) 69 | 70 | # plot the result 71 | embedding %>% 72 | mutate(Species = iris$Species) %>% 73 | ggplot(aes(UMAP1, UMAP2, color = Species)) + geom_point() 74 | ``` 75 | 76 | There is a function called `run_umap_shiny()` which will bring up a Shiny app for exploring different colors of the variables on the umap plots. 77 | 78 | ```{r eval=FALSE} 79 | run_umap_shiny(embedding) 80 | ``` 81 | 82 | ![Shiny App for Exploring Results](img/shiny.png) 83 | 84 | ## Function parameters 85 | 86 | There are a few important parameters. These are fully described in the UMAP Python [documentation](https://github.com/lmcinnes/umap/blob/bf1c3e5c89ea393c9de10bd66c5e3d9bc30588ee/notebooks/UMAP%20usage%20and%20parameters.ipynb). 87 | 88 | The `n_neighbor` argument can range from 2 to n-1 where n is the number of rows in the data. 89 | 90 | ```{r fig.width=7} 91 | neighbors <- c(4, 8, 16, 32, 64, 128) 92 | 93 | 94 | 95 | neighbors %>% 96 | map_df(~umap(as.matrix(iris[,1:4]), n_neighbors = .x) %>% 97 | mutate(Species = iris$Species, Neighbor = .x)) %>% 98 | mutate(Neighbor = as.integer(Neighbor)) %>% 99 | ggplot(aes(UMAP1, UMAP2, color = Species)) + 100 | geom_point() + 101 | facet_wrap(~ Neighbor, scales = "free") 102 | ``` 103 | 104 | The `min_dist` argument can range from 0 to 1. 105 | 106 | ```{r fig.width=7} 107 | dists <- c(0.001, 0.01, 0.05, 0.1, 0.5, 0.99) 108 | 109 | dists %>% 110 | map_df(~umap(as.matrix(iris[,1:4]), min_dist = .x) %>% 111 | mutate(Species = iris$Species, Distance = .x)) %>% 112 | ggplot(aes(UMAP1, UMAP2, color = Species)) + 113 | geom_point() + 114 | facet_wrap(~ Distance, scales = "free") 115 | ``` 116 | 117 | The `distance` argument can be many different distance functions. 118 | 119 | ```{r fig.width=7} 120 | dists <- c("euclidean", "manhattan", "canberra", "cosine", "hamming", "dice") 121 | 122 | dists %>% 123 | map_df(~umap(as.matrix(iris[,1:4]), metric = .x) %>% 124 | mutate(Species = iris$Species, Metric = .x)) %>% 125 | ggplot(aes(UMAP1, UMAP2, color = Species)) + 126 | geom_point() + 127 | facet_wrap(~ Metric, scales = "free") 128 | ``` 129 | 130 | ## Comparison to t-SNE and principal components analysis 131 | 132 | t-SNE and UMAP are both non-linear dimensionality reduction methods, in contrast to PCA. Because t-SNE is relatively slow, PCA is sometimes run first to reduce the dimensions of the data. 133 | 134 | We compared UMAP to PCA and t-SNE alone, as well as to t-SNE run on data preprocessed with PCA. In each case, the data were subset to include only complete observations. The code to reproduce these findings are available in [`timings.R`](timings.R). 135 | 136 | The first data set is the same iris data set used above (149 observations of 4 variables): 137 | 138 | ![t-SNE, PCA, and UMAP on iris](img/multiple_algorithms_iris.png) 139 | 140 | Next we tried a cancer data set, made up of 699 observations of 10 variables: 141 | 142 | ![t-SNE, PCA, and UMAP on cancer](img/multiple_algorithms_cancer.png) 143 | 144 | Third we tried a soybean data set. It is made up of 531 observations and 35 variables: 145 | 146 | ![t-SNE, PCA, and UMAP on soybeans](img/multiple_algorithms_bean.png) 147 | 148 | Finally we used a large single-cell RNAsequencing data set, with 561 observations (cells) of 55186 variables (over 30 million elements)! 149 | 150 | ![t-SNE, PCA, and UMAP on rna](img/multiple_algorithms_rna.png) 151 | 152 | PCA is orders of magnitude faster than t-SNE or UMAP (not shown). UMAP, though, is a substantial improvement over t-SNE both in terms of memory and time taken to run. 153 | 154 | ![Time to run t-SNE vs UMAP](img/multiple_algorithms_time.png) 155 | 156 | ![Memory to run t-SNE vs UMAP](img/multiple_algorithms_memory.png) 157 | 158 | ## Related projects 159 | 160 | * [`umap`](https://github.com/tkonopka/umap): R implementation of UMAP 161 | * [`seurat`](https://github.com/satijalab/seurat): R toolkit for single cell genomics 162 | * [`smallvis`](https://github.com/jlmelville/smallvis): R package for dimensionality reduction of small datasets 163 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # umapr 2 | 3 | [![Project Status: Abandoned](https://www.repostatus.org/badges/latest/abandoned.svg)](https://www.repostatus.org/#abandoned) 4 | 5 | This repository has been archived. The former README is now in [README-NOT.md](README-NOT.md). 6 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # DO NOT CHANGE the "init" and "install" sections below 2 | 3 | image: 4 | #- Visual Studio 2015 5 | - Visual Studio 2017 6 | #- Ubuntu 7 | 8 | environment: 9 | global: 10 | PATH: C:\msys64\usr\bin;C:\msys64\mingw64\bin;C:\Windows;%PATH% 11 | #;C:\Windows\System32 12 | R_CHECK_ARGS: "--ignore-vignettes --no-examples --no-build-vignettes --no-manual" 13 | R_BUILD_ARGS: '--no-build-vignettes' 14 | NOT_CRAN: true 15 | USE_RTOOLS: true 16 | _R_CHECK_FORCE_SUGGESTS_: false 17 | matrix: 18 | - PYTHON: "C:\\Python36" 19 | RETICULATE_PYTHON: "C:\\Python36" 20 | MINICONDA: C:\Miniconda36-x64 21 | CONDA_INSTALL_LOCN: C:\Miniconda36-x64 22 | DISTUTILS_USE_SDK: "1" 23 | R_VERSION: devel 24 | R_ARCH: x64 25 | platform: x64 26 | PYTHON_ARCH: "64" 27 | GCC_PATH: mingw_64 28 | - PYTHON: "C:\\Python37" 29 | RETICULATE_PYTHON: "C:\\Python37" 30 | MINICONDA: C:\Miniconda37-x64 31 | CONDA_INSTALL_LOCN: C:\Miniconda37-x64 32 | DISTUTILS_USE_SDK: "1" 33 | R_VERSION: devel 34 | R_ARCH: x64 35 | platform: x64 36 | PYTHON_ARCH: "64" 37 | GCC_PATH: mingw_64 38 | - PYTHON: "C:\\Python35" 39 | RETICULATE_PYTHON: "C:\\Python35" 40 | MINICONDA: C:\Miniconda35-x64 41 | CONDA_INSTALL_LOCN: C:\Miniconda35-x64 42 | DISTUTILS_USE_SDK: "1" 43 | R_VERSION: devel 44 | R_ARCH: x64 45 | platform: x64 46 | PYTHON_ARCH: "64" 47 | GCC_PATH: mingw_64 48 | - PYTHON: "C:\\Python37" 49 | RETICULATE_PYTHON: "C:\\Python37" 50 | MINICONDA: C:\Miniconda37-x64 51 | CONDA_INSTALL_LOCN: C:\Miniconda37-x64 52 | DISTUTILS_USE_SDK: "1" 53 | R_VERSION: release 54 | R_ARCH: x64 55 | platform: x64 56 | PYTHON_ARCH: "64" 57 | - PYTHON: "C:\\Python36" 58 | RETICULATE_PYTHON: "C:\\Python36" 59 | MINICONDA: C:\Miniconda36-x64 60 | CONDA_INSTALL_LOCN: C:\Miniconda36-x64 61 | DISTUTILS_USE_SDK: "1" 62 | R_VERSION: release 63 | R_ARCH: x64 64 | platform: x64 65 | PYTHON_ARCH: "64" 66 | - PYTHON: "C:\\Python35" 67 | RETICULATE_PYTHON: "C:\\Python35" 68 | MINICONDA: C:\Miniconda35-x64 69 | CONDA_INSTALL_LOCN: C:\Miniconda35-x64 70 | DISTUTILS_USE_SDK: "1" 71 | R_VERSION: release 72 | R_ARCH: x64 73 | platform: x64 74 | PYTHON_ARCH: "64" 75 | - PYTHON: "C:\\Python37" 76 | RETICULATE_PYTHON: "C:\\Python37" 77 | MINICONDA: C:\Miniconda37-x64 78 | CONDA_INSTALL_LOCN: C:\Miniconda37-x64 79 | DISTUTILS_USE_SDK: "1" 80 | R_VERSION: stable 81 | R_ARCH: x64 82 | platform: x64 83 | PYTHON_ARCH: "64" 84 | - PYTHON: "C:\\Python36" 85 | RETICULATE_PYTHON: "C:\\Python36" 86 | MINICONDA: C:\Miniconda36-x64 87 | CONDA_INSTALL_LOCN: C:\Miniconda36-x64 88 | DISTUTILS_USE_SDK: "1" 89 | R_VERSION: stable 90 | R_ARCH: x64 91 | platform: x64 92 | PYTHON_ARCH: "64" 93 | - PYTHON: "C:\\Python35" 94 | RETICULATE_PYTHON: "C:\\Python35" 95 | MINICONDA: C:\Miniconda35-x64 96 | CONDA_INSTALL_LOCN: C:\Miniconda35-x64 97 | DISTUTILS_USE_SDK: "1" 98 | R_VERSION: stable 99 | R_ARCH: x64 100 | platform: x64 101 | PYTHON_ARCH: "64" 102 | 103 | 104 | matrix: 105 | fast_finish: true 106 | exclude: 107 | - platform: x64 108 | PYTHON_ARCH: "32" 109 | - platform: x86 110 | PYTHON_ARCH: "64" 111 | 112 | # Download script file from GitHub 113 | init: 114 | - ps: | 115 | $ErrorActionPreference = "Stop" 116 | Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" 117 | Import-Module '..\appveyor-tool.ps1' 118 | - cmd: "ECHO %PYTHON_VERSION% %CONDA_INSTALL_LOCN%" 119 | - cmd: "set PATH=%MINICONDA%;%MINICONDA%\\Scripts;%MINICONDA%\\Library\\bin;%PATH%" 120 | 121 | install: 122 | # If there is a newer build queued for the same PR, cancel this one. 123 | # The AppVeyor 'rollout builds' option is supposed to serve the same 124 | # purpose but it is problematic because it tends to cancel builds pushed 125 | # directly to master instead of just PR builds (or the converse). 126 | # credits: JuliaLang developers. 127 | - ps: if ($env:APPVEYOR_PULL_REQUEST_NUMBER -and $env:APPVEYOR_BUILD_NUMBER -ne ((Invoke-RestMethod ` 128 | https://ci.appveyor.com/api/projects/$env:APPVEYOR_ACCOUNT_NAME/$env:APPVEYOR_PROJECT_SLUG/history?recordsNumber=50).builds | ` 129 | Where-Object pullRequestId -eq $env:APPVEYOR_PULL_REQUEST_NUMBER)[0].buildNumber) { ` 130 | throw "There are newer queued builds for this pull request, failing early." } 131 | - ECHO "Filesystem root:" 132 | - ps: "ls \"C:/\"" 133 | 134 | #- ECHO "Installed SDKs:" 135 | #- call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" 136 | #- ps: "ls \"C:/Program Files/Microsoft SDKs/Windows\"" 137 | 138 | #- travis-tool.sh install_github igraph/igraph 139 | #- travis-tool.sh install_github igraph/python-igraph 140 | # Install Python (from the official .msi of https://python.org) and pip when 141 | # not already installed. 142 | #- ps: if (-not(Test-Path($env:PYTHON))) { & appveyor\install.ps1 } 143 | 144 | # Prepend newly installed Python to the PATH of this build (this cannot be 145 | # done from inside the powershell script as it would require to restart 146 | # the parent CMD process). 147 | #- "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" 148 | 149 | # Check that we have the expected version and architecture for Python 150 | - "python --version" 151 | #- "python -c \"import struct; print(struct.calcsize('P') * 8)\"" 152 | 153 | # setup conda environment for building 154 | #- cmd: set "PATH=%CONDA_INSTALL_LOCN%;%CONDA_INSTALL_LOCN%\scripts;%PATH%" 155 | #- cmd: set PYTHONUNBUFFERED=1 156 | 157 | - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" 158 | # Check that we have the expected version and architecture for Python 159 | - "python -c \"import sys; print(sys.version)\"" 160 | # Install the build and runtime dependencies of the project. 161 | - "conda update -q --yes conda" 162 | # Install constructor, take into account what vc version we target later in the build.. 163 | - "conda install -q --yes 'constructor>=2.0'" 164 | # list package versions 165 | - "conda list" 166 | # build installer 167 | #- "constructor --verbose --platform=%CONDA_PLATFORM% %OBSPY_VERSION%_%CONDA_PYSUFFIX%" 168 | 169 | # update mysy2 170 | #- C:\msys64\usr\bin\bash -lc "pacman --needed --noconfirm -Sy pacman-mirrors" 171 | #- C:\msys64\usr\bin\bash -lc "pacman --noconfirm -Sy" 172 | #- C:\msys64\usr\bin\bash -lc "pacman --noconfirm -S autoconf automake bison flex" 173 | #- C:\msys64\usr\bin\bash -lc "pacman --noconfirm -S libxml2-devel zip" 174 | 175 | - conda info --envs 176 | #- conda create -n env_name -y 177 | #- conda update -y -n base -c defaults conda 178 | #- "conda.bat activate" 179 | #- sudo conda init bash 180 | #- bash 181 | #- conda activate base 182 | #- echo ". C:\Miniconda37/etc/profile.d/conda.sh" >> ~/.bashrc 183 | #- conda activate env_name 184 | - conda config --set always_yes yes --set changeps1 no 185 | - conda config --add channels conda-forge 186 | - conda config --add channels vtraag 187 | - conda install -y -q conda pip 188 | #- conda update -q conda pip 189 | #- conda install -y -q numpy 190 | #- conda install -y -q -c r r-igraph 191 | #- conda install -y -q -c conda-forge libcxx 192 | #- conda install -y -q -c anaconda git 193 | 194 | #- git clone -q https://github.com/conda-forge/igraph-feedstock.git C:\projects\igraph-feedstock 195 | #- git fetch -q origin +refs/pull/7/merge: 196 | #- git checkout -qf HEAD 197 | #- powershell -Command "(New-Object Net.WebClient).DownloadFile('https://raw.githubusercontent.com/conda-forge/conda-forge-build-setup-feedstock/master/recipe/ff_ci_pr_build.py', 'ff_ci_pr_build.py')" 198 | #- ff_ci_pr_build -v --ci "appveyor" "%APPVEYOR_ACCOUNT_NAME%/%APPVEYOR_PROJECT_SLUG%" "%APPVEYOR_BUILD_NUMBER%" "%APPVEYOR_PULL_REQUEST_NUMBER%" 199 | #- del ff_ci_pr_build.py 200 | #- rmdir C:\cygwin /s /q 201 | #- call %CONDA_INSTALL_LOCN%\Scripts\activate.bat 202 | #- conda.exe update --yes --quiet conda 203 | #- set PYTHONUNBUFFERED=1 204 | #- conda.exe config --set show_channel_urls true 205 | #- conda.exe config --remove channels defaults 206 | #- conda.exe config --add channels defaults 207 | #- conda.exe config --add channels conda-forge 208 | #- conda.exe install -n env_name --quiet --yes conda-forge-build-setup 209 | #- run_conda_forge_build_setup 210 | #- conda.exe build recipe --quiet 211 | #- conda install -n env_name -y q igraph 212 | - conda install -y numpy scipy 213 | - conda install -y python-igraph 214 | - conda install -y -c conda-forge umap-learn 215 | #- conda install pip numpy scipy 216 | #- "python -m pip install --upgrade pip" 217 | #- "pip install -q pycairo" 218 | #- pip install wheel 219 | #- pip install python-igraph==0.7.1.post6 220 | #- "pip install git+git://github.com/igraph/python-igraph.git" 221 | #- "pip install -q leidenalg" 222 | #- "echo done" 223 | 224 | # Upgrade to the latest version of pip to avoid it displaying warnings 225 | # about it being out of date. 226 | #- "python -m pip install --upgrade pip" 227 | 228 | # Install the build dependencies of the project. If some dependencies contain 229 | # compiled extensions and are not provided as pre-built wheel packages, 230 | # pip will build them from source using the MSVC compiler matching the 231 | # target Python version and architecture 232 | #- "%CMD_IN_ENV% pip install leidenalg python-igraph numpy" 233 | 234 | 235 | #install: 236 | #install python libraries 237 | #- "pip install --upgrade pip" 238 | #- "pip install leidenalg" 239 | - ps: Bootstrap 240 | 241 | cache: 242 | - C:\RLibrary 243 | 244 | # Adapt as necessary starting from here 245 | 246 | build_script: 247 | - R -e 'install.packages("igraph")' 248 | - R -e 'install.packages("RColorBrewer")' 249 | - R -e 'install.packages("rmarkdown")' 250 | - R -e 'install.packages("knitr")' 251 | - travis-tool.sh install_deps 252 | #- "pip install -q graphviz" #includes python-igraph 253 | #- "pip install -q leidenalg" 254 | 255 | 256 | test_script: 257 | - travis-tool.sh run_tests 258 | 259 | on_failure: 260 | - 7z a failure.zip *.Rcheck\* 261 | - appveyor PushArtifact failure.zip 262 | 263 | artifacts: 264 | - path: '*.Rcheck\**\*.log' 265 | name: Logs 266 | 267 | - path: '*.Rcheck\**\*.out' 268 | name: Logs 269 | 270 | - path: '*.Rcheck\**\*.fail' 271 | name: Logs 272 | 273 | - path: '*.Rcheck\**\*.Rout' 274 | name: Logs 275 | 276 | - path: '\*_*.tar.gz' 277 | name: Bits 278 | 279 | - path: '\*_*.zip' 280 | name: Bits 281 | -------------------------------------------------------------------------------- /img/multiple_algorithms_bean.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/multiple_algorithms_bean.png -------------------------------------------------------------------------------- /img/multiple_algorithms_cancer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/multiple_algorithms_cancer.png -------------------------------------------------------------------------------- /img/multiple_algorithms_iris.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/multiple_algorithms_iris.png -------------------------------------------------------------------------------- /img/multiple_algorithms_memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/multiple_algorithms_memory.png -------------------------------------------------------------------------------- /img/multiple_algorithms_rna.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/multiple_algorithms_rna.png -------------------------------------------------------------------------------- /img/multiple_algorithms_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/multiple_algorithms_time.png -------------------------------------------------------------------------------- /img/shiny.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/shiny.png -------------------------------------------------------------------------------- /img/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /img/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /img/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /img/unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/unnamed-chunk-7-1.png -------------------------------------------------------------------------------- /inst/ropensci_blog/img/multiple_algorithms_cancer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/inst/ropensci_blog/img/multiple_algorithms_cancer.png -------------------------------------------------------------------------------- /inst/ropensci_blog/img/multiple_algorithms_memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/inst/ropensci_blog/img/multiple_algorithms_memory.png -------------------------------------------------------------------------------- /inst/ropensci_blog/img/multiple_algorithms_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/inst/ropensci_blog/img/multiple_algorithms_time.png -------------------------------------------------------------------------------- /inst/ropensci_blog/img/shiny.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/inst/ropensci_blog/img/shiny.png -------------------------------------------------------------------------------- /inst/ropensci_blog/working-on-the-umapr-package.md: -------------------------------------------------------------------------------- 1 | Working on the `umapr` package 2 | ================ 3 | Sean Hughes, Ju Kim, Malisa Smith, Angela Li, and Ted Laderas 4 | 5 | ![Comparing UMAP to other algorithms](img/multiple_algorithms_cancer.png) 6 | 7 | Motivation 8 | ---------- 9 | 10 | > Note: At the time of the unconference, we were unaware that a similar package called `umap` existed and has implemented the algorithm in R. It's in the process of being submitted to CRAN. We don't want to steal their thunder. 11 | 12 | A few weeks ago, as part of the [rOpenSci Unconference](http://unconf18.ropensci.org), a group of us (Sean Hughes, Malisa Smith, Angela Li, Ju Kim, and Ted Laderas) decided to work on making the UMAP algorithm accessible within R. UMAP (Uniform Manifold Approximation and Projection) is a dimensionality reduction technique that allows the user to reduce high dimensional data (multiple columns) into a smaller number of columns for visualization purposes ([github](https://github.com/lmcinnes/umap), [arxiv](https://arxiv.org/abs/1802.03426)). It is similar to both Principal Components Analysis (PCA) and t-SNE, which are techniques often used in the single-cell omics world to visualize high dimensional data. t-SNE is actually quite a slow algorithm; one of the advantages of UMAP is that it runs faster than t-SNE. Because the `data.frames` that are typically run with these algorithms can run into millions of rows, efficiency is important. 13 | 14 | A few weeks ago, as part of the [rOpenSci Unconference](http://unconf18.ropensci.org), a group of us (Sean Hughes, Malisa Smith, Angela Li, Ju Kim, and Ted Laderas) decided to work on making the UMAP algorithm accessible within R. We had been introduced to each other before the unconference, and it turns out that we all work with flow cytometry data and that it would be fun to work on a project together. UMAP (Uniform Manifold Approximation and Projection) is a dimensionality reduction technique that allows the user to reduce high dimensional data (multiple columns) into a smaller number of columns for visualization purposes ([github](https://github.com/lmcinnes/umap), [arxiv](https://arxiv.org/abs/1802.03426)). It is related to both Principle Component Analysis (PCA) and t-SNE, which are techniques often used to visualize high dimensional data, such as single cell sequencing or expression data. 15 | 16 | t-SNE is actually quite a slow algorithm; one of the advantages of UMAP is that it actually runs faster than t-SNE. Because the `data.frames` that are typically run with these algorithms can run into millions of rows, efficiency is important. 17 | 18 | We decided to start working on the `umapr` package to make this technique accessible within R. As with most rOpenSci Unconf projects, this started with an [issue entry in the rOpenSci unconf repo](https://github.com/ropensci/unconf18/issues/43): 19 | 20 | > I recently read about a new non-linear dimensionality reduction algorithm called UMAP, which is much faster than t-SNE, while producing two-dimensional visualizations that share many characteristics with t-SNE. I initially found out about it in the context of use on high-dimensional single-cell data in this paper. 21 | 22 | > .... 23 | 24 | > My thought is that the ideal would be a package focused on UMAP specifically, implemented in R or Rcpp. Unfortunately I am not at all an expert in this topic or familiar with the mathematics involved, so the best I would be able to do is try to translate the Python implementation into R. 25 | 26 | We all met at the unconference the first day and decided that this was a project worth working on. Since t-SNE is so used in the single cell and flow-cytometry community, we thought that having an alternative that was just as good, but faster to run would be helpful. 27 | 28 | Making a Development Plan 29 | ------------------------- 30 | 31 | Rather than recreate the UMAP code completely from scratch in R, we decided to use the `reticulate` package to call the implementation in Python from R. It was tempting to just wrap the function's arguments with `...` and let the user refer to the python documentation. However, we didn't really think that was in the spirit of the unconf. We wanted to make UMAP much more accessible. 32 | 33 | Learning about Package Building, Testing, and Documentation 34 | ----------------------------------------------------------- 35 | 36 | Although our package only really has one main function (`umap()`), we felt it was important to have good documentation and unit tests. We spent some time learning about `roxygen` for function documentation and `testthat` for unit testing, and setting up our package with Travis-CI for continuous integration testing. This included unit tests on each argument and including examples varying the essential parameters. 37 | 38 | We spent a lot of time learning more about the specifics of package building and vignette building in R. We were definitely excited by all of the available tools and built a vignette profiling the performance of the UMAP algorithm versus other dimensionality reduction techniques, such as TSNE. 39 | 40 | Profiling `umapr` using different datasets 41 | ------------------------------------------ 42 | 43 | ![Execution time of UMAP compared to other algorithms](img/multiple_algorithms_time.png) 44 | ![Memory usage of UMAP comapred to other algorithms](img/multiple_algorithms_memory.png) 45 | 46 | Part of the appeal of UMAP is that it is faster than tSNE. So we profiled the performance of UMAP on a number of different datasets: `iris` (of course!), the [`BreastCancer` dataset from the `mlbench` package](https://cran.r-project.org/web/packages/mlbench/index.html), a [`Soybean` dataset from `mlbench`](https://cran.r-project.org/web/packages/mlbench/index.html), and finally, a [single cell RNA dataset](https://hemberg-lab.github.io/scRNA.seq.datasets/human/tissues/). You can [see our results in our readme file](https://github.com/ropenscilabs/umapr/blob/master/README.md). 47 | 48 | Thankfully, UMAP does run faster than tSNE on these datasets, showing an reduction of 66% compared to both versions of TSNE for the `Soybean` dataset, and reduced memory usage for all of the datasets, except for the single cell RNA dataset (see above figure). 49 | 50 | Exploring the Results with Shiny 51 | -------------------------------- 52 | 53 | ![Shiny App](img/shiny.png) 54 | 55 | We built a small Shiny app that lets people explore their embedding vectors (the dimensionally reduced vectors) and how they separate the data into different groupings in the 2D space. The app is simple, but allows users to immediately assess the results of the UMAP algorithm in differentiating groupings in the data by coloring the `umap` result by the different variables included in the analysis. 56 | 57 | Final Results: Get `umapr` 58 | -------------------------- 59 | 60 | `umapr` is currently available in the `ropenscilabs` organization, and can be installed with the following commands, [after the python modules are installed](https://github.com/lmcinnes/umap#installing). 61 | 62 | ``` 63 | install.packages("devtools") 64 | devtools::install_github("ropenscilabs/umapr") 65 | ``` 66 | 67 | As a group, we learned a lot by building the `umapr` package. More importantly, I think we'll work together on future projects. It was great to work together, and we are talking about having a hackathon between our multiple groups to improve some current open source flow cytometry tools. This was a really fun project and we're excited to do more! 68 | -------------------------------------------------------------------------------- /man/make_umap_object.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/classes.R 3 | \name{make_umap_object} 4 | \alias{make_umap_object} 5 | \title{Title} 6 | \usage{ 7 | make_umap_object(umap_result, annotation = NULL) 8 | } 9 | \arguments{ 10 | \item{umap_result}{- output of running} 11 | 12 | \item{annotation}{- optional annotation file} 13 | } 14 | \value{ 15 | - a umap object that includes plotting 16 | } 17 | \description{ 18 | Title 19 | } 20 | \examples{ 21 | \dontrun{ 22 | library(flowCore) 23 | 24 | umap_table <- umap() 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /man/run_umap_shiny.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/umap_app.R 3 | \name{run_umap_shiny} 4 | \alias{run_umap_shiny} 5 | \title{Open a shiny app to explore the data in a UMAP embedding.} 6 | \usage{ 7 | run_umap_shiny(umap) 8 | } 9 | \arguments{ 10 | \item{umap}{output of a call to `umap`} 11 | } 12 | \value{ 13 | Open an interactive shiny app to explore the data. 14 | } 15 | \description{ 16 | Open a shiny app to explore the data in a UMAP embedding. 17 | } 18 | -------------------------------------------------------------------------------- /man/umap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/umap.R 3 | \name{umap} 4 | \alias{umap} 5 | \title{umap} 6 | \usage{ 7 | umap(data, include_input = TRUE, n_neighbors = 15L, 8 | n_components = 2L, metric = "euclidean", n_epochs = NULL, 9 | learning_rate = 1, alpha = 1, init = "spectral", spread = 1, 10 | min_dist = 0.1, set_op_mix_ratio = 1, local_connectivity = 1L, 11 | repulsion_strength = 1, bandwidth = 1, gamma = 1, 12 | negative_sample_rate = 5L, transform_queue_size = 4, a = NULL, 13 | b = NULL, random_state = NULL, metric_kwds = dict(), 14 | angular_rp_forest = FALSE, target_n_neighbors = -1L, 15 | target_metric = "categorical", target_metric_kwds = dict(), 16 | target_weight = 0.5, transform_seed = 42L, verbose = FALSE) 17 | } 18 | \arguments{ 19 | \item{data}{data frame or matrix. input data.} 20 | 21 | \item{include_input}{logical. Attach input data to UMAP embeddings if desired.} 22 | 23 | \item{n_neighbors}{integer. The size of local neighborhood 24 | (in terms of number of neighboring sample points) used for manifold 25 | approximation. Larger values result in more global views of the manifold, 26 | while smaller values result in more local data being preserved. In general 27 | values should be in the range 2 to 100.} 28 | 29 | \item{n_components}{integer The dimension of the space to embed into. This 30 | defaults to 2 to provide easy visualization, but can reasonably be set to 31 | any integer value in the range 2 to 100.} 32 | 33 | \item{metric}{character. The metric to use to compute distances in high 34 | dimensional space. If a string is passed it must match a valid predefined 35 | metric. If a general metric is required a function that takes two 1d arrays 36 | and returns a float can be provided. For performance purposes it is required 37 | that this be a numba jit'd function. Valid string metrics include: euclidean, 38 | manhattan, chebyshev, minkowski, canberra, braycurtis, mahalanobis, 39 | wminkowski, seuclidean, cosine, correlation, haversine, hamming, jaccard, 40 | dice, russelrao, kulsinski, rogerstanimoto, sokalmichener, sokalsneath, yule. 41 | Metrics that take arguments (such as minkowski, mahalanobis etc.) can have 42 | arguments passed via the metric_kwds dictionary. At this time care must be 43 | taken and dictionary elements must be ordered appropriately; this will 44 | hopefully be fixed in the future.} 45 | 46 | \item{n_epochs}{integer The number of training epochs to use in optimization.} 47 | 48 | \item{learning_rate}{numeric. The initial learning rate for the embedding optimization.} 49 | 50 | \item{alpha}{numeric. The initial learning rate for the embedding optimization.} 51 | 52 | \item{init}{character. How to initialize the low dimensional embedding. 53 | Options are: 'spectral' (use a spectral embedding of the fuzzy 1-skeleton), 54 | 'random' (assign initial embedding positions at random), 55 | * A numpy array of initial embedding positions.} 56 | 57 | \item{spread}{numeric. The effective scale of embedded points. 58 | In combination with ``min_dist`` this determines how clustered/clumped the 59 | embedded points are.} 60 | 61 | \item{min_dist}{numeric. The effective minimum distance between embedded 62 | points. Smaller values will result in a more clustered/clumped embedding 63 | where nearby points on the manifold are drawn closer together, while larger 64 | values will result on a more even dispersal of points. The value should be 65 | set relative to the ``spread`` value, which determines the scale at which 66 | embedded points will be spread out.} 67 | 68 | \item{set_op_mix_ratio}{numeric. Interpolate between (fuzzy) union and 69 | intersection as the set operation used to combine local fuzzy simplicial 70 | sets to obtain a global fuzzy simplicial sets. Both fuzzy set operations use 71 | the product t-norm. The value of this parameter should be between 0.0 and 72 | 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure 73 | fuzzy intersection.} 74 | 75 | \item{local_connectivity}{integer The local connectivity required -- i.e. 76 | the number of nearest neighbors that should be assumed to be connected at a 77 | local level. The higher this value the more connected the manifold becomes 78 | locally. In practice, this should be not more than the local intrinsic 79 | dimension of the manifold.} 80 | 81 | \item{repulsion_strength}{numeric. Weighting applied to negative samples in 82 | low dimensional embedding optimization. Values higher than one will result in 83 | greater weight being given to negative samples.} 84 | 85 | \item{bandwidth}{numeric. The effective bandwidth of the kernel if we view 86 | the algorithm as similar to Laplacian eigenmaps. Larger values induce more 87 | connectivity and a more global view of the data, smaller values concentrate 88 | more locally.} 89 | 90 | \item{gamma}{numeric. Weighting applied to negative samples in low 91 | dimensional embedding optimization. Values higher than one will result in 92 | greater weight being given to negative samples.} 93 | 94 | \item{negative_sample_rate}{numeric. The number of negative edge/1-simplex 95 | samples to use per positive edge/1-simplex sample in optimizing the low 96 | dimensional embedding.} 97 | 98 | \item{transform_queue_size}{numeric. For transform operations (embedding new points 99 | using a trained model_ this will control how aggressively to search for 100 | nearest neighbors. Larger values will result in slower performance but 101 | more accurate nearest neighbor evaluation.} 102 | 103 | \item{a}{numeric. More specific parameters controlling the embedding. 104 | If NULL, these values are set automatically as determined by ``min_dist`` 105 | and ``spread``.} 106 | 107 | \item{b}{numeric. More specific parameters controlling the embedding. 108 | If NULL, these values are set automatically as determined by ``min_dist`` 109 | and ``spread``.} 110 | 111 | \item{random_state}{integer. If integer, random_state is the seed used by the 112 | random number generator; If NULL, the random number generator is the 113 | RandomState instance used by `np.random`.} 114 | 115 | \item{metric_kwds}{reticulate dictionary. Arguments to pass on to the metric, 116 | such as the ``p`` value for Minkowski distance.} 117 | 118 | \item{angular_rp_forest}{logical. Whether to use an angular random projection 119 | forest to initialise the approximate nearest neighbor search. This can be 120 | faster, but is mostly on useful for metric that use an angular style distance 121 | such as cosine, correlation etc. In the case of those metrics angular forests 122 | will be chosen automatically.} 123 | 124 | \item{target_n_neighbors}{integer. The number of nearest neighbors to use to 125 | construct the target simplcial set. If set to -1 use the n_neighbors value.} 126 | 127 | \item{target_metric}{character or function. The metric used to measure distance 128 | for a target array is using supervised dimension reduction. By default this is 129 | ‘categorical’ which will measure distance in terms of whether categories match 130 | or are different. Furthermore, if semi-supervised is required target values of 131 | -1 will be trated as unlabelled under the ‘categorical’ metric. If the target 132 | array takes continuous values (e.g. for a regression problem) then metric of 133 | ‘l1’ or ‘l2’ is probably more appropriate.} 134 | 135 | \item{target_metric_kwds}{reticulate dictionary. Keyword argument to pass to 136 | the target metric when performing supervised dimension reduction. If None then 137 | no arguments are passed on.} 138 | 139 | \item{target_weight}{numeric. weighting factor between data topology and target 140 | topology. A value of 0.0 weights entirely on data, a value of 1.0 weights 141 | entirely on target. The default of 0.5 balances the weighting equally between 142 | data and target.} 143 | 144 | \item{transform_seed}{integer. Random seed used for the stochastic aspects of 145 | the transform operation. This ensures consistency in transform operations.} 146 | 147 | \item{verbose}{logical. Controls verbosity of logging.} 148 | } 149 | \value{ 150 | matrix 151 | } 152 | \description{ 153 | Provides an interface to the UMAP algorithm implemented in Python. 154 | } 155 | \examples{ 156 | #import umap library (and load python module) 157 | library("umapr") 158 | umap(as.matrix(iris[, 1:4])) 159 | umap(iris[, 1:4]) 160 | } 161 | \references{ 162 | Leland McInnes and John Healy (2018). UMAP: Uniform Manifold 163 | Approximation and Projection for Dimension Reduction. 164 | ArXiv e-prints 1802.03426. 165 | } 166 | -------------------------------------------------------------------------------- /tests/shinyTest/test_umap_shiny.R: -------------------------------------------------------------------------------- 1 | library(reticulate) 2 | library(tidyverse) 3 | 4 | umap <- import("umap") 5 | sklearn.datasets_module <- import("sklearn.datasets") 6 | 7 | digits <- sklearn.datasets_module$load_digits() 8 | 9 | umap_out <- umap$UMAP()$fit_transform(digits$data) 10 | colnames(umap_out) <- c("UMAP1","UMAP2") 11 | umap <- cbind(digits$data, umap_out) %>% data.frame() 12 | 13 | #runUmapShiny(umap) 14 | 15 | umapout <- make_umap_object(umap_result = umap) 16 | 17 | #umapout$plot("V4") 18 | 19 | runUmapShiny(umap) 20 | 21 | library(flowCore) 22 | data("GvHD") 23 | out <- fsApply(GvHD, exprs) 24 | 25 | 26 | out <- out[,-8] 27 | test <- umap(out) 28 | 29 | colnames(umap_out) <- c("UMAP1","UMAP2") 30 | umap <- cbind(digits$data, umap_out) %>% data.frame() 31 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(umapr) 3 | 4 | test_check("umapr") 5 | -------------------------------------------------------------------------------- /tests/testthat/test-umapr.R: -------------------------------------------------------------------------------- 1 | context("UMAP wrapper tests") 2 | library("umapr") 3 | library("reticulate") 4 | 5 | # From https://cran.r-project.org/web/packages/reticulate/vignettes/package.html 6 | # helper function to skip tests if we don't have the 'foo' module 7 | skip_if_no_umap <- function() { 8 | have_umap <- py_module_available("umap") 9 | if (!have_umap) 10 | skip("umap not available for testing") 11 | } 12 | 13 | skip_if_no_sklearn.datasets <- function() { 14 | have_sklearn.datasets <- py_module_available("sklearn.datasets") 15 | if (!have_sklearn.datasets) 16 | skip("sklearn.datasets not available for testing") 17 | } 18 | 19 | # Here we perform the actual testing 20 | test_that("Things work as expected", { 21 | skip_if_no_umap() 22 | skip_if_no_sklearn.datasets() 23 | 24 | # Generate/Load some data 25 | set.seed(1) 26 | data = cbind(matrix(rexp( 100 * 10, runif(1, 1E-5, 1E-3) ), 100, 10)) 27 | 28 | # The function should check the input types to make sure they are correct 29 | expect_error(umap(data = "Not a matrix"), "Data must be a data frame or a matrix") 30 | expect_error(umap(data = "This is not a matrix or a data frame"), "Data must be a data frame or a matrix.") 31 | expect_error(umap(data = data, n_neighbors = "Not count"), "n_neighbors is not a count") 32 | expect_error(umap(data = data, n_components = "Not count"), "n_components is not a count") 33 | # metric must be one of the options listed here: https://github.com/lmcinnes/umap/blob/bf1c3e5c89ea393c9de10bd66c5e3d9bc30588ee/umap/umap_.py#L1211 34 | expect_error(umap(data = data, metric = "not a valid metric"), NULL) 35 | expect_error(umap(data = data, n_epochs = "Not count"), "n_epochs is not a count") 36 | expect_error(umap(data = data, alpha = "Not numeric"), "alpha is not a numeric") 37 | expect_error(umap(data = data, learning_rate = "Not numeric"), "learning_rate is not a numeric") 38 | expect_error(umap(data = data, init = "not a valid init"), "init must be one of 'spectral', 'random', or a numpy array of initial embedding positions") 39 | expect_error(umap(data = data, spread = "Not numeric"), "spread is not a numeric") 40 | expect_error(umap(data = data, min_dist = "Not numeric"), "min_dist is not a numeric") 41 | expect_error(umap(data = data, set_op_mix_ratio = "Not numeric"), "set_op_mix_ratio is not a numeric") 42 | expect_error(umap(data = data, local_connectivity = 2.4), "local_connectivity is not a count") 43 | expect_error(umap(data = data, bandwidth = "Not numeric"), "bandwidth is not a numeric") 44 | expect_error(umap(data = data, gamma = "Not numeric"), "gamma is not a numeric") 45 | expect_error(umap(data = data, negative_sample_rate = 2.4), "negative_sample_rate is not a count") 46 | expect_error(umap(data = data, a = "Not numeric"), "not TRUE") 47 | expect_error(umap(data = data, b = "Not numeric"), "not TRUE") 48 | expect_error(umap(data = data, random_state = 2.4), "not TRUE") 49 | expect_error(umap(data = data, metric_kwds = 2.4), "metric_kwds must be a Python dictionary object") 50 | expect_error(umap(data = data, angular_rp_forest = 2.4), "angular_rp_forest is not a flag") 51 | expect_error(umap(data = data, verbose = 2.4), "verbose is not a flag") 52 | 53 | # try running umap with the same seed twice, see if you get the same thing 54 | expect_true(identical(umap(data = data, random_state = 3L), 55 | umap(data = data, random_state = 3L))) 56 | }) 57 | 58 | # test_that("R6 tests", 59 | # { 60 | # 61 | # set.seed(1) 62 | # data = cbind(matrix(rexp( 100 * 10, runif(1, 1E-5, 1E-3) ), 100, 10)) 63 | # colnames(data) <- c(letters[1:10]) 64 | # out <- umap(data) 65 | # 66 | # expect_equal(class(out)[1], "umap_obj") 67 | # pl <- out$plot("a") 68 | # expect_equal(class(pl)[-1], "ggplot") 69 | # 70 | # 71 | # }) 72 | -------------------------------------------------------------------------------- /timings.R: -------------------------------------------------------------------------------- 1 | library(umapr) 2 | library(Rtsne) 3 | library(tidyverse) 4 | library(bench) 5 | 6 | # stuff to compare algorithms ------------------------------------------------- 7 | embed <- function(labels, d) { 8 | times <- mark( 9 | um <- umap(d), 10 | ts <- Rtsne(d)$Y, 11 | ts_no_pca <- Rtsne(d, pca = FALSE)$Y, 12 | check = FALSE) 13 | 14 | pca <- prcomp(d)$x[,1:2] 15 | 16 | times$expression <- c("UMAP", "PCA + t-SNE", "t-SNE") 17 | 18 | combo <- function(embedding, name) { 19 | colnames(embedding) <- c("V1", "V2") 20 | embedding %>% 21 | as.data.frame() %>% 22 | mutate(Algorithm = name, Class = labels) 23 | } 24 | 25 | list(times = times, 26 | results = bind_rows( 27 | combo(pca, "PCA"), 28 | mutate(um, Algorithm = "UMAP", Class = labels, V1 = UMAP1, V2 = UMAP2), 29 | combo(ts, "PCA + t-SNE"), 30 | combo(ts_no_pca, "t-SNE"))) 31 | } 32 | 33 | plot_embeddings <- function(embeddings, dataset) { 34 | ggplot(embeddings, aes(V1, V2, color = Class)) + 35 | geom_point() + facet_wrap(~ Algorithm, scales = "free") + 36 | ggtitle(dataset) 37 | } 38 | 39 | # iris ----------------------------------------------------------------------- 40 | d <- iris 41 | d <- d[!duplicated(d), ] 42 | with_labels <- d 43 | d <- as.matrix(d[ , 1:4]) 44 | 45 | iris_result <- embed(with_labels$Species, d) 46 | 47 | # cancer --------------------------------------------------------------------- 48 | library(mlbench) 49 | data("BreastCancer") 50 | d <- BreastCancer[ , 2:11] 51 | d <- d[!duplicated(d), ] 52 | d <- d[complete.cases(d), ] 53 | labels <- d$Class 54 | d <- as.matrix(d[ , 1:9]) 55 | d <- apply(d, 2, as.numeric) 56 | 57 | cancer_result <- embed(labels, d) 58 | 59 | # beans ----------------------------------------------------------- 60 | data(Soybean) 61 | d <- Soybean 62 | d <- d[!duplicated(d[,2:36]), ] 63 | d <- d[complete.cases(d[,2:36]), ] 64 | labels <- d$Class 65 | d <- as.matrix(d[ , 2:36]) 66 | d <- apply(d, 2, as.numeric) 67 | 68 | bean_result <- embed(labels, d) 69 | 70 | # some scRNAseq ------------------------------------------------------------- 71 | #https://hemberg-lab.github.io/scRNA.seq.datasets/human/tissues/ 72 | library(SingleCellExperiment) 73 | x <- readRDS("~/Desktop/li.rds") 74 | y <- t(logcounts(x)) 75 | rm(x) 76 | 77 | labels <- str_extract(rownames(y), "[^_]*$") 78 | 79 | sc_rna_seq_result <- embed(labels, y) 80 | 81 | # display results ---------------------------------------------------------- 82 | plot_embeddings(iris_result$results, "iris") 83 | ggsave("img/multiple_algorithms_iris.png", width = 6, height = 5, dpi = 300) 84 | plot_embeddings(cancer_result$results, "cancer") 85 | ggsave("img/multiple_algorithms_cancer.png", width = 6, height = 5, dpi = 300) 86 | plot_embeddings(bean_result$results, "bean") 87 | ggsave("img/multiple_algorithms_bean.png", width = 6, height = 5, dpi = 300) 88 | plot_embeddings(sc_rna_seq_result$results, "scRNAseq") 89 | ggsave("img/multiple_algorithms_rna.png", width = 6, height = 5, dpi = 300) 90 | 91 | # times ------------------------------------------------------------------- 92 | combo_times <- function(times, dataset) { 93 | dplyr::select(times, expression, median, mem_alloc) %>% 94 | dplyr::mutate(Data = dataset) 95 | } 96 | 97 | times <- suppressWarnings(bind_rows(combo_times(iris_result$times, "iris"), 98 | combo_times(cancer_result$times, "cancer"), 99 | combo_times(bean_result$times, "bean"), 100 | combo_times(sc_rna_seq_result$times, "scRNAseq"))) 101 | 102 | ggplot(times, aes(x = expression, y = median)) + 103 | geom_col()+ facet_wrap(~ Data, scales = "free_y") + 104 | ylab("Time (s)") + xlab(NULL) + 105 | ggtitle("Time taken to run dimensionality reduction on dataset") 106 | 107 | ggsave("img/multiple_algorithms_time.png", width = 6, height = 5, dpi = 300) 108 | 109 | ggplot(times, aes(x = expression, y = mem_alloc)) + 110 | geom_col()+ facet_wrap(~ Data, scales = "free_y") + 111 | ylab("Memory (bytes)") + xlab(NULL) + 112 | ggtitle("Memory used to run dimensionality reduction on dataset") 113 | ggsave("img/multiple_algorithms_memory.png", width = 6, height = 5, dpi = 300) 114 | -------------------------------------------------------------------------------- /travis_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # install python 4 | if [[ $TRAVIS_OS_NAME == "linux" ]]; then 5 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh 6 | elif [[ $TRAVIS_OS_NAME == "osx" ]]; then 7 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh 8 | fi 9 | 10 | bash miniconda.sh -b -p $HOME/miniconda 11 | export PATH="$HOME/miniconda/bin:$PATH" 12 | export RETICULATE_PYTHON="$HOME/miniconda/bin/python" 13 | hash -r 14 | conda config --set always_yes yes --set changeps1 no 15 | conda update -q conda 16 | conda info -a 17 | pip install --upgrade pip 18 | pip install igraph leidenalg 19 | --------------------------------------------------------------------------------