├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── CONDUCT.md
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R
    ├── classes.R
    ├── umap.R
    └── umap_app.R
├── README-NOT.md
├── README.Rmd
├── README.md
├── appveyor.yml
├── img
    ├── multiple_algorithms_bean.png
    ├── multiple_algorithms_cancer.png
    ├── multiple_algorithms_iris.png
    ├── multiple_algorithms_memory.png
    ├── multiple_algorithms_rna.png
    ├── multiple_algorithms_time.png
    ├── shiny.png
    ├── unnamed-chunk-3-1.png
    ├── unnamed-chunk-5-1.png
    ├── unnamed-chunk-6-1.png
    └── unnamed-chunk-7-1.png
├── inst
    └── ropensci_blog
    │   ├── img
    │       ├── multiple_algorithms_cancer.png
    │       ├── multiple_algorithms_memory.png
    │       ├── multiple_algorithms_time.png
    │       └── shiny.png
    │   └── working-on-the-umapr-package.md
├── man
    ├── make_umap_object.Rd
    ├── run_umap_shiny.Rd
    └── umap.Rd
├── tests
    ├── shinyTest
    │   └── test_umap_shiny.R
    ├── testthat.R
    ├── testthat
    │   └── test-umapr.R
    └── umap_output.txt
├── timings.R
└── travis_setup.sh


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^CONDUCT\.md$
 4 | ^\.travis\.yml$
 5 | ^README\.Rmd$
 6 | ^README-.*\.png$
 7 | ^timings\.R$
 8 | ^img$
 9 | ^LICENSE\.md$
10 | ^appveyor\.yml$
11 | ^travis_setup.sh$  
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | umapr.Rproj
2 | .Rproj.user
3 | .Rhistory
4 | .RData
5 | .Ruserdata
6 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
 2 | 
 3 | language: R
 4 | sudo: true
 5 | cache: packages
 6 | 
 7 | os:
 8 |   - linux
 9 |   #- os: osx
10 |     #osx_image: xcode10.1
11 |     #disable_homebrew: true
12 |     #r_build_args: '--no-build-vignettes'
13 |     #r_check_args: '--ignore-vignettes --no-examples'
14 | 
15 | addons:
16 |   apt:
17 |     #sources:
18 |     #  - ubuntu-toolchain-r-test
19 |     packages:
20 |       - libpython-dev
21 |       - libpython3-dev
22 |       - texlive-full
23 |       - cmake
24 |       - python3
25 |   #homebrew:
26 |     #packages:
27 |       #- libpython-dev
28 |       #- libpython3-dev
29 |       #- texlive-full
30 |       #- cmake
31 |       #- python3
32 |     #update: true
33 | 
34 | r:
35 |   - release
36 |   - devel
37 | 
38 | r-packages:
39 |   - reticulate
40 |   - knitr
41 |   - testthat
42 |   - RColorBrewer
43 | 
44 | env:
45 |   global:
46 |     - R_CHECK_ARGS="--no-build-vignettes --no-manual --ignore-vignettes --no-examples"
47 |     - R_BUILD_ARGS="--no-build-vignettes"
48 | 
49 | python:
50 |   - "3.5"
51 |   - "3.6"
52 |   - "3.7"
53 | 
54 | before_install:
55 |   - chmod +x travis_setup.sh
56 |   - ./travis_setup.sh
57 |   - pip install --user conda
58 |   - pip install --user umap-learn
59 | 
60 | warnings_are_errors: false
61 | 


--------------------------------------------------------------------------------
/CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Code of Conduct
 2 | 
 3 | As contributors and maintainers of this project, we pledge to respect all people who 
 4 | contribute through reporting issues, posting feature requests, updating documentation,
 5 | submitting pull requests or patches, and other activities.
 6 | 
 7 | We are committed to making participation in this project a harassment-free experience for
 8 | everyone, regardless of level of experience, gender, gender identity and expression,
 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
10 | 
11 | Examples of unacceptable behavior by participants include the use of sexual language or
12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment,
13 | insults, or other unprofessional conduct.
14 | 
15 | Project maintainers have the right and responsibility to remove, edit, or reject comments,
16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 
17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 
18 | from the project team.
19 | 
20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 
21 | opening an issue or contacting one or more of the project maintainers.
22 | 
23 | This Code of Conduct is adapted from the Contributor Covenant 
24 | (http:contributor-covenant.org), version 1.0.0, available at 
25 | http://contributor-covenant.org/version/1/0/0/
26 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: umapr
 2 | Title: Wraps UMAP Algorithm for Dimension Reduction
 3 | Version: 0.0.0.9001
 4 | Authors@R: c(
 5 |     person("Sean", "Hughes", role = c("aut", "cre"), email = "smhughes@uw.edu"),
 6 |     person("Ted", "Laderas", role = "aut", email="tedladeras@gmail.com"),
 7 |     person("Malisa", "Smith", role = "aut"),
 8 |     person("Ju Yeong", "Kim", role = "aut"),
 9 |     person("Angela", "Li", role = "aut")
10 |   )
11 | Description: Wraps the Python implementation of the UMAP dimension reductionality algorithm to use in `R`. Uniform Manifold Approximation and Projection (UMAP) is a non-linear dimensionality reduction algorithm that is computationally more efficient than t-SNE (McInnes and Healy, 2018) <https://arxiv.org/abs/1802.03426>. This package allows the user to run UMAP from R, producing a data frame that can be plotted on a 2-D graph.  
12 | Depends: R (>= 3.2.3)
13 | License: MIT + file LICENSE
14 | URL: https://github.com/ropenscilabs/umapr
15 | BugReports: https://github.com/ropenscilabs/umapr/issues
16 | Encoding: UTF-8
17 | LazyData: true
18 | Imports:
19 |     reticulate,
20 |     shiny,
21 |     ggplot2,
22 |     assertthat
23 | Suggests:
24 |     testthat,
25 |     tidyverse,
26 |     knitr,
27 |     rmarkdown
28 | VignetteBuilder: knitr
29 | RoxygenNote: 6.1.1
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2018
2 | COPYRIGHT HOLDER: Sean Hughes;Ted Laderas;Malisa Smith;Ju Yeong Kim;Angela Li
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2018 Sean Hughes;Ted Laderas;Malisa Smith;Ju Yeong Kim;Angela Li
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(make_umap_object)
 4 | export(run_umap_shiny)
 5 | export(umap)
 6 | importFrom(assertthat,assert_that)
 7 | importFrom(assertthat,is.count)
 8 | importFrom(assertthat,is.flag)
 9 | importFrom(reticulate,dict)
10 | importFrom(reticulate,import)
11 | importFrom(reticulate,py_available)
12 | importFrom(reticulate,py_install)
13 | importFrom(reticulate,py_module_available)
14 | importFrom(reticulate,r_to_py)
15 | importFrom(reticulate,use_condaenv)
16 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # umapr 0.0.0.9000
2 | 
3 | * Added a `NEWS.md` file to track changes to the package.
4 | 
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/R/classes.R:
--------------------------------------------------------------------------------
 1 | library(R6)
 2 | library(ggplot2)
 3 | 
 4 | umap_obj <- R6Class("umap_obj",
 5 |                     public = list(
 6 |                       #inherit=base::data.frame,
 7 |                       markers=NULL,
 8 |                       umap_table=NULL,
 9 |                       
10 |                       plot = function(marker){
11 |                         markers <- self$markers
12 |                         if(!marker %in% markers){stop("marker not in list of markers")}
13 |                         ggplot2::ggplot(self$umap_table, ggplot2::aes_string(x = "UMAP1", y = "UMAP2", color=marker)) +
14 |                           ggplot2::geom_point()
15 |                       },
16 | 
17 |                       initialize = function(umap_table, annotation=NULL){
18 |                         
19 |                         self$umap_table <- umap_table
20 |                         if(!is.null(annotation)){
21 |                             self$annotation = annotation
22 |                         }
23 |                         markers <- colnames(umap_table)[!colnames(umap_table) %in% c("UMAP1","UMAP2")]
24 |                         
25 |                         self$markers <- markers
26 |                         invisible(self)
27 |                       },
28 |                       
29 |                       explore = function(markers=NULL){
30 |                         runUmapShiny(self)
31 |                       },
32 |                       
33 |                       set_markers = function(markers=NULL){
34 |                         self$markers 
35 |                         invisible(self)
36 |                       },
37 |                       
38 |                       returnData = function(){
39 |                         return(self$umap_table)
40 |                       }
41 |                     ))
42 | 
43 | #' Title
44 | #'
45 | #' @param umap_result - output of running
46 | #' @param annotation - optional annotation file
47 | #'
48 | #' @return - a umap object that includes plotting
49 | #' @export
50 | #'
51 | #' @examples
52 | #' \dontrun{
53 | #' library(flowCore)
54 | #'
55 | #' umap_table <- umap()
56 | #' }
57 | make_umap_object <- function(umap_result, annotation=NULL){
58 |   umapobj <-umap_obj$new(umap_table=umap_result, annotation=annotation)
59 |   return(umapobj)
60 | }
61 | 


--------------------------------------------------------------------------------
/R/umap.R:
--------------------------------------------------------------------------------
  1 | #' umap
  2 | #'
  3 | #' @description Provides an interface to the UMAP algorithm implemented in Python.
  4 | #'
  5 | #' @references Leland McInnes and John Healy (2018). UMAP: Uniform Manifold
  6 | #' Approximation and Projection for Dimension Reduction.
  7 | #' ArXiv e-prints 1802.03426.
  8 | #'
  9 | #' @param data data frame or matrix. input data.
 10 | #' @param include_input logical. Attach input data to UMAP embeddings if desired.
 11 | #' @param n_neighbors integer. The size of local neighborhood
 12 | #' (in terms of number of neighboring sample points) used for manifold
 13 | #' approximation. Larger values result in more global views of the manifold,
 14 | #' while smaller values result in more local data being preserved. In general
 15 | #' values should be in the range 2 to 100.
 16 | #' @param n_components integer The dimension of the space to embed into. This
 17 | #' defaults to 2 to provide easy visualization, but can reasonably be set to
 18 | #' any integer value in the range 2 to 100.
 19 | #' @param metric character. The metric to use to compute distances in high
 20 | #' dimensional space. If a string is passed it must match a valid predefined
 21 | #' metric. If a general metric is required a function that takes two 1d arrays
 22 | #' and returns a float can be provided. For performance purposes it is required
 23 | #' that this be a numba jit'd function. Valid string metrics include: euclidean,
 24 | #' manhattan, chebyshev, minkowski, canberra, braycurtis, mahalanobis,
 25 | #' wminkowski, seuclidean, cosine, correlation, haversine, hamming, jaccard,
 26 | #' dice, russelrao, kulsinski, rogerstanimoto, sokalmichener, sokalsneath, yule.
 27 | #' Metrics that take arguments (such as minkowski, mahalanobis etc.) can have
 28 | #' arguments passed via the metric_kwds dictionary. At this time care must be
 29 | #' taken and dictionary elements must be ordered appropriately; this will
 30 | #' hopefully be fixed in the future.
 31 | #' @param n_epochs integer The number of training epochs to use in optimization.
 32 | #' @param learning_rate numeric. The initial learning rate for the embedding optimization.
 33 | #' @param alpha numeric. The initial learning rate for the embedding optimization.
 34 | #' @param init character. How to initialize the low dimensional embedding.
 35 | #' Options are: 'spectral' (use a spectral embedding of the fuzzy 1-skeleton),
 36 | #' 'random' (assign initial embedding positions at random),
 37 | #' * A numpy array of initial embedding positions.
 38 | #' @param spread numeric. The effective scale of embedded points.
 39 | #' In combination with ``min_dist`` this determines how clustered/clumped the
 40 | #' embedded points are.
 41 | #' @param min_dist numeric.  The effective minimum distance between embedded
 42 | #' points. Smaller values will result in a more clustered/clumped embedding
 43 | #' where nearby points on the manifold are drawn closer together, while larger
 44 | #' values will result on a more even dispersal of points. The value should be
 45 | #' set relative to the ``spread`` value, which determines the scale at which
 46 | #' embedded points will be spread out.
 47 | #' @param set_op_mix_ratio numeric. Interpolate between (fuzzy) union and
 48 | #' intersection as the set operation used to combine local fuzzy simplicial
 49 | #' sets to obtain a global fuzzy simplicial sets. Both fuzzy set operations use
 50 | #' the product t-norm. The value of this parameter should be between 0.0 and
 51 | #' 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure
 52 | #' fuzzy intersection.
 53 | #' @param local_connectivity integer The local connectivity required -- i.e.
 54 | #' the number of nearest neighbors that should be assumed to be connected at a
 55 | #' local level. The higher this value the more connected the manifold becomes
 56 | #' locally. In practice, this should be not more than the local intrinsic
 57 | #' dimension of the manifold.
 58 | #' @param repulsion_strength numeric. Weighting applied to negative samples in 
 59 | #' low dimensional embedding optimization. Values higher than one will result in
 60 | #'  greater weight being given to negative samples.
 61 | #' @param bandwidth numeric. The effective bandwidth of the kernel if we view
 62 | #' the algorithm as similar to Laplacian eigenmaps. Larger values induce more
 63 | #' connectivity and a more global view of the data, smaller values concentrate
 64 | #' more locally.
 65 | #' @param gamma numeric. Weighting applied to negative samples in low
 66 | #' dimensional embedding optimization. Values higher than one will result in
 67 | #' greater weight being given to negative samples.
 68 | #' @param negative_sample_rate numeric. The number of negative edge/1-simplex
 69 | #' samples to use per positive edge/1-simplex sample in optimizing the low
 70 | #' dimensional embedding.
 71 | #' @param transform_queue_size numeric. For transform operations (embedding new points
 72 | #'  using a trained model_ this will control how aggressively to search for 
 73 | #'  nearest neighbors. Larger values will result in slower performance but
 74 | #'   more accurate nearest neighbor evaluation.
 75 | #' @param a numeric. More specific parameters controlling the embedding.
 76 | #' If NULL, these values are set automatically as determined by ``min_dist``
 77 | #' and ``spread``.
 78 | #' @param b numeric. More specific parameters controlling the embedding.
 79 | #' If NULL, these values are set automatically as determined by ``min_dist``
 80 | #' and ``spread``.
 81 | #' @param random_state integer. If integer, random_state is the seed used by the
 82 | #' random number generator; If NULL, the random number generator is the
 83 | #' RandomState instance used by `np.random`.
 84 | #' @param metric_kwds reticulate dictionary. Arguments to pass on to the metric,
 85 | #' such as the ``p`` value for Minkowski distance.
 86 | #' @param angular_rp_forest logical. Whether to use an angular random projection
 87 | #' forest to initialise the approximate nearest neighbor search. This can be
 88 | #' faster, but is mostly on useful for metric that use an angular style distance
 89 | #' such as cosine, correlation etc. In the case of those metrics angular forests
 90 | #' will be chosen automatically.
 91 | #' @param target_n_neighbors integer. The number of nearest neighbors to use to 
 92 | #' construct the target simplcial set. If set to -1 use the n_neighbors value.
 93 | #' @param target_metric character or function. The metric used to measure distance
 94 | #' for a target array is using supervised dimension reduction. By default this is
 95 | #' ‘categorical’ which will measure distance in terms of whether categories match
 96 | #' or are different. Furthermore, if semi-supervised is required target values of 
 97 | #' -1 will be trated as unlabelled under the ‘categorical’ metric. If the target
 98 | #' array takes continuous values (e.g. for a regression problem) then metric of 
 99 | #' ‘l1’ or ‘l2’ is probably more appropriate.
100 | #' @param target_metric_kwds reticulate dictionary. Keyword argument to pass to 
101 | #' the target metric when performing supervised dimension reduction. If None then
102 | #' no arguments are passed on.
103 | #' @param target_weight numeric. weighting factor between data topology and target 
104 | #' topology. A value of 0.0 weights entirely on data, a value of 1.0 weights 
105 | #' entirely on target. The default of 0.5 balances the weighting equally between 
106 | #' data and target.
107 | #' @param transform_seed integer. Random seed used for the stochastic aspects of 
108 | #' the transform operation. This ensures consistency in transform operations.
109 | #' @param verbose logical. Controls verbosity of logging.
110 | #'
111 | #' @return matrix
112 | #' @export
113 | #' @importFrom assertthat assert_that is.count is.flag
114 | #' @importFrom reticulate dict r_to_py py_module_available py_install import use_condaenv py_available
115 | #'
116 | #' @examples
117 | #' #test only if umap python module 
118 | #' if(reticulate::py_module_available("umap")){}
119 | #' 
120 | #' #import umap library (and load python module)
121 | #' 
122 | #' library("umapr")
123 | #' umap(as.matrix(iris[, 1:4]))
124 | #' umap(iris[, 1:4])
125 | #' 
126 | #' 
127 | #' }
128 | umap <- function(data,
129 |                  include_input = TRUE,
130 |                  n_neighbors = 15L,
131 |                  n_components = 2L,
132 |                  metric = "euclidean",
133 |                  n_epochs = NULL,
134 |                  learning_rate = 1.0,
135 |                  alpha = 1.0,
136 |                  init = "spectral",
137 |                  spread = 1.0,
138 |                  min_dist = 0.1,
139 |                  set_op_mix_ratio = 1.0,
140 |                  local_connectivity = 1L,
141 |                  repulsion_strength = 1.0,
142 |                  bandwidth = 1.0,
143 |                  gamma = 1.0,
144 |                  negative_sample_rate = 5L,
145 |                  transform_queue_size = 4.0,
146 |                  a = NULL,
147 |                  b = NULL,
148 |                  random_state = NULL,
149 |                  metric_kwds = dict(),
150 |                  angular_rp_forest = FALSE,
151 |                  target_n_neighbors = -1L,
152 |                  target_metric = "categorical",
153 |                  target_metric_kwds = dict(),
154 |                  target_weight =  0.5,
155 |                  transform_seed = 42L,
156 |                  verbose = FALSE) {
157 |   assert_that(is.matrix(data) | is.data.frame(data), msg = "Data must be a data frame or a matrix.")
158 |   if (!all(unlist(lapply(data, is.numeric)))) stop("All columns should be numeric.")
159 |   assert_that(is.logical(include_input))
160 |   assert_that(is.count(n_neighbors))
161 |   assert_that(is.count(n_components))
162 |   assert_that(is.character(metric), msg = "Valid string metrics include: euclidean, manhattan, chebyshev, minkowski, canberra, braycurtis, mahalanobis, wminkowski, seuclidean, cosine, correlation, haversine, hamming, jaccard, dice, russelrao, kulsinski, rogerstanimoto, sokalmichener, sokalsneath, yule.")
163 |   assert_that(is.null(n_epochs) | is.count(n_epochs), msg = "n_epochs is not a count (a single positive integer)")
164 |   assert_that(is.numeric(learning_rate))
165 |   assert_that(is.numeric(alpha))
166 |   assert_that(init %in% c("spectral", "random"), msg = "init must be one of 'spectral', 'random', or a numpy array of initial embedding positions")
167 |   assert_that(is.numeric(spread))
168 |   assert_that(is.numeric(min_dist))
169 |   assert_that(is.numeric(set_op_mix_ratio))
170 |   assert_that(is.count(local_connectivity))
171 |   assert_that(is.numeric(repulsion_strength))
172 |   assert_that(is.numeric(bandwidth))
173 |   assert_that(is.numeric(gamma))
174 |   assert_that(is.count(negative_sample_rate))
175 |   assert_that(is.numeric(transform_queue_size))
176 |   assert_that(is.null(a) | is.numeric(a))
177 |   assert_that(is.null(b) | is.numeric(b))
178 |   assert_that(is.null(random_state) | is.count(random_state))
179 |   assert_that(is_dict(metric_kwds), msg = "metric_kwds must be a Python dictionary object, you can create it using 'reticulate::dict()'")
180 |   assert_that(is.flag(angular_rp_forest))
181 |   assert_that(is.integer(target_n_neighbors))
182 |   assert_that(is.character(target_metric) | is.function(target_metric))
183 |   assert_that(is_dict(target_metric_kwds))
184 |   assert_that(is.numeric(target_weight))
185 |   assert_that(is.integer(transform_seed))
186 |   assert_that(is.flag(verbose))
187 |   
188 |   # keyword "alpha" was renamed "initial_alpha" in a later version of the
189 |   # python library, try running it both ways in case of failure
190 |   
191 |   
192 |   modules <- py_module_available("umap")
193 |   if(!modules){
194 |     install_python_modules <- function(method = "auto", conda = "auto") {
195 |       py_install("umap-learn", method = method, conda = conda)
196 |     }
197 |     tryCatch(install_python_modules(), 
198 |              error = function(e) {
199 |                modules <- FALSE
200 |              },
201 |              finally = "umap-learn installed")
202 |     modules <- py_module_available("umap")
203 |   } else {
204 |     print("umap-learn already installed")
205 |   }
206 |   
207 |   umap_module <- import("umap")
208 |   
209 |   umap_vec <- tryCatch(
210 |     umap_module$UMAP(
211 |       n_neighbors = as.integer(n_neighbors),
212 |       n_components = as.integer(n_components),
213 |       metric = metric,
214 |       n_epochs = n_epochs,
215 |       alpha = alpha,
216 |       init = init,
217 |       spread = spread,
218 |       min_dist = min_dist,
219 |       set_op_mix_ratio = set_op_mix_ratio,
220 |       local_connectivity = local_connectivity,
221 |       bandwidth = bandwidth,
222 |       gamma = r_to_py(gamma),
223 |       negative_sample_rate = as.integer(negative_sample_rate),
224 |       a = a,
225 |       b = b,
226 |       random_state = random_state,
227 |       metric_kwds = metric_kwds,
228 |       angular_rp_forest = angular_rp_forest,
229 |       verbose = verbose
230 |     )$fit_transform(r_to_py(as.matrix(data))),
231 |     error = function(e) {
232 |       if (grepl("alpha", e$message) || grepl("bandwidth", e$message)) {
233 |         umap_module$UMAP(
234 |           n_neighbors = r_to_py(as.integer(n_neighbors)),
235 |           n_components = r_to_py(as.integer(n_components)),
236 |           metric = r_to_py(metric),
237 |           n_epochs = r_to_py(n_epochs),
238 |           learning_rate = r_to_py(as.numeric(learning_rate)),
239 |           init = r_to_py(init),
240 |           min_dist = r_to_py(as.numeric(min_dist)),
241 |           spread = r_to_py(as.numeric(spread)),
242 |           set_op_mix_ratio = r_to_py(as.numeric(set_op_mix_ratio)),
243 |           local_connectivity = r_to_py(as.integer(local_connectivity)),
244 |           repulsion_strength = r_to_py(as.numeric(repulsion_strength)),
245 |           negative_sample_rate = r_to_py(as.integer(negative_sample_rate)),
246 |           transform_queue_size = r_to_py(as.numeric(transform_queue_size)),
247 |           a = r_to_py(a),
248 |           b = r_to_py(b),
249 |           random_state = r_to_py(random_state),
250 |           metric_kwds = r_to_py(metric_kwds),
251 |           angular_rp_forest = r_to_py(angular_rp_forest),
252 |           target_n_neighbors = as.integer(target_n_neighbors),
253 |           target_metric = r_to_py(target_metric),
254 |           target_metric_kwds = r_to_py(target_metric_kwds),
255 |           target_weight =  r_to_py(target_weight),
256 |           transform_seed = r_to_py(as.integer(transform_seed)),
257 |           verbose = r_to_py(verbose)
258 |         )$fit_transform(r_to_py(as.matrix(data)))
259 |       } else  {
260 |         stop(e)
261 |       }
262 |     }
263 |   )
264 |   colnames(umap_vec) <- paste0("UMAP", seq_len(ncol(umap_vec)))
265 |   
266 |   # attach input data to UMAP embeddings if desired
267 |   if (include_input) {
268 |     output <- data.frame(cbind(data, umap_vec))
269 |   } else {
270 |     output <- data.frame(umap_vec)
271 |   }
272 |   
273 |   #make_umap_object(output)
274 |   output
275 | }
276 | 
277 | is_dict <- function(x) {
278 |   inherits(x, "python.builtin.dict")
279 | }
280 | 
281 | # global reference to umap (will be initialized in .onLoad)
282 | umap_module <<- NULL
283 | 
284 | .onLoad <- function(libname, pkgname) {
285 |   # use superassignment to update global reference to umap
286 |   if(py_available()){
287 |     use_condaenv("r-reticulate")
288 |     modules <- py_module_available("umap")
289 |     if(!modules){
290 |       install_python_modules <- function(method = "auto", conda = "auto") {
291 |         py_install("umap-learn", method = method, conda = conda)
292 |       }
293 |       tryCatch(install_python_modules(), 
294 |                error = function(e) {
295 |                  modules <- FALSE
296 |                })
297 |       modules <- py_module_available("umap")
298 |     }
299 |     if (suppressWarnings(suppressMessages(requireNamespace("reticulate")))) {
300 |       
301 |       if (modules) {
302 |         ## assignment in parent environment!
303 |         umap_module <- import("umap", delay_load = TRUE)
304 |       } else {
305 |         install_python_modules()
306 |       }
307 |     }
308 |   }
309 | }
310 | 
311 | .onAttach <- function(libname, pkgname) {
312 |   if(py_available()){
313 |     use_condaenv("r-reticulate")
314 |     modules <- py_module_available("umap")
315 |     if(!modules){
316 |       install_python_modules <- function(method = "auto", conda = "auto") {
317 |         py_install("umap-learn", method = method, conda = conda)
318 |       }
319 |       tryCatch(install_python_modules(), 
320 |                error = function(e) {
321 |                  modules <- FALSE
322 |                },
323 |                finally = "umap-learn installed")
324 |       modules <- py_module_available("umap")
325 |     }
326 |   } else{
327 |     packageStartupMessage("Warning message:
328 | Python not installed
329 | Please install anaconda or miniconda
330 | https://conda.io/projects/conda/en/latest/user-guide/install/index.html")
331 |   }
332 |   if(py_available() && py_module_available("umap")){
333 |   umap_module <- import("umap")
334 |   packageStartupMessage("umap-learn python module loaded successfully")
335 |   } else {
336 |     packageStartupMessage("Warning message:
337 | umap-learn module is not installed
338 | Please run one of the following:
339 | conda install -n r-reticulate -c conda-forge umap-learn
340 | conda activate r-reticulate; pip install umap-learn")
341 |   }
342 | }
343 | 


--------------------------------------------------------------------------------
/R/umap_app.R:
--------------------------------------------------------------------------------
 1 | #' Open a shiny app to explore the data in a UMAP embedding.
 2 | #'
 3 | #' @param umap output of a call to `umap`
 4 | #'
 5 | #' @return Open an interactive shiny app to explore the data.
 6 | #' @export
 7 | run_umap_shiny <- function(umap){
 8 | 
 9 |   #umapobj <- deparse(substitute(umap_obj))
10 | 
11 |   #umap <- umap_obj$umap_table
12 |   #markers <- umap_obj
13 |   markers <- colnames(umap)[!colnames(umap) %in% c("UMAP1", "UMAP2")]
14 | 
15 |   #if(is.null(markers)){  markers <- umap_obj$markers}
16 | 
17 |   # Define UI for application that draws a histogram
18 |   ui <- shiny::fluidPage(
19 | 
20 |     # Application title
21 |     shiny::titlePanel("UMAP Explorer"),
22 | 
23 |     # Sidebar with a slider input for number of bins
24 |     shiny::sidebarLayout(
25 |       shiny::sidebarPanel(
26 |         shiny::selectInput("marker", label = "Select Variable to Color By", choices = markers, selected = markers[1])
27 |       ),
28 | 
29 |       # Show a plot of the generated distribution
30 |       shiny::mainPanel(
31 |         shiny::plotOutput("umapPlot")
32 |       )
33 |     )
34 |   )
35 | 
36 |   # Define server logic required to draw a histogram
37 |   server <- function(input, output) {
38 | 
39 |     output$umapPlot <- shiny::renderPlot({
40 | 
41 |       #umap$plot(input$marker)
42 |        out_plot <- ggplot2::ggplot(umap, ggplot2::aes_string(x = "UMAP1", y = "UMAP2", color=input$marker)) +
43 |         ggplot2::geom_point()
44 | 
45 |       #umap_obj$plot(input$marker)
46 | 
47 |       out_plot
48 |     })
49 |   }
50 | 
51 |   # Run the application
52 |   shiny::shinyApp(ui = ui, server = server)
53 | 
54 | 
55 | }
56 | 
57 | 


--------------------------------------------------------------------------------
/README-NOT.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | umapr
  4 | =====
  5 | 
  6 | [![Project Status: Abandoned – Initial development has started, but there has not yet been a stable, usable release; the project has been abandoned and the author(s) do not intend on continuing development.](https://www.repostatus.org/badges/latest/abandoned.svg)](https://www.repostatus.org/#abandoned)
  7 | [![Travis-CI Build Status](https://travis-ci.org/ropenscilabs/umapr.svg?branch=master)](https://travis-ci.org/ropenscilabs/umapr) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/juyeongkim/umapr?branch=master&svg=true)](https://ci.appveyor.com/project/juyeongkim/umapr) [![codecov](https://codecov.io/gh/ropenscilabs/umapr/branch/master/graph/badge.svg)](https://codecov.io/gh/ropenscilabs/umapr)
  8 | 
  9 | `umapr` wraps the Python implementation of UMAP to make the algorithm accessible from within R. It uses the great [`reticulate`](https://cran.r-project.org/web/packages/reticulate/index.html) package.
 10 | 
 11 | Uniform Manifold Approximation and Projection (UMAP) is a non-linear dimensionality reduction algorithm. It is similar to t-SNE but computationally more efficient. UMAP was created by Leland McInnes and John Healy ([github](https://github.com/lmcinnes/umap), [arxiv](https://arxiv.org/abs/1802.03426)).
 12 | 
 13 | Recently, two new UMAP R packages have appeared. These new packages provide more features than `umapr` does and they are more actively developed. These packages are:
 14 | 
 15 | -   [umap](https://github.com/tkonopka/umap), which provides the same Python wrapping function as `umapr` and also an R implementation, removing the need for the Python version to be installed. It is available on [CRAN](https://cran.r-project.org/web/packages/umap/index.html).
 16 | 
 17 | -   [uwot](https://github.com/jlmelville/uwot), which also provides an R implementation, removing the need for the Python version to be installed.
 18 | 
 19 | Contributors
 20 | ------------
 21 | 
 22 | [Angela Li](https://github.com/angela-li), [Ju Kim](https://github.com/juyeongkim), [Malisa Smith](https://github.com/malisas), [Sean Hughes](https://github.com/seaaan), [Ted Laderas](https://github.com/laderast)
 23 | 
 24 | `umapr` is a project that was first developed at [rOpenSci Unconf 2018](http://unconf18.ropensci.org).
 25 | 
 26 | Installation
 27 | ------------
 28 | 
 29 | **First**, you will need to install `Python` and the `UMAP` package. Instruction available [here](https://github.com/lmcinnes/umap#installing).
 30 | 
 31 | Then, you can install the development version from [GitHub](https://github.com/) with:
 32 | 
 33 | ``` r
 34 | # install.packages("devtools")
 35 | devtools::install_github("ropenscilabs/umapr")
 36 | ```
 37 | 
 38 | Basic use
 39 | ---------
 40 | 
 41 | Here is an example of running UMAP on the `iris` data set.
 42 | 
 43 | ``` r
 44 | library(umapr)
 45 | library(tidyverse)
 46 | 
 47 | # select only numeric columns
 48 | df <- as.matrix(iris[ , 1:4])
 49 | 
 50 | # run UMAP algorithm
 51 | embedding <- umap(df)
 52 | ```
 53 | 
 54 | `umap` returns a `data.frame` with two attached columns called "UMAP1" and "UMAP2". These columns represent the UMAP embeddings of the data, which are column-bound to the original data frame.
 55 | 
 56 | ``` r
 57 | # look at result
 58 | head(embedding)
 59 | #>   Sepal.Length Sepal.Width Petal.Length Petal.Width    UMAP1     UMAP2
 60 | #> 1          5.1         3.5          1.4         0.2 5.647059 -6.666872
 61 | #> 2          4.9         3.0          1.4         0.2 4.890193 -8.130815
 62 | #> 3          4.7         3.2          1.3         0.2 4.397037 -7.546669
 63 | #> 4          4.6         3.1          1.5         0.2 4.412886 -7.633424
 64 | #> 5          5.0         3.6          1.4         0.2 5.707233 -6.863213
 65 | #> 6          5.4         3.9          1.7         0.4 6.442851 -5.726554
 66 | 
 67 | # plot the result
 68 | embedding %>% 
 69 |   mutate(Species = iris$Species) %>%
 70 |   ggplot(aes(UMAP1, UMAP2, color = Species)) + geom_point()
 71 | ```
 72 | 
 73 | ![](img/unnamed-chunk-3-1.png)
 74 | 
 75 | There is a function called `run_umap_shiny()` which will bring up a Shiny app for exploring different colors of the variables on the umap plots.
 76 | 
 77 | ``` r
 78 | run_umap_shiny(embedding)
 79 | ```
 80 | 
 81 | ![Shiny App for Exploring Results](img/shiny.png)
 82 | 
 83 | Function parameters
 84 | -------------------
 85 | 
 86 | There are a few important parameters. These are fully described in the UMAP Python [documentation](https://github.com/lmcinnes/umap/blob/bf1c3e5c89ea393c9de10bd66c5e3d9bc30588ee/notebooks/UMAP%20usage%20and%20parameters.ipynb).
 87 | 
 88 | The `n_neighbor` argument can range from 2 to n-1 where n is the number of rows in the data.
 89 | 
 90 | ``` r
 91 | neighbors <- c(4, 8, 16, 32, 64, 128)
 92 | 
 93 | 
 94 | 
 95 | neighbors %>% 
 96 |   map_df(~umap(as.matrix(iris[,1:4]), n_neighbors = .x) %>% 
 97 |       mutate(Species = iris$Species, Neighbor = .x)) %>% 
 98 |   mutate(Neighbor = as.integer(Neighbor)) %>% 
 99 |   ggplot(aes(UMAP1, UMAP2, color = Species)) + 
100 |     geom_point() + 
101 |     facet_wrap(~ Neighbor, scales = "free")
102 | ```
103 | 
104 | ![](img/unnamed-chunk-5-1.png)
105 | 
106 | The `min_dist` argument can range from 0 to 1.
107 | 
108 | ``` r
109 | dists <- c(0.001, 0.01, 0.05, 0.1, 0.5, 0.99)
110 | 
111 | dists %>% 
112 |   map_df(~umap(as.matrix(iris[,1:4]), min_dist = .x) %>% 
113 |       mutate(Species = iris$Species, Distance = .x)) %>% 
114 |   ggplot(aes(UMAP1, UMAP2, color = Species)) + 
115 |     geom_point() + 
116 |     facet_wrap(~ Distance, scales = "free")
117 | ```
118 | 
119 | ![](img/unnamed-chunk-6-1.png)
120 | 
121 | The `distance` argument can be many different distance functions.
122 | 
123 | ``` r
124 | dists <- c("euclidean", "manhattan", "canberra", "cosine", "hamming", "dice")
125 | 
126 | dists %>% 
127 |   map_df(~umap(as.matrix(iris[,1:4]), metric = .x) %>% 
128 |       mutate(Species = iris$Species, Metric = .x)) %>% 
129 |   ggplot(aes(UMAP1, UMAP2, color = Species)) + 
130 |     geom_point() + 
131 |     facet_wrap(~ Metric, scales = "free")
132 | ```
133 | 
134 | ![](img/unnamed-chunk-7-1.png)
135 | 
136 | Comparison to t-SNE and principal components analysis
137 | -----------------------------------------------------
138 | 
139 | t-SNE and UMAP are both non-linear dimensionality reduction methods, in contrast to PCA. Because t-SNE is relatively slow, PCA is sometimes run first to reduce the dimensions of the data.
140 | 
141 | We compared UMAP to PCA and t-SNE alone, as well as to t-SNE run on data preprocessed with PCA. In each case, the data were subset to include only complete observations. The code to reproduce these findings are available in [`timings.R`](timings.R).
142 | 
143 | The first data set is the same iris data set used above (149 observations of 4 variables):
144 | 
145 | ![t-SNE, PCA, and UMAP on iris](img/multiple_algorithms_iris.png)
146 | 
147 | Next we tried a cancer data set, made up of 699 observations of 10 variables:
148 | 
149 | ![t-SNE, PCA, and UMAP on cancer](img/multiple_algorithms_cancer.png)
150 | 
151 | Third we tried a soybean data set. It is made up of 531 observations and 35 variables:
152 | 
153 | ![t-SNE, PCA, and UMAP on soybeans](img/multiple_algorithms_bean.png)
154 | 
155 | Finally we used a large single-cell RNAsequencing data set, with 561 observations (cells) of 55186 variables (over 30 million elements)!
156 | 
157 | ![t-SNE, PCA, and UMAP on rna](img/multiple_algorithms_rna.png)
158 | 
159 | PCA is orders of magnitude faster than t-SNE or UMAP (not shown). UMAP, though, is a substantial improvement over t-SNE both in terms of memory and time taken to run.
160 | 
161 | ![Time to run t-SNE vs UMAP](img/multiple_algorithms_time.png)
162 | 
163 | ![Memory to run t-SNE vs UMAP](img/multiple_algorithms_memory.png)
164 | 
165 | Related projects
166 | ----------------
167 | 
168 | -   [`umap`](https://github.com/tkonopka/umap): R implementation of UMAP
169 | -   [`seurat`](https://github.com/satijalab/seurat): R toolkit for single cell genomics
170 | -   [`smallvis`](https://github.com/jlmelville/smallvis): R package for dimensionality reduction of small datasets
171 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  6 | 
  7 | ```{r, echo = FALSE}
  8 | knitr::opts_chunk$set(
  9 |   collapse = TRUE,
 10 |   comment = "#>",
 11 |   fig.path = "img/"
 12 | )
 13 | library(bindrcpp)
 14 | ```
 15 | 
 16 | # umapr
 17 | 
 18 | [![Travis-CI Build Status](https://travis-ci.org/ropenscilabs/umapr.svg?branch=master)](https://travis-ci.org/ropenscilabs/umapr)
 19 | [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/juyeongkim/umapr?branch=master&svg=true)](https://ci.appveyor.com/project/juyeongkim/umapr)
 20 | [![codecov](https://codecov.io/gh/ropenscilabs/umapr/branch/master/graph/badge.svg)](https://codecov.io/gh/ropenscilabs/umapr)
 21 | 
 22 | `umapr` wraps the Python implementation of UMAP to make the algorithm accessible from within R. It uses the great [`reticulate`](https://cran.r-project.org/web/packages/reticulate/index.html) package.
 23 | 
 24 | Uniform Manifold Approximation and Projection (UMAP) is a non-linear dimensionality reduction algorithm. It is similar to t-SNE but computationally more efficient. UMAP was created by Leland McInnes and John Healy ([github](https://github.com/lmcinnes/umap), [arxiv](https://arxiv.org/abs/1802.03426)). 
 25 | 
 26 | Recently, two new UMAP R packages have appeared. These new packages provide more features than `umapr` does and they are more actively developed. These packages are: 
 27 | 
 28 | * [umap](https://github.com/tkonopka/umap), which provides the same Python wrapping function as `umapr` and also an R implementation, removing the need for the Python version to be installed. It is available on [CRAN](https://cran.r-project.org/web/packages/umap/index.html).
 29 | 
 30 | * [uwot](https://github.com/jlmelville/uwot), which also provides an R implementation, removing the need for the Python version to be installed.
 31 | 
 32 | ## Contributors 
 33 | 
 34 | [Angela Li](https://github.com/angela-li), [Ju Kim](https://github.com/juyeongkim), [Malisa Smith](https://github.com/malisas), [Sean Hughes](https://github.com/seaaan), [Ted Laderas](https://github.com/laderast)
 35 | 
 36 | `umapr` is a project that was first developed at [rOpenSci Unconf 2018](http://unconf18.ropensci.org).
 37 | 
 38 | ## Installation
 39 | 
 40 | **First**, you will need to install `Python` and the `UMAP` package. Instruction available [here](https://github.com/lmcinnes/umap#installing).
 41 | 
 42 | Then, you can install the development version from [GitHub](https://github.com/) with:
 43 | 
 44 | ``` r
 45 | # install.packages("devtools")
 46 | devtools::install_github("ropenscilabs/umapr")
 47 | ```
 48 | 
 49 | ## Basic use
 50 | 
 51 | Here is an example of running UMAP on the `iris` data set. 
 52 | 
 53 | ```{r message=FALSE, warning=FALSE, fig.width=7}
 54 | library(umapr)
 55 | library(tidyverse)
 56 | 
 57 | # select only numeric columns
 58 | df <- as.matrix(iris[ , 1:4])
 59 | 
 60 | # run UMAP algorithm
 61 | embedding <- umap(df)
 62 | ```
 63 | 
 64 | `umap` returns a `data.frame` with two attached columns called "UMAP1" and "UMAP2". These columns represent the UMAP embeddings of the data, which are column-bound to the original data frame. 
 65 | 
 66 | ```{r}
 67 | # look at result
 68 | head(embedding)
 69 | 
 70 | # plot the result
 71 | embedding %>% 
 72 |   mutate(Species = iris$Species) %>%
 73 |   ggplot(aes(UMAP1, UMAP2, color = Species)) + geom_point()
 74 | ```
 75 | 
 76 | There is a function called `run_umap_shiny()` which will bring up a Shiny app for exploring different colors of the variables on the umap plots.
 77 | 
 78 | ```{r eval=FALSE}
 79 | run_umap_shiny(embedding)
 80 | ```
 81 | 
 82 | ![Shiny App for Exploring Results](img/shiny.png)
 83 | 
 84 | ## Function parameters
 85 | 
 86 | There are a few important parameters. These are fully described in the UMAP Python [documentation](https://github.com/lmcinnes/umap/blob/bf1c3e5c89ea393c9de10bd66c5e3d9bc30588ee/notebooks/UMAP%20usage%20and%20parameters.ipynb). 
 87 | 
 88 | The `n_neighbor` argument can range from 2 to n-1 where n is the number of rows in the data.
 89 |  
 90 | ```{r fig.width=7}
 91 | neighbors <- c(4, 8, 16, 32, 64, 128)
 92 | 
 93 | 
 94 | 
 95 | neighbors %>% 
 96 |   map_df(~umap(as.matrix(iris[,1:4]), n_neighbors = .x) %>% 
 97 |       mutate(Species = iris$Species, Neighbor = .x)) %>% 
 98 |   mutate(Neighbor = as.integer(Neighbor)) %>% 
 99 |   ggplot(aes(UMAP1, UMAP2, color = Species)) + 
100 |     geom_point() + 
101 |     facet_wrap(~ Neighbor, scales = "free")
102 | ```
103 | 
104 | The `min_dist` argument can range from 0 to 1. 
105 | 
106 | ```{r fig.width=7}
107 | dists <- c(0.001, 0.01, 0.05, 0.1, 0.5, 0.99)
108 | 
109 | dists %>% 
110 |   map_df(~umap(as.matrix(iris[,1:4]), min_dist = .x) %>% 
111 |       mutate(Species = iris$Species, Distance = .x)) %>% 
112 |   ggplot(aes(UMAP1, UMAP2, color = Species)) + 
113 |     geom_point() + 
114 |     facet_wrap(~ Distance, scales = "free")
115 | ```
116 | 
117 | The `distance` argument can be many different distance functions. 
118 | 
119 | ```{r fig.width=7}
120 | dists <- c("euclidean", "manhattan", "canberra", "cosine", "hamming", "dice")
121 | 
122 | dists %>% 
123 |   map_df(~umap(as.matrix(iris[,1:4]), metric = .x) %>% 
124 |       mutate(Species = iris$Species, Metric = .x)) %>% 
125 |   ggplot(aes(UMAP1, UMAP2, color = Species)) + 
126 |     geom_point() + 
127 |     facet_wrap(~ Metric, scales = "free")
128 | ```
129 | 
130 | ## Comparison to t-SNE and principal components analysis
131 | 
132 | t-SNE and UMAP are both non-linear dimensionality reduction methods, in contrast to PCA. Because t-SNE is relatively slow, PCA is sometimes run first to reduce the dimensions of the data. 
133 | 
134 | We compared UMAP to PCA and t-SNE alone, as well as to t-SNE run on data preprocessed with PCA. In each case, the data were subset to include only complete observations. The code to reproduce these findings are available in [`timings.R`](timings.R). 
135 | 
136 | The first data set is the same iris data set used above (149 observations of 4 variables): 
137 | 
138 | ![t-SNE, PCA, and UMAP on iris](img/multiple_algorithms_iris.png)
139 | 
140 | Next we tried a cancer data set, made up of 699 observations of 10 variables: 
141 | 
142 | ![t-SNE, PCA, and UMAP on cancer](img/multiple_algorithms_cancer.png)
143 | 
144 | Third we tried a soybean data set. It is made up of 531 observations and 35 variables: 
145 | 
146 | ![t-SNE, PCA, and UMAP on soybeans](img/multiple_algorithms_bean.png)
147 | 
148 | Finally we used a large single-cell RNAsequencing data set, with 561 observations (cells) of 55186 variables (over 30 million elements)! 
149 | 
150 | ![t-SNE, PCA, and UMAP on rna](img/multiple_algorithms_rna.png)
151 | 
152 | PCA is orders of magnitude faster than t-SNE or UMAP (not shown). UMAP, though, is a substantial improvement over t-SNE both in terms of memory and time taken to run. 
153 | 
154 | ![Time to run t-SNE vs UMAP](img/multiple_algorithms_time.png)
155 | 
156 | ![Memory to run t-SNE vs UMAP](img/multiple_algorithms_memory.png)
157 | 
158 | ## Related projects
159 | 
160 | * [`umap`](https://github.com/tkonopka/umap): R implementation of UMAP
161 | * [`seurat`](https://github.com/satijalab/seurat): R toolkit for single cell genomics
162 | * [`smallvis`](https://github.com/jlmelville/smallvis): R package for dimensionality reduction of small datasets
163 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # umapr
2 | 
3 | [![Project Status: Abandoned](https://www.repostatus.org/badges/latest/abandoned.svg)](https://www.repostatus.org/#abandoned)
4 | 
5 | This repository has been archived. The former README is now in [README-NOT.md](README-NOT.md).
6 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
  1 | # DO NOT CHANGE the "init" and "install" sections below
  2 | 
  3 | image:
  4 | #- Visual Studio 2015
  5 | - Visual Studio 2017
  6 | #- Ubuntu
  7 | 
  8 | environment:
  9 |   global:
 10 |     PATH: C:\msys64\usr\bin;C:\msys64\mingw64\bin;C:\Windows;%PATH%
 11 |     #;C:\Windows\System32
 12 |     R_CHECK_ARGS: "--ignore-vignettes --no-examples --no-build-vignettes --no-manual"
 13 |     R_BUILD_ARGS: '--no-build-vignettes'
 14 |     NOT_CRAN: true
 15 |     USE_RTOOLS: true
 16 |     _R_CHECK_FORCE_SUGGESTS_: false
 17 |   matrix:
 18 |     - PYTHON: "C:\\Python36"
 19 |       RETICULATE_PYTHON: "C:\\Python36"
 20 |       MINICONDA: C:\Miniconda36-x64
 21 |       CONDA_INSTALL_LOCN: C:\Miniconda36-x64
 22 |       DISTUTILS_USE_SDK: "1"
 23 |       R_VERSION: devel
 24 |       R_ARCH: x64
 25 |       platform: x64
 26 |       PYTHON_ARCH: "64"
 27 |       GCC_PATH: mingw_64
 28 |     - PYTHON: "C:\\Python37"
 29 |       RETICULATE_PYTHON: "C:\\Python37"
 30 |       MINICONDA: C:\Miniconda37-x64
 31 |       CONDA_INSTALL_LOCN: C:\Miniconda37-x64
 32 |       DISTUTILS_USE_SDK: "1"
 33 |       R_VERSION: devel
 34 |       R_ARCH: x64
 35 |       platform: x64
 36 |       PYTHON_ARCH: "64"
 37 |       GCC_PATH: mingw_64
 38 |     - PYTHON: "C:\\Python35"
 39 |       RETICULATE_PYTHON: "C:\\Python35"
 40 |       MINICONDA: C:\Miniconda35-x64
 41 |       CONDA_INSTALL_LOCN: C:\Miniconda35-x64
 42 |       DISTUTILS_USE_SDK: "1"
 43 |       R_VERSION: devel
 44 |       R_ARCH: x64
 45 |       platform: x64
 46 |       PYTHON_ARCH: "64"
 47 |       GCC_PATH: mingw_64
 48 |     - PYTHON: "C:\\Python37"
 49 |       RETICULATE_PYTHON: "C:\\Python37"
 50 |       MINICONDA: C:\Miniconda37-x64
 51 |       CONDA_INSTALL_LOCN: C:\Miniconda37-x64
 52 |       DISTUTILS_USE_SDK: "1"
 53 |       R_VERSION: release
 54 |       R_ARCH: x64
 55 |       platform: x64
 56 |       PYTHON_ARCH: "64"
 57 |     - PYTHON: "C:\\Python36"
 58 |       RETICULATE_PYTHON: "C:\\Python36"
 59 |       MINICONDA: C:\Miniconda36-x64
 60 |       CONDA_INSTALL_LOCN: C:\Miniconda36-x64
 61 |       DISTUTILS_USE_SDK: "1"
 62 |       R_VERSION: release
 63 |       R_ARCH: x64
 64 |       platform: x64
 65 |       PYTHON_ARCH: "64"
 66 |     - PYTHON: "C:\\Python35"
 67 |       RETICULATE_PYTHON: "C:\\Python35"
 68 |       MINICONDA: C:\Miniconda35-x64
 69 |       CONDA_INSTALL_LOCN: C:\Miniconda35-x64
 70 |       DISTUTILS_USE_SDK: "1"
 71 |       R_VERSION: release
 72 |       R_ARCH: x64
 73 |       platform: x64
 74 |       PYTHON_ARCH: "64"
 75 |     - PYTHON: "C:\\Python37"
 76 |       RETICULATE_PYTHON: "C:\\Python37"
 77 |       MINICONDA: C:\Miniconda37-x64
 78 |       CONDA_INSTALL_LOCN: C:\Miniconda37-x64
 79 |       DISTUTILS_USE_SDK: "1"
 80 |       R_VERSION: stable
 81 |       R_ARCH: x64
 82 |       platform: x64
 83 |       PYTHON_ARCH: "64"
 84 |     - PYTHON: "C:\\Python36"
 85 |       RETICULATE_PYTHON: "C:\\Python36"
 86 |       MINICONDA: C:\Miniconda36-x64
 87 |       CONDA_INSTALL_LOCN: C:\Miniconda36-x64
 88 |       DISTUTILS_USE_SDK: "1"
 89 |       R_VERSION: stable
 90 |       R_ARCH: x64
 91 |       platform: x64
 92 |       PYTHON_ARCH: "64"
 93 |     - PYTHON: "C:\\Python35"
 94 |       RETICULATE_PYTHON: "C:\\Python35"
 95 |       MINICONDA: C:\Miniconda35-x64
 96 |       CONDA_INSTALL_LOCN: C:\Miniconda35-x64
 97 |       DISTUTILS_USE_SDK: "1"
 98 |       R_VERSION: stable
 99 |       R_ARCH: x64
100 |       platform: x64
101 |       PYTHON_ARCH: "64"
102 | 
103 | 
104 | matrix:
105 |   fast_finish: true
106 |   exclude:
107 |     - platform: x64
108 |       PYTHON_ARCH: "32"
109 |     - platform: x86
110 |       PYTHON_ARCH: "64"
111 | 
112 | # Download script file from GitHub
113 | init:
114 |   - ps: |
115 |         $ErrorActionPreference = "Stop"
116 |         Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1"
117 |         Import-Module '..\appveyor-tool.ps1'
118 |   - cmd: "ECHO %PYTHON_VERSION% %CONDA_INSTALL_LOCN%"
119 |   - cmd: "set PATH=%MINICONDA%;%MINICONDA%\\Scripts;%MINICONDA%\\Library\\bin;%PATH%"
120 | 
121 | install:
122 |   # If there is a newer build queued for the same PR, cancel this one.
123 |   # The AppVeyor 'rollout builds' option is supposed to serve the same
124 |   # purpose but it is problematic because it tends to cancel builds pushed
125 |   # directly to master instead of just PR builds (or the converse).
126 |   # credits: JuliaLang developers.
127 |   - ps: if ($env:APPVEYOR_PULL_REQUEST_NUMBER -and $env:APPVEYOR_BUILD_NUMBER -ne ((Invoke-RestMethod `
128 |         https://ci.appveyor.com/api/projects/$env:APPVEYOR_ACCOUNT_NAME/$env:APPVEYOR_PROJECT_SLUG/history?recordsNumber=50).builds | `
129 |         Where-Object pullRequestId -eq $env:APPVEYOR_PULL_REQUEST_NUMBER)[0].buildNumber) { `
130 |           throw "There are newer queued builds for this pull request, failing early." }
131 |   - ECHO "Filesystem root:"
132 |   - ps: "ls \"C:/\""
133 | 
134 |   #- ECHO "Installed SDKs:"
135 |   #- call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"
136 |   #- ps: "ls \"C:/Program Files/Microsoft SDKs/Windows\""
137 | 
138 |   #- travis-tool.sh install_github igraph/igraph
139 |   #- travis-tool.sh install_github igraph/python-igraph
140 |   # Install Python (from the official .msi of https://python.org) and pip when
141 |   # not already installed.
142 |   #- ps: if (-not(Test-Path($env:PYTHON))) { & appveyor\install.ps1 }
143 | 
144 |   # Prepend newly installed Python to the PATH of this build (this cannot be
145 |   # done from inside the powershell script as it would require to restart
146 |   # the parent CMD process).
147 |   #- "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
148 | 
149 |   # Check that we have the expected version and architecture for Python
150 |   - "python --version"
151 |   #- "python -c \"import struct; print(struct.calcsize('P') * 8)\""
152 | 
153 |   # setup conda environment for building
154 |   #- cmd: set "PATH=%CONDA_INSTALL_LOCN%;%CONDA_INSTALL_LOCN%\scripts;%PATH%"
155 |   #- cmd: set PYTHONUNBUFFERED=1
156 | 
157 |   - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
158 |   # Check that we have the expected version and architecture for Python
159 |   - "python -c \"import sys; print(sys.version)\""
160 |   # Install the build and runtime dependencies of the project.
161 |   - "conda update -q --yes conda"
162 |   # Install constructor, take into account what vc version we target later in the build..
163 |   - "conda install -q --yes 'constructor>=2.0'"
164 |   # list package versions
165 |   - "conda list"
166 |   # build installer
167 |   #- "constructor --verbose --platform=%CONDA_PLATFORM% %OBSPY_VERSION%_%CONDA_PYSUFFIX%"
168 | 
169 |     # update mysy2
170 |   #- C:\msys64\usr\bin\bash -lc "pacman --needed --noconfirm -Sy pacman-mirrors"
171 |   #- C:\msys64\usr\bin\bash -lc "pacman --noconfirm -Sy"
172 |   #- C:\msys64\usr\bin\bash -lc "pacman --noconfirm -S autoconf automake bison flex"
173 |   #- C:\msys64\usr\bin\bash -lc "pacman --noconfirm -S libxml2-devel zip"
174 | 
175 |   - conda info --envs
176 |   #- conda create -n env_name -y
177 |   #- conda update -y -n base -c defaults conda
178 |   #- "conda.bat activate"
179 |   #- sudo conda init bash
180 |   #- bash
181 |   #- conda activate base
182 |   #- echo ". C:\Miniconda37/etc/profile.d/conda.sh" >> ~/.bashrc
183 |   #- conda activate env_name
184 |   - conda config --set always_yes yes --set changeps1 no
185 |   - conda config --add channels conda-forge
186 |   - conda config --add channels vtraag
187 |   - conda install -y -q conda pip
188 |   #- conda update -q conda pip
189 |   #- conda install -y -q numpy
190 |   #- conda install -y -q -c r r-igraph
191 |   #- conda install -y -q -c conda-forge libcxx
192 |   #- conda install -y -q -c anaconda git
193 | 
194 |   #- git clone -q https://github.com/conda-forge/igraph-feedstock.git C:\projects\igraph-feedstock
195 |   #- git fetch -q origin +refs/pull/7/merge:
196 |   #- git checkout -qf HEAD
197 |   #- powershell -Command "(New-Object Net.WebClient).DownloadFile('https://raw.githubusercontent.com/conda-forge/conda-forge-build-setup-feedstock/master/recipe/ff_ci_pr_build.py', 'ff_ci_pr_build.py')"
198 |   #- ff_ci_pr_build -v --ci "appveyor" "%APPVEYOR_ACCOUNT_NAME%/%APPVEYOR_PROJECT_SLUG%" "%APPVEYOR_BUILD_NUMBER%" "%APPVEYOR_PULL_REQUEST_NUMBER%"
199 |   #- del ff_ci_pr_build.py
200 |   #- rmdir C:\cygwin /s /q
201 |   #- call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
202 |   #- conda.exe update --yes --quiet conda
203 |   #- set PYTHONUNBUFFERED=1
204 |   #- conda.exe config --set show_channel_urls true
205 |   #- conda.exe config --remove channels defaults
206 |   #- conda.exe config --add channels defaults
207 |   #- conda.exe config --add channels conda-forge
208 |   #- conda.exe install -n env_name --quiet --yes conda-forge-build-setup
209 |   #- run_conda_forge_build_setup
210 |   #- conda.exe build recipe --quiet
211 |   #- conda install -n env_name -y q igraph
212 |   - conda install -y numpy scipy
213 |   - conda install -y python-igraph
214 |   - conda install -y -c conda-forge umap-learn
215 |   #- conda install pip numpy scipy
216 |   #- "python -m pip install --upgrade pip"
217 |   #- "pip install -q pycairo"
218 |   #- pip install wheel
219 |   #- pip install python-igraph==0.7.1.post6
220 |   #- "pip install git+git://github.com/igraph/python-igraph.git"
221 |   #- "pip install -q leidenalg"
222 |   #- "echo done"
223 | 
224 |   # Upgrade to the latest version of pip to avoid it displaying warnings
225 |   # about it being out of date.
226 |   #- "python -m pip install --upgrade pip"
227 | 
228 |   # Install the build dependencies of the project. If some dependencies contain
229 |   # compiled extensions and are not provided as pre-built wheel packages,
230 |   # pip will build them from source using the MSVC compiler matching the
231 |   # target Python version and architecture
232 |   #- "%CMD_IN_ENV% pip install leidenalg python-igraph numpy"
233 | 
234 | 
235 | #install:
236 |   #install python libraries
237 |   #- "pip install --upgrade pip"
238 |   #- "pip install leidenalg"
239 |   - ps: Bootstrap
240 | 
241 | cache:
242 |   - C:\RLibrary
243 | 
244 | # Adapt as necessary starting from here
245 | 
246 | build_script:
247 |   - R -e 'install.packages("igraph")'
248 |   - R -e 'install.packages("RColorBrewer")'
249 |   - R -e 'install.packages("rmarkdown")'
250 |   - R -e 'install.packages("knitr")'
251 |   - travis-tool.sh install_deps
252 |   #- "pip install -q graphviz" #includes python-igraph
253 |   #- "pip install -q leidenalg"
254 | 
255 | 
256 | test_script:
257 |   - travis-tool.sh run_tests
258 | 
259 | on_failure:
260 |   - 7z a failure.zip *.Rcheck\*
261 |   - appveyor PushArtifact failure.zip
262 | 
263 | artifacts:
264 |   - path: '*.Rcheck\**\*.log'
265 |     name: Logs
266 | 
267 |   - path: '*.Rcheck\**\*.out'
268 |     name: Logs
269 | 
270 |   - path: '*.Rcheck\**\*.fail'
271 |     name: Logs
272 | 
273 |   - path: '*.Rcheck\**\*.Rout'
274 |     name: Logs
275 | 
276 |   - path: '\*_*.tar.gz'
277 |     name: Bits
278 | 
279 |   - path: '\*_*.zip'
280 |     name: Bits
281 | 


--------------------------------------------------------------------------------
/img/multiple_algorithms_bean.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/multiple_algorithms_bean.png


--------------------------------------------------------------------------------
/img/multiple_algorithms_cancer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/multiple_algorithms_cancer.png


--------------------------------------------------------------------------------
/img/multiple_algorithms_iris.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/multiple_algorithms_iris.png


--------------------------------------------------------------------------------
/img/multiple_algorithms_memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/multiple_algorithms_memory.png


--------------------------------------------------------------------------------
/img/multiple_algorithms_rna.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/multiple_algorithms_rna.png


--------------------------------------------------------------------------------
/img/multiple_algorithms_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/multiple_algorithms_time.png


--------------------------------------------------------------------------------
/img/shiny.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/shiny.png


--------------------------------------------------------------------------------
/img/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/img/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/img/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/unnamed-chunk-6-1.png


--------------------------------------------------------------------------------
/img/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/img/unnamed-chunk-7-1.png


--------------------------------------------------------------------------------
/inst/ropensci_blog/img/multiple_algorithms_cancer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/inst/ropensci_blog/img/multiple_algorithms_cancer.png


--------------------------------------------------------------------------------
/inst/ropensci_blog/img/multiple_algorithms_memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/inst/ropensci_blog/img/multiple_algorithms_memory.png


--------------------------------------------------------------------------------
/inst/ropensci_blog/img/multiple_algorithms_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/inst/ropensci_blog/img/multiple_algorithms_time.png


--------------------------------------------------------------------------------
/inst/ropensci_blog/img/shiny.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci-archive/umapr/b3246f13e542d155a9616b842f4995e0c79a785a/inst/ropensci_blog/img/shiny.png


--------------------------------------------------------------------------------
/inst/ropensci_blog/working-on-the-umapr-package.md:
--------------------------------------------------------------------------------
 1 | Working on the `umapr` package
 2 | ================
 3 | Sean Hughes, Ju Kim, Malisa Smith, Angela Li, and Ted Laderas
 4 | 
 5 | ![Comparing UMAP to other algorithms](img/multiple_algorithms_cancer.png)
 6 | 
 7 | Motivation
 8 | ----------
 9 | 
10 | > Note: At the time of the unconference, we were unaware that a similar package called `umap` existed and has implemented the algorithm in R. It's in the process of being submitted to CRAN. We don't want to steal their thunder.
11 | 
12 | A few weeks ago, as part of the [rOpenSci Unconference](http://unconf18.ropensci.org), a group of us (Sean Hughes, Malisa Smith, Angela Li, Ju Kim, and Ted Laderas) decided to work on making the UMAP algorithm accessible within R. UMAP (Uniform Manifold Approximation and Projection) is a dimensionality reduction technique that allows the user to reduce high dimensional data (multiple columns) into a smaller number of columns for visualization purposes ([github](https://github.com/lmcinnes/umap), [arxiv](https://arxiv.org/abs/1802.03426)). It is similar to both Principal Components Analysis (PCA) and t-SNE, which are techniques often used in the single-cell omics world to visualize high dimensional data. t-SNE is actually quite a slow algorithm; one of the advantages of UMAP is that it runs faster than t-SNE. Because the `data.frames` that are typically run with these algorithms can run into millions of rows, efficiency is important.
13 | 
14 | A few weeks ago, as part of the [rOpenSci Unconference](http://unconf18.ropensci.org), a group of us (Sean Hughes, Malisa Smith, Angela Li, Ju Kim, and Ted Laderas) decided to work on making the UMAP algorithm accessible within R. We had been introduced to each other before the unconference, and it turns out that we all work with flow cytometry data and that it would be fun to work on a project together. UMAP (Uniform Manifold Approximation and Projection) is a dimensionality reduction technique that allows the user to reduce high dimensional data (multiple columns) into a smaller number of columns for visualization purposes ([github](https://github.com/lmcinnes/umap), [arxiv](https://arxiv.org/abs/1802.03426)). It is related to both Principle Component Analysis (PCA) and t-SNE, which are techniques often used to visualize high dimensional data, such as single cell sequencing or expression data.
15 | 
16 | t-SNE is actually quite a slow algorithm; one of the advantages of UMAP is that it actually runs faster than t-SNE. Because the `data.frames` that are typically run with these algorithms can run into millions of rows, efficiency is important.
17 | 
18 | We decided to start working on the `umapr` package to make this technique accessible within R.  As with most rOpenSci Unconf projects, this started with an [issue entry in the rOpenSci unconf repo](https://github.com/ropensci/unconf18/issues/43):
19 | 
20 | > I recently read about a new non-linear dimensionality reduction algorithm called UMAP, which is much faster than t-SNE, while producing two-dimensional visualizations that share many characteristics with t-SNE. I initially found out about it in the context of use on high-dimensional single-cell data in this paper.
21 | 
22 | > ....
23 | 
24 | > My thought is that the ideal would be a package focused on UMAP specifically, implemented in R or Rcpp. Unfortunately I am not at all an expert in this topic or familiar with the mathematics involved, so the best I would be able to do is try to translate the Python implementation into R.
25 | 
26 | We all met at the unconference the first day and decided that this was a project worth working on. Since t-SNE is so used in the single cell and flow-cytometry community, we thought that having an alternative that was just as good, but faster to run would be helpful.
27 | 
28 | Making a Development Plan
29 | -------------------------
30 | 
31 | Rather than recreate the UMAP code completely from scratch in R, we decided to use the `reticulate` package to call the implementation in Python from R. It was tempting to just wrap the function's arguments with `...` and let the user refer to the python documentation. However, we didn't really think that was in the spirit of the unconf. We wanted to make UMAP much more accessible.
32 | 
33 | Learning about Package Building, Testing, and Documentation
34 | -----------------------------------------------------------
35 | 
36 | Although our package only really has one main function (`umap()`), we felt it was important to have good documentation and unit tests. We spent some time learning about `roxygen` for function documentation and `testthat` for unit testing, and setting up our package with Travis-CI for continuous integration testing. This included unit tests on each argument and including examples varying the essential parameters.
37 | 
38 | We spent a lot of time learning more about the specifics of package building and vignette building in R. We were definitely excited by all of the available tools and built a vignette profiling the performance of the UMAP algorithm versus other dimensionality reduction techniques, such as TSNE.
39 | 
40 | Profiling `umapr` using different datasets
41 | ------------------------------------------
42 | 
43 | ![Execution time of UMAP compared to other algorithms](img/multiple_algorithms_time.png)
44 | ![Memory usage of UMAP comapred to other algorithms](img/multiple_algorithms_memory.png)
45 | 
46 | Part of the appeal of UMAP is that it is faster than tSNE. So we profiled the performance of UMAP on a number of different datasets: `iris` (of course!), the [`BreastCancer` dataset from the `mlbench` package](https://cran.r-project.org/web/packages/mlbench/index.html), a [`Soybean` dataset from `mlbench`](https://cran.r-project.org/web/packages/mlbench/index.html), and finally, a [single cell RNA dataset](https://hemberg-lab.github.io/scRNA.seq.datasets/human/tissues/). You can [see our results in our readme file](https://github.com/ropenscilabs/umapr/blob/master/README.md).
47 | 
48 | Thankfully, UMAP does run faster than tSNE on these datasets, showing an reduction of 66% compared to both versions of TSNE for the `Soybean` dataset, and reduced memory usage for all of the datasets, except for the single cell RNA dataset (see above figure).
49 | 
50 | Exploring the Results with Shiny
51 | --------------------------------
52 | 
53 | ![Shiny App](img/shiny.png)
54 | 
55 | We built a small Shiny app that lets people explore their embedding vectors (the dimensionally reduced vectors) and how they separate the data into different groupings in the 2D space. The app is simple, but allows users to immediately assess the results of the UMAP algorithm in differentiating groupings in the data by coloring the `umap` result by the different variables included in the analysis.
56 | 
57 | Final Results: Get `umapr`
58 | --------------------------
59 | 
60 | `umapr` is currently available in the `ropenscilabs` organization, and can be installed with the following commands, [after the python modules are installed](https://github.com/lmcinnes/umap#installing).
61 | 
62 | ```
63 | install.packages("devtools") 
64 | devtools::install_github("ropenscilabs/umapr")
65 | ```
66 | 
67 | As a group, we learned a lot by building the `umapr` package. More importantly, I think we'll work together on future projects. It was great to work together, and we are talking about having a hackathon between our multiple groups to improve some current open source flow cytometry tools. This was a really fun project and we're excited to do more!
68 | 


--------------------------------------------------------------------------------
/man/make_umap_object.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/classes.R
 3 | \name{make_umap_object}
 4 | \alias{make_umap_object}
 5 | \title{Title}
 6 | \usage{
 7 | make_umap_object(umap_result, annotation = NULL)
 8 | }
 9 | \arguments{
10 | \item{umap_result}{- output of running}
11 | 
12 | \item{annotation}{- optional annotation file}
13 | }
14 | \value{
15 | - a umap object that includes plotting
16 | }
17 | \description{
18 | Title
19 | }
20 | \examples{
21 | \dontrun{
22 | library(flowCore)
23 | 
24 | umap_table <- umap()
25 | }
26 | }
27 | 


--------------------------------------------------------------------------------
/man/run_umap_shiny.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/umap_app.R
 3 | \name{run_umap_shiny}
 4 | \alias{run_umap_shiny}
 5 | \title{Open a shiny app to explore the data in a UMAP embedding.}
 6 | \usage{
 7 | run_umap_shiny(umap)
 8 | }
 9 | \arguments{
10 | \item{umap}{output of a call to `umap`}
11 | }
12 | \value{
13 | Open an interactive shiny app to explore the data.
14 | }
15 | \description{
16 | Open a shiny app to explore the data in a UMAP embedding.
17 | }
18 | 


--------------------------------------------------------------------------------
/man/umap.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/umap.R
  3 | \name{umap}
  4 | \alias{umap}
  5 | \title{umap}
  6 | \usage{
  7 | umap(data, include_input = TRUE, n_neighbors = 15L,
  8 |   n_components = 2L, metric = "euclidean", n_epochs = NULL,
  9 |   learning_rate = 1, alpha = 1, init = "spectral", spread = 1,
 10 |   min_dist = 0.1, set_op_mix_ratio = 1, local_connectivity = 1L,
 11 |   repulsion_strength = 1, bandwidth = 1, gamma = 1,
 12 |   negative_sample_rate = 5L, transform_queue_size = 4, a = NULL,
 13 |   b = NULL, random_state = NULL, metric_kwds = dict(),
 14 |   angular_rp_forest = FALSE, target_n_neighbors = -1L,
 15 |   target_metric = "categorical", target_metric_kwds = dict(),
 16 |   target_weight = 0.5, transform_seed = 42L, verbose = FALSE)
 17 | }
 18 | \arguments{
 19 | \item{data}{data frame or matrix. input data.}
 20 | 
 21 | \item{include_input}{logical. Attach input data to UMAP embeddings if desired.}
 22 | 
 23 | \item{n_neighbors}{integer. The size of local neighborhood
 24 | (in terms of number of neighboring sample points) used for manifold
 25 | approximation. Larger values result in more global views of the manifold,
 26 | while smaller values result in more local data being preserved. In general
 27 | values should be in the range 2 to 100.}
 28 | 
 29 | \item{n_components}{integer The dimension of the space to embed into. This
 30 | defaults to 2 to provide easy visualization, but can reasonably be set to
 31 | any integer value in the range 2 to 100.}
 32 | 
 33 | \item{metric}{character. The metric to use to compute distances in high
 34 | dimensional space. If a string is passed it must match a valid predefined
 35 | metric. If a general metric is required a function that takes two 1d arrays
 36 | and returns a float can be provided. For performance purposes it is required
 37 | that this be a numba jit'd function. Valid string metrics include: euclidean,
 38 | manhattan, chebyshev, minkowski, canberra, braycurtis, mahalanobis,
 39 | wminkowski, seuclidean, cosine, correlation, haversine, hamming, jaccard,
 40 | dice, russelrao, kulsinski, rogerstanimoto, sokalmichener, sokalsneath, yule.
 41 | Metrics that take arguments (such as minkowski, mahalanobis etc.) can have
 42 | arguments passed via the metric_kwds dictionary. At this time care must be
 43 | taken and dictionary elements must be ordered appropriately; this will
 44 | hopefully be fixed in the future.}
 45 | 
 46 | \item{n_epochs}{integer The number of training epochs to use in optimization.}
 47 | 
 48 | \item{learning_rate}{numeric. The initial learning rate for the embedding optimization.}
 49 | 
 50 | \item{alpha}{numeric. The initial learning rate for the embedding optimization.}
 51 | 
 52 | \item{init}{character. How to initialize the low dimensional embedding.
 53 | Options are: 'spectral' (use a spectral embedding of the fuzzy 1-skeleton),
 54 | 'random' (assign initial embedding positions at random),
 55 | * A numpy array of initial embedding positions.}
 56 | 
 57 | \item{spread}{numeric. The effective scale of embedded points.
 58 | In combination with ``min_dist`` this determines how clustered/clumped the
 59 | embedded points are.}
 60 | 
 61 | \item{min_dist}{numeric.  The effective minimum distance between embedded
 62 | points. Smaller values will result in a more clustered/clumped embedding
 63 | where nearby points on the manifold are drawn closer together, while larger
 64 | values will result on a more even dispersal of points. The value should be
 65 | set relative to the ``spread`` value, which determines the scale at which
 66 | embedded points will be spread out.}
 67 | 
 68 | \item{set_op_mix_ratio}{numeric. Interpolate between (fuzzy) union and
 69 | intersection as the set operation used to combine local fuzzy simplicial
 70 | sets to obtain a global fuzzy simplicial sets. Both fuzzy set operations use
 71 | the product t-norm. The value of this parameter should be between 0.0 and
 72 | 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure
 73 | fuzzy intersection.}
 74 | 
 75 | \item{local_connectivity}{integer The local connectivity required -- i.e.
 76 | the number of nearest neighbors that should be assumed to be connected at a
 77 | local level. The higher this value the more connected the manifold becomes
 78 | locally. In practice, this should be not more than the local intrinsic
 79 | dimension of the manifold.}
 80 | 
 81 | \item{repulsion_strength}{numeric. Weighting applied to negative samples in 
 82 | low dimensional embedding optimization. Values higher than one will result in
 83 |  greater weight being given to negative samples.}
 84 | 
 85 | \item{bandwidth}{numeric. The effective bandwidth of the kernel if we view
 86 | the algorithm as similar to Laplacian eigenmaps. Larger values induce more
 87 | connectivity and a more global view of the data, smaller values concentrate
 88 | more locally.}
 89 | 
 90 | \item{gamma}{numeric. Weighting applied to negative samples in low
 91 | dimensional embedding optimization. Values higher than one will result in
 92 | greater weight being given to negative samples.}
 93 | 
 94 | \item{negative_sample_rate}{numeric. The number of negative edge/1-simplex
 95 | samples to use per positive edge/1-simplex sample in optimizing the low
 96 | dimensional embedding.}
 97 | 
 98 | \item{transform_queue_size}{numeric. For transform operations (embedding new points
 99 | using a trained model_ this will control how aggressively to search for 
100 | nearest neighbors. Larger values will result in slower performance but
101 |  more accurate nearest neighbor evaluation.}
102 | 
103 | \item{a}{numeric. More specific parameters controlling the embedding.
104 | If NULL, these values are set automatically as determined by ``min_dist``
105 | and ``spread``.}
106 | 
107 | \item{b}{numeric. More specific parameters controlling the embedding.
108 | If NULL, these values are set automatically as determined by ``min_dist``
109 | and ``spread``.}
110 | 
111 | \item{random_state}{integer. If integer, random_state is the seed used by the
112 | random number generator; If NULL, the random number generator is the
113 | RandomState instance used by `np.random`.}
114 | 
115 | \item{metric_kwds}{reticulate dictionary. Arguments to pass on to the metric,
116 | such as the ``p`` value for Minkowski distance.}
117 | 
118 | \item{angular_rp_forest}{logical. Whether to use an angular random projection
119 | forest to initialise the approximate nearest neighbor search. This can be
120 | faster, but is mostly on useful for metric that use an angular style distance
121 | such as cosine, correlation etc. In the case of those metrics angular forests
122 | will be chosen automatically.}
123 | 
124 | \item{target_n_neighbors}{integer. The number of nearest neighbors to use to 
125 | construct the target simplcial set. If set to -1 use the n_neighbors value.}
126 | 
127 | \item{target_metric}{character or function. The metric used to measure distance
128 | for a target array is using supervised dimension reduction. By default this is
129 | ‘categorical’ which will measure distance in terms of whether categories match
130 | or are different. Furthermore, if semi-supervised is required target values of 
131 | -1 will be trated as unlabelled under the ‘categorical’ metric. If the target
132 | array takes continuous values (e.g. for a regression problem) then metric of 
133 | ‘l1’ or ‘l2’ is probably more appropriate.}
134 | 
135 | \item{target_metric_kwds}{reticulate dictionary. Keyword argument to pass to 
136 | the target metric when performing supervised dimension reduction. If None then
137 | no arguments are passed on.}
138 | 
139 | \item{target_weight}{numeric. weighting factor between data topology and target 
140 | topology. A value of 0.0 weights entirely on data, a value of 1.0 weights 
141 | entirely on target. The default of 0.5 balances the weighting equally between 
142 | data and target.}
143 | 
144 | \item{transform_seed}{integer. Random seed used for the stochastic aspects of 
145 | the transform operation. This ensures consistency in transform operations.}
146 | 
147 | \item{verbose}{logical. Controls verbosity of logging.}
148 | }
149 | \value{
150 | matrix
151 | }
152 | \description{
153 | Provides an interface to the UMAP algorithm implemented in Python.
154 | }
155 | \examples{
156 | #import umap library (and load python module)
157 | library("umapr")
158 | umap(as.matrix(iris[, 1:4]))
159 | umap(iris[, 1:4])
160 | }
161 | \references{
162 | Leland McInnes and John Healy (2018). UMAP: Uniform Manifold
163 | Approximation and Projection for Dimension Reduction.
164 | ArXiv e-prints 1802.03426.
165 | }
166 | 


--------------------------------------------------------------------------------
/tests/shinyTest/test_umap_shiny.R:
--------------------------------------------------------------------------------
 1 | library(reticulate)
 2 | library(tidyverse)
 3 | 
 4 | umap <- import("umap")
 5 | sklearn.datasets_module <- import("sklearn.datasets")
 6 | 
 7 | digits <- sklearn.datasets_module$load_digits()
 8 | 
 9 | umap_out <- umap$UMAP()$fit_transform(digits$data)
10 | colnames(umap_out) <- c("UMAP1","UMAP2")
11 | umap <- cbind(digits$data, umap_out) %>% data.frame()
12 | 
13 | #runUmapShiny(umap)
14 | 
15 | umapout <- make_umap_object(umap_result = umap)
16 | 
17 | #umapout$plot("V4")
18 | 
19 | runUmapShiny(umap)
20 | 
21 | library(flowCore)
22 | data("GvHD")
23 | out <- fsApply(GvHD, exprs)
24 | 
25 | 
26 | out <- out[,-8]
27 | test <- umap(out)
28 | 
29 | colnames(umap_out) <- c("UMAP1","UMAP2")
30 | umap <- cbind(digits$data, umap_out) %>% data.frame()
31 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(umapr)
3 | 
4 | test_check("umapr")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test-umapr.R:
--------------------------------------------------------------------------------
 1 | context("UMAP wrapper tests")
 2 | library("umapr")
 3 | library("reticulate")
 4 | 
 5 | # From https://cran.r-project.org/web/packages/reticulate/vignettes/package.html
 6 | # helper function to skip tests if we don't have the 'foo' module
 7 | skip_if_no_umap <- function() {
 8 |   have_umap <- py_module_available("umap")
 9 |   if (!have_umap)
10 |     skip("umap not available for testing")
11 | }
12 | 
13 | skip_if_no_sklearn.datasets <- function() {
14 |   have_sklearn.datasets <- py_module_available("sklearn.datasets")
15 |   if (!have_sklearn.datasets)
16 |     skip("sklearn.datasets not available for testing")
17 | }
18 | 
19 | # Here we perform the actual testing
20 | test_that("Things work as expected", {
21 |   skip_if_no_umap()
22 |   skip_if_no_sklearn.datasets()
23 | 
24 |   # Generate/Load some data
25 |   set.seed(1)
26 |   data = cbind(matrix(rexp( 100 * 10, runif(1, 1E-5, 1E-3) ), 100, 10))
27 | 
28 |   # The function should check the input types to make sure they are correct
29 |   expect_error(umap(data = "Not a matrix"), "Data must be a data frame or a matrix")
30 |   expect_error(umap(data = "This is not a matrix or a data frame"), "Data must be a data frame or a matrix.")
31 |   expect_error(umap(data = data, n_neighbors = "Not count"), "n_neighbors is not a count")
32 |   expect_error(umap(data = data, n_components = "Not count"), "n_components is not a count")
33 |   # metric must be one of the options listed here: https://github.com/lmcinnes/umap/blob/bf1c3e5c89ea393c9de10bd66c5e3d9bc30588ee/umap/umap_.py#L1211
34 |   expect_error(umap(data = data, metric = "not a valid metric"), NULL)
35 |   expect_error(umap(data = data, n_epochs = "Not count"), "n_epochs is not a count")
36 |   expect_error(umap(data = data, alpha = "Not numeric"), "alpha is not a numeric")
37 |   expect_error(umap(data = data, learning_rate = "Not numeric"), "learning_rate is not a numeric")
38 |   expect_error(umap(data = data, init = "not a valid init"), "init must be one of 'spectral', 'random', or a numpy array of initial embedding positions")
39 |   expect_error(umap(data = data, spread = "Not numeric"), "spread is not a numeric")
40 |   expect_error(umap(data = data, min_dist = "Not numeric"), "min_dist is not a numeric")
41 |   expect_error(umap(data = data, set_op_mix_ratio = "Not numeric"), "set_op_mix_ratio is not a numeric")
42 |   expect_error(umap(data = data, local_connectivity = 2.4), "local_connectivity is not a count")
43 |   expect_error(umap(data = data, bandwidth = "Not numeric"), "bandwidth is not a numeric")
44 |   expect_error(umap(data = data, gamma = "Not numeric"), "gamma is not a numeric")
45 |   expect_error(umap(data = data, negative_sample_rate = 2.4), "negative_sample_rate is not a count")
46 |   expect_error(umap(data = data, a = "Not numeric"), "not TRUE")
47 |   expect_error(umap(data = data, b = "Not numeric"), "not TRUE")
48 |   expect_error(umap(data = data, random_state = 2.4), "not TRUE")
49 |   expect_error(umap(data = data, metric_kwds = 2.4), "metric_kwds must be a Python dictionary object")
50 |   expect_error(umap(data = data, angular_rp_forest = 2.4), "angular_rp_forest is not a flag")
51 |   expect_error(umap(data = data, verbose = 2.4), "verbose is not a flag")
52 | 
53 |   # try running umap with the same seed twice, see if you get the same thing
54 |   expect_true(identical(umap(data = data, random_state = 3L),
55 |                         umap(data = data, random_state = 3L)))
56 | })
57 | 
58 | # test_that("R6 tests",
59 | #           {
60 | #
61 | #               set.seed(1)
62 | #               data = cbind(matrix(rexp( 100 * 10, runif(1, 1E-5, 1E-3) ), 100, 10))
63 | #               colnames(data) <- c(letters[1:10])
64 | #               out <- umap(data)
65 | #
66 | #               expect_equal(class(out)[1], "umap_obj")
67 | #               pl <- out$plot("a")
68 | #               expect_equal(class(pl)[-1], "ggplot")
69 | #
70 | #
71 | #           })
72 | 


--------------------------------------------------------------------------------
/timings.R:
--------------------------------------------------------------------------------
  1 | library(umapr)
  2 | library(Rtsne)
  3 | library(tidyverse)
  4 | library(bench)
  5 | 
  6 | # stuff to compare algorithms -------------------------------------------------
  7 | embed <- function(labels, d) {
  8 |   times <- mark(
  9 |     um <- umap(d),
 10 |     ts <- Rtsne(d)$Y,
 11 |     ts_no_pca <- Rtsne(d, pca = FALSE)$Y,
 12 |     check = FALSE)
 13 | 
 14 |   pca <- prcomp(d)$x[,1:2]
 15 | 
 16 |   times$expression <- c("UMAP", "PCA + t-SNE", "t-SNE")
 17 | 
 18 |   combo <- function(embedding, name) {
 19 |     colnames(embedding) <- c("V1", "V2")
 20 |     embedding %>%
 21 |       as.data.frame() %>%
 22 |       mutate(Algorithm = name, Class = labels)
 23 |   }
 24 | 
 25 |   list(times = times,
 26 |     results = bind_rows(
 27 |       combo(pca, "PCA"),
 28 |       mutate(um, Algorithm = "UMAP", Class = labels, V1 = UMAP1, V2 = UMAP2),
 29 |       combo(ts, "PCA + t-SNE"),
 30 |       combo(ts_no_pca, "t-SNE")))
 31 | }
 32 | 
 33 | plot_embeddings <- function(embeddings, dataset) {
 34 |   ggplot(embeddings, aes(V1, V2, color = Class)) +
 35 |     geom_point() + facet_wrap(~ Algorithm, scales = "free") +
 36 |     ggtitle(dataset)
 37 | }
 38 | 
 39 | # iris -----------------------------------------------------------------------
 40 | d <- iris
 41 | d <- d[!duplicated(d), ]
 42 | with_labels <- d
 43 | d <- as.matrix(d[ , 1:4])
 44 | 
 45 | iris_result <- embed(with_labels$Species, d)
 46 | 
 47 | # cancer ---------------------------------------------------------------------
 48 | library(mlbench)
 49 | data("BreastCancer")
 50 | d <- BreastCancer[ , 2:11]
 51 | d <- d[!duplicated(d), ]
 52 | d <- d[complete.cases(d), ]
 53 | labels <- d$Class
 54 | d <- as.matrix(d[ , 1:9])
 55 | d <- apply(d, 2, as.numeric)
 56 | 
 57 | cancer_result <- embed(labels, d)
 58 | 
 59 | # beans -----------------------------------------------------------
 60 | data(Soybean)
 61 | d <- Soybean
 62 | d <- d[!duplicated(d[,2:36]), ]
 63 | d <- d[complete.cases(d[,2:36]), ]
 64 | labels <- d$Class
 65 | d <- as.matrix(d[ , 2:36])
 66 | d <- apply(d, 2, as.numeric)
 67 | 
 68 | bean_result <- embed(labels, d)
 69 | 
 70 | # some scRNAseq -------------------------------------------------------------
 71 | #https://hemberg-lab.github.io/scRNA.seq.datasets/human/tissues/
 72 | library(SingleCellExperiment)
 73 | x <- readRDS("~/Desktop/li.rds")
 74 | y <- t(logcounts(x))
 75 | rm(x)
 76 | 
 77 | labels <- str_extract(rownames(y), "[^_]*$")
 78 | 
 79 | sc_rna_seq_result <- embed(labels, y)
 80 | 
 81 | # display results ----------------------------------------------------------
 82 | plot_embeddings(iris_result$results, "iris")
 83 | ggsave("img/multiple_algorithms_iris.png", width = 6, height = 5, dpi = 300)
 84 | plot_embeddings(cancer_result$results, "cancer")
 85 | ggsave("img/multiple_algorithms_cancer.png", width = 6, height = 5, dpi = 300)
 86 | plot_embeddings(bean_result$results, "bean")
 87 | ggsave("img/multiple_algorithms_bean.png", width = 6, height = 5, dpi = 300)
 88 | plot_embeddings(sc_rna_seq_result$results, "scRNAseq")
 89 | ggsave("img/multiple_algorithms_rna.png", width = 6, height = 5, dpi = 300)
 90 | 
 91 | # times -------------------------------------------------------------------
 92 | combo_times <- function(times, dataset) {
 93 |   dplyr::select(times, expression, median, mem_alloc) %>%
 94 |     dplyr::mutate(Data = dataset)
 95 | }
 96 | 
 97 | times <- suppressWarnings(bind_rows(combo_times(iris_result$times, "iris"),
 98 |   combo_times(cancer_result$times, "cancer"),
 99 |   combo_times(bean_result$times, "bean"),
100 |   combo_times(sc_rna_seq_result$times, "scRNAseq")))
101 | 
102 | ggplot(times, aes(x = expression, y = median)) +
103 |   geom_col()+ facet_wrap(~ Data, scales = "free_y") +
104 |   ylab("Time (s)") + xlab(NULL) +
105 |   ggtitle("Time taken to run dimensionality reduction on dataset")
106 | 
107 | ggsave("img/multiple_algorithms_time.png", width = 6, height = 5, dpi = 300)
108 | 
109 | ggplot(times, aes(x = expression, y = mem_alloc)) +
110 |   geom_col()+ facet_wrap(~ Data, scales = "free_y") +
111 |   ylab("Memory (bytes)") + xlab(NULL) +
112 |   ggtitle("Memory used to run dimensionality reduction on dataset")
113 | ggsave("img/multiple_algorithms_memory.png", width = 6, height = 5, dpi = 300)
114 | 


--------------------------------------------------------------------------------
/travis_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # install python
 4 | if [[ $TRAVIS_OS_NAME == "linux" ]]; then
 5 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
 6 | elif [[ $TRAVIS_OS_NAME == "osx" ]]; then
 7 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh
 8 | fi
 9 | 
10 | bash miniconda.sh -b -p $HOME/miniconda
11 | export PATH="$HOME/miniconda/bin:$PATH"
12 | export RETICULATE_PYTHON="$HOME/miniconda/bin/python"
13 | hash -r
14 | conda config --set always_yes yes --set changeps1 no
15 | conda update -q conda
16 | conda info -a
17 | pip install --upgrade pip
18 | pip install igraph leidenalg
19 | 


--------------------------------------------------------------------------------