├── .gitattributes
├── .gitignore
├── .idea
    ├── .gitignore
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    └── umap-nan.iml
├── .pep8speaks.yml
├── .readthedocs.yaml
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE.txt
├── Makefile
├── README.rst
├── appveyor.yml
├── azure-pipelines.yml
├── ci_scripts
    ├── install.sh
    ├── success.sh
    └── test.sh
├── doc
    ├── .gitignore
    ├── Makefile
    ├── _static
    │   └── .gitkeep
    ├── aligned_umap_basic_usage.rst
    ├── aligned_umap_plotly_plot.html
    ├── aligned_umap_politics_demo.rst
    ├── api.rst
    ├── basic_usage.rst
    ├── basic_usage_bokeh_example.html
    ├── benchmarking.rst
    ├── bokeh_digits_plot.py
    ├── clustering.rst
    ├── composing_models.rst
    ├── conf.py
    ├── densmap_demo.rst
    ├── doc_requirements.txt
    ├── document_embedding.rst
    ├── embedding_space.rst
    ├── exploratory_analysis.rst
    ├── faq.rst
    ├── how_umap_works.rst
    ├── images
    │   ├── 20newsgroups_hellinger_counts.png
    │   ├── 20newsgroups_hellinger_tfidf.png
    │   ├── BasicUsage_13_1.png
    │   ├── BasicUsage_18_0.png
    │   ├── BasicUsage_20_1.png
    │   ├── BasicUsage_26_1.png
    │   ├── BasicUsage_6_1.png
    │   ├── ESM_metagenomic_atlas.png
    │   ├── Hyperbolic_tiling.png
    │   ├── SupervisedUMAP_10_1.png
    │   ├── SupervisedUMAP_15_1.png
    │   ├── SupervisedUMAP_22_1.png
    │   ├── SupervisedUMAP_31_0.png
    │   ├── SupervisedUMAP_33_0.png
    │   ├── UMAPTransform_15_0.png
    │   ├── UMAPTransform_21_0.png
    │   ├── UMAP_zoo.png
    │   ├── activation_atlas.png
    │   ├── aligned_umap_basic_usage_15_0.png
    │   ├── aligned_umap_basic_usage_22_0.png
    │   ├── aligned_umap_basic_usage_29_0.png
    │   ├── aligned_umap_basic_usage_29_1.png
    │   ├── aligned_umap_basic_usage_5_1.png
    │   ├── aligned_umap_pendigits_3d.png
    │   ├── aligned_umap_pendigits_3d_1.png
    │   ├── aligned_umap_pendigits_anim.gif
    │   ├── aligned_umap_politics_demo_31_0.png
    │   ├── aligned_umap_politics_demo_spaghetti.png
    │   ├── audio_explorer.png
    │   ├── basic_usage_17_1.png
    │   ├── basic_usage_22_0.png
    │   ├── basic_usage_24_2.png
    │   ├── basic_usage_30_1.png
    │   ├── basic_usage_8_1.png
    │   ├── bert_embedding.png
    │   ├── c_elegans_3d.jpg
    │   ├── clustering_10_1.png
    │   ├── clustering_16_1.png
    │   ├── clustering_27_1.png
    │   ├── clustering_31_1.png
    │   ├── clustering_6_1.png
    │   ├── composing_models_11_1.png
    │   ├── composing_models_14_1.png
    │   ├── composing_models_18_1.png
    │   ├── composing_models_20_1.png
    │   ├── composing_models_23_1.png
    │   ├── composing_models_36_1.png
    │   ├── composing_models_38_1.png
    │   ├── composing_models_42_1.png
    │   ├── composing_models_44_1.png
    │   ├── composing_models_47_1.png
    │   ├── composing_models_50_1.png
    │   ├── composing_models_6_1.png
    │   ├── densmap_demo_10_1.png
    │   ├── densmap_demo_13_1.png
    │   ├── densmap_demo_16_1.png
    │   ├── densmap_demo_19_1.png
    │   ├── densmap_demo_21_1.png
    │   ├── densmap_demo_24_1.png
    │   ├── densmap_demo_6_1.png
    │   ├── embedding_projector.png
    │   ├── embedding_space_11_1.png
    │   ├── embedding_space_15_1.png
    │   ├── embedding_space_18_1.png
    │   ├── embedding_space_26_0.png
    │   ├── embedding_space_29_1.png
    │   ├── embedding_space_33_1.png
    │   ├── embedding_space_37_1.png
    │   ├── embedding_space_39_1.png
    │   ├── embedding_space_47_1.png
    │   ├── embedding_space_52_1.png
    │   ├── embedding_space_55_0.png
    │   ├── embedding_space_59_0.png
    │   ├── embedding_space_7_1.png
    │   ├── exploring_fashion_mnist.png
    │   ├── galaxy10_2D_densmap.svg
    │   ├── galaxy10_2D_densmap_supervised.svg
    │   ├── galaxy10_2D_densmap_supervised_prediction.svg
    │   ├── galaxy10_2D_umap.svg
    │   ├── galaxy10_2D_umap_supervised.svg
    │   ├── galaxy10_2D_umap_supervised_prediction.svg
    │   ├── galaxy10_subset.svg
    │   ├── how_umap_works_basic_graph.png
    │   ├── how_umap_works_fuzzy_open_cover.png
    │   ├── how_umap_works_local_metric_open_cover.png
    │   ├── how_umap_works_open_cover.png
    │   ├── how_umap_works_raw_data.png
    │   ├── how_umap_works_raw_graph.png
    │   ├── how_umap_works_umap_graph.png
    │   ├── how_umap_works_umap_graph_layout.png
    │   ├── how_umap_works_umap_layout.png
    │   ├── how_umap_works_umap_open_cover.png
    │   ├── how_umap_works_uniform_distribution_cover.png
    │   ├── inverse_transform_13_0.png
    │   ├── inverse_transform_20_1.png
    │   ├── inverse_transform_26_0.png
    │   ├── inverse_transform_7_1.png
    │   ├── mutual_nn_umap_20ngc.png
    │   ├── mutual_nn_umap_FMNIST.png
    │   ├── mutual_nn_umap_MNIST.png
    │   ├── mutual_nn_umap_connectivity.png
    │   ├── mutual_nn_umap_lc.png
    │   ├── mutual_nn_umap_results.png
    │   ├── organogenesis_paper.png
    │   ├── orion_particles.png
    │   ├── outliers_10_0.png
    │   ├── outliers_12_2.png
    │   ├── outliers_13_2.png
    │   ├── outliers_15_0.png
    │   ├── outliers_19_0.png
    │   ├── outliers_22_2.png
    │   ├── outliers_27_0.png
    │   ├── outliers_5_0.png
    │   ├── outliers_7_2.png
    │   ├── outliers_9_0.png
    │   ├── parameters_13_1.png
    │   ├── parameters_13_2.png
    │   ├── parameters_13_3.png
    │   ├── parameters_13_4.png
    │   ├── parameters_13_5.png
    │   ├── parameters_13_6.png
    │   ├── parameters_13_7.png
    │   ├── parameters_16_1.png
    │   ├── parameters_16_2.png
    │   ├── parameters_16_3.png
    │   ├── parameters_16_4.png
    │   ├── parameters_16_5.png
    │   ├── parameters_16_6.png
    │   ├── parameters_19_1.png
    │   ├── parameters_21_1.png
    │   ├── parameters_32_1.png
    │   ├── parameters_32_2.png
    │   ├── parameters_32_3.png
    │   ├── parameters_32_4.png
    │   ├── parameters_32_5.png
    │   ├── parameters_8_1.png
    │   ├── performance_14_1.png
    │   ├── performance_15_1.png
    │   ├── performance_17_1.png
    │   ├── performance_18_1.png
    │   ├── performance_20_1.png
    │   ├── performance_21_1.png
    │   ├── pixplot.png
    │   ├── plotting_10_1.png
    │   ├── plotting_12_1.png
    │   ├── plotting_14_1.png
    │   ├── plotting_19_2.png
    │   ├── plotting_21_2.png
    │   ├── plotting_32_2.png
    │   ├── plotting_34_2.png
    │   ├── plotting_38_1.png
    │   ├── plotting_40_1.png
    │   ├── plotting_42_0.png
    │   ├── plotting_44_1.png
    │   ├── plotting_8_2.png
    │   ├── population_umap.jpg
    │   ├── precomputed_k-nn11.png
    │   ├── precomputed_k-nn13.png
    │   ├── precomputed_k-nn17.png
    │   ├── precomputed_k-nn6.png
    │   ├── pumap-only.png
    │   ├── reproducibility_10_1.png
    │   ├── reproducibility_14_1.png
    │   ├── reproducibility_18_1.png
    │   ├── reproducibility_6_1.png
    │   ├── retrain_pumap_emb_x1.png
    │   ├── retrain_pumap_emb_x2.png
    │   ├── retrain_pumap_history.png
    │   ├── retrain_pumap_p_emb_x1.png
    │   ├── retrain_pumap_p_emb_x2.png
    │   ├── retrain_pumap_summary_2_removed.png
    │   ├── simplices.png
    │   ├── single_cell_umap.jpg
    │   ├── sparse_11_1.png
    │   ├── sparse_18_0.png
    │   ├── sparse_31_1.png
    │   ├── sparse_35_0.png
    │   ├── structure_recent_phil.png
    │   ├── syllabus_galaxy.png
    │   ├── time_cluster.png
    │   ├── umap-loss.png
    │   ├── umap-only.png
    │   ├── umap_explorer.png
    │   ├── umap_primes.png
    │   ├── umap_surrey.png
    │   └── umap_vae_pca.png
    ├── index.rst
    ├── interactive_viz.rst
    ├── inverse_transform.rst
    ├── logo.png
    ├── logo_large.png
    ├── make.bat
    ├── mutual_nn_umap.rst
    ├── nomic_atlas_umap_of_text_embeddings.rst
    ├── nomic_atlas_visualizing_mnist_training_dynamics.rst
    ├── outliers.rst
    ├── parameters.rst
    ├── parametric_umap.rst
    ├── performance.rst
    ├── plotting.rst
    ├── plotting_example_interactive.py
    ├── plotting_example_nomic_atlas.py
    ├── plotting_interactive_example.html
    ├── precomputed_k-nn.rst
    ├── release_notes.rst
    ├── reproducibility.rst
    ├── scientific_papers.rst
    ├── sparse.rst
    ├── supervised.rst
    ├── transform.rst
    └── transform_landmarked_pumap.rst
├── docs_requirements.txt
├── examples
    ├── README.txt
    ├── digits
    │   ├── digits.html
    │   └── digits.py
    ├── galaxy10sdss.py
    ├── inverse_transform_example.py
    ├── iris
    │   ├── iris.html
    │   └── iris.py
    ├── mnist_torus_sphere_example.py
    ├── mnist_transform_new_data.py
    ├── plot_algorithm_comparison.py
    ├── plot_fashion-mnist_example.py
    ├── plot_feature_extraction_classification.py
    └── plot_mnist_example.py
├── images
    ├── densmap_example_mnist.png
    ├── iris.png
    ├── mnist_digits.png
    ├── sklearn_digits.png
    ├── umap_example_fashion_mnist1.png
    ├── umap_example_mnist1.png
    └── umap_example_shuttle.png
├── notebooks
    ├── AnimatingUMAP.ipynb
    ├── Document embedding using UMAP.ipynb
    ├── MNIST_Landmarks.ipynb
    ├── Parametric_UMAP
    │   ├── 01.0-parametric-umap-mnist-embedding-basic.ipynb
    │   ├── 02.0-parametric-umap-mnist-embedding-convnet.ipynb
    │   ├── 03.0-parametric-umap-mnist-embedding-convnet-with-reconstruction.ipynb
    │   ├── 04.0-parametric-umap-mnist-embedding-convnet-with-autoencoder-loss.ipynb
    │   ├── 05.0-parametric-umap-with-callback.ipynb
    │   ├── 06.0-nonparametric-umap.ipynb
    │   └── 07.0-parametric-umap-global-loss.ipynb
    ├── UMAP usage and parameters.ipynb
    ├── nomic-atlas-umap-of-text-embeddings.ipynb
    └── nomic-atlas-visualizing-mnist-training-dynamics.ipynb
├── paper.bib
├── paper.md
├── setup.py
└── umap
    ├── __init__.py
    ├── aligned_umap.py
    ├── distances.py
    ├── layouts.py
    ├── parametric_umap.py
    ├── plot.py
    ├── sparse.py
    ├── spectral.py
    ├── tests
        ├── __init__.py
        ├── conftest.py
        ├── digits_embedding_42.npy
        ├── test_aligned_umap.py
        ├── test_chunked_parallel_spatial_metric.py
        ├── test_composite_models.py
        ├── test_data_input.py
        ├── test_densmap.py
        ├── test_parametric_umap.py
        ├── test_plot.py
        ├── test_spectral.py
        ├── test_umap.py
        ├── test_umap_get_feature_names_out.py
        ├── test_umap_metrics.py
        ├── test_umap_nn.py
        ├── test_umap_on_iris.py
        ├── test_umap_ops.py
        ├── test_umap_repeated_data.py
        ├── test_umap_trustworthiness.py
        └── test_umap_validation_params.py
    ├── umap_.py
    ├── utils.py
    └── validation.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-language=Python


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # virtual environment
 2 | venv
 3 | 
 4 | # non-stylistic pycharm configs
 5 | .idea/misc.xml
 6 | .idea/modules.xml
 7 | .idea/umap.iml
 8 | .idea/vcs.xml
 9 | .idea/workspace.xml
10 | .idea/dictionaries
11 | .idea/other.xml
12 | 
13 | # Mac Finder layout
14 | .DS_Store
15 | 
16 | # IPython/Jupyter notebook checkpoints
17 | *.ipynb_checkpoints
18 | 
19 | # Python 2.x & 3.x bytecode cache
20 | *.pyc
21 | *__pycache__
22 | 
23 | # metadata from pip-installing repo
24 | umap_learn.egg-info
25 | 
26 | # docs
27 | doc/auto_examples
28 | doc/_build


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | 


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="PROJECT_PROFILE" value="Default" />
4 |     <option name="USE_PROJECT_PROFILE" value="false" />
5 |     <version value="1.0" />
6 |   </settings>
7 | </component>
8 | 


--------------------------------------------------------------------------------
/.idea/umap-nan.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 3.10 (umap-nan)" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="PyDocumentationSettings">
 9 |     <option name="format" value="NUMPY" />
10 |     <option name="myDocStringFormat" value="NumPy" />
11 |   </component>
12 |   <component name="TestRunnerService">
13 |     <option name="PROJECT_TEST_RUNNER" value="py.test" />
14 |   </component>
15 | </module>


--------------------------------------------------------------------------------
/.pep8speaks.yml:
--------------------------------------------------------------------------------
1 | pycodestyle:  # Same as scanner.linter value. Other option is flake8
2 |     max-line-length: 88  # Default is 79 in PEP 8
3 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file for Sphinx projects
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | # Required
 5 | version: 2
 6 | 
 7 | # Set the OS, Python version and other tools you might need
 8 | build:
 9 |   os: ubuntu-22.04
10 |   tools:
11 |     python: "3.11"
12 | 
13 | 
14 | # Build documentation in the "docs/" directory with Sphinx
15 | sphinx:
16 |   configuration: doc/conf.py
17 | 
18 | # Optional but recommended, declare the Python requirements required
19 | # to build your documentation
20 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
21 | python:
22 |   install:
23 |     - requirements: docs_requirements.txt
24 |     - method: pip
25 |       path: .
26 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | cache:
 4 |   apt: true
 5 |   # We use three different cache directory
 6 |   # to work around a Travis bug with multi-platform cache
 7 |   directories:
 8 |   - $HOME/.cache/pip
 9 |   - $HOME/download
10 | env:
11 |   global:
12 |     # Directory where tests are run from
13 |     - TEST_DIR=/tmp/test_dir/
14 |     - MODULE=umap
15 | 
16 | matrix:
17 |   include:
18 |     - python: 3.6
19 |       os: linux
20 |     - env: DISTRIB="conda" PYTHON_VERSION="3.7" NUMPY_VERSION="1.17" SCIPY_VERSION="1.3.1"
21 |       os: linux
22 |     - env: DISTRIB="conda" PYTHON_VERSION="3.8" NUMPY_VERSION="1.20.0" SCIPY_VERSION="1.6.0"
23 |       os: linux
24 |     - env: DISTRIB="conda" PYTHON_VERSION="3.8" COVERAGE="true" NUMPY_VERSION="1.20.0" SCIPY_VERSION="1.6.0"
25 |       os: linux
26 | #    - env: DISTRIB="conda" PYTHON_VERSION="3.7" NUMBA_VERSION="0.51.2"
27 | #      os: osx
28 | #      language: generic
29 | #    - env: DISTRIB="conda" PYTHON_VERSION="3.8" NUMBA_VERSION="0.51.2"
30 | #      os: osx
31 | #      language: generic
32 | 
33 | install: source ci_scripts/install.sh
34 | script: travis_wait 90 bash ci_scripts/test.sh
35 | after_success: source ci_scripts/success.sh
36 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
 6 | 
 7 | ## Our Standards
 8 | 
 9 | Examples of behavior that contributes to creating a positive environment include:
10 | 
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 | 
17 | Examples of unacceptable behavior by participants include:
18 | 
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 | 
25 | ## Our Responsibilities
26 | 
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 | 
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 | 
31 | ## Scope
32 | 
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 | 
35 | ## Enforcement
36 | 
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at leland.mcinnes@gmail.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 | 
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 | 
41 | ## Attribution
42 | 
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 | 
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | Contributions of all kinds are welcome. In particular pull requests are appreciated. 
 4 | The authors will endeavour to help walk you through any issues in the pull request
 5 | discussion, so please feel free to open a pull request even if you are new to such things.
 6 | 
 7 | ## Issues
 8 | 
 9 | The easiest contribution to make is to [file an issue](https://github.com/lmcinnes/umap/issues/new).
10 | It is beneficial if you check the [FAQ](https://umap-learn.readthedocs.io/en/latest/faq.html), 
11 | and do a cursory search of [existing issues](https://github.com/lmcinnes/umap/issues?utf8=%E2%9C%93&q=is%3Aissue).
12 | It is also helpful, but not necessary, if you can provide clear instruction for 
13 | how to reproduce a problem. If you have resolved an issue yourself please consider
14 | contributing to the FAQ to add your problem, and its resolution, so others can
15 | benefit from your work.
16 | 
17 | ## Documentation
18 | 
19 | Contributing to documentation is the easiest way to get started. Providing simple
20 | clear or helpful documentation for new users is critical. Anything that *you* as 
21 | a new user found hard to understand, or difficult to work out, are excellent places
22 | to begin. Contributions to more detailed and descriptive error messages is
23 | especially appreciated. To contribute to the documentation please 
24 | [fork the project](https://github.com/lmcinnes/umap/issues#fork-destination-box)
25 | into your own repository, make changes there, and then submit a pull request.
26 | 
27 | ### Building the Documentation Locally
28 | 
29 | To build the docs locally, install the documentation tools requirements:
30 | 
31 | ```bash
32 | pip install -r docs_requirements.txt
33 | ```
34 | 
35 | Then run:
36 | 
37 | ```bash
38 | sphinx-build -b html doc doc/_build
39 | ```
40 | 
41 | This will build the documentation in HTML format. You will be able to find the output
42 | in the `doc/_build` folder.
43 | 
44 | ## Code
45 | 
46 | Code contributions are always welcome, from simple bug fixes, to new features. To
47 | contribute code please 
48 | [fork the project](https://github.com/lmcinnes/umap/issues#fork-destination-box)
49 | into your own repository, make changes there, and then submit a pull request. If
50 | you are fixing a known issue please add the issue number to the PR message. If you
51 | are fixing a new issue feel free to file an issue and then reference it in the PR.
52 | You can [browse open issues](https://github.com/lmcinnes/umap/issues), 
53 | or consult the [project roadmap](https://github.com/lmcinnes/umap/issues/15), for potential code
54 | contributions. Fixes for issues tagged with 'help wanted' are especially appreciated.
55 | 
56 | ### Code formatting
57 | 
58 | If possible, install the [black code formatter](https://github.com/python/black) (e.g.
59 | `pip install black`) and run it before submitting a pull request. This helps maintain consistency
60 | across the code, but also there is a check in the Travis-CI continuous integration system which
61 | will show up as a failure in the pull request if `black` detects that it hasn't been run.
62 | 
63 | Formatting is as simple as running:
64 | 
65 | ```bash
66 | black .
67 | ```
68 | 
69 | in the root of the project.
70 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017, Leland McInnes
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # make gh-pages in repo base directory to automatically build and deploy documents to github
 2 | 
 3 | gh-pages:
 4 | 	echo "Make gh-pages"
 5 | 	cd doc; make html
 6 | 	git checkout gh-pages
 7 | 	rm -rf _sources _static _modules _downloads _images auto_examples
 8 | 	mv -fv doc/_build/html/* .
 9 | 	rm -rf doc
10 | 	git add -A
11 | 	git commit -m "Generated gh-pages for `git log master -1 --pretty=short --abbrev-commit`" && git push origin gh-pages ; git checkout master
12 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | build: "off"
 2 | 
 3 | environment:
 4 |   matrix:
 5 |     - PYTHON_VERSION: "3.7"
 6 |       MINICONDA: C:\Miniconda3-x64
 7 |     - PYTHON_VERSION: "3.8"
 8 |       MINICONDA: C:\Miniconda3-x64
 9 | 
10 | init:
11 |   - "ECHO %PYTHON_VERSION% %MINICONDA%"
12 | 
13 | install:
14 |   - "set PATH=%MINICONDA%;%MINICONDA%\\Scripts;%PATH%"
15 |   - conda config --set always_yes yes --set changeps1 no
16 |   - conda update -q conda
17 |   - conda info -a
18 |   - "conda create -q -n test-environment python=%PYTHON_VERSION% numpy scipy scikit-learn numba pandas bokeh holoviews datashader scikit-image pytest"
19 |   - activate test-environment
20 |   - pip install "tensorflow>=2.1"
21 |   - pip install pytest-benchmark
22 |   - pip install -e .
23 | 
24 | test_script:
25 |   - pytest --show-capture=no -v --disable-warnings
26 |   


--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
  1 | # Trigger a build when there is a push to the main branch or a tag starts with release-
  2 | trigger:
  3 |   branches:
  4 |     include:
  5 |     - master
  6 |   tags:
  7 |     include:
  8 |     - release-*
  9 | 
 10 | # Trigger a build when there is a pull request to the main branch
 11 | # Ignore PRs that are just updating the docs
 12 | pr:
 13 |   branches:
 14 |     include:
 15 |     - master
 16 |     exclude:
 17 |     - doc/*
 18 |     - README.rst
 19 | 
 20 | parameters:
 21 |   - name: includeReleaseCandidates
 22 |     displayName: "Allow pre-release dependencies"
 23 |     type: boolean
 24 |     default: false
 25 | 
 26 | 
 27 | variables:
 28 |   triggeredByPullRequest: $[eq(variables['Build.Reason'], 'PullRequest')]
 29 | 
 30 | stages:
 31 |   - stage: RunAllTests
 32 |     displayName: Run test suite
 33 |     jobs:
 34 |       - job: run_platform_tests
 35 |         strategy:
 36 |           matrix:
 37 |             mac_py39:
 38 |               imageName: 'macOS-latest'
 39 |               python.version: '3.9'
 40 |             linux_py39:
 41 |               imageName: 'ubuntu-latest'
 42 |               python.version: '3.9'
 43 |             windows_py39:
 44 |               imageName: 'windows-latest'
 45 |               python.version: '3.9'
 46 |             mac_py310:
 47 |               imageName: 'macOS-latest'
 48 |               python.version: '3.10'
 49 |             linux_py310:
 50 |               imageName: 'ubuntu-latest'
 51 |               python.version: '3.10'
 52 |             windows_py310:
 53 |               imageName: 'windows-latest'
 54 |               python.version: '3.10'
 55 |             mac_py311:
 56 |               imageName: 'macOS-latest'
 57 |               python.version: '3.11'
 58 |             linux_py311:
 59 |               imageName: 'ubuntu-latest'
 60 |               python.version: '3.11'
 61 |             windows_py311:
 62 |               imageName: 'windows-latest'
 63 |               python.version: '3.11'
 64 |             mac_py312:
 65 |               imageName: 'macOS-latest'
 66 |               python.version: '3.12'
 67 |             linux_py312:
 68 |               imageName: 'ubuntu-latest'
 69 |               python.version: '3.12'
 70 |             windows_py312:
 71 |               imageName: 'windows-latest'
 72 |               python.version: '3.12'
 73 | 
 74 |         pool:
 75 |           vmImage: $(imageName)
 76 | 
 77 |         steps:
 78 |         - task: UsePythonVersion@0
 79 |           inputs:
 80 |             versionSpec: '$(python.version)'
 81 |           displayName: 'Use Python $(python.version)'
 82 | 
 83 |         - script: |
 84 |             python -m pip install --upgrade pip
 85 |           displayName: 'Upgrade pip'
 86 | 
 87 |         - script: |
 88 |             pip install -e .
 89 |             pip install .[plot]
 90 |             pip install .[parametric_umap]
 91 |           displayName: 'Install dependencies'
 92 |           condition: ${{ eq(parameters.includeReleaseCandidates, false) }}
 93 | 
 94 |         - script: |
 95 |             pip install --pre -e .
 96 |             pip install --pre .[plot]
 97 |             pip install --pre .[parametric_umap]
 98 |           displayName: 'Install dependencies (allow pre-releases)'
 99 |           condition: ${{ eq(parameters.includeReleaseCandidates, true) }}
100 | 
101 |         - script: |
102 |             pip install pytest  pytest-azurepipelines pytest-cov pytest-benchmark coveralls
103 |           displayName: 'Install pytest'
104 | 
105 |         - script: |
106 |             # export NUMBA_DISABLE_JIT=1 # Disable numba coverage so tests run on time for now.
107 |             pytest umap/tests --show-capture=no -v --disable-warnings --junitxml=junit/test-results.xml --cov=umap/ --cov-report=xml --cov-report=html
108 |           displayName: 'Run tests'
109 | 
110 |         - bash: |
111 |             coveralls
112 |           displayName: 'Publish to coveralls'
113 |           condition: and(succeeded(), eq(variables.triggeredByPullRequest, false)) # Don't run this for PRs because they can't access pipeline secrets
114 |           env:
115 |             COVERALLS_REPO_TOKEN: $(COVERALLS_TOKEN)
116 | 
117 |         - task: PublishTestResults@2
118 |           inputs:
119 |             testResultsFiles: '$(System.DefaultWorkingDirectory)/**/coverage.xml'
120 |             testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)'
121 |           condition: succeededOrFailed()
122 | 
123 |   - stage: BuildPublishArtifact
124 |     dependsOn: RunAllTests
125 |     condition: and(succeeded(), startsWith(variables['Build.SourceBranch'], 'refs/tags/'), eq(variables.triggeredByPullRequest, false))
126 |     jobs:
127 |       - job: BuildArtifacts
128 |         displayName: Build source dists and wheels    
129 |         pool:
130 |           vmImage: 'ubuntu-latest'
131 |         steps:
132 |         - task: UsePythonVersion@0
133 |           inputs:
134 |             versionSpec: '3.10'
135 |           displayName: 'Use Python 3.10'
136 | 
137 |         - script: |
138 |             python -m pip install --upgrade pip
139 |             pip install wheel
140 |             pip install -e .
141 |           displayName: 'Install package locally'
142 |         
143 |         - bash: |
144 |             python setup.py sdist bdist_wheel
145 |             ls -l dist/
146 |           displayName: 'Build package'
147 | 
148 |         - bash: |
149 |             export PACKAGE_VERSION="$(python setup.py --version)"
150 |             echo "Package Version: ${PACKAGE_VERSION}"
151 |             echo "##vso[task.setvariable variable=packageVersionFormatted;]release-${PACKAGE_VERSION}"
152 |           displayName: 'Get package version'
153 | 
154 |         - script: |
155 |             echo "Version in git tag $(Build.SourceBranchName) does not match version derived from setup.py $(packageVersionFormatted)"
156 |             exit 1
157 |           displayName: Raise error if version doesnt match tag
158 |           condition: and(succeeded(), ne(variables['Build.SourceBranchName'], variables['packageVersionFormatted']))
159 | 
160 |         - task: DownloadSecureFile@1
161 |           name: PYPIRC_CONFIG
162 |           displayName: 'Download pypirc'
163 |           inputs:
164 |             secureFile: 'pypirc'  
165 | 
166 |         - script: |
167 |             pip install twine
168 |             twine upload --repository pypi --config-file $(PYPIRC_CONFIG.secureFilePath) dist/*
169 |           displayName: 'Upload to PyPI'
170 |           condition: and(succeeded(), eq(variables['Build.SourceBranchName'], variables['packageVersionFormatted']))
171 | 
172 | 


--------------------------------------------------------------------------------
/ci_scripts/install.sh:
--------------------------------------------------------------------------------
 1 | if [[ "$DISTRIB" == "conda" ]]; then
 2 | 
 3 |   # Deactivate the travis-provided virtual environment and setup a
 4 |     # conda-based environment instead
 5 |   if [ $TRAVIS_OS_NAME = 'linux' ]; then
 6 |     # Only Linux has a virtual environment activated; Mac does not.
 7 |     deactivate
 8 |   fi
 9 | 
10 |   # Use the miniconda installer for faster download / install of conda
11 |   # itself
12 |   pushd .
13 |   cd
14 |   mkdir -p download
15 |   cd download
16 |   echo "Cached in $HOME/download :"
17 |   ls -l
18 |   echo
19 | # For now, ignoring the cached file.
20 | #  if [[ ! -f miniconda.sh ]]
21 | #     then
22 |      if [ $TRAVIS_OS_NAME = 'osx' ]; then
23 |        # MacOS URL found here: https://docs.conda.io/en/latest/miniconda.html
24 |        wget \
25 |        https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh \
26 |          -O miniconda.sh
27 |      else
28 |        wget \
29 |        http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \
30 |          -O miniconda.sh
31 |      fi
32 | #  fi
33 |   chmod +x miniconda.sh && ./miniconda.sh -b -p $HOME/miniconda
34 |   cd ..
35 |   export PATH=$HOME/miniconda/bin:$HOME/miniconda3/bin:$PATH
36 |   conda update --yes conda
37 |   popd
38 | 
39 |   # Configure the conda environment and put it in the path using the
40 |   # provided versions
41 | #  conda create -n testenv --yes python=$PYTHON_VERSION pip \
42 | #        numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION numba=$NUMBA_VERSION scikit-learn \
43 | #        pytest "tensorflow-mkl>=2.2.0"
44 |   if [ $TRAVIS_OS_NAME = 'osx' ]; then
45 |     conda create -q -n testenv --yes python=$PYTHON_VERSION numpy scipy scikit-learn \
46 |           numba pytest pandas
47 | #    pip install bokeh
48 | #    pip install datashader
49 | #    pip install holoviews
50 |     conda install --yes "tensorflow>=2.0.0"
51 |   else
52 |     conda create -q -n testenv --yes python=$PYTHON_VERSION numpy scipy scikit-learn \
53 |           numba pandas bokeh holoviews datashader scikit-image pytest pytest-benchmark \
54 |           "tensorflow-mkl>=2.2.0"
55 |   fi
56 | 
57 |   source activate testenv
58 | 
59 |   # black requires Python 3.x; don't try to install for Python 2.7 test
60 |   if [[ "$PYTHON_VERSION" != "2.7" ]]; then
61 |     pip install black
62 |     pip install pynndescent
63 |   fi
64 | 
65 |   if [[ "$COVERAGE" == "true" ]]; then
66 |       pip install coverage coveralls
67 |       pip install pytest-cov pytest-benchmark # pytest coverage plugin
68 |   fi
69 | 
70 |   python --version
71 |   python -c "import numpy; print('numpy %s' % numpy.__version__)"
72 |   python -c "import scipy; print('scipy %s' % scipy.__version__)"
73 |   python -c "import numba; print('numba %s' % numba.__version__)"
74 |   python -c "import sklearn; print('scikit-learn %s' % sklearn.__version__)"
75 |   python setup.py develop
76 | else
77 |   pip install pynndescent # test with optional pynndescent dependency
78 |   pip install pandas
79 |   pip install bokeh
80 |   pip install datashader
81 |   pip install matplotlib
82 |   pip install holoviews
83 |   pip install scikit-image
84 |   pip install "tensorflow>=2.2.0"
85 |   pip install -e .
86 | fi
87 | 


--------------------------------------------------------------------------------
/ci_scripts/success.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | if [[ "$COVERAGE" == "true" ]]; then
 4 | #    # Need to run coveralls from a git checkout, so we copy .coverage
 5 | #    # from TEST_DIR where nosetests has been run
 6 | #    cp $TEST_DIR/.coverage $TRAVIS_BUILD_DIR
 7 | #    cd $TRAVIS_BUILD_DIR
 8 |     # Ignore coveralls failures as the coveralls server is not
 9 |     # very reliable but we don't want travis to report a failure
10 |     # in the github UI just because the coverage report failed to
11 |     # be published.
12 |     coveralls || echo "Coveralls upload failed"
13 | fi
14 | 


--------------------------------------------------------------------------------
/ci_scripts/test.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | #if [[ "$COVERAGE" == "true" ]]; then
 4 | #    black --check $MODULE
 5 | #fi
 6 | 
 7 | if [[ "$COVERAGE" == "true" ]]; then
 8 |     export NUMBA_DISABLE_JIT=1
 9 |     pytest --cov=umap/ --cov-report=xml --cov-report=html --show-capture=no -v --disable-warnings
10 | else
11 |     pytest --show-capture=no -v --disable-warnings
12 | fi
13 | 


--------------------------------------------------------------------------------
/doc/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | umap
3 | setup.py
4 | paper.md
5 | paper.bib
6 | LICENSE.txt
7 | CODE_OF_CONDUCT.md
8 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = umap
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/doc/_static/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/_static/.gitkeep


--------------------------------------------------------------------------------
/doc/api.rst:
--------------------------------------------------------------------------------
 1 | UMAP API Guide
 2 | ==============
 3 | 
 4 | UMAP has only two classes, :class:`UMAP`, and :class:`ParametricUMAP`, which inherits from it.
 5 | 
 6 | UMAP
 7 | ----
 8 | 
 9 | .. autoclass:: umap.umap_.UMAP
10 |    :members:
11 | 
12 | ParametricUMAP
13 | ----
14 | 
15 | .. autoclass:: umap.parametric_umap.ParametricUMAP
16 |    :members:
17 | 
18 | A number of internal functions can also be accessed separately for more fine tuned work.
19 | 
20 | Useful Functions
21 | ----------------
22 | 
23 | .. automodule:: umap.umap_
24 |    :members:
25 |    :exclude-members: UMAP
26 | 
27 | 


--------------------------------------------------------------------------------
/doc/bokeh_digits_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.datasets import load_digits
 3 | import pandas as pd
 4 | 
 5 | digits = load_digits()
 6 | 
 7 | import umap
 8 | 
 9 | reducer = umap.UMAP(random_state=42)
10 | embedding = reducer.fit_transform(digits.data)
11 | 
12 | from io import BytesIO
13 | from PIL import Image
14 | import base64
15 | 
16 | 
17 | def embeddable_image(data):
18 |     img_data = 255 - 15 * data.astype(np.uint8)
19 |     image = Image.fromarray(img_data, mode="L").resize((64, 64), Image.BICUBIC)
20 |     buffer = BytesIO()
21 |     image.save(buffer, format="png")
22 |     for_encoding = buffer.getvalue()
23 |     return "data:image/png;base64," + base64.b64encode(for_encoding).decode()
24 | 
25 | 
26 | from bokeh.plotting import figure, show, output_file
27 | from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper
28 | from bokeh.palettes import Spectral10
29 | 
30 | output_file("basic_usage_bokeh_example.html")
31 | 
32 | digits_df = pd.DataFrame(embedding, columns=("x", "y"))
33 | digits_df["digit"] = [str(x) for x in digits.target]
34 | digits_df["image"] = list(map(embeddable_image, digits.images))
35 | 
36 | datasource = ColumnDataSource(digits_df)
37 | color_mapping = CategoricalColorMapper(
38 |     factors=[str(9 - x) for x in digits.target_names], palette=Spectral10
39 | )
40 | 
41 | plot_figure = figure(
42 |     title="UMAP projection of the Digits dataset",
43 |     plot_width=600,
44 |     plot_height=600,
45 |     tools=("pan, wheel_zoom, reset"),
46 | )
47 | 
48 | plot_figure.add_tools(
49 |     HoverTool(
50 |         tooltips="""
51 | <div>
52 |     <div>
53 |         <img src='@image' style='float: left; margin: 5px 5px 5px 5px'/>
54 |     </div>
55 |     <div>
56 |         <span style='font-size: 16px; color: #224499'>Digit:</span>
57 |         <span style='font-size: 18px'>@digit</span>
58 |     </div>
59 | </div>
60 | """
61 |     )
62 | )
63 | 
64 | plot_figure.circle(
65 |     "x",
66 |     "y",
67 |     source=datasource,
68 |     color=dict(field="digit", transform=color_mapping),
69 |     line_alpha=0.6,
70 |     fill_alpha=0.6,
71 |     size=4,
72 | )
73 | show(plot_figure)
74 | 


--------------------------------------------------------------------------------
/doc/doc_requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy>=1.13
 2 | scipy>=0.19
 3 | scikit-learn>=0.19
 4 | numba>=0.37
 5 | bokeh>=0.13
 6 | datashader>=0.6
 7 | seaborn>=0.8
 8 | tqdm
 9 | sphinx-gallery
10 | numpydoc
11 | 


--------------------------------------------------------------------------------
/doc/exploratory_analysis.rst:
--------------------------------------------------------------------------------
  1 | Exploratory Analysis of Interesting Datasets
  2 | ============================================
  3 | 
  4 | UMAP is a useful tool for general exploratory analysis of data -- it can provide
  5 | a unique lens through which to view data that can highlight structures and
  6 | properties hiding in data that are not as apparent when analysed with other techniques.
  7 | Below is a selection of uses cases of UMAP being used for interesting explorations
  8 | of intriguing datasets -- everything from pure math and outputs of neural networks,
  9 | to philosophy articles, and scientific texts.
 10 | 
 11 | Prime factorizations of numbers
 12 | -------------------------------
 13 | What would happen if we applied UMAP to the integers? First we would need a way
 14 | to express an integer in a high dimensional space. That can be done by looking
 15 | at the prime factorization of each number. Next you have to take enough numbers
 16 | to actually generate an interesting visualization. John Williamson set about doing
 17 | exactly this, and the results are fascinating. While they may not actually tell us
 18 | anything new about number theory they do highlight interesting structures
 19 | in prime factorizations, and demonstrate how UMAP can aid in interesting explorations
 20 | of datasets that we might think we know well. It's worth visiting the linked article
 21 | below as Dr. Williamson provides a rich and detailed exploration of UMAP as
 22 | applied to prime factorizations of integers.
 23 | 
 24 | .. image:: images/umap_primes.png
 25 |    :width: 400px
 26 | 
 27 | `UMAP on prime factorizations <https://johnhw.github.io/umap_primes/index.md.html>`__
 28 | 
 29 | Thanks to John Williamson.
 30 | 
 31 | Structure of Recent Philosophy
 32 | ------------------------------
 33 | Philosophy is an incredibly diverse subject, ranging from social and moral philosophy to
 34 | logic and philosophy of math; from analysis of ancient Greek philosophy to modern business
 35 | ethics. If we could get an overview of all the philosophy papers published in the last
 36 | century what might it look like? Maximilian Noichl provides just such an exploration,
 37 | looking at a large sampling of philosophy papers and comparing them according to their
 38 | citations. The results are intriguing, and can be explored interactively in the
 39 | viewer Maximilian built for it.
 40 | 
 41 | .. image:: images/structure_recent_phil.png
 42 |    :width: 400px
 43 | 
 44 | `Structure of Recent Philosophy <https://homepage.univie.ac.at/noichlm94/full/zoom_final/index.html>`__
 45 | 
 46 | Thanks to Maximilian Noichl.
 47 | 
 48 | Language, Context, and Geometry in Neural Networks
 49 | --------------------------------------------------
 50 | Among recent developments in natural language processing is the BERT neural network
 51 | based technique for analysis of language. Among many things that BERT can do one is
 52 | context sensitive embeddings of words -- providing numeric vector representations of words
 53 | that are sensitive to the context of how the word is used. Exactly what goes on inside
 54 | the neural network to do this is a little mysterious (since the network is very complex
 55 | with many many parameters). A tram of researchers from Google set out to explore the
 56 | word embedding space generated by BERT, and among the tools used was UMAP. The linked
 57 | blog post provides a detailed and inspiring analysis of what BERT's word embeddings
 58 | look like, and how the different layers of BERT represent different aspects of language.
 59 | 
 60 | .. image:: images/bert_embedding.png
 61 |    :width: 400px
 62 | 
 63 | `Language, Context, and Geometry in Neural Networks <https://pair-code.github.io/interpretability/context-atlas/blogpost/>`__
 64 | 
 65 | Thanks to Andy Coenen, Emily Reif, Ann Yuan, Been Kim, Adam Pearce, Fernanda Viégas, and Martin Wattenberg.
 66 | 
 67 | Activation Atlas
 68 | ----------------
 69 | Understanding the image processing capabilities (and deficits!) of modern
 70 | convolutional neural networks is a challenge. Certainly these models are capable
 71 | of amazing feats in, for example, image classification. They can also be brittle
 72 | in unexpected ways, with carefully designed images able to induce otherwise
 73 | baffling mis-classifications. To better understand this researchers from
 74 | Google and OpenAI built the activation atlas -- analysing the space of activations
 75 | of a neural network. Here UMAP provides a means to compress the activation landscape
 76 | down to 2 dimensions for visualization. The result was an impressive interactive paper
 77 | in the Distill journal, providing rich visualizations and new insights into
 78 | the working of convolutional neural networks.
 79 | 
 80 | .. image:: images/activation_atlas.png
 81 |    :width: 400px
 82 | 
 83 | `The Activation Atlas <https://distill.pub/2019/activation-atlas/>`__
 84 | 
 85 | Thanks to Shan Carter, Zan Armstrong, Ludwig Schubert, Ian Johnson, and Chris Olah.
 86 | 
 87 | Open Syllabus Galaxy
 88 | --------------------
 89 | Suppose you wanted to explore the space of commonly assigned texts from Open Syllabus? That
 90 | gives you over 150,000 texts to consider. Since the texts are open you can actually analyse
 91 | the text content involved. With some NLP and neural network wizardry David McClure build
 92 | a network of such texts and then used node2vec and UMAP to generate a map of them. The result
 93 | is a galaxy of textbooks showing inter-relationships between subjects, similar and related texts,
 94 | and generally just a an interesting ladscape of science to be explored. As with some
 95 | of the other projects here David made a great interactive viewer allowing for rich exploration
 96 | of the results.
 97 | 
 98 | .. image:: images/syllabus_galaxy.png
 99 |    :width: 400px
100 | 
101 | `Open Syllabus Galaxy <https://galaxy.opensyllabus.org/>`__
102 | 
103 | Thanks to David McClure.
104 | 


--------------------------------------------------------------------------------
/doc/images/20newsgroups_hellinger_counts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/20newsgroups_hellinger_counts.png


--------------------------------------------------------------------------------
/doc/images/20newsgroups_hellinger_tfidf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/20newsgroups_hellinger_tfidf.png


--------------------------------------------------------------------------------
/doc/images/BasicUsage_13_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/BasicUsage_13_1.png


--------------------------------------------------------------------------------
/doc/images/BasicUsage_18_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/BasicUsage_18_0.png


--------------------------------------------------------------------------------
/doc/images/BasicUsage_20_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/BasicUsage_20_1.png


--------------------------------------------------------------------------------
/doc/images/BasicUsage_26_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/BasicUsage_26_1.png


--------------------------------------------------------------------------------
/doc/images/BasicUsage_6_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/BasicUsage_6_1.png


--------------------------------------------------------------------------------
/doc/images/ESM_metagenomic_atlas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/ESM_metagenomic_atlas.png


--------------------------------------------------------------------------------
/doc/images/Hyperbolic_tiling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/Hyperbolic_tiling.png


--------------------------------------------------------------------------------
/doc/images/SupervisedUMAP_10_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/SupervisedUMAP_10_1.png


--------------------------------------------------------------------------------
/doc/images/SupervisedUMAP_15_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/SupervisedUMAP_15_1.png


--------------------------------------------------------------------------------
/doc/images/SupervisedUMAP_22_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/SupervisedUMAP_22_1.png


--------------------------------------------------------------------------------
/doc/images/SupervisedUMAP_31_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/SupervisedUMAP_31_0.png


--------------------------------------------------------------------------------
/doc/images/SupervisedUMAP_33_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/SupervisedUMAP_33_0.png


--------------------------------------------------------------------------------
/doc/images/UMAPTransform_15_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/UMAPTransform_15_0.png


--------------------------------------------------------------------------------
/doc/images/UMAPTransform_21_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/UMAPTransform_21_0.png


--------------------------------------------------------------------------------
/doc/images/UMAP_zoo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/UMAP_zoo.png


--------------------------------------------------------------------------------
/doc/images/activation_atlas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/activation_atlas.png


--------------------------------------------------------------------------------
/doc/images/aligned_umap_basic_usage_15_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/aligned_umap_basic_usage_15_0.png


--------------------------------------------------------------------------------
/doc/images/aligned_umap_basic_usage_22_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/aligned_umap_basic_usage_22_0.png


--------------------------------------------------------------------------------
/doc/images/aligned_umap_basic_usage_29_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/aligned_umap_basic_usage_29_0.png


--------------------------------------------------------------------------------
/doc/images/aligned_umap_basic_usage_29_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/aligned_umap_basic_usage_29_1.png


--------------------------------------------------------------------------------
/doc/images/aligned_umap_basic_usage_5_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/aligned_umap_basic_usage_5_1.png


--------------------------------------------------------------------------------
/doc/images/aligned_umap_pendigits_3d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/aligned_umap_pendigits_3d.png


--------------------------------------------------------------------------------
/doc/images/aligned_umap_pendigits_3d_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/aligned_umap_pendigits_3d_1.png


--------------------------------------------------------------------------------
/doc/images/aligned_umap_pendigits_anim.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/aligned_umap_pendigits_anim.gif


--------------------------------------------------------------------------------
/doc/images/aligned_umap_politics_demo_31_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/aligned_umap_politics_demo_31_0.png


--------------------------------------------------------------------------------
/doc/images/aligned_umap_politics_demo_spaghetti.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/aligned_umap_politics_demo_spaghetti.png


--------------------------------------------------------------------------------
/doc/images/audio_explorer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/audio_explorer.png


--------------------------------------------------------------------------------
/doc/images/basic_usage_17_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/basic_usage_17_1.png


--------------------------------------------------------------------------------
/doc/images/basic_usage_22_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/basic_usage_22_0.png


--------------------------------------------------------------------------------
/doc/images/basic_usage_24_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/basic_usage_24_2.png


--------------------------------------------------------------------------------
/doc/images/basic_usage_30_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/basic_usage_30_1.png


--------------------------------------------------------------------------------
/doc/images/basic_usage_8_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/basic_usage_8_1.png


--------------------------------------------------------------------------------
/doc/images/bert_embedding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/bert_embedding.png


--------------------------------------------------------------------------------
/doc/images/c_elegans_3d.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/c_elegans_3d.jpg


--------------------------------------------------------------------------------
/doc/images/clustering_10_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/clustering_10_1.png


--------------------------------------------------------------------------------
/doc/images/clustering_16_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/clustering_16_1.png


--------------------------------------------------------------------------------
/doc/images/clustering_27_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/clustering_27_1.png


--------------------------------------------------------------------------------
/doc/images/clustering_31_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/clustering_31_1.png


--------------------------------------------------------------------------------
/doc/images/clustering_6_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/clustering_6_1.png


--------------------------------------------------------------------------------
/doc/images/composing_models_11_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/composing_models_11_1.png


--------------------------------------------------------------------------------
/doc/images/composing_models_14_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/composing_models_14_1.png


--------------------------------------------------------------------------------
/doc/images/composing_models_18_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/composing_models_18_1.png


--------------------------------------------------------------------------------
/doc/images/composing_models_20_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/composing_models_20_1.png


--------------------------------------------------------------------------------
/doc/images/composing_models_23_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/composing_models_23_1.png


--------------------------------------------------------------------------------
/doc/images/composing_models_36_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/composing_models_36_1.png


--------------------------------------------------------------------------------
/doc/images/composing_models_38_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/composing_models_38_1.png


--------------------------------------------------------------------------------
/doc/images/composing_models_42_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/composing_models_42_1.png


--------------------------------------------------------------------------------
/doc/images/composing_models_44_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/composing_models_44_1.png


--------------------------------------------------------------------------------
/doc/images/composing_models_47_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/composing_models_47_1.png


--------------------------------------------------------------------------------
/doc/images/composing_models_50_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/composing_models_50_1.png


--------------------------------------------------------------------------------
/doc/images/composing_models_6_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/composing_models_6_1.png


--------------------------------------------------------------------------------
/doc/images/densmap_demo_10_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/densmap_demo_10_1.png


--------------------------------------------------------------------------------
/doc/images/densmap_demo_13_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/densmap_demo_13_1.png


--------------------------------------------------------------------------------
/doc/images/densmap_demo_16_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/densmap_demo_16_1.png


--------------------------------------------------------------------------------
/doc/images/densmap_demo_19_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/densmap_demo_19_1.png


--------------------------------------------------------------------------------
/doc/images/densmap_demo_21_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/densmap_demo_21_1.png


--------------------------------------------------------------------------------
/doc/images/densmap_demo_24_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/densmap_demo_24_1.png


--------------------------------------------------------------------------------
/doc/images/densmap_demo_6_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/densmap_demo_6_1.png


--------------------------------------------------------------------------------
/doc/images/embedding_projector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/embedding_projector.png


--------------------------------------------------------------------------------
/doc/images/embedding_space_11_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/embedding_space_11_1.png


--------------------------------------------------------------------------------
/doc/images/embedding_space_15_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/embedding_space_15_1.png


--------------------------------------------------------------------------------
/doc/images/embedding_space_18_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/embedding_space_18_1.png


--------------------------------------------------------------------------------
/doc/images/embedding_space_26_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/embedding_space_26_0.png


--------------------------------------------------------------------------------
/doc/images/embedding_space_29_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/embedding_space_29_1.png


--------------------------------------------------------------------------------
/doc/images/embedding_space_33_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/embedding_space_33_1.png


--------------------------------------------------------------------------------
/doc/images/embedding_space_37_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/embedding_space_37_1.png


--------------------------------------------------------------------------------
/doc/images/embedding_space_39_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/embedding_space_39_1.png


--------------------------------------------------------------------------------
/doc/images/embedding_space_47_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/embedding_space_47_1.png


--------------------------------------------------------------------------------
/doc/images/embedding_space_52_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/embedding_space_52_1.png


--------------------------------------------------------------------------------
/doc/images/embedding_space_55_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/embedding_space_55_0.png


--------------------------------------------------------------------------------
/doc/images/embedding_space_59_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/embedding_space_59_0.png


--------------------------------------------------------------------------------
/doc/images/embedding_space_7_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/embedding_space_7_1.png


--------------------------------------------------------------------------------
/doc/images/exploring_fashion_mnist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/exploring_fashion_mnist.png


--------------------------------------------------------------------------------
/doc/images/how_umap_works_basic_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/how_umap_works_basic_graph.png


--------------------------------------------------------------------------------
/doc/images/how_umap_works_fuzzy_open_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/how_umap_works_fuzzy_open_cover.png


--------------------------------------------------------------------------------
/doc/images/how_umap_works_local_metric_open_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/how_umap_works_local_metric_open_cover.png


--------------------------------------------------------------------------------
/doc/images/how_umap_works_open_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/how_umap_works_open_cover.png


--------------------------------------------------------------------------------
/doc/images/how_umap_works_raw_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/how_umap_works_raw_data.png


--------------------------------------------------------------------------------
/doc/images/how_umap_works_raw_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/how_umap_works_raw_graph.png


--------------------------------------------------------------------------------
/doc/images/how_umap_works_umap_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/how_umap_works_umap_graph.png


--------------------------------------------------------------------------------
/doc/images/how_umap_works_umap_graph_layout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/how_umap_works_umap_graph_layout.png


--------------------------------------------------------------------------------
/doc/images/how_umap_works_umap_layout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/how_umap_works_umap_layout.png


--------------------------------------------------------------------------------
/doc/images/how_umap_works_umap_open_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/how_umap_works_umap_open_cover.png


--------------------------------------------------------------------------------
/doc/images/how_umap_works_uniform_distribution_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/how_umap_works_uniform_distribution_cover.png


--------------------------------------------------------------------------------
/doc/images/inverse_transform_13_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/inverse_transform_13_0.png


--------------------------------------------------------------------------------
/doc/images/inverse_transform_20_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/inverse_transform_20_1.png


--------------------------------------------------------------------------------
/doc/images/inverse_transform_26_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/inverse_transform_26_0.png


--------------------------------------------------------------------------------
/doc/images/inverse_transform_7_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/inverse_transform_7_1.png


--------------------------------------------------------------------------------
/doc/images/mutual_nn_umap_20ngc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/mutual_nn_umap_20ngc.png


--------------------------------------------------------------------------------
/doc/images/mutual_nn_umap_FMNIST.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/mutual_nn_umap_FMNIST.png


--------------------------------------------------------------------------------
/doc/images/mutual_nn_umap_MNIST.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/mutual_nn_umap_MNIST.png


--------------------------------------------------------------------------------
/doc/images/mutual_nn_umap_connectivity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/mutual_nn_umap_connectivity.png


--------------------------------------------------------------------------------
/doc/images/mutual_nn_umap_lc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/mutual_nn_umap_lc.png


--------------------------------------------------------------------------------
/doc/images/mutual_nn_umap_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/mutual_nn_umap_results.png


--------------------------------------------------------------------------------
/doc/images/organogenesis_paper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/organogenesis_paper.png


--------------------------------------------------------------------------------
/doc/images/orion_particles.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/orion_particles.png


--------------------------------------------------------------------------------
/doc/images/outliers_10_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/outliers_10_0.png


--------------------------------------------------------------------------------
/doc/images/outliers_12_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/outliers_12_2.png


--------------------------------------------------------------------------------
/doc/images/outliers_13_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/outliers_13_2.png


--------------------------------------------------------------------------------
/doc/images/outliers_15_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/outliers_15_0.png


--------------------------------------------------------------------------------
/doc/images/outliers_19_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/outliers_19_0.png


--------------------------------------------------------------------------------
/doc/images/outliers_22_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/outliers_22_2.png


--------------------------------------------------------------------------------
/doc/images/outliers_27_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/outliers_27_0.png


--------------------------------------------------------------------------------
/doc/images/outliers_5_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/outliers_5_0.png


--------------------------------------------------------------------------------
/doc/images/outliers_7_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/outliers_7_2.png


--------------------------------------------------------------------------------
/doc/images/outliers_9_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/outliers_9_0.png


--------------------------------------------------------------------------------
/doc/images/parameters_13_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_13_1.png


--------------------------------------------------------------------------------
/doc/images/parameters_13_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_13_2.png


--------------------------------------------------------------------------------
/doc/images/parameters_13_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_13_3.png


--------------------------------------------------------------------------------
/doc/images/parameters_13_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_13_4.png


--------------------------------------------------------------------------------
/doc/images/parameters_13_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_13_5.png


--------------------------------------------------------------------------------
/doc/images/parameters_13_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_13_6.png


--------------------------------------------------------------------------------
/doc/images/parameters_13_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_13_7.png


--------------------------------------------------------------------------------
/doc/images/parameters_16_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_16_1.png


--------------------------------------------------------------------------------
/doc/images/parameters_16_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_16_2.png


--------------------------------------------------------------------------------
/doc/images/parameters_16_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_16_3.png


--------------------------------------------------------------------------------
/doc/images/parameters_16_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_16_4.png


--------------------------------------------------------------------------------
/doc/images/parameters_16_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_16_5.png


--------------------------------------------------------------------------------
/doc/images/parameters_16_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_16_6.png


--------------------------------------------------------------------------------
/doc/images/parameters_19_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_19_1.png


--------------------------------------------------------------------------------
/doc/images/parameters_21_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_21_1.png


--------------------------------------------------------------------------------
/doc/images/parameters_32_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_32_1.png


--------------------------------------------------------------------------------
/doc/images/parameters_32_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_32_2.png


--------------------------------------------------------------------------------
/doc/images/parameters_32_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_32_3.png


--------------------------------------------------------------------------------
/doc/images/parameters_32_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_32_4.png


--------------------------------------------------------------------------------
/doc/images/parameters_32_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_32_5.png


--------------------------------------------------------------------------------
/doc/images/parameters_8_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/parameters_8_1.png


--------------------------------------------------------------------------------
/doc/images/performance_14_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/performance_14_1.png


--------------------------------------------------------------------------------
/doc/images/performance_15_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/performance_15_1.png


--------------------------------------------------------------------------------
/doc/images/performance_17_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/performance_17_1.png


--------------------------------------------------------------------------------
/doc/images/performance_18_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/performance_18_1.png


--------------------------------------------------------------------------------
/doc/images/performance_20_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/performance_20_1.png


--------------------------------------------------------------------------------
/doc/images/performance_21_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/performance_21_1.png


--------------------------------------------------------------------------------
/doc/images/pixplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/pixplot.png


--------------------------------------------------------------------------------
/doc/images/plotting_10_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/plotting_10_1.png


--------------------------------------------------------------------------------
/doc/images/plotting_12_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/plotting_12_1.png


--------------------------------------------------------------------------------
/doc/images/plotting_14_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/plotting_14_1.png


--------------------------------------------------------------------------------
/doc/images/plotting_19_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/plotting_19_2.png


--------------------------------------------------------------------------------
/doc/images/plotting_21_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/plotting_21_2.png


--------------------------------------------------------------------------------
/doc/images/plotting_32_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/plotting_32_2.png


--------------------------------------------------------------------------------
/doc/images/plotting_34_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/plotting_34_2.png


--------------------------------------------------------------------------------
/doc/images/plotting_38_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/plotting_38_1.png


--------------------------------------------------------------------------------
/doc/images/plotting_40_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/plotting_40_1.png


--------------------------------------------------------------------------------
/doc/images/plotting_42_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/plotting_42_0.png


--------------------------------------------------------------------------------
/doc/images/plotting_44_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/plotting_44_1.png


--------------------------------------------------------------------------------
/doc/images/plotting_8_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/plotting_8_2.png


--------------------------------------------------------------------------------
/doc/images/population_umap.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/population_umap.jpg


--------------------------------------------------------------------------------
/doc/images/precomputed_k-nn11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/precomputed_k-nn11.png


--------------------------------------------------------------------------------
/doc/images/precomputed_k-nn13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/precomputed_k-nn13.png


--------------------------------------------------------------------------------
/doc/images/precomputed_k-nn17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/precomputed_k-nn17.png


--------------------------------------------------------------------------------
/doc/images/precomputed_k-nn6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/precomputed_k-nn6.png


--------------------------------------------------------------------------------
/doc/images/pumap-only.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/pumap-only.png


--------------------------------------------------------------------------------
/doc/images/reproducibility_10_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/reproducibility_10_1.png


--------------------------------------------------------------------------------
/doc/images/reproducibility_14_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/reproducibility_14_1.png


--------------------------------------------------------------------------------
/doc/images/reproducibility_18_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/reproducibility_18_1.png


--------------------------------------------------------------------------------
/doc/images/reproducibility_6_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/reproducibility_6_1.png


--------------------------------------------------------------------------------
/doc/images/retrain_pumap_emb_x1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/retrain_pumap_emb_x1.png


--------------------------------------------------------------------------------
/doc/images/retrain_pumap_emb_x2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/retrain_pumap_emb_x2.png


--------------------------------------------------------------------------------
/doc/images/retrain_pumap_history.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/retrain_pumap_history.png


--------------------------------------------------------------------------------
/doc/images/retrain_pumap_p_emb_x1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/retrain_pumap_p_emb_x1.png


--------------------------------------------------------------------------------
/doc/images/retrain_pumap_p_emb_x2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/retrain_pumap_p_emb_x2.png


--------------------------------------------------------------------------------
/doc/images/retrain_pumap_summary_2_removed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/retrain_pumap_summary_2_removed.png


--------------------------------------------------------------------------------
/doc/images/simplices.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/simplices.png


--------------------------------------------------------------------------------
/doc/images/single_cell_umap.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/single_cell_umap.jpg


--------------------------------------------------------------------------------
/doc/images/sparse_11_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/sparse_11_1.png


--------------------------------------------------------------------------------
/doc/images/sparse_18_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/sparse_18_0.png


--------------------------------------------------------------------------------
/doc/images/sparse_31_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/sparse_31_1.png


--------------------------------------------------------------------------------
/doc/images/sparse_35_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/sparse_35_0.png


--------------------------------------------------------------------------------
/doc/images/structure_recent_phil.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/structure_recent_phil.png


--------------------------------------------------------------------------------
/doc/images/syllabus_galaxy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/syllabus_galaxy.png


--------------------------------------------------------------------------------
/doc/images/time_cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/time_cluster.png


--------------------------------------------------------------------------------
/doc/images/umap-loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/umap-loss.png


--------------------------------------------------------------------------------
/doc/images/umap-only.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/umap-only.png


--------------------------------------------------------------------------------
/doc/images/umap_explorer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/umap_explorer.png


--------------------------------------------------------------------------------
/doc/images/umap_primes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/umap_primes.png


--------------------------------------------------------------------------------
/doc/images/umap_surrey.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/umap_surrey.png


--------------------------------------------------------------------------------
/doc/images/umap_vae_pca.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/images/umap_vae_pca.png


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
  1 | .. umap documentation master file, created by
  2 |    sphinx-quickstart on Fri Jun  8 10:09:40 2018.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | .. image:: logo_large.png
  7 |   :width: 600
  8 |   :align: center
  9 | 
 10 | UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction
 11 | ===========================================================================
 12 | 
 13 | Uniform Manifold Approximation and Projection (UMAP) is a dimension reduction
 14 | technique that can be used for visualisation similarly to t-SNE, but also for
 15 | general non-linear dimension reduction. The algorithm is founded on three
 16 | assumptions about the data
 17 | 
 18 | 1. The data is uniformly distributed on Riemannian manifold;
 19 | 2. The Riemannian metric is locally constant (or can be approximated as such);
 20 | 3. The manifold is locally connected.
 21 | 
 22 | From these assumptions it is possible to model the manifold with a fuzzy
 23 | topological structure. The embedding is found by searching for a low dimensional
 24 | projection of the data that has the closest possible equivalent fuzzy
 25 | topological structure.
 26 | 
 27 | The details for the underlying mathematics can be found in
 28 | `our paper on ArXiv <https://arxiv.org/abs/1802.03426>`_:
 29 | 
 30 | McInnes, L, Healy, J, *UMAP: Uniform Manifold Approximation and Projection
 31 | for Dimension Reduction*, ArXiv e-prints 1802.03426, 2018
 32 | 
 33 | You can find the software `on github <https://github.com/lmcinnes/>`_.
 34 | 
 35 | **Installation**
 36 | 
 37 | Conda install, via the excellent work of the conda-forge team:
 38 | 
 39 | .. code:: bash
 40 | 
 41 |     conda install -c conda-forge umap-learn
 42 | 
 43 | The conda-forge packages are available for linux, OS X, and Windows 64 bit.
 44 | 
 45 | PyPI install, presuming you have numba and sklearn and all its requirements
 46 | (numpy and scipy) installed:
 47 | 
 48 | .. code:: bash
 49 | 
 50 |     pip install umap-learn
 51 | 
 52 | 
 53 | .. toctree::
 54 |    :maxdepth: 2
 55 |    :caption: User Guide / Tutorial:
 56 | 
 57 |    basic_usage
 58 |    parameters
 59 |    plotting
 60 |    reproducibility
 61 |    transform
 62 |    inverse_transform
 63 |    parametric_umap
 64 |    transform_landmarked_pumap
 65 |    sparse
 66 |    supervised
 67 |    clustering
 68 |    outliers
 69 |    composing_models
 70 |    densmap_demo
 71 |    mutual_nn_umap
 72 |    document_embedding
 73 |    embedding_space
 74 |    aligned_umap_basic_usage
 75 |    aligned_umap_politics_demo
 76 |    precomputed_k-nn
 77 |    benchmarking
 78 |    release_notes
 79 |    faq
 80 | 
 81 | .. toctree::
 82 |    :maxdepth: 2
 83 |    :caption: Background on UMAP:
 84 | 
 85 |    how_umap_works
 86 |    performance
 87 | 
 88 | .. toctree::
 89 |    :maxdepth: 2
 90 |    :caption: Examples of UMAP usage
 91 | 
 92 |    interactive_viz
 93 |    exploratory_analysis
 94 |    scientific_papers
 95 |    nomic_atlas_umap_of_text_embeddings
 96 |    nomic_atlas_visualizing_mnist_training
 97 | 
 98 | .. toctree::
 99 |    :caption: API Reference:
100 | 
101 |    api
102 | 
103 | 
104 | 
105 | Indices and tables
106 | ==================
107 | 
108 | * :ref:`genindex`
109 | * :ref:`modindex`
110 | * :ref:`search`
111 | 


--------------------------------------------------------------------------------
/doc/interactive_viz.rst:
--------------------------------------------------------------------------------
  1 | Interactive Visualizations
  2 | ==========================
  3 | 
  4 | UMAP has found use in a number of interesting interactive visualization projects, analyzing everything from
  5 | images from photo archives, to word embedding, animal point clouds, and even sound. Sometimes it has also
  6 | been used in interesting interactive tools that simply help a user to get an intuition for what the algorithm
  7 | is doing (by applying it to intuitive 3D data). Below are some amazing projects that make use of UMAP.
  8 | 
  9 | UMAP Zoo
 10 | --------
 11 | An exploration of how UMAP behaves when dimension reducing point clouds of animals. It is
 12 | interactive, letting you switch between 2D and 3D representations and has a wide selection
 13 | of different animals. Attempting to guess the animal from the 2D UMAP representation is a
 14 | fun game. In practice this tool can go a long way to helping to build at least some intuitions
 15 | for what UMAP tends to do with data.
 16 | 
 17 | .. image:: images/UMAP_zoo.png
 18 |    :width: 400px
 19 | 
 20 | `UMAP Zoo <https://duhaime.s3.amazonaws.com/apps/umap-zoo/index.html>`__
 21 | 
 22 | Thanks to Douglas Duhaime.
 23 | 
 24 | Tensorflow Embedding Projector
 25 | ------------------------------
 26 | If you just want to explore UMAP embeddings of datasets then the Embedding Projector
 27 | from Tensorflow is a great way to do that. As well as having a good interactive 3D view
 28 | it also has facilities for inspecting and searching labels and tags on the data. By default
 29 | it loads up word2vec vectors, but you can upload any data you wish. You can then select
 30 | the UMAP option among the tabs for embeddings choices (alongside PCA and t-SNE).
 31 | 
 32 | .. image:: images/embedding_projector.png
 33 |    :width: 400px
 34 | 
 35 | `Embedding Projector <https://projector.tensorflow.org/>`__
 36 | 
 37 | Thanks to Andy Coenen and the Embedding Projector team.
 38 | 
 39 | PixPlot
 40 | -------
 41 | PixPlot provides an overview of large photo-collections. In the demonstration app
 42 | from Yale's Digital Humanities lab it provides a window on the Meserve-Kunhardt Collection
 43 | of historical photographs. The approach uses convolutional neural nets to reduce the images
 44 | to 2048 dimensions, and then uses UMAP to present them in a 2-dimensional map which the
 45 | user can interactive pan and zoom around in. This process results in similar photos
 46 | ending up in similar regions of the map allowing for easy perusal of large photo
 47 | collections. The PixPlot project is also available on github in case you wish to train
 48 | it on your own photo collection.
 49 | 
 50 | .. image:: images/pixplot.png
 51 |    :width: 400px
 52 | 
 53 | `PixPlot <https://dhlab.yale.edu/projects/pixplot/>`__
 54 | 
 55 | Thanks to Douglas Duhaime and the Digital Humanities lab at Yale.
 56 | 
 57 | UMAP Explorer
 58 | -------------
 59 | A great demonstration of building a web based app for interactively exploring a UMAP embedding.
 60 | In this case it provides an exploration of UMAP run on the MNIST digits dataset. Each point in
 61 | the embedding is rendered as the digit image, and coloured according to the digit class. Mousing
 62 | over the images will make them larger and provide a view of the digit in the upper left. You can also pan
 63 | and zoom around the emebdding to get a better understanding of how UMAP has mapped the different styles of
 64 | handwritten digits down to 2 dimensions.
 65 | 
 66 | .. image:: images/umap_explorer.png
 67 |    :width: 400px
 68 | 
 69 | `UMAP Explorer <https://grantcuster.github.io/umap-explorer/>`__
 70 | 
 71 | Thanks to Grant Custer.
 72 | 
 73 | Audio Explorer
 74 | --------------
 75 | The Audio Explorer uses UMAP to embed sound samples into a 2 dimensional space for easy exploration.
 76 | The goal here is to take a large library of sounds samples and put similar sounds in similar regions
 77 | of the map, allowing a user to quickly mouse over and listen to various variations of a given sample
 78 | to quickly find exactly the right sound sample to use. Audio explorer uses MFCCs and/or WaveNet to
 79 | provide an initial useful vector representation of the sound samples, before applying UMAP to
 80 | generate the 2D embedding.
 81 | 
 82 | .. image:: images/audio_explorer.png
 83 |    :width: 400px
 84 | 
 85 | `Audio Explorer <http://doc.gold.ac.uk/~lfedd001/three/demo.html>`__
 86 | 
 87 | Thanks to Leon Fedden.
 88 | 
 89 | Orion Search
 90 | ------------
 91 | Orion is an open source research measurement and knowledge discovery tool that enables you to monitor
 92 | progress in science, visually explore the scientific landscape and search for relevant publications.
 93 | Orion encodes bioRxiv paper abstracts to dense vectors with Sentence Transformers and projects them to
 94 | an interactive 3D visualisation with UMAP. You can filter the UMAP embeddings by topic and country.
 95 | You can also select a subset of the UMAP embeddings and retrieve those papers and their metadata.
 96 | 
 97 | .. image:: images/orion_particles.png
 98 |    :width: 400px
 99 | 
100 | `Orion Search <https://www.orion-search.org/>`__
101 | 
102 | Thanks to Kostas Stathoulopoulos, Zac Ioannidis and Lilia Villafuerte.
103 | 
104 | Exploring Fashion MNIST
105 | -----------------------
106 | A web based interactive exploration of a 3D UMAP embedding ran on the Fashion MNIST dataset. Users can
107 | freely navigate the 3D space, jumping to a specific image by clicking an image or entering an image id.
108 | Like Grant Custer's UMAP Explorer, each point is rendered as the actual image and colored according to
109 | the label. It is also similar to the Tensorflow Embedding Projector, but designed more specifically for
110 | Fashion MNIST, thus more efficient and capable of showing all the 70k images.
111 | 
112 | .. image:: images/exploring_fashion_mnist.png
113 |    :width: 400px
114 | 
115 | `Exploring Fashion MNIST <https://observablehq.com/@stwind/exploring-fashion-mnist/>`__
116 | 
117 | Thanks to stwind.
118 | 
119 | ESM Metagenomic Atlas
120 | ---------------------
121 | The ESM Metagenomic Atlas contains over 600 million predicted protein structures, revealing the 
122 | metagenomic world in a way we have never seen before. The Explore page visualizes a sample of 1 
123 | million of these. (That’s about how much a browser can handle.) We represent each protein in this 
124 | dataset as a single point, and reveal the actual protein structure when zooming in or when hovering 
125 | over it. The color of each point corresponds to the similarity to the closest match we could find in 
126 | UniRef90, the reference database of known protein sequences. The position in the map is a 
127 | two-dimensional projection, which groups sequences by similarity, as determined by our language 
128 | model’s internal representation. The map reveals structure at different scales: local neighbors in 
129 | the same cluster tend to have similar structures, while nearby clusters preserve certain patterns 
130 | like secondary structure elements.
131 | 
132 | .. image:: images/ESM_metagenomic_atlas.png
133 |    :width: 400px
134 | 
135 | Thanks to the authors of "Evolutionary-scale prediction of atomic level protein structure
136 | with a language model".
137 | 
138 | `ESM Metagenomic Atlas <https://esmatlas.com/explore>`__
139 | 
140 | Interactive UMAP with Nomic Atlas
141 | ---------------------------------
142 | 
143 | `Nomic Atlas <https://atlas.nomic.ai/>`_ is a platform for interactively visualizing and exploring massive datasets. It automates the creation of embeddings and 2D coordinate projections using UMAP.
144 | 
145 | .. image:: https://assets.nomicatlas.com/mnist-training-embeddings-umap-short.gif
146 |    :alt: UMAP interactive visualization with Nomic Atlas
147 |    :align: center
148 |    :width: 600
149 | 
150 | Atlas provides:
151 | 
152 | * In-browser analysis of your UMAP data with the `Atlas Analyst <https://docs.nomic.ai/atlas/data-maps/atlas-analyst>`_
153 | * Vector search over your UMAP data using the `Nomic API <https://docs.nomic.ai/atlas/data-maps/guides/vector-search-over-your-data>`_
154 | * Interactive features like zooming, recoloring, searching, and filtering in the `Nomic Atlas data map <https://docs.nomic.ai/atlas/data-maps/controls>`_
155 | * Scalability for millions of data points
156 | * Rich information display on hover
157 | * Shareable UMAPs via URL links to your embeddings and data maps in Atlas
158 | 
159 | -----------
160 | 
161 | .. toctree::
162 |    :maxdepth: 1
163 |    :caption: Nomic Atlas Examples
164 | 
165 |    nomic_atlas_umap_of_text_embeddings
166 |    nomic_atlas_visualizing_mnist_training_dynamics
167 | 


--------------------------------------------------------------------------------
/doc/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/logo.png


--------------------------------------------------------------------------------
/doc/logo_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/doc/logo_large.png


--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=umap
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/doc/mutual_nn_umap.rst:
--------------------------------------------------------------------------------
  1 | Improving the Separation Between Similar Classes Using a Mutual k-NN Graph
  2 | ==========================================================================
  3 | 
  4 | This post briefly explains how the connectivity of the original graphical representation can adversely affect the resulting UMAP embeddings.
  5 | 
  6 | In default UMAP, a weighted k nearest neighbor (k-NN) graph, which connects each
  7 | datapoint to its 𝑘 nearest neighbors based on some distance metric, is constructed
  8 | and used to generate the initial topological representation of a dataset.
  9 | 
 10 | However, previous research has shown that using a weighted k-NN
 11 | graph may not provide an accurate representation of the underlying local
 12 | structure for a high dimensional dataset. The k-NN graph is relatively susceptible
 13 | to the “curse of dimensionality” and the associated distance concentration
 14 | effect, where distances are similar in high dimensions, as well as the
 15 | hub effect, where certain points become highly influential when highly
 16 | connected. This skews the local representation of high dimensional data,
 17 | deteriorating its performance for various similarity-based machine learning
 18 | tasks.
 19 | 
 20 | A recent paper titled
 21 | `Clustering with UMAP: Why and How Connectivity Matters <https://arxiv.org/abs/2108.05525>`__
 22 | proposes a refinement in the graph construction stage of the UMAP algorithm
 23 | that uses a weighted mutual k-NN graph rather than it vanilla counterpart,
 24 | to reduce the undesired distance concentration and hub effects.
 25 | 
 26 | Mutual k-NN graphs have been shown to contain many
 27 | desirable properties  when combating the “curse of dimensionality” as discussed in
 28 | `this paper <https://arxiv.org/abs/2108.05525>`__ . However, one pitfall of using a
 29 | mutual k-NN graph over the original k-NN graph is that it often
 30 | contains disconnected components and potential isolated vertices.
 31 | 
 32 | This violates one of UMAP primary assumptions that "The manifold is locally connected." To
 33 | combat the issue of isolated components, the authors consider different methods that have
 34 | been previously used to augment and increase the connectivity of the mutual k-NN graph:
 35 | 
 36 | 1. ``NN``: To minimally connect isolated vertices and satisfy the assumption that the underlying manifold is locally connected, we add an undirected edge between each isolated vertex and its original nearest neighbor (de Sousa, Rezende, and Batista 2013).Note that the resulting graph may still contain disconnected components.
 37 | 2. ``MST-min``: To achieve a connected graph, add the minimum number of edges from a maximum spanning tree to the mutual k-NN graph that has been weighted with similarity-based metrics(Ozaki et al. 2011). We adapt this by calculating the minimum spanning tree for distances.
 38 | 3. ``MST-all``: Adding all the edges of the MST.
 39 | 
 40 | .. image:: images/mutual_nn_umap_connectivity.png
 41 | 
 42 | They also different ways to obtain the new local neighborhood for each point ``x_i``:
 43 | 
 44 | 1. ``Adjacent Neighbors``: Only consider neighbors that are directly connected(adjacent) to ``x_i`` in the connected mutual k-NN graph.
 45 | 2. ``Path Neighbors``: Using shortest path distance to find the new k closest points to ``x_i`` with respect to the connected mutual k-NN graph. This shortest path distance can be considered a new distance metric as it directly aligns with UMAP’s definition of an extended pseudo-metric space.
 46 | 
 47 | .. image:: images/mutual_nn_umap_lc.png
 48 |   :width: 600
 49 |   :align: center
 50 | 
 51 | 
 52 | Visualizing the Results
 53 | ----------------------------------------------
 54 | To see the differences between using a mutual k-NN graph vs the original k-NN graph as
 55 | the starting topology for UMAP, let's visualize the 2D projections generated for MNIST, FMNIST, and 20
 56 | NG Count Vectors using each of the discussed methods. For all code snippets to reproduce the results and visualizations, please refer
 57 | to this `Github repo <https://github.com/adalmia96/umap-mnn>`__. Will be adding this soon as a
 58 | mode to the original implementation.
 59 | 
 60 | We’ll start with MNIST digits, a collection of 70,000 gray-scale images of hand-written digits:
 61 | 
 62 | .. image:: images/mutual_nn_umap_MNIST.png
 63 |   :width: 850
 64 |   :align: center
 65 | 
 66 | In general, for most of the mutual k-NN graph based vectors, there
 67 | is a better separation between similar classes than the original UMAP vectors
 68 | regardless of connectivity (NN, MST variants). Connecting isolated vertices in
 69 | the mutual k-NN graph to their original nearest neighbor produced the desired
 70 | separation between similar classes such as with the 4, 7, 9 in MNIST. This follows
 71 | our intuition given that mutual k-NN graphs have previously been shown as a useful
 72 | method for removing edges between points that are only loosely similar.
 73 | 
 74 | Similar results are observed for the Fashion-MNIST(FMNIST) dataset, a collection of 70,000
 75 | gray-scale images of fashion items:
 76 | 
 77 | .. image:: images/mutual_nn_umap_FMNIST.png
 78 |   :width: 850
 79 |   :align: center
 80 | 
 81 | For the FMNIST dataset, the vectors using the aforementioned methods preserve
 82 | the global structure between clothing classes (T-shirt/top, Coat, Trouser, and etc.)
 83 | from footwear classes (Sandal, Sneaker, Ankle-boot) while also depicting a clearer
 84 | separation between the footwear classes. This is contrasted with original
 85 | UMAP which has poorer separation between similar classes like the footwear classes.
 86 | 
 87 | For both MNIST and FMNIST, NN which naively connects isolated vertices
 88 | to their nearest neighbor had multiple small clusters of points scattered
 89 | throughout the vector space. This makes sense given using NN for connectivity can
 90 | still cause the resulting manifold to be broken into many small components.
 91 | 
 92 | It would be fair to assume that augmenting the mutual k-NN graph with a "higher connectivity"
 93 | would always be better as it reduces random scattering of points. However,
 94 | too much connectivity such as with MST-all can also hurt which is further discussed in the paper.
 95 | 
 96 | Finally, we depict the embeddings generated using the 20 newsgroup dataset, a collection of
 97 | 18846 documents, transformed using sklearns CountVectorizer:
 98 | 
 99 | .. image:: images/mutual_nn_umap_20ngc.png
100 |   :width: 850
101 |   :align: center
102 | 
103 | We can see there is better distinction between similar subjects such as the recreation
104 | (rec) topics.
105 | 
106 | Visually, the vector generated using the Adjacent Neighbors
107 | and MST-min result in disperse dense clusters of points e.g, the footwear classes in
108 | FMNIST and the recreation topics in 20 NG. However for Path Neighbors, the groups of
109 | points belonging to the same class are less dispersed. This is because Adjacent Neighbors are not guaranteed to have k connected neighbors for each local
110 | neighborhood. Points with smaller neighborhoods will be close to primarily few adjacent
111 | neighbors and repelled further away from the other points. 
112 | 
113 | To evaluate these methods quantitatively, the authors compare the clustering performance
114 | of the resulting low dimensional vectors generated. Below shows the Normalised Mutual
115 | Information NMI results after performing KMeans(for more information of the results please refer to `the full
116 | paper <https://arxiv.org/abs/2108.05525>`__).
117 | 
118 | .. image:: images/mutual_nn_umap_results.png
119 | 
120 | These quantitative experiments show that MST variants combined with Path
121 | Neighbors can help produce better clustering results and how the initialization
122 | of a weighted connected graph is critical to the success of topology based
123 | dimensionality reduction methods like UMAP.
124 | 
125 | 
126 | Citing our work
127 | ---------------
128 | If you use this implementation or reference the results in your work, please cite the paper:
129 | 
130 | .. code:: bibtex
131 | 
132 |   @article{Dalmia2021UMAPConnectivity,
133 |     author={Ayush Dalmia and Suzanna Sia},
134 |     title={Clustering with {UMAP:} Why and How Connectivity Matters},
135 |     journal={CoRR},
136 |     volume={abs/2108.05525},
137 |     year={2021},
138 |     url={https://arxiv.org/abs/2108.05525},
139 |     eprinttype={arXiv},
140 |     eprint={2108.05525},
141 |     timestamp={Wed, 18 Aug 2021 19:45:42 +0200},
142 |     biburl={https://dblp.org/rec/journals/corr/abs-2108-05525.bib},
143 |     bibsource={dblp computer science bibliography, https://dblp.org}
144 |     }
145 | 


--------------------------------------------------------------------------------
/doc/nomic_atlas_umap_of_text_embeddings.rst:
--------------------------------------------------------------------------------
 1 | UMAP of Text Embeddings with Nomic Atlas
 2 | =======================
 3 | 
 4 | `Nomic Atlas <https://atlas.nomic.ai/>`_ is a platform for interactively visualizing and exploring massive datasets. It automates the creation of embeddings and 2D coordinate projections using UMAP.
 5 | 
 6 | .. image:: https://assets.nomicatlas.com/airline-reviews-umap.gif
 7 |    :alt: UMAP interactive visualization with Nomic Atlas
 8 |    :align: center
 9 |    :width: 600
10 | 
11 | Nomic Atlas automatically generates embeddings for your data and allows you to explore large datasets in a web browser. Atlas provides:
12 | 
13 | * In-browser analysis of your UMAP data with the `Atlas Analyst <https://docs.nomic.ai/atlas/data-maps/atlas-analyst>`_
14 | * Vector search over your UMAP data using the `Nomic API <https://docs.nomic.ai/atlas/data-maps/guides/vector-search-over-your-data>`_
15 | * Interactive features like zooming, recoloring, searching, and filtering in the `Nomic Atlas data map <https://docs.nomic.ai/atlas/data-maps/controls>`_
16 | * Scalability for millions of data points
17 | * Rich information display on hover
18 | * Shareable UMAPs via URL links to your embeddings and data maps in Atlas
19 | 
20 | This example demonstrates how to use `Nomic Atlas <https://docs.nomic.ai/atlas/embeddings-and-retrieval/guides/using-umap-with-atlas>`_ to create interactive maps of text using embeddings and UMAP. 
21 | 
22 | Setup
23 | -----
24 | 
25 | 1. Get the required python packages with ``pip instll nomic pandas``
26 | 2. Get a Nomic API key `here <https://atlas.nomic.ai/cli-login>`_
27 | 3. Run ``nomic login nk-...`` in a terminal window or use the following code:
28 | 
29 | .. code:: python3
30 | 
31 |     import nomic
32 |     nomic.login('nk-...')
33 | 
34 | 
35 | 
36 | Download Example Data
37 | --------------------
38 | 
39 | .. code:: python3
40 | 
41 |     import pandas as pd
42 | 
43 |     # Example data
44 |     df = pd.read_csv("https://docs.nomic.ai/singapore_airlines_reviews.csv")
45 | 
46 | Create Atlas Dataset
47 | --------------------
48 | 
49 | .. code:: python3
50 | 
51 |     from nomic import AtlasDataset
52 |     dataset = AtlasDataset("airline-reviews-data")
53 | 
54 | Upload to Atlas
55 | ---------------
56 | 
57 | .. code:: python3
58 | 
59 |     dataset.add_data(df)
60 | 
61 | Create Data Map
62 | ---------------
63 | 
64 | We specify the ``text`` field from ``df`` as the field to create embeddings from. We choose some standard UMAP parameters as well.
65 | 
66 | .. code:: python3
67 | 
68 |     from nomic.data_inference import ProjectionOptions
69 | 
70 |     # model="umap" is how you choose UMAP in Nomic Atlas
71 |     # You can adjust n_neighbors, min_dist, 
72 |     # and n_epochs as you would with the UMAP library.
73 |     atlas_map = dataset.create_index(
74 |         indexed_field='text',
75 |         projection=ProjectionOptions(
76 |           model="umap",
77 |           n_neighbors=20,
78 |           min_dist=0.01,
79 |           n_epochs=200
80 |       )
81 |     )
82 | 
83 |     print(f"Explore your interactive map at: {atlas_map.map_link}")
84 | 
85 | Your map will be available in your `Atlas Dashboard <https://atlas.nomic.ai/data>`_.


--------------------------------------------------------------------------------
/doc/plotting_example_interactive.py:
--------------------------------------------------------------------------------
 1 | import sklearn.datasets
 2 | import pandas as pd
 3 | import numpy as np
 4 | import umap
 5 | import umap.plot
 6 | 
 7 | fmnist = sklearn.datasets.fetch_openml("Fashion-MNIST")
 8 | 
 9 | mapper = umap.UMAP().fit(fmnist.data[:30000])
10 | 
11 | hover_data = pd.DataFrame({"index": np.arange(30000), "label": fmnist.target[:30000]})
12 | hover_data["item"] = hover_data.label.map(
13 |     {
14 |         "0": "T-shirt/top",
15 |         "1": "Trouser",
16 |         "2": "Pullover",
17 |         "3": "Dress",
18 |         "4": "Coat",
19 |         "5": "Sandal",
20 |         "6": "Shirt",
21 |         "7": "Sneaker",
22 |         "8": "Bag",
23 |         "9": "Ankle Boot",
24 |     }
25 | )
26 | 
27 | umap.plot.output_file("plotting_interactive_example.html")
28 | 
29 | p = umap.plot.interactive(
30 |     mapper, labels=fmnist.target[:30000], hover_data=hover_data, point_size=2
31 | )
32 | umap.plot.show(p)
33 | 


--------------------------------------------------------------------------------
/doc/plotting_example_nomic_atlas.py:
--------------------------------------------------------------------------------
 1 | from nomic import AtlasDataset
 2 | from nomic.data_inference import ProjectionOptions
 3 | import pandas as pd
 4 | 
 5 | # Example data
 6 | df = pd.read_csv("https://docs.nomic.ai/singapore_airlines_reviews.csv")
 7 | 
 8 | dataset = AtlasDataset("example-dataset-airline-reviews")
 9 | 
10 | dataset.add_data(df)
11 | 
12 | atlas_map = dataset.create_index(
13 |     indexed_field="text",
14 |     projection=ProjectionOptions(
15 |         model="umap", n_neighbors=20, min_dist=0.01, n_epochs=200
16 |     ),
17 | )
18 | 


--------------------------------------------------------------------------------
/doc/release_notes.rst:
--------------------------------------------------------------------------------
 1 | Release Notes
 2 | =============
 3 | 
 4 | Some notes on new features in various releases
 5 | 
 6 | What's new in 0.5
 7 | -----------------
 8 | 
 9 | * ParametricUMAP learns embeddings with neural networks.
10 | * AlignedUMAP can align multiple embeddings using relations between datasets.
11 | * DensMAP can preserve local density information in embeddings.
12 | * UMAP now depends on PyNNDescent, but has faster more parallel performance as a result.
13 | * UMAP now supports an ``update`` method to add new data and retrain.
14 | * Various performance improvements and bug fixes.
15 | * Additional plotting support, including text searching in interactive plots.
16 | * Support for "maximal distances" in neighbor graphs.
17 | 
18 | What's new in 0.4
19 | -----------------
20 | 
21 | * Inverse transform method. Generate points in the original space corresponding to points in embedded space. (Thanks to Joseph Courtney)
22 | * Different embedding spaces. Support for embedding to a variety of different spaces other than Euclidean. (Thanks to Joseph Courtney)
23 | * New metrics, including Hellinger distance for sparse count data.
24 | * New discrete/label metrics, including hierarchical categories, counts, ordinal data, and string edit distance.
25 | * Support for parallelism in neighbor search and layout optimization. (Thanks to Tom White)
26 | * Support for alternative methods to handling duplicated data samples. (Thanks to John Healy)
27 | * New plotting methods for fast and easy plots.
28 | * Initial support for dataframe embedding -- still experimental, but worth trying.
29 | * Support for transform methods with sparse data.
30 | * Multithreading support when no random seed is set.
31 | 
32 | 
33 | What's new in 0.3
34 | -----------------
35 | 
36 | * Supervised and semi-supervised dimension reduction. Support for using labels or partial labels for dimension reduction.
37 | * Transform method. Support for adding new unseen points to an existing embedding.
38 | * Performance improvements.
39 | 
40 | 
41 | What's new in 0.2
42 | -----------------
43 | 
44 | * A new layout algorithm that handles large datasets (more) correctly.
45 | * Performance improvements.


--------------------------------------------------------------------------------
/doc/reproducibility.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | UMAP Reproducibility
  3 | ====================
  4 | 
  5 | UMAP is a stochastic algorithm -- it makes use of randomness both to
  6 | speed up approximation steps, and to aid in solving hard optimization
  7 | problems. This means that different runs of UMAP can produce different
  8 | results. UMAP is relatively stable -- thus the variance between runs
  9 | should ideally be relatively small -- but different runs may have
 10 | variations none the less. To ensure that results can be reproduced
 11 | exactly UMAP allows the user to set a random seed state.
 12 | 
 13 | Since version 0.4 UMAP also support multi-threading for faster
 14 | performance; when performing optimization this exploits the fact that
 15 | race conditions between the threads are acceptable within certain
 16 | optimization phases. Unfortunately this means that the randomness in
 17 | UMAP outputs for the multi-threaded case depends not only on the random
 18 | seed input, but also on race conditions between threads during
 19 | optimization, over which no control can be had. This means that
 20 | multi-threaded UMAP results cannot be explicitly reproduced.
 21 | 
 22 | In this tutorial we'll look at how UMAP can be used in multi-threaded
 23 | mode for performance purposes, and alternatively how we can fix random
 24 | states to ensure exact reproducibility at the cost of some performance.
 25 | First let's load the relevant libraries and get some data; in this case
 26 | the MNIST digits dataset.
 27 | 
 28 | .. code:: python3
 29 | 
 30 |     import numpy as np
 31 |     import sklearn.datasets
 32 |     import umap
 33 |     import umap.plot
 34 | 
 35 | .. code:: python3
 36 | 
 37 |     data, labels = sklearn.datasets.fetch_openml(
 38 |         'mnist_784', version=1, return_X_y=True
 39 |     )
 40 | 
 41 | With data in hand let's run UMAP on it, and note how long it takes to
 42 | run:
 43 | 
 44 | .. code:: python3
 45 | 
 46 |     %%time
 47 |     mapper1 = umap.UMAP().fit(data)
 48 | 
 49 | 
 50 | .. parsed-literal::
 51 | 
 52 |     CPU times: user 3min 18s, sys: 3.84 s, total: 3min 22s
 53 |     Wall time: 1min 29s
 54 | 
 55 | 
 56 | The thing to note here is that the "Wall time" is significantly smaller
 57 | than the CPU time -- this means that multiple CPU cores were used. For
 58 | this particular demonstration I am making use of the latest version of
 59 | PyNNDescent for nearest neighbor search (UMAP will use it if you have it
 60 | installed) which supports multi-threading as well. The result is a very
 61 | fast fitting to the data that does an effective job of using several
 62 | cores. If you are on a large server with many cores available and don't
 63 | wish to use them *all* (which is the default situation) you can
 64 | currently control the number of cores used by setting the numba
 65 | environment variable ``NUMBA_NUM_THREADS``; see the `numba
 66 | documentation <https://numba.pydata.org/numba-doc/dev/reference/envvars.html#threading-control>`__
 67 | for more details.
 68 | 
 69 | Now let's plot our result to see what the embedding looks like:
 70 | 
 71 | .. code:: python3
 72 | 
 73 |     umap.plot.points(mapper1, labels=labels)
 74 | 
 75 | 
 76 | .. image:: images/reproducibility_6_1.png
 77 | 
 78 | 
 79 | Now, let's run UMAP again and compare the results to that of our first
 80 | run.
 81 | 
 82 | .. code:: python3
 83 | 
 84 |     %%time
 85 |     mapper2 = umap.UMAP().fit(data)
 86 | 
 87 | 
 88 | .. parsed-literal::
 89 | 
 90 |     CPU times: user 2min 53s, sys: 4.16 s, total: 2min 57s
 91 |     Wall time: 1min 5s
 92 | 
 93 | 
 94 | You will note that this time we ran *even faster*. This is because
 95 | during the first run numba was still JIT compiling some of the code in
 96 | the background. In contrast, this time that work has already been done,
 97 | so it no longer takes up any of our run-time. We see that we are still
 98 | making use of mutliple cores well.
 99 | 
100 | Now let's plot the results of this second run and compare to the first:
101 | 
102 | .. code:: python3
103 | 
104 |     umap.plot.points(mapper2, labels=labels)
105 | 
106 | 
107 | .. image:: images/reproducibility_10_1.png
108 | 
109 | 
110 | Qualitatively this looks very similar, but a little closer inspection
111 | will quickly show that the results are actually different between the
112 | runs. Note that even in versions of UMAP prior to 0.4 this would have
113 | been the case -- since we fixed no specific random seed, and were thus
114 | using the current random state of the system which will naturally differ
115 | between runs. This is the default behaviour, as is standard with sklearn
116 | estimators that are stochastic. Rather than having a default random seed
117 | the user is required to explicitly provide one should they want a
118 | reproducible result. As noted by Vito Zanotelli
119 | 
120 |     ... setting a random seed is like signing a waiver "I am aware that
121 |     this is a stochastic algorithm and I have done sufficient tests to
122 |     confirm that my main conclusions are not affected by this
123 |     randomness".
124 | 
125 | With that in mind, let's see what happens if we set an explicit
126 | ``random_state`` value:
127 | 
128 | .. code:: python3
129 | 
130 |     %%time
131 |     mapper3 = umap.UMAP(random_state=42).fit(data)
132 | 
133 | 
134 | .. parsed-literal::
135 | 
136 |     CPU times: user 2min 27s, sys: 4.16 s, total: 2min 31s
137 |     Wall time: 1min 56s
138 | 
139 | 
140 | The first thing to note that that this run took significantly longer
141 | (despite having all the functions JIT compiled by numba already). Then
142 | note that the Wall time and CPU times are now much closer to each other
143 | -- we are no longer exploiting multiple cores to anywhere near the same
144 | degree. This is because by setting a ``random_state`` we are effectively
145 | turning off any of the multi-threading that does not support explicit
146 | reproducibility. Let's plot the results:
147 | 
148 | .. code:: python3
149 | 
150 |     umap.plot.points(mapper3, labels=labels)
151 | 
152 | 
153 | .. image:: images/reproducibility_14_1.png
154 | 
155 | 
156 | We arrive at much the same results as before from a qualitative point of
157 | view, but again inspection will show that there are some differences.
158 | More importantly this result should now be reproducible. Thus we can run
159 | UMAP again, with the same ``random_state`` set ...
160 | 
161 | .. code:: python3
162 | 
163 |     %%time
164 |     mapper4 = umap.UMAP(random_state=42).fit(data)
165 | 
166 | 
167 | .. parsed-literal::
168 | 
169 |     CPU times: user 2min 26s, sys: 4.13 s, total: 2min 30s
170 |     Wall time: 1min 54s
171 | 
172 | 
173 | Again, this takes longer than the earlier runs with no ``random_state``
174 | set. However when we plot the results of the second run we see that they
175 | look not merely qualitatively similar, but instead appear to be almost
176 | identical:
177 | 
178 | .. code:: python3
179 | 
180 |     umap.plot.points(mapper4, labels=labels)
181 | 
182 | .. image:: images/reproducibility_18_1.png
183 | 
184 | 
185 | We can, in fact, check that the results are identical by verifying that
186 | each and every coordinate of the resulting embeddings match perfectly:
187 | 
188 | .. code:: python3
189 | 
190 |     np.all(mapper3.embedding_ == mapper4.embedding_)
191 | 
192 | 
193 | .. parsed-literal::
194 | 
195 |     True
196 | 
197 | So we have, in fact, reproduced the embedding exactly.
198 | 


--------------------------------------------------------------------------------
/doc/scientific_papers.rst:
--------------------------------------------------------------------------------
  1 | Scientific Papers
  2 | =================
  3 | 
  4 | UMAP has been used in a wide variety of scientific publications from a diverse range of
  5 | fields. Here we will highlight a small selection of papers that demonstrate both
  6 | the depth of analysis, and breadth of subjects, UMAP can be used for. These range from biology,
  7 | to machine learning, and even social science.
  8 | 
  9 | 
 10 | The single-cell transcriptional landscape of mammalian organogenesis
 11 | --------------------------------------------------------------------
 12 | A detailed look at the development of mouse embryos from a single-cell view. UMAP
 13 | is used as a core piece of The Monocle3 software suite for identifying cell types
 14 | and trajectories. This was a major paper in Nature, demonstrating the power
 15 | of UMAP for large scale scientific endeavours.
 16 | 
 17 | .. image:: images/organogenesis_paper.png
 18 |    :width: 400px
 19 | 
 20 | `Link to the paper <https://www.nature.com/articles/s41586-019-0969-x>`__
 21 | 
 22 | A lineage-resolved molecular atlas of C. elegans embryogenesis at single-cell resolution
 23 | ----------------------------------------------------------------------------------------
 24 | Still in the realm of single cell biology this paper looks at the developmental
 25 | landscape of the round-word C. elegans. UMAP is used for detailed analysis of
 26 | the developmental trajectories of cells, looking at global scales, and then
 27 | digging down to look at individual organs. The result is an impressive
 28 | array of UMAP visualisations that tease out ever finer structures in
 29 | cellular development.
 30 | 
 31 | .. image:: images/c_elegans_3d.jpg
 32 |    :width: 400px
 33 | 
 34 | `Link to the paper <https://science.sciencemag.org/content/early/2019/09/04/science.aax1971>`__
 35 | 
 36 | Exploring Neural Networks with Activation Atlases
 37 | -------------------------------------------------
 38 | Understanding the image processing capabilities (and deficits!) of modern
 39 | convolutional neural networks is a challenge. This interactive paper from
 40 | Distill seeks to provide a way to "peek inside the black box" by looking
 41 | at the activations throughout the network. By mapping this high dimensional
 42 | data down to 2D with UMAP the authors can construct an "atlas" of how
 43 | different images are perceived by the network.
 44 | 
 45 | .. image:: images/activation_atlas.png
 46 |    :width: 400px
 47 | 
 48 | `Link to the paper <https://distill.pub/2019/activation-atlas/>`__
 49 | 
 50 | TimeCluster: dimension reduction applied to temporal data for visual analytics
 51 | ------------------------------------------------------------------------------
 52 | An interesting approach to time-series analysis, targeted toward cases where the
 53 | time series has repeating patterns -- though no necessarily of a consistently
 54 | periodic nature. The approach involves dimension reduction and clustering
 55 | of sliding window blocks of the time-series. The result is a map where
 56 | repeating behaviour is exposed as loop structures. This can be useful
 57 | for both clustering similar blocks within a time-series, or finding
 58 | outliers.
 59 | 
 60 | .. image:: images/time_cluster.png
 61 |    :width: 400px
 62 | 
 63 | `Link to the paper <https://link.springer.com/article/10.1007/s00371-019-01673-y>`__
 64 | 
 65 | Dimensionality reduction for visualizing single-cell data using UMAP
 66 | --------------------------------------------------------------------
 67 | An early paper on applying UMAP to single-cell biology data. It looks at
 68 | both, gene-expression data and flow-cytometry data, and compares UMAP to
 69 | t-SNE both in terms of performance and quality of results. This is a good
 70 | introduction to using UMAP for single-cell biology data.
 71 | 
 72 | .. image:: images/single_cell_umap.jpg
 73 |    :width: 400px
 74 | 
 75 | `Link to the paper <https://www.nature.com/articles/nbt.4314>`__
 76 | 
 77 | 
 78 | Revealing multi-scale population structure in large cohorts
 79 | -----------------------------------------------------------
 80 | A paper looking at population genetics which uses UMAP as a means
 81 | to visualise population structures. This produced some intriguing
 82 | visualizations, and was one of the first of several papers taking
 83 | this visualization approach. It also includes some novel visualizations
 84 | using UMAP projections to 3D as RGB color specifications for
 85 | data points, allowing the UMAP structure to be visualized in
 86 | geographic maps based on where the samples were drawn from.
 87 | 
 88 | .. image:: images/population_umap.jpg
 89 |    :width: 400px
 90 | 
 91 | `Link to the paper <https://www.biorxiv.org/content/10.1101/423632v2>`__
 92 | 
 93 | 
 94 | Understanding Vulnerability of Children in Surrey
 95 | --------------------------------------------------
 96 | An example of the use of UMAP in sociological studies -- in this case
 97 | looking at children in Surrey, British Columbia. Here UMAP is used as
 98 | a tool to aid in general data analysis, and proves effective for the
 99 | tasks to which it was put.
100 | 
101 | .. image:: images/umap_surrey.png
102 |    :width: 400px
103 | 
104 | `Link to the paper <https://dsi.ubc.ca/sites/default/files/education-dssg-report-2018.pdf>`__


--------------------------------------------------------------------------------
/docs_requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx>=1.8
2 | sphinx_gallery
3 | matplotlib
4 | pillow
5 | sphinx_rtd_theme
6 | numpydoc
7 | scipy
8 | 


--------------------------------------------------------------------------------
/examples/README.txt:
--------------------------------------------------------------------------------
1 | Gallery of Examples of UMAP usage
2 | ---------------------------------
3 | 
4 | A small gallery collection examples of UMAP usage. Do you
5 | have an interesting UMAP plot that uses publicly available
6 | data? Submit a pull request to have it added as an example!


--------------------------------------------------------------------------------
/examples/digits/digits.py:
--------------------------------------------------------------------------------
 1 | from bokeh.plotting import figure, output_file, show
 2 | from bokeh.models import CategoricalColorMapper, ColumnDataSource
 3 | from bokeh.palettes import Category10
 4 | 
 5 | import umap
 6 | from sklearn.datasets import load_digits
 7 | 
 8 | digits = load_digits()
 9 | embedding = umap.UMAP().fit_transform(digits.data)
10 | 
11 | output_file("digits.html")
12 | 
13 | targets = [str(d) for d in digits.target_names]
14 | 
15 | source = ColumnDataSource(
16 |     dict(
17 |         x=[e[0] for e in embedding],
18 |         y=[e[1] for e in embedding],
19 |         label=[targets[d] for d in digits.target],
20 |     )
21 | )
22 | 
23 | cmap = CategoricalColorMapper(factors=targets, palette=Category10[10])
24 | 
25 | p = figure(title="test umap")
26 | p.circle(
27 |     x="x",
28 |     y="y",
29 |     source=source,
30 |     color={"field": "label", "transform": cmap},
31 |     legend="label",
32 | )
33 | 
34 | show(p)
35 | 


--------------------------------------------------------------------------------
/examples/inverse_transform_example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | from sklearn.datasets import fetch_openml
 6 | 
 7 | import umap
 8 | 
 9 | mnist = fetch_openml("Fashion-MNIST", version=1)
10 | 
11 | 
12 | trans = umap.UMAP(
13 |     n_neighbors=10,
14 |     random_state=42,
15 |     metric="euclidean",
16 |     output_metric="euclidean",
17 |     init="spectral",
18 |     verbose=True,
19 | ).fit(mnist.data)
20 | 
21 | corners = np.array(
22 |     [
23 |         [-5.1, 2.9],
24 |         [-1.9, 6.4],
25 |         [-5.4, -6.3],
26 |         [8.3, 4.0],
27 |     ]  # 7  # 4  # 1  # 0
28 | )
29 | 
30 | test_pts = np.array(
31 |     [
32 |         (corners[0] * (1 - x) + corners[1] * x) * (1 - y)
33 |         + (corners[2] * (1 - x) + corners[3] * x) * y
34 |         for y in np.linspace(0, 1, 10)
35 |         for x in np.linspace(0, 1, 10)
36 |     ]
37 | )
38 | 
39 | inv_transformed_points = trans.inverse_transform(test_pts)
40 | 
41 | plt.scatter(
42 |     trans.embedding_[:, 0],
43 |     trans.embedding_[:, 1],
44 |     c=mnist.target,
45 |     cmap="Spectral",
46 |     s=0.25,
47 | )
48 | plt.colorbar(boundaries=np.arange(11) - 0.5).set_ticks(np.arange(10))
49 | plt.scatter(test_pts[:, 0], test_pts[:, 1], marker="x", c="k")
50 | 
51 | fig, ax = plt.subplots(10, 10)
52 | for i in range(10):
53 |     for j in range(10):
54 |         ax[i, j].imshow(
55 |             inv_transformed_points[i * 10 + j].reshape(28, 28), origin="upper"
56 |         )
57 |         ax[i, j].get_xaxis().set_visible(False)
58 |         ax[i, j].get_yaxis().set_visible(False)
59 | 
60 | plt.show()
61 | 


--------------------------------------------------------------------------------
/examples/iris/iris.py:
--------------------------------------------------------------------------------
 1 | from bokeh.plotting import figure, output_file, show
 2 | from bokeh.models import CategoricalColorMapper, ColumnDataSource
 3 | from bokeh.palettes import Category10
 4 | 
 5 | import umap
 6 | from sklearn.datasets import load_iris
 7 | 
 8 | iris = load_iris()
 9 | embedding = umap.UMAP(
10 |     n_neighbors=50, learning_rate=0.5, init="random", min_dist=0.001
11 | ).fit_transform(iris.data)
12 | 
13 | output_file("iris.html")
14 | 
15 | 
16 | targets = [str(d) for d in iris.target_names]
17 | 
18 | source = ColumnDataSource(
19 |     dict(
20 |         x=[e[0] for e in embedding],
21 |         y=[e[1] for e in embedding],
22 |         label=[targets[d] for d in iris.target],
23 |     )
24 | )
25 | 
26 | cmap = CategoricalColorMapper(factors=targets, palette=Category10[10])
27 | 
28 | p = figure(title="Test UMAP on Iris dataset")
29 | p.circle(
30 |     x="x",
31 |     y="y",
32 |     source=source,
33 |     color={"field": "label", "transform": cmap},
34 |     legend="label",
35 | )
36 | 
37 | show(p)
38 | 


--------------------------------------------------------------------------------
/examples/mnist_torus_sphere_example.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import matplotlib.pyplot as plt
  4 | import numba
  5 | import numpy as np
  6 | from mayavi import mlab
  7 | from sklearn.datasets import load_digits
  8 | from sklearn.model_selection import train_test_split
  9 | 
 10 | import umap
 11 | 
 12 | digits = load_digits()
 13 | X_train, X_test, y_train, y_test = train_test_split(
 14 |     digits.data, digits.target, stratify=digits.target, random_state=42
 15 | )
 16 | 
 17 | target_spaces = ["plane", "torus", "sphere"]
 18 | 
 19 | if "plane" in target_spaces:
 20 |     # embed onto a plane
 21 | 
 22 |     trans = umap.UMAP(
 23 |         n_neighbors=10,
 24 |         random_state=42,
 25 |         metric="euclidean",
 26 |         output_metric="euclidean",
 27 |         init="spectral",
 28 |         verbose=True,
 29 |     ).fit(X_train)
 30 | 
 31 |     plt.scatter(
 32 |         trans.embedding_[:, 0], trans.embedding_[:, 1], c=y_train, cmap="Spectral"
 33 |     )
 34 |     plt.show()
 35 | 
 36 | if "torus" in target_spaces:
 37 |     # embed onto a torus
 38 |     # note: this is a topological torus, not a geometric torus. Think
 39 |     # Pacman, not donut.
 40 | 
 41 |     @numba.njit(fastmath=True)
 42 |     def torus_euclidean_grad(x, y, torus_dimensions=(2 * np.pi, 2 * np.pi)):
 43 |         """Standard euclidean distance.
 44 | 
 45 |         ..math::
 46 |             D(x, y) = \sqrt{\sum_i (x_i - y_i)^2}
 47 |         """
 48 |         distance_sqr = 0.0
 49 |         g = np.zeros_like(x)
 50 |         for i in range(x.shape[0]):
 51 |             a = abs(x[i] - y[i])
 52 |             if 2 * a < torus_dimensions[i]:
 53 |                 distance_sqr += a**2
 54 |                 g[i] = x[i] - y[i]
 55 |             else:
 56 |                 distance_sqr += (torus_dimensions[i] - a) ** 2
 57 |                 g[i] = (x[i] - y[i]) * (a - torus_dimensions[i]) / a
 58 |         distance = np.sqrt(distance_sqr)
 59 |         return distance, g / (1e-6 + distance)
 60 | 
 61 |     trans = umap.UMAP(
 62 |         n_neighbors=10,
 63 |         random_state=42,
 64 |         metric="euclidean",
 65 |         output_metric=torus_euclidean_grad,
 66 |         init="spectral",
 67 |         min_dist=0.15,  # requires adjustment since the torus has limited space
 68 |         verbose=True,
 69 |     ).fit(X_train)
 70 | 
 71 |     mlab.clf()
 72 |     x, y, z = np.mgrid[-3:3:50j, -3:3:50j, -3:3:50j]
 73 | 
 74 |     # Plot a torus
 75 |     R = 2
 76 |     r = 1
 77 |     values = (R - np.sqrt(x**2 + y**2)) ** 2 + z**2 - r**2
 78 |     mlab.contour3d(x, y, z, values, color=(1.0, 1.0, 1.0), contours=[0])
 79 | 
 80 |     # torus angles -> 3D
 81 |     x = (R + r * np.cos(trans.embedding_[:, 0])) * np.cos(trans.embedding_[:, 1])
 82 |     y = (R + r * np.cos(trans.embedding_[:, 0])) * np.sin(trans.embedding_[:, 1])
 83 |     z = r * np.sin(trans.embedding_[:, 0])
 84 | 
 85 |     pts = mlab.points3d(
 86 |         x, y, z, y_train, colormap="spectral", scale_mode="none", scale_factor=0.1
 87 |     )
 88 | 
 89 |     mlab.show()
 90 | 
 91 | if "sphere" in target_spaces:
 92 |     # embed onto a sphere
 93 |     trans = umap.UMAP(
 94 |         n_neighbors=10,
 95 |         random_state=42,
 96 |         metric="euclidean",
 97 |         output_metric="haversine",
 98 |         init="spectral",
 99 |         min_dist=0.15,  # requires adjustment since the sphere has limited space
100 |         verbose=True,
101 |     ).fit(X_train)
102 | 
103 |     mlab.clf()
104 |     x, y, z = np.mgrid[-3:3:50j, -3:3:50j, -3:3:50j]
105 | 
106 |     # Plot a sphere
107 |     r = 3
108 |     values = x**2 + y**2 + z**2 - r**2
109 |     mlab.contour3d(x, y, z, values, color=(1.0, 1.0, 1.0), contours=[0])
110 | 
111 |     # latitude, longitude -> 3D
112 |     x = r * np.sin(trans.embedding_[:, 0]) * np.cos(trans.embedding_[:, 1])
113 |     y = r * np.sin(trans.embedding_[:, 0]) * np.sin(trans.embedding_[:, 1])
114 |     z = r * np.cos(trans.embedding_[:, 0])
115 | 
116 |     pts = mlab.points3d(
117 |         x, y, z, y_train, colormap="spectral", scale_mode="none", scale_factor=0.2
118 |     )
119 | 
120 |     mlab.show()
121 | 


--------------------------------------------------------------------------------
/examples/mnist_transform_new_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | UMAP on the MNIST Digits dataset
 5 | --------------------------------
 6 | 
 7 | A simple example demonstrating how to use UMAP on a larger
 8 | dataset such as MNIST. We first pull the MNIST dataset and
 9 | then use UMAP to reduce it to only 2-dimensions for
10 | easy visualisation.
11 | 
12 | Note that UMAP manages to both group the individual digit
13 | classes, but also to retain the overall global structure
14 | among the different digit classes -- keeping 1 far from
15 | 0, and grouping triplets of 3,5,8 and 4,7,9 which can
16 | blend into one another in some cases.
17 | """
18 | import matplotlib.pyplot as plt
19 | import seaborn as sns
20 | from sklearn.datasets import fetch_openml
21 | from sklearn.model_selection import train_test_split
22 | 
23 | import umap
24 | 
25 | sns.set(context="paper", style="white")
26 | 
27 | mnist = fetch_openml("mnist_784", version=1)
28 | X_train, X_test, y_train, y_test = train_test_split(
29 |     mnist.data, mnist.target, stratify=mnist.target, random_state=42
30 | )
31 | 
32 | reducer = umap.UMAP(random_state=42)
33 | embedding_train = reducer.fit_transform(X_train)
34 | embedding_test = reducer.transform(X_test)
35 | 
36 | fig, ax = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(12, 10))
37 | ax[0].scatter(
38 |     embedding_train[:, 0], embedding_train[:, 1], c=y_train, cmap="Spectral"  # , s=0.1
39 | )
40 | ax[1].scatter(
41 |     embedding_test[:, 0], embedding_test[:, 1], c=y_test, cmap="Spectral"  # , s=0.1
42 | )
43 | plt.setp(ax[0], xticks=[], yticks=[])
44 | plt.setp(ax[1], xticks=[], yticks=[])
45 | plt.suptitle("MNIST data embedded into two dimensions by UMAP", fontsize=18)
46 | ax[0].set_title("Training Set", fontsize=12)
47 | ax[1].set_title("Test Set", fontsize=12)
48 | plt.show()
49 | 


--------------------------------------------------------------------------------
/examples/plot_algorithm_comparison.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Comparison of Dimension Reduction Techniques
  3 | --------------------------------------------
  4 | 
  5 | A comparison of several different dimension reduction
  6 | techniques on a variety of toy datasets. The datasets
  7 | are all toy datasets, but should provide a representative
  8 | range of the strengths and weaknesses of the different
  9 | algorithms.
 10 | 
 11 | The time to perform the dimension reduction with each
 12 | algorithm and each dataset is recorded in the lower
 13 | right of each plot.
 14 | 
 15 | Things to note about the datasets:
 16 | 
 17 | - Blobs: A set of five gaussian blobs in 10 dimensional
 18 |          space. This should be a prototypical example
 19 |          of something that should clearly separate
 20 |          even in a reduced dimension space.
 21 | - Iris: a classic small dataset with one distinct class
 22 |         and two classes that are not clearly separated.
 23 | - Digits: handwritten digits -- ideally different digit
 24 |           classes should form distinct groups. Due to
 25 |           the nature of handwriting digits may have several
 26 |           forms (crossed or uncrossed sevens, capped or
 27 |           straight line oes, etc.)
 28 | - Wine: wine characteristics ideally used for a toy
 29 |         regression. Ultimately the data is essentially
 30 |         one dimensional in nature.
 31 | - Swiss Roll: data is essentially a rectangle, but
 32 |               has been "rolled up" like a swiss roll
 33 |               in three dimensional space. Ideally a
 34 |               dimension reduction technique should
 35 |               be able to "unroll" it. The data
 36 |               has been coloured according to one dimension
 37 |               of the rectangle, so should form
 38 |               a rectangle of smooth color variation.
 39 | - Sphere: the two dimensional surface of a three
 40 |           dimensional sphere. This cannot be represented
 41 |           accurately in two dimensions without tearing.
 42 |           The sphere has been coloured with hue around
 43 |           the equator and black to white from the south
 44 |           to north pole.
 45 | """
 46 | 
 47 | import numpy as np
 48 | import matplotlib.pyplot as plt
 49 | import seaborn as sns
 50 | import time
 51 | 
 52 | from sklearn import datasets, decomposition, manifold, preprocessing
 53 | from colorsys import hsv_to_rgb
 54 | 
 55 | import umap
 56 | 
 57 | sns.set(context="paper", style="white")
 58 | 
 59 | blobs, blob_labels = datasets.make_blobs(
 60 |     n_samples=500, n_features=10, centers=5, random_state=42
 61 | )
 62 | iris = datasets.load_iris()
 63 | digits = datasets.load_digits(n_class=10)
 64 | wine = datasets.load_wine()
 65 | swissroll, swissroll_labels = datasets.make_swiss_roll(
 66 |     n_samples=1000, noise=0.1, random_state=42
 67 | )
 68 | sphere = np.random.normal(size=(600, 3))
 69 | sphere = preprocessing.normalize(sphere)
 70 | sphere_hsv = np.array(
 71 |     [
 72 |         (
 73 |             (np.arctan2(c[1], c[0]) + np.pi) / (2 * np.pi),
 74 |             np.abs(c[2]),
 75 |             min((c[2] + 1.1), 1.0),
 76 |         )
 77 |         for c in sphere
 78 |     ]
 79 | )
 80 | sphere_colors = np.array([hsv_to_rgb(*c) for c in sphere_hsv])
 81 | 
 82 | reducers = [
 83 |     (manifold.TSNE, {"perplexity": 50}),
 84 |     # (manifold.LocallyLinearEmbedding, {'n_neighbors':10, 'method':'hessian'}),
 85 |     (manifold.Isomap, {"n_neighbors": 30}),
 86 |     (manifold.MDS, {}),
 87 |     (decomposition.PCA, {}),
 88 |     (umap.UMAP, {"n_neighbors": 30, "min_dist": 0.3}),
 89 | ]
 90 | 
 91 | test_data = [
 92 |     (blobs, blob_labels),
 93 |     (iris.data, iris.target),
 94 |     (digits.data, digits.target),
 95 |     (wine.data, wine.target),
 96 |     (swissroll, swissroll_labels),
 97 |     (sphere, sphere_colors),
 98 | ]
 99 | dataset_names = ["Blobs", "Iris", "Digits", "Wine", "Swiss Roll", "Sphere"]
100 | 
101 | n_rows = len(test_data)
102 | n_cols = len(reducers)
103 | ax_index = 1
104 | ax_list = []
105 | 
106 | # plt.figure(figsize=(9 * 2 + 3, 12.5))
107 | plt.figure(figsize=(10, 8))
108 | plt.subplots_adjust(
109 |     left=0.02, right=0.98, bottom=0.001, top=0.96, wspace=0.05, hspace=0.01
110 | )
111 | for data, labels in test_data:
112 |     for reducer, args in reducers:
113 |         start_time = time.time()
114 |         embedding = reducer(n_components=2, **args).fit_transform(data)
115 |         elapsed_time = time.time() - start_time
116 |         ax = plt.subplot(n_rows, n_cols, ax_index)
117 |         if isinstance(labels[0], tuple):
118 |             ax.scatter(*embedding.T, s=10, c=labels, alpha=0.5)
119 |         else:
120 |             ax.scatter(*embedding.T, s=10, c=labels, cmap="Spectral", alpha=0.5)
121 |         ax.text(
122 |             0.99,
123 |             0.01,
124 |             "{:.2f} s".format(elapsed_time),
125 |             transform=ax.transAxes,
126 |             size=14,
127 |             horizontalalignment="right",
128 |         )
129 |         ax_list.append(ax)
130 |         ax_index += 1
131 | plt.setp(ax_list, xticks=[], yticks=[])
132 | 
133 | for i in np.arange(n_rows) * n_cols:
134 |     ax_list[i].set_ylabel(dataset_names[i // n_cols], size=16)
135 | for i in range(n_cols):
136 |     ax_list[i].set_xlabel(repr(reducers[i][0]()).split("(")[0], size=16)
137 |     ax_list[i].xaxis.set_label_position("top")
138 | 
139 | plt.tight_layout()
140 | plt.show()
141 | 


--------------------------------------------------------------------------------
/examples/plot_fashion-mnist_example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | UMAP on the Fashion MNIST Digits dataset using Datashader
 3 | ---------------------------------------------------------
 4 | 
 5 | This is a simple example of using UMAP on the Fashion-MNIST
 6 | dataset. The goal of this example is largely to demonstrate
 7 | the use of datashader as an effective tool for visualising
 8 | UMAP results. In particular datashader allows visualisation
 9 | of very large datasets where overplotting can be a serious
10 | problem. It supports coloring by categorical variables
11 | (as shown in this example), or by continuous variables,
12 | or by density (as is common in datashader examples).
13 | """
14 | 
15 | import umap
16 | import numpy as np
17 | import pandas as pd
18 | import requests
19 | import os
20 | import datashader as ds
21 | import datashader.utils as utils
22 | import datashader.transfer_functions as tf
23 | import matplotlib.pyplot as plt
24 | import seaborn as sns
25 | 
26 | sns.set(context="paper", style="white")
27 | 
28 | if not os.path.isfile("fashion-mnist.csv"):
29 |     csv_data = requests.get("https://www.openml.org/data/get_csv/18238735/phpnBqZGZ")
30 |     with open("fashion-mnist.csv", "w") as f:
31 |         f.write(csv_data.text)
32 | source_df = pd.read_csv("fashion-mnist.csv")
33 | 
34 | data = source_df.iloc[:, :784].values.astype(np.float32)
35 | target = source_df["class"].values
36 | 
37 | pal = [
38 |     "#9e0142",
39 |     "#d8434e",
40 |     "#f67a49",
41 |     "#fdbf6f",
42 |     "#feeda1",
43 |     "#f1f9a9",
44 |     "#bfe5a0",
45 |     "#74c7a5",
46 |     "#378ebb",
47 |     "#5e4fa2",
48 | ]
49 | color_key = {str(d): c for d, c in enumerate(pal)}
50 | 
51 | reducer = umap.UMAP(random_state=42)
52 | embedding = reducer.fit_transform(data)
53 | 
54 | df = pd.DataFrame(embedding, columns=("x", "y"))
55 | df["class"] = pd.Series([str(x) for x in target], dtype="category")
56 | 
57 | cvs = ds.Canvas(plot_width=400, plot_height=400)
58 | agg = cvs.points(df, "x", "y", ds.count_cat("class"))
59 | img = tf.shade(agg, color_key=color_key, how="eq_hist")
60 | 
61 | utils.export_image(img, filename="fashion-mnist", background="black")
62 | 
63 | image = plt.imread("fashion-mnist.png")
64 | fig, ax = plt.subplots(figsize=(6, 6))
65 | plt.imshow(image)
66 | plt.setp(ax, xticks=[], yticks=[])
67 | plt.title(
68 |     "Fashion MNIST data embedded\n"
69 |     "into two dimensions by UMAP\n"
70 |     "visualised with Datashader",
71 |     fontsize=12,
72 | )
73 | 
74 | plt.show()
75 | 


--------------------------------------------------------------------------------
/examples/plot_feature_extraction_classification.py:
--------------------------------------------------------------------------------
 1 | """
 2 | UMAP as a Feature Extraction Technique for Classification
 3 | ---------------------------------------------------------
 4 | 
 5 | The following script shows how UMAP can be used as a feature extraction
 6 | technique to improve the accuracy on a classification task. It also shows
 7 | how UMAP can be integrated in standard scikit-learn pipelines.
 8 | 
 9 | The first step is to create a dataset for a classification task, which is
10 | performed with the function ``sklearn.datasets.make_classification``. The
11 | dataset is then split into a training set and a test set using the
12 | ``sklearn.model_selection.train_test_split`` function.
13 | 
14 | Second, a linear SVM is fitted on the training set. To choose the best
15 | hyperparameters automatically, a gridsearch is performed on the training set.
16 | The performance of the model is then evaluated on the test set with the
17 | accuracy metric.
18 | 
19 |  Third, the previous step is repeated with a slight modification: UMAP is
20 |  used as a feature extraction technique. This small change results in a
21 |  substantial improvement compared to the model where raw data is used.
22 | """
23 | 
24 | from sklearn.datasets import make_classification
25 | from sklearn.model_selection import train_test_split, GridSearchCV
26 | from sklearn.pipeline import Pipeline
27 | from sklearn.svm import LinearSVC
28 | 
29 | from umap import UMAP
30 | 
31 | # Make a toy dataset
32 | X, y = make_classification(
33 |     n_samples=1000,
34 |     n_features=300,
35 |     n_informative=250,
36 |     n_redundant=0,
37 |     n_repeated=0,
38 |     n_classes=2,
39 |     random_state=1212,
40 | )
41 | 
42 | # Split the dataset into a training set and a test set
43 | X_train, X_test, y_train, y_test = train_test_split(
44 |     X, y, test_size=0.2, random_state=42
45 | )
46 | 
47 | # Classification with a linear SVM
48 | svc = LinearSVC(dual=False, random_state=123)
49 | params_grid = {"C": [10**k for k in range(-3, 4)]}
50 | clf = GridSearchCV(svc, params_grid)
51 | clf.fit(X_train, y_train)
52 | print(
53 |     "Accuracy on the test set with raw data: {:.3f}".format(clf.score(X_test, y_test))
54 | )
55 | 
56 | # Transformation with UMAP followed by classification with a linear SVM
57 | umap = UMAP(random_state=456)
58 | pipeline = Pipeline([("umap", umap), ("svc", svc)])
59 | params_grid_pipeline = {
60 |     "umap__n_neighbors": [5, 20],
61 |     "umap__n_components": [15, 25, 50],
62 |     "svc__C": [10**k for k in range(-3, 4)],
63 | }
64 | 
65 | 
66 | clf_pipeline = GridSearchCV(pipeline, params_grid_pipeline)
67 | clf_pipeline.fit(X_train, y_train)
68 | print(
69 |     "Accuracy on the test set with UMAP transformation: {:.3f}".format(
70 |         clf_pipeline.score(X_test, y_test)
71 |     )
72 | )
73 | 


--------------------------------------------------------------------------------
/examples/plot_mnist_example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | UMAP on the MNIST Digits dataset
 3 | --------------------------------
 4 | 
 5 | A simple example demonstrating how to use UMAP on a larger
 6 | dataset such as MNIST. We first pull the MNIST dataset and
 7 | then use UMAP to reduce it to only 2-dimensions for
 8 | easy visualisation.
 9 | 
10 | Note that UMAP manages to both group the individual digit
11 | classes, but also to retain the overall global structure
12 | among the different digit classes -- keeping 1 far from
13 | 0, and grouping triplets of 3,5,8 and 4,7,9 which can
14 | blend into one another in some cases.
15 | """
16 | 
17 | import umap
18 | from sklearn.datasets import fetch_openml
19 | import matplotlib.pyplot as plt
20 | import seaborn as sns
21 | 
22 | sns.set(context="paper", style="white")
23 | 
24 | mnist = fetch_openml("mnist_784", version=1)
25 | 
26 | reducer = umap.UMAP(random_state=42)
27 | embedding = reducer.fit_transform(mnist.data)
28 | 
29 | fig, ax = plt.subplots(figsize=(12, 10))
30 | color = mnist.target.astype(int)
31 | plt.scatter(embedding[:, 0], embedding[:, 1], c=color, cmap="Spectral", s=0.1)
32 | plt.setp(ax, xticks=[], yticks=[])
33 | plt.title("MNIST data embedded into two dimensions by UMAP", fontsize=18)
34 | 
35 | plt.show()
36 | 


--------------------------------------------------------------------------------
/images/densmap_example_mnist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/images/densmap_example_mnist.png


--------------------------------------------------------------------------------
/images/iris.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/images/iris.png


--------------------------------------------------------------------------------
/images/mnist_digits.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/images/mnist_digits.png


--------------------------------------------------------------------------------
/images/sklearn_digits.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/images/sklearn_digits.png


--------------------------------------------------------------------------------
/images/umap_example_fashion_mnist1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/images/umap_example_fashion_mnist1.png


--------------------------------------------------------------------------------
/images/umap_example_mnist1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/images/umap_example_mnist1.png


--------------------------------------------------------------------------------
/images/umap_example_shuttle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/images/umap_example_shuttle.png


--------------------------------------------------------------------------------
/paper.bib:
--------------------------------------------------------------------------------
 1 | @article{umap_arxiv,
 2 |      author = {{McInnes}, L. and {Healy}, J.},
 3 |      title = "{UMAP: Uniform Manifold Approximation
 4 |      and Projection for Dimension Reduction}",
 5 |      journal = {ArXiv e-prints},
 6 |      archivePrefix = "arXiv",
 7 |      eprint = {1802.03426},
 8 |      primaryClass = "stat.ML",
 9 |      keywords = {Statistics - Machine Learning,
10 |                  Computer Science - Computational Geometry,
11 |                  Computer Science - Learning},
12 |      year = 2018,
13 |      month = feb,
14 | }
15 | 
16 | @online{umap_repo,
17 |   author = {Leland McInnes and John Healy and Nathaniel Saul and Lukas Grossberger},
18 |   title = {UMAP},
19 |   year = 2018,
20 |   url = {https://github.com/lmcinnes/umap},
21 |   urldate = {2018-07-22}
22 | }


--------------------------------------------------------------------------------
/paper.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'UMAP: Uniform Manifold Approximation and Projection'
 3 | tags:
 4 |   - manifold learning
 5 |   - dimension reduction
 6 |   - unsupervised learning
 7 | authors:
 8 |  - name: Leland McInnes
 9 |    orcid: 0000-0003-2143-6834
10 |    affiliation: 1
11 |  - name: John Healy
12 |    affiliation: 1
13 |  - name: Nathaniel Saul
14 |    affiliation: 2
15 |  - name: Lukas Großberger
16 |    affiliation: "3, 4"
17 | affiliations:
18 |  - name: Tutte Institute for Mathematics and Computing
19 |    index: 1
20 |  - name: Department of Mathematics and Statistics, Washington State University
21 |    index: 2
22 |  - name: Ernst Strüngmann Institute for Neuroscience in cooperation with Max Planck Society
23 |    index: 3
24 |  - name: Donders Institute for Brain, Cognition and Behaviour, Radboud Universiteit
25 |    index: 4
26 | date: 26 July 2018
27 | bibliography: paper.bib
28 | ---
29 | 
30 | # Summary
31 | 
32 | Uniform Manifold Approximation and Projection (UMAP) is a dimension reduction technique
33 | that can be used for  visualisation similarly to t-SNE, but also for general non-linear
34 | dimension reduction. UMAP has a rigorous mathematical foundation, but is simple to use,
35 | with a scikit-learn compatible API. UMAP is among the fastest manifold learning
36 | implementations available -- significantly faster than most t-SNE implementations.
37 | 
38 | UMAP supports a number of useful features, including the ability to use labels
39 | (or partial labels) for supervised (or semi-supervised) dimension reduction,
40 | and the ability to transform new unseen data into a pretrained embedding space.
41 | 
42 | For details of the mathematical underpinnings see [@umap_arxiv]. The implementation
43 | can be found at [@umap_repo].
44 | 
45 | -![Fashion MNIST embedded via UMAP](images/umap_example_fashion_mnist1.png)
46 | 
47 | # References
48 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | from setuptools import setup
 3 | 
 4 | 
 5 | def readme():
 6 |     try:
 7 |         with open("README.rst", encoding="UTF-8") as readme_file:
 8 |             return readme_file.read()
 9 |     except TypeError:
10 |         # Python 2.7 doesn't support encoding argument in builtin open
11 |         import io
12 | 
13 |         with io.open("README.rst", encoding="UTF-8") as readme_file:
14 |             return readme_file.read()
15 | 
16 | 
17 | configuration = {
18 |     "name": "umap-learn",
19 |     "version": "0.5.8",
20 |     "description": "Uniform Manifold Approximation and Projection",
21 |     "long_description": readme(),
22 |     "long_description_content_type": "text/x-rst",
23 |     "classifiers": [
24 |         "Development Status :: 3 - Alpha",
25 |         "Intended Audience :: Science/Research",
26 |         "Intended Audience :: Developers",
27 |         "License :: OSI Approved",
28 |         "Programming Language :: C",
29 |         "Programming Language :: Python",
30 |         "Topic :: Software Development",
31 |         "Topic :: Scientific/Engineering",
32 |         "Operating System :: Microsoft :: Windows",
33 |         "Operating System :: POSIX",
34 |         "Operating System :: Unix",
35 |         "Operating System :: MacOS",
36 |         "Programming Language :: Python :: 3.9",
37 |         "Programming Language :: Python :: 3.10",
38 |         "Programming Language :: Python :: 3.11",
39 |         "Programming Language :: Python :: 3.12",
40 |     ],
41 |     "keywords": "dimension reduction t-sne manifold",
42 |     "url": "http://github.com/lmcinnes/umap",
43 |     "maintainer": "Leland McInnes",
44 |     "maintainer_email": "leland.mcinnes@gmail.com",
45 |     "license": "BSD",
46 |     "packages": ["umap"],
47 |     "install_requires": [
48 |         "numpy >= 1.23",
49 |         "scipy >= 1.3.1",
50 |         "scikit-learn >= 1.6",
51 |         "numba >= 0.51.2",
52 |         "pynndescent >= 0.5",
53 |         "tqdm",
54 |     ],
55 |     "extras_require": {
56 |         "plot": [
57 |             "pandas",
58 |             "matplotlib",
59 |             "datashader",
60 |             "bokeh",
61 |             "holoviews",
62 |             "colorcet",
63 |             "seaborn",
64 |             "scikit-image",
65 |             "dask",
66 |         ],
67 |         "parametric_umap": ["tensorflow >= 2.1"],
68 |         "tbb": ["tbb >= 2019.0"],
69 |     },
70 |     "ext_modules": [],
71 |     "cmdclass": {},
72 |     "test_suite": "pytest",
73 |     "tests_require": ["pytest"],
74 |     "data_files": (),
75 |     "zip_safe": False,
76 | }
77 | 
78 | setup(**configuration)
79 | 


--------------------------------------------------------------------------------
/umap/__init__.py:
--------------------------------------------------------------------------------
 1 | from warnings import warn, catch_warnings, simplefilter
 2 | from .umap_ import UMAP
 3 | 
 4 | try:
 5 |     with catch_warnings():
 6 |         simplefilter("ignore")
 7 |         from .parametric_umap import ParametricUMAP, load_ParametricUMAP
 8 | except ImportError:
 9 |     warn(
10 |         "Tensorflow not installed; ParametricUMAP will be unavailable",
11 |         category=ImportWarning,
12 |     )
13 | 
14 |     # Add a dummy class to raise an error
15 |     class ParametricUMAP(object):
16 |         def __init__(self, **kwds):
17 |             warn(
18 |                 """The umap.parametric_umap package requires Tensorflow > 2.0 to be installed.
19 |             You can install Tensorflow at https://www.tensorflow.org/install
20 | 
21 |             or you can install the CPU version of Tensorflow using 
22 | 
23 |             pip install umap-learn[parametric_umap]
24 | 
25 |             """
26 |             )
27 |             raise ImportError(
28 |                 "umap.parametric_umap requires Tensorflow >= 2.0"
29 |             ) from None
30 | 
31 | 
32 | from .aligned_umap import AlignedUMAP
33 | 
34 | # Workaround: https://github.com/numba/numba/issues/3341
35 | import numba
36 | 
37 | from importlib.metadata import version, PackageNotFoundError
38 | 
39 | try:
40 |     __version__ = version("umap-learn")
41 | except PackageNotFoundError:
42 |     __version__ = "0.5-dev"
43 | 


--------------------------------------------------------------------------------
/umap/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test Suite for UMAP to ensure things are working as expected.
 3 | 
 4 | The test suite comprises multiple testing modules,
 5 | including multiple test cases related to a specific
 6 | set of UMAP features under test.
 7 | 
 8 | Backend
 9 | -------
10 | pytest is the reference backend for testing environment and execution,
11 | also integrating with pre-existent nose-based tests
12 | 
13 | Shared Testing code
14 | -------------------
15 | Whenever needed, each module includes a set of
16 | _utility_ functions that specify shared (and repeated)
17 | testing operations.
18 | 
19 | Fixtures
20 | --------
21 | All data dependency has been implemented
22 | as test fixtures (preferred to shared global variables).
23 | All the fixtures shared by multiple test cases
24 | are defined in the `conftest.py` module.
25 | 
26 | Fixtures allow the execution of each test module in isolation, as well
27 | as within the whole test suite.
28 | 
29 | Modules in Tests (to keep up to date)
30 | -------------------------------------
31 | - conftest: pytrest fixtures
32 | - test_plot: basic tests for umap.plot
33 | - test_umap_df_validation_params:
34 |     Tests on parameters validation for DataFrameUMAP
35 | - test_umap_metrics:
36 |     Tests for UMAP metrics - spatial, binary, and sparse
37 | - test_umap_nn:
38 |     Tests for NearestNeighbours
39 | - test_umap_on_iris:
40 |     Tests for UMAP on Iris Dataset
41 | - test_umap_ops:
42 |     Tests for general UMAP ops (e.g. clusterability, transform stability)
43 | - test_umap_repeated_data:
44 |     UMAP tests on repeated data (sparse|dense; spatial|binary)
45 | - test_umap_trustworthiness:
46 |     Tests on UMAP Trustworthiness
47 | - test_umap_validation_params:
48 |     Tests for fit parameters validation
49 | 
50 | """
51 | 


--------------------------------------------------------------------------------
/umap/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | # ===========================
  2 | #  Testing (session) Fixture
  3 | # ==========================
  4 | 
  5 | import pytest
  6 | import numpy as np
  7 | from scipy import sparse
  8 | from sklearn.datasets import load_iris
  9 | from umap import UMAP, AlignedUMAP
 10 | 
 11 | # Globals, used for all the tests
 12 | SEED = 189212  # 0b101110001100011100
 13 | np.random.seed(SEED)
 14 | 
 15 | 
 16 | # Spatial and Binary Data
 17 | # -----------------------
 18 | @pytest.fixture(scope="session")
 19 | def spatial_data():
 20 |     # - Spatial Data
 21 |     spatial_data = np.random.randn(10, 20)
 22 |     # Add some all zero data for corner case test
 23 |     return np.vstack([spatial_data, np.zeros((2, 20))])
 24 | 
 25 | 
 26 | @pytest.fixture(scope="session")
 27 | def binary_data():
 28 |     binary_data = np.random.choice(a=[False, True], size=(10, 20), p=[0.66, 1 - 0.66])
 29 |     # Add some all zero data for corner case test
 30 |     binary_data = np.vstack([binary_data, np.zeros((2, 20), dtype="bool")])
 31 |     return binary_data
 32 | 
 33 | 
 34 | # Sparse Spatial and Binary Data
 35 | # ------------------------------
 36 | @pytest.fixture(scope="session")
 37 | def sparse_spatial_data(spatial_data, binary_data):
 38 |     return sparse.csr_matrix(spatial_data * binary_data)
 39 | 
 40 | 
 41 | @pytest.fixture(scope="session")
 42 | def sparse_binary_data(binary_data):
 43 |     return sparse.csr_matrix(binary_data)
 44 | 
 45 | 
 46 | # Nearest Neighbour Data
 47 | # -----------------------
 48 | @pytest.fixture(scope="session")
 49 | def nn_data():
 50 |     nn_data = np.random.uniform(0, 1, size=(1000, 5))
 51 |     nn_data = np.vstack(
 52 |         [nn_data, np.zeros((2, 5))]
 53 |     )  # Add some all zero data for corner case test
 54 |     return nn_data
 55 | 
 56 | 
 57 | @pytest.fixture(scope="session")
 58 | def binary_nn_data():
 59 |     binary_nn_data = np.random.choice(
 60 |         a=[False, True], size=(1000, 5), p=[0.66, 1 - 0.66]
 61 |     )
 62 |     binary_nn_data = np.vstack(
 63 |         [binary_nn_data, np.zeros((2, 5), dtype="bool")]
 64 |     )  # Add some all zero data for corner case test
 65 |     return binary_nn_data
 66 | 
 67 | 
 68 | @pytest.fixture(scope="session")
 69 | def sparse_nn_data():
 70 |     return sparse.random(1000, 50, density=0.5, format="csr")
 71 | 
 72 | 
 73 | # Data With Repetitions
 74 | # ---------------------
 75 | 
 76 | 
 77 | @pytest.fixture(scope="session")
 78 | def repetition_dense():
 79 |     # Dense data for testing small n
 80 |     return np.array(
 81 |         [
 82 |             [5, 6, 7, 8],
 83 |             [5, 6, 7, 8],
 84 |             [5, 6, 7, 8],
 85 |             [5, 6, 7, 8],
 86 |             [5, 6, 7, 8],
 87 |             [5, 6, 7, 8],
 88 |             [1, 1, 1, 1],
 89 |             [1, 2, 3, 4],
 90 |             [1, 1, 2, 1],
 91 |         ]
 92 |     )
 93 | 
 94 | 
 95 | @pytest.fixture(scope="session")
 96 | def spatial_repeats(spatial_data):
 97 |     # spatial data repeats
 98 |     spatial_repeats = np.vstack(
 99 |         [np.repeat(spatial_data[0:2], [2, 0], axis=0), spatial_data, np.zeros((2, 20))]
100 |     )
101 |     # Add some all zero data for corner case test.  Make the first three rows identical
102 |     # binary Data Repeat
103 |     return spatial_repeats
104 | 
105 | 
106 | @pytest.fixture(scope="session")
107 | def binary_repeats(binary_data):
108 |     binary_repeats = np.vstack(
109 |         [
110 |             np.repeat(binary_data[0:2], [2, 0], axis=0),
111 |             binary_data,
112 |             np.zeros((2, 20), dtype="bool"),
113 |         ]
114 |     )
115 |     # Add some all zero data for corner case test.  Make the first three rows identical
116 |     return binary_repeats
117 | 
118 | 
119 | @pytest.fixture(scope="session")
120 | def sparse_spatial_data_repeats(spatial_repeats, binary_repeats):
121 |     return sparse.csr_matrix(spatial_repeats * binary_repeats)
122 | 
123 | 
124 | @pytest.fixture(scope="session")
125 | def sparse_binary_data_repeats(binary_repeats):
126 |     return sparse.csr_matrix(binary_repeats)
127 | 
128 | 
129 | @pytest.fixture(scope="session")
130 | def sparse_test_data(nn_data, binary_nn_data):
131 |     return sparse.csr_matrix(nn_data * binary_nn_data)
132 | 
133 | 
134 | @pytest.fixture(scope="session")
135 | def iris():
136 |     return load_iris()
137 | 
138 | 
139 | @pytest.fixture(scope="session")
140 | def iris_selection():
141 |     return np.random.choice([True, False], 150, replace=True, p=[0.75, 0.25])
142 | 
143 | 
144 | @pytest.fixture(scope="session")
145 | def aligned_iris(iris):
146 |     slices = [iris.data[i : i + 50] for i in range(0, 125, 25)]
147 |     target = [iris.target[i : i + 50] for i in range(0, 125, 25)]
148 |     return slices, target
149 | 
150 | 
151 | @pytest.fixture(scope="session")
152 | def aligned_iris_relations():
153 |     return [{a: a + 25 for a in range(25)} for i in range(4)]
154 | 
155 | 
156 | @pytest.fixture(scope="session")
157 | def iris_model(iris):
158 |     return UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(iris.data)
159 | 
160 | 
161 | @pytest.fixture(scope="session")
162 | def iris_model_large(iris):
163 |     return UMAP(
164 |         n_neighbors=10,
165 |         min_dist=0.01,
166 |         random_state=42,
167 |         force_approximation_algorithm=True,
168 |     ).fit(iris.data)
169 | 
170 | 
171 | @pytest.fixture(scope="session")
172 | def iris_subset_model(iris, iris_selection):
173 |     return UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(
174 |         iris.data[iris_selection]
175 |     )
176 | 
177 | 
178 | @pytest.fixture(scope="session")
179 | def iris_subset_model_large(iris, iris_selection):
180 |     return UMAP(
181 |         n_neighbors=10,
182 |         min_dist=0.01,
183 |         random_state=42,
184 |         force_approximation_algorithm=True,
185 |     ).fit(iris.data[iris_selection])
186 | 
187 | 
188 | @pytest.fixture(scope="session")
189 | def supervised_iris_model(iris):
190 |     return UMAP(n_neighbors=10, min_dist=0.01, n_epochs=200, random_state=42).fit(
191 |         iris.data, iris.target
192 |     )
193 | 
194 | 
195 | @pytest.fixture(scope="session")
196 | def aligned_iris_model(aligned_iris, aligned_iris_relations):
197 |     data, target = aligned_iris
198 |     model = AlignedUMAP()
199 |     model.fit(data, relations=aligned_iris_relations)
200 |     return model
201 | 
202 | 
203 | # UMAP Distance Metrics
204 | # ---------------------
205 | @pytest.fixture(scope="session")
206 | def spatial_distances():
207 |     return (
208 |         "euclidean",
209 |         "manhattan",
210 |         "chebyshev",
211 |         "minkowski",
212 |         "hamming",
213 |         "canberra",
214 |         "braycurtis",
215 |         "cosine",
216 |         "correlation",
217 |     )
218 | 
219 | 
220 | @pytest.fixture(scope="session")
221 | def binary_distances():
222 |     return (
223 |         "jaccard",
224 |         "matching",
225 |         "dice",
226 |         "kulsinski",
227 |         "rogerstanimoto",
228 |         "russellrao",
229 |         "sokalmichener",
230 |         "sokalsneath",
231 |         "yule",
232 |     )
233 | 


--------------------------------------------------------------------------------
/umap/tests/digits_embedding_42.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/umap/tests/digits_embedding_42.npy


--------------------------------------------------------------------------------
/umap/tests/test_aligned_umap.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from umap import AlignedUMAP
 3 | from sklearn.metrics import pairwise_distances
 4 | from sklearn.cluster import KMeans
 5 | import numpy as np
 6 | from sklearn.metrics import adjusted_rand_score
 7 | 
 8 | # ===============================
 9 | # Test AlignedUMAP on sliced iris
10 | # ===============================
11 | 
12 | 
13 | def nn_accuracy(true_nn, embd_nn):
14 |     num_correct = 0.0
15 |     for i in range(true_nn.shape[0]):
16 |         num_correct += np.sum(np.isin(true_nn[i], embd_nn[i]))
17 |     return num_correct / true_nn.size
18 | 
19 | 
20 | def test_neighbor_local_neighbor_accuracy(aligned_iris, aligned_iris_model):
21 |     data, target = aligned_iris
22 |     for i, slice in enumerate(data):
23 |         data_dmat = pairwise_distances(slice)
24 |         true_nn = np.argsort(data_dmat, axis=1)[:, :10]
25 |         embd_dmat = pairwise_distances(aligned_iris_model.embeddings_[i])
26 |         embd_nn = np.argsort(embd_dmat, axis=1)[:, :10]
27 |         assert nn_accuracy(true_nn, embd_nn) >= 0.65
28 | 
29 | 
30 | def test_local_clustering(aligned_iris, aligned_iris_model):
31 |     data, target = aligned_iris
32 | 
33 |     embd = aligned_iris_model.embeddings_[1]
34 |     clusters = KMeans(n_clusters=2).fit_predict(embd)
35 |     ari = adjusted_rand_score(target[1], clusters)
36 |     assert ari >= 0.75
37 | 
38 |     embd = aligned_iris_model.embeddings_[3]
39 |     clusters = KMeans(n_clusters=2).fit_predict(embd)
40 |     ari = adjusted_rand_score(target[3], clusters)
41 |     assert ari >= 0.40
42 | 
43 | 
44 | def test_aligned_update(aligned_iris, aligned_iris_relations):
45 |     data, target = aligned_iris
46 |     small_aligned_model = AlignedUMAP()
47 |     small_aligned_model.fit(data[:3], relations=aligned_iris_relations[:2])
48 |     small_aligned_model.update(data[3], relations=aligned_iris_relations[2])
49 |     for i, slice in enumerate(data[:4]):
50 |         data_dmat = pairwise_distances(slice)
51 |         true_nn = np.argsort(data_dmat, axis=1)[:, :10]
52 |         embd_dmat = pairwise_distances(small_aligned_model.embeddings_[i])
53 |         embd_nn = np.argsort(embd_dmat, axis=1)[:, :10]
54 |         assert nn_accuracy(true_nn, embd_nn) >= 0.45
55 | 
56 | 
57 | def test_aligned_update_params(aligned_iris, aligned_iris_relations):
58 |     data, target = aligned_iris
59 |     n_neighbors = [15, 15, 15, 15, 15]
60 |     small_aligned_model = AlignedUMAP(n_neighbors=n_neighbors[:3])
61 |     small_aligned_model.fit(data[:3], relations=aligned_iris_relations[:2])
62 |     small_aligned_model.update(
63 |         data[3], relations=aligned_iris_relations[2], n_neighbors=n_neighbors[3]
64 |     )
65 |     for i, slice in enumerate(data[:4]):
66 |         data_dmat = pairwise_distances(slice)
67 |         true_nn = np.argsort(data_dmat, axis=1)[:, :10]
68 |         embd_dmat = pairwise_distances(small_aligned_model.embeddings_[i])
69 |         embd_nn = np.argsort(embd_dmat, axis=1)[:, :10]
70 |         assert nn_accuracy(true_nn, embd_nn) >= 0.45
71 | 
72 | 
73 | @pytest.mark.skip(reason="Temporarily disable")
74 | def test_aligned_update_array_error(aligned_iris, aligned_iris_relations):
75 |     data, target = aligned_iris
76 |     n_neighbors = [15, 15, 15, 15, 15]
77 |     small_aligned_model = AlignedUMAP(n_neighbors=n_neighbors[:3])
78 |     small_aligned_model.fit(data[:3], relations=aligned_iris_relations[:2])
79 | 
80 |     with pytest.raises(ValueError):
81 |         small_aligned_model.update(
82 |             data[3:], relations=aligned_iris_relations[2:], n_neighbors=n_neighbors[3:]
83 |         )
84 | 


--------------------------------------------------------------------------------
/umap/tests/test_composite_models.py:
--------------------------------------------------------------------------------
  1 | from umap import UMAP
  2 | import pytest
  3 | 
  4 | try:
  5 |     # works for sklearn>=0.22
  6 |     from sklearn.manifold import trustworthiness
  7 | except ImportError:
  8 |     # this is to comply with requirements (scikit-learn>=0.20)
  9 |     # More recent versions of sklearn have exposed trustworthiness
 10 |     # in top level module API
 11 |     # see: https://github.com/scikit-learn/scikit-learn/pull/15337
 12 |     from sklearn.manifold.t_sne import trustworthiness
 13 | 
 14 | 
 15 | def test_composite_trustworthiness(nn_data, iris_model):
 16 |     data = nn_data[:50]
 17 |     model1 = UMAP(n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=50).fit(data)
 18 |     model2 = UMAP(
 19 |         n_neighbors=30,
 20 |         min_dist=0.01,
 21 |         random_state=42,
 22 |         n_epochs=50,
 23 |         init=model1.embedding_,
 24 |     ).fit(data)
 25 |     model3 = model1 * model2
 26 |     trust = trustworthiness(data, model3.embedding_, n_neighbors=10)
 27 |     assert (
 28 |         trust >= 0.80
 29 |     ), "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust)
 30 |     model4 = model1 + model2
 31 |     trust = trustworthiness(data, model4.embedding_, n_neighbors=10)
 32 |     assert (
 33 |         trust >= 0.80
 34 |     ), "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust)
 35 | 
 36 |     with pytest.raises(ValueError):
 37 |         _ = model1 + iris_model
 38 | 
 39 |     with pytest.raises(ValueError):
 40 |         _ = model1 * iris_model
 41 | 
 42 |     with pytest.raises(ValueError):
 43 |         _ = model1 - iris_model
 44 | 
 45 | 
 46 | @pytest.mark.skip(reason="Marked as Skipped test")
 47 | def test_composite_trustworthiness_random_init(nn_data):  # pragma: no cover
 48 |     data = nn_data[:50]
 49 |     model1 = UMAP(
 50 |         n_neighbors=10,
 51 |         min_dist=0.01,
 52 |         random_state=42,
 53 |         n_epochs=50,
 54 |         init="random",
 55 |     ).fit(data)
 56 |     model2 = UMAP(
 57 |         n_neighbors=30,
 58 |         min_dist=0.01,
 59 |         random_state=42,
 60 |         n_epochs=50,
 61 |         init="random",
 62 |     ).fit(data)
 63 |     model3 = model1 * model2
 64 |     trust = trustworthiness(data, model3.embedding_, n_neighbors=10)
 65 |     assert (
 66 |         trust >= 0.82
 67 |     ), "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust)
 68 |     model4 = model1 + model2
 69 |     trust = trustworthiness(data, model4.embedding_, n_neighbors=10)
 70 |     assert (
 71 |         trust >= 0.82
 72 |     ), "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust)
 73 | 
 74 | 
 75 | def test_composite_trustworthiness_on_iris(iris):
 76 |     iris_model1 = UMAP(
 77 |         n_neighbors=10,
 78 |         min_dist=0.01,
 79 |         random_state=42,
 80 |         n_epochs=100,
 81 |     ).fit(iris.data[:, :2])
 82 |     iris_model2 = UMAP(
 83 |         n_neighbors=10,
 84 |         min_dist=0.01,
 85 |         random_state=42,
 86 |         n_epochs=100,
 87 |     ).fit(iris.data[:, 2:])
 88 |     embedding = (iris_model1 + iris_model2).embedding_
 89 |     trust = trustworthiness(iris.data, embedding, n_neighbors=10)
 90 |     assert (
 91 |         trust >= 0.82
 92 |     ), "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust)
 93 |     embedding = (iris_model1 * iris_model2).embedding_
 94 |     trust = trustworthiness(iris.data, embedding, n_neighbors=10)
 95 |     assert (
 96 |         trust >= 0.82
 97 |     ), "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust)
 98 | 
 99 | 
100 | def test_contrastive_trustworthiness_on_iris(iris):
101 |     iris_model1 = UMAP(
102 |         n_neighbors=10,
103 |         min_dist=0.01,
104 |         random_state=42,
105 |         n_epochs=100,
106 |     ).fit(iris.data[:, :2])
107 |     iris_model2 = UMAP(
108 |         n_neighbors=10,
109 |         min_dist=0.01,
110 |         random_state=42,
111 |         n_epochs=100,
112 |     ).fit(iris.data[:, 2:])
113 |     embedding = (iris_model1 - iris_model2).embedding_
114 |     trust = trustworthiness(iris.data, embedding, n_neighbors=10)
115 |     assert (
116 |         trust >= 0.75
117 |     ), "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust)
118 | 


--------------------------------------------------------------------------------
/umap/tests/test_data_input.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest as pytest
 3 | from numba import njit
 4 | from umap import UMAP
 5 | 
 6 | 
 7 | @pytest.fixture(scope="session")
 8 | def all_finite_data():
 9 |     return np.arange(100.0).reshape(25, 4)
10 | 
11 | 
12 | @pytest.fixture(scope="session")
13 | def inverse_data():
14 |     return np.arange(50).reshape(25, 2)
15 | 
16 | 
17 | @njit
18 | def nan_dist(a: np.ndarray, b: np.ndarray):
19 |     a[0] = np.nan
20 |     a[1] = np.inf
21 |     return 0, a
22 | 
23 | 
24 | def test_check_input_data(all_finite_data, inverse_data):
25 |     """
26 |     Data input to UMAP gets checked for liability.
27 |     This tests checks the if data input is dismissed/accepted
28 |     according to the "ensure_all_finite" keyword as used by
29 |     sklearn.
30 | 
31 |     Parameters
32 |     ----------
33 |     all_finite_data
34 |     inverse_data
35 |     -------
36 | 
37 |     """
38 |     inf_data = all_finite_data.copy()
39 |     inf_data[0] = np.inf
40 |     nan_data = all_finite_data.copy()
41 |     nan_data[0] = np.nan
42 |     inf_nan_data = all_finite_data.copy()
43 |     inf_nan_data[0] = np.nan
44 |     inf_nan_data[1] = np.inf
45 | 
46 |     # wrapper to call each data handling function of UMAP in a convenient way
47 |     def call_umap_functions(data, ensure_all_finite):
48 |         u = UMAP(metric=nan_dist)
49 |         if ensure_all_finite is None:
50 |             u.fit_transform(data)
51 |             u.fit(data)
52 |             u.transform(data)
53 |             u.update(data)
54 |             u.inverse_transform(inverse_data)
55 |         else:
56 |             u.fit_transform(data, ensure_all_finite=ensure_all_finite)
57 |             u.fit(data, ensure_all_finite=ensure_all_finite)
58 |             u.transform(data, ensure_all_finite=ensure_all_finite)
59 |             u.update(data, ensure_all_finite=ensure_all_finite)
60 |             u.inverse_transform(inverse_data)
61 | 
62 |     # Check whether correct data input is accepted
63 |     call_umap_functions(all_finite_data, None)
64 |     call_umap_functions(all_finite_data, True)
65 | 
66 |     call_umap_functions(nan_data, "allow-nan")
67 |     call_umap_functions(all_finite_data, "allow-nan")
68 | 
69 |     call_umap_functions(inf_data, False)
70 |     call_umap_functions(inf_nan_data, False)
71 |     call_umap_functions(nan_data, False)
72 |     call_umap_functions(all_finite_data, False)
73 | 
74 |     # Check whether illegal data raises a ValueError
75 |     with pytest.raises(ValueError):
76 |         call_umap_functions(nan_data, None)
77 |         call_umap_functions(inf_data, None)
78 |         call_umap_functions(inf_nan_data, None)
79 | 
80 |         call_umap_functions(nan_data, True)
81 |         call_umap_functions(inf_data, True)
82 |         call_umap_functions(inf_nan_data, True)
83 | 
84 |         call_umap_functions(inf_data, "allow-nan")
85 |         call_umap_functions(inf_nan_data, "allow-nan")
86 | 


--------------------------------------------------------------------------------
/umap/tests/test_densmap.py:
--------------------------------------------------------------------------------
 1 | from umap import UMAP
 2 | import pytest
 3 | 
 4 | try:
 5 |     # works for sklearn>=0.22
 6 |     from sklearn.manifold import trustworthiness
 7 | except ImportError:
 8 |     # this is to comply with requirements (scikit-learn>=0.20)
 9 |     # More recent versions of sklearn have exposed trustworthiness
10 |     # in top level module API
11 |     # see: https://github.com/scikit-learn/scikit-learn/pull/15337
12 |     from sklearn.manifold.t_sne import trustworthiness
13 | 
14 | 
15 | def test_densmap_trustworthiness(nn_data):
16 |     data = nn_data[:50]
17 |     embedding, rad_h, rad_l = UMAP(
18 |         n_neighbors=10,
19 |         min_dist=0.01,
20 |         random_state=42,
21 |         n_epochs=100,
22 |         densmap=True,
23 |         output_dens=True,
24 |     ).fit_transform(data)
25 |     trust = trustworthiness(data, embedding, n_neighbors=10)
26 |     assert (
27 |         trust >= 0.72
28 |     ), "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust)
29 | 
30 | 
31 | @pytest.mark.skip()
32 | def test_densmap_trustworthiness_random_init(nn_data):  # pragma: no cover
33 |     data = nn_data[:50]
34 |     embedding = UMAP(
35 |         n_neighbors=10,
36 |         min_dist=0.01,
37 |         random_state=42,
38 |         init="random",
39 |         densmap=True,
40 |     ).fit_transform(data)
41 |     trust = trustworthiness(data, embedding, n_neighbors=10)
42 |     assert (
43 |         trust >= 0.75
44 |     ), "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust)
45 | 
46 | 
47 | def test_densmap_trustworthiness_on_iris(iris):
48 |     densmap_iris_model = UMAP(
49 |         n_neighbors=10,
50 |         min_dist=0.01,
51 |         random_state=42,
52 |         densmap=True,
53 |         verbose=True,
54 |     ).fit(iris.data)
55 |     embedding = densmap_iris_model.embedding_
56 |     trust = trustworthiness(iris.data, embedding, n_neighbors=10)
57 |     assert (
58 |         trust >= 0.97
59 |     ), "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust)
60 | 
61 |     with pytest.raises(NotImplementedError):
62 |         densmap_iris_model.transform(iris.data[:10])
63 | 
64 |     with pytest.raises(ValueError):
65 |         densmap_iris_model.inverse_transform(embedding[:10])
66 | 
67 | 
68 | def test_densmap_trustworthiness_on_iris_supervised(iris):
69 |     densmap_iris_model = UMAP(
70 |         n_neighbors=10,
71 |         min_dist=0.01,
72 |         random_state=42,
73 |         densmap=True,
74 |         verbose=True,
75 |     ).fit(iris.data, y=iris.target)
76 |     embedding = densmap_iris_model.embedding_
77 |     trust = trustworthiness(iris.data, embedding, n_neighbors=10)
78 |     assert (
79 |         trust >= 0.97
80 |     ), "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust)
81 | 


--------------------------------------------------------------------------------
/umap/tests/test_parametric_umap.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tempfile
  3 | import pytest
  4 | from sklearn.datasets import make_moons
  5 | from sklearn.model_selection import train_test_split
  6 | from numpy.testing import assert_array_almost_equal
  7 | import platform
  8 | 
  9 | try:
 10 |     import tensorflow as tf
 11 | 
 12 |     IMPORT_TF = True
 13 | except ImportError:
 14 |     IMPORT_TF = False
 15 | else:
 16 |     from umap.parametric_umap import ParametricUMAP, load_ParametricUMAP
 17 | 
 18 | tf_only = pytest.mark.skipif(not IMPORT_TF, reason="TensorFlow >= 2.0 is not installed")
 19 | not_windows = pytest.mark.skipif(
 20 |     platform.system() == "Windows", reason="Windows file access issues"
 21 | )
 22 | 
 23 | 
 24 | @pytest.fixture(scope="session")
 25 | def moon_dataset():
 26 |     X, _ = make_moons(200)
 27 |     return X
 28 | 
 29 | 
 30 | @tf_only
 31 | def test_create_model(moon_dataset):
 32 |     """test a simple parametric UMAP network"""
 33 |     embedder = ParametricUMAP()
 34 |     embedding = embedder.fit_transform(moon_dataset)
 35 |     # completes successfully
 36 |     assert embedding is not None
 37 |     assert embedding.shape == (moon_dataset.shape[0], 2)
 38 | 
 39 | 
 40 | @tf_only
 41 | def test_global_loss(moon_dataset):
 42 |     """test a simple parametric UMAP network"""
 43 |     embedder = ParametricUMAP(global_correlation_loss_weight=1.0)
 44 |     embedding = embedder.fit_transform(moon_dataset)
 45 |     # completes successfully
 46 |     assert embedding is not None
 47 |     assert embedding.shape == (moon_dataset.shape[0], 2)
 48 | 
 49 | 
 50 | @tf_only
 51 | def test_inverse_transform(moon_dataset):
 52 |     """tests inverse_transform"""
 53 | 
 54 |     def norm(x):
 55 |         return (x - np.min(x)) / (np.max(x) - np.min(x))
 56 | 
 57 |     X = norm(moon_dataset)
 58 |     embedder = ParametricUMAP(parametric_reconstruction=True)
 59 |     Z = embedder.fit_transform(X)
 60 |     X_r = embedder.inverse_transform(Z)
 61 |     # completes successfully
 62 |     assert X_r is not None
 63 |     assert X_r.shape == X.shape
 64 | 
 65 | 
 66 | @tf_only
 67 | def test_custom_encoder_decoder(moon_dataset):
 68 |     """test using a custom encoder / decoder"""
 69 |     dims = (2,)
 70 |     n_components = 2
 71 |     encoder = tf.keras.Sequential(
 72 |         [
 73 |             tf.keras.layers.Input(shape=dims),
 74 |             tf.keras.layers.Flatten(),
 75 |             tf.keras.layers.Dense(units=100, activation="relu"),
 76 |             tf.keras.layers.Dense(units=100, activation="relu"),
 77 |             tf.keras.layers.Dense(units=100, activation="relu"),
 78 |             tf.keras.layers.Dense(units=n_components, name="z"),
 79 |         ]
 80 |     )
 81 | 
 82 |     decoder = tf.keras.Sequential(
 83 |         [
 84 |             tf.keras.layers.Input(shape=(n_components,)),
 85 |             tf.keras.layers.Dense(units=100, activation="relu"),
 86 |             tf.keras.layers.Dense(units=100, activation="relu"),
 87 |             tf.keras.layers.Dense(units=100, activation="relu"),
 88 |             tf.keras.layers.Dense(
 89 |                 units=np.prod(dims), name="recon", activation=None
 90 |             ),
 91 |             tf.keras.layers.Reshape(dims),
 92 |         ]
 93 |     )
 94 | 
 95 |     embedder = ParametricUMAP(
 96 |         encoder=encoder,
 97 |         decoder=decoder,
 98 |         dims=dims,
 99 |         parametric_reconstruction=True,
100 |         verbose=True,
101 |     )
102 |     embedding = embedder.fit_transform(moon_dataset)
103 |     # completes successfully
104 |     assert embedding is not None
105 |     assert embedding.shape == (moon_dataset.shape[0], 2)
106 | 
107 | 
108 | @tf_only
109 | def test_validation(moon_dataset):
110 |     """tests adding a validation dataset"""
111 |     X_train, X_valid = train_test_split(moon_dataset, train_size=0.5)
112 |     embedder = ParametricUMAP(
113 |         parametric_reconstruction=True, reconstruction_validation=X_valid, verbose=True
114 |     )
115 |     embedding = embedder.fit_transform(X_train)
116 |     # completes successfully
117 |     assert embedding is not None
118 |     assert embedding.shape == (X_train.shape[0], 2)
119 | 
120 | 
121 | @not_windows
122 | @tf_only
123 | def test_save_load(moon_dataset):
124 |     """tests saving and loading"""
125 | 
126 |     embedder = ParametricUMAP()
127 |     embedding = embedder.fit_transform(moon_dataset)
128 |     # completes successfully
129 |     assert embedding is not None
130 |     assert embedding.shape == (moon_dataset.shape[0], 2)
131 | 
132 |     # Portable tempfile
133 |     model_path = tempfile.mkdtemp(suffix="_umap_model")
134 | 
135 |     embedder.save(model_path)
136 |     loaded_model = load_ParametricUMAP(model_path)
137 |     assert loaded_model is not None
138 | 
139 |     loaded_embedding = loaded_model.transform(moon_dataset)
140 |     assert_array_almost_equal(
141 |         embedding,
142 |         loaded_embedding,
143 |         decimal=5,
144 |         err_msg="Loaded model transform fails to match original embedding",
145 |     )
146 | 


--------------------------------------------------------------------------------
/umap/tests/test_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | import umap
 4 | 
 5 | # Globals, used for all the tests
 6 | SEED = 189212  # 0b101110001100011100
 7 | np.random.seed(SEED)
 8 | 
 9 | try:
10 |     from umap import plot
11 | 
12 |     IMPORT_PLOT = True
13 | except ImportError:
14 |     IMPORT_PLOT = False
15 | 
16 | plot_only = pytest.mark.skipif(not IMPORT_PLOT, reason="umap plot not found.")
17 | 
18 | 
19 | @pytest.fixture(scope="session")
20 | def mapper(iris):
21 |     return umap.UMAP(n_epochs=100).fit(iris.data)
22 | 
23 | 
24 | # These tests requires revision: Refactoring is
25 | # needed as there is no assertion nor
26 | # property verification.
27 | @plot_only
28 | def test_plot_runs_at_all(mapper, iris, iris_selection):
29 |     from umap import plot as umap_plot
30 | 
31 |     umap_plot.points(mapper)
32 |     umap_plot.points(mapper, labels=iris.target)
33 |     umap_plot.points(mapper, values=iris.data[:, 0])
34 |     umap_plot.points(mapper, labels=iris.target, subset_points=iris_selection)
35 |     umap_plot.points(mapper, values=iris.data[:, 0], subset_points=iris_selection)
36 |     umap_plot.points(mapper, theme="fire")
37 |     umap_plot.diagnostic(mapper, diagnostic_type="all")
38 |     umap_plot.diagnostic(mapper, diagnostic_type="neighborhood")
39 |     umap_plot.connectivity(mapper)
40 |     umap_plot.connectivity(mapper, theme="fire")
41 |     umap_plot.connectivity(mapper, edge_bundling="hammer")
42 |     umap_plot.interactive(mapper)
43 |     umap_plot.interactive(mapper, labels=iris.target)
44 |     umap_plot.interactive(mapper, values=iris.data[:, 0])
45 |     umap_plot.interactive(mapper, labels=iris.target, subset_points=iris_selection)
46 |     umap_plot.interactive(mapper, values=iris.data[:, 0], subset_points=iris_selection)
47 |     umap_plot.interactive(mapper, theme="fire")
48 |     umap_plot._datashade_points(mapper.embedding_)
49 |     umap_plot._datashade_points(mapper.embedding_, labels=iris.target)
50 |     umap_plot._datashade_points(mapper.embedding_, values=iris.data[:, 0])
51 | 


--------------------------------------------------------------------------------
/umap/tests/test_spectral.py:
--------------------------------------------------------------------------------
 1 | from umap.spectral import spectral_layout, tswspectral_layout
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | import re
 6 | from scipy.version import full_version as scipy_full_version_
 7 | from warnings import catch_warnings
 8 | 
 9 | scipy_full_version = tuple(
10 |     int(n)
11 |     for n in re.findall(r"[0-9]+\.[0-9]+\.?[0-9]*", scipy_full_version_)[0].split(".")
12 | )
13 | 
14 | 
15 | @pytest.mark.skipif(
16 |     scipy_full_version < (1, 10) or scipy_full_version >= (1, 15),
17 |     reason="SciPy installing with Python 3.7 does not converge under same circumstances",
18 | )
19 | def test_tsw_spectral_init(iris):
20 |     # create an arbitrary (dense) random affinity matrix
21 |     seed = 42
22 |     rng = np.random.default_rng(seed=seed)
23 |     # matrix must be of sufficient size of lobpcg will refuse to work on it
24 |     n = 20
25 |     graph = rng.standard_normal(n * n).reshape((n, n)) ** 2
26 |     graph = graph.T * graph
27 | 
28 |     spec = spectral_layout(None, graph, 2, random_state=seed**2)
29 |     tsw_spec = tswspectral_layout(None, graph, 2, random_state=seed**2, tol=1e-8)
30 | 
31 |     # Make sure the two methods produce similar embeddings.
32 |     rmsd = np.mean(np.sum((spec - tsw_spec) ** 2, axis=1))
33 |     assert (
34 |         rmsd < 1e-6
35 |     ), "tsvd-warmed spectral init insufficiently close to standard spectral init"
36 | 
37 | 
38 | @pytest.mark.skipif(
39 |     scipy_full_version < (1, 10),
40 |     reason="SciPy installing with Py 3.7 does not warn reliably on convergence failure",
41 | )
42 | def test_ensure_fallback_to_random_on_spectral_failure():
43 |     dim = 1000
44 |     k = 10
45 |     assert k >= 10
46 |     assert dim // 10 > k
47 |     y = np.eye(dim, k=1)
48 |     u = np.random.random((dim, dim // 10))
49 |     graph = y + y.T + u @ u.T
50 |     with pytest.warns(UserWarning, match="Spectral initialisation failed!"):
51 |         tswspectral_layout(u, graph, k, random_state=42, maxiter=2, method="lobpcg")
52 | 


--------------------------------------------------------------------------------
/umap/tests/test_umap.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/umap/15e55bb6a1ca23b8d6040d9d6184a7ae98325ace/umap/tests/test_umap.py


--------------------------------------------------------------------------------
/umap/tests/test_umap_get_feature_names_out.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.datasets import make_classification
 3 | from sklearn.pipeline import Pipeline, FeatureUnion
 4 | 
 5 | from ..umap_ import UMAP
 6 | 
 7 | 
 8 | def test_get_feature_names_out():
 9 |     X, _ = make_classification(n_samples=30, n_features=10, random_state=42)
10 |     umap = UMAP(
11 |         n_neighbors=10,
12 |         min_dist=0.01,
13 |         n_epochs=200,
14 |         random_state=42,
15 |         n_components=3,
16 |     ).fit(X)
17 |     # get_feature_names_out should not care about passed features.
18 |     features_names_in = [f"feature{i}" for i in range(10)]
19 |     feature_names_out = umap.get_feature_names_out(input_features=features_names_in)
20 |     expected = ["umap0", "umap1", "umap2"]
21 |     np.testing.assert_array_equal(feature_names_out, expected)
22 | 
23 | 
24 | def test_get_feature_names_out_default():
25 |     X, _ = make_classification(n_samples=30, n_features=10, random_state=42)
26 |     umap = UMAP(
27 |         n_neighbors=10,
28 |         min_dist=0.01,
29 |         n_epochs=200,
30 |         random_state=42,
31 |         n_components=3,
32 |     ).fit(X)
33 |     # get_feature_names_out should generate feature names in a certain format if no names are passed.
34 |     default_result = umap.get_feature_names_out()
35 |     expected_default_result = ["umap0", "umap1", "umap2"]
36 |     np.testing.assert_array_equal(default_result, expected_default_result)
37 | 
38 | 
39 | def test_get_feature_names_out_multicomponent():
40 |     # The output length should be equal to the number of components UMAP generates.
41 |     X, _ = make_classification(n_samples=30, n_features=10, random_state=42)
42 |     umap = UMAP(
43 |         n_neighbors=10,
44 |         min_dist=0.01,
45 |         n_epochs=200,
46 |         random_state=42,
47 |         n_components=9,
48 |     ).fit(X)
49 |     result_umap = umap.get_feature_names_out()
50 |     expected_umap_result = [f"umap{i}" for i in range(9)]
51 |     assert len(result_umap) == 9
52 |     np.testing.assert_array_equal(result_umap, expected_umap_result)
53 | 
54 | 
55 | 
56 | def test_get_feature_names_out_featureunion():
57 |     X, _ = make_classification(n_samples=30, n_features=10, random_state=42)
58 |     pipeline = Pipeline(
59 |         [
60 |             (
61 |                 "umap_pipeline",
62 |                 FeatureUnion(
63 |                     [
64 |                         ("umap1", UMAP(n_components=2)),
65 |                         ("umap2", UMAP(n_components=3)),
66 |                     ]
67 |                 ),
68 |             )
69 |         ]
70 |     )
71 | 
72 |     pipeline.fit(X)
73 |     feature_names = pipeline.get_feature_names_out()
74 |     expected_feature_names = np.array(
75 |         [
76 |             "umap1__umap0",
77 |             "umap1__umap1",
78 |             "umap2__umap0",
79 |             "umap2__umap1",
80 |             "umap2__umap2",
81 |         ]
82 |     )
83 |     np.testing.assert_array_equal(feature_names, expected_feature_names)
84 | 


--------------------------------------------------------------------------------
/umap/tests/test_umap_nn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | from numpy.testing import assert_array_almost_equal
  4 | from sklearn.neighbors import KDTree
  5 | from sklearn.preprocessing import normalize
  6 | 
  7 | from umap import distances as dist
  8 | from umap.umap_ import (
  9 |     nearest_neighbors,
 10 |     smooth_knn_dist,
 11 | )
 12 | 
 13 | 
 14 | # ===================================================
 15 | #  Nearest Neighbour Test cases
 16 | # ===================================================
 17 | 
 18 | 
 19 | # nearest_neighbours metric parameter validation
 20 | # -----------------------------------------------
 21 | def test_nn_bad_metric(nn_data):
 22 |     with pytest.raises(ValueError):
 23 |         nearest_neighbors(nn_data, 10, 42, {}, False, np.random)
 24 | 
 25 | 
 26 | def test_nn_bad_metric_sparse_data(sparse_nn_data):
 27 |     with pytest.raises(ValueError):
 28 |         nearest_neighbors(
 29 |             sparse_nn_data,
 30 |             10,
 31 |             "seuclidean",
 32 |             {},
 33 |             False,
 34 |             np.random,
 35 |         )
 36 | 
 37 | 
 38 | # -------------------------------------------------
 39 | #  Utility functions for Nearest Neighbour
 40 | # -------------------------------------------------
 41 | 
 42 | 
 43 | def knn(indices, nn_data):  # pragma: no cover
 44 |     tree = KDTree(nn_data)
 45 |     true_indices = tree.query(nn_data, 10, return_distance=False)
 46 |     num_correct = 0.0
 47 |     for i in range(nn_data.shape[0]):
 48 |         num_correct += np.sum(np.isin(true_indices[i], indices[i]))
 49 |     return num_correct / (nn_data.shape[0] * 10)
 50 | 
 51 | 
 52 | def smooth_knn(nn_data, local_connectivity=1.0):
 53 |     knn_indices, knn_dists, _ = nearest_neighbors(
 54 |         nn_data, 10, "euclidean", {}, False, np.random
 55 |     )
 56 |     sigmas, rhos = smooth_knn_dist(
 57 |         knn_dists, 10.0, local_connectivity=local_connectivity
 58 |     )
 59 |     shifted_dists = knn_dists - rhos[:, np.newaxis]
 60 |     shifted_dists[shifted_dists < 0.0] = 0.0
 61 |     vals = np.exp(-(shifted_dists / sigmas[:, np.newaxis]))
 62 |     norms = np.sum(vals, axis=1)
 63 |     return norms
 64 | 
 65 | 
 66 | @pytest.mark.skip()
 67 | def test_nn_descent_neighbor_accuracy(nn_data):  # pragma: no cover
 68 |     knn_indices, knn_dists, _ = nearest_neighbors(
 69 |         nn_data, 10, "euclidean", {}, False, np.random
 70 |     )
 71 |     percent_correct = knn(knn_indices, nn_data)
 72 |     assert (
 73 |         percent_correct >= 0.85
 74 |     ), "NN-descent did not get 89% accuracy on nearest neighbors"
 75 | 
 76 | 
 77 | @pytest.mark.skip()
 78 | def test_nn_descent_neighbor_accuracy_low_memory(nn_data):  # pragma: no cover
 79 |     knn_indices, knn_dists, _ = nearest_neighbors(
 80 |         nn_data, 10, "euclidean", {}, False, np.random, low_memory=True
 81 |     )
 82 |     percent_correct = knn(knn_indices, nn_data)
 83 |     assert (
 84 |         percent_correct >= 0.89
 85 |     ), "NN-descent did not get 89% accuracy on nearest neighbors"
 86 | 
 87 | 
 88 | @pytest.mark.skip()
 89 | def test_angular_nn_descent_neighbor_accuracy(nn_data):  # pragma: no cover
 90 |     knn_indices, knn_dists, _ = nearest_neighbors(
 91 |         nn_data, 10, "cosine", {}, True, np.random
 92 |     )
 93 |     angular_data = normalize(nn_data, norm="l2")
 94 |     percent_correct = knn(knn_indices, angular_data)
 95 |     assert (
 96 |         percent_correct >= 0.85
 97 |     ), "NN-descent did not get 89% accuracy on nearest neighbors"
 98 | 
 99 | 
100 | @pytest.mark.skip()
101 | def test_sparse_nn_descent_neighbor_accuracy(sparse_nn_data):  # pragma: no cover
102 |     knn_indices, knn_dists, _ = nearest_neighbors(
103 |         sparse_nn_data, 20, "euclidean", {}, False, np.random
104 |     )
105 |     percent_correct = knn(knn_indices, sparse_nn_data.todense())
106 |     assert (
107 |         percent_correct >= 0.75
108 |     ), "Sparse NN-descent did not get 90% accuracy on nearest neighbors"
109 | 
110 | 
111 | @pytest.mark.skip()
112 | def test_sparse_nn_descent_neighbor_accuracy_low_memory(
113 |     sparse_nn_data,
114 | ):  # pragma: no cover
115 |     knn_indices, knn_dists, _ = nearest_neighbors(
116 |         sparse_nn_data, 20, "euclidean", {}, False, np.random, low_memory=True
117 |     )
118 |     percent_correct = knn(knn_indices, sparse_nn_data.todense())
119 |     assert (
120 |         percent_correct >= 0.85
121 |     ), "Sparse NN-descent did not get 90% accuracy on nearest neighbors"
122 | 
123 | 
124 | @pytest.mark.skip()
125 | def test_nn_descent_neighbor_accuracy_callable_metric(nn_data):  # pragma: no cover
126 |     knn_indices, knn_dists, _ = nearest_neighbors(
127 |         nn_data, 10, dist.euclidean, {}, False, np.random
128 |     )
129 | 
130 |     percent_correct = knn(knn_indices, nn_data)
131 |     assert (
132 |         percent_correct >= 0.95
133 |     ), "NN-descent did not get 95% accuracy on nearest neighbors with callable metric"
134 | 
135 | 
136 | @pytest.mark.skip()
137 | def test_sparse_angular_nn_descent_neighbor_accuracy(
138 |     sparse_nn_data,
139 | ):  # pragma: no cover
140 |     knn_indices, knn_dists, _ = nearest_neighbors(
141 |         sparse_nn_data, 20, "cosine", {}, True, np.random
142 |     )
143 |     angular_data = normalize(sparse_nn_data, norm="l2").toarray()
144 |     percent_correct = knn(knn_indices, angular_data)
145 |     assert (
146 |         percent_correct >= 0.90
147 |     ), "Sparse NN-descent did not get 90% accuracy on nearest neighbors"
148 | 
149 | 
150 | def test_smooth_knn_dist_l1norms(nn_data):
151 |     norms = smooth_knn(nn_data)
152 |     assert_array_almost_equal(
153 |         norms,
154 |         1.0 + np.log2(10) * np.ones(norms.shape[0]),
155 |         decimal=3,
156 |         err_msg="Smooth knn-dists does not give expected" "norms",
157 |     )
158 | 
159 | 
160 | def test_smooth_knn_dist_l1norms_w_connectivity(nn_data):
161 |     norms = smooth_knn(nn_data, local_connectivity=1.75)
162 |     assert_array_almost_equal(
163 |         norms,
164 |         1.0 + np.log2(10) * np.ones(norms.shape[0]),
165 |         decimal=3,
166 |         err_msg="Smooth knn-dists does not give expected"
167 |         "norms for local_connectivity=1.75",
168 |     )
169 | 


--------------------------------------------------------------------------------
/umap/tests/test_umap_repeated_data.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from umap import UMAP
 3 | 
 4 | 
 5 | # ===================================================
 6 | #  Spatial Data Test cases
 7 | # ===================================================
 8 | #  Use force_approximation_algorithm in order to test
 9 | #  the region of the code that is called for n>4096
10 | # ---------------------------------------------------
11 | 
12 | 
13 | def test_repeated_points_large_sparse_spatial(sparse_spatial_data_repeats):
14 |     model = UMAP(
15 |         n_neighbors=3,
16 |         unique=True,
17 |         force_approximation_algorithm=True,
18 |         n_epochs=20,
19 |         verbose=True,
20 |     ).fit(sparse_spatial_data_repeats)
21 |     assert np.unique(model.embedding_[0:2], axis=0).shape[0] == 1
22 | 
23 | 
24 | def test_repeated_points_small_sparse_spatial(sparse_spatial_data_repeats):
25 |     model = UMAP(n_neighbors=3, unique=True, n_epochs=20).fit(
26 |         sparse_spatial_data_repeats
27 |     )
28 |     assert np.unique(model.embedding_[0:2], axis=0).shape[0] == 1
29 | 
30 | 
31 | # Use force_approximation_algorithm in order to test the region
32 | # of the code that is called for n>4096
33 | def test_repeated_points_large_dense_spatial(spatial_repeats):
34 |     model = UMAP(
35 |         n_neighbors=3, unique=True, force_approximation_algorithm=True, n_epochs=50
36 |     ).fit(spatial_repeats)
37 |     assert np.unique(model.embedding_[0:2], axis=0).shape[0] == 1
38 | 
39 | 
40 | def test_repeated_points_small_dense_spatial(spatial_repeats):
41 |     model = UMAP(n_neighbors=3, unique=True, n_epochs=20).fit(spatial_repeats)
42 |     assert np.unique(model.embedding_[0:2], axis=0).shape[0] == 1
43 | 
44 | 
45 | # ===================================================
46 | #  Binary Data Test cases
47 | # ===================================================
48 | # Use force_approximation_algorithm in order to test
49 | # the region of the code that is called for n>4096
50 | # ---------------------------------------------------
51 | 
52 | 
53 | def test_repeated_points_large_sparse_binary(sparse_binary_data_repeats):
54 |     model = UMAP(
55 |         n_neighbors=3, unique=True, force_approximation_algorithm=True, n_epochs=50
56 |     ).fit(sparse_binary_data_repeats)
57 |     assert np.unique(model.embedding_[0:2], axis=0).shape[0] == 1
58 | 
59 | 
60 | def test_repeated_points_small_sparse_binary(sparse_binary_data_repeats):
61 |     model = UMAP(n_neighbors=3, unique=True, n_epochs=20).fit(
62 |         sparse_binary_data_repeats
63 |     )
64 |     assert np.unique(model.embedding_[0:2], axis=0).shape[0] == 1
65 | 
66 | 
67 | # Use force_approximation_algorithm in order to test
68 | # the region of the code that is called for n>4096
69 | def test_repeated_points_large_dense_binary(binary_repeats):
70 |     model = UMAP(
71 |         n_neighbors=3, unique=True, force_approximation_algorithm=True, n_epochs=20
72 |     ).fit(binary_repeats)
73 |     assert np.unique(model.embedding_[0:2], axis=0).shape[0] == 1
74 | 
75 | 
76 | def test_repeated_points_small_dense_binary(binary_repeats):
77 |     model = UMAP(n_neighbors=3, unique=True, n_epochs=20).fit(binary_repeats)
78 |     assert np.unique(binary_repeats[0:2], axis=0).shape[0] == 1
79 |     assert np.unique(model.embedding_[0:2], axis=0).shape[0] == 1
80 | 
81 | 
82 | # ===================================================
83 | #  Repeated Data Test cases
84 | # ===================================================
85 | 
86 | 
87 | # ----------------------------------------------------
88 | # This should test whether the n_neighbours are being
89 | # reduced properly when your n_neighbours is larger
90 | # than the unique data set size
91 | # ----------------------------------------------------
92 | def test_repeated_points_large_n(repetition_dense):
93 |     model = UMAP(n_neighbors=5, unique=True, n_epochs=20).fit(repetition_dense)
94 |     assert model._n_neighbors == 3
95 | 


--------------------------------------------------------------------------------
/umap/tests/test_umap_trustworthiness.py:
--------------------------------------------------------------------------------
  1 | from umap import UMAP
  2 | from sklearn.datasets import make_blobs
  3 | from sklearn.metrics import pairwise_distances
  4 | import numpy as np
  5 | import scipy.sparse
  6 | 
  7 | try:
  8 |     # works for sklearn>=0.22
  9 |     from sklearn.manifold import trustworthiness
 10 | except ImportError:
 11 |     # this is to comply with requirements (scikit-learn>=0.20)
 12 |     # More recent versions of sklearn have exposed trustworthiness
 13 |     # in top level module API
 14 |     # see: https://github.com/scikit-learn/scikit-learn/pull/15337
 15 |     from sklearn.manifold.t_sne import trustworthiness
 16 | 
 17 | # ===================================================
 18 | #  UMAP Trustworthiness Test cases
 19 | # ===================================================
 20 | 
 21 | 
 22 | def test_umap_sparse_trustworthiness(sparse_test_data):
 23 |     embedding = UMAP(n_neighbors=10, n_epochs=100).fit_transform(sparse_test_data[:100])
 24 |     trust = trustworthiness(sparse_test_data[:100].toarray(), embedding, n_neighbors=10)
 25 |     assert (
 26 |         trust >= 0.88
 27 |     ), "Insufficiently trustworthy embedding for sparse test dataset: {}".format(trust)
 28 | 
 29 | 
 30 | def test_umap_trustworthiness_fast_approx(nn_data):
 31 |     data = nn_data[:50]
 32 |     embedding = UMAP(
 33 |         n_neighbors=10,
 34 |         min_dist=0.01,
 35 |         random_state=42,
 36 |         n_epochs=100,
 37 |         force_approximation_algorithm=True,
 38 |     ).fit_transform(data)
 39 |     trust = trustworthiness(data, embedding, n_neighbors=10)
 40 |     assert (
 41 |         trust >= 0.8
 42 |     ), "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust)
 43 | 
 44 | 
 45 | def test_umap_trustworthiness_random_init(nn_data):
 46 |     data = nn_data[:50]
 47 |     embedding = UMAP(
 48 |         n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=100, init="random"
 49 |     ).fit_transform(data)
 50 |     trust = trustworthiness(data, embedding, n_neighbors=10)
 51 |     assert (
 52 |         trust >= 0.8
 53 |     ), "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust)
 54 | 
 55 | 
 56 | def test_supervised_umap_trustworthiness():
 57 |     data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
 58 |     embedding = UMAP(
 59 |         n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=100
 60 |     ).fit_transform(data, labels)
 61 |     trust = trustworthiness(data, embedding, n_neighbors=10)
 62 |     assert (
 63 |         trust >= 0.95
 64 |     ), "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust)
 65 | 
 66 | 
 67 | def test_semisupervised_umap_trustworthiness():
 68 |     data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
 69 |     labels[10:30] = -1
 70 |     embedding = UMAP(
 71 |         n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=100
 72 |     ).fit_transform(data, labels)
 73 |     trust = trustworthiness(data, embedding, n_neighbors=10)
 74 |     assert (
 75 |         trust >= 0.95
 76 |     ), "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust)
 77 | 
 78 | 
 79 | def test_metric_supervised_umap_trustworthiness():
 80 |     data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
 81 |     embedding = UMAP(
 82 |         n_neighbors=10,
 83 |         min_dist=0.01,
 84 |         target_metric="l1",
 85 |         target_weight=0.8,
 86 |         n_epochs=100,
 87 |         random_state=42,
 88 |     ).fit_transform(data, labels)
 89 |     trust = trustworthiness(data, embedding, n_neighbors=10)
 90 |     assert (
 91 |         trust >= 0.95
 92 |     ), "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust)
 93 | 
 94 | 
 95 | def test_string_metric_supervised_umap_trustworthiness():
 96 |     data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
 97 |     labels = np.array(["this", "that", "other"])[labels]
 98 |     embedding = UMAP(
 99 |         n_neighbors=10,
100 |         min_dist=0.01,
101 |         target_metric="string",
102 |         target_weight=0.8,
103 |         n_epochs=100,
104 |         random_state=42,
105 |     ).fit_transform(data, labels)
106 |     trust = trustworthiness(data, embedding, n_neighbors=10)
107 |     assert (
108 |         trust >= 0.95
109 |     ), "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust)
110 | 
111 | 
112 | def test_discrete_metric_supervised_umap_trustworthiness():
113 |     data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
114 |     embedding = UMAP(
115 |         n_neighbors=10,
116 |         min_dist=0.01,
117 |         target_metric="ordinal",
118 |         target_weight=0.8,
119 |         n_epochs=100,
120 |         random_state=42,
121 |     ).fit_transform(data, labels)
122 |     trust = trustworthiness(data, embedding, n_neighbors=10)
123 |     assert (
124 |         trust >= 0.95
125 |     ), "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust)
126 | 
127 | 
128 | def test_count_metric_supervised_umap_trustworthiness():
129 |     data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
130 |     labels = (labels**2) + 2 * labels
131 |     embedding = UMAP(
132 |         n_neighbors=10,
133 |         min_dist=0.01,
134 |         target_metric="count",
135 |         target_weight=0.8,
136 |         n_epochs=100,
137 |         random_state=42,
138 |     ).fit_transform(data, labels)
139 |     trust = trustworthiness(data, embedding, n_neighbors=10)
140 |     assert (
141 |         trust >= 0.95
142 |     ), "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust)
143 | 
144 | 
145 | def test_sparse_precomputed_metric_umap_trustworthiness():
146 |     data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
147 |     dmat = scipy.sparse.csr_matrix(pairwise_distances(data))
148 |     embedding = UMAP(
149 |         n_neighbors=10,
150 |         min_dist=0.01,
151 |         random_state=42,
152 |         n_epochs=100,
153 |         metric="precomputed",
154 |     ).fit_transform(dmat)
155 |     trust = trustworthiness(data, embedding, n_neighbors=10)
156 |     assert (
157 |         trust >= 0.75
158 |     ), "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust)
159 | 


--------------------------------------------------------------------------------
/umap/utils.py:
--------------------------------------------------------------------------------
  1 | # Author: Leland McInnes <leland.mcinnes@gmail.com>
  2 | #
  3 | # License: BSD 3 clause
  4 | 
  5 | import time
  6 | from warnings import warn
  7 | 
  8 | import numpy as np
  9 | import numba
 10 | from sklearn.utils.validation import check_is_fitted
 11 | import scipy.sparse
 12 | 
 13 | 
 14 | @numba.njit(parallel=True)
 15 | def fast_knn_indices(X, n_neighbors):
 16 |     """A fast computation of knn indices.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     X: array of shape (n_samples, n_features)
 21 |         The input data to compute the k-neighbor indices of.
 22 | 
 23 |     n_neighbors: int
 24 |         The number of nearest neighbors to compute for each sample in ``X``.
 25 | 
 26 |     Returns
 27 |     -------
 28 |     knn_indices: array of shape (n_samples, n_neighbors)
 29 |         The indices on the ``n_neighbors`` closest points in the dataset.
 30 |     """
 31 |     knn_indices = np.empty((X.shape[0], n_neighbors), dtype=np.int32)
 32 |     for row in numba.prange(X.shape[0]):
 33 |         # v = np.argsort(X[row])  # Need to call argsort this way for numba
 34 |         v = X[row].argsort(kind="quicksort")
 35 |         v = v[:n_neighbors]
 36 |         knn_indices[row] = v
 37 |     return knn_indices
 38 | 
 39 | 
 40 | @numba.njit("i4(i8[:])")
 41 | def tau_rand_int(state):
 42 |     """A fast (pseudo)-random number generator.
 43 | 
 44 |     Parameters
 45 |     ----------
 46 |     state: array of int64, shape (3,)
 47 |         The internal state of the rng
 48 | 
 49 |     Returns
 50 |     -------
 51 |     A (pseudo)-random int32 value
 52 |     """
 53 |     state[0] = (((state[0] & 4294967294) << 12) & 0xFFFFFFFF) ^ (
 54 |         (((state[0] << 13) & 0xFFFFFFFF) ^ state[0]) >> 19
 55 |     )
 56 |     state[1] = (((state[1] & 4294967288) << 4) & 0xFFFFFFFF) ^ (
 57 |         (((state[1] << 2) & 0xFFFFFFFF) ^ state[1]) >> 25
 58 |     )
 59 |     state[2] = (((state[2] & 4294967280) << 17) & 0xFFFFFFFF) ^ (
 60 |         (((state[2] << 3) & 0xFFFFFFFF) ^ state[2]) >> 11
 61 |     )
 62 | 
 63 |     return state[0] ^ state[1] ^ state[2]
 64 | 
 65 | 
 66 | @numba.njit("f4(i8[:])")
 67 | def tau_rand(state):
 68 |     """A fast (pseudo)-random number generator for floats in the range [0,1]
 69 | 
 70 |     Parameters
 71 |     ----------
 72 |     state: array of int64, shape (3,)
 73 |         The internal state of the rng
 74 | 
 75 |     Returns
 76 |     -------
 77 |     A (pseudo)-random float32 in the interval [0, 1]
 78 |     """
 79 |     integer = tau_rand_int(state)
 80 |     return abs(float(integer) / 0x7FFFFFFF)
 81 | 
 82 | 
 83 | @numba.njit()
 84 | def norm(vec):
 85 |     """Compute the (standard l2) norm of a vector.
 86 | 
 87 |     Parameters
 88 |     ----------
 89 |     vec: array of shape (dim,)
 90 | 
 91 |     Returns
 92 |     -------
 93 |     The l2 norm of vec.
 94 |     """
 95 |     result = 0.0
 96 |     for i in range(vec.shape[0]):
 97 |         result += vec[i] ** 2
 98 |     return np.sqrt(result)
 99 | 
100 | 
101 | @numba.njit(parallel=True)
102 | def submatrix(dmat, indices_col, n_neighbors):
103 |     """Return a submatrix given an orginal matrix and the indices to keep.
104 | 
105 |     Parameters
106 |     ----------
107 |     dmat: array, shape (n_samples, n_samples)
108 |         Original matrix.
109 | 
110 |     indices_col: array, shape (n_samples, n_neighbors)
111 |         Indices to keep. Each row consists of the indices of the columns.
112 | 
113 |     n_neighbors: int
114 |         Number of neighbors.
115 | 
116 |     Returns
117 |     -------
118 |     submat: array, shape (n_samples, n_neighbors)
119 |         The corresponding submatrix.
120 |     """
121 |     n_samples_transform, n_samples_fit = dmat.shape
122 |     submat = np.zeros((n_samples_transform, n_neighbors), dtype=dmat.dtype)
123 |     for i in numba.prange(n_samples_transform):
124 |         for j in numba.prange(n_neighbors):
125 |             submat[i, j] = dmat[i, indices_col[i, j]]
126 |     return submat
127 | 
128 | 
129 | # Generates a timestamp for use in logging messages when verbose=True
130 | def ts():
131 |     return time.ctime(time.time())
132 | 
133 | 
134 | # I'm not enough of a numba ninja to numba this successfully.
135 | # np.arrays of lists, which are objects...
136 | def csr_unique(matrix, return_index=True, return_inverse=True, return_counts=True):
137 |     """Find the unique elements of a sparse csr matrix.
138 |     We don't explicitly construct the unique matrix leaving that to the user
139 |     who may not want to duplicate a massive array in memory.
140 |     Returns the indices of the input array that give the unique values.
141 |     Returns the indices of the unique array that reconstructs the input array.
142 |     Returns the number of times each unique row appears in the input matrix.
143 | 
144 |     matrix: a csr matrix
145 |     return_index = bool, optional
146 |         If true, return the row indices of 'matrix'
147 |     return_inverse: bool, optional
148 |         If true, return the indices of the unique array that can be
149 |            used to reconstruct 'matrix'.
150 |     return_counts = bool, optional
151 |         If true, returns the number of times each unique item appears in 'matrix'
152 | 
153 |     The unique matrix can computed via
154 |     unique_matrix = matrix[index]
155 |     and the original matrix reconstructed via
156 |     unique_matrix[inverse]
157 |     """
158 |     lil_matrix = matrix.tolil()
159 |     rows = np.asarray(
160 |         [tuple(x + y) for x, y in zip(lil_matrix.rows, lil_matrix.data)], dtype=object
161 |     )
162 |     return_values = return_counts + return_inverse + return_index
163 |     return np.unique(
164 |         rows,
165 |         return_index=return_index,
166 |         return_inverse=return_inverse,
167 |         return_counts=return_counts,
168 |     )[1 : (return_values + 1)]
169 | 
170 | 
171 | def disconnected_vertices(model):
172 |     """
173 |     Returns a boolean vector indicating which vertices are disconnected from the umap graph.
174 |     These vertices will often be scattered across the space and make it difficult to focus on the main
175 |     manifold.  They can either be filtered and have UMAP re-run or simply filtered from the interactive plotting tool
176 |     via the subset_points parameter.
177 |     Use ~disconnected_vertices(model) to only plot the connected points.
178 |     Parameters
179 |     ----------
180 |     model: a trained UMAP model
181 | 
182 |     Returns
183 |     -------
184 |     A boolean vector indicating which points are disconnected
185 |     """
186 |     check_is_fitted(model, "graph_")
187 |     if model.unique:
188 |         vertices_disconnected = (
189 |             np.array(model.graph_[model._unique_inverse_].sum(axis=1)).flatten() == 0
190 |         )
191 |     else:
192 |         vertices_disconnected = np.array(model.graph_.sum(axis=1)).flatten() == 0
193 |     return vertices_disconnected
194 | 
195 | 
196 | def average_nn_distance(dist_matrix):
197 |     """Calculate the average distance to each points nearest neighbors.
198 | 
199 |     Parameters
200 |     ----------
201 |     dist_matrix: a csr_matrix
202 |         A distance matrix (usually umap_model.graph_)
203 | 
204 |     Returns
205 |     -------
206 |     An array with the average distance to each points nearest neighbors
207 | 
208 |     """
209 |     (row_idx, col_idx, val) = scipy.sparse.find(dist_matrix)
210 | 
211 |     # Count/sum is done per row
212 |     count_non_zero_elems = np.bincount(row_idx)
213 |     sum_non_zero_elems = np.bincount(row_idx, weights=val)
214 |     averages = sum_non_zero_elems / count_non_zero_elems
215 | 
216 |     if any(np.isnan(averages)):
217 |         warn(
218 |             "Embedding contains disconnected vertices which will be ignored."
219 |             "Use umap.utils.disconnected_vertices() to identify them."
220 |         )
221 | 
222 |     return averages
223 | 


--------------------------------------------------------------------------------
/umap/validation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import numba
 3 | 
 4 | from sklearn.neighbors import KDTree
 5 | from umap.distances import named_distances
 6 | 
 7 | 
 8 | @numba.njit()
 9 | def trustworthiness_vector_bulk(
10 |     indices_source, indices_embedded, max_k
11 | ):  # pragma: no cover
12 | 
13 |     n_samples = indices_embedded.shape[0]
14 |     trustworthiness = np.zeros(max_k + 1, dtype=np.float64)
15 | 
16 |     for i in range(n_samples):
17 |         for j in range(max_k):
18 | 
19 |             rank = 0
20 |             while indices_source[i, rank] != indices_embedded[i, j]:
21 |                 rank += 1
22 | 
23 |             for k in range(j + 1, max_k + 1):
24 |                 if rank > k:
25 |                     trustworthiness[k] += rank - k
26 | 
27 |     for k in range(1, max_k + 1):
28 |         trustworthiness[k] = 1.0 - trustworthiness[k] * (
29 |             2.0 / (n_samples * k * (2.0 * n_samples - 3.0 * k - 1.0))
30 |         )
31 | 
32 |     return trustworthiness
33 | 
34 | 
35 | def make_trustworthiness_calculator(metric):  # pragma: no cover
36 |     @numba.njit(parallel=True)
37 |     def trustworthiness_vector_lowmem(source, indices_embedded, max_k):
38 | 
39 |         n_samples = indices_embedded.shape[0]
40 |         trustworthiness = np.zeros(max_k + 1, dtype=np.float64)
41 |         dist_vector = np.zeros(n_samples, dtype=np.float64)
42 | 
43 |         for i in range(n_samples):
44 | 
45 |             for j in numba.prange(n_samples):
46 |                 dist_vector[j] = metric(source[i], source[j])
47 | 
48 |             indices_source = np.argsort(dist_vector)
49 | 
50 |             for j in range(max_k):
51 | 
52 |                 rank = 0
53 |                 while indices_source[rank] != indices_embedded[i, j]:
54 |                     rank += 1
55 | 
56 |                 for k in range(j + 1, max_k + 1):
57 |                     if rank > k:
58 |                         trustworthiness[k] += rank - k
59 | 
60 |         for k in range(1, max_k + 1):
61 |             trustworthiness[k] = 1.0 - trustworthiness[k] * (
62 |                 2.0 / (n_samples * k * (2.0 * n_samples - 3.0 * k - 1.0))
63 |             )
64 | 
65 |         trustworthiness[0] = 1.0
66 | 
67 |         return trustworthiness
68 | 
69 |     return trustworthiness_vector_lowmem
70 | 
71 | 
72 | def trustworthiness_vector(
73 |     source, embedding, max_k, metric="euclidean"
74 | ):  # pragma: no cover
75 |     tree = KDTree(embedding, metric=metric)
76 |     indices_embedded = tree.query(embedding, k=max_k, return_distance=False)
77 |     # Drop the actual point itself
78 |     indices_embedded = indices_embedded[:, 1:]
79 | 
80 |     dist = named_distances[metric]
81 | 
82 |     vec_calculator = make_trustworthiness_calculator(dist)
83 | 
84 |     result = vec_calculator(source, indices_embedded, max_k)
85 | 
86 |     return result
87 | 


--------------------------------------------------------------------------------