├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.rst ├── data ├── 10x_pooled_400.mat ├── BranchedSynDat.mat ├── GSE60361_dat.mat ├── SCDE_k2_sup.mat ├── SCDE_test.mat └── SynMouseESprog_1000.mat ├── deploy_pypi.sh ├── docs ├── Makefile ├── _build │ ├── doctrees │ │ ├── best_practices.doctree │ │ ├── environment.pickle │ │ ├── index.doctree │ │ ├── modules.doctree │ │ ├── nmf_wrapper.doctree │ │ ├── preprocessing.doctree │ │ ├── readme_link.doctree │ │ ├── run_se.doctree │ │ ├── state_estimation.doctree │ │ ├── things_we_tried.doctree │ │ ├── uncurl.doctree │ │ ├── uncurl.robust.doctree │ │ ├── uncurl.scalable.doctree │ │ ├── uncurl_2.doctree │ │ ├── uncurl_pub.doctree │ │ └── unsupported_methods.doctree │ └── html │ │ ├── .buildinfo │ │ ├── .nojekyll │ │ ├── _modules │ │ ├── index.html │ │ └── uncurl │ │ │ ├── clustering.html │ │ │ ├── dim_reduce.html │ │ │ ├── dimensionality_reduction.html │ │ │ ├── evaluation.html │ │ │ ├── experiment_runner.html │ │ │ ├── lineage.html │ │ │ ├── nb_cluster.html │ │ │ ├── nb_clustering.html │ │ │ ├── nb_state_estimation.html │ │ │ ├── nmf_wrapper.html │ │ │ ├── pois_ll.html │ │ │ ├── poisson_cluster.html │ │ │ ├── preprocessing.html │ │ │ ├── qual2quant.html │ │ │ ├── robust │ │ │ └── state_estimation.html │ │ │ ├── run_se.html │ │ │ ├── scalable │ │ │ └── state_estimation.html │ │ │ ├── simulation.html │ │ │ ├── spatial_inference.html │ │ │ └── state_estimation.html │ │ ├── _sources │ │ ├── best_practices.rst.txt │ │ ├── index.rst.txt │ │ ├── modules.rst.txt │ │ ├── nmf_wrapper.rst.txt │ │ ├── preprocessing.rst.txt │ │ ├── readme_link.rst.txt │ │ ├── run_se.rst.txt │ │ ├── state_estimation.rst.txt │ │ ├── things_we_tried.rst.txt │ │ ├── uncurl.robust.rst.txt │ │ ├── uncurl.rst.txt │ │ ├── uncurl.scalable.rst.txt │ │ ├── uncurl_2.rst.txt │ │ ├── uncurl_pub.rst.txt │ │ └── unsupported_methods.rst.txt │ │ ├── _static │ │ ├── ajax-loader.gif │ │ ├── alabaster.css │ │ ├── basic.css │ │ ├── classic.css │ │ ├── custom.css │ │ ├── doctools.js │ │ ├── jquery-3.1.0.js │ │ ├── jquery.js │ │ ├── pygments.css │ │ ├── searchtools.js │ │ ├── sidebar.js │ │ ├── underscore-1.3.1.js │ │ ├── underscore.js │ │ └── websupport.js │ │ ├── best_practices.html │ │ ├── genindex.html │ │ ├── index.html │ │ ├── modules.html │ │ ├── nmf_wrapper.html │ │ ├── objects.inv │ │ ├── preprocessing.html │ │ ├── py-modindex.html │ │ ├── readme_link.html │ │ ├── run_se.html │ │ ├── search.html │ │ ├── searchindex.js │ │ ├── state_estimation.html │ │ ├── things_we_tried.html │ │ ├── uncurl.html │ │ ├── uncurl.robust.html │ │ ├── uncurl.scalable.html │ │ ├── uncurl_2.html │ │ ├── uncurl_pub.html │ │ └── unsupported_methods.html ├── conf.py ├── index.rst ├── readme_link.rst ├── things_we_tried.rst ├── uncurl.rst ├── uncurl_pub.rst └── unsupported_methods.rst ├── examples ├── example.py ├── lineage_example.py ├── means_weights.npy ├── synthetic_example.py └── zeisel_subset_example.py ├── notebooks ├── Imputation.ipynb └── Tutorial.ipynb ├── optional_requirements.txt ├── push-docs.sh ├── requirements.txt ├── setup.py ├── tests ├── test_cluster.py ├── test_cluster_sparse.py ├── test_dim_reduce.py ├── test_experiment_runner.py ├── test_fit_dist.py ├── test_gap_score.py ├── test_lineage.py ├── test_nb.py ├── test_nb_state_estimation.py ├── test_nmf.py ├── test_poisson.py ├── test_preprocessing.py ├── test_qual2quant.py ├── test_real_data.py ├── test_state_estimation.py ├── test_state_estimation_sparse.py └── test_zip_state_estimation.py └── uncurl ├── __init__.py ├── clustering.py ├── dimensionality_reduction.py ├── ensemble.py ├── evaluation.py ├── experiment_runner.py ├── fit_dist_data.py ├── gap_score.py ├── lightlda_utils.py ├── lineage.py ├── nb_clustering.py ├── nb_state_estimation.py ├── nmf_wrapper.py ├── nolips.pyx ├── nolips_parallel.pyx ├── plda_utils.py ├── pois_ll.py ├── preprocessing.py ├── qual2quant.py ├── run_se.py ├── sampling.py ├── simulation.py ├── sparse_utils.pyx ├── state_estimation.py ├── vis.py ├── zip_clustering.py ├── zip_state_estimation.py └── zip_utils.py /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: enabled 3 | python: 4 | - 2.7 5 | - 3.4 6 | install: 7 | - pip install -r requirements.txt 8 | - pip install pytest 9 | - pip install flaky 10 | - pip install . 11 | script: pytest 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Yue Zhang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include tests/*.py 2 | include uncurl/*.pyx 3 | -------------------------------------------------------------------------------- /data/10x_pooled_400.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/data/10x_pooled_400.mat -------------------------------------------------------------------------------- /data/BranchedSynDat.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/data/BranchedSynDat.mat -------------------------------------------------------------------------------- /data/GSE60361_dat.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/data/GSE60361_dat.mat -------------------------------------------------------------------------------- /data/SCDE_k2_sup.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/data/SCDE_k2_sup.mat -------------------------------------------------------------------------------- /data/SCDE_test.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/data/SCDE_test.mat -------------------------------------------------------------------------------- /data/SynMouseESprog_1000.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/data/SynMouseESprog_1000.mat -------------------------------------------------------------------------------- /deploy_pypi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # before running this: 4 | # 1. increase version number in setup.py, git commit 5 | # 2. git tag v 6 | # 3. git push --tags 7 | 8 | # delete existing dists 9 | rm dist/*.tar.gz 10 | rm dist/*.whl 11 | 12 | # create a source distribution 13 | python3 setup.py sdist 14 | 15 | # create wheels - note: this can't be uploaded onto pypi 16 | python3 setup.py bdist_wheel 17 | 18 | # upload 19 | twine upload dist/uncurl_seq-*.tar.gz --verbose 20 | 21 | # TODO: how to upload built wheels? This requires the 'manylinux1' platform tag? 22 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXAUTO = sphinx-apidoc 8 | SPHINXPROJ = UNCURL 9 | SOURCEDIR = . 10 | PYTHONDIR = ../uncurl 11 | BUILDDIR = _build 12 | 13 | # Put it first so that "make" without argument is like "make help". 14 | help: 15 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 16 | 17 | .PHONY: help Makefile 18 | 19 | # Catch-all target: route all unknown targets to Sphinx using the new 20 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 21 | %: Makefile 22 | @$(SPHINXAUTO) -o ./ "$(PYTHONDIR)" 23 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 24 | -------------------------------------------------------------------------------- /docs/_build/doctrees/best_practices.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/doctrees/best_practices.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/environment.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/doctrees/environment.pickle -------------------------------------------------------------------------------- /docs/_build/doctrees/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/doctrees/index.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/modules.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/doctrees/modules.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/nmf_wrapper.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/doctrees/nmf_wrapper.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/preprocessing.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/doctrees/preprocessing.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/readme_link.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/doctrees/readme_link.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/run_se.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/doctrees/run_se.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/state_estimation.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/doctrees/state_estimation.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/things_we_tried.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/doctrees/things_we_tried.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/uncurl.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/doctrees/uncurl.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/uncurl.robust.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/doctrees/uncurl.robust.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/uncurl.scalable.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/doctrees/uncurl.scalable.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/uncurl_2.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/doctrees/uncurl_2.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/uncurl_pub.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/doctrees/uncurl_pub.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/unsupported_methods.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/doctrees/unsupported_methods.doctree -------------------------------------------------------------------------------- /docs/_build/html/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: 7a41e62564e52ad3fce21832d1242a34 4 | tags: 645f666f9bcd5a90fca523b33c5a78b7 5 | -------------------------------------------------------------------------------- /docs/_build/html/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/html/.nojekyll -------------------------------------------------------------------------------- /docs/_build/html/_modules/index.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Overview: module code — UNCURL 0.2.3 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 44 | 45 |
46 | 75 | 89 |
90 |
91 | 103 | 107 | 108 | -------------------------------------------------------------------------------- /docs/_build/html/_modules/uncurl/spatial_inference.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | uncurl.spatial_inference — UNCURL 0.2.3 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 45 | 46 |
47 |
48 |
49 |
50 | 51 |

Source code for uncurl.spatial_inference

52 | # Spatial inference using Poisson clustering
53 | 
54 | 
[docs]def spatial(data): 55 | """ 56 | """
57 |
58 | 59 |
60 |
61 |
62 | 76 |
77 |
78 | 91 | 95 | 96 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/best_practices.rst.txt: -------------------------------------------------------------------------------- 1 | UNCURL Best Practices 2 | ====== 3 | 4 | 5 | Distribution Selection 6 | ====================== 7 | 8 | In general, the best distribution to use for a given dataset is the one for which the most genes have the lowest error: see (cite figure here). 9 | 10 | State Estimation 11 | ================ 12 | 13 | Gene subset selection 14 | --------------------- 15 | 16 | .. code-block:: python 17 | 18 | import uncurl 19 | 20 | 21 | Initialization 22 | -------------- 23 | 24 | The default initialization 25 | 26 | Semi-supervision, using bulk or qualitative data 27 | ------------------------------------------------ 28 | 29 | Semi-supervision is done using the 30 | 31 | Clustering 32 | ========== 33 | 34 | Visualization 35 | ============= 36 | 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/index.rst.txt: -------------------------------------------------------------------------------- 1 | .. UNCURL documentation master file, created by 2 | sphinx-quickstart on Mon Mar 27 13:42:21 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to UNCURL's documentation! 7 | ================================== 8 | 9 | 10 | .. toctree:: 11 | :maxdepth: 2 12 | :caption: Contents: 13 | 14 | readme_link 15 | unsupported_methods 16 | things_we_tried 17 | uncurl 18 | 19 | 20 | 21 | Indices and tables 22 | ================== 23 | 24 | * :ref:`genindex` 25 | * :ref:`modindex` 26 | * :ref:`search` 27 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/modules.rst.txt: -------------------------------------------------------------------------------- 1 | uncurl 2 | ====== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | uncurl 8 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/nmf_wrapper.rst.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/html/_sources/nmf_wrapper.rst.txt -------------------------------------------------------------------------------- /docs/_build/html/_sources/preprocessing.rst.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/html/_sources/preprocessing.rst.txt -------------------------------------------------------------------------------- /docs/_build/html/_sources/readme_link.rst.txt: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/run_se.rst.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/html/_sources/run_se.rst.txt -------------------------------------------------------------------------------- /docs/_build/html/_sources/state_estimation.rst.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/html/_sources/state_estimation.rst.txt -------------------------------------------------------------------------------- /docs/_build/html/_sources/things_we_tried.rst.txt: -------------------------------------------------------------------------------- 1 | Non-default parameters: things we tried and their results 2 | ================================================ 3 | 4 | There are a number of uncurl parameters (well, not necessarily parameters, more like... run configurations?) that we experimented with. Here are some results. 5 | 6 | 7 | Cell normalization 8 | ------------------ 9 | 10 | This option involves normalizing the cells by their read counts. First, we calculate the total read count of each cell, and divide all counts for cell i by its total read count. Then, we find the median total read count over all cells, and multiply the entire matrix by that value. This method has been used previously for scRNA-seq datasets [see paper for reference]. 11 | 12 | The clustering performance after cell normalization were substantially better on count-valued datasets, and either had no effect or were marginally worse on RPKM-normalized and other forms of data that have already been normalized in some other way. So we would suggest using this option for unnormalized count-valued datasets. The downside is that it might lose some information (if certain cell types were correlated to larger read counts?), but I'm not sure if that happens in practice. 13 | 14 | [TODO: include graphs] 15 | 16 | To use this option, run ``data_normalized = uncurl.preprocessing.cell_normalize(data)``, and run uncurl on ``data_normalized``. 17 | 18 | 19 | Constrained W 20 | ------------- 21 | 22 | When this option is activated, the ``W`` matrix is normalized so that its columns sum to 1 after each round of alternating minimization. Without this option, ``W`` is only constrained to be nonnegative during the optimization process, and normalized after the end of the optimization. 23 | 24 | In clustering experiments, this option had mixed results. It performed marginally better on some datasets and marginally worse on others. On the 10X datasets, constrained W performed slightly better when combined with cell normalization, and worse without cell normalization. 25 | 26 | [TODO: include graphs] 27 | 28 | To use this option, add the argument ``constrain_w=True`` to ``run_state_estimation`` or ``poisson_estimate_state``. This does not work for the NMF-based methods. 29 | 30 | 31 | Uncurl initialization options 32 | ----------------------------- 33 | 34 | We provide a variety of initialization options for uncurl. Most initialization methods first perform a clustering, initialize M based on the cluster means, and W based on the cluster assignments. The default initialization is based on truncated SVD followed by K-means. We also provide initializations based on Poisson clustering, and Poisson k-means++ with randomized W. 35 | 36 | In clustering experiments, truncated SVD initialization usually performed the best, but there were some datasets under which Poisson clustering initialization performed better. For example, on randomly downsampled data, Poisson clustering initialization seems to perform better. 37 | 38 | To use different initializations, use the argument ``initialization=``, where ```` can be one of ``tsvd`` (truncated SVD + K-means), ``cluster`` (Poisson clustering), ``kmpp`` (Poisson k-means++), or ``km`` (k-means on the full data). 39 | 40 | 41 | Alternative to QualNorm: mean-normalized initialization 42 | ------------------------------------------------------- 43 | 44 | Given prior gene expression data, there are a variety of methods for initializing uncurl. ``QualNorm`` is one way of doing this initialization. Another way, when we have real-valued prior data, we could normalize the prior data so that each cell type sums to 1, and then multiply that by the mean per-cell read count of the actual data. 45 | 46 | This performed better than QualNorm on sparse datasets such as the 10X datasets. 47 | 48 | 49 | Optimization methods 50 | -------------------- 51 | 52 | The default optimization method for Poisson state estimation is NoLips [see paper for reference]. 53 | 54 | Before settling on NoLips as a default, we also tried a variety of different optimization methods. The first was L-BFGS, as implemented in scipy. We also tried gradient descent, stochastic gradient descent, and a custom method based on alternating iteratively reweighted least squares on a Poisson regression model. These methods are not included in the uncurl package because they had poor performance characteristics compared to NoLips. We settled on NoLips because it was easy to port to sparse matrices and was easily parallelizable. L-BFGS tends to converge in fewer iterations, but the per-iteration time for NoLips is much less: it has closed-form updates that don't require gradient or objective value calculations, and the updates take advantage of data sparsity. 55 | 56 | To use different optimization methods, use the argument ``method=``, where ```` can be either ``NoLips`` (default) or ``L-BFGS-B``. 57 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/uncurl.robust.rst.txt: -------------------------------------------------------------------------------- 1 | uncurl.robust package 2 | ===================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | uncurl.robust.state_estimation module 8 | ------------------------------------- 9 | 10 | .. automodule:: uncurl.robust.state_estimation 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: uncurl.robust 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/uncurl.rst.txt: -------------------------------------------------------------------------------- 1 | uncurl package 2 | ============== 3 | 4 | Submodules 5 | ---------- 6 | 7 | uncurl.preprocessing module 8 | --------------------------- 9 | 10 | .. automodule:: uncurl.preprocessing 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | uncurl.run_se module 16 | ------------------------------ 17 | 18 | .. automodule:: uncurl.run_se 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | uncurl.state_estimation module 24 | ------------------------------ 25 | 26 | .. automodule:: uncurl.state_estimation 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | uncurl.nmf_wrapper module 32 | ------------------------------ 33 | 34 | .. automodule:: uncurl.nmf_wrapper 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | uncurl.qual2quant module 40 | ------------------------ 41 | 42 | .. automodule:: uncurl.qual2quant 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | uncurl.clustering module 48 | ------------------------ 49 | 50 | .. automodule:: uncurl.clustering 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | uncurl.dimensionality_reduction module 56 | ------------------------ 57 | 58 | .. automodule:: uncurl.dimensionality_reduction 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | uncurl.evaluation module 64 | ------------------------ 65 | 66 | .. automodule:: uncurl.evaluation 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | uncurl.experiment_runner module 72 | ------------------------ 73 | 74 | .. automodule:: uncurl.experiment_runner 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | 79 | uncurl.lineage module 80 | --------------------- 81 | 82 | .. automodule:: uncurl.lineage 83 | :members: 84 | :undoc-members: 85 | :show-inheritance: 86 | 87 | uncurl.nb_cluster module 88 | ------------------------ 89 | 90 | .. automodule:: uncurl.nb_cluster 91 | :members: 92 | :undoc-members: 93 | :show-inheritance: 94 | 95 | uncurl.nb_state_estimation module 96 | --------------------------------- 97 | 98 | .. automodule:: uncurl.nb_state_estimation 99 | :members: 100 | :undoc-members: 101 | :show-inheritance: 102 | 103 | uncurl.pois_ll module 104 | --------------------- 105 | 106 | .. automodule:: uncurl.pois_ll 107 | :members: 108 | :undoc-members: 109 | :show-inheritance: 110 | 111 | uncurl.simulation module 112 | ------------------------ 113 | 114 | .. automodule:: uncurl.simulation 115 | :members: 116 | :undoc-members: 117 | :show-inheritance: 118 | 119 | 120 | Module contents 121 | --------------- 122 | 123 | .. automodule:: uncurl 124 | :members: 125 | :undoc-members: 126 | :show-inheritance: 127 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/uncurl.scalable.rst.txt: -------------------------------------------------------------------------------- 1 | uncurl.scalable package 2 | ======================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | uncurl.scalable.state_estimation module 8 | --------------------------------------- 9 | 10 | .. automodule:: uncurl.scalable.state_estimation 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: uncurl.scalable 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/uncurl_2.rst.txt: -------------------------------------------------------------------------------- 1 | UNCURL public functions 2 | ============== 3 | 4 | .. automodule:: uncurl 5 | :members: poisson_cluster, nb_cluster, qual2quant, poisson_estimate_state, lineage 6 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/uncurl_pub.rst.txt: -------------------------------------------------------------------------------- 1 | UNCURL public functions 2 | ======================= 3 | 4 | uncurl.max_variance_genes 5 | ----------------- 6 | 7 | .. autofunction:: uncurl.max_variance_genes 8 | 9 | uncurl.qualNorm 10 | ----------------- 11 | 12 | .. autofunction:: uncurl.qualNorm 13 | 14 | uncurl.poisson_cluster 15 | ---------------------- 16 | 17 | .. autofunction:: uncurl.poisson_cluster 18 | 19 | uncurl.nb_cluster 20 | ----------------- 21 | 22 | .. autofunction:: uncurl.nb_cluster 23 | 24 | uncurl.poisson_estimate_state 25 | ----------------------------- 26 | 27 | .. autofunction:: uncurl.poisson_estimate_state 28 | 29 | uncurl.nb_estimate_state 30 | ----------------------------- 31 | 32 | .. autofunction:: uncurl.nb_estimate_state 33 | 34 | uncurl.mds 35 | ----------------- 36 | 37 | .. autofunction:: uncurl.mds 38 | 39 | uncurl.lineage 40 | -------------- 41 | 42 | .. autofunction:: uncurl.lineage 43 | 44 | uncurl.pseudotime 45 | -------------- 46 | 47 | .. autofunction:: uncurl.pseudotime 48 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/unsupported_methods.rst.txt: -------------------------------------------------------------------------------- 1 | Details on unsupported methods 2 | ============================== 3 | 4 | There are a number of unsupported or experimental methods part of the UNCURL package. We provide information on them here for the sake of completeness but cannot vouch for their correctness. 5 | 6 | Alternative state estimation methods 7 | ------------------------------------ 8 | 9 | We provide implementations of the convex mixture model for the negative binomial (NB) and zero-inflated Poisson (ZIP) distributions. In our experiments they did not work as well as the Poisson model on most datasets, and are substantially less efficient. 10 | 11 | We also provide methods based on LDA (latent Dirichlet allocation), using the LightLDA implementation. The outputs of these methods can be interpreted as state estimation with a binomial sampling distribution. See ``lightlda_utils.py``. In practice, they had worse performance than Poisson state estimation in accuracy, runtime, and memory usage, especially on larger datasets. 12 | 13 | Alternative clustering methods 14 | ------------------------------ 15 | 16 | As with state estimation, we provide NB and ZIP versions of k-means. The same efficiency considerations apply. 17 | 18 | Dimensionality reduction 19 | ------------------------ 20 | 21 | The ``mds`` function performs dimensionality reduction using MDS. This works by running MDS on M to convert it into a projection matrix, and then using that matrix to project W onto 2d space. This is much faster than tSNE or even PCA, at the cost of some fidelity, but it might work as a first pass. 22 | 23 | Example: 24 | 25 | .. code-block:: python 26 | 27 | import numpy as np 28 | from uncurl import mds, dim_reduce_data 29 | 30 | data = np.loadtxt('counts.txt') 31 | 32 | # dimensionality reduction using MDS on state estimation means 33 | M, W, ll = poisson_estimate_state(data, 4) 34 | # proj is a 2d projection of the data. 35 | proj = mds(M, W, 2) 36 | 37 | 38 | Lineage estimation 39 | ------------------ 40 | 41 | The ``lineage`` function performs lineage estimation from the output of ``poisson_estimate_state``. It fits the data to a different 5th degree polynomial for each cell type. 42 | 43 | The ``pseudotime`` function calculates the pseudotime for each cell given the output of ``lineage`` and a starting cell. 44 | 45 | Example (including visualization): 46 | 47 | .. code-block:: python 48 | 49 | import numpy as np 50 | import matplotlib.pyplot as plt 51 | 52 | from uncurl import poisson_estimate_state, mds, lineage, pseudotime 53 | 54 | data = np.loadtxt('counts.txt') 55 | 56 | # pretend that there are three natural clusters in the dataset. 57 | M, W = poisson_estimate_state(data, 3) 58 | 59 | curve_params, smoothed_points, edges, cell_assignments = lineage(M, W) 60 | 61 | # assume the "root" is cell 0 62 | ptime = pseudotime(0, edges, smoothed_points) 63 | 64 | # visualizing the lineage 65 | proj = mds(M, W, 2) 66 | 67 | plt.scatter(proj[0,:], proj[1,:], s=10, c=cell_assignments, edgecolors='none', alpha=0.7) 68 | plt.scatter(smoothed_points[0,:], smoothed_points[1,:], s=30, c=cell_assignments, edgecolors='none', alpha=0.7) 69 | # connect the lines 70 | for edge in edges: 71 | plt.plot((smoothed_points[0, edge[0]], smoothed_points[0, edge[1]]), 72 | (smoothed_points[1, edge[0]], smoothed_points[1, edge[1]]), 'black', linewidth=2) 73 | plt.xlabel('dim 1') 74 | plt.ylabel('dim 2') 75 | 76 | 77 | Ensemble Methods 78 | ---------------- 79 | 80 | Consensus clustering, consensus clustering-based initialization for uncurl, etc. This requires the `Cluster_Ensembles package `_. 81 | 82 | 83 | Visualization 84 | ------------- 85 | 86 | see ``vis.py`` 87 | -------------------------------------------------------------------------------- /docs/_build/html/_static/ajax-loader.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/html/_static/ajax-loader.gif -------------------------------------------------------------------------------- /docs/_build/html/_static/classic.css: -------------------------------------------------------------------------------- 1 | /* 2 | * classic.css_t 3 | * ~~~~~~~~~~~~~ 4 | * 5 | * Sphinx stylesheet -- classic theme. 6 | * 7 | * :copyright: Copyright 2007-2016 by the Sphinx team, see AUTHORS. 8 | * :license: BSD, see LICENSE for details. 9 | * 10 | */ 11 | 12 | @import url("basic.css"); 13 | 14 | /* -- page layout ----------------------------------------------------------- */ 15 | 16 | body { 17 | font-family: sans-serif; 18 | font-size: 100%; 19 | background-color: #11303d; 20 | color: #000; 21 | margin: 0; 22 | padding: 0; 23 | } 24 | 25 | div.document { 26 | background-color: #1c4e63; 27 | } 28 | 29 | div.documentwrapper { 30 | float: left; 31 | width: 100%; 32 | } 33 | 34 | div.bodywrapper { 35 | margin: 0 0 0 230px; 36 | } 37 | 38 | div.body { 39 | background-color: #ffffff; 40 | color: #000000; 41 | padding: 0 20px 30px 20px; 42 | } 43 | 44 | div.footer { 45 | color: #ffffff; 46 | width: 100%; 47 | padding: 9px 0 9px 0; 48 | text-align: center; 49 | font-size: 75%; 50 | } 51 | 52 | div.footer a { 53 | color: #ffffff; 54 | text-decoration: underline; 55 | } 56 | 57 | div.related { 58 | background-color: #133f52; 59 | line-height: 30px; 60 | color: #ffffff; 61 | } 62 | 63 | div.related a { 64 | color: #ffffff; 65 | } 66 | 67 | div.sphinxsidebar { 68 | } 69 | 70 | div.sphinxsidebar h3 { 71 | font-family: 'Trebuchet MS', sans-serif; 72 | color: #ffffff; 73 | font-size: 1.4em; 74 | font-weight: normal; 75 | margin: 0; 76 | padding: 0; 77 | } 78 | 79 | div.sphinxsidebar h3 a { 80 | color: #ffffff; 81 | } 82 | 83 | div.sphinxsidebar h4 { 84 | font-family: 'Trebuchet MS', sans-serif; 85 | color: #ffffff; 86 | font-size: 1.3em; 87 | font-weight: normal; 88 | margin: 5px 0 0 0; 89 | padding: 0; 90 | } 91 | 92 | div.sphinxsidebar p { 93 | color: #ffffff; 94 | } 95 | 96 | div.sphinxsidebar p.topless { 97 | margin: 5px 10px 10px 10px; 98 | } 99 | 100 | div.sphinxsidebar ul { 101 | margin: 10px; 102 | padding: 0; 103 | color: #ffffff; 104 | } 105 | 106 | div.sphinxsidebar a { 107 | color: #98dbcc; 108 | } 109 | 110 | div.sphinxsidebar input { 111 | border: 1px solid #98dbcc; 112 | font-family: sans-serif; 113 | font-size: 1em; 114 | } 115 | 116 | 117 | 118 | /* -- hyperlink styles ------------------------------------------------------ */ 119 | 120 | a { 121 | color: #355f7c; 122 | text-decoration: none; 123 | } 124 | 125 | a:visited { 126 | color: #355f7c; 127 | text-decoration: none; 128 | } 129 | 130 | a:hover { 131 | text-decoration: underline; 132 | } 133 | 134 | 135 | 136 | /* -- body styles ----------------------------------------------------------- */ 137 | 138 | div.body h1, 139 | div.body h2, 140 | div.body h3, 141 | div.body h4, 142 | div.body h5, 143 | div.body h6 { 144 | font-family: 'Trebuchet MS', sans-serif; 145 | background-color: #f2f2f2; 146 | font-weight: normal; 147 | color: #20435c; 148 | border-bottom: 1px solid #ccc; 149 | margin: 20px -20px 10px -20px; 150 | padding: 3px 0 3px 10px; 151 | } 152 | 153 | div.body h1 { margin-top: 0; font-size: 200%; } 154 | div.body h2 { font-size: 160%; } 155 | div.body h3 { font-size: 140%; } 156 | div.body h4 { font-size: 120%; } 157 | div.body h5 { font-size: 110%; } 158 | div.body h6 { font-size: 100%; } 159 | 160 | a.headerlink { 161 | color: #c60f0f; 162 | font-size: 0.8em; 163 | padding: 0 4px 0 4px; 164 | text-decoration: none; 165 | } 166 | 167 | a.headerlink:hover { 168 | background-color: #c60f0f; 169 | color: white; 170 | } 171 | 172 | div.body p, div.body dd, div.body li, div.body blockquote { 173 | text-align: justify; 174 | line-height: 130%; 175 | } 176 | 177 | div.admonition p.admonition-title + p { 178 | display: inline; 179 | } 180 | 181 | div.admonition p { 182 | margin-bottom: 5px; 183 | } 184 | 185 | div.admonition pre { 186 | margin-bottom: 5px; 187 | } 188 | 189 | div.admonition ul, div.admonition ol { 190 | margin-bottom: 5px; 191 | } 192 | 193 | div.note { 194 | background-color: #eee; 195 | border: 1px solid #ccc; 196 | } 197 | 198 | div.seealso { 199 | background-color: #ffc; 200 | border: 1px solid #ff6; 201 | } 202 | 203 | div.topic { 204 | background-color: #eee; 205 | } 206 | 207 | div.warning { 208 | background-color: #ffe4e4; 209 | border: 1px solid #f66; 210 | } 211 | 212 | p.admonition-title { 213 | display: inline; 214 | } 215 | 216 | p.admonition-title:after { 217 | content: ":"; 218 | } 219 | 220 | pre { 221 | padding: 5px; 222 | background-color: #eeffcc; 223 | color: #333333; 224 | line-height: 120%; 225 | border: 1px solid #ac9; 226 | border-left: none; 227 | border-right: none; 228 | } 229 | 230 | code { 231 | background-color: #ecf0f3; 232 | padding: 0 1px 0 1px; 233 | font-size: 0.95em; 234 | } 235 | 236 | th { 237 | background-color: #ede; 238 | } 239 | 240 | .warning code { 241 | background: #efc2c2; 242 | } 243 | 244 | .note code { 245 | background: #d6d6d6; 246 | } 247 | 248 | .viewcode-back { 249 | font-family: sans-serif; 250 | } 251 | 252 | div.viewcode-block:target { 253 | background-color: #f4debf; 254 | border-top: 1px solid #ac9; 255 | border-bottom: 1px solid #ac9; 256 | } 257 | 258 | div.code-block-caption { 259 | color: #efefef; 260 | background-color: #1c4e63; 261 | } -------------------------------------------------------------------------------- /docs/_build/html/_static/custom.css: -------------------------------------------------------------------------------- 1 | /* This file intentionally left blank. */ 2 | -------------------------------------------------------------------------------- /docs/_build/html/_static/pygments.css: -------------------------------------------------------------------------------- 1 | .highlight .hll { background-color: #ffffcc } 2 | .highlight { background: #eeffcc; } 3 | .highlight .c { color: #408090; font-style: italic } /* Comment */ 4 | .highlight .err { border: 1px solid #FF0000 } /* Error */ 5 | .highlight .k { color: #007020; font-weight: bold } /* Keyword */ 6 | .highlight .o { color: #666666 } /* Operator */ 7 | .highlight .ch { color: #408090; font-style: italic } /* Comment.Hashbang */ 8 | .highlight .cm { color: #408090; font-style: italic } /* Comment.Multiline */ 9 | .highlight .cp { color: #007020 } /* Comment.Preproc */ 10 | .highlight .cpf { color: #408090; font-style: italic } /* Comment.PreprocFile */ 11 | .highlight .c1 { color: #408090; font-style: italic } /* Comment.Single */ 12 | .highlight .cs { color: #408090; background-color: #fff0f0 } /* Comment.Special */ 13 | .highlight .gd { color: #A00000 } /* Generic.Deleted */ 14 | .highlight .ge { font-style: italic } /* Generic.Emph */ 15 | .highlight .gr { color: #FF0000 } /* Generic.Error */ 16 | .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ 17 | .highlight .gi { color: #00A000 } /* Generic.Inserted */ 18 | .highlight .go { color: #333333 } /* Generic.Output */ 19 | .highlight .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */ 20 | .highlight .gs { font-weight: bold } /* Generic.Strong */ 21 | .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ 22 | .highlight .gt { color: #0044DD } /* Generic.Traceback */ 23 | .highlight .kc { color: #007020; font-weight: bold } /* Keyword.Constant */ 24 | .highlight .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */ 25 | .highlight .kn { color: #007020; font-weight: bold } /* Keyword.Namespace */ 26 | .highlight .kp { color: #007020 } /* Keyword.Pseudo */ 27 | .highlight .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */ 28 | .highlight .kt { color: #902000 } /* Keyword.Type */ 29 | .highlight .m { color: #208050 } /* Literal.Number */ 30 | .highlight .s { color: #4070a0 } /* Literal.String */ 31 | .highlight .na { color: #4070a0 } /* Name.Attribute */ 32 | .highlight .nb { color: #007020 } /* Name.Builtin */ 33 | .highlight .nc { color: #0e84b5; font-weight: bold } /* Name.Class */ 34 | .highlight .no { color: #60add5 } /* Name.Constant */ 35 | .highlight .nd { color: #555555; font-weight: bold } /* Name.Decorator */ 36 | .highlight .ni { color: #d55537; font-weight: bold } /* Name.Entity */ 37 | .highlight .ne { color: #007020 } /* Name.Exception */ 38 | .highlight .nf { color: #06287e } /* Name.Function */ 39 | .highlight .nl { color: #002070; font-weight: bold } /* Name.Label */ 40 | .highlight .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */ 41 | .highlight .nt { color: #062873; font-weight: bold } /* Name.Tag */ 42 | .highlight .nv { color: #bb60d5 } /* Name.Variable */ 43 | .highlight .ow { color: #007020; font-weight: bold } /* Operator.Word */ 44 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */ 45 | .highlight .mb { color: #208050 } /* Literal.Number.Bin */ 46 | .highlight .mf { color: #208050 } /* Literal.Number.Float */ 47 | .highlight .mh { color: #208050 } /* Literal.Number.Hex */ 48 | .highlight .mi { color: #208050 } /* Literal.Number.Integer */ 49 | .highlight .mo { color: #208050 } /* Literal.Number.Oct */ 50 | .highlight .sa { color: #4070a0 } /* Literal.String.Affix */ 51 | .highlight .sb { color: #4070a0 } /* Literal.String.Backtick */ 52 | .highlight .sc { color: #4070a0 } /* Literal.String.Char */ 53 | .highlight .dl { color: #4070a0 } /* Literal.String.Delimiter */ 54 | .highlight .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */ 55 | .highlight .s2 { color: #4070a0 } /* Literal.String.Double */ 56 | .highlight .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */ 57 | .highlight .sh { color: #4070a0 } /* Literal.String.Heredoc */ 58 | .highlight .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */ 59 | .highlight .sx { color: #c65d09 } /* Literal.String.Other */ 60 | .highlight .sr { color: #235388 } /* Literal.String.Regex */ 61 | .highlight .s1 { color: #4070a0 } /* Literal.String.Single */ 62 | .highlight .ss { color: #517918 } /* Literal.String.Symbol */ 63 | .highlight .bp { color: #007020 } /* Name.Builtin.Pseudo */ 64 | .highlight .fm { color: #06287e } /* Name.Function.Magic */ 65 | .highlight .vc { color: #bb60d5 } /* Name.Variable.Class */ 66 | .highlight .vg { color: #bb60d5 } /* Name.Variable.Global */ 67 | .highlight .vi { color: #bb60d5 } /* Name.Variable.Instance */ 68 | .highlight .vm { color: #bb60d5 } /* Name.Variable.Magic */ 69 | .highlight .il { color: #208050 } /* Literal.Number.Integer.Long */ -------------------------------------------------------------------------------- /docs/_build/html/_static/sidebar.js: -------------------------------------------------------------------------------- 1 | /* 2 | * sidebar.js 3 | * ~~~~~~~~~~ 4 | * 5 | * This script makes the Sphinx sidebar collapsible. 6 | * 7 | * .sphinxsidebar contains .sphinxsidebarwrapper. This script adds 8 | * in .sphixsidebar, after .sphinxsidebarwrapper, the #sidebarbutton 9 | * used to collapse and expand the sidebar. 10 | * 11 | * When the sidebar is collapsed the .sphinxsidebarwrapper is hidden 12 | * and the width of the sidebar and the margin-left of the document 13 | * are decreased. When the sidebar is expanded the opposite happens. 14 | * This script saves a per-browser/per-session cookie used to 15 | * remember the position of the sidebar among the pages. 16 | * Once the browser is closed the cookie is deleted and the position 17 | * reset to the default (expanded). 18 | * 19 | * :copyright: Copyright 2007-2016 by the Sphinx team, see AUTHORS. 20 | * :license: BSD, see LICENSE for details. 21 | * 22 | */ 23 | 24 | $(function() { 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | // global elements used by the functions. 34 | // the 'sidebarbutton' element is defined as global after its 35 | // creation, in the add_sidebar_button function 36 | var bodywrapper = $('.bodywrapper'); 37 | var sidebar = $('.sphinxsidebar'); 38 | var sidebarwrapper = $('.sphinxsidebarwrapper'); 39 | 40 | // for some reason, the document has no sidebar; do not run into errors 41 | if (!sidebar.length) return; 42 | 43 | // original margin-left of the bodywrapper and width of the sidebar 44 | // with the sidebar expanded 45 | var bw_margin_expanded = bodywrapper.css('margin-left'); 46 | var ssb_width_expanded = sidebar.width(); 47 | 48 | // margin-left of the bodywrapper and width of the sidebar 49 | // with the sidebar collapsed 50 | var bw_margin_collapsed = '.8em'; 51 | var ssb_width_collapsed = '.8em'; 52 | 53 | // colors used by the current theme 54 | var dark_color = $('.related').css('background-color'); 55 | var light_color = $('.document').css('background-color'); 56 | 57 | function sidebar_is_collapsed() { 58 | return sidebarwrapper.is(':not(:visible)'); 59 | } 60 | 61 | function toggle_sidebar() { 62 | if (sidebar_is_collapsed()) 63 | expand_sidebar(); 64 | else 65 | collapse_sidebar(); 66 | } 67 | 68 | function collapse_sidebar() { 69 | sidebarwrapper.hide(); 70 | sidebar.css('width', ssb_width_collapsed); 71 | bodywrapper.css('margin-left', bw_margin_collapsed); 72 | sidebarbutton.css({ 73 | 'margin-left': '0', 74 | 'height': bodywrapper.height() 75 | }); 76 | sidebarbutton.find('span').text('»'); 77 | sidebarbutton.attr('title', _('Expand sidebar')); 78 | document.cookie = 'sidebar=collapsed'; 79 | } 80 | 81 | function expand_sidebar() { 82 | bodywrapper.css('margin-left', bw_margin_expanded); 83 | sidebar.css('width', ssb_width_expanded); 84 | sidebarwrapper.show(); 85 | sidebarbutton.css({ 86 | 'margin-left': ssb_width_expanded-12, 87 | 'height': bodywrapper.height() 88 | }); 89 | sidebarbutton.find('span').text('«'); 90 | sidebarbutton.attr('title', _('Collapse sidebar')); 91 | document.cookie = 'sidebar=expanded'; 92 | } 93 | 94 | function add_sidebar_button() { 95 | sidebarwrapper.css({ 96 | 'float': 'left', 97 | 'margin-right': '0', 98 | 'width': ssb_width_expanded - 28 99 | }); 100 | // create the button 101 | sidebar.append( 102 | '
«
' 103 | ); 104 | var sidebarbutton = $('#sidebarbutton'); 105 | light_color = sidebarbutton.css('background-color'); 106 | // find the height of the viewport to center the '<<' in the page 107 | var viewport_height; 108 | if (window.innerHeight) 109 | viewport_height = window.innerHeight; 110 | else 111 | viewport_height = $(window).height(); 112 | sidebarbutton.find('span').css({ 113 | 'display': 'block', 114 | 'margin-top': (viewport_height - sidebar.position().top - 20) / 2 115 | }); 116 | 117 | sidebarbutton.click(toggle_sidebar); 118 | sidebarbutton.attr('title', _('Collapse sidebar')); 119 | sidebarbutton.css({ 120 | 'color': '#FFFFFF', 121 | 'border-left': '1px solid ' + dark_color, 122 | 'font-size': '1.2em', 123 | 'cursor': 'pointer', 124 | 'height': bodywrapper.height(), 125 | 'padding-top': '1px', 126 | 'margin-left': ssb_width_expanded - 12 127 | }); 128 | 129 | sidebarbutton.hover( 130 | function () { 131 | $(this).css('background-color', dark_color); 132 | }, 133 | function () { 134 | $(this).css('background-color', light_color); 135 | } 136 | ); 137 | } 138 | 139 | function set_position_from_cookie() { 140 | if (!document.cookie) 141 | return; 142 | var items = document.cookie.split(';'); 143 | for(var k=0; k 3 | 4 | 5 | 6 | 7 | 8 | 9 | UNCURL Best Practices — UNCURL 0.2.3 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 44 | 45 |
46 |
47 |
48 |
49 | 50 |
51 |

UNCURL Best Practices

52 |
53 |
54 |

Distribution Selection

55 |

In general, the best distribution to use for a given dataset is the one for which the most genes have the lowest error: see (cite figure here).

56 |
57 |
58 |

State Estimation

59 |
60 |

Gene subset selection

61 |
import uncurl
 62 | 
63 |
64 |
65 |
66 |

Initialization

67 |

The default initialization

68 |
69 |
70 |

Semi-supervision, using bulk or qualitative data

71 |

Semi-supervision is done using the

72 |
73 |
74 |
75 |

Clustering

76 |
77 |
78 |

Visualization

79 |
80 | 81 | 82 |
83 |
84 |
85 | 120 |
121 |
122 | 134 | 138 | 139 | -------------------------------------------------------------------------------- /docs/_build/html/modules.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | uncurl — UNCURL 0.2.3 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 44 | 45 |
46 | 81 | 102 |
103 |
104 | 116 | 120 | 121 | -------------------------------------------------------------------------------- /docs/_build/html/nmf_wrapper.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | <no title> — UNCURL 0.2.3 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 44 | 45 |
46 |
47 |
48 |
49 | 50 | 51 | 52 |
53 |
54 |
55 | 76 |
77 |
78 | 90 | 94 | 95 | -------------------------------------------------------------------------------- /docs/_build/html/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/docs/_build/html/objects.inv -------------------------------------------------------------------------------- /docs/_build/html/preprocessing.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | <no title> — UNCURL 0.2.3 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 44 | 45 |
46 |
47 |
48 |
49 | 50 | 51 | 52 |
53 |
54 |
55 | 76 |
77 |
78 | 90 | 94 | 95 | -------------------------------------------------------------------------------- /docs/_build/html/run_se.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | <no title> — UNCURL 0.2.3 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 44 | 45 |
46 |
47 |
48 |
49 | 50 | 51 | 52 |
53 |
54 |
55 | 76 |
77 |
78 | 90 | 94 | 95 | -------------------------------------------------------------------------------- /docs/_build/html/search.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Search — UNCURL 0.2.3 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 52 | 53 |
54 |
55 |
56 |
57 | 58 |

Search

59 |
60 | 61 |

62 | Please activate JavaScript to enable the search 63 | functionality. 64 |

65 |
66 |

67 | From here you can search these documents. Enter your search 68 | words into the box below and click "search". Note that the search 69 | function will automatically search for all of the words. Pages 70 | containing fewer words won't appear in the result list. 71 |

72 |
73 | 74 | 75 | 76 |
77 | 78 |
79 | 80 |
81 | 82 |
83 |
84 |
85 | 89 |
90 |
91 | 103 | 107 | 108 | -------------------------------------------------------------------------------- /docs/_build/html/state_estimation.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | <no title> — UNCURL 0.2.3 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 44 | 45 |
46 |
47 |
48 |
49 | 50 | 51 | 52 |
53 |
54 |
55 | 76 |
77 |
78 | 90 | 94 | 95 | -------------------------------------------------------------------------------- /docs/_build/html/uncurl.robust.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | uncurl.robust package — UNCURL 0.2.3 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 44 | 45 |
46 |
47 |
48 |
49 | 50 |
51 |

uncurl.robust package

52 |
53 |

Submodules

54 |
55 |
56 |

uncurl.robust.state_estimation module

57 |
58 |
59 |

Module contents

60 |
61 |
62 | 63 | 64 |
65 |
66 |
67 | 98 |
99 |
100 | 112 | 116 | 117 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # UNCURL documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Mar 27 13:42:21 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # 19 | import os 20 | import sys 21 | sys.path.insert(0, os.path.abspath('../uncurl/')) 22 | sys.path.insert(0, os.path.abspath('../')) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = ['sphinx.ext.autodoc', 35 | 'sphinx.ext.coverage', 36 | 'sphinx.ext.mathjax', 37 | 'sphinx.ext.viewcode', 38 | 'sphinx.ext.githubpages', 39 | 'sphinx.ext.napoleon'] 40 | 41 | # Add any paths that contain templates here, relative to this directory. 42 | templates_path = ['_templates'] 43 | 44 | # The suffix(es) of source filenames. 45 | # You can specify multiple suffix as a list of string: 46 | # 47 | # source_suffix = ['.rst', '.md'] 48 | source_suffix = '.rst' 49 | 50 | # The master toctree document. 51 | master_doc = 'index' 52 | 53 | # General information about the project. 54 | project = u'UNCURL' 55 | copyright = u'2017, Sumit Mukherjee, Yue Zhang' 56 | author = u'Sumit Mukherjee, Yue Zhang' 57 | 58 | # The version info for the project you're documenting, acts as replacement for 59 | # |version| and |release|, also used in various other places throughout the 60 | # built documents. 61 | # 62 | # The short X.Y version. 63 | version = u'0.2.3' 64 | # The full version, including alpha/beta/rc tags. 65 | release = u'0.2.3' 66 | 67 | # The language for content autogenerated by Sphinx. Refer to documentation 68 | # for a list of supported languages. 69 | # 70 | # This is also used if you do content translation via gettext catalogs. 71 | # Usually you set "language" from the command line for these cases. 72 | language = None 73 | 74 | # List of patterns, relative to source directory, that match files and 75 | # directories to ignore when looking for source files. 76 | # This patterns also effect to html_static_path and html_extra_path 77 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 78 | 79 | # The name of the Pygments (syntax highlighting) style to use. 80 | pygments_style = 'sphinx' 81 | 82 | # If true, `todo` and `todoList` produce output, else they produce nothing. 83 | todo_include_todos = False 84 | 85 | 86 | # -- Options for HTML output ---------------------------------------------- 87 | 88 | # The theme to use for HTML and HTML Help pages. See the documentation for 89 | # a list of builtin themes. 90 | # 91 | html_theme = 'classic' 92 | 93 | # Theme options are theme-specific and customize the look and feel of a theme 94 | # further. For a list of options available for each theme, see the 95 | # documentation. 96 | # 97 | # html_theme_options = {} 98 | 99 | # Add any paths that contain custom static files (such as style sheets) here, 100 | # relative to this directory. They are copied after the builtin static files, 101 | # so a file named "default.css" will overwrite the builtin "default.css". 102 | html_static_path = ['_static'] 103 | 104 | 105 | # -- Options for HTMLHelp output ------------------------------------------ 106 | 107 | # Output file base name for HTML help builder. 108 | htmlhelp_basename = 'UNCURLdoc' 109 | 110 | 111 | # -- Options for LaTeX output --------------------------------------------- 112 | 113 | latex_elements = { 114 | # The paper size ('letterpaper' or 'a4paper'). 115 | # 116 | # 'papersize': 'letterpaper', 117 | 118 | # The font size ('10pt', '11pt' or '12pt'). 119 | # 120 | # 'pointsize': '10pt', 121 | 122 | # Additional stuff for the LaTeX preamble. 123 | # 124 | # 'preamble': '', 125 | 126 | # Latex figure (float) alignment 127 | # 128 | # 'figure_align': 'htbp', 129 | } 130 | 131 | # Grouping the document tree into LaTeX files. List of tuples 132 | # (source start file, target name, title, 133 | # author, documentclass [howto, manual, or own class]). 134 | latex_documents = [ 135 | (master_doc, 'UNCURL.tex', u'UNCURL Documentation', 136 | u'Sumit Mukherjee, Yue Zhang', 'manual'), 137 | ] 138 | 139 | 140 | # -- Options for manual page output --------------------------------------- 141 | 142 | # One entry per manual page. List of tuples 143 | # (source start file, name, description, authors, manual section). 144 | man_pages = [ 145 | (master_doc, 'uncurl', u'UNCURL Documentation', 146 | [author], 1) 147 | ] 148 | 149 | 150 | # -- Options for Texinfo output ------------------------------------------- 151 | 152 | # Grouping the document tree into Texinfo files. List of tuples 153 | # (source start file, target name, title, author, 154 | # dir menu entry, description, category) 155 | texinfo_documents = [ 156 | (master_doc, 'UNCURL', u'UNCURL Documentation', 157 | author, 'UNCURL', 'One line description of project.', 158 | 'Miscellaneous'), 159 | ] 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. UNCURL documentation master file, created by 2 | sphinx-quickstart on Mon Mar 27 13:42:21 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to UNCURL's documentation! 7 | ================================== 8 | 9 | 10 | .. toctree:: 11 | :maxdepth: 2 12 | :caption: Contents: 13 | 14 | readme_link 15 | unsupported_methods 16 | things_we_tried 17 | uncurl 18 | 19 | 20 | 21 | Indices and tables 22 | ================== 23 | 24 | * :ref:`genindex` 25 | * :ref:`modindex` 26 | * :ref:`search` 27 | -------------------------------------------------------------------------------- /docs/readme_link.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | -------------------------------------------------------------------------------- /docs/things_we_tried.rst: -------------------------------------------------------------------------------- 1 | Non-default parameters: things we tried and their results 2 | ================================================ 3 | 4 | There are a number of uncurl parameters (well, not necessarily parameters, more like... run configurations?) that we experimented with. Here are some results. 5 | 6 | 7 | Cell normalization 8 | ------------------ 9 | 10 | This option involves normalizing the cells by their read counts. First, we calculate the total read count of each cell, and divide all counts for cell i by its total read count. Then, we find the median total read count over all cells, and multiply the entire matrix by that value. This method has been used previously for scRNA-seq datasets [see paper for reference]. 11 | 12 | The clustering performance after cell normalization were substantially better on count-valued datasets, and either had no effect or were marginally worse on RPKM-normalized and other forms of data that have already been normalized in some other way. So we would suggest using this option for unnormalized count-valued datasets. The downside is that it might lose some information (if certain cell types were correlated to larger read counts?), but I'm not sure if that happens in practice. 13 | 14 | [TODO: include graphs] 15 | 16 | To use this option, run ``data_normalized = uncurl.preprocessing.cell_normalize(data)``, and run uncurl on ``data_normalized``. 17 | 18 | 19 | Constrained W 20 | ------------- 21 | 22 | When this option is activated, the ``W`` matrix is normalized so that its columns sum to 1 after each round of alternating minimization. Without this option, ``W`` is only constrained to be nonnegative during the optimization process, and normalized after the end of the optimization. 23 | 24 | In clustering experiments, this option had mixed results. It performed marginally better on some datasets and marginally worse on others. On the 10X datasets, constrained W performed slightly better when combined with cell normalization, and worse without cell normalization. 25 | 26 | [TODO: include graphs] 27 | 28 | To use this option, add the argument ``constrain_w=True`` to ``run_state_estimation`` or ``poisson_estimate_state``. This does not work for the NMF-based methods. 29 | 30 | 31 | Uncurl initialization options 32 | ----------------------------- 33 | 34 | We provide a variety of initialization options for uncurl. Most initialization methods first perform a clustering, initialize M based on the cluster means, and W based on the cluster assignments. The default initialization is based on truncated SVD followed by K-means. We also provide initializations based on Poisson clustering, and Poisson k-means++ with randomized W. 35 | 36 | In clustering experiments, truncated SVD initialization usually performed the best, but there were some datasets under which Poisson clustering initialization performed better. For example, on randomly downsampled data, Poisson clustering initialization seems to perform better. 37 | 38 | To use different initializations, use the argument ``initialization=``, where ```` can be one of ``tsvd`` (truncated SVD + K-means), ``cluster`` (Poisson clustering), ``kmpp`` (Poisson k-means++), or ``km`` (k-means on the full data). 39 | 40 | 41 | Alternative to QualNorm: mean-normalized initialization 42 | ------------------------------------------------------- 43 | 44 | Given prior gene expression data, there are a variety of methods for initializing uncurl. ``QualNorm`` is one way of doing this initialization. Another way, when we have real-valued prior data, we could normalize the prior data so that each cell type sums to 1, and then multiply that by the mean per-cell read count of the actual data. 45 | 46 | This performed better than QualNorm on sparse datasets such as the 10X datasets. 47 | 48 | 49 | Optimization methods 50 | -------------------- 51 | 52 | The default optimization method for Poisson state estimation is NoLips [see paper for reference]. 53 | 54 | Before settling on NoLips as a default, we also tried a variety of different optimization methods. The first was L-BFGS, as implemented in scipy. We also tried gradient descent, stochastic gradient descent, and a custom method based on alternating iteratively reweighted least squares on a Poisson regression model. These methods are not included in the uncurl package because they had poor performance characteristics compared to NoLips. We settled on NoLips because it was easy to port to sparse matrices and was easily parallelizable. L-BFGS tends to converge in fewer iterations, but the per-iteration time for NoLips is much less: it has closed-form updates that don't require gradient or objective value calculations, and the updates take advantage of data sparsity. 55 | 56 | To use different optimization methods, use the argument ``method=``, where ```` can be either ``NoLips`` (default) or ``L-BFGS-B``. 57 | -------------------------------------------------------------------------------- /docs/uncurl.rst: -------------------------------------------------------------------------------- 1 | uncurl package 2 | ============== 3 | 4 | Submodules 5 | ---------- 6 | 7 | uncurl.preprocessing module 8 | --------------------------- 9 | 10 | .. automodule:: uncurl.preprocessing 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | uncurl.run_se module 16 | ------------------------------ 17 | 18 | .. automodule:: uncurl.run_se 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | uncurl.state_estimation module 24 | ------------------------------ 25 | 26 | .. automodule:: uncurl.state_estimation 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | uncurl.nmf_wrapper module 32 | ------------------------------ 33 | 34 | .. automodule:: uncurl.nmf_wrapper 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | uncurl.qual2quant module 40 | ------------------------ 41 | 42 | .. automodule:: uncurl.qual2quant 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | uncurl.clustering module 48 | ------------------------ 49 | 50 | .. automodule:: uncurl.clustering 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | uncurl.dimensionality_reduction module 56 | ------------------------ 57 | 58 | .. automodule:: uncurl.dimensionality_reduction 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | uncurl.evaluation module 64 | ------------------------ 65 | 66 | .. automodule:: uncurl.evaluation 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | uncurl.experiment_runner module 72 | ------------------------ 73 | 74 | .. automodule:: uncurl.experiment_runner 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | 79 | uncurl.lineage module 80 | --------------------- 81 | 82 | .. automodule:: uncurl.lineage 83 | :members: 84 | :undoc-members: 85 | :show-inheritance: 86 | 87 | uncurl.nb_cluster module 88 | ------------------------ 89 | 90 | .. automodule:: uncurl.nb_cluster 91 | :members: 92 | :undoc-members: 93 | :show-inheritance: 94 | 95 | uncurl.nb_state_estimation module 96 | --------------------------------- 97 | 98 | .. automodule:: uncurl.nb_state_estimation 99 | :members: 100 | :undoc-members: 101 | :show-inheritance: 102 | 103 | uncurl.pois_ll module 104 | --------------------- 105 | 106 | .. automodule:: uncurl.pois_ll 107 | :members: 108 | :undoc-members: 109 | :show-inheritance: 110 | 111 | uncurl.simulation module 112 | ------------------------ 113 | 114 | .. automodule:: uncurl.simulation 115 | :members: 116 | :undoc-members: 117 | :show-inheritance: 118 | 119 | 120 | Module contents 121 | --------------- 122 | 123 | .. automodule:: uncurl 124 | :members: 125 | :undoc-members: 126 | :show-inheritance: 127 | -------------------------------------------------------------------------------- /docs/uncurl_pub.rst: -------------------------------------------------------------------------------- 1 | UNCURL public functions 2 | ======================= 3 | 4 | uncurl.max_variance_genes 5 | ----------------- 6 | 7 | .. autofunction:: uncurl.max_variance_genes 8 | 9 | uncurl.qualNorm 10 | ----------------- 11 | 12 | .. autofunction:: uncurl.qualNorm 13 | 14 | uncurl.poisson_cluster 15 | ---------------------- 16 | 17 | .. autofunction:: uncurl.poisson_cluster 18 | 19 | uncurl.nb_cluster 20 | ----------------- 21 | 22 | .. autofunction:: uncurl.nb_cluster 23 | 24 | uncurl.poisson_estimate_state 25 | ----------------------------- 26 | 27 | .. autofunction:: uncurl.poisson_estimate_state 28 | 29 | uncurl.nb_estimate_state 30 | ----------------------------- 31 | 32 | .. autofunction:: uncurl.nb_estimate_state 33 | 34 | uncurl.mds 35 | ----------------- 36 | 37 | .. autofunction:: uncurl.mds 38 | 39 | uncurl.lineage 40 | -------------- 41 | 42 | .. autofunction:: uncurl.lineage 43 | 44 | uncurl.pseudotime 45 | -------------- 46 | 47 | .. autofunction:: uncurl.pseudotime 48 | -------------------------------------------------------------------------------- /docs/unsupported_methods.rst: -------------------------------------------------------------------------------- 1 | Details on unsupported methods 2 | ============================== 3 | 4 | There are a number of unsupported or experimental methods part of the UNCURL package. We provide information on them here for the sake of completeness but cannot vouch for their correctness. 5 | 6 | Alternative state estimation methods 7 | ------------------------------------ 8 | 9 | We provide implementations of the convex mixture model for the negative binomial (NB) and zero-inflated Poisson (ZIP) distributions. In our experiments they did not work as well as the Poisson model on most datasets, and are substantially less efficient. 10 | 11 | We also provide methods based on LDA (latent Dirichlet allocation), using the LightLDA implementation. The outputs of these methods can be interpreted as state estimation with a binomial sampling distribution. See ``lightlda_utils.py``. In practice, they had worse performance than Poisson state estimation in accuracy, runtime, and memory usage, especially on larger datasets. 12 | 13 | Alternative clustering methods 14 | ------------------------------ 15 | 16 | As with state estimation, we provide NB and ZIP versions of k-means. The same efficiency considerations apply. 17 | 18 | Dimensionality reduction 19 | ------------------------ 20 | 21 | The ``mds`` function performs dimensionality reduction using MDS. This works by running MDS on M to convert it into a projection matrix, and then using that matrix to project W onto 2d space. This is much faster than tSNE or even PCA, at the cost of some fidelity, but it might work as a first pass. 22 | 23 | Example: 24 | 25 | .. code-block:: python 26 | 27 | import numpy as np 28 | from uncurl import mds, dim_reduce_data 29 | 30 | data = np.loadtxt('counts.txt') 31 | 32 | # dimensionality reduction using MDS on state estimation means 33 | M, W, ll = poisson_estimate_state(data, 4) 34 | # proj is a 2d projection of the data. 35 | proj = mds(M, W, 2) 36 | 37 | 38 | Lineage estimation 39 | ------------------ 40 | 41 | The ``lineage`` function performs lineage estimation from the output of ``poisson_estimate_state``. It fits the data to a different 5th degree polynomial for each cell type. 42 | 43 | The ``pseudotime`` function calculates the pseudotime for each cell given the output of ``lineage`` and a starting cell. 44 | 45 | Example (including visualization): 46 | 47 | .. code-block:: python 48 | 49 | import numpy as np 50 | import matplotlib.pyplot as plt 51 | 52 | from uncurl import poisson_estimate_state, mds, lineage, pseudotime 53 | 54 | data = np.loadtxt('counts.txt') 55 | 56 | # pretend that there are three natural clusters in the dataset. 57 | M, W = poisson_estimate_state(data, 3) 58 | 59 | curve_params, smoothed_points, edges, cell_assignments = lineage(M, W) 60 | 61 | # assume the "root" is cell 0 62 | ptime = pseudotime(0, edges, smoothed_points) 63 | 64 | # visualizing the lineage 65 | proj = mds(M, W, 2) 66 | 67 | plt.scatter(proj[0,:], proj[1,:], s=10, c=cell_assignments, edgecolors='none', alpha=0.7) 68 | plt.scatter(smoothed_points[0,:], smoothed_points[1,:], s=30, c=cell_assignments, edgecolors='none', alpha=0.7) 69 | # connect the lines 70 | for edge in edges: 71 | plt.plot((smoothed_points[0, edge[0]], smoothed_points[0, edge[1]]), 72 | (smoothed_points[1, edge[0]], smoothed_points[1, edge[1]]), 'black', linewidth=2) 73 | plt.xlabel('dim 1') 74 | plt.ylabel('dim 2') 75 | 76 | 77 | Ensemble Methods 78 | ---------------- 79 | 80 | Consensus clustering, consensus clustering-based initialization for uncurl, etc. This requires the `Cluster_Ensembles package `_. 81 | 82 | 83 | Visualization 84 | ------------- 85 | 86 | see ``vis.py`` 87 | -------------------------------------------------------------------------------- /examples/example.py: -------------------------------------------------------------------------------- 1 | from scipy.io import loadmat 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | import uncurl 6 | from uncurl.evaluation import purity 7 | 8 | if __name__ == '__main__': 9 | dat = loadmat('data/SCDE_test.mat') 10 | data = dat['dat'].toarray() 11 | centers, assignments = uncurl.kmeans_pp(data, 2) 12 | lls = uncurl.poisson_ll(data, centers) 13 | # Poisson clustering 14 | assignments_poisson, centers = uncurl.poisson_cluster(data, 2, init=centers) 15 | # NB clustering 16 | assignments_nb, P, R = uncurl.nb_cluster(data, 2) 17 | # ZIP clustering 18 | assignments_zip, M, L = uncurl.zip_cluster(data, 2) 19 | true_labs = dat['Lab'][0] 20 | print 'poisson purity:', purity(assignments_poisson, true_labs) 21 | print 'NB purity:', purity(assignments_nb, true_labs) 22 | print 'ZIP purity:', purity(assignments_zip, true_labs) 23 | # State estimation 24 | means, weights, ll = uncurl.poisson_estimate_state(data, 2, disp=False) 25 | w_classes = weights.argmax(0) 26 | print 'W argmax purity:', purity(w_classes, true_labs) 27 | # dimensionality reduction 28 | X = uncurl.dim_reduce(means, weights, 2) 29 | proj = np.dot(X, weights) 30 | # plotting dimensionality reduction 31 | plt.cla() 32 | # weight plot 33 | plt.title('Dimensionality reduction plot - assigned weight labels') 34 | plt.scatter(proj[0,:], proj[1,:], s=100, cmap='seismic', c=weights[0,:]) 35 | plt.xlabel('dim 1') 36 | plt.ylabel('dim 2') 37 | plt.savefig('dat.png') 38 | plt.cla() 39 | # Poisson cluster plot 40 | plt.title('Dimensionality reduction plot - Poisson clustering labels') 41 | plt.scatter(proj[0,:], proj[1,:], s=100, cmap='seismic', c=assignments_poisson) 42 | plt.xlabel('dim 1') 43 | plt.ylabel('dim 2') 44 | plt.savefig('poisson_cluster_dat.png') 45 | plt.cla() 46 | # NB cluster plot 47 | plt.title('Dimensionality reduction plot - NB clustering labels') 48 | plt.scatter(proj[0,:], proj[1,:], s=100, cmap='seismic', c=assignments_nb) 49 | plt.xlabel('dim 1') 50 | plt.ylabel('dim 2') 51 | plt.savefig('nb_cluster_dat.png') 52 | plt.cla() 53 | # ZIP cluster plot 54 | plt.title('Dimensionality reduction plot - ZIP clustering labels') 55 | plt.scatter(proj[0,:], proj[1,:], s=100, cmap='seismic', c=assignments_zip) 56 | plt.xlabel('dim 1') 57 | plt.ylabel('dim 2') 58 | plt.savefig('zip_cluster_dat.png') 59 | plt.cla() 60 | # true label plot 61 | plt.title('Dimensionality reduction plot - true labels') 62 | plt.scatter(proj[0,:], proj[1,:], cmap='bwr', s=100, alpha=0.7, c=dat['Lab']) 63 | plt.xlabel('dim 1') 64 | plt.ylabel('dim 2') 65 | plt.savefig('labels.png') 66 | -------------------------------------------------------------------------------- /examples/lineage_example.py: -------------------------------------------------------------------------------- 1 | from scipy.io import loadmat 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | import uncurl 6 | from uncurl.lineage import fourier_series 7 | 8 | if __name__ == '__main__': 9 | dat = loadmat('data/BranchedSynDat.mat') 10 | data = dat['Dat'].astype(float) 11 | # Poisson clustering 12 | assignments, centers = uncurl.poisson_cluster(data, 3) 13 | # State estimation 14 | means, weights, ll = uncurl.run_state_estimation(data, 3) 15 | #means, weights = np.load('means_weights.npy') 16 | # dimensionality reduction 17 | X = uncurl.dim_reduce(means, weights, 2) 18 | proj = np.dot(X.T, weights) 19 | cluster_curves, cluster_fitted_vals, cluster_edges, cluster_assignments = uncurl.run_lineage(means, weights, curve_function='poly') 20 | # dimensionality reduction with true data 21 | true_weights = dat['X'] 22 | true_means = dat['M'] 23 | X = uncurl.dim_reduce(true_means, true_weights, 2) 24 | proj_true = np.dot(X.T, true_weights) 25 | true_curves, true_fitted, true_edges, true_assignments = uncurl.run_lineage(true_means, true_weights) 26 | # plotting dimensionality reduction, fitted curves 27 | plt.clf() 28 | plt.cla() 29 | plt.title('Dimensionality reduction plot') 30 | plt.scatter(proj[0,:], proj[1,:], s=30, c=weights.argmax(0), edgecolors='none', alpha=0.7) 31 | plt.scatter(cluster_fitted_vals[0,:], cluster_fitted_vals[1,:], s=30, c=weights.argmax(0), edgecolors='none', alpha=0.7) 32 | # connect the lines 33 | for edge in cluster_edges: 34 | plt.plot((cluster_fitted_vals[0, edge[0]], cluster_fitted_vals[0, edge[1]]), 35 | (cluster_fitted_vals[1, edge[0]], cluster_fitted_vals[1, edge[1]]), 'black', linewidth=2) 36 | plt.xlabel('dim 1') 37 | plt.ylabel('dim 2') 38 | plt.savefig('branching_dim_reduce_fitted_poly.png') 39 | plt.cla() 40 | # true label plot 41 | """ 42 | plt.title('Dimensionality reduction plot - true labels') 43 | plt.scatter(proj[0,:], proj[1,:], cmap='bwr', s=100, alpha=0.7, c=dat['Lab']) 44 | plt.xlabel('dim 1') 45 | plt.ylabel('dim 2') 46 | plt.savefig('labels.png') 47 | """ 48 | -------------------------------------------------------------------------------- /examples/means_weights.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjzhang/uncurl_python/0113ba6ca874549ac1d760ef961dd6e82ebfcc67/examples/means_weights.npy -------------------------------------------------------------------------------- /examples/synthetic_example.py: -------------------------------------------------------------------------------- 1 | from scipy.io import loadmat 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | import uncurl 6 | 7 | if __name__ == '__main__': 8 | dat = loadmat('data/SynMouseESprog_1000.mat') 9 | data = dat['Dat'].toarray() 10 | centers, assignments = uncurl.kmeans_pp(data, 2) 11 | lls = uncurl.poisson_ll(data, centers) 12 | # Poisson clustering 13 | assignments, centers = uncurl.poisson_cluster(data, 3) 14 | # State estimation 15 | means, weights, ll = uncurl.poisson_estimate_state(data, 3, max_iters=5) 16 | # dimensionality reduction 17 | X = uncurl.dim_reduce(means, weights, 2) 18 | proj = np.dot(X.T, weights) 19 | # plotting dimensionality reduction 20 | plt.cla() 21 | # weight plot 22 | plt.title('Dimensionality reduction plot - assigned weight labels') 23 | plt.scatter(proj[0,:], proj[1,:], s=100, c=weights.argmax(0)) 24 | plt.xlabel('dim 1') 25 | plt.ylabel('dim 2') 26 | plt.savefig('synthetic_dim_reduce.png') 27 | plt.cla() 28 | # true label plot 29 | """ 30 | plt.title('Dimensionality reduction plot - true labels') 31 | plt.scatter(proj[0,:], proj[1,:], cmap='bwr', s=100, alpha=0.7, c=dat['Lab']) 32 | plt.xlabel('dim 1') 33 | plt.ylabel('dim 2') 34 | plt.savefig('labels.png') 35 | """ 36 | -------------------------------------------------------------------------------- /examples/zeisel_subset_example.py: -------------------------------------------------------------------------------- 1 | from scipy.io import loadmat 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from scipy import sparse 5 | from sklearn.manifold import TSNE 6 | 7 | import uncurl 8 | from uncurl.sparse_utils import symmetric_kld 9 | from uncurl.vis import visualize_dim_red 10 | 11 | # note: this whole script should finish in under a few minutes. 12 | 13 | if __name__ == '__main__': 14 | 15 | # 1. load data - 753 cells, 19971 genes 16 | dat = loadmat('data/GSE60361_dat.mat') 17 | data = dat['Dat'] 18 | true_labels = dat['ActLabs'].flatten() 19 | data_csc = sparse.csc_matrix(data) 20 | 21 | # 2. gene selection 22 | genes = uncurl.max_variance_genes(data_csc, nbins=5, frac=0.2) 23 | data_subset = data_csc[genes,:] 24 | 25 | # 3. state estimation 26 | k = 7 # number of clusters to use 27 | M, W, ll = uncurl.poisson_estimate_state(data_subset, k) 28 | argmax_labels = W.argmax(0) 29 | 30 | # 4. visualization 31 | 32 | # mds visualization 33 | mds_proj = uncurl.mds(M, W, 2) 34 | visualize_dim_red(mds_proj, true_labels, 'GSE60361_mds_true_labels.png', title='MDS', figsize=(12,7), alpha=0.5) 35 | 36 | # tsne visualization 37 | tsne = TSNE(2, metric=symmetric_kld) 38 | tsne_w = tsne.fit_transform(W.T) 39 | # plot using true labels 40 | visualize_dim_red(tsne_w.T, true_labels, 'GSE60361_tsne_true_labels.png', title='TSNE(W)', figsize=(12,7), alpha=0.5) 41 | # plot using assigned labels 42 | visualize_dim_red(tsne_w.T, argmax_labels, 'GSE60361_tsne_argmax_labels.png', title='TSNE(W)', figsize=(12,7), alpha=0.5) 43 | -------------------------------------------------------------------------------- /optional_requirements.txt: -------------------------------------------------------------------------------- 1 | Cluster-Ensembles 2 | matplotlib 3 | SIMLR 4 | -------------------------------------------------------------------------------- /push-docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # run this from the master branch to build/push documentation 4 | 5 | cd docs 6 | make html 7 | git add _build 8 | git commit 9 | 10 | cd .. 11 | 12 | git subtree push --prefix docs/_build/html origin gh-pages 13 | 14 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cython>=0.27 2 | numpy>=1.12 3 | scipy>=0.19 4 | scikit-learn>=0.19 5 | matplotlib 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from distutils.extension import Extension 3 | from Cython.Build import cythonize 4 | import numpy 5 | 6 | #directive_defaults['linetrace'] = True 7 | #directive_defaults['binding'] = True 8 | 9 | extensions = [ 10 | Extension('uncurl.nolips', ['uncurl/nolips.pyx'], 11 | extra_compile_args=['-O3', '-ffast-math']), 12 | Extension('uncurl.sparse_utils', ['uncurl/sparse_utils.pyx'], 13 | extra_compile_args=['-O3', '-ffast-math']) 14 | ] 15 | 16 | parallel_extensions = [ 17 | Extension('uncurl.nolips_parallel', ['uncurl/nolips_parallel.pyx'], 18 | extra_compile_args=['-O3', '-ffast-math', '-fopenmp'], 19 | extra_link_args=['-fopenmp']) 20 | ] 21 | 22 | long_description = '' 23 | with open('README.rst') as f: 24 | long_description = f.read() 25 | 26 | setup(name='uncurl_seq', 27 | version='0.2.16', 28 | description='Tool for pre-processing single-cell RNASeq data', 29 | long_description=long_description, 30 | long_description_content_type='text/plain', 31 | url='https://github.com/yjzhang/uncurl_python', 32 | author='Yue Zhang', 33 | author_email='yjzhang@cs.washington.edu', 34 | license='MIT', 35 | include_dirs=[numpy.get_include()], 36 | ext_modules = cythonize(extensions + parallel_extensions), 37 | packages=find_packages("."), 38 | install_requires=[ 39 | 'numpy', 40 | 'scipy', 41 | 'cython', 42 | 'scikit-learn', 43 | ], 44 | test_suite='nose.collector', 45 | tests_require=['nose', 'flaky'], 46 | classifiers=[ 47 | 'Development Status :: 3 - Alpha', 48 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 49 | 'License :: OSI Approved :: MIT License', 50 | 'Programming Language :: Python :: 2.7', 51 | 'Programming Language :: Python :: 3.5', 52 | ], 53 | zip_safe=False) 54 | -------------------------------------------------------------------------------- /tests/test_cluster.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import unittest 4 | from unittest import TestCase 5 | from flaky import flaky 6 | 7 | import numpy as np 8 | from scipy.io import loadmat 9 | 10 | import uncurl 11 | from uncurl.simulation import generate_poisson_data, generate_zip_data 12 | from uncurl.evaluation import purity 13 | from uncurl.zip_clustering import zip_fit_params_mle 14 | 15 | @flaky(max_runs=3) 16 | class ClusterTest(TestCase): 17 | 18 | def setUp(self): 19 | self.dat = loadmat('data/SCDE_k2_sup.mat') 20 | 21 | def test_kmeans_pp(self): 22 | data = self.dat['Dat'] 23 | genes, cells = data.shape 24 | centers, assignments = uncurl.kmeans_pp(data, 3) 25 | self.assertEqual(centers.shape[0], genes) 26 | self.assertEqual(centers.shape[1], 3) 27 | # the center assignments are nondeterministic so... 28 | self.assertFalse(np.equal(centers[:,0], centers[:,1]).all()) 29 | self.assertFalse(np.equal(centers[:,1], centers[:,2]).all()) 30 | 31 | def test_cluster(self): 32 | data = self.dat['Dat'] 33 | assignments, centers = uncurl.poisson_cluster(data, 3) 34 | self.assertEqual(assignments.shape[0], data.shape[1]) 35 | self.assertEqual(centers.shape[0], data.shape[0]) 36 | # just checking that the values are valid 37 | self.assertFalse(np.isnan(centers).any()) 38 | 39 | def test_simulation(self): 40 | """ 41 | Basically this is to test that the Poisson EM can correctly separate 42 | clusters in simulated data. 43 | """ 44 | centers = np.array([[1,10,20], [1, 11, 1], [50, 1, 100]]) 45 | centers = centers.astype(float) 46 | data, labs = generate_poisson_data(centers, 500) 47 | data = data.astype(float) 48 | assignments, c_centers = uncurl.poisson_cluster(data, 3) 49 | distances = np.zeros((3,3)) 50 | for i in range(3): 51 | for j in range(3): 52 | distances[i,j] = uncurl.poisson_dist(centers[:,i], c_centers[:,j]) 53 | self.assertTrue(purity(assignments, labs) > 0.8) 54 | 55 | @flaky(max_runs=3) 56 | @unittest.skip('zip methods are unsupported') 57 | def test_zip_simulation(self): 58 | """ 59 | ZIP clustering on poisson-simulated data 60 | """ 61 | centers = np.array([[0.1,10,20], [0.1, 11, 0.1], [50, 0.1, 100]]) 62 | centers = centers.astype(float) 63 | data, labs = generate_poisson_data(centers, 500) 64 | data = data.astype(float) 65 | assignments, c_centers, c_zeros = uncurl.zip_cluster(data, 3) 66 | self.assertTrue(purity(assignments, labs) > 0.8) 67 | 68 | @flaky(max_runs=3) 69 | @unittest.skip('zip methods are unsupported') 70 | def test_zip_fit(self): 71 | """ 72 | Tests the algorithm for fitting a ZIP distribution. 73 | """ 74 | for i in range(10): 75 | centers = np.random.randint(10, 1000, (3,1)) 76 | M = np.random.random((3,1)) 77 | data, labs = generate_zip_data(centers, M, 300) 78 | L_, M_ = zip_fit_params_mle(data) 79 | self.assertFalse(np.isnan(L_).any()) 80 | self.assertFalse(np.isnan(M_).any()) 81 | self.assertFalse(np.isnan(L_).any()) 82 | self.assertFalse(np.isnan(M_).any()) 83 | self.assertTrue(np.mean(np.abs(M.flatten() - M_)) < 0.2) 84 | self.assertTrue(np.mean(np.abs(centers.flatten() - L_)) < 10) 85 | 86 | @flaky(max_runs=3) 87 | @unittest.skip('zip methods are unsupported') 88 | def test_zip_simulation_2(self): 89 | """ 90 | ZIP clustering on ZIP-simulated data 91 | """ 92 | centers = np.random.randint(10, 1000, (3,3)) 93 | L = np.random.random((3,3)) 94 | print(centers) 95 | print(L) 96 | centers = centers.astype(float) 97 | data, labs = generate_zip_data(centers, L, 1000) 98 | data = data.astype(float) 99 | print(data) 100 | assignments, c_centers, c_zeros = uncurl.zip_cluster(data, 3) 101 | distances = np.zeros((3,3)) 102 | for i in range(3): 103 | for j in range(3): 104 | distances[i,j] = uncurl.poisson_dist(centers[:,i], c_centers[:,j]) 105 | print(c_centers) 106 | print(c_zeros) 107 | print(purity(assignments, labs)) 108 | self.assertTrue(purity(assignments, labs) > 0.6) 109 | #self.assertFalse(correspond[0]==correspond[1]) 110 | #self.assertFalse(correspond[1]==correspond[2]) 111 | -------------------------------------------------------------------------------- /tests/test_cluster_sparse.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | from flaky import flaky 5 | 6 | import numpy as np 7 | from scipy.io import loadmat 8 | from scipy import sparse 9 | 10 | import uncurl 11 | from uncurl.simulation import generate_poisson_data 12 | from uncurl.evaluation import purity 13 | 14 | @flaky 15 | class SparseClusterTest(TestCase): 16 | 17 | def setUp(self): 18 | dat = loadmat('data/SCDE_k2_sup.mat') 19 | self.data = sparse.csc_matrix(dat['Dat']) 20 | self.labs = dat['Lab'].flatten() 21 | 22 | def test_kmeans_pp(self): 23 | data = self.data 24 | genes, cells = data.shape 25 | centers, assignments = uncurl.kmeans_pp(data, 3) 26 | self.assertEqual(centers.shape[0], genes) 27 | self.assertEqual(centers.shape[1], 3) 28 | # the center assignments are nondeterministic so... 29 | self.assertFalse(np.equal(centers[:,0], centers[:,1]).all()) 30 | self.assertFalse(np.equal(centers[:,1], centers[:,2]).all()) 31 | 32 | def test_cluster(self): 33 | data = self.data 34 | assignments, centers = uncurl.poisson_cluster(data, 2) 35 | self.assertEqual(assignments.shape[0], data.shape[1]) 36 | self.assertEqual(centers.shape[0], data.shape[0]) 37 | # just checking that the values are valid 38 | self.assertFalse(np.isnan(centers).any()) 39 | self.assertTrue(purity(assignments, self.labs) > 0.8) 40 | 41 | def test_simulation(self): 42 | """ 43 | Basically this is to test that the Poisson EM can correctly separate 44 | clusters in simulated data. 45 | """ 46 | centers = np.array([[1,10,20], [1, 11, 1], [50, 1, 100]]) 47 | centers = centers.astype(float) 48 | data, labs = generate_poisson_data(centers, 500) 49 | data = data.astype(float) 50 | data = sparse.csc_matrix(data) 51 | assignments, c_centers = uncurl.poisson_cluster(data, 3) 52 | distances = np.zeros((3,3)) 53 | for i in range(3): 54 | for j in range(3): 55 | distances[i,j] = uncurl.poisson_dist(centers[:,i], c_centers[:,j]) 56 | print(assignments) 57 | print(labs) 58 | print(purity(assignments, labs)) 59 | self.assertTrue(purity(assignments, labs) > 0.65) 60 | 61 | -------------------------------------------------------------------------------- /tests/test_dim_reduce.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | import numpy as np 4 | 5 | from uncurl import simulation, dim_reduce, dim_reduce_data, mds 6 | 7 | class DimReduceTest(TestCase): 8 | 9 | def setUp(self): 10 | pass 11 | 12 | def test_dim_reduce(self): 13 | """ 14 | Test dimensionality reduction using sample data 15 | """ 16 | sim_means = np.array([[20.,30.,1.], 17 | [10.,3.,8.], 18 | [90.,50.,20.], 19 | [10.,4.,30.]]) 20 | sim_assignments = np.array([[0.1,0.2,0.3,0.4,0.5,0.1,0.8], 21 | [0.5,0.3,0.2,0.4,0.2,0.2,0.1], 22 | [0.4,0.5,0.5,0.2,0.3,0.7,0.1]]) 23 | sim_data = simulation.generate_state_data(sim_means, sim_assignments) 24 | sim_data = sim_data + 1e-8 25 | X = dim_reduce(sim_means, sim_assignments, 2) 26 | self.assertEqual(X.shape, (3, 2)) 27 | X2 = dim_reduce_data(sim_data, 2) 28 | self.assertEqual(X2.shape, (sim_data.shape[1], 2)) 29 | projections = np.dot(X.transpose(), sim_assignments) 30 | mds_proj = mds(sim_means, sim_assignments, 2) 31 | self.assertTrue(np.abs(mds_proj - projections).sum() < 1e-6) 32 | # assert something about the distances??? 33 | # 1-NN based error? 34 | -------------------------------------------------------------------------------- /tests/test_experiment_runner.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | import numpy as np 6 | from scipy import sparse 7 | from scipy.io import loadmat 8 | 9 | import uncurl 10 | 11 | class ExperimentRunnerTest(TestCase): 12 | # TODO: test dataset 13 | 14 | def setUp(self): 15 | dat = loadmat('data/SCDE_test.mat') 16 | self.data = dat['dat'].toarray()[0:500, :] 17 | self.data = sparse.csc_matrix(self.data) 18 | self.labs = dat['Lab'][0] 19 | 20 | def test_run(self): 21 | se = uncurl.experiment_runner.PoissonSE(clusters=2) 22 | results, ll = se.run(self.data) 23 | self.assertTrue(len(results)==1) 24 | self.assertTrue(results[0].shape[0]==2) 25 | 26 | def test_runExperiment(self): 27 | se = uncurl.experiment_runner.PoissonSE(clusters=2, max_iters=10, inner_max_iters=50) 28 | argmax = uncurl.experiment_runner.Argmax(n_classes=2) 29 | km = uncurl.experiment_runner.KM(n_classes=2) 30 | methods = [(se, [argmax, km])] 31 | results, names, other = uncurl.experiment_runner.run_experiment(methods, self.data, 2, self.labs, n_runs=2) 32 | self.assertEqual(len(results), 2) 33 | self.assertTrue('clusterings' in other) 34 | self.assertTrue('timing' in other) 35 | self.assertTrue('preprocessing' in other) 36 | print(results) 37 | self.assertTrue(results[0][0]>0.95) 38 | 39 | def test_runExperiment_2(self): 40 | se = uncurl.experiment_runner.PoissonSE(clusters=2, max_iters=10, inner_max_iters=50) 41 | pre = uncurl.experiment_runner.Preprocess() 42 | argmax = uncurl.experiment_runner.Argmax(n_classes=2) 43 | km = uncurl.experiment_runner.KM(n_classes=2) 44 | pca_km = uncurl.experiment_runner.PcaKm(k=8, n_classes=2) 45 | methods = [(se, [argmax, km]), (pre, [km, pca_km])] 46 | results, names, other = uncurl.experiment_runner.run_experiment(methods, self.data, 2, self.labs, n_runs=2) 47 | self.assertEqual(len(results), 2) 48 | self.assertTrue('clusterings' in other) 49 | self.assertTrue('timing' in other) 50 | self.assertTrue('preprocessing' in other) 51 | self.assertTrue(results[0][0]>0.95) 52 | -------------------------------------------------------------------------------- /tests/test_fit_dist.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest import TestCase 3 | from flaky import flaky 4 | 5 | import numpy as np 6 | 7 | from scipy.io import loadmat 8 | from scipy import sparse 9 | 10 | import uncurl 11 | from uncurl.simulation import generate_poisson_data 12 | from uncurl import fit_dist_data 13 | 14 | @flaky(max_runs=4) 15 | class FitDistTest(TestCase): 16 | 17 | def setUp(self): 18 | pass 19 | 20 | def testPoissonData(self): 21 | """ 22 | Test with generated unimodal Poisson dataset. 23 | """ 24 | centers = np.array([[1], [10], [50]]) 25 | centers = centers.astype(float) 26 | data, labs = generate_poisson_data(centers, 500) 27 | fit_errors = fit_dist_data.DistFitDataset(data) 28 | self.assertTrue((fit_errors['poiss'] < fit_errors['norm']).all()) 29 | self.assertTrue((fit_errors['poiss'] < fit_errors['lognorm']).all()) 30 | 31 | def testNormalData(self): 32 | """ 33 | Test with generated unimodal Normal dataset. 34 | """ 35 | centers = np.array([[100], [20], [50]]) 36 | variances = np.array([[1.0], [1.0], [5.0]]) 37 | centers = centers.astype(float) 38 | data = np.random.normal(centers, variances, size=(3,500)) 39 | fit_errors = fit_dist_data.DistFitDataset(data) 40 | self.assertTrue((fit_errors['poiss'] > fit_errors['norm']).all()) 41 | self.assertTrue((fit_errors['norm'] < fit_errors['lognorm']).all()) 42 | 43 | @unittest.skip('still working on this') 44 | def testLogNormalData(self): 45 | """ 46 | Test with generated unimodal Log-Normal dataset. 47 | """ 48 | centers = np.array([[-1.0], [0.0], [-2]]) 49 | variances = np.array([[2.0], [1.2], [1.5]]) 50 | centers = centers.astype(float) 51 | data = np.random.lognormal(centers, variances, size=(3,500)) 52 | print(data.round()) 53 | print(data.round().max(1)) 54 | fit_errors = fit_dist_data.DistFitDataset(data) 55 | print(fit_errors) 56 | self.assertTrue((fit_errors['poiss'] > fit_errors['lognorm']).all()) 57 | self.assertTrue((fit_errors['norm'] > fit_errors['lognorm']).all()) 58 | 59 | -------------------------------------------------------------------------------- /tests/test_gap_score.py: -------------------------------------------------------------------------------- 1 | """ 2 | Using gap score to determine optimal cluster number 3 | """ 4 | 5 | import unittest 6 | from unittest import TestCase 7 | from flaky import flaky 8 | 9 | import numpy as np 10 | import scipy 11 | 12 | from uncurl import gap_score 13 | 14 | class GapScoreTest(TestCase): 15 | 16 | def setUp(self): 17 | pass 18 | 19 | def test_gap_score(self): 20 | data_mat = scipy.io.loadmat('data/10x_pooled_400.mat') 21 | data = data_mat['data'] 22 | data_tsvd = gap_score.preproc_data(data, gene_subset=True) 23 | max_k, gap_vals, sk_vals = gap_score.run_gap_k_selection(data_tsvd, 24 | k_min=1, k_max=50, skip=5, B=5) 25 | # just test that the score is in a very broad range 26 | self.assertTrue(max_k > 3) 27 | self.assertTrue(max_k < 20) 28 | 29 | def test_gap_score_2(self): 30 | data_mat = scipy.io.loadmat('data/GSE60361_dat.mat') 31 | data = data_mat['Dat'] 32 | data_tsvd = gap_score.preproc_data(data, gene_subset=True) 33 | max_k, gap_vals, sk_vals = gap_score.run_gap_k_selection(data_tsvd, 34 | k_min=1, k_max=50, skip=5, B=5) 35 | self.assertTrue(max_k > 3) 36 | self.assertTrue(max_k < 30) 37 | 38 | @flaky(max_runs=3) 39 | def test_gap_score_3(self): 40 | data_mat = scipy.io.loadmat('data/SCDE_test.mat') 41 | data = data_mat['dat'] 42 | data_tsvd = gap_score.preproc_data(data, gene_subset=True) 43 | max_k, gap_vals, sk_vals = gap_score.run_gap_k_selection(data_tsvd, 44 | k_min=1, k_max=50, skip=5, B=5) 45 | self.assertTrue(max_k < 10) 46 | 47 | 48 | 49 | if __name__ == '__main__': 50 | unittest.main() 51 | 52 | 53 | -------------------------------------------------------------------------------- /tests/test_lineage.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from flaky import flaky 3 | 4 | import numpy as np 5 | 6 | from uncurl import simulation, run_lineage, pseudotime 7 | 8 | @flaky 9 | class LineageTest(TestCase): 10 | 11 | def setUp(self): 12 | pass 13 | 14 | def test_lineage(self): 15 | """ 16 | Testing lineage using randomly generated lineage data 17 | """ 18 | M, W = simulation.generate_poisson_lineage(3, 100, 50) 19 | sim_data = simulation.generate_state_data(M, W) 20 | sim_data = sim_data + 1e-8 21 | m2 = M + np.random.random(M.shape) - 0.5 22 | curves, fitted_vals, edges, assignments = run_lineage(m2, W) 23 | # TODO: assert something about the distances??? 24 | print(len(edges)) 25 | adjacent_count = 0 26 | for e in edges: 27 | if np.abs(e[0]-e[1]) <= 1: 28 | adjacent_count += 1 29 | self.assertTrue(adjacent_count>150) 30 | 31 | def test_pseudotime(self): 32 | """ 33 | Test pseudotime calculations 34 | """ 35 | M, W = simulation.generate_poisson_lineage(3, 100, 50) 36 | sim_data = simulation.generate_state_data(M, W) 37 | sim_data = sim_data + 1e-8 38 | m2 = M + np.random.random(M.shape) - 0.5 39 | curves, fitted_vals, edges, assignments = run_lineage(m2, W) 40 | ptime = pseudotime(0, edges, fitted_vals) 41 | # assert that the cells are generally increasing in ptime 42 | # test each cluster 43 | old_p = 0 44 | for i in range(100): 45 | p = ptime[i] 46 | self.assertTrue(p >= old_p) 47 | old_p = p 48 | old_p = 0 49 | for i in range(100, 200): 50 | p = ptime[i] 51 | self.assertTrue(p >= old_p) 52 | self.assertTrue(p > 0) 53 | old_p = p 54 | old_p = 0 55 | for i in range(200, 300): 56 | p = ptime[i] 57 | self.assertTrue(p >= old_p) 58 | self.assertTrue(p > 0) 59 | old_p = p 60 | -------------------------------------------------------------------------------- /tests/test_nb.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import unittest 4 | from unittest import TestCase 5 | from flaky import flaky 6 | 7 | import numpy as np 8 | 9 | from uncurl import nb_cluster, simulation 10 | from uncurl.nb_clustering import nb_ll, nb_fit 11 | from uncurl.evaluation import purity 12 | 13 | 14 | @flaky 15 | @unittest.skip('nb methods currently not supported') 16 | class NBTest(TestCase): 17 | 18 | def setUp(self): 19 | self.p1 = np.array([1.,2.,3.]) 20 | self.p2 = np.array([2.,2.,3.]) 21 | 22 | def test_negative_binomial(self): 23 | """ 24 | Test NB log-likelihood, nb_cluster 25 | """ 26 | P = np.array([[0.5,0.4,0.8], 27 | [0.5,0.3,0.7], 28 | [0.5,0.3,0.9]]) 29 | R = np.array([[1.,8.,10.], 30 | [2.,8.,24], 31 | [3.,6.,30.]]) 32 | data, labels = simulation.generate_nb_data(P, R, 100) 33 | data = data.astype(float) 34 | #data += 1e-8 35 | ll = nb_ll(data, P, R) 36 | self.assertEqual(ll.shape, (100,3)) 37 | self.assertFalse(np.isnan(ll).any()) 38 | self.assertFalse(np.isinf(ll).any()) 39 | # test derivative 40 | # test nb cluster 41 | # how to test the results... they're often not good... 42 | a,p,r = nb_cluster(data,3) 43 | self.assertEqual(p.shape, P.shape) 44 | self.assertEqual(r.shape, R.shape) 45 | p_nans = np.isnan(p) 46 | r_nans = np.isnan(r) 47 | self.assertFalse(p_nans.any()) 48 | self.assertFalse(r_nans.any()) 49 | # assert that all the points aren't being put into 50 | # the same cluster. 51 | self.assertTrue(purity(labels, a) > 0.8) 52 | self.assertFalse((a==a[0]).all()) 53 | 54 | 55 | def test_nb_fit(self): 56 | """ 57 | Tests fitting an NB distribution 58 | """ 59 | P = np.array([[0.5], 60 | [0.3], 61 | [0.4]]) 62 | R = np.array([[1.], 63 | [8.], 64 | [2.]]) 65 | data, _ = simulation.generate_nb_data(P, R, 500) 66 | p, r = nb_fit(data) 67 | p_nans = np.isnan(p) 68 | r_nans = np.isnan(r) 69 | self.assertFalse(p_nans.any()) 70 | self.assertFalse(r_nans.any()) 71 | self.assertFalse(np.isinf(p).any()) 72 | self.assertFalse(np.isinf(r).any()) 73 | self.assertTrue(np.sum(np.abs(p - P.flatten())**2)/3 < 0.5) 74 | print(r) 75 | print(np.sqrt(np.sum(np.abs(r - R.flatten())**2))/3) 76 | self.assertTrue(np.sqrt(np.sum(np.abs(r - R.flatten())**2))/3 < 3) 77 | 78 | def test_nb_fit_random(self): 79 | """ 80 | Tests fitting an NB distribution with random parameters 81 | """ 82 | for i in range(5): 83 | P = np.random.random((3,1))*0.9+0.1 84 | R = np.random.randint(1, 100, (3,1)) 85 | data, _ = simulation.generate_nb_data(P, R, 500) 86 | try: 87 | p, r = nb_fit(data) 88 | except ValueError: 89 | continue 90 | p_nans = np.isnan(p) 91 | r_nans = np.isnan(r) 92 | print(P) 93 | print(R) 94 | print(p) 95 | print(r) 96 | print(np.sqrt(np.sum(np.abs(r - R.flatten())**2))/3) 97 | self.assertTrue(np.sqrt(np.sum(np.abs(r - R.flatten())**2))/3 < 35) 98 | self.assertFalse(p_nans.any()) 99 | self.assertFalse(r_nans.any()) 100 | self.assertFalse(np.isinf(p).any()) 101 | self.assertFalse(np.isinf(r).any()) 102 | self.assertTrue(np.sum(np.abs(p - P.flatten())**2)/3 < 0.5) 103 | 104 | -------------------------------------------------------------------------------- /tests/test_nb_state_estimation.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import unittest 4 | from unittest import TestCase 5 | 6 | import numpy as np 7 | 8 | from uncurl import nb_state_estimation, simulation 9 | from uncurl.evaluation import purity 10 | 11 | @unittest.skip('nb methods currently not supported') 12 | class StateEstimationTest(TestCase): 13 | 14 | def setUp(self): 15 | pass 16 | 17 | def test_random_1(self): 18 | """ 19 | Test NB state estimation with random parameters 20 | """ 21 | M, W, R = simulation.generate_nb_states(2, 200, 20) 22 | data = simulation.generate_nb_state_data(M, W, R) 23 | M_noised = M + 0.1*(np.random.random(M.shape)-0.5) 24 | M_, W_, R_, ll = nb_state_estimation.nb_estimate_state(data, 2, init_means=M_noised, R = R, disp=False) 25 | c1 = W.argmax(0) 26 | c2 = W_.argmax(0) 27 | p = purity(c2, c1) 28 | print(p) 29 | print(data) 30 | print(M) 31 | print(M_) 32 | self.assertTrue(p > 0.7) 33 | -------------------------------------------------------------------------------- /tests/test_nmf.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | import numpy as np 6 | from scipy import sparse 7 | from scipy.io import loadmat 8 | 9 | import uncurl 10 | 11 | class NMFTest(TestCase): 12 | 13 | def setUp(self): 14 | dat = loadmat('data/SCDE_test.mat') 15 | self.data = dat['dat'].toarray()[0:500, :] 16 | self.data_sparse = sparse.csc_matrix(self.data) 17 | self.labs = dat['Lab'][0] 18 | 19 | def test_run_lognorm_nmf(self): 20 | w, h, cost = uncurl.nmf_wrapper.log_norm_nmf(self.data, 2) 21 | labs = h.argmax(0) 22 | self.assertTrue(uncurl.evaluation.purity(labs, self.labs) > 0.85) 23 | 24 | def test_run_norm_nmf(self): 25 | w, h, cost = uncurl.nmf_wrapper.norm_nmf(self.data, 2) 26 | labs = h.argmax(0) 27 | self.assertTrue(uncurl.evaluation.purity(labs, self.labs) > 0.8) 28 | 29 | def test_run_se(self): 30 | w, h, cost = uncurl.run_state_estimation(self.data, 2, dist='log-norm') 31 | labs = h.argmax(0) 32 | self.assertTrue(uncurl.evaluation.purity(labs, self.labs) > 0.85) 33 | w1, h1, cost = uncurl.run_state_estimation(self.data, 2, dist='gaussian') 34 | labs = h1.argmax(0) 35 | self.assertTrue(uncurl.evaluation.purity(labs, self.labs) > 0.8) 36 | -------------------------------------------------------------------------------- /tests/test_poisson.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest import TestCase 3 | from flaky import flaky 4 | 5 | import numpy as np 6 | 7 | from scipy import sparse 8 | 9 | import uncurl 10 | from uncurl import pois_ll 11 | from uncurl.simulation import generate_poisson_data 12 | 13 | class PoissonTest(TestCase): 14 | 15 | def setUp(self): 16 | self.p1 = np.array([1.,2.,3.]) 17 | self.p2 = np.array([2.,2.,3.]) 18 | 19 | def test_poisson_dist(self): 20 | self.assertEqual(uncurl.poisson_dist(self.p1, self.p1), 0.0) 21 | self.assertEqual(uncurl.poisson_dist(self.p2, self.p2), 0.0) 22 | self.assertTrue(uncurl.poisson_dist(self.p1, self.p2) > 0.0) 23 | self.assertTrue( 24 | np.abs(uncurl.sparse_utils.poisson_dist(self.p1, self.p2) - 25 | uncurl.poisson_dist(self.p1, self.p2)) < 1e-4) 26 | 27 | def test_sparse_poisson_dist(self): 28 | sp1 = sparse.csc_matrix(self.p1) 29 | sp2 = sparse.csc_matrix(self.p2) 30 | self.assertTrue( 31 | np.abs(uncurl.sparse_utils.poisson_dist(self.p1, self.p2) - 32 | uncurl.poisson_dist(self.p1, self.p2)) < 1e-4) 33 | 34 | 35 | def test_poisson_ll(self): 36 | """ 37 | Test Poisson log-likelihood 38 | """ 39 | centers = np.array([[1,10,20], [1, 11, 1], [50, 1, 100]]) 40 | centers = centers.astype(float) 41 | data, labs = generate_poisson_data(centers, 500) 42 | data = data.astype(float) 43 | starting_centers = centers 44 | poisson_ll = pois_ll.poisson_ll(data, starting_centers) 45 | p_isnan = np.isnan(poisson_ll) 46 | # just test that it's not nan 47 | self.assertFalse(p_isnan.any()) 48 | 49 | def test_sparse_poisson_ll(self): 50 | """ 51 | Test Poisson log-likelihood 52 | """ 53 | centers = np.array([[0.1,10,20], [5, 15, 1], [50, 1, 0.1]]) 54 | centers = centers.astype(float) 55 | data, labs = generate_poisson_data(centers, 500) 56 | data = data.astype(float) 57 | data = sparse.csc_matrix(data) 58 | starting_centers = centers 59 | poisson_ll = pois_ll.poisson_ll(data, starting_centers) 60 | p_isnan = np.isnan(poisson_ll) 61 | self.assertFalse(p_isnan.any()) 62 | labels = poisson_ll.argmax(1) 63 | self.assertTrue((labels==labs).sum() >= 450) 64 | 65 | 66 | -------------------------------------------------------------------------------- /tests/test_preprocessing.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | import numpy as np 4 | from scipy.io import loadmat 5 | from scipy import sparse 6 | 7 | import uncurl 8 | from uncurl.preprocessing import sparse_mean_var, cell_normalize 9 | from uncurl.simulation import generate_poisson_data 10 | from uncurl.evaluation import purity 11 | 12 | class PreprocessingTest(TestCase): 13 | 14 | def setUp(self): 15 | dat = loadmat('data/SCDE_k2_sup.mat') 16 | self.data_sparse = sparse.csc_matrix(dat['Dat']) 17 | self.data_dense = dat['Dat'] 18 | self.labs = dat['Lab'].flatten() 19 | 20 | def testSparseVar(self): 21 | """ 22 | Test sparse variance 23 | """ 24 | dense_var = np.var(self.data_dense, 1) 25 | mean, sp_var = sparse_mean_var(self.data_sparse) 26 | se = np.sqrt(np.sum((sp_var - dense_var)**2)) 27 | print(se) 28 | self.assertTrue(se < 1e-5) 29 | 30 | def testMaxVarGenes(self): 31 | """ 32 | test max variance genes for dense and sparse matrices 33 | """ 34 | n_genes =self.data_sparse.shape[0] 35 | genes1 = uncurl.max_variance_genes(self.data_dense, nbins=1, frac=0.5) 36 | genes2 = uncurl.max_variance_genes(self.data_sparse, nbins=1, frac=0.5) 37 | self.assertEqual(set(genes1), set(genes2)) 38 | self.assertEqual(len(genes1), int(0.5*n_genes)) 39 | genes1 = uncurl.max_variance_genes(self.data_dense, nbins=5, frac=0.2) 40 | genes2 = uncurl.max_variance_genes(self.data_sparse, nbins=5, frac=0.2) 41 | self.assertEqual(set(genes1), set(genes2)) 42 | self.assertEqual(len(genes1), 5*int((n_genes/5)*0.2)) 43 | 44 | def testCellNormalize(self): 45 | sparse_cell_norm = cell_normalize(self.data_sparse) 46 | dense_cell_norm = cell_normalize(self.data_dense) 47 | diff = dense_cell_norm - sparse_cell_norm.toarray() 48 | diff = np.sqrt(np.sum(diff**2)) 49 | self.assertTrue(diff < 1e-6) 50 | -------------------------------------------------------------------------------- /tests/test_qual2quant.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from unittest import TestCase 4 | 5 | import numpy as np 6 | from scipy import sparse 7 | from scipy.io import loadmat 8 | 9 | import uncurl 10 | 11 | class Qual2QuantTest(TestCase): 12 | # TODO: test dataset 13 | 14 | def setUp(self): 15 | dat = loadmat('data/SCDE_test.mat') 16 | self.data = dat['dat'].toarray()[0:500, :] 17 | self.qualData = dat['M'].toarray()[0:500, :] 18 | 19 | 20 | def test_qual2quant(self): 21 | # simulated test data? 22 | # no... use M as a starting matrix 23 | # qual_matrix = np.zeros((self.data.shape[0], 2)) 24 | starting_points = uncurl.qualNorm(self.data, self.qualData) 25 | self.assertTrue(starting_points.shape==(500, 2)) 26 | self.assertFalse(np.isnan(starting_points).any()) 27 | print((starting_points[:,0] == starting_points[:,1]).sum()) 28 | self.assertTrue((starting_points[:,0] == starting_points[:,1]).sum() < 10) 29 | 30 | 31 | def test_qual2quant_sparse(self): 32 | # simulated test data? 33 | # no... use M as a starting matrix 34 | # qual_matrix = np.zeros((self.data.shape[0], 2)) 35 | data_sparse = sparse.csc_matrix(self.data) 36 | starting_points = uncurl.qualNorm(data_sparse, self.qualData) 37 | self.assertTrue(starting_points.shape==(500, 2)) 38 | self.assertFalse(np.isnan(starting_points).any()) 39 | print((starting_points[:,0] == starting_points[:,1]).sum()) 40 | self.assertTrue((starting_points[:,0] == starting_points[:,1]).sum() < 10) 41 | 42 | 43 | def test_qual2quant_missing_data(self): 44 | # simulated test data? 45 | # no... use M as a starting matrix 46 | # qual_matrix = np.zeros((self.data.shape[0], 2)) 47 | qualData_m = self.qualData.copy() 48 | for i in range(300): 49 | qualData_m[i,:] = -1 50 | starting_points = uncurl.qualNorm(self.data, qualData_m) 51 | self.assertTrue(starting_points.shape==(500, 2)) 52 | self.assertFalse(np.isnan(starting_points).any()) 53 | print((starting_points[:,0] == starting_points[:,1]).sum()) 54 | self.assertTrue((starting_points[:,0] == starting_points[:,1]).sum() < 10) 55 | 56 | 57 | -------------------------------------------------------------------------------- /tests/test_real_data.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import unittest 4 | from unittest import TestCase 5 | 6 | import numpy as np 7 | from scipy import sparse 8 | from scipy.io import loadmat 9 | 10 | import uncurl 11 | 12 | class RealDataTest(TestCase): 13 | """ 14 | tests results on actual datasets: 10x_pooled, Zeisel 7-cluster subset, 15 | maybe add others? 16 | """ 17 | 18 | def setUp(self): 19 | dat = loadmat('data/10x_pooled_400.mat') 20 | self.data = sparse.csc_matrix(dat['data']) 21 | self.labs = dat['labels'].flatten() 22 | dat_z = loadmat('data/GSE60361_dat.mat') 23 | self.data_z = sparse.csc_matrix(dat_z['Dat']) 24 | self.labs_z = dat_z['ActLabs'].flatten() 25 | 26 | def test_10xSE(self): 27 | # gene selection 28 | genes = uncurl.max_variance_genes(self.data) 29 | data_subset = self.data[genes,:] 30 | # smaller # of iterations than default so it finishes faster... 31 | se = uncurl.experiment_runner.PoissonSE(clusters=8, max_iters=10, 32 | inner_max_iters=80) 33 | argmax = uncurl.experiment_runner.Argmax(n_classes=8) 34 | km = uncurl.experiment_runner.KM(n_classes=8) 35 | methods = [(se, [argmax, km])] 36 | results, names, other = uncurl.experiment_runner.run_experiment( 37 | methods, data_subset, 8, self.labs, n_runs=1, 38 | use_purity=False, use_nmi=True) 39 | print(results) 40 | # NMI should be > 0.75 on 10x_pure_pooled 41 | # (accounting for lower than default iter count) 42 | self.assertTrue(results[0][0]>0.75) 43 | self.assertTrue(results[0][1]>0.75) 44 | 45 | def test_Zeisel(self): 46 | # gene selection 47 | genes = uncurl.max_variance_genes(self.data_z) 48 | data_subset = self.data_z[genes,:] 49 | # smaller # of iterations than default so it finishes faster... 50 | se = uncurl.experiment_runner.PoissonSE(clusters=7, max_iters=10, 51 | inner_max_iters=80) 52 | argmax = uncurl.experiment_runner.Argmax(n_classes=7) 53 | km = uncurl.experiment_runner.KM(n_classes=7) 54 | methods = [(se, [argmax, km])] 55 | results, names, other = uncurl.experiment_runner.run_experiment( 56 | methods, data_subset, 7, self.labs_z, n_runs=1, 57 | use_purity=False, use_nmi=True) 58 | print(results) 59 | # NMI should be > 0.75 on Zeisel subset as well 60 | self.assertTrue(results[0][0]>0.75) 61 | self.assertTrue(results[0][1]>0.75) 62 | 63 | def test_10x_auto_cluster(self): 64 | """ 65 | Test using automatic cluster size determination 66 | """ 67 | from sklearn.metrics.cluster import normalized_mutual_info_score as nmi 68 | # gene selection 69 | genes = uncurl.max_variance_genes(self.data) 70 | data_subset = self.data[genes,:] 71 | # smaller # of iterations than default so it finishes faster... 72 | M, W, ll = uncurl.run_state_estimation(data_subset, clusters=0, 73 | max_iters=10, inner_max_iters=80) 74 | labels = W.argmax(0) 75 | # NMI should be > 0.75 on 10x_pure_pooled 76 | # (accounting for lower than default iter count) 77 | self.assertTrue(nmi(self.labs, labels)>0.6) 78 | # test RMSE 79 | test_data = np.dot(M, W) 80 | error = data_subset.toarray() - test_data 81 | error = np.sqrt(np.mean(error**2)) 82 | print('data subset RMSE:', error) 83 | self.assertTrue(error < 2.0) 84 | 85 | def test_10x_update_m(self): 86 | """ 87 | Test after updating M 88 | """ 89 | from uncurl.state_estimation import update_m 90 | genes = uncurl.max_variance_genes(self.data) 91 | data_subset = self.data[genes,:] 92 | # smaller # of iterations than default so it finishes faster... 93 | M, W, ll = uncurl.run_state_estimation(data_subset, clusters=0, 94 | max_iters=10, inner_max_iters=50) 95 | new_M = update_m(self.data, M, W, genes) 96 | self.assertEqual(new_M.shape, (self.data.shape[0], W.shape[0])) 97 | self.assertFalse(np.isnan(new_M).any()) 98 | # test RMSE 99 | test_data = np.dot(new_M, W) 100 | error = self.data.toarray() - test_data 101 | error = np.sqrt(np.mean(error**2)) 102 | print('M update RMSE:', error) 103 | self.assertTrue(error < 2.0) 104 | 105 | if __name__ == '__main__': 106 | unittest.main() 107 | -------------------------------------------------------------------------------- /tests/test_state_estimation.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import itertools 4 | 5 | from unittest import TestCase 6 | from flaky import flaky 7 | 8 | import numpy as np 9 | from scipy.io import loadmat 10 | 11 | from uncurl import state_estimation, simulation, run_state_estimation 12 | 13 | class StateEstimationTest(TestCase): 14 | 15 | def setUp(self): 16 | pass 17 | 18 | @flaky 19 | def test_state_estimation(self): 20 | """ 21 | Generate sample data from a small set to see that the state 22 | estimation is accurate. 23 | 24 | 7 cells, 4 genes, 2 clusters 25 | """ 26 | sim_means = np.array([[20.,30.], 27 | [10.,3.], 28 | [90.,50.], 29 | [10.,4.]]) 30 | sim_assignments = np.array([[0.1,0.2,0.3,0.4,0.5,0.8,0.9], 31 | [0.9,0.8,0.7,0.6,0.5,0.2,0.1]]) 32 | sim_data = simulation.generate_state_data(sim_means, sim_assignments) 33 | sim_data = sim_data + 1e-8 34 | print(sim_data) 35 | # add noise to the mean 36 | sim_means_noised = sim_means + 5*(np.random.random(sim_means.shape)-0.5) 37 | m, w, ll = state_estimation.poisson_estimate_state(sim_data, 2, init_means=sim_means_noised, max_iters=10, disp=False) 38 | print(m) 39 | print(w) 40 | self.assertTrue(np.max(w.sum(0) - 1.0)<0.01) 41 | # mean error in M is less than 5 42 | self.assertTrue(np.mean(np.abs(sim_means-m))<10.0 or 43 | np.mean(np.abs(sim_means-m[:,[1,0]]))<10.0) 44 | # mean error in W is less than 0.2 (arbitrary boundary) 45 | self.assertTrue(np.mean(np.abs(sim_assignments-w))<0.3 or 46 | np.mean(np.abs(sim_assignments-w[[1,0],:]))<0.3) 47 | 48 | def test_state_estimation_2(self): 49 | """ 50 | Generate sample data from a slightly larger set to see that the state 51 | estimation is accurate. 52 | 53 | 11 cells, 5 genes, 3 clusters 54 | 55 | This might fail due to inherent randomness... 56 | """ 57 | sim_means = np.array([[20.,30.,4.], 58 | [10.,3.,9.], 59 | [90.,50.,10.], 60 | [10.,4.,30.], 61 | [35.,10.,2.]]) 62 | sim_assignments = np.array([[0.1,0.2,0.3,0.4,0.1,0.7,0.6,0.9,0.5,0.2,0.1], 63 | [0.6,0.7,0.3,0.4,0.1,0.2,0.1,0.1,0.0,0.3,0.8], 64 | [0.3,0.1,0.4,0.2,0.8,0.1,0.3,0.0,0.5,0.5,0.1]]) 65 | sim_data = simulation.generate_state_data(sim_means, sim_assignments) 66 | sim_data = sim_data + 1e-8 67 | print(sim_data) 68 | # add noise to the mean 69 | sim_means_noised = sim_means + 5*(np.random.random(sim_means.shape)-0.5) 70 | m, w, ll = state_estimation.poisson_estimate_state(sim_data, 3, init_means=sim_means_noised, max_iters=10, disp=False) 71 | print(m) 72 | print(w) 73 | print(w.sum(0)) 74 | self.assertTrue(np.max(w.sum(0) - 1.0)<0.01) 75 | # mean error in M is less than 10 76 | means_good = False 77 | weights_good = False 78 | # test every permutation of clusters 79 | for p in itertools.permutations([0,1,2]): 80 | means_good = means_good or (np.mean(np.abs(sim_means-m[:,p]))<10.0) 81 | weights_good = weights_good or (np.mean(np.abs(sim_assignments-w[p,:]))<0.2) 82 | self.assertTrue(means_good) 83 | self.assertTrue(weights_good) 84 | 85 | def test_random_means(self): 86 | """ 87 | Test state estimation with random means and weights. 88 | 89 | 200 cells, 20 genes, 2 clusters 90 | """ 91 | sim_m, sim_w = simulation.generate_poisson_states(2, 200, 20) 92 | sim_data = simulation.generate_state_data(sim_m, sim_w) 93 | sim_means_noised = sim_m + 5*(np.random.random(sim_m.shape)-0.5) 94 | m, w, ll = state_estimation.poisson_estimate_state(sim_data, 2, init_means=sim_means_noised, max_iters=10, disp=False, method='L-BFGS-B') 95 | self.assertTrue(np.max(w.sum(0) - 1.0)<0.001) 96 | means_good = False 97 | weights_good = False 98 | for p in itertools.permutations([0,1]): 99 | means_good = means_good or (np.mean(np.abs(sim_m-m[:,p]))<20.0) 100 | weights_good = weights_good or (np.mean(np.abs(sim_w-w[p,:]))<0.3) 101 | self.assertTrue(means_good) 102 | self.assertTrue(weights_good) 103 | 104 | def test_random_means_2(self): 105 | """ 106 | Test state estimation with random means and weights. 107 | 108 | 20 cells, 200 genes, 2 clusters 109 | """ 110 | sim_m, sim_w = simulation.generate_poisson_states(2, 20, 200) 111 | sim_data = simulation.generate_state_data(sim_m, sim_w) 112 | sim_means_noised = sim_m + 5*(np.random.random(sim_m.shape)-0.5) 113 | m, w, ll = state_estimation.poisson_estimate_state(sim_data, 2, init_means=sim_means_noised, max_iters=10, disp=False) 114 | means_good = False 115 | weights_good = False 116 | for p in itertools.permutations([0,1]): 117 | means_good = means_good or (np.mean(np.abs(sim_m-m[:,p]))<20.0) 118 | weights_good = weights_good or (np.mean(np.abs(sim_w-w[p,:]))<0.2) 119 | self.assertTrue(means_good) 120 | self.assertTrue(weights_good) 121 | 122 | def test_run_se(self): 123 | """ 124 | test the run_state_estimation function 125 | """ 126 | sim_m, sim_w = simulation.generate_poisson_states(2, 200, 20) 127 | sim_data = simulation.generate_state_data(sim_m, sim_w) 128 | m, w, ll = run_state_estimation(sim_data, 2, dist='Poiss', max_iters=10, disp=False) 129 | means_good = False 130 | weights_good = False 131 | for p in itertools.permutations([0,1]): 132 | means_good = means_good or (np.mean(np.abs(sim_m-m[:,p]))<20.0) 133 | weights_good = weights_good or (np.mean(np.abs(sim_w-w[p,:]))<0.3) 134 | self.assertTrue(means_good) 135 | self.assertTrue(weights_good) 136 | -------------------------------------------------------------------------------- /tests/test_zip_state_estimation.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import unittest 4 | from unittest import TestCase 5 | from flaky import flaky 6 | 7 | import numpy as np 8 | from scipy.io import loadmat 9 | 10 | from uncurl import zip_state_estimation, simulation 11 | 12 | @flaky 13 | @unittest.skip('zip methods currently not supported') 14 | class ZIPStateEstimationTest(TestCase): 15 | 16 | def setUp(self): 17 | pass 18 | 19 | def test_state_estimation(self): 20 | """ 21 | Generate sample data from a small set to see that the state 22 | estimation is accurate. 23 | 24 | 7 cells, 4 genes, 2 clusters 25 | """ 26 | sim_means = np.array([[20.,30.], 27 | [10.,3.], 28 | [90.,50.], 29 | [10.,4.]]) 30 | sim_assignments = np.array([[0.1,0.2,0.3,0.4,0.5,0.8,0.9], 31 | [0.9,0.8,0.7,0.6,0.5,0.2,0.1]]) 32 | sim_data = simulation.generate_zip_state_data(sim_means, sim_assignments, 0.3) 33 | sim_data = sim_data + 1e-8 34 | print(sim_data) 35 | # add noise to the mean 36 | sim_means_noised = sim_means + 5*(np.random.random(sim_means.shape)-0.5) 37 | m, w, ll = zip_state_estimation.zip_estimate_state(sim_data, 2, init_means=sim_means_noised, max_iters=10, disp=False) 38 | print(m) 39 | print(w) 40 | print(w.sum(0)) 41 | self.assertTrue(np.max(w.sum(0) - 1.0)<0.01) 42 | # mean error in M is less than 5 43 | self.assertTrue(np.mean(np.abs(sim_means-m))<10.0) 44 | # mean error in W is less than 0.2 (arbitrary boundary) 45 | self.assertTrue(np.mean(np.abs(sim_assignments-w))<0.3) 46 | 47 | def test_state_estimation_2(self): 48 | """ 49 | Generate sample data from a slightly larger set to see that the state 50 | estimation is accurate. 51 | 52 | 11 cells, 5 genes, 3 clusters 53 | 54 | This might fail due to inherent randomness... 55 | """ 56 | sim_means = np.array([[20.,30.,4.], 57 | [10.,3.,9.], 58 | [90.,50.,10.], 59 | [10.,4.,30.], 60 | [35.,10.,2.]]) 61 | sim_assignments = np.array([[0.1,0.2,0.3,0.4,0.1,0.7,0.6,0.9,0.5,0.2,0.1], 62 | [0.6,0.7,0.3,0.4,0.1,0.2,0.1,0.1,0.0,0.3,0.8], 63 | [0.3,0.1,0.4,0.2,0.8,0.1,0.3,0.0,0.5,0.5,0.1]]) 64 | sim_data = simulation.generate_zip_state_data(sim_means, sim_assignments, 0.3) 65 | sim_data = sim_data + 1e-8 66 | print(sim_data) 67 | # add noise to the mean 68 | sim_means_noised = sim_means + 5*(np.random.random(sim_means.shape)-0.5) 69 | m, w, ll = zip_state_estimation.zip_estimate_state(sim_data, 3, init_means=sim_means_noised, max_iters=10, disp=False) 70 | print(m) 71 | print(w) 72 | print(w.sum(0)) 73 | self.assertTrue(np.max(w.sum(0) - 1.0)<0.01) 74 | # mean error in M is less than 10 75 | self.assertTrue(np.mean(np.abs(sim_means-m))<10.0) 76 | # mean error in W is less than 0.4 (arbitrary boundary) 77 | self.assertTrue(np.mean(np.abs(sim_assignments-w))<0.4) 78 | 79 | def test_random_means(self): 80 | """ 81 | Test state estimation with random means and weights. 82 | 83 | 200 cells, 20 genes, 2 clusters 84 | """ 85 | sim_m, sim_w = simulation.generate_poisson_states(2, 200, 20) 86 | z = np.random.random()/2 87 | sim_data = simulation.generate_zip_state_data(sim_m, sim_w, z) 88 | sim_means_noised = sim_m + 5*(np.random.random(sim_m.shape)-0.5) 89 | m, w, ll = zip_state_estimation.zip_estimate_state(sim_data, 2, init_means=sim_means_noised, max_iters=10, disp=False) 90 | self.assertTrue(np.max(w.sum(0) - 1.0)<0.001) 91 | self.assertTrue(np.mean(np.abs(sim_m-m))<50.0) 92 | self.assertTrue(np.mean(np.abs(sim_w-w))<0.4) 93 | 94 | def test_random_means_2(self): 95 | """ 96 | Test state estimation with random means and weights. 97 | 98 | 20 cells, 200 genes, 2 clusters 99 | """ 100 | sim_m, sim_w = simulation.generate_poisson_states(2, 20, 200) 101 | sim_data = simulation.generate_state_data(sim_m, sim_w) 102 | sim_means_noised = sim_m + 5*(np.random.random(sim_m.shape)-0.5) 103 | m, w, ll = zip_state_estimation.zip_estimate_state(sim_data, 2, init_means=sim_means_noised, max_iters=10, disp=False) 104 | self.assertTrue(np.max(w.sum(0) - 1.0)<0.001) 105 | self.assertTrue(np.mean(np.abs(sim_m-m))<60.0) 106 | self.assertTrue(np.mean(np.abs(sim_w-w))<0.5) 107 | -------------------------------------------------------------------------------- /uncurl/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .clustering import poisson_cluster, kmeans_pp 3 | from .zip_clustering import zip_cluster 4 | from .pois_ll import poisson_ll, poisson_dist 5 | from .qual2quant import qualNorm 6 | from .state_estimation import poisson_estimate_state, update_m 7 | from .run_se import run_state_estimation 8 | from .nb_state_estimation import nb_estimate_state 9 | from .zip_state_estimation import zip_estimate_state 10 | from .dimensionality_reduction import mds, dim_reduce, dim_reduce_data 11 | from .lineage import run_lineage, pseudotime 12 | from .nb_clustering import nb_cluster 13 | 14 | from .preprocessing import max_variance_genes 15 | 16 | from . import sampling 17 | 18 | from .nmf_wrapper import log_norm_nmf 19 | 20 | try: 21 | from . import experiment_runner 22 | except: 23 | print('unable to import experiment_runner') 24 | -------------------------------------------------------------------------------- /uncurl/clustering.py: -------------------------------------------------------------------------------- 1 | # poisson clustering 2 | 3 | import numpy as np 4 | from scipy import sparse 5 | 6 | from .pois_ll import poisson_ll, poisson_dist 7 | 8 | eps = 1e-10 9 | 10 | def kmeans_pp(data, k, centers=None): 11 | """ 12 | Generates kmeans++ initial centers. 13 | 14 | Args: 15 | data (array): A 2d array- genes x cells 16 | k (int): Number of clusters 17 | centers (array, optional): if provided, these are one or more known cluster centers. 2d array of genes x number of centers (<=k). 18 | 19 | Returns: 20 | centers - a genes x k array of cluster means. 21 | assignments - a cells x 1 array of cluster assignments 22 | """ 23 | # TODO: what if there is missing data for a given gene? 24 | # missing data could be if all the entires are -1. 25 | genes, cells = data.shape 26 | if sparse.issparse(data) and not sparse.isspmatrix_csc(data): 27 | data = sparse.csc_matrix(data) 28 | num_known_centers = 0 29 | if centers is None: 30 | centers = np.zeros((genes, k)) 31 | else: 32 | num_known_centers = centers.shape[1] 33 | centers = np.concatenate((centers, np.zeros((genes, k-num_known_centers))), 1) 34 | distances = np.zeros((cells, k)) 35 | distances[:] = np.inf 36 | if num_known_centers == 0: 37 | init = np.random.randint(0, cells) 38 | if sparse.issparse(data): 39 | centers[:,0] = data[:, init].toarray().flatten() 40 | else: 41 | centers[:,0] = data[:, init] 42 | num_known_centers+=1 43 | available_cells = list(range(cells)) 44 | for c in range(num_known_centers, k): 45 | c2 = c-1 46 | # use different formulation for distance... if sparse, use lls 47 | # if not sparse, use poisson_dist 48 | if sparse.issparse(data): 49 | lls = poisson_ll(data, centers[:,c2:c2+1]).flatten() 50 | distances[:,c2] = 1 + lls.max() - lls 51 | distances[:,c2] /= distances[:,c2].max() 52 | else: 53 | for cell in range(cells): 54 | distances[cell, c2] = poisson_dist(data[:,cell], centers[:,c2]) 55 | # choose a new data point as center... probability proportional 56 | # to distance^2 57 | min_distances = np.min(distances, 1) 58 | min_distances = min_distances**2 59 | min_distances = min_distances[available_cells] 60 | # should be sampling without replacement 61 | min_dist = np.random.choice(available_cells, 62 | p=min_distances/min_distances.sum()) 63 | available_cells.pop(available_cells.index(min_dist)) 64 | if sparse.issparse(data): 65 | centers[:,c] = data[:, min_dist].toarray().flatten() 66 | else: 67 | centers[:,c] = data[:, min_dist] 68 | lls = poisson_ll(data, centers) 69 | new_assignments = np.argmax(lls, 1) 70 | centers[centers==0.0] = eps 71 | return centers, new_assignments 72 | 73 | def poisson_cluster(data, k, init=None, max_iters=100): 74 | """ 75 | Performs Poisson hard EM on the given data. 76 | 77 | Args: 78 | data (array): A 2d array- genes x cells. Can be dense or sparse; for best performance, sparse matrices should be in CSC format. 79 | k (int): Number of clusters 80 | init (array, optional): Initial centers - genes x k array. Default: None, use kmeans++ 81 | max_iters (int, optional): Maximum number of iterations. Default: 100 82 | 83 | Returns: 84 | a tuple of two arrays: a cells x 1 vector of cluster assignments, 85 | and a genes x k array of cluster means. 86 | """ 87 | # TODO: be able to use a combination of fixed and unknown starting points 88 | # e.g., have init values only for certain genes, have a row of all 89 | # zeros indicating that kmeans++ should be used for that row. 90 | genes, cells = data.shape 91 | #print 'starting: ', centers 92 | if sparse.issparse(data) and not sparse.isspmatrix_csc(data): 93 | data = sparse.csc_matrix(data) 94 | init, assignments = kmeans_pp(data, k, centers=init) 95 | centers = np.copy(init) 96 | assignments = np.zeros(cells) 97 | for it in range(max_iters): 98 | lls = poisson_ll(data, centers) 99 | #cluster_dists = np.zeros((cells, k)) 100 | new_assignments = np.argmax(lls, 1) 101 | if np.equal(assignments, new_assignments).all(): 102 | #print 'ending: ', centers 103 | return new_assignments, centers 104 | for c in range(k): 105 | if sparse.issparse(data): 106 | if data[:,new_assignments==c].shape[0]==0: 107 | # re-initialize centers? 108 | new_c, _ = kmeans_pp(data, k, centers[:,:c]) 109 | centers[:,c] = new_c[:,c] 110 | else: 111 | centers[:,c] = np.asarray(data[:,new_assignments==c].mean(1)).flatten() 112 | else: 113 | if len(data[:,new_assignments==c])==0: 114 | new_c, _ = kmeans_pp(data, k, centers[:,:c]) 115 | centers[:,c] = new_c[:,c] 116 | else: 117 | centers[:,c] = np.mean(data[:,new_assignments==c], 1) 118 | assignments = new_assignments 119 | return assignments, centers 120 | 121 | -------------------------------------------------------------------------------- /uncurl/dimensionality_reduction.py: -------------------------------------------------------------------------------- 1 | # dimensionality reduction 2 | 3 | import numpy as np 4 | from .pois_ll import poisson_dist 5 | 6 | eps=1e-8 7 | max_or_zero = np.vectorize(lambda x: max(0.0,x)) 8 | 9 | def diffusion_mds(means, weights, d, diffusion_rounds=10): 10 | """ 11 | Dimensionality reduction using MDS, while running diffusion on W. 12 | 13 | Args: 14 | means (array): genes x clusters 15 | weights (array): clusters x cells 16 | d (int): desired dimensionality 17 | 18 | Returns: 19 | W_reduced (array): array of shape (d, cells) 20 | """ 21 | for i in range(diffusion_rounds): 22 | weights = weights*weights 23 | weights = weights/weights.sum(0) 24 | X = dim_reduce(means, weights, d) 25 | if X.shape[0]==2: 26 | return X.dot(weights) 27 | else: 28 | return X.T.dot(weights) 29 | 30 | 31 | def mds(means, weights, d): 32 | """ 33 | Dimensionality reduction using MDS. 34 | 35 | Args: 36 | means (array): genes x clusters 37 | weights (array): clusters x cells 38 | d (int): desired dimensionality 39 | 40 | Returns: 41 | W_reduced (array): array of shape (d, cells) 42 | """ 43 | X = dim_reduce(means, weights, d) 44 | if X.shape[0]==2: 45 | return X.dot(weights) 46 | else: 47 | return X.T.dot(weights) 48 | 49 | def dim_reduce(means, weights, d): 50 | """ 51 | Dimensionality reduction using Poisson distances and MDS. 52 | 53 | Args: 54 | means (array): genes x clusters 55 | weights (array): clusters x cells 56 | d (int): desired dimensionality 57 | 58 | Returns: 59 | X, a clusters x d matrix representing the reduced dimensions 60 | of the cluster centers. 61 | """ 62 | return dim_reduce_data(means, d) 63 | 64 | def dim_reduce_data(data, d): 65 | """ 66 | Does a MDS on the data directly, not on the means. 67 | 68 | Args: 69 | data (array): genes x cells 70 | d (int): desired dimensionality 71 | 72 | Returns: 73 | X, a cells x d matrix 74 | """ 75 | genes, cells = data.shape 76 | distances = np.zeros((cells, cells)) 77 | for i in range(cells): 78 | for j in range(cells): 79 | distances[i,j] = poisson_dist(data[:,i], data[:,j]) 80 | # do MDS on the distance matrix (procedure from Wikipedia) 81 | proximity = distances**2 82 | J = np.eye(cells) - 1./cells 83 | B = -0.5*np.dot(J, np.dot(proximity, J)) 84 | # B should be symmetric, so we can use eigh 85 | e_val, e_vec = np.linalg.eigh(B) 86 | # Note: lam should be ordered to be the largest eigenvalues 87 | lam = np.diag(e_val[-d:])[::-1] 88 | #lam = max_or_zero(lam) 89 | E = e_vec[:,-d:][::-1] 90 | X = np.dot(E, lam**0.5) 91 | return X 92 | -------------------------------------------------------------------------------- /uncurl/evaluation.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | import numpy as np 4 | from sklearn.neighbors import BallTree 5 | 6 | def purity(labels, true_labels): 7 | """ 8 | Calculates the purity score for the given labels. 9 | 10 | Args: 11 | labels (array): 1D array of integers 12 | true_labels (array): 1D array of integers - true labels 13 | 14 | Returns: 15 | purity score - a float bewteen 0 and 1. Closer to 1 is better. 16 | """ 17 | purity = 0.0 18 | for i in set(labels): 19 | indices = (labels==i) 20 | true_clusters = true_labels[indices] 21 | if len(true_clusters)==0: 22 | continue 23 | counts = Counter(true_clusters) 24 | lab, count = counts.most_common()[0] 25 | purity += count 26 | return float(purity)/len(labels) 27 | 28 | def nne(dim_red, true_labels): 29 | """ 30 | Calculates the nearest neighbor accuracy (basically leave-one-out cross 31 | validation with a 1NN classifier). 32 | 33 | Args: 34 | dim_red (array): dimensions (k, cells) 35 | true_labels (array): 1d array of integers 36 | 37 | Returns: 38 | Nearest neighbor accuracy - fraction of points for which the 1NN 39 | 1NN classifier returns the correct value. 40 | """ 41 | # use sklearn's BallTree 42 | bt = BallTree(dim_red.T) 43 | correct = 0 44 | for i, l in enumerate(true_labels): 45 | dist, ind = bt.query([dim_red[:,i]], k=2) 46 | closest_cell = ind[0, 1] 47 | if true_labels[closest_cell] == l: 48 | correct += 1 49 | return float(correct)/len(true_labels) 50 | 51 | def mdl(ll, k, data): 52 | """ 53 | Returns the minimum description length score of the model given its 54 | log-likelihood and k, the number of cell types. 55 | 56 | a lower cost is better... 57 | """ 58 | 59 | """ 60 | N - no. of genes 61 | n - no. of cells 62 | k - no. of cell types 63 | R - sum(Dataset) i.e. total no. of reads 64 | 65 | function TotCost = TotBits(N,m,p,R,C) 66 | # C is the cost from the cost function 67 | TotCost = C + (N*m + m*p)*(log(R/(N*p))); 68 | """ 69 | N, m = data.shape 70 | cost = ll + (N*m + m*k)*(np.log(data.sum()/(N*k))) 71 | return cost 72 | -------------------------------------------------------------------------------- /uncurl/fit_dist_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import division 4 | import numpy as np 5 | from scipy.stats import poisson 6 | from scipy.stats import norm 7 | import math as math 8 | 9 | 10 | 11 | def GetDistFitError(Dat): 12 | #Assumes data to be in the form of a numpy matrix 13 | # TODO: make this work for sparse inputs? 14 | # TODO: fixed number of bins, rather than use the maximum value? 15 | # use np.histogram 16 | Dat = np.round(Dat).astype(int) 17 | Dat2 = np.log(1 + Dat) 18 | BinDat = np.zeros(max(Dat)+1) 19 | Poiss = np.zeros(max(Dat)+1) 20 | Norm = np.zeros(max(Dat)+1) 21 | LogNorm = np.zeros(max(Dat)+1) 22 | 23 | m = np.mean(Dat) 24 | std = np.std(Dat, ddof=1) 25 | m_l = np.mean(Dat2) 26 | std_l = np.std(Dat2, ddof=1) 27 | 28 | #Create a bin of frequencies and generate frequencies based on distr 29 | for i in range(0,len(BinDat)): 30 | # this is EXTREMELY INEFFICIENT!!!!! 31 | # n^2 since Dat==i requires iterating through the whole array 32 | BinDat[i] = sum(Dat==i) 33 | Poiss[i] = poisson.pmf(i,m) 34 | Norm[i] = norm.pdf((i-m+1)/std) 35 | LogNorm[i] = norm.pdf((i-m_l)/std_l) 36 | BinDat = BinDat/sum(BinDat) + 0.0 37 | Poiss = Poiss/sum(Poiss) + 0.0 38 | Norm = Norm/sum(Norm) + 0.0 39 | LogNorm = LogNorm/sum(LogNorm) + 0.0 40 | #Get error for each distribution 41 | PoissErr = np.linalg.norm(BinDat - Poiss) 42 | NormErr = np.linalg.norm(BinDat - Norm) 43 | LogNormErr = np.linalg.norm(BinDat - LogNorm) 44 | d = {} 45 | d['poiss'] = PoissErr 46 | d['norm'] = NormErr 47 | d['lognorm'] = LogNormErr 48 | return d 49 | 50 | def NormPDF(x,mu,std): 51 | pi = math.pi 52 | temp = np.exp(-((x-mu)**2)/(2*std**2))/np.sqrt(2*pi*std**2) 53 | return temp 54 | 55 | def DistFitDataset(Dat): 56 | """ 57 | Given a data matrix, this returns the per-gene fit error for the 58 | Poisson, Normal, and Log-Normal distributions. 59 | 60 | Args: 61 | Dat (array): numpy array with shape (genes, cells) 62 | 63 | Returns: 64 | d (dict): 'poiss', 'norm', 'lognorm' give the fit error for each distribution. 65 | """ 66 | #Assumes data to be in the form of a numpy matrix 67 | (r,c) = Dat.shape 68 | Poiss = np.zeros(r) 69 | Norm = np.zeros(r) 70 | LogNorm = np.zeros(r) 71 | for i in range(r): 72 | temp = GetDistFitError(Dat[i]) 73 | Poiss[i] = temp['poiss'] 74 | Norm[i] = temp['norm'] 75 | LogNorm[i] = temp['lognorm'] 76 | d = {} 77 | d['poiss'] = Poiss 78 | d['norm'] = Norm 79 | d['lognorm'] = LogNorm 80 | return d 81 | 82 | 83 | #Dat = np.array([[0,0,0,1,1,2,2,3,4],[0,0,0,1,1,1,3,5,7]]) 84 | #Dat = np.array([2,3,4,5]) 85 | #print GetDistFitError(Dat) 86 | #n = 100 87 | #Dat = np.random.poisson(lam = [[2]*n,[.5]*n], size = (2,n)) 88 | #d = DistFitDataset(Dat) 89 | -------------------------------------------------------------------------------- /uncurl/gap_score.py: -------------------------------------------------------------------------------- 1 | """ 2 | Using gap score to determine optimal cluster number 3 | """ 4 | import numpy as np 5 | from sklearn.cluster import KMeans 6 | 7 | def preproc_data(data, gene_subset=False, **kwargs): 8 | """ 9 | basic data preprocessing before running gap score 10 | 11 | Assumes that data is a matrix of shape (genes, cells). 12 | 13 | Returns a matrix of shape (cells, 8), using the first 8 SVD 14 | components. Why 8? It's an arbitrary selection... 15 | """ 16 | import uncurl 17 | from uncurl.preprocessing import log1p, cell_normalize 18 | from sklearn.decomposition import TruncatedSVD 19 | data_subset = data 20 | if gene_subset: 21 | gene_subset = uncurl.max_variance_genes(data) 22 | data_subset = data[gene_subset, :] 23 | tsvd = TruncatedSVD(min(8, data_subset.shape[0] - 1)) 24 | data_tsvd = tsvd.fit_transform(log1p(cell_normalize(data_subset)).T) 25 | return data_tsvd 26 | 27 | def calculate_bounding_box(data): 28 | """ 29 | Returns a 2 x m array indicating the min and max along each 30 | dimension. 31 | """ 32 | mins = data.min(0) 33 | maxes = data.max(0) 34 | return mins, maxes 35 | 36 | def calculate_gap(data, clustering, km, B=50, **kwargs): 37 | """ 38 | See: https://datasciencelab.wordpress.com/2013/12/27/finding-the-k-in-k-means-clustering/ 39 | 40 | https://web.stanford.edu/~hastie/Papers/gap.pdf 41 | 42 | Returns two results: the gap score, and s_k. 43 | """ 44 | k = len(set(clustering)) 45 | Wk = km.inertia_ 46 | mins, maxes = calculate_bounding_box(data) 47 | Wk_est = [] 48 | for i in range(B): 49 | data_sample = (maxes-mins)*np.random.random(data.shape) + mins 50 | km_b = KMeans(k) 51 | km_b.fit_predict(data_sample) 52 | Wk_est.append(km_b.inertia_) 53 | Wk_est = np.log(np.array(Wk_est)) 54 | Wk_mean = np.mean(Wk_est) 55 | Wk_std = np.std(Wk_est) 56 | gap = Wk_mean - np.log(Wk) 57 | sk = np.sqrt(1 + 1.0/B)*Wk_std 58 | return gap, sk 59 | 60 | 61 | def run_gap_k_selection(data, k_min=1, k_max=50, B=5, 62 | skip=5, **kwargs): 63 | """ 64 | Runs gap score for all k from k_min to k_max. 65 | """ 66 | if k_min == k_max: 67 | return k_min 68 | gap_vals = [] 69 | sk_vals = [] 70 | k_range = list(range(k_min, k_max, skip)) 71 | min_k = 0 72 | min_i = 0 73 | for i, k in enumerate(k_range): 74 | km = KMeans(k) 75 | clusters = km.fit_predict(data) 76 | gap, sk = calculate_gap(data, clusters, km, B=B) 77 | if len(gap_vals) > 1: 78 | if gap_vals[-1] >= gap - (skip+1)*sk: 79 | min_i = i 80 | min_k = k_range[i-1] 81 | break 82 | #return k_range[-1], gap_vals, sk_vals 83 | gap_vals.append(gap) 84 | sk_vals.append(sk) 85 | if min_k == 0: 86 | min_k = k_max 87 | if skip == 1: 88 | return min_k, gap_vals, sk_vals 89 | gap_vals = [] 90 | sk_vals = [] 91 | for k in range(min_k - skip, min_k + skip): 92 | km = KMeans(k) 93 | clusters = km.fit_predict(data) 94 | gap, sk = calculate_gap(data, clusters, km, B=B) 95 | if len(gap_vals) > 1: 96 | if gap_vals[-1] >= gap - sk: 97 | min_k = k-1 98 | return min_k, gap_vals, sk_vals 99 | gap_vals.append(gap) 100 | sk_vals.append(sk) 101 | return k, gap_vals, sk_vals 102 | 103 | -------------------------------------------------------------------------------- /uncurl/nb_state_estimation.py: -------------------------------------------------------------------------------- 1 | # state estimation with NB convex mixture model 2 | 3 | from .clustering import kmeans_pp 4 | from .nb_clustering import nb_fit, find_nb_genes 5 | from .state_estimation import initialize_from_assignments 6 | 7 | import numpy as np 8 | from scipy.optimize import minimize 9 | 10 | eps=1e-8 11 | 12 | def _create_w_objective(m, X, R): 13 | """ 14 | Creates an objective function and its derivative for W, given M and X (data) 15 | 16 | Args: 17 | m (array): genes x clusters 18 | X (array): genes x cells 19 | R (array): 1 x genes 20 | """ 21 | genes, clusters = m.shape 22 | cells = X.shape[1] 23 | R1 = R.reshape((genes, 1)).dot(np.ones((1, cells))) 24 | def objective(w): 25 | # convert w into a matrix first... because it's a vector for 26 | # optimization purposes 27 | w = w.reshape((m.shape[1], X.shape[1])) 28 | d = m.dot(w)+eps 29 | return np.sum((X + R1)*np.log(d + R1) - X*np.log(d))/genes 30 | def deriv(w): 31 | # derivative of objective wrt all elements of w 32 | # for w_{ij}, the derivative is... m_j1+...+m_jn sum over genes minus 33 | # x_ij 34 | w2 = w.reshape((m.shape[1], X.shape[1])) 35 | d = m.dot(w2)+eps 36 | temp = X/d 37 | temp2 = (X+R1)/(d+R1) 38 | m1 = m.T.dot(temp2) 39 | m2 = m.T.dot(temp) 40 | deriv = m1 - m2 41 | return deriv.flatten()/genes 42 | return objective, deriv 43 | 44 | def _create_m_objective(w, X, R): 45 | """ 46 | Creates an objective function and its derivative for M, given W and X 47 | 48 | Args: 49 | w (array): clusters x cells 50 | X (array): genes x cells 51 | R (array): 1 x genes 52 | """ 53 | clusters, cells = w.shape 54 | genes = X.shape[0] 55 | R1 = R.reshape((genes, 1)).dot(np.ones((1, cells))) 56 | def objective(m): 57 | m = m.reshape((X.shape[0], w.shape[0])) 58 | d = m.dot(w)+eps 59 | return np.sum((X+R1)*np.log(d + R1) - X*np.log(d))/genes 60 | def deriv(m): 61 | m2 = m.reshape((X.shape[0], w.shape[0])) 62 | d = m2.dot(w)+eps 63 | temp = X/d 64 | temp2 = (X+R1)/(d+R1) 65 | w1 = w.dot(temp2.T) 66 | w2 = w.dot(temp.T) 67 | deriv = w1.T - w2.T 68 | return deriv.flatten()/genes 69 | return objective, deriv 70 | 71 | def nb_estimate_state(data, clusters, R=None, init_means=None, init_weights=None, max_iters=10, tol=1e-4, disp=True, inner_max_iters=400, normalize=True): 72 | """ 73 | Uses a Negative Binomial Mixture model to estimate cell states and 74 | cell state mixing weights. 75 | 76 | If some of the genes do not fit a negative binomial distribution 77 | (mean > var), then the genes are discarded from the analysis. 78 | 79 | Args: 80 | data (array): genes x cells 81 | clusters (int): number of mixture components 82 | R (array, optional): vector of length genes containing the dispersion estimates for each gene. Default: use nb_fit 83 | init_means (array, optional): initial centers - genes x clusters. Default: kmeans++ initializations 84 | init_weights (array, optional): initial weights - clusters x cells. Default: random(0,1) 85 | max_iters (int, optional): maximum number of iterations. Default: 10 86 | tol (float, optional): if both M and W change by less than tol (in RMSE), then the iteration is stopped. Default: 1e-4 87 | disp (bool, optional): whether or not to display optimization parameters. Default: True 88 | inner_max_iters (int, optional): Number of iterations to run in the scipy minimizer for M and W. Default: 400 89 | normalize (bool, optional): True if the resulting W should sum to 1 for each cell. Default: True. 90 | 91 | Returns: 92 | M (array): genes x clusters - state centers 93 | W (array): clusters x cells - state mixing components for each cell 94 | R (array): 1 x genes - NB dispersion parameter for each gene 95 | ll (float): Log-likelihood of final iteration 96 | """ 97 | # TODO: deal with non-NB data... just ignore it? or do something else? 98 | data_subset = data.copy() 99 | genes, cells = data_subset.shape 100 | # 1. use nb_fit to get inital Rs 101 | if R is None: 102 | nb_indices = find_nb_genes(data) 103 | data_subset = data[nb_indices, :] 104 | if init_means is not None and len(init_means) > sum(nb_indices): 105 | init_means = init_means[nb_indices, :] 106 | genes, cells = data_subset.shape 107 | R = np.zeros(genes) 108 | P, R = nb_fit(data_subset) 109 | if init_means is None: 110 | means, assignments = kmeans_pp(data_subset, clusters) 111 | else: 112 | means = init_means.copy() 113 | clusters = means.shape[1] 114 | w_init = np.random.random(cells*clusters) 115 | if init_weights is not None: 116 | if len(init_weights.shape)==1: 117 | init_weights = initialize_from_assignments(init_weights, clusters) 118 | w_init = init_weights.reshape(cells*clusters) 119 | m_init = means.reshape(genes*clusters) 120 | ll = np.inf 121 | # repeat steps 1 and 2 until convergence: 122 | for i in range(max_iters): 123 | if disp: 124 | print('iter: {0}'.format(i)) 125 | w_bounds = [(0, 1.0) for x in w_init] 126 | m_bounds = [(0, None) for x in m_init] 127 | # step 1: given M, estimate W 128 | w_objective, w_deriv = _create_w_objective(means, data_subset, R) 129 | w_res = minimize(w_objective, w_init, method='L-BFGS-B', jac=w_deriv, bounds=w_bounds, options={'disp':disp, 'maxiter':inner_max_iters}) 130 | w_diff = np.sqrt(np.sum((w_res.x-w_init)**2))/w_init.size 131 | w_new = w_res.x.reshape((clusters, cells)) 132 | w_init = w_res.x 133 | # step 2: given W, update M 134 | m_objective, m_deriv = _create_m_objective(w_new, data_subset, R) 135 | # method could be 'L-BFGS-B' or 'SLSQP'... SLSQP gives a memory error... 136 | # or use TNC... 137 | m_res = minimize(m_objective, m_init, method='L-BFGS-B', jac=m_deriv, bounds=m_bounds, options={'disp':disp, 'maxiter':inner_max_iters}) 138 | m_diff = np.sqrt(np.sum((m_res.x-m_init)**2))/m_init.size 139 | m_new = m_res.x.reshape((genes, clusters)) 140 | m_init = m_res.x 141 | ll = m_res.fun 142 | means = m_new 143 | if w_diff < tol and m_diff < tol: 144 | break 145 | if normalize: 146 | w_new = w_new/w_new.sum(0) 147 | return m_new, w_new, R, ll 148 | -------------------------------------------------------------------------------- /uncurl/nolips_parallel.pyx: -------------------------------------------------------------------------------- 1 | # parallel sparse implementation of nolips Poisson optimization 2 | 3 | #import cython 4 | cimport cython 5 | 6 | from cython.parallel import prange 7 | 8 | from scipy import sparse 9 | 10 | import numpy as np 11 | cimport numpy as np 12 | #DTYPE = np.double 13 | #ctypedef np.double_t DTYPE_t 14 | 15 | # TODO: use fused types 16 | ctypedef fused int2: 17 | short 18 | int 19 | long 20 | long long 21 | 22 | ctypedef fused DTYPE_t: 23 | float 24 | double 25 | 26 | cdef double eps = 1e-10 27 | 28 | @cython.boundscheck(False) 29 | @cython.wraparound(False) 30 | @cython.nonecheck(False) 31 | cdef inline void _update(int2 i, DTYPE_t[:] data_, int2[:] indices, int2[:] indptr, DTYPE_t[:,:] cij, double[:] R_view, double[:,:] M_view, double[:,:] W_view, double[:,:] Wnew_view, double lam, double eps, int2 k, double regularization) nogil: 32 | # NoLips in-place update for a single cell/column of w. 33 | # all these updates can run in parallel. 34 | cdef int2 start_ind = indptr[i] 35 | cdef int2 end_ind = indptr[i+1] 36 | cdef int2 g, k2, j, ind 37 | cdef double mw, divisor 38 | for ind in range(start_ind, end_ind): 39 | g = indices[ind] 40 | mw = eps 41 | for k2 in range(k): 42 | mw += M_view[g,k2]*W_view[k2,i] 43 | mw = data_[ind]/mw 44 | for j in range(k): 45 | cij[i,j] += M_view[g,j]*mw 46 | for j in range(k): 47 | # divisor has to be >= 0 48 | divisor = 1+lam*W_view[j,i]*(regularization + R_view[j]-cij[i,j]) 49 | if divisor > 0: 50 | Wnew_view[j,i] = W_view[j,i]/divisor 51 | else: 52 | Wnew_view[j,i] = 0.0 53 | 54 | @cython.boundscheck(False) 55 | @cython.wraparound(False) 56 | @cython.nonecheck(False) 57 | def sparse_nolips_update_w(np.ndarray[DTYPE_t, ndim=1] X_data, 58 | np.ndarray[int2, ndim=1] X_indices, 59 | np.ndarray[int2, ndim=1] X_indptr, 60 | int2 cells, 61 | int2 genes, 62 | np.ndarray[DTYPE_t, ndim=2] M, 63 | np.ndarray[DTYPE_t, ndim=2] W, 64 | np.ndarray[DTYPE_t, ndim=1] lams, 65 | np.ndarray[DTYPE_t, ndim=1] m_sum, int2 n_threads=4, disp=False, 66 | double regularization=0.0): 67 | """ 68 | Parallel nolips... 69 | 70 | Args: 71 | X (csc sparse array): data with shape genes x cells 72 | M (array): genes x k 73 | W (array): k x cells 74 | lams (array): 1/(2*X.sum(0)) - sum each column of X - has length cells 75 | m_sum (array): M.sum(0) 76 | n_threads (int2): number of threads 77 | disp (bool): currently unused 78 | regularization (double): regularization factor for L1 regularization 79 | 80 | Returns: 81 | Updated copy of W 82 | """ 83 | cdef int2 k = W.shape[0] 84 | cdef double[:,:] M_view = M 85 | #cdef np.ndarray[DTYPE_t, ndim=1] R = M.sum(0) 86 | cdef double[:] mw_view 87 | cdef double[:] R_view = m_sum 88 | cdef np.ndarray[DTYPE_t, ndim=1] z = np.zeros(k) 89 | cdef double lam, mw, xig 90 | #cdef np.ndarray[DTYPE_t, ndim=1] lams = 1/(2*Xsum) 91 | cdef double[:] lams_view = lams 92 | cdef double[:,:] Wnew_view = np.empty((k, cells), dtype=np.double) 93 | cdef double[:,:] W_view = W 94 | cdef Py_ssize_t i 95 | #X_csc = sparse.csc_matrix(X) 96 | # when there are more than 2^31 elements, will be long. 97 | # so this function won't work - have to deal with this in the calling 98 | # function. 99 | #cdef int2[:] indices, indptr 100 | cdef int2[:] indices = X_indices 101 | cdef int2[:] indptr = X_indptr 102 | cdef DTYPE_t[:] data_ = X_data 103 | cdef DTYPE_t[:,:] cij = np.zeros((cells, k)) 104 | # schedules: guided, 105 | for i in prange(cells, schedule="guided", nogil=True, num_threads=n_threads): 106 | _update(i, data_, indices, indptr, cij, R_view, M_view, W_view, Wnew_view, lams_view[i], eps, k, regularization) 107 | return np.asarray(Wnew_view) 108 | 109 | -------------------------------------------------------------------------------- /uncurl/plda_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import os 5 | import subprocess 6 | 7 | from scipy import sparse 8 | 9 | from uncurl.sparse_utils import sparse_create_plda_file 10 | 11 | PLDA_FOLDER = "/home/yjzhang/plda" 12 | eps=1e-10 13 | 14 | # Contains methods to process input and output files for PLDA. 15 | 16 | # Generates an input file for PLDA from the matrix 17 | # Assumes "matrix" is a Numpy array containing only integers, with dimensions (genes x cells) 18 | def create_plda_file(matrix, filename): 19 | if sparse.issparse(matrix): 20 | sparse_create_plda_file(matrix, filename) 21 | return 22 | f = open(filename, "w") 23 | (r,c) = matrix.shape 24 | strings = [] 25 | # PLDA input format requires one line per document (cell). Each line contains a sparse 26 | # representation of the counts of the words (genes) present. Example: 27 | # G1 12 G2 4 G5 6 28 | # (G1 appears 12 times, G2 appears 4 times, G5 appears 6 times) 29 | for i in range(c): 30 | for j in range(r): 31 | strings.append("G" + str(j) + " " + str(int(matrix[j,i])) + " ") 32 | strings.append("\n") 33 | f.write("".join(strings)) 34 | 35 | 36 | # Parses the "model file" outputted by PLDA into a 37 | # (word x topic) (or gene x archetype) matrix. 38 | def parse_model_file(model_file, word_topic=None, included_genes=None): 39 | f = open(model_file, "r") 40 | lines = f.readlines() 41 | num_words = len(lines) # There's 1 line for each word 42 | num_topics = len(lines[1].split()) - 1 43 | 44 | if word_topic is None: 45 | word_topic = np.zeros((num_words, num_topics)) 46 | if included_genes is None: 47 | included_genes = np.arange(num_words) 48 | for line in lines: 49 | tokens = line.split() 50 | gene_number = int(tokens[0][1:]) 51 | original_row_number = included_genes[gene_number] 52 | word_topic[original_row_number, :] = np.array(map(float, tokens[1:])) 53 | return word_topic 54 | 55 | 56 | # Parse the "inference result" matrix outputed by PLDA into a 57 | # (topic x document) (or archetype x cell) matrix. 58 | def parse_result_file(result_file): 59 | document_topic = np.loadtxt(result_file, dtype="float") 60 | return document_topic.T 61 | 62 | 63 | 64 | # Given a PLDA input file (each line is a "document", with each word followed by 65 | # its count), return a corresponding data matrix. 66 | def parse_plda_input(input_file, num_columns): 67 | f = open(input_file, "r") 68 | lines = f.readlines() 69 | num_lines = len(lines) 70 | matrix = np.zeros((num_lines, num_columns)) 71 | row = 0 72 | 73 | for line in lines: 74 | tokens = line.split() 75 | i = 1 76 | while i < len(tokens): 77 | gene_number = int(tokens[i-1][1:]) 78 | matrix[row, gene_number] = int(tokens[i]) 79 | i += 2 80 | row += 1 81 | return matrix 82 | 83 | 84 | # Given a PLDA input file, runs PLDA to find the M/W matrices. 85 | # Note: please call "create_plda_file()" beforehand to create a PLDA input 86 | # file from your matrix. 87 | def plda_estimate_state(data, k, threads=4, num_iterations=150, plda_folder=None): 88 | if plda_folder is None: 89 | plda_folder = PLDA_FOLDER 90 | data_mean = np.array(data.mean(0)).flatten() 91 | try: 92 | os.mkdir('plda') 93 | except: 94 | pass 95 | filename = os.path.join(os.getcwd(), 'plda', 'data.txt') 96 | create_plda_file(data, filename) 97 | print("Training PLDA") 98 | train_args = ("mpiexec", "-n", str(threads), os.path.join(plda_folder, "mpi_lda"), 99 | "--num_topics", str(k), "--alpha", "0.1", 100 | "--beta", "0.01", "--training_data_file", filename, 101 | "--model_file", "model.txt", "--burn_in_iterations", "100", "--total_iterations", str(num_iterations)) 102 | subprocess.call(train_args) #, stdout=subprocess.PIPE) 103 | 104 | print("TRAINED") 105 | 106 | inference_args = (os.path.join(plda_folder, "infer"), "--alpha", "0.1", "--beta", 107 | "0.01", "--inference_data_file", filename, "--inference_result_file", 108 | "result.txt", "--model_file", "model.txt", "--total_iterations", 109 | "50", "--burn_in_iterations", "20") 110 | subprocess.call(inference_args) #, stdout=subprocess.PIPE) 111 | 112 | M = parse_model_file("model.txt") 113 | W = parse_result_file("result.txt") 114 | M *= (data_mean / np.mean(M)) 115 | W = W/W.sum(axis=0, keepdims=1) 116 | return M, W 117 | 118 | -------------------------------------------------------------------------------- /uncurl/pois_ll.py: -------------------------------------------------------------------------------- 1 | # Poisson log-likelihood 2 | 3 | import numpy as np 4 | from scipy import sparse 5 | from scipy.special import xlogy, gammaln 6 | 7 | from uncurl.sparse_utils import sparse_poisson_ll_csc 8 | 9 | eps = 1e-10 10 | 11 | def sparse_poisson_ll(data, means): 12 | data = sparse.csc_matrix(data) 13 | return sparse_poisson_ll_csc( 14 | data.data, 15 | data.indices, 16 | data.indptr, 17 | data.shape[0], 18 | data.shape[1], 19 | means, 20 | eps) 21 | 22 | def poisson_ll(data, means): 23 | """ 24 | Calculates the Poisson log-likelihood. 25 | 26 | Args: 27 | data (array): 2d numpy array of genes x cells 28 | means (array): 2d numpy array of genes x k 29 | 30 | Returns: 31 | cells x k array of log-likelihood for each cell/cluster pair 32 | """ 33 | if sparse.issparse(data): 34 | return sparse_poisson_ll(data, means) 35 | genes, cells = data.shape 36 | clusters = means.shape[1] 37 | ll = np.zeros((cells, clusters)) 38 | for i in range(clusters): 39 | means_i = np.tile(means[:,i], (cells, 1)) 40 | means_i = means_i.transpose() + eps 41 | #ll[:,i] = np.sum(xlogy(data, means_i) - gammaln(data+1) - means_i, 0) 42 | ll[:,i] = np.sum(xlogy(data, means_i) - means_i, 0) 43 | return ll 44 | 45 | def poisson_ll_2(p1, p2): 46 | """ 47 | Calculates Poisson LL(p1|p2). 48 | """ 49 | p1_1 = p1 + eps 50 | p2_1 = p2 + eps 51 | return np.sum(-p2_1 + p1_1*np.log(p2_1)) 52 | 53 | def poisson_dist(p1, p2): 54 | """ 55 | Calculates the Poisson distance between two vectors. 56 | 57 | p1 can be a sparse matrix, while p2 has to be a dense matrix. 58 | """ 59 | # ugh... 60 | p1_ = p1 + eps 61 | p2_ = p2 + eps 62 | return np.dot(p1_-p2_, np.log(p1_/p2_)) 63 | 64 | -------------------------------------------------------------------------------- /uncurl/preprocessing.py: -------------------------------------------------------------------------------- 1 | """ 2 | Misc functions... 3 | """ 4 | 5 | import numpy as np 6 | from scipy import sparse 7 | 8 | from uncurl.sparse_utils import sparse_cell_normalize, sparse_means_var_csc 9 | 10 | def sparse_mean_var(data): 11 | """ 12 | Calculates the variance for each row of a sparse matrix, 13 | using the relationship Var = E[x^2] - E[x]^2. 14 | 15 | Returns: 16 | pair of matrices mean, variance. 17 | """ 18 | data = sparse.csc_matrix(data) 19 | return sparse_means_var_csc(data.data, 20 | data.indices, 21 | data.indptr, 22 | data.shape[1], 23 | data.shape[0]) 24 | 25 | def max_variance_genes(data, nbins=5, frac=0.2): 26 | """ 27 | This function identifies the genes that have the max variance 28 | across a number of bins sorted by mean. 29 | 30 | Args: 31 | data (array): genes x cells 32 | nbins (int): number of bins to sort genes by mean expression level. Default: 10. 33 | frac (float): fraction of genes to return per bin - between 0 and 1. Default: 0.1 34 | 35 | Returns: 36 | list of gene indices (list of ints) 37 | """ 38 | # TODO: profile, make more efficient for large matrices 39 | # 8000 cells: 0.325 seconds 40 | # top time: sparse.csc_tocsr, csc_matvec, astype, copy, mul_scalar 41 | # 73233 cells: 5.347 seconds, 4.762 s in sparse_var 42 | # csc_tocsr: 1.736 s 43 | # copy: 1.028 s 44 | # astype: 0.999 s 45 | # there is almost certainly something superlinear in this method 46 | # maybe it's to_csr? 47 | indices = [] 48 | if sparse.issparse(data): 49 | means, var = sparse_mean_var(data) 50 | else: 51 | means = data.mean(1) 52 | var = data.var(1) 53 | mean_indices = means.argsort() 54 | n_elements = int(data.shape[0]/nbins) 55 | frac_elements = int(n_elements*frac) 56 | for i in range(nbins): 57 | bin_i = mean_indices[i*n_elements : (i+1)*n_elements] 58 | if i==nbins-1: 59 | bin_i = mean_indices[i*n_elements :] 60 | var_i = var[bin_i] 61 | var_sorted = var_i.argsort() 62 | top_var_indices = var_sorted[len(bin_i) - frac_elements:] 63 | ind = bin_i[top_var_indices] 64 | # filter out genes with zero variance 65 | ind = [index for index in ind if var[index]>0] 66 | indices.extend(ind) 67 | return indices 68 | 69 | def cell_normalize(data, multiply_means=True): 70 | """ 71 | Returns the data where the expression is normalized so that the total 72 | count per cell is equal. 73 | 74 | If multiply_means is true, then the data will be multiplied to have the median UMI count for all cells. 75 | """ 76 | if sparse.issparse(data): 77 | data = sparse.csc_matrix(data.astype(float)) 78 | # normalize in-place 79 | sparse_cell_normalize(data.data, 80 | data.indices, 81 | data.indptr, 82 | data.shape[1], 83 | data.shape[0], 84 | multiply_means) 85 | return data 86 | data_norm = data.astype(float) 87 | total_umis = [] 88 | for i in range(data.shape[1]): 89 | di = data_norm[:,i] 90 | total_umis.append(di.sum()) 91 | di /= total_umis[i] 92 | if multiply_means: 93 | med = np.median(total_umis) 94 | data_norm *= med 95 | return data_norm 96 | 97 | def log1p(data): 98 | """ 99 | Returns ln(data+1), whether the original data is dense or sparse. 100 | """ 101 | if sparse.issparse(data): 102 | return data.log1p() 103 | else: 104 | return np.log1p(data) 105 | -------------------------------------------------------------------------------- /uncurl/run_se.py: -------------------------------------------------------------------------------- 1 | # state estimation with poisson convex mixture model 2 | 3 | from .state_estimation import poisson_estimate_state 4 | from .nb_state_estimation import nb_estimate_state 5 | from .zip_state_estimation import zip_estimate_state 6 | from .nmf_wrapper import log_norm_nmf, norm_nmf 7 | 8 | import numpy as np 9 | from scipy import sparse 10 | 11 | def run_state_estimation(data, clusters, dist='Poiss', reps=1, **kwargs): 12 | """ 13 | Runs state estimation for multiple initializations, returning the result with the highest log-likelihood. All the arguments are passed to the underlying state estimation functions (poisson_estimate_state, nb_estimate_state, zip_estimate_state). 14 | 15 | Args: 16 | data (array): genes x cells 17 | clusters (int): number of mixture components. If this is set to 0, this is automatically estimated using gap score. 18 | dist (str, optional): Distribution used in state estimation. Options: 'Poiss', 'NB', 'ZIP', 'LogNorm', 'Gaussian'. Default: 'Poiss' 19 | reps (int, optional): number of times to run the state estimation, taking the result with the highest log-likelihood. 20 | **kwargs: arguments to pass to the underlying state estimation function. 21 | 22 | Returns: 23 | M (array): genes x clusters - state means 24 | W (array): clusters x cells - state mixing components for each cell 25 | ll (float): final log-likelihood 26 | """ 27 | clusters = int(clusters) 28 | func = poisson_estimate_state 29 | dist = dist.lower() 30 | if dist=='poiss' or dist=='poisson': 31 | pass 32 | elif dist=='nb': 33 | func = nb_estimate_state 34 | elif dist=='zip': 35 | func = zip_estimate_state 36 | elif dist=='lognorm' or dist=='log-normal' or dist=='lognormal': 37 | func = log_norm_nmf 38 | elif dist=='gaussian' or dist=='norm' or dist=='normal': 39 | func = norm_nmf 40 | elif dist=='none': 41 | func = run_baseline 42 | else: 43 | print('dist should be one of Poiss, NB, ZIP, LogNorm, or Gaussian. Using Poiss.') 44 | # TODO: estimate number of clusters 45 | if clusters == 0: 46 | from .gap_score import run_gap_k_selection, preproc_data 47 | data_tsvd = preproc_data(data, gene_subset=False) 48 | max_k, gap_vals, sk_vals = run_gap_k_selection(data_tsvd, 49 | k_min=1, k_max=50, skip=5, B=6) 50 | clusters = min(max_k, data.shape[0] - 1, data.shape[1] - 1) 51 | best_ll = np.inf 52 | best_M = None 53 | best_W = None 54 | for i in range(reps): 55 | results = func(data, clusters, **kwargs) 56 | M = results[0] 57 | W = results[1] 58 | if dist=='NB': 59 | ll = results[3] 60 | else: 61 | ll = results[2] 62 | if ll < best_ll: 63 | best_ll = ll 64 | best_M = M 65 | best_W = W 66 | return best_M, best_W, best_ll 67 | 68 | 69 | def run_baseline(data, clusters, **kwargs): 70 | """ 71 | Run "baseline" tSVD + k-means 72 | """ 73 | from .state_estimation import initialize_means_weights 74 | m, w = initialize_means_weights(data, clusters, initialization='tsvd', 75 | max_assign_weight=0.95, use_log_norm=False) 76 | return m, w, 0 77 | -------------------------------------------------------------------------------- /uncurl/sampling.py: -------------------------------------------------------------------------------- 1 | # downsampling count datasets (for comparisons) 2 | 3 | import numpy as np 4 | 5 | from scipy import sparse 6 | 7 | def downsample(data, percent): 8 | """ 9 | downsample the data by removing a given percentage of the reads. 10 | 11 | Args: 12 | data: genes x cells array or sparse matrix 13 | percent: float between 0 and 1 14 | """ 15 | n_genes = data.shape[0] 16 | n_cells = data.shape[1] 17 | new_data = data.copy() 18 | total_count = float(data.sum()) 19 | to_remove = total_count*percent 20 | # sum of read counts per cell 21 | cell_sums = data.sum(0).astype(float) 22 | # probability of selecting genes per cell 23 | cell_gene_probs = data/cell_sums 24 | # probability of selecting cells 25 | cell_probs = np.array(cell_sums/total_count).flatten() 26 | cells_selected = np.random.multinomial(to_remove, pvals=cell_probs) 27 | for i, num_selected in enumerate(cells_selected): 28 | cell_gene = np.array(cell_gene_probs[:,i]).flatten() 29 | genes_selected = np.random.multinomial(num_selected, pvals=cell_gene) 30 | if sparse.issparse(data): 31 | genes_selected = sparse.csc_matrix(genes_selected).T 32 | new_data[:,i] -= genes_selected 33 | new_data[new_data < 0] = 0 34 | return new_data 35 | -------------------------------------------------------------------------------- /uncurl/vis.py: -------------------------------------------------------------------------------- 1 | # basic functions for visualization of clustering, state estimation, lineage 2 | 3 | import matplotlib.pyplot as plt 4 | from sklearn.decomposition import PCA 5 | 6 | def visualize_poisson_w(w, labels, filename, method='pca', figsize=(18,10), title='', **scatter_options): 7 | """ 8 | Saves a scatter plot of a visualization of W, the result from Poisson SE. 9 | """ 10 | if method == 'pca': 11 | pca = PCA(2) 12 | r_dim_red = pca.fit_transform(w.T).T 13 | elif method == 'tsne': 14 | pass 15 | else: 16 | print("Method is not available. use 'pca' (default) or 'tsne'.") 17 | return 18 | visualize_dim_red(r_dim_red, labels, filename, figsize, title, **scatter_options) 19 | 20 | def visualize_dim_red(r, labels, filename=None, figsize=(18,10), title='', legend=True, label_map=None, label_scale=False, label_color_map=None, **scatter_options): 21 | """ 22 | Saves a scatter plot of a (2,n) matrix r, where each column is a cell. 23 | 24 | Args: 25 | r (array): (2,n) matrix 26 | labels (array): (n,) array of ints/strings or floats. Can be None. 27 | filename (string): string to save the output graph. If None, then this just displays the plot. 28 | figsize (tuple): Default: (18, 10) 29 | title (string): graph title 30 | legend (bool): Default: True 31 | label_map (dict): map of labels to label names. Default: None 32 | label_scale (bool): True if labels is should be treated as floats. Default: False 33 | label_color_map (array): (n,) array or list of colors for each label. 34 | """ 35 | fig = plt.figure(figsize=figsize) 36 | plt.cla() 37 | if not label_scale: 38 | for i in set(labels): 39 | label = i 40 | if label_map is not None: 41 | label = label_map[i] 42 | if label_color_map is not None: 43 | c = label_color_map[i] 44 | plt.scatter(r[0, labels==i], r[1, labels==i], label=label, c=c, **scatter_options) 45 | else: 46 | plt.scatter(r[0, labels==i], r[1, labels==i], label=label, **scatter_options) 47 | else: 48 | if labels is None: 49 | plt.scatter(r[0,:], r[1,:], **scatter_options) 50 | else: 51 | plt.scatter(r[0,:], r[1,:], c=labels/labels.max(), **scatter_options) 52 | plt.title(title) 53 | if legend: 54 | plt.legend() 55 | if filename is not None: 56 | plt.savefig(filename, dpi=100) 57 | plt.close() 58 | return fig 59 | -------------------------------------------------------------------------------- /uncurl/zip_clustering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import sparse 3 | from scipy.optimize import minimize 4 | 5 | from .clustering import kmeans_pp 6 | from .zip_utils import zip_ll, zip_ll_row 7 | 8 | eps = 1e-8 9 | 10 | 11 | def zip_fit_params(data): 12 | """ 13 | Returns the ZIP parameters that best fit a given data set. 14 | 15 | Args: 16 | data (array): 2d array of genes x cells belonging to a given cluster 17 | 18 | Returns: 19 | L (array): 1d array of means 20 | M (array): 1d array of zero-inflation parameter 21 | """ 22 | genes, cells = data.shape 23 | m = data.mean(1) 24 | v = data.var(1) 25 | M = (v-m)/(m**2+v-m) 26 | #M = v/(v+m**2) 27 | #M[np.isnan(M)] = 0.0 28 | M = np.array([min(1.0, max(0.0, x)) for x in M]) 29 | L = m + v/m - 1.0 30 | #L = (v + m**2)/m 31 | L[np.isnan(L)] = 0.0 32 | L = np.array([max(0.0, x) for x in L]) 33 | return L, M 34 | 35 | def zip_fit_params_mle(data): 36 | genes, cells = data.shape 37 | L, M = zip_fit_params(data) 38 | for i in range(genes): 39 | result = minimize(zip_ll_row, [L[i], M[i]], args=(data[i,:],), 40 | bounds=[(eps, None),(0,1)]) 41 | params = result.x 42 | L[i] = params[0] 43 | M[i] = params[1] 44 | return L, M 45 | 46 | def zip_cluster(data, k, init=None, max_iters=100): 47 | """ 48 | Performs hard EM clustering using the zero-inflated Poisson distribution. 49 | 50 | Args: 51 | data (array): A 2d array- genes x cells 52 | k (int): Number of clusters 53 | init (array, optional): Initial centers - genes x k array. Default: None, use kmeans++ 54 | max_iters (int, optional): Maximum number of iterations. Default: 100 55 | 56 | Returns: 57 | assignments (array): integer assignments of cells to clusters (length cells) 58 | L (array): Poisson parameter (genes x k) 59 | M (array): zero-inflation parameter (genes x k) 60 | """ 61 | genes, cells = data.shape 62 | init, new_assignments = kmeans_pp(data+eps, k, centers=init) 63 | centers = np.copy(init) 64 | M = np.zeros(centers.shape) 65 | assignments = new_assignments 66 | for c in range(k): 67 | centers[:,c], M[:,c] = zip_fit_params_mle(data[:, assignments==c]) 68 | for it in range(max_iters): 69 | lls = zip_ll(data, centers, M) 70 | new_assignments = np.argmax(lls, 1) 71 | if np.equal(assignments, new_assignments).all(): 72 | return assignments, centers, M 73 | for c in range(k): 74 | centers[:,c], M[:,c] = zip_fit_params_mle(data[:, assignments==c]) 75 | assignments = new_assignments 76 | return assignments, centers, M 77 | 78 | -------------------------------------------------------------------------------- /uncurl/zip_state_estimation.py: -------------------------------------------------------------------------------- 1 | # state estimation with Zero-Inflated Poisson model 2 | # TODO 3 | 4 | from .clustering import kmeans_pp 5 | from .zip_clustering import zip_fit_params_mle 6 | from .state_estimation import initialize_from_assignments 7 | 8 | import numpy as np 9 | from scipy.optimize import minimize 10 | 11 | eps=1e-8 12 | 13 | def _create_w_objective(m, X, Z=None): 14 | """ 15 | Creates an objective function and its derivative for W, given M and X (data) 16 | 17 | Args: 18 | m (array): genes x clusters 19 | X (array): genes x cells 20 | Z (array): zero-inflation parameters - genes x 1 21 | """ 22 | genes, clusters = m.shape 23 | cells = X.shape[1] 24 | nonzeros = (X!=0) 25 | def objective(w): 26 | # convert w into a matrix first... because it's a vector for 27 | # optimization purposes 28 | w = w.reshape((m.shape[1], X.shape[1])) 29 | d = m.dot(w)+eps 30 | # derivative of objective wrt all elements of w 31 | # for w_{ij}, the derivative is... m_j1+...+m_jn sum over genes minus 32 | # x_ij 33 | temp = X/d 34 | m_sum = m.T.dot(nonzeros) 35 | m2 = m.T.dot(temp) 36 | deriv = m_sum - m2 37 | return np.sum(nonzeros*(d - X*np.log(d)))/genes, deriv.flatten()/genes 38 | return objective 39 | 40 | def _create_m_objective(w, X, Z=None): 41 | """ 42 | Creates an objective function and its derivative for M, given W and X 43 | 44 | Args: 45 | w (array): clusters x cells 46 | X (array): genes x cells 47 | Z (array): zero-inflation parameters - genes x 1 48 | """ 49 | clusters, cells = w.shape 50 | genes = X.shape[0] 51 | nonzeros = (X!=0) 52 | def objective(m): 53 | m = m.reshape((X.shape[0], w.shape[0])) 54 | d = m.dot(w)+eps 55 | temp = nonzeros*(X/d) 56 | w_sum = w.dot(nonzeros.T) 57 | w2 = w.dot(temp.T) 58 | deriv = w_sum.T - w2.T 59 | return np.sum(nonzeros*(d - X*np.log(d)))/genes, deriv.flatten()/genes 60 | return objective 61 | 62 | 63 | 64 | def zip_estimate_state(data, clusters, init_means=None, init_weights=None, max_iters=10, tol=1e-4, disp=True, inner_max_iters=400, normalize=True): 65 | """ 66 | Uses a Zero-inflated Poisson Mixture model to estimate cell states and 67 | cell state mixing weights. 68 | 69 | Args: 70 | data (array): genes x cells 71 | clusters (int): number of mixture components 72 | init_means (array, optional): initial centers - genes x clusters. Default: kmeans++ initializations 73 | init_weights (array, optional): initial weights - clusters x cells. Default: random(0,1) 74 | max_iters (int, optional): maximum number of iterations. Default: 10 75 | tol (float, optional): if both M and W change by less than tol (in RMSE), then the iteration is stopped. Default: 1e-4 76 | disp (bool, optional): whether or not to display optimization parameters. Default: True 77 | inner_max_iters (int, optional): Number of iterations to run in the scipy minimizer for M and W. Default: 400 78 | normalize (bool, optional): True if the resulting W should sum to 1 for each cell. Default: True. 79 | 80 | Returns: 81 | M: genes x clusters - state centers 82 | W: clusters x cells - state mixing components for each cell 83 | ll: final log-likelihood 84 | """ 85 | genes, cells = data.shape 86 | # TODO: estimate ZIP parameter? 87 | if init_means is None: 88 | means, assignments = kmeans_pp(data, clusters) 89 | else: 90 | means = init_means.copy() 91 | clusters = means.shape[1] 92 | w_init = np.random.random(cells*clusters) 93 | if init_weights is not None: 94 | if len(init_weights.shape)==1: 95 | init_weights = initialize_from_assignments(init_weights, clusters) 96 | w_init = init_weights.reshape(cells*clusters) 97 | m_init = means.reshape(genes*clusters) 98 | # using zero-inflated parameters... 99 | L, Z = zip_fit_params_mle(data) 100 | # repeat steps 1 and 2 until convergence: 101 | ll = np.inf 102 | for i in range(max_iters): 103 | if disp: 104 | print('iter: {0}'.format(i)) 105 | w_bounds = [(0, 1.0) for x in w_init] 106 | m_bounds = [(0, None) for x in m_init] 107 | # step 1: given M, estimate W 108 | w_objective = _create_w_objective(means, data, Z) 109 | w_res = minimize(w_objective, w_init, method='L-BFGS-B', jac=True, bounds=w_bounds, options={'disp':disp, 'maxiter':inner_max_iters}) 110 | w_diff = np.sqrt(np.sum((w_res.x-w_init)**2))/w_init.size 111 | w_new = w_res.x.reshape((clusters, cells)) 112 | w_init = w_res.x 113 | # step 2: given W, update M 114 | m_objective = _create_m_objective(w_new, data, Z) 115 | # method could be 'L-BFGS-B' or 'SLSQP'... SLSQP gives a memory error... 116 | # or use TNC... 117 | m_res = minimize(m_objective, m_init, method='L-BFGS-B', jac=True, bounds=m_bounds, options={'disp':disp, 'maxiter':inner_max_iters}) 118 | ll = m_res.fun 119 | m_diff = np.sqrt(np.sum((m_res.x-m_init)**2))/m_init.size 120 | m_new = m_res.x.reshape((genes, clusters)) 121 | m_init = m_res.x 122 | means = m_new 123 | if w_diff < tol and m_diff < tol: 124 | break 125 | if normalize: 126 | w_new = w_new/w_new.sum(0) 127 | return m_new, w_new, ll 128 | -------------------------------------------------------------------------------- /uncurl/zip_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import sparse 3 | from scipy.stats import poisson 4 | from scipy.special import xlogy, gammaln 5 | 6 | eps = 1e-10 7 | 8 | 9 | def zip_ll(data, means, M): 10 | """ 11 | Calculates the zero-inflated Poisson log-likelihood. 12 | 13 | Args: 14 | data (array): genes x cells 15 | means (array): genes x k 16 | M (array): genes x k - this is the zero-inflation parameter. 17 | 18 | Returns: 19 | cells x k array of log-likelihood for each cell/cluster pair. 20 | """ 21 | genes, cells = data.shape 22 | clusters = means.shape[1] 23 | ll = np.zeros((cells, clusters)) 24 | d0 = (data==0) 25 | d1 = (data>0) 26 | for i in range(clusters): 27 | means_i = np.tile(means[:,i], (cells, 1)) 28 | means_i = means_i.transpose() 29 | L_i = np.tile(M[:,i], (cells, 1)) 30 | L_i = L_i.transpose() 31 | ll_0 = np.log(L_i + (1 - L_i)*np.exp(-means_i)) 32 | ll_0 = np.where((L_i==0) & (means_i==0), -means_i, ll_0) 33 | # not including constant factors 34 | ll_1 = np.log(1 - L_i) + xlogy(data, means_i) - means_i 35 | ll_0 = np.where(d0, ll_0, 0.0) 36 | ll_1 = np.where(d1, ll_1, 0.0) 37 | ll[:,i] = np.sum(ll_0 + ll_1, 0) 38 | return ll 39 | 40 | def zip_ll_row(params, data_row): 41 | """ 42 | Returns the negative log-likelihood of a row given ZIP data. 43 | 44 | Args: 45 | params (list): [lambda zero-inf] 46 | data_row (array): 1d array 47 | 48 | Returns: 49 | negative log-likelihood 50 | """ 51 | l = params[0] 52 | pi = params[1] 53 | d0 = (data_row==0) 54 | likelihood = d0*pi + (1-pi)*poisson.pmf(data_row, l) 55 | return -np.log(likelihood+eps).sum() 56 | 57 | --------------------------------------------------------------------------------