├── .gitignore ├── .readthedocs.yaml ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── bin └── scHPF ├── docs ├── Makefile ├── changelog.rst ├── cli-man.rst ├── conf.py ├── genelists.rst ├── img │ ├── cell-type-rep-01.png │ └── k_selection_minifig-01.png ├── index.rst ├── install.rst ├── make.bat ├── prep-cli.rst ├── project.rst ├── references.rst ├── score-cli.rst ├── select-k.rst └── train-cli.rst ├── resources ├── README.md ├── gencode.v24.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt ├── gencode.v29.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt ├── gencode.v31.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt ├── gencode.vM10.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt └── gencode.vM19.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt ├── schpf ├── __init__.py ├── _version.py ├── hpf_numba.py ├── loss.py ├── preprocessing.py ├── scHPF_.py └── util.py ├── setup.cfg ├── setup.py └── tests ├── __init__.py ├── _data ├── PJ030merge.c300t400_g0t500.matrix.txt └── sample_blacklist.txt ├── conftest.py ├── test_inference.py ├── test_misc.py ├── test_preprocessing.py ├── test_scHPF_model.py └── test_util.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/conf.py 11 | 12 | # Build documentation with MkDocs 13 | #mkdocs: 14 | # configuration: mkdocs.yml 15 | 16 | # Optionally build your docs in additional formats such as PDF and ePub 17 | formats: all 18 | 19 | # Optionally set the version of Python and requirements required to build your docs 20 | python: 21 | version: 3.7 22 | install: 23 | - method: pip 24 | path: . 25 | extra_requirements: 26 | - docs 27 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at h.mendes.levitin@columbia.edu. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: http://contributor-covenant.org 46 | [version]: http://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2018, Sims Lab. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Single-cell Hierarchical Poisson Factorization 2 | 3 | ## About 4 | scHPF is a tool for _de novo_ discovery of both discrete and continuous expression patterns in single-cell RNA\-sequencing (scRNA-seq). We find that scHPF’s sparse low-dimensional representations, non-negativity, and explicit modeling of variable sparsity across genes and cells produce highly interpretable factors. 5 | 6 | - [Documentation](https://schpf.readthedocs.io/en/latest/) 7 | - [Changelog](https://schpf.readthedocs.io/en/latest/changelog.html) 8 | - [Paper at Molecular Systems Biology](https://doi.org/10.15252/msb.20188557) 9 | - [Application to human tissue T cells across multiple donors and tissues](https://doi.org/10.1038/s41467-019-12464-3) 10 | 11 | ## Installation 12 | ### Environment & Dependencies 13 | scHPF requires Python >= 3.6 and the packages: 14 | - numba ([version needed depends on Python version](https://schpf.readthedocs.io/en/latest/install.html#numba-compatibility), but should be safe with 0.45) 15 | - scikit-learn 16 | - pandas 17 | - (optional) loompy 18 | 19 | The easiest way to setup an environment for scHPF is with the Anaconda 20 | Python distribution in [Miniconda](https://conda.io/miniconda.html) or 21 | [anaconda](https://www.continuum.io/downloads): 22 | 23 | ``` 24 | conda create -n schpf_p37 python=3.7 scikit-learn numba=0.50 pandas 25 | 26 | # for newer anaconda versions 27 | conda activate schpf_p37 28 | # XOR older anaconda verstions 29 | source activate schpf_p37 30 | 31 | # Optional, for using loom files as input to preprocessing 32 | pip install -U loompy 33 | ``` 34 | 35 | ### Installing from source 36 | Once you have set up the environment, clone this repository and install. 37 | ``` 38 | git clone git@github.com:simslab/scHPF.git 39 | cd scHPF 40 | pip install . 41 | ``` 42 | 43 | ### Testing your installation 44 | This step important because not all micro-versions of numba play nicely with 45 | all micro versions of Python or numpy, and sometimes issues vary across 46 | machines. Testing will catch some but not all such issues. From the scHPF base 47 | directory do: 48 | ``` 49 | conda install pytest 50 | pytest 51 | ``` 52 | Please get in touch if tests fail, or if you get segmentation faults or very 53 | long train times that and no automatic parallelization, and I'm happy to try to 54 | help. 55 | 56 | ## Quick Start: Command Line Interface 57 | 58 | 1. [Prepare your data](https://schpf.readthedocs.io/en/latest/prep-cli.html). 59 | 60 | 2. [Train a model](https://schpf.readthedocs.io/en/latest/train-cli.html). 61 | 62 | 3. [Get gene and cell scores](https://schpf.readthedocs.io/en/latest/score-cli.html) 63 | 64 | 65 | ## API 66 | scHPF has a scikit-learn like API. Trained models are stored in a serialized 67 | joblib format. 68 | 69 | 70 | ## Help and support 71 | If you have any questions/errors/issues, please [open an issue](https://github.com/simslab/scHPF/issues/new) 72 | and I be happy to to provide whatever help and guidance I can. 73 | 74 | 75 | ## Contributing 76 | Contributions to scHPF are welcome. Please get in touch if you would like to 77 | discuss/check it's something I've already done but haven't pushed to master yet. 78 | To contribute, please [fork 79 | scHPF](https://github.com/simslab/scHPF/issues#fork-destination-box), make your 80 | changes, and submit a pull request. 81 | 82 | ## References 83 | Hanna Mendes Levitin, Jinzhou Yuan, Yim Ling Cheng, Francisco JR Ruiz, Erin C Bush, 84 | Jeffrey N Bruce, Peter Canoll, Antonio Iavarone, Anna Lasorella, David M Blei, Peter A Sims. 85 | __"*De novo* gene signature identification from single‐cell RNA‐seq with hierarchical Poisson 86 | factorization."__ Molecular Systems Biology, 2019. [[Open access article]](http://msb.embopress.org/content/15/2/e8557.full.pdf) 87 | 88 | Peter A. Szabo\*, Hanna Mendes Levitin\*, Michelle Miron, Mark E. Snyder, 89 | Takashi Senda, Jinzhou Yuan, Yim Ling Cheng, Erin C. Bush, Pranay Dogra, Puspa 90 | Thapa, Donna L. Farber, Peter A. Sims. __"Single-cell transcriptomics of human 91 | T cells reveals tissue and activation signatures in health and disease."__ Nature Communications, 2019. 92 | [[Open access article]](https://doi.org/10.1038/s41467-019-12464-3) 93 | \* Co-first authors 94 | 95 | -------------------------------------------------------------------------------- /bin/scHPF: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import argparse 6 | import json 7 | import time 8 | from functools import partial 9 | 10 | if not sys.warnoptions: 11 | import warnings 12 | warnings.simplefilter("ignore") 13 | 14 | import numpy as np 15 | import pandas as pd 16 | from scipy.io import mmread, mmwrite 17 | from scipy.sparse import coo_matrix 18 | import joblib 19 | 20 | from schpf import scHPF, run_trials, run_trials_pool 21 | from schpf.util import max_pairwise_table, mean_cellscore_fraction_list 22 | from schpf.preprocessing import load_coo, load_and_filter, load_like 23 | from schpf.preprocessing import split_validation_cells 24 | 25 | def _parser(): 26 | # usage = """scHPF [] 27 | 28 | # The most commonly used scHPF commands are: 29 | # prep Prepare data 30 | # train Train a model from data 31 | # score Get cell-scores, gene-scores, and other data 32 | 33 | # Some advanced scHPF commands are: 34 | # prep-like Prepare data with the same genes & order as other data 35 | # project Project data onto a pre-trained model 36 | # train-pool Train a model, parallelized at the level of trials 37 | # """ 38 | parser = argparse.ArgumentParser() 39 | subparsers = parser.add_subparsers(dest='cmd') 40 | 41 | ### Preprocess command 42 | prep = subparsers.add_parser('prep', 43 | help='Prepare data for training') 44 | # data 45 | prep.add_argument('-i', '--input', required=True, 46 | help='Input data. Currently accepts either: (1) a whitespace-' 47 | 'delimited gene by cell UMI count matrix with 2 leading columns ' 48 | 'of gene attributes (ENSEMBL_ID and GENE_NAME respectively), or ' 49 | '(2) a loom file with at least one of the row attributes ' 50 | '`Accession` or `Gene`, where `Accession` is an ENSEMBL id and ' 51 | '`Gene` is the name.' 52 | ) 53 | prep.add_argument('-o', '--outdir', 54 | help='Output directory. Does not need to exist.') 55 | prep.add_argument('-p', '--prefix', default='', 56 | help='Prefix for output files. Optional.') 57 | 58 | # gene filtering criteria 59 | prep.add_argument('-m', '--min-cells', type=float, default=0.01, 60 | help='Minimum number of cells in which we must observe at ' 61 | 'least one transcript of a gene for the gene to pass ' 62 | 'filtering. If 0 <`min_cells`< 1, sets threshold to be ' 63 | '`min_cells` * ncells, rounded to the nearest integer.' 64 | ' [Default 0.01]') 65 | prep.add_argument('-w', '--whitelist', default='', 66 | help='Tab-delimited file where first column contains ENSEMBL gene ' 67 | 'ids to accept, and second column contains corresponding gene ' 68 | 'names. If given, genes not on the whitelist are filtered from ' 69 | 'the input matrix. Superseded by blacklist. Optional.') 70 | prep.add_argument('-b', '--blacklist', default='', 71 | help='Tab-delimited file where first column contains ENSEMBL gene ' 72 | 'ids to exclude, and second column is the corresponding gene name. ' 73 | 'Only performed if file given. Genes on the blacklist are ' 74 | 'excluded even if they are also on the whitelist. Optional.') 75 | 76 | # optional selection of cells for validation set 77 | prep.add_argument('-nvc', '--n-validation-cells', type=int, default=0, 78 | help='Number of cells to randomly select for validation.') 79 | prep.add_argument('-vgid', '--validation-group-ids', default=None, 80 | help= 'Single column file of cell group ids readable with ' 81 | ' np.readtxt. If `--n-validation-cells` is > 0, cells will be ' 82 | ' randomly selected approximately evenly across the groups in ' 83 | ' this file, under the constraint that at most' 84 | ' `--validation-min-group-frac` * (ncells in group) are selected' 85 | ' from every group.') 86 | prep.add_argument('--validation-max-group-frac', type=float, default=0.5, 87 | help='If `-nvc`>0 and `validation-group-ids` is a valid file, at' 88 | ' most `validation-min-group-frac`*(ncells in group) cells are' 89 | ' selected from each group.') 90 | 91 | # other options 92 | prep.add_argument('--filter-by-gene-name', default=False, 93 | action='store_true', help='Use gene name rather than ENSEMBL' 94 | ' id to filter (with whitelist or blacklist). Useful for' 95 | ' datasets where only gene symbols are given. Applies to both' 96 | ' whitelist and blacklist. Used by default when input is a loom' 97 | ' file (unless there is an Accession attribute in the loom).') 98 | prep.add_argument('--no-split-on-dot', default=False, action='store_true', 99 | help='Don\'t split gene symbol or name on period before ' 100 | 'filtering whitelist and blacklist. We do this by default for ' 101 | 'ENSEMBL ids.') 102 | 103 | 104 | #### Prepare like 105 | prep_like = subparsers.add_parser('prep-like', 106 | help='Prepare a data set like another (ie with the same genes in' 107 | ' the same order)') 108 | # data 109 | prep_like.add_argument('-i', '--input', required=True, 110 | help='Input data to format. Currently accepts either: (1) a' 111 | ' whitespace-delimited gene by cell UMI count matrix with 2' 112 | ' leading columns of gene attributes (ENSEMBL_ID and GENE_NAME' 113 | ' respectively), or (2) a loom file with at least one of the row' 114 | ' attributes `Accession` or `Gene`, where `Accession` is an' 115 | ' ENSEMBL id and `Gene` is the name.') 116 | prep_like.add_argument('-r', '--reference', required=True, 117 | help='Two-column tab-delimited file of ENSEMBL ids and gene names' 118 | ' to select from `input` and order like. All genes in `reference`' 119 | ' must be present in `input`.') 120 | prep_like.add_argument('-o', '--outdir', required=True, 121 | help='Output directory. Does not need to exist.') 122 | prep_like.add_argument('-p', '--prefix', default='', 123 | help='Prefix for output files. Optional.') 124 | # other options 125 | prep_like.add_argument('--by-gene-name', default=False, 126 | action='store_true', help='Use gene name rather than ENSEMBL' 127 | ' id to when matching against reference. Useful for datasets' 128 | ' where only gene symbols are given. Used by default when input' 129 | ' is a loom file (unless there is an Accession attr in the loom).') 130 | prep_like.add_argument('--no-split-on-dot', default=False, action='store_true', 131 | help='Don\'t split gene symbol or name on period before' 132 | ' when matching to reference. We do this by default for ENSEMBL' 133 | ' ids.') 134 | 135 | 136 | ###### Train command 137 | train = subparsers.add_parser('train', 138 | help='Train a model with automatic parallelization across' 139 | ' computations with numba') 140 | # data and saving 141 | train.add_argument('-i', '--input', required=True, 142 | help="Training data. Expects either the mtx file output by the " 143 | "prep command or a tab-separated tsv file formatted like:" 144 | "`CELL_ID\tGENE_ID\tUMI_COUNT`. In the later case, ids are " 145 | "assumed to be 0 indexed and we assume no duplicates." 146 | ) 147 | train.add_argument('-o', '--outdir', 148 | help='Output directory for scHPF model. Will be created if does ' 149 | 'not exist.') 150 | train.add_argument('-p', '--prefix', default='', 151 | help='Prefix for output files. Optional.') 152 | 153 | # Required model hyperparameter 154 | train.add_argument('-k', '--nfactors', type=int, required=True, 155 | help='Number of factors.') 156 | 157 | # training parameters 158 | train.add_argument('-t', '--ntrials', type=int, default=1, 159 | help='Number of times to run scHPF, selecting the trial with ' 160 | 'best loss (on training data unless validation is given).' 161 | ' [Default 1]') 162 | train.add_argument('-v', '--validation-cells', default=None, 163 | help='Cells to use to assess convergence and choose a model.' 164 | ' Expects same format as ``-i/--input``. Training data used by' 165 | ' default.' 166 | ) 167 | train.add_argument('-M', '--max-iter', type=int, default=1000, 168 | help='Maximum iterations. [Default 1000].') 169 | train.add_argument('-m', '--min-iter', type=int, default=30, 170 | help='Minimum iterations. [Default 30]') 171 | train.add_argument('-e', '--epsilon', type=float, default=0.001, 172 | help='Minimum percent decrease in loss between checks to continue ' 173 | 'inference (convergence criteria). [Default 0.001].') 174 | train.add_argument('-f', '--check-freq', type=int, default=10, 175 | help='Number of iterations to run between convergence checks. ' 176 | '[Default 10].') 177 | train.add_argument('--better-than-n-ago', default=5, type=int, 178 | help= 'Stop condition if loss is getting worse. Stops training ' 179 | 'if loss is worse than `better_than_n_ago`*`check-freq` training ' 180 | 'steps ago and getting worse. Normally not necessary to change.') 181 | train.add_argument('-a', type=float, default=0.3, 182 | help='Value for hyperparameter a. Setting to -2 will auto-set to' 183 | ' 1/sqrt(nfactors)[Default 0.3]') 184 | train.add_argument('-c', type=float, default=0.3, 185 | help='Value for hyperparameter c. Setting to -2 will auto-set to' 186 | ' 1/sqrt(nfactors)[Default 0.3]') 187 | train.add_argument('--float32', action='store_true', 188 | help="Use 32-bit floats instead of default 64-bit floats in" 189 | " variational distrubtions") 190 | train.add_argument('-bs', '--batchsize', default=0, type=int, 191 | help="Number of cells to use per training round. All cells used if" 192 | " 0. Note that using batches changes the order of updates during" 193 | " inference.") 194 | train.add_argument('-sl', '--smooth-loss', default=1, type=int, 195 | help="Average loss over the last `--smooth-loss` interations." 196 | " Intended for when using minibatches, where int(ncells/batchsize)" 197 | " is a reasonable value" 198 | ) 199 | train.add_argument('-bts', '--beta-theta-simultaneous', action='store_true', 200 | help="If False (default), compute beta update, then compute theta" 201 | " based on the updated beta. Note that if batching is used, this" 202 | " order is reverse. If True, update both beta and theta based on" 203 | " values from the last training round. The later slows the rate of" 204 | " convergence and sometimes results in better log-likelihoods, but" 205 | " may increase convergence time, especially for large numbers of" 206 | " cells." 207 | ) 208 | train.add_argument('-sa', '--save-all', action='store_true', 209 | help="Save all trials") 210 | train.add_argument('-rp', '--reproject', action='store_true', 211 | help="Reproject data onto fixed global (gene) parameters after" 212 | " convergence, but before model selection. Recommended with" 213 | " batching") 214 | train.add_argument('--quiet', dest='verbose', action='store_false', 215 | default=True, help="Don't print intermediate llh.") 216 | 217 | ###### train with trials in threadpool 218 | train_pool = subparsers.add_parser('train-pool', parents=[train], 219 | add_help=False, conflict_handler='resolve') 220 | train_pool.add_argument('--njobs', type=int, default=0, 221 | help='Max number of processes to spawn. 0 will use the minimum of' 222 | ' all available cores and ntrials.') 223 | # Required model hyperparameter 224 | train_pool.add_argument('-k', '--nfactors', nargs='+', type=int, 225 | required=True, help='Number of factors.') 226 | 227 | 228 | ### Score command 229 | score = subparsers.add_parser('score', 230 | help='Create useful files such as gene scores, cell scores, and' 231 | ' ranked gene lists in txt format.') 232 | score.add_argument('-m', '--model', required=True, 233 | help='Saved scHPF model from train command. Should have extension' 234 | '`.joblib`') 235 | score.add_argument('-o', '--outdir', default=None, 236 | help='Output directory for score files. If not given, a new' 237 | ' subdirectory of the dir containing the model will be made with' 238 | ' the same name as the model file (without extension)') 239 | score.add_argument('-p', '--prefix', default='', 240 | help='Prefix for output files. Optional.') 241 | score.add_argument('-g', '--genefile', default=None, 242 | help='Create an additional file with gene names ranked by score ' 243 | 'for each factor. Expects the gene.txt file output by the scHPF ' 244 | 'prep command or a similarly formatted tab-delimited file without ' 245 | 'headers. Uses the zero-indexed ``--name_col``\'th column as gene ' 246 | 'names. Optional.') 247 | score.add_argument('--name-col', type=int, default=1, 248 | help='The zero-indexed column of `genefile` to use as a gene name ' 249 | 'when (optionally) ranking genes. If ``--name_col`` is greater' 250 | ' than the index of ``--genefile``\'s last column, it is ' 251 | ' automatically reset to the last column\'s index. [Default 1]' 252 | ) 253 | 254 | 255 | 256 | # ###### Project command 257 | proj = subparsers.add_parser('project', 258 | help='Project new data onto a trained model.') 259 | # data and saving 260 | proj.add_argument('-m', '--model', required=True, 261 | help='The model to project onto.') 262 | proj.add_argument('-i', '--input', required=True, 263 | help='Data to project onto model. Expects either the mtx file' 264 | ' output by the prep or prep-like commands or a tab-delimitted' 265 | ' tsv file formated like: `CELL_ID\tGENE_ID\tUMI_COUNT`. In the' 266 | ' later case, ids are assumed to be 0 indexed and we assume no' 267 | ' duplicates.') 268 | proj.add_argument('-o', '--outdir', 269 | help='Output directory for projected scHPF model. Will be created' 270 | ' if does not exist.') 271 | proj.add_argument('-p', '--prefix', default='', 272 | help='Prefix for output files. Optional.') 273 | 274 | # projection-specific args 275 | proj.add_argument('--recalc-bp', action='store_true', 276 | help='Recalculate hyperparameter bp for the new data') 277 | 278 | # Training parameters (same as train, different defaults, no short names) 279 | proj.add_argument('--max-iter', type=int, default=500, 280 | help='Maximum iterations. [Default 500].') 281 | proj.add_argument('--min-iter', type=int, default=10, 282 | help='Minimum iterations. [Default 10]') 283 | proj.add_argument('--epsilon', type=float, default=0.001, 284 | help='Minimum percent decrease in loss between checks to continue ' 285 | 'inference (convergence criteria). [Default 0.001].') 286 | proj.add_argument('--check-freq', type=int, default=10, 287 | help='Number of iterations to run between convergence checks. ' 288 | '[Default 10].') 289 | 290 | return parser 291 | 292 | 293 | if __name__=='__main__': 294 | parser = _parser() 295 | args = parser.parse_args() 296 | 297 | # print help if no subparser given 298 | if len(sys.argv)==1: 299 | parser.print_help(sys.stderr) 300 | sys.exit(1) 301 | 302 | # setup paths and prefixes 303 | 304 | if args.outdir is None: 305 | if args.cmd in ['prep', 'prep-like', 'train', 'train-pool']: 306 | args.outdir = args.input.rsplit('/', 1)[0] 307 | elif args.cmd=='project': 308 | args.outdir = args.model.rsplit('/',1)[0] 309 | elif args.cmd=='score': 310 | args.outdir = args.model.split('.joblib')[0] 311 | 312 | if args.outdir is not None and not os.path.exists(args.outdir): 313 | print("Creating output directory {} ".format(args.outdir)) 314 | os.makedirs(args.outdir) 315 | prefix = args.prefix.rstrip('.') + '.' if len(args.prefix) > 0 else '' 316 | outprefix = args.outdir + '/' + prefix 317 | 318 | if args.cmd == 'prep': 319 | filtered, genes = load_and_filter(args.input, 320 | min_cells=args.min_cells, 321 | whitelist=args.whitelist, 322 | blacklist=args.blacklist, 323 | filter_by_gene_name=args.filter_by_gene_name, 324 | no_split_on_dot=args.no_split_on_dot) 325 | 326 | print('Writing filtered data to file.....') 327 | mmwrite('{}filtered.mtx'.format(outprefix), filtered, field='integer') 328 | genes.to_csv('{}genes.txt'.format(outprefix), sep='\t', header=None, 329 | index=None) 330 | 331 | if args.n_validation_cells > 0: 332 | print('Selecting train/validation cells.....') 333 | Xtrn, Xvld, vld_ix = split_validation_cells( filtered, 334 | args.n_validation_cells, args.validation_group_ids, 335 | max_group_frac = args.validation_max_group_frac) 336 | trn_ix = np.setdiff1d(np.arange(filtered.shape[0]), vld_ix) 337 | 338 | print('Writing train/validation splits.....') 339 | mmwrite('{}train_cells.mtx'.format(outprefix), Xtrn, 340 | field='integer') 341 | np.savetxt('{}train_cell_ix.txt'.format(outprefix), trn_ix, 342 | fmt='%d') 343 | mmwrite('{}validation_cells.mtx'.format(outprefix), Xvld, 344 | field='integer') 345 | np.savetxt('{}validation_cell_ix.txt'.format(outprefix), vld_ix, 346 | fmt='%d') 347 | 348 | print('Writing commandline arguments to file.....') 349 | cmdfile = '{}prep_commandline_args.json'.format(outprefix) 350 | with open(cmdfile, 'w') as f: 351 | json.dump(args.__dict__, f, indent=2) 352 | 353 | 354 | 355 | elif args.cmd == 'prep-like': 356 | print('Loading and reordering input like reference.... ') 357 | filtered, genes = load_like(args.input, reference=args.reference, 358 | by_gene_name=args.by_gene_name, 359 | no_split_on_dot=args.no_split_on_dot) 360 | print('Writing prepared data to file.....') 361 | mmwrite('{}filtered.mtx'.format(outprefix), filtered, field='integer') 362 | genes.to_csv('{}genes.txt'.format(outprefix), sep='\t', header=None, 363 | index=None) 364 | print('Writing commandline arguments to file.....') 365 | cmdfile = '{}prep-like_commandline_args.json'.format(outprefix) 366 | with open(cmdfile, 'w') as f: 367 | json.dump(args.__dict__, f, indent=2) 368 | 369 | 370 | elif args.cmd in ['train', 'train-pool']: 371 | # load data 372 | print( 'Loading data.....' ) 373 | load_fnc = mmread if args.input.endswith('.mtx') else load_coo 374 | train = load_fnc(args.input) 375 | 376 | ncells, ngenes = train.shape 377 | msg = '.....found {} cells and {} genes in {}'.format( 378 | ncells, ngenes, args.input) 379 | print(msg) 380 | 381 | if args.batchsize and ncells > args.batchsize and not args.reproject: 382 | msg = '\nWARNING: running with minibatches but without reproject.' \ 383 | + ' We recommend adding the --reproject flag when running with'\ 384 | + ' batches to synchronize cell variational distributions. \n' 385 | print(msg) 386 | 387 | if args.validation_cells is not None: 388 | vcells = load_fnc(args.validation_cells) 389 | msg = '.....found {} validation cells and {} genes in {}'.format( 390 | vcells.shape[0], vcells.shape[1], args.validation_cells) 391 | print(msg) 392 | msg = 'WARNING: scHPF models with validation cells can be slow' 393 | msg += ' to converge.\n\tIf you observe this, try either (or both)' 394 | msg += ' increasing epsilon (-e, currently set to {})'.format( 395 | args.epsilon) 396 | msg += ' or increasing the number of validation cells (using prep)' 397 | print(msg) 398 | else: 399 | vcells = None 400 | 401 | # create model 402 | print('Running trials.....' ) 403 | dtype = np.float32 if args.float32 else np.float64 404 | model_kwargs = dict(a=args.a, c=args.c) 405 | 406 | if args.cmd == 'train': 407 | run_fnc = run_trials 408 | else: 409 | if args.njobs < 0: 410 | msg = 'njobs must be an int >= 0, received {}' 411 | raise ValueError(msg.format(args.njobs)) 412 | run_fnc = partial(run_trials_pool, njobs=args.njobs) 413 | 414 | # TODO get rid of repeated code 415 | reject = None 416 | if args.save_all: 417 | model, reject = run_fnc(train, vcells=vcells, 418 | nfactors=args.nfactors, ntrials=args.ntrials, 419 | min_iter=args.min_iter, max_iter=args.max_iter, 420 | check_freq=args.check_freq, epsilon=args.epsilon, 421 | better_than_n_ago=args.better_than_n_ago, dtype=dtype, 422 | verbose=args.verbose, model_kwargs=model_kwargs, 423 | return_all=True, reproject=args.reproject, 424 | batchsize=args.batchsize, 425 | beta_theta_simultaneous=args.beta_theta_simultaneous, 426 | loss_smoothing=args.smooth_loss 427 | ) 428 | else: 429 | model = run_fnc(train, vcells=vcells, nfactors=args.nfactors, 430 | ntrials=args.ntrials, min_iter=args.min_iter, 431 | max_iter=args.max_iter, check_freq=args.check_freq, 432 | epsilon=args.epsilon, 433 | better_than_n_ago=args.better_than_n_ago, dtype=dtype, 434 | verbose=args.verbose, model_kwargs=model_kwargs, 435 | return_all=False, reproject=args.reproject, 436 | batchsize=args.batchsize, 437 | beta_theta_simultaneous=args.beta_theta_simultaneous, 438 | loss_smoothing=args.smooth_loss 439 | ) 440 | 441 | # save the model/models 442 | if isinstance(args.nfactors, int): 443 | klist = [args.nfactors] 444 | model = [model] 445 | if reject is not None: 446 | reject = [reject] 447 | else: 448 | klist = args.nfactors 449 | for i, (K,m) in enumerate(zip(klist, model)): 450 | model_outprefix = '{}scHPF_K{}{}_{}trials'.format( 451 | outprefix, K, 452 | f'_b{args.batchsize}' if ncells > args.batchsize else '', 453 | args.ntrials) 454 | if vcells is None: 455 | print('Saving best model ({} factors).....'.format(K)) 456 | joblib.dump(m, model_outprefix + '.joblib') 457 | else: 458 | print('Saving best model (training data, {} factors).....'\ 459 | .format(K)) 460 | joblib.dump(m, model_outprefix + '.train.joblib') 461 | 462 | print('Computing final validation projection ({} factors)....'\ 463 | .format(K)) 464 | projection = m.project(vcells, replace=False) 465 | print('Saving validation projection.....({} factors)'.format(K)) 466 | joblib.dump(projection, 467 | model_outprefix + '.validation_proj.joblib') 468 | if args.save_all: 469 | for j,r in enumerate(reject[i]): 470 | joblib.dump(r, model_outprefix + f'_reject{j+1}.joblib') 471 | 472 | 473 | print('Writing commandline arguments to file.....') 474 | cmdfile = '{}train_commandline_args.json'.format(outprefix) 475 | print(cmdfile) 476 | if os.path.exists(cmdfile): 477 | cmdfile = '{}train_commandline_args.{}.json'.format(outprefix, 478 | time.strftime("%Y%m%d-%H%M%S")) 479 | with open(cmdfile, 'w') as f: 480 | json.dump(args.__dict__, f, indent=2) 481 | 482 | print('\n') 483 | 484 | 485 | elif args.cmd == 'score': 486 | print('Loading model.....') 487 | model = joblib.load(args.model) 488 | 489 | print('Calculating scores.....') 490 | cell_score = model.cell_score() 491 | gene_score = model.gene_score() 492 | 493 | print('Saving scores.....') 494 | np.savetxt(outprefix + 'cell_score.txt', cell_score, delimiter='\t') 495 | np.savetxt(outprefix + 'gene_score.txt', gene_score, delimiter='\t') 496 | 497 | print('Calculating mean cellscore fractions.....') 498 | frac_list = mean_cellscore_fraction_list(cell_score) 499 | with open(outprefix + 'mean_cellscore_fraction.txt', 'w') as h: 500 | h.write('nfactors\tmean_cellscore_fraction\n') 501 | for i,csf in enumerate(frac_list): 502 | h.write('{}\t{}\n'.format(i+1,csf)) 503 | 504 | print('Calculating maximum pairwise overlaps.....') 505 | table = max_pairwise_table(gene_score, 506 | ntop_list=[50,100,150,200,250,300,350,400,450,500]) 507 | table.to_csv(outprefix + 'maximum_overlaps.txt', sep='\t', index=False) 508 | 509 | if args.genefile is not None: 510 | print('Ranking genes.....') 511 | # load and format gene file 512 | genes = np.loadtxt(args.genefile, delimiter='\t', dtype=str) 513 | if len(genes.shape) == 1: 514 | genes = genes[:,None] 515 | # get column to use for gene names 516 | last_col = genes.shape[1] - 1 517 | name_col = last_col if args.name_col > last_col else args.name_col 518 | print('.....using {}\'th column of genefile as gene label'.format( 519 | name_col)) 520 | 521 | # rank the genes by gene_score 522 | ranks = np.argsort(gene_score, axis=0)[::-1] 523 | ranked_genes = [] 524 | for i in range(gene_score.shape[1]): 525 | ranked_genes.append(genes[ranks[:,i], name_col]) 526 | ranked_genes = np.stack(ranked_genes).T 527 | print('Saving ranked genes.....') 528 | np.savetxt(outprefix + 'ranked_genes.txt', ranked_genes, 529 | fmt="%s", delimiter='\t') 530 | 531 | print('Writing commandline arguments to file.....') 532 | cmdfile = '{}score_commandline_args.json'.format(outprefix) 533 | with open(cmdfile, 'w') as f: 534 | json.dump(args.__dict__, f, indent=2) 535 | 536 | elif args.cmd == 'project': 537 | print('Loading reference model.....') 538 | model = joblib.load(args.model) 539 | print('Loading data.....') 540 | load_fnc = mmread if args.input.endswith('.mtx') else load_coo 541 | proj_data = load_fnc(args.input) 542 | print('Projecting data.....') 543 | projection = model.project(proj_data, replace=False, verbose=True, 544 | recalc_bp=args.recalc_bp, 545 | min_iter=args.min_iter, 546 | max_iter=args.max_iter, 547 | check_freq=args.check_freq, 548 | epsilon=args.epsilon, ) 549 | print('Saving projection.....') 550 | if args.recalc_bp: 551 | outprefix += '{}.'.format('recalc_bp') 552 | proj_out = '{}{}.proj.joblib'.format(outprefix, 553 | args.model.rsplit('.',1)[0].split('/')[-1]) 554 | joblib.dump(projection, proj_out) 555 | 556 | print('Writing commandline arguments to file.....') 557 | cmdfile = '{}project_commandline_args.json'.format(outprefix) 558 | with open(cmdfile, 'w') as f: 559 | json.dump(args.__dict__, f, indent=2) 560 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/changelog.rst: -------------------------------------------------------------------------------- 1 | .. _changelog: 2 | 3 | ********* 4 | Changelog 5 | ********* 6 | 7 | 0.4.0 8 | ===== 9 | - train-pool for training parallelized at the level of trials rather than 10 | computations 11 | - reproject and save all options during training 12 | - add separate joblib dependency (should be installed w/scikit-learn, 13 | scikit.externals.joblib is deprecated) 14 | 15 | 16 | 0.3.0 17 | ===== 18 | 19 | - Refactor so loss can be an arbitrary function 20 | - Fix bugs in and expand options for projetion 21 | - prep-like CLI to prepare data for projection onto a trained model 22 | - cellscore fraction file for score CLI 23 | - Verbose option for load_txt 24 | - Update options for validation cells & selection 25 | - Version as an object attribute 26 | - Handle change in scipy API 27 | - new GENCODE files 28 | - (feature request) options to specify a and c from the train CLI 29 | - Documentation with ReadTheDocs 30 | 31 | 32 | 0.2.4 33 | ===== 34 | - Emergency patch preprocessing error for loom files. Also fixed an errant test. 35 | Not really enough to justify a new release but fixed a pretty 36 | irritating/embarrassing error. 37 | 38 | 0.2.3 39 | ===== 40 | - fix no split on dot bug 41 | - Max pairwise table + default max pairwise in score 42 | - Note about ld.so error 43 | - Fix max pairwise second greatest bug 44 | - Some integration tests 45 | 46 | 47 | 0.2.2 48 | ===== 49 | - partial test suite 50 | - max pairwise test for gene overlap 51 | - faster preprocessing of larage text files 52 | - refactor preprocessing and training control flow out of CLI 53 | - move load and save methods outside of scHPF object 54 | 55 | 56 | 0.2.1 57 | ===== 58 | - Slight speedup during inference for Xphi 59 | - Fix bug (occurred first in 0.2.0-alpha) that occurs when genes in 60 | whitespace-delim input to prep that have no counts 61 | 62 | 63 | 0.2.0 64 | ===== 65 | Numba implmentation with scikit-learn-like API 66 | 67 | 68 | 0.1.0 69 | ===== 70 | - Tensorflow implementation 71 | 72 | -------------------------------------------------------------------------------- /docs/cli-man.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _cli-man: 3 | 4 | 5 | ********************** 6 | Complete CLI Reference 7 | ********************** 8 | 9 | .. _cli-prep: 10 | 11 | scHPF prep 12 | ========== 13 | 14 | .. argparse:: 15 | :filename: ../bin/scHPF 16 | :func: _parser 17 | :prog: scHPF 18 | :path: prep 19 | 20 | 21 | .. _cli-train: 22 | 23 | scHPF train 24 | =========== 25 | 26 | .. argparse:: 27 | :filename: ../bin/scHPF 28 | :func: _parser 29 | :prog: scHPF 30 | :path: train 31 | 32 | 33 | .. _cli-score: 34 | 35 | scHPF score 36 | =========== 37 | 38 | .. argparse:: 39 | :filename: ../bin/scHPF 40 | :func: _parser 41 | :prog: scHPF 42 | :path: score 43 | 44 | 45 | .. _cli-prep-like: 46 | 47 | scHPF prep-like 48 | =============== 49 | 50 | .. argparse:: 51 | :filename: ../bin/scHPF 52 | :func: _parser 53 | :prog: scHPF 54 | :path: prep-like 55 | 56 | 57 | .. _cli-project: 58 | 59 | scHPF project 60 | ============= 61 | 62 | .. argparse:: 63 | :filename: ../bin/scHPF 64 | :func: _parser 65 | :prog: scHPF 66 | :path: project 67 | 68 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # http://www.sphinx-doc.org/en/master/config 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('../schpf')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'scHPF' 21 | copyright = '2019, Hanna Mendes Levitin' 22 | author = 'Hanna Mendes Levitin' 23 | 24 | # The full version, including alpha/beta/rc tags 25 | import schpf 26 | version = schpf.__version__ 27 | release = version 28 | 29 | 30 | # -- General configuration --------------------------------------------------- 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = [ 36 | 'sphinx.ext.autodoc', 37 | 'sphinx.ext.intersphinx', 38 | 'sphinxarg.ext', 39 | # 'sphinx.ext.doctest', 40 | # 'sphinx.ext.coverage', 41 | # 'sphinx.ext.mathjax', 42 | 'sphinx.ext.napoleon', 43 | 'sphinx.ext.viewcode', 44 | 'sphinx.ext.autosummary', 45 | ] 46 | 47 | autosummary_generate = True 48 | autodoc_member_order = 'bysource' 49 | napoleon_google_docstring = False 50 | napoleon_numpy_docstring = True 51 | napoleon_include_init_with_doc = False 52 | napoleon_use_rtype = True # having a separate entry generally helps readability 53 | napoleon_use_param = True 54 | napoleon_custom_sections = [('Params', 'Parameters')] 55 | todo_include_todos = True 56 | 57 | # Add any paths that contain templates here, relative to this directory. 58 | templates_path = ['_templates'] 59 | 60 | # List of patterns, relative to source directory, that match files and 61 | # directories to ignore when looking for source files. 62 | # This pattern also affects html_static_path and html_extra_path. 63 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 64 | 65 | 66 | # -- Options for HTML output ------------------------------------------------- 67 | 68 | # The theme to use for HTML and HTML Help pages. See the documentation for 69 | # a list of builtin themes. 70 | # 71 | # html_theme = 'alabaster' 72 | html_theme = 'sphinx_rtd_theme' 73 | 74 | # Add any paths that contain custom static files (such as style sheets) here, 75 | # relative to this directory. They are copied after the builtin static files, 76 | # so a file named "default.css" will overwrite the builtin "default.css". 77 | html_static_path = ['_static'] 78 | 79 | html_show_sphinx = False 80 | html_context = dict( 81 | display_github=True, # Integrate GitHub 82 | github_user='simslab', # Username 83 | github_repo='scHPF', # Repo name 84 | github_version='master', # Version 85 | conf_py_path='/docs/', # Path in the checkout to the docs root 86 | ) 87 | gh_url = 'https://github.com/{github_user}/{github_repo}'.format_map(html_context) 88 | -------------------------------------------------------------------------------- /docs/genelists.rst: -------------------------------------------------------------------------------- 1 | .. _premade lists: https://github.com/simslab/scHPF/tree/master/resources 2 | .. _stable identifiers: https://useast.ensembl.org/info/genome/stable_ids/index.html 3 | .. _biotypes: https://www.gencodegenes.org/pages/biotypes.html 4 | 5 | .. _genelists: 6 | 7 | ********** 8 | Gene lists 9 | ********** 10 | 11 | About 12 | ===== 13 | We recommend restricting analysis to protein-coding genes, and bundle 14 | `premade lists`_ of coding genes for human and mouse with the scHPF code. The 15 | :ref:`prep CLI command` optionally uses these lists to filter input 16 | data. Although ENSEMBL ids are theoretically unambiguous and consistent across 17 | releases (ie `stable identifiers`_), you may want to generate your own list 18 | from a different annotation (that matches your alignment GENCODE version) or 19 | with different parameters for gene inclusion (eg including lncRNA). 20 | 21 | Premade lists 22 | ============= 23 | The scHPF code includes tab-delimited lists of ENSEMBL ids and names for genes 24 | with protein coding, T-cell receptor constant, or immunoglobulin constant 25 | `biotypes`_ for human and mouse. 26 | 27 | Premade lists can be found in the 28 | `code's resources folder `_: 29 | 30 | * Human (GENCODE v24, v29, v31) 31 | * Mouse (GENCODE vM10, vM19) 32 | 33 | Format 34 | ====== 35 | Example tab-delimited gene list:: 36 | 37 | ENSG00000186092 OR4F5 38 | ENSG00000284733 OR4F29 39 | ENSG00000284662 OR4F16 40 | ENSG00000187634 SAMD11 41 | ENSG00000188976 NOC2L 42 | ENSG00000187961 KLHL17 43 | 44 | By default, the prep command assumes a two-column, tab-delimited text file of 45 | ENSEMBL gene ids and names, and uses the first column (assumed to be ENSEMBL id) 46 | to filter genes. See the 47 | :ref:`prep command documentation` for other options. 48 | 49 | .. note:: 50 | ENSEMBL ids may end in a period followed by an unstable version 51 | number (eg ENSG00000186092.6). By default, the prep command ignores anything 52 | after the period. This means ``[ENS-ID].[VERSION]`` is equivalent to 53 | ``[ENS-ID]`` . See the :ref:`prep command ` for other options. 54 | 55 | 56 | Making custom gene lists 57 | ======================== 58 | Although ENSEMBL ids aim to be unambiguous and consistent across 59 | releases (ie `stable identifiers`_), you may want to generate your own list from 60 | a different annotation or with different parameters for gene inclusion. 61 | 62 | 63 | Example creation script 64 | ~~~~~~~~~~~~~~~~~~~~~~~ 65 | Reference files of ids and names for genes with with 66 | ``protein_coding``, ``TR_C_gene``, or ``IG_C_gene`` biotypes in the GENCODE 67 | main annotation (in this case ``gencode.v29.annotation.gtf``) were generated as follows: 68 | 69 | .. code:: bash 70 | 71 | # Select genes with feature gene and level 1 or 2 72 | awk '{if($3=="gene" && $0~"level (1|2);"){print $0}}' gencode.v29.annotation.gtf > gencode.v29.annotation.gene_l1l2.gtf 73 | 74 | # Only include biotypes protein_coding, TR_C_g* and IG_C_g* 75 | awk '{if($12~"TR_C_g" || $12~"IG_C_g" || $12~"protein_coding"){print $0}}' gencode.v29.annotation.gene_l1l2.gtf > gencode.v29.annotation.gene_l1l2.pc_TRC_IGC.gtf 76 | 77 | # Retrieve ENSEMBL gene id and name 78 | awk '{{OFS="\t"}{gsub(/"/, "", $10); gsub(/;/, "", $10); gsub(/"/, "", $14); gsub(/;/, "", $14); print $10, $14}}' gencode.v29.annotation.gene_l1l2.pc_TRC_IGC.gtf > gencode.v29.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt 79 | 80 | 81 | .. note:: 82 | For older GENCODE versions, you may need to adjust the field indices in 83 | the third line of code (for example changing all instances of $14 to $16). 84 | -------------------------------------------------------------------------------- /docs/img/cell-type-rep-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simslab/scHPF/aff30d674039359395cbee4ca4ddc85f3a5c8b56/docs/img/cell-type-rep-01.png -------------------------------------------------------------------------------- /docs/img/k_selection_minifig-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simslab/scHPF/aff30d674039359395cbee4ca4ddc85f3a5c8b56/docs/img/k_selection_minifig-01.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. scHPF documentation master file, created by 2 | sphinx-quickstart on Mon Jul 8 17:02:06 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Single-cell Hierarchical Poisson Factorization 7 | ============================================== 8 | 9 | Single-cell Hierarchical Poisson Factorization (scHPF) is a tool for *de novo* 10 | discovery of discrete and continuous expression patterns in single-cell 11 | RNA\-sequencing (scRNA-seq). 12 | 13 | We find that scHPF’s sparse low-dimensional representations, non-negativity, 14 | and explicit modeling of variable sparsity across genes and cells produces 15 | highly interpretable factors. The algorithm takes genome-wide molecular counts 16 | as input, avoids prior normalization, and has fast, memory-efficient inference 17 | on sparse scRNA-seq datasets. 18 | 19 | Algorithmic details, benchmarking against alternative methods, and scHPF's 20 | application to a spatially sampled high-grade glioma can be found in 21 | `our paper at Molecular Systems Biology`_. 22 | 23 | .. _our paper at Molecular Systems Biology: https://doi.org/10.15252/msb.20188557 24 | 25 | You can find the software `on github `_. 26 | 27 | .. toctree:: 28 | :maxdepth: 1 29 | :caption: Setup 30 | 31 | install 32 | genelists 33 | 34 | .. toctree:: 35 | :maxdepth: 2 36 | :caption: Commandline workflow 37 | 38 | prep-cli 39 | train-cli 40 | score-cli 41 | 42 | .. toctree:: 43 | :maxdepth: 2 44 | :caption: Advanced options 45 | 46 | select-k 47 | project 48 | 49 | .. toctree:: 50 | :maxdepth: 1 51 | :caption: Misc 52 | 53 | cli-man 54 | changelog 55 | 56 | 57 | Indices and tables 58 | ================== 59 | 60 | * :ref:`genindex` 61 | * :ref:`modindex` 62 | * :ref:`search` 63 | -------------------------------------------------------------------------------- /docs/install.rst: -------------------------------------------------------------------------------- 1 | .. _install: 2 | 3 | ************ 4 | Installation 5 | ************ 6 | 7 | Environment & Dependencies 8 | ========================== 9 | 10 | scHPF requires Python >= 3.6 and the packages: 11 | 12 | * numba (:ref:`version requirement depends on python version`, but will be safe with 0.45, and probably 0.45+) 13 | * scikit-learn 14 | * pandas 15 | * (optional) loompy 16 | 17 | The easiest way to setup a python environment for scHPF is with `anaconda`_ (or 18 | its stripped-down version `miniconda`_): 19 | 20 | .. _anaconda: https://www.anaconda.com/distribution 21 | .. _miniconda: https://docs.conda.io/en/latest/miniconda.html 22 | 23 | .. code:: bash 24 | 25 | conda create -n schpf_p37 python=3.7 scikit-learn numba=0.50 pandas numpy=1.18 26 | 27 | # for newer anaconda versions 28 | conda activate schpf_p37 29 | # XOR older anaconda verstions 30 | source activate schpf_p37 31 | 32 | # Optional, for using loom files as input to preprocessing 33 | pip install -U loompy 34 | 35 | 36 | 37 | .. _numba: 38 | 39 | numba/Python compatibility 40 | -------------------------- 41 | Certain micro-versions of Python and numba do not play well together, resulting 42 | in segmentation faults and/or horrible performance (at least for the ops scHPF 43 | uses). In our experience, micro-version combos that avoid these issues are 44 | listed below, as well as known-bad combination, but note this is not an 45 | exhaustive list: 46 | 47 | **Python 3.7.9** 48 | Compatible numba: 0.45-0.50 49 | 50 | DO NOT USE: 0.44 or earlier 51 | **Python 3.7.5 - 3.7.8** 52 | Not tested 53 | **Python 3.7.4** 54 | Compatible numba: 0.44, 0.45 55 | 56 | DO NOT USE: 0.43 or earlier 57 | **Python <=3.7.3** 58 | Compatible numba: 0.39, 0.40, 0.44, 0.45 59 | 60 | DO NOT USE: 0.41-0.43 61 | 62 | *Please* let me know about any weird errors/slowness you experience so we can 63 | document! 64 | 65 | 66 | 67 | 68 | Installing scHPF 69 | ================ 70 | 71 | Once you have set up the environment, clone ``simslab/scHPF`` from github and 72 | install. 73 | 74 | .. code:: bash 75 | 76 | git clone git@github.com:simslab/scHPF.git 77 | cd scHPF 78 | pip install . 79 | 80 | 81 | .. _tests: 82 | 83 | Test your installation 84 | ---------------------- 85 | Highly recommended, as this will catch some annoying problems with python/numba/numpy incompatibilities. From your scHPF home directory: 86 | 87 | 88 | .. code:: bash 89 | 90 | conda install pytest 91 | pytest 92 | 93 | 94 | If any tests fail, please get in touch and I'll be happy to help. 95 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/prep-cli.rst: -------------------------------------------------------------------------------- 1 | .. _loompy docs: http://loompy.org/ 2 | .. _resources folder: https://github.com/simslab/scHPF/tree/rewrite_release/resources 3 | 4 | .. _prep-cli: 5 | 6 | ********** 7 | scHPF prep 8 | ********** 9 | 10 | Basic usage 11 | =========== 12 | 13 | To preprocess genome-wide UMI counts for a typical run, use the command: 14 | 15 | .. code:: bash 16 | 17 | scHPF prep -i UMICOUNT_MATRIX -o OUTDIR -m 10 -w WHITELIST 18 | 19 | As written, the command prepares a 20 | :ref:`matrix of molecular counts ` for training and only includes 21 | genes that are: 22 | 23 | - on a :ref:`whitelist`, for example one of the lists of protein 24 | coding genes bundled in the scHPF code's reference folder 25 | (``-w``/``--whitelist``) 26 | - that we observe in at at least 10 cells (``-m``/``--min-cells``). 27 | 28 | After running this command, ``OUTDIR`` should contain a matrix market file, 29 | ``filtered.mtx``, and an ordered list of genes, ``genes.txt``. An optional prefix 30 | argument can be added, which is prepended to to the output file names. 31 | 32 | Now we can train the model with the |scHPF train|_. 33 | 34 | .. |scHPF train| replace:: ``scHPF train`` utility 35 | .. _scHPF train: train-cli.html 36 | 37 | .. _matrix-format: 38 | 39 | Input matrix format 40 | =================== 41 | ``scHPF prep`` takes a molecular count matrix for an scRNA-seq experiment 42 | and formats it for training. The input matrix has two allowed formats: 43 | 44 | 1. A **whitespace-delimited matrix** formatted as follows, with no header:: 45 | 46 | ENSEMBL_ID GENE_NAME UMICOUNT_CELL0 UMICOUNT_CELL1 ... 47 | 48 | 2. A **loom file** (see `loompy docs`_). The loom file must have at least 49 | one of the row attributes ``Accession`` or ``Gene``, where ``Accession`` 50 | is an ENSEMBL id and ``Gene`` is a gene name. 51 | 52 | .. _whitelist: 53 | 54 | Whitelisting genes 55 | ================== 56 | 57 | About 58 | ----- 59 | We recommend restricting analysis to protein-coding genes. The 60 | ``-w``/``--whitelist`` option removes all genes in the input data that are *not 61 | in* a two column, tab-delimited text file of ENSEMBL gene ids and names. 62 | Symmetrically, the ``-b``/``--blacklist`` option removes all genes that are *in* 63 | a file. 64 | 65 | Whitelists for human and mouse are provided in the `resources folder`_, and 66 | details on formatting and custom lists are in the 67 | :ref:`gene list documentation `. 68 | 69 | .. Attention:: 70 | ENSEMBL ids may end in a period followed by an unstable version 71 | number (eg ENSG00000186092.6). By default, the prep command ignores anything 72 | after the period. This means ``[ENS-ID].[VERSION]`` is equivalent to 73 | ``[ENS-ID]``. This behavior can be overwritten with the 74 | ``--no-split-on-dot`` flag. 75 | 76 | Whitespace-delimited input matrix 77 | --------------------------------- 78 | For whitespace-delimited UMI-count files, filtering is performed using the input 79 | matrix's first column (assumed to be a unique identifier) by default, but can be 80 | done with the gene name (next column) using the ``--filter-by-gene-name`` flag. 81 | This is useful for data that does not include a gene id. 82 | 83 | 84 | loom input matrix 85 | ----------------- 86 | For loom files, we filter the loom ``Accession`` row attribute against the 87 | whitelist's ENSEMBLE if ``Accession`` is present in the loom's row attributes, 88 | and filter the loom's ``Gene`` row attribute against the gene name in the 89 | whitelist otherwise. 90 | 91 | 92 | .. _prep-options: 93 | 94 | Complete options 95 | ================ 96 | 97 | For complete options, see the :ref:`complete CLI reference` or use the 98 | ``-h`` option on the command line: 99 | 100 | .. code:: bash 101 | 102 | scHPF prep -h 103 | 104 | -------------------------------------------------------------------------------- /docs/project.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _project: 3 | 4 | ************************************ 5 | Projecting data onto a trained model 6 | ************************************ 7 | 8 | Full writeup coming soon. Use the ``prep-like`` and ``project`` commandline 9 | programs. 10 | 11 | Preparing data for projection 12 | ============================= 13 | 14 | For complete options, see the :ref:`complete CLI reference` or 15 | use the ``-h`` option on the command line: 16 | 17 | .. code:: bash 18 | 19 | scHPF prep-like -h 20 | 21 | Projecting new data 22 | ==================== 23 | 24 | For complete options, see the :ref:`complete CLI reference` or 25 | use the ``-h`` option on the command line: 26 | 27 | .. code:: bash 28 | 29 | scHPF project -h 30 | -------------------------------------------------------------------------------- /docs/references.rst: -------------------------------------------------------------------------------- 1 | References 2 | ---------- 3 | 4 | .. [Levitin2019] Levitin *et al.* (2019), 5 | *De novo gene signature identification from single‐cell RNA‐seq with hierarchical Poisson factorization*, 6 | `Molecular Systems Biology`__. 7 | 8 | 9 | .. [SzaboLevitin2019] Szabo, Levitin *et al.* (2019), 10 | *Single-cell transcriptomics of human T cells reveals tissue and activation signatures in health and disease*, 11 | `Nature Communications`. 12 | -------------------------------------------------------------------------------- /docs/score-cli.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _score-cli: 3 | 4 | *********** 5 | scHPF score 6 | *********** 7 | 8 | Basic usage 9 | =========== 10 | To get gene- and cell-scores in a tab-delimited file, ordered like the genes and 11 | cells in the train file and with a column for each factor: 12 | 13 | .. code:: bash 14 | 15 | scHPF score -m MODEL_JOBLIB -o OUTDIR -p PREFIX 16 | 17 | To also generate a tab-delimited file of gene names, ranked by gene-score for 18 | each factor: 19 | 20 | .. code:: bash 21 | 22 | scHPF score -m MODEL_JOBLIB -o OUTDIR -p PREFIX -g GENE_FILE 23 | 24 | ``GENE_FILE`` is intended to be the gene.txt file output by the 25 | |scHPF prep command|_, but can in theory be any tab-delimited file where the 26 | number of rows is equal to the number of genes in the scHPF model. The score 27 | command automatically uses the 1st (zero-indexed) column of ``GENE_FILE`` (or 28 | the only column if there is only one); however, the column used can be specified 29 | with ``--name-col``. 30 | 31 | .. |scHPF prep command| replace:: ``scHPF prep`` command 32 | .. _scHPF prep command: prep-cli.html 33 | 34 | If ``OUTDIR`` is omitted, the command will make a new subdirectory of the 35 | directory containing the model. The new subdirectory will have the same name as 36 | the model file, but without the joblib extension. 37 | 38 | The command also outputs files which can be used to 39 | :ref:`select the number of factors` using trained models. 40 | 41 | 42 | Complete options 43 | ================ 44 | 45 | For complete options, see the :ref:`complete CLI reference` or use the 46 | ``-h`` option on the command line: 47 | 48 | .. code:: bash 49 | 50 | scHPF score -h 51 | -------------------------------------------------------------------------------- /docs/select-k.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _select-k: 3 | 4 | ************* 5 | Selecting *K* 6 | ************* 7 | 8 | General comments 9 | ================ 10 | 11 | The number of factors, *K*, determines scHPF's granularity. An appropriate 12 | number of factors depends on both the data being fit and the intended 13 | application of the scHPF model. In our experience, subsequent analyses on cell 14 | scores (eg. UMAP) are stable across a reasonable range of *K*, while 15 | interpretability (gene scores) can be more *K*-dependent. 16 | 17 | 18 | .. _k-workflow: 19 | 20 | Example workflows 21 | ================= 22 | 23 | 1. Exploratory analysis on a single sample 24 | ------------------------------------------ 25 | In some cases, if a user has a single sample, it may be appropriate to increase 26 | or decrease *K* manually according to the desired resolution. Granularity at 27 | the level of expression programs can be assessed qualitatively using the 28 | per-factor ranked gene lists in *ranked_genes.txt* (from |scHPF score|_ with 29 | the ``-g`` option). For example, if genes for two cell types appear in the same 30 | factor, one might increase *K*. Resolution can also be assessed quantitatively 31 | using 32 | :ref:`cell type respresentation`, or 33 | :ref:`other quantitative criteria`. 34 | 35 | When using this approach, we encourage the user to always try at least two 36 | values of *K* in any direction, as scHPF is multimodal and behavior is not 37 | always monotonic. *K* in the neighborhood of the number of clusters is often a 38 | good starting point. 39 | 40 | .. _multi-model-example: 41 | 42 | 2. Consistent choices across multiple models 43 | -------------------------------------------- 44 | Applying scHPF separately to multiple partitions (as in [SzaboLevitin2019]_) 45 | necessitates a uniform procedure for choosing the number of factors. To 46 | maximize interpretability while being quantitative and consistent across 47 | models, we usually train scHPF across a range of *K*'s for each partition and 48 | select the per-dataset number of factors using a heuristic suitable to our 49 | intended application 50 | (:ref:`example criteria`). An example workflow might be: 51 | 52 | 53 | 1. Choose an appropriate selection criteria for the problem at hand 54 | (:ref:`examples`). 55 | 56 | 2. Guess a minimum number of factors, |K_min|. Values slightly less than 57 | the number of clusters in the dataset are usually a good starting point 58 | (e.g. |K_min| = number of clusters - 2). Guess a maximum number of 59 | factors, |K_max|, not worrying too much if we are low since we'll refine 60 | later (e.g. |K_max| = |K_min| + 8). 61 | 62 | 3. :ref:`Train` scHPF models for K in 63 | range(|K_min|, |K_max| +1). *Advanced note*: I sometimes use a step 64 | size of 2 or 3 on the first pass to check that the range is reasonable, 65 | but recommend a final step of 1 (scHPF is multimodal, so results may not 66 | be monotonic). 67 | 68 | 4. Evaluate the models using the selection criteria from 1. Expand/refine 69 | the range accordingly. For example, if |K_max| passes our criteria, we 70 | should increase |K_max|. 71 | 72 | 5. Repeat 3-5 as needed. 73 | 74 | 75 | .. |K_min| replace:: *K*:sub:`min` 76 | 77 | .. |K_max| replace:: *K*:sub:`max` 78 | 79 | .. _k-criteria: 80 | 81 | Example selection criteria 82 | =========================== 83 | 84 | .. _type-rep: 85 | 86 | 1. Cell type representation 87 | --------------------------- 88 | 89 | In [Levitin2019]_, we chose *K* based on scHPF's representation of cell types 90 | in the data. Specifically, we selected the smallest *K* such that every 91 | well-defined cluster was most strongly associated with at least one unique 92 | factor `[Levitin2019, Appendix Figure S8]`_. This method is intuitive, and can 93 | work well when many cell types are present, but depends on the quality and 94 | granularity of clustering. It is also difficult to standardize across multiple 95 | models trained on different data. 96 | 97 | .. _[Levitin2019, Appendix Figure S8]: https://www.embopress.org/action/downloadSupplement?doi=10.15252%2Fmsb.20188557&file=msb188557-sup-0001-Appendix.pdf 98 | 99 | 100 | .. figure:: ./img/cell-type-rep-01.png 101 | 102 | Median cell score per factor and cluster in a high-grade glioma for 12, 103 | 13, and 14 factors in [Levitin2019]_. At 14 factors, all clusters are most 104 | closely associated with at least one unique factor. 105 | 106 | 107 | .. _signature-overlap: 108 | 109 | .. sidebar:: Evaluating top gene overlap 110 | 111 | .. figure:: ./img/k_selection_minifig-01.png 112 | 113 | Hypergeometric -log10 *p*-value of the maximum pairwise overlap 114 | of the highest scoring genes in each factor for Donor 2 Bone Marrow in 115 | [SzaboLevitin2019]_ at different values of *K*. 116 | 117 | 2. Gene signature overlap 118 | ------------------------- 119 | 120 | To find common patterns of gene expression across multiple models in 121 | [SzaboLevitin2019]_, we selected *K* such that factors in the same model did 122 | not have significant overlap in their top genes (where top genes are defined as 123 | the *n* highest scoring genes per factor). This reflected our prior that 124 | programs should be distinctive with respect to gene scores, and the further 125 | requirement that models should have similar granularity across datasets with 126 | different levels of complexity. 127 | 128 | The |scHPF score|_ command automatically produces the file 129 | *maximum_overlaps.txt*, which contains factors' maximum pairwise overlap and 130 | corresponding hypergeometric *p* values at different cutoffs. 131 | 132 | For standard significance thresholds and reasonable *n*, this method can be 133 | quite strict, resulting in lower granularity factorizations for some datasets. 134 | Using :ref:`cellular resolution` or 135 | :ref:`cell type respresentation` may find higher resolution 136 | factorizations in these cases. 137 | 138 | .. |scHPF score| replace:: ``scHPF score`` 139 | .. _scHPF score: score-cli.html 140 | 141 | 142 | .. _cell-res: 143 | 144 | 145 | 3. Cellular resolution 146 | ---------------------- 147 | 148 | Cellular resolution directly evaluates a model's granularity by specifying how 149 | many factors, on average, should explain a given portion of a cell's total cell 150 | scores. We have found it especially useful for datasets where 151 | :ref:`gene signature overlap` is too strict. 152 | 153 | We define cellular resolution as the maximum *K* such that, on average, cells' 154 | *n* highest scoring factors contain at least *r*\*100 percent of their total 155 | score across all factors. So if we want to find a model where the 3 factors 156 | with the highest score in a cell contain at least 70% of its total score (on 157 | average), *n* would be 3 and *r* would be 0.7. 158 | 159 | We can evaluate cellular resolution using one of |scHPF score|_'s outputs, a 160 | file called *mean_cellscore_fraction.txt* (potentially with a prefix). The 161 | file's two columns, *nfactors* and *mean_cellscore_fraction*, represent the 162 | mean fraction of each cell's total cell score allocated to its top *nfactors* 163 | factors. If we want to find a model at *n* =3 and *r* =0.7 resolution, we 164 | might follow the :ref:`example workflow` above, and select the 165 | largest *K* such that *mean_cellscore_fraction* >= 0.7 when *nfactors* = 3. 166 | -------------------------------------------------------------------------------- /docs/train-cli.rst: -------------------------------------------------------------------------------- 1 | .. _joblib: https://scikit-learn.org/stable/modules/model_persistence.html 2 | 3 | .. _train-cli: 4 | 5 | *********** 6 | scHPF train 7 | *********** 8 | 9 | Basic usage 10 | =========== 11 | A typical command to train an scHPF model (using data prepared by the 12 | |scHPF prep command|_): 13 | 14 | .. |scHPF prep command| replace:: ``scHPF prep`` command 15 | .. _scHPF prep command: prep-cli.html 16 | 17 | .. code:: bash 18 | 19 | scHPF train -i TRAIN_FILE -o OUTDIR -p PREFIX -k 7 -t 5 20 | 21 | This command performs approximate Bayesian inference on scHPF with, in this 22 | instance, seven factors and five different random initializations. scHPF will 23 | automatically select the trial with the lowest negative log-likelihood, and 24 | save the model in the OUTDIR in a serialized `joblib`_ file. 25 | 26 | Input file format 27 | ================= 28 | scHPF's train command accepts two formats: 29 | 30 | 1. **Matrix Market (.mtx) files**, where rows are cells, columns are genes, and 31 | values are nonzero molecular counts. Matrix market files are output by 32 | the current |scHPF prep command|_. 33 | 34 | 2. **Tab-delimited COO matrix coordinates**, output by a previous version of the 35 | preprocessing command. These files are essentially the same as .mtx 36 | files, except they do not have a header and are zero indexed. 37 | 38 | 39 | Debugging 40 | ========= 41 | .. hint:: 42 | If you get an error like "Inconsistency detected by ld.so: dl-version.c: 224: 43 | _dl_check_map_versions" and are running numba 0.40.0, try downgrading to 44 | 0.39.0. 45 | 46 | .. hint:: 47 | If you get an error like "Segmentation fault (core dumped)" and are running 48 | Python 3.7.4, try upgrading numba to version 0.45 or downgrading Python to 49 | 3.7.3 python :ref:`[More details]` 50 | 51 | 52 | Complete options 53 | ================ 54 | 55 | For complete options, see the :ref:`complete CLI reference` or use the 56 | ``-h`` option on the command line: 57 | 58 | .. code:: bash 59 | 60 | scHPF train -h 61 | -------------------------------------------------------------------------------- /resources/README.md: -------------------------------------------------------------------------------- 1 | ## Gene Files 2 | 3 | Two column, tab-delimited text file of ENSEMBL gene ids and names with protein coding, T-cell receptor constant or immunoglobulin constant biotypes in the GENCODE main annotation for [human](https://www.gencodegenes.org/human/) or [mouse](https://www.gencodegenes.org/mouse/). 4 | 5 | ### Included files 6 | Human: `gencode.v29.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt` 7 | Mouse: `gencode.vM19.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt` 8 | 9 | ### Generating gene files 10 | Files were generated from GENCODE GTFs as follows: 11 | ``` 12 | # Select genes with feature gene and level 1 or 2 13 | awk '{if($3=="gene" && $0~"level (1|2);"){print $0}}' gencode.v29.annotation.gtf > gencode.v29.annotation.gene_l1l2.gtf 14 | 15 | # Only include biotypes protein_coding, TR_C_g* and IG_C_g* 16 | awk '{if($12~"TR_C_g" || $12~"IG_C_g" || $12~"protein_coding"){print $0}}' gencode.v29.annotation.gene_l1l2.gtf > gencode.v29.annotation.gene_l1l2.pc_TRC_IGC.gtf 17 | 18 | # Retrieve ENSEMBL gene id and name 19 | awk '{{OFS="\t"}{gsub(/"/, "", $10); gsub(/;/, "", $10); gsub(/"/, "", $14); gsub(/;/, "", $14); print $10, $14}}' gencode.v29.annotation.gene_l1l2.pc_TRC_IGC.gtf > gencode.v29.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt'")")}}' 20 | ``` 21 | -------------------------------------------------------------------------------- /schpf/__init__.py: -------------------------------------------------------------------------------- 1 | from .scHPF_ import * 2 | from .util import * 3 | from ._version import __version__ 4 | -------------------------------------------------------------------------------- /schpf/_version.py: -------------------------------------------------------------------------------- 1 | __version__='0.5.0' 2 | -------------------------------------------------------------------------------- /schpf/hpf_numba.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import ctypes 4 | import numpy as np 5 | from scipy.sparse import coo_matrix 6 | try: 7 | from scipy.misc import logsumexp 8 | except ImportError: 9 | from scipy.special import logsumexp 10 | 11 | import numba 12 | from numba.extending import get_cython_function_address as getaddr 13 | 14 | # get numba-compatible digamma/psi and gammaln 15 | # psi/digamma 16 | psi_fnaddr = getaddr("scipy.special.cython_special", "__pyx_fuse_1psi") 17 | psi_ftype = ctypes.CFUNCTYPE(ctypes.c_double, ctypes.c_double) 18 | psi = psi_ftype(psi_fnaddr) 19 | # gammaln 20 | gammaln_fnaddr = getaddr("scipy.special.cython_special", "gammaln") 21 | gammaln_ftype = ctypes.CFUNCTYPE(ctypes.c_double, ctypes.c_double) 22 | cgammaln = gammaln_ftype(gammaln_fnaddr) 23 | 24 | @numba.njit(parallel=True, nogil=True, fastmath=True) 25 | def compute_pois_llh(X_data, X_row, X_col, 26 | theta_vi_shape, theta_vi_rate, 27 | beta_vi_shape, beta_vi_rate): 28 | ncells, ngenes = (theta_vi_shape.shape[0], beta_vi_shape.shape[0]) 29 | nfactors, nnz = (theta_vi_shape.shape[1], X_data.shape[0]) 30 | dtype = theta_vi_shape.dtype 31 | 32 | # precompute expectations 33 | theta_e_x = np.zeros_like(theta_vi_shape, dtype=dtype) 34 | for i in numba.prange(ncells): 35 | for k in range(nfactors): 36 | theta_e_x[i,k] = theta_vi_shape[i,k] / theta_vi_rate[i,k] 37 | 38 | beta_e_x = np.zeros_like(beta_vi_shape, dtype=dtype) 39 | for i in numba.prange(ngenes): 40 | for k in range(nfactors): 41 | beta_e_x[i,k] = beta_vi_shape[i,k] / beta_vi_rate[i,k] 42 | 43 | # compute llh 44 | llh = np.zeros(X_data.shape, dtype=dtype) 45 | for i in numba.prange(nnz): 46 | e_rate = np.zeros(1, dtype=dtype)[0] 47 | for k in range(nfactors): 48 | e_rate += theta_e_x[X_row[i],k] * beta_e_x[X_col[i], k] 49 | llh[i] = X_data[i] * np.log(e_rate) - e_rate \ 50 | - cgammaln(X_data[i] + 1.0) 51 | return llh 52 | 53 | 54 | @numba.njit(parallel=True, nogil=True) 55 | def compute_Xphi_data(X_data, X_row, X_col, 56 | theta_vi_shape, theta_vi_rate, 57 | beta_vi_shape, beta_vi_rate): 58 | """ Fast version of Xphi computation using numba & gsl_digamma 59 | 60 | Parameters 61 | ---------- 62 | X_data : ndarray of np.int32 63 | (number_nonzero, ) array of nonzero values 64 | X_row : ndarray of np.int32 65 | (number_nonzero, ) array of row ids for each nonzero value 66 | X_col : ndarray (np.int32) 67 | (number_nonzero, ) array of column ids for each nonzero value 68 | theta_vi_shape : ndarray 69 | (ncells, nfactors) array of values for theta's variational shape 70 | theta_vi_rate : ndarray 71 | (ncells, nfactors) array of values for theta's variational rate 72 | beta_vi_shape : ndarray 73 | (ngenes, nfactors) array of values for beta's variational shape 74 | beta_vi_rate : ndarray 75 | (ngenes, nfactors) array of values for beta's variational rate 76 | """ 77 | # convenience 78 | ncells, ngenes = (theta_vi_shape.shape[0], beta_vi_shape.shape[0]) 79 | nfactors, nnz = (theta_vi_shape.shape[1], X_data.shape[0]) 80 | dtype = theta_vi_shape.dtype 81 | 82 | # precompute theta.e_logx 83 | theta_e_logx = np.zeros_like(theta_vi_shape, dtype=dtype) 84 | for i in numba.prange(ncells): 85 | for k in range(nfactors): 86 | theta_e_logx[i,k] = psi(theta_vi_shape[i,k]) \ 87 | - np.log(theta_vi_rate[i,k]) 88 | 89 | # precompute beta.e_logx 90 | beta_e_logx = np.zeros_like(beta_vi_shape, dtype=dtype) 91 | for i in numba.prange(ngenes): 92 | for k in range(nfactors): 93 | beta_e_logx[i,k] = psi(beta_vi_shape[i,k]) \ 94 | - np.log(beta_vi_rate[i,k]) 95 | 96 | # compute Xphi 97 | Xphi = np.zeros((X_row.shape[0], theta_e_logx.shape[1]), dtype=dtype) 98 | for i in numba.prange(nnz): 99 | logrho = np.zeros((Xphi.shape[1]), dtype=dtype) 100 | for k in range(nfactors): 101 | logrho[k] = theta_e_logx[X_row[i],k] + beta_e_logx[X_col[i], k] 102 | 103 | #log normalizer trick 104 | rho_shift = np.zeros((Xphi.shape[1]), dtype=dtype) 105 | normalizer = np.zeros(1, dtype=dtype)[0] 106 | largest_in = np.max(logrho) 107 | for k in range(nfactors): 108 | rho_shift[k] = np.exp(logrho[k] - largest_in) 109 | normalizer += rho_shift[k] 110 | 111 | for k in range(nfactors): 112 | Xphi[i,k] = X_data[i] * rho_shift[k] / normalizer 113 | 114 | return Xphi 115 | 116 | 117 | def compute_Xphi_data_numpy(X, theta, beta, theta_ix=None): 118 | """Single-threaded version of compute_Xphi_data 119 | """ 120 | if theta_ix is None: 121 | logrho = theta.e_logx[X.row, :] + beta.e_logx[X.col, :] 122 | else: 123 | logrho = theta.e_logx[theta_ix,:][X.row, :] + beta.e_logx[X.col,:] 124 | logphi = logrho - logsumexp(logrho, axis=1)[:,None] 125 | return X.data[:,None] * np.exp(logphi) 126 | 127 | 128 | @numba.njit(fastmath=True) #results unstable with prange. don't do it. 129 | def compute_loading_shape_update(Xphi_data, X_keep, nkeep, shape_prior): 130 | """Compute gamma shape updates for theta or beta using numba 131 | 132 | Parameters 133 | ---------- 134 | Xphi_data : ndarray 135 | (number_nonzero, nfactors) array of X * phi 136 | X_keep : ndarray 137 | (number_nonzer,) vector of indices along the axis of interest. 138 | If X is an (ncell,ngene) coo_matrix, this should be X.row when 139 | computing updates for theta and X.col when computing updates for 140 | beta 141 | nkeep : int 142 | Number of items on the axis of interest. ncells when computing 143 | updates for theta, and ngenes for updates for beta 144 | shape_prior : float 145 | Hyperprior for parameter. a for theta, c for beta. 146 | 147 | """ 148 | nnz, nfactors = Xphi_data.shape 149 | dtype = Xphi_data.dtype 150 | 151 | result = shape_prior * np.ones((nkeep, nfactors), dtype=dtype) 152 | for i in range(nnz): 153 | ikeep = X_keep[i] 154 | for k in range(nfactors): 155 | result[ikeep, k] += Xphi_data[i,k] 156 | return result 157 | 158 | 159 | @numba.njit(fastmath=True) 160 | def compute_loading_rate_update(prior_vi_shape, prior_vi_rate, 161 | other_loading_vi_shape, other_loading_vi_rate,): 162 | # shorter names 163 | pvs, pvr = (prior_vi_shape, prior_vi_rate) 164 | olvs, olvr = (other_loading_vi_shape, other_loading_vi_rate) 165 | dtype = prior_vi_shape.dtype 166 | 167 | other_loading_e_x_sum = np.zeros((olvs.shape[1]), dtype=dtype) 168 | for i in range(olvs.shape[0]): 169 | for k in range(olvs.shape[1]): 170 | other_loading_e_x_sum[k] += olvs[i,k] / olvr[i,k] 171 | 172 | result = np.zeros((pvs.shape[0], olvs.shape[1]), dtype=dtype) 173 | for i in range(pvs.shape[0]): 174 | prior_e_x = pvs[i] / pvr[i] 175 | for k in range(olvs.shape[1]): 176 | result[i, k] = prior_e_x + other_loading_e_x_sum[k] 177 | return result 178 | 179 | 180 | @numba.njit(fastmath=True) 181 | def compute_capacity_rate_update(loading_vi_shape, loading_vi_rate, prior_rate): 182 | dtype = loading_vi_shape.dtype 183 | result = prior_rate * np.ones((loading_vi_shape.shape[0],), 184 | dtype=dtype) 185 | for k in range(loading_vi_shape.shape[1]): 186 | for i in range(loading_vi_shape.shape[0]): 187 | result[i] += loading_vi_shape[i,k] / loading_vi_rate[i,k] 188 | return result 189 | -------------------------------------------------------------------------------- /schpf/loss.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Loss functions and higher order functions that return loss functions for a 5 | given dataset 6 | 7 | """ 8 | 9 | import functools 10 | import numpy as np 11 | from scipy.special import gammaln 12 | 13 | from schpf.hpf_numba import compute_pois_llh 14 | 15 | ### Higher order loss functions 16 | 17 | def loss_function_for_data(loss_function, X): 18 | """ Get a loss function for a fixed dataset 19 | 20 | Parameters 21 | ---------- 22 | loss_function : function 23 | The loss function to use. The data parameter for the function must 24 | be `X` 25 | X : coo_matrix 26 | coo_matrix of data to apply loss function to 27 | 28 | Returns 29 | ------- 30 | fixed_data_loss_function : function 31 | A loss function which takes all the same parameters as the input 32 | `loss_function`, except for the data parameter `X` which is fixed 33 | """ 34 | return functools.partial(loss_function, X=X) 35 | 36 | 37 | def projection_loss_function(loss_function, X, nfactors, 38 | model_kwargs={}, proj_kwargs={}): 39 | """ Project new data onto an existing model and calculate loss from it 40 | 41 | Parameters 42 | ---------- 43 | loss_function : function 44 | the loss function to use on the projected data 45 | X : coo_matrix 46 | Data to project onto the existing model. Can have an arbitrary number 47 | of rows (cells) > 0, but must have the same number of columns (genes) 48 | as the existing model 49 | nfactors : int 50 | Number of factors in model 51 | model_kwargs : dict, optional 52 | additional keyword arguments for scHPF() 53 | proj_kwargs : dict, optional 54 | additional keyword arguments for scHPF.project(). By default, 55 | max_iter=5, 56 | 57 | 58 | Returns 59 | ------- 60 | projection_loss_function : function 61 | A function which takes `a`, `ap`, `bp`, `c`, `cp`, `dp`, `eta`, and 62 | `beta` for an scHPF model, projects a fixed dataset onto it, and takes 63 | the loss (using a fixed function) with respect to both the model and 64 | the data's projection. 65 | 66 | """ 67 | # have to do import here to avoid issue with files importing each other 68 | from schpf import scHPF 69 | 70 | # make the model used for projection 71 | pmodel = scHPF(nfactors=nfactors, **model_kwargs) 72 | 73 | # actual loss function for data 74 | def _projection_loss_function(*, a, ap, bp, c, cp, dp, eta, beta, **kwargs): 75 | assert eta.dims[0] == beta.dims[0] 76 | assert beta.dims[1] == nfactors 77 | 78 | pmodel.a = a 79 | pmodel.ap = ap 80 | pmodel.bp = bp 81 | pmodel.c = c 82 | pmodel.cp = cp 83 | pmodel.dp = dp 84 | pmodel.eta = eta 85 | pmodel.beta = beta 86 | 87 | # defaults if not given 88 | if 'reinit' not in proj_kwargs: proj_kwargs['reinit'] = False 89 | if 'max_iter' not in proj_kwargs: proj_kwargs['max_iter'] = 10 90 | if 'min_iter' not in proj_kwargs: proj_kwargs['min_iter'] = 10 91 | if 'check_freq' not in proj_kwargs: 92 | proj_kwargs['check_freq'] = proj_kwargs['max_iter'] + 1 93 | 94 | # do the projection 95 | pmodel.project(X, replace=True, **proj_kwargs) 96 | 97 | # calculate loss 98 | return loss_function(X, a=pmodel.a, ap=pmodel.ap, bp=pmodel.bp, 99 | c=pmodel.c, cp=pmodel.cp, dp=pmodel.dp, xi=pmodel.xi, 100 | eta=pmodel.eta, theta=pmodel.theta, beta=pmodel.beta) 101 | 102 | return _projection_loss_function 103 | 104 | 105 | #### Loss functions 106 | 107 | def pois_llh_pointwise(X, *, theta, beta, single_process=False, **kwargs): 108 | """Poisson log-likelihood for each nonzero entry 109 | 110 | Parameters 111 | ---------- 112 | X: coo_matrix 113 | Data to compute Poisson log likelihood of. Assumed to be nonzero. 114 | theta : HPF_Gamma 115 | beta : HPF_Gamma 116 | single_process: bool, optiona (Default: False) 117 | use single-threaded version of llh 118 | **kwargs : dict, optional 119 | extra arguments not used in this loss function 120 | 121 | Returns 122 | ------- 123 | llh: ndarray 124 | 125 | 126 | Note 127 | ---- 128 | Like all loss functions in this module, all parameters except from data 129 | must be passed to the function as a keyword argument, and the function 130 | will accept unused keyword args. 131 | """ 132 | if single_process: 133 | e_rate = (theta.e_x[X.row] * beta.e_x[X.col]).sum(axis=1) 134 | llh = X.data * np.log(e_rate) - e_rate - gammaln(X.data + 1) 135 | else: 136 | llh = compute_pois_llh(X.data, X.row, X.col, 137 | theta.vi_shape, theta.vi_rate, 138 | beta.vi_shape, beta.vi_rate) 139 | return llh 140 | 141 | 142 | def mean_negative_pois_llh(X, *, theta, beta, single_process=False, **kwargs): 143 | """Mean Poisson log-likelihood for each nonzero entry 144 | 145 | Parameters 146 | ---------- 147 | X: coo_matrix 148 | Data to compute Poisson log likelihood of. Assumed to be nonzero. 149 | theta : HPF_Gamma 150 | beta : HPF_Gamma 151 | single_process: bool, optional (Default: False) 152 | use single-threaded version of pointwise loss 153 | **kwargs : dict, optional 154 | extra arguments not used in this loss function 155 | 156 | Returns 157 | ------- 158 | llh: ndarray 159 | 160 | 161 | Note 162 | ---- 163 | Like all loss functions in this module, all parameters except from data 164 | must be passed to the function as a keyword argument, and the function 165 | will accept unused keyword args. 166 | """ 167 | return np.mean( -pois_llh_pointwise(X=X, theta=theta, beta=beta, 168 | single_process=single_process) ) 169 | -------------------------------------------------------------------------------- /schpf/preprocessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import warnings 4 | import numpy as np 5 | from scipy.sparse import coo_matrix 6 | import pandas as pd 7 | 8 | from schpf.util import split_coo_rows 9 | 10 | 11 | def load_coo(filename): 12 | """Load a sparse coo matrix 13 | 14 | Assumes first column (dense row ids) are cells, second column (dense 15 | column ids) are genes, and third column are nonzero counts. Also assumes 16 | row and column ids are 0-indexed. 17 | 18 | Parameters 19 | ---------- 20 | filename : str 21 | file to load 22 | 23 | Returns 24 | ------- 25 | coo : coo_matrix 26 | """ 27 | raw = np.loadtxt(filename, delimiter='\t', dtype=int) 28 | sparse = coo_matrix((raw[:,2], (raw[:,0],raw[:,1]))) 29 | return sparse 30 | 31 | 32 | def load_loom(filename): 33 | """Load data from a loom file 34 | 35 | Parameters 36 | ---------- 37 | filename: str 38 | file to load 39 | 40 | Returns 41 | ------- 42 | coo : coo_matrix 43 | cell x gene sparse count matrix 44 | genes : Dataframe 45 | Dataframe of gene attributes. Attributes are ordered so 46 | Accession and Gene are the first columns, if those attributs are 47 | present 48 | """ 49 | import loompy 50 | # load the loom file 51 | with loompy.connect(filename) as ds: 52 | loom_genes = pd.DataFrame(dict(ds.ra.items())) 53 | loom_coo = ds.sparse().T 54 | 55 | # order gene attributes so Accession and Gene are the first two columns, 56 | # if they are present 57 | first_cols = [] 58 | for colname in ['Accession', 'Gene']: 59 | if colname in loom_genes.columns: 60 | first_cols.append(colname) 61 | rest_cols = loom_genes.columns.difference(first_cols).tolist() 62 | loom_genes = loom_genes[first_cols + rest_cols] 63 | 64 | return loom_coo,loom_genes 65 | 66 | 67 | def load_txt(filename, ngene_cols=2, verbose=True): 68 | """Load data from a whitespace delimited txt file 69 | 70 | Parameters 71 | ---------- 72 | filename : str 73 | file to load. Expected to be a gene x cell whitespace-delimited file 74 | without a header where the first `ngene_cols` are gene identifiers, 75 | names or other metadata. 76 | ngene_cols : int, optional (default: 2) 77 | The number of columns that contain row attributes (ie gene id/names) 78 | verbose : bool, optional (default: True) 79 | print progress messages 80 | 81 | Returns 82 | ------- 83 | coo : coo_matrix 84 | cell x gene sparse count matrix 85 | genes : pd.DataFrame 86 | ngenes x ngene_cols array of gene names/attributes 87 | """ 88 | assert( ngene_cols > 0 ) 89 | gene_cols = list(range(ngene_cols)) 90 | 91 | if filename.endswith('.gz') or filename.endswith('.bz2'): 92 | msg = '.....' 93 | msg+= 'WARNING: Input file {} is compressed. '.format(filename) 94 | msg+= 'It may be faster to manually decompress before loading.' 95 | print(msg) 96 | 97 | df = pd.read_csv(filename, header=None, memory_map=True, 98 | delim_whitespace=True) 99 | 100 | genes = df[gene_cols] 101 | dense = df.drop(columns=gene_cols).values.T 102 | nz = np.nonzero(dense) 103 | coo = coo_matrix((dense[nz], nz), shape=dense.shape, dtype=np.int32) 104 | else: 105 | genes, rows, cols, values = [], [], [], [] 106 | 107 | # load row by row to conserve memory + actually often faster 108 | with open(filename) as f: 109 | # for each gene/row 110 | for g, l in enumerate(f): 111 | llist = l.split() 112 | genes.append(llist[:ngene_cols]) 113 | r, c, val = [], [], [] 114 | 115 | # for each cell/column 116 | for cell,v in enumerate(llist[ngene_cols:]): 117 | if v != '0': 118 | r.append(int(cell)) 119 | c.append(int(g)) 120 | val.append(int(v)) 121 | 122 | rows.extend(r) 123 | cols.extend(c) 124 | values.extend(val) 125 | 126 | if verbose and ((g+1)%10000 == 0) and (g!=0): 127 | print('\tloaded {} genes for {} cells'.format( 128 | g+1, cell+1)) 129 | 130 | ncells, ngenes = len(llist[ngene_cols:]), g+1 131 | coo = coo_matrix((np.array(values), (np.array(rows),np.array(cols))), 132 | shape=(ncells,ngenes), dtype=np.int32) 133 | genes = pd.DataFrame(genes) 134 | 135 | return coo, genes 136 | 137 | 138 | def min_cells_expressing_mask(counts, min_cells, verbose=True): 139 | """Get a mask for genes expressed by a minimum number of cells 140 | 141 | Parameters 142 | ---------- 143 | counts : ndarray or coo_matrix 144 | A cell x gene coo_matrix of counts 145 | min_cells: numeric 146 | the minimum number (if int) or proportion (if float between 0 and 1) 147 | of cells in which we must observe transcripts of the gene for 148 | inclusion in the dataset. If `min_cells` is between 0 and 1, sets 149 | the threshold to round(min_cells * ncells) 150 | verbose : bool, default True 151 | if True, print the number of cells when a numbr between 0 and 1 is given 152 | 153 | Returns 154 | ------- 155 | passing_mask : ndarray 156 | boolean array of passing genes 157 | 158 | TODO verbose option + return min_cells 159 | """ 160 | if min_cells < 1 and min_cells > 0: 161 | min_cells_frac = min_cells 162 | min_cells = round(min_cells_frac * counts.shape[0]) 163 | msg = '.....requiring {}% of cells = {} cells observed expressing for' 164 | msg += ' gene inclusion' 165 | print(msg.format(100 * min_cells_frac, min_cells)) 166 | return counts.astype(bool).sum(axis=0).A[0,:] >= min_cells 167 | 168 | 169 | def genelist_mask(candidates, genelist, whitelist=True, split_on_dot=True): 170 | """Get a mask for genes on or off a list 171 | 172 | Parameters 173 | ---------- 174 | candidates : pd.Series 175 | Candidate genes (from matrix) 176 | genelist : pd.Series 177 | List of genes to filter against 178 | whitelist : bool, default True 179 | Is the gene list a whitelist (True), where only genes on it should 180 | be kept or a blacklist (False) where all genes on it should be 181 | excluded 182 | split_on_dot : bool, default True 183 | If True, remove part of gene identifier after '.'. We do this by 184 | default because ENSEMBL IDs contain version numbers after periods. 185 | 186 | Returns 187 | ------- 188 | passing_mask : ndarray 189 | boolean array of passing genes 190 | """ 191 | if split_on_dot: 192 | candidates = candidates.str.split('.').str[0] 193 | genelist = genelist.str.split('.').str[0] 194 | 195 | if whitelist: 196 | mask = candidates.isin(genelist) 197 | else: 198 | mask = ~candidates.isin(genelist) 199 | 200 | return mask.values 201 | 202 | 203 | def subsample_cell_ixs(choices, nselect, group_ids=None, max_group_frac=0.5): 204 | """Randomly select cells, potentially accounting for groups 205 | 206 | Parameters 207 | ---------- 208 | choices : ndarray or int 209 | Indices of cells to choose from. If int is give, indices assumend 210 | to be np.arange(`choices`) 211 | nselect : int 212 | number of indices to return 213 | group_ids : ndarray, optional 214 | Group ids of cells. len(`group_ids`) must == `choices` if `choices` 215 | is an int, and == len(`choices`) otherwise. If `group_ids` is given, 216 | selected cells will be distributed approximately evenly over the 217 | labels under the constraint that at most floor(group_size * 218 | `max_group_frac`) can be selected from a group. 219 | max_group_frac : float, optional (default: 0.5) 220 | If `group_ids` given, the maximum fraction of cells in a group that 221 | can be selected. 222 | 223 | Returns 224 | ------- 225 | selected_ix : ndarray 226 | 1d array of selected ids (sorted). 227 | """ 228 | if isinstance(choices, int): 229 | choices = np.arange(choices) 230 | 231 | if group_ids is None: 232 | return np.sort(np.random.choice(choices, nselect, replace=False)) 233 | else: 234 | assert len(group_ids) == len(choices) 235 | 236 | label, remaining = np.unique(group_ids, return_counts=True) 237 | constraint = np.floor( remaining * max_group_frac ).astype(int) 238 | 239 | selected, n_remain = [], nselect 240 | # while unconstrained cells left and more requested 241 | while np.sum(constraint) > 0 and n_remain > 0: 242 | # calculate goals given remaining cells to select and 243 | # unconstrained cells left 244 | weights = (constraint > 0) / (constraint > 0).sum() 245 | goal_floor = np.floor(weights * n_remain).astype(int) 246 | remainder = np.sum(np.ceil(weights * n_remain) - goal_floor 247 | ).astype(int) 248 | goal = goal_floor + np.random.multinomial(remainder, weights) 249 | # for each group 250 | for i in range(len(remaining)): 251 | # if there are unconstrained cells left in the group 252 | if constraint[i] > 0: 253 | my_nchoose = min(goal[i], constraint[i]) 254 | my_choices = np.setdiff1d(choices[group_ids == label[i]], 255 | selected) 256 | # select the cells 257 | chosen = np.random.choice(my_choices, my_nchoose, 258 | replace=False) 259 | selected.extend(list(chosen)) 260 | # update constraint 261 | constraint[i] -= my_nchoose 262 | n_remain -= my_nchoose 263 | if n_remain > 0: 264 | msg = "Could not select {} cells".format(nselect) 265 | msg += " with given group_ids under constraint max_group_frac" 266 | msg += "={}. {} cells selected.".format(max_group_frac, n_remain) 267 | warnings.warn(msg, UserWarning) 268 | 269 | return np.sort(selected) 270 | 271 | 272 | def split_validation_cells(X, nselect, group_id_file='', max_group_frac=0.5, 273 | verbose=True): 274 | """ Split train and validation cells, potentially accounting from groups 275 | 276 | Parameters 277 | ---------- 278 | X : coo_matrix 279 | Matrix to select validation cells from 280 | nselect : int 281 | Number of cells to select 282 | group_id_file : str, optional 283 | File containing group ids. Should be loadable with np.loadtxt 284 | max_group_frac : float, optional (default: 0.5) 285 | If `group_id_file` given, the maximum fraction of cells in a group that 286 | can be selected. 287 | verbose : bool, optional (default: True) 288 | Verbose output 289 | 290 | Returns 291 | ------- 292 | Xtrain : coo_matrix 293 | X with validation rows removed 294 | Xvalidation : coo_matrix 295 | Selected rows from X 296 | validation_ix : ndarray 297 | Indexes of selected rows in the intput matrix `X` 298 | """ 299 | # load groups 300 | if group_id_file is not None and len(group_id_file): 301 | group_ids = np.loadtxt(group_id_file) 302 | else: 303 | group_ids = None 304 | 305 | # select cells 306 | selected_ids = subsample_cell_ixs(X.shape[0], nselect, group_ids, 307 | max_group_frac) 308 | 309 | # write a message 310 | if verbose: 311 | ncells = len(selected_ids) 312 | msg = '.....{} cells selected'.format(ncells) 313 | if group_ids is not None: 314 | msg += ' ~~evenly from groups in {}'.format(group_id_file) 315 | msg += ' under constraint max_group_frac={}'.format(max_group_frac) 316 | msg += '\n\tGroup counts:' 317 | ids, id_counts = np.unique(group_ids[selected_ids], 318 | return_counts=True) 319 | for i, c in zip(ids, id_counts): 320 | msg += '\n\t\t[{}] {}'.format(i, c) 321 | print(msg) 322 | 323 | # split cells 324 | Xvalidation, Xtrain = split_coo_rows(X, selected_ids) 325 | return Xtrain, Xvalidation, selected_ids 326 | 327 | 328 | def load_and_filter(infile, min_cells, whitelist='', blacklist='', 329 | filter_by_gene_name=False, no_split_on_dot=False, verbose=True): 330 | """ Composite of loading and filtering intended for use by CLI 331 | 332 | Parameters 333 | ---------- 334 | infile : str 335 | Input data. Currently accepts either: (1) a whitespace-delimited gene 336 | by cell UMI count matrix with 2 leading columns of gene attributes 337 | (ENSEMBL_ID and GENE_NAME respectively), or (2) a loom file with at 338 | least one of the row attributes `Accession` or `Gene`, where `Accession` 339 | is an ENSEMBL id and `Gene` is the name. 340 | min_cells : float or int 341 | Minimum number of cells in which we must observe at least one transcript 342 | of a gene for the gene to pass filtering. If 0 <`min_cells`< 1, sets 343 | threshold to be `min_cells` * ncells, rounded to the nearest integer. 344 | whitelist : str, optional 345 | Tab-delimited file where first column contains ENSEMBL gene ids to 346 | accept, and second column contains corresponding gene names. If given, 347 | genes not on the whitelist are filtered from the input matrix. 348 | Superseded by blacklist. Default None. 349 | blacklist : str, optional 350 | Tab-delimited file where first column contains ENSEMBL gene ids to 351 | exclude, and second column is the corresponding gene name. Only 352 | performed if file given. Genes on the blacklist are excluded even if 353 | they are also on the whitelist. 354 | filter_by_gene_name : bool, optional 355 | Use gene name rather than ENSEMBL id to filter (with whitelist or 356 | blacklist). Useful for datasets where only gene symbols are given. 357 | Applies to both whitelist and blacklist. Used by default when input 358 | is a loom file. Default False. 359 | no_split_on_dot : bool, optional 360 | Don't split gene symbol or name on period before filtering white and 361 | blacklist. We do this by default for ENSEMBL ids. Default False. 362 | verbose : bool, optional 363 | Print progress messages. Default True 364 | 365 | Returns 366 | ------- 367 | filtered : ndarray 368 | genes : pd.DataFrame 369 | 370 | Raises 371 | ------ 372 | ValueError 373 | """ 374 | if verbose: 375 | print('Loading data.....') 376 | 377 | if infile.endswith('.loom'): 378 | umis, genes = load_loom(infile) 379 | if 'Accession' in genes.columns: 380 | candidate_names = genes['Accession'] 381 | genelist_col = 0 382 | elif 'Gene' in genes.columns: 383 | candidate_names = genes['Gene'] 384 | genelist_col = 1 385 | else: 386 | msg = 'loom files must have at least one of the row ' 387 | msg+= 'attributes: `Gene` or `Accession`.' 388 | raise ValueError(msg) 389 | else: 390 | umis, genes = load_txt(infile) 391 | genelist_col = 1 if filter_by_gene_name else 0 392 | candidate_names = genes[genelist_col] 393 | ncells, ngenes = umis.shape 394 | if verbose: 395 | print('.....found {} cells and {} genes'.format(ncells, ngenes)) 396 | print('Generating masks for filtering.....') 397 | 398 | if min_cells < 0: 399 | raise ValueError('min_cells must be >= 0') 400 | mask = min_cells_expressing_mask(umis, min_cells) 401 | if whitelist is not None and len(whitelist): 402 | whitelist = pd.read_csv(whitelist, delim_whitespace=True, header=None) 403 | mask &= genelist_mask(candidate_names, whitelist[genelist_col], 404 | split_on_dot = ~no_split_on_dot) 405 | if blacklist is not None and len(blacklist): 406 | blacklist = pd.read_csv(blacklist, delim_whitespace=True, header=None) 407 | mask &= genelist_mask(candidate_names, blacklist[genelist_col], 408 | whitelist=False, split_on_dot = ~no_split_on_dot) 409 | 410 | if verbose: 411 | print('Filtering data.....') 412 | genes = genes.loc[mask] 413 | filtered = umis.tolil()[:,mask].tocoo() # must convert to apply mask 414 | 415 | return filtered, genes 416 | 417 | 418 | def load_like(infile, reference, by_gene_name=False, 419 | no_split_on_dot=False): 420 | """Load expression matrix, selecting genes and ordering like a reference 421 | gene list 422 | 423 | Parameters 424 | ---------- 425 | infile : str 426 | Input data. Currently accepts either: (1) a whitespace-delimited gene 427 | by cell UMI count matrix with 2 leading columns of gene attributes 428 | (ENSEMBL_ID and GENE_NAME respectively), or (2) a loom file with at 429 | least one of the row attributes `Accession` or `Gene`, where `Accession` 430 | is an ENSEMBL id and `Gene` is the name. 431 | reference : str 432 | Tab-delimited file where first column contains ENSEMBL gene ids and 433 | second column contains corresponding gene names. Returned array 434 | will contain exactly these genes, in this order, for counts in cells 435 | in `infile` 436 | by_gene_name : bool, optional (Default: False) 437 | match files by gene name (second 1-indexed column) 438 | no_split_on_dot : bool, optional 439 | Don't split gene symbol or name on period before filtering white and 440 | blacklist. We do this by default for ENSEMBL ids. Default False. 441 | 442 | Returns 443 | ------- 444 | reordered_coo : coo_matrix 445 | cell x gene sparse count matrix with genes filtered and ordered like 446 | reference 447 | reordered_genes : pd.DataFrame 448 | ngenes x ngene_cols array of gene names/attributes. Should basically by 449 | a duplicate of reference 450 | 451 | 452 | Raises 453 | ------ 454 | ValueError : if a gene from the reference is not in infile 455 | """ 456 | if infile.endswith('.loom'): 457 | umis, genes = load_loom(infile) 458 | if 'Accession' in genes.columns: 459 | candidate_names = genes['Accession'] 460 | genelist_col = 0 461 | elif 'Gene' in genes.columns: 462 | candidate_names = genes['Gene'] 463 | genelist_col = 1 464 | else: 465 | msg = 'loom files must have at least one of the row ' 466 | msg+= 'attributes: `Gene` or `Accession`.' 467 | raise ValueError(msg) 468 | else: 469 | umis, genes = load_txt(infile) 470 | genelist_col = 1 if by_gene_name else 0 471 | candidate_names = genes[genelist_col] 472 | ncells, ngenes = umis.shape 473 | 474 | # load the reference order 475 | ref = pd.read_csv(reference, delim_whitespace=True, header=None 476 | )[genelist_col] 477 | # select input column and process names unless told not to 478 | if no_split_on_dot: 479 | ingenes = candidate_names 480 | else: 481 | ref = ref.str.split('.').str[0] 482 | ingenes = candidate_names.str.split('.').str[0] 483 | 484 | perm = [] 485 | try: 486 | for g in ref: 487 | perm.append(np.where(ingenes==g)[0][0]) 488 | except IndexError as e: 489 | msg = 'Reference gene `{}` in reference `{}` not found in infile `{}`' 490 | msg = msg.format(g, reference, infile) 491 | raise ValueError(msg) 492 | 493 | reordered_umis = umis.tocsr()[:,perm].tocoo() 494 | reordered_genes = genes.loc[perm] 495 | return reordered_umis, reordered_genes 496 | -------------------------------------------------------------------------------- /schpf/scHPF_.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from copy import deepcopy 3 | from warnings import warn 4 | from functools import partial 5 | from multiprocessing import cpu_count 6 | 7 | import numpy as np 8 | from scipy.sparse import coo_matrix 9 | from scipy.special import digamma, gammaln, psi 10 | try: 11 | from scipy.misc import logsumexp 12 | except ImportError: 13 | from scipy.special import logsumexp 14 | 15 | from sklearn.base import BaseEstimator 16 | import joblib 17 | from joblib import Parallel, delayed 18 | 19 | 20 | # TODO warn if can't import, and allow computation with slow 21 | from schpf.hpf_numba import * 22 | from schpf.util import minibatch_ix_generator 23 | import schpf.loss as ls 24 | import schpf 25 | 26 | 27 | class HPF_Gamma(object): 28 | """Gamma variational distributions 29 | 30 | Parameters 31 | ---------- 32 | vi_shape: np.ndarray 33 | Gamma shape parameter for the variational Gamma distributions. 34 | Ndarray.shape[0] must match `vi_rate` 35 | vi_rate: np.ndarray 36 | Gamma rate parameter for the variational Gamma distributions. 37 | Ndarray.shape[0] must match `vi_shape` 38 | 39 | Attributes 40 | ---------- 41 | vi_shape : ndarray 42 | vi_rate : ndarray 43 | dims : ndarray 44 | The shape of vi_shape and vi_rate 45 | dtype : dtype 46 | dtype of both vi_shape and vi_rate 47 | """ 48 | 49 | @staticmethod 50 | def random_gamma_factory(dims, shape_prior, rate_prior, dtype=np.float64): 51 | """Factory method to randomly initialize variational distributions 52 | 53 | Parameters 54 | ---------- 55 | dims: list-like 56 | Numpy-style shape of the matrix of Gammas. 57 | shape_prior: float 58 | Prior for variational Gammas' shapes. Must be greater than 0. 59 | rate_prior: float 60 | Prior for variational Gammas' rates. Must be greater than 0. 61 | 62 | Returns 63 | ------- 64 | A randomly initialized HPF_Gamma instance 65 | """ 66 | vi_shape = np.random.uniform(0.5 * shape_prior, 1.5 * shape_prior, 67 | dims).astype(dtype) 68 | vi_rate = np.random.uniform(0.5 * rate_prior, 1.5 * rate_prior, 69 | dims).astype(dtype) 70 | return HPF_Gamma(vi_shape,vi_rate) 71 | 72 | 73 | def __init__(self, vi_shape, vi_rate): 74 | """Initializes HPF_Gamma with variational shape and rates""" 75 | assert(vi_shape.shape == vi_rate.shape) 76 | assert(vi_shape.dtype == vi_rate.dtype) 77 | assert(np.all(vi_shape > 0)) 78 | assert(np.all(vi_rate > 0)) 79 | self.vi_shape = vi_shape 80 | self.vi_rate = vi_rate 81 | self.dtype = vi_shape.dtype 82 | 83 | 84 | def __eq__(self, other): 85 | if isinstance(other, self.__class__): 86 | shape_equal = np.array_equal(self.vi_shape, other.vi_shape) 87 | rate_equal = np.array_equal(self.vi_rate, other.vi_rate) 88 | dtype_equal = self.dtype == other.dtype 89 | return shape_equal and rate_equal and dtype_equal 90 | return False 91 | 92 | 93 | @property 94 | def dims(self): 95 | assert self.vi_shape.shape == self.vi_rate.shape 96 | return self.vi_shape.shape 97 | 98 | 99 | @property 100 | def e_x(self): 101 | """Expected value of the random variable(s) given variational 102 | distribution(s) 103 | """ 104 | return self.vi_shape / self.vi_rate 105 | 106 | 107 | @property 108 | def e_logx(self): 109 | """Expectation of the log of random variable given variational 110 | distribution(s)""" 111 | return digamma(self.vi_shape) - np.log(self.vi_rate) 112 | 113 | 114 | @property 115 | def entropy(self): 116 | """Entropy of variational Gammas""" 117 | return self.vi_shape - np.log(self.vi_rate) \ 118 | + gammaln(self.vi_shape) \ 119 | + (1 - self.vi_shape) * digamma(self.vi_shape) 120 | 121 | 122 | def sample(self, nsamples=1): 123 | """Sample from variational distributions 124 | 125 | Parameters 126 | ---------- 127 | nsamples: int (optional, default 1) 128 | Number of samples to take. 129 | 130 | Returns 131 | ------- 132 | X_rep : np.ndarray 133 | An ndarray of samples from the variational distributions, where 134 | the last dimension is the number of samples `nsamples` 135 | """ 136 | samples = [] 137 | for i in range(nsamples): 138 | samples.append(np.random.gamma(self.vi_shape, 1/self.vi_rate).T) 139 | return np.stack(samples).T 140 | 141 | 142 | def combine(self, other, other_ixs): 143 | """ Combine with another HPF_Gamma 144 | 145 | Useful for combining variational distributions from training data with 146 | variational distributions from cells that were projected onto frozen 147 | beta and eta 148 | 149 | Parameters 150 | ---------- 151 | other : `HPF_Gamma` 152 | Other HPF_Gamma to merge with 153 | other_ixs : list or ndarray 154 | Ordered indices of other in the merged HPF_Gamma. Must have len 155 | equal to other.shape[0]. Must have a maximum value less than 156 | self.dims[0] + other.shape[0]. May not have any repeat values. 157 | 158 | Returns 159 | ------- 160 | combined_model : `HPF_Gamma` 161 | """ 162 | assert other.dims[0] == len(other_ixs) 163 | assert len(np.unique(other_ixs)) == len(other_ixs) 164 | assert self.dims[0] + other.dims[0] > np.max(other_ixs) 165 | 166 | new_dims = [self.dims[0]+other.dims[0], *self.dims[1:]] 167 | self_ixs = np.setdiff1d(np.arange(new_dims[0]), 168 | other_ixs) 169 | 170 | new_vi_shape = np.empty(new_dims, dtype=self.dtype) 171 | new_vi_shape[self_ixs] = self.vi_shape 172 | new_vi_shape[other_ixs] = other.vi_shape 173 | 174 | new_vi_rate = np.empty(new_dims, dtype=self.dtype) 175 | new_vi_rate[self_ixs] = self.vi_rate 176 | new_vi_rate[other_ixs] = other.vi_rate 177 | 178 | return HPF_Gamma(new_vi_shape, new_vi_rate) 179 | 180 | 181 | class scHPF(BaseEstimator): 182 | """scHPF as described in Levitin et al., Molecular Systems Biology 2019 183 | 184 | Parameters 185 | ---------- 186 | nfactors: int 187 | Number of factors (K) 188 | a: float, (optional, default 0.3) 189 | Hyperparameter a 190 | ap: float (optional, default 1.0) 191 | Hyperparameter a' 192 | bp: float (optional, default None) 193 | Hyperparameter b'. Set empirically from observed data if not 194 | given. 195 | c: float, (optional, default 0.3) 196 | Hyperparameter c 197 | cp: float (optional, default 1.0) 198 | Hyperparameter c' 199 | dp: float (optional, default None) 200 | Hyperparameter d'. Set empirically from observed data if not 201 | given. 202 | min_iter: int (optional, default 30): 203 | Minimum number of interations for training. 204 | max_iter: int (optional, default 1000): 205 | Maximum number of interations for training. 206 | check_freq: int (optional, default 10) 207 | Number of training iterations between calculating loss. 208 | epsilon: float (optional, default 0.001) 209 | Percent change of loss for convergence. 210 | better_than_n_ago: int (optional, default 5) 211 | Stop condition if loss is getting worse. Stops training if loss 212 | is worse than `better_than_n_ago`*`check_freq` training steps 213 | ago and getting worse. 214 | xi: HPF_Gamma (optional, default None) 215 | Variational distributions for xi 216 | theta: HPF_Gamma (optional, default None) 217 | Variational distributions for theta 218 | eta: HPF_Gamma (optional, default None) 219 | Variational distributions for eta 220 | beta: HPF_Gamma (optional, default None) 221 | Variational distributions for beta 222 | verbose: bool (optional, default True) 223 | Print messages at each check_freq 224 | """ 225 | def __init__( 226 | self, 227 | nfactors, 228 | a=0.3, 229 | ap=1, 230 | bp=None, 231 | c=0.3, 232 | cp=1, 233 | dp=None, 234 | min_iter=30, 235 | max_iter=1000, 236 | check_freq=10, 237 | epsilon=0.001, 238 | better_than_n_ago=5, 239 | dtype=np.float64, 240 | xi=None, 241 | theta=None, 242 | eta=None, 243 | beta=None, 244 | loss=[], 245 | verbose=True, 246 | ): 247 | """Initialize HPF instance""" 248 | self.version = schpf.__version__ 249 | self.nfactors = nfactors 250 | self.a = a 251 | self.ap = ap 252 | self.bp = bp 253 | self.c = c 254 | self.cp = cp 255 | self.dp = dp 256 | self.min_iter = min_iter 257 | self.max_iter = max_iter 258 | self.check_freq = check_freq 259 | self.epsilon = epsilon 260 | self.better_than_n_ago = better_than_n_ago 261 | self.dtype = dtype 262 | self.verbose = verbose 263 | 264 | self.xi = xi 265 | self.eta = eta 266 | self.theta = theta 267 | self.beta = beta 268 | 269 | self.loss = [] 270 | 271 | 272 | @property 273 | def a(self): 274 | try: 275 | return self._a 276 | except AttributeError: 277 | msg = 'Automatically using a=0.3. If you are loading a model' 278 | msg += ' generated with scHPF version < 0.5 and set a custom value' 279 | msg += ' for a, you must manually reset it and re-save the model.' 280 | warn(msg, RuntimeWarning) 281 | return 0.3 282 | 283 | 284 | @a.setter 285 | def a(self, val): 286 | if val == -2: 287 | if self.nfactors is None: 288 | raise ValueError('Can only set a as a function of nfactors when' 289 | ' nfactors is not None') 290 | else: 291 | self._a = 1/np.sqrt(self.nfactors) 292 | else: 293 | assert val > 0 294 | self._a = val 295 | 296 | 297 | @property 298 | def c(self): 299 | try: 300 | return self._c 301 | except AttributeError: 302 | msg = 'Automatically using c=0.3. If you are loading a model ' 303 | msg += ' generated with scHPF version < 0.5 and set a custom value' 304 | msg += ' for c, you must manually reset it and re-save the model.' 305 | warn(msg, RuntimeWarning) 306 | return 0.3 307 | 308 | 309 | @c.setter 310 | def c(self, val): 311 | if val == -2: 312 | if self.nfactors is None: 313 | raise ValueError('Can only set a as a function of nfactors when' 314 | ' nfactors is not None') 315 | else: 316 | self._c = 1/np.sqrt(self.nfactors) 317 | else: 318 | assert val > 0 319 | self._c = val 320 | 321 | 322 | @property 323 | def ngenes(self): 324 | return self.eta.dims[0] if self.eta is not None else None 325 | 326 | 327 | @property 328 | def ncells(self): 329 | return self.xi.dims[0] if self.xi is not None else None 330 | 331 | 332 | def cell_score(self, xi=None, theta=None): 333 | """Get cell score from xi and theta 334 | 335 | Parameters 336 | ---------- 337 | xi : HPF_Gamma, (optional, default self.xi) 338 | varitional distributions for xi 339 | theta : HPF_Gamma, (optional, default self.theta) 340 | varitional distributions for theta 341 | 342 | Returns 343 | ------- 344 | cell_score : ndarray 345 | ncell x nfactor array of cell scores 346 | """ 347 | xi = self.xi if xi is None else xi 348 | theta = self.theta if theta is None else theta 349 | return self._score(xi, theta) 350 | 351 | 352 | def gene_score(self, eta=None, beta=None): 353 | """Get cell score from eta and beta 354 | 355 | Parameters 356 | ---------- 357 | eta : HPF_Gamma, (optional, default self.eta) 358 | varitional distributions for eta 359 | beta : HPF_Gamma, (optional, default self.beta) 360 | varitional distributions for beta 361 | 362 | Returns 363 | ------- 364 | gene_score : ndarray 365 | ngene x nfactor array of cell scores 366 | """ 367 | eta = self.eta if eta is None else eta 368 | beta = self.beta if beta is None else beta 369 | return self._score(eta, beta) 370 | 371 | 372 | def pois_llh_pointwise(self, X, theta=None, beta=None): 373 | """Poisson log-likelihood (for each nonzero data) 374 | 375 | Attempt to use numba/cffi/gsl, use numpy otherwise 376 | 377 | Parameters 378 | ---------- 379 | X: coo_matrix 380 | Data to compute Poisson log likelihood of. Assumed to be nonzero. 381 | theta : HPF_Gamma, optional 382 | If given, use for theta instead of self.theta 383 | beta : HPF_Gamma, optional 384 | If given, use for beta instead of self.beta 385 | 386 | Returns 387 | ------- 388 | llh: ndarray 389 | """ 390 | theta = self.theta if theta is None else theta 391 | beta = self.beta if beta is None else beta 392 | return ls.pois_llh_pointwise(X=X, theta=theta, beta=beta) 393 | 394 | 395 | def cellmean_negative_pois_llh(self, X, theta=None, beta=None): 396 | """Convenience method for mean negative llh of nonzero entries, 397 | averaged by cell 398 | 399 | """ 400 | theta = self.theta if theta is None else theta 401 | assert(theta.vi_shape.shape[0] == X.shape[0]) 402 | beta = self.beta if beta is None else beta 403 | llh_pointwise= self.pois_llh_pointwise(X=X, theta=theta, beta=beta) 404 | 405 | llh_csr = coo_matrix((-llh_pointwise, (X.row,X.col)), shape=X.shape).tocsr() 406 | sums = llh_csr.sum(axis=1).A1 407 | counts = np.diff(llh_csr.indptr) 408 | averages = sums/counts 409 | 410 | assert(averages.shape[0] == theta.vi_shape.shape[0]) 411 | return averages 412 | 413 | 414 | 415 | 416 | def mean_negative_pois_llh(self, X, theta=None, beta=None, **kwargs): 417 | """Convenience method for mean negative llh of nonzero entries 418 | 419 | """ 420 | theta = self.theta if theta is None else theta 421 | beta = self.beta if beta is None else beta 422 | return ls.mean_negative_pois_llh(X=X, theta=theta, beta=beta) 423 | 424 | 425 | def fit(self, X, **kwargs): 426 | """Fit an scHPF model 427 | 428 | Parameters 429 | ---------- 430 | X: coo_matrix 431 | Data to fit 432 | loss_function : function, optional (Default: None) 433 | loss function to use for fit. set to negative poisson likelihood 434 | of X if not given 435 | """ 436 | (bp, dp, xi, eta, theta, beta, loss) = self._fit( 437 | X, **kwargs) 438 | self.bp = bp 439 | self.dp = dp 440 | self.xi = xi 441 | self.eta = eta 442 | self.theta = theta 443 | self.beta = beta 444 | self.loss = loss 445 | return self 446 | 447 | 448 | def project(self, X, recalc_bp=False, replace=False, min_iter=2, max_iter=50, 449 | check_freq=2, **kwargs): 450 | """Project new cells into latent space 451 | 452 | Gene distributions (beta and eta) are fixed. 453 | 454 | Parameters 455 | ---------- 456 | X: coo_matrix 457 | Data to project. Should have self.ngenes columns 458 | recalc_bp : bool, optional (Default: False) 459 | Recalculated value of empirical hyperparameter bp. Do not do this 460 | for withheld text data. 461 | replace: bool, optional (Default: False) 462 | Replace theta and xi with projected values in self. Note that 463 | loss will not be updated 464 | min_iter: int, (Default: 2) 465 | Replaces self.min_iter if not None. Few iterations are needed 466 | because beta and eta are fixed. 467 | max_iter: int, (Default: 10) 468 | Replaces self.max_iter if not None. Few iterations are needed 469 | because beta and eta are fixed. 470 | check_freq: int, optional (Default: 2) 471 | Number of training iterations between calculating loss. 472 | 473 | Returns 474 | ------- 475 | result : scHPF or ndarray 476 | If replace=`False`, an scHPF object with variational 477 | distributions theta and xi (for the new cells in `X`) and the 478 | same variational distributions as self for gene distributions 479 | beta and eta. If replace=`True`, then the loss for the projection 480 | (xi and theta will be updated in self but not returned). In both 481 | cases, bp will only be updated for the new data if self.bp==None or 482 | recalc_bp=`True`. 483 | 484 | """ 485 | if replace and recalc_bp: 486 | msg = 'Cannot replace `bp` with recalculated value' 487 | raise ValueError(msg) 488 | 489 | model = self if replace else deepcopy(self) 490 | if recalc_bp: model.bp = None 491 | (bp, _, xi, _, theta, _, loss) = model._fit(X, 492 | min_iter=min_iter, max_iter=max_iter, check_freq=check_freq, 493 | freeze_genes=True, **kwargs) 494 | if replace: 495 | self.xi = xi 496 | self.theta = theta 497 | return loss 498 | else: 499 | model.bp = bp 500 | model.xi = xi 501 | model.theta = theta 502 | model.loss = loss 503 | return model 504 | 505 | 506 | def _score(self, capacity, loading): 507 | """Get the hierarchically normalized loadings which we call the cell 508 | or gene score in the scHPF paper 509 | 510 | Parameters 511 | ---------- 512 | capacity : HPF_Gamma 513 | xi or eta 514 | loading : HPF_Gamma 515 | theta or beta 516 | 517 | 518 | Returns 519 | ------- 520 | score : ndarray 521 | """ 522 | assert(loading.dims[0] == capacity.dims[0]) 523 | return loading.e_x * capacity.e_x[:,None] 524 | 525 | 526 | def _fit(self, X, freeze_genes=False, reinit=True, loss_function=None, 527 | min_iter=None, max_iter=None, epsilon=None, check_freq=None, 528 | single_process=False, checkstep_function=None, verbose=None, 529 | batchsize=None, beta_theta_simultaneous=False, 530 | loss_smoothing=1): 531 | """Combined internal fit/transform function 532 | 533 | Parameters 534 | ---------- 535 | X: coo_matrix 536 | Data to fit 537 | freeze_genes: bool, (optional, default False) 538 | Should we update gene variational distributions eta and beta 539 | reinit: bool, (optional, default True) 540 | Randomly initialize variational distributions even if they 541 | already exist. Superseded by freeze_genes. Does not affect 542 | self.bp and self.dp which will only be set empirically if they 543 | are None 544 | loss_function : function, (optional, default None) 545 | Function to use for loss, which is assumed to be nonzero and 546 | decrease with improvement. Must accept hyperparameters a, ap, 547 | bp, c, cp, and dp and the variational distributions for xi, eta, 548 | theta, and beta even if only some of these values are used. 549 | Should have an internal reference to any data used (_fit will 550 | not pass it any data). If `loss_function` is not given or equal 551 | to None, the mean negative log likelihood of nonzero values in 552 | training data `X` is used. 553 | min_iter: int (optional, default None) 554 | Replaces self.min_iter if given. Useful when projecting 555 | new data onto an existing scHPF model. 556 | max_iter: int (optional, default None) 557 | Replaces self.max_iter if given. Useful when projecting 558 | new data onto an existing scHPF model. 559 | epsilon: float (optional, default None) 560 | Replaces self.epsilon if given. Percent change of loss for 561 | convergence. 562 | check_freq : int, optional (Default: None) 563 | Replaces self.check_freq if given. Useful when projecting 564 | new data onto an existing scHPF model. 565 | single_process : bool, optional (Default: False) 566 | Use single-threaded versions of updates 567 | checkstep_function : function (optional, default None) 568 | A function that takes arguments bp, dp, xi, eta, theta, beta, 569 | and t and, if given, is called at check_interval. Intended use 570 | is to check additional stats during training, potentially with 571 | hardcoded data, but is unrestricted. Use at own risk. 572 | verbose: bool (optional, default None) 573 | If not None, overrides self.verbose 574 | batchsize: int, optional (Default 0) 575 | number of cells per batch. When 0, all cells are used 576 | beta_theta_simultaneous: bool, option (Default False) 577 | Should updates for beta and theta be computed simultaneously. 578 | If False, beta is updated first, and theta is updated using 579 | that beta 580 | loss_smoothing: int, optional (Default: 1) 581 | Smooth loss up to `loss_smoothing` check frequencies ago. 1 results 582 | in no smoothing. Intended to be used with batching when assessing 583 | convergence based on training loss, where a good value might be 584 | int(ncells/n_batches) 585 | 586 | Returns 587 | ------- 588 | bp: float 589 | Empirically set value for bp 590 | dp: float 591 | Empirically set value for dp. Unchanged if freeze_genes. 592 | xi: HPF_Gamma 593 | Learned variational distributions for xi 594 | eta: HPF_Gamma 595 | Learned variational distributions for eta. Unchanged if 596 | freeze_genes. 597 | theta: HPF_Gamma 598 | Learned variational distributions for theta 599 | beta: HPF_Gamma 600 | Learned variational distributions for beta. Unchanged if 601 | freeze_genes. 602 | loss : list 603 | loss at each checkstep 604 | """ 605 | assert loss_smoothing > 0 606 | 607 | # local (convenience) vars for model 608 | nfactors, (ncells, ngenes) = self.nfactors, X.shape 609 | a, ap, c, cp = self.a, self.ap, self.c, self.cp 610 | 611 | # get empirically set hyperparameters and variational distributions 612 | bp, dp, xi, eta, theta, beta = self._setup(X, freeze_genes, reinit) 613 | 614 | # Make first updates for hierarchical shape prior 615 | # (vi_shape is constant, but want to update full distribution) 616 | xi.vi_shape[:] = ap + nfactors * a 617 | if not freeze_genes: 618 | eta.vi_shape[:] = cp + nfactors * c 619 | 620 | # setup loss function as mean negative llh of nonzero training data 621 | # if the loss function is not given 622 | if loss_function is None: 623 | loss_function = ls.loss_function_for_data( 624 | ls.mean_negative_pois_llh, X) 625 | 626 | # setup batch_ix iterator 627 | if batchsize is not None and batchsize > 1 and batchsize <= ncells: 628 | batched = True 629 | batch_ix_generator = minibatch_ix_generator(ncells, batchsize) 630 | else: 631 | batched = False 632 | batch_ix_generator = None 633 | 634 | ## init 635 | loss, unsmoothed_loss, pct_change = [], [], [] 636 | # check variable overrides 637 | min_iter = self.min_iter if min_iter is None else min_iter 638 | max_iter = self.max_iter if max_iter is None else max_iter 639 | epsilon = self.epsilon if epsilon is None else epsilon 640 | check_freq = self.check_freq if check_freq is None else check_freq 641 | verbose = self.verbose if verbose is None else verbose 642 | for t in range(max_iter): 643 | # setup batching 644 | if batch_ix_generator is None: 645 | batch_ix = np.arange(X.shape[0]) 646 | batchsize = ncells 647 | X_batch = X 648 | else: 649 | batch_ix = next(batch_ix_generator) 650 | X_batch = X.tocsr()[batch_ix,:].tocoo() 651 | 652 | if t==0 and reinit: #randomize phi for first iteration 653 | random_phi = np.random.dirichlet( np.ones(nfactors), 654 | X_batch.data.shape[0]) 655 | Xphi_data = X_batch.data[:,None] * random_phi 656 | else: 657 | if single_process: 658 | Xphi_data = compute_Xphi_data_numpy(X_batch, theta, beta, 659 | theta_ix=batch_ix) 660 | else: 661 | Xphi_data = compute_Xphi_data( 662 | X_batch.data, X_batch.row, X_batch.col, 663 | theta.vi_shape[batch_ix], theta.vi_rate[batch_ix], 664 | beta.vi_shape, beta.vi_rate) 665 | 666 | if beta_theta_simultaneous: 667 | # calculate gene updates but don't assign yet 668 | if not freeze_genes: 669 | bvs = compute_loading_shape_update(Xphi_data, 670 | X_batch.col, ngenes, c) 671 | bvr = compute_loading_rate_update(eta.vi_shape, 672 | eta.vi_rate, theta.vi_shape[batch_ix], 673 | theta.vi_rate[batch_ix]) 674 | # cell updates 675 | theta.vi_shape[batch_ix] = compute_loading_shape_update( 676 | Xphi_data, X_batch.row, batchsize, a) 677 | theta.vi_rate[batch_ix] = compute_loading_rate_update( 678 | xi.vi_shape[batch_ix], xi.vi_rate[batch_ix], 679 | beta.vi_shape, beta.vi_rate) 680 | xi.vi_rate[batch_ix] = bp + theta.e_x[batch_ix].sum(1) 681 | # make gene updates 682 | if not freeze_genes: 683 | beta.vi_shape = bvs 684 | beta.vi_rate = bvr 685 | eta.vi_rate = dp + beta.e_x.sum(1) 686 | 687 | else: 688 | if batched: 689 | # cell updates, must do first for batching 690 | theta.vi_shape[batch_ix] = compute_loading_shape_update( 691 | Xphi_data, X_batch.row, batchsize, a) 692 | theta.vi_rate[batch_ix] = compute_loading_rate_update( 693 | xi.vi_shape[batch_ix], xi.vi_rate[batch_ix], 694 | beta.vi_shape, beta.vi_rate) 695 | xi.vi_rate[batch_ix] = bp + theta.e_x[batch_ix].sum(1) 696 | 697 | if not freeze_genes: 698 | #gene updates 699 | beta.vi_shape = compute_loading_shape_update(Xphi_data, 700 | X_batch.col, ngenes, c) 701 | beta.vi_rate = compute_loading_rate_update(eta.vi_shape, 702 | eta.vi_rate, theta.vi_shape[batch_ix], 703 | theta.vi_rate[batch_ix]) 704 | eta.vi_rate = dp + beta.e_x.sum(1) 705 | 706 | if not batched: 707 | # cell updates, doing after gene updates when not batched 708 | # for legacy consistency 709 | theta.vi_shape[batch_ix] = compute_loading_shape_update( 710 | Xphi_data, X_batch.row, batchsize, a) 711 | theta.vi_rate[batch_ix] = compute_loading_rate_update( 712 | xi.vi_shape[batch_ix], xi.vi_rate[batch_ix], 713 | beta.vi_shape, beta.vi_rate) 714 | xi.vi_rate[batch_ix] = bp + theta.e_x[batch_ix].sum(1) 715 | 716 | 717 | # record llh/percent change and check for convergence 718 | if t % check_freq == 0: 719 | 720 | # chech llh 721 | # vX = validation_data if validation_data is not None else X 722 | try : 723 | curr = loss_function( 724 | a=a, ap=ap, bp=bp, c=c, cp=cp, dp=dp, 725 | xi=xi, eta=eta, theta=theta, beta=beta) 726 | unsmoothed_loss.append(curr) 727 | if len(unsmoothed_loss) > loss_smoothing: 728 | unsmoothed_loss = unsmoothed_loss[1:] 729 | # normally this is just curr as loss_smoothing=1 by default 730 | loss.append(np.mean(unsmoothed_loss)) 731 | except NameError as e: 732 | print('Invalid loss function') 733 | raise e 734 | 735 | # calculate percent change 736 | try: 737 | curr, prev = loss[-1], loss[-2] 738 | pct_change.append(100 * (curr - prev) / np.abs(prev)) 739 | except IndexError: 740 | pct_change.append(100) 741 | if verbose: 742 | msg = '[Iter. {0: >4}] loss:{1:.6f} pct:{2:.9f}'.format( 743 | t, curr, pct_change[-1]) 744 | print(msg) 745 | if checkstep_function is not None: 746 | checkstep_function(bp=bp, dp=dp, xi=xi, eta=eta, theta=theta, 747 | beta=beta, t=t) 748 | 749 | # check convergence 750 | if len(loss) > 3 and t >= min_iter: 751 | # convergence conditions (all must be met) 752 | current_small = np.abs(pct_change[-1]) < self.epsilon 753 | prev_small = np.abs(pct_change[-2]) < self.epsilon 754 | not_inflection = not ( 755 | (np.abs(loss[-3]) < np.abs(prev)) \ 756 | and (np.abs(prev) > np.abs(curr))) 757 | converged = current_small and prev_small and not_inflection 758 | if converged: 759 | if verbose: 760 | print('converged') 761 | break 762 | 763 | # getting worse, and has been for better_than_n_ago checks 764 | # (don't waste time on a bad run) 765 | if len(loss) > self.better_than_n_ago \ 766 | and self.better_than_n_ago: 767 | nprev = loss[-self.better_than_n_ago] \ 768 | if len(loss)>self.better_than_n_ago else loss[0] 769 | worse_than_n_ago = np.abs(nprev) < np.abs(curr) 770 | getting_worse = np.abs(prev) < np.abs(curr) 771 | if worse_than_n_ago and getting_worse: 772 | if verbose: 773 | print('getting worse break') 774 | break 775 | 776 | # TODO message or warning or something 777 | if t >= self.max_iter: 778 | break 779 | 780 | return (bp, dp, xi, eta, theta, beta, loss) 781 | 782 | 783 | def _setup(self, X, freeze_genes=False, reinit=True, clip=True): 784 | """Setup variational distributions 785 | 786 | Parameters 787 | ---------- 788 | X: coo_matrix 789 | Data to fit 790 | freeze_genes: bool, optional (Default: False) 791 | Should we update gene variational distributions eta and beta 792 | reinit: bool, optional (Default: True) 793 | Randomly initialize variational distributions even if they 794 | already exist. Superseded by freeze_genes. Does not affect 795 | self.bp and self.dp (which will only be set empirically if 796 | they are None) 797 | clip : bool, optional (Default: True) 798 | If empirically calculating dp and bp > 1000 * dp, clip dp to 799 | bp / 1000. 800 | 801 | Returns 802 | ------- 803 | bp : float 804 | dp : float 805 | xi : HPF_Gamma 806 | eta : HPF_Gamma 807 | theta : HPF_Gamma 808 | beta : HPF_Gamma 809 | 810 | """ 811 | # locals for convenience 812 | nfactors, (ncells, ngenes) = self.nfactors, X.shape 813 | a, ap, c, cp = self.a, self.ap, self.c, self.cp 814 | bp, dp = self.bp, self.dp 815 | 816 | xi, eta, theta, beta = (self.xi, self.eta, self.theta, self.beta) 817 | 818 | # empirically set bp and dp 819 | bp, dp = self._get_empirical_hypers(X, freeze_genes, clip) 820 | 821 | if reinit or (xi is None): 822 | xi = HPF_Gamma.random_gamma_factory((ncells,), ap, bp, 823 | dtype=self.dtype) 824 | if reinit or (theta is None): 825 | theta = HPF_Gamma.random_gamma_factory((ncells,nfactors), a, bp, 826 | dtype=self.dtype) 827 | 828 | # Check if variational distributions for genes exist, create if not 829 | # Error if freeze_genes and eta and beta don't exists 830 | if freeze_genes: 831 | if eta is None or beta is None: 832 | msg = 'To fit with frozen gene variational distributions ' \ 833 | + '(`freeze_genes`==True), eta and beta must be set to ' \ 834 | + 'valid HPF_Gamma instances.' 835 | raise ValueError(msg) 836 | else: 837 | if reinit or (eta is None): 838 | eta = HPF_Gamma.random_gamma_factory((ngenes,), cp, dp, 839 | dtype=self.dtype) 840 | if reinit or (beta is None): 841 | beta = HPF_Gamma.random_gamma_factory((ngenes,nfactors), 842 | c, dp, dtype=self.dtype) 843 | 844 | return (bp, dp, xi, eta, theta, beta) 845 | 846 | 847 | def _get_empirical_hypers(self, X, freeze_genes=False, clip=True): 848 | """Get empirical values for bp, dp 849 | 850 | Parameters 851 | ---------- 852 | X : coo_matrix 853 | Data to fit 854 | 855 | Returns 856 | ------- 857 | bp : float 858 | dp : float 859 | """ 860 | bp, dp = self.bp, self.dp 861 | # empirically set bp and dp 862 | def mean_var_ratio(X, axis): 863 | axis_sum = X.sum(axis=axis) 864 | return np.mean(axis_sum) / np.var(axis_sum) 865 | if bp is None: 866 | bp = self.ap * mean_var_ratio(X, axis=1) 867 | if dp is None: # dp first in case of error 868 | if freeze_genes: 869 | msg = 'dp is None and cannot be set' 870 | msg += ' when freeze_genes is True.' 871 | raise ValueError(msg) 872 | else: 873 | dp = self.cp * mean_var_ratio(X, axis=0) 874 | if clip and bp > 1000 * dp: 875 | old_val = dp 876 | dp = bp / 1000 877 | print('Clipping dp: was {} now {}'.format(old_val, dp)) 878 | 879 | return bp, dp 880 | 881 | 882 | def _initialize(self, X, freeze_genes=False): 883 | """Shortcut to setup random distributions & set variables 884 | """ 885 | bp, dp, xi, eta, theta, beta = self._setup(X, freeze_genes, 886 | reinit=True) 887 | self.bp = bp 888 | self.dp = dp 889 | self.xi = xi 890 | self.eta = eta 891 | self.theta = theta 892 | self.beta = beta 893 | 894 | 895 | def load_model(file_name): 896 | """Load a model from a joblib file 897 | 898 | Parameters 899 | ---------- 900 | file_name : str 901 | Joblib file containing a saved scHPF model 902 | 903 | 904 | Returns 905 | ------- 906 | model : scHPF 907 | The scHPF model in the file 908 | """ 909 | return joblib.load(file_name) 910 | 911 | 912 | def save_model(model, file_name): 913 | """Save model to (joblib) file 914 | 915 | Serialize scHPF model as a joblib file. Joblib is simillar to pickle, 916 | but preferable for objects with many numpy arrays 917 | 918 | Parameters 919 | ---------- 920 | model : scHPF 921 | The scHPF model object to save 922 | file_name : str 923 | Name of file to save model to 924 | """ 925 | joblib.dump(model, file_name) 926 | 927 | 928 | def combine_across_cells(x, y, y_ixs): 929 | """Combine theta & xi from two scHPF instance with the same beta & eta 930 | 931 | Intended to be used combining variational distributions for local 932 | variables (theta,xi) from training data with variational distributions 933 | for local variables from validation or other data that was projected 934 | onto the same global variational distributions (beta,eta) 935 | 936 | If `x.bp` != `y.bp`, returned model `xy.bp` is set to None. All other 937 | attributes (except for the merged xi and eta) are inherited from `x`. 938 | 939 | Parameters 940 | ---------- 941 | x : `scHPF` 942 | y : `scHPF` 943 | The scHPF instance whose rows in the output should be at the 944 | corresponding indices `y_ixs` 945 | y_ixs : ndarray 946 | Row indices of `y` in the returned distributions. Must be 1-d and 947 | have same number of rows as `y`, have no repeats, and have no index 948 | greater than or equal to x.ncells + y.ncells. 949 | 950 | 951 | Returns 952 | ------- 953 | ab : `scHPF` 954 | 955 | """ 956 | assert x.dp == y.dp 957 | assert x.eta == y.eta 958 | assert x.beta == y.beta 959 | 960 | xy = deepcopy(x) 961 | if y.bp != x.bp: 962 | xy.bp = None 963 | xy.xi = x.xi.combine(y.xi, y_ixs) 964 | xy.theta = x.theta.combine(y.theta, y_ixs) 965 | return xy 966 | 967 | 968 | def run_trials(X, nfactors, 969 | ntrials=5, 970 | min_iter=30, 971 | max_iter=1000, 972 | check_freq=10, 973 | epsilon=0.001, 974 | better_than_n_ago=5, 975 | dtype=np.float64, 976 | verbose=True, 977 | vcells = None, 978 | vX = None, 979 | loss_function=None, 980 | model_kwargs = {}, 981 | return_all = False, 982 | reproject = False, 983 | reproject_kwargs = {}, 984 | batchsize=0, 985 | beta_theta_simultaneous=False, 986 | loss_smoothing=1 987 | ): 988 | """ 989 | Train with multiple random initializations, selecting model with best loss 990 | 991 | As scHPF uses non-convex optimization, it benefits from training with 992 | multiple random initializations to avoid local minima. 993 | 994 | Parameters 995 | ---------- 996 | X: coo_matrix 997 | Data to fit 998 | nfactors: int 999 | Number of factors (K) 1000 | ntrials : int, optional (Default 5) 1001 | Number of random initializations for training 1002 | min_iter: int, optional (Default 30) 1003 | Minimum number of interations for training. 1004 | max_iter: int, optional (Default 1000): 1005 | Maximum number of interations for training. 1006 | check_freq: int, optional (Default 10) 1007 | Number of training iterations between calculating loss. 1008 | epsilon: float, optional (Default 0.001) 1009 | Percent change of loss for convergence. 1010 | better_than_n_ago: int, optional (Default 5) 1011 | Stop condition if loss is getting worse. Stops training if loss 1012 | is worse than `better_than_n_ago`*`check_freq` training steps 1013 | ago and getting worse. 1014 | dtype : datatype, optional (Default np.float64) 1015 | np.float64 or np.float32 1016 | verbose: bool, optional (Default True) 1017 | verbose 1018 | vcells : coo_matrix, optional (Default None) 1019 | cells to use in a validation loss function 1020 | vX : coo_matrix, optional (Default None) 1021 | nonzero entries from the cells in vX 1022 | loss_function : function, optional (Default None) 1023 | A loss function to asses convergence that accepts data, model 1024 | variational parameters, and model hyperparameters. Note this is 1025 | distinct from the `loss_function` argument in scHPF._fit (called by 1026 | scHPF.fit and scHPF.project), which assumes a fixed reference to data 1027 | is included in the function and *does not* accept data as an argument. 1028 | model_kwargs: dict, optional (Default {}) 1029 | dictionary of additional keyword arguments for model 1030 | initialization 1031 | return_all: bool, optional (Default False) 1032 | return all models 1033 | reproject: bool, optional (Default False) 1034 | Reproject the data onto the frozen gene variables before calculating 1035 | loss. The reprojected loss will be added to the end of loss as a 1036 | sublist. Note that this reprojection will *not* use the `loss_function` 1037 | argument, and instead use the default provided log likelihood 1038 | reproject_kwargs: dict, optional (Default {'replace':True}) 1039 | Only used if `reproject` is True. Keyword args for scHPF.project. 1040 | 'replace':True cannot be changed, and will be overwritten if given 1041 | batchsize: int, optional (Defualt 0) 1042 | Number of cells to use per training round. All cells used if 0. 1043 | loss_smoothing: int, optional (Default: 1) 1044 | Smooth loss up to `loss_smoothing` check frequencies ago. 1 results in 1045 | no smoothing. Intended to be used with batching when assessing 1046 | convergence based on training loss, where a good value might be 1047 | int(ncells/n_batches) 1048 | 1049 | Returns 1050 | ------- 1051 | best_model: scHPF 1052 | The model with the best loss facter `ntrials` random initializations 1053 | and training runs 1054 | rejected_models: list, optional 1055 | Rejected models, ordered by decreasing loss . Only returned if 1056 | return_all is True 1057 | """ 1058 | ncells, ngenes = X.shape 1059 | if ngenes >= 20000: 1060 | msg = 'WARNING: you are running scHPF with {} genes,'.format(ngenes) 1061 | msg += ' which is more than the ~20k protein coding genes in the' 1062 | msg += ' human genome. We suggest running scHPF on protein-coding' 1063 | msg += ' genes only.' 1064 | print(msg) 1065 | 1066 | # get the loss function for any data 1067 | if loss_function is None: 1068 | loss_function = partial(ls.mean_negative_pois_llh, 1069 | single_process=False) 1070 | 1071 | # check data we're using for loss 1072 | if vcells is not None: 1073 | assert X.shape[1] == vcells.shape[1] 1074 | if vX is not None: 1075 | assert vX.shape == X.shape 1076 | else: 1077 | vX = X 1078 | # setup loss fnc w/data (will be overridden if vcells is not None) 1079 | data_loss_function = ls.loss_function_for_data(loss_function, vX) 1080 | # setup smoothed_loss if using batches 1081 | 1082 | # run trials 1083 | best_loss, best_model, best_t = np.finfo(np.float64).max, None, None 1084 | models, losses = [], [] # only used if return_all 1085 | for t in range(ntrials): 1086 | # make a new model 1087 | model = scHPF(nfactors=nfactors, 1088 | min_iter=min_iter, max_iter=max_iter, 1089 | check_freq=check_freq, epsilon=epsilon, 1090 | better_than_n_ago=better_than_n_ago, 1091 | verbose=verbose, dtype=dtype, 1092 | **model_kwargs 1093 | ) 1094 | 1095 | # override the loss function data if we have vcells 1096 | # (must be redone for each new model) 1097 | if vcells is not None: 1098 | proj_kwargs = dict(reinit=False, 1099 | min_iter=1, 1100 | max_iter=min(10, check_freq), 1101 | check_freq= check_freq+1, 1102 | verbose=False 1103 | ) 1104 | data_loss_function = ls.projection_loss_function( 1105 | loss_function, vcells, nfactors, 1106 | proj_kwargs=proj_kwargs) 1107 | def checkstep_function(**kwargs): 1108 | loss = ls.loss_function_for_data(loss_function, X) 1109 | print('\ttrain:', '{0:.6f}'.format(loss(**kwargs))) 1110 | else: 1111 | checkstep_function = None 1112 | 1113 | # fit the model 1114 | model.fit(X, loss_function=data_loss_function, 1115 | checkstep_function=checkstep_function, 1116 | batchsize=batchsize, loss_smoothing=loss_smoothing, 1117 | beta_theta_simultaneous=beta_theta_simultaneous) 1118 | if reproject: 1119 | print('Reprojecting data...') 1120 | reproject_kwargs['replace'] = True 1121 | reproject_kwargs['reinit'] = False 1122 | proj_loss = model.project(X, **reproject_kwargs) 1123 | model.loss.append(proj_loss) 1124 | loss = proj_loss[-1] 1125 | else: 1126 | loss = model.loss[-1] 1127 | 1128 | if loss < best_loss: 1129 | best_model = model 1130 | best_loss = loss 1131 | best_t = t 1132 | if verbose: 1133 | print('New best!'.format(t)) 1134 | if return_all: 1135 | models.append(model) 1136 | losses.append(loss) 1137 | if verbose: 1138 | print('Trial {0} loss: {1:.6f}'.format(t, loss)) 1139 | print('Best loss: {0:.6f} (trial {1})'.format(best_loss, best_t)) 1140 | 1141 | if return_all: 1142 | return_order = np.argsort(losses) 1143 | ordered_models = [models[i] for i in return_order] 1144 | assert ordered_models[0] == best_model 1145 | return best_model, ordered_models[1:] 1146 | else: 1147 | return best_model 1148 | 1149 | 1150 | # TODO deal with verbosity 1151 | def run_trials_pool(X, nfactors, 1152 | ntrials=5, 1153 | njobs=0, 1154 | max_threads=None, 1155 | min_iter=30, 1156 | max_iter=1000, 1157 | check_freq=10, 1158 | epsilon=0.001, 1159 | better_than_n_ago=5, 1160 | dtype=np.float64, 1161 | verbose=True, 1162 | vcells = None, 1163 | vX = None, 1164 | loss_function=None, 1165 | model_kwargs = {}, 1166 | return_all = False, 1167 | reproject = False, 1168 | reproject_kwargs = {}, 1169 | batchsize=0, 1170 | beta_theta_simultaneous=False, 1171 | loss_smoothing=1 1172 | ): 1173 | """ 1174 | Train with multiple random initializations, selecting model with best loss. 1175 | Parallelization is done at the trial level rather than within computations 1176 | 1177 | As scHPF uses non-convex optimization, it benefits from training with 1178 | multiple random initializations to avoid local minima. 1179 | 1180 | Parameters 1181 | ---------- 1182 | X: coo_matrix 1183 | Data to fit 1184 | nfactors: int or list of ints 1185 | Number of factors (K), may be a list for multiple k 1186 | ntrials : int, optional (Default 5) 1187 | Number of random initializations for training 1188 | njobs : int, optional (Default 0) 1189 | Maximum number of threads in the threadpool. 0 will use all available. 1190 | min_iter: int, optional (Default 30) 1191 | Minimum number of interations for training. 1192 | max_iter: int, optional (Default 1000): 1193 | Maximum number of interations for training. 1194 | check_freq: int, optional (Default 10) 1195 | Number of training iterations between calculating loss. 1196 | epsilon: float, optional (Default 0.001) 1197 | Percent change of loss for convergence. 1198 | better_than_n_ago: int, optional (Default 5) 1199 | Stop condition if loss is getting worse. Stops training if loss 1200 | is worse than `better_than_n_ago`*`check_freq` training steps 1201 | ago and getting worse. 1202 | dtype : datatype, optional (Default np.float64) 1203 | np.float64 or np.float32 1204 | verbose: bool, optional (Default True) 1205 | verbose 1206 | vcells : coo_matrix, optional (Default None) 1207 | cells to use in a validation loss function 1208 | vX : coo_matrix, optional (Default None) 1209 | nonzero entries from the cells in vX 1210 | loss_function : function, optional (Default None) 1211 | A loss function that accepts data, model variational parameters, 1212 | and model hyperparameters. Note this is distinct from the 1213 | `loss_function` argument in scHPF._fit (called by scHPF.fit and 1214 | scHPF.project), which assumes a fixed reference to data is included 1215 | in the function and *does not* accept data as an argument. 1216 | model_kwargs: dict, optional (Default {}) 1217 | dictionary of additional keyword arguments for model 1218 | initialization 1219 | return_all: bool, optional (Default False) 1220 | return all models 1221 | reproject: bool, optional (Default False) 1222 | Reproject the data onto the frozen gene variables before calculating 1223 | loss. The reprojected loss will be added to the end of loss as a 1224 | sublist. Note that this reprojection will *not* use the `loss_function` 1225 | argument, and instead use the default provided log likelihood 1226 | reproject_kwargs: dict, optional (Default {'replace':True}) 1227 | Only used if `reproject` is True. Keyword args for scHPF.project. 1228 | 'replace':True cannot be changed, and will be overwritten if given 1229 | batchsize: int, optional (Defualt 0) 1230 | Number of cells to use per training round. All cells used if 0. 1231 | loss_smoothing: int, optional (Default: 1) 1232 | Smooth loss up to `loss_smoothing` check frequencies ago. 1 results in 1233 | no smoothing. Intended to be used with batching when assessing 1234 | convergence based on training loss, where a good value might be 1235 | int(ncells/n_batches) 1236 | 1237 | 1238 | Returns 1239 | ------- 1240 | best_models: list(scHPF) 1241 | The model with the best loss facter `ntrials` random initializations 1242 | and training runs for each value in nfactors 1243 | rejected_models: list(list(scHPF)), optional 1244 | Rejected models, ordered by corresponding nfactors and then by 1245 | decreasing loss . Only returned if return_all is True 1246 | """ 1247 | ngenes = X.shape[1] 1248 | if ngenes >= 20000: 1249 | msg = 'WARNING: you are running scHPF with {} genes,'.format(ngenes) 1250 | msg += ' which is more than the ~20k protein coding genes in the' 1251 | msg += ' human genome. We suggest running scHPF on protein-coding' 1252 | msg += ' genes only.' 1253 | print(msg) 1254 | 1255 | # get the loss function for any data 1256 | if loss_function is None: 1257 | loss_function = partial(ls.mean_negative_pois_llh, 1258 | single_process=True) 1259 | 1260 | # check data we're using for loss 1261 | if vcells is not None: 1262 | assert X.shape[1] == vcells.shape[1] 1263 | if vX is not None: 1264 | assert vX.shape == X.shape 1265 | else: 1266 | vX = X 1267 | # setup loss fnc w/data (will be overridden if vcells is not None) 1268 | data_loss_function = ls.loss_function_for_data(loss_function, vX) 1269 | 1270 | # only need to create once because will be copied to processes 1271 | # override the loss function data if we have vcells 1272 | # (must be redone for each new model) 1273 | if vcells is not None: 1274 | proj_kwargs = dict(reinit=False, 1275 | min_iter=1, 1276 | max_iter=min(10, check_freq), 1277 | check_freq= check_freq+1, 1278 | verbose=False 1279 | ) 1280 | data_loss_function = ls.projection_loss_function( 1281 | loss_function, vcells, nfactors, 1282 | proj_kwargs=proj_kwargs) 1283 | 1284 | 1285 | # function to fit model 1286 | def fit_model(nfactors): 1287 | model = scHPF(nfactors=nfactors, 1288 | min_iter=min_iter, max_iter=max_iter, 1289 | check_freq=check_freq, epsilon=epsilon, 1290 | better_than_n_ago=better_than_n_ago, 1291 | verbose=False, dtype=dtype, 1292 | **model_kwargs 1293 | ) 1294 | # fit the model 1295 | model.fit(X, loss_function=data_loss_function, 1296 | checkstep_function=None, single_process=True, 1297 | batchsize=batchsize, loss_smoothing=loss_smoothing) 1298 | if reproject: 1299 | # print('Reprojecting data...') 1300 | reproject_kwargs['replace'] = True 1301 | proj_loss = model.project(X, loss_function=data_loss_function, 1302 | **reproject_kwargs) 1303 | model.loss.append(proj_loss) 1304 | return model 1305 | 1306 | # get nfactors for every trial 1307 | if isinstance(nfactors, int): 1308 | nfactors = [nfactors] 1309 | trial_nfactors = [t for trial_set in [[K]*ntrials for K in nfactors] \ 1310 | for t in trial_set] 1311 | 1312 | # set max processes if not given 1313 | if njobs == 0: njobs = min(cpu_count(), len(trial_nfactors)) 1314 | 1315 | # training 1316 | with Parallel(n_jobs=njobs, verbose=10) as pool: # make the pool 1317 | candidates = pool( delayed(fit_model)(K) for K in trial_nfactors) 1318 | 1319 | # get the best model for every K 1320 | ordered_best, ordered_reject = [], [] 1321 | for i,K in enumerate(nfactors): 1322 | my_candidates = candidates[i*ntrials : (i+1)*ntrials] 1323 | loss = [m.loss[-1][-1] if reproject else m.loss[-1] for m in 1324 | my_candidates] 1325 | # print(list(zip([m.nfactors for m in my_candidates],loss))) 1326 | best_ix = np.argmin(loss) 1327 | ordered_best.append(my_candidates[best_ix]) 1328 | ordered_reject.append([my_candidates[i] for i in np.argsort(loss)[1:]]) 1329 | if return_all: 1330 | return ordered_best, ordered_reject 1331 | else: 1332 | return ordered_best 1333 | -------------------------------------------------------------------------------- /schpf/util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from collections import namedtuple 4 | 5 | import numpy as np 6 | from scipy.stats import hypergeom 7 | from scipy.sparse import csr_matrix 8 | import pandas as pd 9 | 10 | 11 | def mean_cellscore_fraction(cell_scores, ntop_factors=1): 12 | """ Get number of cells with a percentage of their total scores 13 | on a small number of factors 14 | 15 | Parameters 16 | ---------- 17 | cell_scores : ndarray 18 | (ncells, nfactors) array of cell scores 19 | ntop_factors : int, optional (Default: 1) 20 | number of factors that can count towards domance 21 | 22 | Returns 23 | ------- 24 | mean_cellscore_fraction : float 25 | The mean fraction of cells' scores that are contained within 26 | their top `ntop_factors` highest scoring factors 27 | 28 | """ 29 | totals = np.sum(cell_scores, axis=1) 30 | ntop_scores = np.sort(cell_scores,axis=1)[:, -ntop_factors:] 31 | domsum = np.sum(ntop_scores, axis=1) 32 | domfrac = domsum/totals 33 | return np.mean(domfrac) 34 | 35 | 36 | def mean_cellscore_fraction_list(cell_scores): 37 | """ Make a list of the mean dominant fraction at all possible numbers 38 | of ntop_factors 39 | """ 40 | return [mean_cellscore_fraction(cell_scores, i+1) 41 | for i in range(cell_scores.shape[1])] 42 | 43 | 44 | def max_pairwise(gene_scores, ntop=200, second_greatest=False): 45 | """ Get the maximum pairwise overlap of top genes 46 | 47 | Parameters 48 | ---------- 49 | gene_scores : ndarray 50 | (ngenes, nfactors) array of gene scores 51 | ntop : int (optional, default 200) 52 | Number of top genes to consider in each factor 53 | second_greatest : bool, optional 54 | Return the second greatest pairwise overlap of top genes 55 | 56 | Returns 57 | ------- 58 | max_pairwise : int 59 | The maximum pairwise overlap of the `ntop` highest scoring genes in 60 | each factors 61 | p : float 62 | Hypergeometric p value of max_pairwise, where the number of genes is 63 | the population size, `ntop` is the number of potential successes and 64 | the number of draws, and max_pairwise is the number of successes. 65 | """ 66 | tops = np.argsort(gene_scores, axis=0)[-ntop:] 67 | max_pairwise, last_max = 0, 0 68 | for i in range(tops.shape[1]): 69 | for j in range(tops.shape[1]): 70 | if i >= j: 71 | continue 72 | overlap = len(np.intersect1d(tops[:,i], tops[:,j])) 73 | if overlap > max_pairwise: 74 | last_max = max_pairwise 75 | max_pairwise = overlap 76 | elif overlap > last_max: 77 | last_max = overlap 78 | 79 | overlap = last_max if second_greatest else max_pairwise 80 | p = hypergeom.pmf(k=overlap, M=gene_scores.shape[0], 81 | N=ntop, n=ntop) \ 82 | + hypergeom.sf(k=overlap, M=gene_scores.shape[0], 83 | N=ntop, n=ntop) 84 | Overlap = namedtuple('Overlap', ['overlap', 'p']) 85 | return Overlap(overlap, p) 86 | 87 | 88 | def max_pairwise_table(gene_scores, ntop_list=[50,100,150,200,250,300]): 89 | """ Get the maximum pairwise overlap at 90 | 91 | Parameters 92 | ---------- 93 | gene_scores : ndarray 94 | (ngenes, nfactors) array of gene scores 95 | ntop_list : list, optional 96 | List of values of ntop to evaluate 97 | 98 | Returns 99 | ------- 100 | df : DataFrame 101 | """ 102 | max_overlap, p_max, max2_overlap, p_max2 = [],[],[],[] 103 | for ntop in ntop_list: 104 | o = max_pairwise(gene_scores, ntop, False) 105 | max_overlap.append( o.overlap ) 106 | p_max.append( o.p ) 107 | 108 | o2 = max_pairwise(gene_scores, ntop, True) 109 | max2_overlap.append( o2.overlap ) 110 | p_max2.append( o2.p ) 111 | df = pd.DataFrame({'ntop' : ntop_list, 'max_overlap' : max_overlap, 112 | 'p_max' : p_max, 'max2_overlap' : max2_overlap, 'p_max2' : p_max2}) 113 | return df 114 | 115 | 116 | def split_coo_rows(X, split_indices): 117 | """Split a coo matrix into two 118 | 119 | Parameters 120 | ---------- 121 | X : coo_matrix 122 | Matrix to split into two by row 123 | split_indices : ndarray 124 | Indices to use for the split. 125 | 126 | Returns 127 | ------- 128 | a : coo_matrix 129 | rows from X specified in split_indices 130 | b : coo_matrix 131 | rows from X *not* specified in split_indices 132 | 133 | """ 134 | a_indices = split_indices 135 | b_indices = np.setdiff1d(np.arange(X.shape[0]), split_indices) 136 | 137 | X_csr = X.tocsr() 138 | a = X_csr[a_indices, :].tocoo() 139 | b = X_csr[b_indices, :].tocoo() 140 | return a, b 141 | 142 | 143 | def collapse_coo_rows(coo): 144 | """Collapse the empty rows of a coo_matrix 145 | 146 | Parameters 147 | ---------- 148 | coo : coo_matrix 149 | Input coo_matrix which may have empty rows 150 | 151 | 152 | Returns 153 | ------- 154 | collapsed_coo : coo_matrix 155 | coo with row indices adjusted to removed empty rows 156 | collapsed_indices : ndarray 157 | Indices of the returned rows in the original input matrix 158 | """ 159 | nz_idx = np.where(coo.getnnz(1) > 0)[0] 160 | return coo.tocsr()[nz_idx].tocoo(), nz_idx 161 | 162 | 163 | def insert_coo_rows(a, b, b_indices): 164 | """Insert rows from b into a at specified row indeces 165 | 166 | Parameters 167 | ---------- 168 | a : sparse matrix 169 | b : sparse matrix 170 | b_indices : ndarray 171 | Indices in final matrix where b's rows should be. np.max(`b_indices`) 172 | must be a valid row index in the merged matrix with shape[0] = 173 | a.shape[0] + b.shape[0]. Must me ordered and unique. 174 | 175 | Returns 176 | ------- 177 | ab : 178 | coo_matrix with rows re-indexed to have rows from b 179 | """ 180 | # check arguments 181 | if a.shape[1] != b.shape[1]: 182 | msg = 'a.shape[1] must equal b.shape[1], received a with shape' 183 | msg += ' {} and b with shape {}'.format(a.shape, b.shape) 184 | raise ValueError(msg) 185 | if np.max(b_indices) >= a.shape[0] + b.shape[0]: 186 | msg = 'Invalid row indices {} for array with '.format(b_indices) 187 | msg += 'a.shape[0] + b.shape[0] = {} '.format(a.shape[0]) 188 | msg += '+ {} = {}'.format(b.shape[0], a.shape[0]+b.shape[0]) 189 | raise ValueError(msg) 190 | if not np.all(np.diff(b_indices) > 0): 191 | msg = '`b_indices` must be ordered without repeats. Received ' 192 | msg += '{}'.format(b_indices) 193 | raise ValueError(msg) 194 | 195 | out_shape = (a.shape[0] + b.shape[0], a.shape[1]) 196 | a = a.tocsr() 197 | b = b.tocsr() 198 | 199 | a_row, b_row = 0, 0 200 | data, indices, indptr = [], [], [0] 201 | for ab_row in range(out_shape[0]): 202 | if b_row < len(b_indices) and ab_row == b_indices[b_row]: 203 | my_row = b[b_row, :] 204 | b_row += 1 205 | else: 206 | my_row = a[a_row, :] 207 | a_row += 1 208 | data.append(my_row.data) 209 | indices.append(my_row.indices) 210 | indptr.append(indptr[-1] + my_row.indptr[1]) 211 | 212 | ab = csr_matrix( 213 | (np.hstack(data), np.hstack(indices), np.array(indptr)), 214 | out_shape).tocoo() 215 | return ab 216 | 217 | 218 | def minibatch_ix_generator(ncells, batchsize): 219 | assert ncells >= batchsize # allow equalitiy for testing 220 | ixs = np.arange(ncells) 221 | np.random.shuffle(ixs) 222 | start = 0 223 | while True: 224 | stop = start + batchsize 225 | if stop > ncells: 226 | stop = stop % ncells 227 | res = np.hstack([ixs[start:ncells], ixs[0:stop]]) 228 | else: 229 | res = ixs[start:stop] 230 | start = stop % ncells # need mod for case where ncells=batchsize 231 | yield res 232 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [tool:pytest] 2 | filterwarnings = 3 | ignore:.*matrix subclass is not the recommended.*:PendingDeprecationWarning 4 | ignore:.*importing the ABCs from.*:DeprecationWarning 5 | ignore:.*Could not select.*cells with given group_ids.*:UserWarning 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import find_packages, setup 4 | 5 | # get version from file 6 | __version__ = '0.0.0' 7 | exec(open('schpf/_version.py').read()) 8 | 9 | requires = ['scikit-learn', 10 | "numba >= 0.39, !=0.41, !=0.42, !=0.43; python_version<='3.7.3'", 11 | "numba >= 0.44; python_version=='3.7.4'", 12 | "numba >= 0.45; python_version>'3.7.4'", 13 | 'scipy >= 1.1', 14 | 'numpy', 15 | 'pandas', 16 | 'joblib' 17 | ] 18 | 19 | tests_require = ['pytest'] 20 | extras_require = { 21 | 'loompy' : ['loompy'], 22 | 'docs' : ['sphinx-argparse'], 23 | } 24 | 25 | setup( 26 | name='scHPF', 27 | version=__version__, 28 | packages=find_packages(), 29 | scripts=['bin/scHPF'], 30 | python_requires='>=3.6', 31 | install_requires=requires, 32 | tests_require=tests_require, 33 | extras_require=extras_require, 34 | author = 'Hanna Mendes Levitin', 35 | author_email = 'hml2134@columbia.edu', 36 | description='Single-cell Hierarchical Poisson Factorization', 37 | license="BSD", 38 | url='https://www.github.com/simslab/scHPF', 39 | ) 40 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simslab/scHPF/aff30d674039359395cbee4ca4ddc85f3a5c8b56/tests/__init__.py -------------------------------------------------------------------------------- /tests/_data/sample_blacklist.txt: -------------------------------------------------------------------------------- 1 | ENSG00000130772.13 MED18 2 | ENSG00000142609.18 CFAP74 3 | ENSG00000125945.14 ZNF436 4 | ENSG00000158246.7 TENT5B 5 | ENSG00000189280.3 GJB5 6 | ENSG00000120948.17 TARDBP 7 | ENSG00000142733.15 MAP3K6 8 | ENSG00000157978.11 LDLRAP1 9 | ENSG00000116819.7 TFAP2E 10 | ENSG00000284733.1 OR4F29 11 | ENSG00000121766.15 ZCCHC17 12 | ENSG00000182330.10 PRAMEF8 13 | ENSG00000025800.13 KPNA6 14 | ENSG00000271741.1 AC114490.2 15 | ENSG00000157881.13 PANK4 16 | ENSG00000107404.19 DVL1 17 | ENSG00000078900.14 TP73 18 | ENSG00000116731.22 PRDM2 19 | ENSG00000070831.15 CDC42 20 | ENSG00000197921.5 HES5 21 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | from scipy.sparse import coo_matrix 5 | import pytest 6 | from schpf import scHPF 7 | 8 | np.random.seed(42) 9 | 10 | N_CELLS, N_GENES, NZ_FRAC, N_FACTORS = (300, 1000, 0.03, 4) 11 | NNZ = int(N_CELLS * N_GENES * NZ_FRAC) 12 | 13 | # Fixtures 14 | @pytest.fixture() 15 | def data(): 16 | X_data = np.random.negative_binomial(2, 0.5, NNZ) 17 | X_data[X_data==0] = 1 18 | cell_ix = np.random.randint(0, N_CELLS, NNZ, dtype=np.int32) 19 | gene_ix = np.random.randint(0, N_GENES, NNZ, dtype=np.int32) 20 | X = coo_matrix( 21 | (X_data, (cell_ix, gene_ix)), 22 | (N_CELLS, N_GENES), 23 | dtype=np.int32) 24 | X.sum_duplicates() 25 | return X 26 | 27 | 28 | # TODO make these actual unit tests by making distributions from scratch 29 | @pytest.fixture(params=[np.float64, np.float32]) 30 | def model_uninit(request): 31 | model = scHPF(N_FACTORS, dtype=request.param) 32 | return model 33 | 34 | 35 | @pytest.fixture() 36 | def model(model_uninit, data): 37 | model_uninit._initialize(data) 38 | return model_uninit 39 | -------------------------------------------------------------------------------- /tests/test_inference.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | from scipy.sparse import coo_matrix 5 | from scipy.special import logsumexp, digamma, gammaln 6 | 7 | import pytest 8 | from numpy.testing import assert_allclose 9 | 10 | from schpf import hpf_numba, scHPF 11 | 12 | # globals & seed 13 | np.random.seed(42) 14 | 15 | @pytest.fixture() 16 | def Xphi(data, model): 17 | random_phi = np.random.dirichlet( np.ones(model.nfactors), 18 | data.data.shape[0]).astype(model.dtype) 19 | return data.data[:,None] * random_phi 20 | 21 | 22 | # Tests 23 | 24 | @pytest.mark.parametrize('x', [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]) 25 | @pytest.mark.parametrize('dtype', [np.float64, np.float32]) 26 | def test_cython_digamma(x, dtype): 27 | x = dtype(x) 28 | # using approx_equal for float32 :( 29 | assert_allclose(hpf_numba.psi(x), digamma(x)) 30 | 31 | 32 | @pytest.mark.parametrize('x', [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]) 33 | @pytest.mark.parametrize('dtype', [np.float64, np.float32]) 34 | def test_cython_gammaln(x, dtype): 35 | x = dtype(x) 36 | # using approx_equal for float32 :( 37 | assert_allclose(hpf_numba.cgammaln(x), gammaln(x)) 38 | 39 | 40 | def test_compute_Xphi_numba(data, model): 41 | def compute_Xphi_numpy(X, theta, beta): 42 | logrho = theta.e_logx[X.row, :] + beta.e_logx[X.col, :] 43 | logphi = logrho - logsumexp(logrho, axis=1)[:,None] 44 | return X.data[:,None] * np.exp(logphi) 45 | Xphi = compute_Xphi_numpy(data, model.theta, model.beta) 46 | # increase rtol for float32 47 | assert_allclose( 48 | hpf_numba.compute_Xphi_data( 49 | data.data, data.row, data.col, 50 | model.theta.vi_shape, model.theta.vi_rate, 51 | model.beta.vi_shape, model.beta.vi_rate), 52 | Xphi, 53 | rtol=1e-5 if model.dtype==np.float32 else 1e-7, atol=0) 54 | assert_allclose( 55 | hpf_numba.compute_Xphi_data_numpy(data, model.theta, model.beta), 56 | Xphi, 57 | rtol=1e-5 if model.dtype==np.float32 else 1e-7, atol=0) 58 | 59 | 60 | def test_compute_theta_shape_numba(model, Xphi, data): 61 | nfactors = model.nfactors 62 | reference = np.zeros((model.ncells, nfactors), dtype=model.dtype) 63 | for k in range(nfactors): 64 | reference[:,k] = coo_matrix( 65 | (Xphi[:, k], (data.row, data.col)), 66 | (model.ncells, model.ngenes) 67 | ).sum(1).A[:,0] 68 | reference += model.a 69 | assert_allclose( 70 | hpf_numba.compute_loading_shape_update( 71 | Xphi, data.row, model.ncells, model.a), 72 | reference, 73 | rtol=1e-6 if model.dtype==np.float32 else 1e-7, atol=0) 74 | 75 | 76 | def test_compute_beta_shape_numba(model, Xphi, data): 77 | reference = np.zeros((model.ngenes, model.nfactors), dtype=model.dtype) 78 | for k in range(model.nfactors): 79 | reference[:,k] = coo_matrix( 80 | (Xphi[:, k], (data.col, data.row)), 81 | (model.ngenes, model.ncells) 82 | ).sum(1).A[:,0] 83 | reference += model.c 84 | assert_allclose( 85 | hpf_numba.compute_loading_shape_update( 86 | Xphi, data.col, model.ngenes, model.c), 87 | reference, 88 | rtol=1e-6 if model.dtype==np.float32 else 1e-7, atol=0) 89 | 90 | 91 | def test_compute_theta_rate_numba(model): 92 | reference = model.xi.e_x[:,None] + model.beta.e_x.sum(0)[None,:] 93 | assert_allclose( 94 | hpf_numba.compute_loading_rate_update( 95 | model.xi.vi_shape, model.xi.vi_rate, 96 | model.beta.vi_shape, model.beta.vi_rate), 97 | reference 98 | ) 99 | 100 | 101 | def test_compute_eta_rate_numba(model): 102 | reference = model.beta.e_x.sum(axis=1) + model.dp 103 | assert_allclose( 104 | hpf_numba.compute_capacity_rate_update( 105 | model.beta.vi_shape, model.beta.vi_rate, 106 | model.dp), 107 | reference, 108 | rtol=1e-6 if model.dtype==np.float32 else 1e-7, atol=0) 109 | 110 | 111 | def test_llh_pois(data, model): 112 | e_rate = model.theta.e_x @ model.beta.e_x.T 113 | desired = data.data * np.log(e_rate[data.row, data.col]) \ 114 | - e_rate[data.row, data.col] \ 115 | - gammaln(data.data + 1) 116 | assert_allclose( 117 | hpf_numba.compute_pois_llh(data.data, data.row, data.col, 118 | model.theta.vi_shape, model.theta.vi_rate, 119 | model.beta.vi_shape, model.beta.vi_rate), 120 | desired, 121 | rtol=1e-6 if model.dtype==np.float32 else 1e-7, atol=0) 122 | 123 | 124 | -------------------------------------------------------------------------------- /tests/test_misc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import schpf 4 | 5 | def test_version(): 6 | assert schpf.__version__ is not None 7 | # assert schpf.__version__ == '0.2.5' 8 | -------------------------------------------------------------------------------- /tests/test_preprocessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from pathlib import Path 4 | import numpy as np 5 | import pandas as pd 6 | from scipy.sparse import coo_matrix 7 | 8 | import pytest 9 | from numpy.testing import assert_equal, assert_array_equal 10 | 11 | from schpf import preprocessing as prep 12 | 13 | 14 | TXT = str(Path(__file__).parent / \ 15 | Path('_data/PJ030merge.c300t400_g0t500.matrix.txt')) 16 | NCELLS = 100 17 | NGENES = 500 18 | 19 | # TODO figure out how to get this without going this far up tree or doubling 20 | # perhaps make a small copy? 21 | PROTEIN_CODING = str( 22 | Path(*Path(__file__).parts[:-2]) / Path( 23 | 'resources/gencode.v29.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt')) 24 | BLIST = str( Path(__file__).parent / Path('_data/sample_blacklist.txt') ) 25 | 26 | 27 | @pytest.fixture() 28 | def protein_coding(): 29 | return pd.read_csv(PROTEIN_CODING, delim_whitespace=True, header=None) 30 | 31 | 32 | @pytest.fixture() 33 | def blacklist(): 34 | return pd.read_csv(BLIST, delim_whitespace=True, header=None) 35 | 36 | 37 | @pytest.fixture() 38 | def exp_genes(): 39 | return pd.read_csv(TXT, delim_whitespace=True, header=None)[[0,1]] 40 | 41 | 42 | @pytest.mark.parametrize('ngene_cols', [2,3]) 43 | def test_load_txt(ngene_cols): 44 | coo, genes = prep.load_txt(TXT, ngene_cols) 45 | assert genes.shape[1] == ngene_cols 46 | assert coo.shape[1] == NGENES 47 | assert genes.shape[0] == NGENES 48 | assert coo.shape[0] == NCELLS + 2 - ngene_cols 49 | 50 | 51 | # TODO add loom and test, also check that this works since passed shouldn't? 52 | def test_load_like(tmp_path): 53 | gene_file = str(tmp_path / 'genes.txt') 54 | 55 | # make a permutation 56 | perm = np.random.choice(NGENES, NGENES-10, replace=False) 57 | 58 | # load data to make reference and permute 59 | umis, genes = prep.load_txt(TXT) 60 | umis = umis.A[:, perm] 61 | genes = genes.loc[perm] 62 | 63 | # write permuted/subsampled reference file 64 | genes.to_csv(gene_file, header=None, sep='\t', index=None) 65 | 66 | # load like permuted reference 67 | ll_umi, ll_genes = prep.load_like(TXT, reference=gene_file) 68 | assert_equal(len(ll_genes), len(perm)) 69 | assert_array_equal(umis, ll_umi.A) 70 | 71 | # repeat with no_split_on_dot 72 | ll_umi, ll_genes = prep.load_like(TXT, reference=gene_file, 73 | no_split_on_dot=True) 74 | assert_equal(len(ll_genes), len(perm)) 75 | assert_array_equal(umis, ll_umi.A) 76 | 77 | # by gene name 78 | ll_umi, ll_genes = prep.load_like(TXT, reference=gene_file, 79 | by_gene_name=True) 80 | assert_equal(len(ll_genes), len(perm)) 81 | assert_array_equal(umis, ll_umi.A) 82 | 83 | # corrupt the permuted reference 84 | bad_genes = genes.copy() 85 | bad_genes.loc[5, 0] = 'random' 86 | bad_genes.to_csv(gene_file, header=None, sep='\t', index=None) 87 | with pytest.raises(ValueError): 88 | ll_umi, ll_genes = prep.load_like(TXT, reference=gene_file) 89 | 90 | 91 | def test_min_cells_expressing(data): 92 | ncells, ngenes = data.shape 93 | # test all true when 0 94 | min_cells = 0 95 | assert_equal(prep.min_cells_expressing_mask(data, min_cells).sum(), 96 | ngenes) 97 | 98 | # test all false when > ncell 99 | min_cells = ngenes + 1 100 | assert_equal(prep.min_cells_expressing_mask(data, min_cells).sum(), 101 | 0) 102 | min_cells = 0.9999999 103 | assert min_cells < 1 104 | assert_equal(prep.min_cells_expressing_mask(data, min_cells).sum(), 105 | 0) 106 | 107 | # test for reasonable value 108 | min_cells = 5 109 | n_expressing = data.astype(bool).sum(axis=0).A[0, :] 110 | mask = n_expressing >= min_cells 111 | assert_array_equal(prep.min_cells_expressing_mask(data, min_cells), 112 | mask) 113 | # test same for proportion 114 | min_cells_prop = min_cells / ncells 115 | assert_array_equal(prep.min_cells_expressing_mask(data, min_cells_prop), 116 | mask) 117 | 118 | 119 | def test_genelist_mask(protein_coding, exp_genes): 120 | shared_ens = exp_genes[0].str.split('.').str[0].isin( 121 | protein_coding[0].str.split('.').str[0]) 122 | shared_gene = exp_genes[1].isin(protein_coding[1]) 123 | 124 | # whitelist 125 | assert_array_equal(prep.genelist_mask(exp_genes[0], protein_coding[0]), 126 | shared_ens) 127 | assert_array_equal(prep.genelist_mask(exp_genes[1], protein_coding[1]), 128 | shared_gene) 129 | 130 | # blacklist 131 | assert_array_equal(prep.genelist_mask(exp_genes[0], protein_coding[0], 132 | whitelist=False), 133 | ~shared_ens) 134 | assert_array_equal(prep.genelist_mask(exp_genes[1], protein_coding[1], 135 | whitelist=False), 136 | ~shared_gene) 137 | 138 | 139 | def test_subsample_cell_ixs(): 140 | # int for choices 141 | assert_equal(len(prep.subsample_cell_ixs(20, 10)), 10) 142 | # array of choices 143 | assert_equal(len(prep.subsample_cell_ixs(np.arange(20), 10)), 10) 144 | 145 | # test picks one from a group 146 | group_ids = np.array([0] * 100 + [1,1]) 147 | idx = prep.subsample_cell_ixs(102, 10, group_ids=group_ids, 148 | max_group_frac=0.5) 149 | assert (100 in idx) ^ (101 in idx) #xor 150 | assert_equal(len(idx), 10) 151 | 152 | # test doesn't pick when can't under constraint 153 | group_ids = np.array([0] * 18 + [1,1]) 154 | idx = prep.subsample_cell_ixs(20, 5, group_ids=group_ids, 155 | max_group_frac=0.4) 156 | assert (not 18 in idx) and (not 19 in idx) #neither of the group 1 indexes 157 | assert_equal(len(idx), 5) # but still have 5 items 158 | 159 | 160 | # test doesn't pick more than it can under constraint 161 | group_ids = np.array([0] * 18 + [1,1]) 162 | idx = prep.subsample_cell_ixs(20, 5, group_ids=group_ids, 163 | max_group_frac=0.25) 164 | assert (not 18 in idx) and (not 19 in idx) #neither of the group 1 indexes 165 | assert_equal(len(idx), 4) # should have floor(0.25*18) items 166 | with pytest.warns(UserWarning) as record: 167 | idx = prep.subsample_cell_ixs(20, 5, group_ids=group_ids, 168 | max_group_frac=0.25) 169 | assert len(record) == 1 170 | 171 | 172 | def test_load_and_filter(protein_coding, blacklist): 173 | filtered_m2, genes_m2 = prep.load_and_filter(TXT, min_cells=2, 174 | whitelist=PROTEIN_CODING, blacklist=BLIST) 175 | assert_equal(filtered_m2.shape[0], NCELLS) 176 | assert filtered_m2.shape[1] <= NGENES 177 | assert_equal(filtered_m2.shape[1], len(genes_m2)) 178 | assert_equal(genes_m2[0].str.split('.').str[0].isin( 179 | blacklist[0].str.split('.').str[0]).sum(), 180 | 0) 181 | assert_equal(genes_m2[0].str.split('.').str[0].isin( 182 | protein_coding[0].str.split('.').str[0]).sum(), 183 | len(genes_m2)) 184 | 185 | filtered_m5, genes_m5 = prep.load_and_filter(TXT, min_cells=5, 186 | whitelist=PROTEIN_CODING, blacklist=BLIST) 187 | assert filtered_m5.shape[1] <= filtered_m2.shape[1] 188 | assert np.all(filtered_m5.astype(bool).sum(axis=0).A >= 5) 189 | -------------------------------------------------------------------------------- /tests/test_scHPF_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | 5 | import pytest 6 | from numpy.testing import assert_equal 7 | from numpy.testing import assert_array_equal 8 | 9 | from schpf import HPF_Gamma, scHPF, combine_across_cells 10 | 11 | """For tests of inference, see test_inference.py 12 | """ 13 | 14 | def test__setup_meanvar(model_uninit, data): 15 | bp, dp, xi, eta, theta, beta = model_uninit._setup(X=data, 16 | freeze_genes=False, reinit=True) 17 | cell_sums = data.sum(axis=1) 18 | gene_sums = data.sum(axis=0) 19 | 20 | # test hyperparams set to mean/var ratios 21 | assert_equal(bp, np.mean(cell_sums) / np.var(cell_sums)) 22 | assert_equal(dp, np.mean(gene_sums) / np.var(gene_sums)) 23 | 24 | 25 | def test__setup_dims(model_uninit, data): 26 | bp, dp, xi, eta, theta, beta = model_uninit._setup(X=data, 27 | freeze_genes=False, reinit=True) 28 | 29 | assert_equal(xi.vi_shape.shape[0], data.shape[0]) 30 | assert_equal(xi.vi_rate.shape[0], data.shape[0]) 31 | assert_equal(len(xi.vi_shape.shape), 1) 32 | assert_equal(len(xi.vi_rate.shape), 1) 33 | 34 | assert_equal(eta.vi_shape.shape[0], data.shape[1]) 35 | assert_equal(eta.vi_rate.shape[0], data.shape[1]) 36 | assert_equal(len(eta.vi_shape.shape), 1) 37 | assert_equal(len(eta.vi_rate.shape), 1) 38 | 39 | assert_equal(theta.vi_shape.shape[0], data.shape[0]) 40 | assert_equal(theta.vi_rate.shape[0], data.shape[0]) 41 | assert_equal(theta.vi_shape.shape[1], model_uninit.nfactors) 42 | assert_equal(theta.vi_rate.shape[1], model_uninit.nfactors) 43 | 44 | assert_equal(beta.vi_shape.shape[0], data.shape[1]) 45 | assert_equal(beta.vi_rate.shape[0], data.shape[1]) 46 | assert_equal(beta.vi_shape.shape[1], model_uninit.nfactors) 47 | assert_equal(beta.vi_rate.shape[1], model_uninit.nfactors) 48 | 49 | 50 | def test__setup_freeze(model, data): 51 | my_data = data.tocsr()[:20].tocoo() 52 | bp, dp, xi, eta, theta, beta = (model.bp, model.dp, model.xi, 53 | model.eta, model.theta, model.beta) 54 | 55 | model.bp = None 56 | bp2, dp2, xi2, eta2, theta2, beta2 = model._setup(X=my_data, 57 | freeze_genes=True, reinit=True) 58 | 59 | # cell-side vals (should be the smae) 60 | assert_equal(dp2, dp) 61 | assert_equal(eta2, eta) 62 | assert_equal(beta2, beta) 63 | 64 | # gene-side vals 65 | assert bp2 != bp 66 | assert xi2.dims != xi.dims 67 | assert theta2.dims != theta.dims 68 | 69 | # check bp not updated w/freeze_genes if already set 70 | model.bp = bp 71 | bp3, _, _, _, _, _ = model._setup(X=my_data, freeze_genes=True, reinit=True) 72 | print(bp, bp2, bp3) 73 | assert bp3 == bp 74 | assert bp3 != bp2 75 | 76 | 77 | def test__set_ac(model_uninit): 78 | model_uninit.nfactors = None 79 | with pytest.raises(ValueError): 80 | model_uninit.a = -2 81 | with pytest.raises(ValueError): 82 | model_uninit.c = -2 83 | 84 | model_uninit.nfactors = 15 85 | model_uninit.a = -2 86 | assert model_uninit.a == 1/np.sqrt(15) 87 | model_uninit.c = -2 88 | assert model_uninit.c == 1/np.sqrt(15) 89 | 90 | @pytest.mark.parametrize('a_dims', [[5,], [5,10]]) 91 | @pytest.mark.parametrize('dtype', [np.float64, np.float32]) 92 | def test_HPF_Gamma_combine(a_dims, dtype): 93 | a_vi_shape = np.ones(a_dims, dtype=dtype) 94 | a_vi_rate = np.ones(a_dims, dtype=dtype) 95 | a = HPF_Gamma(a_vi_shape, a_vi_rate) 96 | 97 | b_dims = a_dims.copy() 98 | b_dims[0] = 3 99 | b_vi_shape = 2*np.ones(b_dims, dtype=dtype) 100 | b_vi_rate = 2*np.ones(b_dims, dtype=dtype) 101 | b = HPF_Gamma(b_vi_shape, b_vi_rate) 102 | 103 | b_ix = [0,5,7] 104 | ab = a.combine(b, b_ix) 105 | assert_equal(ab.dims[0], a.dims[0] + b.dims[0]) 106 | # check b rows 107 | assert_array_equal(ab.vi_shape[b_ix], b.vi_shape) 108 | assert_array_equal(ab.vi_rate[b_ix], b.vi_rate) 109 | # check a rows too 110 | a_ix = np.setdiff1d(np.arange(ab.dims[0]), b_ix) 111 | print(a_ix) 112 | assert_array_equal(ab.vi_shape[a_ix], a.vi_shape) 113 | assert_array_equal(ab.vi_rate[a_ix], a.vi_rate) 114 | 115 | b_ix = [4] 116 | with pytest.raises(AssertionError): 117 | ab = a.combine(b, b_ix) 118 | 119 | b_ix = [0,1,2,3] 120 | with pytest.raises(AssertionError): 121 | ab = a.combine(b, b_ix) 122 | 123 | 124 | b_ix = [0,1,2,2] 125 | with pytest.raises(AssertionError): 126 | ab = a.combine(b, b_ix) 127 | 128 | b_ix = [7,8,9] 129 | with pytest.raises(AssertionError): 130 | ab = a.combine(b, b_ix) 131 | 132 | 133 | @pytest.mark.parametrize('dtype', [np.float64, np.float32]) 134 | def test_project(data, dtype): 135 | # get b indices 136 | b_idx = np.random.choice(data.shape[0], 10) 137 | # get remaining indices (for a) 138 | a_idx = np.setdiff1d(np.arange(data.shape[0]), b_idx) 139 | # split data 140 | data_csr = data.tocsr() 141 | a_data = data_csr[a_idx].tocoo() 142 | b_data = data_csr[b_idx].tocoo() 143 | 144 | # setup model for a_data 145 | a_model = scHPF(5, dtype=dtype) 146 | a_model._initialize(a_data) 147 | bp = a_model.bp 148 | 149 | #project b_model 150 | b_model = a_model.project(b_data) 151 | # check genes frozen 152 | assert_equal(b_model.eta, a_model.eta) 153 | assert_equal(b_model.beta, a_model.beta) 154 | # check cells different 155 | assert_equal(a_model.ncells, a_data.shape[0]) 156 | assert_equal(b_model.ncells, b_data.shape[0]) 157 | # check bp unchanged 158 | assert_equal(b_model.bp, bp) 159 | 160 | # check bp updates when we want 161 | c_model = a_model.project(b_data, recalc_bp=True) 162 | assert c_model.bp != bp 163 | 164 | 165 | @pytest.mark.parametrize('dtype', [np.float64, np.float32]) 166 | def test_combine_across_cells(data, dtype): 167 | # get b indices 168 | b_ixs = np.random.choice(data.shape[0], 10, replace=False) 169 | # get a indices (remaining) 170 | a_ixs = np.setdiff1d(np.arange(data.shape[0]), b_ixs) 171 | # split data 172 | data_csr = data.tocsr() 173 | a_data = data_csr[a_ixs].tocoo() 174 | b_data = data_csr[b_ixs].tocoo() 175 | 176 | # setup model for a_data 177 | a = scHPF(5, dtype=dtype) 178 | a._initialize(a_data) 179 | # setup model for b_data w/same dp, eta, beta 180 | b = scHPF(5, dtype=dtype, dp=a.dp, eta=a.eta, beta=a.beta) 181 | b._initialize(b_data, freeze_genes=True) 182 | 183 | ab = combine_across_cells(a, b, b_ixs) 184 | 185 | # check bp is None since it is different across the two models 186 | assert_equal(ab.bp, None) 187 | # check a locals where they should be in xi and eta 188 | assert_array_equal(ab.xi.vi_shape[a_ixs], a.xi.vi_shape) 189 | assert_array_equal(ab.xi.vi_rate[a_ixs], a.xi.vi_rate) 190 | assert_array_equal(ab.theta.vi_shape[a_ixs], a.theta.vi_shape) 191 | assert_array_equal(ab.theta.vi_rate[a_ixs], a.theta.vi_rate) 192 | 193 | # check b locals where they should be in xi and eta 194 | assert_array_equal(ab.xi.vi_shape[b_ixs], b.xi.vi_shape) 195 | assert_array_equal(ab.xi.vi_rate[b_ixs], b.xi.vi_rate) 196 | assert_array_equal(ab.theta.vi_shape[b_ixs], b.theta.vi_shape) 197 | assert_array_equal(ab.theta.vi_rate[b_ixs], b.theta.vi_rate) 198 | 199 | # check globals unchanged 200 | assert_equal(ab.eta, a.eta) 201 | assert_equal(ab.eta, b.eta) 202 | assert_equal(ab.beta, a.beta) 203 | assert_equal(ab.beta, b.beta) 204 | 205 | 206 | # TODO write this, also do for run_trials_pool 207 | # def test_run_trials(data): 208 | # pass 209 | -------------------------------------------------------------------------------- /tests/test_util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | from numpy.testing import assert_equal, assert_array_equal 5 | from scipy.sparse import coo_matrix 6 | import pytest 7 | 8 | from schpf import max_pairwise, max_pairwise_table 9 | from schpf.util import split_coo_rows, collapse_coo_rows, insert_coo_rows 10 | from schpf.util import mean_cellscore_fraction 11 | 12 | 13 | def test_mean_cellscore_fraction(): 14 | X = np.array([ 15 | [10, 1, 1, 1, 1], 16 | [1, 1, 1, 1, 1], 17 | [10, 10, 1, 1, 1], 18 | [1, 1, 1, 1, 10]]) 19 | assert_equal(mean_cellscore_fraction(X, 5), 1.0) 20 | assert np.abs(mean_cellscore_fraction(X, 1) - 0.5) < 0.02 21 | 22 | 23 | def test_overlap(): 24 | X = np.array([ 25 | [0.33973751, 0.72029002, 0.52763837, 0.94012605, 0.20375346], 26 | [0.32460224, 0.43595206, 0.8304655 , 0.31792094, 0.77330563], 27 | [0.00507031, 0.42707696, 0.26948512, 0.50554657, 0.31438824], 28 | [0.52583849, 0.54531833, 0.08530654, 0.35516516, 0.10617843], 29 | [0.78608326, 0.59571929, 0.09737211, 0.09474643, 0.55319175], 30 | [0.04245016, 0.43322226, 0.99748447, 0.45731582, 0.65861378], 31 | [0.04364505, 0.97239799, 0.68847276, 0.96692073, 0.60268244], 32 | [0.13364376, 0.40121588, 0.32770517, 0.02352124, 0.04974099], 33 | [0.92531954, 0.23635494, 0.29327799, 0.40788107, 0.95974159], 34 | [0.42295065, 0.5725946 , 0.59206089, 0.76534785, 0.77961214]]) 35 | assert_equal(max_pairwise(X, ntop=3)[0], 2) 36 | assert_equal(max_pairwise(X, ntop=3, second_greatest=True)[0], 1) 37 | 38 | 39 | def test_overlap_table(): 40 | X = np.array([ 41 | [13, 11, 2, 13, 5, 12, 5], 42 | [ 8, 6, 0, 6, 8, 14, 6], 43 | [11, 13, 11, 11, 14, 8, 10], 44 | [ 1, 12, 8, 14, 7, 1, 3], 45 | [ 0, 1, 10, 12, 3, 5, 2], 46 | [ 3, 2, 6, 5, 9, 2, 1], 47 | [ 9, 3, 7, 2, 4, 3, 7], 48 | [ 4, 4, 12, 7, 13, 10, 0], 49 | [ 7, 0, 9, 8, 6, 6, 12], 50 | [14, 14, 13, 9, 10, 9, 14], 51 | [ 2, 9, 4, 10, 12, 7, 13], 52 | [ 6, 10, 3, 1, 0, 11, 4], 53 | [12, 8, 1, 0, 2, 13, 9], 54 | [ 5, 7, 14, 3, 11, 4, 11], 55 | [10, 5, 5, 4, 1, 0, 8] ]) 56 | ntop_list = [1,2,3,4,5,6] 57 | table = max_pairwise_table(X, ntop_list=ntop_list) 58 | assert np.all(table.max_overlap >= table.max2_overlap) 59 | assert np.any(table.max_overlap > table.max2_overlap) 60 | assert np.all(np.diff(table.max_overlap.values) >= 0 ) 61 | assert np.all(np.diff(table.max2_overlap.values) >= 0 ) 62 | 63 | 64 | def test_split_coo_rows(): 65 | row = np.array([0, 0, 2, 3, 3, 3]) 66 | col = np.array([0, 2, 2, 0, 1, 2]) 67 | data = np.array([1, 2, 3, 4, 5, 6]) 68 | X = coo_matrix((data, (row, col))) 69 | 70 | a, b = split_coo_rows(X, np.array([0,2,3])) 71 | assert_equal(a.shape[0], 3) 72 | assert_equal(a.shape[1], 3) 73 | assert_equal(b.shape[0], 1) 74 | assert_equal(b.shape[1], 3) 75 | assert_array_equal(b.todense()[0,:], X.todense()[1,:]) 76 | 77 | 78 | def test_collapse_coo_rows(): 79 | a_row = np.array([0, 0, 2, 3, 3, 3]) 80 | a_col = np.array([0, 2, 2, 0, 1, 2]) 81 | a_data = np.array([1, 2, 3, 4, 5, 6]) 82 | a = coo_matrix((a_data, (a_row, a_col))) 83 | 84 | collapsed, nz = collapse_coo_rows(a) 85 | assert_equal(collapsed.shape[0],a.shape[0]-1) 86 | assert_array_equal(nz, np.array([0,2,3])) 87 | 88 | 89 | def test_insert_coo_rows(): 90 | a_row = np.array([0, 0, 1, 2, 2, 2]) 91 | a_col = np.array([0, 2, 2, 0, 1, 2]) 92 | a_data = np.array([1, 2, 3, 4, 5, 6]) 93 | a = coo_matrix((a_data, (a_row, a_col))) 94 | 95 | b_row = np.array([0, 1, 1]) 96 | b_col = np.array([2, 1, 2 ]) 97 | b_data = np.array([11, 12, 13]) 98 | b = coo_matrix((b_data, (b_row, b_col))) 99 | 100 | b_indices = [0,1] 101 | ab = insert_coo_rows(a, b, b_indices) 102 | assert_equal(ab.shape[0], a.shape[0] + b.shape[0]) 103 | assert_array_equal(ab.todense()[0, :], b.todense()[0,:]) 104 | assert_array_equal(ab.todense()[1, :], b.todense()[1,:]) 105 | 106 | b_indices = [1,4] 107 | ab = insert_coo_rows(a, b, b_indices) 108 | assert_equal(ab.shape[0], a.shape[0] + b.shape[0]) 109 | assert_array_equal(ab.todense()[0, :], a.todense()[0,:]) 110 | assert_array_equal(ab.todense()[1, :], b.todense()[0,:]) 111 | 112 | with pytest.raises(ValueError) as execinfo: 113 | b_indices = [1,4] 114 | b = coo_matrix((b_data, (b_row, b_col)), shape=[3, 5]) 115 | insert_coo_rows(a, b, b_indices) 116 | assert "a.shape[1] must equal b.shape[1]" in str(execinfo.value) 117 | 118 | with pytest.raises(ValueError) as execinfo: 119 | b_indices = [1,7] 120 | b = coo_matrix((b_data, (b_row, b_col))) 121 | insert_coo_rows(a, b, b_indices) 122 | assert "Invalid row indices" in str(execinfo.value) 123 | 124 | with pytest.raises(ValueError) as execinfo: 125 | b_indices = [2,1] 126 | insert_coo_rows(a, b, b_indices) 127 | assert "must be ordered" in str(execinfo.value) 128 | 129 | with pytest.raises(ValueError) as execinfo: 130 | b_indices = [1,1] 131 | insert_coo_rows(a, b, b_indices) 132 | assert "must be ordered" in str(execinfo.value) 133 | 134 | --------------------------------------------------------------------------------