├── .gitignore
├── .readthedocs.yaml
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── bin
    └── scHPF
├── docs
    ├── Makefile
    ├── changelog.rst
    ├── cli-man.rst
    ├── conf.py
    ├── genelists.rst
    ├── img
    │   ├── cell-type-rep-01.png
    │   └── k_selection_minifig-01.png
    ├── index.rst
    ├── install.rst
    ├── make.bat
    ├── prep-cli.rst
    ├── project.rst
    ├── references.rst
    ├── score-cli.rst
    ├── select-k.rst
    └── train-cli.rst
├── resources
    ├── README.md
    ├── gencode.v24.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt
    ├── gencode.v29.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt
    ├── gencode.v31.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt
    ├── gencode.vM10.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt
    └── gencode.vM19.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt
├── schpf
    ├── __init__.py
    ├── _version.py
    ├── hpf_numba.py
    ├── loss.py
    ├── preprocessing.py
    ├── scHPF_.py
    └── util.py
├── setup.cfg
├── setup.py
└── tests
    ├── __init__.py
    ├── _data
        ├── PJ030merge.c300t400_g0t500.matrix.txt
        └── sample_blacklist.txt
    ├── conftest.py
    ├── test_inference.py
    ├── test_misc.py
    ├── test_preprocessing.py
    ├── test_scHPF_model.py
    └── test_util.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/conf.py
11 | 
12 | # Build documentation with MkDocs
13 | #mkdocs:
14 | #  configuration: mkdocs.yml
15 | 
16 | # Optionally build your docs in additional formats such as PDF and ePub
17 | formats: all
18 | 
19 | # Optionally set the version of Python and requirements required to build your docs
20 | python:
21 |    version: 3.7
22 |    install:
23 |       - method: pip
24 |         path: .
25 |         extra_requirements:
26 |            - docs
27 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
 6 | 
 7 | ## Our Standards
 8 | 
 9 | Examples of behavior that contributes to creating a positive environment include:
10 | 
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 | 
17 | Examples of unacceptable behavior by participants include:
18 | 
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 | 
25 | ## Our Responsibilities
26 | 
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 | 
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 | 
31 | ## Scope
32 | 
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 | 
35 | ## Enforcement
36 | 
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at h.mendes.levitin@columbia.edu. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 | 
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 | 
41 | ## Attribution
42 | 
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 | 
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2018, Sims Lab.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Single-cell Hierarchical Poisson Factorization
 2 | 
 3 | ## About
 4 | scHPF is a tool for _de novo_ discovery of both discrete and continuous expression patterns in single-cell RNA\-sequencing (scRNA-seq). We find that scHPF’s sparse low-dimensional representations, non-negativity, and explicit modeling of variable sparsity across genes and cells produce highly interpretable factors.
 5 | 
 6 | - [Documentation](https://schpf.readthedocs.io/en/latest/)
 7 | - [Changelog](https://schpf.readthedocs.io/en/latest/changelog.html)
 8 | - [Paper at Molecular Systems Biology](https://doi.org/10.15252/msb.20188557)
 9 | - [Application to human tissue T cells across multiple donors and tissues](https://doi.org/10.1038/s41467-019-12464-3) 
10 | 
11 | ##  Installation
12 | ### Environment & Dependencies
13 | scHPF requires Python >= 3.6 and the packages:
14 | - numba ([version needed depends on Python version](https://schpf.readthedocs.io/en/latest/install.html#numba-compatibility), but should be safe with 0.45)
15 | - scikit-learn
16 | - pandas
17 | - (optional) loompy
18 | 
19 | The easiest way to setup an environment for scHPF is with the Anaconda
20 | Python distribution in [Miniconda](https://conda.io/miniconda.html) or
21 | [anaconda](https://www.continuum.io/downloads):
22 | 
23 | ```
24 | conda create -n schpf_p37 python=3.7 scikit-learn numba=0.50 pandas
25 | 
26 | # for newer anaconda versions
27 | conda activate schpf_p37
28 | # XOR older anaconda verstions
29 | source activate schpf_p37
30 | 
31 | # Optional, for using loom files as input to preprocessing
32 | pip install -U loompy
33 | ```
34 | 
35 | ### Installing from source
36 | Once you have set up the environment, clone this repository and install.
37 | ```
38 | git clone git@github.com:simslab/scHPF.git
39 | cd scHPF
40 | pip install .
41 | ```
42 | 
43 | ### Testing your installation
44 | This step important because not all micro-versions of numba play nicely with
45 | all micro versions of Python or numpy, and sometimes issues vary across
46 | machines. Testing will catch some but not all such issues.  From the scHPF base
47 | directory do:
48 | ```
49 | conda install pytest
50 | pytest
51 | ```
52 | Please get in touch if tests fail, or if you get segmentation faults or very
53 | long train times that and no automatic parallelization, and I'm happy to try to
54 | help. 
55 | 
56 | ## Quick Start: Command Line Interface
57 | 
58 | 1. [Prepare your data](https://schpf.readthedocs.io/en/latest/prep-cli.html). 
59 | 
60 | 2. [Train a model](https://schpf.readthedocs.io/en/latest/train-cli.html).
61 | 
62 | 3. [Get gene and cell scores](https://schpf.readthedocs.io/en/latest/score-cli.html)
63 | 
64 | 
65 | ## API
66 | scHPF has a scikit-learn like API. Trained models are stored in a serialized
67 | joblib format.
68 | 
69 | 
70 | ## Help and support
71 | If you have any questions/errors/issues, please [open an issue](https://github.com/simslab/scHPF/issues/new) 
72 | and I be happy to to provide whatever help and guidance I can.
73 | 
74 | 
75 | ## Contributing
76 | Contributions to scHPF are welcome. Please get in touch if you would like to
77 | discuss/check it's something I've already done but haven't pushed to master yet.
78 | To contribute, please [fork
79 | scHPF](https://github.com/simslab/scHPF/issues#fork-destination-box), make your
80 | changes, and submit a pull request.
81 | 
82 | ##  References
83 | Hanna Mendes Levitin, Jinzhou Yuan, Yim Ling Cheng, Francisco JR Ruiz, Erin C Bush, 
84 | Jeffrey N Bruce, Peter Canoll, Antonio Iavarone, Anna Lasorella, David M Blei, Peter A Sims.
85 | __"*De novo* gene signature identification from single‐cell RNA‐seq with hierarchical Poisson 
86 | factorization."__ Molecular Systems Biology, 2019. [[Open access article]](http://msb.embopress.org/content/15/2/e8557.full.pdf)
87 | 
88 | Peter A. Szabo\*, Hanna Mendes Levitin\*, Michelle Miron, Mark E. Snyder,
89 | Takashi Senda, Jinzhou Yuan, Yim Ling Cheng, Erin C. Bush, Pranay Dogra, Puspa
90 | Thapa, Donna L. Farber, Peter A. Sims. __"Single-cell transcriptomics of human
91 | T cells reveals tissue and activation signatures in health and disease."__ Nature Communications, 2019. 
92 | [[Open access article]](https://doi.org/10.1038/s41467-019-12464-3)
93 | \* Co-first authors
94 | 
95 | 


--------------------------------------------------------------------------------
/bin/scHPF:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | import os
  4 | import sys
  5 | import argparse
  6 | import json
  7 | import time
  8 | from functools import partial
  9 | 
 10 | if not sys.warnoptions:
 11 |     import warnings
 12 |     warnings.simplefilter("ignore")
 13 | 
 14 | import numpy as np
 15 | import pandas as pd
 16 | from scipy.io import mmread, mmwrite
 17 | from scipy.sparse import coo_matrix
 18 | import joblib
 19 | 
 20 | from schpf import scHPF, run_trials, run_trials_pool
 21 | from schpf.util import max_pairwise_table, mean_cellscore_fraction_list
 22 | from schpf.preprocessing import load_coo, load_and_filter, load_like
 23 | from schpf.preprocessing import split_validation_cells
 24 | 
 25 | def _parser():
 26 |     # usage = """scHPF <command> [<args>]
 27 | 
 28 | # The most commonly used scHPF commands are:
 29 |     # prep    Prepare data
 30 |     # train   Train a model from data
 31 |     # score   Get cell-scores, gene-scores, and other data
 32 | 
 33 | # Some advanced scHPF commands are:
 34 |     # prep-like    Prepare data with the same genes & order as other data
 35 |     # project      Project data onto a pre-trained model
 36 |     # train-pool   Train a model, parallelized at the level of trials
 37 |     # """
 38 |     parser = argparse.ArgumentParser()
 39 |     subparsers = parser.add_subparsers(dest='cmd')
 40 | 
 41 |     ### Preprocess command
 42 |     prep = subparsers.add_parser('prep',
 43 |             help='Prepare data for training')
 44 |     # data
 45 |     prep.add_argument('-i', '--input', required=True,
 46 |             help='Input data. Currently accepts either: (1) a whitespace-'
 47 |             'delimited gene by cell UMI count matrix with 2 leading columns '
 48 |             'of gene attributes (ENSEMBL_ID and GENE_NAME respectively), or '
 49 |             '(2) a loom file with at least one of the row attributes '
 50 |             '`Accession` or `Gene`, where `Accession` is an ENSEMBL id and '
 51 |             '`Gene` is the name.'
 52 |             )
 53 |     prep.add_argument('-o', '--outdir', 
 54 |             help='Output directory. Does not need to exist.')
 55 |     prep.add_argument('-p', '--prefix', default='',
 56 |             help='Prefix for output files. Optional.')
 57 | 
 58 |     # gene filtering criteria
 59 |     prep.add_argument('-m', '--min-cells', type=float, default=0.01, 
 60 |             help='Minimum number of cells in which we must observe at '
 61 |             'least one transcript of a gene for the gene to pass '
 62 |             'filtering. If 0 <`min_cells`< 1, sets threshold to be '
 63 |             '`min_cells` * ncells, rounded to the nearest integer.'
 64 |             ' [Default 0.01]')
 65 |     prep.add_argument('-w', '--whitelist', default='',
 66 |             help='Tab-delimited file where first column contains ENSEMBL gene '
 67 |             'ids to accept, and second column contains corresponding gene '
 68 |             'names. If given, genes not on the whitelist are filtered from '
 69 |             'the input matrix. Superseded by blacklist. Optional.')
 70 |     prep.add_argument('-b', '--blacklist', default='',
 71 |             help='Tab-delimited file where first column contains ENSEMBL gene '
 72 |             'ids to exclude, and second column is the corresponding gene name. '
 73 |             'Only performed if file given. Genes on the blacklist are '
 74 |             'excluded even if they are also on the whitelist. Optional.')
 75 | 
 76 |     # optional selection of cells for validation set
 77 |     prep.add_argument('-nvc', '--n-validation-cells', type=int, default=0,
 78 |             help='Number of cells to randomly select for validation.')
 79 |     prep.add_argument('-vgid', '--validation-group-ids', default=None,
 80 |             help= 'Single column file of cell group ids readable with '
 81 |             ' np.readtxt. If `--n-validation-cells` is > 0, cells will be '
 82 |             ' randomly selected approximately evenly across the groups in '
 83 |             ' this file, under the constraint that at most'
 84 |             ' `--validation-min-group-frac` * (ncells in group) are selected'
 85 |             ' from every group.')
 86 |     prep.add_argument('--validation-max-group-frac', type=float, default=0.5,
 87 |             help='If `-nvc`>0 and `validation-group-ids` is a valid file, at'
 88 |             ' most `validation-min-group-frac`*(ncells in group) cells are'
 89 |             ' selected from each group.')
 90 | 
 91 |     # other options
 92 |     prep.add_argument('--filter-by-gene-name', default=False, 
 93 |             action='store_true', help='Use gene name rather than ENSEMBL'
 94 |             ' id to filter (with whitelist or blacklist).  Useful for'
 95 |             ' datasets where only gene symbols are given. Applies to both'
 96 |             ' whitelist and blacklist. Used by default when input is a loom'
 97 |             ' file (unless there is an Accession attribute in the loom).')
 98 |     prep.add_argument('--no-split-on-dot', default=False, action='store_true',
 99 |             help='Don\'t split gene symbol or name on period before '
100 |             'filtering whitelist and blacklist. We do this by default for '
101 |             'ENSEMBL ids.')
102 | 
103 | 
104 |     #### Prepare like
105 |     prep_like = subparsers.add_parser('prep-like', 
106 |             help='Prepare a data set like another (ie with the same genes in'
107 |             ' the same order)')
108 |     # data
109 |     prep_like.add_argument('-i', '--input', required=True,
110 |             help='Input data to format. Currently accepts either: (1) a'
111 |             ' whitespace-delimited gene by cell UMI count matrix with 2'
112 |             ' leading columns of gene attributes (ENSEMBL_ID and GENE_NAME'
113 |             ' respectively), or (2) a loom file with at least one of the row'
114 |             ' attributes `Accession` or `Gene`, where `Accession` is an'
115 |             ' ENSEMBL id and `Gene` is the name.')
116 |     prep_like.add_argument('-r', '--reference', required=True,
117 |             help='Two-column tab-delimited file of ENSEMBL ids and gene names'
118 |             ' to select from `input` and order like. All genes in `reference`'
119 |             ' must be present in `input`.')
120 |     prep_like.add_argument('-o', '--outdir', required=True,
121 |             help='Output directory. Does not need to exist.')
122 |     prep_like.add_argument('-p', '--prefix', default='',
123 |             help='Prefix for output files. Optional.')
124 |     # other options
125 |     prep_like.add_argument('--by-gene-name', default=False, 
126 |             action='store_true', help='Use gene name rather than ENSEMBL'
127 |             ' id to when matching against reference.  Useful for datasets'
128 |             ' where only gene symbols are given. Used by default when input'
129 |             ' is a loom file (unless there is an Accession attr in the loom).')
130 |     prep_like.add_argument('--no-split-on-dot', default=False, action='store_true',
131 |             help='Don\'t split gene symbol or name on period before'
132 |             ' when matching to reference. We do this by default for ENSEMBL'
133 |             ' ids.')
134 | 
135 | 
136 |     ###### Train command
137 |     train = subparsers.add_parser('train',
138 |             help='Train a model with automatic parallelization across'
139 |             ' computations with numba')
140 |     # data and saving
141 |     train.add_argument('-i', '--input', required=True,
142 |             help="Training data. Expects either the mtx file output by the "
143 |             "prep command or a tab-separated tsv file formatted like:" 
144 |             "`CELL_ID\tGENE_ID\tUMI_COUNT`. In the later case, ids are "
145 |             "assumed to be 0 indexed and we assume no duplicates."
146 |             )
147 |     train.add_argument('-o', '--outdir', 
148 |             help='Output directory for scHPF model. Will be created if does '
149 |             'not exist.')
150 |     train.add_argument('-p', '--prefix', default='',
151 |             help='Prefix for output files. Optional.')
152 | 
153 |     # Required model hyperparameter
154 |     train.add_argument('-k', '--nfactors', type=int, required=True,
155 |             help='Number of factors.')
156 | 
157 |     # training parameters
158 |     train.add_argument('-t', '--ntrials',  type=int, default=1,
159 |             help='Number of times to run scHPF, selecting the trial with '
160 |             'best loss (on training data unless validation is given).'
161 |             ' [Default 1]')
162 |     train.add_argument('-v', '--validation-cells', default=None,
163 |             help='Cells to use to assess convergence and choose a model.'
164 |             ' Expects same format as ``-i/--input``. Training data used by'
165 |             ' default.'
166 |             )
167 |     train.add_argument('-M', '--max-iter', type=int, default=1000,
168 |             help='Maximum iterations. [Default 1000].')
169 |     train.add_argument('-m', '--min-iter', type=int, default=30,
170 |             help='Minimum iterations. [Default 30]')
171 |     train.add_argument('-e',  '--epsilon', type=float, default=0.001,
172 |             help='Minimum percent decrease in loss between checks to continue '
173 |             'inference (convergence criteria). [Default 0.001].')
174 |     train.add_argument('-f', '--check-freq', type=int, default=10,
175 |             help='Number of iterations to run between convergence checks. '
176 |             '[Default 10].')
177 |     train.add_argument('--better-than-n-ago', default=5, type=int,
178 |             help= 'Stop condition if loss is getting worse.  Stops training '
179 |             'if loss is worse than `better_than_n_ago`*`check-freq` training '
180 |             'steps ago and getting worse. Normally not necessary to change.')
181 |     train.add_argument('-a', type=float, default=0.3,
182 |             help='Value for hyperparameter a. Setting to -2 will auto-set to'
183 |             ' 1/sqrt(nfactors)[Default 0.3]')
184 |     train.add_argument('-c', type=float, default=0.3,
185 |             help='Value for hyperparameter c. Setting to -2 will auto-set to'
186 |             ' 1/sqrt(nfactors)[Default 0.3]')
187 |     train.add_argument('--float32', action='store_true',
188 |             help="Use 32-bit floats instead of default 64-bit floats in"
189 |             " variational distrubtions")
190 |     train.add_argument('-bs', '--batchsize', default=0, type=int,
191 |             help="Number of cells to use per training round. All cells used if"
192 |             " 0. Note that using batches changes the order of updates during"
193 |             " inference.")
194 |     train.add_argument('-sl', '--smooth-loss', default=1, type=int,
195 |             help="Average loss over the last `--smooth-loss` interations."
196 |             " Intended for when using minibatches, where int(ncells/batchsize)"
197 |             " is a reasonable value"
198 |             )
199 |     train.add_argument('-bts', '--beta-theta-simultaneous', action='store_true',
200 |             help="If False (default), compute beta update, then compute theta"
201 |             " based on the updated beta. Note that if batching is used, this"
202 |             " order is reverse. If True, update both beta and theta based on"
203 |             " values from the last training round. The later slows the rate of"
204 |             " convergence and sometimes results in better log-likelihoods, but"
205 |             " may increase convergence time, especially for large numbers of"
206 |             " cells."
207 |             )
208 |     train.add_argument('-sa', '--save-all', action='store_true',
209 |             help="Save all trials")
210 |     train.add_argument('-rp', '--reproject', action='store_true',
211 |             help="Reproject data onto fixed global (gene) parameters after"
212 |             " convergence, but before model selection. Recommended with"
213 |             " batching")
214 |     train.add_argument('--quiet', dest='verbose', action='store_false', 
215 |             default=True, help="Don't print intermediate llh.")
216 | 
217 |     ###### train with trials in threadpool
218 |     train_pool = subparsers.add_parser('train-pool', parents=[train], 
219 |             add_help=False, conflict_handler='resolve')
220 |     train_pool.add_argument('--njobs', type=int, default=0,
221 |             help='Max number of processes to spawn. 0 will use the minimum of'
222 |             ' all available cores and ntrials.')
223 |     # Required model hyperparameter
224 |     train_pool.add_argument('-k', '--nfactors', nargs='+', type=int, 
225 |         required=True, help='Number of factors.')
226 | 
227 | 
228 |     ### Score command
229 |     score = subparsers.add_parser('score',
230 |             help='Create useful files such as gene scores, cell scores, and'
231 |             ' ranked gene lists in txt format.')
232 |     score.add_argument('-m', '--model', required=True,
233 |             help='Saved scHPF model from train command. Should have extension' 
234 |             '`.joblib`')
235 |     score.add_argument('-o', '--outdir', default=None,
236 |             help='Output directory for score files. If not given, a new'
237 |             ' subdirectory of the dir containing the model will be made with'
238 |             ' the same name as the model file (without extension)')
239 |     score.add_argument('-p', '--prefix', default='',
240 |             help='Prefix for output files. Optional.')
241 |     score.add_argument('-g', '--genefile', default=None,
242 |             help='Create an additional file with gene names ranked by score '
243 |             'for each factor. Expects the gene.txt file output by the scHPF '
244 |             'prep command or a similarly formatted tab-delimited file without '
245 |             'headers. Uses the zero-indexed ``--name_col``\'th column as gene '
246 |             'names. Optional.')
247 |     score.add_argument('--name-col', type=int, default=1,
248 |             help='The zero-indexed column of `genefile` to use as a gene name '
249 |             'when (optionally) ranking genes. If ``--name_col`` is greater'
250 |             ' than the index of ``--genefile``\'s last column, it is '
251 |             ' automatically reset to the last column\'s index. [Default 1]'
252 |         )
253 | 
254 | 
255 | 
256 |     # ###### Project command
257 |     proj = subparsers.add_parser('project',
258 |             help='Project new data onto a trained model.')
259 |     # data and saving
260 |     proj.add_argument('-m', '--model', required=True,
261 |             help='The model to project onto.')
262 |     proj.add_argument('-i', '--input', required=True,
263 |             help='Data to project onto model. Expects either the mtx file'
264 |             ' output by the prep or prep-like commands or a tab-delimitted'
265 |             ' tsv file formated like: `CELL_ID\tGENE_ID\tUMI_COUNT`. In the'
266 |             ' later case, ids are assumed to be 0 indexed and we assume no'
267 |             ' duplicates.')
268 |     proj.add_argument('-o', '--outdir', 
269 |             help='Output directory for projected scHPF model. Will be created'
270 |             ' if does not exist.')
271 |     proj.add_argument('-p', '--prefix', default='',
272 |             help='Prefix for output files. Optional.')
273 | 
274 |     # projection-specific args
275 |     proj.add_argument('--recalc-bp', action='store_true',
276 |             help='Recalculate hyperparameter bp for the new data')
277 | 
278 |     # Training parameters (same as train, different defaults, no short names)
279 |     proj.add_argument('--max-iter', type=int, default=500,
280 |             help='Maximum iterations. [Default 500].')
281 |     proj.add_argument('--min-iter', type=int, default=10,
282 |             help='Minimum iterations. [Default 10]')
283 |     proj.add_argument('--epsilon', type=float, default=0.001,
284 |             help='Minimum percent decrease in loss between checks to continue '
285 |             'inference (convergence criteria). [Default 0.001].')
286 |     proj.add_argument('--check-freq', type=int, default=10,
287 |             help='Number of iterations to run between convergence checks. '
288 |             '[Default 10].')
289 | 
290 |     return parser
291 | 
292 | 
293 | if __name__=='__main__':
294 |     parser = _parser()
295 |     args = parser.parse_args()
296 | 
297 |     # print help if no subparser given
298 |     if len(sys.argv)==1:
299 |         parser.print_help(sys.stderr)
300 |         sys.exit(1)
301 | 
302 |     # setup paths and prefixes
303 | 
304 |     if args.outdir is None:
305 |         if args.cmd in ['prep', 'prep-like', 'train', 'train-pool']:
306 |             args.outdir = args.input.rsplit('/', 1)[0]
307 |         elif args.cmd=='project':
308 |             args.outdir = args.model.rsplit('/',1)[0]
309 |         elif args.cmd=='score':
310 |             args.outdir = args.model.split('.joblib')[0]
311 | 
312 |     if args.outdir is not None and not os.path.exists(args.outdir):
313 |         print("Creating output directory {} ".format(args.outdir))
314 |         os.makedirs(args.outdir)
315 |     prefix = args.prefix.rstrip('.') + '.' if len(args.prefix) > 0 else ''
316 |     outprefix = args.outdir + '/' +  prefix
317 | 
318 |     if args.cmd == 'prep':
319 |         filtered, genes = load_and_filter(args.input, 
320 |                 min_cells=args.min_cells, 
321 |                 whitelist=args.whitelist, 
322 |                 blacklist=args.blacklist, 
323 |                 filter_by_gene_name=args.filter_by_gene_name,
324 |                 no_split_on_dot=args.no_split_on_dot)
325 | 
326 |         print('Writing filtered data to file.....')
327 |         mmwrite('{}filtered.mtx'.format(outprefix), filtered, field='integer')
328 |         genes.to_csv('{}genes.txt'.format(outprefix), sep='\t', header=None,
329 |                 index=None)
330 | 
331 |         if args.n_validation_cells > 0:
332 |             print('Selecting train/validation cells.....')
333 |             Xtrn, Xvld, vld_ix = split_validation_cells( filtered,
334 |                     args.n_validation_cells, args.validation_group_ids,
335 |                     max_group_frac = args.validation_max_group_frac)
336 |             trn_ix = np.setdiff1d(np.arange(filtered.shape[0]), vld_ix)
337 | 
338 |             print('Writing train/validation splits.....')
339 |             mmwrite('{}train_cells.mtx'.format(outprefix), Xtrn, 
340 |                     field='integer')
341 |             np.savetxt('{}train_cell_ix.txt'.format(outprefix), trn_ix, 
342 |                     fmt='%d')
343 |             mmwrite('{}validation_cells.mtx'.format(outprefix), Xvld, 
344 |                     field='integer')
345 |             np.savetxt('{}validation_cell_ix.txt'.format(outprefix), vld_ix, 
346 |                     fmt='%d')
347 | 
348 |         print('Writing commandline arguments to file.....')
349 |         cmdfile = '{}prep_commandline_args.json'.format(outprefix)
350 |         with open(cmdfile, 'w') as f:
351 |             json.dump(args.__dict__, f, indent=2)
352 | 
353 | 
354 | 
355 |     elif args.cmd == 'prep-like':
356 |         print('Loading and reordering input like reference.... ')
357 |         filtered, genes = load_like(args.input, reference=args.reference,
358 |                 by_gene_name=args.by_gene_name, 
359 |                 no_split_on_dot=args.no_split_on_dot)
360 |         print('Writing prepared data to file.....')
361 |         mmwrite('{}filtered.mtx'.format(outprefix), filtered, field='integer')
362 |         genes.to_csv('{}genes.txt'.format(outprefix), sep='\t', header=None,
363 |                 index=None)
364 |         print('Writing commandline arguments to file.....')
365 |         cmdfile = '{}prep-like_commandline_args.json'.format(outprefix)
366 |         with open(cmdfile, 'w') as f:
367 |             json.dump(args.__dict__, f, indent=2)
368 | 
369 | 
370 |     elif args.cmd in ['train', 'train-pool']:
371 |         # load data
372 |         print( 'Loading data.....' )
373 |         load_fnc = mmread if args.input.endswith('.mtx') else load_coo
374 |         train = load_fnc(args.input)
375 | 
376 |         ncells, ngenes = train.shape
377 |         msg = '.....found {} cells and {} genes in {}'.format(
378 |                 ncells, ngenes, args.input)
379 |         print(msg)
380 | 
381 |         if args.batchsize and ncells > args.batchsize and not args.reproject:
382 |             msg = '\nWARNING: running with minibatches but without reproject.' \
383 |                 + ' We recommend adding the --reproject flag when running with'\
384 |                 + ' batches to synchronize cell variational distributions. \n'
385 |             print(msg)
386 | 
387 |         if args.validation_cells is not None:
388 |             vcells = load_fnc(args.validation_cells)
389 |             msg = '.....found {} validation cells and {} genes in {}'.format(
390 |                     vcells.shape[0], vcells.shape[1], args.validation_cells)
391 |             print(msg)
392 |             msg = 'WARNING: scHPF models with validation cells can be slow'
393 |             msg += ' to converge.\n\tIf you observe this, try either (or both)'
394 |             msg += ' increasing epsilon (-e, currently set to {})'.format(
395 |                     args.epsilon)
396 |             msg += ' or increasing the number of validation cells (using prep)'
397 |             print(msg)
398 |         else:
399 |             vcells = None
400 | 
401 |         # create model
402 |         print('Running trials.....' )
403 |         dtype = np.float32 if args.float32 else np.float64
404 |         model_kwargs = dict(a=args.a, c=args.c)
405 | 
406 |         if args.cmd == 'train':
407 |             run_fnc = run_trials
408 |         else:
409 |             if args.njobs < 0:
410 |                 msg = 'njobs must be an int >= 0, received {}'
411 |                 raise ValueError(msg.format(args.njobs))
412 |             run_fnc = partial(run_trials_pool, njobs=args.njobs)
413 | 
414 |         # TODO get rid of repeated code
415 |         reject = None
416 |         if args.save_all:
417 |             model, reject = run_fnc(train, vcells=vcells, 
418 |                         nfactors=args.nfactors, ntrials=args.ntrials,
419 |                         min_iter=args.min_iter, max_iter=args.max_iter,
420 |                         check_freq=args.check_freq, epsilon=args.epsilon,
421 |                         better_than_n_ago=args.better_than_n_ago, dtype=dtype,
422 |                         verbose=args.verbose, model_kwargs=model_kwargs,
423 |                         return_all=True, reproject=args.reproject,
424 |                         batchsize=args.batchsize,
425 |                         beta_theta_simultaneous=args.beta_theta_simultaneous,
426 |                         loss_smoothing=args.smooth_loss
427 |                         )
428 |         else:
429 |             model = run_fnc(train, vcells=vcells, nfactors=args.nfactors, 
430 |                         ntrials=args.ntrials, min_iter=args.min_iter,
431 |                         max_iter=args.max_iter, check_freq=args.check_freq,
432 |                         epsilon=args.epsilon,
433 |                         better_than_n_ago=args.better_than_n_ago, dtype=dtype,
434 |                         verbose=args.verbose, model_kwargs=model_kwargs,
435 |                         return_all=False, reproject=args.reproject,
436 |                         batchsize=args.batchsize,
437 |                         beta_theta_simultaneous=args.beta_theta_simultaneous,
438 |                         loss_smoothing=args.smooth_loss
439 |                         )
440 | 
441 |         # save the model/models
442 |         if isinstance(args.nfactors, int):
443 |             klist = [args.nfactors]
444 |             model = [model]
445 |             if reject is not None:
446 |                 reject = [reject]
447 |         else:
448 |             klist = args.nfactors
449 |         for i, (K,m) in enumerate(zip(klist, model)):
450 |             model_outprefix = '{}scHPF_K{}{}_{}trials'.format(
451 |                     outprefix, K, 
452 |                     f'_b{args.batchsize}' if ncells > args.batchsize else '', 
453 |                     args.ntrials)
454 |             if vcells is None:
455 |                 print('Saving best model ({} factors).....'.format(K))
456 |                 joblib.dump(m, model_outprefix + '.joblib')
457 |             else:
458 |                 print('Saving best model (training data, {} factors).....'\
459 |                         .format(K))
460 |                 joblib.dump(m, model_outprefix + '.train.joblib')
461 | 
462 |                 print('Computing final validation projection ({} factors)....'\
463 |                         .format(K))
464 |                 projection = m.project(vcells, replace=False)
465 |                 print('Saving validation projection.....({} factors)'.format(K))
466 |                 joblib.dump(projection, 
467 |                         model_outprefix + '.validation_proj.joblib')
468 |             if args.save_all:
469 |                 for j,r in enumerate(reject[i]):
470 |                     joblib.dump(r, model_outprefix + f'_reject{j+1}.joblib')
471 | 
472 | 
473 |         print('Writing commandline arguments to file.....')
474 |         cmdfile = '{}train_commandline_args.json'.format(outprefix)
475 |         print(cmdfile)
476 |         if os.path.exists(cmdfile):
477 |             cmdfile = '{}train_commandline_args.{}.json'.format(outprefix, 
478 |                     time.strftime("%Y%m%d-%H%M%S"))
479 |         with open(cmdfile, 'w') as f:
480 |             json.dump(args.__dict__, f, indent=2)
481 | 
482 |         print('\n')
483 | 
484 | 
485 |     elif args.cmd == 'score':
486 |         print('Loading model.....')
487 |         model = joblib.load(args.model)
488 | 
489 |         print('Calculating scores.....')
490 |         cell_score = model.cell_score()
491 |         gene_score = model.gene_score()
492 | 
493 |         print('Saving scores.....')
494 |         np.savetxt(outprefix + 'cell_score.txt', cell_score, delimiter='\t')
495 |         np.savetxt(outprefix + 'gene_score.txt', gene_score, delimiter='\t')
496 | 
497 |         print('Calculating mean cellscore fractions.....')
498 |         frac_list = mean_cellscore_fraction_list(cell_score)
499 |         with open(outprefix + 'mean_cellscore_fraction.txt', 'w') as h:
500 |             h.write('nfactors\tmean_cellscore_fraction\n')
501 |             for i,csf in enumerate(frac_list):
502 |                 h.write('{}\t{}\n'.format(i+1,csf))
503 | 
504 |         print('Calculating maximum pairwise overlaps.....')
505 |         table = max_pairwise_table(gene_score, 
506 |                 ntop_list=[50,100,150,200,250,300,350,400,450,500])
507 |         table.to_csv(outprefix + 'maximum_overlaps.txt', sep='\t', index=False)
508 | 
509 |         if args.genefile is not None:
510 |             print('Ranking genes.....')
511 |             # load and format gene file
512 |             genes = np.loadtxt(args.genefile, delimiter='\t', dtype=str)
513 |             if len(genes.shape) == 1:
514 |                 genes = genes[:,None]
515 |             # get column to use for gene names
516 |             last_col = genes.shape[1] - 1
517 |             name_col = last_col if args.name_col > last_col else args.name_col
518 |             print('.....using {}\'th column of genefile as gene label'.format(
519 |                 name_col))
520 | 
521 |             # rank the genes by gene_score
522 |             ranks = np.argsort(gene_score, axis=0)[::-1]
523 |             ranked_genes = []
524 |             for i in range(gene_score.shape[1]):
525 |                 ranked_genes.append(genes[ranks[:,i], name_col])
526 |             ranked_genes = np.stack(ranked_genes).T
527 |             print('Saving ranked genes.....')
528 |             np.savetxt(outprefix + 'ranked_genes.txt', ranked_genes, 
529 |                     fmt="%s", delimiter='\t')
530 | 
531 |         print('Writing commandline arguments to file.....')
532 |         cmdfile = '{}score_commandline_args.json'.format(outprefix)
533 |         with open(cmdfile, 'w') as f:
534 |             json.dump(args.__dict__, f, indent=2) 
535 | 
536 |     elif args.cmd == 'project':
537 |         print('Loading reference model.....')
538 |         model = joblib.load(args.model)
539 |         print('Loading data.....')
540 |         load_fnc = mmread if args.input.endswith('.mtx') else load_coo
541 |         proj_data = load_fnc(args.input)
542 |         print('Projecting data.....')
543 |         projection = model.project(proj_data, replace=False, verbose=True,
544 |                                    recalc_bp=args.recalc_bp,
545 |                                    min_iter=args.min_iter, 
546 |                                    max_iter=args.max_iter, 
547 |                                    check_freq=args.check_freq, 
548 |                                    epsilon=args.epsilon, )
549 |         print('Saving projection.....')
550 |         if args.recalc_bp:
551 |             outprefix += '{}.'.format('recalc_bp')
552 |         proj_out = '{}{}.proj.joblib'.format(outprefix, 
553 |                 args.model.rsplit('.',1)[0].split('/')[-1])
554 |         joblib.dump(projection, proj_out)
555 | 
556 |         print('Writing commandline arguments to file.....')
557 |         cmdfile = '{}project_commandline_args.json'.format(outprefix)
558 |         with open(cmdfile, 'w') as f:
559 |             json.dump(args.__dict__, f, indent=2)
560 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/changelog.rst:
--------------------------------------------------------------------------------
 1 | .. _changelog:
 2 | 
 3 | *********
 4 | Changelog
 5 | *********
 6 | 
 7 | 0.4.0
 8 | =====
 9 | - train-pool for training parallelized at the level of trials rather than
10 |   computations
11 | - reproject and save all options during training
12 | - add separate joblib dependency (should be installed w/scikit-learn,
13 |   scikit.externals.joblib is deprecated)
14 | 
15 | 
16 | 0.3.0
17 | =====
18 | 
19 | - Refactor so loss can be an arbitrary function
20 | - Fix bugs in and expand options for projetion
21 | - prep-like CLI to prepare data for projection onto a trained model
22 | - cellscore fraction file for score CLI
23 | - Verbose option for load_txt
24 | - Update options for validation cells & selection
25 | - Version as an object attribute
26 | - Handle change in scipy API
27 | - new GENCODE files
28 | - (feature request) options to specify a and c from the train CLI
29 | - Documentation with ReadTheDocs
30 | 
31 | 
32 | 0.2.4
33 | =====
34 | - Emergency patch preprocessing error for loom files. Also fixed an errant test.
35 |   Not really enough to justify a new release but fixed a pretty
36 |   irritating/embarrassing error.  
37 | 
38 | 0.2.3
39 | =====
40 | - fix no split on dot bug
41 | - Max pairwise table + default max pairwise in score
42 | - Note about ld.so error
43 | - Fix max pairwise second greatest bug
44 | - Some integration tests
45 | 
46 | 
47 | 0.2.2
48 | =====
49 | - partial test suite
50 | - max pairwise test for gene overlap
51 | - faster preprocessing of larage text files
52 | - refactor preprocessing and training control flow out of CLI
53 | - move load and save methods outside of scHPF object
54 | 
55 | 
56 | 0.2.1
57 | =====
58 | - Slight speedup during inference for Xphi
59 | - Fix bug (occurred first in 0.2.0-alpha) that occurs when genes in
60 |   whitespace-delim input to prep that have no counts
61 | 
62 | 
63 | 0.2.0
64 | =====
65 | Numba implmentation with scikit-learn-like API
66 | 
67 | 
68 | 0.1.0
69 | =====
70 | - Tensorflow implementation
71 | 
72 | 


--------------------------------------------------------------------------------
/docs/cli-man.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. _cli-man:
 3 | 
 4 | 
 5 | **********************
 6 | Complete CLI Reference
 7 | **********************
 8 | 
 9 | .. _cli-prep:
10 | 
11 | scHPF prep
12 | ==========
13 | 
14 | .. argparse::
15 |    :filename: ../bin/scHPF
16 |    :func: _parser
17 |    :prog: scHPF
18 |    :path: prep
19 | 
20 | 
21 | .. _cli-train:
22 | 
23 | scHPF train
24 | ===========
25 | 
26 | .. argparse::
27 |    :filename: ../bin/scHPF
28 |    :func: _parser
29 |    :prog: scHPF
30 |    :path: train
31 | 
32 | 
33 | .. _cli-score:
34 | 
35 | scHPF score
36 | ===========
37 | 
38 | .. argparse::
39 |    :filename: ../bin/scHPF
40 |    :func: _parser
41 |    :prog: scHPF
42 |    :path: score
43 | 
44 | 
45 | .. _cli-prep-like:
46 | 
47 | scHPF prep-like
48 | ===============
49 | 
50 | .. argparse::
51 |    :filename: ../bin/scHPF
52 |    :func: _parser
53 |    :prog: scHPF
54 |    :path: prep-like
55 | 
56 | 
57 | .. _cli-project:
58 | 
59 | scHPF project
60 | =============
61 | 
62 | .. argparse::
63 |    :filename: ../bin/scHPF
64 |    :func: _parser
65 |    :prog: scHPF
66 |    :path: project
67 | 
68 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # http://www.sphinx-doc.org/en/master/config
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('../schpf'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'scHPF'
21 | copyright = '2019, Hanna Mendes Levitin'
22 | author = 'Hanna Mendes Levitin'
23 | 
24 | # The full version, including alpha/beta/rc tags
25 | import schpf
26 | version = schpf.__version__
27 | release = version
28 | 
29 | 
30 | # -- General configuration ---------------------------------------------------
31 | 
32 | # Add any Sphinx extension module names here, as strings. They can be
33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
34 | # ones.
35 | extensions = [
36 |     'sphinx.ext.autodoc',
37 |     'sphinx.ext.intersphinx',
38 |     'sphinxarg.ext',
39 |     # 'sphinx.ext.doctest',
40 |     # 'sphinx.ext.coverage',
41 |     # 'sphinx.ext.mathjax',
42 |     'sphinx.ext.napoleon',
43 |     'sphinx.ext.viewcode',
44 |     'sphinx.ext.autosummary',
45 | ]
46 | 
47 | autosummary_generate = True
48 | autodoc_member_order = 'bysource'
49 | napoleon_google_docstring = False
50 | napoleon_numpy_docstring = True
51 | napoleon_include_init_with_doc = False
52 | napoleon_use_rtype = True  # having a separate entry generally helps readability
53 | napoleon_use_param = True
54 | napoleon_custom_sections = [('Params', 'Parameters')]
55 | todo_include_todos = True
56 | 
57 | # Add any paths that contain templates here, relative to this directory.
58 | templates_path = ['_templates']
59 | 
60 | # List of patterns, relative to source directory, that match files and
61 | # directories to ignore when looking for source files.
62 | # This pattern also affects html_static_path and html_extra_path.
63 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
64 | 
65 | 
66 | # -- Options for HTML output -------------------------------------------------
67 | 
68 | # The theme to use for HTML and HTML Help pages.  See the documentation for
69 | # a list of builtin themes.
70 | #
71 | # html_theme = 'alabaster'
72 | html_theme = 'sphinx_rtd_theme'
73 | 
74 | # Add any paths that contain custom static files (such as style sheets) here,
75 | # relative to this directory. They are copied after the builtin static files,
76 | # so a file named "default.css" will overwrite the builtin "default.css".
77 | html_static_path = ['_static']
78 | 
79 | html_show_sphinx = False
80 | html_context = dict(
81 |     display_github=True,      # Integrate GitHub
82 |     github_user='simslab',   # Username
83 |     github_repo='scHPF',     # Repo name
84 |     github_version='master',  # Version
85 |     conf_py_path='/docs/',    # Path in the checkout to the docs root
86 | )
87 | gh_url = 'https://github.com/{github_user}/{github_repo}'.format_map(html_context)
88 | 


--------------------------------------------------------------------------------
/docs/genelists.rst:
--------------------------------------------------------------------------------
 1 | .. _premade lists: https://github.com/simslab/scHPF/tree/master/resources
 2 | .. _stable identifiers: https://useast.ensembl.org/info/genome/stable_ids/index.html
 3 | .. _biotypes: https://www.gencodegenes.org/pages/biotypes.html
 4 | 
 5 | .. _genelists:
 6 | 
 7 | **********
 8 | Gene lists
 9 | **********
10 | 
11 | About
12 | =====
13 | We recommend restricting analysis to protein-coding genes, and bundle
14 | `premade lists`_ of coding genes for human and mouse with the scHPF code.  The 
15 | :ref:`prep CLI command<prep-cli>` optionally uses these lists to filter input 
16 | data.  Although ENSEMBL ids are theoretically unambiguous and consistent across
17 | releases (ie `stable identifiers`_), you may want to generate your own list 
18 | from a different annotation (that matches your alignment GENCODE version) or 
19 | with different parameters for gene inclusion (eg including lncRNA). 
20 | 
21 | Premade lists
22 | =============
23 | The scHPF code includes tab-delimited lists of ENSEMBL ids and names for genes 
24 | with protein coding, T-cell receptor constant, or immunoglobulin constant 
25 | `biotypes`_ for human and mouse.
26 | 
27 | Premade lists can be found in the 
28 | `code's resources folder <https://github.com/simslab/scHPF/tree/master/resources>`_:
29 | 
30 |     * Human (GENCODE v24, v29, v31)
31 |     * Mouse (GENCODE vM10, vM19)
32 | 
33 | Format
34 | ======
35 | Example tab-delimited gene list::
36 | 
37 |     ENSG00000186092	OR4F5
38 |     ENSG00000284733	OR4F29
39 |     ENSG00000284662	OR4F16
40 |     ENSG00000187634	SAMD11
41 |     ENSG00000188976	NOC2L
42 |     ENSG00000187961	KLHL17
43 | 
44 | By default, the prep command assumes a two-column, tab-delimited text file of 
45 | ENSEMBL gene ids and names, and uses the first column (assumed to be ENSEMBL id) 
46 | to filter genes. See the  
47 | :ref:`prep command documentation<prep-cli>` for other options. 
48 | 
49 | .. note::
50 |     ENSEMBL ids may end in a period followed by an unstable version 
51 |     number (eg ENSG00000186092.6). By default, the prep command ignores anything 
52 |     after the period. This means ``[ENS-ID].[VERSION]`` is equivalent to 
53 |     ``[ENS-ID]`` . See the :ref:`prep command <prep-cli>` for other options. 
54 | 
55 | 
56 | Making custom gene lists
57 | ========================
58 | Although ENSEMBL ids aim to be unambiguous and consistent across
59 | releases (ie `stable identifiers`_), you may want to generate your own list from 
60 | a different annotation or with different parameters for gene inclusion.
61 | 
62 | 
63 | Example creation script
64 | ~~~~~~~~~~~~~~~~~~~~~~~
65 | Reference files of ids and names for genes with with 
66 | ``protein_coding``, ``TR_C_gene``, or ``IG_C_gene`` biotypes in the GENCODE 
67 | main annotation (in this case ``gencode.v29.annotation.gtf``) were generated as follows:
68 | 
69 | .. code:: bash
70 | 
71 |     # Select genes with feature gene and level 1 or 2
72 |     awk '{if($3=="gene" && $0~"level (1|2);"){print $0}}' gencode.v29.annotation.gtf > gencode.v29.annotation.gene_l1l2.gtf 
73 | 
74 |     # Only include biotypes protein_coding, TR_C_g* and IG_C_g*
75 |     awk '{if($12~"TR_C_g" || $12~"IG_C_g" || $12~"protein_coding"){print $0}}' gencode.v29.annotation.gene_l1l2.gtf > gencode.v29.annotation.gene_l1l2.pc_TRC_IGC.gtf
76 | 
77 |     # Retrieve ENSEMBL gene id and name
78 |     awk '{{OFS="\t"}{gsub(/"/, "", $10); gsub(/;/, "", $10); gsub(/"/, "", $14); gsub(/;/, "", $14); print $10, $14}}' gencode.v29.annotation.gene_l1l2.pc_TRC_IGC.gtf > gencode.v29.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt
79 | 
80 | 
81 | .. note:: 
82 |     For older GENCODE versions, you  may need to  adjust the field indices in
83 |     the third line of code (for example changing all instances of $14 to $16).
84 | 


--------------------------------------------------------------------------------
/docs/img/cell-type-rep-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simslab/scHPF/aff30d674039359395cbee4ca4ddc85f3a5c8b56/docs/img/cell-type-rep-01.png


--------------------------------------------------------------------------------
/docs/img/k_selection_minifig-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simslab/scHPF/aff30d674039359395cbee4ca4ddc85f3a5c8b56/docs/img/k_selection_minifig-01.png


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. scHPF documentation master file, created by
 2 |    sphinx-quickstart on Mon Jul  8 17:02:06 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Single-cell Hierarchical Poisson Factorization
 7 | ==============================================
 8 | 
 9 | Single-cell Hierarchical Poisson Factorization (scHPF) is a tool for *de novo*
10 | discovery of discrete and continuous expression patterns in single-cell
11 | RNA\-sequencing (scRNA-seq). 
12 | 
13 | We find that scHPF’s sparse low-dimensional representations, non-negativity,
14 | and explicit modeling of variable sparsity across genes and cells produces
15 | highly interpretable factors.  The algorithm takes genome-wide molecular counts
16 | as input, avoids prior normalization, and has fast, memory-efficient inference
17 | on sparse scRNA-seq datasets. 
18 | 
19 | Algorithmic details, benchmarking against alternative methods, and scHPF's
20 | application to a spatially sampled high-grade glioma can be found in 
21 | `our paper at Molecular Systems Biology`_.
22 | 
23 | .. _our paper at Molecular Systems Biology: https://doi.org/10.15252/msb.20188557
24 | 
25 | You can find the software `on github <https://www.embopress.org/doi/full/10.15252/msb.20188557>`_.
26 | 
27 | .. toctree::
28 |     :maxdepth: 1
29 |     :caption: Setup 
30 | 
31 |     install
32 |     genelists
33 | 
34 | .. toctree::
35 |     :maxdepth: 2
36 |     :caption: Commandline workflow
37 | 
38 |     prep-cli
39 |     train-cli
40 |     score-cli
41 | 
42 | .. toctree::
43 |     :maxdepth: 2
44 |     :caption: Advanced options
45 | 
46 |     select-k
47 |     project
48 | 
49 | .. toctree::
50 |     :maxdepth: 1
51 |     :caption: Misc
52 | 
53 |     cli-man   
54 |     changelog
55 | 
56 | 
57 | Indices and tables
58 | ==================
59 | 
60 | * :ref:`genindex`
61 | * :ref:`modindex`
62 | * :ref:`search`
63 | 


--------------------------------------------------------------------------------
/docs/install.rst:
--------------------------------------------------------------------------------
 1 | .. _install:
 2 | 
 3 | ************
 4 | Installation
 5 | ************
 6 | 
 7 | Environment & Dependencies
 8 | ==========================
 9 | 
10 | scHPF requires Python >= 3.6 and the packages:
11 | 
12 | *   numba (:ref:`version requirement depends on python version<numba>`, but will be safe with 0.45, and probably 0.45+)
13 | *   scikit-learn
14 | *   pandas
15 | *   (optional) loompy
16 | 
17 | The easiest way to setup a python environment for scHPF is with `anaconda`_ (or
18 | its stripped-down version `miniconda`_):
19 | 
20 | .. _anaconda: https://www.anaconda.com/distribution
21 | .. _miniconda: https://docs.conda.io/en/latest/miniconda.html
22 | 
23 | .. code:: bash
24 | 
25 |     conda create -n schpf_p37 python=3.7 scikit-learn numba=0.50 pandas numpy=1.18
26 | 
27 |     # for newer anaconda versions
28 |     conda activate schpf_p37
29 |     # XOR older anaconda verstions
30 |     source activate schpf_p37
31 | 
32 |     # Optional, for using loom files as input to preprocessing
33 |     pip install -U loompy
34 | 
35 | 
36 | 
37 | .. _numba:
38 | 
39 | numba/Python compatibility
40 | --------------------------
41 | Certain micro-versions of Python and numba do not play well together, resulting
42 | in segmentation faults and/or horrible performance (at least for the ops scHPF
43 | uses).  In our experience, micro-version combos that avoid these issues are
44 | listed below, as well as known-bad combination, but note this is not an
45 | exhaustive list:
46 | 
47 | **Python 3.7.9**
48 |     Compatible numba:  0.45-0.50
49 | 
50 |     DO NOT USE: 0.44 or earlier
51 | **Python 3.7.5 - 3.7.8**
52 |     Not tested
53 | **Python 3.7.4**
54 |     Compatible numba: 0.44, 0.45
55 | 
56 |     DO NOT USE: 0.43 or earlier
57 | **Python <=3.7.3**
58 |     Compatible numba: 0.39, 0.40, 0.44, 0.45
59 | 
60 |     DO NOT USE: 0.41-0.43
61 | 
62 | *Please* let me know about any weird errors/slowness you experience so we can 
63 | document!
64 | 
65 | 
66 | 
67 | 
68 | Installing scHPF 
69 | ================
70 | 
71 | Once you have set up the environment, clone ``simslab/scHPF`` from github and
72 | install.
73 | 
74 | .. code:: bash
75 | 
76 |     git clone git@github.com:simslab/scHPF.git
77 |     cd scHPF
78 |     pip install .
79 | 
80 | 
81 | .. _tests:
82 | 
83 | Test your installation
84 | ----------------------
85 | Highly recommended, as this will catch some annoying problems with python/numba/numpy incompatibilities. From your scHPF home directory:
86 | 
87 | 
88 | .. code:: bash
89 | 
90 |     conda install pytest
91 |     pytest
92 | 
93 | 
94 | If any tests fail, please get in touch and I'll be happy to help.
95 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/prep-cli.rst:
--------------------------------------------------------------------------------
  1 | .. _loompy docs: http://loompy.org/
  2 | .. _resources folder: https://github.com/simslab/scHPF/tree/rewrite_release/resources
  3 | 
  4 | .. _prep-cli:
  5 | 
  6 | **********
  7 | scHPF prep
  8 | **********
  9 | 
 10 | Basic usage
 11 | ===========
 12 | 
 13 | To preprocess genome-wide UMI counts for a typical run, use the command:
 14 | 
 15 | .. code:: bash
 16 | 
 17 |     scHPF prep -i UMICOUNT_MATRIX -o OUTDIR -m 10 -w WHITELIST
 18 | 
 19 | As written, the command prepares a 
 20 | :ref:`matrix of molecular counts <matrix-format>` for training and only includes
 21 | genes that are:
 22 | 
 23 |     - on a :ref:`whitelist<whitelist>`, for example one of the lists of protein
 24 |       coding genes bundled in the scHPF code's reference folder
 25 |       (``-w``/``--whitelist``)
 26 |     - that we observe in at at least 10 cells (``-m``/``--min-cells``).
 27 | 
 28 | After running this command, ``OUTDIR`` should contain a matrix market file,
 29 | ``filtered.mtx``, and an ordered list of genes, ``genes.txt``.  An optional prefix
 30 | argument can be added, which is prepended to to the output file names.
 31 | 
 32 | Now we can train the model with the |scHPF train|_.
 33 | 
 34 | .. |scHPF train| replace:: ``scHPF train`` utility
 35 | .. _scHPF train: train-cli.html
 36 | 
 37 | .. _matrix-format:
 38 | 
 39 | Input matrix format
 40 | ===================
 41 | ``scHPF prep`` takes a molecular count matrix for an scRNA-seq experiment
 42 | and formats it for training. The input matrix has two allowed formats:
 43 | 
 44 |     1. A **whitespace-delimited matrix** formatted as follows, with no header::
 45 | 
 46 |         ENSEMBL_ID    GENE_NAME    UMICOUNT_CELL0    UMICOUNT_CELL1 ...
 47 | 
 48 |     2. A **loom file** (see `loompy docs`_).  The loom file must have at least 
 49 |        one of the row attributes ``Accession`` or ``Gene``, where ``Accession``
 50 |        is an ENSEMBL id and ``Gene`` is a gene name. 
 51 | 
 52 | .. _whitelist:
 53 | 
 54 | Whitelisting genes
 55 | ==================
 56 | 
 57 | About
 58 | -----
 59 | We recommend restricting analysis to protein-coding genes. The
 60 | ``-w``/``--whitelist`` option removes all genes in the input data that are *not
 61 | in* a two column, tab-delimited text file of ENSEMBL gene ids and names.
 62 | Symmetrically, the ``-b``/``--blacklist`` option removes all genes that are *in*
 63 | a file.
 64 | 
 65 | Whitelists for human and mouse are provided in the `resources folder`_, and
 66 | details on formatting  and custom lists are in the 
 67 | :ref:`gene list documentation <genelists>`.
 68 | 
 69 | .. Attention::
 70 |     ENSEMBL ids may end in a period followed by an unstable version 
 71 |     number (eg ENSG00000186092.6). By default, the prep command ignores anything 
 72 |     after the period. This means ``[ENS-ID].[VERSION]`` is equivalent to 
 73 |     ``[ENS-ID]``. This behavior can be overwritten with the
 74 |     ``--no-split-on-dot`` flag.
 75 | 
 76 | Whitespace-delimited input matrix
 77 | ---------------------------------
 78 | For whitespace-delimited UMI-count files, filtering is performed using the input
 79 | matrix's first column (assumed to be a unique identifier) by default, but can be
 80 | done with the gene name (next column) using the ``--filter-by-gene-name`` flag.
 81 | This is useful for data that does not include a gene id.
 82 | 
 83 | 
 84 | loom input matrix
 85 | -----------------
 86 | For loom files, we filter the loom ``Accession`` row attribute against the
 87 | whitelist's ENSEMBLE if ``Accession`` is present in the loom's row attributes,
 88 | and filter the loom's ``Gene`` row attribute against the gene name in the
 89 | whitelist otherwise.
 90 | 
 91 | 
 92 | .. _prep-options:
 93 | 
 94 | Complete options
 95 | ================
 96 | 
 97 | For complete options, see the :ref:`complete CLI reference<cli-prep>` or use the
 98 | ``-h`` option on the command line:
 99 | 
100 | .. code:: bash
101 | 
102 |     scHPF prep -h
103 | 
104 | 


--------------------------------------------------------------------------------
/docs/project.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. _project:
 3 | 
 4 | ************************************
 5 | Projecting data onto a trained model
 6 | ************************************
 7 | 
 8 | Full writeup coming soon.  Use the ``prep-like`` and  ``project`` commandline
 9 | programs.
10 | 
11 | Preparing data for projection
12 | =============================
13 | 
14 | For complete options, see the :ref:`complete CLI reference<cli-prep-like>` or
15 | use the ``-h`` option on the command line:
16 | 
17 | .. code:: bash
18 | 
19 |     scHPF prep-like -h
20 | 
21 | Projecting new data
22 | ====================
23 | 
24 | For complete options, see the :ref:`complete CLI reference<cli-project>` or
25 | use the ``-h`` option on the command line:
26 | 
27 | .. code:: bash
28 | 
29 |     scHPF project -h
30 | 


--------------------------------------------------------------------------------
/docs/references.rst:
--------------------------------------------------------------------------------
 1 | References
 2 | ----------
 3 | 
 4 | .. [Levitin2019] Levitin *et al.* (2019),
 5 |    *De novo gene signature identification from single‐cell RNA‐seq with hierarchical Poisson factorization*,
 6 |    `Molecular Systems Biology<https://doi.org/10.15252/msb.20188557>`__.
 7 | 
 8 | 
 9 | .. [SzaboLevitin2019] Szabo, Levitin *et al.* (2019),
10 |    *Single-cell transcriptomics of human T cells reveals tissue and activation signatures in health and disease*,
11 |    `Nature Communications<https://doi.org/10.1038/s41467-019-12464-3>`.
12 | 


--------------------------------------------------------------------------------
/docs/score-cli.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. _score-cli:
 3 | 
 4 | ***********
 5 | scHPF score
 6 | ***********
 7 | 
 8 | Basic usage
 9 | ===========
10 | To get gene- and cell-scores in a tab-delimited file, ordered like the genes and
11 | cells in the train file and with a column for each factor:
12 | 
13 | .. code:: bash
14 | 
15 |     scHPF score -m MODEL_JOBLIB -o OUTDIR -p PREFIX
16 | 
17 | To also generate a tab-delimited file of gene names, ranked by gene-score for
18 | each factor:
19 | 
20 | .. code:: bash
21 | 
22 |     scHPF score -m MODEL_JOBLIB -o OUTDIR -p PREFIX -g GENE_FILE
23 | 
24 | ``GENE_FILE`` is intended to be the gene.txt file output by the 
25 | |scHPF prep command|_, but can in theory be any tab-delimited file where the
26 | number of rows is equal to the number of genes in the scHPF model. The score
27 | command automatically uses the 1st (zero-indexed) column of ``GENE_FILE`` (or
28 | the only column if there is only one); however, the column used can be specified
29 | with ``--name-col``.
30 | 
31 | .. |scHPF prep command| replace:: ``scHPF prep`` command
32 | .. _scHPF prep command: prep-cli.html
33 | 
34 | If ``OUTDIR`` is omitted, the command will make a new subdirectory of the
35 | directory containing the model.  The new subdirectory will have the same name as
36 | the model file, but without the joblib extension.
37 | 
38 | The command also outputs files which can be used to 
39 | :ref:`select the number of factors<select-k>` using trained models.
40 | 
41 | 
42 | Complete options
43 | ================
44 | 
45 | For complete options, see the :ref:`complete CLI reference<cli-score>` or use the
46 | ``-h`` option on the command line:
47 | 
48 | .. code:: bash
49 | 
50 |     scHPF score -h
51 | 


--------------------------------------------------------------------------------
/docs/select-k.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | .. _select-k:
  3 | 
  4 | *************
  5 | Selecting *K*
  6 | *************
  7 | 
  8 | General comments
  9 | ================
 10 | 
 11 | The number of factors, *K*, determines scHPF's granularity. An appropriate
 12 | number of factors depends on both the data being fit and the intended
 13 | application of the scHPF model.  In our experience, subsequent analyses on cell
 14 | scores (eg. UMAP) are stable across a reasonable range of *K*, while
 15 | interpretability (gene scores) can be more *K*-dependent.
 16 | 
 17 | 
 18 | .. _k-workflow:
 19 | 
 20 | Example workflows
 21 | =================
 22 | 
 23 | 1. Exploratory analysis on a single sample
 24 | ------------------------------------------
 25 | In some cases, if a user has a single sample, it may be appropriate to increase
 26 | or decrease *K* manually according to the desired resolution. Granularity at
 27 | the level of expression programs can be assessed qualitatively using the
 28 | per-factor ranked gene lists in *ranked_genes.txt* (from |scHPF score|_ with
 29 | the ``-g`` option). For example, if genes for two cell types appear in the same
 30 | factor, one might increase *K*. Resolution can also be assessed quantitatively
 31 | using 
 32 | :ref:`cell type respresentation<type-rep>`, or 
 33 | :ref:`other quantitative criteria<k-criteria>`.
 34 | 
 35 | When using this approach, we encourage the user to always try at least two
 36 | values of *K* in any direction, as scHPF is multimodal and behavior is not
 37 | always monotonic. *K* in the neighborhood of the number of clusters is often a
 38 | good starting point.
 39 | 
 40 | .. _multi-model-example:
 41 | 
 42 | 2. Consistent choices across multiple models
 43 | --------------------------------------------
 44 | Applying scHPF separately to multiple partitions (as in [SzaboLevitin2019]_)
 45 | necessitates a uniform procedure for choosing the number of factors.  To
 46 | maximize interpretability while being quantitative and consistent across
 47 | models, we usually train scHPF across a range of *K*'s for each partition and
 48 | select the per-dataset number of factors using a heuristic suitable to our
 49 | intended application 
 50 | (:ref:`example criteria<k-criteria>`). An example workflow might be:
 51 | 
 52 | 
 53 |     1. Choose an appropriate selection criteria for the problem at hand 
 54 |        (:ref:`examples<k-criteria>`).
 55 | 
 56 |     2. Guess a minimum number of factors, |K_min|. Values slightly less than
 57 |        the number of clusters in the dataset are usually a good starting point
 58 |        (e.g. |K_min| = number of clusters - 2). Guess a maximum number of
 59 |        factors, |K_max|, not worrying too much if we are low since we'll refine
 60 |        later (e.g. |K_max| = |K_min| + 8).
 61 | 
 62 |     3. :ref:`Train<train-cli>` scHPF models for K in 
 63 |        range(|K_min|,  |K_max| +1).  *Advanced note*: I sometimes use a step
 64 |        size of 2 or 3 on the first pass to check that the range is reasonable,
 65 |        but recommend a final step of 1 (scHPF is multimodal, so results may not
 66 |        be monotonic).
 67 | 
 68 |     4. Evaluate the models using the selection criteria from 1. Expand/refine
 69 |        the range accordingly.  For example, if |K_max| passes our criteria, we
 70 |        should increase |K_max|.
 71 | 
 72 |     5. Repeat 3-5 as needed.
 73 | 
 74 |       
 75 | .. |K_min| replace:: *K*:sub:`min`
 76 | 
 77 | .. |K_max| replace:: *K*:sub:`max`
 78 | 
 79 | .. _k-criteria:
 80 | 
 81 | Example selection criteria
 82 | ===========================
 83 | 
 84 | .. _type-rep:
 85 | 
 86 | 1. Cell type representation
 87 | ---------------------------
 88 | 
 89 | In [Levitin2019]_, we chose *K* based on scHPF's representation of cell types
 90 | in the data.  Specifically, we selected the smallest *K* such that every
 91 | well-defined cluster was most strongly associated with at least one unique
 92 | factor `[Levitin2019, Appendix Figure S8]`_.  This method is intuitive, and can
 93 | work well when many cell types are present, but depends on the quality and
 94 | granularity of clustering. It is also difficult to standardize across multiple
 95 | models trained on different data.
 96 | 
 97 | .. _[Levitin2019, Appendix Figure S8]: https://www.embopress.org/action/downloadSupplement?doi=10.15252%2Fmsb.20188557&file=msb188557-sup-0001-Appendix.pdf
 98 | 
 99 | 
100 | .. figure:: ./img/cell-type-rep-01.png
101 | 
102 |      Median cell score per factor and cluster in a high-grade glioma for 12,
103 |      13, and 14 factors in [Levitin2019]_. At 14 factors, all clusters are most
104 |      closely associated with at least one unique factor.
105 | 
106 | 
107 | .. _signature-overlap:
108 | 
109 | .. sidebar:: Evaluating top gene overlap
110 | 
111 |     .. figure:: ./img/k_selection_minifig-01.png
112 |         
113 |     Hypergeometric -log10 *p*-value of the maximum pairwise overlap
114 |     of the highest scoring genes in each factor for Donor 2 Bone Marrow in
115 |     [SzaboLevitin2019]_ at different values of *K*.
116 | 
117 | 2. Gene signature overlap
118 | -------------------------
119 | 
120 | To find common patterns of gene expression across multiple models in
121 | [SzaboLevitin2019]_, we selected *K* such that factors in the same model did
122 | not have significant overlap in their top genes (where top genes are defined as
123 | the *n* highest scoring genes per factor). This reflected our prior that
124 | programs should be distinctive with respect to gene scores, and the further
125 | requirement that models should have similar granularity across datasets with
126 | different levels of complexity.  
127 | 
128 | The |scHPF score|_ command automatically produces the file
129 | *maximum_overlaps.txt*, which contains factors' maximum pairwise overlap and
130 | corresponding hypergeometric *p* values at different cutoffs.
131 | 
132 | For standard significance thresholds and reasonable *n*, this method can be
133 | quite strict, resulting in lower granularity factorizations for some datasets.
134 | Using :ref:`cellular resolution<cell-res>` or 
135 | :ref:`cell type respresentation<type-rep>` may find higher resolution 
136 | factorizations in these cases.
137 | 
138 | .. |scHPF score| replace:: ``scHPF score``
139 | .. _scHPF score: score-cli.html
140 | 
141 | 
142 | .. _cell-res:
143 | 
144 | 
145 | 3. Cellular resolution
146 | ----------------------
147 | 
148 | Cellular resolution directly evaluates a model's granularity by specifying how
149 | many factors, on average, should explain a given portion of a cell's total cell
150 | scores.  We have found it especially useful for datasets where 
151 | :ref:`gene signature overlap<signature-overlap>` is too strict.
152 | 
153 | We define cellular resolution as the maximum *K* such that, on average, cells'
154 | *n* highest scoring factors contain at least *r*\*100 percent of their total
155 | score across all factors.  So if we want to find a model where the 3 factors
156 | with the highest score in a cell contain at least 70% of its total score (on
157 | average), *n* would be 3 and *r* would be 0.7.
158 | 
159 | We can evaluate cellular resolution using one of |scHPF score|_'s  outputs, a
160 | file called *mean_cellscore_fraction.txt* (potentially with a prefix). The
161 | file's two columns, *nfactors* and *mean_cellscore_fraction*, represent the
162 | mean fraction of each cell's total cell score allocated to its top *nfactors*
163 | factors.  If we want to find a model at *n* =3 and *r* =0.7 resolution, we
164 | might follow the :ref:`example workflow<k-workflow>` above, and select the
165 | largest *K* such that *mean_cellscore_fraction* >= 0.7 when *nfactors* = 3.
166 | 


--------------------------------------------------------------------------------
/docs/train-cli.rst:
--------------------------------------------------------------------------------
 1 | .. _joblib: https://scikit-learn.org/stable/modules/model_persistence.html
 2 | 
 3 | .. _train-cli:
 4 | 
 5 | ***********
 6 | scHPF train
 7 | ***********
 8 | 
 9 | Basic usage
10 | ===========
11 | A typical command to train an scHPF model (using data prepared by the 
12 | |scHPF prep command|_):
13 | 
14 | .. |scHPF prep command| replace:: ``scHPF prep`` command
15 | .. _scHPF prep command: prep-cli.html
16 | 
17 | .. code:: bash
18 | 
19 |     scHPF train -i TRAIN_FILE -o OUTDIR -p PREFIX -k 7 -t 5
20 | 
21 | This command performs approximate Bayesian inference on scHPF with, in this
22 | instance, seven factors and five different random initializations. scHPF will
23 | automatically select the trial with the lowest negative log-likelihood, and
24 | save the model in the OUTDIR in a serialized `joblib`_ file.
25 | 
26 | Input file format
27 | =================
28 | scHPF's train command accepts two formats:
29 | 
30 |     1. **Matrix Market (.mtx) files**, where rows are cells, columns are genes, and
31 |        values are nonzero molecular counts. Matrix market files are output by
32 |        the current |scHPF prep command|_.
33 | 
34 |     2. **Tab-delimited COO matrix coordinates**, output by a previous version of the
35 |        preprocessing command. These files are essentially the same as .mtx
36 |        files, except they do not have a header and are zero indexed.
37 | 
38 | 
39 | Debugging
40 | =========
41 | .. hint::
42 |     If you get an error like "Inconsistency detected by ld.so: dl-version.c: 224:
43 |     _dl_check_map_versions" and are running numba 0.40.0, try downgrading to
44 |     0.39.0.
45 | 
46 | .. hint::
47 |     If you get an error like "Segmentation fault (core dumped)" and are running
48 |     Python 3.7.4,  try upgrading numba to version 0.45 or downgrading Python to
49 |     3.7.3 python :ref:`[More details]<numba>`
50 | 
51 | 
52 | Complete options
53 | ================
54 | 
55 | For complete options, see the :ref:`complete CLI reference<cli-train>` or use the
56 | ``-h`` option on the command line:
57 | 
58 | .. code:: bash
59 | 
60 |     scHPF train -h
61 | 


--------------------------------------------------------------------------------
/resources/README.md:
--------------------------------------------------------------------------------
 1 | ## Gene Files
 2 | 
 3 | Two column, tab-delimited text file of ENSEMBL gene ids and names with protein coding, T-cell receptor constant or immunoglobulin constant biotypes  in the GENCODE main annotation for [human](https://www.gencodegenes.org/human/) or [mouse](https://www.gencodegenes.org/mouse/).
 4 | 
 5 | ### Included files
 6 | Human: `gencode.v29.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt`   
 7 | Mouse: `gencode.vM19.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt`
 8 | 
 9 | ### Generating gene files
10 | Files were generated from GENCODE GTFs as follows:
11 | ```
12 | # Select genes with feature gene and level 1 or 2
13 | awk '{if($3=="gene" && $0~"level (1|2);"){print $0}}' gencode.v29.annotation.gtf > gencode.v29.annotation.gene_l1l2.gtf 
14 | 
15 | # Only include biotypes protein_coding, TR_C_g* and IG_C_g*
16 | awk '{if($12~"TR_C_g" || $12~"IG_C_g" || $12~"protein_coding"){print $0}}' gencode.v29.annotation.gene_l1l2.gtf > gencode.v29.annotation.gene_l1l2.pc_TRC_IGC.gtf
17 | 
18 | # Retrieve ENSEMBL gene id and name
19 | awk '{{OFS="\t"}{gsub(/"/, "", $10); gsub(/;/, "", $10); gsub(/"/, "", $14); gsub(/;/, "", $14); print $10, $14}}' gencode.v29.annotation.gene_l1l2.pc_TRC_IGC.gtf > gencode.v29.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt'")")}}'
20 | ```
21 | 


--------------------------------------------------------------------------------
/schpf/__init__.py:
--------------------------------------------------------------------------------
1 | from .scHPF_ import *
2 | from .util import *
3 | from ._version import __version__
4 | 


--------------------------------------------------------------------------------
/schpf/_version.py:
--------------------------------------------------------------------------------
1 | __version__='0.5.0'
2 | 


--------------------------------------------------------------------------------
/schpf/hpf_numba.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import ctypes
  4 | import numpy as np
  5 | from scipy.sparse import coo_matrix
  6 | try:
  7 |     from scipy.misc import logsumexp
  8 | except ImportError:
  9 |     from scipy.special import logsumexp
 10 | 
 11 | import numba
 12 | from numba.extending import get_cython_function_address as getaddr
 13 | 
 14 | # get numba-compatible digamma/psi and gammaln
 15 | # psi/digamma
 16 | psi_fnaddr = getaddr("scipy.special.cython_special", "__pyx_fuse_1psi")
 17 | psi_ftype = ctypes.CFUNCTYPE(ctypes.c_double, ctypes.c_double)
 18 | psi = psi_ftype(psi_fnaddr)
 19 | # gammaln
 20 | gammaln_fnaddr = getaddr("scipy.special.cython_special", "gammaln")
 21 | gammaln_ftype = ctypes.CFUNCTYPE(ctypes.c_double, ctypes.c_double)
 22 | cgammaln = gammaln_ftype(gammaln_fnaddr)
 23 | 
 24 | @numba.njit(parallel=True, nogil=True, fastmath=True)
 25 | def compute_pois_llh(X_data, X_row, X_col,
 26 |                      theta_vi_shape, theta_vi_rate,
 27 |                      beta_vi_shape, beta_vi_rate):
 28 |     ncells, ngenes = (theta_vi_shape.shape[0], beta_vi_shape.shape[0])
 29 |     nfactors, nnz = (theta_vi_shape.shape[1], X_data.shape[0])
 30 |     dtype = theta_vi_shape.dtype
 31 | 
 32 |     # precompute expectations
 33 |     theta_e_x = np.zeros_like(theta_vi_shape, dtype=dtype)
 34 |     for i in numba.prange(ncells):
 35 |         for k in range(nfactors):
 36 |             theta_e_x[i,k] = theta_vi_shape[i,k] / theta_vi_rate[i,k]
 37 | 
 38 |     beta_e_x = np.zeros_like(beta_vi_shape, dtype=dtype)
 39 |     for i in numba.prange(ngenes):
 40 |         for k in range(nfactors):
 41 |             beta_e_x[i,k] = beta_vi_shape[i,k] / beta_vi_rate[i,k]
 42 | 
 43 |     # compute llh
 44 |     llh = np.zeros(X_data.shape, dtype=dtype)
 45 |     for i in numba.prange(nnz):
 46 |         e_rate = np.zeros(1, dtype=dtype)[0]
 47 |         for k in range(nfactors):
 48 |             e_rate += theta_e_x[X_row[i],k] * beta_e_x[X_col[i], k]
 49 |         llh[i] = X_data[i] * np.log(e_rate) - e_rate \
 50 |             - cgammaln(X_data[i] + 1.0)
 51 |     return llh
 52 | 
 53 | 
 54 | @numba.njit(parallel=True, nogil=True)
 55 | def compute_Xphi_data(X_data, X_row, X_col,
 56 |                      theta_vi_shape, theta_vi_rate,
 57 |                      beta_vi_shape, beta_vi_rate):
 58 |     """ Fast version of Xphi computation using numba & gsl_digamma
 59 | 
 60 |     Parameters
 61 |     ----------
 62 |     X_data : ndarray of np.int32
 63 |         (number_nonzero, ) array of nonzero values
 64 |     X_row : ndarray of np.int32
 65 |         (number_nonzero, ) array of row ids for each nonzero value
 66 |     X_col : ndarray (np.int32)
 67 |         (number_nonzero, ) array of column ids for each nonzero value
 68 |     theta_vi_shape : ndarray
 69 |         (ncells, nfactors) array of values for theta's variational shape
 70 |     theta_vi_rate : ndarray
 71 |         (ncells, nfactors) array of values for theta's variational rate
 72 |     beta_vi_shape : ndarray
 73 |         (ngenes, nfactors) array of values for beta's variational shape
 74 |     beta_vi_rate : ndarray
 75 |         (ngenes, nfactors) array of values for beta's variational rate
 76 |     """
 77 |     # convenience
 78 |     ncells, ngenes = (theta_vi_shape.shape[0], beta_vi_shape.shape[0])
 79 |     nfactors, nnz = (theta_vi_shape.shape[1], X_data.shape[0])
 80 |     dtype = theta_vi_shape.dtype
 81 | 
 82 |     # precompute theta.e_logx
 83 |     theta_e_logx = np.zeros_like(theta_vi_shape, dtype=dtype)
 84 |     for i in numba.prange(ncells):
 85 |         for k in range(nfactors):
 86 |             theta_e_logx[i,k] = psi(theta_vi_shape[i,k]) \
 87 |                                 - np.log(theta_vi_rate[i,k])
 88 | 
 89 |     # precompute beta.e_logx
 90 |     beta_e_logx = np.zeros_like(beta_vi_shape, dtype=dtype)
 91 |     for i in numba.prange(ngenes):
 92 |         for k in range(nfactors):
 93 |             beta_e_logx[i,k] = psi(beta_vi_shape[i,k]) \
 94 |                                - np.log(beta_vi_rate[i,k])
 95 | 
 96 |     # compute Xphi
 97 |     Xphi = np.zeros((X_row.shape[0], theta_e_logx.shape[1]), dtype=dtype)
 98 |     for i in numba.prange(nnz):
 99 |         logrho = np.zeros((Xphi.shape[1]), dtype=dtype)
100 |         for k in range(nfactors):
101 |             logrho[k] = theta_e_logx[X_row[i],k] + beta_e_logx[X_col[i], k]
102 | 
103 |         #log normalizer trick
104 |         rho_shift = np.zeros((Xphi.shape[1]), dtype=dtype)
105 |         normalizer = np.zeros(1, dtype=dtype)[0]
106 |         largest_in = np.max(logrho)
107 |         for k in range(nfactors):
108 |             rho_shift[k] = np.exp(logrho[k] - largest_in)
109 |             normalizer += rho_shift[k]
110 | 
111 |         for k in range(nfactors):
112 |             Xphi[i,k] = X_data[i] * rho_shift[k] / normalizer
113 | 
114 |     return Xphi
115 | 
116 | 
117 | def compute_Xphi_data_numpy(X, theta, beta, theta_ix=None):
118 |     """Single-threaded version of compute_Xphi_data
119 |     """
120 |     if theta_ix is None:
121 |         logrho = theta.e_logx[X.row, :] + beta.e_logx[X.col, :]
122 |     else:
123 |         logrho = theta.e_logx[theta_ix,:][X.row, :] + beta.e_logx[X.col,:]
124 |     logphi = logrho - logsumexp(logrho, axis=1)[:,None]
125 |     return X.data[:,None] * np.exp(logphi)
126 | 
127 | 
128 | @numba.njit(fastmath=True) #results unstable with prange. don't do it.
129 | def compute_loading_shape_update(Xphi_data, X_keep, nkeep, shape_prior):
130 |     """Compute gamma shape updates for theta or beta using numba
131 | 
132 |     Parameters
133 |     ----------
134 |     Xphi_data : ndarray
135 |         (number_nonzero, nfactors) array of X * phi
136 |     X_keep : ndarray
137 |         (number_nonzer,) vector of indices along the axis of interest.
138 |         If X is an (ncell,ngene) coo_matrix, this should be X.row when
139 |         computing updates for theta and X.col when computing updates for
140 |         beta
141 |     nkeep : int
142 |         Number of items on the axis of interest.  ncells when computing
143 |         updates for theta, and ngenes for updates for beta
144 |     shape_prior : float
145 |         Hyperprior for parameter. a for theta, c for beta.
146 | 
147 |     """
148 |     nnz, nfactors = Xphi_data.shape
149 |     dtype = Xphi_data.dtype
150 | 
151 |     result = shape_prior * np.ones((nkeep, nfactors), dtype=dtype)
152 |     for i in range(nnz):
153 |         ikeep = X_keep[i]
154 |         for k in range(nfactors):
155 |             result[ikeep, k] += Xphi_data[i,k]
156 |     return result
157 | 
158 | 
159 | @numba.njit(fastmath=True)
160 | def compute_loading_rate_update(prior_vi_shape, prior_vi_rate,
161 |         other_loading_vi_shape, other_loading_vi_rate,):
162 |     # shorter names
163 |     pvs, pvr = (prior_vi_shape, prior_vi_rate)
164 |     olvs, olvr = (other_loading_vi_shape, other_loading_vi_rate)
165 |     dtype = prior_vi_shape.dtype
166 | 
167 |     other_loading_e_x_sum = np.zeros((olvs.shape[1]), dtype=dtype)
168 |     for i in range(olvs.shape[0]):
169 |         for k in range(olvs.shape[1]):
170 |             other_loading_e_x_sum[k] += olvs[i,k] / olvr[i,k]
171 | 
172 |     result = np.zeros((pvs.shape[0], olvs.shape[1]), dtype=dtype)
173 |     for i in range(pvs.shape[0]):
174 |         prior_e_x = pvs[i] / pvr[i]
175 |         for k in range(olvs.shape[1]):
176 |             result[i, k] = prior_e_x + other_loading_e_x_sum[k]
177 |     return result
178 | 
179 | 
180 | @numba.njit(fastmath=True)
181 | def compute_capacity_rate_update(loading_vi_shape, loading_vi_rate, prior_rate):
182 |     dtype = loading_vi_shape.dtype
183 |     result = prior_rate * np.ones((loading_vi_shape.shape[0],),
184 |             dtype=dtype)
185 |     for k in range(loading_vi_shape.shape[1]):
186 |         for i in range(loading_vi_shape.shape[0]):
187 |             result[i] += loading_vi_shape[i,k] / loading_vi_rate[i,k]
188 |     return result
189 | 


--------------------------------------------------------------------------------
/schpf/loss.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Loss functions and higher order functions that return loss functions for a
  5 | given dataset
  6 | 
  7 | """
  8 | 
  9 | import functools
 10 | import numpy as np
 11 | from scipy.special import gammaln
 12 | 
 13 | from schpf.hpf_numba import compute_pois_llh
 14 | 
 15 | ### Higher order loss functions
 16 | 
 17 | def loss_function_for_data(loss_function, X):
 18 |     """ Get a loss function for a fixed dataset
 19 | 
 20 |     Parameters
 21 |     ----------
 22 |     loss_function : function
 23 |         The loss function to use.  The data parameter for the function must
 24 |         be `X`
 25 |     X : coo_matrix
 26 |         coo_matrix of data to apply loss function to
 27 | 
 28 |     Returns
 29 |     -------
 30 |     fixed_data_loss_function : function
 31 |         A loss function which takes all the same parameters as the input
 32 |         `loss_function`, except for the data parameter `X` which is fixed
 33 |     """
 34 |     return functools.partial(loss_function, X=X)
 35 | 
 36 | 
 37 | def projection_loss_function(loss_function, X, nfactors,
 38 |         model_kwargs={}, proj_kwargs={}):
 39 |     """ Project new data onto an existing model and calculate loss from it
 40 | 
 41 |     Parameters
 42 |     ----------
 43 |     loss_function : function
 44 |         the loss function to use on the projected data
 45 |     X : coo_matrix
 46 |         Data to project onto the existing model.  Can have an arbitrary number
 47 |         of rows (cells) > 0, but must have the same number of columns (genes)
 48 |         as the existing model
 49 |     nfactors : int
 50 |         Number of factors in model
 51 |     model_kwargs : dict, optional
 52 |         additional keyword arguments for scHPF()
 53 |     proj_kwargs : dict, optional
 54 |         additional keyword arguments for scHPF.project(). By default,
 55 |         max_iter=5,
 56 | 
 57 | 
 58 |     Returns
 59 |     -------
 60 |     projection_loss_function : function
 61 |         A function which takes `a`, `ap`, `bp`, `c`, `cp`, `dp`, `eta`, and
 62 |         `beta` for an scHPF model, projects a fixed dataset onto it, and takes
 63 |         the loss (using a fixed function) with respect to both the model and
 64 |         the data's projection.
 65 | 
 66 |     """
 67 |     # have to do import here to avoid issue with files importing each other
 68 |     from schpf import scHPF
 69 | 
 70 |     # make the model used for projection
 71 |     pmodel = scHPF(nfactors=nfactors, **model_kwargs)
 72 | 
 73 |     # actual loss function for data
 74 |     def _projection_loss_function(*, a, ap, bp, c, cp, dp, eta, beta, **kwargs):
 75 |         assert eta.dims[0] == beta.dims[0]
 76 |         assert beta.dims[1] == nfactors
 77 | 
 78 |         pmodel.a = a
 79 |         pmodel.ap = ap
 80 |         pmodel.bp = bp
 81 |         pmodel.c = c
 82 |         pmodel.cp = cp
 83 |         pmodel.dp = dp
 84 |         pmodel.eta = eta
 85 |         pmodel.beta = beta
 86 | 
 87 |         # defaults if not given
 88 |         if 'reinit' not in proj_kwargs: proj_kwargs['reinit'] = False
 89 |         if 'max_iter' not in proj_kwargs: proj_kwargs['max_iter'] = 10
 90 |         if 'min_iter' not in proj_kwargs: proj_kwargs['min_iter'] = 10
 91 |         if 'check_freq' not in proj_kwargs:
 92 |             proj_kwargs['check_freq'] = proj_kwargs['max_iter'] + 1
 93 | 
 94 |         # do the projection
 95 |         pmodel.project(X, replace=True, **proj_kwargs)
 96 | 
 97 |         # calculate loss
 98 |         return loss_function(X, a=pmodel.a, ap=pmodel.ap, bp=pmodel.bp,
 99 |                 c=pmodel.c, cp=pmodel.cp, dp=pmodel.dp, xi=pmodel.xi,
100 |                 eta=pmodel.eta, theta=pmodel.theta, beta=pmodel.beta)
101 | 
102 |     return _projection_loss_function
103 | 
104 | 
105 | #### Loss functions
106 | 
107 | def pois_llh_pointwise(X, *, theta, beta, single_process=False, **kwargs):
108 |     """Poisson log-likelihood for each nonzero entry
109 | 
110 |     Parameters
111 |     ----------
112 |     X: coo_matrix
113 |         Data to compute Poisson log likelihood of. Assumed to be nonzero.
114 |     theta : HPF_Gamma
115 |     beta : HPF_Gamma
116 |     single_process: bool, optiona (Default: False)
117 |         use single-threaded version of llh
118 |     **kwargs : dict, optional
119 |         extra arguments not used in this loss function
120 | 
121 |     Returns
122 |     -------
123 |     llh: ndarray
124 | 
125 | 
126 |     Note
127 |     ----
128 |     Like all loss functions in this module, all parameters except from data
129 |     must be passed to the function as a keyword argument, and the function
130 |     will accept unused keyword args.
131 |     """
132 |     if single_process:
133 |         e_rate = (theta.e_x[X.row] *  beta.e_x[X.col]).sum(axis=1)
134 |         llh = X.data * np.log(e_rate) - e_rate - gammaln(X.data + 1)
135 |     else:
136 |         llh = compute_pois_llh(X.data, X.row, X.col,
137 |                                 theta.vi_shape, theta.vi_rate,
138 |                                 beta.vi_shape, beta.vi_rate)
139 |     return llh
140 | 
141 | 
142 | def mean_negative_pois_llh(X, *, theta, beta, single_process=False, **kwargs):
143 |     """Mean Poisson log-likelihood for each nonzero entry
144 | 
145 |     Parameters
146 |     ----------
147 |     X: coo_matrix
148 |         Data to compute Poisson log likelihood of. Assumed to be nonzero.
149 |     theta : HPF_Gamma
150 |     beta : HPF_Gamma
151 |     single_process: bool, optional (Default: False)
152 |         use single-threaded version of pointwise loss
153 |     **kwargs : dict, optional
154 |         extra arguments not used in this loss function
155 | 
156 |     Returns
157 |     -------
158 |     llh: ndarray
159 | 
160 | 
161 |     Note
162 |     ----
163 |     Like all loss functions in this module, all parameters except from data
164 |     must be passed to the function as a keyword argument, and the function
165 |     will accept unused keyword args.
166 |     """
167 |     return np.mean( -pois_llh_pointwise(X=X, theta=theta, beta=beta,
168 |         single_process=single_process) )
169 | 


--------------------------------------------------------------------------------
/schpf/preprocessing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import warnings
  4 | import numpy as np
  5 | from scipy.sparse import coo_matrix
  6 | import pandas as pd
  7 | 
  8 | from schpf.util import split_coo_rows
  9 | 
 10 | 
 11 | def load_coo(filename):
 12 |     """Load a sparse coo matrix
 13 | 
 14 |     Assumes first column (dense row ids) are cells, second column (dense
 15 |     column ids) are genes, and third column are nonzero counts. Also assumes
 16 |     row and column ids are 0-indexed.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     filename : str
 21 |         file to load
 22 | 
 23 |     Returns
 24 |     -------
 25 |     coo : coo_matrix
 26 |     """
 27 |     raw = np.loadtxt(filename, delimiter='\t', dtype=int)
 28 |     sparse = coo_matrix((raw[:,2], (raw[:,0],raw[:,1])))
 29 |     return sparse
 30 | 
 31 | 
 32 | def load_loom(filename):
 33 |     """Load data from a loom file
 34 | 
 35 |     Parameters
 36 |     ----------
 37 |     filename: str
 38 |         file to load
 39 | 
 40 |     Returns
 41 |     -------
 42 |     coo : coo_matrix
 43 |         cell x gene sparse count matrix
 44 |     genes : Dataframe
 45 |         Dataframe of gene attributes.  Attributes are ordered so
 46 |         Accession and Gene are the first columns, if those attributs are
 47 |         present
 48 |     """
 49 |     import loompy
 50 |     # load the loom file
 51 |     with loompy.connect(filename) as ds:
 52 |         loom_genes = pd.DataFrame(dict(ds.ra.items()))
 53 |         loom_coo = ds.sparse().T
 54 | 
 55 |     # order gene attributes so Accession and Gene are the first two columns,
 56 |     # if they are present
 57 |     first_cols = []
 58 |     for colname in ['Accession', 'Gene']:
 59 |         if colname in loom_genes.columns:
 60 |             first_cols.append(colname)
 61 |     rest_cols = loom_genes.columns.difference(first_cols).tolist()
 62 |     loom_genes = loom_genes[first_cols + rest_cols]
 63 | 
 64 |     return loom_coo,loom_genes
 65 | 
 66 | 
 67 | def load_txt(filename,  ngene_cols=2, verbose=True):
 68 |     """Load data from a whitespace delimited txt file
 69 | 
 70 |     Parameters
 71 |     ----------
 72 |     filename : str
 73 |         file to load.  Expected to be a gene x cell whitespace-delimited file
 74 |         without a header where the first `ngene_cols` are gene identifiers,
 75 |         names or other metadata.
 76 |     ngene_cols : int, optional (default: 2)
 77 |         The number of columns that contain row attributes (ie gene id/names)
 78 |     verbose : bool, optional (default: True)
 79 |         print progress messages
 80 | 
 81 |     Returns
 82 |     -------
 83 |     coo : coo_matrix
 84 |         cell x gene sparse count matrix
 85 |     genes : pd.DataFrame
 86 |         ngenes x ngene_cols array of gene names/attributes
 87 |     """
 88 |     assert( ngene_cols > 0 )
 89 |     gene_cols = list(range(ngene_cols))
 90 | 
 91 |     if filename.endswith('.gz') or filename.endswith('.bz2'):
 92 |         msg = '.....'
 93 |         msg+= 'WARNING: Input file {} is compressed. '.format(filename)
 94 |         msg+= 'It may be faster to manually decompress before loading.'
 95 |         print(msg)
 96 | 
 97 |         df = pd.read_csv(filename, header=None, memory_map=True,
 98 |                 delim_whitespace=True)
 99 | 
100 |         genes = df[gene_cols]
101 |         dense = df.drop(columns=gene_cols).values.T
102 |         nz = np.nonzero(dense)
103 |         coo = coo_matrix((dense[nz], nz), shape=dense.shape, dtype=np.int32)
104 |     else:
105 |         genes, rows, cols, values = [], [], [], []
106 | 
107 |         # load row by row to conserve memory + actually often faster
108 |         with open(filename) as f:
109 |             # for each gene/row
110 |             for g, l in enumerate(f):
111 |                 llist = l.split()
112 |                 genes.append(llist[:ngene_cols])
113 |                 r, c, val = [], [], []
114 | 
115 |                 # for each cell/column
116 |                 for cell,v in enumerate(llist[ngene_cols:]):
117 |                     if v != '0':
118 |                         r.append(int(cell))
119 |                         c.append(int(g))
120 |                         val.append(int(v))
121 | 
122 |                 rows.extend(r)
123 |                 cols.extend(c)
124 |                 values.extend(val)
125 | 
126 |                 if verbose and ((g+1)%10000 == 0) and (g!=0):
127 |                     print('\tloaded {} genes for {} cells'.format(
128 |                         g+1, cell+1))
129 | 
130 |         ncells, ngenes = len(llist[ngene_cols:]), g+1
131 |         coo = coo_matrix((np.array(values), (np.array(rows),np.array(cols))),
132 |                 shape=(ncells,ngenes), dtype=np.int32)
133 |         genes = pd.DataFrame(genes)
134 | 
135 |     return coo, genes
136 | 
137 | 
138 | def min_cells_expressing_mask(counts, min_cells, verbose=True):
139 |     """Get a mask for genes expressed by a minimum number of cells
140 | 
141 |     Parameters
142 |     ----------
143 |     counts : ndarray or coo_matrix
144 |         A cell x gene coo_matrix of counts
145 |     min_cells: numeric
146 |         the minimum number (if int) or proportion (if float between 0 and 1)
147 |         of cells in which we must observe transcripts of the gene for
148 |         inclusion in the dataset.  If `min_cells` is between 0 and 1, sets
149 |         the threshold to round(min_cells * ncells)
150 |     verbose : bool, default True
151 |         if True, print the number of cells when a numbr between 0 and 1 is given
152 | 
153 |     Returns
154 |     -------
155 |     passing_mask : ndarray
156 |         boolean array of passing genes
157 | 
158 |     TODO verbose option + return min_cells
159 |     """
160 |     if min_cells < 1 and min_cells > 0:
161 |         min_cells_frac = min_cells
162 |         min_cells = round(min_cells_frac * counts.shape[0])
163 |         msg = '.....requiring {}% of cells = {} cells observed expressing for'
164 |         msg += ' gene inclusion'
165 |         print(msg.format(100 * min_cells_frac, min_cells))
166 |     return counts.astype(bool).sum(axis=0).A[0,:] >= min_cells
167 | 
168 | 
169 | def genelist_mask(candidates, genelist, whitelist=True, split_on_dot=True):
170 |     """Get a mask for genes on or off a list
171 | 
172 |     Parameters
173 |     ----------
174 |     candidates : pd.Series
175 |         Candidate genes (from matrix)
176 |     genelist : pd.Series
177 |         List of genes to filter against
178 |     whitelist : bool, default True
179 |         Is the gene list a whitelist (True), where only genes on it should
180 |         be kept or a blacklist (False) where all genes on it should be
181 |         excluded
182 |     split_on_dot : bool, default True
183 |         If True, remove part of gene identifier after '.'.  We do this by
184 |         default because ENSEMBL IDs contain version numbers after periods.
185 | 
186 |     Returns
187 |     -------
188 |     passing_mask : ndarray
189 |         boolean array of passing genes
190 |     """
191 |     if split_on_dot:
192 |         candidates = candidates.str.split('.').str[0]
193 |         genelist = genelist.str.split('.').str[0]
194 | 
195 |     if whitelist:
196 |         mask = candidates.isin(genelist)
197 |     else:
198 |         mask = ~candidates.isin(genelist)
199 | 
200 |     return mask.values
201 | 
202 | 
203 | def subsample_cell_ixs(choices, nselect, group_ids=None, max_group_frac=0.5):
204 |     """Randomly select cells, potentially accounting for groups
205 | 
206 |     Parameters
207 |     ----------
208 |     choices : ndarray or int
209 |         Indices of cells to choose from.  If int is give, indices assumend
210 |         to be np.arange(`choices`)
211 |     nselect : int
212 |         number of indices to return
213 |     group_ids : ndarray, optional
214 |         Group ids of cells.  len(`group_ids`) must == `choices` if `choices`
215 |         is an int, and == len(`choices`) otherwise.  If `group_ids` is given,
216 |         selected cells will be distributed approximately evenly over the
217 |         labels under the constraint that at most floor(group_size *
218 |         `max_group_frac`) can be selected from a group.
219 |     max_group_frac : float, optional (default: 0.5)
220 |         If `group_ids` given, the maximum fraction of cells in a group that
221 |         can be selected.
222 | 
223 |     Returns
224 |     -------
225 |     selected_ix : ndarray
226 |         1d array of selected ids (sorted).
227 |     """
228 |     if isinstance(choices, int):
229 |         choices = np.arange(choices)
230 | 
231 |     if group_ids is None:
232 |         return np.sort(np.random.choice(choices, nselect, replace=False))
233 |     else:
234 |         assert len(group_ids) == len(choices)
235 | 
236 |         label, remaining = np.unique(group_ids, return_counts=True)
237 |         constraint = np.floor( remaining * max_group_frac ).astype(int)
238 | 
239 |         selected, n_remain = [], nselect
240 |         # while unconstrained cells left and more requested
241 |         while np.sum(constraint) > 0 and n_remain > 0:
242 |             # calculate goals given remaining cells to select and
243 |             # unconstrained cells left
244 |             weights = (constraint > 0) / (constraint > 0).sum()
245 |             goal_floor = np.floor(weights * n_remain).astype(int)
246 |             remainder = np.sum(np.ceil(weights * n_remain) - goal_floor
247 |                     ).astype(int)
248 |             goal = goal_floor + np.random.multinomial(remainder, weights)
249 |             # for each group
250 |             for i in range(len(remaining)):
251 |                 # if there are unconstrained cells left in the group
252 |                 if constraint[i] > 0:
253 |                     my_nchoose = min(goal[i], constraint[i])
254 |                     my_choices = np.setdiff1d(choices[group_ids == label[i]],
255 |                             selected)
256 |                     # select the cells
257 |                     chosen = np.random.choice(my_choices, my_nchoose,
258 |                             replace=False)
259 |                     selected.extend(list(chosen))
260 |                     # update constraint
261 |                     constraint[i] -= my_nchoose
262 |                     n_remain -= my_nchoose
263 |         if n_remain > 0:
264 |             msg = "Could not select {} cells".format(nselect)
265 |             msg += " with given group_ids under constraint max_group_frac"
266 |             msg += "={}. {} cells selected.".format(max_group_frac, n_remain)
267 |             warnings.warn(msg, UserWarning)
268 | 
269 |         return np.sort(selected)
270 | 
271 | 
272 | def split_validation_cells(X, nselect, group_id_file='', max_group_frac=0.5,
273 |         verbose=True):
274 |     """ Split train and validation cells, potentially accounting from groups
275 | 
276 |     Parameters
277 |     ----------
278 |     X : coo_matrix
279 |         Matrix to select validation cells from
280 |     nselect : int
281 |         Number of cells to select
282 |     group_id_file : str, optional
283 |         File containing group ids.  Should be loadable with np.loadtxt
284 |     max_group_frac : float, optional (default: 0.5)
285 |         If `group_id_file` given, the maximum fraction of cells in a group that
286 |         can be selected.
287 |     verbose : bool, optional (default: True)
288 |         Verbose output
289 | 
290 |     Returns
291 |     -------
292 |     Xtrain : coo_matrix
293 |         X with validation rows removed
294 |     Xvalidation : coo_matrix
295 |         Selected rows from X
296 |     validation_ix : ndarray
297 |         Indexes of selected rows in the intput matrix `X`
298 |     """
299 |     # load groups
300 |     if group_id_file is not None and len(group_id_file):
301 |         group_ids = np.loadtxt(group_id_file)
302 |     else:
303 |         group_ids = None
304 | 
305 |     # select cells
306 |     selected_ids = subsample_cell_ixs(X.shape[0], nselect, group_ids,
307 |             max_group_frac)
308 | 
309 |     # write a message
310 |     if verbose:
311 |         ncells = len(selected_ids)
312 |         msg = '.....{} cells selected'.format(ncells)
313 |         if group_ids is not None:
314 |             msg += ' ~~evenly from groups in {}'.format(group_id_file)
315 |             msg += ' under constraint max_group_frac={}'.format(max_group_frac)
316 |             msg += '\n\tGroup counts:'
317 |             ids, id_counts = np.unique(group_ids[selected_ids],
318 |                     return_counts=True)
319 |             for i, c in zip(ids, id_counts):
320 |                 msg += '\n\t\t[{}] {}'.format(i, c)
321 |         print(msg)
322 | 
323 |     # split cells
324 |     Xvalidation, Xtrain =  split_coo_rows(X, selected_ids)
325 |     return Xtrain, Xvalidation, selected_ids
326 | 
327 | 
328 | def load_and_filter(infile, min_cells, whitelist='', blacklist='',
329 |         filter_by_gene_name=False, no_split_on_dot=False, verbose=True):
330 |     """ Composite of loading and filtering intended for use by CLI
331 | 
332 |     Parameters
333 |     ----------
334 |     infile : str
335 |         Input data. Currently accepts either: (1) a whitespace-delimited gene
336 |         by cell UMI count matrix with 2 leading columns of gene attributes
337 |         (ENSEMBL_ID and GENE_NAME respectively), or (2) a loom file with at
338 |         least one of the row attributes `Accession` or `Gene`, where `Accession`
339 |         is an ENSEMBL id and `Gene` is the name.
340 |     min_cells : float or int
341 |         Minimum number of cells in which we must observe at least one transcript
342 |         of a gene for the gene to pass filtering. If 0 <`min_cells`< 1, sets
343 |         threshold to be `min_cells` * ncells, rounded to the nearest integer.
344 |     whitelist : str, optional
345 |         Tab-delimited file where first column contains ENSEMBL gene ids to
346 |         accept, and second column contains corresponding gene names. If given,
347 |         genes not on the whitelist are filtered from the input matrix.
348 |         Superseded by blacklist. Default None.
349 |     blacklist : str, optional
350 |         Tab-delimited file where first column contains ENSEMBL gene ids to
351 |         exclude, and second column is the corresponding gene name. Only
352 |         performed if file given. Genes on the blacklist are excluded even if
353 |         they are also on the whitelist.
354 |     filter_by_gene_name : bool, optional
355 |         Use gene name rather than ENSEMBL id to filter (with whitelist or
356 |         blacklist).  Useful for datasets where only gene symbols are given.
357 |         Applies to both whitelist and blacklist. Used by default when input
358 |         is a loom file. Default False.
359 |     no_split_on_dot : bool, optional
360 |         Don't split gene symbol or name on period before filtering white and
361 |         blacklist. We do this by default for ENSEMBL ids. Default False.
362 |     verbose : bool, optional
363 |         Print progress messages. Default True
364 | 
365 |     Returns
366 |     -------
367 |     filtered : ndarray
368 |     genes : pd.DataFrame
369 | 
370 |     Raises
371 |     ------
372 |     ValueError
373 |     """
374 |     if verbose:
375 |         print('Loading data.....')
376 | 
377 |     if infile.endswith('.loom'):
378 |         umis, genes = load_loom(infile)
379 |         if 'Accession' in genes.columns:
380 |             candidate_names = genes['Accession']
381 |             genelist_col = 0
382 |         elif 'Gene' in genes.columns:
383 |             candidate_names = genes['Gene']
384 |             genelist_col = 1
385 |         else:
386 |             msg = 'loom files must have at least one of the row '
387 |             msg+= 'attributes: `Gene` or `Accession`.'
388 |             raise ValueError(msg)
389 |     else:
390 |         umis, genes = load_txt(infile)
391 |         genelist_col = 1 if filter_by_gene_name else 0
392 |         candidate_names = genes[genelist_col]
393 |     ncells, ngenes = umis.shape
394 |     if verbose:
395 |         print('.....found {} cells and {} genes'.format(ncells, ngenes))
396 |         print('Generating masks for filtering.....')
397 | 
398 |     if min_cells < 0:
399 |         raise ValueError('min_cells must be >= 0')
400 |     mask = min_cells_expressing_mask(umis, min_cells)
401 |     if whitelist is not None and len(whitelist):
402 |         whitelist = pd.read_csv(whitelist, delim_whitespace=True, header=None)
403 |         mask &= genelist_mask(candidate_names, whitelist[genelist_col],
404 |                               split_on_dot = ~no_split_on_dot)
405 |     if blacklist is not None and len(blacklist):
406 |         blacklist = pd.read_csv(blacklist, delim_whitespace=True, header=None)
407 |         mask &= genelist_mask(candidate_names, blacklist[genelist_col],
408 |                               whitelist=False, split_on_dot = ~no_split_on_dot)
409 | 
410 |     if verbose:
411 |         print('Filtering data.....')
412 |     genes = genes.loc[mask]
413 |     filtered = umis.tolil()[:,mask].tocoo() # must convert to apply mask
414 | 
415 |     return filtered, genes
416 | 
417 | 
418 | def load_like(infile, reference, by_gene_name=False,
419 |         no_split_on_dot=False):
420 |     """Load expression matrix, selecting genes and ordering like a reference
421 |     gene list
422 | 
423 |     Parameters
424 |     ----------
425 |     infile : str
426 |         Input data. Currently accepts either: (1) a whitespace-delimited gene
427 |         by cell UMI count matrix with 2 leading columns of gene attributes
428 |         (ENSEMBL_ID and GENE_NAME respectively), or (2) a loom file with at
429 |         least one of the row attributes `Accession` or `Gene`, where `Accession`
430 |         is an ENSEMBL id and `Gene` is the name.
431 |     reference : str
432 |         Tab-delimited file where first column contains ENSEMBL gene ids and
433 |         second column contains corresponding gene names. Returned array
434 |         will contain exactly these genes, in this order, for counts in cells
435 |         in `infile`
436 |     by_gene_name : bool, optional (Default: False)
437 |         match files by gene name (second 1-indexed column)
438 |     no_split_on_dot : bool, optional
439 |         Don't split gene symbol or name on period before filtering white and
440 |         blacklist. We do this by default for ENSEMBL ids. Default False.
441 | 
442 |     Returns
443 |     -------
444 |     reordered_coo : coo_matrix
445 |         cell x gene sparse count matrix with genes filtered and ordered like
446 |         reference
447 |     reordered_genes : pd.DataFrame
448 |         ngenes x ngene_cols array of gene names/attributes. Should basically by
449 |         a duplicate of reference
450 | 
451 | 
452 |     Raises
453 |     ------
454 |     ValueError : if a gene from the reference is not in infile
455 |     """
456 |     if infile.endswith('.loom'):
457 |         umis, genes = load_loom(infile)
458 |         if 'Accession' in genes.columns:
459 |             candidate_names = genes['Accession']
460 |             genelist_col = 0
461 |         elif 'Gene' in genes.columns:
462 |             candidate_names = genes['Gene']
463 |             genelist_col = 1
464 |         else:
465 |             msg = 'loom files must have at least one of the row '
466 |             msg+= 'attributes: `Gene` or `Accession`.'
467 |             raise ValueError(msg)
468 |     else:
469 |         umis, genes = load_txt(infile)
470 |         genelist_col = 1 if by_gene_name else 0
471 |         candidate_names = genes[genelist_col]
472 |     ncells, ngenes = umis.shape
473 | 
474 |     # load the reference order
475 |     ref = pd.read_csv(reference, delim_whitespace=True, header=None
476 |             )[genelist_col]
477 |     # select input column and process names unless told not to
478 |     if no_split_on_dot:
479 |         ingenes = candidate_names
480 |     else:
481 |         ref = ref.str.split('.').str[0]
482 |         ingenes = candidate_names.str.split('.').str[0]
483 | 
484 |     perm = []
485 |     try:
486 |         for g in ref:
487 |             perm.append(np.where(ingenes==g)[0][0])
488 |     except IndexError as e:
489 |         msg = 'Reference gene `{}` in reference `{}` not found in infile `{}`'
490 |         msg = msg.format(g, reference, infile)
491 |         raise ValueError(msg)
492 | 
493 |     reordered_umis = umis.tocsr()[:,perm].tocoo()
494 |     reordered_genes = genes.loc[perm]
495 |     return reordered_umis, reordered_genes
496 | 


--------------------------------------------------------------------------------
/schpf/scHPF_.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python
   2 | from copy import deepcopy
   3 | from warnings import warn
   4 | from functools import partial
   5 | from multiprocessing import cpu_count
   6 | 
   7 | import numpy as np
   8 | from scipy.sparse import coo_matrix
   9 | from scipy.special import digamma, gammaln, psi
  10 | try:
  11 |     from scipy.misc import logsumexp
  12 | except ImportError:
  13 |     from scipy.special import logsumexp
  14 | 
  15 | from sklearn.base import BaseEstimator
  16 | import joblib
  17 | from joblib import Parallel, delayed
  18 | 
  19 | 
  20 | # TODO warn if can't import, and allow computation with slow
  21 | from schpf.hpf_numba import *
  22 | from schpf.util import minibatch_ix_generator
  23 | import schpf.loss as ls
  24 | import schpf
  25 | 
  26 | 
  27 | class HPF_Gamma(object):
  28 |     """Gamma variational distributions
  29 | 
  30 |     Parameters
  31 |     ----------
  32 |     vi_shape: np.ndarray
  33 |         Gamma shape parameter for the variational Gamma distributions.
  34 |         Ndarray.shape[0] must match `vi_rate`
  35 |     vi_rate: np.ndarray
  36 |         Gamma rate parameter for the variational Gamma distributions.
  37 |         Ndarray.shape[0] must match `vi_shape`
  38 | 
  39 |     Attributes
  40 |     ----------
  41 |     vi_shape : ndarray
  42 |     vi_rate : ndarray
  43 |     dims : ndarray
  44 |         The shape of vi_shape and vi_rate
  45 |     dtype : dtype
  46 |         dtype of both vi_shape and vi_rate
  47 |     """
  48 | 
  49 |     @staticmethod
  50 |     def random_gamma_factory(dims, shape_prior, rate_prior, dtype=np.float64):
  51 |         """Factory method to randomly initialize variational distributions
  52 | 
  53 |         Parameters
  54 |         ----------
  55 |         dims: list-like
  56 |             Numpy-style shape of the matrix of Gammas.
  57 |         shape_prior: float
  58 |             Prior for variational Gammas' shapes.  Must be greater than 0.
  59 |         rate_prior: float
  60 |             Prior for variational Gammas' rates.  Must be greater than 0.
  61 | 
  62 |         Returns
  63 |         -------
  64 |             A randomly initialized HPF_Gamma instance
  65 |         """
  66 |         vi_shape = np.random.uniform(0.5 * shape_prior, 1.5 * shape_prior,
  67 |                                      dims).astype(dtype)
  68 |         vi_rate  = np.random.uniform(0.5 * rate_prior, 1.5 * rate_prior,
  69 |                                      dims).astype(dtype)
  70 |         return HPF_Gamma(vi_shape,vi_rate)
  71 | 
  72 | 
  73 |     def __init__(self, vi_shape, vi_rate):
  74 |         """Initializes HPF_Gamma with variational shape and rates"""
  75 |         assert(vi_shape.shape == vi_rate.shape)
  76 |         assert(vi_shape.dtype == vi_rate.dtype)
  77 |         assert(np.all(vi_shape > 0))
  78 |         assert(np.all(vi_rate > 0))
  79 |         self.vi_shape = vi_shape
  80 |         self.vi_rate = vi_rate
  81 |         self.dtype = vi_shape.dtype
  82 | 
  83 | 
  84 |     def __eq__(self, other):
  85 |         if isinstance(other, self.__class__):
  86 |             shape_equal = np.array_equal(self.vi_shape, other.vi_shape)
  87 |             rate_equal = np.array_equal(self.vi_rate, other.vi_rate)
  88 |             dtype_equal = self.dtype == other.dtype
  89 |             return shape_equal and rate_equal and dtype_equal
  90 |         return False
  91 | 
  92 | 
  93 |     @property
  94 |     def dims(self):
  95 |         assert self.vi_shape.shape == self.vi_rate.shape
  96 |         return self.vi_shape.shape
  97 | 
  98 | 
  99 |     @property
 100 |     def e_x(self):
 101 |         """Expected value of the random variable(s) given variational
 102 |         distribution(s)
 103 |         """
 104 |         return self.vi_shape / self.vi_rate
 105 | 
 106 | 
 107 |     @property
 108 |     def e_logx(self):
 109 |         """Expectation of the log of random variable given variational
 110 |         distribution(s)"""
 111 |         return digamma(self.vi_shape) - np.log(self.vi_rate)
 112 | 
 113 | 
 114 |     @property
 115 |     def entropy(self):
 116 |         """Entropy of variational Gammas"""
 117 |         return  self.vi_shape - np.log(self.vi_rate) \
 118 |                 + gammaln(self.vi_shape) \
 119 |                 + (1 - self.vi_shape) * digamma(self.vi_shape)
 120 | 
 121 | 
 122 |     def sample(self, nsamples=1):
 123 |         """Sample from variational distributions
 124 | 
 125 |         Parameters
 126 |         ----------
 127 |         nsamples: int (optional, default 1)
 128 |             Number of samples to take.
 129 | 
 130 |         Returns
 131 |         -------
 132 |         X_rep : np.ndarray
 133 |             An ndarray of samples from the variational distributions, where
 134 |             the last dimension is the number of samples `nsamples`
 135 |         """
 136 |         samples = []
 137 |         for i in range(nsamples):
 138 |             samples.append(np.random.gamma(self.vi_shape, 1/self.vi_rate).T)
 139 |         return np.stack(samples).T
 140 | 
 141 | 
 142 |     def combine(self, other, other_ixs):
 143 |         """ Combine with another HPF_Gamma
 144 | 
 145 |         Useful for combining variational distributions from training data with
 146 |         variational distributions from cells that were projected onto frozen
 147 |         beta and eta
 148 | 
 149 |         Parameters
 150 |         ----------
 151 |         other : `HPF_Gamma`
 152 |             Other HPF_Gamma to merge with
 153 |         other_ixs : list or ndarray
 154 |             Ordered indices of other in the merged HPF_Gamma. Must have len
 155 |             equal to other.shape[0]. Must have a maximum value less than
 156 |             self.dims[0] + other.shape[0]. May not have any repeat values.
 157 | 
 158 |         Returns
 159 |         -------
 160 |         combined_model : `HPF_Gamma`
 161 |         """
 162 |         assert other.dims[0] == len(other_ixs)
 163 |         assert len(np.unique(other_ixs)) == len(other_ixs)
 164 |         assert self.dims[0] + other.dims[0] > np.max(other_ixs)
 165 | 
 166 |         new_dims = [self.dims[0]+other.dims[0], *self.dims[1:]]
 167 |         self_ixs = np.setdiff1d(np.arange(new_dims[0]),
 168 |                 other_ixs)
 169 | 
 170 |         new_vi_shape = np.empty(new_dims, dtype=self.dtype)
 171 |         new_vi_shape[self_ixs] = self.vi_shape
 172 |         new_vi_shape[other_ixs] = other.vi_shape
 173 | 
 174 |         new_vi_rate = np.empty(new_dims, dtype=self.dtype)
 175 |         new_vi_rate[self_ixs] = self.vi_rate
 176 |         new_vi_rate[other_ixs] = other.vi_rate
 177 | 
 178 |         return HPF_Gamma(new_vi_shape, new_vi_rate)
 179 | 
 180 | 
 181 | class scHPF(BaseEstimator):
 182 |     """scHPF as described in Levitin et al., Molecular Systems Biology 2019
 183 | 
 184 |     Parameters
 185 |     ----------
 186 |     nfactors: int
 187 |         Number of factors (K)
 188 |     a: float, (optional, default 0.3)
 189 |         Hyperparameter a
 190 |     ap: float (optional, default 1.0)
 191 |         Hyperparameter a'
 192 |     bp: float (optional, default None)
 193 |         Hyperparameter b'. Set empirically from observed data if not
 194 |         given.
 195 |     c: float, (optional, default 0.3)
 196 |         Hyperparameter c
 197 |     cp: float (optional, default 1.0)
 198 |         Hyperparameter c'
 199 |     dp: float (optional, default None)
 200 |         Hyperparameter d'. Set empirically from observed data if not
 201 |         given.
 202 |     min_iter: int (optional, default 30):
 203 |         Minimum number of interations for training.
 204 |     max_iter: int (optional, default 1000):
 205 |         Maximum number of interations for training.
 206 |     check_freq: int (optional, default 10)
 207 |         Number of training iterations between calculating loss.
 208 |     epsilon: float (optional, default 0.001)
 209 |         Percent change of loss for convergence.
 210 |     better_than_n_ago: int (optional, default 5)
 211 |         Stop condition if loss is getting worse.  Stops training if loss
 212 |         is worse than `better_than_n_ago`*`check_freq` training steps
 213 |         ago and getting worse.
 214 |     xi: HPF_Gamma (optional, default None)
 215 |         Variational distributions for xi
 216 |     theta: HPF_Gamma (optional, default None)
 217 |         Variational distributions for theta
 218 |     eta: HPF_Gamma (optional, default None)
 219 |         Variational distributions for eta
 220 |     beta: HPF_Gamma (optional, default None)
 221 |         Variational distributions for beta
 222 |     verbose: bool (optional, default True)
 223 |             Print messages at each check_freq
 224 |     """
 225 |     def __init__(
 226 |             self,
 227 |             nfactors,
 228 |             a=0.3,
 229 |             ap=1,
 230 |             bp=None,
 231 |             c=0.3,
 232 |             cp=1,
 233 |             dp=None,
 234 |             min_iter=30,
 235 |             max_iter=1000,
 236 |             check_freq=10,
 237 |             epsilon=0.001,
 238 |             better_than_n_ago=5,
 239 |             dtype=np.float64,
 240 |             xi=None,
 241 |             theta=None,
 242 |             eta=None,
 243 |             beta=None,
 244 |             loss=[],
 245 |             verbose=True,
 246 |             ):
 247 |         """Initialize HPF instance"""
 248 |         self.version = schpf.__version__
 249 |         self.nfactors = nfactors
 250 |         self.a = a
 251 |         self.ap = ap
 252 |         self.bp = bp
 253 |         self.c = c
 254 |         self.cp = cp
 255 |         self.dp = dp
 256 |         self.min_iter = min_iter
 257 |         self.max_iter = max_iter
 258 |         self.check_freq = check_freq
 259 |         self.epsilon = epsilon
 260 |         self.better_than_n_ago = better_than_n_ago
 261 |         self.dtype = dtype
 262 |         self.verbose = verbose
 263 | 
 264 |         self.xi = xi
 265 |         self.eta = eta
 266 |         self.theta = theta
 267 |         self.beta = beta
 268 | 
 269 |         self.loss = []
 270 | 
 271 | 
 272 |     @property
 273 |     def a(self):
 274 |         try:
 275 |             return self._a
 276 |         except AttributeError:
 277 |             msg = 'Automatically using a=0.3. If you are loading a model'
 278 |             msg += ' generated with scHPF version < 0.5 and set a custom value'
 279 |             msg += ' for a, you must manually reset it and re-save the model.'
 280 |             warn(msg, RuntimeWarning)
 281 |             return 0.3
 282 | 
 283 | 
 284 |     @a.setter
 285 |     def a(self, val):
 286 |         if val == -2:
 287 |             if self.nfactors is None:
 288 |                 raise ValueError('Can only set a as a function of nfactors when'
 289 |                         ' nfactors is not None')
 290 |             else:
 291 |                 self._a = 1/np.sqrt(self.nfactors)
 292 |         else:
 293 |             assert val > 0
 294 |             self._a = val
 295 | 
 296 | 
 297 |     @property
 298 |     def c(self):
 299 |         try:
 300 |             return self._c
 301 |         except AttributeError:
 302 |             msg = 'Automatically using c=0.3. If you are loading a model '
 303 |             msg += ' generated with scHPF version < 0.5 and set a custom value'
 304 |             msg += ' for c, you must manually reset it and re-save the model.'
 305 |             warn(msg, RuntimeWarning)
 306 |             return 0.3
 307 | 
 308 | 
 309 |     @c.setter
 310 |     def c(self, val):
 311 |         if val == -2:
 312 |             if self.nfactors is None:
 313 |                 raise ValueError('Can only set a as a function of nfactors when'
 314 |                         ' nfactors is not None')
 315 |             else:
 316 |                 self._c = 1/np.sqrt(self.nfactors)
 317 |         else:
 318 |             assert val > 0
 319 |             self._c = val
 320 | 
 321 | 
 322 |     @property
 323 |     def ngenes(self):
 324 |         return self.eta.dims[0] if self.eta is not None else None
 325 | 
 326 | 
 327 |     @property
 328 |     def ncells(self):
 329 |         return self.xi.dims[0] if self.xi is not None else None
 330 | 
 331 | 
 332 |     def cell_score(self, xi=None, theta=None):
 333 |         """Get cell score from xi and theta
 334 | 
 335 |         Parameters
 336 |         ----------
 337 |         xi : HPF_Gamma, (optional, default self.xi)
 338 |             varitional distributions for xi
 339 |         theta : HPF_Gamma, (optional, default self.theta)
 340 |             varitional distributions for theta
 341 | 
 342 |         Returns
 343 |         -------
 344 |         cell_score : ndarray
 345 |             ncell x nfactor array of cell scores
 346 |         """
 347 |         xi = self.xi if xi is None else xi
 348 |         theta = self.theta if theta is None else theta
 349 |         return self._score(xi, theta)
 350 | 
 351 | 
 352 |     def gene_score(self, eta=None, beta=None):
 353 |         """Get cell score from eta and beta
 354 | 
 355 |         Parameters
 356 |         ----------
 357 |         eta : HPF_Gamma, (optional, default self.eta)
 358 |             varitional distributions for eta
 359 |         beta : HPF_Gamma, (optional, default self.beta)
 360 |             varitional distributions for beta
 361 | 
 362 |         Returns
 363 |         -------
 364 |         gene_score : ndarray
 365 |             ngene x nfactor array of cell scores
 366 |         """
 367 |         eta = self.eta if eta is None else eta
 368 |         beta = self.beta if beta is None else beta
 369 |         return self._score(eta, beta)
 370 | 
 371 | 
 372 |     def pois_llh_pointwise(self, X, theta=None, beta=None):
 373 |         """Poisson log-likelihood (for each nonzero data)
 374 | 
 375 |         Attempt to use numba/cffi/gsl, use numpy otherwise
 376 | 
 377 |         Parameters
 378 |         ----------
 379 |         X: coo_matrix
 380 |             Data to compute Poisson log likelihood of. Assumed to be nonzero.
 381 |         theta : HPF_Gamma, optional
 382 |             If given, use for theta instead of self.theta
 383 |         beta : HPF_Gamma, optional
 384 |             If given, use for beta instead of self.beta
 385 | 
 386 |         Returns
 387 |         -------
 388 |         llh: ndarray
 389 |         """
 390 |         theta = self.theta if theta is None else theta
 391 |         beta = self.beta if beta is None else beta
 392 |         return ls.pois_llh_pointwise(X=X, theta=theta, beta=beta)
 393 | 
 394 | 
 395 |     def cellmean_negative_pois_llh(self, X, theta=None, beta=None):
 396 |         """Convenience method for mean negative llh of nonzero entries,
 397 |            averaged by cell
 398 | 
 399 |         """
 400 |         theta = self.theta if theta is None else theta
 401 |         assert(theta.vi_shape.shape[0] == X.shape[0])
 402 |         beta = self.beta if beta is None else beta
 403 |         llh_pointwise= self.pois_llh_pointwise(X=X, theta=theta, beta=beta)
 404 | 
 405 |         llh_csr = coo_matrix((-llh_pointwise, (X.row,X.col)), shape=X.shape).tocsr()
 406 |         sums = llh_csr.sum(axis=1).A1
 407 |         counts = np.diff(llh_csr.indptr)
 408 |         averages = sums/counts
 409 | 
 410 |         assert(averages.shape[0] == theta.vi_shape.shape[0])
 411 |         return averages
 412 | 
 413 | 
 414 | 
 415 | 
 416 |     def mean_negative_pois_llh(self, X, theta=None, beta=None, **kwargs):
 417 |         """Convenience method for mean negative llh of nonzero entries
 418 | 
 419 |         """
 420 |         theta = self.theta if theta is None else theta
 421 |         beta = self.beta if beta is None else beta
 422 |         return ls.mean_negative_pois_llh(X=X, theta=theta, beta=beta)
 423 | 
 424 | 
 425 |     def fit(self, X, **kwargs):
 426 |         """Fit an scHPF model
 427 | 
 428 |         Parameters
 429 |         ----------
 430 |         X: coo_matrix
 431 |             Data to fit
 432 |         loss_function : function, optional (Default: None)
 433 |             loss function to use for fit. set to negative poisson likelihood
 434 |             of X if not given
 435 |         """
 436 |         (bp, dp, xi, eta, theta, beta, loss) = self._fit(
 437 |                 X, **kwargs)
 438 |         self.bp = bp
 439 |         self.dp = dp
 440 |         self.xi = xi
 441 |         self.eta = eta
 442 |         self.theta = theta
 443 |         self.beta = beta
 444 |         self.loss = loss
 445 |         return self
 446 | 
 447 | 
 448 |     def project(self, X, recalc_bp=False, replace=False, min_iter=2, max_iter=50,
 449 |             check_freq=2, **kwargs):
 450 |         """Project new cells into latent space
 451 | 
 452 |         Gene distributions (beta and eta) are fixed.
 453 | 
 454 |         Parameters
 455 |         ----------
 456 |         X: coo_matrix
 457 |             Data to project.  Should have self.ngenes columns
 458 |         recalc_bp : bool, optional (Default: False)
 459 |             Recalculated value of empirical hyperparameter bp. Do not do this
 460 |             for withheld text data.
 461 |         replace: bool, optional (Default: False)
 462 |             Replace theta and xi with projected values in self. Note that
 463 |             loss will not be updated
 464 |         min_iter: int, (Default: 2)
 465 |             Replaces self.min_iter if not None. Few iterations are needed
 466 |             because beta and eta are fixed.
 467 |         max_iter: int, (Default: 10)
 468 |             Replaces self.max_iter if not None. Few iterations are needed
 469 |             because beta and eta are fixed.
 470 |         check_freq: int, optional (Default: 2)
 471 |             Number of training iterations between calculating loss.
 472 | 
 473 |         Returns
 474 |         -------
 475 |         result : scHPF or ndarray
 476 |             If replace=`False`, an  scHPF object with variational
 477 |             distributions theta and xi (for the new cells in `X`) and the
 478 |             same variational distributions as self for gene distributions
 479 |             beta and eta. If replace=`True`, then the loss for the projection
 480 |             (xi and theta will be updated in self but not returned). In both
 481 |             cases, bp will only be updated for the new data if self.bp==None or
 482 |             recalc_bp=`True`.
 483 | 
 484 |         """
 485 |         if replace and recalc_bp:
 486 |             msg = 'Cannot replace `bp` with recalculated value'
 487 |             raise ValueError(msg)
 488 | 
 489 |         model = self if replace else deepcopy(self)
 490 |         if recalc_bp: model.bp = None
 491 |         (bp, _, xi, _, theta, _, loss) = model._fit(X,
 492 |                 min_iter=min_iter, max_iter=max_iter, check_freq=check_freq,
 493 |                 freeze_genes=True, **kwargs)
 494 |         if replace:
 495 |             self.xi = xi
 496 |             self.theta = theta
 497 |             return loss
 498 |         else:
 499 |             model.bp = bp
 500 |             model.xi = xi
 501 |             model.theta = theta
 502 |             model.loss = loss
 503 |             return model
 504 | 
 505 | 
 506 |     def _score(self, capacity, loading):
 507 |         """Get the hierarchically normalized loadings which we call the cell
 508 |         or gene score in the scHPF paper
 509 | 
 510 |         Parameters
 511 |         ----------
 512 |         capacity : HPF_Gamma
 513 |             xi or eta
 514 |         loading : HPF_Gamma
 515 |             theta or beta
 516 | 
 517 | 
 518 |         Returns
 519 |         -------
 520 |         score : ndarray
 521 |         """
 522 |         assert(loading.dims[0] == capacity.dims[0])
 523 |         return loading.e_x * capacity.e_x[:,None]
 524 | 
 525 | 
 526 |     def _fit(self, X, freeze_genes=False, reinit=True, loss_function=None,
 527 |             min_iter=None, max_iter=None, epsilon=None, check_freq=None,
 528 |             single_process=False, checkstep_function=None, verbose=None,
 529 |             batchsize=None, beta_theta_simultaneous=False,
 530 |             loss_smoothing=1):
 531 |         """Combined internal fit/transform function
 532 | 
 533 |         Parameters
 534 |         ----------
 535 |         X: coo_matrix
 536 |             Data to fit
 537 |         freeze_genes: bool, (optional, default False)
 538 |             Should we update gene variational distributions eta and beta
 539 |         reinit: bool, (optional, default True)
 540 |             Randomly initialize variational distributions even if they
 541 |             already exist. Superseded by freeze_genes. Does not affect
 542 |             self.bp and self.dp which will only be set empirically if they
 543 |             are None
 544 |         loss_function : function, (optional, default None)
 545 |             Function to use for loss, which is assumed to be nonzero and
 546 |             decrease with improvement. Must accept hyperparameters a, ap,
 547 |             bp, c, cp, and dp and the variational distributions for xi, eta,
 548 |             theta, and beta even if only some of these values are used.
 549 |             Should have an internal reference to any data used (_fit will
 550 |             not pass it any data). If `loss_function` is not given or equal
 551 |             to None, the mean negative log likelihood of nonzero values in
 552 |             training data `X` is used.
 553 |         min_iter: int (optional, default None)
 554 |             Replaces self.min_iter if given.  Useful when projecting
 555 |             new data onto an existing scHPF model.
 556 |         max_iter: int (optional, default None)
 557 |             Replaces self.max_iter if given.  Useful when projecting
 558 |             new data onto an existing scHPF model.
 559 |         epsilon: float (optional, default None)
 560 |             Replaces self.epsilon if given. Percent change of loss for
 561 |             convergence.
 562 |         check_freq : int, optional (Default: None)
 563 |             Replaces self.check_freq if given.  Useful when projecting
 564 |             new data onto an existing scHPF model.
 565 |         single_process : bool, optional (Default: False)
 566 |             Use single-threaded versions of updates
 567 |         checkstep_function : function  (optional, default None)
 568 |             A function that takes arguments bp, dp, xi, eta, theta, beta,
 569 |             and t and, if given, is called at check_interval. Intended use
 570 |             is to check additional stats during training, potentially with
 571 |             hardcoded data, but is unrestricted.  Use at own risk.
 572 |         verbose: bool (optional, default None)
 573 |             If not None, overrides self.verbose
 574 |         batchsize: int, optional (Default 0)
 575 |             number of cells per batch. When 0, all cells are used
 576 |         beta_theta_simultaneous: bool, option (Default False)
 577 |             Should updates for beta and theta be computed simultaneously.
 578 |             If False, beta is updated first, and theta is updated using
 579 |             that beta
 580 |         loss_smoothing: int, optional (Default: 1)
 581 |             Smooth loss up to `loss_smoothing` check frequencies ago. 1 results
 582 |             in no smoothing. Intended to be used with batching when assessing
 583 |             convergence based on training loss, where a good value might be
 584 |             int(ncells/n_batches)
 585 | 
 586 |         Returns
 587 |         -------
 588 |         bp: float
 589 |             Empirically set value for bp
 590 |         dp: float
 591 |             Empirically set value for dp. Unchanged if freeze_genes.
 592 |         xi: HPF_Gamma
 593 |             Learned variational distributions for xi
 594 |         eta: HPF_Gamma
 595 |             Learned variational distributions for eta. Unchanged if
 596 |             freeze_genes.
 597 |         theta: HPF_Gamma
 598 |             Learned variational distributions for theta
 599 |         beta: HPF_Gamma
 600 |             Learned variational distributions for beta. Unchanged if
 601 |             freeze_genes.
 602 |         loss : list
 603 |             loss at each checkstep
 604 |         """
 605 |         assert loss_smoothing > 0
 606 | 
 607 |         # local (convenience) vars for model
 608 |         nfactors, (ncells, ngenes) = self.nfactors, X.shape
 609 |         a, ap, c, cp = self.a, self.ap, self.c, self.cp
 610 | 
 611 |         # get empirically set hyperparameters and variational distributions
 612 |         bp, dp, xi, eta, theta, beta = self._setup(X, freeze_genes, reinit)
 613 | 
 614 |         # Make first updates for hierarchical shape prior
 615 |         # (vi_shape is constant, but want to update full distribution)
 616 |         xi.vi_shape[:] = ap + nfactors * a
 617 |         if not freeze_genes:
 618 |             eta.vi_shape[:] = cp + nfactors * c
 619 | 
 620 |         # setup loss function as mean negative llh of nonzero training data
 621 |         # if the loss function is not given
 622 |         if loss_function is None:
 623 |             loss_function = ls.loss_function_for_data(
 624 |                     ls.mean_negative_pois_llh, X)
 625 | 
 626 |         # setup batch_ix iterator
 627 |         if batchsize is not None and batchsize > 1 and batchsize <= ncells:
 628 |             batched = True
 629 |             batch_ix_generator = minibatch_ix_generator(ncells, batchsize)
 630 |         else:
 631 |             batched = False
 632 |             batch_ix_generator = None
 633 | 
 634 |         ## init
 635 |         loss, unsmoothed_loss, pct_change = [], [], []
 636 |         # check variable overrides
 637 |         min_iter = self.min_iter if min_iter is None else min_iter
 638 |         max_iter = self.max_iter if max_iter is None else max_iter
 639 |         epsilon = self.epsilon if epsilon is None else epsilon
 640 |         check_freq = self.check_freq if check_freq is None else check_freq
 641 |         verbose = self.verbose if verbose is None else verbose
 642 |         for t in range(max_iter):
 643 |             # setup batching
 644 |             if batch_ix_generator is None:
 645 |                 batch_ix = np.arange(X.shape[0])
 646 |                 batchsize = ncells
 647 |                 X_batch = X
 648 |             else:
 649 |                 batch_ix = next(batch_ix_generator)
 650 |                 X_batch = X.tocsr()[batch_ix,:].tocoo()
 651 | 
 652 |             if t==0 and reinit: #randomize phi for first iteration
 653 |                 random_phi = np.random.dirichlet( np.ones(nfactors),
 654 |                         X_batch.data.shape[0])
 655 |                 Xphi_data = X_batch.data[:,None] * random_phi
 656 |             else:
 657 |                 if single_process:
 658 |                     Xphi_data = compute_Xphi_data_numpy(X_batch, theta, beta,
 659 |                             theta_ix=batch_ix)
 660 |                 else:
 661 |                     Xphi_data = compute_Xphi_data(
 662 |                             X_batch.data, X_batch.row, X_batch.col,
 663 |                             theta.vi_shape[batch_ix], theta.vi_rate[batch_ix],
 664 |                             beta.vi_shape, beta.vi_rate)
 665 | 
 666 |             if beta_theta_simultaneous:
 667 |                 # calculate gene updates but don't assign yet
 668 |                 if not freeze_genes:
 669 |                     bvs = compute_loading_shape_update(Xphi_data,
 670 |                             X_batch.col, ngenes, c)
 671 |                     bvr = compute_loading_rate_update(eta.vi_shape,
 672 |                             eta.vi_rate, theta.vi_shape[batch_ix],
 673 |                             theta.vi_rate[batch_ix])
 674 |                 # cell updates
 675 |                 theta.vi_shape[batch_ix] = compute_loading_shape_update(
 676 |                         Xphi_data, X_batch.row, batchsize, a)
 677 |                 theta.vi_rate[batch_ix] = compute_loading_rate_update(
 678 |                         xi.vi_shape[batch_ix], xi.vi_rate[batch_ix],
 679 |                         beta.vi_shape, beta.vi_rate)
 680 |                 xi.vi_rate[batch_ix] = bp + theta.e_x[batch_ix].sum(1)
 681 |                 # make gene updates
 682 |                 if not freeze_genes:
 683 |                     beta.vi_shape = bvs
 684 |                     beta.vi_rate = bvr
 685 |                     eta.vi_rate = dp + beta.e_x.sum(1)
 686 | 
 687 |             else:
 688 |                 if batched:
 689 |                     # cell updates, must do first for batching
 690 |                     theta.vi_shape[batch_ix] = compute_loading_shape_update(
 691 |                             Xphi_data, X_batch.row, batchsize, a)
 692 |                     theta.vi_rate[batch_ix] = compute_loading_rate_update(
 693 |                             xi.vi_shape[batch_ix], xi.vi_rate[batch_ix],
 694 |                             beta.vi_shape, beta.vi_rate)
 695 |                     xi.vi_rate[batch_ix] = bp + theta.e_x[batch_ix].sum(1)
 696 | 
 697 |                 if not freeze_genes:
 698 |                     #gene updates
 699 |                     beta.vi_shape = compute_loading_shape_update(Xphi_data,
 700 |                             X_batch.col, ngenes, c)
 701 |                     beta.vi_rate = compute_loading_rate_update(eta.vi_shape,
 702 |                             eta.vi_rate, theta.vi_shape[batch_ix],
 703 |                             theta.vi_rate[batch_ix])
 704 |                     eta.vi_rate = dp + beta.e_x.sum(1)
 705 | 
 706 |                 if not batched:
 707 |                     # cell updates, doing after gene updates when not batched
 708 |                     # for legacy consistency
 709 |                     theta.vi_shape[batch_ix] = compute_loading_shape_update(
 710 |                             Xphi_data, X_batch.row, batchsize, a)
 711 |                     theta.vi_rate[batch_ix] = compute_loading_rate_update(
 712 |                             xi.vi_shape[batch_ix], xi.vi_rate[batch_ix],
 713 |                             beta.vi_shape, beta.vi_rate)
 714 |                     xi.vi_rate[batch_ix] = bp + theta.e_x[batch_ix].sum(1)
 715 | 
 716 | 
 717 |             # record llh/percent change and check for convergence
 718 |             if t % check_freq == 0:
 719 | 
 720 |                 # chech llh
 721 |                 # vX = validation_data if validation_data is not None else X
 722 |                 try :
 723 |                     curr = loss_function(
 724 |                                 a=a, ap=ap, bp=bp, c=c, cp=cp, dp=dp,
 725 |                                 xi=xi, eta=eta, theta=theta, beta=beta)
 726 |                     unsmoothed_loss.append(curr)
 727 |                     if len(unsmoothed_loss) > loss_smoothing:
 728 |                         unsmoothed_loss = unsmoothed_loss[1:]
 729 |                     # normally this is just curr as loss_smoothing=1 by default
 730 |                     loss.append(np.mean(unsmoothed_loss))
 731 |                 except NameError as e:
 732 |                     print('Invalid loss function')
 733 |                     raise e
 734 | 
 735 |                 # calculate percent change
 736 |                 try:
 737 |                     curr, prev = loss[-1], loss[-2]
 738 |                     pct_change.append(100 * (curr - prev) / np.abs(prev))
 739 |                 except IndexError:
 740 |                     pct_change.append(100)
 741 |                 if verbose:
 742 |                     msg = '[Iter. {0: >4}]  loss:{1:.6f}  pct:{2:.9f}'.format(
 743 |                             t, curr, pct_change[-1])
 744 |                     print(msg)
 745 |                 if checkstep_function is not None:
 746 |                     checkstep_function(bp=bp, dp=dp, xi=xi, eta=eta, theta=theta,
 747 |                             beta=beta, t=t)
 748 | 
 749 |                 # check convergence
 750 |                 if len(loss) > 3 and t >= min_iter:
 751 |                     # convergence conditions (all must be met)
 752 |                     current_small = np.abs(pct_change[-1]) < self.epsilon
 753 |                     prev_small = np.abs(pct_change[-2]) < self.epsilon
 754 |                     not_inflection = not (
 755 |                             (np.abs(loss[-3]) < np.abs(prev)) \
 756 |                             and (np.abs(prev) > np.abs(curr)))
 757 |                     converged = current_small and prev_small and not_inflection
 758 |                     if converged:
 759 |                         if verbose:
 760 |                             print('converged')
 761 |                         break
 762 | 
 763 |                     # getting worse, and has been for better_than_n_ago checks
 764 |                     # (don't waste time on a bad run)
 765 |                     if len(loss) > self.better_than_n_ago \
 766 |                             and self.better_than_n_ago:
 767 |                         nprev = loss[-self.better_than_n_ago] \
 768 |                                 if len(loss)>self.better_than_n_ago else loss[0]
 769 |                         worse_than_n_ago = np.abs(nprev) < np.abs(curr)
 770 |                         getting_worse = np.abs(prev) < np.abs(curr)
 771 |                         if worse_than_n_ago and getting_worse:
 772 |                             if verbose:
 773 |                                 print('getting worse break')
 774 |                             break
 775 | 
 776 |             # TODO message or warning or something
 777 |             if t >= self.max_iter:
 778 |                 break
 779 | 
 780 |         return (bp, dp, xi, eta, theta, beta, loss)
 781 | 
 782 | 
 783 |     def _setup(self, X, freeze_genes=False, reinit=True, clip=True):
 784 |         """Setup variational distributions
 785 | 
 786 |         Parameters
 787 |         ----------
 788 |         X: coo_matrix
 789 |             Data to fit
 790 |         freeze_genes: bool, optional (Default: False)
 791 |             Should we update gene variational distributions eta and beta
 792 |         reinit: bool, optional (Default: True)
 793 |             Randomly initialize variational distributions even if they
 794 |             already exist. Superseded by freeze_genes. Does not affect
 795 |             self.bp and self.dp (which will only be set empirically if
 796 |             they are None)
 797 |         clip : bool, optional (Default: True)
 798 |             If empirically calculating dp and bp > 1000 * dp, clip dp to
 799 |             bp / 1000.
 800 | 
 801 |         Returns
 802 |         -------
 803 |         bp : float
 804 |         dp : float
 805 |         xi : HPF_Gamma
 806 |         eta : HPF_Gamma
 807 |         theta : HPF_Gamma
 808 |         beta : HPF_Gamma
 809 | 
 810 |         """
 811 |         # locals for convenience
 812 |         nfactors, (ncells, ngenes) = self.nfactors, X.shape
 813 |         a, ap, c, cp = self.a, self.ap, self.c, self.cp
 814 |         bp, dp = self.bp, self.dp
 815 | 
 816 |         xi, eta, theta, beta = (self.xi, self.eta, self.theta, self.beta)
 817 | 
 818 |         # empirically set bp and dp
 819 |         bp, dp = self._get_empirical_hypers(X, freeze_genes, clip)
 820 | 
 821 |         if reinit or (xi is None):
 822 |             xi = HPF_Gamma.random_gamma_factory((ncells,), ap, bp,
 823 |                     dtype=self.dtype)
 824 |         if reinit or (theta is None):
 825 |             theta = HPF_Gamma.random_gamma_factory((ncells,nfactors), a, bp,
 826 |                     dtype=self.dtype)
 827 | 
 828 |         # Check if variational distributions for genes exist, create if not
 829 |         # Error if freeze_genes and eta and beta don't exists
 830 |         if freeze_genes:
 831 |             if eta is None or beta is None:
 832 |                 msg = 'To fit with frozen gene variational distributions ' \
 833 |                     + '(`freeze_genes`==True), eta and beta must be set to ' \
 834 |                     + 'valid HPF_Gamma instances.'
 835 |                 raise ValueError(msg)
 836 |         else:
 837 |             if reinit or (eta is None):
 838 |                 eta = HPF_Gamma.random_gamma_factory((ngenes,), cp, dp,
 839 |                         dtype=self.dtype)
 840 |             if reinit or (beta is None):
 841 |                 beta = HPF_Gamma.random_gamma_factory((ngenes,nfactors),
 842 |                         c, dp, dtype=self.dtype)
 843 | 
 844 |         return (bp, dp, xi, eta, theta, beta)
 845 | 
 846 | 
 847 |     def _get_empirical_hypers(self, X, freeze_genes=False, clip=True):
 848 |         """Get empirical values for bp, dp
 849 | 
 850 |         Parameters
 851 |         ----------
 852 |         X : coo_matrix
 853 |             Data to fit
 854 | 
 855 |         Returns
 856 |         -------
 857 |         bp : float
 858 |         dp : float
 859 |         """
 860 |         bp, dp = self.bp, self.dp
 861 |         # empirically set bp and dp
 862 |         def mean_var_ratio(X, axis):
 863 |             axis_sum = X.sum(axis=axis)
 864 |             return np.mean(axis_sum) / np.var(axis_sum)
 865 |         if bp is None:
 866 |             bp = self.ap * mean_var_ratio(X, axis=1)
 867 |         if dp is None: # dp first in case of error
 868 |             if freeze_genes:
 869 |                 msg = 'dp is None and cannot be set'
 870 |                 msg += ' when freeze_genes is True.'
 871 |                 raise ValueError(msg)
 872 |             else:
 873 |                 dp = self.cp *  mean_var_ratio(X, axis=0)
 874 |                 if clip and bp > 1000 * dp:
 875 |                     old_val = dp
 876 |                     dp = bp / 1000
 877 |                     print('Clipping dp: was {} now {}'.format(old_val, dp))
 878 | 
 879 |         return bp, dp
 880 | 
 881 | 
 882 |     def _initialize(self, X, freeze_genes=False):
 883 |         """Shortcut to setup random distributions & set variables
 884 |         """
 885 |         bp, dp, xi, eta, theta, beta = self._setup(X, freeze_genes,
 886 |                 reinit=True)
 887 |         self.bp = bp
 888 |         self.dp = dp
 889 |         self.xi = xi
 890 |         self.eta = eta
 891 |         self.theta = theta
 892 |         self.beta = beta
 893 | 
 894 | 
 895 | def load_model(file_name):
 896 |     """Load a model from a joblib file
 897 | 
 898 |     Parameters
 899 |     ----------
 900 |     file_name : str
 901 |         Joblib file containing a saved scHPF model
 902 | 
 903 | 
 904 |     Returns
 905 |     -------
 906 |     model : scHPF
 907 |         The scHPF model in the file
 908 |     """
 909 |     return joblib.load(file_name)
 910 | 
 911 | 
 912 | def save_model(model, file_name):
 913 |     """Save model to (joblib) file
 914 | 
 915 |     Serialize scHPF model as a joblib file.  Joblib is simillar to pickle,
 916 |     but preferable for objects with many numpy arrays
 917 | 
 918 |     Parameters
 919 |     ----------
 920 |     model : scHPF
 921 |         The scHPF model object to save
 922 |     file_name : str
 923 |         Name of file to save model to
 924 |     """
 925 |     joblib.dump(model, file_name)
 926 | 
 927 | 
 928 | def combine_across_cells(x, y, y_ixs):
 929 |     """Combine theta & xi from two scHPF instance with the same beta & eta
 930 | 
 931 |     Intended to be used combining variational distributions for local
 932 |     variables (theta,xi) from training data with variational distributions
 933 |     for local variables from validation or other data that was projected
 934 |     onto the same global variational distributions (beta,eta)
 935 | 
 936 |     If `x.bp` != `y.bp`, returned model `xy.bp` is set to None. All other
 937 |     attributes (except for the merged xi and eta) are inherited from `x`.
 938 | 
 939 |     Parameters
 940 |     ----------
 941 |     x : `scHPF`
 942 |     y : `scHPF`
 943 |         The scHPF instance whose rows in the output should be at the
 944 |         corresponding indices `y_ixs`
 945 |     y_ixs : ndarray
 946 |         Row indices of `y` in the returned distributions. Must be 1-d and
 947 |         have same number of rows as `y`, have no repeats, and have no index
 948 |         greater than or equal to x.ncells + y.ncells.
 949 | 
 950 | 
 951 |     Returns
 952 |     -------
 953 |     ab : `scHPF`
 954 | 
 955 |     """
 956 |     assert x.dp == y.dp
 957 |     assert x.eta == y.eta
 958 |     assert x.beta == y.beta
 959 | 
 960 |     xy = deepcopy(x)
 961 |     if y.bp != x.bp:
 962 |         xy.bp = None
 963 |     xy.xi = x.xi.combine(y.xi, y_ixs)
 964 |     xy.theta = x.theta.combine(y.theta, y_ixs)
 965 |     return xy
 966 | 
 967 | 
 968 | def run_trials(X, nfactors,
 969 |         ntrials=5,
 970 |         min_iter=30,
 971 |         max_iter=1000,
 972 |         check_freq=10,
 973 |         epsilon=0.001,
 974 |         better_than_n_ago=5,
 975 |         dtype=np.float64,
 976 |         verbose=True,
 977 |         vcells = None,
 978 |         vX = None,
 979 |         loss_function=None,
 980 |         model_kwargs = {},
 981 |         return_all = False,
 982 |         reproject = False,
 983 |         reproject_kwargs = {},
 984 |         batchsize=0,
 985 |         beta_theta_simultaneous=False,
 986 |         loss_smoothing=1
 987 |         ):
 988 |     """
 989 |     Train with multiple random initializations, selecting model with best loss
 990 | 
 991 |     As scHPF uses non-convex optimization, it benefits from training with
 992 |     multiple random initializations to avoid local minima.
 993 | 
 994 |     Parameters
 995 |     ----------
 996 |     X: coo_matrix
 997 |         Data to fit
 998 |     nfactors: int
 999 |         Number of factors (K)
1000 |     ntrials : int,  optional (Default 5)
1001 |         Number of random initializations for training
1002 |     min_iter: int, optional (Default 30)
1003 |         Minimum number of interations for training.
1004 |     max_iter: int, optional (Default 1000):
1005 |         Maximum number of interations for training.
1006 |     check_freq: int, optional (Default 10)
1007 |         Number of training iterations between calculating loss.
1008 |     epsilon: float, optional (Default 0.001)
1009 |         Percent change of loss for convergence.
1010 |     better_than_n_ago: int, optional (Default 5)
1011 |         Stop condition if loss is getting worse.  Stops training if loss
1012 |         is worse than `better_than_n_ago`*`check_freq` training steps
1013 |         ago and getting worse.
1014 |     dtype : datatype, optional (Default np.float64)
1015 |         np.float64 or np.float32
1016 |     verbose: bool, optional (Default True)
1017 |         verbose
1018 |     vcells : coo_matrix, optional (Default None)
1019 |         cells to use in a validation loss function
1020 |     vX : coo_matrix, optional (Default None)
1021 |         nonzero entries from the cells in vX
1022 |     loss_function : function, optional (Default None)
1023 |         A loss function to asses convergence that accepts data, model
1024 |         variational parameters, and model hyperparameters.  Note this is
1025 |         distinct from the `loss_function` argument in scHPF._fit (called by
1026 |         scHPF.fit and scHPF.project), which assumes a fixed reference to data
1027 |         is included in the function and *does not* accept data as an argument.
1028 |     model_kwargs: dict, optional (Default {})
1029 |         dictionary of additional keyword arguments for model
1030 |         initialization
1031 |     return_all: bool, optional (Default False)
1032 |         return all models
1033 |     reproject: bool, optional (Default False)
1034 |         Reproject the data onto the frozen gene variables before calculating
1035 |         loss. The reprojected loss will be added to the end of loss as a
1036 |         sublist. Note that this reprojection will *not* use the `loss_function`
1037 |         argument, and instead use the default provided log likelihood
1038 |     reproject_kwargs: dict, optional (Default {'replace':True})
1039 |         Only used if `reproject` is True. Keyword args for scHPF.project.
1040 |         'replace':True cannot be changed, and will be overwritten if given
1041 |     batchsize: int, optional (Defualt 0)
1042 |         Number of cells to use per training round. All cells used if 0.
1043 |     loss_smoothing: int, optional (Default: 1)
1044 |         Smooth loss up to `loss_smoothing` check frequencies ago. 1 results in
1045 |         no smoothing. Intended to be used with batching when assessing
1046 |         convergence based on training loss, where a good value might be
1047 |         int(ncells/n_batches)
1048 | 
1049 |     Returns
1050 |     -------
1051 |     best_model: scHPF
1052 |         The model with the best loss facter `ntrials` random initializations
1053 |         and training runs
1054 |     rejected_models: list, optional
1055 |         Rejected models, ordered by decreasing loss . Only returned if
1056 |         return_all is True
1057 |     """
1058 |     ncells, ngenes = X.shape
1059 |     if ngenes >= 20000:
1060 |         msg = 'WARNING: you are running scHPF with {} genes,'.format(ngenes)
1061 |         msg += ' which is more than the ~20k protein coding genes in the'
1062 |         msg += ' human genome. We suggest running scHPF on protein-coding'
1063 |         msg += ' genes only.'
1064 |         print(msg)
1065 | 
1066 |     # get the loss function for any data
1067 |     if loss_function is None:
1068 |         loss_function = partial(ls.mean_negative_pois_llh,
1069 |                 single_process=False)
1070 | 
1071 |     # check data we're using for loss
1072 |     if vcells is not None:
1073 |         assert X.shape[1] == vcells.shape[1]
1074 |     if vX is not None:
1075 |         assert vX.shape == X.shape
1076 |     else:
1077 |         vX = X
1078 |     # setup loss fnc w/data (will be overridden if vcells is not None)
1079 |     data_loss_function = ls.loss_function_for_data(loss_function, vX)
1080 |     # setup smoothed_loss if using batches
1081 | 
1082 |     # run trials
1083 |     best_loss, best_model, best_t = np.finfo(np.float64).max, None, None
1084 |     models, losses = [], [] # only used if return_all
1085 |     for t in range(ntrials):
1086 |         # make a new model
1087 |         model = scHPF(nfactors=nfactors,
1088 |                     min_iter=min_iter, max_iter=max_iter,
1089 |                     check_freq=check_freq, epsilon=epsilon,
1090 |                     better_than_n_ago=better_than_n_ago,
1091 |                     verbose=verbose, dtype=dtype,
1092 |                     **model_kwargs
1093 |                     )
1094 | 
1095 |         # override the loss function data if we have vcells
1096 |         # (must be redone for each new model)
1097 |         if vcells is not None:
1098 |             proj_kwargs = dict(reinit=False,
1099 |                                min_iter=1,
1100 |                                max_iter=min(10, check_freq),
1101 |                                check_freq= check_freq+1,
1102 |                                verbose=False
1103 |                                )
1104 |             data_loss_function = ls.projection_loss_function(
1105 |                     loss_function, vcells, nfactors,
1106 |                     proj_kwargs=proj_kwargs)
1107 |             def checkstep_function(**kwargs):
1108 |                 loss = ls.loss_function_for_data(loss_function, X)
1109 |                 print('\ttrain:', '{0:.6f}'.format(loss(**kwargs)))
1110 |         else:
1111 |             checkstep_function = None
1112 | 
1113 |         # fit the model
1114 |         model.fit(X, loss_function=data_loss_function,
1115 |                   checkstep_function=checkstep_function,
1116 |                   batchsize=batchsize, loss_smoothing=loss_smoothing,
1117 |                   beta_theta_simultaneous=beta_theta_simultaneous)
1118 |         if reproject:
1119 |             print('Reprojecting data...')
1120 |             reproject_kwargs['replace'] = True
1121 |             reproject_kwargs['reinit'] = False
1122 |             proj_loss = model.project(X, **reproject_kwargs)
1123 |             model.loss.append(proj_loss)
1124 |             loss = proj_loss[-1]
1125 |         else:
1126 |             loss = model.loss[-1]
1127 | 
1128 |         if loss < best_loss:
1129 |             best_model = model
1130 |             best_loss = loss
1131 |             best_t = t
1132 |             if verbose:
1133 |                 print('New best!'.format(t))
1134 |         if return_all:
1135 |             models.append(model)
1136 |             losses.append(loss)
1137 |         if verbose:
1138 |             print('Trial {0} loss: {1:.6f}'.format(t, loss))
1139 |             print('Best loss: {0:.6f} (trial {1})'.format(best_loss, best_t))
1140 | 
1141 |     if return_all:
1142 |         return_order = np.argsort(losses)
1143 |         ordered_models = [models[i] for i in return_order]
1144 |         assert ordered_models[0] == best_model
1145 |         return best_model, ordered_models[1:]
1146 |     else:
1147 |         return best_model
1148 | 
1149 | 
1150 | # TODO deal with verbosity
1151 | def run_trials_pool(X, nfactors,
1152 |         ntrials=5,
1153 |         njobs=0,
1154 |         max_threads=None,
1155 |         min_iter=30,
1156 |         max_iter=1000,
1157 |         check_freq=10,
1158 |         epsilon=0.001,
1159 |         better_than_n_ago=5,
1160 |         dtype=np.float64,
1161 |         verbose=True,
1162 |         vcells = None,
1163 |         vX = None,
1164 |         loss_function=None,
1165 |         model_kwargs = {},
1166 |         return_all = False,
1167 |         reproject = False,
1168 |         reproject_kwargs = {},
1169 |         batchsize=0,
1170 |         beta_theta_simultaneous=False,
1171 |         loss_smoothing=1
1172 |         ):
1173 |     """
1174 |     Train with multiple random initializations, selecting model with best loss.
1175 |     Parallelization is done at the trial level rather than within computations
1176 | 
1177 |     As scHPF uses non-convex optimization, it benefits from training with
1178 |     multiple random initializations to avoid local minima.
1179 | 
1180 |     Parameters
1181 |     ----------
1182 |     X: coo_matrix
1183 |         Data to fit
1184 |     nfactors: int or list of ints
1185 |         Number of factors (K), may be a list for multiple k
1186 |     ntrials : int,  optional (Default 5)
1187 |         Number of random initializations for training
1188 |     njobs : int, optional (Default 0)
1189 |         Maximum number of threads in the threadpool.  0 will use all available.
1190 |     min_iter: int, optional (Default 30)
1191 |         Minimum number of interations for training.
1192 |     max_iter: int, optional (Default 1000):
1193 |         Maximum number of interations for training.
1194 |     check_freq: int, optional (Default 10)
1195 |         Number of training iterations between calculating loss.
1196 |     epsilon: float, optional (Default 0.001)
1197 |         Percent change of loss for convergence.
1198 |     better_than_n_ago: int, optional (Default 5)
1199 |         Stop condition if loss is getting worse.  Stops training if loss
1200 |         is worse than `better_than_n_ago`*`check_freq` training steps
1201 |         ago and getting worse.
1202 |     dtype : datatype, optional (Default np.float64)
1203 |         np.float64 or np.float32
1204 |     verbose: bool, optional (Default True)
1205 |         verbose
1206 |     vcells : coo_matrix, optional (Default None)
1207 |         cells to use in a validation loss function
1208 |     vX : coo_matrix, optional (Default None)
1209 |         nonzero entries from the cells in vX
1210 |     loss_function : function, optional (Default None)
1211 |         A loss function that accepts data, model variational parameters,
1212 |         and model hyperparameters.  Note this is distinct from the
1213 |         `loss_function` argument in scHPF._fit (called by scHPF.fit and
1214 |         scHPF.project), which assumes a fixed reference to data is included
1215 |         in the function and *does not* accept data as an argument.
1216 |     model_kwargs: dict, optional (Default {})
1217 |         dictionary of additional keyword arguments for model
1218 |         initialization
1219 |     return_all: bool, optional (Default False)
1220 |         return all models
1221 |     reproject: bool, optional (Default False)
1222 |         Reproject the data onto the frozen gene variables before calculating
1223 |         loss. The reprojected loss will be added to the end of loss as a
1224 |         sublist. Note that this reprojection will *not* use the `loss_function`
1225 |         argument, and instead use the default provided log likelihood
1226 |     reproject_kwargs: dict, optional (Default {'replace':True})
1227 |         Only used if `reproject` is True. Keyword args for scHPF.project.
1228 |         'replace':True cannot be changed, and will be overwritten if given
1229 |     batchsize: int, optional (Defualt 0)
1230 |             Number of cells to use per training round. All cells used if 0.
1231 |     loss_smoothing: int, optional (Default: 1)
1232 |         Smooth loss up to `loss_smoothing` check frequencies ago. 1 results in
1233 |         no smoothing. Intended to be used with batching when assessing
1234 |         convergence based on training loss, where a good value might be
1235 |         int(ncells/n_batches)
1236 | 
1237 | 
1238 |     Returns
1239 |     -------
1240 |     best_models: list(scHPF)
1241 |         The model with the best loss facter `ntrials` random initializations
1242 |         and training runs for each value in nfactors
1243 |     rejected_models: list(list(scHPF)), optional
1244 |         Rejected models, ordered by corresponding nfactors and then by
1245 |         decreasing loss . Only returned if return_all is True
1246 |     """
1247 |     ngenes = X.shape[1]
1248 |     if ngenes >= 20000:
1249 |         msg = 'WARNING: you are running scHPF with {} genes,'.format(ngenes)
1250 |         msg += ' which is more than the ~20k protein coding genes in the'
1251 |         msg += ' human genome. We suggest running scHPF on protein-coding'
1252 |         msg += ' genes only.'
1253 |         print(msg)
1254 | 
1255 |     # get the loss function for any data
1256 |     if loss_function is None:
1257 |         loss_function = partial(ls.mean_negative_pois_llh,
1258 |                 single_process=True)
1259 | 
1260 |     # check data we're using for loss
1261 |     if vcells is not None:
1262 |         assert X.shape[1] == vcells.shape[1]
1263 |     if vX is not None:
1264 |         assert vX.shape == X.shape
1265 |     else:
1266 |         vX = X
1267 |     # setup loss fnc w/data (will be overridden if vcells is not None)
1268 |     data_loss_function = ls.loss_function_for_data(loss_function, vX)
1269 | 
1270 |     # only need to create once because will be copied to processes
1271 |     # override the loss function data if we have vcells
1272 |     # (must be redone for each new model)
1273 |     if vcells is not None:
1274 |         proj_kwargs = dict(reinit=False,
1275 |                             min_iter=1,
1276 |                             max_iter=min(10, check_freq),
1277 |                             check_freq= check_freq+1,
1278 |                             verbose=False
1279 |                             )
1280 |         data_loss_function = ls.projection_loss_function(
1281 |                 loss_function, vcells, nfactors,
1282 |                 proj_kwargs=proj_kwargs)
1283 | 
1284 | 
1285 |     # function to fit model
1286 |     def fit_model(nfactors):
1287 |         model = scHPF(nfactors=nfactors,
1288 |                     min_iter=min_iter, max_iter=max_iter,
1289 |                     check_freq=check_freq, epsilon=epsilon,
1290 |                     better_than_n_ago=better_than_n_ago,
1291 |                     verbose=False, dtype=dtype,
1292 |                     **model_kwargs
1293 |                     )
1294 |         # fit the model
1295 |         model.fit(X, loss_function=data_loss_function,
1296 |                   checkstep_function=None, single_process=True,
1297 |                   batchsize=batchsize, loss_smoothing=loss_smoothing)
1298 |         if reproject:
1299 |             # print('Reprojecting data...')
1300 |             reproject_kwargs['replace'] = True
1301 |             proj_loss = model.project(X, loss_function=data_loss_function,
1302 |                     **reproject_kwargs)
1303 |             model.loss.append(proj_loss)
1304 |         return model
1305 | 
1306 |     # get nfactors for every trial
1307 |     if isinstance(nfactors, int):
1308 |         nfactors = [nfactors]
1309 |     trial_nfactors = [t for trial_set in [[K]*ntrials for K in nfactors] \
1310 |             for t in trial_set]
1311 | 
1312 |     # set max processes if not given
1313 |     if njobs == 0: njobs = min(cpu_count(), len(trial_nfactors))
1314 | 
1315 |     # training
1316 |     with Parallel(n_jobs=njobs, verbose=10) as pool: # make the pool
1317 |         candidates = pool( delayed(fit_model)(K)  for K in trial_nfactors)
1318 | 
1319 |     # get the best model for every K
1320 |     ordered_best, ordered_reject = [], []
1321 |     for i,K in enumerate(nfactors):
1322 |         my_candidates = candidates[i*ntrials : (i+1)*ntrials]
1323 |         loss = [m.loss[-1][-1] if reproject else m.loss[-1] for m in
1324 |                 my_candidates]
1325 |         # print(list(zip([m.nfactors for m in my_candidates],loss)))
1326 |         best_ix = np.argmin(loss)
1327 |         ordered_best.append(my_candidates[best_ix])
1328 |         ordered_reject.append([my_candidates[i] for i in np.argsort(loss)[1:]])
1329 |     if return_all:
1330 |         return ordered_best, ordered_reject
1331 |     else:
1332 |         return ordered_best
1333 | 


--------------------------------------------------------------------------------
/schpf/util.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from collections import namedtuple
  4 | 
  5 | import numpy as np
  6 | from scipy.stats import hypergeom
  7 | from scipy.sparse import csr_matrix
  8 | import pandas as pd
  9 | 
 10 | 
 11 | def mean_cellscore_fraction(cell_scores, ntop_factors=1):
 12 |     """ Get number of cells with a percentage of their total scores
 13 |     on a small number of factors
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     cell_scores : ndarray
 18 |         (ncells, nfactors) array of cell scores
 19 |     ntop_factors : int, optional (Default: 1)
 20 |         number of factors that can count towards domance
 21 | 
 22 |     Returns
 23 |     -------
 24 |     mean_cellscore_fraction : float
 25 |         The mean fraction of cells' scores that are contained within
 26 |         their top `ntop_factors` highest scoring factors
 27 | 
 28 |     """
 29 |     totals = np.sum(cell_scores, axis=1)
 30 |     ntop_scores = np.sort(cell_scores,axis=1)[:, -ntop_factors:]
 31 |     domsum = np.sum(ntop_scores, axis=1)
 32 |     domfrac = domsum/totals
 33 |     return np.mean(domfrac)
 34 | 
 35 | 
 36 | def mean_cellscore_fraction_list(cell_scores):
 37 |     """ Make a list of the mean dominant fraction at all possible numbers
 38 |         of ntop_factors
 39 |     """
 40 |     return [mean_cellscore_fraction(cell_scores, i+1)
 41 |                 for i in range(cell_scores.shape[1])]
 42 | 
 43 | 
 44 | def max_pairwise(gene_scores, ntop=200, second_greatest=False):
 45 |     """ Get the maximum pairwise overlap of top genes
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     gene_scores : ndarray
 50 |         (ngenes, nfactors) array of gene scores
 51 |     ntop : int (optional, default 200)
 52 |         Number of top genes to consider in each factor
 53 |     second_greatest : bool, optional
 54 |         Return the second greatest pairwise overlap of top genes
 55 | 
 56 |     Returns
 57 |     -------
 58 |     max_pairwise : int
 59 |         The maximum pairwise overlap of the `ntop` highest scoring genes in
 60 |         each factors
 61 |     p : float
 62 |         Hypergeometric p value of max_pairwise, where the number of genes is
 63 |         the population size, `ntop` is the number of potential successes and
 64 |         the number of draws, and max_pairwise is the number of successes.
 65 |     """
 66 |     tops = np.argsort(gene_scores, axis=0)[-ntop:]
 67 |     max_pairwise, last_max = 0, 0
 68 |     for i in range(tops.shape[1]):
 69 |         for j in range(tops.shape[1]):
 70 |             if i >= j:
 71 |                 continue
 72 |             overlap = len(np.intersect1d(tops[:,i], tops[:,j]))
 73 |             if overlap > max_pairwise:
 74 |                 last_max = max_pairwise
 75 |                 max_pairwise = overlap
 76 |             elif overlap > last_max:
 77 |                 last_max = overlap
 78 | 
 79 |     overlap = last_max if second_greatest else max_pairwise
 80 |     p = hypergeom.pmf(k=overlap, M=gene_scores.shape[0],
 81 |                 N=ntop, n=ntop) \
 82 |         + hypergeom.sf(k=overlap, M=gene_scores.shape[0],
 83 |                 N=ntop, n=ntop)
 84 |     Overlap = namedtuple('Overlap', ['overlap', 'p'])
 85 |     return Overlap(overlap, p)
 86 | 
 87 | 
 88 | def max_pairwise_table(gene_scores, ntop_list=[50,100,150,200,250,300]):
 89 |     """ Get the maximum pairwise overlap at
 90 | 
 91 |     Parameters
 92 |     ----------
 93 |     gene_scores : ndarray
 94 |         (ngenes, nfactors) array of gene scores
 95 |     ntop_list : list, optional
 96 |         List of values of ntop to evaluate
 97 | 
 98 |     Returns
 99 |     -------
100 |     df : DataFrame
101 |     """
102 |     max_overlap, p_max, max2_overlap, p_max2 = [],[],[],[]
103 |     for ntop in ntop_list:
104 |         o = max_pairwise(gene_scores, ntop, False)
105 |         max_overlap.append( o.overlap )
106 |         p_max.append( o.p )
107 | 
108 |         o2 = max_pairwise(gene_scores, ntop, True)
109 |         max2_overlap.append( o2.overlap )
110 |         p_max2.append( o2.p )
111 |     df = pd.DataFrame({'ntop' : ntop_list, 'max_overlap' : max_overlap,
112 |         'p_max' : p_max, 'max2_overlap' : max2_overlap, 'p_max2' : p_max2})
113 |     return df
114 | 
115 | 
116 | def split_coo_rows(X, split_indices):
117 |     """Split a coo matrix into two
118 | 
119 |     Parameters
120 |     ----------
121 |     X : coo_matrix
122 |         Matrix to split into two by row
123 |     split_indices : ndarray
124 |         Indices to use for the split.
125 | 
126 |     Returns
127 |     -------
128 |     a : coo_matrix
129 |         rows from X specified in split_indices
130 |     b : coo_matrix
131 |         rows from X *not* specified in split_indices
132 | 
133 |     """
134 |     a_indices = split_indices
135 |     b_indices = np.setdiff1d(np.arange(X.shape[0]), split_indices)
136 | 
137 |     X_csr = X.tocsr()
138 |     a = X_csr[a_indices, :].tocoo()
139 |     b = X_csr[b_indices, :].tocoo()
140 |     return a, b
141 | 
142 | 
143 | def collapse_coo_rows(coo):
144 |     """Collapse the empty rows of a coo_matrix
145 | 
146 |     Parameters
147 |     ----------
148 |     coo : coo_matrix
149 |         Input coo_matrix which may have empty rows
150 | 
151 | 
152 |     Returns
153 |     -------
154 |     collapsed_coo : coo_matrix
155 |         coo with row indices adjusted to removed empty rows
156 |     collapsed_indices : ndarray
157 |         Indices of the returned rows in the original input matrix
158 |     """
159 |     nz_idx = np.where(coo.getnnz(1) > 0)[0]
160 |     return coo.tocsr()[nz_idx].tocoo(), nz_idx
161 | 
162 | 
163 | def insert_coo_rows(a, b, b_indices):
164 |     """Insert rows from b into a at specified row indeces
165 | 
166 |     Parameters
167 |     ----------
168 |     a : sparse matrix
169 |     b : sparse matrix
170 |     b_indices : ndarray
171 |         Indices in final matrix where b's rows should be. np.max(`b_indices`)
172 |         must be a valid row index in the merged matrix with shape[0] =
173 |         a.shape[0] + b.shape[0].  Must me ordered and unique.
174 | 
175 |     Returns
176 |     -------
177 |     ab :
178 |         coo_matrix with rows re-indexed to have rows from b
179 |     """
180 |     # check arguments
181 |     if a.shape[1] != b.shape[1]:
182 |         msg = 'a.shape[1] must equal b.shape[1], received a with shape'
183 |         msg += ' {} and b with shape {}'.format(a.shape, b.shape)
184 |         raise ValueError(msg)
185 |     if np.max(b_indices) >= a.shape[0] + b.shape[0]:
186 |         msg = 'Invalid row indices {} for array with '.format(b_indices)
187 |         msg += 'a.shape[0] + b.shape[0] = {} '.format(a.shape[0])
188 |         msg += '+ {} = {}'.format(b.shape[0], a.shape[0]+b.shape[0])
189 |         raise ValueError(msg)
190 |     if not np.all(np.diff(b_indices) > 0):
191 |         msg = '`b_indices` must be ordered without repeats. Received '
192 |         msg += '{}'.format(b_indices)
193 |         raise ValueError(msg)
194 | 
195 |     out_shape = (a.shape[0] + b.shape[0], a.shape[1])
196 |     a = a.tocsr()
197 |     b = b.tocsr()
198 | 
199 |     a_row, b_row = 0, 0
200 |     data, indices, indptr = [], [], [0]
201 |     for ab_row in range(out_shape[0]):
202 |         if b_row < len(b_indices) and ab_row == b_indices[b_row]:
203 |             my_row = b[b_row, :]
204 |             b_row += 1
205 |         else:
206 |             my_row = a[a_row, :]
207 |             a_row += 1
208 |         data.append(my_row.data)
209 |         indices.append(my_row.indices)
210 |         indptr.append(indptr[-1] + my_row.indptr[1])
211 | 
212 |     ab = csr_matrix(
213 |             (np.hstack(data), np.hstack(indices), np.array(indptr)),
214 |             out_shape).tocoo()
215 |     return ab
216 | 
217 | 
218 | def minibatch_ix_generator(ncells, batchsize):
219 |     assert ncells >= batchsize # allow equalitiy for testing
220 |     ixs = np.arange(ncells)
221 |     np.random.shuffle(ixs)
222 |     start = 0
223 |     while True:
224 |         stop = start + batchsize
225 |         if stop > ncells:
226 |             stop = stop % ncells
227 |             res = np.hstack([ixs[start:ncells], ixs[0:stop]])
228 |         else:
229 |             res = ixs[start:stop]
230 |         start = stop % ncells # need mod for case where ncells=batchsize
231 |         yield res
232 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [tool:pytest]
2 | filterwarnings =
3 |     ignore:.*matrix subclass is not the recommended.*:PendingDeprecationWarning
4 |     ignore:.*importing the ABCs from.*:DeprecationWarning
5 |     ignore:.*Could not select.*cells with given group_ids.*:UserWarning
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | # get version from file
 6 | __version__ = '0.0.0'
 7 | exec(open('schpf/_version.py').read())
 8 | 
 9 | requires = ['scikit-learn',
10 |             "numba >= 0.39, !=0.41, !=0.42, !=0.43; python_version<='3.7.3'",
11 |             "numba >= 0.44; python_version=='3.7.4'",
12 |             "numba >= 0.45; python_version>'3.7.4'",
13 |             'scipy >= 1.1',
14 |             'numpy',
15 |             'pandas',
16 |             'joblib'
17 |             ]
18 | 
19 | tests_require = ['pytest']
20 | extras_require = {
21 |         'loompy' : ['loompy'],
22 |         'docs' : ['sphinx-argparse'],
23 |         }
24 | 
25 | setup(
26 |     name='scHPF',
27 |     version=__version__,
28 |     packages=find_packages(),
29 |     scripts=['bin/scHPF'],
30 |     python_requires='>=3.6',
31 |     install_requires=requires,
32 |     tests_require=tests_require,
33 |     extras_require=extras_require,
34 |     author = 'Hanna Mendes Levitin',
35 |     author_email = 'hml2134@columbia.edu',
36 |     description='Single-cell Hierarchical Poisson Factorization',
37 |     license="BSD",
38 |     url='https://www.github.com/simslab/scHPF',
39 | )
40 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simslab/scHPF/aff30d674039359395cbee4ca4ddc85f3a5c8b56/tests/__init__.py


--------------------------------------------------------------------------------
/tests/_data/sample_blacklist.txt:
--------------------------------------------------------------------------------
 1 | ENSG00000130772.13	MED18
 2 | ENSG00000142609.18	CFAP74
 3 | ENSG00000125945.14	ZNF436
 4 | ENSG00000158246.7	TENT5B
 5 | ENSG00000189280.3	GJB5
 6 | ENSG00000120948.17	TARDBP
 7 | ENSG00000142733.15	MAP3K6
 8 | ENSG00000157978.11	LDLRAP1
 9 | ENSG00000116819.7	TFAP2E
10 | ENSG00000284733.1	OR4F29
11 | ENSG00000121766.15	ZCCHC17
12 | ENSG00000182330.10	PRAMEF8
13 | ENSG00000025800.13	KPNA6
14 | ENSG00000271741.1	AC114490.2
15 | ENSG00000157881.13	PANK4
16 | ENSG00000107404.19	DVL1
17 | ENSG00000078900.14	TP73
18 | ENSG00000116731.22	PRDM2
19 | ENSG00000070831.15	CDC42
20 | ENSG00000197921.5	HES5
21 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import numpy as np
 4 | from scipy.sparse import coo_matrix
 5 | import pytest
 6 | from schpf import scHPF
 7 | 
 8 | np.random.seed(42)
 9 | 
10 | N_CELLS, N_GENES, NZ_FRAC, N_FACTORS = (300, 1000, 0.03, 4)
11 | NNZ = int(N_CELLS * N_GENES * NZ_FRAC)
12 | 
13 | # Fixtures
14 | @pytest.fixture()
15 | def data():
16 |     X_data = np.random.negative_binomial(2, 0.5, NNZ)
17 |     X_data[X_data==0] = 1
18 |     cell_ix = np.random.randint(0, N_CELLS, NNZ, dtype=np.int32)
19 |     gene_ix = np.random.randint(0, N_GENES, NNZ, dtype=np.int32)
20 |     X = coo_matrix(
21 |             (X_data, (cell_ix, gene_ix)),
22 |             (N_CELLS, N_GENES),
23 |             dtype=np.int32)
24 |     X.sum_duplicates()
25 |     return X
26 | 
27 | 
28 | # TODO make these actual unit tests by making distributions from scratch
29 | @pytest.fixture(params=[np.float64, np.float32])
30 | def model_uninit(request):
31 |     model = scHPF(N_FACTORS, dtype=request.param)
32 |     return model
33 | 
34 | 
35 | @pytest.fixture()
36 | def model(model_uninit, data):
37 |     model_uninit._initialize(data)
38 |     return model_uninit
39 | 


--------------------------------------------------------------------------------
/tests/test_inference.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import numpy as np
  4 | from scipy.sparse import coo_matrix
  5 | from scipy.special import logsumexp, digamma, gammaln
  6 | 
  7 | import pytest
  8 | from numpy.testing import assert_allclose
  9 | 
 10 | from schpf import hpf_numba, scHPF
 11 | 
 12 | # globals & seed
 13 | np.random.seed(42)
 14 | 
 15 | @pytest.fixture()
 16 | def Xphi(data, model):
 17 |     random_phi = np.random.dirichlet( np.ones(model.nfactors),
 18 |             data.data.shape[0]).astype(model.dtype)
 19 |     return data.data[:,None] * random_phi
 20 | 
 21 | 
 22 | # Tests
 23 | 
 24 | @pytest.mark.parametrize('x', [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000])
 25 | @pytest.mark.parametrize('dtype', [np.float64, np.float32])
 26 | def test_cython_digamma(x, dtype):
 27 |     x = dtype(x)
 28 |     # using approx_equal for float32 :(
 29 |     assert_allclose(hpf_numba.psi(x), digamma(x))
 30 | 
 31 | 
 32 | @pytest.mark.parametrize('x', [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000])
 33 | @pytest.mark.parametrize('dtype', [np.float64, np.float32])
 34 | def test_cython_gammaln(x, dtype):
 35 |     x = dtype(x)
 36 |     # using approx_equal for float32 :(
 37 |     assert_allclose(hpf_numba.cgammaln(x), gammaln(x))
 38 | 
 39 | 
 40 | def test_compute_Xphi_numba(data, model):
 41 |     def compute_Xphi_numpy(X, theta, beta):
 42 |         logrho = theta.e_logx[X.row, :] + beta.e_logx[X.col, :]
 43 |         logphi = logrho - logsumexp(logrho, axis=1)[:,None]
 44 |         return X.data[:,None] * np.exp(logphi)
 45 |     Xphi = compute_Xphi_numpy(data, model.theta, model.beta)
 46 |     # increase rtol for float32
 47 |     assert_allclose(
 48 |             hpf_numba.compute_Xphi_data(
 49 |                 data.data, data.row, data.col,
 50 |                 model.theta.vi_shape, model.theta.vi_rate,
 51 |                 model.beta.vi_shape, model.beta.vi_rate),
 52 |             Xphi,
 53 |             rtol=1e-5 if model.dtype==np.float32 else 1e-7, atol=0)
 54 |     assert_allclose(
 55 |             hpf_numba.compute_Xphi_data_numpy(data, model.theta, model.beta),
 56 |             Xphi,
 57 |             rtol=1e-5 if model.dtype==np.float32 else 1e-7, atol=0)
 58 | 
 59 | 
 60 | def test_compute_theta_shape_numba(model, Xphi, data):
 61 |     nfactors = model.nfactors
 62 |     reference = np.zeros((model.ncells, nfactors), dtype=model.dtype)
 63 |     for k in range(nfactors):
 64 |         reference[:,k] = coo_matrix(
 65 |                          (Xphi[:, k], (data.row, data.col)),
 66 |                          (model.ncells, model.ngenes)
 67 |                         ).sum(1).A[:,0]
 68 |     reference += model.a
 69 |     assert_allclose(
 70 |             hpf_numba.compute_loading_shape_update(
 71 |                 Xphi, data.row, model.ncells, model.a),
 72 |             reference,
 73 |             rtol=1e-6 if model.dtype==np.float32 else 1e-7, atol=0)
 74 | 
 75 | 
 76 | def test_compute_beta_shape_numba(model, Xphi, data):
 77 |     reference = np.zeros((model.ngenes, model.nfactors), dtype=model.dtype)
 78 |     for k in range(model.nfactors):
 79 |         reference[:,k] = coo_matrix(
 80 |                          (Xphi[:, k], (data.col, data.row)),
 81 |                          (model.ngenes, model.ncells)
 82 |                         ).sum(1).A[:,0]
 83 |     reference += model.c
 84 |     assert_allclose(
 85 |             hpf_numba.compute_loading_shape_update(
 86 |                 Xphi, data.col, model.ngenes, model.c),
 87 |             reference,
 88 |             rtol=1e-6 if model.dtype==np.float32 else 1e-7, atol=0)
 89 | 
 90 | 
 91 | def test_compute_theta_rate_numba(model):
 92 |     reference = model.xi.e_x[:,None] + model.beta.e_x.sum(0)[None,:]
 93 |     assert_allclose(
 94 |             hpf_numba.compute_loading_rate_update(
 95 |                 model.xi.vi_shape, model.xi.vi_rate,
 96 |                 model.beta.vi_shape, model.beta.vi_rate),
 97 |             reference
 98 |             )
 99 | 
100 | 
101 | def test_compute_eta_rate_numba(model):
102 |     reference = model.beta.e_x.sum(axis=1) + model.dp
103 |     assert_allclose(
104 |             hpf_numba.compute_capacity_rate_update(
105 |                 model.beta.vi_shape, model.beta.vi_rate,
106 |                 model.dp),
107 |             reference,
108 |             rtol=1e-6 if model.dtype==np.float32 else 1e-7, atol=0)
109 | 
110 | 
111 | def test_llh_pois(data, model):
112 |     e_rate = model.theta.e_x @ model.beta.e_x.T
113 |     desired = data.data * np.log(e_rate[data.row, data.col]) \
114 |                 - e_rate[data.row, data.col] \
115 |                 - gammaln(data.data + 1)
116 |     assert_allclose(
117 |             hpf_numba.compute_pois_llh(data.data, data.row, data.col,
118 |                 model.theta.vi_shape, model.theta.vi_rate,
119 |                 model.beta.vi_shape, model.beta.vi_rate),
120 |             desired,
121 |             rtol=1e-6 if model.dtype==np.float32 else 1e-7, atol=0)
122 | 
123 | 
124 | 


--------------------------------------------------------------------------------
/tests/test_misc.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | import schpf
4 | 
5 | def test_version():
6 |     assert schpf.__version__ is not None
7 |     # assert schpf.__version__ == '0.2.5'
8 | 


--------------------------------------------------------------------------------
/tests/test_preprocessing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from pathlib import Path
  4 | import numpy as np
  5 | import pandas as pd
  6 | from scipy.sparse import coo_matrix
  7 | 
  8 | import pytest
  9 | from numpy.testing import assert_equal, assert_array_equal
 10 | 
 11 | from schpf import preprocessing as prep
 12 | 
 13 | 
 14 | TXT = str(Path(__file__).parent / \
 15 |         Path('_data/PJ030merge.c300t400_g0t500.matrix.txt'))
 16 | NCELLS = 100
 17 | NGENES = 500
 18 | 
 19 | # TODO figure out how to get this without going this far up tree or doubling
 20 | # perhaps make a small copy?
 21 | PROTEIN_CODING = str(
 22 |     Path(*Path(__file__).parts[:-2]) / Path(
 23 |         'resources/gencode.v29.annotation.gene_l1l2.pc_TRC_IGC.stripped.txt'))
 24 | BLIST = str( Path(__file__).parent /  Path('_data/sample_blacklist.txt') )
 25 | 
 26 | 
 27 | @pytest.fixture()
 28 | def protein_coding():
 29 |     return pd.read_csv(PROTEIN_CODING, delim_whitespace=True, header=None)
 30 | 
 31 | 
 32 | @pytest.fixture()
 33 | def blacklist():
 34 |     return pd.read_csv(BLIST, delim_whitespace=True, header=None)
 35 | 
 36 | 
 37 | @pytest.fixture()
 38 | def exp_genes():
 39 |     return pd.read_csv(TXT, delim_whitespace=True, header=None)[[0,1]]
 40 | 
 41 | 
 42 | @pytest.mark.parametrize('ngene_cols', [2,3])
 43 | def test_load_txt(ngene_cols):
 44 |     coo, genes = prep.load_txt(TXT, ngene_cols)
 45 |     assert genes.shape[1] == ngene_cols
 46 |     assert coo.shape[1] == NGENES
 47 |     assert genes.shape[0] == NGENES
 48 |     assert coo.shape[0]  == NCELLS + 2 - ngene_cols
 49 | 
 50 | 
 51 | # TODO add loom and test, also check that this works since passed shouldn't?
 52 | def test_load_like(tmp_path):
 53 |     gene_file = str(tmp_path / 'genes.txt')
 54 | 
 55 |     # make a permutation
 56 |     perm = np.random.choice(NGENES, NGENES-10, replace=False)
 57 | 
 58 |     # load data to make reference and permute
 59 |     umis, genes = prep.load_txt(TXT)
 60 |     umis = umis.A[:, perm]
 61 |     genes = genes.loc[perm]
 62 | 
 63 |     # write permuted/subsampled reference file
 64 |     genes.to_csv(gene_file, header=None, sep='\t', index=None)
 65 | 
 66 |     # load like permuted reference
 67 |     ll_umi, ll_genes = prep.load_like(TXT, reference=gene_file)
 68 |     assert_equal(len(ll_genes), len(perm))
 69 |     assert_array_equal(umis, ll_umi.A)
 70 | 
 71 |     # repeat with no_split_on_dot
 72 |     ll_umi, ll_genes = prep.load_like(TXT, reference=gene_file,
 73 |             no_split_on_dot=True)
 74 |     assert_equal(len(ll_genes), len(perm))
 75 |     assert_array_equal(umis, ll_umi.A)
 76 | 
 77 |     # by gene name
 78 |     ll_umi, ll_genes = prep.load_like(TXT, reference=gene_file,
 79 |             by_gene_name=True)
 80 |     assert_equal(len(ll_genes), len(perm))
 81 |     assert_array_equal(umis, ll_umi.A)
 82 | 
 83 |     # corrupt the permuted reference
 84 |     bad_genes = genes.copy()
 85 |     bad_genes.loc[5, 0] = 'random'
 86 |     bad_genes.to_csv(gene_file, header=None, sep='\t', index=None)
 87 |     with pytest.raises(ValueError):
 88 |         ll_umi, ll_genes = prep.load_like(TXT, reference=gene_file)
 89 | 
 90 | 
 91 | def test_min_cells_expressing(data):
 92 |     ncells, ngenes = data.shape
 93 |     # test all true when 0
 94 |     min_cells = 0
 95 |     assert_equal(prep.min_cells_expressing_mask(data, min_cells).sum(),
 96 |                  ngenes)
 97 | 
 98 |     # test all false when > ncell
 99 |     min_cells = ngenes + 1
100 |     assert_equal(prep.min_cells_expressing_mask(data, min_cells).sum(),
101 |                  0)
102 |     min_cells = 0.9999999
103 |     assert min_cells < 1
104 |     assert_equal(prep.min_cells_expressing_mask(data, min_cells).sum(),
105 |                  0)
106 | 
107 |     # test for reasonable value
108 |     min_cells = 5
109 |     n_expressing = data.astype(bool).sum(axis=0).A[0, :]
110 |     mask = n_expressing >= min_cells
111 |     assert_array_equal(prep.min_cells_expressing_mask(data, min_cells),
112 |                        mask)
113 |     # test same for proportion
114 |     min_cells_prop = min_cells / ncells
115 |     assert_array_equal(prep.min_cells_expressing_mask(data, min_cells_prop),
116 |                        mask)
117 | 
118 | 
119 | def test_genelist_mask(protein_coding, exp_genes):
120 |     shared_ens = exp_genes[0].str.split('.').str[0].isin(
121 |             protein_coding[0].str.split('.').str[0])
122 |     shared_gene = exp_genes[1].isin(protein_coding[1])
123 | 
124 |     # whitelist
125 |     assert_array_equal(prep.genelist_mask(exp_genes[0], protein_coding[0]),
126 |                        shared_ens)
127 |     assert_array_equal(prep.genelist_mask(exp_genes[1], protein_coding[1]),
128 |                        shared_gene)
129 | 
130 |     # blacklist
131 |     assert_array_equal(prep.genelist_mask(exp_genes[0], protein_coding[0],
132 |                                           whitelist=False),
133 |                        ~shared_ens)
134 |     assert_array_equal(prep.genelist_mask(exp_genes[1], protein_coding[1],
135 |                                           whitelist=False),
136 |                        ~shared_gene)
137 | 
138 | 
139 | def test_subsample_cell_ixs():
140 |     # int for choices
141 |     assert_equal(len(prep.subsample_cell_ixs(20, 10)),  10)
142 |     # array of choices
143 |     assert_equal(len(prep.subsample_cell_ixs(np.arange(20), 10)),  10)
144 | 
145 |     # test picks one from a group
146 |     group_ids = np.array([0] * 100 + [1,1])
147 |     idx = prep.subsample_cell_ixs(102, 10, group_ids=group_ids,
148 |             max_group_frac=0.5)
149 |     assert (100 in idx) ^ (101 in idx) #xor
150 |     assert_equal(len(idx), 10)
151 | 
152 |     # test doesn't pick when can't under constraint
153 |     group_ids = np.array([0] * 18 + [1,1])
154 |     idx = prep.subsample_cell_ixs(20, 5, group_ids=group_ids,
155 |             max_group_frac=0.4)
156 |     assert (not 18 in idx) and (not 19 in idx) #neither of the group 1 indexes
157 |     assert_equal(len(idx), 5) # but still have 5 items
158 | 
159 | 
160 |     # test doesn't pick more than it can under constraint
161 |     group_ids = np.array([0] * 18 + [1,1])
162 |     idx = prep.subsample_cell_ixs(20, 5, group_ids=group_ids,
163 |             max_group_frac=0.25)
164 |     assert (not 18 in idx) and (not 19 in idx) #neither of the group 1 indexes
165 |     assert_equal(len(idx), 4) # should have floor(0.25*18) items
166 |     with pytest.warns(UserWarning) as record:
167 |         idx = prep.subsample_cell_ixs(20, 5, group_ids=group_ids,
168 |                 max_group_frac=0.25)
169 |     assert len(record) == 1
170 | 
171 | 
172 | def test_load_and_filter(protein_coding, blacklist):
173 |     filtered_m2, genes_m2 = prep.load_and_filter(TXT, min_cells=2,
174 |             whitelist=PROTEIN_CODING, blacklist=BLIST)
175 |     assert_equal(filtered_m2.shape[0], NCELLS)
176 |     assert filtered_m2.shape[1] <= NGENES
177 |     assert_equal(filtered_m2.shape[1], len(genes_m2))
178 |     assert_equal(genes_m2[0].str.split('.').str[0].isin(
179 |                     blacklist[0].str.split('.').str[0]).sum(),
180 |                  0)
181 |     assert_equal(genes_m2[0].str.split('.').str[0].isin(
182 |                     protein_coding[0].str.split('.').str[0]).sum(),
183 |                  len(genes_m2))
184 | 
185 |     filtered_m5, genes_m5 = prep.load_and_filter(TXT, min_cells=5,
186 |             whitelist=PROTEIN_CODING, blacklist=BLIST)
187 |     assert filtered_m5.shape[1] <= filtered_m2.shape[1]
188 |     assert np.all(filtered_m5.astype(bool).sum(axis=0).A >= 5)
189 | 


--------------------------------------------------------------------------------
/tests/test_scHPF_model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import numpy as np
  4 | 
  5 | import pytest
  6 | from numpy.testing import assert_equal
  7 | from numpy.testing import assert_array_equal
  8 | 
  9 | from schpf import HPF_Gamma, scHPF, combine_across_cells
 10 | 
 11 | """For tests of inference, see test_inference.py
 12 | """
 13 | 
 14 | def test__setup_meanvar(model_uninit, data):
 15 |     bp, dp, xi, eta, theta, beta = model_uninit._setup(X=data,
 16 |             freeze_genes=False, reinit=True)
 17 |     cell_sums = data.sum(axis=1)
 18 |     gene_sums = data.sum(axis=0)
 19 | 
 20 |     # test hyperparams set to mean/var ratios
 21 |     assert_equal(bp, np.mean(cell_sums) / np.var(cell_sums))
 22 |     assert_equal(dp, np.mean(gene_sums) / np.var(gene_sums))
 23 | 
 24 | 
 25 | def test__setup_dims(model_uninit, data):
 26 |     bp, dp, xi, eta, theta, beta = model_uninit._setup(X=data,
 27 |             freeze_genes=False, reinit=True)
 28 | 
 29 |     assert_equal(xi.vi_shape.shape[0], data.shape[0])
 30 |     assert_equal(xi.vi_rate.shape[0], data.shape[0])
 31 |     assert_equal(len(xi.vi_shape.shape), 1)
 32 |     assert_equal(len(xi.vi_rate.shape), 1)
 33 | 
 34 |     assert_equal(eta.vi_shape.shape[0], data.shape[1])
 35 |     assert_equal(eta.vi_rate.shape[0], data.shape[1])
 36 |     assert_equal(len(eta.vi_shape.shape), 1)
 37 |     assert_equal(len(eta.vi_rate.shape), 1)
 38 | 
 39 |     assert_equal(theta.vi_shape.shape[0], data.shape[0])
 40 |     assert_equal(theta.vi_rate.shape[0], data.shape[0])
 41 |     assert_equal(theta.vi_shape.shape[1], model_uninit.nfactors)
 42 |     assert_equal(theta.vi_rate.shape[1], model_uninit.nfactors)
 43 | 
 44 |     assert_equal(beta.vi_shape.shape[0], data.shape[1])
 45 |     assert_equal(beta.vi_rate.shape[0], data.shape[1])
 46 |     assert_equal(beta.vi_shape.shape[1], model_uninit.nfactors)
 47 |     assert_equal(beta.vi_rate.shape[1], model_uninit.nfactors)
 48 | 
 49 | 
 50 | def test__setup_freeze(model, data):
 51 |     my_data = data.tocsr()[:20].tocoo()
 52 |     bp, dp, xi, eta, theta, beta = (model.bp, model.dp, model.xi,
 53 |             model.eta, model.theta, model.beta)
 54 | 
 55 |     model.bp = None
 56 |     bp2, dp2, xi2, eta2, theta2, beta2 = model._setup(X=my_data,
 57 |             freeze_genes=True, reinit=True)
 58 | 
 59 |     # cell-side vals (should be the smae)
 60 |     assert_equal(dp2, dp)
 61 |     assert_equal(eta2, eta)
 62 |     assert_equal(beta2, beta)
 63 | 
 64 |     # gene-side vals
 65 |     assert bp2  != bp
 66 |     assert xi2.dims != xi.dims
 67 |     assert theta2.dims != theta.dims
 68 | 
 69 |     # check bp not updated w/freeze_genes if already set
 70 |     model.bp = bp
 71 |     bp3, _, _, _, _, _ = model._setup(X=my_data, freeze_genes=True, reinit=True)
 72 |     print(bp, bp2, bp3)
 73 |     assert bp3 == bp
 74 |     assert bp3 != bp2
 75 | 
 76 | 
 77 | def test__set_ac(model_uninit):
 78 |     model_uninit.nfactors = None
 79 |     with pytest.raises(ValueError):
 80 |         model_uninit.a = -2
 81 |     with pytest.raises(ValueError):
 82 |         model_uninit.c = -2
 83 | 
 84 |     model_uninit.nfactors = 15
 85 |     model_uninit.a = -2
 86 |     assert model_uninit.a == 1/np.sqrt(15)
 87 |     model_uninit.c = -2
 88 |     assert model_uninit.c == 1/np.sqrt(15)
 89 | 
 90 | @pytest.mark.parametrize('a_dims', [[5,], [5,10]])
 91 | @pytest.mark.parametrize('dtype', [np.float64, np.float32])
 92 | def test_HPF_Gamma_combine(a_dims, dtype):
 93 |     a_vi_shape = np.ones(a_dims, dtype=dtype)
 94 |     a_vi_rate = np.ones(a_dims, dtype=dtype)
 95 |     a = HPF_Gamma(a_vi_shape, a_vi_rate)
 96 | 
 97 |     b_dims = a_dims.copy()
 98 |     b_dims[0] = 3
 99 |     b_vi_shape = 2*np.ones(b_dims, dtype=dtype)
100 |     b_vi_rate = 2*np.ones(b_dims, dtype=dtype)
101 |     b = HPF_Gamma(b_vi_shape, b_vi_rate)
102 | 
103 |     b_ix = [0,5,7]
104 |     ab = a.combine(b, b_ix)
105 |     assert_equal(ab.dims[0], a.dims[0] + b.dims[0])
106 |     # check b rows
107 |     assert_array_equal(ab.vi_shape[b_ix], b.vi_shape)
108 |     assert_array_equal(ab.vi_rate[b_ix], b.vi_rate)
109 |     # check a rows too
110 |     a_ix = np.setdiff1d(np.arange(ab.dims[0]), b_ix)
111 |     print(a_ix)
112 |     assert_array_equal(ab.vi_shape[a_ix], a.vi_shape)
113 |     assert_array_equal(ab.vi_rate[a_ix], a.vi_rate)
114 | 
115 |     b_ix = [4]
116 |     with pytest.raises(AssertionError):
117 |         ab = a.combine(b, b_ix)
118 | 
119 |     b_ix = [0,1,2,3]
120 |     with pytest.raises(AssertionError):
121 |         ab = a.combine(b, b_ix)
122 | 
123 | 
124 |     b_ix = [0,1,2,2]
125 |     with pytest.raises(AssertionError):
126 |         ab = a.combine(b, b_ix)
127 | 
128 |     b_ix = [7,8,9]
129 |     with pytest.raises(AssertionError):
130 |         ab = a.combine(b, b_ix)
131 | 
132 | 
133 | @pytest.mark.parametrize('dtype', [np.float64, np.float32])
134 | def test_project(data, dtype):
135 |     # get b indices
136 |     b_idx = np.random.choice(data.shape[0], 10)
137 |     # get remaining indices (for a)
138 |     a_idx = np.setdiff1d(np.arange(data.shape[0]), b_idx)
139 |     # split data
140 |     data_csr = data.tocsr()
141 |     a_data = data_csr[a_idx].tocoo()
142 |     b_data = data_csr[b_idx].tocoo()
143 | 
144 |     # setup model for a_data
145 |     a_model = scHPF(5, dtype=dtype)
146 |     a_model._initialize(a_data)
147 |     bp = a_model.bp
148 | 
149 |     #project b_model
150 |     b_model = a_model.project(b_data)
151 |     # check genes frozen
152 |     assert_equal(b_model.eta, a_model.eta)
153 |     assert_equal(b_model.beta, a_model.beta)
154 |     # check cells different
155 |     assert_equal(a_model.ncells, a_data.shape[0])
156 |     assert_equal(b_model.ncells, b_data.shape[0])
157 |     # check bp unchanged
158 |     assert_equal(b_model.bp, bp)
159 | 
160 |     # check bp updates when we want
161 |     c_model = a_model.project(b_data, recalc_bp=True)
162 |     assert c_model.bp != bp
163 | 
164 | 
165 | @pytest.mark.parametrize('dtype', [np.float64, np.float32])
166 | def test_combine_across_cells(data, dtype):
167 |     # get b indices
168 |     b_ixs = np.random.choice(data.shape[0], 10, replace=False)
169 |     # get a indices (remaining)
170 |     a_ixs = np.setdiff1d(np.arange(data.shape[0]), b_ixs)
171 |     # split data
172 |     data_csr = data.tocsr()
173 |     a_data = data_csr[a_ixs].tocoo()
174 |     b_data = data_csr[b_ixs].tocoo()
175 | 
176 |     # setup model for a_data
177 |     a = scHPF(5, dtype=dtype)
178 |     a._initialize(a_data)
179 |     # setup model for b_data w/same dp, eta, beta
180 |     b = scHPF(5, dtype=dtype, dp=a.dp, eta=a.eta, beta=a.beta)
181 |     b._initialize(b_data, freeze_genes=True)
182 | 
183 |     ab = combine_across_cells(a, b, b_ixs)
184 | 
185 |     # check bp is None since it is different across the two models
186 |     assert_equal(ab.bp, None)
187 |     # check a locals where they should be in xi and eta
188 |     assert_array_equal(ab.xi.vi_shape[a_ixs], a.xi.vi_shape)
189 |     assert_array_equal(ab.xi.vi_rate[a_ixs], a.xi.vi_rate)
190 |     assert_array_equal(ab.theta.vi_shape[a_ixs], a.theta.vi_shape)
191 |     assert_array_equal(ab.theta.vi_rate[a_ixs], a.theta.vi_rate)
192 | 
193 |     # check b locals where they should be in xi and eta
194 |     assert_array_equal(ab.xi.vi_shape[b_ixs], b.xi.vi_shape)
195 |     assert_array_equal(ab.xi.vi_rate[b_ixs], b.xi.vi_rate)
196 |     assert_array_equal(ab.theta.vi_shape[b_ixs], b.theta.vi_shape)
197 |     assert_array_equal(ab.theta.vi_rate[b_ixs], b.theta.vi_rate)
198 | 
199 |     # check globals unchanged
200 |     assert_equal(ab.eta, a.eta)
201 |     assert_equal(ab.eta, b.eta)
202 |     assert_equal(ab.beta, a.beta)
203 |     assert_equal(ab.beta, b.beta)
204 | 
205 | 
206 | # TODO write this, also do for run_trials_pool
207 | # def test_run_trials(data):
208 |     # pass
209 | 


--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import numpy as np
  4 | from numpy.testing import assert_equal, assert_array_equal
  5 | from scipy.sparse import coo_matrix
  6 | import pytest
  7 | 
  8 | from schpf import max_pairwise, max_pairwise_table
  9 | from schpf.util import split_coo_rows, collapse_coo_rows, insert_coo_rows
 10 | from schpf.util import mean_cellscore_fraction
 11 | 
 12 | 
 13 | def test_mean_cellscore_fraction():
 14 |     X = np.array([
 15 |             [10, 1, 1, 1, 1],
 16 |             [1, 1, 1, 1, 1],
 17 |             [10, 10, 1, 1, 1],
 18 |             [1, 1, 1, 1, 10]])
 19 |     assert_equal(mean_cellscore_fraction(X, 5), 1.0)
 20 |     assert np.abs(mean_cellscore_fraction(X, 1) - 0.5) < 0.02
 21 | 
 22 | 
 23 | def test_overlap():
 24 |     X = np.array([
 25 |             [0.33973751, 0.72029002, 0.52763837, 0.94012605, 0.20375346],
 26 |             [0.32460224, 0.43595206, 0.8304655 , 0.31792094, 0.77330563],
 27 |             [0.00507031, 0.42707696, 0.26948512, 0.50554657, 0.31438824],
 28 |             [0.52583849, 0.54531833, 0.08530654, 0.35516516, 0.10617843],
 29 |             [0.78608326, 0.59571929, 0.09737211, 0.09474643, 0.55319175],
 30 |             [0.04245016, 0.43322226, 0.99748447, 0.45731582, 0.65861378],
 31 |             [0.04364505, 0.97239799, 0.68847276, 0.96692073, 0.60268244],
 32 |             [0.13364376, 0.40121588, 0.32770517, 0.02352124, 0.04974099],
 33 |             [0.92531954, 0.23635494, 0.29327799, 0.40788107, 0.95974159],
 34 |             [0.42295065, 0.5725946 , 0.59206089, 0.76534785, 0.77961214]])
 35 |     assert_equal(max_pairwise(X, ntop=3)[0], 2)
 36 |     assert_equal(max_pairwise(X, ntop=3, second_greatest=True)[0], 1)
 37 | 
 38 | 
 39 | def test_overlap_table():
 40 |     X = np.array([
 41 |             [13, 11,  2, 13,  5, 12,  5],
 42 |             [ 8,  6,  0,  6,  8, 14,  6],
 43 |             [11, 13, 11, 11, 14,  8, 10],
 44 |             [ 1, 12,  8, 14,  7,  1,  3],
 45 |             [ 0,  1, 10, 12,  3,  5,  2],
 46 |             [ 3,  2,  6,  5,  9,  2,  1],
 47 |             [ 9,  3,  7,  2,  4,  3,  7],
 48 |             [ 4,  4, 12,  7, 13, 10,  0],
 49 |             [ 7,  0,  9,  8,  6,  6, 12],
 50 |             [14, 14, 13,  9, 10,  9, 14],
 51 |             [ 2,  9,  4, 10, 12,  7, 13],
 52 |             [ 6, 10,  3,  1,  0, 11,  4],
 53 |             [12,  8,  1,  0,  2, 13,  9],
 54 |             [ 5,  7, 14,  3, 11,  4, 11],
 55 |             [10,  5,  5,  4,  1,  0,  8] ])
 56 |     ntop_list = [1,2,3,4,5,6]
 57 |     table = max_pairwise_table(X, ntop_list=ntop_list)
 58 |     assert np.all(table.max_overlap >= table.max2_overlap)
 59 |     assert np.any(table.max_overlap > table.max2_overlap)
 60 |     assert np.all(np.diff(table.max_overlap.values) >= 0 )
 61 |     assert np.all(np.diff(table.max2_overlap.values) >= 0 )
 62 | 
 63 | 
 64 | def test_split_coo_rows():
 65 |     row = np.array([0, 0, 2, 3, 3, 3])
 66 |     col = np.array([0, 2, 2, 0, 1, 2])
 67 |     data = np.array([1, 2, 3, 4, 5, 6])
 68 |     X = coo_matrix((data, (row, col)))
 69 | 
 70 |     a, b = split_coo_rows(X, np.array([0,2,3]))
 71 |     assert_equal(a.shape[0], 3)
 72 |     assert_equal(a.shape[1], 3)
 73 |     assert_equal(b.shape[0], 1)
 74 |     assert_equal(b.shape[1], 3)
 75 |     assert_array_equal(b.todense()[0,:], X.todense()[1,:])
 76 | 
 77 | 
 78 | def test_collapse_coo_rows():
 79 |     a_row = np.array([0, 0, 2, 3, 3, 3])
 80 |     a_col = np.array([0, 2, 2, 0, 1, 2])
 81 |     a_data = np.array([1, 2, 3, 4, 5, 6])
 82 |     a = coo_matrix((a_data, (a_row, a_col)))
 83 | 
 84 |     collapsed, nz = collapse_coo_rows(a)
 85 |     assert_equal(collapsed.shape[0],a.shape[0]-1)
 86 |     assert_array_equal(nz, np.array([0,2,3]))
 87 | 
 88 | 
 89 | def test_insert_coo_rows():
 90 |     a_row = np.array([0, 0, 1, 2, 2, 2])
 91 |     a_col = np.array([0, 2, 2, 0, 1, 2])
 92 |     a_data = np.array([1, 2, 3, 4, 5, 6])
 93 |     a = coo_matrix((a_data, (a_row, a_col)))
 94 | 
 95 |     b_row = np.array([0, 1, 1])
 96 |     b_col = np.array([2, 1, 2 ])
 97 |     b_data = np.array([11, 12, 13])
 98 |     b = coo_matrix((b_data, (b_row, b_col)))
 99 | 
100 |     b_indices = [0,1]
101 |     ab = insert_coo_rows(a, b, b_indices)
102 |     assert_equal(ab.shape[0], a.shape[0] + b.shape[0])
103 |     assert_array_equal(ab.todense()[0, :], b.todense()[0,:])
104 |     assert_array_equal(ab.todense()[1, :], b.todense()[1,:])
105 | 
106 |     b_indices = [1,4]
107 |     ab = insert_coo_rows(a, b, b_indices)
108 |     assert_equal(ab.shape[0], a.shape[0] + b.shape[0])
109 |     assert_array_equal(ab.todense()[0, :], a.todense()[0,:])
110 |     assert_array_equal(ab.todense()[1, :], b.todense()[0,:])
111 | 
112 |     with pytest.raises(ValueError) as execinfo:
113 |         b_indices = [1,4]
114 |         b = coo_matrix((b_data, (b_row, b_col)), shape=[3, 5])
115 |         insert_coo_rows(a, b, b_indices)
116 |     assert "a.shape[1] must equal b.shape[1]" in str(execinfo.value)
117 | 
118 |     with pytest.raises(ValueError) as execinfo:
119 |         b_indices = [1,7]
120 |         b = coo_matrix((b_data, (b_row, b_col)))
121 |         insert_coo_rows(a, b, b_indices)
122 |     assert "Invalid row indices" in str(execinfo.value)
123 | 
124 |     with pytest.raises(ValueError) as execinfo:
125 |         b_indices = [2,1]
126 |         insert_coo_rows(a, b, b_indices)
127 |     assert "must be ordered" in str(execinfo.value)
128 | 
129 |     with pytest.raises(ValueError) as execinfo:
130 |         b_indices = [1,1]
131 |         insert_coo_rows(a, b, b_indices)
132 |     assert "must be ordered" in str(execinfo.value)
133 | 
134 | 


--------------------------------------------------------------------------------