├── .gitattributes ├── .github └── workflows │ └── test.yml ├── .gitignore ├── .readthedocs.yml ├── AUTHORS.rst ├── CONTRIBUTING.rst ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── build.py ├── cassiopeia ├── __init__.py ├── critique │ ├── __init__.py │ ├── compare.py │ └── critique_utilities.py ├── data │ ├── CassiopeiaTree.py │ ├── Layers.py │ ├── __init__.py │ └── utilities.py ├── mixins │ ├── __init__.py │ ├── errors.py │ ├── logging.py │ ├── utilities.py │ └── warnings.py ├── plotting │ ├── __init__.py │ ├── itol_utilities.py │ ├── local.py │ ├── local_3d.py │ ├── palettes.py │ └── utilities.py ├── preprocess │ ├── UMI_utils.py │ ├── __init__.py │ ├── alignment_utilities.py │ ├── cassiopeia_preprocess.py │ ├── collapse_cython.pyx │ ├── constants.py │ ├── doublet_utils.py │ ├── lineage_utils.py │ ├── map_utils.py │ ├── pipeline.py │ ├── setup_utilities.py │ └── utilities.py ├── simulator │ ├── BirthDeathFitnessSimulator.py │ ├── BrownianSpatialDataSimulator.py │ ├── Cas9LineageTracingDataSimulator.py │ ├── ClonalSpatialDataSimulator.py │ ├── CompleteBinarySimulator.py │ ├── DataSimulator.py │ ├── LeafSubsampler.py │ ├── LineageTracingDataSimulator.py │ ├── SequentialLineageTracingDataSimulator.py │ ├── SimpleFitSubcloneSimulator.py │ ├── SpatialDataSimulator.py │ ├── SpatialLeafSubsampler.py │ ├── SupercellularSampler.py │ ├── TreeSimulator.py │ ├── UniformLeafSubsampler.py │ ├── __init__.py │ └── ecDNABirthDeathSimulator.py ├── solver │ ├── CassiopeiaSolver.py │ ├── DistanceSolver.py │ ├── GreedySolver.py │ ├── HybridSolver.py │ ├── ILPSolver.py │ ├── MaxCutGreedySolver.py │ ├── MaxCutSolver.py │ ├── NeighborJoiningSolver.py │ ├── PercolationSolver.py │ ├── SharedMutationJoiningSolver.py │ ├── SpectralGreedySolver.py │ ├── SpectralNeighborJoiningSolver.py │ ├── SpectralSolver.py │ ├── UPGMASolver.py │ ├── VanillaGreedySolver.py │ ├── __init__.py │ ├── dissimilarity_functions.py │ ├── graph_utilities.py │ ├── ilp_solver_utilities.pyx │ ├── missing_data_methods.py │ └── solver_utilities.py └── tools │ ├── __init__.py │ ├── autocorrelation.py │ ├── branch_length_estimator │ ├── BranchLengthEstimator.py │ ├── IIDExponentialBayesian.py │ ├── IIDExponentialMLE.py │ ├── __init__.py │ ├── _iid_exponential_bayesian.pxd │ ├── _iid_exponential_bayesian.pyx │ ├── _iid_exponential_bayesian_cpp.cpp │ └── _iid_exponential_bayesian_cpp.h │ ├── coupling.py │ ├── fitness_estimator │ ├── _FitnessEstimator.py │ ├── __init__.py │ ├── _jungle │ │ ├── LICENSE │ │ ├── examples │ │ │ ├── FitnessScore.ipynb │ │ │ ├── SignaturesSelection.ipynb │ │ │ ├── Tree_neutral.nwk │ │ │ ├── Tree_positive_selection.nwk │ │ │ ├── node_features.tsv │ │ │ └── node_features_leaves.tsv │ │ ├── jungle │ │ │ ├── __init__.py │ │ │ ├── forest.py │ │ │ ├── forest.py.bak │ │ │ ├── resources │ │ │ │ ├── FitnessInference │ │ │ │ │ ├── .gitignore │ │ │ │ │ ├── LICENSE │ │ │ │ │ ├── README.md │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── infer_fitness.py │ │ │ │ │ ├── prediction_src │ │ │ │ │ │ ├── README │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── ancestral.py │ │ │ │ │ │ ├── fitness_inference.py │ │ │ │ │ │ ├── node_ranking.py │ │ │ │ │ │ ├── sequence_ranking.py │ │ │ │ │ │ ├── solve_survival.py │ │ │ │ │ │ └── tree_utils.py │ │ │ │ │ └── rank_sequences.py │ │ │ │ ├── __init__.py │ │ │ │ └── betatree │ │ │ │ │ ├── LICENSE │ │ │ │ │ ├── README │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── src │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── betatree.py │ │ │ │ │ ├── sfs.py │ │ │ │ │ ├── sfs.py.bak │ │ │ │ │ ├── sfs_py3.py │ │ │ │ │ └── sfs_py3.py.bak │ │ │ ├── sfs.py │ │ │ ├── sfs.py.bak │ │ │ ├── size_matched_model.py │ │ │ ├── tree.py │ │ │ └── tree.py.bak │ │ ├── reference_data │ │ │ ├── generate_annotate_forest.py │ │ │ └── generate_annotate_forest.py.bak │ │ └── tests │ │ │ ├── concat.ipynb │ │ │ ├── generate_annotate_forest.sh │ │ │ └── size_matched_model.ipynb │ └── _lbi_jungle.py │ ├── parameter_estimators.py │ ├── small_parsimony.py │ ├── topology.py │ └── tree_metrics.py ├── codecov.yml ├── conftest.py ├── data ├── PCT48.ref.fasta ├── ccphylo_config.ini ├── itolconfig_example ├── preprocess.cfg ├── preprocess_gestalt.cfg └── spatial_preprocess.cfg ├── docs ├── Makefile ├── _static │ ├── computer-24px.svg │ ├── css │ │ ├── override.css │ │ └── sphinx_gallery.css │ ├── library_books-24px.svg │ ├── logo.png │ ├── play_circle_outline-24px.svg │ ├── question-mark-svgrepo-com.svg │ └── tutorials │ │ ├── benchmark.png │ │ ├── benchmark.svg │ │ ├── local_plotting.png │ │ ├── preprocess.png │ │ ├── preprocess.svg │ │ ├── reconstruct.png │ │ └── reconstruct.svg ├── _templates │ ├── autosummary │ │ └── class.rst │ └── layout.html ├── api │ ├── critique.rst │ ├── data.rst │ ├── index.rst │ ├── plotting.rst │ ├── preprocess.rst │ ├── simulator.rst │ ├── solver.rst │ └── tools.rst ├── authors.rst ├── conf.py ├── contributing.rst ├── extensions │ └── typed_returns.py ├── index.rst ├── installation.rst ├── make.bat ├── notebooks ├── references.rst └── user_guide.rst ├── notebooks ├── benchmark.ipynb ├── data │ ├── 3432_NT_T1_alleletable.txt │ └── 3432_NT_T1_tree.processed.tree ├── local_plotting.ipynb ├── preprocess.ipynb ├── reconstruct.ipynb └── simulate_ecDNA.ipynb ├── pyproject.toml ├── setup.py ├── test ├── critique_tests │ └── compare_tree_test.py ├── data_tests │ ├── cassiopeia_tree_test.py │ ├── data_utilities_test.py │ └── layers_test.py ├── mixin_tests │ └── mixin_utilities_test.py ├── plotting_tests │ ├── itol_plotting_test.py │ ├── local_3d_test.py │ ├── local_test.py │ └── utilities_test.py ├── preprocess_tests │ ├── align_sequence_test.py │ ├── call_alleles_test.py │ ├── call_lineage_groups_test.py │ ├── character_matrix_test.py │ ├── collapse_umi_test.py │ ├── config_parser_test.py │ ├── convert_fastqs_to_unmapped_bam_test.py │ ├── error_correct_cellbcs_to_whitelist_test.py │ ├── error_correct_intbcs_to_whitelist_test.py │ ├── error_correct_umi_test.py │ ├── filter_bam_test.py │ ├── filter_molecule_table_test.py │ ├── resolve_umi_sequence_test.py │ └── test_files │ │ ├── 10xv3_1.fastq.gz │ │ ├── 10xv3_2.fastq.gz │ │ ├── 10xv3_unmapped.bam │ │ ├── 10xv3_whitelist.txt │ │ ├── basic_grouping.csv │ │ ├── clustered_intbc.png │ │ ├── collapse_header_required.bam │ │ ├── collapse_header_required.collapsed.bam │ │ ├── doublet.csv │ │ ├── filter_and_reassign.csv │ │ ├── indropsv3_1.fastq.gz │ │ ├── indropsv3_2.fastq.gz │ │ ├── indropsv3_3.fastq.gz │ │ ├── intbc_whitelist.txt │ │ ├── lineageGrp_piv_heatmaps │ │ └── lg_1_piv_heatmap.png │ │ ├── reassign.csv │ │ ├── slideseq2_1.fastq.gz │ │ ├── slideseq2_2.fastq.gz │ │ ├── slideseq2_unmapped.bam │ │ ├── slideseq2_whitelist.txt │ │ ├── test.bam │ │ ├── test_sorted.bam │ │ ├── test_sorted.bayesian_collapsed.bam │ │ ├── test_sorted.collapsed.bam │ │ ├── test_sorted.collapsed.txt │ │ ├── test_uncorrected.bam │ │ ├── test_uncorrected_sorted.bam │ │ └── test_uncorrected_sorted.collapsed.bam ├── simulator_tests │ ├── birth_death_simulator_test.py │ ├── brownian_spatial_simulator_test.py │ ├── cas9_lineage_tracing_simulator_test.py │ ├── clonal_spatial_simulator_test.py │ ├── complete_binary_simulator_test.py │ ├── ecdna_birth_death_simulator_test.py │ ├── sequential_lineage_tracing_simulator_test.py │ ├── simple_fit_subclone_simulator_test.py │ ├── spatial_leaf_subsampler_test.py │ ├── supercellular_sampler_test.py │ └── unifom_leaf_subsampler_test.py ├── solver_tests │ ├── ccphylo_solver_test.py │ ├── dissimilarity_functions_test.py │ ├── greedy_variants_test.py │ ├── hybrid_solver_test.py │ ├── ilp_solver_test.py │ ├── maxcut_test.py │ ├── neighborjoining_solver_test.py │ ├── percolation_test.py │ ├── sharedmutationjoiner_test.py │ ├── snj_solver_test.py │ ├── spectral_test.py │ ├── upgma_test.py │ └── vanillagreedy_test.py └── tools_tests │ ├── autocorrelation_test.py │ ├── branch_length_estimator_tests │ ├── iid_exponential_bayesian_test.py │ └── iid_exponential_mle_test.py │ ├── coupling_test.py │ ├── fitness_estimator_tests │ └── lbi_jungle_test.py │ ├── parameter_estimators_test.py │ ├── small_parsimony_test.py │ ├── topology_test.py │ └── tree_metrics_test.py └── version.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.py linguist-language=python 2 | *.ipynb linguist-documentation 3 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: cassiopeia 5 | 6 | on: 7 | push: 8 | branches: [master] 9 | pull_request: 10 | branches: [master] 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | timeout-minutes: 20 16 | strategy: 17 | matrix: 18 | python-version: ["3.8", "3.9", "3.10", "3.11"] 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | - name: Cache pip 27 | uses: actions/cache@v2 28 | with: 29 | path: ~/.cache/pip 30 | key: ${{ runner.os }}-pip-v1-${{ hashFiles('**/requirements.txt') }} 31 | restore-keys: | 32 | ${{ runner.os }}-pip-v1- 33 | - name: Install dependencies 34 | run: | 35 | pip install pytest-cov 36 | pip install codecov 37 | pip install . 38 | - name: Test with pytest 39 | run: | 40 | pytest -vv test/ --cov-report=xml --cov=cassiopeia 41 | - name: After success 42 | run: | 43 | codecov 44 | pip list 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 105 | __pypackages__/ 106 | 107 | # Celery stuff 108 | celerybeat-schedule 109 | celerybeat.pid 110 | 111 | # SageMath parsed files 112 | *.sage.py 113 | 114 | # Environments 115 | .env 116 | .venv 117 | env/ 118 | venv/ 119 | ENV/ 120 | env.bak/ 121 | venv.bak/ 122 | 123 | # Spyder project settings 124 | .spyderproject 125 | .spyproject 126 | 127 | # Rope project settings 128 | .ropeproject 129 | 130 | # mkdocs documentation 131 | /site 132 | 133 | # mypy 134 | .mypy_cache/ 135 | .dmypy.json 136 | dmypy.json 137 | 138 | # Pyre type checker 139 | .pyre/ 140 | 141 | # pytype static type analyzer 142 | .pytype/ 143 | 144 | # Cython debug symbols 145 | cython_debug/ 146 | 147 | # PyCharm 148 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can 149 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 150 | # and can be added to the global gitignore or merged into this file. For a more nuclear 151 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 152 | #.idea/ 153 | 154 | .DS_Store 155 | .idea 156 | *.pyc 157 | *.so 158 | _build 159 | _static 160 | _templates 161 | build 162 | *.egg-info 163 | *.c 164 | stdout.log 165 | notebooks/.ipynb_checkpoints 166 | cassiopeia/tools/branch_length_estimator/_iid_exponential_bayesian.cpp 167 | docs/api/reference/** 168 | .vscode 169 | cassiopeia/config.ini 170 | environment.yml 171 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | build: 3 | os: "ubuntu-22.04" 4 | tools: 5 | python: "3.9" 6 | sphinx: 7 | configuration: docs/conf.py 8 | python: 9 | install: 10 | - method: pip 11 | path: . 12 | extra_requirements: 13 | - docs 14 | - spatial -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | * Matthew Jones 6 | * Alex Khodaverdian 7 | * Richard Zhang 8 | * Sebastian Prillo 9 | * Joseph Min 10 | * Jeffrey Quinn 11 | * Jeffrey Hussmann 12 | * Michelle Chan 13 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Contributing 5 | ============ 6 | 7 | Your contributions are much appreciated! Feel free to contribute in one of these ways: 8 | 9 | 10 | Types of Contributions 11 | ---------------------- 12 | 13 | Report Bugs 14 | ~~~~~~~~~~~ 15 | 16 | Report bugs at https://github.com/YosefLab/Cassiopeia/issues. 17 | 18 | If you are reporting a bug, please include: 19 | 20 | * Your operating system name and version. 21 | * Any details about your local setup that might be helpful in troubleshooting. 22 | * Detailed steps to reproduce the bug. 23 | 24 | Fix Bugs 25 | ~~~~~~~~ 26 | 27 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help 28 | wanted" is open to whoever wants to implement it. 29 | 30 | Ideally, when you are fixing a bug, please first provide a test that breaks 31 | due to the bug. Your contributed code should then fix this test. 32 | 33 | Implement Features 34 | ~~~~~~~~~~~~~~~~~~ 35 | 36 | Look through the GitHub issues for features. Anything tagged with "enhancement" 37 | and "help wanted" is open to whoever wants to implement it. 38 | 39 | Write Documentation 40 | ~~~~~~~~~~~~~~~~~~~ 41 | 42 | Cassiopeia could always use more documentation, whether as part of the 43 | official Cassiopeia docs, in docstrings, or even on the web in blog posts, 44 | articles, and such. 45 | 46 | Submit Feedback 47 | ~~~~~~~~~~~~~~~ 48 | 49 | The best way to send feedback is to file an issue at https://github.com/YosefLab/Cassiopeia/issues. 50 | 51 | If you are proposing a feature: 52 | 53 | * Explain in detail how it would work. 54 | * Keep the scope as narrow as possible, to make it easier to implement. 55 | * Remember that this is a volunteer-driven project, and that contributions 56 | are welcome 57 | 58 | Get Started! 59 | ------------ 60 | 61 | Ready to contribute? Here's how to set up `cassiopeia` for local development. 62 | 63 | 1. Fork the `cassiopeia` repo on GitHub. 64 | 2. Clone your fork locally:: 65 | 66 | $ git clone git@github.com:your_name_here/Cassiopeia.git 67 | 68 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: 69 | 70 | $ mkvirtualenv cassiopeia 71 | $ cd cassiopeia/ 72 | $ make install 73 | 74 | 4. Create a branch for local development:: 75 | 76 | $ git checkout -b name-of-your-bugfix-or-feature 77 | 78 | Now you can make your changes locally. 79 | 80 | 5. When you're done making changes, check that your changes pass flake8 and the 81 | tests, including testing other Python versions with tox:: 82 | 83 | $ flake8 cassiopeia tests 84 | $ make test 85 | 86 | To get flake8, just pip install them into your virtualenv. 87 | 88 | 6. Commit your changes and push your branch to GitHub:: 89 | 90 | $ git add . 91 | $ git commit -m "Your detailed description of your changes." 92 | $ git push origin name-of-your-bugfix-or-feature 93 | 94 | 7. Submit a pull request through the GitHub website. 95 | 96 | Coding Standards 97 | ----------------------- 98 | 1. Don't duplicate code. Certainly no blocks longer than a couple of lines. It's almost always better to refactor than to duplicate blocks of code. 99 | 2. Almost all code should at least be run by a unit tests. No pull request should decrease unit test coverage by much. 100 | 3. Document each new method and each new class with a docstring. 101 | 4. Don't commit commented-out code. Just delete it or store it somewhere outside of the repo. You probably aren't going to need it. At worse, it's stored in previous commits, from before it was commented out. 102 | 5. A pull request (PR) will typically close at least one Github issue. For these pull requests, write the issue it closes in the description, e.g. ``closes #210``. The issue will be automatically closed when the PR is merged. 103 | 6. Don't commit data to the repository, except perhaps a few small (< 50 KB) files of test data. 104 | 105 | 106 | Pull Request Guidelines 107 | ----------------------- 108 | 109 | Before you submit a pull request, check that it meets these guidelines: 110 | 111 | 1. The pull request should include tests. 112 | 2. If the pull request adds functionality, the docs should be updated. Put 113 | your new functionality into a function with a docstring, and add the 114 | feature to the list in README.rst. 115 | 3. The pull request should work for Python >= 3.8. Check 116 | https://travis-ci.org/YosefLab/Cassiopeia/pull_requests 117 | and make sure that the tests pass for all supported Python versions. 118 | 119 | Deploying 120 | --------- 121 | 122 | A reminder for the maintainers on how to deploy. 123 | Make sure all your changes are committed (including an entry in HISTORY.rst). 124 | Then run:: 125 | 126 | $ bumpversion patch # possible: major / minor / patch 127 | $ git push 128 | $ git push --tags 129 | 130 | Travis will then deploy to PyPI if tests pass. 131 | 132 | Also, make sure you've tested your code using tox by running:: 133 | 134 | $ tox 135 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019, Matthew G Jones 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | recursive-include cassiopeia * 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL=bash 2 | python=python 3 | pip=pip 4 | tests=./test 5 | version:=$(shell $(python) version.py) 6 | sdist_name:=cassiopeia-$(version).tar.gz 7 | 8 | develop: 9 | $(pip) install -e . 10 | 11 | clean_develop: 12 | - $(pip) uninstall -y cassiopeia 13 | - rm -rf *.egg-info 14 | 15 | clean_sdist: 16 | - rm -rf dist 17 | 18 | clean: clean_develop clean_pypi 19 | 20 | install: 21 | - $(python) -m pip install . 22 | 23 | check_build_reqs: 24 | @$(python) -c 'import pytest' \ 25 | || ( printf "$(redpip)Build requirements are missing. Run 'make prepare' to install them.$(normal)" ; false ) 26 | 27 | test: check_build_reqs 28 | $(python) -m pytest -vv $(tests) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | cassiopeia 2 | 3 | [![Stars](https://img.shields.io/github/stars/YosefLab/cassiopeia?logo=GitHub&color=yellow)](https://github.com/YosefLab/cassiopeia/stargazers) 4 | [![Documentation Status](https://readthedocs.org/projects/cassiopeia/badge/?version=latest)](https://cassiopeia.readthedocs.io/en/stable/?badge=stable) 5 | ![Build 6 | Status](https://github.com/YosefLab/cassiopeia/workflows/cassiopeia/badge.svg) 7 | [![Coverage](https://codecov.io/gh/YosefLab/cassiopeia/branch/master/graph/badge.svg)](https://codecov.io/gh/YosefLab/cassiopeia) 8 | 9 | Cassiopeia: A pipeline for single-cell lineage tracing data 10 | ============================================================= 11 | 12 | Cassiopeia is an end-to-end pipeline for single-cell lineage tracing experiments. 13 | The software contained here comes equipped with modules for processing sequencing reads, 14 | reconstructing & plotting trees, analyzing these trees, and benchmarking new algorithms. 15 | 16 | You can find all of our [documentation here](https://cassiopeia-lineage.readthedocs.io/en/latest/index.html). 17 | 18 | We also have provided tutorials for three modules: 19 | 20 | - [processing fastqs](https://github.com/YosefLab/Cassiopeia/blob/master/notebooks/preprocess.ipynb) 21 | - [reconstructing trees](https://github.com/YosefLab/Cassiopeia/blob/master/notebooks/reconstruct.ipynb) 22 | - [simulating trees and benchmarking](https://github.com/YosefLab/Cassiopeia/blob/master/notebooks/benchmark.ipynb) 23 | - [plotting trees with our local library](https://github.com/YosefLab/Cassiopeia/blob/master/notebooks/local_plotting.ipynb) 24 | 25 | 26 | You can also find our originally describing Cassiopeia published in [Genome Biology](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02000-8). 27 | 28 | Free Software: MIT License 29 | 30 | Installation 31 | -------------- 32 | 33 | For users: 34 | 35 | ``` 36 | pip install git+https://github.com/YosefLab/Cassiopeia@master#egg=cassiopeia-lineage 37 | ``` 38 | 39 | For developers: 40 | 41 | 1. Clone the package as so: ``git clone https://github.com/YosefLab/Cassiopeia.git`` 42 | 43 | 2. Ensure that you have Python >= 3.8 installed. (Due to dependencies no longer supporting Python 3.7, we have now stopped supporting Python <= 3.7) We prefer using [miniconda](https://docs.conda.io/en/latest/miniconda.html). 44 | 45 | 3. [Optional] If you intend to use the CassiopeiaILP solver, please be sure that Gurobi is installed. You can follow the instructions listed [here](http://www.gurobi.com/academia/for-universities). To verify that it's working correctly, use the following tests: 46 | * Run the command ``gurobi.sh`` from a terminal window 47 | * From the Gurobi installation directory (where there is a setup.py file), use ``python setup.py install --user`` 48 | 49 | 4. [Optional] To use fast versions of Neighbor-Joining and UPGMA, install [CCPhylo](https://bitbucket.org/genomicepidemiology/ccphylo/src/master/). Then copy the file `./data/ccphylo_config.ini` to your `./cassiopeia` directory, rename it `config.ini` and set the Path variable to point to your CCPhylo installation. 50 | 51 | 5. Install Cassiopeia by first changing into the Cassiopeia directory and then `pip3 install .` or `make install`. To install dev and docs requirements, you can run `pip3 install .[dev,docs]`. 52 | 53 | 6. [Optional] To use tools built for the analysis of spatial lineage tracing datasets, you can install Cassiopeia with `pip install .[spatial]`. Please note that we recommend using Python >= 3.9 for these analyses as some features might not be available otherwise, due to package dependencies (especially 3D visualization). 54 | 55 | To verify that it installed correctly, install `pytest` (`pip install pytest`) and try running our tests with `make test`. 56 | 57 | Reference 58 | ---------------------- 59 | 60 | If you've found Cassiopeia useful for your research, please consider citing our paper published in Genome Biology: 61 | 62 | 63 | Matthew G Jones*, Alex Khodaverdian*, Jeffrey J Quinn*, Michelle M Chan, Jeffrey A Hussmann, Robert Wang, Chenling Xu, Jonathan S Weissman, Nir Yosef. (2020), [*Inference of single-cell phylogenies from lineage tracing data using Cassiopeia*](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02000-8), Genome Biology 64 | 65 | -------------------------------------------------------------------------------- /build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from distutils.command.build_ext import build_ext 4 | from distutils.core import Distribution, Extension 5 | 6 | import numpy 7 | from Cython.Build import cythonize 8 | 9 | # https://github.com/mdgoldberg/poetry-cython-example 10 | 11 | 12 | def build(): 13 | extensions = [ 14 | Extension( 15 | "cassiopeia.preprocess.collapse_cython", 16 | ["cassiopeia/preprocess/collapse_cython.pyx"], 17 | ), 18 | Extension( 19 | "cassiopeia.solver.ilp_solver_utilities", 20 | ["cassiopeia/solver/ilp_solver_utilities.pyx"], 21 | include_dirs=[numpy.get_include()], 22 | ), 23 | Extension( 24 | "cassiopeia.tools.branch_length_estimator._iid_exponential_bayesian", 25 | sources=[ 26 | "cassiopeia/tools/branch_length_estimator/_iid_exponential_bayesian.pyx", 27 | "cassiopeia/tools/branch_length_estimator/_iid_exponential_bayesian_cpp.cpp", 28 | ], 29 | extra_compile_args=[ 30 | "-std=c++17", 31 | "-Wall", 32 | "-Wextra", 33 | "-pedantic", 34 | "-O3", 35 | ], 36 | language="c++", 37 | ), 38 | ] 39 | ext_modules = cythonize( 40 | extensions, 41 | compiler_directives={"language_level": 3}, 42 | ) 43 | 44 | distribution = Distribution({"name": "extended", "ext_modules": ext_modules}) 45 | distribution.package_dir = "extended" 46 | 47 | cmd = build_ext(distribution) 48 | cmd.ensure_finalized() 49 | cmd.run() 50 | 51 | # Copy built extensions back to the project 52 | for output in cmd.get_outputs(): 53 | relative_extension = os.path.relpath(output, cmd.build_lib) 54 | shutil.copyfile(output, relative_extension) 55 | mode = os.stat(relative_extension).st_mode 56 | mode |= (mode & 0o444) >> 2 57 | os.chmod(relative_extension, mode) 58 | 59 | 60 | if __name__ == "__main__": 61 | build() -------------------------------------------------------------------------------- /cassiopeia/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Top-level for Cassiopeia development.""" 4 | 5 | from . import preprocess as pp 6 | from . import solver 7 | from . import plotting as pl 8 | from . import data 9 | from . import critique 10 | from . import simulator as sim 11 | from . import tools as tl 12 | 13 | # https://github.com/python-poetry/poetry/pull/2366#issuecomment-652418094 14 | # https://github.com/python-poetry/poetry/issues/144#issuecomment-623927302 15 | try: 16 | import importlib.metadata as importlib_metadata 17 | except ModuleNotFoundError: 18 | import importlib_metadata 19 | package_name = "cassiopeia-lineage" 20 | __version__ = importlib_metadata.version(package_name) 21 | 22 | import sys 23 | 24 | sys.modules.update({f"{__name__}.{m}": globals()[m] for m in ["tl", "pp", "pl", "sim"]}) 25 | del sys 26 | 27 | __all__ = ["pp", "solver", "pl", "data", "critique", "sim", "tl"] 28 | -------------------------------------------------------------------------------- /cassiopeia/critique/__init__.py: -------------------------------------------------------------------------------- 1 | """Top level for the cassiopeia critique module.""" 2 | 3 | from .compare import robinson_foulds, triplets_correct 4 | -------------------------------------------------------------------------------- /cassiopeia/data/Layers.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file stores the data structure behind layers in a CassiopeiaTree object. 3 | 4 | Briefly, Layers are collection of character matrices that can be used to 5 | store multiple versions of mutation data for each cell. This can be useful, 6 | for example, during simulation when a user is experimenting with imputing 7 | missing data. 8 | 9 | The data structure behaves like a dictionary, allowing users to store, retrieve, 10 | and delete entries using canonical commands. 11 | 12 | This data structure is inspired by AnnData's layer functionality for scRNA-seq 13 | count matrices. Much of the code and logic is derived from the AnnData project. 14 | """ 15 | from typing import Iterator, List, Mapping, Optional 16 | 17 | import pandas as pd 18 | 19 | from cassiopeia.data import CassiopeiaTree 20 | 21 | 22 | class Layers(dict): 23 | 24 | attrname = "layers" 25 | 26 | parent_mapping: Mapping[str, pd.DataFrame] 27 | 28 | def __init__( 29 | self, parent: CassiopeiaTree, layers: Optional[Mapping] = None 30 | ): 31 | self._parent = parent 32 | self._data = dict() 33 | if layers is not None: 34 | self.update(layers) 35 | 36 | def __repr__(self): 37 | return f"{type(self).__name__} with keys: {', '.join(self.keys())}" 38 | 39 | def _ipython_key_completions_(self) -> List[str]: 40 | return list(self.keys()) 41 | 42 | def copy(self): 43 | d = Layers(self._parent) 44 | for k, v in self.items(): 45 | d[k] = v.copy() 46 | return d 47 | 48 | def __getitem__(self, key: str) -> pd.DataFrame: 49 | return self._data[key] 50 | 51 | def __setitem__(self, key: str, value: pd.DataFrame): 52 | value = self._validate_value(value, key) 53 | self._data[key] = value 54 | 55 | def __delitem__(self, key: str): 56 | del self._data[key] 57 | 58 | def __contains__(self, key: str) -> bool: 59 | return key in self._data 60 | 61 | def __iter__(self) -> Iterator[str]: 62 | return iter(self._data) 63 | 64 | def __len__(self) -> int: 65 | return len(self._data) 66 | 67 | def _validate_value(self, val: pd.DataFrame, key: str) -> pd.DataFrame: 68 | """Checks passed value for correct structure.""" 69 | 70 | if val.shape[0] != self._parent.n_cell: 71 | raise ValueError( 72 | f"Value passed for key {key!r} is of incorrect shape. " 73 | f"Values of {self.attrname} must have the same number of " 74 | f"samples as the tree. Value had {val.shape[0]} while it " 75 | f"should have had {self._parent.n_cell} samples." 76 | ) 77 | return val 78 | -------------------------------------------------------------------------------- /cassiopeia/data/__init__.py: -------------------------------------------------------------------------------- 1 | """Top level for data.""" 2 | 3 | from .CassiopeiaTree import CassiopeiaTree 4 | from .utilities import ( 5 | compute_dissimilarity_map, 6 | compute_inter_cluster_distances, 7 | compute_phylogenetic_weight_matrix, 8 | get_lca_characters, 9 | net_relatedness_index, 10 | sample_bootstrap_allele_tables, 11 | sample_bootstrap_character_matrices, 12 | to_newick, 13 | ) 14 | -------------------------------------------------------------------------------- /cassiopeia/mixins/__init__.py: -------------------------------------------------------------------------------- 1 | from .errors import * 2 | from .logging import logger 3 | from .utilities import * 4 | from .warnings import * 5 | -------------------------------------------------------------------------------- /cassiopeia/mixins/errors.py: -------------------------------------------------------------------------------- 1 | class AutocorrelationError(Exception): 2 | """An Exception for the tools.autocorrelation methods.""" 3 | 4 | pass 5 | 6 | 7 | class CassiopeiaError(Exception): 8 | """An general exception for the Cassiopeia software.""" 9 | 10 | pass 11 | 12 | 13 | class CassiopeiaTreeError(Exception): 14 | """An Exception class for the CassiopeiaTree class.""" 15 | 16 | pass 17 | 18 | 19 | class DataSimulatorError(Exception): 20 | """Generic error for the DataSimulator subclasses""" 21 | 22 | pass 23 | 24 | 25 | class DistanceSolverError(Exception): 26 | """An Exception class for all DistanceSolver subclasses.""" 27 | 28 | pass 29 | 30 | class ecDNABirthDeathSimulatorError(Exception): 31 | """An ExceptionClass for ecDNABirthDeathSimulator class.""" 32 | 33 | pass 34 | 35 | class FitchCountError(Exception): 36 | """An ExceptionClass for FitchCount.""" 37 | 38 | pass 39 | 40 | 41 | class GreedySolverError(Exception): 42 | pass 43 | 44 | 45 | class HybridSolverError(Exception): 46 | """An Exception class for all HybridSolver subclasses.""" 47 | 48 | pass 49 | 50 | 51 | class ILPSolverError(Exception): 52 | """An Exception class for all ILPError subclasses.""" 53 | 54 | pass 55 | 56 | 57 | class iTOLError(Exception): 58 | pass 59 | 60 | 61 | class LeafSubsamplerError(Exception): 62 | """An Exception class for the LeafSubsampler class.""" 63 | 64 | pass 65 | 66 | 67 | class PreprocessError(Exception): 68 | pass 69 | 70 | 71 | class PriorTransformationError(Exception): 72 | """An Exception class for generating weights from priors.""" 73 | 74 | pass 75 | 76 | 77 | class SharedMutationJoiningSolverError(Exception): 78 | """An Exception class for SharedMutationJoiningSolver.""" 79 | 80 | pass 81 | 82 | 83 | class TreeSimulatorError(Exception): 84 | """An Exception class for all exceptions generated by 85 | TreeSimulator or a subclass of TreeSimulator 86 | """ 87 | 88 | pass 89 | 90 | 91 | class UnknownCigarStringError(Exception): 92 | pass 93 | 94 | 95 | class UnspecifiedConfigParameterError(Exception): 96 | pass 97 | 98 | 99 | class BranchLengthEstimatorError(Exception): 100 | """An Exception class for the BranchLengthEstimator class.""" 101 | 102 | pass 103 | 104 | 105 | class IIDExponentialMLEError(BranchLengthEstimatorError): 106 | pass 107 | 108 | 109 | class TreeMetricError(Exception): 110 | """An Exception class for calculating tree metrics""" 111 | 112 | pass 113 | 114 | 115 | class ParameterEstimateError(Exception): 116 | """An Exception class for the estimation and retrieval of tree parameters""" 117 | 118 | pass 119 | 120 | 121 | class PlottingError(Exception): 122 | pass 123 | -------------------------------------------------------------------------------- /cassiopeia/mixins/logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import ngs_tools as ngs 4 | 5 | logger = ngs.logging.Logger(__name__) 6 | logger.setLevel(logging.INFO) 7 | -------------------------------------------------------------------------------- /cassiopeia/mixins/utilities.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import importlib 3 | from types import ModuleType 4 | from typing import Dict, List, Optional, Tuple, Union 5 | 6 | import numpy as np 7 | 8 | 9 | def is_ambiguous_state(state: Union[int, Tuple[int, ...]]) -> bool: 10 | """Determine whether the provided state is ambiguous. 11 | 12 | Note that this function operates on a single (indel) state. 13 | 14 | Args: 15 | state: Single, possibly ambiguous, character state 16 | 17 | Returns: 18 | True if the state is ambiguous, False otherwise. 19 | """ 20 | return isinstance(state, tuple) 21 | 22 | 23 | def try_import(module: str) -> Optional[ModuleType]: 24 | """Helper function to import a possibly not-installed module. 25 | 26 | Args: 27 | module: Module to try and import 28 | 29 | Returns: 30 | The imported module, if the module exists, or None 31 | """ 32 | try: 33 | return importlib.import_module(module) 34 | except ModuleNotFoundError: 35 | return None 36 | 37 | 38 | def unravel_ambiguous_states( 39 | state_array: List[Union[int, Tuple[int, ...]]] 40 | ) -> List[int]: 41 | """Helper function to unravel ambiguous states. 42 | 43 | Args: 44 | A list of states, potentially containing ambiguous states. 45 | 46 | Returns: 47 | A list of unique states contained in the list. 48 | """ 49 | all_states = [ 50 | list(state) if is_ambiguous_state(state) else [state] 51 | for state in state_array 52 | ] 53 | return functools.reduce(lambda a, b: a + b, all_states) 54 | 55 | def find_duplicate_groups(character_matrix) -> Dict[str, Tuple[str, ...]]: 56 | """Maps duplicated indices in character matrix to groups. 57 | 58 | Groups together samples in a character matrix if they have the same 59 | character states. 60 | 61 | Args: 62 | character_matrix: Character matrix, potentially with ambiguous states. 63 | 64 | Returns: 65 | A mapping of a single sample name to the set of of samples that have 66 | the same character states. 67 | """ 68 | 69 | character_matrix.index.name = "index" 70 | 71 | # convert to sets to support ambiguous states 72 | character_matrix_sets = character_matrix.copy() 73 | character_matrix_sets = character_matrix_sets.apply( 74 | lambda x: [ 75 | set(s) if is_ambiguous_state(s) else set([s]) 76 | for s in x.values 77 | ], 78 | axis=0, 79 | ).apply(tuple, axis=1) 80 | is_duplicated = ( 81 | character_matrix_sets.duplicated(keep=False) 82 | ) 83 | unique_states = np.unique(character_matrix_sets[is_duplicated]) 84 | duplicate_groups = [character_matrix_sets[character_matrix_sets == val].index.values for val in unique_states] 85 | duplicate_mappings = {g[0]: tuple(g) for g in duplicate_groups} 86 | 87 | return duplicate_mappings 88 | -------------------------------------------------------------------------------- /cassiopeia/mixins/warnings.py: -------------------------------------------------------------------------------- 1 | class CassiopeiaTreeWarning(UserWarning): 2 | """A Warning for the CassiopeiaTree class.""" 3 | 4 | pass 5 | 6 | class DataSimulatorWarning(UserWarning): 7 | pass 8 | 9 | 10 | class PreprocessWarning(UserWarning): 11 | pass 12 | 13 | 14 | class SharedMutationJoiningSolverWarning(UserWarning): 15 | """A warning class for SharedMutationJoiningSolver.""" 16 | 17 | pass 18 | 19 | 20 | class ParameterEstimateWarning(UserWarning): 21 | """An warning class for the estimation and retrieval of tree parameters""" 22 | 23 | pass 24 | 25 | 26 | class PlottingWarning(UserWarning): 27 | pass 28 | 29 | class LeafSubsamplerWarning(UserWarning): 30 | 31 | pass 32 | -------------------------------------------------------------------------------- /cassiopeia/plotting/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Top level for plotting.""" 4 | 5 | from .itol_utilities import upload_and_export_itol 6 | from .local import plot_matplotlib, plot_plotly 7 | from .local_3d import labels_from_coordinates, Tree3D 8 | 9 | 10 | __all__ = [ 11 | "upload_and_export_itol", 12 | "plot_matplotlib", 13 | "plot_plotly", 14 | "labels_from_coordinates", 15 | "Tree3D" 16 | ] 17 | -------------------------------------------------------------------------------- /cassiopeia/plotting/palettes.py: -------------------------------------------------------------------------------- 1 | """ 2 | A file storing palettes to be used for plotting. 3 | """ 4 | 5 | # Don't want to have to set scanpy as a dependency just to use its color palette. 6 | # https://github.com/scverse/scanpy/blob/master/scanpy/plotting/palettes.py 7 | godsnot_102 = [ 8 | "#FFFF00", 9 | "#1CE6FF", 10 | "#FF34FF", 11 | "#FF4A46", 12 | "#008941", 13 | "#006FA6", 14 | "#A30059", 15 | "#FFDBE5", 16 | "#7A4900", 17 | "#0000A6", 18 | "#63FFAC", 19 | "#B79762", 20 | "#004D43", 21 | "#8FB0FF", 22 | "#997D87", 23 | "#5A0007", 24 | "#809693", 25 | "#6A3A4C", 26 | "#1B4400", 27 | "#4FC601", 28 | "#3B5DFF", 29 | "#4A3B53", 30 | "#FF2F80", 31 | "#61615A", 32 | "#BA0900", 33 | "#6B7900", 34 | "#00C2A0", 35 | "#FFAA92", 36 | "#FF90C9", 37 | "#B903AA", 38 | "#D16100", 39 | "#DDEFFF", 40 | "#000035", 41 | "#7B4F4B", 42 | "#A1C299", 43 | "#300018", 44 | "#0AA6D8", 45 | "#013349", 46 | "#00846F", 47 | "#372101", 48 | "#FFB500", 49 | "#C2FFED", 50 | "#A079BF", 51 | "#CC0744", 52 | "#C0B9B2", 53 | "#C2FF99", 54 | "#001E09", 55 | "#00489C", 56 | "#6F0062", 57 | "#0CBD66", 58 | "#EEC3FF", 59 | "#456D75", 60 | "#B77B68", 61 | "#7A87A1", 62 | "#788D66", 63 | "#885578", 64 | "#FAD09F", 65 | "#FF8A9A", 66 | "#D157A0", 67 | "#BEC459", 68 | "#456648", 69 | "#0086ED", 70 | "#886F4C", 71 | "#34362D", 72 | "#B4A8BD", 73 | "#00A6AA", 74 | "#452C2C", 75 | "#636375", 76 | "#A3C8C9", 77 | "#FF913F", 78 | "#938A81", 79 | "#575329", 80 | "#00FECF", 81 | "#B05B6F", 82 | "#8CD0FF", 83 | "#3B9700", 84 | "#04F757", 85 | "#C8A1A1", 86 | "#1E6E00", 87 | "#7900D7", 88 | "#A77500", 89 | "#6367A9", 90 | "#A05837", 91 | "#6B002C", 92 | "#772600", 93 | "#D790FF", 94 | "#9B9700", 95 | "#549E79", 96 | "#FFF69F", 97 | "#201625", 98 | "#72418F", 99 | "#BC23FF", 100 | "#99ADC0", 101 | "#3A2465", 102 | "#922329", 103 | "#5B4534", 104 | "#FDE8DC", 105 | "#404E55", 106 | "#0089A3", 107 | "#CB7E98", 108 | "#A4E804", 109 | "#324E72", 110 | ] 111 | -------------------------------------------------------------------------------- /cassiopeia/preprocess/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Top level for preprocess.""" 4 | 5 | from .pipeline import ( 6 | align_sequences, 7 | call_alleles, 8 | collapse_umis, 9 | convert_fastqs_to_unmapped_bam, 10 | error_correct_cellbcs_to_whitelist, 11 | error_correct_intbcs_to_whitelist, 12 | error_correct_umis, 13 | filter_bam, 14 | resolve_umi_sequence, 15 | filter_molecule_table, 16 | call_lineage_groups, 17 | ) 18 | from .utilities import ( 19 | compute_empirical_indel_priors, 20 | convert_alleletable_to_character_matrix, 21 | convert_alleletable_to_lineage_profile, 22 | convert_lineage_profile_to_character_matrix, 23 | filter_cells, 24 | filter_umis, 25 | ) 26 | from .setup_utilities import setup 27 | 28 | 29 | __all__ = [ 30 | "align_sequences", 31 | "call_alleles", 32 | "collapse_umis", 33 | "convert_fastqs_to_unmapped_bam", 34 | "error_correct_cellbcs_to_whitelist", 35 | "error_correct_intbcs_to_whitelist", 36 | "error_correct_umis", 37 | "filter_bam", 38 | "resolve_umi_sequence", 39 | "filter_molecule_table", 40 | "call_lineage_groups", 41 | "compute_empirical_indel_priors", 42 | "convert_alleletable_to_character_matrix", 43 | "convert_alleletable_to_lineage_profile", 44 | "convert_lineage_profile_to_character_matrix", 45 | "filter_cells", 46 | "filter_umis", 47 | "setup", 48 | ] 49 | -------------------------------------------------------------------------------- /cassiopeia/preprocess/collapse_cython.pyx: -------------------------------------------------------------------------------- 1 | cimport cython 2 | import numpy as np 3 | cdef int OFFSET = 33 4 | 5 | def hamming_distance(char* first, char* second): 6 | cdef int i 7 | cdef int d = 0 8 | cdef int length = len(first) 9 | 10 | for i in range(length): 11 | if first[i] != second[i]: 12 | d += 1 13 | 14 | return d 15 | 16 | @cython.boundscheck(False) 17 | def hamming_distance_matrix(seqs): 18 | cdef int i, j, k, d, n, seq_length 19 | 20 | ints = np.array([list(s.encode()) for s in seqs]) 21 | cdef long[:, ::1] ints_view = ints 22 | n, seq_length = ints.shape 23 | 24 | ds = np.zeros((n, n), int) 25 | cdef long[:, ::1] ds_view = ds 26 | 27 | for i in range(n): 28 | for j in range(i + 1, n): 29 | d = 0 30 | for k in range(seq_length): 31 | if ints_view[i, k] != ints_view[j, k]: 32 | d += 1 33 | 34 | ds_view[i, j] = d 35 | 36 | return ds 37 | 38 | @cython.boundscheck(False) 39 | def register_corrections(long[:, ::1] ds, int max_UMI_distance, UMIs): 40 | cdef int i, j, n 41 | n = len(ds) 42 | corrections = {} 43 | 44 | # Moving from least common to most common, register a correction 45 | # from a UMI to the most common UMI that is within Hamming distance 46 | # max_UMI_distance of it. 47 | for j in range(n - 1, -1, -1): 48 | for i in range(j - 1, -1, -1): 49 | if ds[i, j] <= max_UMI_distance: 50 | corrections[UMIs[j]] = UMIs[i] 51 | 52 | # If a correction points to a UMI that is itself going to be corrected, 53 | # propogate this correction through. 54 | for from_, to in list(corrections.items()): 55 | while to in corrections: 56 | to = corrections[to] 57 | 58 | corrections[from_] = to 59 | 60 | return corrections 61 | 62 | def hq_hamming_distance(char* first_seq, char* second_seq, char* first_qual, char* second_qual, int min_q): 63 | cdef int i 64 | cdef int d = 0 65 | cdef int length = len(first_seq) 66 | cdef int floor = min_q + OFFSET 67 | 68 | for i in range(length): 69 | if (first_seq[i] != second_seq[i]) and (first_qual[i] >= floor) and (second_qual[i] >= floor): 70 | d += 1 71 | 72 | return d 73 | 74 | def hq_mismatches_from_seed(char* seed, char* seq, char[:] qual, int min_q): 75 | cdef int i 76 | cdef int d = 0 77 | cdef int length = len(seq) 78 | cdef int floor = min_q 79 | 80 | for i in range(length): 81 | if (seq[i] != seed[i]) and (qual[i] >= floor): 82 | d += 1 83 | 84 | return d 85 | -------------------------------------------------------------------------------- /cassiopeia/preprocess/constants.py: -------------------------------------------------------------------------------- 1 | """ 2 | Stores constants for the ProcessingPipeline module 3 | """ 4 | 5 | BAM_CONSTANTS = { 6 | "RAW_CELL_BC_TAG": "CR", 7 | "RAW_CELL_BC_QUALITY_TAG": "CY", 8 | "CELL_BC_TAG": "CB", 9 | "UMI_TAG": "UR", 10 | "UMI_QUALITY_TAG": "UY", 11 | "NUM_READS_TAG": "ZR", 12 | "CLUSTER_ID_TAG": "ZC", 13 | "N_Q": 2, 14 | "HIGH_Q": 31, 15 | "LOW_Q": 10, 16 | } 17 | 18 | SINGLE_CELL_BAM_TAGS = { 19 | "umi": (BAM_CONSTANTS["UMI_TAG"], BAM_CONSTANTS["UMI_QUALITY_TAG"]), 20 | "cell_barcode": ( 21 | BAM_CONSTANTS["RAW_CELL_BC_TAG"], 22 | BAM_CONSTANTS["RAW_CELL_BC_QUALITY_TAG"], 23 | ), 24 | } 25 | SPATIAL_BAM_TAGS = { 26 | "umi": (BAM_CONSTANTS["UMI_TAG"], BAM_CONSTANTS["UMI_QUALITY_TAG"]), 27 | "spot_barcode": ( 28 | BAM_CONSTANTS["RAW_CELL_BC_TAG"], 29 | BAM_CONSTANTS["RAW_CELL_BC_QUALITY_TAG"], 30 | ), 31 | } 32 | CHEMISTRY_BAM_TAGS = { 33 | "dropseq": SINGLE_CELL_BAM_TAGS, 34 | "10xv2": SINGLE_CELL_BAM_TAGS, 35 | "10xv3": SINGLE_CELL_BAM_TAGS, 36 | "indropsv3": SINGLE_CELL_BAM_TAGS, 37 | "slideseq2": SPATIAL_BAM_TAGS, 38 | } 39 | 40 | 41 | DNA_SUBSTITUTION_MATRIX = { 42 | "A": {"A": 5, "T": -4, "C": -4, "G": -4, "Z": 0, "N": 0}, 43 | "T": {"A": -4, "T": 5, "C": -4, "G": -4, "Z": 0, "N": 0}, 44 | "C": {"A": -4, "T": -4, "C": 5, "G": -4, "Z": 0, "N": 0}, 45 | "G": {"A": -4, "T": -4, "C": -4, "G": 5, "Z": 0, "N": 0}, 46 | "Z": {"A": 0, "T": 0, "C": 0, "G": 0, "Z": 0, "N": 0}, 47 | "N": {"A": 0, "T": 0, "C": 0, "G": 0, "Z": 0, "N": 0}, 48 | } 49 | 50 | DEFAULT_PIPELINE_PARAMETERS = { 51 | "general": { 52 | "entry": "'convert'", 53 | "exit": "'call_lineages'", 54 | "verbose": False, 55 | }, 56 | "convert": {}, 57 | "filter_bam": {"quality_threshold": 10}, 58 | "error_correct_cellbcs_to_whitelist": {}, 59 | "collapse": {"max_hq_mismatches": 3, "max_indels": 2}, 60 | "resolve": { 61 | "min_avg_reads_per_umi": 2.0, 62 | "min_umi_per_cell": 10, 63 | "plot": True, 64 | }, 65 | "align": { 66 | "gap_open_penalty": 20, 67 | "gap_extend_penalty": 1, 68 | "method": "'local'", 69 | }, 70 | "call_alleles": { 71 | "barcode_interval": (20, 34), 72 | "cutsite_locations": [112, 166, 220], 73 | "cutsite_width": 12, 74 | "context": True, 75 | "context_size": 5, 76 | }, 77 | "error_correct_intbcs_to_whitelist": {"intbc_dist_thresh": 1}, 78 | "error_correct_umis": {"max_umi_distance": 2}, 79 | "filter_molecule_table": { 80 | "min_umi_per_cell": 10, 81 | "min_avg_reads_per_umi": 2.0, 82 | "min_reads_per_umi": -1, 83 | "intbc_prop_thresh": 0.5, 84 | "intbc_umi_thresh": 10, 85 | "intbc_dist_thresh": 1, 86 | "doublet_threshold": 0.35, 87 | "plot": True, 88 | }, 89 | "call_lineages": { 90 | "min_umi_per_cell": 10, 91 | "min_avg_reads_per_umi": 2.0, 92 | "min_cluster_prop": 0.005, 93 | "min_intbc_thresh": 0.05, 94 | "inter_doublet_threshold": 0.35, 95 | "kinship_thresh": 0.25, 96 | "plot": True, 97 | }, 98 | } 99 | -------------------------------------------------------------------------------- /cassiopeia/preprocess/map_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains functions pertaining to mapping intBCs. 3 | Invoked through pipeline.py and supports the filter_molecule_table function. 4 | """ 5 | 6 | import pandas as pd 7 | 8 | from cassiopeia.mixins import logger 9 | from cassiopeia.preprocess import utilities 10 | 11 | 12 | @utilities.log_molecule_table 13 | def map_intbcs(molecule_table: pd.DataFrame) -> pd.DataFrame: 14 | """Assign one allele to each intBC/cellBC pair. 15 | 16 | For each intBC/cellBC pairing, selects the most frequent allele (by read 17 | count, and then by UMI) and removes alignments that don't have that allele. 18 | 19 | Args: 20 | molecule_table: A molecule table of cellBC-UMI pairs to be filtered 21 | 22 | Returns: 23 | An allele table with one allele per cellBC-intBC pair 24 | """ 25 | 26 | # Have to drop out all intBCs that are NaN 27 | molecule_table = molecule_table.dropna(subset=["intBC"]) 28 | 29 | # For each cellBC-intBC pair, select the allele that has the highest 30 | # readCount; on ties, use UMI count 31 | allele_table = ( 32 | molecule_table.groupby(["cellBC", "intBC", "allele"]) 33 | .agg({"readCount": "sum", "UMI": "count"}) 34 | .reset_index() 35 | .sort_values(["UMI", "readCount"], ascending=False) 36 | ) 37 | duplicated_mask = allele_table.duplicated(["cellBC", "intBC"]) 38 | mapped_alleles = set( 39 | allele_table[~duplicated_mask][ 40 | ["cellBC", "intBC", "allele"] 41 | ].itertuples(index=False, name=None) 42 | ) 43 | 44 | # True for rows that contain the mapped allele; False for ones to filter out 45 | selection_mask = ( 46 | molecule_table[["cellBC", "intBC", "allele"]] 47 | .apply(tuple, axis=1) 48 | .isin(mapped_alleles) 49 | ) 50 | 51 | mapped_table = molecule_table[selection_mask] 52 | logger.debug(f"Alleles removed: {duplicated_mask.sum()}") 53 | logger.debug(f"UMIs removed: {(~selection_mask).sum()}") 54 | return mapped_table 55 | -------------------------------------------------------------------------------- /cassiopeia/simulator/CompleteBinarySimulator.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file defines the CompleteBinarySimulator, which inherits TreeSimulator, 3 | that simulates complte binary trees. In this sense, this is the simplest tree 4 | simulator. 5 | """ 6 | from typing import Generator, Optional 7 | 8 | import networkx as nx 9 | import numpy as np 10 | 11 | from cassiopeia.data.CassiopeiaTree import CassiopeiaTree 12 | from cassiopeia.mixins import TreeSimulatorError 13 | from cassiopeia.simulator.TreeSimulator import TreeSimulator 14 | 15 | 16 | class CompleteBinarySimulator(TreeSimulator): 17 | """Simulate a complete binary tree. 18 | 19 | Internally, this class uses :func:`nx.balanced_tree` to generate a 20 | perfectly balanced binary tree of specified size. Only one of ``num_cells`` 21 | or ``depth`` should be provided. All branches have equal length that is 22 | normalized by the height of the tree (i.e. the tree has height 1). 23 | 24 | Args: 25 | num_cells: Number of cells to simulate. Needs to be a power of 2. The 26 | depth of the tree will be `log2(num_cells)`. 27 | depth: Depth of the tree. The number of cells will be `2^depth`. 28 | 29 | Raises: 30 | TreeSimulatorError if neither or both ``num_cells`` or ``depth`` are 31 | provided, if ``num_cells`` is not a power of 2, or if the calculated 32 | depth is not greater than 0. 33 | """ 34 | 35 | def __init__( 36 | self, num_cells: Optional[int] = None, depth: Optional[int] = None 37 | ): 38 | if (num_cells is None) == (depth is None): 39 | raise TreeSimulatorError( 40 | "One of `num_cells` or `depth` must be provided." 41 | ) 42 | if num_cells is not None: 43 | log2_num_cells = np.log2(num_cells) 44 | if log2_num_cells != int(log2_num_cells): 45 | raise TreeSimulatorError("`num_cells` must be a power of 2.") 46 | depth = int(log2_num_cells) 47 | if depth <= 0: 48 | raise TreeSimulatorError("`depth` must be grater than 0.") 49 | self.depth = depth 50 | 51 | def simulate_tree( 52 | self, 53 | ) -> CassiopeiaTree: 54 | """Simulates a complete binary tree. 55 | 56 | Returns: 57 | A CassiopeiaTree with the tree topology initialized with the 58 | simulated tree 59 | """ 60 | 61 | def node_name_generator() -> Generator[str, None, None]: 62 | """Generates unique node names for the tree.""" 63 | i = 0 64 | while True: 65 | yield str(i) 66 | i += 1 67 | 68 | names = node_name_generator() 69 | 70 | tree = nx.balanced_tree(2, self.depth, create_using=nx.DiGraph) 71 | mapping = {"root": next(names)} 72 | mapping.update({node: next(names) for node in tree.nodes}) 73 | # Add root, which indicates the initiating cell 74 | tree.add_edge("root", 0) 75 | nx.relabel_nodes(tree, mapping, copy=False) 76 | cassiopeia_tree = CassiopeiaTree(tree=tree) 77 | 78 | # Initialize branch lengths 79 | time_dict = { 80 | node: cassiopeia_tree.get_time(node) / (self.depth + 1) 81 | for node in cassiopeia_tree.nodes 82 | } 83 | cassiopeia_tree.set_times(time_dict) 84 | return cassiopeia_tree 85 | -------------------------------------------------------------------------------- /cassiopeia/simulator/DataSimulator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Abstract class DataSimulator, for overlaying data onto a CassiopeiaTree. 3 | 4 | All data simulators are derived classes of this abstract class, and at a minimum 5 | implement a method called `overlay_data`. 6 | """ 7 | import abc 8 | 9 | from cassiopeia.data import CassiopeiaTree 10 | from cassiopeia.mixins import DataSimulatorError 11 | 12 | 13 | class DataSimulator(abc.ABC): 14 | """ 15 | DataSimulator is an abstract class that all data overlayers derive from. 16 | 17 | A DataSimulator is very generic and meant to give users the flexibility to 18 | overlay any kind of data onto the tree using this single API. The prime 19 | example of data a user might want to overlay on a tree is lineage tracing 20 | data, for which there is a specific subclass LineageTracingDataSimulator. 21 | Other data of interest might include: transcriptomes, proteomes, etc. 22 | """ 23 | 24 | @abc.abstractmethod 25 | def overlay_data(self, tree: CassiopeiaTree) -> None: 26 | """ 27 | Overlay data on a CassiopeiaTree (in-place). 28 | 29 | The tree topology must be initialized. 30 | 31 | Args: 32 | tree: the CassiopeiaTree to overlay the data on. The tree topology 33 | must be initialized. 34 | """ 35 | -------------------------------------------------------------------------------- /cassiopeia/simulator/LeafSubsampler.py: -------------------------------------------------------------------------------- 1 | """ 2 | Abstract class LeafSubsampler. Samples the leaves of CassiopeiaTrees and 3 | generates a tree that keeps only the lineages pertaining to the sample. 4 | 5 | All leaf subsamplers are derived classes of this abstract class, and at a minimum 6 | implement a method called `subsample_leaves`. 7 | """ 8 | 9 | import abc 10 | import networkx as nx 11 | import numpy as np 12 | from typing import Optional 13 | 14 | from cassiopeia.data import CassiopeiaTree 15 | from cassiopeia.mixins import ( 16 | LeafSubsamplerError, 17 | LeafSubsamplerWarning, 18 | ) 19 | 20 | 21 | class LeafSubsampler(abc.ABC): 22 | """ 23 | Abstract base class for all leaf samplers. 24 | 25 | A LeafSubsampler implements a method 'subsample_leaves' which, given a 26 | tree, generates a sample of the observed leaves in that tree and returns a 27 | new tree which is the induced subtree (tree containing only lineages that 28 | contain a sampled leaf) of the original tree on that sample. 29 | """ 30 | 31 | @abc.abstractmethod 32 | def subsample_leaves(self, tree: CassiopeiaTree) -> CassiopeiaTree: 33 | """ 34 | Subsamples the leaves of a CassiopeiaTree. 35 | 36 | Returns a new CassiopeiaTree which is the result of subsampling the 37 | leaves in the original CassiopeiaTree and removing ancestral nodes no 38 | longer relevant to the sample. All fields on the original character 39 | matrix persist, but maintains character states, meta data, and the 40 | dissimilarity map for the sampled cells only. 41 | 42 | Args: 43 | tree: The CassiopeiaTree for which to subsample leaves 44 | 45 | Returns: 46 | A new CassiopeiaTree that is the induced subtree on a sample of the 47 | leaves in the given tree. 48 | """ 49 | -------------------------------------------------------------------------------- /cassiopeia/simulator/LineageTracingDataSimulator.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file stores an abstract subclass of DataSimulator, the 3 | LineageTracingDataSimulator. A LineageTracingDataSimulator overlays lineage 4 | tracing data onto a CassiopeiaTree, i.e. it sets the character states of a 5 | CassiopeiaTree (in particular, the character matrix). 6 | """ 7 | import abc 8 | 9 | from cassiopeia.data import CassiopeiaTree 10 | from cassiopeia.simulator.DataSimulator import DataSimulator 11 | 12 | 13 | class LineageTracingDataSimulator(DataSimulator): 14 | """ 15 | LineageTracingDataSimulator is an abstract class that all lineage tracing 16 | data simulators derive from. 17 | 18 | A LineageTracingDataSimulator is useful for simulating lineage tracing 19 | assays in silico, allowing us to explore the utility of lineage tracing 20 | technologies such as base editors, GESTALT, etc. for recovering the ground 21 | truth cell phylogeny. In a typical simulation pipeline, a 22 | LineageTracingDataSimulator is used to overlay lineage tracing data on a 23 | CassiopeiaTree, and then a CassiopeiaSolver is used to reconstruct the tree 24 | topology. 25 | 26 | As a result, LineageTracingDataSimulators allow us to study the impact of 27 | different aspects of the lineage tracing assay - such as number of 28 | barcodes, mutation rates, etc. - on our ability to recover the ground 29 | truth phylogeny. 30 | """ 31 | 32 | @abc.abstractmethod 33 | def overlay_data(self, tree: CassiopeiaTree) -> None: 34 | """ 35 | Overlay lineage tracing data onto the CassiopeiaTree (in-place). 36 | 37 | This sets the character states of all nodes in the tree, as well 38 | as the character matrix. The tree is expected to have its topology 39 | initialized, as well as meaningful branch lengths. 40 | 41 | Args: 42 | tree: the CassiopeiaTree to overlay the lineage tracing data on. 43 | The tree topology must be initialized. 44 | """ 45 | -------------------------------------------------------------------------------- /cassiopeia/simulator/SpatialDataSimulator.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file stores an abstract subclass of DataSimulator, the 3 | SpatialDataSimulator. A SpatialDataSimulator overlays spatial data onto a 4 | Cassiopeiatree, i.e. it sets the spatial coordinates of a CassiopeiaTree 5 | (in particular, as attributes of the nodes of the tree and the cell meta). 6 | """ 7 | import abc 8 | 9 | from cassiopeia.data import CassiopeiaTree 10 | from cassiopeia.simulator.DataSimulator import DataSimulator 11 | 12 | 13 | class SpatialDataSimulator(DataSimulator): 14 | """ 15 | SpatialDataSimulator is an abstract class that all spatial data simulators 16 | derive from. 17 | 18 | A SpatialDataSimulator is useful for simulating spatial assays in silico. 19 | In a typical simulation pipeline, a SpatialDataSimulator is used to overlay 20 | spatial coordinates on a CassiopeiaTree, and then a CassiopeiaSolver is used 21 | to reconstruct the tree topology (to simulate single-cell-resolution spatial 22 | assays) or a SpatialLeafSubsampler is used (to simulate 23 | non-single-cell-resoultion spatial assays). 24 | """ 25 | 26 | @abc.abstractmethod 27 | def overlay_data(self, tree: CassiopeiaTree) -> None: 28 | """ 29 | Overlay spatial data onto the CassiopeiaTree (in-place). 30 | 31 | This sets the spatial coordinates of all nodes in the tree. These 32 | coordinates are stored as the `spatial` node attribute. For leaves, 33 | these exact coordinates are saved as columns in the `cell_meta` 34 | attribute of the CassiopeiaTree. 35 | 36 | Args: 37 | tree: the CassiopeiaTree to overlay the lineage tracing data on. 38 | The tree topology must be initialized. 39 | """ 40 | -------------------------------------------------------------------------------- /cassiopeia/simulator/TreeSimulator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Abstract class TreeSimulator, for tree simulation module. 3 | 4 | All tree simulators are derived classes of this abstract class, and at a minimum 5 | implement a method called `simulate_tree`. 6 | """ 7 | import abc 8 | 9 | from cassiopeia.data import CassiopeiaTree 10 | from cassiopeia.mixins import TreeSimulatorError 11 | 12 | 13 | class TreeSimulator(abc.ABC): 14 | """ 15 | TreeSimulator is an abstract class that all tree simulators derive from. 16 | 17 | A TreeSimulator returns a CassiopeiaTree with at least its tree topology 18 | initialized. The character matrix need not be initialized (this is 19 | accomplished instead using a LineageTracingDataSimulator object). The 20 | branch lengths may be interpretable or not depending on the specific 21 | TreeSimulator. 22 | 23 | The purpose of the TreeSimulator is to allow users to perform in silico 24 | simulations of single-cell phylogenies, such as tumor phylogenies, organism 25 | development, etc., providing a ground truth phylogeny and thus a means to 26 | evaluate methodologies for reconstructing and analyzing single-cell 27 | phylogenies. 28 | """ 29 | 30 | @abc.abstractmethod 31 | def simulate_tree(self) -> CassiopeiaTree: 32 | """ 33 | Simulate a CassiopeiaTree. 34 | 35 | The returned tree will have at least its tree topology initialized. 36 | """ 37 | -------------------------------------------------------------------------------- /cassiopeia/simulator/UniformLeafSubsampler.py: -------------------------------------------------------------------------------- 1 | """ 2 | A subclass of LeafSubsampler, the UniformLeafSubsampler. 3 | 4 | Takes a uniform random sample of the leaves of a CassiopeiaTree and produces a 5 | new CassiopeiaTree that keeps only the lineages pertaining to the sample. 6 | """ 7 | 8 | import copy 9 | import numpy as np 10 | from typing import Optional 11 | 12 | from cassiopeia.data import CassiopeiaTree 13 | from cassiopeia.simulator.LeafSubsampler import ( 14 | LeafSubsampler, 15 | LeafSubsamplerError, 16 | ) 17 | 18 | 19 | class UniformLeafSubsampler(LeafSubsampler): 20 | def __init__( 21 | self, 22 | ratio: Optional[float] = None, 23 | number_of_leaves: Optional[int] = None, 24 | ): 25 | """ 26 | Uniformly subsample leaf samples of a CassiopeiaTree. 27 | 28 | If 'ratio' is provided, samples 'ratio' of the leaves, rounded down, 29 | uniformly at random. If instead 'number_of_leaves' is provided, 30 | 'number_of_leaves' of the leaves are sampled uniformly at random. Only 31 | one of the two criteria can be provided. 32 | 33 | Args: 34 | ratio: Specifies the number of leaves to be sampled as a ratio of 35 | the total number of leaves 36 | number_of_leaves: Explicitly specifies the number of leaves to be sampled 37 | """ 38 | if ratio is None and number_of_leaves is None: 39 | raise LeafSubsamplerError( 40 | "At least one of 'ratio' and 'number_of_leaves' " 41 | "must be specified." 42 | ) 43 | if ratio is not None and number_of_leaves is not None: 44 | raise LeafSubsamplerError( 45 | "Exactly one of 'ratio' and 'number_of_leaves'" 46 | "must be specified." 47 | ) 48 | self.__ratio = ratio 49 | self.__number_of_leaves = number_of_leaves 50 | 51 | def subsample_leaves( 52 | self, tree: CassiopeiaTree, keep_singular_root_edge: bool = True 53 | ) -> CassiopeiaTree: 54 | """Uniformly subsample leaf samples of a given tree. 55 | 56 | Generates a uniform random sample on the leaves of the given 57 | CassiopeiaTree and returns a tree pruned to contain lineages relevant 58 | to only leaves in the sample (the "induced subtree" on the sample). 59 | All fields on the original character matrix persist, but maintains 60 | character states, meta data, and the dissimilarity map for the sampled 61 | cells only. 62 | 63 | Has the option to keep the single edge leading from the root in the 64 | induced subtree, if it exists. This edge is often used to represent the 65 | time that the root lives before any divisions occur in the phyologeny, 66 | and is useful in instances where the branch lengths are critical, like 67 | simulating ground truth phylogenies or estimating branch lengths. 68 | 69 | Args: 70 | tree: The CassiopeiaTree for which to subsample leaves 71 | keep_singular_root_edge: Whether or not to collapse the single edge 72 | leading from the root in the subsample, if it exists 73 | 74 | Returns: 75 | A new CassiopeiaTree that is the induced subtree on a sample of the 76 | leaves in the given tree 77 | 78 | Raises: 79 | LeafSubsamplerError if the sample size is <= 0, or larger than the 80 | number of leaves in the tree 81 | """ 82 | ratio = self.__ratio 83 | number_of_leaves = self.__number_of_leaves 84 | n_subsample = ( 85 | number_of_leaves 86 | if number_of_leaves is not None 87 | else int(tree.n_cell * ratio) 88 | ) 89 | if n_subsample <= 0: 90 | raise LeafSubsamplerError( 91 | "Specified number of leaves sampled is <= 0." 92 | ) 93 | if n_subsample > tree.n_cell: 94 | raise LeafSubsamplerError( 95 | "Specified number of leaves sampled is greater than the number" 96 | " of leaves in the given tree." 97 | ) 98 | 99 | n_remove = len(tree.leaves) - n_subsample 100 | subsampled_tree = copy.deepcopy(tree) 101 | leaf_remove = np.random.choice( 102 | subsampled_tree.leaves, n_remove, replace=False 103 | ) 104 | 105 | subsampled_tree.remove_leaves_and_prune_lineages(leaf_remove) 106 | 107 | # Keep the singular root edge if it exists and is indicated to be kept 108 | if ( 109 | len(subsampled_tree.children(subsampled_tree.root)) == 1 110 | and keep_singular_root_edge 111 | ): 112 | collapse_source = subsampled_tree.children(subsampled_tree.root)[0] 113 | else: 114 | collapse_source = None 115 | subsampled_tree.collapse_unifurcations(source=collapse_source) 116 | 117 | # Copy and annotate branch lengths and times 118 | subsampled_tree.set_times( 119 | dict( 120 | [(node, tree.get_time(node)) for node in subsampled_tree.nodes] 121 | ) 122 | ) 123 | 124 | return subsampled_tree 125 | -------------------------------------------------------------------------------- /cassiopeia/simulator/__init__.py: -------------------------------------------------------------------------------- 1 | """Top level for simulator.""" 2 | 3 | from .BirthDeathFitnessSimulator import BirthDeathFitnessSimulator 4 | from .BrownianSpatialDataSimulator import BrownianSpatialDataSimulator 5 | from .Cas9LineageTracingDataSimulator import Cas9LineageTracingDataSimulator 6 | from .SequentialLineageTracingDataSimulator import ( 7 | SequentialLineageTracingDataSimulator, 8 | ) 9 | from .ClonalSpatialDataSimulator import ClonalSpatialDataSimulator 10 | from .CompleteBinarySimulator import CompleteBinarySimulator 11 | from .DataSimulator import DataSimulator 12 | from .ecDNABirthDeathSimulator import ecDNABirthDeathSimulator 13 | from .LeafSubsampler import LeafSubsampler 14 | from .LineageTracingDataSimulator import LineageTracingDataSimulator 15 | from .SimpleFitSubcloneSimulator import SimpleFitSubcloneSimulator 16 | from .SupercellularSampler import SupercellularSampler 17 | from .TreeSimulator import TreeSimulator 18 | from .UniformLeafSubsampler import UniformLeafSubsampler 19 | from .SpatialLeafSubsampler import SpatialLeafSubsampler 20 | 21 | 22 | __all__ = [ 23 | "BirthDeathFitnessSimulator", 24 | "BrownianSpatialDataSimulator", 25 | "Cas9LineageTracingDataSimulator", 26 | "SeqeuntialLineageTracingDataSimulator", 27 | "CompleteBinarySimulator", 28 | "DataSimulator", 29 | "ecDNABirthDeathSimulator", 30 | "LeafSubsampler", 31 | "LineageTracingDataSimulator", 32 | "SimpleFitSubcloneSimulator", 33 | "SupercellularSampler", 34 | "TreeSimulator", 35 | "UniformLeafSubsampler", 36 | ] 37 | -------------------------------------------------------------------------------- /cassiopeia/solver/CassiopeiaSolver.py: -------------------------------------------------------------------------------- 1 | """ 2 | Abstract class CassiopeiaSolver, for the phylogenetic inference module. 3 | 4 | All algorithms are derived classes of this abstract class, and at a minimum 5 | store an input character matrix and implement a method called `solve`. Each 6 | derived class stores more information around the parameters necessary for 7 | inferring a phylogenetic tree. 8 | """ 9 | import abc 10 | from typing import Optional 11 | 12 | from cassiopeia.data import CassiopeiaTree 13 | 14 | 15 | class CassiopeiaSolver(abc.ABC): 16 | """ 17 | CassiopeiaSolver is an abstract class that all inference algorithms derive 18 | from. At minimum, all CassiopeiaSolver subclasses will store a character 19 | matrix and implement a solver procedure. 20 | 21 | Args: 22 | prior_transformation: A function defining a transformation on the priors 23 | in forming weights. Supports the following transformations: 24 | "negative_log": Transforms each probability by the negative log 25 | "inverse": Transforms each probability p by taking 1/p 26 | "square_root_inverse": Transforms each probability by the 27 | the square root of 1/p 28 | """ 29 | 30 | def __init__(self, prior_transformation: str = "negative_log"): 31 | 32 | self.prior_transformation = prior_transformation 33 | 34 | @abc.abstractmethod 35 | def solve( 36 | self, 37 | cassiopeia_tree: CassiopeiaTree, 38 | layer: Optional[str] = None, 39 | collapse_mutationless_edges: bool = False, 40 | logfile: str = "stdout.log", 41 | ): 42 | """Solves the inference problem. 43 | 44 | Args: 45 | cassiopeia_tree: CassiopeiaTree storing character information for 46 | phylogenetic inference. 47 | layer: Layer storing the character matrix for solving. If None, the 48 | default character matrix is used in the CassiopeiaTree. 49 | collapse_mutationless_edges: Indicates if the final reconstructed 50 | tree should collapse mutationless edges based on internal states 51 | inferred by Camin-Sokal parsimony. In scoring accuracy, this 52 | removes artifacts caused by arbitrarily resolving polytomies. 53 | logfile: File location to log output. 54 | """ 55 | pass 56 | -------------------------------------------------------------------------------- /cassiopeia/solver/__init__.py: -------------------------------------------------------------------------------- 1 | """Top level for Tree Solver development.""" 2 | 3 | from .HybridSolver import HybridSolver 4 | from .ILPSolver import ILPSolver 5 | from .MaxCutGreedySolver import MaxCutGreedySolver 6 | from .MaxCutSolver import MaxCutSolver 7 | from .NeighborJoiningSolver import NeighborJoiningSolver 8 | from .PercolationSolver import PercolationSolver 9 | from .SharedMutationJoiningSolver import SharedMutationJoiningSolver 10 | from .SpectralGreedySolver import SpectralGreedySolver 11 | from .SpectralSolver import SpectralSolver 12 | from .UPGMASolver import UPGMASolver 13 | from .VanillaGreedySolver import VanillaGreedySolver 14 | from .SpectralNeighborJoiningSolver import SpectralNeighborJoiningSolver 15 | from . import dissimilarity_functions as dissimilarity 16 | -------------------------------------------------------------------------------- /cassiopeia/solver/missing_data_methods.py: -------------------------------------------------------------------------------- 1 | """This file contains included missing data imputation methods.""" 2 | 3 | from typing import Dict, List, Optional, Tuple, Union 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from cassiopeia.mixins import is_ambiguous_state, unravel_ambiguous_states 9 | from cassiopeia.solver import solver_utilities 10 | 11 | 12 | def assign_missing_average( 13 | character_matrix: pd.DataFrame, 14 | missing_state_indicator: int, 15 | left_set: List[str], 16 | right_set: List[str], 17 | missing: List[str], 18 | weights: Optional[Dict[int, Dict[int, float]]] = None, 19 | ) -> Tuple[List[str], List[str]]: 20 | """Implements the "Average" missing data imputation method. 21 | 22 | An on-the-fly missing data imputation method for the VanillaGreedy 23 | Solver and variants. It takes in a set of samples that have a missing 24 | value at the character chosen to split on in a partition. For each of 25 | these samples, it calculates the average number of mutations that 26 | samples on each side of the partition share with it and places the 27 | sample on the side with the higher value. 28 | 29 | Args: 30 | character_matrix: The character matrix containing the observed 31 | character states for the samples 32 | missing_state_indicator: The character representing missing values 33 | left_set: A list of the samples on the left of the partition, 34 | represented by their names in the original character matrix 35 | right_set: A list of the samples on the right of the partition, 36 | represented by their names in the original character matrix 37 | missing: A list of samples with missing data to be imputed, 38 | represented by their names in the original character matrix 39 | weights: A set of optional weights for character/state mutation pairs 40 | 41 | Returns: 42 | A tuple of lists, representing the left and right partitions with 43 | missing samples imputed 44 | """ 45 | 46 | # A helper function to calculate the number of shared character/state pairs 47 | # shared between a missing sample and a side of the partition 48 | sample_names = list(character_matrix.index) 49 | character_array = character_matrix.to_numpy() 50 | left_indices = solver_utilities.convert_sample_names_to_indices( 51 | sample_names, left_set 52 | ) 53 | right_indices = solver_utilities.convert_sample_names_to_indices( 54 | sample_names, right_set 55 | ) 56 | missing_indices = solver_utilities.convert_sample_names_to_indices( 57 | sample_names, missing 58 | ) 59 | 60 | def score_side(subset_character_states, query_states, weights): 61 | score = 0 62 | for char in range(len(subset_character_states)): 63 | 64 | query_state = [ 65 | q 66 | for q in query_states[char] 67 | if q != 0 and q != missing_state_indicator 68 | ] 69 | all_states = np.array(subset_character_states[char]) 70 | for q in query_state: 71 | if weights: 72 | score += weights[char][q] * np.count_nonzero( 73 | all_states == q 74 | ) 75 | else: 76 | score += np.count_nonzero(all_states == q) 77 | 78 | return score 79 | 80 | subset_character_array_left = character_array[left_indices, :] 81 | subset_character_array_right = character_array[right_indices, :] 82 | 83 | all_left_states = [ 84 | unravel_ambiguous_states(subset_character_array_left[:, char]) 85 | for char in range(subset_character_array_left.shape[1]) 86 | ] 87 | all_right_states = [ 88 | unravel_ambiguous_states(subset_character_array_right[:, char]) 89 | for char in range(subset_character_array_right.shape[1]) 90 | ] 91 | 92 | for sample_index in missing_indices: 93 | 94 | all_states_for_sample = [ 95 | unravel_ambiguous_states([character_array[sample_index, char]]) 96 | for char in range(character_array.shape[1]) 97 | ] 98 | 99 | left_score = score_side( 100 | np.array(all_left_states, dtype=object), 101 | np.array(all_states_for_sample, dtype=object), 102 | weights, 103 | ) 104 | right_score = score_side( 105 | np.array(all_right_states, dtype=object), 106 | np.array(all_states_for_sample, dtype=object), 107 | weights, 108 | ) 109 | 110 | if (left_score / len(left_set)) > (right_score / len(right_set)): 111 | left_set.append(sample_names[sample_index]) 112 | else: 113 | right_set.append(sample_names[sample_index]) 114 | 115 | return left_set, right_set 116 | -------------------------------------------------------------------------------- /cassiopeia/solver/solver_utilities.py: -------------------------------------------------------------------------------- 1 | """This file contains general utilities to be called by functions throughout 2 | the solver module""" 3 | 4 | import logging 5 | from typing import Dict, Generator, List, Optional 6 | 7 | import ete3 8 | from hashlib import blake2b 9 | import numpy as np 10 | import pandas as pd 11 | import time 12 | 13 | from cassiopeia.mixins import PriorTransformationError 14 | 15 | 16 | def node_name_generator() -> Generator[str, None, None]: 17 | """Generates unique node names for building the reconstructed tree. 18 | 19 | Creates a generator object that produces unique node names by hashing 20 | timestamps. 21 | 22 | Returns: 23 | A generator object 24 | """ 25 | 26 | while True: 27 | k = str(time.time()).encode("utf-8") 28 | h = blake2b(key=k, digest_size=12) 29 | yield "cassiopeia_internal_node" + h.hexdigest() 30 | 31 | 32 | def collapse_unifurcations(tree: ete3.Tree) -> ete3.Tree: 33 | """Collapse unifurcations. 34 | Collapse all unifurcations in the tree, namely any node with only one child 35 | should be removed and all children should be connected to the parent node. 36 | Args: 37 | tree: tree to be collapsed 38 | Returns: 39 | A collapsed tree. 40 | """ 41 | 42 | collapse_fn = lambda x: (len(x.children) == 1) 43 | 44 | collapsed_tree = tree.copy() 45 | to_collapse = [n for n in collapsed_tree.traverse() if collapse_fn(n)] 46 | 47 | for n in to_collapse: 48 | n.delete() 49 | 50 | return collapsed_tree 51 | 52 | 53 | def transform_priors( 54 | priors: Optional[Dict[int, Dict[int, float]]], 55 | prior_transformation: str = "negative_log", 56 | ) -> Dict[int, Dict[int, float]]: 57 | """Generates a dictionary of weights from priors. 58 | 59 | Generates a dictionary of weights from given priors for each character/state 60 | pair for use in algorithms that inherit the GreedySolver. Supported 61 | transformations include negative log, negative log square root, and inverse. 62 | 63 | Args: 64 | priors: A dictionary of prior probabilities for each character/state 65 | pair 66 | prior_transformation: A function defining a transformation on the priors 67 | in forming weights. Supports the following transformations: 68 | "negative_log": Transforms each probability by the negative log 69 | "inverse": Transforms each probability p by taking 1/p 70 | "square_root_inverse": Transforms each probability by the 71 | the square root of 1/p 72 | 73 | Returns: 74 | A dictionary of weights for each character/state pair 75 | """ 76 | if prior_transformation not in [ 77 | "negative_log", 78 | "inverse", 79 | "square_root_inverse", 80 | ]: 81 | raise PriorTransformationError( 82 | "Please select one of the supported prior transformations." 83 | ) 84 | 85 | prior_function = lambda x: -np.log(x) 86 | 87 | if prior_transformation == "square_root_inverse": 88 | prior_function = lambda x: (np.sqrt(1 / x)) 89 | if prior_transformation == "inverse": 90 | prior_function = lambda x: 1 / x 91 | 92 | weights = {} 93 | for character in priors: 94 | state_weights = {} 95 | for state in priors[character]: 96 | p = priors[character][state] 97 | if p <= 0.0 or p > 1.0: 98 | raise PriorTransformationError( 99 | "Please make sure all priors have a value between 0 and 1" 100 | ) 101 | state_weights[state] = prior_function(p) 102 | weights[character] = state_weights 103 | return weights 104 | 105 | 106 | def convert_sample_names_to_indices( 107 | names: List[str], samples: List[str] 108 | ) -> List[int]: 109 | """Maps samples to their integer indices in a given set of names. 110 | 111 | Used to map sample string names to the their integer positions in the index 112 | of the original character matrix for efficient indexing operations. 113 | 114 | Args: 115 | names: A list of sample names, represented by their string names in the 116 | original character matrix 117 | samples: A list of sample names representing the subset to be mapped to 118 | integer indices 119 | 120 | Returns: 121 | A list of samples mapped to integer indices 122 | """ 123 | name_to_index = dict(zip(names, range(len(names)))) 124 | 125 | return list(map(lambda x: name_to_index[x], samples)) 126 | 127 | def save_dissimilarity_as_phylip( 128 | dissimilarity_map: pd.DataFrame, path: str 129 | ) -> None: 130 | """Saves a dissimilarity map as a phylip file. 131 | 132 | Args: 133 | dissimilarity_map: A dissimilarity map 134 | path: The path to save the phylip file 135 | 136 | Returns: 137 | None 138 | """ 139 | dissimilarity_np = dissimilarity_map.to_numpy() 140 | n = dissimilarity_np.shape[0] 141 | with open(path, "w") as f: 142 | f.write("{}\n".format(n)) 143 | for i in range(n): 144 | row = dissimilarity_np[i, :i+1] 145 | formatted_values = '\t'.join(map('{:.4f}'.format, row)) 146 | f.write("{}\t{}\n".format(dissimilarity_map.index[i], formatted_values)) 147 | -------------------------------------------------------------------------------- /cassiopeia/tools/__init__.py: -------------------------------------------------------------------------------- 1 | """Top level for tools.""" 2 | 3 | from .autocorrelation import compute_morans_i 4 | from .branch_length_estimator import IIDExponentialBayesian, IIDExponentialMLE 5 | from .coupling import compute_evolutionary_coupling 6 | from .fitness_estimator import ( 7 | FitnessEstimator, 8 | FitnessEstimatorError, 9 | LBIJungle, 10 | ) 11 | from .parameter_estimators import ( 12 | estimate_missing_data_rates, 13 | estimate_mutation_rate, 14 | ) 15 | from .small_parsimony import fitch_count, fitch_hartigan, score_small_parsimony 16 | from .topology import compute_cophenetic_correlation, compute_expansion_pvalues 17 | from .tree_metrics import ( 18 | calculate_likelihood_continuous, 19 | calculate_likelihood_discrete, 20 | calculate_parsimony, 21 | ) 22 | 23 | 24 | __all__ = [ 25 | "calculate_likelihood_continuous", 26 | "calculate_likelihood_discrete", 27 | "calculate_parsimony", 28 | "compute_morans_i", 29 | "compute_evolutionary_coupling", 30 | "estimate_missing_data_rates", 31 | "estimate_mutation_rate", 32 | "fitch_count", 33 | "fitch_hartigan", 34 | "score_small_parsimony", 35 | "compute_cophenetic_correlation", 36 | "compute_expansion_pvalues", 37 | ] 38 | -------------------------------------------------------------------------------- /cassiopeia/tools/autocorrelation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility file for computing autocorrelation statistics on trees. 3 | """ 4 | from typing import Callable, List, Optional, Union 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from cassiopeia.data import CassiopeiaTree 9 | from cassiopeia.mixins import AutocorrelationError 10 | from cassiopeia.data import utilities 11 | 12 | 13 | def compute_morans_i( 14 | tree: CassiopeiaTree, 15 | meta_columns: Optional[List] = None, 16 | X: Optional[pd.DataFrame] = None, 17 | W: Optional[pd.DataFrame] = None, 18 | inverse_weight_fn: Callable[[Union[int, float]], float] = lambda x: 1.0 / x, 19 | ) -> Union[float, pd.DataFrame]: 20 | """Computes Moran's I statistic. 21 | 22 | Using the cross-correlation between leaves as specified on the tree, compute 23 | the Moran's I statistic for each of the data items specified. This will 24 | only work for numerical data, and will thrown an error otherwise. 25 | 26 | Generally, this statistic takes in a weight matrix (which can be computed 27 | directly from a phylogenetic tree) and a set of numerical observations that 28 | are centered and standardized (i.e., mean 0 and population standard deviation 29 | of 1). Then, the Moran's I statistic is: 30 | 31 | I = X' * Wn * X 32 | 33 | where X' denotes a tranpose, * denotes the matrix multiplier, and Wn is the 34 | normalized weight matrix such that sum([w_i,j for all i,j]) = 1. 35 | 36 | Inspired from the tools and code used in Chaligne et al, Nature Genetics 37 | 2021. 38 | 39 | The mathematical details of the statistic can be found in: 40 | Wartenberg, "Multivariate Spatial Correlation: A Method for Exploratory 41 | Geographical Analysis", Geographical Analysis (1985) 42 | 43 | Args: 44 | tree: CassiopeiaTree 45 | meta_columns: Columns in the Cassiopeia Tree :attr:cell_meta object 46 | for which to compute autocorrelations 47 | X: Extra data matrix for computing autocorrelations. 48 | W: Phylogenetic weight matrix. If this is not specified, then the 49 | weight matrix will be computed within the function. 50 | inverse_weight_fn: Inverse function to apply to the weights, if the 51 | weight matrix must be computed. 52 | 53 | Returns: 54 | Moran's I statistic 55 | """ 56 | 57 | if X is None and meta_columns is None: 58 | raise AutocorrelationError( 59 | "Specify data for computing autocorrelations." 60 | ) 61 | 62 | _X = None 63 | if meta_columns is not None: 64 | _X = tree.cell_meta[meta_columns] 65 | 66 | if X is not None: 67 | if len(np.intersect1d(tree.leaves, X.index)) != tree.n_cell: 68 | raise AutocorrelationError( 69 | "Specified argument X must be a dataframe with identical" 70 | " indices to the leaves of the CassiopeiaTree." 71 | ) 72 | 73 | _X = pd.concat([_X, X], axis=0) 74 | 75 | # check to make sure all values are numerical 76 | if not np.all( 77 | _X.apply(lambda s: pd.to_numeric(s, errors="coerce").notnull().all()) 78 | ): 79 | raise AutocorrelationError( 80 | "There are some columns that are not numeric in the specified data." 81 | ) 82 | 83 | # cast to numeric 84 | _X = _X.apply(lambda s: pd.to_numeric(s, errors="coerce")) 85 | 86 | # instantiate the weight matrix if None is specified 87 | if W is None: 88 | W = utilities.compute_phylogenetic_weight_matrix( 89 | tree, inverse=True, inverse_fn=inverse_weight_fn 90 | ) 91 | 92 | # make sure that W has the correct indices 93 | if len(np.intersect1d(tree.leaves, W.index)) != tree.n_cell: 94 | raise AutocorrelationError( 95 | "Weight matrix does not have the same leaves as the tree." 96 | ) 97 | 98 | N = tree.n_cell 99 | 100 | # normalize W to 1 101 | _W = W / W.sum().sum() 102 | 103 | # center and standardize _X 104 | _X = (_X - _X.mean()) / _X.std(axis=0, ddof=0) 105 | 106 | I = _X.T.dot(_W).dot(_X) 107 | 108 | # if we're only testing one variable, return a float 109 | if _X.shape[1] == 1: 110 | I = I.iloc[0, 0] 111 | 112 | return I 113 | -------------------------------------------------------------------------------- /cassiopeia/tools/branch_length_estimator/BranchLengthEstimator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Abstract class BranchLengthEstimator, for the branch length estimation module. 3 | 4 | All algorithms are derived classes of this abstract class, and at a minimum 5 | implement a method called `estimate_branch_lengths`. 6 | """ 7 | import abc 8 | 9 | from cassiopeia.data import CassiopeiaTree 10 | 11 | 12 | class BranchLengthEstimator(abc.ABC): 13 | """ 14 | BranchLengthEstimator is an abstract class that all branch length 15 | estimation algorithms derive from. At minimum, all BranchLengthEstimator 16 | subclasses will implement a method called `estimate_branch_lengths`. 17 | """ 18 | 19 | @abc.abstractmethod 20 | def estimate_branch_lengths(self, tree: CassiopeiaTree) -> None: 21 | """Estimates branch lengths for the given tree. 22 | 23 | Args: 24 | cassiopeia_tree: CassiopeiaTree storing character information 25 | and an initialized tree topology. 26 | """ 27 | -------------------------------------------------------------------------------- /cassiopeia/tools/branch_length_estimator/__init__.py: -------------------------------------------------------------------------------- 1 | """Top level for branch length estimator.""" 2 | 3 | from .IIDExponentialMLE import IIDExponentialMLE 4 | from .IIDExponentialBayesian import IIDExponentialBayesian 5 | -------------------------------------------------------------------------------- /cassiopeia/tools/branch_length_estimator/_iid_exponential_bayesian.pxd: -------------------------------------------------------------------------------- 1 | from libcpp.vector cimport vector 2 | from libcpp.map cimport map 3 | from libcpp.pair cimport pair 4 | 5 | # Declare the class with cdef 6 | cdef extern from "_iid_exponential_bayesian_cpp.h": 7 | cdef cppclass _InferPosteriorTimes: 8 | _InferPosteriorTimes() except + 9 | void run( 10 | int N, 11 | vector[vector[int]] children, 12 | int root, 13 | vector[int] is_internal_node, 14 | vector[int] get_number_of_mutated_characters_in_node, 15 | vector[int] non_root_internal_nodes, 16 | vector[int] leaves, 17 | vector[int] parent, 18 | int K, 19 | vector[int] K_non_missing, 20 | int T, 21 | double r, 22 | double lam, 23 | double sampling_probability, 24 | vector[int] is_leaf, 25 | ) except + 26 | vector[pair[int, double]] get_posterior_means_res() 27 | vector[pair[int, vector[double]]] get_posteriors_res() 28 | vector[pair[int, vector[double]]] get_log_joints_res() 29 | double get_log_likelihood_res() 30 | -------------------------------------------------------------------------------- /cassiopeia/tools/branch_length_estimator/_iid_exponential_bayesian.pyx: -------------------------------------------------------------------------------- 1 | # distutils: language = c++ 2 | 3 | from ._iid_exponential_bayesian cimport _InferPosteriorTimes 4 | from libcpp.vector cimport vector 5 | from libcpp.map cimport map 6 | 7 | from typing import List, Tuple 8 | 9 | # Create a Cython extension type which holds a C++ instance 10 | # as an attribute and create a bunch of forwarding methods 11 | # Python extension type. 12 | cdef class _PyInferPosteriorTimes: 13 | """ 14 | Infer posterior node times under the Bayesian model. 15 | 16 | The method 'run' takes in all the information needed to perform inference. 17 | """ 18 | cdef _InferPosteriorTimes* c_infer_posterior_times 19 | 20 | def __cinit__(self): 21 | self.c_infer_posterior_times = new _InferPosteriorTimes(); 22 | 23 | def run( 24 | self, 25 | int N, 26 | vector[vector[int]] children, 27 | int root, 28 | vector[int] is_internal_node, 29 | vector[int] get_number_of_mutated_characters_in_node, 30 | vector[int] non_root_internal_nodes, 31 | vector[int] leaves, 32 | vector[int] parent, 33 | int K, 34 | vector[int] K_non_missing, 35 | int T, 36 | double r, 37 | double lam, 38 | double sampling_probability, 39 | vector[int] is_leaf, 40 | ): 41 | """ 42 | Infer posterior node time distribution. 43 | 44 | Args: 45 | N: Number of nodes in tree. 46 | children: Adjacency list of graph. 47 | root: Root of graph. 48 | is_internal_node: Binary indicator for whether the node is internal 49 | or not. 50 | get_number_of_mutated_characters_in_node: Number of mutated 51 | characters in the node. 52 | non_root_internal_nodes: The non-root internal nodes. 53 | leaves: The leaves of the tree. 54 | parent: The parent of each node in the tree (or a negative number 55 | for the root) 56 | K: The number of characters 57 | K_non_missing: The number of non-missing characters in each node. 58 | T: The number of timesteps of the discretization. 59 | r: The CRISRP/Cas9 mutation rate. 60 | lam: The birth rate. 61 | sampling_probability: The probability that a leaf is subsampled from 62 | the ground truth phylogeny. 63 | is_leaf: Binary indicator for whether a node is a leaf or not. 64 | 65 | Raises: 66 | ValueError if the discretization level T is too small. 67 | """ 68 | self.c_infer_posterior_times.run( 69 | N, 70 | children, 71 | root, 72 | is_internal_node, 73 | get_number_of_mutated_characters_in_node, 74 | non_root_internal_nodes, 75 | leaves, 76 | parent, 77 | K, 78 | K_non_missing, 79 | T, 80 | r, 81 | lam, 82 | sampling_probability, 83 | is_leaf, 84 | ) 85 | 86 | def get_posterior_means_res(self) -> List[Tuple[int, float]]: 87 | """ 88 | Posterior mean node times. 89 | 90 | Returns a list of tuples (node, posterior_time), containing the posterior mean 91 | time 'posterior_time' of node 'node'. 92 | """ 93 | return self.c_infer_posterior_times.get_posterior_means_res() 94 | 95 | def get_posteriors_res(self) -> List[Tuple[int, List[float]]]: 96 | """ 97 | Posterior node time distributions. 98 | 99 | Returns a list of tuples (node, posterior_time_distribution), containing 100 | the posterior time 'posterior_time_distribution' of node 'node'. Here 101 | 'posterior_time_distribution' is a list of length T + 1, where 102 | posterior_time_distribution[t] is the posterior probability that node 103 | 'node' has (discretized) time t. 104 | 105 | Note that this is the normalized version of get_log_joints_res. 106 | """ 107 | return self.c_infer_posterior_times.get_posteriors_res() 108 | 109 | def get_log_joints_res(self) -> List[Tuple[int, List[float]]]: 110 | """ 111 | Joint (node, time) log probabilities. 112 | 113 | Returns a list of tuples (node, log_joint), containing the log joint 114 | probability of node 'node' taking a given (discretized) time t (given 115 | the observed character matrix and tree topology); this is log_joint[t]. 116 | 117 | Note that this is the unnormalized version of get_log_joints_res. 118 | """ 119 | return self.c_infer_posterior_times.get_log_joints_res() 120 | 121 | def get_log_likelihood_res(self): 122 | """ 123 | Log likelihood of the observed data. 124 | 125 | The log likelihood of the observed character matrix and tree topology 126 | under the Bayesian model. 127 | 128 | Note that this is just the log-sum-exp of get_log_joints_res for any 129 | node. 130 | """ 131 | return self.c_infer_posterior_times.get_log_likelihood_res() 132 | 133 | def __dealloc__(self): 134 | del self.c_infer_posterior_times 135 | 136 | -------------------------------------------------------------------------------- /cassiopeia/tools/branch_length_estimator/_iid_exponential_bayesian_cpp.h: -------------------------------------------------------------------------------- 1 | #ifndef _IID_EXPONENTIAL_BAYESIAN_CPP_H 2 | #define _IID_EXPONENTIAL_BAYESIAN_CPP_H 3 | 4 | #include 5 | 6 | using namespace std; 7 | 8 | class _InferPosteriorTimes{ 9 | public: 10 | _InferPosteriorTimes(); 11 | ~_InferPosteriorTimes(); 12 | void run( 13 | int N, 14 | vector > children, 15 | int root, 16 | vector is_internal_node, 17 | vector get_number_of_mutated_characters_in_node, 18 | vector non_root_internal_nodes, 19 | vector leaves, 20 | vector parent, 21 | int K, 22 | vector K_non_missing, 23 | int T, 24 | double r, 25 | double lam, 26 | double sampling_probability, 27 | vector is_leaf 28 | ); 29 | // The following methods access the results of the run() method. 30 | vector > get_posterior_means_res(); 31 | vector > > get_posteriors_res(); 32 | vector > > get_log_joints_res(); 33 | double get_log_likelihood_res(); 34 | 35 | private: 36 | // These are the parameters to the run() call. 37 | int N; 38 | vector > children; 39 | int root; 40 | vector is_internal_node; 41 | vector get_number_of_mutated_characters_in_node; 42 | vector non_root_internal_nodes; 43 | vector leaves; 44 | vector parent; 45 | int K; 46 | vector K_non_missing; 47 | int T; 48 | double r; 49 | double lam; 50 | double sampling_probability; 51 | vector is_leaf; 52 | 53 | // These are computed internally. 54 | double dt; 55 | double*** down_cache; // [N][T + 1][K] 56 | double*** up_cache; // [N][T + 1][K] 57 | double* p_unsampled; // [T + 1] 58 | double** log_joints; // [N][T + 1] 59 | double** posteriors; // [N][T + 1] 60 | double* posterior_means; // [N] 61 | 62 | void allocate_memory(); 63 | void deallocate_memory(); 64 | void precompute_p_unsampled(); 65 | pair valid_cuts_range(int v); 66 | bool state_is_valid(int v, int x); 67 | double down(int v, int t, int x); 68 | double up(int v, int t, int x); 69 | void populate_down_res(); 70 | void populate_up_res(); 71 | void populate_log_likelihood_res(); 72 | double compute_log_joint(int v, int t); 73 | void populate_log_joints_res(); 74 | void populate_posteriors_res(); 75 | void populate_posterior_means_res(); 76 | void populate_posterior_results(); 77 | 78 | vector, double> > down_res; 79 | vector, double> > up_res; 80 | vector > posterior_means_res; 81 | vector > > posteriors_res; 82 | vector > > log_joints_res; 83 | double log_likelihood_res; 84 | }; 85 | 86 | #endif 87 | -------------------------------------------------------------------------------- /cassiopeia/tools/coupling.py: -------------------------------------------------------------------------------- 1 | """ 2 | File storing functionality for computing coupling statistics between meta 3 | variables on a tree. 4 | """ 5 | from typing import Callable, Optional 6 | 7 | from collections import defaultdict 8 | import numpy as np 9 | import pandas as pd 10 | from tqdm import tqdm 11 | 12 | from cassiopeia.data import CassiopeiaTree 13 | from cassiopeia.data import utilities as data_utilities 14 | 15 | 16 | def compute_evolutionary_coupling( 17 | tree: CassiopeiaTree, 18 | meta_variable: str, 19 | minimum_proportion: float = 0.05, 20 | number_of_shuffles: int = 500, 21 | random_state: Optional[np.random.RandomState] = None, 22 | dissimilarity_map: Optional[pd.DataFrame] = None, 23 | cluster_comparison_function: Callable = data_utilities.net_relatedness_index, 24 | **comparison_kwargs, 25 | ) -> pd.DataFrame: 26 | """Computes Evolutionary Coupling of categorical variables. 27 | 28 | Using the methodology described in Yang, Jones et al, BioRxiv (2021), this 29 | function will compute the "evolutionary coupling" statistic between values 30 | that a categorical variable can take on with the tree. For example, this 31 | categorical variable can be a "cell type", and this function will compute 32 | the evolutionary couplings between all types of cell types. This indicates 33 | how closely related these cell types are to one another. 34 | 35 | Briefly, this statistic is the Z-normalized mean distance between categories 36 | in the specified categorical variable. Note that empirical nulls that have a 37 | standard deviation of 0 lead to NaNs in the resulting evolutionary coupling 38 | matrix. 39 | 40 | The computational complexity of this function is 41 | O(n^2 log n + (B+1)(K^2 * O(distance_function)) for a tree with n leaves, a 42 | variable with K categories, and B random shuffles. 43 | 44 | Args: 45 | tree: CassiopeiaTree 46 | meta_variable: Column in `tree.cell_meta` that stores a categorical 47 | variable with K categories. 48 | minimum_proportion: Minimum proportion of cells that a category needs 49 | to appear in to be considered. 50 | number_of_shuffles: Number of times to shuffle the data to compute the 51 | empirical Z score. 52 | random_state: Numpy random state to parameterize the shuffling. 53 | dissimilarity_map: A precomputed dissimilarity map between all leaves. 54 | cluster_comparison_function: A function for comparing the mean distance 55 | between groups. By default, this is the Net Relatedness Index. 56 | **comparison_kwargs: Extra arguments to pass to the cluster comparison 57 | function. 58 | 59 | Returns: 60 | A K x K evolutionary coupling dataframe. 61 | """ 62 | 63 | W = ( 64 | data_utilities.compute_phylogenetic_weight_matrix(tree) 65 | if (dissimilarity_map is None) 66 | else dissimilarity_map 67 | ) 68 | 69 | meta_data = tree.cell_meta[meta_variable] 70 | 71 | # subset meta data by minimum proportion 72 | if minimum_proportion > 0: 73 | filter_threshold = int(len(tree.leaves) * minimum_proportion) 74 | category_frequencies = meta_data.value_counts() 75 | passing_categories = category_frequencies[ 76 | category_frequencies > filter_threshold 77 | ].index.values 78 | meta_data = meta_data[meta_data.isin(passing_categories)] 79 | W = W.loc[meta_data.index.values, meta_data.index.values] 80 | 81 | # compute inter-cluster distances 82 | inter_cluster_distances = data_utilities.compute_inter_cluster_distances( 83 | tree, 84 | meta_data=meta_data, 85 | dissimilarity_map=W, 86 | distance_function=cluster_comparison_function, 87 | **comparison_kwargs, 88 | ) 89 | 90 | # compute background for Z-scoring 91 | background = defaultdict(list) 92 | for _ in tqdm( 93 | range(number_of_shuffles), desc="Creating empirical background" 94 | ): 95 | permuted_assignments = meta_data.copy() 96 | if random_state: 97 | permuted_assignments.index = random_state.permutation( 98 | meta_data.index.values 99 | ) 100 | else: 101 | permuted_assignments.index = np.random.permutation( 102 | meta_data.index.values 103 | ) 104 | background_distances = data_utilities.compute_inter_cluster_distances( 105 | tree, 106 | meta_data=permuted_assignments, 107 | dissimilarity_map=W, 108 | distance_function=cluster_comparison_function, 109 | **comparison_kwargs, 110 | ) 111 | for s1 in background_distances.index: 112 | for s2 in background_distances.columns: 113 | background[(s1, s2)].append(background_distances.loc[s1, s2]) 114 | 115 | Z_scores = inter_cluster_distances.copy() 116 | for s1 in Z_scores.index: 117 | for s2 in Z_scores.columns: 118 | mean = np.mean(background[(s1, s2)]) 119 | sd = np.std(background[(s1, s2)]) 120 | 121 | Z_scores.loc[s1, s2] = ( 122 | inter_cluster_distances.loc[s1, s2] - mean 123 | ) / sd 124 | 125 | return Z_scores 126 | -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/_FitnessEstimator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Abstract class FitnessEstimator, for the fitness estimation module. 3 | 4 | All algorithms are derived classes of this abstract class, and at a minimum 5 | implement a method called `estimate_fitness`. Fitness will be stored as 6 | the attribute 'fitness' of each node. 7 | """ 8 | import abc 9 | 10 | from cassiopeia.data import CassiopeiaTree 11 | 12 | 13 | class FitnessEstimatorError(Exception): 14 | """An Exception class for the FitnessEstimator class.""" 15 | 16 | pass 17 | 18 | 19 | class FitnessEstimator(abc.ABC): 20 | """ 21 | FitnessEstimator is an abstract class that all fitness 22 | estimation algorithms derive from. At minimum, all FitnessEstimator 23 | subclasses will implement a method called `estimate_fitness`. 24 | Fitness will be stored as the attribute 'fitness' of each node. 25 | """ 26 | 27 | @abc.abstractmethod 28 | def estimate_fitness(self, tree: CassiopeiaTree) -> None: 29 | """Estimates fitness for each node in the tree. 30 | 31 | Fitness will be stored as the attribute 'fitness' of each node. 32 | 33 | Args: 34 | cassiopeia_tree: CassiopeiaTree storing an initialized 35 | tree topology with estimated branch lengths. 36 | """ 37 | -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/__init__.py: -------------------------------------------------------------------------------- 1 | """Top level for fitness estimator.""" 2 | 3 | from ._FitnessEstimator import FitnessEstimator, FitnessEstimatorError 4 | from ._lbi_jungle import LBIJungle 5 | -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/_jungle/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2020 Felix Horns 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/_jungle/jungle/__init__.py: -------------------------------------------------------------------------------- 1 | from .forest import * 2 | from .sfs import * 3 | from .size_matched_model import * 4 | from .tree import * 5 | -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/FitnessInference/.gitignore: -------------------------------------------------------------------------------- 1 | *fasta 2 | *pickle 3 | *dat 4 | *~ 5 | *pyc 6 | *txt 7 | *nwk 8 | notes/#* 9 | src/#* 10 | adaptation_ms/#* 11 | .#* 12 | *aux 13 | *out 14 | *log 15 | *bbl 16 | *blg 17 | *.o 18 | auto 19 | *el 20 | *.fdb_latexmk 21 | *.py.* 22 | 23 | *.py[cod] 24 | 25 | # C extensions 26 | *.so 27 | 28 | # Packages 29 | *.egg 30 | *.egg-info 31 | dist 32 | build 33 | eggs 34 | parts 35 | bin 36 | var 37 | sdist 38 | develop-eggs 39 | .installed.cfg 40 | lib 41 | lib64 42 | __pycache__ 43 | 44 | # Installer logs 45 | pip-log.txt 46 | 47 | # Unit test / coverage reports 48 | .coverage 49 | .tox 50 | nosetests.xml 51 | 52 | # Translations 53 | *.mo 54 | 55 | *.py[cod] 56 | 57 | # C extensions 58 | *.so 59 | 60 | # Packages 61 | *.egg 62 | *.egg-info 63 | dist 64 | build 65 | eggs 66 | parts 67 | bin 68 | var 69 | sdist 70 | develop-eggs 71 | .installed.cfg 72 | lib 73 | lib64 74 | __pycache__ 75 | 76 | # Installer logs 77 | pip-log.txt 78 | 79 | # Unit test / coverage reports 80 | .coverage 81 | .tox 82 | nosetests.xml 83 | 84 | # Translations 85 | *.mo 86 | 87 | # EMACS 88 | *~ 89 | \#*\# 90 | /.emacs.desktop 91 | /.emacs.desktop.lock 92 | .elc 93 | auto-save-list 94 | tramp 95 | .\#* 96 | 97 | # Org-mode 98 | .org-id-locations 99 | *_archive 100 | 101 | # VIM 102 | *.s[a-w][a-z] 103 | *.un~ 104 | Session.vim 105 | .netrwhist 106 | *~ 107 | 108 | # Eclipse 109 | *.pydevproject 110 | .project 111 | .metadata 112 | bin/** 113 | tmp/** 114 | tmp/**/* 115 | *.tmp 116 | *.bak 117 | *.swp 118 | *~.nib 119 | local.properties 120 | .classpath 121 | .settings/ 122 | .loadpath 123 | 124 | # External tool builders 125 | .externalToolBuilders/ 126 | 127 | # Locally stored "Eclipse launch configurations" 128 | *.launch 129 | 130 | # CDT-specific 131 | .cproject 132 | 133 | # PDT-specific 134 | .buildpath 135 | 136 | flupred.geany 137 | flupred.komodoproject 138 | -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/FitnessInference/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 rneher 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/FitnessInference/README.md: -------------------------------------------------------------------------------- 1 | ### Inferring fitness from the shape of trees 2 | 3 | This repository contains the code associated with the manuscript 4 | 5 | Neher, Russell, Shraiman: "Predicting evolution from the shape of genealogical trees". accepted for publication in eLife 6 | 7 | --- 8 | 9 | The directory *prediction_src* contains the code base used for the fitness inference and prediction algorithms as well as classes to hold sequence data and trees adapted. 10 | 11 | --- 12 | 13 | The directory *flu* contains the code specific to our analysis of historical influenza data, scripts that generate the figures, the influenza sequences and annotation, analysis results and figure files. 14 | 15 | --- 16 | 17 | The directory *toy_data* contains the code to simulate adapting populations building on the FFPopSim library. In addition, it contains scripts to analyze this simulated data, the data itself and the resulting figures. 18 | 19 | --- 20 | 21 | #### Ranking sequences by the local branching index (LBI) 22 | 23 | The script *rank_sequences.py* is a simple wrapper for the prediction tool that takes a multiple sequence alignment and the name of the outgroup as input (this outgroup needs to be in the MSA). It produces a folder containing a ranking of sequences, the inferred ancestral sequences, the reconstructed tree, and optionally a pdf of the marked up tree. This script uses the local branching index (LBI), rather than the full fitness inference to rank sequences. 24 | 25 | build-in help and optional arguments: 26 | 27 | ./rank_sequences.py --help 28 | usage: rank_sequences.py [-h] --aln ALN --outgroup OUTGROUP 29 | [--eps_branch EPS_BRANCH] [--tau TAU] 30 | [--collapse [COLLAPSE]] [--plot [PLOT]] 31 | 32 | rank sequences in a multiple sequence aligment 33 | 34 | optional arguments: 35 | -h, --help show this help message and exit 36 | --aln ALN alignment of sequences to by ranked 37 | --outgroup OUTGROUP name of outgroup sequence 38 | --eps_branch EPS_BRANCH 39 | minimal branch length for inference 40 | --tau TAU time scale for local tree length estimation (relative 41 | to average pairwise distance) 42 | --collapse [COLLAPSE] 43 | collapse internal branches with identical sequences 44 | --plot [PLOT] plot trees 45 | 46 | #### Inferring fitness distribution of nodes in the tree 47 | 48 | The script *infer_fitness.py* also takes an alignment and outgroup as argument, but uses the full fitness inference to rank sequences and calculate the mean posterior and the variance of the posterior. Note that plausible posterior distributions require a that the parameter omega is well chosen. Also, the time conversion factor might need to be different from gamma=1 for optimal results. 49 | 50 | ./infer_fitness.py --help 51 | usage: infer_fitness.py [-h] --aln ALN --outgroup OUTGROUP 52 | [--eps_branch EPS_BRANCH] [--diffusion DIFFUSION] 53 | [--gamma GAMMA] [--omega OMEGA] 54 | [--collapse [COLLAPSE]] [--plot [PLOT]] 55 | 56 | rank sequences in a multiple sequence aligment 57 | 58 | optional arguments: 59 | -h, --help show this help message and exit 60 | --aln ALN alignment of sequences to by ranked 61 | --outgroup OUTGROUP name of outgroup sequence 62 | --eps_branch EPS_BRANCH 63 | minimal branch length for inference 64 | --diffusion DIFFUSION 65 | fitness diffusion coefficient 66 | --gamma GAMMA scale factor for time scale, choose high (>2) for 67 | prediction, 1 for fitness inference 68 | --omega OMEGA approximate sampling fraction diveded by the fitness 69 | standard deviation 70 | --collapse [COLLAPSE] 71 | collapse internal branches with identical sequences 72 | --plot [PLOT] plot trees 73 | 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/FitnessInference/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/FitnessInference/__init__.py -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/FitnessInference/prediction_src/README: -------------------------------------------------------------------------------- 1 | This folder contains scripts at the core of the fitness estimation and prediction machinery. 2 | 3 | ############################################################ 4 | solve_survival.py 5 | 6 | this scripts provides a class that solves the branching process equation numerically and uses it to integrate the branch propagator between the desired time points. 7 | 8 | ############################################################ 9 | fitness_inference.py 10 | 11 | this script provdies a class with the basic fitness inference. It uses the actual numerical solution for the propagator rather than the tree length approximation. 12 | 13 | ############################################################ 14 | node_ranking.py 15 | 16 | this scripts provides a number of utility functions for trees, in particular building trees, labeling, translating, etc. It also contains the class that establises a tree, infers the ancestral states and then infers fitness of all nodes. It provides functions for ranking nodes by different methods, the major being the inferred fitness. It also colors its own trees. 17 | 18 | ############################################################ 19 | sequence_ranking.py 20 | 21 | this script provides two classes: An alignment class which dresses an biopython alignment with an outgroup, a tree and an amino acid alignment if a coding region is provided. 22 | 23 | The other class is a subclass of node_ranking that takes an alignment as input and runs a prediction. 24 | 25 | ############################################################ 26 | ancestral.py 27 | 28 | inference of ancestral sequences on a tree using a variant of dynamic programming to calculate the most likely sequences of internal nodes. 29 | -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/FitnessInference/prediction_src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/FitnessInference/prediction_src/__init__.py -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/__init__.py -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/betatree/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Richard Neher 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/betatree/README: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | 3 | betatree is a collection of python scripts to generate trees from the beta coalescent ensemble, calculate properties of those trees and gather statistics across many instances. 4 | 5 | Authors: Taylor Kessinger and Richard Neher 6 | 7 | Contact: richard.neher@tuebingen.mpg.de 8 | 9 | Example output of the betatree generator are given as pdf files in example_trees. Example site frequency spectra for different parameters of the beta-coalescent ensemble are provided in example_SFS. 10 | 11 | If you use betatree in a publication, please refer to 12 | 13 | Neher, Kessinger, and Shraiman. "Coalescence and Genetic Diversity in Sexual Populations under Selection." PNAS 110: 15836-41. doi:10.1073/pnas.1309697110. 14 | 15 | 16 | ################################################################################ 17 | Tree generation 18 | 19 | the script src/betatree.py provides a class that generates coalescent trees using pseudorandom numbers given an initial sample size n and a parameter alpha of the beta measure of the Lambda coalescent process. The following 3 lines will generate a tree of a sample size of 100 and draw it with the Biopython.Phylo package. 20 | 21 | myT = betatree(100,alpha = 2) 22 | myT.coalesce() 23 | Phylo.draw(myT.BioTree) 24 | 25 | the tree is internally stored as a biopython.phylo tree with all the associated functionality. 26 | 27 | Sample code is appended to the definition of the class as will be exectuted if betatree.py is run as main. 28 | 29 | ################################################################################ 30 | Site frequency spectra 31 | 32 | the script src/sfs.py uses the class betatree to generate many trees and calculate the SFS assuming that mutation are uniformly distributed on the tree. The following three lines will generate an SFS for a sample size 100 and alpha=1.5 by averaging 1000 trees. 33 | 34 | mySFS = SFS(n=100,alpha=1.5) 35 | mySFS.getSFS(ntrees=1000) 36 | 37 | The sfs is accessible as mySFS.sfs and can be binned using different binning schemes or a user defined binning. 38 | 39 | mySFS.binSFS(mode='logit', bins=20) 40 | plt.plot(mySFS.bin_center, mySFS.binned_sfs) 41 | 42 | The sfs can be saved and loaded by member functions. 43 | 44 | Sample code is appended to the definition of the class as will be exectuted if sfs.py is run as main. 45 | 46 | 47 | ################################################################################ 48 | Dependencies: 49 | 50 | python 2.7 51 | numpy 52 | scipy 53 | biopython 54 | matplotlib for plotting examples 55 | 56 | ################################################################################ 57 | 58 | -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/betatree/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/betatree/__init__.py -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/betatree/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/betatree/src/__init__.py -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/_jungle/jungle/size_matched_model.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import json 3 | 4 | import numpy as np 5 | import scipy # import scipy so its available in namespace for evaluating distributions 6 | 7 | # TODO evaluate distribution in global namespace, so that import is not necessary here 8 | 9 | 10 | class SizeMatchedModel: 11 | def __init__(self, bins, params, distribution, name=None): 12 | """Initialize SizeMatchedModel from a list of bins, a list of parameters, and a distribution""" 13 | assert ( 14 | len(params) == len(bins) - 1 15 | ), "Length of params must be one less than length of bins" 16 | self.bins = bins 17 | self.params = params 18 | self.distribution = distribution 19 | self.name = name 20 | 21 | # @classmethod 22 | # def from_dict(cls, bin_to_params, distribution): 23 | # """ Initialize SizeMatchedModel from a dictionary of bin-parameter mappings and a distribution """ 24 | # bins = bin_to_params.keys() 25 | # params = bin_to_params.values() 26 | # return SizeMatchedModel(bins, params, distribution) 27 | 28 | @classmethod 29 | def from_json(cls, filename): 30 | """Load SizeMatchedModel from JSON file""" 31 | with open(filename) as f: 32 | attributes_str = json.load(f) 33 | attributes = dict() 34 | attributes["bins"] = ast.literal_eval(attributes_str["bins"]) 35 | attributes["params"] = ast.literal_eval(attributes_str["params"]) 36 | attributes["name"] = ast.literal_eval(attributes_str["name"]) 37 | distribution = eval( 38 | attributes_str["distribution"] 39 | )() # evaluate class name and instantiate 40 | return SizeMatchedModel( 41 | attributes["bins"], 42 | attributes["params"], 43 | distribution, 44 | attributes["name"], 45 | ) 46 | 47 | def to_json(self, outfile): 48 | """Write SizeMatchedModel to JSON file""" 49 | attributes = dict() 50 | attributes["bins"] = json.dumps(self.bins) 51 | attributes["params"] = json.dumps(self.params) 52 | attributes["name"] = json.dumps(self.name) 53 | 54 | # Get distribution class name 55 | # Distribution is a function, so we need to parse out the class name to save it in JSON format 56 | distribution_str = ( 57 | self.distribution.__class__.__module__ 58 | + "." 59 | + self.distribution.__class__.__name__ 60 | ) 61 | attributes["distribution"] = distribution_str 62 | 63 | with open(outfile, "w") as out: 64 | json.dump(attributes, out) 65 | 66 | def _params_for_size(self, size, strict_bounds=True): 67 | """Find parameters for bin that matches size""" 68 | 69 | # Find matching bin based on size 70 | bin_index = np.digitize( 71 | size, self.bins 72 | ) # digitize returns the index of the bin to which value belongs 73 | 74 | if (bin_index == 0 or bin_index == len(self.bins)) and strict_bounds: 75 | # if strict bounds are used, only allow values that fall strictly within the bins 76 | raise ValueError( 77 | "Size must be within bounds of bins (if strict_bounds=True)" 78 | ) 79 | 80 | if bin_index == 0 and not strict_bounds: 81 | # if loose bounds are used, values less than the bounds of bins should be set to smallest bin 82 | bin_index = 1 83 | 84 | if bin_index == len(self.bins) and not strict_bounds: 85 | # if loose bounds are used, values greater than the bounds of binds should be set to largest bin 86 | bin_index = len(self.bins) - 1 87 | 88 | # Adjust bin index to match indexing of params 89 | # np.digitize returns a one-indexed value, whereas params is a zero-indexed value 90 | # This line shifts the index, so that it matches the indexing of params 91 | bin_index = bin_index - 1 92 | 93 | # Get parameters of matching bin 94 | params_match = self.params[bin_index] 95 | 96 | return params_match 97 | 98 | def pvalue(self, x, size, invert_cdf=False, strict_bounds=True): 99 | """Calculate P value of x under model""" 100 | 101 | # Find model parameters for matching bin based on size 102 | params = self._params_for_size(size, strict_bounds) 103 | 104 | # Calculate probability of finding the observed x, or more extreme, under model 105 | p = self.distribution.cdf(x, *params) 106 | 107 | if invert_cdf: 108 | p = 1 - p 109 | 110 | return p 111 | 112 | def model_mean(self, size, strict_bounds=True): 113 | """Find mean of model for given size""" 114 | 115 | # Find model parameters for matching bin based on size 116 | params = self._params_for_size(size, strict_bounds) 117 | 118 | # Calculate mean of model 119 | mean = self.distribution.mean(*params) 120 | 121 | return mean 122 | -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/_jungle/reference_data/generate_annotate_forest.py: -------------------------------------------------------------------------------- 1 | # Generate and annotate a Forest 2 | # Usage: python generate_annotate_forest.py [n_leaves] [n_trees] [alpha] [output_dir] 3 | # Saves Forest as a gzipped pickle archive. 4 | 5 | import sys 6 | import time 7 | import uuid 8 | 9 | sys.path.append("../../jungle/") 10 | import jungle as jg 11 | 12 | verbose = True 13 | 14 | # Specify parameters 15 | n_leaves = int(sys.argv[1]) # Number of leaves in tree 16 | n_trees = int(sys.argv[2]) # Number of trees in forest 17 | alpha = float( 18 | sys.argv[3] 19 | ) # Shape parameter alpha (alpha = 2.0 for neutral Kingman trees, alpha = 1.0 for positive selection Bolthausen-Sznitman trees) 20 | outfile_dir = sys.argv[4] # Output directory 21 | 22 | # Specify output file 23 | outfile_vars = (n_leaves, n_trees, alpha, str(uuid.uuid4())[0:8]) 24 | outfile_basename = ( 25 | "forest_nleaves{0}_ntrees{1}_alpha{2}_uuid{3}.pickle.gz".format( 26 | *outfile_vars 27 | ) 28 | ) 29 | outfile = outfile_dir + "/" + outfile_basename 30 | 31 | # Report parameters 32 | if verbose: 33 | print("Parameters") 34 | print(("n_leaves", n_leaves)) 35 | print(("n_trees", n_trees)) 36 | print(("alpha", alpha)) 37 | print(("outfile_dir", outfile_dir)) 38 | print(("outfile", outfile)) 39 | 40 | if verbose: 41 | print("Starting tree generation...") 42 | 43 | # Track run time 44 | start_time = time.time() 45 | 46 | # Generate and annotate trees 47 | F = jg.Forest.generate( 48 | n_trees=n_trees, params={"n_leaves": n_leaves, "alpha": alpha} 49 | ) 50 | F.resolve_polytomy() 51 | F.annotate_standard_node_features() 52 | F.annotate_colless() 53 | 54 | # Dump to file 55 | F.dump(outfile) 56 | 57 | # Track run time 58 | elapsed_time = time.time() - start_time 59 | 60 | # Report run time 61 | if verbose: 62 | print("Done!!") 63 | print(("Elapsed time (s):", elapsed_time)) 64 | -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/_jungle/reference_data/generate_annotate_forest.py.bak: -------------------------------------------------------------------------------- 1 | # Generate and annotate a Forest 2 | # Usage: python generate_annotate_forest.py [n_leaves] [n_trees] [alpha] [output_dir] 3 | # Saves Forest as a gzipped pickle archive. 4 | 5 | import sys 6 | import uuid 7 | import time 8 | 9 | sys.path.append("../../jungle/") 10 | import jungle as jg 11 | 12 | verbose = True 13 | 14 | # Specify parameters 15 | n_leaves = int(sys.argv[1]) # Number of leaves in tree 16 | n_trees = int(sys.argv[2]) # Number of trees in forest 17 | alpha = float(sys.argv[3]) # Shape parameter alpha (alpha = 2.0 for neutral Kingman trees, alpha = 1.0 for positive selection Bolthausen-Sznitman trees) 18 | outfile_dir = sys.argv[4] # Output directory 19 | 20 | # Specify output file 21 | outfile_vars = (n_leaves, n_trees, alpha, str(uuid.uuid4())[0:8]) 22 | outfile_basename = "forest_nleaves{0}_ntrees{1}_alpha{2}_uuid{3}.pickle.gz".format(*outfile_vars) 23 | outfile = outfile_dir + "/" + outfile_basename 24 | 25 | # Report parameters 26 | if verbose: 27 | print("Parameters") 28 | print("n_leaves", n_leaves) 29 | print("n_trees", n_trees) 30 | print("alpha", alpha) 31 | print("outfile_dir", outfile_dir) 32 | print("outfile", outfile) 33 | 34 | if verbose: 35 | print("Starting tree generation...") 36 | 37 | # Track run time 38 | start_time = time.time() 39 | 40 | # Generate and annotate trees 41 | F = jg.Forest.generate(n_trees=n_trees, params={"n_leaves": n_leaves, "alpha": alpha}) 42 | F.resolve_polytomy() 43 | F.annotate_standard_node_features() 44 | F.annotate_colless() 45 | 46 | # Dump to file 47 | F.dump(outfile) 48 | 49 | # Track run time 50 | elapsed_time = time.time() - start_time 51 | 52 | # Report run time 53 | if verbose: 54 | print("Done!!") 55 | print("Elapsed time (s):", elapsed_time) 56 | -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/_jungle/tests/generate_annotate_forest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Test generating and annotating a forest 3 | python ../reference_data/generate_annotate_forest.py 100 5 2.0 ../reference_data/ -------------------------------------------------------------------------------- /cassiopeia/tools/fitness_estimator/_lbi_jungle.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tempfile 4 | from typing import Optional 5 | 6 | import networkx as nx 7 | 8 | dir_path = os.path.dirname(os.path.realpath(__file__)) 9 | sys.path.append(os.path.join(dir_path, "_jungle")) 10 | import jungle as jg 11 | import numpy as np 12 | 13 | from cassiopeia.data import CassiopeiaTree 14 | 15 | from ._FitnessEstimator import FitnessEstimator, FitnessEstimatorError 16 | 17 | 18 | def _to_newick(tree: nx.DiGraph, record_branch_lengths: bool = False) -> str: 19 | """Converts a networkx graph to a newick string. 20 | 21 | Args: 22 | tree: A networkx tree 23 | record_branch_lengths: Whether to record branch lengths on the tree in 24 | the newick string 25 | 26 | Returns: 27 | A newick string representing the topology of the tree 28 | """ 29 | 30 | def _to_newick_str(g, node): 31 | is_leaf = g.out_degree(node) == 0 32 | weight_string = "" 33 | 34 | if record_branch_lengths and g.in_degree(node) > 0: 35 | parent = list(g.predecessors(node))[0] 36 | weight_string = ":" + str(g[parent][node]["length"]) 37 | if not is_leaf: 38 | weight_string = node + weight_string 39 | 40 | _name = str(node) 41 | return ( 42 | "%s" % (_name,) + weight_string 43 | if is_leaf 44 | else ( 45 | "(" 46 | + ",".join( 47 | _to_newick_str(g, child) for child in g.successors(node) 48 | ) 49 | + ")" 50 | + weight_string 51 | ) 52 | ) 53 | 54 | root = [node for node in tree if tree.in_degree(node) == 0][0] 55 | return _to_newick_str(tree, root) + ";" 56 | 57 | 58 | class LBIJungle(FitnessEstimator): 59 | """ 60 | LBI as implemented by the jungle package. 61 | 62 | Implements the LBI fitness estimator described by Neher et al. (2014). 63 | This is a simple wrapper on top of the Jungle package, which is in turn 64 | a wrapper around Neher et al.'s code. 65 | 66 | Caveat: LBIJungle does not estimate fitness for the root of this tree 67 | (artifact of the Jungle package). This is rarely of interest though. 68 | 69 | Args: 70 | random_seed: Random seed to set in numpy before running fitness 71 | estimates. (A random seed is used by the LBI to estimate the 72 | characteristic timescale `tau` of the underlying process. 73 | See Neher et al. 2014, and the LBIJungle package for details.) 74 | """ 75 | 76 | def __init__(self, random_seed: Optional[int] = None): 77 | self._random_seed = random_seed 78 | 79 | def estimate_fitness(self, tree: CassiopeiaTree) -> None: 80 | """ 81 | Sets attribute `fitness` for each node in the tree using the LBI. 82 | 83 | Caveat: LBIJungle does not estimate fitness for the root of this tree 84 | (artifact of the Jungle package). This is rarely of interest though. 85 | 86 | Will raise a FitnessEstimatorError if the CassiopeiaTree cannot be 87 | serialized to networkx. 88 | 89 | Also, due to the underlying implementation in the Jungle package that we 90 | wrap, leaf names cannot start with an underscore. A 91 | FitnessEstimatorError will also be raised in this case. 92 | 93 | Raises: 94 | FitnessEstimatorError 95 | """ 96 | if any([leaf.startswith("_") for leaf in tree.leaves]): 97 | raise FitnessEstimatorError( 98 | "Leaf names must NOT start with '_'. Please rename your leaves" 99 | " to use LBIJungle." 100 | ) 101 | with tempfile.NamedTemporaryFile("w") as outfile: 102 | outfilename = outfile.name 103 | tree_newick = _to_newick( 104 | tree.get_tree_topology(), record_branch_lengths=True 105 | ) 106 | outfile.write(tree_newick) 107 | outfile.flush() 108 | if self._random_seed is not None: 109 | np.random.seed(self._random_seed) 110 | try: 111 | T_empirical = jg.Tree.from_newick(outfilename) 112 | except Exception: 113 | raise Exception(f"Could not read newick str:\n{tree_newick}") 114 | T_empirical.annotate_standard_node_features() 115 | T_empirical.infer_fitness(params={}) 116 | res_df = T_empirical.node_features() 117 | node_names = res_df.name 118 | node_fitnesses = res_df.mean_fitness 119 | for v, f in zip(node_names, node_fitnesses): 120 | if v != "" and v[0] != "_": 121 | tree.set_attribute(v, "fitness", f) 122 | elif v != "" and v[0] == "_": 123 | # (Non-root) internal node! 124 | tree.set_attribute(v[1:], "fitness", f) 125 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | # Run to check if valid 2 | # curl --data-binary @codecov.yml https://codecov.io/validate 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: 85% 8 | threshold: 1% 9 | patch: off -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def pytest_addoption(parser): 5 | parser.addoption( 6 | "--runslow", action="store_true", default=False, help="run slow tests", 7 | ) 8 | parser.addoption( 9 | "--runspatial", action="store_true", default=False, help="run spatial tests", 10 | ) 11 | 12 | 13 | def pytest_configure(config): 14 | config.addinivalue_line("markers", "slow: mark test as slow to run") 15 | config.addinivalue_line("markers", "spatial: mark test as spatial to run") 16 | 17 | 18 | 19 | def pytest_collection_modifyitems(config, items): 20 | run_slow = config.getoption("--runslow") 21 | run_spatial = config.getoption("--runspatial") 22 | skip_slow = pytest.mark.skip(reason="need --runslow option to run") 23 | skip_spatial = pytest.mark.skip(reason="need --runspatial option to run") 24 | 25 | for item in items: 26 | if "slow" in item.keywords and not run_slow: 27 | item.add_marker(skip_slow) 28 | if "spatial" in item.keywords and not run_spatial: 29 | item.add_marker(skip_spatial) 30 | -------------------------------------------------------------------------------- /data/PCT48.ref.fasta: -------------------------------------------------------------------------------- 1 | >PCT48.ref 2 | AATCCAGCTAGCTGTGCAGCNNNNNNNNNNNNNNATTCAACTGCAGTAATGCTACCTCGTACTCACGCTTTCCAAGTGCTTGGCGTCGCATCTCGGTCCTTTGTACGCCGAAAAATGGCCTGACAACTAAGCTACGGCACGCTGCCATGTTGGGTCATAACGATATCTCTGGTTCATCCGTGACCGAACATGTCATGGAGTAGCAGGAGCTATTAATTCGCGGAGGACAATGCGGTTCGTAGTCACTGTCTTCCGCAATCGTCCATCGCTCCTGCAGGTGGCCTAGAGGGCCCGTTTAAACCCGCTGATCAGCCTCGACTGTGCCTTCTAGTTGCCAGCCATCTGTTGTTTGCCCCTCCCCCGTGCCTTCCTTGACCCTGGAAGGTGCCACTCCCACTGTCCTTTCCTAATAAAATGAGGAAATTGCATCGCATTGTCTGAGTAGGTGTCATTCTATTCTGGGGGGTGGGGTGGGGCAGGACAGCAAGGGGGAGGATTGGGAAGACAATAGCAGGCATGCTGGGGATGCGGTGGGCTCTATGGTCTAGAGCGGGCCCGGTACTAACCAAACTGGATCTCTGCTGTCCCTGTAATAAACCCGAAAATTTTGAATTTTTGTAATTTGTTTTTGTAATTCTTTAGTTTGTATGTCTGTTGCTATTATGTCTACTATTCTTTCCCCTGCACTGTACCCCCCAATCCCCCCTTTTCTTTTAAAATTGTGGATGAATACTGCCATTTGTCTGCAGA 3 | -------------------------------------------------------------------------------- /data/ccphylo_config.ini: -------------------------------------------------------------------------------- 1 | [Paths] 2 | ccphylo_path = /path/to/ccphylo/ccphylo 3 | -------------------------------------------------------------------------------- /data/itolconfig_example: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | api_key = 3 | project_name = MyProjectonIToL 4 | -------------------------------------------------------------------------------- /data/preprocess.cfg: -------------------------------------------------------------------------------- 1 | # cassiopeia-preprocess configuration example 2 | # See notebooks/preprocess.ipynb for parameter descriptions. 3 | 4 | [general] 5 | name = "test_sample" 6 | output_directory = "/mnt/e/scratch/cassiopeia/pipeline_test" 7 | reference_filepath = "/mnt/e/scratch/cassiopeia/PCT48-long.ref.fa" 8 | entry = "convert" 9 | exit = "call_lineages" 10 | input_files = ["/mnt/e/scratch/cassiopeia/smaller_1.fastq.gz", "/mnt/e/scratch/cassiopeia/smaller_2.fastq.gz"] 11 | n_threads = 32 12 | allow_allele_conflicts = False 13 | verbose = True 14 | 15 | [convert] 16 | chemistry = "10xv3" 17 | 18 | [filter_bam] 19 | quality_threshold = 10 20 | 21 | [error_correct_cellbcs_to_whitelist] 22 | # Set to None to turn off this step. 23 | whitelist = "/mnt/e/scratch/cassiopeia/pipeline_test/10x_version3_whitelist.txt" 24 | 25 | [collapse] 26 | max_hq_mismatches = 3 27 | max_indels = 2 28 | method = "cutoff" 29 | 30 | [resolve] 31 | min_avg_reads_per_umi = 2.0 32 | min_umi_per_cell = 10 33 | plot = True 34 | 35 | [align] 36 | gap_open_penalty = 20 37 | gap_extend_penalty = 1 38 | method = "local" 39 | 40 | [call_alleles] 41 | barcode_interval = (20, 34) 42 | cutsite_locations = [112, 166, 220] 43 | cutsite_width = 12 44 | context = True 45 | context_size = 5 46 | 47 | [error_correct_intbcs_to_whitelist] 48 | # Set to None to turn off this step. 49 | whitelist = "/mnt/e/scratch/cassiopeia/pipeline_test/intbc_whitelist.txt" 50 | intbc_dist_thresh = 1 51 | 52 | [error_correct_umis] 53 | max_umi_distance = 2 54 | 55 | [filter_molecule_table] 56 | min_umi_per_cell = 10 57 | min_avg_reads_per_umi = 2.0 58 | min_reads_per_umi = -1 59 | intbc_prop_thresh = 0.5 60 | intbc_umi_thresh = 10 61 | intbc_dist_thresh = 1 62 | doublet_threshold = 0.35 63 | plot = True 64 | 65 | [call_lineages] 66 | min_umi_per_cell = 10 67 | min_avg_reads_per_umi = 2.0 68 | min_cluster_prop = 0.005 69 | min_intbc_thresh = 0.05 70 | inter_doublet_threshold = 0.35 71 | kinship_thresh = 0.25 72 | plot = True 73 | -------------------------------------------------------------------------------- /data/preprocess_gestalt.cfg: -------------------------------------------------------------------------------- 1 | [general] 2 | name = "test_sample" 3 | output_directory = "/gestalt_barcode_data/cassiopeia_output" 4 | reference_filepath = "/gestalt_barcode_data/reference/gestalt.abbrv.fa" 5 | entry = "collapse" 6 | exit = "filter_molecule_table" 7 | input_files = ["/gestalt_barcode_data/raw/possorted_genome_bam.bam", "/gestalt_barcode_data/raw/possorted_genome_bam.bam.bai"] 8 | n_threads = 32 9 | allow_allele_conflicts = False 10 | verbose = True 11 | 12 | [collapse] 13 | max_hq_mismatches = 3 14 | max_indels = 2 15 | method = "cutoff" 16 | 17 | [resolve] 18 | min_avg_reads_per_umi = 2.0 19 | min_umi_per_cell = 3 20 | plot = True 21 | 22 | [align] 23 | gap_open_penalty = 20 24 | gap_extend_penalty = 1 25 | method = "global" 26 | 27 | [call_alleles] 28 | barcode_interval = (0, 0) 29 | cutsite_locations = [42, 69, 96, 123, 150, 177, 204, 231, 258, 285] 30 | cutsite_width = 4 31 | context = True 32 | context_size = 5 33 | 34 | [error_correct_umis] 35 | max_umi_distance = 2 36 | 37 | [filter_molecule_table] 38 | min_umi_per_cell = 3 39 | min_avg_reads_per_umi = 2.0 40 | min_reads_per_umi = -1 41 | intbc_prop_thresh = 0.5 42 | intbc_umi_thresh = 3 43 | intbc_dist_thresh = 1 44 | doublet_threshold = None 45 | plot = True 46 | -------------------------------------------------------------------------------- /data/spatial_preprocess.cfg: -------------------------------------------------------------------------------- 1 | # cassiopeia-preprocess configuration example for spatial assays 2 | # See notebooks/preprocess.ipynb for parameter descriptions. 3 | 4 | [general] 5 | name = "test_sample" 6 | output_directory = "/mnt/e/scratch/cassiopeia/pipeline_test" 7 | reference_filepath = "/mnt/e/scratch/cassiopeia/PCT48-long.ref.fa" 8 | entry = "convert" 9 | exit = "call_lineages" 10 | input_files = ["/mnt/e/scratch/cassiopeia/smaller_1.fastq.gz", "/mnt/e/scratch/cassiopeia/smaller_2.fastq.gz"] 11 | n_threads = 32 12 | allow_allele_conflicts = True 13 | verbose = True 14 | 15 | [convert] 16 | chemistry = "slideseq2" 17 | 18 | [filter_bam] 19 | quality_threshold = 10 20 | 21 | [error_correct_cellbcs_to_whitelist] 22 | # Set to None to turn off this step. 23 | whitelist = "/mnt/e/scratch/cassiopeia/pipeline_test/puck_whitelist.txt" 24 | 25 | [collapse] 26 | max_hq_mismatches = 3 27 | max_indels = 2 28 | method = "likelihood" 29 | skip_existing = False 30 | 31 | [resolve] 32 | min_avg_reads_per_umi = 2.0 33 | min_umi_per_cell = 10 34 | plot = True 35 | 36 | [align] 37 | gap_open_penalty = 20 38 | gap_extend_penalty = 1 39 | method = "local" 40 | 41 | [call_alleles] 42 | barcode_interval = (20, 34) 43 | cutsite_locations = [112, 166, 220] 44 | cutsite_width = 12 45 | context = True 46 | context_size = 5 47 | 48 | [error_correct_intbcs_to_whitelist] 49 | # Set to None to turn off this step. 50 | whitelist = "/mnt/e/scratch/cassiopeia/pipeline_test/intbc_whitelist.txt" 51 | intbc_dist_thresh = 1 52 | 53 | [error_correct_umis] 54 | max_umi_distance = 2 55 | 56 | [filter_molecule_table] 57 | min_umi_per_cell = 10 58 | min_avg_reads_per_umi = 2.0 59 | min_reads_per_umi = -1 60 | intbc_prop_thresh = 0.5 61 | intbc_umi_thresh = 10 62 | intbc_dist_thresh = 1 63 | doublet_threshold = 0.35 64 | plot = True 65 | 66 | [call_lineages] 67 | min_umi_per_cell = 10 68 | min_avg_reads_per_umi = 2.0 69 | min_cluster_prop = 0.005 70 | min_intbc_thresh = 0.05 71 | inter_doublet_threshold = 0.35 72 | kinship_thresh = 0.25 73 | plot = True 74 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = cassiopeia 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_static/computer-24px.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docs/_static/css/override.css: -------------------------------------------------------------------------------- 1 | /* 2 | Furo CSS variables 3 | https://github.com/pradyunsg/furo/blob/main/src/furo/assets/styles/variables/_index.scss 4 | https://github.com/pradyunsg/furo/blob/main/src/furo/theme/partials/_head_css_variables.html 5 | https://github.com/streamlink/streamlink/blob/17a4088c38709123c0bcab4a150549bd16d19e07/docs/_static/styles/custom.css 6 | */ 7 | 8 | dt:target, span.highlighted { 9 | background-color: #f0f0f0; 10 | } 11 | 12 | code.docutils.literal.notranslate.xref, a code { 13 | background: transparent; 14 | font-weight: bold; 15 | color: inherit; 16 | } 17 | 18 | a > code { 19 | color: inherit; 20 | } 21 | 22 | code.docutils.literal.notranslate { 23 | background: #f8f9fb; 24 | font-size: 87.5%; 25 | border-radius: .2em; 26 | color: #000000; 27 | word-wrap: break-word; 28 | padding: .1em .2em; 29 | } 30 | 31 | dl.citation > dt { 32 | float: left; 33 | margin-right: 15px; 34 | font-weight: bold; 35 | } 36 | 37 | /* Parameters normalize size and captialized, */ 38 | dl.c .field-list dt, dl.cpp .field-list dt, dl.js .field-list dt, dl.py .field-list dt { 39 | font-size: var(--font-size--normal); 40 | text-transform: none; 41 | } 42 | 43 | /* examples and headings in classes */ 44 | p.rubric { 45 | font-size: var(--font-size--normal); 46 | text-transform: none; 47 | font-weight: 500; 48 | } 49 | 50 | 51 | /* Getting started index page */ 52 | 53 | .intro-card { 54 | background: #fff; 55 | border-radius: 0; 56 | padding: 30px 10px 10px 10px; 57 | margin: 10px 0px; 58 | } 59 | 60 | .intro-card .card-text { 61 | margin: 20px 0px; 62 | /*min-height: 150px; */ 63 | } 64 | 65 | .custom-button { 66 | background-color: #dcdcdc; 67 | border: none; 68 | color: #484848; 69 | text-align: center; 70 | text-decoration: none; 71 | display: inline-block; 72 | font-size: 0.9rem; 73 | border-radius: 0.5rem; 74 | max-width: 220px; 75 | padding: 0.5rem 0rem; 76 | } 77 | 78 | .custom-button a { 79 | color: #484848; 80 | display: block; 81 | } 82 | 83 | .custom-button p { 84 | margin-top: 0; 85 | margin-bottom: 0rem; 86 | color: #484848; 87 | } 88 | 89 | 90 | 91 | /* from https://github.com/dask/dask-sphinx-theme/blob/main/dask_sphinx_theme/static/css/nbsphinx.css */ 92 | 93 | .nbinput .prompt, 94 | .nboutput .prompt { 95 | display: none; 96 | } 97 | .nboutput .stderr{ 98 | display: none; 99 | } 100 | 101 | div.nblast.container { 102 | padding-bottom: 5px; 103 | padding-right: 0px; 104 | padding-left: 0px; 105 | margin-bottom: 10px; 106 | } 107 | 108 | div.nbinput.container, div.nboutput.container { 109 | display: -webkit-flex; 110 | display: flex; 111 | align-items: flex-start; 112 | margin-top: 0px; 113 | margin-right: 0px; 114 | margin-bottom: 5px; 115 | margin-left: 0px; 116 | padding-right: 0px; 117 | padding-left: 0px; 118 | padding-top: 0px 119 | width: 100%; 120 | } -------------------------------------------------------------------------------- /docs/_static/css/sphinx_gallery.css: -------------------------------------------------------------------------------- 1 | /* .sphx-glr-thumbcontainer { 2 | background: none !important; 3 | border: 1px solid #003262!important; 4 | text-align: center !important; 5 | min-height: 220px !important; 6 | } 7 | .sphx-glr-thumbcontainer a.internal:hover { 8 | color: #003262!important; 9 | } 10 | p.sphx-glr-timing { 11 | margin: 0 !important; 12 | padding-top: 24px; 13 | border-top: 1px solid #000; 14 | } 15 | .sphx-glr-thumbcontainer:hover { 16 | box-shadow: 0 0 10px #003262!important 17 | } 18 | */ 19 | 20 | 21 | .sphx-glr-thumbcontainer .headerlink { 22 | display: none !important; 23 | } 24 | 25 | div.sphx-glr-thumbcontainer span { 26 | font-style: normal !important; 27 | } 28 | 29 | .sphx-glr-thumbcontainer a.internal { 30 | padding: 140px 10px 0!important; 31 | } 32 | 33 | .sphx-glr-thumbcontainer .figure.align-center { 34 | text-align: center; 35 | margin-left: 0%; 36 | transform: translate(0%); 37 | } -------------------------------------------------------------------------------- /docs/_static/library_books-24px.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docs/_static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/docs/_static/logo.png -------------------------------------------------------------------------------- /docs/_static/play_circle_outline-24px.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docs/_static/question-mark-svgrepo-com.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 8 | 9 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /docs/_static/tutorials/benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/docs/_static/tutorials/benchmark.png -------------------------------------------------------------------------------- /docs/_static/tutorials/local_plotting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/docs/_static/tutorials/local_plotting.png -------------------------------------------------------------------------------- /docs/_static/tutorials/preprocess.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/docs/_static/tutorials/preprocess.png -------------------------------------------------------------------------------- /docs/_static/tutorials/reconstruct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/docs/_static/tutorials/reconstruct.png -------------------------------------------------------------------------------- /docs/_templates/autosummary/class.rst: -------------------------------------------------------------------------------- 1 | {{ fullname | escape | underline }} 2 | 3 | .. currentmodule:: {{ module }} 4 | 5 | .. autoclass:: {{ objname }} 6 | :members: 7 | :undoc-members: 8 | 9 | .. rubric:: Methods 10 | 11 | .. autoautosummary:: {{ objname }} 12 | :methods: -------------------------------------------------------------------------------- /docs/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "pydata_sphinx_theme/layout.html" %} 2 | 3 | {% block fonts %} 4 | 5 | 6 | 7 | 8 | 17 | {% endblock %} 18 | -------------------------------------------------------------------------------- /docs/api/critique.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | Critique 3 | =========== 4 | .. currentmodule:: cassiopeia 5 | 6 | Critique 7 | ~~~~~~~~~~~~~~~~~~~ 8 | 9 | We support functionality for comparing trees to one another, for example when benchmarking new algorithms. 10 | 11 | .. autosummary:: 12 | :toctree: reference/ 13 | 14 | critique.robinson_foulds 15 | critique.triplets_correct -------------------------------------------------------------------------------- /docs/api/data.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | Data 3 | =========== 4 | 5 | .. module:: cassiopeia.data 6 | .. currentmodule:: cassiopeia 7 | 8 | CassiopeiaTrees 9 | ~~~~~~~~~~~~~~~~~~~ 10 | 11 | The main data structure that Cassiopeia uses for all tree-based analyses is the CassiopeiaTree: 12 | 13 | .. autosummary:: 14 | :toctree: reference/ 15 | 16 | data.CassiopeiaTree 17 | 18 | Utilities 19 | ~~~~~~~~~~~~~~~~~~~ 20 | 21 | We also have several utilities that are useful for working with various data related to phylogenetics: 22 | 23 | .. autosummary:: 24 | :toctree: reference/ 25 | 26 | data.compute_dissimilarity_map 27 | data.compute_phylogenetic_weight_matrix 28 | data.get_lca_characters 29 | data.sample_bootstrap_allele_tables 30 | data.sample_bootstrap_character_matrices 31 | data.to_newick -------------------------------------------------------------------------------- /docs/api/index.rst: -------------------------------------------------------------------------------- 1 | === 2 | API 3 | === 4 | 5 | 6 | Import Cassiopeia as:: 7 | 8 | import cassiopeia as cas 9 | 10 | .. toctree:: 11 | :maxdepth: 1 12 | 13 | preprocess 14 | data 15 | critique 16 | solver 17 | simulator 18 | plotting 19 | tools -------------------------------------------------------------------------------- /docs/api/plotting.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Plotting 3 | ========== 4 | 5 | .. currentmodule:: cassiopeia 6 | 7 | Plotting 8 | ~~~~~~~~~~~~~~~~~~~ 9 | 10 | Plotting functionality is divided into two broad categories: local and remote 11 | (a.k.a. iTOL). Previously, we only supported tree visualization using the rich 12 | iTOL framework. However, we are now in the process of deprecating the use of 13 | this service for most use cases. We recommend all users to visualize their 14 | trees using the local plotting functions, which either use Matplotlib or 15 | Plotly, as this option is free and is more reminiscent of plotting in other 16 | packages such as Scanpy. 17 | 18 | .. autosummary:: 19 | :toctree: reference/ 20 | 21 | pl.labels_from_coordinates 22 | pl.plot_matplotlib 23 | pl.plot_plotly 24 | pl.Tree3D 25 | pl.upload_and_export_itol 26 | -------------------------------------------------------------------------------- /docs/api/preprocess.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | Preprocess 3 | =========== 4 | .. currentmodule:: cassiopeia 5 | 6 | Data Preprocessing 7 | ~~~~~~~~~~~~~~~~~~~ 8 | 9 | We have several functions that are part of our pipeline for processing sequencing data from single-cell lineage tracing technologies: 10 | 11 | .. autosummary:: 12 | :toctree: reference/ 13 | 14 | pp.align_sequences 15 | pp.call_alleles 16 | pp.call_lineage_groups 17 | pp.collapse_umis 18 | pp.convert_fastqs_to_unmapped_bam 19 | pp.error_correct_cellbcs_to_whitelist 20 | pp.error_correct_intbcs_to_whitelist 21 | pp.error_correct_umis 22 | pp.filter_bam 23 | pp.filter_molecule_table 24 | pp.filter_cells 25 | pp.filter_umis 26 | pp.resolve_umi_sequence 27 | 28 | 29 | 30 | 31 | Data Utilities 32 | ~~~~~~~~~~~~~~~~~~~ 33 | 34 | We also have several functions that are useful for converting between data formats for downstream analyses: 35 | 36 | .. autosummary:: 37 | :toctree: reference/ 38 | 39 | pp.compute_empirical_indel_priors 40 | pp.convert_alleletable_to_character_matrix 41 | pp.convert_alleletable_to_lineage_profile 42 | pp.convert_lineage_profile_to_character_matrix -------------------------------------------------------------------------------- /docs/api/simulator.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | Simulator 3 | =========== 4 | .. currentmodule:: cassiopeia 5 | 6 | 7 | Our simulators for cassiopeia are split up into those that simulate topologies and those that simulate data on top of the topologies. 8 | 9 | Tree Simulators 10 | ~~~~~~~~~~~~~~~~~~~ 11 | 12 | We have several frameworks available for simulating topologies: 13 | 14 | .. autosummary:: 15 | :toctree: reference/ 16 | 17 | sim.BirthDeathFitnessSimulator 18 | sim.ecDNABirthDeathSimulator 19 | sim.CompleteBinarySimulator 20 | sim.SimpleFitSubcloneSimulator 21 | 22 | 23 | Data Simulators 24 | ~~~~~~~~~~~~~~~~~~~ 25 | 26 | These simulators are subclasses of the `DataSimulator` class and implement the `overlay_data` method which simulates data according to a given topology. 27 | 28 | .. autosummary:: 29 | :toctree: reference/ 30 | 31 | sim.Cas9LineageTracingDataSimulator 32 | 33 | 34 | Spatial Simulators 35 | ~~~~~~~~~~~~~~~~~~~ 36 | These simulators are subclasses of the `SpatialSimulator` class and implement the `overlay_data` method which adds spatial coordinates to a given topology. `SpatialSimulator`s are a special sublcass of `DataSimulator` and can be used in addition to other `DataSimulator`s that simulate lineage tracing data. 37 | 38 | .. autosummary:: 39 | :toctree: reference/ 40 | 41 | sim.BrownianSpatialDataSimulator 42 | sim.ClonalSpatialDataSimulator 43 | 44 | 45 | Leaf SubSamplers 46 | ~~~~~~~~~~~~~~~~~~~ 47 | These are utilities for subsampling lineages for benchmarking purposes. For example, sampling a random proportion of leaves or grouping together cells into clades to model spatial data. 48 | 49 | .. autosummary:: 50 | :toctree: reference/ 51 | 52 | sim.SupercellularSampler 53 | sim.SpatialLeafSubsampler 54 | sim.UniformLeafSubsampler -------------------------------------------------------------------------------- /docs/api/solver.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | Solver 3 | =========== 4 | .. currentmodule:: cassiopeia 5 | 6 | CassiopeiaSolvers 7 | ~~~~~~~~~~~~~~~~~~~ 8 | 9 | We have several algorithms available for solving phylogenies: 10 | 11 | .. autosummary:: 12 | :toctree: reference/ 13 | 14 | solver.HybridSolver 15 | solver.ILPSolver 16 | solver.MaxCutSolver 17 | solver.MaxCutGreedySolver 18 | solver.NeighborJoiningSolver 19 | solver.PercolationSolver 20 | solver.SharedMutationJoiningSolver 21 | solver.SpectralSolver 22 | solver.SpectralGreedySolver 23 | solver.UPGMASolver 24 | solver.VanillaGreedySolver 25 | 26 | 27 | Dissimilarity Maps 28 | ~~~~~~~~~~~~~~~~~~~ 29 | 30 | For use in our distance-based solver and for comparing character states, we also have available several dissimilarity functions: 31 | 32 | .. autosummary:: 33 | :toctree: reference/ 34 | 35 | solver.dissimilarity_functions.cluster_dissimilarity 36 | solver.dissimilarity_functions.hamming_distance 37 | solver.dissimilarity_functions.hamming_similarity_normalized_over_missing 38 | solver.dissimilarity_functions.hamming_similarity_without_missing 39 | solver.dissimilarity_functions.weighted_hamming_distance 40 | solver.dissimilarity_functions.weighted_hamming_similarity -------------------------------------------------------------------------------- /docs/api/tools.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Tools 3 | ========== 4 | 5 | .. currentmodule:: cassiopeia 6 | 7 | This library stores code for post-reconstruction analysis of trees. We are 8 | always in the process of developing new statistics and tools for helping us 9 | interpret trees, and adding them to this library. 10 | 11 | Autocorrelation 12 | ~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autosummary:: 15 | :toctree: reference/ 16 | 17 | tl.compute_morans_i 18 | 19 | Branch Length Estimation (BLE) 20 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 21 | 22 | .. autosummary:: 23 | :toctree: reference/ 24 | 25 | tl.IIDExponentialBayesian 26 | tl.IIDExponentialMLE 27 | 28 | Coupling 29 | ~~~~~~~~~~~ 30 | 31 | .. autosummary:: 32 | :toctree: reference/ 33 | 34 | tl.compute_evolutionary_coupling 35 | 36 | Metrics 37 | ~~~~~~~~ 38 | .. autosummary:: 39 | :toctree: reference/ 40 | 41 | tl.calculate_likelihood_continuous 42 | tl.calculate_likelihood_discrete 43 | tl.calculate_parsimony 44 | 45 | Parameter Estimation 46 | ~~~~~~~~~~~~~~~~~~~~~~ 47 | 48 | .. autosummary:: 49 | :toctree: reference/ 50 | 51 | tl.estimate_missing_data_rates 52 | tl.estimate_mutation_rate 53 | 54 | 55 | Small-Parsimony 56 | ~~~~~~~~~~~~~~~~~~~ 57 | 58 | .. autosummary:: 59 | :toctree: reference/ 60 | 61 | tl.fitch_count 62 | tl.fitch_hartigan 63 | tl.score_small_parsimony 64 | 65 | Topology 66 | ~~~~~~~~~~~~~~~~~~~ 67 | .. autosummary:: 68 | :toctree: reference/ 69 | 70 | tl.compute_expansion_pvalues -------------------------------------------------------------------------------- /docs/authors.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../AUTHORS.rst 2 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CONTRIBUTING.rst 2 | -------------------------------------------------------------------------------- /docs/extensions/typed_returns.py: -------------------------------------------------------------------------------- 1 | # code from https://github.com/theislab/scanpy/blob/master/docs/extensions/typed_returns.py 2 | # with some minor adjustment 3 | import re 4 | 5 | from sphinx.application import Sphinx 6 | from sphinx.ext.napoleon import NumpyDocstring 7 | 8 | 9 | def process_return(lines): 10 | for line in lines: 11 | m = re.fullmatch(r"(?P\w+)\s+:\s+(?P[\w.]+)", line) 12 | if m: 13 | # Once this is in scanpydoc, we can use the fancy hover stuff 14 | yield f'-{m["param"]} (:class:`~{m["type"]}`)' 15 | else: 16 | yield line 17 | 18 | 19 | def scanpy_parse_returns_section(self, section): 20 | lines_raw = list(process_return(self._dedent(self._consume_to_next_section()))) 21 | lines = self._format_block(":returns: ", lines_raw) 22 | if lines and lines[-1]: 23 | lines.append("") 24 | return lines 25 | 26 | 27 | def setup(app: Sphinx): 28 | NumpyDocstring._parse_returns_section = scanpy_parse_returns_section 29 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. Cassiopeia documentation master file, created by 2 | sphinx-quickstart on Sat Jan 26 12:35:18 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | ======================== 7 | Welcome! 8 | ======================== 9 | 10 | This website serves as documentation to the Cassiopeia software suite, maintained by the `Yosef Lab 11 | `_ at UC Berkeley. 12 | 13 | Cassiopeia [Jones20]_ is a package for end-to-end phylogenetic reconstruction of single-cell lineage tracing data. The package is composed of four independent modules: 14 | 15 | * ``preprocess`` for processing sequencing FASTQ data to character matrices 16 | * ``solver`` for performing tree inference 17 | * ``simulator`` for simulating trees and character-level data 18 | * ``plotting`` for plotting trees. 19 | 20 | If you find this useful for your research, please consider citing Cassiopeia [Jones20]_. 21 | 22 | .. raw:: html 23 | 24 |
25 |
26 |
27 |
28 | installation with cassiopeia action icon 29 |
30 |
Installation
31 |

New to Cassiopeia? Check out the installation guide. 32 |

33 | 34 | .. container:: custom-button 35 | 36 | :doc:`To the installation guide` 37 | 38 | .. raw:: html 39 | 40 |
41 |
42 |
43 |
44 |
45 | cassiopeia user guide action icon 46 |
47 |
User guide
48 |

The tutorials provide in-depth information on running Cassiopeia.

49 | 50 | .. container:: custom-button 51 | 52 | :doc:`To the user guide` 53 | 54 | .. raw:: html 55 | 56 |
57 |
58 |
59 |
60 |
61 | api of scvi action icon 62 |
63 |
API reference
64 |

The API reference contains a detailed description of 65 | the Cassiopeia API.

66 | 67 | .. container:: custom-button 68 | 69 | :doc:`To the API reference` 70 | 71 | .. raw:: html 72 | 73 |
74 |
75 |
76 |
77 |
78 | questions about cassiopeia 79 |
80 |
Questions & Issues
81 |

Have a question or found a bug? File an issue.

82 | 83 | .. container:: custom-button 84 | 85 | `File an issue `_ 86 | 87 | .. raw:: html 88 | 89 |
90 |
91 |
92 |
93 |
94 | 95 | 96 | .. toctree:: 97 | :maxdepth: 1 98 | :hidden: 99 | 100 | installation 101 | api/index 102 | user_guide 103 | contributing 104 | authors 105 | references -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ------------ 3 | 4 | Prerequisites 5 | ~~~~~~~~~~~~~~ 6 | 7 | Cassiopeia currently requires python version 3.6, which is publicly available. 8 | 9 | Cassiopeia needs to be downloaded from Github by cloning the directory onto your machine: 10 | 11 | :: 12 | 13 | git clone https://github.com/YosefLab/Cassiopeia.git 14 | 15 | To run some of the models in Cassiopeia, you will also need to install `Gurobi `_. Licenses are free to academic users and can be downloaded `here `_. 16 | 17 | 18 | Installing 19 | ~~~~~~~~~~~ 20 | 21 | Once Cassiopeia is cloned into a directory onto your machine, enter into the directory with `cd Cassiopeia`. To make installation simple, we have wrapped the installation steps into a MAKEFILE - this allows you to install Cassiopeia with the command: 22 | 23 | :: 24 | 25 | make install 26 | 27 | To make sure that the package has been installed correctly, we recommend you also run all the unit tests with another command from the MAKEFILE: 28 | 29 | :: 30 | 31 | make test 32 | 33 | 34 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/notebooks: -------------------------------------------------------------------------------- 1 | ../notebooks -------------------------------------------------------------------------------- /docs/references.rst: -------------------------------------------------------------------------------- 1 | References 2 | ---------- 3 | 4 | .. [Jones20] Matthew G Jones*, Alex Khodaverdian*, Jeffrey J Quinn*, Michelle M Chan, Jeffrey A Hussmann, Robert Wang, Chenling Xu, Jonatahn S Weissman, Nir Yosef. (2020), 5 | *Inference of single-cell phylogenies from lineage tracing data using Cassiopeia*, 6 | `Genome Biology `__. 7 | -------------------------------------------------------------------------------- /docs/user_guide.rst: -------------------------------------------------------------------------------- 1 | User guide 2 | ========== 3 | 4 | Cassiopeia is a flexible tool for analyzing lineage-tracing data and benchmarking new algorithms. Perhaps the easiest way to get started with Cassiopeia is by following along with or tutorials. Generally, we'll expect that you've already successfully installed Cassiopeia using the :doc:`installation guide`. 5 | 6 | For any questions about Cassiopeia, please file an issue on `Github `_. If you'd like to contribute a tutorial or a new algorithm, please follow our :doc:`Contributing guide`. 7 | 8 | Main Tutorials 9 | ----------- 10 | 11 | .. nbgallery:: 12 | 13 | notebooks/preprocess 14 | notebooks/benchmark 15 | notebooks/reconstruct 16 | notebooks/local_plotting 17 | 18 | 19 | Other Tutorials 20 | ----------------- 21 | 22 | .. toctree:: 23 | :maxdepth: 1 24 | 25 | notebooks/simulate_ecDNA 26 | 27 | Contributed tutorials 28 | --------------------- 29 | 30 | Currently we have no contributed tutorials -- if you are interested, check out our :doc:`Contributing guide`! -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | authors = ["Matthew Jones ", "Alex Khodaverdian", "Richard Zhang", "Sebastian Prillo", "Joseph Min"] 3 | classifiers = [ 4 | "Development Status :: 4 - Beta", 5 | "Intended Audience :: Science/Research", 6 | "Natural Language :: English", 7 | "Programming Language :: Python :: 3.8", 8 | "Programming Language :: Python :: 3.9", 9 | "Programming Language :: Python :: 3.10", 10 | "Programming Language :: Python :: 3.11", 11 | "Operating System :: MacOS :: MacOS X", 12 | "Operating System :: Microsoft :: Windows", 13 | "Operating System :: POSIX :: Linux", 14 | "Topic :: Scientific/Engineering :: Bio-Informatics", 15 | ] 16 | description = "Single Cell Lineage Reconstruction with Cas9-Enabled Lineage Recorders" 17 | documentation = "https://cassiopeia-lineage.readthedocs.io/" 18 | homepage = "https://github.com/YosefLab/Cassiopeia" 19 | keywords = ['scLT'] 20 | license = "MIT" 21 | name = "cassiopeia-lineage" 22 | readme = 'README.md' 23 | repository = "https://github.com/YosefLab/Cassiopeia" 24 | version = "2.1.0" 25 | 26 | include = [ 27 | {path = "cassiopeia/preprocess/*.so", format = "wheel"}, 28 | {path = "cassiopeia/preprocess/*.pyx", format = "wheel"}, 29 | {path = "cassiopeia/solver/*.so", format = "wheel"}, 30 | {path = "cassiopeia/solver/*.pyx", format = "wheel"}, 31 | {path = "cassiopeia/tools/branch_length_estimator/*.so", format = "wheel"}, 32 | {path = "cassiopeia/tools/branch_length_estimator/*.pyx", format = "wheel"}, 33 | {path = "cassiopeia/config.ini"}, 34 | ] 35 | packages = [ 36 | {include = "cassiopeia"}, 37 | ] 38 | 39 | [tool.poetry.dependencies] 40 | Biopython = ">=1.71" 41 | Cython = ">=0.29.2" 42 | PyYAML = ">=3.12" 43 | black = {version = ">=20.8b1", optional = true} 44 | bokeh = ">=0.12.15" 45 | cchardet = {version = ">=2.1.7", optional = true} 46 | codecov = {version = ">=2.0.8", optional = true} 47 | cvxpy = "*" 48 | ete3 = ">=3.1.1" 49 | hits = "*" 50 | ipython = {version = ">=7.20", optional = true} 51 | isort = {version = ">=5.7", optional = true} 52 | itolapi = "*" 53 | jupyter = {version = ">=1.0", optional = true} 54 | matplotlib = ">=2.2.2" 55 | nbconvert = {version = ">=5.4.0", optional = true} 56 | nbformat = {version = ">=4.4.0", optional = true} 57 | nbsphinx = {version = "*", optional = true} 58 | nbsphinx-link = {version = "*", optional = true} 59 | networkx = ">=3.1" 60 | ngs-tools = ">=1.5.6" 61 | numba = ">=0.51.0" 62 | numpy = ">=1.22, <3.0" 63 | opencv-python = {version = ">=4.5.4.60", optional = true} 64 | pandas = ">=1.1.4" 65 | parameterized = "*" 66 | plotly = ">=5.0.0" 67 | poisson-disc = {version = ">=0.2.1", optional = true} 68 | pre-commit = {version = ">=2.7.1", optional = true} 69 | pydata-sphinx-theme = {version = ">=0.4.3", optional = true} 70 | pysam = ">=0.14.1" 71 | pyseq-align = ">=1.0.2" 72 | pytest = {version = ">=4.4", optional = true} 73 | python = ">=3.7,<4.0" 74 | pyvista = {version = "=0.41.0", optional = true} 75 | scanpydoc = {version = ">=0.5", optional = true} 76 | scikit-image = {version = ">=0.19.1", optional = true} 77 | scikit-learn = {version = ">=1.0.2", optional = true} 78 | scipy = ">=1.2.0" 79 | sphinx = {version = ">=3.4", optional = true} 80 | sphinx-autodoc-typehints = {version = "*", optional = true} 81 | sphinx-gallery = {version = ">0.6", optional = true} 82 | trame = {version = ">=3.2.4", optional = true} 83 | trame-vtk = {version = ">=2.5.8", optional = true} 84 | trame-vuetify = {version = ">=2.3.1", optional = true} 85 | tqdm = ">=4" 86 | typing-extensions = ">=3.7.4" 87 | typing_extensions = {version = "*", python = "<3.8", optional = true} 88 | vtk = {version = ">=9.2", optional = true} 89 | 90 | [tool.poetry.build] 91 | generate-setup-file = false 92 | script = "build.py" 93 | 94 | [build-system] 95 | build-backend = "poetry.core.masonry.api" 96 | requires = ["poetry-core>=1.0.7", "Cython", "numpy>=1.19.5", "setuptools", "pip>=22.0.0"] 97 | 98 | [tool.poetry.scripts] 99 | cassiopeia-preprocess = 'cassiopeia.preprocess.cassiopeia_preprocess:main' 100 | 101 | [tool.poetry.extras] 102 | dev = ["black", "pytest", "flake8", "codecov", "jupyter", "pre-commit", "isort"] 103 | docs = [ 104 | "sphinx", 105 | "scanpydoc", 106 | "nbconvert", 107 | "nbformat", 108 | "nbsphinx", 109 | "nbsphinx-link", 110 | "ipython", 111 | "pydata-sphinx-theme", 112 | "typing_extensions", 113 | "sphinx-autodoc-typehints", 114 | "sphinx_gallery", 115 | ] 116 | spatial = [ 117 | "opencv-python", 118 | "poisson-disc", 119 | "vtk", 120 | "scikit-image", 121 | "scikit-learn", 122 | "trame", 123 | "trame-vuetify", 124 | "trame-vtk", 125 | "cchardet", 126 | "pyvista" 127 | ] 128 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # This is a shim to hopefully allow Github to detect the package, build is done with poetry 4 | 5 | import setuptools 6 | 7 | if __name__ == "__main__": 8 | setuptools.setup(name="cassiopeia") 9 | -------------------------------------------------------------------------------- /test/mixin_tests/mixin_utilities_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file tests the utilities stored in cassiopeia/data/utilities.py 3 | """ 4 | 5 | import unittest 6 | 7 | import pandas as pd 8 | 9 | from cassiopeia.mixins import utilities 10 | 11 | 12 | class TestMixinUtilities(unittest.TestCase): 13 | def test_is_ambiguous_state(self): 14 | self.assertTrue(utilities.is_ambiguous_state((1, 2))) 15 | self.assertFalse(utilities.is_ambiguous_state(1)) 16 | 17 | def test_unravel_states(self): 18 | state_array = [0, (1, 2), 3, 4, 5] 19 | self.assertListEqual( 20 | [0, 1, 2, 3, 4, 5], utilities.unravel_ambiguous_states(state_array) 21 | ) 22 | 23 | state_array = [0, 1, 2, 3, 4, 5] 24 | self.assertListEqual( 25 | [0, 1, 2, 3, 4, 5], utilities.unravel_ambiguous_states(state_array) 26 | ) 27 | 28 | def test_find_duplicated_character_states(self): 29 | 30 | character_matrix = pd.DataFrame.from_dict( 31 | { 32 | "c1": [(5, 1), 0, 1, 2, 0], 33 | "c2": [(5, 1), 0, 1, 2, 0], 34 | "c3": [4, 0, 3, 2, -1], 35 | "c4": [-1, 4, 0, 2, 2], 36 | "c5": [0, 4, 1, 2, 2], 37 | "c6": [4, 0, 0, 2, (2, 1)], 38 | "c6_dup": [4, 0, 0, 2, (1, 2)], 39 | }, 40 | orient="index", 41 | columns=["a", "b", "c", "d", "e"], 42 | ) 43 | 44 | duplicated_mappings = utilities.find_duplicate_groups(character_matrix) 45 | 46 | expected_entries = [('c1', ('c1', 'c2')), 47 | ('c6', ('c6', 'c6_dup'))] 48 | 49 | for k, grp in expected_entries: 50 | self.assertIn(k, list(duplicated_mappings.keys())) 51 | self.assertSetEqual(set(grp), set(duplicated_mappings[k])) 52 | 53 | 54 | if __name__ == "__main__": 55 | unittest.main() 56 | -------------------------------------------------------------------------------- /test/plotting_tests/local_3d_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import unittest 3 | 4 | import numpy as np 5 | 6 | import cassiopeia as cas 7 | from cassiopeia.plotting import local_3d 8 | 9 | 10 | class TestLocal3DPlotting(unittest.TestCase): 11 | def setUp(self): 12 | np.random.seed(0) 13 | simulator = cas.sim.CompleteBinarySimulator(num_cells=8) 14 | self.tree = simulator.simulate_tree() 15 | 16 | spatial_simulator = cas.sim.ClonalSpatialDataSimulator((10, 10)) 17 | spatial_simulator.overlay_data(self.tree) 18 | 19 | self.labels = local_3d.labels_from_coordinates(self.tree) 20 | 21 | @pytest.mark.spatial 22 | def test_interpolate_branch(self): 23 | parent = (0, 0, 0) 24 | child = (1, 1, 1) 25 | np.testing.assert_array_equal( 26 | [[0, 0, 0], [1, 1, 0], [1, 1, 1]], 27 | local_3d.interpolate_branch(parent, child), 28 | ) 29 | 30 | @pytest.mark.spatial 31 | def test_polyline_from_points(self): 32 | points = np.array( 33 | [ 34 | [0, 0, 0], 35 | [1, 1, 1], 36 | [1, 1, 0], 37 | ] 38 | ) 39 | poly = local_3d.polyline_from_points(points) 40 | np.testing.assert_array_equal(points, poly.points) 41 | 42 | @pytest.mark.spatial 43 | def test_average_mixing(self): 44 | c1 = (0, 0, 0) 45 | c2 = (0.1, 0.2, 0.3) 46 | c3 = (0.5, 0.7, 0.0) 47 | np.testing.assert_allclose( 48 | (0.2, 0.3, 0.1), local_3d.average_mixing(c1, c2, c3) 49 | ) 50 | 51 | @pytest.mark.spatial 52 | def test_highlight(self): 53 | c = (0.8, 0.2, 0.0) 54 | np.testing.assert_allclose((1.0, 0.25, 0.0), local_3d.highlight(c)) 55 | 56 | @pytest.mark.spatial 57 | def test_lowlight(self): 58 | c = (0.8, 0.2, 0.0) 59 | np.testing.assert_allclose((0.3, 0.075, 0.0), local_3d.lowlight(c)) 60 | 61 | @pytest.mark.spatial 62 | def test_labels_from_coordinates(self): 63 | # invalid shape 64 | with self.assertRaises(ValueError): 65 | local_3d.labels_from_coordinates(self.tree, shape=(10,10,10)) 66 | with self.assertRaises(ValueError): 67 | local_3d.labels_from_coordinates(self.tree, shape=("10","10")) 68 | with self.assertRaises(ValueError): 69 | local_3d.labels_from_coordinates(self.tree, shape=(-1,10)) 70 | # invalid attribute 71 | with self.assertRaises(ValueError): 72 | local_3d.labels_from_coordinates(self.tree, attribute_key="foo") 73 | # edits tree metadata 74 | for leaf in self.tree.leaves: 75 | x, y = self.tree.get_attribute(leaf, "spatial") 76 | self.assertEqual( 77 | self.labels[int(x), int(y)], 78 | self.tree.cell_meta["spatial_label"][leaf], 79 | ) 80 | # not square 81 | labels = local_3d.labels_from_coordinates(self.tree, shape=(1000, 500)) 82 | self.assertEqual(labels.shape, (1000, 500)) 83 | # dense spatial positions 84 | dense_tree = self.tree.copy() 85 | spatial_simulator = cas.sim.ClonalSpatialDataSimulator((1,1)) 86 | spatial_simulator.overlay_data(dense_tree) 87 | labels = local_3d.labels_from_coordinates(dense_tree, shape=(100, 100)) 88 | 89 | @pytest.mark.spatial 90 | def test_Tree3D(self): 91 | # There isn't a good way to test this, other than making sure there 92 | # are no errors on initialization. 93 | tree3d = local_3d.Tree3D(self.tree, self.labels) 94 | tree3d.plot(show=False) 95 | # without labels 96 | tree3d = local_3d.Tree3D(self.tree) 97 | tree3d.plot(show=False) 98 | 99 | if __name__ == "__main__": 100 | unittest.main() 101 | -------------------------------------------------------------------------------- /test/preprocess_tests/align_sequence_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the sequence alignment in pipeline.py. 3 | """ 4 | import unittest 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | import cassiopeia 10 | 11 | 12 | class TestAlignSequence(unittest.TestCase): 13 | def setUp(self): 14 | 15 | self.queries = pd.DataFrame.from_dict( 16 | { 17 | "cellBC": ["A", "A", "A", "B", "B", "C", "C", "C"], 18 | "UMI": ["1", "2", "3", "1", "2", "1", "2", "3"], 19 | "readCount": [20, 30, 30, 40, 40, 10, 10, 15], 20 | "seq": [ 21 | "AACCTTGG", 22 | "ACTG", 23 | "AACCTTGGACTGCATCG", 24 | "AATTAA", 25 | "ACTGGACT", 26 | "AACCTTGGGG", 27 | "AAAAAAAAAAA", 28 | "TACTCTATA", 29 | ], 30 | } 31 | ) 32 | self.queries["readName"] = self.queries.apply( 33 | lambda x: "_".join([x.cellBC, x.UMI, str(x.readCount)]), axis=1 34 | ) 35 | 36 | self.reference = "AACCTTGG" 37 | 38 | def test_alignment_dataframe_structure(self): 39 | 40 | aln_df = cassiopeia.pp.align_sequences( 41 | self.queries, 42 | ref=self.reference, 43 | gap_open_penalty=20, 44 | gap_extend_penalty=1, 45 | n_threads=2, 46 | ) 47 | 48 | self.assertEqual(aln_df.shape[0], self.queries.shape[0]) 49 | 50 | for cellBC in self.queries["cellBC"].unique(): 51 | self.assertIn(cellBC, aln_df["cellBC"].unique()) 52 | 53 | expected_columns = [ 54 | "cellBC", 55 | "UMI", 56 | "AlignmentScore", 57 | "CIGAR", 58 | "QueryBegin", 59 | "ReferenceBegin", 60 | "Seq", 61 | "readName", 62 | "readCount", 63 | ] 64 | 65 | for column in expected_columns: 66 | self.assertIn(column, aln_df.columns) 67 | 68 | def test_extremely_large_gap_open_penalty(self): 69 | 70 | aln_df = cassiopeia.pp.align_sequences( 71 | self.queries, 72 | ref=self.reference, 73 | gap_open_penalty=255, 74 | gap_extend_penalty=1, 75 | ) 76 | 77 | # since the gap open penalty is so large, enforce that 78 | # no gaps should occur 79 | for ind, row in aln_df.iterrows(): 80 | 81 | self.assertNotIn("D", row.CIGAR) 82 | self.assertNotIn("I", row.CIGAR) 83 | 84 | def test_default_alignment_works(self): 85 | 86 | aln_df = cassiopeia.pp.align_sequences( 87 | self.queries, 88 | ref=self.reference, 89 | gap_open_penalty=2, 90 | gap_extend_penalty=1, 91 | ) 92 | 93 | expected_alignments = { 94 | "A_1_20": ("8M", 40), 95 | "A_2_30": ("2M2D2M", 17), 96 | "A_3_30": ("8M", 40), 97 | "B_1_40": ("2M2D2M", 17), 98 | "B_2_40": ("2M2D3M", 22), 99 | "C_1_10": ("8M", 40), 100 | "C_2_10": ("2M", 10), 101 | "C_3_15": ("2M1I2M1I1M", 21), 102 | } 103 | 104 | for read_name in aln_df["readName"].unique(): 105 | 106 | expected_cigar = expected_alignments[read_name][0] 107 | expected_score = expected_alignments[read_name][1] 108 | 109 | self.assertEqual( 110 | aln_df.loc[aln_df["readName"] == read_name, "CIGAR"].iloc[0], 111 | expected_cigar, 112 | ) 113 | self.assertEqual( 114 | aln_df.loc[aln_df["readName"] == read_name, "AlignmentScore"].iloc[0], 115 | expected_score, 116 | ) 117 | 118 | def test_global_alignment(self): 119 | 120 | aln_df = cassiopeia.pp.align_sequences( 121 | self.queries, 122 | ref=self.reference, 123 | gap_open_penalty=2, 124 | gap_extend_penalty=1, 125 | method="global", 126 | ) 127 | 128 | expected_alignments = { 129 | "A_1_20": ("8M", 40), 130 | "A_2_30": ("1M2D2M1D1M1D", 15), 131 | "A_3_30": ("8M9I", 40), 132 | "B_1_40": ("2M2D2M2D2I", 14), 133 | "B_2_40": ("1M2D2M1D2M3I", 20), 134 | "C_1_10": ("8M2I", 40), 135 | "C_2_10": ("2M6D9I", 3), 136 | "C_3_15": ("1I1M1D1M1I2M1I1M1I2D", 15), 137 | } 138 | 139 | for read_name in aln_df["readName"].unique(): 140 | 141 | expected_cigar = expected_alignments[read_name][0] 142 | expected_score = expected_alignments[read_name][1] 143 | 144 | self.assertEqual( 145 | aln_df.loc[aln_df["readName"] == read_name, "CIGAR"].iloc[0], 146 | expected_cigar, 147 | ) 148 | self.assertEqual( 149 | aln_df.loc[aln_df["readName"] == read_name, "AlignmentScore"].iloc[0], 150 | expected_score, 151 | ) 152 | 153 | 154 | if __name__ == "__main__": 155 | unittest.main() 156 | -------------------------------------------------------------------------------- /test/preprocess_tests/error_correct_cellbcs_to_whitelist_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for correcting raw barcodes to a whitelist pipeline.py 3 | """ 4 | import os 5 | import unittest 6 | import tempfile 7 | 8 | import pysam 9 | import ngs_tools as ngs 10 | 11 | from cassiopeia.preprocess import pipeline 12 | 13 | 14 | class TestErrorCorrectCellBCsToWhitelist(unittest.TestCase): 15 | def setUp(self): 16 | dir_path = os.path.dirname(os.path.realpath(__file__)) 17 | test_files_path = os.path.join(dir_path, "test_files") 18 | 19 | self.bam_10xv3_fp = os.path.join(test_files_path, "10xv3_unmapped.bam") 20 | self.whitelist_10xv3_fp = os.path.join( 21 | test_files_path, "10xv3_whitelist.txt" 22 | ) 23 | self.whitelist_10xv3 = ["TACGTCATCTCCTACG", "TTAGATCGTTAGAAAG"] 24 | self.bam_slideseq2_fp = os.path.join( 25 | test_files_path, "slideseq2_unmapped.bam" 26 | ) 27 | self.whitelist_slideseq2_fp = os.path.join( 28 | test_files_path, "slideseq2_whitelist.txt" 29 | ) 30 | self.whitelist_slideseq2 = ["CTTTGNTCAAAGTT"] 31 | 32 | def test_10xv3(self): 33 | bam_fp = pipeline.error_correct_cellbcs_to_whitelist( 34 | self.bam_10xv3_fp, self.whitelist_10xv3_fp, tempfile.mkdtemp() 35 | ) 36 | with pysam.AlignmentFile(bam_fp, "rb", check_sq=False) as f: 37 | alignments = list(f.fetch(until_eof=True)) 38 | self.assertEqual(2, len(alignments)) 39 | self.assertEqual( 40 | ["TACGTCATCTCCTACG", "TTAGATCGTTAGAAAG"], 41 | [al.get_tag("CB") for al in alignments], 42 | ) 43 | 44 | def test_10xv3_whitelist_list(self): 45 | bam_fp = pipeline.error_correct_cellbcs_to_whitelist( 46 | self.bam_10xv3_fp, self.whitelist_10xv3, tempfile.mkdtemp() 47 | ) 48 | with pysam.AlignmentFile(bam_fp, "rb", check_sq=False) as f: 49 | alignments = list(f.fetch(until_eof=True)) 50 | self.assertEqual(2, len(alignments)) 51 | self.assertEqual( 52 | ["TACGTCATCTCCTACG", "TTAGATCGTTAGAAAG"], 53 | [al.get_tag("CB") for al in alignments], 54 | ) 55 | 56 | def test_slideseq2(self): 57 | bam_fp = pipeline.error_correct_cellbcs_to_whitelist( 58 | self.bam_slideseq2_fp, 59 | self.whitelist_slideseq2_fp, 60 | tempfile.mkdtemp(), 61 | ) 62 | with pysam.AlignmentFile(bam_fp, "rb", check_sq=False) as f: 63 | alignments = list(f.fetch(until_eof=True)) 64 | self.assertEqual(2, len(alignments)) 65 | self.assertEqual([True, False], [al.has_tag("CB") for al in alignments]) 66 | self.assertEqual("CTTTGNTCAAAGTT", alignments[0].get_tag("CB")) 67 | 68 | def test_slideseq2_whitelist_list(self): 69 | bam_fp = pipeline.error_correct_cellbcs_to_whitelist( 70 | self.bam_slideseq2_fp, self.whitelist_slideseq2, tempfile.mkdtemp() 71 | ) 72 | with pysam.AlignmentFile(bam_fp, "rb", check_sq=False) as f: 73 | alignments = list(f.fetch(until_eof=True)) 74 | self.assertEqual(2, len(alignments)) 75 | self.assertEqual([True, False], [al.has_tag("CB") for al in alignments]) 76 | self.assertEqual("CTTTGNTCAAAGTT", alignments[0].get_tag("CB")) 77 | 78 | 79 | if __name__ == "__main__": 80 | unittest.main() 81 | -------------------------------------------------------------------------------- /test/preprocess_tests/error_correct_intbcs_to_whitelist_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | import cassiopeia 8 | 9 | 10 | class TestErrorCorrectIntBCstoWhitelist(unittest.TestCase): 11 | def setUp(self): 12 | dir_path = os.path.dirname(os.path.realpath(__file__)) 13 | test_files_path = os.path.join(dir_path, "test_files") 14 | self.whitelist_fp = os.path.join(test_files_path, "intbc_whitelist.txt") 15 | self.whitelist = ["ACTT", "TAAG"] 16 | 17 | self.multi_case = pd.DataFrame.from_dict( 18 | { 19 | "cellBC": [ 20 | "A", 21 | "A", 22 | "A", 23 | "B", 24 | "B", 25 | "C", 26 | "C", 27 | "C", 28 | "C", 29 | "D", 30 | "D", 31 | ], 32 | "UMI": [ 33 | "AACCT", 34 | "AACCG", 35 | "AACCC", 36 | "AACCT", 37 | "AACCG", 38 | "AACCT", 39 | "AACCG", 40 | "AAGGA", 41 | "AACCT", 42 | "AACCT", 43 | "AAGGG", 44 | ], 45 | "readCount": [20, 30, 30, 40, 50, 10, 10, 15, 10, 10, 10], 46 | "Seq": [ 47 | "AACCTTGG", 48 | "AACCTTGG", 49 | "AACCTTCC", 50 | "AACCTTGG", 51 | "AACCTTGC", 52 | "AACCTTCC", 53 | "AACCTTCG", 54 | "AACCTCAG", 55 | "AACCTTGG", 56 | "AACCTTGG", 57 | "AACCTAAA", 58 | ], 59 | "intBC": [ 60 | "ACTT", 61 | "AAGG", 62 | "ACTA", 63 | "AAGN", 64 | "TACT", 65 | "TAAG", 66 | "TNNG", 67 | "ANNN", 68 | "GCTT", 69 | "NNNN", 70 | "AAAA", 71 | ], 72 | "r1": ["1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"], 73 | "r2": ["2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2"], 74 | "r3": ["3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3"], 75 | "AlignmentScore": [ 76 | "20", 77 | "20", 78 | "20", 79 | "20", 80 | "20", 81 | "20", 82 | "20", 83 | "20", 84 | "20", 85 | "20", 86 | "20", 87 | ], 88 | "CIGAR": [ 89 | "NA", 90 | "NA", 91 | "NA", 92 | "NA", 93 | "NA", 94 | "NA", 95 | "NA", 96 | "NA", 97 | "NA", 98 | "NA", 99 | "NA", 100 | ], 101 | } 102 | ) 103 | self.multi_case["readName"] = self.multi_case.apply( 104 | lambda x: "_".join([x.cellBC, x.UMI, str(x.readCount)]), axis=1 105 | ) 106 | 107 | self.multi_case["allele"] = self.multi_case.apply( 108 | lambda x: "_".join([x.r1, x.r2, x.r3]), axis=1 109 | ) 110 | self.corrections = { 111 | "ACTT": "ACTT", 112 | "TAAG": "TAAG", 113 | "ACTA": "ACTT", 114 | "TNNG": "TAAG", 115 | "ANNN": "ACTT", 116 | } 117 | 118 | def test_correct(self): 119 | 120 | df = cassiopeia.pp.error_correct_intbcs_to_whitelist( 121 | self.multi_case, self.whitelist_fp, intbc_dist_thresh=1 122 | ) 123 | expected_df = self.multi_case.copy() 124 | expected_df["intBC"] = expected_df["intBC"].map(self.corrections) 125 | expected_df.dropna(subset=["intBC"], inplace=True) 126 | 127 | pd.testing.assert_frame_equal(df, expected_df) 128 | 129 | def test_correct_whitelist_list(self): 130 | 131 | df = cassiopeia.pp.error_correct_intbcs_to_whitelist( 132 | self.multi_case, self.whitelist, intbc_dist_thresh=1 133 | ) 134 | expected_df = self.multi_case.copy() 135 | expected_df["intBC"] = expected_df["intBC"].map(self.corrections) 136 | expected_df.dropna(subset=["intBC"], inplace=True) 137 | 138 | pd.testing.assert_frame_equal(df, expected_df) 139 | 140 | 141 | if __name__ == "__main__": 142 | unittest.main() 143 | -------------------------------------------------------------------------------- /test/preprocess_tests/filter_bam_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for correcting raw barcodes to a whitelist pipeline.py 3 | """ 4 | import os 5 | import unittest 6 | import tempfile 7 | 8 | import pysam 9 | import ngs_tools as ngs 10 | 11 | from cassiopeia.preprocess import pipeline 12 | 13 | 14 | class TestFilterBam(unittest.TestCase): 15 | def setUp(self): 16 | dir_path = os.path.dirname(os.path.realpath(__file__)) 17 | test_files_path = os.path.join(dir_path, "test_files") 18 | 19 | self.bam_10xv3_fp = os.path.join(test_files_path, "10xv3_unmapped.bam") 20 | 21 | def test_filter(self): 22 | bam_fp = pipeline.filter_bam(self.bam_10xv3_fp, tempfile.mkdtemp(), 10) 23 | with pysam.AlignmentFile(bam_fp, "rb", check_sq=False) as f: 24 | alignments = list(f.fetch(until_eof=True)) 25 | self.assertEqual(len(alignments), 2) 26 | 27 | bam_fp = pipeline.filter_bam(self.bam_10xv3_fp, tempfile.mkdtemp(), 20) 28 | with pysam.AlignmentFile(bam_fp, "rb", check_sq=False) as f: 29 | alignments = list(f.fetch(until_eof=True)) 30 | self.assertEqual(len(alignments), 0) 31 | 32 | 33 | if __name__ == "__main__": 34 | unittest.main() 35 | -------------------------------------------------------------------------------- /test/preprocess_tests/resolve_umi_sequence_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the UMI Resolution module in pipeline.py. 3 | """ 4 | import os 5 | import unittest 6 | 7 | import shutil 8 | import tempfile 9 | 10 | import pandas as pd 11 | from cassiopeia.preprocess import pipeline 12 | 13 | 14 | class TestResolveUMISequence(unittest.TestCase): 15 | def setUp(self): 16 | 17 | collapsed_umi_table_dict = { 18 | "cellBC": [ 19 | "cell1", 20 | "cell1", 21 | "cell1", 22 | "cell2", 23 | "cell2", 24 | "cell3", 25 | "cell3", 26 | ], 27 | "UMI": ["UMIA", "UMIA", "UMIC", "UMIA", "UMIB", "UMIA", "UMIB"], 28 | "readCount": [9, 20, 11, 2, 1, 40, 30], 29 | "grpFlag": [0, 0, 0, 0, 0, 0, 0], 30 | "seq": [ 31 | "AATCCG", 32 | "AAGGTT", 33 | "CCATTA", 34 | "ATACTG", 35 | "GGGAAT", 36 | "TTTCCTT", 37 | "CCAATTG", 38 | ], 39 | "qual": [ 40 | "FFFFFF", 41 | "FFFFFF", 42 | "FFFFFF", 43 | "FFFFFF", 44 | "FFFFFF", 45 | "FFFFFF", 46 | "FFFFFF", 47 | ], 48 | "readName": [ 49 | "cell1_UMIA_9_0", 50 | "cell1_UMIA_20_0", 51 | "cell1_UMIC_11_0", 52 | "cell2_UMIA_2", 53 | "cell2_UMIB_1", 54 | "cell3_UMIA_40", 55 | "cell3_UMIB_30", 56 | ], 57 | } 58 | self.collapsed_umi_table = pd.DataFrame.from_dict( 59 | collapsed_umi_table_dict 60 | ) 61 | 62 | # set up temporary directory 63 | self.temporary_directory = tempfile.mkdtemp() 64 | 65 | def test_resolve_umi(self): 66 | 67 | resolved_mt = pipeline.resolve_umi_sequence( 68 | self.collapsed_umi_table, self.temporary_directory, min_umi_per_cell=1, plot=False 69 | ) 70 | 71 | # check that cell1-UMIA was selected correctly 72 | expected_seq = "AAGGTT" 73 | observed_seq = resolved_mt.loc[ 74 | resolved_mt["readName"] == "cell1_UMIA_20_0", "seq" 75 | ].values 76 | self.assertEqual(expected_seq, observed_seq) 77 | 78 | # check that cell2 was filtered 79 | self.assertNotIn("cell2", resolved_mt["cellBC"].unique()) 80 | 81 | # check that cell3 didn't lose UMIs 82 | self.assertEqual( 83 | 2, resolved_mt[resolved_mt["cellBC"] == "cell3"].shape[0] 84 | ) 85 | 86 | # check expected reads 87 | expected = {"cell1": 31, "cell3": 70} 88 | for n, g in resolved_mt.groupby("cellBC"): 89 | 90 | self.assertEqual(expected[n], g["readCount"].sum()) 91 | 92 | def test_filter_by_reads(self): 93 | 94 | resolved_mt = pipeline.resolve_umi_sequence( 95 | self.collapsed_umi_table, 96 | self.temporary_directory, 97 | min_avg_reads_per_umi=30, 98 | min_umi_per_cell=1, 99 | plot=True, 100 | ) 101 | 102 | expected_cells = ["cell3"] 103 | expected_removed_cells = ["cell1", "cell2"] 104 | 105 | # print(expected_cells) 106 | 107 | for cell in expected_cells: 108 | self.assertIn(cell, resolved_mt["cellBC"].unique()) 109 | 110 | for cell in expected_removed_cells: 111 | self.assertNotIn(cell, resolved_mt["cellBC"].unique()) 112 | 113 | def tearDown(self): 114 | 115 | shutil.rmtree(self.temporary_directory) 116 | 117 | 118 | if __name__ == "__main__": 119 | unittest.main() 120 | -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/10xv3_1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/10xv3_1.fastq.gz -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/10xv3_2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/10xv3_2.fastq.gz -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/10xv3_unmapped.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/10xv3_unmapped.bam -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/10xv3_whitelist.txt: -------------------------------------------------------------------------------- 1 | TACGTCATCTCCTACG 2 | TTAGATCGTTAGAAAG 3 | -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/basic_grouping.csv: -------------------------------------------------------------------------------- 1 | cellBC intBC allele r1 r2 r3 lineageGrp UMI readCount Sample 2 | A XX 1_2_3 1 2 3 1 3 70 A 3 | A YZ 1_2_3 1 2 3 1 1 40 A 4 | B XX 1_2_3 1 2 3 1 1 10 B 5 | B YZ 1_2_3 1 2 3 1 1 110 B 6 | C XY 1_2_2 1 2 2 2 1 10 C 7 | C XZ 1_2_2 1 2 2 2 1 10 C 8 | C YX 1_2_3 1 2 3 2 1 15 C 9 | -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/clustered_intbc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/clustered_intbc.png -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/collapse_header_required.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/collapse_header_required.bam -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/collapse_header_required.collapsed.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/collapse_header_required.collapsed.bam -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/doublet.csv: -------------------------------------------------------------------------------- 1 | cellBC intBC allele r1 r2 r3 lineageGrp UMI readCount Sample 2 | A XX 1_2_3 1 2 3 1 2 40 A 3 | B XX 1_2_3 1 2 3 1 2 70 B 4 | D XY 1_2_3 1 2 3 2 2 35 D 5 | E XY 1_2_3 1 2 3 2 2 20 E 6 | -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/filter_and_reassign.csv: -------------------------------------------------------------------------------- 1 | cellBC intBC allele r1 r2 r3 lineageGrp UMI readCount Sample 2 | A XX 1_2_3 1 2 3 1 1 30 A 3 | B XX 1_2_3 1 2 3 1 1 40 B 4 | C XX 1_2_3 1 2 3 1 1 10 C 5 | D XX 1_2_3 1 2 3 1 1 20 D 6 | E XZ 1_2_3 1 2 3 2 2 20 E 7 | F XZ 1_2_3 1 2 3 2 2 20 F 8 | -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/indropsv3_1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/indropsv3_1.fastq.gz -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/indropsv3_2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/indropsv3_2.fastq.gz -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/indropsv3_3.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/indropsv3_3.fastq.gz -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/intbc_whitelist.txt: -------------------------------------------------------------------------------- 1 | ACTT 2 | TAAG 3 | -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/lineageGrp_piv_heatmaps/lg_1_piv_heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/lineageGrp_piv_heatmaps/lg_1_piv_heatmap.png -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/reassign.csv: -------------------------------------------------------------------------------- 1 | cellBC intBC allele r1 r2 r3 lineageGrp UMI readCount Sample 2 | A XX 1_2_3 1 2 3 1 2 40 A 3 | B XX 1_2_3 1 2 3 1 2 70 B 4 | C XX 1_2_3 1 2 3 1 2 120 C 5 | D XX 1_2_3 1 2 3 1 1 20 D 6 | D YZ 1_2_3 1 2 3 1 1 15 D 7 | E XZ 1_2_3 1 2 3 1 1 10 E 8 | E YZ 1_2_3 1 2 3 1 1 10 E 9 | F XZ 1_2_3 1 2 3 1 3 30 F 10 | -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/slideseq2_1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/slideseq2_1.fastq.gz -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/slideseq2_2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/slideseq2_2.fastq.gz -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/slideseq2_unmapped.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/slideseq2_unmapped.bam -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/slideseq2_whitelist.txt: -------------------------------------------------------------------------------- 1 | CTTTGNTCAAAGTT 2 | -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/test.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/test.bam -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/test_sorted.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/test_sorted.bam -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/test_sorted.bayesian_collapsed.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/test_sorted.bayesian_collapsed.bam -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/test_sorted.collapsed.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/test_sorted.collapsed.bam -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/test_sorted.collapsed.txt: -------------------------------------------------------------------------------- 1 | cellBC UMI readCount grpFlag seq qual readName 2 | CAACCTCGTGGGTATG-1 GATAACATCG 000007 0+ AATCCAGCTAGCTGA @@@@@@@@@@@@@@@ CAACCTCGTGGGTATG-1_GATAACATCG_000007_0+ 3 | CTCACACTCGAATGCT-1 TGGCCTTTAA 000001 0 TATCCAGCTAGCTGA FFFFFFFFFFFFFFF CTCACACTCGAATGCT-1_TGGCCTTTAA_000001_0 4 | CTCACACTCGAATGCT-1 TGGCCTTTAT 000002 0 NATCCAGCTAGCTGA #@@@@@@@@@@@@@@ CTCACACTCGAATGCT-1_TGGCCTTTAT_000002_0 5 | GACCCTCGTGGGTATG-1 GATAACATCG 000003 0 AATCCAGCTAGCTGA @@@@@@@@@@@@@@@ GACCCTCGTGGGTATG-1_GATAACATCG_000003_0 6 | GACCCTCGTGGGTATG-1 GATAACATCG 000003 1 CCGCCAGCTAGCTGA @@@@@@@@@@@@@@@ GACCCTCGTGGGTATG-1_GATAACATCG_000003_1 7 | -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/test_uncorrected.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/test_uncorrected.bam -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/test_uncorrected_sorted.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/test_uncorrected_sorted.bam -------------------------------------------------------------------------------- /test/preprocess_tests/test_files/test_uncorrected_sorted.collapsed.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/test_uncorrected_sorted.collapsed.bam -------------------------------------------------------------------------------- /test/simulator_tests/complete_binary_simulator_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import unittest 3 | 4 | from cassiopeia.mixins import TreeSimulatorError 5 | from cassiopeia.simulator import CompleteBinarySimulator 6 | 7 | 8 | class TestCompleteBinarySimulator(unittest.TestCase): 9 | def test_init(self): 10 | with self.assertRaises(TreeSimulatorError): 11 | CompleteBinarySimulator() 12 | 13 | with self.assertRaises(TreeSimulatorError): 14 | CompleteBinarySimulator(num_cells=3) 15 | 16 | with self.assertRaises(TreeSimulatorError): 17 | CompleteBinarySimulator(depth=0) 18 | 19 | simulator = CompleteBinarySimulator(num_cells=4) 20 | self.assertEqual(simulator.depth, 2) 21 | 22 | def test_simulate_tree(self): 23 | tree = CompleteBinarySimulator(depth=2).simulate_tree() 24 | 25 | self.assertEqual( 26 | set(tree.nodes), {"0", "1", "2", "3", "4", "5", "6", "7"} 27 | ) 28 | self.assertEqual(set(tree.leaves), {"4", "5", "6", "7"}) 29 | self.assertEqual( 30 | set(tree.edges), 31 | { 32 | ("0", "1"), 33 | ("1", "2"), 34 | ("1", "3"), 35 | ("2", "4"), 36 | ("2", "5"), 37 | ("3", "6"), 38 | ("3", "7"), 39 | }, 40 | ) 41 | 42 | # Test branch lengths 43 | self.assertEqual( 44 | tree.get_times(), 45 | { 46 | "0": 0.0, 47 | "1": 1 / 3, 48 | "2": 2 / 3, 49 | "3": 2 / 3, 50 | "4": 1.0, 51 | "5": 1.0, 52 | "6": 1.0, 53 | "7": 1.0, 54 | }, 55 | ) 56 | -------------------------------------------------------------------------------- /test/simulator_tests/simple_fit_subclone_simulator_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import unittest 3 | 4 | from cassiopeia.simulator import SimpleFitSubcloneSimulator 5 | 6 | 7 | class TestSimpleFitSubcloneSimulator(unittest.TestCase): 8 | def test_deterministic(self): 9 | r""" 10 | Small test that can be drawn by hand. 11 | Checks that the generated phylogeny is correct. 12 | """ 13 | tree = SimpleFitSubcloneSimulator( 14 | branch_length_neutral=1, 15 | branch_length_fit=0.5, 16 | experiment_duration=1.9, 17 | generations_until_fit_subclone=1, 18 | ).simulate_tree() 19 | self.assertListEqual( 20 | tree.nodes, 21 | ["0_neutral", "1_neutral", "2_fit", "3_neutral", "4_fit", "5_fit"], 22 | ) 23 | self.assertListEqual( 24 | tree.edges, 25 | [ 26 | ("0_neutral", "1_neutral"), 27 | ("1_neutral", "2_fit"), 28 | ("1_neutral", "3_neutral"), 29 | ("2_fit", "4_fit"), 30 | ("2_fit", "5_fit"), 31 | ], 32 | ) 33 | self.assertDictEqual( 34 | tree.get_times(), 35 | { 36 | "0_neutral": 0.0, 37 | "1_neutral": 1.0, 38 | "2_fit": 1.5, 39 | "3_neutral": 1.9, 40 | "4_fit": 1.9, 41 | "5_fit": 1.9, 42 | }, 43 | ) 44 | 45 | def test_stochastic(self): 46 | r""" 47 | We test the functionality that allows providing a callable for branch 48 | lengths. Because the test is stochastic, we don't assert anything 49 | besides the branch lengths being all different. 50 | """ 51 | np.random.seed(1) 52 | 53 | def branch_length_neutral() -> float: 54 | return np.random.exponential(1.0) 55 | 56 | def branch_length_fit() -> float: 57 | return np.random.exponential(0.5) 58 | 59 | tree = SimpleFitSubcloneSimulator( 60 | branch_length_neutral=branch_length_neutral, 61 | branch_length_fit=branch_length_fit, 62 | experiment_duration=4.9, 63 | generations_until_fit_subclone=2, 64 | ).simulate_tree() 65 | # Just check that all branch lengths are distinct to confirm 66 | # non-determinism. We exclude the leaves because sister leaves have the 67 | # same branch length. 68 | branch_lengths = [ 69 | tree.get_branch_length(p, c) 70 | for (p, c) in tree.edges 71 | if not tree.is_leaf(c) 72 | ] 73 | assert len(branch_lengths) == len(set(branch_lengths)) 74 | -------------------------------------------------------------------------------- /test/tools_tests/autocorrelation_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test suite for the autocorrelation functions in 3 | cassiopeia/tools/autocorrelation.py 4 | """ 5 | import unittest 6 | 7 | import networkx as nx 8 | import numpy as np 9 | import pandas as pd 10 | 11 | import cassiopeia as cas 12 | from cassiopeia.mixins.errors import AutocorrelationError 13 | from cassiopeia.tools.autocorrelation import compute_morans_i 14 | 15 | 16 | class TestAutocorrelation(unittest.TestCase): 17 | def setUp(self) -> None: 18 | 19 | tree = nx.DiGraph() 20 | tree.add_nodes_from(["A", "B", "C", "D", "E", "F"]) 21 | tree.add_edge("F", "A", length=0.1) 22 | tree.add_edge("F", "B", length=0.2) 23 | tree.add_edge("F", "E", length=0.5) 24 | tree.add_edge("E", "C", length=0.3) 25 | tree.add_edge("E", "D", length=0.4) 26 | 27 | self.basic_tree = cas.data.CassiopeiaTree(tree=tree) 28 | 29 | example_obs = pd.DataFrame.from_dict( 30 | { 31 | "nUMI": [10, 10, 3, 3], 32 | "GeneX": [3, 5, 10, 2], 33 | "GeneY": [30, 30, 1, 1], 34 | }, 35 | orient="index", 36 | columns=["A", "B", "C", "D"], 37 | ).T 38 | 39 | self.X = example_obs 40 | 41 | def test_simple_moran_single_variable(self): 42 | """ 43 | Tests Moran's I, comparing values gotten from the function implemented 44 | in Chaligne et al, Nat Genetics 2021 45 | """ 46 | 47 | I = cas.tl.compute_morans_i( 48 | self.basic_tree, X=pd.DataFrame(self.X["nUMI"]) 49 | ) 50 | 51 | self.assertAlmostEqual(I, 0.084456, delta=0.001) 52 | 53 | def test_moran_bivariate(self): 54 | """ 55 | Statistics compared to the function implemented in Chaligne et al, 56 | Nat Gen 2021 57 | """ 58 | I = cas.tl.compute_morans_i(self.basic_tree, X=self.X) 59 | 60 | expected_correlations = pd.DataFrame.from_dict( 61 | { 62 | "nUMI": [0.08445, -0.00874, 0.08446], 63 | "GeneX": [-0.00874, -0.31810, -0.00874], 64 | "GeneY": [0.08446, -0.00874, 0.08446], 65 | }, 66 | orient="index", 67 | columns=["nUMI", "GeneX", "GeneY"], 68 | ) 69 | 70 | pd.testing.assert_frame_equal( 71 | I, expected_correlations, check_exact=False, atol=0.001 72 | ) 73 | 74 | def test_moran_custom_weights(self): 75 | 76 | W = pd.DataFrame.from_dict( 77 | { 78 | "A": [0, 1 / 2, 1 / 3, 1 / 3], 79 | "B": [1 / 2, 0, 1 / 3, 1 / 3], 80 | "C": [1 / 3, 1 / 3, 0, 1 / 2], 81 | "D": [1 / 3, 1 / 3, 1 / 2, 0], 82 | }, 83 | orient="index", 84 | columns=["A", "B", "C", "D"], 85 | ) 86 | 87 | I = cas.tl.compute_morans_i( 88 | self.basic_tree, X=pd.DataFrame(self.X["nUMI"]), W=W 89 | ) 90 | 91 | self.assertAlmostEqual(I, -0.1428571, delta=0.0001) 92 | 93 | def test_moran_exceptions(self): 94 | 95 | # check typing 96 | string_type_meta = pd.DataFrame( 97 | ["type1", "type2", "type1", "type3"], 98 | index=["A", "B", "C", "D"], 99 | columns=["CellType"], 100 | ) 101 | 102 | X = pd.concat([self.X, string_type_meta]) 103 | 104 | self.assertRaises( 105 | AutocorrelationError, 106 | cas.tl.compute_morans_i, 107 | self.basic_tree, 108 | None, 109 | X, 110 | ) 111 | 112 | # check all leaves are accounted for 113 | new_row = pd.DataFrame.from_dict( 114 | {"E": [5, 5, 5]}, orient="index", columns=["nUMI", "GeneX", "GeneY"] 115 | ) 116 | 117 | X = pd.concat([self.X, new_row], axis=1) 118 | 119 | self.assertRaises( 120 | AutocorrelationError, 121 | cas.tl.compute_morans_i, 122 | self.basic_tree, 123 | None, 124 | X, 125 | ) 126 | 127 | # make sure some data is passed in 128 | self.assertRaises( 129 | AutocorrelationError, 130 | cas.tl.compute_morans_i, 131 | self.basic_tree, 132 | None, 133 | None, 134 | ) 135 | 136 | # make sure weight matrix has the right leaves 137 | W = pd.DataFrame.from_dict( 138 | { 139 | "A": [0, 1 / 2, 1 / 3], 140 | "B": [1 / 2, 0, 1 / 3], 141 | "C": [1 / 3, 1 / 3, 0], 142 | }, 143 | orient="index", 144 | columns=["A", "B", "C"], 145 | ) 146 | self.assertRaises( 147 | AutocorrelationError, 148 | cas.tl.compute_morans_i, 149 | self.basic_tree, 150 | None, 151 | self.X, 152 | W 153 | ) 154 | 155 | if __name__ == "__main__": 156 | unittest.main() 157 | -------------------------------------------------------------------------------- /test/tools_tests/fitness_estimator_tests/lbi_jungle_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test LBIJungle in cassiopeia.tools. 3 | """ 4 | import unittest 5 | 6 | import networkx as nx 7 | 8 | from cassiopeia.data import CassiopeiaTree 9 | from cassiopeia.tools import FitnessEstimatorError, LBIJungle 10 | 11 | 12 | class TestLBIJungle(unittest.TestCase): 13 | def test_small_tree(self): 14 | """ 15 | Run LBI jungle on small tree and see that fitness estimates make sense. 16 | """ 17 | tree = nx.DiGraph() 18 | nodes = [ 19 | "root", 20 | "internal-1", 21 | "internal-2", 22 | "internal-3", 23 | "leaf-1", 24 | "leaf-2", 25 | "leaf-3", 26 | "leaf-4", 27 | "leaf-5", 28 | ] 29 | tree.add_nodes_from(nodes) 30 | tree.add_edges_from( 31 | [ 32 | ("root", "internal-1"), 33 | ("internal-1", "internal-2"), 34 | ("internal-1", "internal-3"), 35 | ("internal-2", "leaf-1"), 36 | ("internal-2", "leaf-2"), 37 | ("internal-2", "leaf-3"), 38 | ("internal-3", "leaf-4"), 39 | ("internal-3", "leaf-5"), 40 | ] 41 | ) 42 | tree = CassiopeiaTree(tree=tree) 43 | tree.set_times( 44 | { 45 | "root": 0.0, 46 | "internal-1": 0.25, 47 | "internal-2": 0.5, 48 | "internal-3": 0.5, 49 | "leaf-1": 1.0, 50 | "leaf-2": 1.0, 51 | "leaf-3": 1.0, 52 | "leaf-4": 1.0, 53 | "leaf-5": 1.0, 54 | } 55 | ) 56 | fitness_estimator = LBIJungle() 57 | fitness_estimator.estimate_fitness(tree) 58 | fitness_estimates = { 59 | node: tree.get_attribute(node, "fitness") 60 | for node in nodes 61 | if node != tree.root # LBIJungle doesn't report root fitness. 62 | } 63 | # internal node 2 has strictly more branching than internal node 3, so 64 | # fitness estimate should be higher 65 | self.assertGreater( 66 | fitness_estimates["internal-2"], fitness_estimates["internal-3"] 67 | ) 68 | # Leaves 1, 2, 3 should have the same fitness 69 | self.assertAlmostEqual( 70 | fitness_estimates["leaf-1"], fitness_estimates["leaf-2"] 71 | ) 72 | self.assertAlmostEqual( 73 | fitness_estimates["leaf-2"], fitness_estimates["leaf-3"] 74 | ) 75 | # Leaves 4, 5 should have the same fitness 76 | self.assertAlmostEqual( 77 | fitness_estimates["leaf-4"], fitness_estimates["leaf-5"] 78 | ) 79 | # Leaves 1, 2, 3 should have higher fitness than leaves 4, 5 80 | self.assertGreater( 81 | fitness_estimates["leaf-1"], fitness_estimates["leaf-4"] 82 | ) 83 | # Leaves should have lower fitness than their parent (by LBI property) 84 | self.assertGreater( 85 | fitness_estimates["internal-2"], fitness_estimates["leaf-1"] 86 | ) 87 | self.assertGreater( 88 | fitness_estimates["internal-3"], fitness_estimates["leaf-4"] 89 | ) 90 | 91 | def test_raises_error_if_leaf_name_startswith_underscore(self): 92 | """ 93 | Leaf names cannot start with an underscore. 94 | 95 | (This is due to the underlying Jungle implementation we wrap.) 96 | """ 97 | tree = nx.DiGraph() 98 | nodes = [ 99 | "root", 100 | "_leaf", 101 | ] 102 | tree.add_nodes_from(nodes) 103 | tree.add_edges_from( 104 | [ 105 | ("root", "_leaf"), 106 | ] 107 | ) 108 | tree = CassiopeiaTree(tree=tree) 109 | fitness_estimator = LBIJungle() 110 | with self.assertRaises(FitnessEstimatorError): 111 | fitness_estimator.estimate_fitness(tree) 112 | 113 | if __name__ == "__main__": 114 | unittest.main() 115 | -------------------------------------------------------------------------------- /version.py: -------------------------------------------------------------------------------- 1 | version = '2.0.0' 2 | 3 | if __name__ == '__main__': 4 | print(version) --------------------------------------------------------------------------------