├── .gitattributes
├── .github
    └── workflows
    │   └── test.yml
├── .gitignore
├── .readthedocs.yml
├── AUTHORS.rst
├── CONTRIBUTING.rst
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── build.py
├── cassiopeia
    ├── __init__.py
    ├── critique
    │   ├── __init__.py
    │   ├── compare.py
    │   └── critique_utilities.py
    ├── data
    │   ├── CassiopeiaTree.py
    │   ├── Layers.py
    │   ├── __init__.py
    │   └── utilities.py
    ├── mixins
    │   ├── __init__.py
    │   ├── errors.py
    │   ├── logging.py
    │   ├── utilities.py
    │   └── warnings.py
    ├── plotting
    │   ├── __init__.py
    │   ├── itol_utilities.py
    │   ├── local.py
    │   ├── local_3d.py
    │   ├── palettes.py
    │   └── utilities.py
    ├── preprocess
    │   ├── UMI_utils.py
    │   ├── __init__.py
    │   ├── alignment_utilities.py
    │   ├── cassiopeia_preprocess.py
    │   ├── collapse_cython.pyx
    │   ├── constants.py
    │   ├── doublet_utils.py
    │   ├── lineage_utils.py
    │   ├── map_utils.py
    │   ├── pipeline.py
    │   ├── setup_utilities.py
    │   └── utilities.py
    ├── simulator
    │   ├── BirthDeathFitnessSimulator.py
    │   ├── BrownianSpatialDataSimulator.py
    │   ├── Cas9LineageTracingDataSimulator.py
    │   ├── ClonalSpatialDataSimulator.py
    │   ├── CompleteBinarySimulator.py
    │   ├── DataSimulator.py
    │   ├── LeafSubsampler.py
    │   ├── LineageTracingDataSimulator.py
    │   ├── SequentialLineageTracingDataSimulator.py
    │   ├── SimpleFitSubcloneSimulator.py
    │   ├── SpatialDataSimulator.py
    │   ├── SpatialLeafSubsampler.py
    │   ├── SupercellularSampler.py
    │   ├── TreeSimulator.py
    │   ├── UniformLeafSubsampler.py
    │   ├── __init__.py
    │   └── ecDNABirthDeathSimulator.py
    ├── solver
    │   ├── CassiopeiaSolver.py
    │   ├── DistanceSolver.py
    │   ├── GreedySolver.py
    │   ├── HybridSolver.py
    │   ├── ILPSolver.py
    │   ├── MaxCutGreedySolver.py
    │   ├── MaxCutSolver.py
    │   ├── NeighborJoiningSolver.py
    │   ├── PercolationSolver.py
    │   ├── SharedMutationJoiningSolver.py
    │   ├── SpectralGreedySolver.py
    │   ├── SpectralNeighborJoiningSolver.py
    │   ├── SpectralSolver.py
    │   ├── UPGMASolver.py
    │   ├── VanillaGreedySolver.py
    │   ├── __init__.py
    │   ├── dissimilarity_functions.py
    │   ├── graph_utilities.py
    │   ├── ilp_solver_utilities.pyx
    │   ├── missing_data_methods.py
    │   └── solver_utilities.py
    └── tools
    │   ├── __init__.py
    │   ├── autocorrelation.py
    │   ├── branch_length_estimator
    │       ├── BranchLengthEstimator.py
    │       ├── IIDExponentialBayesian.py
    │       ├── IIDExponentialMLE.py
    │       ├── __init__.py
    │       ├── _iid_exponential_bayesian.pxd
    │       ├── _iid_exponential_bayesian.pyx
    │       ├── _iid_exponential_bayesian_cpp.cpp
    │       └── _iid_exponential_bayesian_cpp.h
    │   ├── coupling.py
    │   ├── fitness_estimator
    │       ├── _FitnessEstimator.py
    │       ├── __init__.py
    │       ├── _jungle
    │       │   ├── LICENSE
    │       │   ├── examples
    │       │   │   ├── FitnessScore.ipynb
    │       │   │   ├── SignaturesSelection.ipynb
    │       │   │   ├── Tree_neutral.nwk
    │       │   │   ├── Tree_positive_selection.nwk
    │       │   │   ├── node_features.tsv
    │       │   │   └── node_features_leaves.tsv
    │       │   ├── jungle
    │       │   │   ├── __init__.py
    │       │   │   ├── forest.py
    │       │   │   ├── forest.py.bak
    │       │   │   ├── resources
    │       │   │   │   ├── FitnessInference
    │       │   │   │   │   ├── .gitignore
    │       │   │   │   │   ├── LICENSE
    │       │   │   │   │   ├── README.md
    │       │   │   │   │   ├── __init__.py
    │       │   │   │   │   ├── infer_fitness.py
    │       │   │   │   │   ├── prediction_src
    │       │   │   │   │   │   ├── README
    │       │   │   │   │   │   ├── __init__.py
    │       │   │   │   │   │   ├── ancestral.py
    │       │   │   │   │   │   ├── fitness_inference.py
    │       │   │   │   │   │   ├── node_ranking.py
    │       │   │   │   │   │   ├── sequence_ranking.py
    │       │   │   │   │   │   ├── solve_survival.py
    │       │   │   │   │   │   └── tree_utils.py
    │       │   │   │   │   └── rank_sequences.py
    │       │   │   │   ├── __init__.py
    │       │   │   │   └── betatree
    │       │   │   │   │   ├── LICENSE
    │       │   │   │   │   ├── README
    │       │   │   │   │   ├── __init__.py
    │       │   │   │   │   └── src
    │       │   │   │   │       ├── __init__.py
    │       │   │   │   │       ├── betatree.py
    │       │   │   │   │       ├── sfs.py
    │       │   │   │   │       ├── sfs.py.bak
    │       │   │   │   │       ├── sfs_py3.py
    │       │   │   │   │       └── sfs_py3.py.bak
    │       │   │   ├── sfs.py
    │       │   │   ├── sfs.py.bak
    │       │   │   ├── size_matched_model.py
    │       │   │   ├── tree.py
    │       │   │   └── tree.py.bak
    │       │   ├── reference_data
    │       │   │   ├── generate_annotate_forest.py
    │       │   │   └── generate_annotate_forest.py.bak
    │       │   └── tests
    │       │   │   ├── concat.ipynb
    │       │   │   ├── generate_annotate_forest.sh
    │       │   │   └── size_matched_model.ipynb
    │       └── _lbi_jungle.py
    │   ├── parameter_estimators.py
    │   ├── small_parsimony.py
    │   ├── topology.py
    │   └── tree_metrics.py
├── codecov.yml
├── conftest.py
├── data
    ├── PCT48.ref.fasta
    ├── ccphylo_config.ini
    ├── itolconfig_example
    ├── preprocess.cfg
    ├── preprocess_gestalt.cfg
    └── spatial_preprocess.cfg
├── docs
    ├── Makefile
    ├── _static
    │   ├── computer-24px.svg
    │   ├── css
    │   │   ├── override.css
    │   │   └── sphinx_gallery.css
    │   ├── library_books-24px.svg
    │   ├── logo.png
    │   ├── play_circle_outline-24px.svg
    │   ├── question-mark-svgrepo-com.svg
    │   └── tutorials
    │   │   ├── benchmark.png
    │   │   ├── benchmark.svg
    │   │   ├── local_plotting.png
    │   │   ├── preprocess.png
    │   │   ├── preprocess.svg
    │   │   ├── reconstruct.png
    │   │   └── reconstruct.svg
    ├── _templates
    │   ├── autosummary
    │   │   └── class.rst
    │   └── layout.html
    ├── api
    │   ├── critique.rst
    │   ├── data.rst
    │   ├── index.rst
    │   ├── plotting.rst
    │   ├── preprocess.rst
    │   ├── simulator.rst
    │   ├── solver.rst
    │   └── tools.rst
    ├── authors.rst
    ├── conf.py
    ├── contributing.rst
    ├── extensions
    │   └── typed_returns.py
    ├── index.rst
    ├── installation.rst
    ├── make.bat
    ├── notebooks
    ├── references.rst
    └── user_guide.rst
├── notebooks
    ├── benchmark.ipynb
    ├── data
    │   ├── 3432_NT_T1_alleletable.txt
    │   └── 3432_NT_T1_tree.processed.tree
    ├── local_plotting.ipynb
    ├── preprocess.ipynb
    ├── reconstruct.ipynb
    └── simulate_ecDNA.ipynb
├── pyproject.toml
├── setup.py
├── test
    ├── critique_tests
    │   └── compare_tree_test.py
    ├── data_tests
    │   ├── cassiopeia_tree_test.py
    │   ├── data_utilities_test.py
    │   └── layers_test.py
    ├── mixin_tests
    │   └── mixin_utilities_test.py
    ├── plotting_tests
    │   ├── itol_plotting_test.py
    │   ├── local_3d_test.py
    │   ├── local_test.py
    │   └── utilities_test.py
    ├── preprocess_tests
    │   ├── align_sequence_test.py
    │   ├── call_alleles_test.py
    │   ├── call_lineage_groups_test.py
    │   ├── character_matrix_test.py
    │   ├── collapse_umi_test.py
    │   ├── config_parser_test.py
    │   ├── convert_fastqs_to_unmapped_bam_test.py
    │   ├── error_correct_cellbcs_to_whitelist_test.py
    │   ├── error_correct_intbcs_to_whitelist_test.py
    │   ├── error_correct_umi_test.py
    │   ├── filter_bam_test.py
    │   ├── filter_molecule_table_test.py
    │   ├── resolve_umi_sequence_test.py
    │   └── test_files
    │   │   ├── 10xv3_1.fastq.gz
    │   │   ├── 10xv3_2.fastq.gz
    │   │   ├── 10xv3_unmapped.bam
    │   │   ├── 10xv3_whitelist.txt
    │   │   ├── basic_grouping.csv
    │   │   ├── clustered_intbc.png
    │   │   ├── collapse_header_required.bam
    │   │   ├── collapse_header_required.collapsed.bam
    │   │   ├── doublet.csv
    │   │   ├── filter_and_reassign.csv
    │   │   ├── indropsv3_1.fastq.gz
    │   │   ├── indropsv3_2.fastq.gz
    │   │   ├── indropsv3_3.fastq.gz
    │   │   ├── intbc_whitelist.txt
    │   │   ├── lineageGrp_piv_heatmaps
    │   │       └── lg_1_piv_heatmap.png
    │   │   ├── reassign.csv
    │   │   ├── slideseq2_1.fastq.gz
    │   │   ├── slideseq2_2.fastq.gz
    │   │   ├── slideseq2_unmapped.bam
    │   │   ├── slideseq2_whitelist.txt
    │   │   ├── test.bam
    │   │   ├── test_sorted.bam
    │   │   ├── test_sorted.bayesian_collapsed.bam
    │   │   ├── test_sorted.collapsed.bam
    │   │   ├── test_sorted.collapsed.txt
    │   │   ├── test_uncorrected.bam
    │   │   ├── test_uncorrected_sorted.bam
    │   │   └── test_uncorrected_sorted.collapsed.bam
    ├── simulator_tests
    │   ├── birth_death_simulator_test.py
    │   ├── brownian_spatial_simulator_test.py
    │   ├── cas9_lineage_tracing_simulator_test.py
    │   ├── clonal_spatial_simulator_test.py
    │   ├── complete_binary_simulator_test.py
    │   ├── ecdna_birth_death_simulator_test.py
    │   ├── sequential_lineage_tracing_simulator_test.py
    │   ├── simple_fit_subclone_simulator_test.py
    │   ├── spatial_leaf_subsampler_test.py
    │   ├── supercellular_sampler_test.py
    │   └── unifom_leaf_subsampler_test.py
    ├── solver_tests
    │   ├── ccphylo_solver_test.py
    │   ├── dissimilarity_functions_test.py
    │   ├── greedy_variants_test.py
    │   ├── hybrid_solver_test.py
    │   ├── ilp_solver_test.py
    │   ├── maxcut_test.py
    │   ├── neighborjoining_solver_test.py
    │   ├── percolation_test.py
    │   ├── sharedmutationjoiner_test.py
    │   ├── snj_solver_test.py
    │   ├── spectral_test.py
    │   ├── upgma_test.py
    │   └── vanillagreedy_test.py
    └── tools_tests
    │   ├── autocorrelation_test.py
    │   ├── branch_length_estimator_tests
    │       ├── iid_exponential_bayesian_test.py
    │       └── iid_exponential_mle_test.py
    │   ├── coupling_test.py
    │   ├── fitness_estimator_tests
    │       └── lbi_jungle_test.py
    │   ├── parameter_estimators_test.py
    │   ├── small_parsimony_test.py
    │   ├── topology_test.py
    │   └── tree_metrics_test.py
└── version.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.py linguist-language=python
2 | *.ipynb linguist-documentation
3 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: cassiopeia
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [master]
 9 |   pull_request:
10 |     branches: [master]
11 | 
12 | jobs:
13 |   build:
14 |     runs-on: ubuntu-latest
15 |     timeout-minutes: 20
16 |     strategy:
17 |       matrix:
18 |         python-version: ["3.8", "3.9", "3.10", "3.11"]
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v2
22 |     - name: Set up Python ${{ matrix.python-version }}
23 |       uses: actions/setup-python@v2
24 |       with:
25 |         python-version: ${{ matrix.python-version }}
26 |     - name: Cache pip
27 |       uses: actions/cache@v2
28 |       with:
29 |           path: ~/.cache/pip
30 |           key: ${{ runner.os }}-pip-v1-${{ hashFiles('**/requirements.txt') }}
31 |           restore-keys: |
32 |             ${{ runner.os }}-pip-v1-
33 |     - name: Install dependencies
34 |       run: |
35 |         pip install pytest-cov
36 |         pip install codecov
37 |         pip install .
38 |     - name: Test with pytest
39 |       run: |
40 |         pytest -vv test/ --cov-report=xml --cov=cassiopeia
41 |     - name: After success
42 |       run: |
43 |         codecov
44 |         pip list
45 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105 | __pypackages__/
106 | 
107 | # Celery stuff
108 | celerybeat-schedule
109 | celerybeat.pid
110 | 
111 | # SageMath parsed files
112 | *.sage.py
113 | 
114 | # Environments
115 | .env
116 | .venv
117 | env/
118 | venv/
119 | ENV/
120 | env.bak/
121 | venv.bak/
122 | 
123 | # Spyder project settings
124 | .spyderproject
125 | .spyproject
126 | 
127 | # Rope project settings
128 | .ropeproject
129 | 
130 | # mkdocs documentation
131 | /site
132 | 
133 | # mypy
134 | .mypy_cache/
135 | .dmypy.json
136 | dmypy.json
137 | 
138 | # Pyre type checker
139 | .pyre/
140 | 
141 | # pytype static type analyzer
142 | .pytype/
143 | 
144 | # Cython debug symbols
145 | cython_debug/
146 | 
147 | # PyCharm
148 | #  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
149 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
150 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
151 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
152 | #.idea/
153 | 
154 | .DS_Store
155 | .idea
156 | *.pyc
157 | *.so
158 | _build
159 | _static
160 | _templates
161 | build
162 | *.egg-info
163 | *.c
164 | stdout.log
165 | notebooks/.ipynb_checkpoints
166 | cassiopeia/tools/branch_length_estimator/_iid_exponential_bayesian.cpp
167 | docs/api/reference/**
168 | .vscode
169 | cassiopeia/config.ini
170 | environment.yml
171 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | build:
 3 |   os: "ubuntu-22.04"
 4 |   tools:
 5 |     python: "3.9"
 6 | sphinx:
 7 |   configuration: docs/conf.py
 8 | python:
 9 |   install:
10 |     - method: pip
11 |       path: .
12 |       extra_requirements:
13 |       - docs
14 |       - spatial


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Credits
 3 | =======
 4 | 
 5 | * Matthew Jones <matthew.jones@ucsf.edu>
 6 | * Alex Khodaverdian <alexkhodaverdian@berkeley.edu>
 7 | * Richard Zhang
 8 | * Sebastian Prillo
 9 | * Joseph Min
10 | * Jeffrey Quinn <jeffrey.quinn@ucsf.edu>
11 | * Jeffrey Hussmann <jeffrey.hussmann@ucsf.edu>
12 | * Michelle Chan <michelle.chan@ucsf.edu>
13 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
  1 | .. highlight:: shell
  2 | 
  3 | ============
  4 | Contributing
  5 | ============
  6 | 
  7 | Your contributions are much appreciated! Feel free to contribute in one of these ways:
  8 | 
  9 | 
 10 | Types of Contributions
 11 | ----------------------
 12 | 
 13 | Report Bugs
 14 | ~~~~~~~~~~~
 15 | 
 16 | Report bugs at https://github.com/YosefLab/Cassiopeia/issues.
 17 | 
 18 | If you are reporting a bug, please include:
 19 | 
 20 | * Your operating system name and version.
 21 | * Any details about your local setup that might be helpful in troubleshooting.
 22 | * Detailed steps to reproduce the bug.
 23 | 
 24 | Fix Bugs
 25 | ~~~~~~~~
 26 | 
 27 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
 28 | wanted" is open to whoever wants to implement it.
 29 | 
 30 | Ideally, when you are fixing a bug, please first provide a test that breaks
 31 | due to the bug. Your contributed code should then fix this test.
 32 | 
 33 | Implement Features
 34 | ~~~~~~~~~~~~~~~~~~
 35 | 
 36 | Look through the GitHub issues for features. Anything tagged with "enhancement"
 37 | and "help wanted" is open to whoever wants to implement it.
 38 | 
 39 | Write Documentation
 40 | ~~~~~~~~~~~~~~~~~~~
 41 | 
 42 | Cassiopeia could always use more documentation, whether as part of the
 43 | official Cassiopeia docs, in docstrings, or even on the web in blog posts,
 44 | articles, and such.
 45 | 
 46 | Submit Feedback
 47 | ~~~~~~~~~~~~~~~
 48 | 
 49 | The best way to send feedback is to file an issue at https://github.com/YosefLab/Cassiopeia/issues.
 50 | 
 51 | If you are proposing a feature:
 52 | 
 53 | * Explain in detail how it would work.
 54 | * Keep the scope as narrow as possible, to make it easier to implement.
 55 | * Remember that this is a volunteer-driven project, and that contributions
 56 |   are welcome
 57 | 
 58 | Get Started!
 59 | ------------
 60 | 
 61 | Ready to contribute? Here's how to set up `cassiopeia` for local development.
 62 | 
 63 | 1. Fork the `cassiopeia` repo on GitHub.
 64 | 2. Clone your fork locally::
 65 | 
 66 |     $ git clone git@github.com:your_name_here/Cassiopeia.git
 67 | 
 68 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
 69 | 
 70 |     $ mkvirtualenv cassiopeia
 71 |     $ cd cassiopeia/
 72 |     $ make install
 73 | 
 74 | 4. Create a branch for local development::
 75 | 
 76 |     $ git checkout -b name-of-your-bugfix-or-feature
 77 | 
 78 |    Now you can make your changes locally.
 79 | 
 80 | 5. When you're done making changes, check that your changes pass flake8 and the
 81 |    tests, including testing other Python versions with tox::
 82 | 
 83 |     $ flake8 cassiopeia tests
 84 |     $ make test
 85 | 
 86 |    To get flake8, just pip install them into your virtualenv.
 87 | 
 88 | 6. Commit your changes and push your branch to GitHub::
 89 | 
 90 |     $ git add .
 91 |     $ git commit -m "Your detailed description of your changes."
 92 |     $ git push origin name-of-your-bugfix-or-feature
 93 | 
 94 | 7. Submit a pull request through the GitHub website.
 95 | 
 96 | Coding Standards
 97 | -----------------------
 98 | 1. Don't duplicate code. Certainly no blocks longer than a couple of lines. It's almost always better to refactor than to duplicate blocks of code.
 99 | 2. Almost all code should at least be run by a unit tests. No pull request should decrease unit test coverage by much.
100 | 3. Document each new method and each new class with a docstring.
101 | 4. Don't commit commented-out code. Just delete it or store it somewhere outside of the repo. You probably aren't going to need it. At worse, it's stored in previous commits, from before it was commented out.
102 | 5. A pull request (PR) will typically close at least one Github issue. For these pull requests, write the issue it closes in the description, e.g. ``closes #210``. The issue will be automatically closed when the PR is merged.
103 | 6. Don't commit data to the repository, except perhaps a few small (< 50 KB) files of test data.
104 | 
105 | 
106 | Pull Request Guidelines
107 | -----------------------
108 | 
109 | Before you submit a pull request, check that it meets these guidelines:
110 | 
111 | 1. The pull request should include tests.
112 | 2. If the pull request adds functionality, the docs should be updated. Put
113 |    your new functionality into a function with a docstring, and add the
114 |    feature to the list in README.rst.
115 | 3. The pull request should work for Python >= 3.8. Check
116 |    https://travis-ci.org/YosefLab/Cassiopeia/pull_requests
117 |    and make sure that the tests pass for all supported Python versions.
118 | 
119 | Deploying
120 | ---------
121 | 
122 | A reminder for the maintainers on how to deploy.
123 | Make sure all your changes are committed (including an entry in HISTORY.rst).
124 | Then run::
125 | 
126 | $ bumpversion patch # possible: major / minor / patch
127 | $ git push
128 | $ git push --tags
129 | 
130 | Travis will then deploy to PyPI if tests pass.
131 | 
132 | Also, make sure you've tested your code using tox by running::
133 | 
134 | $ tox
135 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019, Matthew G Jones
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | recursive-include cassiopeia *
3 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=bash
 2 | python=python
 3 | pip=pip
 4 | tests=./test
 5 | version:=$(shell $(python) version.py)
 6 | sdist_name:=cassiopeia-$(version).tar.gz
 7 | 
 8 | develop:
 9 | 	$(pip) install -e .
10 | 
11 | clean_develop:
12 | 	- $(pip) uninstall -y cassiopeia
13 | 	- rm -rf *.egg-info
14 | 
15 | clean_sdist:
16 | 	- rm -rf dist
17 | 
18 | clean: clean_develop clean_pypi
19 | 
20 | install:
21 | 	- $(python) -m pip install .
22 | 
23 | check_build_reqs:
24 | 	@$(python) -c 'import pytest' \
25 |                 || ( printf "$(redpip)Build requirements are missing. Run 'make prepare' to install them.$(normal)" ; false )
26 | 
27 | test: check_build_reqs
28 | 	$(python) -m pytest -vv $(tests)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <img src="https://github.com/YosefLab/cassiopeia/blob/master/docs/_static/logo.png?raw=true" width="400" alt="cassiopeia">
 2 | 
 3 | [![Stars](https://img.shields.io/github/stars/YosefLab/cassiopeia?logo=GitHub&color=yellow)](https://github.com/YosefLab/cassiopeia/stargazers)
 4 | [![Documentation Status](https://readthedocs.org/projects/cassiopeia/badge/?version=latest)](https://cassiopeia.readthedocs.io/en/stable/?badge=stable)
 5 | ![Build
 6 | Status](https://github.com/YosefLab/cassiopeia/workflows/cassiopeia/badge.svg)
 7 | [![Coverage](https://codecov.io/gh/YosefLab/cassiopeia/branch/master/graph/badge.svg)](https://codecov.io/gh/YosefLab/cassiopeia)
 8 | 
 9 | Cassiopeia: A pipeline for single-cell lineage tracing data
10 | =============================================================
11 | 
12 | Cassiopeia is an end-to-end pipeline for single-cell lineage tracing experiments.
13 | The software contained here comes equipped with modules for processing sequencing reads,
14 | reconstructing & plotting trees, analyzing these trees, and benchmarking new algorithms.
15 | 
16 | You can find all of our [documentation here](https://cassiopeia-lineage.readthedocs.io/en/latest/index.html).
17 | 
18 | We also have provided tutorials for three modules:
19 | 
20 | - [processing fastqs](https://github.com/YosefLab/Cassiopeia/blob/master/notebooks/preprocess.ipynb)
21 | - [reconstructing trees](https://github.com/YosefLab/Cassiopeia/blob/master/notebooks/reconstruct.ipynb)
22 | - [simulating trees and benchmarking](https://github.com/YosefLab/Cassiopeia/blob/master/notebooks/benchmark.ipynb)
23 | - [plotting trees with our local library](https://github.com/YosefLab/Cassiopeia/blob/master/notebooks/local_plotting.ipynb)
24 | 
25 | 
26 | You can also find our originally describing Cassiopeia published in [Genome Biology](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02000-8).
27 | 
28 | Free Software: MIT License
29 | 
30 | Installation
31 | --------------
32 | 
33 | For users:
34 | 
35 | ```
36 | pip install git+https://github.com/YosefLab/Cassiopeia@master#egg=cassiopeia-lineage
37 | ```
38 | 
39 | For developers:
40 | 
41 | 1. Clone the package as so: ``git clone https://github.com/YosefLab/Cassiopeia.git``
42 | 
43 | 2. Ensure that you have Python >= 3.8 installed. (Due to dependencies no longer supporting Python 3.7, we have now stopped supporting Python <= 3.7) We prefer using [miniconda](https://docs.conda.io/en/latest/miniconda.html).
44 | 
45 | 3. [Optional] If you intend to use the CassiopeiaILP solver, please be sure that Gurobi is installed. You can follow the instructions listed [here](http://www.gurobi.com/academia/for-universities). To verify that it's working correctly, use the following tests:
46 |     * Run the command ``gurobi.sh`` from a terminal window
47 |     * From the Gurobi installation directory (where there is a setup.py file), use ``python setup.py install --user``
48 | 
49 | 4. [Optional] To use fast versions of Neighbor-Joining and UPGMA, install [CCPhylo](https://bitbucket.org/genomicepidemiology/ccphylo/src/master/). Then copy the file `./data/ccphylo_config.ini` to your `./cassiopeia` directory, rename it `config.ini` and set the Path variable to point to your CCPhylo installation.
50 | 
51 | 5. Install Cassiopeia by first changing into the Cassiopeia directory and then `pip3 install .` or `make install`. To install dev and docs requirements, you can run `pip3 install .[dev,docs]`.
52 | 
53 | 6. [Optional] To use tools built for the analysis of spatial lineage tracing datasets, you can install Cassiopeia with `pip install .[spatial]`. Please note that we recommend using Python >= 3.9 for these analyses as some features might not be available otherwise, due to package dependencies (especially 3D visualization).
54 | 
55 | To verify that it installed correctly, install `pytest` (`pip install pytest`) and try running our tests with `make test`.
56 | 
57 | Reference
58 | ----------------------
59 | 
60 | If you've found Cassiopeia useful for your research, please consider citing our paper published in Genome Biology:
61 | 
62 | 
63 | Matthew G Jones*, Alex Khodaverdian*, Jeffrey J Quinn*, Michelle M Chan, Jeffrey A Hussmann, Robert Wang, Chenling Xu, Jonathan S Weissman, Nir Yosef. (2020), [*Inference of single-cell phylogenies from lineage tracing data using Cassiopeia*](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02000-8), Genome Biology
64 | 
65 | 


--------------------------------------------------------------------------------
/build.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | from distutils.command.build_ext import build_ext
 4 | from distutils.core import Distribution, Extension
 5 | 
 6 | import numpy
 7 | from Cython.Build import cythonize
 8 | 
 9 | # https://github.com/mdgoldberg/poetry-cython-example
10 | 
11 | 
12 | def build():
13 |     extensions = [
14 |         Extension(
15 |             "cassiopeia.preprocess.collapse_cython",
16 |             ["cassiopeia/preprocess/collapse_cython.pyx"],
17 |         ),
18 |         Extension(
19 |             "cassiopeia.solver.ilp_solver_utilities",
20 |             ["cassiopeia/solver/ilp_solver_utilities.pyx"],
21 |             include_dirs=[numpy.get_include()],
22 |         ),
23 |         Extension(
24 |             "cassiopeia.tools.branch_length_estimator._iid_exponential_bayesian",
25 |             sources=[
26 |                 "cassiopeia/tools/branch_length_estimator/_iid_exponential_bayesian.pyx",
27 |                 "cassiopeia/tools/branch_length_estimator/_iid_exponential_bayesian_cpp.cpp",
28 |             ],
29 |             extra_compile_args=[
30 |                 "-std=c++17",
31 |                 "-Wall",
32 |                 "-Wextra",
33 |                 "-pedantic",
34 |                 "-O3",
35 |             ],
36 |             language="c++",
37 |         ),
38 |     ]
39 |     ext_modules = cythonize(
40 |         extensions,
41 |         compiler_directives={"language_level": 3},
42 |     )
43 | 
44 |     distribution = Distribution({"name": "extended", "ext_modules": ext_modules})
45 |     distribution.package_dir = "extended"
46 | 
47 |     cmd = build_ext(distribution)
48 |     cmd.ensure_finalized()
49 |     cmd.run()
50 | 
51 |     # Copy built extensions back to the project
52 |     for output in cmd.get_outputs():
53 |         relative_extension = os.path.relpath(output, cmd.build_lib)
54 |         shutil.copyfile(output, relative_extension)
55 |         mode = os.stat(relative_extension).st_mode
56 |         mode |= (mode & 0o444) >> 2
57 |         os.chmod(relative_extension, mode)
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     build()


--------------------------------------------------------------------------------
/cassiopeia/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """Top-level for Cassiopeia development."""
 4 | 
 5 | from . import preprocess as pp
 6 | from . import solver
 7 | from . import plotting as pl
 8 | from . import data
 9 | from . import critique
10 | from . import simulator as sim
11 | from . import tools as tl
12 | 
13 | # https://github.com/python-poetry/poetry/pull/2366#issuecomment-652418094
14 | # https://github.com/python-poetry/poetry/issues/144#issuecomment-623927302
15 | try:
16 |     import importlib.metadata as importlib_metadata
17 | except ModuleNotFoundError:
18 |     import importlib_metadata
19 | package_name = "cassiopeia-lineage"
20 | __version__ = importlib_metadata.version(package_name)
21 | 
22 | import sys
23 | 
24 | sys.modules.update({f"{__name__}.{m}": globals()[m] for m in ["tl", "pp", "pl", "sim"]})
25 | del sys
26 | 
27 | __all__ = ["pp", "solver", "pl", "data", "critique", "sim", "tl"]
28 | 


--------------------------------------------------------------------------------
/cassiopeia/critique/__init__.py:
--------------------------------------------------------------------------------
1 | """Top level for the cassiopeia critique module."""
2 | 
3 | from .compare import robinson_foulds, triplets_correct
4 | 


--------------------------------------------------------------------------------
/cassiopeia/data/Layers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file stores the data structure behind layers in a CassiopeiaTree object.
 3 | 
 4 | Briefly, Layers are collection of character matrices that can be used to
 5 | store multiple versions of mutation data for each cell. This can be useful,
 6 | for example, during simulation when a user is experimenting with imputing
 7 | missing data.
 8 | 
 9 | The data structure behaves like a dictionary, allowing users to store, retrieve,
10 | and delete entries using canonical commands.
11 | 
12 | This data structure is inspired by AnnData's layer functionality for scRNA-seq
13 | count matrices. Much of the code and logic is derived from the AnnData project.
14 | """
15 | from typing import Iterator, List, Mapping, Optional
16 | 
17 | import pandas as pd
18 | 
19 | from cassiopeia.data import CassiopeiaTree
20 | 
21 | 
22 | class Layers(dict):
23 | 
24 |     attrname = "layers"
25 | 
26 |     parent_mapping: Mapping[str, pd.DataFrame]
27 | 
28 |     def __init__(
29 |         self, parent: CassiopeiaTree, layers: Optional[Mapping] = None
30 |     ):
31 |         self._parent = parent
32 |         self._data = dict()
33 |         if layers is not None:
34 |             self.update(layers)
35 | 
36 |     def __repr__(self):
37 |         return f"{type(self).__name__} with keys: {', '.join(self.keys())}"
38 | 
39 |     def _ipython_key_completions_(self) -> List[str]:
40 |         return list(self.keys())
41 | 
42 |     def copy(self):
43 |         d = Layers(self._parent)
44 |         for k, v in self.items():
45 |             d[k] = v.copy()
46 |         return d
47 | 
48 |     def __getitem__(self, key: str) -> pd.DataFrame:
49 |         return self._data[key]
50 | 
51 |     def __setitem__(self, key: str, value: pd.DataFrame):
52 |         value = self._validate_value(value, key)
53 |         self._data[key] = value
54 | 
55 |     def __delitem__(self, key: str):
56 |         del self._data[key]
57 | 
58 |     def __contains__(self, key: str) -> bool:
59 |         return key in self._data
60 | 
61 |     def __iter__(self) -> Iterator[str]:
62 |         return iter(self._data)
63 | 
64 |     def __len__(self) -> int:
65 |         return len(self._data)
66 | 
67 |     def _validate_value(self, val: pd.DataFrame, key: str) -> pd.DataFrame:
68 |         """Checks passed value for correct structure."""
69 | 
70 |         if val.shape[0] != self._parent.n_cell:
71 |             raise ValueError(
72 |                 f"Value passed for key {key!r} is of incorrect shape. "
73 |                 f"Values of {self.attrname} must have the same number of "
74 |                 f"samples as the tree. Value had {val.shape[0]} while it "
75 |                 f"should have had {self._parent.n_cell} samples."
76 |             )
77 |         return val
78 | 


--------------------------------------------------------------------------------
/cassiopeia/data/__init__.py:
--------------------------------------------------------------------------------
 1 | """Top level for data."""
 2 | 
 3 | from .CassiopeiaTree import CassiopeiaTree
 4 | from .utilities import (
 5 |     compute_dissimilarity_map,
 6 |     compute_inter_cluster_distances,
 7 |     compute_phylogenetic_weight_matrix,
 8 |     get_lca_characters,
 9 |     net_relatedness_index,
10 |     sample_bootstrap_allele_tables,
11 |     sample_bootstrap_character_matrices,
12 |     to_newick,
13 | )
14 | 


--------------------------------------------------------------------------------
/cassiopeia/mixins/__init__.py:
--------------------------------------------------------------------------------
1 | from .errors import *
2 | from .logging import logger
3 | from .utilities import *
4 | from .warnings import *
5 | 


--------------------------------------------------------------------------------
/cassiopeia/mixins/errors.py:
--------------------------------------------------------------------------------
  1 | class AutocorrelationError(Exception):
  2 |     """An Exception for the tools.autocorrelation methods."""
  3 | 
  4 |     pass
  5 | 
  6 | 
  7 | class CassiopeiaError(Exception):
  8 |     """An general exception for the Cassiopeia software."""
  9 | 
 10 |     pass
 11 | 
 12 | 
 13 | class CassiopeiaTreeError(Exception):
 14 |     """An Exception class for the CassiopeiaTree class."""
 15 | 
 16 |     pass
 17 | 
 18 | 
 19 | class DataSimulatorError(Exception):
 20 |     """Generic error for the DataSimulator subclasses"""
 21 | 
 22 |     pass
 23 | 
 24 | 
 25 | class DistanceSolverError(Exception):
 26 |     """An Exception class for all DistanceSolver subclasses."""
 27 | 
 28 |     pass
 29 | 
 30 | class ecDNABirthDeathSimulatorError(Exception):
 31 |     """An ExceptionClass for ecDNABirthDeathSimulator class."""
 32 | 
 33 |     pass
 34 | 
 35 | class FitchCountError(Exception):
 36 |     """An ExceptionClass for FitchCount."""
 37 | 
 38 |     pass
 39 | 
 40 | 
 41 | class GreedySolverError(Exception):
 42 |     pass
 43 | 
 44 | 
 45 | class HybridSolverError(Exception):
 46 |     """An Exception class for all HybridSolver subclasses."""
 47 | 
 48 |     pass
 49 | 
 50 | 
 51 | class ILPSolverError(Exception):
 52 |     """An Exception class for all ILPError subclasses."""
 53 | 
 54 |     pass
 55 | 
 56 | 
 57 | class iTOLError(Exception):
 58 |     pass
 59 | 
 60 | 
 61 | class LeafSubsamplerError(Exception):
 62 |     """An Exception class for the LeafSubsampler class."""
 63 | 
 64 |     pass
 65 | 
 66 | 
 67 | class PreprocessError(Exception):
 68 |     pass
 69 | 
 70 | 
 71 | class PriorTransformationError(Exception):
 72 |     """An Exception class for generating weights from priors."""
 73 | 
 74 |     pass
 75 | 
 76 | 
 77 | class SharedMutationJoiningSolverError(Exception):
 78 |     """An Exception class for SharedMutationJoiningSolver."""
 79 | 
 80 |     pass
 81 | 
 82 | 
 83 | class TreeSimulatorError(Exception):
 84 |     """An Exception class for all exceptions generated by
 85 |     TreeSimulator or a subclass of TreeSimulator
 86 |     """
 87 | 
 88 |     pass
 89 | 
 90 | 
 91 | class UnknownCigarStringError(Exception):
 92 |     pass
 93 | 
 94 | 
 95 | class UnspecifiedConfigParameterError(Exception):
 96 |     pass
 97 | 
 98 | 
 99 | class BranchLengthEstimatorError(Exception):
100 |     """An Exception class for the BranchLengthEstimator class."""
101 | 
102 |     pass
103 | 
104 | 
105 | class IIDExponentialMLEError(BranchLengthEstimatorError):
106 |     pass
107 | 
108 | 
109 | class TreeMetricError(Exception):
110 |     """An Exception class for calculating tree metrics"""
111 | 
112 |     pass
113 | 
114 | 
115 | class ParameterEstimateError(Exception):
116 |     """An Exception class for the estimation and retrieval of tree parameters"""
117 |     
118 |     pass
119 | 
120 |   
121 | class PlottingError(Exception):
122 |     pass
123 | 


--------------------------------------------------------------------------------
/cassiopeia/mixins/logging.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | import ngs_tools as ngs
4 | 
5 | logger = ngs.logging.Logger(__name__)
6 | logger.setLevel(logging.INFO)
7 | 


--------------------------------------------------------------------------------
/cassiopeia/mixins/utilities.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import importlib
 3 | from types import ModuleType
 4 | from typing import Dict, List, Optional, Tuple, Union
 5 | 
 6 | import numpy as np
 7 | 
 8 | 
 9 | def is_ambiguous_state(state: Union[int, Tuple[int, ...]]) -> bool:
10 |     """Determine whether the provided state is ambiguous.
11 | 
12 |     Note that this function operates on a single (indel) state.
13 | 
14 |     Args:
15 |         state: Single, possibly ambiguous, character state
16 | 
17 |     Returns:
18 |         True if the state is ambiguous, False otherwise.
19 |     """
20 |     return isinstance(state, tuple)
21 | 
22 | 
23 | def try_import(module: str) -> Optional[ModuleType]:
24 |     """Helper function to import a possibly not-installed module.
25 | 
26 |     Args:
27 |         module: Module to try and import
28 | 
29 |     Returns:
30 |         The imported module, if the module exists, or None
31 |     """
32 |     try:
33 |         return importlib.import_module(module)
34 |     except ModuleNotFoundError:
35 |         return None
36 | 
37 | 
38 | def unravel_ambiguous_states(
39 |     state_array: List[Union[int, Tuple[int, ...]]]
40 | ) -> List[int]:
41 |     """Helper function to unravel ambiguous states.
42 | 
43 |     Args:
44 |         A list of states, potentially containing ambiguous states.
45 | 
46 |     Returns:
47 |         A list of unique states contained in the list.
48 |     """
49 |     all_states = [
50 |         list(state) if is_ambiguous_state(state) else [state]
51 |         for state in state_array
52 |     ]
53 |     return functools.reduce(lambda a, b: a + b, all_states)
54 | 
55 | def find_duplicate_groups(character_matrix) -> Dict[str, Tuple[str, ...]]:
56 |     """Maps duplicated indices in character matrix to groups.
57 | 
58 |     Groups together samples in a character matrix if they have the same
59 |     character states.
60 | 
61 |     Args:
62 |         character_matrix: Character matrix, potentially with ambiguous states.
63 | 
64 |     Returns:
65 |         A mapping of a single sample name to the set of of samples that have
66 |             the same character states.
67 |     """
68 | 
69 |     character_matrix.index.name = "index"
70 | 
71 |      # convert to sets to support ambiguous states
72 |     character_matrix_sets = character_matrix.copy()
73 |     character_matrix_sets = character_matrix_sets.apply(
74 |             lambda x: [
75 |                 set(s) if is_ambiguous_state(s) else set([s])
76 |                 for s in x.values
77 |             ],
78 |             axis=0,
79 |         ).apply(tuple, axis=1)
80 |     is_duplicated = (
81 |         character_matrix_sets.duplicated(keep=False)
82 |     )
83 |     unique_states = np.unique(character_matrix_sets[is_duplicated])
84 |     duplicate_groups = [character_matrix_sets[character_matrix_sets == val].index.values for val in unique_states]
85 |     duplicate_mappings =  {g[0]: tuple(g) for g in duplicate_groups}
86 | 
87 |     return duplicate_mappings
88 | 


--------------------------------------------------------------------------------
/cassiopeia/mixins/warnings.py:
--------------------------------------------------------------------------------
 1 | class CassiopeiaTreeWarning(UserWarning):
 2 |     """A Warning for the CassiopeiaTree class."""
 3 | 
 4 |     pass
 5 | 
 6 | class DataSimulatorWarning(UserWarning):
 7 |     pass
 8 | 
 9 | 
10 | class PreprocessWarning(UserWarning):
11 |     pass
12 | 
13 | 
14 | class SharedMutationJoiningSolverWarning(UserWarning):
15 |     """A warning class for SharedMutationJoiningSolver."""
16 | 
17 |     pass
18 | 
19 | 
20 | class ParameterEstimateWarning(UserWarning):
21 |     """An warning class for the estimation and retrieval of tree parameters"""
22 | 
23 |     pass
24 | 
25 | 
26 | class PlottingWarning(UserWarning):
27 |     pass
28 | 
29 | class LeafSubsamplerWarning(UserWarning):
30 | 
31 |     pass
32 | 


--------------------------------------------------------------------------------
/cassiopeia/plotting/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """Top level for plotting."""
 4 | 
 5 | from .itol_utilities import upload_and_export_itol
 6 | from .local import plot_matplotlib, plot_plotly
 7 | from .local_3d import labels_from_coordinates, Tree3D
 8 | 
 9 | 
10 | __all__ = [
11 |     "upload_and_export_itol",
12 |     "plot_matplotlib",
13 |     "plot_plotly",
14 |     "labels_from_coordinates",
15 |     "Tree3D"
16 | ]
17 | 


--------------------------------------------------------------------------------
/cassiopeia/plotting/palettes.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A file storing palettes to be used for plotting.
  3 | """
  4 | 
  5 | # Don't want to have to set scanpy as a dependency just to use its color palette.
  6 | # https://github.com/scverse/scanpy/blob/master/scanpy/plotting/palettes.py
  7 | godsnot_102 = [
  8 |     "#FFFF00",
  9 |     "#1CE6FF",
 10 |     "#FF34FF",
 11 |     "#FF4A46",
 12 |     "#008941",
 13 |     "#006FA6",
 14 |     "#A30059",
 15 |     "#FFDBE5",
 16 |     "#7A4900",
 17 |     "#0000A6",
 18 |     "#63FFAC",
 19 |     "#B79762",
 20 |     "#004D43",
 21 |     "#8FB0FF",
 22 |     "#997D87",
 23 |     "#5A0007",
 24 |     "#809693",
 25 |     "#6A3A4C",
 26 |     "#1B4400",
 27 |     "#4FC601",
 28 |     "#3B5DFF",
 29 |     "#4A3B53",
 30 |     "#FF2F80",
 31 |     "#61615A",
 32 |     "#BA0900",
 33 |     "#6B7900",
 34 |     "#00C2A0",
 35 |     "#FFAA92",
 36 |     "#FF90C9",
 37 |     "#B903AA",
 38 |     "#D16100",
 39 |     "#DDEFFF",
 40 |     "#000035",
 41 |     "#7B4F4B",
 42 |     "#A1C299",
 43 |     "#300018",
 44 |     "#0AA6D8",
 45 |     "#013349",
 46 |     "#00846F",
 47 |     "#372101",
 48 |     "#FFB500",
 49 |     "#C2FFED",
 50 |     "#A079BF",
 51 |     "#CC0744",
 52 |     "#C0B9B2",
 53 |     "#C2FF99",
 54 |     "#001E09",
 55 |     "#00489C",
 56 |     "#6F0062",
 57 |     "#0CBD66",
 58 |     "#EEC3FF",
 59 |     "#456D75",
 60 |     "#B77B68",
 61 |     "#7A87A1",
 62 |     "#788D66",
 63 |     "#885578",
 64 |     "#FAD09F",
 65 |     "#FF8A9A",
 66 |     "#D157A0",
 67 |     "#BEC459",
 68 |     "#456648",
 69 |     "#0086ED",
 70 |     "#886F4C",
 71 |     "#34362D",
 72 |     "#B4A8BD",
 73 |     "#00A6AA",
 74 |     "#452C2C",
 75 |     "#636375",
 76 |     "#A3C8C9",
 77 |     "#FF913F",
 78 |     "#938A81",
 79 |     "#575329",
 80 |     "#00FECF",
 81 |     "#B05B6F",
 82 |     "#8CD0FF",
 83 |     "#3B9700",
 84 |     "#04F757",
 85 |     "#C8A1A1",
 86 |     "#1E6E00",
 87 |     "#7900D7",
 88 |     "#A77500",
 89 |     "#6367A9",
 90 |     "#A05837",
 91 |     "#6B002C",
 92 |     "#772600",
 93 |     "#D790FF",
 94 |     "#9B9700",
 95 |     "#549E79",
 96 |     "#FFF69F",
 97 |     "#201625",
 98 |     "#72418F",
 99 |     "#BC23FF",
100 |     "#99ADC0",
101 |     "#3A2465",
102 |     "#922329",
103 |     "#5B4534",
104 |     "#FDE8DC",
105 |     "#404E55",
106 |     "#0089A3",
107 |     "#CB7E98",
108 |     "#A4E804",
109 |     "#324E72",
110 | ]
111 | 


--------------------------------------------------------------------------------
/cassiopeia/preprocess/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """Top level for preprocess."""
 4 | 
 5 | from .pipeline import (
 6 |     align_sequences,
 7 |     call_alleles,
 8 |     collapse_umis,
 9 |     convert_fastqs_to_unmapped_bam,
10 |     error_correct_cellbcs_to_whitelist,
11 |     error_correct_intbcs_to_whitelist,
12 |     error_correct_umis,
13 |     filter_bam,
14 |     resolve_umi_sequence,
15 |     filter_molecule_table,
16 |     call_lineage_groups,
17 | )
18 | from .utilities import (
19 |     compute_empirical_indel_priors,
20 |     convert_alleletable_to_character_matrix,
21 |     convert_alleletable_to_lineage_profile,
22 |     convert_lineage_profile_to_character_matrix,
23 |     filter_cells,
24 |     filter_umis,
25 | )
26 | from .setup_utilities import setup
27 | 
28 | 
29 | __all__ = [
30 |     "align_sequences",
31 |     "call_alleles",
32 |     "collapse_umis",
33 |     "convert_fastqs_to_unmapped_bam",
34 |     "error_correct_cellbcs_to_whitelist",
35 |     "error_correct_intbcs_to_whitelist",
36 |     "error_correct_umis",
37 |     "filter_bam",
38 |     "resolve_umi_sequence",
39 |     "filter_molecule_table",
40 |     "call_lineage_groups",
41 |     "compute_empirical_indel_priors",
42 |     "convert_alleletable_to_character_matrix",
43 |     "convert_alleletable_to_lineage_profile",
44 |     "convert_lineage_profile_to_character_matrix",
45 |     "filter_cells",
46 |     "filter_umis",
47 |     "setup",
48 | ]
49 | 


--------------------------------------------------------------------------------
/cassiopeia/preprocess/collapse_cython.pyx:
--------------------------------------------------------------------------------
 1 | cimport cython
 2 | import numpy as np
 3 | cdef int OFFSET = 33
 4 | 
 5 | def hamming_distance(char* first, char* second):
 6 |     cdef int i
 7 |     cdef int d = 0
 8 |     cdef int length = len(first)
 9 |     
10 |     for i in range(length):
11 |         if first[i] != second[i]:
12 |             d += 1
13 |             
14 |     return d
15 | 
16 | @cython.boundscheck(False)
17 | def hamming_distance_matrix(seqs):
18 |     cdef int i, j, k, d, n, seq_length
19 | 
20 |     ints = np.array([list(s.encode()) for s in seqs])
21 |     cdef long[:, ::1] ints_view = ints
22 |     n, seq_length = ints.shape
23 | 
24 |     ds = np.zeros((n, n), int)
25 |     cdef long[:, ::1] ds_view = ds
26 | 
27 |     for i in range(n):
28 |         for j in range(i + 1, n):
29 |             d = 0
30 |             for k in range(seq_length):
31 |                 if ints_view[i, k] != ints_view[j, k]:
32 |                     d += 1
33 | 
34 |             ds_view[i, j] = d
35 | 
36 |     return ds
37 | 
38 | @cython.boundscheck(False)
39 | def register_corrections(long[:, ::1] ds, int max_UMI_distance, UMIs):
40 |     cdef int i, j, n
41 |     n = len(ds)
42 |     corrections = {}
43 | 
44 |     # Moving from least common to most common, register a correction
45 |     # from a UMI to the most common UMI that is within Hamming distance
46 |     # max_UMI_distance of it.
47 |     for j in range(n - 1, -1, -1):
48 |         for i in range(j - 1, -1, -1):
49 |             if ds[i, j] <= max_UMI_distance:
50 |                 corrections[UMIs[j]] = UMIs[i]
51 |     
52 |     # If a correction points to a UMI that is itself going to be corrected,
53 |     # propogate this correction through.  
54 |     for from_, to in list(corrections.items()):
55 |         while to in corrections:
56 |             to = corrections[to]
57 | 
58 |         corrections[from_] = to
59 |     
60 |     return corrections
61 | 
62 | def hq_hamming_distance(char* first_seq, char* second_seq, char* first_qual, char* second_qual, int min_q):
63 |     cdef int i
64 |     cdef int d = 0
65 |     cdef int length = len(first_seq)
66 |     cdef int floor = min_q + OFFSET
67 |     
68 |     for i in range(length):
69 |         if (first_seq[i] != second_seq[i]) and (first_qual[i] >= floor) and (second_qual[i] >= floor):
70 |             d += 1
71 |             
72 |     return d
73 | 
74 | def hq_mismatches_from_seed(char* seed, char* seq, char[:] qual, int min_q):
75 |     cdef int i
76 |     cdef int d = 0
77 |     cdef int length = len(seq)
78 |     cdef int floor = min_q
79 |     
80 |     for i in range(length):
81 |         if (seq[i] != seed[i]) and (qual[i] >= floor):
82 |             d += 1
83 |             
84 |     return d
85 | 


--------------------------------------------------------------------------------
/cassiopeia/preprocess/constants.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Stores constants for the ProcessingPipeline module
 3 | """
 4 | 
 5 | BAM_CONSTANTS = {
 6 |     "RAW_CELL_BC_TAG": "CR",
 7 |     "RAW_CELL_BC_QUALITY_TAG": "CY",
 8 |     "CELL_BC_TAG": "CB",
 9 |     "UMI_TAG": "UR",
10 |     "UMI_QUALITY_TAG": "UY",
11 |     "NUM_READS_TAG": "ZR",
12 |     "CLUSTER_ID_TAG": "ZC",
13 |     "N_Q": 2,
14 |     "HIGH_Q": 31,
15 |     "LOW_Q": 10,
16 | }
17 | 
18 | SINGLE_CELL_BAM_TAGS = {
19 |     "umi": (BAM_CONSTANTS["UMI_TAG"], BAM_CONSTANTS["UMI_QUALITY_TAG"]),
20 |     "cell_barcode": (
21 |         BAM_CONSTANTS["RAW_CELL_BC_TAG"],
22 |         BAM_CONSTANTS["RAW_CELL_BC_QUALITY_TAG"],
23 |     ),
24 | }
25 | SPATIAL_BAM_TAGS = {
26 |     "umi": (BAM_CONSTANTS["UMI_TAG"], BAM_CONSTANTS["UMI_QUALITY_TAG"]),
27 |     "spot_barcode": (
28 |         BAM_CONSTANTS["RAW_CELL_BC_TAG"],
29 |         BAM_CONSTANTS["RAW_CELL_BC_QUALITY_TAG"],
30 |     ),
31 | }
32 | CHEMISTRY_BAM_TAGS = {
33 |     "dropseq": SINGLE_CELL_BAM_TAGS,
34 |     "10xv2": SINGLE_CELL_BAM_TAGS,
35 |     "10xv3": SINGLE_CELL_BAM_TAGS,
36 |     "indropsv3": SINGLE_CELL_BAM_TAGS,
37 |     "slideseq2": SPATIAL_BAM_TAGS,
38 | }
39 | 
40 | 
41 | DNA_SUBSTITUTION_MATRIX = {
42 |     "A": {"A": 5, "T": -4, "C": -4, "G": -4, "Z": 0, "N": 0},
43 |     "T": {"A": -4, "T": 5, "C": -4, "G": -4, "Z": 0, "N": 0},
44 |     "C": {"A": -4, "T": -4, "C": 5, "G": -4, "Z": 0, "N": 0},
45 |     "G": {"A": -4, "T": -4, "C": -4, "G": 5, "Z": 0, "N": 0},
46 |     "Z": {"A": 0, "T": 0, "C": 0, "G": 0, "Z": 0, "N": 0},
47 |     "N": {"A": 0, "T": 0, "C": 0, "G": 0, "Z": 0, "N": 0},
48 | }
49 | 
50 | DEFAULT_PIPELINE_PARAMETERS = {
51 |     "general": {
52 |         "entry": "'convert'",
53 |         "exit": "'call_lineages'",
54 |         "verbose": False,
55 |     },
56 |     "convert": {},
57 |     "filter_bam": {"quality_threshold": 10},
58 |     "error_correct_cellbcs_to_whitelist": {},
59 |     "collapse": {"max_hq_mismatches": 3, "max_indels": 2},
60 |     "resolve": {
61 |         "min_avg_reads_per_umi": 2.0,
62 |         "min_umi_per_cell": 10,
63 |         "plot": True,
64 |     },
65 |     "align": {
66 |         "gap_open_penalty": 20,
67 |         "gap_extend_penalty": 1,
68 |         "method": "'local'",
69 |     },
70 |     "call_alleles": {
71 |         "barcode_interval": (20, 34),
72 |         "cutsite_locations": [112, 166, 220],
73 |         "cutsite_width": 12,
74 |         "context": True,
75 |         "context_size": 5,
76 |     },
77 |     "error_correct_intbcs_to_whitelist": {"intbc_dist_thresh": 1},
78 |     "error_correct_umis": {"max_umi_distance": 2},
79 |     "filter_molecule_table": {
80 |         "min_umi_per_cell": 10,
81 |         "min_avg_reads_per_umi": 2.0,
82 |         "min_reads_per_umi": -1,
83 |         "intbc_prop_thresh": 0.5,
84 |         "intbc_umi_thresh": 10,
85 |         "intbc_dist_thresh": 1,
86 |         "doublet_threshold": 0.35,
87 |         "plot": True,
88 |     },
89 |     "call_lineages": {
90 |         "min_umi_per_cell": 10,
91 |         "min_avg_reads_per_umi": 2.0,
92 |         "min_cluster_prop": 0.005,
93 |         "min_intbc_thresh": 0.05,
94 |         "inter_doublet_threshold": 0.35,
95 |         "kinship_thresh": 0.25,
96 |         "plot": True,
97 |     },
98 | }
99 | 


--------------------------------------------------------------------------------
/cassiopeia/preprocess/map_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains functions pertaining to mapping intBCs.
 3 | Invoked through pipeline.py and supports the filter_molecule_table function.
 4 | """
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from cassiopeia.mixins import logger
 9 | from cassiopeia.preprocess import utilities
10 | 
11 | 
12 | @utilities.log_molecule_table
13 | def map_intbcs(molecule_table: pd.DataFrame) -> pd.DataFrame:
14 |     """Assign one allele to each intBC/cellBC pair.
15 | 
16 |     For each intBC/cellBC pairing, selects the most frequent allele (by read
17 |     count, and then by UMI) and removes alignments that don't have that allele.
18 | 
19 |     Args:
20 |         molecule_table: A molecule table of cellBC-UMI pairs to be filtered
21 | 
22 |     Returns:
23 |         An allele table with one allele per cellBC-intBC pair
24 |     """
25 | 
26 |     # Have to drop out all intBCs that are NaN
27 |     molecule_table = molecule_table.dropna(subset=["intBC"])
28 | 
29 |     # For each cellBC-intBC pair, select the allele that has the highest
30 |     # readCount; on ties, use UMI count
31 |     allele_table = (
32 |         molecule_table.groupby(["cellBC", "intBC", "allele"])
33 |         .agg({"readCount": "sum", "UMI": "count"})
34 |         .reset_index()
35 |         .sort_values(["UMI", "readCount"], ascending=False)
36 |     )
37 |     duplicated_mask = allele_table.duplicated(["cellBC", "intBC"])
38 |     mapped_alleles = set(
39 |         allele_table[~duplicated_mask][
40 |             ["cellBC", "intBC", "allele"]
41 |         ].itertuples(index=False, name=None)
42 |     )
43 | 
44 |     # True for rows that contain the mapped allele; False for ones to filter out
45 |     selection_mask = (
46 |         molecule_table[["cellBC", "intBC", "allele"]]
47 |         .apply(tuple, axis=1)
48 |         .isin(mapped_alleles)
49 |     )
50 | 
51 |     mapped_table = molecule_table[selection_mask]
52 |     logger.debug(f"Alleles removed: {duplicated_mask.sum()}")
53 |     logger.debug(f"UMIs removed: {(~selection_mask).sum()}")
54 |     return mapped_table
55 | 


--------------------------------------------------------------------------------
/cassiopeia/simulator/CompleteBinarySimulator.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file defines the CompleteBinarySimulator, which inherits TreeSimulator,
 3 | that simulates complte binary trees. In this sense, this is the simplest tree
 4 | simulator.
 5 | """
 6 | from typing import Generator, Optional
 7 | 
 8 | import networkx as nx
 9 | import numpy as np
10 | 
11 | from cassiopeia.data.CassiopeiaTree import CassiopeiaTree
12 | from cassiopeia.mixins import TreeSimulatorError
13 | from cassiopeia.simulator.TreeSimulator import TreeSimulator
14 | 
15 | 
16 | class CompleteBinarySimulator(TreeSimulator):
17 |     """Simulate a complete binary tree.
18 | 
19 |     Internally, this class uses :func:`nx.balanced_tree` to generate a
20 |     perfectly balanced binary tree of specified size. Only one of ``num_cells``
21 |     or ``depth`` should be provided. All branches have equal length that is
22 |     normalized by the height of the tree (i.e. the tree has height 1).
23 | 
24 |     Args:
25 |         num_cells: Number of cells to simulate. Needs to be a power of 2. The
26 |             depth of the tree will be `log2(num_cells)`.
27 |         depth: Depth of the tree. The number of cells will be `2^depth`.
28 | 
29 |     Raises:
30 |         TreeSimulatorError if neither or both ``num_cells`` or ``depth`` are
31 |             provided, if ``num_cells`` is not a power of 2, or if the calculated
32 |             depth is not greater than 0.
33 |     """
34 | 
35 |     def __init__(
36 |         self, num_cells: Optional[int] = None, depth: Optional[int] = None
37 |     ):
38 |         if (num_cells is None) == (depth is None):
39 |             raise TreeSimulatorError(
40 |                 "One of `num_cells` or `depth` must be provided."
41 |             )
42 |         if num_cells is not None:
43 |             log2_num_cells = np.log2(num_cells)
44 |             if log2_num_cells != int(log2_num_cells):
45 |                 raise TreeSimulatorError("`num_cells` must be a power of 2.")
46 |             depth = int(log2_num_cells)
47 |         if depth <= 0:
48 |             raise TreeSimulatorError("`depth` must be grater than 0.")
49 |         self.depth = depth
50 | 
51 |     def simulate_tree(
52 |         self,
53 |     ) -> CassiopeiaTree:
54 |         """Simulates a complete binary tree.
55 | 
56 |         Returns:
57 |             A CassiopeiaTree with the tree topology initialized with the
58 |             simulated tree
59 |         """
60 | 
61 |         def node_name_generator() -> Generator[str, None, None]:
62 |             """Generates unique node names for the tree."""
63 |             i = 0
64 |             while True:
65 |                 yield str(i)
66 |                 i += 1
67 | 
68 |         names = node_name_generator()
69 | 
70 |         tree = nx.balanced_tree(2, self.depth, create_using=nx.DiGraph)
71 |         mapping = {"root": next(names)}
72 |         mapping.update({node: next(names) for node in tree.nodes})
73 |         # Add root, which indicates the initiating cell
74 |         tree.add_edge("root", 0)
75 |         nx.relabel_nodes(tree, mapping, copy=False)
76 |         cassiopeia_tree = CassiopeiaTree(tree=tree)
77 | 
78 |         # Initialize branch lengths
79 |         time_dict = {
80 |             node: cassiopeia_tree.get_time(node) / (self.depth + 1)
81 |             for node in cassiopeia_tree.nodes
82 |         }
83 |         cassiopeia_tree.set_times(time_dict)
84 |         return cassiopeia_tree
85 | 


--------------------------------------------------------------------------------
/cassiopeia/simulator/DataSimulator.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Abstract class DataSimulator, for overlaying data onto a CassiopeiaTree.
 3 | 
 4 | All data simulators are derived classes of this abstract class, and at a minimum
 5 | implement a method called `overlay_data`.
 6 | """
 7 | import abc
 8 | 
 9 | from cassiopeia.data import CassiopeiaTree
10 | from cassiopeia.mixins import DataSimulatorError
11 | 
12 | 
13 | class DataSimulator(abc.ABC):
14 |     """
15 |     DataSimulator is an abstract class that all data overlayers derive from.
16 | 
17 |     A DataSimulator is very generic and meant to give users the flexibility to
18 |     overlay any kind of data onto the tree using this single API. The prime
19 |     example of data a user might want to overlay on a tree is lineage tracing
20 |     data, for which there is a specific subclass LineageTracingDataSimulator.
21 |     Other data of interest might include: transcriptomes, proteomes, etc.
22 |     """
23 | 
24 |     @abc.abstractmethod
25 |     def overlay_data(self, tree: CassiopeiaTree) -> None:
26 |         """
27 |         Overlay data on a CassiopeiaTree (in-place).
28 | 
29 |         The tree topology must be initialized.
30 | 
31 |         Args:
32 |             tree: the CassiopeiaTree to overlay the data on. The tree topology
33 |                 must be initialized.
34 |         """
35 | 


--------------------------------------------------------------------------------
/cassiopeia/simulator/LeafSubsampler.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Abstract class LeafSubsampler. Samples the leaves of CassiopeiaTrees and
 3 | generates a tree that keeps only the lineages pertaining to the sample.
 4 | 
 5 | All leaf subsamplers are derived classes of this abstract class, and at a minimum
 6 | implement a method called `subsample_leaves`.
 7 | """
 8 | 
 9 | import abc
10 | import networkx as nx
11 | import numpy as np
12 | from typing import Optional
13 | 
14 | from cassiopeia.data import CassiopeiaTree
15 | from cassiopeia.mixins import (
16 |     LeafSubsamplerError,
17 |     LeafSubsamplerWarning,
18 | )
19 | 
20 | 
21 | class LeafSubsampler(abc.ABC):
22 |     """
23 |     Abstract base class for all leaf samplers.
24 | 
25 |     A LeafSubsampler implements a method 'subsample_leaves' which, given a
26 |     tree, generates a sample of the observed leaves in that tree and returns a
27 |     new tree which is the induced subtree (tree containing only lineages that
28 |     contain a sampled leaf) of the original tree on that sample.
29 |     """
30 | 
31 |     @abc.abstractmethod
32 |     def subsample_leaves(self, tree: CassiopeiaTree) -> CassiopeiaTree:
33 |         """
34 |         Subsamples the leaves of a CassiopeiaTree.
35 | 
36 |         Returns a new CassiopeiaTree which is the result of subsampling the
37 |         leaves in the original CassiopeiaTree and removing ancestral nodes no
38 |         longer relevant to the sample. All fields on the original character
39 |         matrix persist, but maintains character states, meta data, and the
40 |         dissimilarity map for the sampled cells only.
41 | 
42 |         Args:
43 |             tree: The CassiopeiaTree for which to subsample leaves
44 | 
45 |         Returns:
46 |             A new CassiopeiaTree that is the induced subtree on a sample of the
47 |             leaves in the given tree.
48 |         """
49 | 


--------------------------------------------------------------------------------
/cassiopeia/simulator/LineageTracingDataSimulator.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file stores an abstract subclass of DataSimulator, the
 3 | LineageTracingDataSimulator. A LineageTracingDataSimulator overlays lineage
 4 | tracing data onto a CassiopeiaTree, i.e. it sets the character states of a
 5 | CassiopeiaTree (in particular, the character matrix).
 6 | """
 7 | import abc
 8 | 
 9 | from cassiopeia.data import CassiopeiaTree
10 | from cassiopeia.simulator.DataSimulator import DataSimulator
11 | 
12 | 
13 | class LineageTracingDataSimulator(DataSimulator):
14 |     """
15 |     LineageTracingDataSimulator is an abstract class that all lineage tracing
16 |     data simulators derive from.
17 | 
18 |     A LineageTracingDataSimulator is useful for simulating lineage tracing
19 |     assays in silico, allowing us to explore the utility of lineage tracing
20 |     technologies such as base editors, GESTALT, etc. for recovering the ground
21 |     truth cell phylogeny. In a typical simulation pipeline, a
22 |     LineageTracingDataSimulator is used to overlay lineage tracing data on a
23 |     CassiopeiaTree, and then a CassiopeiaSolver is used to reconstruct the tree
24 |     topology.
25 | 
26 |     As a result, LineageTracingDataSimulators allow us to study the impact of
27 |     different aspects of the lineage tracing assay - such as number of
28 |     barcodes, mutation rates, etc. - on our ability to recover the ground
29 |     truth phylogeny.
30 |     """
31 | 
32 |     @abc.abstractmethod
33 |     def overlay_data(self, tree: CassiopeiaTree) -> None:
34 |         """
35 |         Overlay lineage tracing data onto the CassiopeiaTree (in-place).
36 | 
37 |         This sets the character states of all nodes in the tree, as well
38 |         as the character matrix. The tree is expected to have its topology
39 |         initialized, as well as meaningful branch lengths.
40 | 
41 |         Args:
42 |             tree: the CassiopeiaTree to overlay the lineage tracing data on.
43 |                 The tree topology must be initialized.
44 |         """
45 | 


--------------------------------------------------------------------------------
/cassiopeia/simulator/SpatialDataSimulator.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file stores an abstract subclass of DataSimulator, the
 3 | SpatialDataSimulator. A SpatialDataSimulator overlays spatial data onto a
 4 | Cassiopeiatree, i.e. it sets the spatial coordinates of a CassiopeiaTree
 5 | (in particular, as attributes of the nodes of the tree and the cell meta).
 6 | """
 7 | import abc
 8 | 
 9 | from cassiopeia.data import CassiopeiaTree
10 | from cassiopeia.simulator.DataSimulator import DataSimulator
11 | 
12 | 
13 | class SpatialDataSimulator(DataSimulator):
14 |     """
15 |     SpatialDataSimulator is an abstract class that all spatial data simulators
16 |     derive from.
17 | 
18 |     A SpatialDataSimulator is useful for simulating spatial assays in silico.
19 |     In a typical simulation pipeline, a SpatialDataSimulator is used to overlay
20 |     spatial coordinates on a CassiopeiaTree, and then a CassiopeiaSolver is used
21 |     to reconstruct the tree topology (to simulate single-cell-resolution spatial
22 |     assays) or a SpatialLeafSubsampler is used (to simulate
23 |     non-single-cell-resoultion spatial assays).
24 |     """
25 | 
26 |     @abc.abstractmethod
27 |     def overlay_data(self, tree: CassiopeiaTree) -> None:
28 |         """
29 |         Overlay spatial data onto the CassiopeiaTree (in-place).
30 | 
31 |         This sets the spatial coordinates of all nodes in the tree. These
32 |         coordinates are stored as the `spatial` node attribute. For leaves,
33 |         these exact coordinates are saved as columns in the `cell_meta`
34 |         attribute of the CassiopeiaTree.
35 | 
36 |         Args:
37 |             tree: the CassiopeiaTree to overlay the lineage tracing data on.
38 |                 The tree topology must be initialized.
39 |         """
40 | 


--------------------------------------------------------------------------------
/cassiopeia/simulator/TreeSimulator.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Abstract class TreeSimulator, for tree simulation module.
 3 | 
 4 | All tree simulators are derived classes of this abstract class, and at a minimum
 5 | implement a method called `simulate_tree`.
 6 | """
 7 | import abc
 8 | 
 9 | from cassiopeia.data import CassiopeiaTree
10 | from cassiopeia.mixins import TreeSimulatorError
11 | 
12 | 
13 | class TreeSimulator(abc.ABC):
14 |     """
15 |     TreeSimulator is an abstract class that all tree simulators derive from.
16 | 
17 |     A TreeSimulator returns a CassiopeiaTree with at least its tree topology
18 |     initialized. The character matrix need not be initialized (this is
19 |     accomplished instead using a LineageTracingDataSimulator object). The
20 |     branch lengths may be interpretable or not depending on the specific
21 |     TreeSimulator.
22 | 
23 |     The purpose of the TreeSimulator is to allow users to perform in silico
24 |     simulations of single-cell phylogenies, such as tumor phylogenies, organism
25 |     development, etc., providing a ground truth phylogeny and thus a means to
26 |     evaluate methodologies for reconstructing and analyzing single-cell
27 |     phylogenies.
28 |     """
29 | 
30 |     @abc.abstractmethod
31 |     def simulate_tree(self) -> CassiopeiaTree:
32 |         """
33 |         Simulate a CassiopeiaTree.
34 | 
35 |         The returned tree will have at least its tree topology initialized.
36 |         """
37 | 


--------------------------------------------------------------------------------
/cassiopeia/simulator/UniformLeafSubsampler.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A subclass of LeafSubsampler, the UniformLeafSubsampler. 
  3 | 
  4 | Takes a uniform random sample of the leaves of a CassiopeiaTree and produces a
  5 | new CassiopeiaTree that keeps only the lineages pertaining to the sample.
  6 | """
  7 | 
  8 | import copy
  9 | import numpy as np
 10 | from typing import Optional
 11 | 
 12 | from cassiopeia.data import CassiopeiaTree
 13 | from cassiopeia.simulator.LeafSubsampler import (
 14 |     LeafSubsampler,
 15 |     LeafSubsamplerError,
 16 | )
 17 | 
 18 | 
 19 | class UniformLeafSubsampler(LeafSubsampler):
 20 |     def __init__(
 21 |         self,
 22 |         ratio: Optional[float] = None,
 23 |         number_of_leaves: Optional[int] = None,
 24 |     ):
 25 |         """
 26 |         Uniformly subsample leaf samples of a CassiopeiaTree.
 27 | 
 28 |         If 'ratio' is provided, samples 'ratio' of the leaves, rounded down,
 29 |         uniformly at random. If instead 'number_of_leaves' is provided,
 30 |         'number_of_leaves' of the leaves are sampled uniformly at random. Only
 31 |         one of the two criteria can be provided.
 32 | 
 33 |         Args:
 34 |             ratio: Specifies the number of leaves to be sampled as a ratio of
 35 |                 the total number of leaves
 36 |             number_of_leaves: Explicitly specifies the number of leaves to be sampled
 37 |         """
 38 |         if ratio is None and number_of_leaves is None:
 39 |             raise LeafSubsamplerError(
 40 |                 "At least one of 'ratio' and 'number_of_leaves' "
 41 |                 "must be specified."
 42 |             )
 43 |         if ratio is not None and number_of_leaves is not None:
 44 |             raise LeafSubsamplerError(
 45 |                 "Exactly one of 'ratio' and 'number_of_leaves'"
 46 |                 "must be specified."
 47 |             )
 48 |         self.__ratio = ratio
 49 |         self.__number_of_leaves = number_of_leaves
 50 | 
 51 |     def subsample_leaves(
 52 |         self, tree: CassiopeiaTree, keep_singular_root_edge: bool = True
 53 |     ) -> CassiopeiaTree:
 54 |         """Uniformly subsample leaf samples of a given tree.
 55 | 
 56 |         Generates a uniform random sample on the leaves of the given
 57 |         CassiopeiaTree and returns a tree pruned to contain lineages relevant
 58 |         to only leaves in the sample (the "induced subtree" on the sample).
 59 |         All fields on the original character matrix persist, but maintains
 60 |         character states, meta data, and the dissimilarity map for the sampled
 61 |         cells only.
 62 | 
 63 |         Has the option to keep the single edge leading from the root in the
 64 |         induced subtree, if it exists. This edge is often used to represent the
 65 |         time that the root lives before any divisions occur in the phyologeny,
 66 |         and is useful in instances where the branch lengths are critical, like
 67 |         simulating ground truth phylogenies or estimating branch lengths.
 68 | 
 69 |         Args:
 70 |             tree: The CassiopeiaTree for which to subsample leaves
 71 |             keep_singular_root_edge: Whether or not to collapse the single edge
 72 |                 leading from the root in the subsample, if it exists
 73 | 
 74 |         Returns:
 75 |             A new CassiopeiaTree that is the induced subtree on a sample of the
 76 |                 leaves in the given tree
 77 | 
 78 |         Raises:
 79 |             LeafSubsamplerError if the sample size is <= 0, or larger than the
 80 |                 number of leaves in the tree
 81 |         """
 82 |         ratio = self.__ratio
 83 |         number_of_leaves = self.__number_of_leaves
 84 |         n_subsample = (
 85 |             number_of_leaves
 86 |             if number_of_leaves is not None
 87 |             else int(tree.n_cell * ratio)
 88 |         )
 89 |         if n_subsample <= 0:
 90 |             raise LeafSubsamplerError(
 91 |                 "Specified number of leaves sampled is <= 0."
 92 |             )
 93 |         if n_subsample > tree.n_cell:
 94 |             raise LeafSubsamplerError(
 95 |                 "Specified number of leaves sampled is greater than the number"
 96 |                 " of leaves in the given tree."
 97 |             )
 98 | 
 99 |         n_remove = len(tree.leaves) - n_subsample
100 |         subsampled_tree = copy.deepcopy(tree)
101 |         leaf_remove = np.random.choice(
102 |             subsampled_tree.leaves, n_remove, replace=False
103 |         )
104 | 
105 |         subsampled_tree.remove_leaves_and_prune_lineages(leaf_remove)
106 | 
107 |         # Keep the singular root edge if it exists and is indicated to be kept
108 |         if (
109 |             len(subsampled_tree.children(subsampled_tree.root)) == 1
110 |             and keep_singular_root_edge
111 |         ):
112 |             collapse_source = subsampled_tree.children(subsampled_tree.root)[0]
113 |         else:
114 |             collapse_source = None
115 |         subsampled_tree.collapse_unifurcations(source=collapse_source)
116 | 
117 |         # Copy and annotate branch lengths and times
118 |         subsampled_tree.set_times(
119 |             dict(
120 |                 [(node, tree.get_time(node)) for node in subsampled_tree.nodes]
121 |             )
122 |         )
123 | 
124 |         return subsampled_tree
125 | 


--------------------------------------------------------------------------------
/cassiopeia/simulator/__init__.py:
--------------------------------------------------------------------------------
 1 | """Top level for simulator."""
 2 | 
 3 | from .BirthDeathFitnessSimulator import BirthDeathFitnessSimulator
 4 | from .BrownianSpatialDataSimulator import BrownianSpatialDataSimulator
 5 | from .Cas9LineageTracingDataSimulator import Cas9LineageTracingDataSimulator
 6 | from .SequentialLineageTracingDataSimulator import (
 7 |     SequentialLineageTracingDataSimulator,
 8 | )
 9 | from .ClonalSpatialDataSimulator import ClonalSpatialDataSimulator
10 | from .CompleteBinarySimulator import CompleteBinarySimulator
11 | from .DataSimulator import DataSimulator
12 | from .ecDNABirthDeathSimulator import ecDNABirthDeathSimulator
13 | from .LeafSubsampler import LeafSubsampler
14 | from .LineageTracingDataSimulator import LineageTracingDataSimulator
15 | from .SimpleFitSubcloneSimulator import SimpleFitSubcloneSimulator
16 | from .SupercellularSampler import SupercellularSampler
17 | from .TreeSimulator import TreeSimulator
18 | from .UniformLeafSubsampler import UniformLeafSubsampler
19 | from .SpatialLeafSubsampler import SpatialLeafSubsampler
20 | 
21 | 
22 | __all__ = [
23 |     "BirthDeathFitnessSimulator",
24 |     "BrownianSpatialDataSimulator",
25 |     "Cas9LineageTracingDataSimulator",
26 |     "SeqeuntialLineageTracingDataSimulator",
27 |     "CompleteBinarySimulator",
28 |     "DataSimulator",
29 |     "ecDNABirthDeathSimulator",
30 |     "LeafSubsampler",
31 |     "LineageTracingDataSimulator",
32 |     "SimpleFitSubcloneSimulator",
33 |     "SupercellularSampler",
34 |     "TreeSimulator",
35 |     "UniformLeafSubsampler",
36 | ]
37 | 


--------------------------------------------------------------------------------
/cassiopeia/solver/CassiopeiaSolver.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Abstract class CassiopeiaSolver, for the phylogenetic inference module.
 3 | 
 4 | All algorithms are derived classes of this abstract class, and at a minimum
 5 | store an input character matrix and implement a method called `solve`. Each
 6 | derived class stores more information around the parameters necessary for
 7 | inferring a phylogenetic tree.
 8 | """
 9 | import abc
10 | from typing import Optional
11 | 
12 | from cassiopeia.data import CassiopeiaTree
13 | 
14 | 
15 | class CassiopeiaSolver(abc.ABC):
16 |     """
17 |     CassiopeiaSolver is an abstract class that all inference algorithms derive
18 |     from. At minimum, all CassiopeiaSolver subclasses will store a character
19 |     matrix and implement a solver procedure.
20 | 
21 |     Args:
22 |         prior_transformation: A function defining a transformation on the priors
23 |             in forming weights. Supports the following transformations:
24 |                 "negative_log": Transforms each probability by the negative log
25 |                 "inverse": Transforms each probability p by taking 1/p
26 |                 "square_root_inverse": Transforms each probability by the
27 |                     the square root of 1/p
28 |     """
29 | 
30 |     def __init__(self, prior_transformation: str = "negative_log"):
31 | 
32 |         self.prior_transformation = prior_transformation
33 | 
34 |     @abc.abstractmethod
35 |     def solve(
36 |         self,
37 |         cassiopeia_tree: CassiopeiaTree,
38 |         layer: Optional[str] = None,
39 |         collapse_mutationless_edges: bool = False,
40 |         logfile: str = "stdout.log",
41 |     ):
42 |         """Solves the inference problem.
43 | 
44 |         Args:
45 |             cassiopeia_tree: CassiopeiaTree storing character information for
46 |                 phylogenetic inference.
47 |             layer: Layer storing the character matrix for solving. If None, the
48 |                 default character matrix is used in the CassiopeiaTree.
49 |             collapse_mutationless_edges: Indicates if the final reconstructed
50 |                 tree should collapse mutationless edges based on internal states
51 |                 inferred by Camin-Sokal parsimony. In scoring accuracy, this
52 |                 removes artifacts caused by arbitrarily resolving polytomies.
53 |             logfile: File location to log output.
54 |         """
55 |         pass
56 | 


--------------------------------------------------------------------------------
/cassiopeia/solver/__init__.py:
--------------------------------------------------------------------------------
 1 | """Top level for Tree Solver development."""
 2 | 
 3 | from .HybridSolver import HybridSolver
 4 | from .ILPSolver import ILPSolver
 5 | from .MaxCutGreedySolver import MaxCutGreedySolver
 6 | from .MaxCutSolver import MaxCutSolver
 7 | from .NeighborJoiningSolver import NeighborJoiningSolver
 8 | from .PercolationSolver import PercolationSolver
 9 | from .SharedMutationJoiningSolver import SharedMutationJoiningSolver
10 | from .SpectralGreedySolver import SpectralGreedySolver
11 | from .SpectralSolver import SpectralSolver
12 | from .UPGMASolver import UPGMASolver
13 | from .VanillaGreedySolver import VanillaGreedySolver
14 | from .SpectralNeighborJoiningSolver import SpectralNeighborJoiningSolver
15 | from . import dissimilarity_functions as dissimilarity
16 | 


--------------------------------------------------------------------------------
/cassiopeia/solver/missing_data_methods.py:
--------------------------------------------------------------------------------
  1 | """This file contains included missing data imputation methods."""
  2 | 
  3 | from typing import Dict, List, Optional, Tuple, Union
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | from cassiopeia.mixins import is_ambiguous_state, unravel_ambiguous_states
  9 | from cassiopeia.solver import solver_utilities
 10 | 
 11 | 
 12 | def assign_missing_average(
 13 |     character_matrix: pd.DataFrame,
 14 |     missing_state_indicator: int,
 15 |     left_set: List[str],
 16 |     right_set: List[str],
 17 |     missing: List[str],
 18 |     weights: Optional[Dict[int, Dict[int, float]]] = None,
 19 | ) -> Tuple[List[str], List[str]]:
 20 |     """Implements the "Average" missing data imputation method.
 21 | 
 22 |     An on-the-fly missing data imputation method for the VanillaGreedy
 23 |     Solver and variants. It takes in a set of samples that have a missing
 24 |     value at the character chosen to split on in a partition. For each of
 25 |     these samples, it calculates the average number of mutations that
 26 |     samples on each side of the partition share with it and places the
 27 |     sample on the side with the higher value.
 28 | 
 29 |     Args:
 30 |         character_matrix: The character matrix containing the observed
 31 |             character states for the samples
 32 |         missing_state_indicator: The character representing missing values
 33 |         left_set: A list of the samples on the left of the partition,
 34 |             represented by their names in the original character matrix
 35 |         right_set: A list of the samples on the right of the partition,
 36 |             represented by their names in the original character matrix
 37 |         missing: A list of samples with missing data to be imputed,
 38 |             represented by their names in the original character matrix
 39 |         weights: A set of optional weights for character/state mutation pairs
 40 | 
 41 |     Returns:
 42 |         A tuple of lists, representing the left and right partitions with
 43 |         missing samples imputed
 44 |     """
 45 | 
 46 |     # A helper function to calculate the number of shared character/state pairs
 47 |     # shared between a missing sample and a side of the partition
 48 |     sample_names = list(character_matrix.index)
 49 |     character_array = character_matrix.to_numpy()
 50 |     left_indices = solver_utilities.convert_sample_names_to_indices(
 51 |         sample_names, left_set
 52 |     )
 53 |     right_indices = solver_utilities.convert_sample_names_to_indices(
 54 |         sample_names, right_set
 55 |     )
 56 |     missing_indices = solver_utilities.convert_sample_names_to_indices(
 57 |         sample_names, missing
 58 |     )
 59 | 
 60 |     def score_side(subset_character_states, query_states, weights):
 61 |         score = 0
 62 |         for char in range(len(subset_character_states)):
 63 | 
 64 |             query_state = [
 65 |                 q
 66 |                 for q in query_states[char]
 67 |                 if q != 0 and q != missing_state_indicator
 68 |             ]
 69 |             all_states = np.array(subset_character_states[char])
 70 |             for q in query_state:
 71 |                 if weights:
 72 |                     score += weights[char][q] * np.count_nonzero(
 73 |                         all_states == q
 74 |                     )
 75 |                 else:
 76 |                     score += np.count_nonzero(all_states == q)
 77 | 
 78 |         return score
 79 | 
 80 |     subset_character_array_left = character_array[left_indices, :]
 81 |     subset_character_array_right = character_array[right_indices, :]
 82 | 
 83 |     all_left_states = [
 84 |         unravel_ambiguous_states(subset_character_array_left[:, char])
 85 |         for char in range(subset_character_array_left.shape[1])
 86 |     ]
 87 |     all_right_states = [
 88 |         unravel_ambiguous_states(subset_character_array_right[:, char])
 89 |         for char in range(subset_character_array_right.shape[1])
 90 |     ]
 91 | 
 92 |     for sample_index in missing_indices:
 93 | 
 94 |         all_states_for_sample = [
 95 |             unravel_ambiguous_states([character_array[sample_index, char]])
 96 |             for char in range(character_array.shape[1])
 97 |         ]
 98 | 
 99 |         left_score = score_side(
100 |             np.array(all_left_states, dtype=object),
101 |             np.array(all_states_for_sample, dtype=object),
102 |             weights,
103 |         )
104 |         right_score = score_side(
105 |             np.array(all_right_states, dtype=object),
106 |             np.array(all_states_for_sample, dtype=object),
107 |             weights,
108 |         )
109 | 
110 |         if (left_score / len(left_set)) > (right_score / len(right_set)):
111 |             left_set.append(sample_names[sample_index])
112 |         else:
113 |             right_set.append(sample_names[sample_index])
114 | 
115 |     return left_set, right_set
116 | 


--------------------------------------------------------------------------------
/cassiopeia/solver/solver_utilities.py:
--------------------------------------------------------------------------------
  1 | """This file contains general utilities to be called by functions throughout
  2 | the solver module"""
  3 | 
  4 | import logging
  5 | from typing import Dict, Generator, List, Optional
  6 | 
  7 | import ete3
  8 | from hashlib import blake2b
  9 | import numpy as np
 10 | import pandas as pd
 11 | import time
 12 | 
 13 | from cassiopeia.mixins import PriorTransformationError
 14 | 
 15 | 
 16 | def node_name_generator() -> Generator[str, None, None]:
 17 |     """Generates unique node names for building the reconstructed tree.
 18 | 
 19 |     Creates a generator object that produces unique node names by hashing
 20 |     timestamps.
 21 | 
 22 |     Returns:
 23 |         A generator object
 24 |     """
 25 | 
 26 |     while True:
 27 |         k = str(time.time()).encode("utf-8")
 28 |         h = blake2b(key=k, digest_size=12)
 29 |         yield "cassiopeia_internal_node" + h.hexdigest()
 30 | 
 31 | 
 32 | def collapse_unifurcations(tree: ete3.Tree) -> ete3.Tree:
 33 |     """Collapse unifurcations.
 34 |     Collapse all unifurcations in the tree, namely any node with only one child
 35 |     should be removed and all children should be connected to the parent node.
 36 |     Args:
 37 |         tree: tree to be collapsed
 38 |     Returns:
 39 |         A collapsed tree.
 40 |     """
 41 | 
 42 |     collapse_fn = lambda x: (len(x.children) == 1)
 43 | 
 44 |     collapsed_tree = tree.copy()
 45 |     to_collapse = [n for n in collapsed_tree.traverse() if collapse_fn(n)]
 46 | 
 47 |     for n in to_collapse:
 48 |         n.delete()
 49 | 
 50 |     return collapsed_tree
 51 | 
 52 | 
 53 | def transform_priors(
 54 |     priors: Optional[Dict[int, Dict[int, float]]],
 55 |     prior_transformation: str = "negative_log",
 56 | ) -> Dict[int, Dict[int, float]]:
 57 |     """Generates a dictionary of weights from priors.
 58 | 
 59 |     Generates a dictionary of weights from given priors for each character/state
 60 |     pair for use in algorithms that inherit the GreedySolver. Supported
 61 |     transformations include negative log, negative log square root, and inverse.
 62 | 
 63 |     Args:
 64 |         priors: A dictionary of prior probabilities for each character/state
 65 |             pair
 66 |         prior_transformation: A function defining a transformation on the priors
 67 |             in forming weights. Supports the following transformations:
 68 |                 "negative_log": Transforms each probability by the negative log
 69 |                 "inverse": Transforms each probability p by taking 1/p
 70 |                 "square_root_inverse": Transforms each probability by the
 71 |                     the square root of 1/p
 72 | 
 73 |     Returns:
 74 |         A dictionary of weights for each character/state pair
 75 |     """
 76 |     if prior_transformation not in [
 77 |         "negative_log",
 78 |         "inverse",
 79 |         "square_root_inverse",
 80 |     ]:
 81 |         raise PriorTransformationError(
 82 |             "Please select one of the supported prior transformations."
 83 |         )
 84 | 
 85 |     prior_function = lambda x: -np.log(x)
 86 | 
 87 |     if prior_transformation == "square_root_inverse":
 88 |         prior_function = lambda x: (np.sqrt(1 / x))
 89 |     if prior_transformation == "inverse":
 90 |         prior_function = lambda x: 1 / x
 91 | 
 92 |     weights = {}
 93 |     for character in priors:
 94 |         state_weights = {}
 95 |         for state in priors[character]:
 96 |             p = priors[character][state]
 97 |             if p <= 0.0 or p > 1.0:
 98 |                 raise PriorTransformationError(
 99 |                     "Please make sure all priors have a value between 0 and 1"
100 |                 )
101 |             state_weights[state] = prior_function(p)
102 |         weights[character] = state_weights
103 |     return weights
104 | 
105 | 
106 | def convert_sample_names_to_indices(
107 |     names: List[str], samples: List[str]
108 | ) -> List[int]:
109 |     """Maps samples to their integer indices in a given set of names.
110 | 
111 |     Used to map sample string names to the their integer positions in the index
112 |     of the original character matrix for efficient indexing operations.
113 | 
114 |     Args:
115 |         names: A list of sample names, represented by their string names in the
116 |             original character matrix
117 |         samples: A list of sample names representing the subset to be mapped to
118 |             integer indices
119 | 
120 |     Returns:
121 |         A list of samples mapped to integer indices
122 |     """
123 |     name_to_index = dict(zip(names, range(len(names))))
124 | 
125 |     return list(map(lambda x: name_to_index[x], samples))
126 | 
127 | def save_dissimilarity_as_phylip(
128 |         dissimilarity_map: pd.DataFrame, path: str
129 |     ) -> None:
130 |     """Saves a dissimilarity map as a phylip file.
131 | 
132 |     Args:
133 |         dissimilarity_map: A dissimilarity map
134 |         path: The path to save the phylip file
135 | 
136 |     Returns:
137 |         None
138 |     """
139 |     dissimilarity_np = dissimilarity_map.to_numpy()
140 |     n = dissimilarity_np.shape[0]
141 |     with open(path, "w") as f:
142 |         f.write("{}\n".format(n))
143 |         for i in range(n):
144 |             row = dissimilarity_np[i, :i+1]
145 |             formatted_values = '\t'.join(map('{:.4f}'.format, row))
146 |             f.write("{}\t{}\n".format(dissimilarity_map.index[i], formatted_values))
147 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | """Top level for tools."""
 2 | 
 3 | from .autocorrelation import compute_morans_i
 4 | from .branch_length_estimator import IIDExponentialBayesian, IIDExponentialMLE
 5 | from .coupling import compute_evolutionary_coupling
 6 | from .fitness_estimator import (
 7 |     FitnessEstimator,
 8 |     FitnessEstimatorError,
 9 |     LBIJungle,
10 | )
11 | from .parameter_estimators import (
12 |     estimate_missing_data_rates,
13 |     estimate_mutation_rate,
14 | )
15 | from .small_parsimony import fitch_count, fitch_hartigan, score_small_parsimony
16 | from .topology import compute_cophenetic_correlation, compute_expansion_pvalues
17 | from .tree_metrics import (
18 |     calculate_likelihood_continuous,
19 |     calculate_likelihood_discrete,
20 |     calculate_parsimony,
21 | )
22 | 
23 | 
24 | __all__ = [
25 |     "calculate_likelihood_continuous",
26 |     "calculate_likelihood_discrete",
27 |     "calculate_parsimony",
28 |     "compute_morans_i",
29 |     "compute_evolutionary_coupling",
30 |     "estimate_missing_data_rates",
31 |     "estimate_mutation_rate",
32 |     "fitch_count",
33 |     "fitch_hartigan",
34 |     "score_small_parsimony",
35 |     "compute_cophenetic_correlation",
36 |     "compute_expansion_pvalues",
37 | ]
38 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/autocorrelation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utility file for computing autocorrelation statistics on trees.
  3 | """
  4 | from typing import Callable, List, Optional, Union
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | from cassiopeia.data import CassiopeiaTree
  9 | from cassiopeia.mixins import AutocorrelationError
 10 | from cassiopeia.data import utilities
 11 | 
 12 | 
 13 | def compute_morans_i(
 14 |     tree: CassiopeiaTree,
 15 |     meta_columns: Optional[List] = None,
 16 |     X: Optional[pd.DataFrame] = None,
 17 |     W: Optional[pd.DataFrame] = None,
 18 |     inverse_weight_fn: Callable[[Union[int, float]], float] = lambda x: 1.0 / x,
 19 | ) -> Union[float, pd.DataFrame]:
 20 |     """Computes Moran's I statistic.
 21 | 
 22 |     Using the cross-correlation between leaves as specified on the tree, compute
 23 |     the Moran's I statistic for each of the data items specified. This will
 24 |     only work for numerical data, and will thrown an error otherwise.
 25 | 
 26 |     Generally, this statistic takes in a weight matrix (which can be computed
 27 |     directly from a phylogenetic tree) and a set of numerical observations that
 28 |     are centered and standardized (i.e., mean 0 and population standard deviation
 29 |     of 1). Then, the Moran's I statistic is:
 30 | 
 31 |     I = X' * Wn * X
 32 | 
 33 |     where X' denotes a tranpose, * denotes the matrix multiplier, and Wn is the
 34 |     normalized weight matrix such that sum([w_i,j for all i,j]) = 1. 
 35 | 
 36 |     Inspired from the tools and code used in Chaligne et al, Nature Genetics
 37 |     2021.
 38 | 
 39 |     The mathematical details of the statistic can be found in:
 40 |         Wartenberg, "Multivariate Spatial Correlation: A Method for Exploratory
 41 |         Geographical Analysis", Geographical Analysis (1985)
 42 | 
 43 |     Args:
 44 |         tree: CassiopeiaTree
 45 |         meta_columns: Columns in the Cassiopeia Tree :attr:cell_meta object
 46 |             for which to compute autocorrelations
 47 |         X: Extra data matrix for computing autocorrelations.
 48 |         W: Phylogenetic weight matrix. If this is not specified, then the
 49 |             weight matrix will be computed within the function.
 50 |         inverse_weight_fn: Inverse function to apply to the weights, if the
 51 |             weight matrix must be computed.
 52 | 
 53 |     Returns:
 54 |         Moran's I statistic
 55 |     """
 56 | 
 57 |     if X is None and meta_columns is None:
 58 |         raise AutocorrelationError(
 59 |             "Specify data for computing autocorrelations."
 60 |         )
 61 | 
 62 |     _X = None
 63 |     if meta_columns is not None:
 64 |         _X = tree.cell_meta[meta_columns]
 65 | 
 66 |     if X is not None:
 67 |         if len(np.intersect1d(tree.leaves, X.index)) != tree.n_cell:
 68 |             raise AutocorrelationError(
 69 |                 "Specified argument X must be a dataframe with identical"
 70 |                 " indices to the leaves of the CassiopeiaTree."
 71 |             )
 72 | 
 73 |         _X = pd.concat([_X, X], axis=0)
 74 | 
 75 |     # check to make sure all values are numerical
 76 |     if not np.all(
 77 |         _X.apply(lambda s: pd.to_numeric(s, errors="coerce").notnull().all())
 78 |     ):
 79 |         raise AutocorrelationError(
 80 |             "There are some columns that are not numeric in the specified data."
 81 |         )
 82 |     
 83 |     # cast to numeric
 84 |     _X = _X.apply(lambda s: pd.to_numeric(s, errors="coerce"))
 85 | 
 86 |     # instantiate the weight matrix if None is specified
 87 |     if W is None:
 88 |         W = utilities.compute_phylogenetic_weight_matrix(
 89 |             tree, inverse=True, inverse_fn=inverse_weight_fn
 90 |         )
 91 | 
 92 |     # make sure that W has the correct indices
 93 |     if len(np.intersect1d(tree.leaves, W.index)) != tree.n_cell:
 94 |             raise AutocorrelationError(
 95 |                 "Weight matrix does not have the same leaves as the tree."
 96 |             )
 97 | 
 98 |     N = tree.n_cell
 99 | 
100 |     # normalize W to 1
101 |     _W = W / W.sum().sum()
102 | 
103 |     # center and standardize _X
104 |     _X = (_X - _X.mean()) / _X.std(axis=0, ddof=0)
105 | 
106 |     I = _X.T.dot(_W).dot(_X)
107 | 
108 |     # if we're only testing one variable, return a float
109 |     if _X.shape[1] == 1:
110 |         I = I.iloc[0, 0]
111 | 
112 |     return I
113 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/branch_length_estimator/BranchLengthEstimator.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Abstract class BranchLengthEstimator, for the branch length estimation module.
 3 | 
 4 | All algorithms are derived classes of this abstract class, and at a minimum
 5 | implement a method called `estimate_branch_lengths`.
 6 | """
 7 | import abc
 8 | 
 9 | from cassiopeia.data import CassiopeiaTree
10 | 
11 | 
12 | class BranchLengthEstimator(abc.ABC):
13 |     """
14 |     BranchLengthEstimator is an abstract class that all branch length
15 |     estimation algorithms derive from. At minimum, all BranchLengthEstimator
16 |     subclasses will implement a method called `estimate_branch_lengths`.
17 |     """
18 | 
19 |     @abc.abstractmethod
20 |     def estimate_branch_lengths(self, tree: CassiopeiaTree) -> None:
21 |         """Estimates branch lengths for the given tree.
22 | 
23 |         Args:
24 |             cassiopeia_tree: CassiopeiaTree storing character information
25 |                 and an initialized tree topology.
26 |         """
27 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/branch_length_estimator/__init__.py:
--------------------------------------------------------------------------------
1 | """Top level for branch length estimator."""
2 | 
3 | from .IIDExponentialMLE import IIDExponentialMLE
4 | from .IIDExponentialBayesian import IIDExponentialBayesian
5 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/branch_length_estimator/_iid_exponential_bayesian.pxd:
--------------------------------------------------------------------------------
 1 | from libcpp.vector cimport vector
 2 | from libcpp.map cimport map
 3 | from libcpp.pair cimport pair
 4 | 
 5 | # Declare the class with cdef
 6 | cdef extern from "_iid_exponential_bayesian_cpp.h":
 7 |     cdef cppclass _InferPosteriorTimes:
 8 |         _InferPosteriorTimes() except +
 9 |         void run(
10 |             int N,
11 |             vector[vector[int]] children,
12 |             int root,
13 |             vector[int] is_internal_node,
14 |             vector[int] get_number_of_mutated_characters_in_node,
15 |             vector[int] non_root_internal_nodes,
16 |             vector[int] leaves,
17 |             vector[int] parent,
18 |             int K,
19 |             vector[int] K_non_missing,
20 |             int T,
21 |             double r,
22 |             double lam,
23 |             double sampling_probability,
24 |             vector[int] is_leaf,
25 |         ) except +
26 |         vector[pair[int, double]] get_posterior_means_res()
27 |         vector[pair[int, vector[double]]] get_posteriors_res()
28 |         vector[pair[int, vector[double]]] get_log_joints_res()
29 |         double get_log_likelihood_res()
30 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/branch_length_estimator/_iid_exponential_bayesian.pyx:
--------------------------------------------------------------------------------
  1 | # distutils: language = c++
  2 | 
  3 | from ._iid_exponential_bayesian cimport _InferPosteriorTimes
  4 | from libcpp.vector cimport vector
  5 | from libcpp.map cimport map
  6 | 
  7 | from typing import List, Tuple
  8 | 
  9 | # Create a Cython extension type which holds a C++ instance
 10 | # as an attribute and create a bunch of forwarding methods
 11 | # Python extension type.
 12 | cdef class _PyInferPosteriorTimes:
 13 |     """
 14 |     Infer posterior node times under the Bayesian model.
 15 | 
 16 |     The method 'run' takes in all the information needed to perform inference.
 17 |     """
 18 |     cdef _InferPosteriorTimes* c_infer_posterior_times
 19 | 
 20 |     def __cinit__(self):
 21 |         self.c_infer_posterior_times = new _InferPosteriorTimes();
 22 | 
 23 |     def run(
 24 |         self,
 25 |         int N,
 26 |         vector[vector[int]] children,
 27 |         int root,
 28 |         vector[int] is_internal_node,
 29 |         vector[int] get_number_of_mutated_characters_in_node,
 30 |         vector[int] non_root_internal_nodes,
 31 |         vector[int] leaves,
 32 |         vector[int] parent,
 33 |         int K,
 34 |         vector[int] K_non_missing,
 35 |         int T,
 36 |         double r,
 37 |         double lam,
 38 |         double sampling_probability,
 39 |         vector[int] is_leaf,
 40 |     ):
 41 |         """
 42 |         Infer posterior node time distribution.
 43 | 
 44 |         Args:
 45 |             N: Number of nodes in tree.
 46 |             children: Adjacency list of graph.
 47 |             root: Root of graph.
 48 |             is_internal_node: Binary indicator for whether the node is internal
 49 |                 or not.
 50 |             get_number_of_mutated_characters_in_node: Number of mutated
 51 |                 characters in the node.
 52 |             non_root_internal_nodes: The non-root internal nodes.
 53 |             leaves: The leaves of the tree.
 54 |             parent: The parent of each node in the tree (or a negative number
 55 |                 for the root)
 56 |             K: The number of characters
 57 |             K_non_missing: The number of non-missing characters in each node.
 58 |             T: The number of timesteps of the discretization.
 59 |             r: The CRISRP/Cas9 mutation rate.
 60 |             lam: The birth rate.
 61 |             sampling_probability: The probability that a leaf is subsampled from
 62 |                 the ground truth phylogeny.
 63 |             is_leaf: Binary indicator for whether a node is a leaf or not.
 64 | 
 65 |         Raises:
 66 |             ValueError if the discretization level T is too small.
 67 |         """
 68 |         self.c_infer_posterior_times.run(
 69 |             N,
 70 |             children,
 71 |             root,
 72 |             is_internal_node,
 73 |             get_number_of_mutated_characters_in_node,
 74 |             non_root_internal_nodes,
 75 |             leaves,
 76 |             parent,
 77 |             K,
 78 |             K_non_missing,
 79 |             T,
 80 |             r,
 81 |             lam,
 82 |             sampling_probability,
 83 |             is_leaf,
 84 |         )
 85 | 
 86 |     def get_posterior_means_res(self) -> List[Tuple[int, float]]:
 87 |         """
 88 |         Posterior mean node times.
 89 | 
 90 |         Returns a list of tuples (node, posterior_time), containing the posterior mean
 91 |         time 'posterior_time' of node 'node'.
 92 |         """
 93 |         return self.c_infer_posterior_times.get_posterior_means_res()
 94 |     
 95 |     def get_posteriors_res(self) -> List[Tuple[int, List[float]]]:
 96 |         """
 97 |         Posterior node time distributions.
 98 | 
 99 |         Returns a list of tuples (node, posterior_time_distribution), containing
100 |         the posterior time 'posterior_time_distribution' of node 'node'. Here
101 |         'posterior_time_distribution' is a list of length T + 1, where
102 |         posterior_time_distribution[t] is the posterior probability that node
103 |         'node' has (discretized) time t.
104 | 
105 |         Note that this is the normalized version of get_log_joints_res.
106 |         """
107 |         return self.c_infer_posterior_times.get_posteriors_res()
108 |     
109 |     def get_log_joints_res(self) -> List[Tuple[int, List[float]]]:
110 |         """
111 |         Joint (node, time) log probabilities.
112 | 
113 |         Returns a list of tuples (node, log_joint), containing the log joint
114 |         probability of node 'node' taking a given (discretized) time t (given
115 |         the observed character matrix and tree topology); this is log_joint[t].
116 | 
117 |         Note that this is the unnormalized version of get_log_joints_res.
118 |         """
119 |         return self.c_infer_posterior_times.get_log_joints_res()
120 |     
121 |     def get_log_likelihood_res(self):
122 |         """
123 |         Log likelihood of the observed data.
124 | 
125 |         The log likelihood of the observed character matrix and tree topology
126 |         under the Bayesian model.
127 | 
128 |         Note that this is just the log-sum-exp of get_log_joints_res for any
129 |         node.
130 |         """
131 |         return self.c_infer_posterior_times.get_log_likelihood_res()
132 | 
133 |     def __dealloc__(self):
134 |         del self.c_infer_posterior_times
135 | 
136 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/branch_length_estimator/_iid_exponential_bayesian_cpp.h:
--------------------------------------------------------------------------------
 1 | #ifndef _IID_EXPONENTIAL_BAYESIAN_CPP_H
 2 | #define _IID_EXPONENTIAL_BAYESIAN_CPP_H
 3 | 
 4 | #include <vector>
 5 | 
 6 | using namespace std;
 7 | 
 8 | class _InferPosteriorTimes{
 9 |     public:
10 |         _InferPosteriorTimes();
11 |         ~_InferPosteriorTimes();
12 |         void run(
13 |             int N,
14 |             vector<vector<int> > children,
15 |             int root,
16 |             vector<int> is_internal_node,
17 |             vector<int> get_number_of_mutated_characters_in_node,
18 |             vector<int> non_root_internal_nodes,
19 |             vector<int> leaves,
20 |             vector<int> parent,
21 |             int K,
22 |             vector<int> K_non_missing,
23 |             int T,
24 |             double r,
25 |             double lam,
26 |             double sampling_probability,
27 |             vector<int> is_leaf
28 |         );
29 |         // The following methods access the results of the run() method.
30 |         vector<pair<int, double> > get_posterior_means_res();
31 |         vector<pair<int, vector<double> > > get_posteriors_res();
32 |         vector<pair<int, vector<double> > > get_log_joints_res();
33 |         double get_log_likelihood_res();
34 | 
35 |         private:
36 |             // These are the parameters to the run() call.
37 |             int N;
38 |             vector<vector<int> > children;
39 |             int root;
40 |             vector<int> is_internal_node;
41 |             vector<int> get_number_of_mutated_characters_in_node;
42 |             vector<int> non_root_internal_nodes;
43 |             vector<int> leaves;
44 |             vector<int> parent;
45 |             int K;
46 |             vector<int> K_non_missing;
47 |             int T;
48 |             double r;
49 |             double lam;
50 |             double sampling_probability;
51 |             vector<int> is_leaf;
52 | 
53 |             // These are computed internally.
54 |             double dt;
55 |             double*** down_cache;  // [N][T + 1][K]
56 |             double*** up_cache;  // [N][T + 1][K]
57 |             double* p_unsampled;  // [T + 1]
58 |             double** log_joints;  // [N][T + 1]
59 |             double** posteriors;  // [N][T + 1]
60 |             double* posterior_means;  // [N]
61 | 
62 |             void allocate_memory();
63 |             void deallocate_memory();
64 |             void precompute_p_unsampled();
65 |             pair<int, int> valid_cuts_range(int v);
66 |             bool state_is_valid(int v, int x);
67 |             double down(int v, int t, int x);
68 |             double up(int v, int t, int x);
69 |             void populate_down_res();
70 |             void populate_up_res();
71 |             void populate_log_likelihood_res();
72 |             double compute_log_joint(int v, int t);
73 |             void populate_log_joints_res();
74 |             void populate_posteriors_res();
75 |             void populate_posterior_means_res();
76 |             void populate_posterior_results();
77 | 
78 |             vector<pair<vector<int>, double> > down_res;
79 |             vector<pair<vector<int>, double> > up_res;
80 |             vector<pair<int, double> > posterior_means_res;
81 |             vector<pair<int, vector<double> > > posteriors_res;
82 |             vector<pair<int, vector<double> > > log_joints_res;
83 |             double log_likelihood_res;
84 | };
85 | 
86 | #endif
87 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/coupling.py:
--------------------------------------------------------------------------------
  1 | """
  2 | File storing functionality for computing coupling statistics between meta
  3 | variables on a tree.
  4 | """
  5 | from typing import Callable, Optional
  6 | 
  7 | from collections import defaultdict
  8 | import numpy as np
  9 | import pandas as pd
 10 | from tqdm import tqdm
 11 | 
 12 | from cassiopeia.data import CassiopeiaTree
 13 | from cassiopeia.data import utilities as data_utilities
 14 | 
 15 | 
 16 | def compute_evolutionary_coupling(
 17 |     tree: CassiopeiaTree,
 18 |     meta_variable: str,
 19 |     minimum_proportion: float = 0.05,
 20 |     number_of_shuffles: int = 500,
 21 |     random_state: Optional[np.random.RandomState] = None,
 22 |     dissimilarity_map: Optional[pd.DataFrame] = None,
 23 |     cluster_comparison_function: Callable = data_utilities.net_relatedness_index,
 24 |     **comparison_kwargs,
 25 | ) -> pd.DataFrame:
 26 |     """Computes Evolutionary Coupling of categorical variables.
 27 | 
 28 |     Using the methodology described in Yang, Jones et al, BioRxiv (2021), this
 29 |     function will compute the "evolutionary coupling" statistic between values
 30 |     that a categorical variable can take on with the tree. For example, this
 31 |     categorical variable can be a "cell type", and this function will compute
 32 |     the evolutionary couplings between all types of cell types. This indicates
 33 |     how closely related these cell types are to one another.
 34 | 
 35 |     Briefly, this statistic is the Z-normalized mean distance between categories
 36 |     in the specified categorical variable. Note that empirical nulls that have a
 37 |     standard deviation of 0 lead to NaNs in the resulting evolutionary coupling
 38 |     matrix. 
 39 | 
 40 |     The computational complexity of this function is
 41 |     O(n^2 log n + (B+1)(K^2 * O(distance_function)) for a tree with n leaves, a
 42 |     variable with K categories, and B random shuffles.
 43 | 
 44 |     Args:
 45 |         tree: CassiopeiaTree
 46 |         meta_variable: Column in `tree.cell_meta` that stores a categorical
 47 |             variable with K categories.
 48 |         minimum_proportion: Minimum proportion of cells that a category needs
 49 |             to appear in to be considered.
 50 |         number_of_shuffles: Number of times to shuffle the data to compute the
 51 |             empirical Z score.
 52 |         random_state: Numpy random state to parameterize the shuffling.
 53 |         dissimilarity_map: A precomputed dissimilarity map between all leaves.
 54 |         cluster_comparison_function: A function for comparing the mean distance
 55 |             between groups. By default, this is the Net Relatedness Index.
 56 |         **comparison_kwargs: Extra arguments to pass to the cluster comparison
 57 |             function.
 58 | 
 59 |     Returns:
 60 |         A K x K evolutionary coupling dataframe. 
 61 |     """
 62 | 
 63 |     W = (
 64 |         data_utilities.compute_phylogenetic_weight_matrix(tree)
 65 |         if (dissimilarity_map is None)
 66 |         else dissimilarity_map
 67 |     )
 68 | 
 69 |     meta_data = tree.cell_meta[meta_variable]
 70 | 
 71 |     # subset meta data by minimum proportion
 72 |     if minimum_proportion > 0:
 73 |         filter_threshold = int(len(tree.leaves) * minimum_proportion)
 74 |         category_frequencies = meta_data.value_counts()
 75 |         passing_categories = category_frequencies[
 76 |             category_frequencies > filter_threshold
 77 |         ].index.values
 78 |         meta_data = meta_data[meta_data.isin(passing_categories)]
 79 |         W = W.loc[meta_data.index.values, meta_data.index.values]
 80 | 
 81 |     # compute inter-cluster distances
 82 |     inter_cluster_distances = data_utilities.compute_inter_cluster_distances(
 83 |         tree,
 84 |         meta_data=meta_data,
 85 |         dissimilarity_map=W,
 86 |         distance_function=cluster_comparison_function,
 87 |         **comparison_kwargs,
 88 |     )
 89 | 
 90 |     # compute background for Z-scoring
 91 |     background = defaultdict(list)
 92 |     for _ in tqdm(
 93 |         range(number_of_shuffles), desc="Creating empirical background"
 94 |     ):
 95 |         permuted_assignments = meta_data.copy()
 96 |         if random_state:
 97 |             permuted_assignments.index = random_state.permutation(
 98 |                 meta_data.index.values
 99 |             )
100 |         else:
101 |             permuted_assignments.index = np.random.permutation(
102 |                 meta_data.index.values
103 |             )
104 |         background_distances = data_utilities.compute_inter_cluster_distances(
105 |             tree,
106 |             meta_data=permuted_assignments,
107 |             dissimilarity_map=W,
108 |             distance_function=cluster_comparison_function,
109 |             **comparison_kwargs,
110 |         )
111 |         for s1 in background_distances.index:
112 |             for s2 in background_distances.columns:
113 |                 background[(s1, s2)].append(background_distances.loc[s1, s2])
114 | 
115 |     Z_scores = inter_cluster_distances.copy()
116 |     for s1 in Z_scores.index:
117 |         for s2 in Z_scores.columns:
118 |             mean = np.mean(background[(s1, s2)])
119 |             sd = np.std(background[(s1, s2)])
120 | 
121 |             Z_scores.loc[s1, s2] = (
122 |                 inter_cluster_distances.loc[s1, s2] - mean
123 |             ) / sd
124 | 
125 |     return Z_scores
126 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/_FitnessEstimator.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Abstract class FitnessEstimator, for the fitness estimation module.
 3 | 
 4 | All algorithms are derived classes of this abstract class, and at a minimum
 5 | implement a method called `estimate_fitness`. Fitness will be stored as
 6 | the attribute 'fitness' of each node.
 7 | """
 8 | import abc
 9 | 
10 | from cassiopeia.data import CassiopeiaTree
11 | 
12 | 
13 | class FitnessEstimatorError(Exception):
14 |     """An Exception class for the FitnessEstimator class."""
15 | 
16 |     pass
17 | 
18 | 
19 | class FitnessEstimator(abc.ABC):
20 |     """
21 |     FitnessEstimator is an abstract class that all fitness
22 |     estimation algorithms derive from. At minimum, all FitnessEstimator
23 |     subclasses will implement a method called `estimate_fitness`.
24 |     Fitness will be stored as the attribute 'fitness' of each node.
25 |     """
26 | 
27 |     @abc.abstractmethod
28 |     def estimate_fitness(self, tree: CassiopeiaTree) -> None:
29 |         """Estimates fitness for each node in the tree.
30 | 
31 |         Fitness will be stored as the attribute 'fitness' of each node.
32 | 
33 |         Args:
34 |             cassiopeia_tree: CassiopeiaTree storing an initialized
35 |             tree topology with estimated branch lengths.
36 |         """
37 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/__init__.py:
--------------------------------------------------------------------------------
1 | """Top level for fitness estimator."""
2 | 
3 | from ._FitnessEstimator import FitnessEstimator, FitnessEstimatorError
4 | from ._lbi_jungle import LBIJungle
5 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/_jungle/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2020 Felix Horns
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/_jungle/jungle/__init__.py:
--------------------------------------------------------------------------------
1 | from .forest import *
2 | from .sfs import *
3 | from .size_matched_model import *
4 | from .tree import *
5 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/FitnessInference/.gitignore:
--------------------------------------------------------------------------------
  1 | *fasta
  2 | *pickle
  3 | *dat
  4 | *~
  5 | *pyc
  6 | *txt
  7 | *nwk
  8 | notes/#*
  9 | src/#*
 10 | adaptation_ms/#*
 11 | .#*
 12 | *aux
 13 | *out
 14 | *log
 15 | *bbl
 16 | *blg
 17 | *.o
 18 | auto
 19 | *el
 20 | *.fdb_latexmk
 21 | *.py.*
 22 | 
 23 | *.py[cod]
 24 | 
 25 | # C extensions
 26 | *.so
 27 | 
 28 | # Packages
 29 | *.egg
 30 | *.egg-info
 31 | dist
 32 | build
 33 | eggs
 34 | parts
 35 | bin
 36 | var
 37 | sdist
 38 | develop-eggs
 39 | .installed.cfg
 40 | lib
 41 | lib64
 42 | __pycache__
 43 | 
 44 | # Installer logs
 45 | pip-log.txt
 46 | 
 47 | # Unit test / coverage reports
 48 | .coverage
 49 | .tox
 50 | nosetests.xml
 51 | 
 52 | # Translations
 53 | *.mo
 54 | 
 55 | *.py[cod]
 56 | 
 57 | # C extensions
 58 | *.so
 59 | 
 60 | # Packages
 61 | *.egg
 62 | *.egg-info
 63 | dist
 64 | build
 65 | eggs
 66 | parts
 67 | bin
 68 | var
 69 | sdist
 70 | develop-eggs
 71 | .installed.cfg
 72 | lib
 73 | lib64
 74 | __pycache__
 75 | 
 76 | # Installer logs
 77 | pip-log.txt
 78 | 
 79 | # Unit test / coverage reports
 80 | .coverage
 81 | .tox
 82 | nosetests.xml
 83 | 
 84 | # Translations
 85 | *.mo
 86 | 
 87 | # EMACS
 88 | *~
 89 | \#*\#
 90 | /.emacs.desktop
 91 | /.emacs.desktop.lock
 92 | .elc
 93 | auto-save-list
 94 | tramp
 95 | .\#*
 96 | 
 97 | # Org-mode
 98 | .org-id-locations
 99 | *_archive
100 | 
101 | # VIM
102 | *.s[a-w][a-z]
103 | *.un~
104 | Session.vim
105 | .netrwhist
106 | *~
107 | 
108 | # Eclipse
109 | *.pydevproject
110 | .project
111 | .metadata
112 | bin/**
113 | tmp/**
114 | tmp/**/*
115 | *.tmp
116 | *.bak
117 | *.swp
118 | *~.nib
119 | local.properties
120 | .classpath
121 | .settings/
122 | .loadpath
123 | 
124 | # External tool builders
125 | .externalToolBuilders/
126 | 
127 | # Locally stored "Eclipse launch configurations"
128 | *.launch
129 | 
130 | # CDT-specific
131 | .cproject
132 | 
133 | # PDT-specific
134 | .buildpath
135 | 
136 | flupred.geany
137 | flupred.komodoproject
138 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/FitnessInference/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 rneher
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/FitnessInference/README.md:
--------------------------------------------------------------------------------
 1 | ### Inferring fitness from the shape of trees 
 2 | 
 3 | This repository contains the code associated with the manuscript
 4 | 
 5 | Neher, Russell, Shraiman: "Predicting evolution from the shape of genealogical trees". accepted for publication in eLife
 6 | 
 7 | ---
 8 | 
 9 | The directory *prediction_src* contains the code base used for the fitness inference and prediction algorithms as well as classes to hold sequence data and trees adapted.
10 | 
11 | ---
12 | 
13 | The directory *flu* contains the code specific to our analysis of historical influenza data, scripts that generate the figures, the influenza sequences and annotation, analysis results and figure files.
14 | 
15 | ---
16 | 
17 | The directory *toy_data* contains the code to simulate adapting populations building on the FFPopSim library. In addition, it contains scripts to analyze this simulated data, the data itself and the resulting figures.
18 | 
19 | ---
20 | 
21 | #### Ranking sequences by the local branching index (LBI)
22 | 
23 | The script *rank_sequences.py* is a simple wrapper for the prediction tool that takes a multiple sequence alignment and the name of the outgroup as input (this outgroup needs to be in the MSA). It produces a folder containing a ranking of sequences, the inferred ancestral sequences, the reconstructed tree, and optionally a pdf of the marked up tree. This script uses the local branching index (LBI), rather than the full fitness inference to rank sequences.  
24 | 
25 | build-in help and optional arguments:
26 | 
27 |     ./rank_sequences.py --help
28 |     usage: rank_sequences.py [-h] --aln ALN --outgroup OUTGROUP
29 |                              [--eps_branch EPS_BRANCH] [--tau TAU]
30 |                              [--collapse [COLLAPSE]] [--plot [PLOT]]
31 |     
32 |     rank sequences in a multiple sequence aligment
33 |     
34 |     optional arguments:
35 |       -h, --help            show this help message and exit
36 |       --aln ALN             alignment of sequences to by ranked
37 |       --outgroup OUTGROUP   name of outgroup sequence
38 |       --eps_branch EPS_BRANCH
39 |                             minimal branch length for inference
40 |       --tau TAU             time scale for local tree length estimation (relative
41 |                             to average pairwise distance)
42 |       --collapse [COLLAPSE]
43 |                             collapse internal branches with identical sequences
44 |       --plot [PLOT]         plot trees
45 |     
46 | #### Inferring fitness distribution of nodes in the tree
47 | 
48 | The script *infer_fitness.py* also takes an alignment and outgroup as argument, but uses the full fitness inference to rank sequences and calculate the mean posterior and the variance of the posterior. Note that plausible posterior distributions require a that the parameter omega is well chosen. Also, the time conversion factor might need to be different from gamma=1 for optimal results.
49 | 
50 |     ./infer_fitness.py --help
51 |     usage: infer_fitness.py [-h] --aln ALN --outgroup OUTGROUP
52 |                             [--eps_branch EPS_BRANCH] [--diffusion DIFFUSION]
53 |                             [--gamma GAMMA] [--omega OMEGA]
54 |                             [--collapse [COLLAPSE]] [--plot [PLOT]]
55 |     
56 |     rank sequences in a multiple sequence aligment
57 |     
58 |     optional arguments:
59 |       -h, --help            show this help message and exit
60 |       --aln ALN             alignment of sequences to by ranked
61 |       --outgroup OUTGROUP   name of outgroup sequence
62 |       --eps_branch EPS_BRANCH
63 |                             minimal branch length for inference
64 |       --diffusion DIFFUSION
65 |                             fitness diffusion coefficient
66 |       --gamma GAMMA         scale factor for time scale, choose high (>2) for
67 |                             prediction, 1 for fitness inference
68 |       --omega OMEGA         approximate sampling fraction diveded by the fitness
69 |                             standard deviation
70 |       --collapse [COLLAPSE]
71 |                             collapse internal branches with identical sequences
72 |       --plot [PLOT]         plot trees
73 |     
74 | 
75 | 
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/FitnessInference/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/FitnessInference/__init__.py


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/FitnessInference/prediction_src/README:
--------------------------------------------------------------------------------
 1 | This folder contains scripts at the core of the fitness estimation and prediction machinery.
 2 | 
 3 | ############################################################
 4 | solve_survival.py
 5 | 
 6 | this scripts provides a class that solves the branching process equation numerically and uses it to integrate the branch propagator between the desired time points. 
 7 | 
 8 | ############################################################
 9 | fitness_inference.py
10 | 
11 | this script provdies a class with the basic fitness inference. It uses the actual numerical solution for the propagator rather than the tree length approximation. 
12 | 
13 | ############################################################
14 | node_ranking.py
15 | 
16 | this scripts provides a number of utility functions for trees, in particular building trees, labeling, translating, etc. It also contains the class that establises a tree, infers the ancestral states and then infers fitness of all nodes. It provides functions for ranking nodes by different  methods, the major being the inferred fitness. It also colors its own trees.
17 | 
18 | ############################################################
19 | sequence_ranking.py
20 | 
21 | this script provides two classes: An alignment class which dresses an biopython alignment with an outgroup, a tree and an amino acid alignment if a coding region is provided. 
22 | 
23 | The other class is a subclass of node_ranking that takes an alignment as input and runs a prediction.
24 | 
25 | ############################################################
26 | ancestral.py
27 | 
28 | inference of ancestral sequences on a tree using a variant of dynamic programming to calculate the most likely sequences of internal nodes.
29 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/FitnessInference/prediction_src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/FitnessInference/prediction_src/__init__.py


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/__init__.py


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/betatree/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Richard Neher
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/betatree/README:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | 
 3 | betatree is a collection of python scripts to generate trees from the beta coalescent ensemble, calculate properties of those trees and gather statistics across many instances. 
 4 | 
 5 | Authors: Taylor Kessinger and Richard Neher
 6 | 
 7 | Contact: richard.neher@tuebingen.mpg.de
 8 | 
 9 | Example output of the betatree generator are given as pdf files in example_trees. Example site frequency spectra for different parameters of the beta-coalescent ensemble are provided in example_SFS.
10 | 
11 | If you use betatree in a publication, please refer to 
12 | 
13 | Neher, Kessinger, and Shraiman. "Coalescence and Genetic Diversity in Sexual Populations under Selection." PNAS 110: 15836-41. doi:10.1073/pnas.1309697110.
14 | 
15 | 
16 | ################################################################################
17 | Tree generation
18 | 
19 | the script src/betatree.py provides a class that generates coalescent trees using pseudorandom numbers given an initial sample size n and a parameter alpha of the beta measure of the Lambda coalescent process. The following 3 lines will generate a tree of a sample size of 100 and draw it with the Biopython.Phylo package.
20 | 
21 |     myT = betatree(100,alpha = 2)
22 |     myT.coalesce()
23 |     Phylo.draw(myT.BioTree)
24 | 
25 | the tree is internally stored as a biopython.phylo tree with all the associated  functionality.
26 | 
27 | Sample code is appended to the definition of the class as will be exectuted if betatree.py is run as main.
28 | 
29 | ################################################################################
30 | Site frequency spectra
31 | 
32 | the script src/sfs.py uses the class betatree to generate many trees and calculate the SFS assuming that mutation are uniformly distributed on the tree. The following three lines will generate an SFS for a sample size 100 and alpha=1.5 by averaging 1000 trees.
33 | 
34 |     mySFS = SFS(n=100,alpha=1.5)
35 |     mySFS.getSFS(ntrees=1000)
36 | 
37 | The sfs is accessible as mySFS.sfs and can be binned using different binning schemes or a user defined binning. 
38 | 
39 |     mySFS.binSFS(mode='logit', bins=20)
40 |     plt.plot(mySFS.bin_center, mySFS.binned_sfs)
41 | 
42 | The sfs can be saved and loaded by member functions.
43 | 
44 | Sample code is appended to the definition of the class as will be exectuted if sfs.py is run as main.
45 | 
46 | 
47 | ################################################################################
48 | Dependencies:
49 | 
50 | python 2.7
51 | numpy
52 | scipy
53 | biopython
54 | matplotlib for plotting examples
55 | 
56 | ################################################################################
57 | 
58 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/betatree/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/betatree/__init__.py


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/betatree/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/cassiopeia/tools/fitness_estimator/_jungle/jungle/resources/betatree/src/__init__.py


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/_jungle/jungle/size_matched_model.py:
--------------------------------------------------------------------------------
  1 | import ast
  2 | import json
  3 | 
  4 | import numpy as np
  5 | import scipy  # import scipy so its available in namespace for evaluating distributions
  6 | 
  7 | # TODO evaluate distribution in global namespace, so that import is not necessary here
  8 | 
  9 | 
 10 | class SizeMatchedModel:
 11 |     def __init__(self, bins, params, distribution, name=None):
 12 |         """Initialize SizeMatchedModel from a list of bins, a list of parameters, and a distribution"""
 13 |         assert (
 14 |             len(params) == len(bins) - 1
 15 |         ), "Length of params must be one less than length of bins"
 16 |         self.bins = bins
 17 |         self.params = params
 18 |         self.distribution = distribution
 19 |         self.name = name
 20 | 
 21 |     # @classmethod
 22 |     # def from_dict(cls, bin_to_params, distribution):
 23 |     #     """ Initialize SizeMatchedModel from a dictionary of bin-parameter mappings and a distribution """
 24 |     #     bins = bin_to_params.keys()
 25 |     #     params = bin_to_params.values()
 26 |     #     return SizeMatchedModel(bins, params, distribution)
 27 | 
 28 |     @classmethod
 29 |     def from_json(cls, filename):
 30 |         """Load SizeMatchedModel from JSON file"""
 31 |         with open(filename) as f:
 32 |             attributes_str = json.load(f)
 33 |         attributes = dict()
 34 |         attributes["bins"] = ast.literal_eval(attributes_str["bins"])
 35 |         attributes["params"] = ast.literal_eval(attributes_str["params"])
 36 |         attributes["name"] = ast.literal_eval(attributes_str["name"])
 37 |         distribution = eval(
 38 |             attributes_str["distribution"]
 39 |         )()  # evaluate class name and instantiate
 40 |         return SizeMatchedModel(
 41 |             attributes["bins"],
 42 |             attributes["params"],
 43 |             distribution,
 44 |             attributes["name"],
 45 |         )
 46 | 
 47 |     def to_json(self, outfile):
 48 |         """Write SizeMatchedModel to JSON file"""
 49 |         attributes = dict()
 50 |         attributes["bins"] = json.dumps(self.bins)
 51 |         attributes["params"] = json.dumps(self.params)
 52 |         attributes["name"] = json.dumps(self.name)
 53 | 
 54 |         # Get distribution class name
 55 |         # Distribution is a function, so we need to parse out the class name to save it in JSON format
 56 |         distribution_str = (
 57 |             self.distribution.__class__.__module__
 58 |             + "."
 59 |             + self.distribution.__class__.__name__
 60 |         )
 61 |         attributes["distribution"] = distribution_str
 62 | 
 63 |         with open(outfile, "w") as out:
 64 |             json.dump(attributes, out)
 65 | 
 66 |     def _params_for_size(self, size, strict_bounds=True):
 67 |         """Find parameters for bin that matches size"""
 68 | 
 69 |         # Find matching bin based on size
 70 |         bin_index = np.digitize(
 71 |             size, self.bins
 72 |         )  # digitize returns the index of the bin to which value belongs
 73 | 
 74 |         if (bin_index == 0 or bin_index == len(self.bins)) and strict_bounds:
 75 |             # if strict bounds are used, only allow values that fall strictly within the bins
 76 |             raise ValueError(
 77 |                 "Size must be within bounds of bins (if strict_bounds=True)"
 78 |             )
 79 | 
 80 |         if bin_index == 0 and not strict_bounds:
 81 |             # if loose bounds are used, values less than the bounds of bins should be set to smallest bin
 82 |             bin_index = 1
 83 | 
 84 |         if bin_index == len(self.bins) and not strict_bounds:
 85 |             # if loose bounds are used, values greater than the bounds of binds should be set to largest bin
 86 |             bin_index = len(self.bins) - 1
 87 | 
 88 |         # Adjust bin index to match indexing of params
 89 |         # np.digitize returns a one-indexed value, whereas params is a zero-indexed value
 90 |         # This line shifts the index, so that it matches the indexing of params
 91 |         bin_index = bin_index - 1
 92 | 
 93 |         # Get parameters of matching bin
 94 |         params_match = self.params[bin_index]
 95 | 
 96 |         return params_match
 97 | 
 98 |     def pvalue(self, x, size, invert_cdf=False, strict_bounds=True):
 99 |         """Calculate P value of x under model"""
100 | 
101 |         # Find model parameters for matching bin based on size
102 |         params = self._params_for_size(size, strict_bounds)
103 | 
104 |         # Calculate probability of finding the observed x, or more extreme, under model
105 |         p = self.distribution.cdf(x, *params)
106 | 
107 |         if invert_cdf:
108 |             p = 1 - p
109 | 
110 |         return p
111 | 
112 |     def model_mean(self, size, strict_bounds=True):
113 |         """Find mean of model for given size"""
114 | 
115 |         # Find model parameters for matching bin based on size
116 |         params = self._params_for_size(size, strict_bounds)
117 | 
118 |         # Calculate mean of model
119 |         mean = self.distribution.mean(*params)
120 | 
121 |         return mean
122 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/_jungle/reference_data/generate_annotate_forest.py:
--------------------------------------------------------------------------------
 1 | # Generate and annotate a Forest
 2 | # Usage: python generate_annotate_forest.py [n_leaves] [n_trees] [alpha] [output_dir]
 3 | # Saves Forest as a gzipped pickle archive.
 4 | 
 5 | import sys
 6 | import time
 7 | import uuid
 8 | 
 9 | sys.path.append("../../jungle/")
10 | import jungle as jg
11 | 
12 | verbose = True
13 | 
14 | # Specify parameters
15 | n_leaves = int(sys.argv[1])  # Number of leaves in tree
16 | n_trees = int(sys.argv[2])  # Number of trees in forest
17 | alpha = float(
18 |     sys.argv[3]
19 | )  # Shape parameter alpha (alpha = 2.0 for neutral Kingman trees, alpha = 1.0 for positive selection Bolthausen-Sznitman trees)
20 | outfile_dir = sys.argv[4]  # Output directory
21 | 
22 | # Specify output file
23 | outfile_vars = (n_leaves, n_trees, alpha, str(uuid.uuid4())[0:8])
24 | outfile_basename = (
25 |     "forest_nleaves{0}_ntrees{1}_alpha{2}_uuid{3}.pickle.gz".format(
26 |         *outfile_vars
27 |     )
28 | )
29 | outfile = outfile_dir + "/" + outfile_basename
30 | 
31 | # Report parameters
32 | if verbose:
33 |     print("Parameters")
34 |     print(("n_leaves", n_leaves))
35 |     print(("n_trees", n_trees))
36 |     print(("alpha", alpha))
37 |     print(("outfile_dir", outfile_dir))
38 |     print(("outfile", outfile))
39 | 
40 | if verbose:
41 |     print("Starting tree generation...")
42 | 
43 | # Track run time
44 | start_time = time.time()
45 | 
46 | # Generate and annotate trees
47 | F = jg.Forest.generate(
48 |     n_trees=n_trees, params={"n_leaves": n_leaves, "alpha": alpha}
49 | )
50 | F.resolve_polytomy()
51 | F.annotate_standard_node_features()
52 | F.annotate_colless()
53 | 
54 | # Dump to file
55 | F.dump(outfile)
56 | 
57 | # Track run time
58 | elapsed_time = time.time() - start_time
59 | 
60 | # Report run time
61 | if verbose:
62 |     print("Done!!")
63 |     print(("Elapsed time (s):", elapsed_time))
64 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/_jungle/reference_data/generate_annotate_forest.py.bak:
--------------------------------------------------------------------------------
 1 | # Generate and annotate a Forest
 2 | # Usage: python generate_annotate_forest.py [n_leaves] [n_trees] [alpha] [output_dir]
 3 | # Saves Forest as a gzipped pickle archive.
 4 | 
 5 | import sys
 6 | import uuid
 7 | import time
 8 | 
 9 | sys.path.append("../../jungle/")
10 | import jungle as jg
11 | 
12 | verbose = True
13 | 
14 | # Specify parameters
15 | n_leaves = int(sys.argv[1]) # Number of leaves in tree
16 | n_trees = int(sys.argv[2]) # Number of trees in forest
17 | alpha = float(sys.argv[3]) # Shape parameter alpha (alpha = 2.0 for neutral Kingman trees, alpha = 1.0 for positive selection Bolthausen-Sznitman trees)
18 | outfile_dir = sys.argv[4] # Output directory
19 | 
20 | # Specify output file
21 | outfile_vars = (n_leaves, n_trees, alpha, str(uuid.uuid4())[0:8])
22 | outfile_basename = "forest_nleaves{0}_ntrees{1}_alpha{2}_uuid{3}.pickle.gz".format(*outfile_vars)
23 | outfile = outfile_dir + "/" + outfile_basename
24 | 
25 | # Report parameters
26 | if verbose:
27 |     print("Parameters")
28 |     print("n_leaves", n_leaves)
29 |     print("n_trees", n_trees)
30 |     print("alpha", alpha)
31 |     print("outfile_dir", outfile_dir)
32 |     print("outfile", outfile)
33 | 
34 | if verbose:
35 |     print("Starting tree generation...")
36 | 
37 | # Track run time
38 | start_time = time.time()
39 | 
40 | # Generate and annotate trees
41 | F = jg.Forest.generate(n_trees=n_trees, params={"n_leaves": n_leaves, "alpha": alpha})
42 | F.resolve_polytomy()
43 | F.annotate_standard_node_features()
44 | F.annotate_colless()
45 | 
46 | # Dump to file
47 | F.dump(outfile)
48 | 
49 | # Track run time
50 | elapsed_time = time.time() - start_time
51 | 
52 | # Report run time
53 | if verbose:
54 |     print("Done!!")
55 |     print("Elapsed time (s):", elapsed_time)
56 | 


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/_jungle/tests/generate_annotate_forest.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Test generating and annotating a forest
3 | python ../reference_data/generate_annotate_forest.py 100 5 2.0 ../reference_data/


--------------------------------------------------------------------------------
/cassiopeia/tools/fitness_estimator/_lbi_jungle.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import tempfile
  4 | from typing import Optional
  5 | 
  6 | import networkx as nx
  7 | 
  8 | dir_path = os.path.dirname(os.path.realpath(__file__))
  9 | sys.path.append(os.path.join(dir_path, "_jungle"))
 10 | import jungle as jg
 11 | import numpy as np
 12 | 
 13 | from cassiopeia.data import CassiopeiaTree
 14 | 
 15 | from ._FitnessEstimator import FitnessEstimator, FitnessEstimatorError
 16 | 
 17 | 
 18 | def _to_newick(tree: nx.DiGraph, record_branch_lengths: bool = False) -> str:
 19 |     """Converts a networkx graph to a newick string.
 20 | 
 21 |     Args:
 22 |         tree: A networkx tree
 23 |         record_branch_lengths: Whether to record branch lengths on the tree in
 24 |             the newick string
 25 | 
 26 |     Returns:
 27 |         A newick string representing the topology of the tree
 28 |     """
 29 | 
 30 |     def _to_newick_str(g, node):
 31 |         is_leaf = g.out_degree(node) == 0
 32 |         weight_string = ""
 33 | 
 34 |         if record_branch_lengths and g.in_degree(node) > 0:
 35 |             parent = list(g.predecessors(node))[0]
 36 |             weight_string = ":" + str(g[parent][node]["length"])
 37 |             if not is_leaf:
 38 |                 weight_string = node + weight_string
 39 | 
 40 |         _name = str(node)
 41 |         return (
 42 |             "%s" % (_name,) + weight_string
 43 |             if is_leaf
 44 |             else (
 45 |                 "("
 46 |                 + ",".join(
 47 |                     _to_newick_str(g, child) for child in g.successors(node)
 48 |                 )
 49 |                 + ")"
 50 |                 + weight_string
 51 |             )
 52 |         )
 53 | 
 54 |     root = [node for node in tree if tree.in_degree(node) == 0][0]
 55 |     return _to_newick_str(tree, root) + ";"
 56 | 
 57 | 
 58 | class LBIJungle(FitnessEstimator):
 59 |     """
 60 |     LBI as implemented by the jungle package.
 61 | 
 62 |     Implements the LBI fitness estimator described by Neher et al. (2014).
 63 |     This is a simple wrapper on top of the Jungle package, which is in turn
 64 |     a wrapper around Neher et al.'s code.
 65 | 
 66 |     Caveat: LBIJungle does not estimate fitness for the root of this tree
 67 |     (artifact of the Jungle package). This is rarely of interest though.
 68 | 
 69 |     Args:
 70 |         random_seed: Random seed to set in numpy before running fitness
 71 |             estimates. (A random seed is used by the LBI to estimate the
 72 |             characteristic timescale `tau` of the underlying process.
 73 |             See Neher et al. 2014, and the LBIJungle package for details.)
 74 |     """
 75 | 
 76 |     def __init__(self, random_seed: Optional[int] = None):
 77 |         self._random_seed = random_seed
 78 | 
 79 |     def estimate_fitness(self, tree: CassiopeiaTree) -> None:
 80 |         """
 81 |         Sets attribute `fitness` for each node in the tree using the LBI.
 82 | 
 83 |         Caveat: LBIJungle does not estimate fitness for the root of this tree
 84 |         (artifact of the Jungle package). This is rarely of interest though.
 85 | 
 86 |         Will raise a FitnessEstimatorError if the CassiopeiaTree cannot be
 87 |         serialized to networkx.
 88 | 
 89 |         Also, due to the underlying implementation in the Jungle package that we
 90 |         wrap, leaf names cannot start with an underscore. A
 91 |         FitnessEstimatorError will also be raised in this case.
 92 | 
 93 |         Raises:
 94 |             FitnessEstimatorError
 95 |         """
 96 |         if any([leaf.startswith("_") for leaf in tree.leaves]):
 97 |             raise FitnessEstimatorError(
 98 |                 "Leaf names must NOT start with '_'. Please rename your leaves"
 99 |                 " to use LBIJungle."
100 |             )
101 |         with tempfile.NamedTemporaryFile("w") as outfile:
102 |             outfilename = outfile.name
103 |             tree_newick = _to_newick(
104 |                 tree.get_tree_topology(), record_branch_lengths=True
105 |             )
106 |             outfile.write(tree_newick)
107 |             outfile.flush()
108 |             if self._random_seed is not None:
109 |                 np.random.seed(self._random_seed)
110 |             try:
111 |                 T_empirical = jg.Tree.from_newick(outfilename)
112 |             except Exception:
113 |                 raise Exception(f"Could not read newick str:\n{tree_newick}")
114 |             T_empirical.annotate_standard_node_features()
115 |             T_empirical.infer_fitness(params={})
116 |             res_df = T_empirical.node_features()
117 |             node_names = res_df.name
118 |             node_fitnesses = res_df.mean_fitness
119 |             for v, f in zip(node_names, node_fitnesses):
120 |                 if v != "" and v[0] != "_":
121 |                     tree.set_attribute(v, "fitness", f)
122 |                 elif v != "" and v[0] == "_":
123 |                     # (Non-root) internal node!
124 |                     tree.set_attribute(v[1:], "fitness", f)
125 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | # Run to check if valid
2 | # curl --data-binary @codecov.yml https://codecov.io/validate
3 | coverage:
4 |   status:
5 |     project:
6 |       default:
7 |         target: 85%
8 |         threshold: 1%
9 |     patch: off


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def pytest_addoption(parser):
 5 |     parser.addoption(
 6 |         "--runslow", action="store_true", default=False, help="run slow tests",
 7 |     )
 8 |     parser.addoption(
 9 |         "--runspatial", action="store_true", default=False, help="run spatial tests",
10 |     )
11 | 
12 | 
13 | def pytest_configure(config):
14 |     config.addinivalue_line("markers", "slow: mark test as slow to run")
15 |     config.addinivalue_line("markers", "spatial: mark test as spatial to run")
16 |     
17 | 
18 | 
19 | def pytest_collection_modifyitems(config, items):
20 |     run_slow = config.getoption("--runslow")
21 |     run_spatial = config.getoption("--runspatial")
22 |     skip_slow = pytest.mark.skip(reason="need --runslow option to run")
23 |     skip_spatial = pytest.mark.skip(reason="need --runspatial option to run")
24 |     
25 |     for item in items:
26 |         if "slow" in item.keywords and not run_slow:
27 |             item.add_marker(skip_slow)
28 |         if "spatial" in item.keywords and not run_spatial:
29 |             item.add_marker(skip_spatial)
30 | 


--------------------------------------------------------------------------------
/data/PCT48.ref.fasta:
--------------------------------------------------------------------------------
1 | >PCT48.ref
2 | AATCCAGCTAGCTGTGCAGCNNNNNNNNNNNNNNATTCAACTGCAGTAATGCTACCTCGTACTCACGCTTTCCAAGTGCTTGGCGTCGCATCTCGGTCCTTTGTACGCCGAAAAATGGCCTGACAACTAAGCTACGGCACGCTGCCATGTTGGGTCATAACGATATCTCTGGTTCATCCGTGACCGAACATGTCATGGAGTAGCAGGAGCTATTAATTCGCGGAGGACAATGCGGTTCGTAGTCACTGTCTTCCGCAATCGTCCATCGCTCCTGCAGGTGGCCTAGAGGGCCCGTTTAAACCCGCTGATCAGCCTCGACTGTGCCTTCTAGTTGCCAGCCATCTGTTGTTTGCCCCTCCCCCGTGCCTTCCTTGACCCTGGAAGGTGCCACTCCCACTGTCCTTTCCTAATAAAATGAGGAAATTGCATCGCATTGTCTGAGTAGGTGTCATTCTATTCTGGGGGGTGGGGTGGGGCAGGACAGCAAGGGGGAGGATTGGGAAGACAATAGCAGGCATGCTGGGGATGCGGTGGGCTCTATGGTCTAGAGCGGGCCCGGTACTAACCAAACTGGATCTCTGCTGTCCCTGTAATAAACCCGAAAATTTTGAATTTTTGTAATTTGTTTTTGTAATTCTTTAGTTTGTATGTCTGTTGCTATTATGTCTACTATTCTTTCCCCTGCACTGTACCCCCCAATCCCCCCTTTTCTTTTAAAATTGTGGATGAATACTGCCATTTGTCTGCAGA
3 | 


--------------------------------------------------------------------------------
/data/ccphylo_config.ini:
--------------------------------------------------------------------------------
1 | [Paths]
2 | ccphylo_path = /path/to/ccphylo/ccphylo
3 | 


--------------------------------------------------------------------------------
/data/itolconfig_example:
--------------------------------------------------------------------------------
1 | [DEFAULT]
2 | api_key = <itol private api key>
3 | project_name = MyProjectonIToL
4 | 


--------------------------------------------------------------------------------
/data/preprocess.cfg:
--------------------------------------------------------------------------------
 1 | # cassiopeia-preprocess configuration example
 2 | # See notebooks/preprocess.ipynb for parameter descriptions.
 3 | 
 4 | [general]
 5 | name = "test_sample"
 6 | output_directory = "/mnt/e/scratch/cassiopeia/pipeline_test"
 7 | reference_filepath = "/mnt/e/scratch/cassiopeia/PCT48-long.ref.fa"
 8 | entry = "convert"
 9 | exit = "call_lineages"
10 | input_files = ["/mnt/e/scratch/cassiopeia/smaller_1.fastq.gz", "/mnt/e/scratch/cassiopeia/smaller_2.fastq.gz"]
11 | n_threads = 32
12 | allow_allele_conflicts = False
13 | verbose = True
14 | 
15 | [convert]
16 | chemistry = "10xv3"
17 | 
18 | [filter_bam]
19 | quality_threshold = 10
20 | 
21 | [error_correct_cellbcs_to_whitelist]
22 | # Set to None to turn off this step.
23 | whitelist = "/mnt/e/scratch/cassiopeia/pipeline_test/10x_version3_whitelist.txt"
24 | 
25 | [collapse]
26 | max_hq_mismatches = 3
27 | max_indels = 2
28 | method = "cutoff"
29 | 
30 | [resolve]
31 | min_avg_reads_per_umi = 2.0
32 | min_umi_per_cell = 10
33 | plot = True
34 | 
35 | [align]
36 | gap_open_penalty = 20
37 | gap_extend_penalty = 1
38 | method = "local"
39 | 
40 | [call_alleles]
41 | barcode_interval = (20, 34)
42 | cutsite_locations = [112, 166, 220]
43 | cutsite_width = 12
44 | context = True
45 | context_size = 5
46 | 
47 | [error_correct_intbcs_to_whitelist]
48 | # Set to None to turn off this step.
49 | whitelist = "/mnt/e/scratch/cassiopeia/pipeline_test/intbc_whitelist.txt"
50 | intbc_dist_thresh = 1
51 | 
52 | [error_correct_umis]
53 | max_umi_distance = 2
54 | 
55 | [filter_molecule_table]
56 | min_umi_per_cell = 10
57 | min_avg_reads_per_umi = 2.0
58 | min_reads_per_umi = -1
59 | intbc_prop_thresh = 0.5
60 | intbc_umi_thresh = 10
61 | intbc_dist_thresh = 1
62 | doublet_threshold = 0.35
63 | plot = True
64 | 
65 | [call_lineages]
66 | min_umi_per_cell = 10
67 | min_avg_reads_per_umi = 2.0
68 | min_cluster_prop = 0.005
69 | min_intbc_thresh = 0.05
70 | inter_doublet_threshold = 0.35
71 | kinship_thresh = 0.25
72 | plot = True
73 | 


--------------------------------------------------------------------------------
/data/preprocess_gestalt.cfg:
--------------------------------------------------------------------------------
 1 | [general]
 2 | name = "test_sample"
 3 | output_directory = "/gestalt_barcode_data/cassiopeia_output"
 4 | reference_filepath = "/gestalt_barcode_data/reference/gestalt.abbrv.fa"
 5 | entry = "collapse"
 6 | exit = "filter_molecule_table"
 7 | input_files = ["/gestalt_barcode_data/raw/possorted_genome_bam.bam", "/gestalt_barcode_data/raw/possorted_genome_bam.bam.bai"]
 8 | n_threads = 32
 9 | allow_allele_conflicts = False
10 | verbose = True
11 | 
12 | [collapse]
13 | max_hq_mismatches = 3
14 | max_indels = 2
15 | method = "cutoff"
16 | 
17 | [resolve]
18 | min_avg_reads_per_umi = 2.0
19 | min_umi_per_cell = 3
20 | plot = True
21 | 
22 | [align]
23 | gap_open_penalty = 20
24 | gap_extend_penalty = 1
25 | method = "global"
26 | 
27 | [call_alleles]
28 | barcode_interval = (0, 0)
29 | cutsite_locations = [42, 69, 96, 123, 150, 177, 204, 231, 258, 285]
30 | cutsite_width = 4
31 | context = True
32 | context_size = 5
33 | 
34 | [error_correct_umis]
35 | max_umi_distance = 2
36 | 
37 | [filter_molecule_table]
38 | min_umi_per_cell = 3
39 | min_avg_reads_per_umi = 2.0
40 | min_reads_per_umi = -1
41 | intbc_prop_thresh = 0.5
42 | intbc_umi_thresh = 3
43 | intbc_dist_thresh = 1
44 | doublet_threshold = None
45 | plot = True
46 | 


--------------------------------------------------------------------------------
/data/spatial_preprocess.cfg:
--------------------------------------------------------------------------------
 1 | # cassiopeia-preprocess configuration example for spatial assays
 2 | # See notebooks/preprocess.ipynb for parameter descriptions.
 3 | 
 4 | [general]
 5 | name = "test_sample"
 6 | output_directory = "/mnt/e/scratch/cassiopeia/pipeline_test"
 7 | reference_filepath = "/mnt/e/scratch/cassiopeia/PCT48-long.ref.fa"
 8 | entry = "convert"
 9 | exit = "call_lineages"
10 | input_files = ["/mnt/e/scratch/cassiopeia/smaller_1.fastq.gz", "/mnt/e/scratch/cassiopeia/smaller_2.fastq.gz"]
11 | n_threads = 32
12 | allow_allele_conflicts = True
13 | verbose = True
14 | 
15 | [convert]
16 | chemistry = "slideseq2"
17 | 
18 | [filter_bam]
19 | quality_threshold = 10
20 | 
21 | [error_correct_cellbcs_to_whitelist]
22 | # Set to None to turn off this step.
23 | whitelist = "/mnt/e/scratch/cassiopeia/pipeline_test/puck_whitelist.txt"
24 | 
25 | [collapse]
26 | max_hq_mismatches = 3
27 | max_indels = 2
28 | method = "likelihood"
29 | skip_existing = False
30 | 
31 | [resolve]
32 | min_avg_reads_per_umi = 2.0
33 | min_umi_per_cell = 10
34 | plot = True
35 | 
36 | [align]
37 | gap_open_penalty = 20
38 | gap_extend_penalty = 1
39 | method = "local"
40 | 
41 | [call_alleles]
42 | barcode_interval = (20, 34)
43 | cutsite_locations = [112, 166, 220]
44 | cutsite_width = 12
45 | context = True
46 | context_size = 5
47 | 
48 | [error_correct_intbcs_to_whitelist]
49 | # Set to None to turn off this step.
50 | whitelist = "/mnt/e/scratch/cassiopeia/pipeline_test/intbc_whitelist.txt"
51 | intbc_dist_thresh = 1
52 | 
53 | [error_correct_umis]
54 | max_umi_distance = 2
55 | 
56 | [filter_molecule_table]
57 | min_umi_per_cell = 10
58 | min_avg_reads_per_umi = 2.0
59 | min_reads_per_umi = -1
60 | intbc_prop_thresh = 0.5
61 | intbc_umi_thresh = 10
62 | intbc_dist_thresh = 1
63 | doublet_threshold = 0.35
64 | plot = True
65 | 
66 | [call_lineages]
67 | min_umi_per_cell = 10
68 | min_avg_reads_per_umi = 2.0
69 | min_cluster_prop = 0.005
70 | min_intbc_thresh = 0.05
71 | inter_doublet_threshold = 0.35
72 | kinship_thresh = 0.25
73 | plot = True
74 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = python -msphinx
 7 | SPHINXPROJ    = cassiopeia
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_static/computer-24px.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" height="24" viewBox="0 0 24 24" width="24"><path d="M0 0h24v24H0z" fill="none"/><path d="M20 18c1.1 0 1.99-.9 1.99-2L22 6c0-1.1-.9-2-2-2H4c-1.1 0-2 .9-2 2v10c0 1.1.9 2 2 2H0v2h24v-2h-4zM4 6h16v10H4V6z"/></svg>
2 | 


--------------------------------------------------------------------------------
/docs/_static/css/override.css:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Furo CSS variables
  3 |   https://github.com/pradyunsg/furo/blob/main/src/furo/assets/styles/variables/_index.scss
  4 |   https://github.com/pradyunsg/furo/blob/main/src/furo/theme/partials/_head_css_variables.html
  5 | https://github.com/streamlink/streamlink/blob/17a4088c38709123c0bcab4a150549bd16d19e07/docs/_static/styles/custom.css
  6 | */
  7 | 
  8 | dt:target, span.highlighted {
  9 |     background-color: #f0f0f0;
 10 | }
 11 | 
 12 | code.docutils.literal.notranslate.xref, a code {
 13 |     background: transparent;
 14 |     font-weight: bold;
 15 |     color: inherit;
 16 | }
 17 | 
 18 | a > code {
 19 |     color: inherit;
 20 | }
 21 | 
 22 | code.docutils.literal.notranslate {
 23 |     background: #f8f9fb;
 24 |     font-size: 87.5%;
 25 |     border-radius: .2em;
 26 |     color: #000000;
 27 |     word-wrap: break-word;
 28 |     padding: .1em .2em;
 29 | }
 30 | 
 31 | dl.citation > dt {
 32 |     float: left;
 33 |     margin-right: 15px;
 34 |     font-weight: bold;
 35 | }
 36 | 
 37 | /* Parameters normalize size and captialized, */
 38 | dl.c .field-list dt, dl.cpp .field-list dt, dl.js .field-list dt, dl.py .field-list dt {
 39 |     font-size: var(--font-size--normal);
 40 |     text-transform: none;
 41 | }
 42 | 
 43 | /* examples and headings in classes */
 44 | p.rubric {
 45 |     font-size: var(--font-size--normal);
 46 |     text-transform: none;
 47 |     font-weight: 500;
 48 | }
 49 | 
 50 | 
 51 | /* Getting started index page */
 52 | 
 53 | .intro-card {
 54 |     background: #fff;
 55 |     border-radius: 0;
 56 |     padding: 30px 10px 10px 10px;
 57 |     margin: 10px 0px;
 58 |   }
 59 | 
 60 |   .intro-card .card-text {
 61 |     margin: 20px 0px;
 62 |     /*min-height: 150px; */
 63 |   }
 64 | 
 65 |   .custom-button {
 66 |     background-color: #dcdcdc;
 67 |     border: none;
 68 |     color: #484848;
 69 |     text-align: center;
 70 |     text-decoration: none;
 71 |     display: inline-block;
 72 |     font-size: 0.9rem;
 73 |     border-radius: 0.5rem;
 74 |     max-width: 220px;
 75 |     padding: 0.5rem 0rem;
 76 |   }
 77 | 
 78 |   .custom-button a {
 79 |     color: #484848;
 80 |     display: block;
 81 |   }
 82 | 
 83 |   .custom-button p {
 84 |     margin-top: 0;
 85 |     margin-bottom: 0rem;
 86 |     color: #484848;
 87 |   }
 88 | 
 89 | 
 90 | 
 91 |   /* from https://github.com/dask/dask-sphinx-theme/blob/main/dask_sphinx_theme/static/css/nbsphinx.css */
 92 | 
 93 |   .nbinput .prompt,
 94 |   .nboutput .prompt {
 95 |       display: none;
 96 |   }
 97 |   .nboutput .stderr{
 98 |       display: none;
 99 |   }
100 | 
101 |   div.nblast.container {
102 |     padding-bottom: 5px;
103 |     padding-right: 0px;
104 |     padding-left: 0px;
105 |     margin-bottom: 10px;
106 | }
107 | 
108 | div.nbinput.container, div.nboutput.container {
109 |     display: -webkit-flex;
110 |     display: flex;
111 |     align-items: flex-start;
112 |     margin-top: 0px;
113 |     margin-right: 0px;
114 |     margin-bottom: 5px;
115 |     margin-left: 0px;
116 |     padding-right: 0px;
117 |     padding-left: 0px;
118 |     padding-top: 0px
119 |     width: 100%;
120 | }


--------------------------------------------------------------------------------
/docs/_static/css/sphinx_gallery.css:
--------------------------------------------------------------------------------
 1 | /* .sphx-glr-thumbcontainer {
 2 |     background: none !important;
 3 |     border: 1px solid #003262!important;
 4 |     text-align: center !important;
 5 |     min-height: 220px !important;
 6 | }
 7 | .sphx-glr-thumbcontainer a.internal:hover {
 8 |     color: #003262!important;
 9 | }
10 | p.sphx-glr-timing {
11 |     margin: 0 !important;
12 |     padding-top: 24px;
13 |     border-top: 1px solid #000;
14 | }
15 | .sphx-glr-thumbcontainer:hover {
16 |     box-shadow: 0 0 10px #003262!important
17 | }
18 |  */
19 | 
20 | 
21 |  .sphx-glr-thumbcontainer .headerlink {
22 |     display: none !important;
23 | }
24 | 
25 | div.sphx-glr-thumbcontainer span {
26 |     font-style: normal !important;
27 | }
28 | 
29 | .sphx-glr-thumbcontainer a.internal {
30 |     padding: 140px 10px 0!important;
31 | }
32 | 
33 | .sphx-glr-thumbcontainer .figure.align-center {
34 |     text-align: center;
35 |     margin-left: 0%;
36 |     transform: translate(0%);
37 | }


--------------------------------------------------------------------------------
/docs/_static/library_books-24px.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" height="24" viewBox="0 0 24 24" width="24"><path d="M0 0h24v24H0z" fill="none"/><path d="M4 6H2v14c0 1.1.9 2 2 2h14v-2H4V6zm16-4H8c-1.1 0-2 .9-2 2v12c0 1.1.9 2 2 2h12c1.1 0 2-.9 2-2V4c0-1.1-.9-2-2-2zm-1 9H9V9h10v2zm-4 4H9v-2h6v2zm4-8H9V5h10v2z"/></svg>
2 | 


--------------------------------------------------------------------------------
/docs/_static/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/docs/_static/logo.png


--------------------------------------------------------------------------------
/docs/_static/play_circle_outline-24px.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" height="24" viewBox="0 0 24 24" width="24"><path d="M0 0h24v24H0z" fill="none"/><path d="M10 16.5l6-4.5-6-4.5v9zM12 2C6.48 2 2 6.48 2 12s4.48 10 10 10 10-4.48 10-10S17.52 2 12 2zm0 18c-4.41 0-8-3.59-8-8s3.59-8 8-8 8 3.59 8 8-3.59 8-8 8z"/></svg>
2 | 


--------------------------------------------------------------------------------
/docs/_static/question-mark-svgrepo-com.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="iso-8859-1"?>
 2 | <!-- Generator: Adobe Illustrator 18.1.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Capa_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 32 32" style="enable-background:new 0 0 32 32;" xml:space="preserve">
 5 | <g>
 6 | 	<g id="question_x5F_mark">
 7 | 		<g>
 8 | 			<path style="fill:#030104;" d="M17.6,30c0,1.102-0.895,2-2,2s-2-0.898-2-2c0-1.109,0.895-2,2-2S17.6,28.891,17.6,30z"/>
 9 | 			<path style="fill:#030104;" d="M15.676,25.977c-1.336,0-2.59-0.523-3.535-1.469c-0.945-1.105-1.465-2.359-1.465-3.695
10 | 				s0.52-2.59,1.465-3.371l6.688-6.688C19.584,9.996,20,8.992,20,7.926c0-1.07-0.416-2.074-1.172-2.828
11 | 				c-1.559-1.559-4.096-1.562-5.654,0C12.416,5.852,12,6.855,12,7.926H8c0-2.137,0.834-4.148,2.348-5.66
12 | 				c3.02-3.023,8.285-3.02,11.309,0.004C23.168,3.777,24,5.785,24,7.926c0,2.137-0.832,4.145-2.344,5.656l-6.688,6.523
13 | 				c-0.389,0.391-0.389,1.023,0,1.414c0.391,0.391,1.023,0.391,1.414,0c0.254-0.258,0.293-0.555,0.293-0.711h4
14 | 				c0,1.336-0.52,2.594-1.465,3.699C18.266,25.453,17.012,25.977,15.676,25.977L15.676,25.977z"/>
15 | 		</g>
16 | 	</g>
17 | </g>
18 | <g>
19 | </g>
20 | <g>
21 | </g>
22 | <g>
23 | </g>
24 | <g>
25 | </g>
26 | <g>
27 | </g>
28 | <g>
29 | </g>
30 | <g>
31 | </g>
32 | <g>
33 | </g>
34 | <g>
35 | </g>
36 | <g>
37 | </g>
38 | <g>
39 | </g>
40 | <g>
41 | </g>
42 | <g>
43 | </g>
44 | <g>
45 | </g>
46 | <g>
47 | </g>
48 | </svg>
49 | 


--------------------------------------------------------------------------------
/docs/_static/tutorials/benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/docs/_static/tutorials/benchmark.png


--------------------------------------------------------------------------------
/docs/_static/tutorials/local_plotting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/docs/_static/tutorials/local_plotting.png


--------------------------------------------------------------------------------
/docs/_static/tutorials/preprocess.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/docs/_static/tutorials/preprocess.png


--------------------------------------------------------------------------------
/docs/_static/tutorials/reconstruct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/docs/_static/tutorials/reconstruct.png


--------------------------------------------------------------------------------
/docs/_templates/autosummary/class.rst:
--------------------------------------------------------------------------------
 1 | {{ fullname | escape | underline }}
 2 | 
 3 | .. currentmodule:: {{ module }}
 4 | 
 5 | .. autoclass:: {{ objname }}
 6 |    :members:
 7 |    :undoc-members:
 8 | 
 9 |    .. rubric:: Methods
10 | 
11 |    .. autoautosummary:: {{ objname }}
12 |       :methods:


--------------------------------------------------------------------------------
/docs/_templates/layout.html:
--------------------------------------------------------------------------------
 1 | {% extends "pydata_sphinx_theme/layout.html" %}
 2 | 
 3 | {% block fonts %}
 4 | <!-- add `style` or `link` tags with your CSS `@font-face` declarations here -->
 5 | <!-- ... and a `style` tag with setting `font-family` in `body` and `.header-style` -->
 6 | <!-- ... and optionally preload the `woff2` for snappier page loads -->
 7 | <!-- or add a `style` tag with a font fallback chain with good cross-platform coverage -->
 8 | <style>
 9 |     body {
10 |         font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Open Sans', 'Helvetica Neue', sans-serif;
11 |     }
12 | 
13 |     .header-style {
14 |         font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Open Sans', 'Helvetica Neue', sans-serif;
15 |     }
16 | </style>
17 | {% endblock %}
18 | 


--------------------------------------------------------------------------------
/docs/api/critique.rst:
--------------------------------------------------------------------------------
 1 | ===========
 2 | Critique
 3 | ===========
 4 | .. currentmodule:: cassiopeia
 5 | 
 6 | Critique
 7 | ~~~~~~~~~~~~~~~~~~~
 8 | 
 9 | We support functionality for comparing trees to one another, for example when benchmarking new algorithms.
10 | 
11 | .. autosummary::
12 |    :toctree: reference/
13 | 
14 |    critique.robinson_foulds
15 |    critique.triplets_correct


--------------------------------------------------------------------------------
/docs/api/data.rst:
--------------------------------------------------------------------------------
 1 | ===========
 2 | Data
 3 | ===========
 4 | 
 5 | .. module:: cassiopeia.data
 6 | .. currentmodule:: cassiopeia
 7 | 
 8 | CassiopeiaTrees
 9 | ~~~~~~~~~~~~~~~~~~~
10 | 
11 | The main data structure that Cassiopeia uses for all tree-based analyses is the CassiopeiaTree:
12 | 
13 | .. autosummary::
14 |    :toctree: reference/
15 | 
16 |    data.CassiopeiaTree
17 | 
18 | Utilities
19 | ~~~~~~~~~~~~~~~~~~~
20 | 
21 | We also have several utilities that are useful for working with various data related to phylogenetics:
22 | 
23 | .. autosummary::
24 |    :toctree: reference/
25 | 
26 |    data.compute_dissimilarity_map
27 |    data.compute_phylogenetic_weight_matrix
28 |    data.get_lca_characters
29 |    data.sample_bootstrap_allele_tables
30 |    data.sample_bootstrap_character_matrices
31 |    data.to_newick


--------------------------------------------------------------------------------
/docs/api/index.rst:
--------------------------------------------------------------------------------
 1 | ===
 2 | API
 3 | ===
 4 | 
 5 | 
 6 | Import Cassiopeia as::
 7 | 
 8 |    import cassiopeia as cas
 9 | 
10 | .. toctree::
11 |    :maxdepth: 1
12 | 
13 |    preprocess
14 |    data
15 |    critique
16 |    solver
17 |    simulator
18 |    plotting
19 |    tools


--------------------------------------------------------------------------------
/docs/api/plotting.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | Plotting
 3 | ==========
 4 | 
 5 | .. currentmodule:: cassiopeia
 6 | 
 7 | Plotting
 8 | ~~~~~~~~~~~~~~~~~~~
 9 | 
10 | Plotting functionality is divided into two broad categories: local and remote
11 | (a.k.a. iTOL). Previously, we only supported tree visualization using the rich
12 | iTOL framework. However, we are now in the process of deprecating the use of
13 | this service for most use cases. We recommend all users to visualize their
14 | trees using the local plotting functions, which either use Matplotlib or
15 | Plotly, as this option is free and is more reminiscent of plotting in other
16 | packages such as Scanpy.
17 | 
18 | .. autosummary::
19 |    :toctree: reference/
20 | 
21 |    pl.labels_from_coordinates
22 |    pl.plot_matplotlib
23 |    pl.plot_plotly
24 |    pl.Tree3D
25 |    pl.upload_and_export_itol
26 | 


--------------------------------------------------------------------------------
/docs/api/preprocess.rst:
--------------------------------------------------------------------------------
 1 | ===========
 2 | Preprocess
 3 | ===========
 4 | .. currentmodule:: cassiopeia
 5 | 
 6 | Data Preprocessing
 7 | ~~~~~~~~~~~~~~~~~~~
 8 | 
 9 | We have several functions that are part of our pipeline for processing sequencing data from single-cell lineage tracing technologies:
10 | 
11 | .. autosummary::
12 |    :toctree: reference/
13 | 
14 |    pp.align_sequences
15 |    pp.call_alleles
16 |    pp.call_lineage_groups
17 |    pp.collapse_umis
18 |    pp.convert_fastqs_to_unmapped_bam
19 |    pp.error_correct_cellbcs_to_whitelist
20 |    pp.error_correct_intbcs_to_whitelist
21 |    pp.error_correct_umis
22 |    pp.filter_bam
23 |    pp.filter_molecule_table
24 |    pp.filter_cells
25 |    pp.filter_umis
26 |    pp.resolve_umi_sequence
27 | 
28 | 
29 | 
30 | 
31 | Data Utilities
32 | ~~~~~~~~~~~~~~~~~~~
33 | 
34 | We also have several functions that are useful for converting between data formats for downstream analyses:
35 | 
36 | .. autosummary::
37 |    :toctree: reference/
38 | 
39 |    pp.compute_empirical_indel_priors
40 |    pp.convert_alleletable_to_character_matrix
41 |    pp.convert_alleletable_to_lineage_profile
42 |    pp.convert_lineage_profile_to_character_matrix


--------------------------------------------------------------------------------
/docs/api/simulator.rst:
--------------------------------------------------------------------------------
 1 | ===========
 2 | Simulator
 3 | ===========
 4 | .. currentmodule:: cassiopeia
 5 | 
 6 | 
 7 | Our simulators for cassiopeia are split up into those that simulate topologies and those that simulate data on top of the topologies.
 8 | 
 9 | Tree Simulators
10 | ~~~~~~~~~~~~~~~~~~~
11 | 
12 | We have several frameworks available for simulating topologies:
13 | 
14 | .. autosummary::
15 |    :toctree: reference/
16 | 
17 |    sim.BirthDeathFitnessSimulator
18 |    sim.ecDNABirthDeathSimulator
19 |    sim.CompleteBinarySimulator
20 |    sim.SimpleFitSubcloneSimulator
21 | 
22 | 
23 | Data Simulators
24 | ~~~~~~~~~~~~~~~~~~~
25 | 
26 | These simulators are subclasses of the `DataSimulator` class and implement the `overlay_data` method which simulates data according to a given topology.
27 | 
28 | .. autosummary::
29 |    :toctree: reference/
30 | 
31 |    sim.Cas9LineageTracingDataSimulator
32 | 
33 | 
34 | Spatial Simulators
35 | ~~~~~~~~~~~~~~~~~~~
36 | These simulators are subclasses of the `SpatialSimulator` class and implement the `overlay_data` method which adds spatial coordinates to a given topology. `SpatialSimulator`s are a special sublcass of `DataSimulator` and can be used in addition to other `DataSimulator`s that simulate lineage tracing data.
37 | 
38 | .. autosummary::
39 |    :toctree: reference/
40 | 
41 |    sim.BrownianSpatialDataSimulator
42 |    sim.ClonalSpatialDataSimulator
43 |    
44 | 
45 | Leaf SubSamplers
46 | ~~~~~~~~~~~~~~~~~~~
47 | These are utilities for subsampling lineages for benchmarking purposes. For example, sampling a random proportion of leaves or grouping together cells into clades to model spatial data.
48 | 
49 | .. autosummary::
50 |    :toctree: reference/
51 | 
52 |    sim.SupercellularSampler
53 |    sim.SpatialLeafSubsampler
54 |    sim.UniformLeafSubsampler


--------------------------------------------------------------------------------
/docs/api/solver.rst:
--------------------------------------------------------------------------------
 1 | ===========
 2 | Solver
 3 | ===========
 4 | .. currentmodule:: cassiopeia
 5 | 
 6 | CassiopeiaSolvers
 7 | ~~~~~~~~~~~~~~~~~~~
 8 | 
 9 | We have several algorithms available for solving phylogenies:
10 | 
11 | .. autosummary::
12 |    :toctree: reference/
13 | 
14 |    solver.HybridSolver
15 |    solver.ILPSolver
16 |    solver.MaxCutSolver
17 |    solver.MaxCutGreedySolver
18 |    solver.NeighborJoiningSolver
19 |    solver.PercolationSolver
20 |    solver.SharedMutationJoiningSolver
21 |    solver.SpectralSolver
22 |    solver.SpectralGreedySolver
23 |    solver.UPGMASolver
24 |    solver.VanillaGreedySolver
25 | 
26 | 
27 | Dissimilarity Maps
28 | ~~~~~~~~~~~~~~~~~~~
29 | 
30 | For use in our distance-based solver and for comparing character states, we also have available several dissimilarity functions:
31 | 
32 | .. autosummary::
33 |    :toctree: reference/
34 | 
35 |    solver.dissimilarity_functions.cluster_dissimilarity
36 |    solver.dissimilarity_functions.hamming_distance
37 |    solver.dissimilarity_functions.hamming_similarity_normalized_over_missing
38 |    solver.dissimilarity_functions.hamming_similarity_without_missing
39 |    solver.dissimilarity_functions.weighted_hamming_distance
40 |    solver.dissimilarity_functions.weighted_hamming_similarity


--------------------------------------------------------------------------------
/docs/api/tools.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | Tools
 3 | ==========
 4 | 
 5 | .. currentmodule:: cassiopeia
 6 | 
 7 | This library stores code for post-reconstruction analysis of trees. We are
 8 | always in the process of developing new statistics and tools for helping us
 9 | interpret trees, and adding them to this library.
10 | 
11 | Autocorrelation
12 | ~~~~~~~~~~~~~~~~~~~
13 | 
14 | .. autosummary::
15 |    :toctree: reference/
16 | 
17 |    tl.compute_morans_i
18 | 
19 | Branch Length Estimation (BLE)
20 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
21 | 
22 | .. autosummary::
23 |    :toctree: reference/
24 | 
25 |    tl.IIDExponentialBayesian
26 |    tl.IIDExponentialMLE
27 | 
28 | Coupling
29 | ~~~~~~~~~~~
30 | 
31 | .. autosummary::
32 |    :toctree: reference/
33 | 
34 |    tl.compute_evolutionary_coupling
35 | 
36 | Metrics
37 | ~~~~~~~~
38 | .. autosummary::
39 |    :toctree: reference/
40 | 
41 |    tl.calculate_likelihood_continuous
42 |    tl.calculate_likelihood_discrete
43 |    tl.calculate_parsimony
44 | 
45 | Parameter Estimation
46 | ~~~~~~~~~~~~~~~~~~~~~~
47 | 
48 | .. autosummary::
49 |    :toctree: reference/
50 | 
51 |    tl.estimate_missing_data_rates
52 |    tl.estimate_mutation_rate
53 | 
54 | 
55 | Small-Parsimony
56 | ~~~~~~~~~~~~~~~~~~~
57 | 
58 | .. autosummary::
59 |    :toctree: reference/
60 | 
61 |    tl.fitch_count
62 |    tl.fitch_hartigan
63 |    tl.score_small_parsimony
64 | 
65 | Topology
66 | ~~~~~~~~~~~~~~~~~~~
67 | .. autosummary::
68 |    :toctree: reference/
69 | 
70 |    tl.compute_expansion_pvalues


--------------------------------------------------------------------------------
/docs/authors.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../AUTHORS.rst
2 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../CONTRIBUTING.rst
2 | 


--------------------------------------------------------------------------------
/docs/extensions/typed_returns.py:
--------------------------------------------------------------------------------
 1 | # code from https://github.com/theislab/scanpy/blob/master/docs/extensions/typed_returns.py
 2 | # with some minor adjustment
 3 | import re
 4 | 
 5 | from sphinx.application import Sphinx
 6 | from sphinx.ext.napoleon import NumpyDocstring
 7 | 
 8 | 
 9 | def process_return(lines):
10 |     for line in lines:
11 |         m = re.fullmatch(r"(?P<param>\w+)\s+:\s+(?P<type>[\w.]+)", line)
12 |         if m:
13 |             # Once this is in scanpydoc, we can use the fancy hover stuff
14 |             yield f'-{m["param"]} (:class:`~{m["type"]}`)'
15 |         else:
16 |             yield line
17 | 
18 | 
19 | def scanpy_parse_returns_section(self, section):
20 |     lines_raw = list(process_return(self._dedent(self._consume_to_next_section())))
21 |     lines = self._format_block(":returns: ", lines_raw)
22 |     if lines and lines[-1]:
23 |         lines.append("")
24 |     return lines
25 | 
26 | 
27 | def setup(app: Sphinx):
28 |     NumpyDocstring._parse_returns_section = scanpy_parse_returns_section
29 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
  1 | .. Cassiopeia documentation master file, created by
  2 |    sphinx-quickstart on Sat Jan 26 12:35:18 2019.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | ========================
  7 | Welcome!
  8 | ========================
  9 | 
 10 | This website serves as documentation to the Cassiopeia software suite, maintained by the `Yosef Lab
 11 | <https://yoseflab.github.io/>`_ at UC Berkeley.
 12 | 
 13 | Cassiopeia [Jones20]_ is a package for end-to-end phylogenetic reconstruction of single-cell lineage tracing data. The package is composed of four independent modules:
 14 | 
 15 | * ``preprocess`` for processing sequencing FASTQ data to character matrices
 16 | * ``solver`` for performing tree inference
 17 | * ``simulator`` for simulating trees and character-level data
 18 | * ``plotting`` for plotting trees.
 19 | 
 20 | If you find this useful for your research, please consider citing Cassiopeia [Jones20]_.
 21 | 
 22 | .. raw:: html
 23 | 
 24 |     <div class="container">
 25 |         <div class="row">
 26 |             <div class="col-lg-6 col-md-6 col-sm-6 col-xs-12 d-flex">
 27 |                 <div class="card text-center intro-card shadow">
 28 |                 <img src="_static/computer-24px.svg" class="card-img-top" alt="installation with cassiopeia action icon" height="52">
 29 |                 <div class="card-body flex-fill">
 30 |                     <h5 class="card-title">Installation</h5>
 31 |                     <p class="card-text">New to <em>Cassiopeia</em>? Check out the installation guide.
 32 |                     </p>
 33 | 
 34 | .. container:: custom-button
 35 | 
 36 |     :doc:`To the installation guide<installation>`
 37 | 
 38 | .. raw:: html
 39 | 
 40 |                 </div>
 41 |                 </div>
 42 |             </div>
 43 |             <div class="col-lg-6 col-md-6 col-sm-6 col-xs-12 d-flex">
 44 |                 <div class="card text-center intro-card shadow">
 45 |                 <img src="_static/play_circle_outline-24px.svg" class="card-img-top" alt="cassiopeia user guide action icon" height="52">
 46 |                 <div class="card-body flex-fill">
 47 |                     <h5 class="card-title">User guide</h5>
 48 |                     <p class="card-text">The tutorials provide in-depth information on running Cassiopeia.</p>
 49 | 
 50 | .. container:: custom-button
 51 | 
 52 |     :doc:`To the user guide<user_guide>`
 53 | 
 54 | .. raw:: html
 55 | 
 56 |                 </div>
 57 |                 </div>
 58 |             </div>
 59 |             <div class="col-lg-6 col-md-6 col-sm-6 col-xs-12 d-flex">
 60 |                 <div class="card text-center intro-card shadow">
 61 |                 <img src="_static/library_books-24px.svg" class="card-img-top" alt="api of scvi action icon" height="52">
 62 |                 <div class="card-body flex-fill">
 63 |                     <h5 class="card-title">API reference</h5>
 64 |                     <p class="card-text">The API reference contains a detailed description of
 65 |                     the Cassiopeia API.</p>
 66 | 
 67 | .. container:: custom-button
 68 | 
 69 |     :doc:`To the API reference<api/index>`
 70 | 
 71 | .. raw:: html
 72 | 
 73 |                 </div>
 74 |                 </div>
 75 |             </div>
 76 |             <div class="col-lg-6 col-md-6 col-sm-6 col-xs-12 d-flex">
 77 |                 <div class="card text-center intro-card shadow">
 78 |                 <img src="_static/question-mark-svgrepo-com.svg" class="card-img-top" alt="questions about cassiopeia" height="52">
 79 |                 <div class="card-body flex-fill">
 80 |                     <h5 class="card-title">Questions & Issues</h5>
 81 |                     <p class="card-text">Have a question or found a bug? File an issue.</p>
 82 | 
 83 | .. container:: custom-button
 84 | 
 85 |     `File an issue <https://github.com/YosefLab/Cassiopeia/issues>`_
 86 | 
 87 | .. raw:: html
 88 | 
 89 |                 </div>
 90 |                 </div>
 91 |             </div>
 92 |         </div>
 93 |     </div>
 94 | 
 95 | 
 96 | .. toctree::
 97 |    :maxdepth: 1
 98 |    :hidden:
 99 | 
100 |    installation
101 |    api/index
102 |    user_guide
103 |    contributing
104 |    authors
105 |    references


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ------------
 3 | 
 4 | Prerequisites
 5 | ~~~~~~~~~~~~~~
 6 | 
 7 | Cassiopeia currently requires python version 3.6, which is publicly available.
 8 | 
 9 | Cassiopeia needs to be downloaded from Github by cloning the directory onto your machine:
10 | 
11 | ::
12 | 
13 |     git clone https://github.com/YosefLab/Cassiopeia.git
14 | 
15 | To run some of the models in Cassiopeia, you will also need to install `Gurobi <https://www.gurobi.com/>`_. Licenses are free to academic users and can be downloaded `here <https://www.gurobi.com/downloads/end-user-license-agreement-academic/>`_.
16 | 
17 | 
18 | Installing
19 | ~~~~~~~~~~~
20 | 
21 | Once Cassiopeia is cloned into a directory onto your machine, enter into the directory with `cd Cassiopeia`. To make installation simple, we have wrapped the installation steps into a MAKEFILE - this allows you to install Cassiopeia with the command:
22 | 
23 | ::
24 | 
25 |     make install
26 | 
27 | To make sure that the package has been installed correctly, we recommend you also run all the unit tests with another command from the MAKEFILE:
28 | 
29 | ::
30 | 
31 |     make test
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/notebooks:
--------------------------------------------------------------------------------
1 | ../notebooks


--------------------------------------------------------------------------------
/docs/references.rst:
--------------------------------------------------------------------------------
1 | References
2 | ----------
3 | 
4 | .. [Jones20] Matthew G Jones*, Alex Khodaverdian*, Jeffrey J Quinn*, Michelle M Chan, Jeffrey A Hussmann, Robert Wang, Chenling Xu, Jonatahn S Weissman, Nir Yosef. (2020),
5 |    *Inference of single-cell phylogenies from lineage tracing data using Cassiopeia*,
6 |    `Genome Biology <https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02000-8>`__.
7 |    


--------------------------------------------------------------------------------
/docs/user_guide.rst:
--------------------------------------------------------------------------------
 1 | User guide
 2 | ==========
 3 | 
 4 | Cassiopeia is a flexible tool for analyzing lineage-tracing data and benchmarking new algorithms. Perhaps the easiest way to get started with Cassiopeia is by following along with or tutorials. Generally, we'll expect that you've already successfully installed Cassiopeia using the :doc:`installation guide<installation>`.
 5 | 
 6 | For any questions about Cassiopeia, please file an issue on `Github <https://github.com/YosefLab/Cassiopeia/issues>`_. If you'd like to contribute a tutorial or a new algorithm, please follow our :doc:`Contributing guide<contributing>`.
 7 | 
 8 | Main Tutorials
 9 | -----------
10 | 
11 | .. nbgallery::
12 | 
13 |    notebooks/preprocess
14 |    notebooks/benchmark
15 |    notebooks/reconstruct
16 |    notebooks/local_plotting
17 |    
18 | 
19 | Other Tutorials
20 | -----------------
21 | 
22 | .. toctree::
23 |    :maxdepth: 1
24 |    
25 |    notebooks/simulate_ecDNA
26 |    
27 | Contributed tutorials
28 | ---------------------
29 | 
30 | Currently we have no contributed tutorials -- if you are interested, check out our :doc:`Contributing guide<contributing>`!


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.poetry]
  2 | authors = ["Matthew Jones <mattjones315@berkeley.edu>", "Alex Khodaverdian", "Richard Zhang", "Sebastian Prillo", "Joseph Min"]
  3 | classifiers = [
  4 |   "Development Status :: 4 - Beta",
  5 |   "Intended Audience :: Science/Research",
  6 |   "Natural Language :: English",
  7 |   "Programming Language :: Python :: 3.8",
  8 |   "Programming Language :: Python :: 3.9",
  9 |   "Programming Language :: Python :: 3.10",
 10 |   "Programming Language :: Python :: 3.11",
 11 |   "Operating System :: MacOS :: MacOS X",
 12 |   "Operating System :: Microsoft :: Windows",
 13 |   "Operating System :: POSIX :: Linux",
 14 |   "Topic :: Scientific/Engineering :: Bio-Informatics",
 15 | ]
 16 | description = "Single Cell Lineage Reconstruction with Cas9-Enabled Lineage Recorders"
 17 | documentation = "https://cassiopeia-lineage.readthedocs.io/"
 18 | homepage = "https://github.com/YosefLab/Cassiopeia"
 19 | keywords = ['scLT']
 20 | license = "MIT"
 21 | name = "cassiopeia-lineage"
 22 | readme = 'README.md'
 23 | repository = "https://github.com/YosefLab/Cassiopeia"
 24 | version = "2.1.0"
 25 | 
 26 | include = [
 27 |   {path = "cassiopeia/preprocess/*.so", format = "wheel"},
 28 |   {path = "cassiopeia/preprocess/*.pyx", format = "wheel"},
 29 |   {path = "cassiopeia/solver/*.so", format = "wheel"},
 30 |   {path = "cassiopeia/solver/*.pyx", format = "wheel"},
 31 |   {path = "cassiopeia/tools/branch_length_estimator/*.so", format = "wheel"},
 32 |   {path = "cassiopeia/tools/branch_length_estimator/*.pyx", format = "wheel"},
 33 |   {path = "cassiopeia/config.ini"},
 34 | ]
 35 | packages = [
 36 |   {include = "cassiopeia"},
 37 | ]
 38 | 
 39 | [tool.poetry.dependencies]
 40 | Biopython = ">=1.71"
 41 | Cython = ">=0.29.2"
 42 | PyYAML = ">=3.12"
 43 | black = {version = ">=20.8b1", optional = true}
 44 | bokeh = ">=0.12.15"
 45 | cchardet = {version = ">=2.1.7", optional = true}
 46 | codecov = {version = ">=2.0.8", optional = true}
 47 | cvxpy = "*"
 48 | ete3 = ">=3.1.1"
 49 | hits = "*"
 50 | ipython = {version = ">=7.20", optional = true}
 51 | isort = {version = ">=5.7", optional = true}
 52 | itolapi = "*"
 53 | jupyter = {version = ">=1.0", optional = true}
 54 | matplotlib = ">=2.2.2"
 55 | nbconvert = {version = ">=5.4.0", optional = true}
 56 | nbformat = {version = ">=4.4.0", optional = true}
 57 | nbsphinx = {version = "*", optional = true}
 58 | nbsphinx-link = {version = "*", optional = true}
 59 | networkx = ">=3.1"
 60 | ngs-tools = ">=1.5.6"
 61 | numba = ">=0.51.0"
 62 | numpy = ">=1.22, <3.0"
 63 | opencv-python = {version = ">=4.5.4.60", optional = true}
 64 | pandas = ">=1.1.4"
 65 | parameterized = "*"
 66 | plotly = ">=5.0.0"
 67 | poisson-disc = {version = ">=0.2.1", optional = true}
 68 | pre-commit = {version = ">=2.7.1", optional = true}
 69 | pydata-sphinx-theme = {version = ">=0.4.3", optional = true}
 70 | pysam = ">=0.14.1"
 71 | pyseq-align = ">=1.0.2"
 72 | pytest = {version = ">=4.4", optional = true}
 73 | python = ">=3.7,<4.0"
 74 | pyvista = {version = "=0.41.0", optional = true}
 75 | scanpydoc = {version = ">=0.5", optional = true}
 76 | scikit-image = {version = ">=0.19.1", optional = true}
 77 | scikit-learn = {version = ">=1.0.2", optional = true}
 78 | scipy = ">=1.2.0"
 79 | sphinx = {version = ">=3.4", optional = true}
 80 | sphinx-autodoc-typehints = {version = "*", optional = true}
 81 | sphinx-gallery = {version = ">0.6", optional = true}
 82 | trame = {version = ">=3.2.4", optional = true}
 83 | trame-vtk = {version = ">=2.5.8", optional = true}
 84 | trame-vuetify = {version = ">=2.3.1", optional = true}
 85 | tqdm = ">=4"
 86 | typing-extensions = ">=3.7.4"
 87 | typing_extensions = {version = "*", python = "<3.8", optional = true}
 88 | vtk = {version = ">=9.2", optional = true}
 89 | 
 90 | [tool.poetry.build]
 91 | generate-setup-file = false
 92 | script = "build.py"
 93 | 
 94 | [build-system]
 95 | build-backend = "poetry.core.masonry.api"
 96 | requires = ["poetry-core>=1.0.7", "Cython", "numpy>=1.19.5", "setuptools", "pip>=22.0.0"]
 97 | 
 98 | [tool.poetry.scripts]
 99 | cassiopeia-preprocess = 'cassiopeia.preprocess.cassiopeia_preprocess:main'
100 | 
101 | [tool.poetry.extras]
102 | dev = ["black", "pytest", "flake8", "codecov", "jupyter", "pre-commit", "isort"]
103 | docs = [
104 |   "sphinx",
105 |   "scanpydoc",
106 |   "nbconvert",
107 |   "nbformat",
108 |   "nbsphinx",
109 |   "nbsphinx-link",
110 |   "ipython",
111 |   "pydata-sphinx-theme",
112 |   "typing_extensions",
113 |   "sphinx-autodoc-typehints",
114 |   "sphinx_gallery",
115 | ]
116 | spatial = [
117 |   "opencv-python",
118 |   "poisson-disc",
119 |   "vtk",
120 |   "scikit-image", 
121 |   "scikit-learn",
122 |   "trame", 
123 |   "trame-vuetify", 
124 |   "trame-vtk", 
125 |   "cchardet",
126 |   "pyvista"
127 | ]
128 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | # This is a shim to hopefully allow Github to detect the package, build is done with poetry
4 | 
5 | import setuptools
6 | 
7 | if __name__ == "__main__":
8 |     setuptools.setup(name="cassiopeia")
9 | 


--------------------------------------------------------------------------------
/test/mixin_tests/mixin_utilities_test.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file tests the utilities stored in cassiopeia/data/utilities.py
 3 | """
 4 | 
 5 | import unittest
 6 | 
 7 | import pandas as pd
 8 | 
 9 | from cassiopeia.mixins import utilities
10 | 
11 | 
12 | class TestMixinUtilities(unittest.TestCase):
13 |     def test_is_ambiguous_state(self):
14 |         self.assertTrue(utilities.is_ambiguous_state((1, 2)))
15 |         self.assertFalse(utilities.is_ambiguous_state(1))
16 | 
17 |     def test_unravel_states(self):
18 |         state_array = [0, (1, 2), 3, 4, 5]
19 |         self.assertListEqual(
20 |             [0, 1, 2, 3, 4, 5], utilities.unravel_ambiguous_states(state_array)
21 |         )
22 |         
23 |         state_array = [0, 1, 2, 3, 4, 5]
24 |         self.assertListEqual(
25 |             [0, 1, 2, 3, 4, 5], utilities.unravel_ambiguous_states(state_array)
26 |         )
27 | 
28 |     def test_find_duplicated_character_states(self):
29 | 
30 |         character_matrix = pd.DataFrame.from_dict(
31 |             {
32 |                 "c1": [(5, 1), 0, 1, 2, 0],
33 |                 "c2": [(5, 1), 0, 1, 2, 0],
34 |                 "c3": [4, 0, 3, 2, -1],
35 |                 "c4": [-1, 4, 0, 2, 2],
36 |                 "c5": [0, 4, 1, 2, 2],
37 |                 "c6": [4, 0, 0, 2, (2, 1)],
38 |                 "c6_dup": [4, 0, 0, 2, (1, 2)],
39 |             },
40 |             orient="index",
41 |             columns=["a", "b", "c", "d", "e"],
42 |         )
43 | 
44 |         duplicated_mappings = utilities.find_duplicate_groups(character_matrix)
45 |         
46 |         expected_entries = [('c1', ('c1', 'c2')),
47 |                             ('c6', ('c6', 'c6_dup'))]
48 |         
49 |         for k, grp in expected_entries:
50 |             self.assertIn(k, list(duplicated_mappings.keys()))
51 |             self.assertSetEqual(set(grp), set(duplicated_mappings[k]))
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     unittest.main()
56 | 


--------------------------------------------------------------------------------
/test/plotting_tests/local_3d_test.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import unittest
  3 | 
  4 | import numpy as np
  5 | 
  6 | import cassiopeia as cas
  7 | from cassiopeia.plotting import local_3d
  8 | 
  9 | 
 10 | class TestLocal3DPlotting(unittest.TestCase):
 11 |     def setUp(self):
 12 |         np.random.seed(0)
 13 |         simulator = cas.sim.CompleteBinarySimulator(num_cells=8)
 14 |         self.tree = simulator.simulate_tree()
 15 | 
 16 |         spatial_simulator = cas.sim.ClonalSpatialDataSimulator((10, 10))
 17 |         spatial_simulator.overlay_data(self.tree)
 18 | 
 19 |         self.labels = local_3d.labels_from_coordinates(self.tree)
 20 | 
 21 |     @pytest.mark.spatial
 22 |     def test_interpolate_branch(self):
 23 |         parent = (0, 0, 0)
 24 |         child = (1, 1, 1)
 25 |         np.testing.assert_array_equal(
 26 |             [[0, 0, 0], [1, 1, 0], [1, 1, 1]],
 27 |             local_3d.interpolate_branch(parent, child),
 28 |         )
 29 | 
 30 |     @pytest.mark.spatial
 31 |     def test_polyline_from_points(self):
 32 |         points = np.array(
 33 |             [
 34 |                 [0, 0, 0],
 35 |                 [1, 1, 1],
 36 |                 [1, 1, 0],
 37 |             ]
 38 |         )
 39 |         poly = local_3d.polyline_from_points(points)
 40 |         np.testing.assert_array_equal(points, poly.points)
 41 | 
 42 |     @pytest.mark.spatial
 43 |     def test_average_mixing(self):
 44 |         c1 = (0, 0, 0)
 45 |         c2 = (0.1, 0.2, 0.3)
 46 |         c3 = (0.5, 0.7, 0.0)
 47 |         np.testing.assert_allclose(
 48 |             (0.2, 0.3, 0.1), local_3d.average_mixing(c1, c2, c3)
 49 |         )
 50 | 
 51 |     @pytest.mark.spatial
 52 |     def test_highlight(self):
 53 |         c = (0.8, 0.2, 0.0)
 54 |         np.testing.assert_allclose((1.0, 0.25, 0.0), local_3d.highlight(c))
 55 | 
 56 |     @pytest.mark.spatial
 57 |     def test_lowlight(self):
 58 |         c = (0.8, 0.2, 0.0)
 59 |         np.testing.assert_allclose((0.3, 0.075, 0.0), local_3d.lowlight(c))
 60 | 
 61 |     @pytest.mark.spatial
 62 |     def test_labels_from_coordinates(self):
 63 |         # invalid shape
 64 |         with self.assertRaises(ValueError):
 65 |             local_3d.labels_from_coordinates(self.tree, shape=(10,10,10))
 66 |         with self.assertRaises(ValueError):
 67 |             local_3d.labels_from_coordinates(self.tree, shape=("10","10"))
 68 |         with self.assertRaises(ValueError):
 69 |             local_3d.labels_from_coordinates(self.tree, shape=(-1,10))
 70 |         # invalid attribute
 71 |         with self.assertRaises(ValueError):
 72 |             local_3d.labels_from_coordinates(self.tree, attribute_key="foo")
 73 |         # edits tree metadata
 74 |         for leaf in self.tree.leaves:
 75 |             x, y = self.tree.get_attribute(leaf, "spatial")
 76 |             self.assertEqual(
 77 |                 self.labels[int(x), int(y)],
 78 |                 self.tree.cell_meta["spatial_label"][leaf],
 79 |             )
 80 |         # not square
 81 |         labels = local_3d.labels_from_coordinates(self.tree, shape=(1000, 500))
 82 |         self.assertEqual(labels.shape, (1000, 500))
 83 |         # dense spatial positions
 84 |         dense_tree = self.tree.copy()
 85 |         spatial_simulator = cas.sim.ClonalSpatialDataSimulator((1,1))
 86 |         spatial_simulator.overlay_data(dense_tree)
 87 |         labels = local_3d.labels_from_coordinates(dense_tree, shape=(100, 100))
 88 | 
 89 |     @pytest.mark.spatial
 90 |     def test_Tree3D(self):
 91 |         # There isn't a good way to test this, other than making sure there
 92 |         # are no errors on initialization.
 93 |         tree3d = local_3d.Tree3D(self.tree, self.labels)
 94 |         tree3d.plot(show=False)
 95 |         # without labels
 96 |         tree3d = local_3d.Tree3D(self.tree)
 97 |         tree3d.plot(show=False)
 98 | 
 99 | if __name__ == "__main__":
100 |     unittest.main()
101 | 


--------------------------------------------------------------------------------
/test/preprocess_tests/align_sequence_test.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for the sequence alignment in pipeline.py.
  3 | """
  4 | import unittest
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | import cassiopeia
 10 | 
 11 | 
 12 | class TestAlignSequence(unittest.TestCase):
 13 |     def setUp(self):
 14 | 
 15 |         self.queries = pd.DataFrame.from_dict(
 16 |             {
 17 |                 "cellBC": ["A", "A", "A", "B", "B", "C", "C", "C"],
 18 |                 "UMI": ["1", "2", "3", "1", "2", "1", "2", "3"],
 19 |                 "readCount": [20, 30, 30, 40, 40, 10, 10, 15],
 20 |                 "seq": [
 21 |                     "AACCTTGG",
 22 |                     "ACTG",
 23 |                     "AACCTTGGACTGCATCG",
 24 |                     "AATTAA",
 25 |                     "ACTGGACT",
 26 |                     "AACCTTGGGG",
 27 |                     "AAAAAAAAAAA",
 28 |                     "TACTCTATA",
 29 |                 ],
 30 |             }
 31 |         )
 32 |         self.queries["readName"] = self.queries.apply(
 33 |             lambda x: "_".join([x.cellBC, x.UMI, str(x.readCount)]), axis=1
 34 |         )
 35 | 
 36 |         self.reference = "AACCTTGG"
 37 | 
 38 |     def test_alignment_dataframe_structure(self):
 39 | 
 40 |         aln_df = cassiopeia.pp.align_sequences(
 41 |             self.queries,
 42 |             ref=self.reference,
 43 |             gap_open_penalty=20,
 44 |             gap_extend_penalty=1,
 45 |             n_threads=2,
 46 |         )
 47 | 
 48 |         self.assertEqual(aln_df.shape[0], self.queries.shape[0])
 49 | 
 50 |         for cellBC in self.queries["cellBC"].unique():
 51 |             self.assertIn(cellBC, aln_df["cellBC"].unique())
 52 | 
 53 |         expected_columns = [
 54 |             "cellBC",
 55 |             "UMI",
 56 |             "AlignmentScore",
 57 |             "CIGAR",
 58 |             "QueryBegin",
 59 |             "ReferenceBegin",
 60 |             "Seq",
 61 |             "readName",
 62 |             "readCount",
 63 |         ]
 64 | 
 65 |         for column in expected_columns:
 66 |             self.assertIn(column, aln_df.columns)
 67 | 
 68 |     def test_extremely_large_gap_open_penalty(self):
 69 | 
 70 |         aln_df = cassiopeia.pp.align_sequences(
 71 |             self.queries,
 72 |             ref=self.reference,
 73 |             gap_open_penalty=255,
 74 |             gap_extend_penalty=1,
 75 |         )
 76 | 
 77 |         # since the gap open penalty is so large, enforce that
 78 |         # no gaps should occur
 79 |         for ind, row in aln_df.iterrows():
 80 | 
 81 |             self.assertNotIn("D", row.CIGAR)
 82 |             self.assertNotIn("I", row.CIGAR)
 83 | 
 84 |     def test_default_alignment_works(self):
 85 | 
 86 |         aln_df = cassiopeia.pp.align_sequences(
 87 |             self.queries,
 88 |             ref=self.reference,
 89 |             gap_open_penalty=2,
 90 |             gap_extend_penalty=1,
 91 |         )
 92 | 
 93 |         expected_alignments = {
 94 |             "A_1_20": ("8M", 40),
 95 |             "A_2_30": ("2M2D2M", 17),
 96 |             "A_3_30": ("8M", 40),
 97 |             "B_1_40": ("2M2D2M", 17),
 98 |             "B_2_40": ("2M2D3M", 22),
 99 |             "C_1_10": ("8M", 40),
100 |             "C_2_10": ("2M", 10),
101 |             "C_3_15": ("2M1I2M1I1M", 21),
102 |         }
103 | 
104 |         for read_name in aln_df["readName"].unique():
105 | 
106 |             expected_cigar = expected_alignments[read_name][0]
107 |             expected_score = expected_alignments[read_name][1]
108 | 
109 |             self.assertEqual(
110 |                 aln_df.loc[aln_df["readName"] == read_name, "CIGAR"].iloc[0],
111 |                 expected_cigar,
112 |             )
113 |             self.assertEqual(
114 |                 aln_df.loc[aln_df["readName"] == read_name, "AlignmentScore"].iloc[0],
115 |                 expected_score,
116 |             )
117 | 
118 |     def test_global_alignment(self):
119 | 
120 |         aln_df = cassiopeia.pp.align_sequences(
121 |             self.queries,
122 |             ref=self.reference,
123 |             gap_open_penalty=2,
124 |             gap_extend_penalty=1,
125 |             method="global",
126 |         )
127 | 
128 |         expected_alignments = {
129 |             "A_1_20": ("8M", 40),
130 |             "A_2_30": ("1M2D2M1D1M1D", 15),
131 |             "A_3_30": ("8M9I", 40),
132 |             "B_1_40": ("2M2D2M2D2I", 14),
133 |             "B_2_40": ("1M2D2M1D2M3I", 20),
134 |             "C_1_10": ("8M2I", 40),
135 |             "C_2_10": ("2M6D9I", 3),
136 |             "C_3_15": ("1I1M1D1M1I2M1I1M1I2D", 15),
137 |         }
138 | 
139 |         for read_name in aln_df["readName"].unique():
140 | 
141 |             expected_cigar = expected_alignments[read_name][0]
142 |             expected_score = expected_alignments[read_name][1]
143 | 
144 |             self.assertEqual(
145 |                 aln_df.loc[aln_df["readName"] == read_name, "CIGAR"].iloc[0],
146 |                 expected_cigar,
147 |             )
148 |             self.assertEqual(
149 |                 aln_df.loc[aln_df["readName"] == read_name, "AlignmentScore"].iloc[0],
150 |                 expected_score,
151 |             )
152 | 
153 | 
154 | if __name__ == "__main__":
155 |     unittest.main()
156 | 


--------------------------------------------------------------------------------
/test/preprocess_tests/error_correct_cellbcs_to_whitelist_test.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for correcting raw barcodes to a whitelist pipeline.py
 3 | """
 4 | import os
 5 | import unittest
 6 | import tempfile
 7 | 
 8 | import pysam
 9 | import ngs_tools as ngs
10 | 
11 | from cassiopeia.preprocess import pipeline
12 | 
13 | 
14 | class TestErrorCorrectCellBCsToWhitelist(unittest.TestCase):
15 |     def setUp(self):
16 |         dir_path = os.path.dirname(os.path.realpath(__file__))
17 |         test_files_path = os.path.join(dir_path, "test_files")
18 | 
19 |         self.bam_10xv3_fp = os.path.join(test_files_path, "10xv3_unmapped.bam")
20 |         self.whitelist_10xv3_fp = os.path.join(
21 |             test_files_path, "10xv3_whitelist.txt"
22 |         )
23 |         self.whitelist_10xv3 = ["TACGTCATCTCCTACG", "TTAGATCGTTAGAAAG"]
24 |         self.bam_slideseq2_fp = os.path.join(
25 |             test_files_path, "slideseq2_unmapped.bam"
26 |         )
27 |         self.whitelist_slideseq2_fp = os.path.join(
28 |             test_files_path, "slideseq2_whitelist.txt"
29 |         )
30 |         self.whitelist_slideseq2 = ["CTTTGNTCAAAGTT"]
31 | 
32 |     def test_10xv3(self):
33 |         bam_fp = pipeline.error_correct_cellbcs_to_whitelist(
34 |             self.bam_10xv3_fp, self.whitelist_10xv3_fp, tempfile.mkdtemp()
35 |         )
36 |         with pysam.AlignmentFile(bam_fp, "rb", check_sq=False) as f:
37 |             alignments = list(f.fetch(until_eof=True))
38 |         self.assertEqual(2, len(alignments))
39 |         self.assertEqual(
40 |             ["TACGTCATCTCCTACG", "TTAGATCGTTAGAAAG"],
41 |             [al.get_tag("CB") for al in alignments],
42 |         )
43 | 
44 |     def test_10xv3_whitelist_list(self):
45 |         bam_fp = pipeline.error_correct_cellbcs_to_whitelist(
46 |             self.bam_10xv3_fp, self.whitelist_10xv3, tempfile.mkdtemp()
47 |         )
48 |         with pysam.AlignmentFile(bam_fp, "rb", check_sq=False) as f:
49 |             alignments = list(f.fetch(until_eof=True))
50 |         self.assertEqual(2, len(alignments))
51 |         self.assertEqual(
52 |             ["TACGTCATCTCCTACG", "TTAGATCGTTAGAAAG"],
53 |             [al.get_tag("CB") for al in alignments],
54 |         )
55 | 
56 |     def test_slideseq2(self):
57 |         bam_fp = pipeline.error_correct_cellbcs_to_whitelist(
58 |             self.bam_slideseq2_fp,
59 |             self.whitelist_slideseq2_fp,
60 |             tempfile.mkdtemp(),
61 |         )
62 |         with pysam.AlignmentFile(bam_fp, "rb", check_sq=False) as f:
63 |             alignments = list(f.fetch(until_eof=True))
64 |         self.assertEqual(2, len(alignments))
65 |         self.assertEqual([True, False], [al.has_tag("CB") for al in alignments])
66 |         self.assertEqual("CTTTGNTCAAAGTT", alignments[0].get_tag("CB"))
67 | 
68 |     def test_slideseq2_whitelist_list(self):
69 |         bam_fp = pipeline.error_correct_cellbcs_to_whitelist(
70 |             self.bam_slideseq2_fp, self.whitelist_slideseq2, tempfile.mkdtemp()
71 |         )
72 |         with pysam.AlignmentFile(bam_fp, "rb", check_sq=False) as f:
73 |             alignments = list(f.fetch(until_eof=True))
74 |         self.assertEqual(2, len(alignments))
75 |         self.assertEqual([True, False], [al.has_tag("CB") for al in alignments])
76 |         self.assertEqual("CTTTGNTCAAAGTT", alignments[0].get_tag("CB"))
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     unittest.main()
81 | 


--------------------------------------------------------------------------------
/test/preprocess_tests/error_correct_intbcs_to_whitelist_test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | import cassiopeia
  8 | 
  9 | 
 10 | class TestErrorCorrectIntBCstoWhitelist(unittest.TestCase):
 11 |     def setUp(self):
 12 |         dir_path = os.path.dirname(os.path.realpath(__file__))
 13 |         test_files_path = os.path.join(dir_path, "test_files")
 14 |         self.whitelist_fp = os.path.join(test_files_path, "intbc_whitelist.txt")
 15 |         self.whitelist = ["ACTT", "TAAG"]
 16 | 
 17 |         self.multi_case = pd.DataFrame.from_dict(
 18 |             {
 19 |                 "cellBC": [
 20 |                     "A",
 21 |                     "A",
 22 |                     "A",
 23 |                     "B",
 24 |                     "B",
 25 |                     "C",
 26 |                     "C",
 27 |                     "C",
 28 |                     "C",
 29 |                     "D",
 30 |                     "D",
 31 |                 ],
 32 |                 "UMI": [
 33 |                     "AACCT",
 34 |                     "AACCG",
 35 |                     "AACCC",
 36 |                     "AACCT",
 37 |                     "AACCG",
 38 |                     "AACCT",
 39 |                     "AACCG",
 40 |                     "AAGGA",
 41 |                     "AACCT",
 42 |                     "AACCT",
 43 |                     "AAGGG",
 44 |                 ],
 45 |                 "readCount": [20, 30, 30, 40, 50, 10, 10, 15, 10, 10, 10],
 46 |                 "Seq": [
 47 |                     "AACCTTGG",
 48 |                     "AACCTTGG",
 49 |                     "AACCTTCC",
 50 |                     "AACCTTGG",
 51 |                     "AACCTTGC",
 52 |                     "AACCTTCC",
 53 |                     "AACCTTCG",
 54 |                     "AACCTCAG",
 55 |                     "AACCTTGG",
 56 |                     "AACCTTGG",
 57 |                     "AACCTAAA",
 58 |                 ],
 59 |                 "intBC": [
 60 |                     "ACTT",
 61 |                     "AAGG",
 62 |                     "ACTA",
 63 |                     "AAGN",
 64 |                     "TACT",
 65 |                     "TAAG",
 66 |                     "TNNG",
 67 |                     "ANNN",
 68 |                     "GCTT",
 69 |                     "NNNN",
 70 |                     "AAAA",
 71 |                 ],
 72 |                 "r1": ["1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"],
 73 |                 "r2": ["2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2"],
 74 |                 "r3": ["3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3"],
 75 |                 "AlignmentScore": [
 76 |                     "20",
 77 |                     "20",
 78 |                     "20",
 79 |                     "20",
 80 |                     "20",
 81 |                     "20",
 82 |                     "20",
 83 |                     "20",
 84 |                     "20",
 85 |                     "20",
 86 |                     "20",
 87 |                 ],
 88 |                 "CIGAR": [
 89 |                     "NA",
 90 |                     "NA",
 91 |                     "NA",
 92 |                     "NA",
 93 |                     "NA",
 94 |                     "NA",
 95 |                     "NA",
 96 |                     "NA",
 97 |                     "NA",
 98 |                     "NA",
 99 |                     "NA",
100 |                 ],
101 |             }
102 |         )
103 |         self.multi_case["readName"] = self.multi_case.apply(
104 |             lambda x: "_".join([x.cellBC, x.UMI, str(x.readCount)]), axis=1
105 |         )
106 | 
107 |         self.multi_case["allele"] = self.multi_case.apply(
108 |             lambda x: "_".join([x.r1, x.r2, x.r3]), axis=1
109 |         )
110 |         self.corrections = {
111 |             "ACTT": "ACTT",
112 |             "TAAG": "TAAG",
113 |             "ACTA": "ACTT",
114 |             "TNNG": "TAAG",
115 |             "ANNN": "ACTT",
116 |         }
117 | 
118 |     def test_correct(self):
119 | 
120 |         df = cassiopeia.pp.error_correct_intbcs_to_whitelist(
121 |             self.multi_case, self.whitelist_fp, intbc_dist_thresh=1
122 |         )
123 |         expected_df = self.multi_case.copy()
124 |         expected_df["intBC"] = expected_df["intBC"].map(self.corrections)
125 |         expected_df.dropna(subset=["intBC"], inplace=True)
126 | 
127 |         pd.testing.assert_frame_equal(df, expected_df)
128 | 
129 |     def test_correct_whitelist_list(self):
130 | 
131 |         df = cassiopeia.pp.error_correct_intbcs_to_whitelist(
132 |             self.multi_case, self.whitelist, intbc_dist_thresh=1
133 |         )
134 |         expected_df = self.multi_case.copy()
135 |         expected_df["intBC"] = expected_df["intBC"].map(self.corrections)
136 |         expected_df.dropna(subset=["intBC"], inplace=True)
137 | 
138 |         pd.testing.assert_frame_equal(df, expected_df)
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     unittest.main()
143 | 


--------------------------------------------------------------------------------
/test/preprocess_tests/filter_bam_test.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for correcting raw barcodes to a whitelist pipeline.py
 3 | """
 4 | import os
 5 | import unittest
 6 | import tempfile
 7 | 
 8 | import pysam
 9 | import ngs_tools as ngs
10 | 
11 | from cassiopeia.preprocess import pipeline
12 | 
13 | 
14 | class TestFilterBam(unittest.TestCase):
15 |     def setUp(self):
16 |         dir_path = os.path.dirname(os.path.realpath(__file__))
17 |         test_files_path = os.path.join(dir_path, "test_files")
18 | 
19 |         self.bam_10xv3_fp = os.path.join(test_files_path, "10xv3_unmapped.bam")
20 | 
21 |     def test_filter(self):
22 |         bam_fp = pipeline.filter_bam(self.bam_10xv3_fp, tempfile.mkdtemp(), 10)
23 |         with pysam.AlignmentFile(bam_fp, "rb", check_sq=False) as f:
24 |             alignments = list(f.fetch(until_eof=True))
25 |         self.assertEqual(len(alignments), 2)
26 | 
27 |         bam_fp = pipeline.filter_bam(self.bam_10xv3_fp, tempfile.mkdtemp(), 20)
28 |         with pysam.AlignmentFile(bam_fp, "rb", check_sq=False) as f:
29 |             alignments = list(f.fetch(until_eof=True))
30 |         self.assertEqual(len(alignments), 0)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     unittest.main()
35 | 


--------------------------------------------------------------------------------
/test/preprocess_tests/resolve_umi_sequence_test.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for the UMI Resolution module in pipeline.py.
  3 | """
  4 | import os
  5 | import unittest
  6 | 
  7 | import shutil
  8 | import tempfile
  9 | 
 10 | import pandas as pd
 11 | from cassiopeia.preprocess import pipeline
 12 | 
 13 | 
 14 | class TestResolveUMISequence(unittest.TestCase):
 15 |     def setUp(self):
 16 | 
 17 |         collapsed_umi_table_dict = {
 18 |             "cellBC": [
 19 |                 "cell1",
 20 |                 "cell1",
 21 |                 "cell1",
 22 |                 "cell2",
 23 |                 "cell2",
 24 |                 "cell3",
 25 |                 "cell3",
 26 |             ],
 27 |             "UMI": ["UMIA", "UMIA", "UMIC", "UMIA", "UMIB", "UMIA", "UMIB"],
 28 |             "readCount": [9, 20, 11, 2, 1, 40, 30],
 29 |             "grpFlag": [0, 0, 0, 0, 0, 0, 0],
 30 |             "seq": [
 31 |                 "AATCCG",
 32 |                 "AAGGTT",
 33 |                 "CCATTA",
 34 |                 "ATACTG",
 35 |                 "GGGAAT",
 36 |                 "TTTCCTT",
 37 |                 "CCAATTG",
 38 |             ],
 39 |             "qual": [
 40 |                 "FFFFFF",
 41 |                 "FFFFFF",
 42 |                 "FFFFFF",
 43 |                 "FFFFFF",
 44 |                 "FFFFFF",
 45 |                 "FFFFFF",
 46 |                 "FFFFFF",
 47 |             ],
 48 |             "readName": [
 49 |                 "cell1_UMIA_9_0",
 50 |                 "cell1_UMIA_20_0",
 51 |                 "cell1_UMIC_11_0",
 52 |                 "cell2_UMIA_2",
 53 |                 "cell2_UMIB_1",
 54 |                 "cell3_UMIA_40",
 55 |                 "cell3_UMIB_30",
 56 |             ],
 57 |         }
 58 |         self.collapsed_umi_table = pd.DataFrame.from_dict(
 59 |             collapsed_umi_table_dict
 60 |         )
 61 | 
 62 |         # set up temporary directory
 63 |         self.temporary_directory = tempfile.mkdtemp()
 64 | 
 65 |     def test_resolve_umi(self):
 66 | 
 67 |         resolved_mt = pipeline.resolve_umi_sequence(
 68 |             self.collapsed_umi_table,  self.temporary_directory, min_umi_per_cell=1, plot=False
 69 |         )
 70 | 
 71 |         # check that cell1-UMIA was selected correctly
 72 |         expected_seq = "AAGGTT"
 73 |         observed_seq = resolved_mt.loc[
 74 |             resolved_mt["readName"] == "cell1_UMIA_20_0", "seq"
 75 |         ].values
 76 |         self.assertEqual(expected_seq, observed_seq)
 77 | 
 78 |         # check that cell2 was filtered
 79 |         self.assertNotIn("cell2", resolved_mt["cellBC"].unique())
 80 | 
 81 |         # check that cell3 didn't lose UMIs
 82 |         self.assertEqual(
 83 |             2, resolved_mt[resolved_mt["cellBC"] == "cell3"].shape[0]
 84 |         )
 85 | 
 86 |         # check expected reads
 87 |         expected = {"cell1": 31, "cell3": 70}
 88 |         for n, g in resolved_mt.groupby("cellBC"):
 89 | 
 90 |             self.assertEqual(expected[n], g["readCount"].sum())
 91 | 
 92 |     def test_filter_by_reads(self):
 93 | 
 94 |         resolved_mt = pipeline.resolve_umi_sequence(
 95 |             self.collapsed_umi_table,
 96 |             self.temporary_directory,
 97 |             min_avg_reads_per_umi=30,
 98 |             min_umi_per_cell=1,
 99 |             plot=True,
100 |         )
101 | 
102 |         expected_cells = ["cell3"]
103 |         expected_removed_cells = ["cell1", "cell2"]
104 | 
105 |         # print(expected_cells)
106 | 
107 |         for cell in expected_cells:
108 |             self.assertIn(cell, resolved_mt["cellBC"].unique())
109 | 
110 |         for cell in expected_removed_cells:
111 |             self.assertNotIn(cell, resolved_mt["cellBC"].unique())
112 | 
113 |     def tearDown(self):
114 | 
115 |         shutil.rmtree(self.temporary_directory)
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     unittest.main()
120 | 


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/10xv3_1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/10xv3_1.fastq.gz


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/10xv3_2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/10xv3_2.fastq.gz


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/10xv3_unmapped.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/10xv3_unmapped.bam


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/10xv3_whitelist.txt:
--------------------------------------------------------------------------------
1 | TACGTCATCTCCTACG
2 | TTAGATCGTTAGAAAG
3 | 


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/basic_grouping.csv:
--------------------------------------------------------------------------------
1 | cellBC	intBC	allele	r1	r2	r3	lineageGrp	UMI	readCount	Sample
2 | A	XX	1_2_3	1	2	3	1	3	70	A
3 | A	YZ	1_2_3	1	2	3	1	1	40	A
4 | B	XX	1_2_3	1	2	3	1	1	10	B
5 | B	YZ	1_2_3	1	2	3	1	1	110	B
6 | C	XY	1_2_2	1	2	2	2	1	10	C
7 | C	XZ	1_2_2	1	2	2	2	1	10	C
8 | C	YX	1_2_3	1	2	3	2	1	15	C
9 | 


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/clustered_intbc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/clustered_intbc.png


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/collapse_header_required.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/collapse_header_required.bam


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/collapse_header_required.collapsed.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/collapse_header_required.collapsed.bam


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/doublet.csv:
--------------------------------------------------------------------------------
1 | cellBC	intBC	allele	r1	r2	r3	lineageGrp	UMI	readCount	Sample
2 | A	XX	1_2_3	1	2	3	1	2	40	A
3 | B	XX	1_2_3	1	2	3	1	2	70	B
4 | D	XY	1_2_3	1	2	3	2	2	35	D
5 | E	XY	1_2_3	1	2	3	2	2	20	E
6 | 


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/filter_and_reassign.csv:
--------------------------------------------------------------------------------
1 | cellBC	intBC	allele	r1	r2	r3	lineageGrp	UMI	readCount	Sample
2 | A	XX	1_2_3	1	2	3	1	1	30	A
3 | B	XX	1_2_3	1	2	3	1	1	40	B
4 | C	XX	1_2_3	1	2	3	1	1	10	C
5 | D	XX	1_2_3	1	2	3	1	1	20	D
6 | E	XZ	1_2_3	1	2	3	2	2	20	E
7 | F	XZ	1_2_3	1	2	3	2	2	20	F
8 | 


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/indropsv3_1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/indropsv3_1.fastq.gz


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/indropsv3_2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/indropsv3_2.fastq.gz


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/indropsv3_3.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/indropsv3_3.fastq.gz


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/intbc_whitelist.txt:
--------------------------------------------------------------------------------
1 | ACTT
2 | TAAG
3 | 


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/lineageGrp_piv_heatmaps/lg_1_piv_heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/lineageGrp_piv_heatmaps/lg_1_piv_heatmap.png


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/reassign.csv:
--------------------------------------------------------------------------------
 1 | cellBC	intBC	allele	r1	r2	r3	lineageGrp	UMI	readCount	Sample
 2 | A	XX	1_2_3	1	2	3	1	2	40	A
 3 | B	XX	1_2_3	1	2	3	1	2	70	B
 4 | C	XX	1_2_3	1	2	3	1	2	120	C
 5 | D	XX	1_2_3	1	2	3	1	1	20	D
 6 | D	YZ	1_2_3	1	2	3	1	1	15	D
 7 | E	XZ	1_2_3	1	2	3	1	1	10	E
 8 | E	YZ	1_2_3	1	2	3	1	1	10	E
 9 | F	XZ	1_2_3	1	2	3	1	3	30	F
10 | 


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/slideseq2_1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/slideseq2_1.fastq.gz


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/slideseq2_2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/slideseq2_2.fastq.gz


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/slideseq2_unmapped.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/slideseq2_unmapped.bam


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/slideseq2_whitelist.txt:
--------------------------------------------------------------------------------
1 | CTTTGNTCAAAGTT
2 | 


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/test.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/test.bam


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/test_sorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/test_sorted.bam


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/test_sorted.bayesian_collapsed.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/test_sorted.bayesian_collapsed.bam


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/test_sorted.collapsed.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/test_sorted.collapsed.bam


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/test_sorted.collapsed.txt:
--------------------------------------------------------------------------------
1 | cellBC	UMI	readCount	grpFlag	seq	qual	readName
2 | CAACCTCGTGGGTATG-1	GATAACATCG	000007	0+	AATCCAGCTAGCTGA	@@@@@@@@@@@@@@@	CAACCTCGTGGGTATG-1_GATAACATCG_000007_0+
3 | CTCACACTCGAATGCT-1	TGGCCTTTAA	000001	0	TATCCAGCTAGCTGA	FFFFFFFFFFFFFFF	CTCACACTCGAATGCT-1_TGGCCTTTAA_000001_0
4 | CTCACACTCGAATGCT-1	TGGCCTTTAT	000002	0	NATCCAGCTAGCTGA	#@@@@@@@@@@@@@@	CTCACACTCGAATGCT-1_TGGCCTTTAT_000002_0
5 | GACCCTCGTGGGTATG-1	GATAACATCG	000003	0	AATCCAGCTAGCTGA	@@@@@@@@@@@@@@@	GACCCTCGTGGGTATG-1_GATAACATCG_000003_0
6 | GACCCTCGTGGGTATG-1	GATAACATCG	000003	1	CCGCCAGCTAGCTGA	@@@@@@@@@@@@@@@	GACCCTCGTGGGTATG-1_GATAACATCG_000003_1
7 | 


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/test_uncorrected.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/test_uncorrected.bam


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/test_uncorrected_sorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/test_uncorrected_sorted.bam


--------------------------------------------------------------------------------
/test/preprocess_tests/test_files/test_uncorrected_sorted.collapsed.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YosefLab/Cassiopeia/3b05f92fbf02a4041cb0d82cab9c7dca065033d9/test/preprocess_tests/test_files/test_uncorrected_sorted.collapsed.bam


--------------------------------------------------------------------------------
/test/simulator_tests/complete_binary_simulator_test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import unittest
 3 | 
 4 | from cassiopeia.mixins import TreeSimulatorError
 5 | from cassiopeia.simulator import CompleteBinarySimulator
 6 | 
 7 | 
 8 | class TestCompleteBinarySimulator(unittest.TestCase):
 9 |     def test_init(self):
10 |         with self.assertRaises(TreeSimulatorError):
11 |             CompleteBinarySimulator()
12 | 
13 |         with self.assertRaises(TreeSimulatorError):
14 |             CompleteBinarySimulator(num_cells=3)
15 | 
16 |         with self.assertRaises(TreeSimulatorError):
17 |             CompleteBinarySimulator(depth=0)
18 | 
19 |         simulator = CompleteBinarySimulator(num_cells=4)
20 |         self.assertEqual(simulator.depth, 2)
21 | 
22 |     def test_simulate_tree(self):
23 |         tree = CompleteBinarySimulator(depth=2).simulate_tree()
24 | 
25 |         self.assertEqual(
26 |             set(tree.nodes), {"0", "1", "2", "3", "4", "5", "6", "7"}
27 |         )
28 |         self.assertEqual(set(tree.leaves), {"4", "5", "6", "7"})
29 |         self.assertEqual(
30 |             set(tree.edges),
31 |             {
32 |                 ("0", "1"),
33 |                 ("1", "2"),
34 |                 ("1", "3"),
35 |                 ("2", "4"),
36 |                 ("2", "5"),
37 |                 ("3", "6"),
38 |                 ("3", "7"),
39 |             },
40 |         )
41 | 
42 |         # Test branch lengths
43 |         self.assertEqual(
44 |             tree.get_times(),
45 |             {
46 |                 "0": 0.0,
47 |                 "1": 1 / 3,
48 |                 "2": 2 / 3,
49 |                 "3": 2 / 3,
50 |                 "4": 1.0,
51 |                 "5": 1.0,
52 |                 "6": 1.0,
53 |                 "7": 1.0,
54 |             },
55 |         )
56 | 


--------------------------------------------------------------------------------
/test/simulator_tests/simple_fit_subclone_simulator_test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import unittest
 3 | 
 4 | from cassiopeia.simulator import SimpleFitSubcloneSimulator
 5 | 
 6 | 
 7 | class TestSimpleFitSubcloneSimulator(unittest.TestCase):
 8 |     def test_deterministic(self):
 9 |         r"""
10 |         Small test that can be drawn by hand.
11 |         Checks that the generated phylogeny is correct.
12 |         """
13 |         tree = SimpleFitSubcloneSimulator(
14 |             branch_length_neutral=1,
15 |             branch_length_fit=0.5,
16 |             experiment_duration=1.9,
17 |             generations_until_fit_subclone=1,
18 |         ).simulate_tree()
19 |         self.assertListEqual(
20 |             tree.nodes,
21 |             ["0_neutral", "1_neutral", "2_fit", "3_neutral", "4_fit", "5_fit"],
22 |         )
23 |         self.assertListEqual(
24 |             tree.edges,
25 |             [
26 |                 ("0_neutral", "1_neutral"),
27 |                 ("1_neutral", "2_fit"),
28 |                 ("1_neutral", "3_neutral"),
29 |                 ("2_fit", "4_fit"),
30 |                 ("2_fit", "5_fit"),
31 |             ],
32 |         )
33 |         self.assertDictEqual(
34 |             tree.get_times(),
35 |             {
36 |                 "0_neutral": 0.0,
37 |                 "1_neutral": 1.0,
38 |                 "2_fit": 1.5,
39 |                 "3_neutral": 1.9,
40 |                 "4_fit": 1.9,
41 |                 "5_fit": 1.9,
42 |             },
43 |         )
44 | 
45 |     def test_stochastic(self):
46 |         r"""
47 |         We test the functionality that allows providing a callable for branch
48 |         lengths. Because the test is stochastic, we don't assert anything
49 |         besides the branch lengths being all different.
50 |         """
51 |         np.random.seed(1)
52 | 
53 |         def branch_length_neutral() -> float:
54 |             return np.random.exponential(1.0)
55 | 
56 |         def branch_length_fit() -> float:
57 |             return np.random.exponential(0.5)
58 | 
59 |         tree = SimpleFitSubcloneSimulator(
60 |             branch_length_neutral=branch_length_neutral,
61 |             branch_length_fit=branch_length_fit,
62 |             experiment_duration=4.9,
63 |             generations_until_fit_subclone=2,
64 |         ).simulate_tree()
65 |         # Just check that all branch lengths are distinct to confirm
66 |         # non-determinism. We exclude the leaves because sister leaves have the
67 |         # same branch length.
68 |         branch_lengths = [
69 |             tree.get_branch_length(p, c)
70 |             for (p, c) in tree.edges
71 |             if not tree.is_leaf(c)
72 |         ]
73 |         assert len(branch_lengths) == len(set(branch_lengths))
74 | 


--------------------------------------------------------------------------------
/test/tools_tests/autocorrelation_test.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test suite for the autocorrelation functions in
  3 | cassiopeia/tools/autocorrelation.py
  4 | """
  5 | import unittest
  6 | 
  7 | import networkx as nx
  8 | import numpy as np
  9 | import pandas as pd
 10 | 
 11 | import cassiopeia as cas
 12 | from cassiopeia.mixins.errors import AutocorrelationError
 13 | from cassiopeia.tools.autocorrelation import compute_morans_i
 14 | 
 15 | 
 16 | class TestAutocorrelation(unittest.TestCase):
 17 |     def setUp(self) -> None:
 18 | 
 19 |         tree = nx.DiGraph()
 20 |         tree.add_nodes_from(["A", "B", "C", "D", "E", "F"])
 21 |         tree.add_edge("F", "A", length=0.1)
 22 |         tree.add_edge("F", "B", length=0.2)
 23 |         tree.add_edge("F", "E", length=0.5)
 24 |         tree.add_edge("E", "C", length=0.3)
 25 |         tree.add_edge("E", "D", length=0.4)
 26 | 
 27 |         self.basic_tree = cas.data.CassiopeiaTree(tree=tree)
 28 | 
 29 |         example_obs = pd.DataFrame.from_dict(
 30 |             {
 31 |                 "nUMI": [10, 10, 3, 3],
 32 |                 "GeneX": [3, 5, 10, 2],
 33 |                 "GeneY": [30, 30, 1, 1],
 34 |             },
 35 |             orient="index",
 36 |             columns=["A", "B", "C", "D"],
 37 |         ).T
 38 | 
 39 |         self.X = example_obs
 40 | 
 41 |     def test_simple_moran_single_variable(self):
 42 |         """
 43 |         Tests Moran's I, comparing values gotten from the function implemented
 44 |         in Chaligne et al, Nat Genetics 2021
 45 |         """
 46 | 
 47 |         I = cas.tl.compute_morans_i(
 48 |             self.basic_tree, X=pd.DataFrame(self.X["nUMI"])
 49 |         )
 50 | 
 51 |         self.assertAlmostEqual(I, 0.084456, delta=0.001)
 52 | 
 53 |     def test_moran_bivariate(self):
 54 |         """
 55 |         Statistics compared to the function implemented in Chaligne et al,
 56 |         Nat Gen 2021
 57 |         """
 58 |         I = cas.tl.compute_morans_i(self.basic_tree, X=self.X)
 59 | 
 60 |         expected_correlations = pd.DataFrame.from_dict(
 61 |             {
 62 |                 "nUMI": [0.08445, -0.00874, 0.08446],
 63 |                 "GeneX": [-0.00874, -0.31810, -0.00874],
 64 |                 "GeneY": [0.08446, -0.00874, 0.08446],
 65 |             },
 66 |             orient="index",
 67 |             columns=["nUMI", "GeneX", "GeneY"],
 68 |         )
 69 | 
 70 |         pd.testing.assert_frame_equal(
 71 |             I, expected_correlations, check_exact=False, atol=0.001
 72 |         )
 73 | 
 74 |     def test_moran_custom_weights(self):
 75 | 
 76 |         W = pd.DataFrame.from_dict(
 77 |             {
 78 |                 "A": [0, 1 / 2, 1 / 3, 1 / 3],
 79 |                 "B": [1 / 2, 0, 1 / 3, 1 / 3],
 80 |                 "C": [1 / 3, 1 / 3, 0, 1 / 2],
 81 |                 "D": [1 / 3, 1 / 3, 1 / 2, 0],
 82 |             },
 83 |             orient="index",
 84 |             columns=["A", "B", "C", "D"],
 85 |         )
 86 | 
 87 |         I = cas.tl.compute_morans_i(
 88 |             self.basic_tree, X=pd.DataFrame(self.X["nUMI"]), W=W
 89 |         )
 90 | 
 91 |         self.assertAlmostEqual(I, -0.1428571, delta=0.0001)
 92 | 
 93 |     def test_moran_exceptions(self):
 94 | 
 95 |         # check typing
 96 |         string_type_meta = pd.DataFrame(
 97 |             ["type1", "type2", "type1", "type3"],
 98 |             index=["A", "B", "C", "D"],
 99 |             columns=["CellType"],
100 |         )
101 | 
102 |         X = pd.concat([self.X, string_type_meta])
103 | 
104 |         self.assertRaises(
105 |             AutocorrelationError,
106 |             cas.tl.compute_morans_i,
107 |             self.basic_tree,
108 |             None,
109 |             X,
110 |         )
111 | 
112 |         # check all leaves are accounted for
113 |         new_row = pd.DataFrame.from_dict(
114 |             {"E": [5, 5, 5]}, orient="index", columns=["nUMI", "GeneX", "GeneY"]
115 |         )
116 | 
117 |         X = pd.concat([self.X, new_row], axis=1)
118 | 
119 |         self.assertRaises(
120 |             AutocorrelationError,
121 |             cas.tl.compute_morans_i,
122 |             self.basic_tree,
123 |             None,
124 |             X,
125 |         )
126 | 
127 |         # make sure some data is passed in
128 |         self.assertRaises(
129 |             AutocorrelationError,
130 |             cas.tl.compute_morans_i,
131 |             self.basic_tree,
132 |             None,
133 |             None,
134 |         )
135 |         
136 |         # make sure weight matrix has the right leaves
137 |         W = pd.DataFrame.from_dict(
138 |             {
139 |                 "A": [0, 1 / 2, 1 / 3],
140 |                 "B": [1 / 2, 0, 1 / 3],
141 |                 "C": [1 / 3, 1 / 3, 0],
142 |             },
143 |             orient="index",
144 |             columns=["A", "B", "C"],
145 |         )
146 |         self.assertRaises(
147 |             AutocorrelationError,
148 |             cas.tl.compute_morans_i,
149 |             self.basic_tree,
150 |             None,
151 |             self.X,
152 |             W
153 |         )
154 | 
155 | if __name__ == "__main__":
156 |     unittest.main()
157 | 


--------------------------------------------------------------------------------
/test/tools_tests/fitness_estimator_tests/lbi_jungle_test.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test LBIJungle in cassiopeia.tools.
  3 | """
  4 | import unittest
  5 | 
  6 | import networkx as nx
  7 | 
  8 | from cassiopeia.data import CassiopeiaTree
  9 | from cassiopeia.tools import FitnessEstimatorError, LBIJungle
 10 | 
 11 | 
 12 | class TestLBIJungle(unittest.TestCase):
 13 |     def test_small_tree(self):
 14 |         """
 15 |         Run LBI jungle on small tree and see that fitness estimates make sense.
 16 |         """
 17 |         tree = nx.DiGraph()
 18 |         nodes = [
 19 |             "root",
 20 |             "internal-1",
 21 |             "internal-2",
 22 |             "internal-3",
 23 |             "leaf-1",
 24 |             "leaf-2",
 25 |             "leaf-3",
 26 |             "leaf-4",
 27 |             "leaf-5",
 28 |         ]
 29 |         tree.add_nodes_from(nodes)
 30 |         tree.add_edges_from(
 31 |             [
 32 |                 ("root", "internal-1"),
 33 |                 ("internal-1", "internal-2"),
 34 |                 ("internal-1", "internal-3"),
 35 |                 ("internal-2", "leaf-1"),
 36 |                 ("internal-2", "leaf-2"),
 37 |                 ("internal-2", "leaf-3"),
 38 |                 ("internal-3", "leaf-4"),
 39 |                 ("internal-3", "leaf-5"),
 40 |             ]
 41 |         )
 42 |         tree = CassiopeiaTree(tree=tree)
 43 |         tree.set_times(
 44 |             {
 45 |                 "root": 0.0,
 46 |                 "internal-1": 0.25,
 47 |                 "internal-2": 0.5,
 48 |                 "internal-3": 0.5,
 49 |                 "leaf-1": 1.0,
 50 |                 "leaf-2": 1.0,
 51 |                 "leaf-3": 1.0,
 52 |                 "leaf-4": 1.0,
 53 |                 "leaf-5": 1.0,
 54 |             }
 55 |         )
 56 |         fitness_estimator = LBIJungle()
 57 |         fitness_estimator.estimate_fitness(tree)
 58 |         fitness_estimates = {
 59 |             node: tree.get_attribute(node, "fitness")
 60 |             for node in nodes
 61 |             if node != tree.root  # LBIJungle doesn't report root fitness.
 62 |         }
 63 |         # internal node 2 has strictly more branching than internal node 3, so
 64 |         # fitness estimate should be higher
 65 |         self.assertGreater(
 66 |             fitness_estimates["internal-2"], fitness_estimates["internal-3"]
 67 |         )
 68 |         # Leaves 1, 2, 3 should have the same fitness
 69 |         self.assertAlmostEqual(
 70 |             fitness_estimates["leaf-1"], fitness_estimates["leaf-2"]
 71 |         )
 72 |         self.assertAlmostEqual(
 73 |             fitness_estimates["leaf-2"], fitness_estimates["leaf-3"]
 74 |         )
 75 |         # Leaves 4, 5 should have the same fitness
 76 |         self.assertAlmostEqual(
 77 |             fitness_estimates["leaf-4"], fitness_estimates["leaf-5"]
 78 |         )
 79 |         # Leaves 1, 2, 3 should have higher fitness than leaves 4, 5
 80 |         self.assertGreater(
 81 |             fitness_estimates["leaf-1"], fitness_estimates["leaf-4"]
 82 |         )
 83 |         # Leaves should have lower fitness than their parent (by LBI property)
 84 |         self.assertGreater(
 85 |             fitness_estimates["internal-2"], fitness_estimates["leaf-1"]
 86 |         )
 87 |         self.assertGreater(
 88 |             fitness_estimates["internal-3"], fitness_estimates["leaf-4"]
 89 |         )
 90 | 
 91 |     def test_raises_error_if_leaf_name_startswith_underscore(self):
 92 |         """
 93 |         Leaf names cannot start with an underscore.
 94 | 
 95 |         (This is due to the underlying Jungle implementation we wrap.)
 96 |         """
 97 |         tree = nx.DiGraph()
 98 |         nodes = [
 99 |             "root",
100 |             "_leaf",
101 |         ]
102 |         tree.add_nodes_from(nodes)
103 |         tree.add_edges_from(
104 |             [
105 |                 ("root", "_leaf"),
106 |             ]
107 |         )
108 |         tree = CassiopeiaTree(tree=tree)
109 |         fitness_estimator = LBIJungle()
110 |         with self.assertRaises(FitnessEstimatorError):
111 |             fitness_estimator.estimate_fitness(tree)
112 | 
113 | if __name__ == "__main__":
114 |     unittest.main()
115 | 


--------------------------------------------------------------------------------
/version.py:
--------------------------------------------------------------------------------
1 | version = '2.0.0'
2 | 
3 | if __name__ == '__main__':
4 |     print(version)


--------------------------------------------------------------------------------