├── .gitignore
├── .readthedocs.yaml
├── GRCh38_resources
    ├── HLA_regions.bed
    ├── genetic_map_GRCh38_merged.tab.gz
    ├── hgTables_hg38_gencode.txt
    └── ig_gene_list.txt
├── LICENSE
├── README.md
├── calicost.smk
├── config.yaml
├── configuration_cna
├── configuration_cna_multi
├── configuration_purity
├── docs
    ├── _ext
    │   └── typed_returns.py
    ├── _static
    │   ├── css
    │   │   ├── custom.css
    │   │   ├── dataframe.css
    │   │   ├── nbsphinx.css
    │   │   └── sphinx_gallery.css
    │   └── img
    │   │   ├── acn_color_palette.png
    │   │   ├── overview4_combine.pdf
    │   │   └── overview4_combine.png
    ├── conf.py
    ├── index.rst
    ├── installation.rst
    ├── notebooks
    │   └── tutorials
    │   │   ├── prostate_tutorial.ipynb
    │   │   └── simulated_data_tutorial.ipynb
    ├── parameters.rst
    ├── references.rst
    └── tutorials.rst
├── environment.yml
├── examples
    ├── CalicoST_example.tar.gz
    ├── example_input_filelist
    ├── prostate_example.tar.gz
    └── simulated_example.tar.gz
├── pyproject.toml
├── setup.py
├── src
    └── calicost
    │   ├── __init__.py
    │   ├── allele_starch_generateconfig.py
    │   ├── arg_parse.py
    │   ├── calicost_main.py
    │   ├── calicost_supervised.py
    │   ├── estimate_tumor_proportion.py
    │   ├── find_integer_copynumber.py
    │   ├── hmm_NB_BB_nophasing.py
    │   ├── hmm_NB_BB_nophasing_v2.py
    │   ├── hmm_NB_BB_phaseswitch.py
    │   ├── hmm_NB_sharedstates.py
    │   ├── hmm_gaussian.py
    │   ├── hmrf.py
    │   ├── hmrf_normalmixture.py
    │   ├── joint_allele_generateconfig.py
    │   ├── oldcode.py
    │   ├── parse_input.py
    │   ├── phasing.py
    │   ├── phylogeny_startle.py
    │   ├── phylogeography.py
    │   ├── simple_sctransform.py
    │   ├── utils_IO.py
    │   ├── utils_distribution_fitting.py
    │   ├── utils_hmm.py
    │   ├── utils_hmrf.py
    │   ├── utils_phase_switch.py
    │   └── utils_plotting.py
└── utils
    ├── filter_snps_forphasing.py
    ├── get_snp_matrix.py
    ├── maya_plotter.py
    ├── merge_bamfile.py
    ├── plot_hatchet.py
    ├── process_snps.sh
    └── process_snps_merged.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |     os: ubuntu-22.04
 5 |     tools:
 6 |         python: "3.10"
 7 | 
 8 | sphinx:
 9 |     builder: html
10 |     configuration: docs/conf.py
11 |     fail_on_warning: false
12 | 
13 | python:
14 |     install:
15 |         - method: pip
16 |           path: .
17 |           extra_requirements: [docs]
18 | 
19 | submodules:
20 |     include: [docs/notebooks]
21 |     recursive: true


--------------------------------------------------------------------------------
/GRCh38_resources/HLA_regions.bed:
--------------------------------------------------------------------------------
 1 | chr6	29722775	29738528
 2 | chr6	29726601	29749049
 3 | chr6	29826967	29831125
 4 | chr6	29941260	29945884
 5 | chr6	30489509	30494194
 6 | chr6	31268749	31272130
 7 | chr6	31269491	31357188
 8 | chr6	32439878	32445046
 9 | chr6	32517353	32530287
10 | chr6	32578769	32589848
11 | chr6	32628179	32647062
12 | chr6	32659467	32668383
13 | chr6	32659880	32660729
14 | chr6	32741391	32747198
15 | chr6	32756098	32763532
16 | chr6	32812763	32820466
17 | chr6	32934629	32941028
18 | chr6	32948613	32969094
19 | chr6	33004182	33009591
20 | chr6	33064569	33080775
21 | chr6	33075990	33089696
22 | 


--------------------------------------------------------------------------------
/GRCh38_resources/genetic_map_GRCh38_merged.tab.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphael-group/CalicoST/5e4a8a1230e71505667d51390dc9c035a69d60d9/GRCh38_resources/genetic_map_GRCh38_merged.tab.gz


--------------------------------------------------------------------------------
/GRCh38_resources/ig_gene_list.txt:
--------------------------------------------------------------------------------
  1 | IGKV3OR2-268
  2 | IGKC
  3 | IGKJ5
  4 | IGKJ4
  5 | IGKJ3
  6 | IGKJ2
  7 | IGKJ1
  8 | IGKV4-1
  9 | IGKV5-2
 10 | IGKV1-5
 11 | IGKV1-6
 12 | IGKV3-7
 13 | IGKV1-8
 14 | IGKV1-9
 15 | IGKV3-11
 16 | IGKV1-12
 17 | IGKV3-15
 18 | IGKV1-16
 19 | IGKV1-17
 20 | IGKV3-20
 21 | IGKV6-21
 22 | IGKV2-24
 23 | IGKV1-27
 24 | IGKV2-28
 25 | IGKV2-30
 26 | IGKV1-33
 27 | IGKV1-37
 28 | IGKV1-39
 29 | IGKV2-40
 30 | IGKV2D-40
 31 | IGKV1D-39
 32 | IGKV1D-37
 33 | IGKV1D-33
 34 | IGKV2D-30
 35 | IGKV2D-29
 36 | IGKV2D-28
 37 | IGKV2D-26
 38 | IGKV2D-24
 39 | IGKV6D-21
 40 | IGKV3D-20
 41 | IGKV6D-41
 42 | IGKV1D-17
 43 | IGKV1D-16
 44 | IGKV3D-15
 45 | IGKV1D-13
 46 | IGKV1D-12
 47 | IGKV3D-11
 48 | IGKV1D-42
 49 | IGKV1D-43
 50 | IGKV1D-8
 51 | IGKV3D-7
 52 | IGKV1OR2-108
 53 | IGHA2
 54 | IGHE
 55 | IGHG4
 56 | IGHG2
 57 | IGHA1
 58 | IGHG1
 59 | IGHG3
 60 | IGHD
 61 | IGHM
 62 | IGHJ6
 63 | IGHJ5
 64 | IGHJ4
 65 | IGHJ3
 66 | IGHJ2
 67 | IGHJ1
 68 | IGHD7-27
 69 | IGHD1-26
 70 | IGHD6-25
 71 | IGHD5-24
 72 | IGHD4-23
 73 | IGHD3-22
 74 | IGHD2-21
 75 | IGHD1-20
 76 | IGHD6-19
 77 | IGHD5-18
 78 | IGHD4-17
 79 | IGHD3-16
 80 | IGHD2-15
 81 | IGHD1-14
 82 | IGHD6-13
 83 | IGHD5-12
 84 | IGHD4-11
 85 | IGHD3-10
 86 | IGHD3-9
 87 | IGHD2-8
 88 | IGHD1-7
 89 | IGHD6-6
 90 | IGHD5-5
 91 | IGHD4-4
 92 | IGHD3-3
 93 | IGHD2-2
 94 | IGHD1-1
 95 | IGHV6-1
 96 | IGHV1-2
 97 | IGHV1-3
 98 | IGHV4-4
 99 | IGHV7-4-1
100 | IGHV2-5
101 | IGHV3-7
102 | IGHV3-64D
103 | IGHV5-10-1
104 | IGHV3-11
105 | IGHV3-13
106 | IGHV3-15
107 | IGHV3-16
108 | IGHV1-18
109 | IGHV3-20
110 | IGHV3-21
111 | IGHV3-23
112 | IGHV1-24
113 | IGHV2-26
114 | IGHV4-28
115 | IGHV3-30
116 | IGHV4-31
117 | IGHV3-33
118 | IGHV4-34
119 | IGHV3-35
120 | IGHV3-38
121 | IGHV4-39
122 | IGHV3-43
123 | IGHV1-45
124 | IGHV1-46
125 | IGHV3-48
126 | IGHV3-49
127 | IGHV5-51
128 | IGHV3-53
129 | IGHV1-58
130 | IGHV4-59
131 | IGHV4-61
132 | IGHV3-64
133 | IGHV3-66
134 | IGHV1-69
135 | IGHV2-70D
136 | IGHV1-69-2
137 | IGHV1-69D
138 | IGHV2-70
139 | IGHV3-72
140 | IGHV3-73
141 | IGHV3-74
142 | IGHV7-81
143 | IGHV1OR15-9
144 | IGHV3OR15-7
145 | IGHD5OR15-5A
146 | IGHD4OR15-4A
147 | IGHD3OR15-3A
148 | IGHD2OR15-2A
149 | IGHD1OR15-1A
150 | IGHD5OR15-5B
151 | IGHD4OR15-4B
152 | IGHD3OR15-3B
153 | IGHD2OR15-2B
154 | IGHD1OR15-1B
155 | AC135068.8
156 | AC135068.2
157 | IGHV1OR15-1
158 | IGHV4OR15-8
159 | IGHV3OR16-9
160 | IGHV2OR16-5
161 | IGHV3OR16-10
162 | IGHV3OR16-8
163 | IGHV3OR16-12
164 | IGHV3OR16-13
165 | IGHV1OR21-1
166 | IGLV4-69
167 | IGLV10-54
168 | IGLV8-61
169 | IGLV4-60
170 | IGLV6-57
171 | IGLV11-55
172 | IGLV5-52
173 | IGLV1-51
174 | IGLV1-50
175 | IGLV9-49
176 | IGLV5-48
177 | IGLV1-47
178 | IGLV7-46
179 | IGLV5-45
180 | IGLV1-44
181 | IGLV7-43
182 | IGLV1-40
183 | IGLV5-37
184 | IGLV1-36
185 | IGLV2-33
186 | IGLV3-32
187 | IGLV3-27
188 | IGLV3-25
189 | IGLV2-23
190 | IGLV3-22
191 | IGLV3-21
192 | IGLV3-19
193 | IGLV2-18
194 | IGLV3-16
195 | IGLV2-14
196 | IGLV3-12
197 | IGLV2-11
198 | IGLV3-10
199 | IGLV3-9
200 | IGLV2-8
201 | IGLV4-3
202 | IGLV3-1
203 | IGLJ1
204 | IGLC1
205 | IGLJ2
206 | IGLC2
207 | IGLJ3
208 | IGLC3
209 | IGLJ4
210 | IGLJ5
211 | IGLJ6
212 | IGLJ7
213 | IGLC7


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2023, Princeton University
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 |    list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 |    this list of conditions and the following disclaimer in the documentation
13 |    and/or other materials provided with the distribution.
14 | 
15 | 3. Neither the name of the copyright holder nor the names of its
16 |    contributors may be used to endorse or promote products derived from
17 |    this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CalicoST
  2 | 
  3 | <p align="center">
  4 | <img src="https://github.com/raphael-group/CalicoST/blob/main/docs/_static/img/overview4_combine.png?raw=true" width="100%" height="auto"/>
  5 | </p>
  6 | 
  7 | CalicoST is a probabilistic model that infers allele-specific copy number aberrations and tumor phylogeography from spatially resolved transcriptomics.CalicoST has the following key features:
  8 | 1. Identifies allele-specific integer copy numbers for each transcribed region, revealing events such as copy neutral loss of heterozygosity (CNLOH) and mirrored subclonal CNAs that are invisible to total copy number analysis.
  9 | 2. Assigns each spot a clone label indicating whether the spot is primarily normal cells or a cancer clone with aberration copy number profile.
 10 | 3. Infers a phylogeny relating the identified cancer clones as well as a phylogeography that combines genetic evolution and spatial dissemination of clones.
 11 | 4. Handles normal cell admixture in SRT technologies hat are not single-cell resolution (e.g. 10x Genomics Visium) to infer more accurate allele-specific copy numbers and cancer clones.
 12 | 5.  Simultaneously analyzes multiple regions or aligned SRT slices from the same tumor.
 13 | 
 14 | # System requirements
 15 | The package has tested on the following Linux operating systems: SpringdaleOpenEnterprise 9.2 (Parma) and CentOS Linux 7 (Core).
 16 | 
 17 | # Installation
 18 | ## Minimum installation
 19 | First setup a conda environment from the `environment.yml` file:
 20 | ```
 21 | git clone https://github.com/raphael-group/CalicoST.git
 22 | cd CalicoST
 23 | conda env create -f environment.yml --name calicost_env
 24 | ```
 25 | 
 26 | 
 27 | Then, install CalicoST using pip by
 28 | ```
 29 | conda activate calicost_env
 30 | pip install -e .
 31 | ```
 32 | 
 33 | Setting up the conda environments takes around 15 minutes on an HPC head node.
 34 | 
 35 | ## Additional installation for SNP parsing
 36 | CalicoST requires allele count matrices for reference-phased A and B alleles for inferring allele-specific CNAs, and provides a snakemake pipeline for obtaining the required matrices from a BAM file. Run the following commands in CalicoST directory for installing additional package, [Eagle2](https://alkesgroup.broadinstitute.org/Eagle/), for snakemake preprocessing pipeline.
 37 | 
 38 | ```
 39 | mkdir external
 40 | wget --directory-prefix=external https://storage.googleapis.com/broad-alkesgroup-public/Eagle/downloads/Eagle_v2.4.1.tar.gz
 41 | tar -xzf external/Eagle_v2.4.1.tar.gz -C external
 42 | ```
 43 | 
 44 | ## Additional installation for reconstructing phylogeny
 45 | Based on the inferred cancer clones and allele-specific CNAs by CalicoST, we apply Startle to reconstruct a phylogenetic tree along the clones. Install Startle by
 46 | ```
 47 | git clone --recurse-submodules https://github.com/raphael-group/startle.git
 48 | cd startle
 49 | mkdir build; cd build
 50 | cmake -DLIBLEMON_ROOT=<lemon path>\
 51 |         -DCPLEX_INC_DIR=<cplex include path>\
 52 |         -DCPLEX_LIB_DIR=<cplex lib path>\
 53 |         -DCONCERT_INC_DIR=<concert include path>\
 54 |         -DCONCERT_LIB_DIR=<concert lib path>\
 55 |         ..
 56 | make
 57 | ```
 58 | 
 59 | 
 60 | # Getting started
 61 | ### Preprocessing: genotyping and reference-based phasing
 62 | To infer allele-specific CNAs, we generate allele count matrices in this preprocessing step. We followed the recommended pipeline by [Numbat](https://kharchenkolab.github.io/numbat/), which is designed for scRNA-seq data to infer clones and CNAs: first genotyping using the BAM file by cellsnp-lite (included in the conda environment) and reference-based phasing by Eagle2. Download the following panels for genotyping and reference-based phasing.
 63 | * [SNP panel](https://sourceforge.net/projects/cellsnp/files/SNPlist/genome1K.phase3.SNP_AF5e4.chr1toX.hg38.vcf.gz) - 0.5GB in size. You can also choose other SNP panels from [cellsnp-lite webpage](https://cellsnp-lite.readthedocs.io/en/latest/main/data.html#data-list-of-common-snps).
 64 | * [Phasing panel](http://pklab.med.harvard.edu/teng/data/1000G_hg38.zip)- 9.0GB in size. Unzip the panel after downloading.
 65 | 
 66 | Replace the following paths `config.yaml`:
 67 | * `region_vcf`: Replace with the path of downloaded SNP panel.
 68 | * `phasing_panel`: Replace with the unzipped directory of the downloaded phasing panel.
 69 | * `spaceranger_dir`: Replace with the spaceranger directory of your Visium data, which should contain the BAM file `possorted_genome_bam.bam`.
 70 | * `output_snpinfo`: Replace with the desired output directory.
 71 | * Replace `calicost_dir` and `eagledir` with the path to the cloned CalicoST directory and downloaded Eagle2 directory.
 72 | 
 73 | Then you can run preprocessing pipeline by
 74 | ```
 75 | snakemake --cores <number threads> --configfile config.yaml --snakefile calicost.smk all
 76 | ```
 77 | 
 78 | ### Inferring tumor purity per spot (optional)
 79 | Replace the paths in the parameter configuration file `configuration_purity` with the corresponding data/reference file paths and run
 80 | ```
 81 | OMP_NUM_THREADS=1 <CalicoST directory>/src/calicost/estimate_tumor_proportion.py -c configuration_purity
 82 | ```
 83 | 
 84 | ### Inferring clones and allele-specific CNAs
 85 | Replace the paths in parameter configuration file `configuration_cna` with the corresponding data/reference file paths and run
 86 | ```
 87 | OMP_NUM_THREADS=1 python <CalicoST directory>/src/calicost/calicost_main.py -c configuration_cna
 88 | ```
 89 | 
 90 | When jointly inferring clones and CNAs across multiple SRT slices, prepare a table with the following columns (See [`examples/example_input_filelist`](https://github.com/raphael-group/CalicoST/blob/main/examples/example_input_filelist) as an example). 
 91 | Path to BAM file | sample ID | Path to Spaceranger outs
 92 | Modify `configuration_cna_multi` with paths to the table and run
 93 | ```
 94 | OMP_NUM_THREADS=1 python <CalicoST directory>/src/calicost/calicost_main.py -c configuration_cna_multi
 95 | ```
 96 | 
 97 | ### Reconstruct phylogeography
 98 | 
 99 | ```
100 | python <CalicoST directory>/src/calicost/phylogeny_startle.py -c <CalicoST clone and CNA output directory> -s <startle executable path> -o <output directory>
101 | ```
102 | 
103 | 
104 | # Tutorials
105 | Check out our [readthedocs](https://calicost.readthedocs.io/en/latest/) for the following tutorials:
106 | 1. [Inferring clones and allele-specific CNAs on simulated data](https://calicost.readthedocs.io/en/latest/notebooks/tutorials/simulated_data_tutorial.html)
107 | The simulated count matrices and parameter configuration file are available from [`examples/simulated_example.tar.gz`](https://github.com/raphael-group/CalicoST/blob/main/examples/simulated_example.tar.gz). CalicoST takes about 2h to finish on this example.
108 | 
109 | 2. [Inferring tumor purity, clones, allele-specific CNAs, and phylogeography on prostate cancer data](https://calicost.readthedocs.io/en/latest/notebooks/tutorials/prostate_tutorial.html)
110 | This sample contains five slices and over 10000 spots, CalicoST takes about 9h to jointly infer CNAs and cancer clones across the slides.
111 | 
112 | <!-- CalicoST requires a reference SNP panel and phasing panel, which can be downloaded from
113 | * [SNP panel](https://sourceforge.net/projects/cellsnp/files/SNPlist/genome1K.phase3.SNP_AF5e4.chr1toX.hg38.vcf.gz/download). You can also choose other SNP panels from [cellsnp-lite webpage](https://cellsnp-lite.readthedocs.io/en/latest/snp_list.html).
114 | * [Phasing panel](http://pklab.med.harvard.edu/teng/data/1000G_hg38.zip) -->
115 | 
116 | <!-- ### Run CalicoST
117 | Untar the downloaded example data. Replace the following paths in the `example_config.yaml`  of the downloaded example data with paths on your machine
118 | * calicost_dir: the path to CalicoST git-cloned code.
119 | * eagledir: the path to Eagle2 directory
120 | * region_vcf: the path to the downloaded SNP panel.
121 | * phasing_panel: the path to the downloaded and unzipped phasing panel.
122 | 
123 | To avoid falling into local maxima in CalicoST's optimization objective, we recommend run CalicoST with multiple random initializations with a list random seed specified by `random_state` in the `example_config.yaml` file. The provided one uses five random initializations.
124 | 
125 | Then run CalicoST by
126 | ```
127 | cd <directory of downloaded example data>
128 | snakemake --cores 5 --configfile example_config.yaml --snakefile <calicost_dir>/calicost.smk all
129 | ```
130 | 
131 | CalicoST takes about 69 minutes to finish on this example using 5 cores on an HPC. -->
132 | 
133 | ### Understanding the output
134 | The above snakemake run will create a folder `calicost` in the directory of downloaded example data. Within this folder, each random initialization of CalicoST generates a subdirectory of `calicost/clone*`. 
135 | 
136 | CalicoST generates the following key files of each random initialization:
137 | * clone_labels.tsv: The inferred clone labels for each spot.
138 | * cnv_seglevel.tsv: Allele-specific copy numbers for each clone for each genome segment.
139 | * cnv_genelevel.tsv: The projected allele-specific copy numbers from genome segments to the covered genes.
140 | * cnv_diploid_seglevel.tsv, cnv_triploid_seglevel.tsv, cnv_tetraploid_seglevel.tsv, cnv_diploid_genelevel.tsv, cnv_triploid_genelevel.tsv, cnv_tetraploid_genelevel.tsv: Allele-specific copy numbers when enforcing a ploidy for each genome segment or each gene.
141 | 
142 | See the following examples of the key files.
143 | ```
144 | head -10 calicost/clone3_rectangle0_w1.0/clone_labels.tsv
145 | BARCODES        clone_label
146 | spot_0  2
147 | spot_1  2
148 | spot_2  2
149 | spot_3  2
150 | spot_4  2
151 | spot_5  2
152 | spot_6  2
153 | spot_7  2
154 | spot_8  0
155 | ```
156 | 
157 | ```
158 | head -10 calicost/clone3_rectangle0_w1.0/cnv_seglevel.tsv
159 | CHR     START   END     clone0 A        clone0 B        clone1 A        clone1 B        clone2 A        clone2 B
160 | 1       1001138 1616548 1       1       1       1       1       1
161 | 1       1635227 2384877 1       1       1       1       1       1
162 | 1       2391775 6101016 1       1       1       1       1       1
163 | 1       6185020 6653223 1       1       1       1       1       1
164 | 1       6785454 7780639 1       1       1       1       1       1
165 | 1       7784320 8020748 1       1       1       1       1       1
166 | 1       8026738 9271273 1       1       1       1       1       1
167 | 1       9292894 10375267        1       1       1       1       1       1
168 | 1       10398592        11922488        1       1       1       1       1       1
169 | ```
170 | 
171 | ```
172 | head -10 calicost/clone3_rectangle0_w1.0/cnv_genelevel.tsv
173 | gene    clone0 A        clone0 B        clone1 A        clone1 B        clone2 A        clone2 B
174 | A1BG    1       1       1       1       1       1
175 | A1CF    1       1       1       1       1       1
176 | A2M     1       1       1       1       1       1
177 | A2ML1-AS1       1       1       1       1       1       1
178 | AACS    1       1       1       1       1       1
179 | AADAC   1       1       1       1       1       1
180 | AADACL2-AS1     1       1       1       1       1       1
181 | AAK1    1       1       1       1       1       1
182 | AAMP    1       1       1       1       1       1
183 | ```
184 | 
185 | CalicoST graphs the following plots for visualizing the inferred cancer clones in space and allele-specific copy number profiles for each random initialization.
186 | * plots/clone_spatial.pdf: The spatial distribution of inferred cancer clones and normal regions (grey color, clone 0 by default)
187 | * plots/rdr_baf_defaultcolor.pdf: The read depth ratio (RDR) and B allele frequency (BAF) along the genome for each clone. Higher RDR indicates higher total copy numbers, and a deviation-from-0.5 BAF indicates allele imbalance due to allele-specific CNAs.
188 | * plots/acn_genome.pdf: The default allele-specific copy numbers along the genome.
189 | * plots/acn_genome_diploid.pdf, plots/acn_genome_triploid.pdf, plots/acn_genome_tetraploid.pdf: Allele-specific copy numbers when enforcing a ploidy.
190 | 
191 | The allele-specific copy number plots have the following color legend.
192 | <p align="center">
193 | <img src="https://github.com/raphael-group/CalicoST/blob/main/docs/_static/img/acn_color_palette.png?raw=true" width="20%" height="auto"/>
194 | </p>
195 | 
196 | 
197 | # Software dependencies
198 | CalicoST uses the following command-line packages and python for extracting the BAF information
199 | * samtools
200 | * cellsnp-lite
201 | * Eagle2
202 | * pysam
203 | * snakemake
204 | 
205 | CalicoST uses the following packages for the remaining steps to infer allele-specific copy numbers and cancer clones:
206 | * numpy
207 | * scipy
208 | * pandas
209 | * scikit-learn
210 | * scanpy
211 | * anndata
212 | * numba
213 | * tqdm
214 | * statsmodels
215 | * networkx
216 | * matplotlib
217 | * seaborn
218 | * snakemake
219 | 
220 | 
221 | # Citations
222 | The CalicoST manuscript is available on bioRxiv. If you use CalicoST for your work, please cite our paper.
223 | ```
224 | @article{ma2024inferring,
225 |   title={Inferring allele-specific copy number aberrations and tumor phylogeography from spatially resolved transcriptomics},
226 |   author={Ma, Cong and Balaban, Metin and Liu, Jingxian and Chen, Siqi and Ding, Li and Raphael, Benjamin},
227 |   journal={bioRxiv},
228 |   pages={2024--03},
229 |   year={2024},
230 |   publisher={Cold Spring Harbor Laboratory}
231 | }
232 | ```


--------------------------------------------------------------------------------
/calicost.smk:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import scipy
  4 | import calicost.arg_parse
  5 | import calicost.parse_input
  6 | 
  7 | 
  8 | rule all:
  9 |     input:
 10 |         f"{config['output_snpinfo']}/cell_snp_Aallele.npz",
 11 | 
 12 | 
 13 | rule link_or_merge_bam:
 14 |     output:
 15 |         bam="{outputdir}/possorted_genome_bam.bam",
 16 |         bai="{outputdir}/possorted_genome_bam.bam.bai",
 17 |         barcodefile="{outputdir}/barcodes.txt",
 18 |     params:
 19 |         outputdir = "{outputdir}",
 20 |         samtools_sorting_mem=config['samtools_sorting_mem']
 21 |     threads: 1
 22 |     log:
 23 |         "{outputdir}/logs/link_or_merge_bam.log"
 24 |     run:
 25 |         if "bamlist" in config:
 26 |             # merged BAM file
 27 |             shell(f"python {config['calicost_dir']}/utils/merge_bamfile.py -b {config['bamlist']} -o {params.outputdir}/ >> {log} 2>&1")
 28 |             shell(f"samtools sort -m {params.samtools_sorting_mem} -o {output.bam} {params.outputdir}/unsorted_possorted_genome_bam.bam >> {log} 2>&1")
 29 |             shell(f"samtools index {output.bam}")
 30 |             shell(f"rm -fr {params.outputdir}/unsorted_possorted_genome_bam.bam")
 31 |             
 32 |             # merged barcodes
 33 |             df_entries = pd.read_csv(config["bamlist"], sep='\t', index_col=None, header=None)
 34 |             df_barcodes = []
 35 |             for i in range(df_entries.shape[0]):
 36 |                 tmpdf = pd.read_csv(f"{df_entries.iloc[i,2]}/filtered_feature_bc_matrix/barcodes.tsv.gz", header=None, index_col=None)
 37 |                 tmpdf.iloc[:,0] = [f"{x}_{df_entries.iloc[i,1]}" for x in tmpdf.iloc[:,0]]
 38 |                 df_barcodes.append( tmpdf )
 39 |             df_barcodes = pd.concat(df_barcodes, ignore_index=True)
 40 |             df_barcodes.to_csv(f"{output.barcodefile}", sep='\t', index=False, header=False)
 41 |         else:
 42 |             # BAM file
 43 |             assert "spaceranger_dir" in config
 44 |             print("softlink of possorted_genome_bam.bam")
 45 |             shell(f"ln -sf -T {config['spaceranger_dir']}/possorted_genome_bam.bam {output.bam}")
 46 |             shell(f"ln -sf -T {config['spaceranger_dir']}/possorted_genome_bam.bam.bai {output.bai}")
 47 |             # barcodes
 48 |             shell(f"gunzip -c {config['spaceranger_dir']}/filtered_feature_bc_matrix/barcodes.tsv.gz > {output.barcodefile}")
 49 | 
 50 | 
 51 | 
 52 | rule genotype:
 53 |     input:
 54 |         barcodefile="{outputdir}/barcodes.txt",
 55 |         bam="{outputdir}/possorted_genome_bam.bam",
 56 |         bai="{outputdir}/possorted_genome_bam.bam.bai"
 57 |     output:
 58 |         vcf="{outputdir}/genotyping/cellSNP.base.vcf.gz"
 59 |     params:
 60 |         outputdir="{outputdir}",
 61 |         region_vcf=config['region_vcf']
 62 |     threads: config['nthreads_cellsnplite']
 63 |     log:
 64 |         "{outputdir}/logs/genotyping.log"
 65 |     run:
 66 |         shell(f"mkdir -p {params.outputdir}/genotyping")
 67 |         command = f"cellsnp-lite -s {input.bam} " + \
 68 |              f"-b {input.barcodefile} " + \
 69 |              f"-O {params.outputdir}/genotyping/ " + \
 70 |              f"-R {params.region_vcf} " + \
 71 |              f"-p {threads} " + \
 72 |              f"--minMAF 0 --minCOUNT 2 --UMItag {config['UMItag']} --cellTAG {config['cellTAG']} --gzip >> {log} 2>&1"
 73 |         print(command)
 74 |         shell(command)
 75 |         
 76 | 
 77 | 
 78 | rule pre_phasing:
 79 |     input:
 80 |         vcf="{outputdir}/genotyping/cellSNP.base.vcf.gz"
 81 |     output:
 82 |         expand("{{outputdir}}/phasing/chr{chrname}.vcf.gz", chrname=config["chromosomes"])
 83 |     params:
 84 |         outputdir="{outputdir}",
 85 |     threads: 1
 86 |     run:
 87 |         shell(f"mkdir -p {params.outputdir}/phasing")
 88 |         print(f"python {config['calicost_dir']}/utils/filter_snps_forphasing.py -c {params.outputdir}/genotyping -o {params.outputdir}/phasing")
 89 |         shell(f"python {config['calicost_dir']}/utils/filter_snps_forphasing.py -c {params.outputdir}/genotyping -o {params.outputdir}/phasing")
 90 |         for chrname in config["chromosomes"]:
 91 |             shell(f"bgzip -f {params.outputdir}/phasing/chr{chrname}.vcf")
 92 |             shell(f"tabix -f {params.outputdir}/phasing/chr{chrname}.vcf.gz")
 93 | 
 94 | 
 95 | rule phasing:
 96 |     input:
 97 |         vcf="{outputdir}/phasing/chr{chrname}.vcf.gz"
 98 |     output:
 99 |         "{outputdir}/phasing/chr{chrname}.phased.vcf.gz"
100 |     params:
101 |         outputdir="{outputdir}",
102 |         chrname="{chrname}",
103 |     threads: 2
104 |     log:
105 |         "{outputdir}/logs/phasing_chr{chrname}.log",
106 |     run:
107 |         command = f"{config['eagledir']}/eagle --numThreads {threads} --vcfTarget {input.vcf} " + \
108 |             f"--vcfRef {config['phasing_panel']}/chr{params.chrname}.genotypes.bcf " + \
109 |             f"--geneticMapFile={config['eagledir']}/tables/genetic_map_hg38_withX.txt.gz "+ \
110 |             f"--outPrefix {params.outputdir}/phasing/chr{params.chrname}.phased >> {log} 2>&1"
111 |         shell(command)
112 |         
113 | 
114 | 
115 | rule parse_final_snp:
116 |     input:
117 |         "{outputdir}/genotyping/cellSNP.base.vcf.gz",
118 |         expand("{{outputdir}}/phasing/chr{chrname}.phased.vcf.gz", chrname=config["chromosomes"]),
119 |     output:
120 |         "{outputdir}/cell_snp_Aallele.npz",
121 |         "{outputdir}/cell_snp_Ballele.npz",
122 |         "{outputdir}/unique_snp_ids.npy"
123 |     params:
124 |         outputdir="{outputdir}",
125 |     threads: 1
126 |     log:
127 |         "{outputdir}/logs/parse_final_snp.log"
128 |     run:
129 |         command = f"python {config['calicost_dir']}/utils/get_snp_matrix.py " + \
130 |             f"-c {params.outputdir}/genotyping -e {params.outputdir}/phasing -b {params.outputdir}/barcodes.txt -o {params.outputdir}/ >> {log} 2>&1"
131 |         shell( command )
132 | 
133 | 


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
 1 | # path to executables or their parent directories
 2 | calicost_dir: <Replace with git cloned CalicoST directory>
 3 | eagledir: <Replace with downloaded Eagle2 directory>
 4 | 
 5 | # running parameters
 6 | # samtools sort (only used when joingly calling from multiple slices)
 7 | samtools_sorting_mem: "4G"
 8 | # cellsnp-lite
 9 | UMItag: "Auto"
10 | cellTAG: "CB"
11 | nthreads_cellsnplite: 20
12 | region_vcf: <Replace with downloaded SNP panel path>
13 | # Eagle phasing
14 | phasing_panel: <Replace with downloaded phasing panel directory>
15 | chromosomes: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
16 | 
17 | # input
18 | spaceranger_dir: <Replace with Spaceranger directory>
19 | 
20 | # output
21 | output_snpinfo: <Replace with output directory of preprocessing>
22 | 


--------------------------------------------------------------------------------
/configuration_cna:
--------------------------------------------------------------------------------
 1 | 
 2 | spaceranger_dir : <Replace with Spaceranger directory>
 3 | snp_dir : <Replace with output directory of preprocessing>
 4 | output_dir : <Replace with output directory, must be existing>
 5 | 
 6 | # supporting files and preprocessing arguments
 7 | geneticmap_file : <Replace with CalicoST directory>/GRCh38_resources/genetic_map_GRCh38_merged.tab.gz
 8 | hgtable_file : <Replace with CalicoST directory>/GRCh38_resources/hgTables_hg38_gencode.txt
 9 | normalidx_file : None
10 | tumorprop_file : None
11 | supervision_clone_file : None
12 | filtergenelist_file : <Replace with CalicoST directory>/GRCh38_resources/ig_gene_list.txt
13 | filterregion_file : <Replace with CalicoST directory>/GRCh38_resources/HLA_regions.bed
14 | secondary_min_umi : 300
15 | bafonly : False
16 | 
17 | # phase switch probability
18 | nu : 1.0
19 | logphase_shift : -2.0
20 | npart_phasing : 3
21 | 
22 | # HMRF configurations
23 | n_clones : 3
24 | n_clones_rdr : 2
25 | min_spots_per_clone : 100
26 | min_avgumi_per_clone : 10
27 | maxspots_pooling : 7
28 | tumorprop_threshold : 0.5
29 | max_iter_outer : 20
30 | nodepotential : weighted_sum
31 | initialization_method : rectangle
32 | num_hmrf_initialization_start : 0
33 | num_hmrf_initialization_end : 1
34 | spatial_weight : 1.0
35 | construct_adjacency_method : hexagon
36 | construct_adjacency_w : 1.0
37 | 
38 | # HMM configurations
39 | n_states : 7
40 | params : smp
41 | t : 1-1e-5
42 | t_phaseing : 0.9999
43 | fix_NB_dispersion : False
44 | shared_NB_dispersion : True
45 | fix_BB_dispersion : False
46 | shared_BB_dispersion : True
47 | max_iter : 30
48 | tol : 0.0001
49 | gmm_random_state : 0
50 | np_threshold : 1.0
51 | np_eventminlen : 10
52 | 
53 | # integer copy number
54 | nonbalance_bafdist : 1.0
55 | nondiploid_rdrdist : 10.0
56 | 
57 | 


--------------------------------------------------------------------------------
/configuration_cna_multi:
--------------------------------------------------------------------------------
 1 | 
 2 | input_filelist: <Replace with the table file for multiple SRT slices>
 3 | snp_dir : <Replace with output directory of preprocessing>
 4 | output_dir : <Replace with output directory, must be existing>
 5 | 
 6 | # supporting files and preprocessing arguments
 7 | geneticmap_file : <Replace with CalicoST directory>/GRCh38_resources/genetic_map_GRCh38_merged.tab.gz
 8 | hgtable_file : <Replace with CalicoST directory>/GRCh38_resources/hgTables_hg38_gencode.txt
 9 | normalidx_file : None
10 | tumorprop_file : None
11 | alignment_files :
12 | supervision_clone_file : None
13 | filtergenelist_file : <Replace with CalicoST directory>/GRCh38_resources/ig_gene_list.txt
14 | filterregion_file : <Replace with CalicoST directory>/GRCh38_resources/HLA_regions.bed
15 | secondary_min_umi : 300
16 | bafonly : False
17 | 
18 | # phase switch probability
19 | nu : 1.0
20 | logphase_shift : -2.0
21 | npart_phasing : 3
22 | 
23 | # HMRF configurations
24 | n_clones : 3
25 | n_clones_rdr : 2
26 | min_spots_per_clone : 100
27 | min_avgumi_per_clone : 10
28 | maxspots_pooling : 7
29 | tumorprop_threshold : 0.5
30 | max_iter_outer : 20
31 | nodepotential : weighted_sum
32 | initialization_method : rectangle
33 | num_hmrf_initialization_start : 0
34 | num_hmrf_initialization_end : 1
35 | spatial_weight : 1.0
36 | construct_adjacency_method : hexagon
37 | construct_adjacency_w : 1.0
38 | 
39 | # HMM configurations
40 | n_states : 7
41 | params : smp
42 | t : 1-1e-5
43 | t_phaseing : 0.9999
44 | fix_NB_dispersion : False
45 | shared_NB_dispersion : True
46 | fix_BB_dispersion : False
47 | shared_BB_dispersion : True
48 | max_iter : 30
49 | tol : 0.0001
50 | gmm_random_state : 0
51 | np_threshold : 1.0
52 | np_eventminlen : 10
53 | 
54 | # integer copy number
55 | nonbalance_bafdist : 1.0
56 | nondiploid_rdrdist : 10.0
57 | 
58 | 


--------------------------------------------------------------------------------
/configuration_purity:
--------------------------------------------------------------------------------
 1 | 
 2 | spaceranger_dir : <Replace with Spaceranger directory>
 3 | snp_dir : <Replace with output directory of preprocessing>
 4 | output_dir : <Replace with output directory, must be existing>
 5 | 
 6 | # supporting files and preprocessing arguments
 7 | geneticmap_file : <Replace with CalicoST directory>/GRCh38_resources/genetic_map_GRCh38_merged.tab.gz
 8 | hgtable_file : <Replace with CalicoST directory>/GRCh38_resources/hgTables_hg38_gencode.txt
 9 | normalidx_file : None
10 | tumorprop_file : None
11 | alignment_files : 
12 | supervision_clone_file : None
13 | filtergenelist_file : <Replace with CalicoST directory>/GRCh38_resources/ig_gene_list.txt
14 | filterregion_file : <Replace with CalicoST directory>/GRCh38_resources/HLA_regions.bed
15 | secondary_min_umi : 400
16 | bafonly : False
17 | 
18 | # phase switch probability
19 | nu : 1.0
20 | logphase_shift : -2.0
21 | npart_phasing : 3
22 | 
23 | # HMRF configurations
24 | n_clones : 5
25 | n_clones_rdr : 2
26 | min_spots_per_clone : 100
27 | min_avgumi_per_clone : 10
28 | maxspots_pooling : 19
29 | tumorprop_threshold : 0.5
30 | max_iter_outer : 20
31 | nodepotential : weighted_sum
32 | initialization_method : rectangle
33 | num_hmrf_initialization_start : 0
34 | num_hmrf_initialization_end : 1
35 | spatial_weight : 1.0
36 | construct_adjacency_method : hexagon
37 | construct_adjacency_w : 1.0
38 | 
39 | # HMM configurations
40 | n_states : 7
41 | params : smp
42 | t : 1-1e-4
43 | t_phaseing : 0.9999
44 | fix_NB_dispersion : False
45 | shared_NB_dispersion : True
46 | fix_BB_dispersion : False
47 | shared_BB_dispersion : True
48 | max_iter : 30
49 | tol : 0.0001
50 | gmm_random_state : 0
51 | np_threshold : 1.0
52 | np_eventminlen : 10
53 | 
54 | # integer copy number
55 | nonbalance_bafdist : 1.0
56 | nondiploid_rdrdist : 10.0
57 | 
58 | 


--------------------------------------------------------------------------------
/docs/_ext/typed_returns.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import Iterable, Iterator, List
 3 | 
 4 | from sphinx.application import Sphinx
 5 | from sphinx.ext.napoleon import NumpyDocstring
 6 | 
 7 | 
 8 | def _process_return(lines: Iterable[str]) -> Iterator[str]:
 9 |     for line in lines:
10 |         m = re.fullmatch(r"(?P<param>\w+)\s+:\s+(?P<type>[\w.]+)", line)
11 |         if m:
12 |             # Once this is in scanpydoc, we can use the fancy hover stuff
13 |             yield f'**{m["param"]}** : :class:`~{m["type"]}`'
14 |         else:
15 |             yield line
16 | 
17 | 
18 | def _parse_returns_section(self: NumpyDocstring, section: str) -> list[str]:
19 |     lines_raw = list(_process_return(self._dedent(self._consume_to_next_section())))
20 |     lines: list[str] = self._format_block(":returns: ", lines_raw)
21 |     if lines and lines[-1]:
22 |         lines.append("")
23 |     return lines
24 | 
25 | 
26 | def setup(app: Sphinx) -> None:
27 |     NumpyDocstring._parse_returns_section = _parse_returns_section
28 | 


--------------------------------------------------------------------------------
/docs/_static/css/custom.css:
--------------------------------------------------------------------------------
  1 | .small {
  2 |     font-size: 55%;
  3 | }
  4 | 
  5 | div.version {
  6 |     color: #FFD92C!important;
  7 | }
  8 | 
  9 | .wy-nav-side {
 10 |     background: #242335;
 11 | }
 12 | 
 13 | .wy-side-nav-search {
 14 |     background-color: #242335;
 15 | }
 16 | 
 17 | .wy-side-nav-search input[type="text"] {
 18 |     border-radius: 6px!important;
 19 | }
 20 | 
 21 | .wy-nav-content {
 22 |     max-width: 950px;
 23 | }
 24 | 
 25 | .wy-menu-vertical a {
 26 |     color: #eceef4;
 27 | }
 28 | 
 29 | .wy-menu-vertical li.current {
 30 |     background: #f1f5fb;
 31 | }
 32 | 
 33 | .wy-menu-vertical li.toctree-l2.current > a {
 34 |     background: #34377d2e;
 35 | }
 36 | 
 37 | .wy-menu-vertical li.toctree-l2.current li.toctree-l3 > a {
 38 |     background: #34377d4a;
 39 | }
 40 | 
 41 | .wy-menu-vertical li.toctree-l3.current li.toctree-l4 > a {
 42 |     background: #34377d7d;
 43 | }
 44 | 
 45 | .wy-menu-vertical a:hover {
 46 |     background-color: #6b86b0;
 47 | }
 48 | 
 49 | .wy-menu-vertical li.current a:hover {
 50 |     background: #bdcde6a3;
 51 | }
 52 | 
 53 | a {
 54 |     color: #5B64B1;
 55 | }
 56 | 
 57 | .rst-content .viewcode-link {
 58 |     color: #7013e1d9;
 59 | }
 60 | 
 61 | .highlight {
 62 |     background: #f1f5fb!important;
 63 | }
 64 | 
 65 | .rst-content div[class^="highlight"] {
 66 |     border: 1px solid #e4eaf2;
 67 | }
 68 | 
 69 | .wy-menu-vertical p.caption {
 70 |     color: #FFD92C;
 71 | }
 72 | 
 73 | div.output_subarea.output_html.rendered_html.output_result{
 74 |     overflow: auto;
 75 | }
 76 | 
 77 | /* function/class top bar */
 78 | html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) > dt {
 79 |     color: #404040;
 80 |     border-top: solid 4px #7013e1d9;
 81 |     background: #FFD833A8;
 82 | }
 83 | 
 84 | /* class params */
 85 | html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) dl:not(.field-list) > dt {
 86 |     color: #404040;
 87 |     border-left: solid 4px #7013e1d9;
 88 |     background: #FFD8338F;
 89 | }
 90 | 
 91 | /* the other elements, but more specific - leave them be */
 92 | code.docutils.literal.notranslate > span[class="pre"] {
 93 |     font-weight: bold;
 94 |     color: #404040;
 95 | }
 96 | 
 97 | /* odd rows in API */
 98 | .rst-content table.docutils:not(.field-list) tr:nth-child(2n-1) td {
 99 |     background-color: #f6f6f3;
100 | }
101 | 
102 | .rst-content div[class^="highlight"] pre {
103 |     padding: 8px;
104 | }
105 | 
106 | .rst-content .seealso {
107 |     background: #fafae2!important;
108 | }
109 | 
110 | .rst-content .seealso .admonition-title {
111 |     background: #7013e1d9!important;
112 | }
113 | 


--------------------------------------------------------------------------------
/docs/_static/css/dataframe.css:
--------------------------------------------------------------------------------
 1 | /* Pandas dataframe css */
 2 | /* Taken from: https://github.com/spatialaudio/nbsphinx/blob/fb3ba670fc1ba5f54d4c487573dbc1b4ecf7e9ff/src/nbsphinx.py#L587-L619 */
 3 | /* modified margin-left */
 4 | 
 5 | table.dataframe {
 6 |   border: none !important;
 7 |   border-collapse: collapse;
 8 |   border-spacing: 0;
 9 |   border-color: transparent;
10 |   color: black;
11 |   font-size: 12px;
12 |   table-layout: fixed;
13 |   margin-left: 0!important;
14 | }
15 | 
16 | table.dataframe thead {
17 |   border-bottom: 1px solid black;
18 |   vertical-align: bottom;
19 | }
20 | 
21 | table.dataframe tr,
22 | table.dataframe th,
23 | table.dataframe td {
24 |   text-align: right;
25 |   vertical-align: middle;
26 |   padding: 0.5em 0.5em;
27 |   line-height: normal;
28 |   white-space: normal;
29 |   max-width: none;
30 |   border: none;
31 | }
32 | 
33 | table.dataframe th {
34 |   font-weight: bold;
35 | }
36 | 
37 | table.dataframe tbody tr:nth-child(odd) {
38 |   background: #f5f5f5;
39 | }
40 | 
41 | table.dataframe tbody tr:hover {
42 |   background: rgba(66, 165, 245, 0.2);
43 | }
44 | 


--------------------------------------------------------------------------------
/docs/_static/css/nbsphinx.css:
--------------------------------------------------------------------------------
 1 | div.nbinput.container div.prompt,
 2 | div.nboutput.container div.prompt {
 3 |     display: none;
 4 | }
 5 | 
 6 | div.nbinput.container div.prompt > div.highlight,
 7 | div.nboutput.container div.prompt > div.highlight {
 8 |     display: none;
 9 | }
10 | 
11 | div.nbinput.container div.input_area div[class*="highlight"] > pre,
12 | div.nboutput.container div.output_area div[class*="highlight"] > pre {
13 |     padding: 8px!important;
14 | }
15 | 
16 | div.nboutput.container div.output_area > div[class^="highlight"] {
17 |     background-color: #fafae2!important;
18 | }
19 | 
20 | .rst-content .output_area img {
21 |     max-width: unset;
22 |     width: 100% !important;
23 |     height: auto !important;
24 | }
25 | 


--------------------------------------------------------------------------------
/docs/_static/css/sphinx_gallery.css:
--------------------------------------------------------------------------------
  1 | #graph, #image, #core-tutorials, #external-tutorials, #gallery {
  2 |     margin-bottom: 1em;
  3 | }
  4 | 
  5 | div.sphx-glr-download a {
  6 |     background-color: #FFD92C9E!important;
  7 |     background-image: none!important;
  8 |     border-radius: 2px!important;
  9 |     border: 1px solid #f4c200!important;
 10 |     color: #404040!important;
 11 |     font-weight bold !important;
 12 |     padding: 0.1cm!important;
 13 |     text-align: center!important;
 14 | }
 15 | 
 16 | 
 17 | div.sphx-glr-download a[href$=".py"] {
 18 |     display: none!important;
 19 | }
 20 | 
 21 | div.sphx-glr-example-title div[class="highlight"] {
 22 |     background-color: #F5F5F5;
 23 |     border: none;
 24 | }
 25 | 
 26 | / * notebook output cell */
 27 | .sphx-glr-script-out .highlight pre {
 28 |     background: #FDFFD9!important;
 29 | }
 30 | 
 31 | p.sphx-glr-script-out {
 32 |     display: none !important;
 33 | }
 34 | 
 35 | div.sphx-glr-download p {
 36 |     margin: 0!important;
 37 |     width: auto!important;
 38 | }
 39 | 
 40 | .sphx-glr-script-out {
 41 |     color: #404040 !important;
 42 |     margin: -24px 0px 0px 0px !important;
 43 | }
 44 | 
 45 | p.sphx-glr-signature {
 46 |    display: none!important;
 47 | }
 48 | 
 49 | div.sphx-glr-download-link-note {
 50 |    display: none!important;
 51 | }
 52 | 
 53 | /* this gets rid of uneven vertical padding */
 54 | div.sphx-glr-download code.download {
 55 |    display: block !important;
 56 | }
 57 | 
 58 | .sphx-glr-thumbcontainer {
 59 |     background: none !important;
 60 |     border: 1px solid #7013e1d9!important;
 61 |     text-align: center !important;
 62 |     min-height: 220px !important;
 63 | }
 64 | 
 65 | .sphx-glr-thumbcontainer a.internal:hover {
 66 |     color: #7013e1d9!important;
 67 | }
 68 | 
 69 | .sphx-glr-thumbcontainer .headerlink {
 70 |     display: none !important;
 71 | }
 72 | 
 73 | div.sphx-glr-thumbcontainer span {
 74 |     font-style: normal !important;
 75 | }
 76 | 
 77 | p.sphx-glr-timing {
 78 |     margin: 0 !important;
 79 |     padding-top: 24px;
 80 |     border-top: 1px solid #000;
 81 | }
 82 | 
 83 | .sphx-glr-thumbcontainer:hover {
 84 |     box-shadow: 0 0 10px #7013e1d9!important
 85 | }
 86 | 
 87 | /* sphinx-gallery inserts 2 <br> after_repr_html_, ignore the 1st one */
 88 | div[class="rendered_html"] + br {
 89 |     display: none!important;
 90 | }
 91 | 
 92 | /* remove `Jupyter notebook: <name>` from `Download Jupyter notebook: <name>`*/
 93 | div.sphx-glr-download-jupyter code.xref.download.docutils.literal.notranslate > span:nth-child(2),
 94 | div.sphx-glr-download-jupyter code.xref.download.docutils.literal.notranslate > span:nth-child(3) {
 95 |     display: none!important;
 96 | }
 97 | 
 98 | .sphx-glr-thumbcontainer a.internal {
 99 |     padding: 140px 10px 0!important;
100 | }
101 | 
102 | div.binder-badge img {
103 |     width: 120px;
104 | }
105 | 


--------------------------------------------------------------------------------
/docs/_static/img/acn_color_palette.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphael-group/CalicoST/5e4a8a1230e71505667d51390dc9c035a69d60d9/docs/_static/img/acn_color_palette.png


--------------------------------------------------------------------------------
/docs/_static/img/overview4_combine.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphael-group/CalicoST/5e4a8a1230e71505667d51390dc9c035a69d60d9/docs/_static/img/overview4_combine.pdf


--------------------------------------------------------------------------------
/docs/_static/img/overview4_combine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphael-group/CalicoST/5e4a8a1230e71505667d51390dc9c035a69d60d9/docs/_static/img/overview4_combine.png


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # This file only contains a selection of the most common options. For a full
  4 | # list see the documentation:
  5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  6 | 
  7 | # -- Path setup --------------------------------------------------------------
  8 | import os
  9 | import sys
 10 | from datetime import datetime
 11 | 
 12 | # from importlib.metadata import metadata
 13 | from pathlib import Path
 14 | 
 15 | from sphinx.application import Sphinx
 16 | 
 17 | HERE = Path(__file__).parent
 18 | # sys.path.insert(0, str(HERE.parent.parent))  # this way, we don't have to install squidpy
 19 | # sys.path.insert(0, os.path.abspath("_ext"))
 20 | 
 21 | sys.path.insert(0, str(HERE / "_ext"))
 22 | 
 23 | # -- Project information -----------------------------------------------------
 24 | 
 25 | project = 'CalicoST'
 26 | author = 'Ma et al.'
 27 | version = '1.0.0'
 28 | copyright = f"{datetime.now():%Y}, raphael-lab"
 29 | 
 30 | # -- General configuration ---------------------------------------------------
 31 | 
 32 | # Add any Sphinx extension module names here, as strings. They can be
 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 34 | # ones.
 35 | extensions = [
 36 |     "sphinx.ext.autodoc",
 37 |     "sphinx.ext.napoleon",
 38 |     "sphinx.ext.viewcode",
 39 |     "sphinx_autodoc_typehints",
 40 |     "sphinx.ext.intersphinx",
 41 |     "sphinx.ext.autosummary",
 42 |     "sphinx.ext.mathjax",
 43 |     "sphinxcontrib.bibtex",
 44 |     "sphinx_copybutton",
 45 |     "myst_nb",
 46 |     "nbsphinx",
 47 |     "typed_returns",
 48 |     "IPython.sphinxext.ipython_console_highlighting",
 49 | ]
 50 | intersphinx_mapping = dict(  # noqa: C408
 51 |     python=("https://docs.python.org/3", None),
 52 |     numpy=("https://numpy.org/doc/stable/", None),
 53 |     statsmodels=("https://www.statsmodels.org/stable/", None),
 54 |     scipy=("https://docs.scipy.org/doc/scipy/", None),
 55 |     pandas=("https://pandas.pydata.org/pandas-docs/stable/", None),
 56 |     anndata=("https://anndata.readthedocs.io/en/stable/", None),
 57 |     scanpy=("https://scanpy.readthedocs.io/en/stable/", None),
 58 |     matplotlib=("https://matplotlib.org/stable/", None),
 59 |     seaborn=("https://seaborn.pydata.org/", None),
 60 |     networkx=("https://networkx.org/documentation/stable/", None),
 61 |     sklearn=("https://scikit-learn.org/stable/", None),
 62 |     numba=("https://numba.readthedocs.io/en/stable/", None),
 63 |     ete3=("http://etetoolkit.org/docs/latest/", None),
 64 | )
 65 | 
 66 | # Add any paths that contain templates here, relative to this directory.
 67 | templates_path = ["_templates"]
 68 | source_suffix = {".rst": "restructuredtext", ".ipynb": "myst-nb"}
 69 | master_doc = "index"
 70 | pygments_style = "sphinx"
 71 | 
 72 | # myst
 73 | nb_execution_mode = "off"
 74 | myst_enable_extensions = [
 75 |     "colon_fence",
 76 |     "dollarmath",
 77 |     "amsmath",
 78 | ]
 79 | myst_heading_anchors = 2
 80 | 
 81 | # List of patterns, relative to source directory, that match files and
 82 | # directories to ignore when looking for source files.
 83 | # This pattern also affects html_static_path and html_extra_path.
 84 | exclude_patterns = [
 85 |     "notebooks/README.rst",
 86 |     "notebooks/CONTRIBUTING.rst",
 87 |     "release/changelog/*",
 88 |     "**.ipynb_checkpoints",
 89 |     "build",
 90 | ]
 91 | suppress_warnings = ["download.not_readable", "git.too_shallow"]
 92 | 
 93 | # -- Options for HTML output -------------------------------------------------
 94 | 
 95 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 96 | # a list of builtin themes.
 97 | autosummary_generate = True
 98 | autodoc_member_order = "groupwise"
 99 | autodoc_typehints = "signature"
100 | autodoc_docstring_signature = True
101 | napoleon_google_docstring = False
102 | napoleon_numpy_docstring = True
103 | napoleon_include_init_with_doc = False
104 | napoleon_use_rtype = True
105 | napoleon_use_param = True
106 | todo_include_todos = False
107 | 
108 | # bibliography
109 | bibtex_bibfiles = ["references.bib"]
110 | bibtex_reference_style = "author_year"
111 | bibtex_default_style = "alpha"
112 | 
113 | # spelling
114 | spelling_lang = "en_US"
115 | spelling_warning = True
116 | spelling_word_list_filename = "spelling_wordlist.txt"
117 | spelling_add_pypi_package_names = True
118 | spelling_show_suggestions = True
119 | spelling_exclude_patterns = ["references.rst"]
120 | # see: https://pyenchant.github.io/pyenchant/api/enchant.tokenize.html
121 | spelling_filters = [
122 |     "enchant.tokenize.URLFilter",
123 |     "enchant.tokenize.EmailFilter",
124 |     "docs.source.utils.ModnameFilter",
125 |     "docs.source.utils.SignatureFilter",
126 |     "enchant.tokenize.MentionFilter",
127 | ]
128 | # see the solution from: https://github.com/sphinx-doc/sphinx/issues/7369
129 | linkcheck_ignore = [
130 |     # 403 Client Error
131 |     "https://doi.org/10.1126/science.aar7042",
132 |     "https://doi.org/10.1126/science.aau5324",
133 |     "https://doi.org/10.1093/bioinformatics/btab164",
134 |     "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2716260/",
135 |     "https://raw.githubusercontent.com/scverse/squidpy/main/docs/_static/img/figure1.png",
136 | ]
137 | 
138 | # Add any paths that contain custom static files (such as style sheets) here,
139 | # relative to this directory. They are copied after the builtin static files,
140 | # so a file named "default.css" will overwrite the builtin "default.css".
141 | html_theme = "sphinx_rtd_theme"
142 | html_static_path = ["_static"]
143 | # html_logo = "_static/img/gaston_logo_v2.png"
144 | html_theme_options = {"navigation_depth": 4, "logo_only": True}
145 | html_show_sphinx = False
146 | 
147 | 
148 | def setup(app: Sphinx) -> None:
149 |     app.add_css_file("css/custom.css")
150 |     app.add_css_file("css/sphinx_gallery.css")
151 |     app.add_css_file("css/nbsphinx.css")
152 |     app.add_css_file("css/dataframe.css")  # had to add this manually


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | CalicoST - Inferring allele-specific copy number aberrations and tumor phylogeography from spatially resolved transcriptomics
 2 | =============================================================================================================================
 3 | 
 4 | .. image:: https://raw.githubusercontent.com/raphael-group/CalicoST/main/docs/_static/img/overview4_combine.png
 5 |     :alt: CalicoST overview
 6 |     :width: 800px
 7 |     :align: center
 8 | 
 9 | CalicoST is a probabilistic model that infers allele-specific copy number aberrations and tumor phylogeography from spatially resolved transcriptomics.CalicoST has the following key features:
10 | 1. Identifies allele-specific integer copy numbers for each transcribed region, revealing events such as copy neutral loss of heterozygosity (CNLOH) and mirrored subclonal CNAs that are invisible to total copy number analysis.
11 | 2. Assigns each spot a clone label indicating whether the spot is primarily normal cells or a cancer clone with aberration copy number profile.
12 | 3. Infers a phylogeny relating the identified cancer clones as well as a phylogeography that combines genetic evolution and spatial dissemination of clones.
13 | 4. Handles normal cell admixture in SRT technologies hat are not single-cell resolution (e.g. 10x Genomics Visium) to infer more accurate allele-specific copy numbers and cancer clones.
14 | 5. Simultaneously analyzes multiple regional or aligned SRT slices from the same tumor.
15 | 
16 | 
17 | Installation
18 | ------------
19 | Find the details of installation `here <installation>`_.
20 | 
21 | Getting started with CalicoST
22 | -----------------------------
23 | Browse the Tutorials to get started with CalicoST `here <notebooks/tutorials/index>`_.
24 | 
25 | .. toctree::
26 |     :maxdepth: 1
27 | 
28 |     installation
29 |     tutorials
30 |     parameters
31 |     references
32 | 
33 | .. _github: https://github.com/raphael-group/CalicoST


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | Minimum installation
 4 | --------------------
 5 | First setup a conda environment from the `environment.yml` file:
 6 | 
 7 | ..  code-block:: bash
 8 | 
 9 |         git clone https://github.com/raphael-group/CalicoST.git
10 |         cd CalicoST
11 |         conda env create -f environment.yml --name calicost_env
12 | 
13 | 
14 | Then, install CalicoST using pip by
15 | 
16 | ..  code-block:: bash
17 | 
18 |         conda activate calicost_env
19 |         pip install -e .
20 | 
21 | 
22 | Setting up the conda environments takes around 15 minutes on an HPC head node.
23 | 
24 | Additional installation for SNP parsing
25 | ---------------------------------------
26 | CalicoST requires allele count matrices for reference-phased A and B alleles for inferring allele-specific CNAs, and provides a snakemake pipeline for obtaining the required matrices from a BAM file. Run the following commands in CalicoST directory for installing additional package, [Eagle2](https://alkesgroup.broadinstitute.org/Eagle/), for snakemake preprocessing pipeline.
27 | 
28 | ..  code-block:: bash
29 | 
30 |         mkdir external
31 |         wget --directory-prefix=external https://storage.googleapis.com/broad-alkesgroup-public/Eagle/downloads/Eagle_v2.4.1.tar.gz
32 |         tar -xzf external/Eagle_v2.4.1.tar.gz -C external
33 | 
34 | 
35 | Additional installation for reconstructing phylogeny
36 | ----------------------------------------------------
37 | Based on the inferred cancer clones and allele-specific CNAs by CalicoST, we apply Startle to reconstruct a phylogenetic tree along the clones. Install Startle by
38 | 
39 | ..  code-block:: bash
40 | 
41 |         git clone --recurse-submodules https://github.com/raphael-group/startle.git
42 |         cd startle
43 |         mkdir build; cd build
44 |         cmake -DLIBLEMON_ROOT=<lemon path>\
45 |                 -DCPLEX_INC_DIR=<cplex include path>\
46 |                 -DCPLEX_LIB_DIR=<cplex lib path>\
47 |                 -DCONCERT_INC_DIR=<concert include path>\
48 |                 -DCONCERT_LIB_DIR=<concert lib path>\
49 |                 ..
50 |         make
51 | 
52 | 
53 | Prepare reference files for SNP parsing
54 | --------------------
55 | We followed the recommended pipeline by `Numbat <https://kharchenkolab.github.io/numbat/>`_` for parsing SNP information from BAM file(s): first genotyping using the BAM file by cellsnp-lite (included in the conda environment) and reference-based phasing by Eagle2. Download the following panels for genotyping and reference-based phasing.
56 | 
57 | * `SNP panel <https://sourceforge.net/projects/cellsnp/files/SNPlist/genome1K.phase3.SNP_AF5e4.chr1toX.hg38.vcf.gz>`_ - 0.5GB in size. You can also choose other SNP panels from `cellsnp-lite webpage <https://cellsnp-lite.readthedocs.io/en/latest/main/data.html#data-list-of-common-snps>`_.
58 | * `Phasing panel <http://pklab.med.harvard.edu/teng/data/1000G_hg38.zip>`_ - 9.0GB in size. Unzip the panel after downloading.
59 | 


--------------------------------------------------------------------------------
/docs/parameters.rst:
--------------------------------------------------------------------------------
 1 | Specification of running parameters of CalicoST
 2 | ===============================================
 3 | 
 4 | Supporting reference files
 5 | --------------------------
 6 | geneticmap_file: str
 7 |     The path to genetic map file.
 8 | 
 9 | hgtable_file: str
10 |     The path to the location of genes in the genome. This file should be a tab-delimited file with the following columns: gene_name, chrom, cdsStart, cdsEnd.
11 | 
12 | normalidx_file: str, optional
13 |     The path to the file containing the indices of normal spots in the spatial transcriptomics data. Each line is a single index without header.
14 | 
15 | tumorprop_file: str, optional
16 |     The path to inferred tumor proportions per spot. This file should be a tab-delimited file with the following columns names: barcode, Tumor.
17 | 
18 | filtergenelist_file: str, optional
19 |     The file to a list of genes to exclude from CNA inference, based on prior knowledge.
20 | 
21 | filterregion_file: str, optional
22 |     The file to a list of genomic regions to exclude from CNA inference in BED format. E.g., HLA regions.
23 | 
24 | 
25 | Phasing parameters
26 | ------------------
27 | logphase_shift: float, optional
28 |     Adjustment to the strength of Markov Model self-transition in phasing. The higher the value, the higher self-transition probability. Default is -2.0.
29 | 
30 | secondary_min_umi: int, optional
31 |     The minimum UMI count a genome segment has in pseudobulk of spots in the step of genome segmentation. Default is 300.
32 | 
33 | 
34 | Clone inference parameters
35 | --------------------------
36 | n_clones: int
37 |     The number of clones to infer using only BAF signals. Default is 3.
38 | 
39 | n_clones_rdr: int, optional
40 |     The number of clones to refine for each BAF-identified clone using RDR and BAF signals. Default is 2.
41 | 
42 | min_spots_per_clone: int, optional
43 |     The minimum number of spots required to call a clone should have. Default is 100.
44 | 
45 | min_avgumi_per_clone: int, optional
46 |     The minimum average UMI count required for a clone. Default is 10.
47 | 
48 | nodepotential: str, optional
49 |     One of the following two options: "max" or "weighted_sum". "max" refers to using the MLE decoding of HMM in evaluating the probability of spots being in each clone. "weighted_sum" refers to using the full HMM posterior probabilities to evaluate the probability of spots being in each clone. Default is "weighted_sum".
50 | 
51 | spatial_weight: float, optional
52 |     The strength of spatial coherence in HMRF. The higher the value, the stronger the spatial coherence. Default is 1.0.
53 | 
54 | 
55 | CNA inference parameters
56 | ------------------------
57 | n_states: int
58 |     The number of allele-specific copy number states in the HMM for CNA inference.
59 | 
60 | t: float, optional
61 |     The self-transition probability of HMM. The higher the value, the higher probability that adjacent genome segments are in the same CNA state. Default is 1-1e-5.
62 | 
63 | max_iter: int, optional
64 |     The number of Baum-Welch steps to perform in HMM. Default is 30.
65 | 
66 | tol: float, optional
67 |     The convergence threshold to terminate Baum-Welch steps. Default is 1e-4.
68 | 
69 | 
70 | Merging clones with similar CNAs
71 | --------------------------------
72 | np_threshold: float, optional
73 |     The threshold of Neyman Pearson statistics to decide two clones have distinct CNA events. The higher the value, the two clones are merged more easily. Default is 1.0.
74 | 
75 | np_eventminlen: int, optional
76 |     The minimum number of consecutive genome segments to be considered as a CN event. Default is 10.
77 | 


--------------------------------------------------------------------------------
/docs/references.rst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphael-group/CalicoST/5e4a8a1230e71505667d51390dc9c035a69d60d9/docs/references.rst


--------------------------------------------------------------------------------
/docs/tutorials.rst:
--------------------------------------------------------------------------------
 1 | .. toctree::
 2 |    :maxdepth: 0
 3 |    :caption: Contents:
 4 | 
 5 |    Allele-specific CNAs and cancer clones on a simulated data
 6 |    ---------------------------------------------------------
 7 |    notebooks/tutorials/simulated_data_tutorial.ipynb
 8 | 
 9 |    Cancer clones and phylogeography of a five-slice prostate cancer
10 |    ----------------------------------------------------------------
11 |    notebooks/tutorials/prostate_tutorial.ipynb


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: calicost_env
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python==3.10
 8 |   - numpy==1.24.4
 9 |   - scipy==1.11.3
10 |   - samtools==1.18
11 |   - bcftools==1.18
12 |   - cellsnp-lite
13 |   - snakemake
14 |   - lemon
15 | 


--------------------------------------------------------------------------------
/examples/CalicoST_example.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphael-group/CalicoST/5e4a8a1230e71505667d51390dc9c035a69d60d9/examples/CalicoST_example.tar.gz


--------------------------------------------------------------------------------
/examples/example_input_filelist:
--------------------------------------------------------------------------------
1 | /u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H1_2_visium/outs/possorted_genome_bam.bam	H12	/u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H1_2_visium/outs/
2 | /u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H1_4_visium/outs/possorted_genome_bam.bam	H14	/u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H1_4_visium/outs/
3 | /u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H1_5_visium/outs/possorted_genome_bam.bam	H15	/u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H1_5_visium/outs/
4 | /u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H2_1_visium/outs/possorted_genome_bam.bam	H21	/u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H2_1_visium/outs/
5 | /u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H2_5_visium/outs/possorted_genome_bam.bam	H25	/u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H2_5_visium/outs/
6 | 


--------------------------------------------------------------------------------
/examples/prostate_example.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphael-group/CalicoST/5e4a8a1230e71505667d51390dc9c035a69d60d9/examples/prostate_example.tar.gz


--------------------------------------------------------------------------------
/examples/simulated_example.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphael-group/CalicoST/5e4a8a1230e71505667d51390dc9c035a69d60d9/examples/simulated_example.tar.gz


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "CalicoST"
 7 | version = "1.0.0"
 8 | authors = [
 9 |   { name="Cong Ma", email="congma@princeton.edu" },
10 |   { name="Metin Balaban", email="metin@princeton.edu" },
11 |   { name="Jingxian Liu", email="jingxian.liu@wustl.edu" },
12 |   { name="Siqi Chen", email="siqichen@wustl.edu" },
13 |   { name="Li Ding", email="lding@wustl.edu" },
14 |   { name="Ben Raphael", email="braphael@cs.princeton.edu" },
15 | ]
16 | description = "Inferring allele-specific copy number aberrations and tumor phylogeography from spatially resolved transcriptomics"
17 | readme = "README.md"
18 | requires-python = ">=3.8"
19 | classifiers = [
20 |     "Programming Language :: Python :: 3",
21 |     "License :: OSI Approved :: BSD License",
22 |     "Operating System :: OS Independent",
23 | ]
24 | dependencies = [
25 |   'numpy', 
26 |   'scipy', 
27 |   'pandas',
28 |   'scikit-learn',
29 |   'scanpy',
30 |   'anndata',
31 |   'numba',
32 |   'tqdm',
33 |   'statsmodels',
34 |   'networkx',
35 |   'matplotlib',
36 |   'seaborn',
37 |   'pysam',
38 |   'ete3'
39 | ]
40 | 
41 | [project.optional-dependencies]
42 | docs = [
43 |     "ipython",
44 |     "ipywidgets>=8.0.0",
45 |     "sphinx>=5.3",
46 |     "sphinx-autodoc-annotation",
47 |     "sphinx-autodoc-typehints>=1.10.3",
48 |     "sphinx_rtd_theme",
49 |     "sphinxcontrib-bibtex>=2.3.0",
50 |     "sphinxcontrib-spelling>=7.6.2",
51 |     "nbsphinx>=0.8.1",
52 |     "myst-nb>=0.17.1",
53 |     "sphinx_copybutton>=0.5.0",
54 | ]
55 | 
56 | [project.urls]
57 | "Homepage" = "https://github.com/raphael-group/CalicoST"
58 | 
59 | [tool.setuptools.packages.find]
60 | where = ["src"]
61 | include = ["calicost*"]


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | setuptools.setup(
 4 |     name="calicost",
 5 |     version="v1.0.0",
 6 |     python_requires=">=3.8",
 7 |     packages=["calicost"],
 8 |     package_dir={"": "src"},
 9 |     author="Cong Ma",
10 |     author_email="congma@princeton.edu",
11 |     description="Allele-specific CNAs and spatial cancer clone inference",
12 |     long_description="CalicoST infers allele-specific copy number aberrations and cancer clones in spatially resolved transcriptomics data",
13 |     url="https://github.com/raphael-group/CalicoST",
14 |     install_requires=[
15 |         "numpy==1.24.4",
16 |         "scipy==1.11.3",
17 |         "pandas==2.1.1",
18 |         "scikit-learn==1.3.2",
19 |         "scanpy==1.9.6",
20 |         "anndata==0.10.3",
21 |         "numba==0.60.0",
22 |         "tqdm==4.66.1",
23 |         "statsmodels==0.14.0",
24 |         "networkx==3.2.1",
25 |         "matplotlib==3.7.3",
26 |         "seaborn==0.12.2",
27 |         "pysam==0.22.1",
28 |         "ete3==3.1.3",
29 |         "ipykernel",
30 |     ],
31 |     include_package_data=True,
32 | )
33 | 


--------------------------------------------------------------------------------
/src/calicost/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = 'v1.0.0'
2 | 


--------------------------------------------------------------------------------
/src/calicost/allele_starch_generateconfig.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import numpy as np
  3 | import scipy
  4 | import pandas as pd
  5 | from pathlib import Path
  6 | from sklearn.metrics import adjusted_rand_score
  7 | import scanpy as sc
  8 | import anndata
  9 | import logging
 10 | import copy
 11 | from pathlib import Path
 12 | import subprocess
 13 | from hmm_NB_BB_phaseswitch import *
 14 | from utils_distribution_fitting import *
 15 | from hmrf import *
 16 | from utils_IO import *
 17 | 
 18 | 
 19 | def read_configuration_file(filename):
 20 |     ##### [Default settings] #####
 21 |     config = {
 22 |         "spaceranger_dir" : None,
 23 |         "snp_dir" : None,
 24 |         "output_dir" : None,
 25 |         # supporting files and preprocessing arguments
 26 |         "hgtable_file" : None,
 27 |         "normalidx_file" : None,
 28 |         "tumorprop_file" : None,
 29 |         "supervision_clone_file" : None,
 30 |         "filtergenelist_file" : None,
 31 |         "filterregion_file" : None,
 32 |         "binsize" : 1,
 33 |         "rdrbinsize" : 1,
 34 |         # "secondbinning_min_umi" : 500,
 35 |         "max_nbins" : 1200,
 36 |         "avg_umi_perbinspot" : 1.5,
 37 |         "bafonly" : True,
 38 |         # phase switch probability
 39 |         "nu" : 1,
 40 |         "logphase_shift" : 1,
 41 |         "npart_phasing" : 2,
 42 |         # HMRF configurations
 43 |         "n_clones" : None,
 44 |         "n_clones_rdr" : 2,
 45 |         "min_spots_per_clone" : 100,
 46 |         "min_avgumi_per_clone" : 10,
 47 |         "maxspots_pooling" : 7,
 48 |         "tumorprop_threshold" : 0.5, 
 49 |         "max_iter_outer" : 20,
 50 |         "nodepotential" : "max", # max or weighted_sum
 51 |         "initialization_method" : "rectangle", # rectangle or datadrive
 52 |         "num_hmrf_initialization_start" : 0, 
 53 |         "num_hmrf_initialization_end" : 10,
 54 |         "spatial_weight" : 2.0,
 55 |         "construct_adjacency_method" : "hexagon",
 56 |         "construct_adjacency_w" : 1.0,
 57 |         # HMM configurations
 58 |         "n_states" : None,
 59 |         "params" : None,
 60 |         "t" : None,
 61 |         "t_phaseing" : 1-1e-4,
 62 |         "fix_NB_dispersion" : False,
 63 |         "shared_NB_dispersion" : True,
 64 |         "fix_BB_dispersion" : False,
 65 |         "shared_BB_dispersion" : True,
 66 |         "max_iter" : 30,
 67 |         "tol" : 1e-3,
 68 |         "gmm_random_state" : 0,
 69 |         "np_threshold" : 2.0,
 70 |         "np_eventminlen" : 10
 71 |     }
 72 | 
 73 |     argument_type = {
 74 |         "spaceranger_dir" : "str",
 75 |         "snp_dir" : "str",
 76 |         "output_dir" : "str",
 77 |         # supporting files and preprocessing arguments
 78 |         "hgtable_file" : "str",
 79 |         "normalidx_file" : "str",
 80 |         "tumorprop_file" : "str",
 81 |         "supervision_clone_file" : "str",
 82 |         "filtergenelist_file" : "str",
 83 |         "filterregion_file" : "str",
 84 |         "binsize" : "int",
 85 |         "rdrbinsize" : "int",
 86 |         # "secondbinning_min_umi" : "int",
 87 |         "max_nbins" : "int",
 88 |         "avg_umi_perbinspot" : "float",
 89 |         "bafonly" : "bool",
 90 |         # phase switch probability
 91 |         "nu" : "float",
 92 |         "logphase_shift" : "float",
 93 |         "npart_phasing" : "int",
 94 |         # HMRF configurations
 95 |         "n_clones" : "int",
 96 |         "n_clones_rdr" : "int",
 97 |         "min_spots_per_clone" : "int",
 98 |         "min_avgumi_per_clone" : "int",
 99 |         "maxspots_pooling" : "int",
100 |         "tumorprop_threshold" : "float", 
101 |         "max_iter_outer" : "int",
102 |         "nodepotential" : "str",
103 |         "initialization_method" : "str",
104 |         "num_hmrf_initialization_start" : "int", 
105 |         "num_hmrf_initialization_end" : "int",
106 |         "spatial_weight" : "float",
107 |         "construct_adjacency_method" : "str",
108 |         "construct_adjacency_w" : "float",
109 |         # HMM configurations
110 |         "n_states" : "int",
111 |         "params" : "str",
112 |         "t" : "eval",
113 |         "t_phaseing" : "eval",
114 |         "fix_NB_dispersion" : "bool",
115 |         "shared_NB_dispersion" : "bool",
116 |         "fix_BB_dispersion" : "bool",
117 |         "shared_BB_dispersion" : "bool",
118 |         "max_iter" : "int",
119 |         "tol" : "float",
120 |         "gmm_random_state" : "int",
121 |         "np_threshold" : "float",
122 |         "np_eventminlen" : "int"
123 |     }
124 | 
125 |     ##### [ read configuration file to update settings ] #####
126 |     with open(filename, 'r') as fp:
127 |         for line in fp:
128 |             if line.strip() == "" or line[0] == "#":
129 |                 continue
130 |             # strs = [x.replace(" ", "") for x in line.strip().split(":") if x != ""]
131 |             strs = [x.strip() for x in line.strip().split(":") if x != ""]
132 |             assert strs[0] in config.keys(), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}"
133 |             if strs[1].upper() == "NONE":
134 |                 config[strs[0]] = None
135 |             elif argument_type[strs[0]] == "str":
136 |                 config[strs[0]] = strs[1]
137 |             elif argument_type[strs[0]] == "int":
138 |                 config[strs[0]] = int(strs[1])
139 |             elif argument_type[strs[0]] == "float":
140 |                 config[strs[0]] = float(strs[1])
141 |             elif argument_type[strs[0]] == "eval":
142 |                 config[strs[0]] = eval(strs[1])
143 |             elif argument_type[strs[0]] == "bool":
144 |                 config[strs[0]] = (strs[1].upper() == "TRUE")
145 |             elif argument_type[strs[0]] == "list_str":
146 |                 config[strs[0]] = strs[1].split(" ")
147 |     # assertions
148 |     assert not config["spaceranger_dir"] is None, "No spaceranger directory!"
149 |     assert not config["snp_dir"] is None, "No SNP directory!"
150 |     assert not config["output_dir"] is None, "No output directory!"
151 | 
152 |     return config
153 | 
154 | 
155 | def write_config_file(outputfilename, config):
156 |     list_argument_io = ["spaceranger_dir",
157 |         "snp_dir",
158 |         "output_dir"]
159 |     list_argument_sup = ["hgtable_file",
160 |         "normalidx_file",
161 |         "tumorprop_file",
162 |         "supervision_clone_file",
163 |         "filtergenelist_file",
164 |         "filterregion_file",
165 |         "binsize",
166 |         "rdrbinsize",
167 |         # "secondbinning_min_umi",
168 |         "max_nbins",
169 |         "avg_umi_perbinspot",
170 |         "bafonly"]
171 |     list_argument_phase = ["nu",
172 |         "logphase_shift",
173 |         "npart_phasing"]
174 |     list_argument_hmrf = ["n_clones",
175 |         "n_clones_rdr",
176 |         "min_spots_per_clone",
177 |         "min_avgumi_per_clone",
178 |         "maxspots_pooling",
179 |         "tumorprop_threshold",
180 |         "max_iter_outer",
181 |         "nodepotential",
182 |         "initialization_method",
183 |         "num_hmrf_initialization_start", 
184 |         "num_hmrf_initialization_end",
185 |         "spatial_weight",
186 |         "construct_adjacency_method",
187 |         "construct_adjacency_w"]
188 |     list_argument_hmm = ["n_states",
189 |         "params",
190 |         "t",
191 |         "t_phaseing",
192 |         "fix_NB_dispersion",
193 |         "shared_NB_dispersion",
194 |         "fix_BB_dispersion",
195 |         "shared_BB_dispersion",
196 |         "max_iter",
197 |         "tol",
198 |         "gmm_random_state",
199 |         "np_threshold",
200 |         "np_eventminlen"]
201 |     with open(outputfilename, 'w') as fp:
202 |         #
203 |         for k in list_argument_io:
204 |             fp.write(f"{k} : {config[k]}\n")
205 |         #
206 |         fp.write("\n")
207 |         fp.write("# supporting files and preprocessing arguments\n")
208 |         for k in list_argument_sup:
209 |             fp.write(f"{k} : {config[k]}\n")
210 |         #
211 |         fp.write("\n")
212 |         fp.write("# phase switch probability\n")
213 |         for k in list_argument_phase:
214 |             fp.write(f"{k} : {config[k]}\n")
215 |         #
216 |         fp.write("\n")
217 |         fp.write("# HMRF configurations\n")
218 |         for k in list_argument_hmrf:
219 |             fp.write(f"{k} : {config[k]}\n")
220 |         #
221 |         fp.write("\n")
222 |         fp.write("# HMM configurations\n")
223 |         for k in list_argument_hmm:
224 |             fp.write(f"{k} : {config[k]}\n")
225 | 
226 | 
227 | def main(argv):
228 |     template_configuration_file = argv[1]
229 |     outputdir = argv[2]
230 |     hmrf_seed_s = int(argv[3])
231 |     hmrf_seed_t = int(argv[4])
232 |     config = read_configuration_file(template_configuration_file)
233 |     for r in range(hmrf_seed_s, hmrf_seed_t):
234 |         config["num_hmrf_initialization_start"] = r
235 |         config["num_hmrf_initialization_end"] = r+1
236 |         write_config_file(f"{outputdir}/configfile{r}", config)
237 |     
238 | 
239 | if __name__ == "__main__":
240 |     if len(sys.argv) > 1:
241 |         main(sys.argv)


--------------------------------------------------------------------------------
/src/calicost/arg_parse.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import numpy as np
  3 | import scipy
  4 | import pandas as pd
  5 | import logging
  6 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
  7 | logger = logging.getLogger()
  8 | 
  9 | 
 10 | def load_default_config():
 11 |     config_joint = {
 12 |         "input_filelist" : None,
 13 |         "alignment_files" : []
 14 |     }
 15 |     config_single = {
 16 |         "spaceranger_dir" : None
 17 |     }
 18 |     config_shared = {
 19 |         "snp_dir" : None,
 20 |         "output_dir" : None,
 21 |         # supporting files and preprocessing arguments
 22 |         "geneticmap_file" : None,
 23 |         "hgtable_file" : None,
 24 |         "normalidx_file" : None,
 25 |         "tumorprop_file" : None,
 26 |         "supervision_clone_file" : None,
 27 |         "filtergenelist_file" : None,
 28 |         "filterregion_file" : None,
 29 |         "secondary_min_umi" : 300,
 30 |         "min_snpumi_perspot" : 50,
 31 |         'min_percent_expressed_spots' : 0.005,
 32 |         "bafonly" : False,
 33 |         # phase switch probability
 34 |         "nu" : 1.0,
 35 |         "logphase_shift" : -2.0,
 36 |         "npart_phasing" : 3,
 37 |         # HMRF configurations
 38 |         "n_clones" : None,
 39 |         "n_clones_rdr" : 2,
 40 |         "min_spots_per_clone" : 100,
 41 |         "min_avgumi_per_clone" : 10,
 42 |         "maxspots_pooling" : 7,
 43 |         "tumorprop_threshold" : 0.5, 
 44 |         "max_iter_outer" : 20,
 45 |         "nodepotential" : "weighted_sum", # max or weighted_sum
 46 |         "initialization_method" : "rectangle", # rectangle or datadrive
 47 |         "num_hmrf_initialization_start" : 0, 
 48 |         "num_hmrf_initialization_end" : 10,
 49 |         "spatial_weight" : 1.0,
 50 |         "construct_adjacency_method" : "hexagon",
 51 |         "construct_adjacency_w" : 1.0,
 52 |         # HMM configurations
 53 |         "n_states" : None,
 54 |         "params" : "smp",
 55 |         "t" : 1-1e-5,
 56 |         "t_phaseing" : 1-1e-4,
 57 |         "fix_NB_dispersion" : False,
 58 |         "shared_NB_dispersion" : True,
 59 |         "fix_BB_dispersion" : False,
 60 |         "shared_BB_dispersion" : True,
 61 |         "max_iter" : 30,
 62 |         "tol" : 1e-4,
 63 |         "gmm_random_state" : 0,
 64 |         "np_threshold" : 1.0,
 65 |         "np_eventminlen" : 10,
 66 |         # integer copy number
 67 |         "nonbalance_bafdist" : 1.0,
 68 |         "nondiploid_rdrdist" : 10.0
 69 |     }
 70 | 
 71 |     argtype_joint = {
 72 |         "input_filelist" : "str",
 73 |         "alignment_files" : "list_str"
 74 |     }
 75 |     argtype_single = {
 76 |         "spaceranger_dir" : "str"
 77 |     }
 78 |     argtype_shared = {
 79 |         "snp_dir" : "str",
 80 |         "output_dir" : "str",
 81 |         # supporting files and preprocessing arguments
 82 |         "geneticmap_file" : "str",
 83 |         "hgtable_file" : "str",
 84 |         "normalidx_file" : "str",
 85 |         "tumorprop_file" : "str",
 86 |         "supervision_clone_file" : "str",
 87 |         "filtergenelist_file" : "str",
 88 |         "filterregion_file" : "str",
 89 |         "secondary_min_umi" : "int",
 90 |         "min_snpumi_perspot" : "int",
 91 |         'min_percent_expressed_spots' : "float",
 92 |         "bafonly" : "bool",
 93 |         # phase switch probability
 94 |         "nu" : "float",
 95 |         "logphase_shift" : "float",
 96 |         "npart_phasing" : "int",
 97 |         # HMRF configurations
 98 |         "n_clones" : "int",
 99 |         "n_clones_rdr" : "int",
100 |         "min_spots_per_clone" : "int",
101 |         "min_avgumi_per_clone" : "int",
102 |         "maxspots_pooling" : "int",
103 |         "tumorprop_threshold" : "float", 
104 |         "max_iter_outer" : "int",
105 |         "nodepotential" : "str",
106 |         "initialization_method" : "str",
107 |         "num_hmrf_initialization_start" : "int", 
108 |         "num_hmrf_initialization_end" : "int",
109 |         "spatial_weight" : "float",
110 |         "construct_adjacency_method" : "str",
111 |         "construct_adjacency_w" : "float",
112 |         # HMM configurations
113 |         "n_states" : "int",
114 |         "params" : "str",
115 |         "t" : "eval",
116 |         "t_phaseing" : "eval",
117 |         "fix_NB_dispersion" : "bool",
118 |         "shared_NB_dispersion" : "bool",
119 |         "fix_BB_dispersion" : "bool",
120 |         "shared_BB_dispersion" : "bool",
121 |         "max_iter" : "int",
122 |         "tol" : "float",
123 |         "gmm_random_state" : "int",
124 |         "np_threshold" : "float",
125 |         "np_eventminlen" : "int",
126 |         # integer copy number
127 |         "nonbalance_bafdist" : "float",
128 |         "nondiploid_rdrdist" : "float"
129 |     }
130 | 
131 |     category_names = ["", "# supporting files and preprocessing arguments", "# phase switch probability", "# HMRF configurations", "# HMM configurations", "# integer copy number"]
132 |     category_elements = [["input_filelist", "spaceranger_dir", "snp_dir", "output_dir"], \
133 |                          ["geneticmap_file", "hgtable_file", "normalidx_file", "tumorprop_file", "alignment_files", "supervision_clone_file", "filtergenelist_file", "filterregion_file", "secondary_min_umi", "min_snpumi_perspot", "min_percent_expressed_spots", "bafonly"], \
134 |                          ["nu", "logphase_shift", "npart_phasing"], \
135 |                          ["n_clones", "n_clones_rdr", "min_spots_per_clone", "min_avgumi_per_clone", "maxspots_pooling", "tumorprop_threshold",  "max_iter_outer", "nodepotential", "initialization_method", "num_hmrf_initialization_start",  "num_hmrf_initialization_end", "spatial_weight", "construct_adjacency_method", "construct_adjacency_w"], \
136 |                          ["n_states", "params", "t", "t_phaseing", "fix_NB_dispersion", "shared_NB_dispersion", "fix_BB_dispersion", "shared_BB_dispersion", "max_iter", "tol", "gmm_random_state", "np_threshold", "np_eventminlen"], \
137 |                          ["nonbalance_bafdist", "nondiploid_rdrdist"]]
138 |     return config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, category_names, category_elements
139 | 
140 | 
141 | def read_configuration_file(filename):
142 |     ##### [Default settings] #####
143 |     config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, _, _ = load_default_config()
144 |     config = {**config_shared, **config_single}
145 |     argument_type = {**argtype_shared, **argtype_single}
146 | 
147 |     ##### [ read configuration file to update settings ] #####
148 |     with open(filename, 'r') as fp:
149 |         for line in fp:
150 |             if line.strip() == "" or line[0] == "#":
151 |                 continue
152 |             strs = [x.strip() for x in line.strip().split(":") if x != ""]
153 |             # assert strs[0] in config.keys(), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}"
154 |             if (not strs[0] in config.keys()) and (not strs[0] in config_joint.keys()):
155 |                 # warning that the argument is not a valid configuration parameter and continue
156 |                 logger.warning(f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}")
157 |                 continue
158 |             if len(strs) == 1:
159 |                 config[strs[0]] = []
160 |             elif strs[1].upper() == "NONE":
161 |                 config[strs[0]] = None
162 |             elif argument_type[strs[0]] == "str":
163 |                 config[strs[0]] = strs[1]
164 |             elif argument_type[strs[0]] == "int":
165 |                 config[strs[0]] = int(strs[1])
166 |             elif argument_type[strs[0]] == "float":
167 |                 config[strs[0]] = float(strs[1])
168 |             elif argument_type[strs[0]] == "eval":
169 |                 config[strs[0]] = eval(strs[1])
170 |             elif argument_type[strs[0]] == "bool":
171 |                 config[strs[0]] = (strs[1].upper() == "TRUE")
172 |             elif argument_type[strs[0]] == "list_str":
173 |                 config[strs[0]] = strs[1].split(" ")
174 |     # assertions
175 |     assert not config["spaceranger_dir"] is None, "No spaceranger directory!"
176 |     assert not config["snp_dir"] is None, "No SNP directory!"
177 |     assert not config["output_dir"] is None, "No output directory!"
178 | 
179 |     return config
180 | 
181 | 
182 | def read_joint_configuration_file(filename):
183 |     ##### [Default settings] #####
184 |     config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, _, _ = load_default_config()
185 |     config = {**config_shared, **config_joint}
186 |     argument_type = {**argtype_shared, **argtype_joint}
187 | 
188 |     ##### [ read configuration file to update settings ] #####
189 |     with open(filename, 'r') as fp:
190 |         for line in fp:
191 |             if line.strip() == "" or line[0] == "#":
192 |                 continue
193 |             strs = [x.strip() for x in line.strip().split(":") if x != ""]
194 |             # assert strs[0] in config.keys(), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}"
195 |             if (not strs[0] in config.keys()) and (not strs[0] in config_single.keys()):
196 |                 # warning that the argument is not a valid configuration parameter and continue
197 |                 logger.warning(f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}")
198 |                 continue
199 |             if len(strs) == 1:
200 |                 config[strs[0]] = []
201 |             elif strs[1].upper() == "NONE":
202 |                 config[strs[0]] = None
203 |             elif argument_type[strs[0]] == "str":
204 |                 config[strs[0]] = strs[1]
205 |             elif argument_type[strs[0]] == "int":
206 |                 config[strs[0]] = int(strs[1])
207 |             elif argument_type[strs[0]] == "float":
208 |                 config[strs[0]] = float(strs[1])
209 |             elif argument_type[strs[0]] == "eval":
210 |                 config[strs[0]] = eval(strs[1])
211 |             elif argument_type[strs[0]] == "bool":
212 |                 config[strs[0]] = (strs[1].upper() == "TRUE")
213 |             elif argument_type[strs[0]] == "list_str":
214 |                 config[strs[0]] = strs[1].split(" ")
215 |     # assertions
216 |     assert not config["input_filelist"] is None, "No input file list!"
217 |     assert not config["snp_dir"] is None, "No SNP directory!"
218 |     assert not config["output_dir"] is None, "No output directory!"
219 | 
220 |     return config
221 | 
222 | 
223 | def write_config_file(outputfilename, config):
224 |     _,_,_, argtype_shared, argtype_joint, argtype_single, category_names, category_elements = load_default_config()
225 |     argument_type = {**argtype_shared, **argtype_joint, **argtype_single}
226 |     with open(outputfilename, 'w') as fp:
227 |         for i in range(len(category_names)):
228 |             fp.write(f"{category_names[i]}\n")
229 |             for k in category_elements[i]:
230 |                 if k in config:
231 |                     if argument_type[k] == "list_str":
232 |                         fp.write(f"{k} : {' '.join(config[k])}\n")
233 |                     else:
234 |                         fp.write(f"{k} : {config[k]}\n")
235 |             fp.write("\n")
236 | 
237 | 
238 | def get_default_config_single():
239 |     config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, _, _ = load_default_config()
240 |     config = {**config_shared, **config_single}
241 |     return config
242 | 
243 | 
244 | def get_default_config_joint():
245 |     config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, _, _ = load_default_config()
246 |     config = {**config_shared, **config_joint}
247 |     return config
248 | 
249 | 
250 | def main(argv):
251 |     template_configuration_file = argv[1]
252 |     outputdir = argv[2]
253 |     hmrf_seed_s = int(argv[3])
254 |     hmrf_seed_t = int(argv[4])
255 |     try:
256 |         config = read_configuration_file(template_configuration_file)
257 |     except:
258 |         config = read_joint_configuration_file(template_configuration_file)
259 | 
260 |     for r in range(hmrf_seed_s, hmrf_seed_t):
261 |         config["num_hmrf_initialization_start"] = r
262 |         config["num_hmrf_initialization_end"] = r+1
263 |         write_config_file(f"{outputdir}/configfile{r}", config)
264 | 
265 | 
266 | if __name__ == "__main__":
267 |     if len(sys.argv) > 1:
268 |         main(sys.argv)
269 | 


--------------------------------------------------------------------------------
/src/calicost/estimate_tumor_proportion.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import numpy as np
  3 | import scipy
  4 | import pandas as pd
  5 | from pathlib import Path
  6 | import logging
  7 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
  8 | logger = logging.getLogger()
  9 | import copy
 10 | import functools
 11 | import subprocess
 12 | from calicost.arg_parse import *
 13 | from calicost.hmm_NB_BB_phaseswitch import *
 14 | from calicost.parse_input import *
 15 | from calicost.utils_hmrf import *
 16 | from calicost.hmrf import *
 17 | 
 18 | 
 19 | def main(configuration_file):
 20 |     try:
 21 |         config = read_configuration_file(configuration_file)
 22 |     except:
 23 |         config = read_joint_configuration_file(configuration_file)
 24 | 
 25 |     lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_bininfo, df_gene_snp, \
 26 |         barcodes, coords, single_tumor_prop, sample_list, sample_ids, adjacency_mat, smooth_mat, exp_counts = run_parse_n_load(config)
 27 |     
 28 |     single_base_nb_mean[:,:] = 0
 29 | 
 30 |     n_states_for_tumorprop = 5
 31 |     n_clones_for_tumorprop = 3
 32 |     n_rdrclones_for_tumorprop = 3 #2
 33 |     max_outer_iter_for_tumorprop = 10
 34 |     max_iter_for_tumorprop = 20
 35 |     MIN_PROP_UNCERTAINTY = 0.05
 36 |     initial_clone_index = rectangle_initialize_initial_clone(coords, n_clones_for_tumorprop, random_state=0)
 37 |     # save clone initialization into npz file
 38 |     prefix = "initialhmm"
 39 |     if not Path(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz").exists():
 40 |         initial_assignment = np.zeros(single_X.shape[2], dtype=int)
 41 |         for c,idx in enumerate(initial_clone_index):
 42 |             initial_assignment[idx] = c
 43 |         allres = {"num_iterations":0, "round-1_assignment":initial_assignment}
 44 |         np.savez(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz", **allres)
 45 | 
 46 |     hmrf_concatenate_pipeline(config['output_dir'], prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, initial_clone_index, n_states=n_states_for_tumorprop, \
 47 |             log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat, adjacency_mat=adjacency_mat, sample_ids=sample_ids, max_iter_outer=max_outer_iter_for_tumorprop, nodepotential=config["nodepotential"], \
 48 |             hmmclass=hmm_nophasing_v2, params="sp", t=config["t"], random_state=config["gmm_random_state"], \
 49 |             fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \
 50 |             fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \
 51 |             is_diag=True, max_iter=max_iter_for_tumorprop, tol=config["tol"], spatial_weight=config["spatial_weight"])
 52 |     
 53 |     res = load_hmrf_last_iteration(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz")
 54 |     merging_groups, merged_res = merge_by_minspots(res["new_assignment"], res, single_total_bb_RD, min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*single_X.shape[0])
 55 | 
 56 |     # further refine clones
 57 |     combined_assignment = copy.copy(merged_res['new_assignment'])
 58 |     offset_clone = 0
 59 |     combined_p_binom = []
 60 |     offset_state = 0
 61 |     combined_pred_cnv = []
 62 |     for bafc in range(len(merging_groups)):
 63 |         prefix = f"initialhmm_clone{bafc}"
 64 |         idx_spots = np.where(merged_res['new_assignment'] == bafc)[0]
 65 |         total_allele_count = np.sum(single_total_bb_RD[:, idx_spots])
 66 |         if total_allele_count < single_X.shape[0] * 50: # put a minimum B allele read count on pseudobulk to split clones
 67 |             combined_assignment[idx_spots] = offset_clone
 68 |             offset_clone += 1
 69 |             combined_p_binom.append(merged_res['new_p_binom'])
 70 |             combined_pred_cnv.append(merged_res['pred_cnv'] + offset_state)
 71 |             offset_state += merged_res['new_p_binom'].shape[0]
 72 |             continue
 73 |         # initialize clone
 74 |         initial_clone_index = rectangle_initialize_initial_clone(coords[idx_spots], n_rdrclones_for_tumorprop, random_state=0)
 75 |         # save clone initialization into npz file
 76 |         if not Path(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz").exists():
 77 |             initial_assignment = np.zeros(len(idx_spots), dtype=int)
 78 |             for c,idx in enumerate(initial_clone_index):
 79 |                 initial_assignment[idx] = c
 80 |             allres = {"barcodes":barcodes[idx_spots], "num_iterations":0, "round-1_assignment":initial_assignment}
 81 |             np.savez(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz", **allres)
 82 |         
 83 |         copy_slice_sample_ids = copy.copy(sample_ids[idx_spots])
 84 |         hmrf_concatenate_pipeline(config['output_dir'], prefix, single_X[:,:,idx_spots], lengths, single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], initial_clone_index, n_states=n_states_for_tumorprop, \
 85 |             log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat[np.ix_(idx_spots,idx_spots)], adjacency_mat=adjacency_mat[np.ix_(idx_spots,idx_spots)], sample_ids=copy_slice_sample_ids, max_iter_outer=10, nodepotential=config["nodepotential"], \
 86 |             hmmclass=hmm_nophasing_v2, params="sp", t=config["t"], random_state=config["gmm_random_state"], \
 87 |             fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \
 88 |             fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \
 89 |             is_diag=True, max_iter=max_iter_for_tumorprop, tol=config["tol"], spatial_weight=config["spatial_weight"])
 90 |     
 91 |         cloneres = load_hmrf_last_iteration(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz")
 92 |         combined_assignment[idx_spots] = cloneres['new_assignment'] + offset_clone
 93 |         offset_clone += np.max(cloneres['new_assignment']) + 1
 94 |         combined_p_binom.append(cloneres['new_p_binom'])
 95 |         combined_pred_cnv.append(cloneres['pred_cnv'] + offset_state)
 96 |         offset_state += cloneres['new_p_binom'].shape[0]
 97 |     combined_p_binom = np.vstack(combined_p_binom)
 98 |     combined_pred_cnv = np.concatenate(combined_pred_cnv)
 99 | 
100 |     normal_candidate = identify_normal_spots(single_X, single_total_bb_RD, merged_res['new_assignment'], merged_res['pred_cnv'], merged_res['new_p_binom'], min_count=single_X.shape[0] * 200)
101 |     loh_states, is_B_lost, rdr_values, clones_hightumor = identify_loh_per_clone(single_X, combined_assignment, combined_pred_cnv, combined_p_binom, normal_candidate, single_total_bb_RD)
102 |     assignments = pd.DataFrame({'coarse':merged_res['new_assignment'], 'combined':combined_assignment})
103 |     # pool across adjacency spot to increase the UMIs covering LOH region
104 |     _, tp_smooth_mat = multislice_adjacency(sample_ids, sample_list, coords, single_total_bb_RD, exp_counts, 
105 |                                             across_slice_adjacency_mat=None, construct_adjacency_method=config['construct_adjacency_method'], 
106 |                                             maxspots_pooling=7, construct_adjacency_w=config['construct_adjacency_w'])
107 |     single_tumor_prop, _ = estimator_tumor_proportion(single_X, single_total_bb_RD, assignments, combined_pred_cnv, loh_states, is_B_lost, rdr_values, clones_hightumor, smooth_mat=tp_smooth_mat)
108 |     # post-processing to remove negative tumor proportions
109 |     single_tumor_prop = np.where(single_tumor_prop < MIN_PROP_UNCERTAINTY, MIN_PROP_UNCERTAINTY, single_tumor_prop)
110 |     single_tumor_prop[normal_candidate] = 0
111 |     # save single_tumor_prop to file
112 |     pd.DataFrame({"Tumor":single_tumor_prop}, index=barcodes).to_csv(f"{config['output_dir']}/loh_estimator_tumor_prop.tsv", header=True, sep="\t")
113 | 
114 | 
115 | if __name__ == "__main__":
116 |     parser = argparse.ArgumentParser()
117 |     parser.add_argument("-c", "--configfile", help="configuration file of CalicoST", required=True, type=str)
118 |     args = parser.parse_args()
119 | 
120 |     main(args.configfile)


--------------------------------------------------------------------------------
/src/calicost/hmm_NB_BB_nophasing.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import numpy as np
  3 | from numba import njit
  4 | from scipy.stats import norm, multivariate_normal, poisson
  5 | import scipy.special
  6 | from scipy.optimize import minimize
  7 | from scipy.optimize import Bounds
  8 | from sklearn.mixture import GaussianMixture
  9 | from tqdm import trange
 10 | import statsmodels.api as sm
 11 | from statsmodels.base.model import GenericLikelihoodModel
 12 | import copy
 13 | from calicost.utils_distribution_fitting import *
 14 | from calicost.utils_hmm import *
 15 | import networkx as nx
 16 | 
 17 | 
 18 | ############################################################
 19 | # whole inference
 20 | ############################################################
 21 | 
 22 | class hmm_nophasing(object):
 23 |     def __init__(self, params="stmp", t=1-1e-4):
 24 |         """
 25 |         Attributes
 26 |         ----------
 27 |         params : str
 28 |             Codes for parameters that need to be updated. The corresponding parameter can only be updated if it is included in this argument. "s" for start probability; "t" for transition probability; "m" for Negative Binomial RDR signal; "p" for Beta Binomial BAF signal.
 29 | 
 30 |         t : float
 31 |             Determine initial self transition probability to be 1-t.
 32 |         """
 33 |         self.params = params
 34 |         self.t = t
 35 |     #
 36 |     @staticmethod
 37 |     def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus):
 38 |         """
 39 |         Attributes
 40 |         ----------
 41 |         X : array, shape (n_observations, n_components, n_spots)
 42 |             Observed expression UMI count and allele frequency UMI count.
 43 | 
 44 |         base_nb_mean : array, shape (n_observations, n_spots)
 45 |             Mean expression under diploid state.
 46 | 
 47 |         log_mu : array, shape (n_states, n_spots)
 48 |             Log of read depth change due to CNV. Mean of NB distributions in HMM per state per spot.
 49 | 
 50 |         alphas : array, shape (n_states, n_spots)
 51 |             Over-dispersion of NB distributions in HMM per state per spot.
 52 | 
 53 |         total_bb_RD : array, shape (n_observations, n_spots)
 54 |             SNP-covering reads for both REF and ALT across genes along genome.
 55 | 
 56 |         p_binom : array, shape (n_states, n_spots)
 57 |             BAF due to CNV. Mean of Beta Binomial distribution in HMM per state per spot.
 58 | 
 59 |         taus : array, shape (n_states, n_spots)
 60 |             Over-dispersion of Beta Binomial distribution in HMM per state per spot.
 61 |         
 62 |         Returns
 63 |         ----------
 64 |         log_emission : array, shape (n_states, n_obs, n_spots)
 65 |             Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots.
 66 |         """
 67 |         n_obs = X.shape[0]
 68 |         n_comp = X.shape[1]
 69 |         n_spots = X.shape[2]
 70 |         n_states = log_mu.shape[0]
 71 |         # initialize log_emission
 72 |         log_emission_rdr = np.zeros((n_states, n_obs, n_spots))
 73 |         log_emission_baf = np.zeros((n_states, n_obs, n_spots))
 74 |         for i in np.arange(n_states):
 75 |             for s in np.arange(n_spots):
 76 |                 # expression from NB distribution
 77 |                 idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0]
 78 |                 if len(idx_nonzero_rdr) > 0:
 79 |                     nb_mean = base_nb_mean[idx_nonzero_rdr,s] * np.exp(log_mu[i, s])
 80 |                     nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2)
 81 |                     n, p = convert_params(nb_mean, nb_std)
 82 |                     log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p)
 83 |                 # AF from BetaBinom distribution
 84 |                 idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0]
 85 |                 if len(idx_nonzero_baf) > 0:
 86 |                     log_emission_baf[i, idx_nonzero_baf, s] = scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], p_binom[i, s] * taus[i, s], (1-p_binom[i, s]) * taus[i, s])
 87 |         return log_emission_rdr, log_emission_baf
 88 |     #
 89 |     @staticmethod
 90 |     def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop, **kwargs):
 91 |         """
 92 |         Attributes
 93 |         ----------
 94 |         X : array, shape (n_observations, n_components, n_spots)
 95 |             Observed expression UMI count and allele frequency UMI count.
 96 | 
 97 |         base_nb_mean : array, shape (n_observations, n_spots)
 98 |             Mean expression under diploid state.
 99 | 
100 |         log_mu : array, shape (n_states, n_spots)
101 |             Log of read depth change due to CNV. Mean of NB distributions in HMM per state per spot.
102 | 
103 |         alphas : array, shape (n_states, n_spots)
104 |             Over-dispersion of NB distributions in HMM per state per spot.
105 | 
106 |         total_bb_RD : array, shape (n_observations, n_spots)
107 |             SNP-covering reads for both REF and ALT across genes along genome.
108 | 
109 |         p_binom : array, shape (n_states, n_spots)
110 |             BAF due to CNV. Mean of Beta Binomial distribution in HMM per state per spot.
111 | 
112 |         taus : array, shape (n_states, n_spots)
113 |             Over-dispersion of Beta Binomial distribution in HMM per state per spot.
114 |         
115 |         Returns
116 |         ----------
117 |         log_emission : array, shape (n_states, n_obs, n_spots)
118 |             Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots.
119 |         """
120 |         n_obs = X.shape[0]
121 |         n_comp = X.shape[1]
122 |         n_spots = X.shape[2]
123 |         n_states = log_mu.shape[0]
124 |         # initialize log_emission
125 |         log_emission_rdr = np.zeros((n_states, n_obs, n_spots))
126 |         log_emission_baf = np.zeros((n_states, n_obs, n_spots))
127 |         for i in np.arange(n_states):
128 |             for s in np.arange(n_spots):
129 |                 # expression from NB distribution
130 |                 idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0]
131 |                 if len(idx_nonzero_rdr) > 0:
132 |                     # nb_mean = base_nb_mean[idx_nonzero_rdr,s] * (tumor_prop[s] * np.exp(log_mu[i, s]) + 1 - tumor_prop[s])
133 |                     nb_mean = base_nb_mean[idx_nonzero_rdr,s] * (tumor_prop[idx_nonzero_rdr,s] * np.exp(log_mu[i, s]) + 1 - tumor_prop[idx_nonzero_rdr,s])
134 |                     nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2)
135 |                     n, p = convert_params(nb_mean, nb_std)
136 |                     log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p)
137 |                 # AF from BetaBinom distribution
138 |                 idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0]
139 |                 if len(idx_nonzero_baf) > 0:
140 |                     # mix_p_A = p_binom[i, s] * tumor_prop[s] + 0.5 * (1 - tumor_prop[s])
141 |                     # mix_p_B = (1 - p_binom[i, s]) * tumor_prop[s] + 0.5 * (1 - tumor_prop[s])
142 |                     mix_p_A = p_binom[i, s] * tumor_prop[idx_nonzero_baf,s] + 0.5 * (1 - tumor_prop[idx_nonzero_baf,s])
143 |                     mix_p_B = (1 - p_binom[i, s]) * tumor_prop[idx_nonzero_baf,s] + 0.5 * (1 - tumor_prop[idx_nonzero_baf,s])
144 |                     log_emission_baf[i, idx_nonzero_baf, s] += scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], mix_p_A * taus[i, s], mix_p_B * taus[i, s])
145 |         return log_emission_rdr, log_emission_baf
146 |     #
147 |     @staticmethod
148 |     @njit 
149 |     def forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat):
150 |         '''
151 |         Note that n_states is the CNV states, and there are n_states of paired states for (CNV, phasing) pairs.
152 |         Input
153 |             lengths: sum of lengths = n_observations.
154 |             log_transmat: n_states * n_states. Transition probability after log transformation.
155 |             log_startprob: n_states. Start probability after log transformation.
156 |             log_emission: n_states * n_observations * n_spots. Log probability.
157 |         Output
158 |             log_alpha: size n_states * n_observations. log alpha[j, t] = log P(o_1, ... o_t, q_t = j | lambda).
159 |         '''
160 |         n_obs = log_emission.shape[1]
161 |         n_states = log_emission.shape[0]
162 |         assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!"
163 |         assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!"
164 |         # initialize log_alpha
165 |         log_alpha = np.zeros((log_emission.shape[0], n_obs))
166 |         buf = np.zeros(log_emission.shape[0])
167 |         cumlen = 0
168 |         for le in lengths:
169 |             # start prob
170 |             # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. 
171 |             # But adding too many spots may lead to a higher weight of the emission rather then transition prob.
172 |             log_alpha[:, cumlen] = log_startprob + np_sum_ax_squeeze(log_emission[:, cumlen, :], axis=1)
173 |             for t in np.arange(1, le):
174 |                 for j in np.arange(log_emission.shape[0]):
175 |                     for i in np.arange(log_emission.shape[0]):
176 |                         buf[i] = log_alpha[i, (cumlen + t - 1)] + log_transmat[i, j]
177 |                     log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum(log_emission[j, (cumlen + t), :])
178 |             cumlen += le
179 |         return log_alpha
180 |     #
181 |     @staticmethod
182 |     @njit 
183 |     def backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat):
184 |         '''
185 |         Note that n_states is the CNV states, and there are n_states of paired states for (CNV, phasing) pairs.
186 |         Input
187 |             X: size n_observations * n_components * n_spots.
188 |             lengths: sum of lengths = n_observations.
189 |             log_transmat: n_states * n_states. Transition probability after log transformation.
190 |             log_startprob: n_states. Start probability after log transformation.
191 |             log_emission: n_states * n_observations * n_spots. Log probability.
192 |         Output
193 |             log_beta: size 2*n_states * n_observations. log beta[i, t] = log P(o_{t+1}, ..., o_T | q_t = i, lambda).
194 |         '''
195 |         n_obs = log_emission.shape[1]
196 |         n_states = log_emission.shape[0]
197 |         assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!"
198 |         assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!"
199 |         # initialize log_beta
200 |         log_beta = np.zeros((log_emission.shape[0], n_obs))
201 |         buf = np.zeros(log_emission.shape[0])
202 |         cumlen = 0
203 |         for le in lengths:
204 |             # start prob
205 |             # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. 
206 |             # But adding too many spots may lead to a higher weight of the emission rather then transition prob.
207 |             log_beta[:, (cumlen + le - 1)] = 0
208 |             for t in np.arange(le-2, -1, -1):
209 |                 for i in np.arange(log_emission.shape[0]):
210 |                     for j in np.arange(log_emission.shape[0]):
211 |                         buf[j] = log_beta[j, (cumlen + t + 1)] + log_transmat[i, j] + np.sum(log_emission[j, (cumlen + t + 1), :])
212 |                     log_beta[i, (cumlen + t)] = mylogsumexp(buf)
213 |             cumlen += le
214 |         return log_beta
215 | 
216 |     #
217 |     def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat=None, tumor_prop=None, tp_weight_by_mu=None, \
218 |         fix_NB_dispersion=False, shared_NB_dispersion=False, fix_BB_dispersion=False, shared_BB_dispersion=False, \
219 |         is_diag=False, init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, max_iter=100, tol=1e-4, **kwargs):
220 |         '''
221 |         Input
222 |             X: size n_observations * n_components * n_spots.
223 |             lengths: sum of lengths = n_observations.
224 |             base_nb_mean: size of n_observations * n_spots.
225 |             In NB-BetaBinom model, n_components = 2
226 |         Intermediate
227 |             log_mu: size of n_states. Log of mean/exposure/base_prob of each HMM state.
228 |             alpha: size of n_states. Dispersioon parameter of each HMM state.
229 |         '''
230 |         n_obs = X.shape[0]
231 |         n_comp = X.shape[1]
232 |         n_spots = X.shape[2]
233 |         assert n_comp == 2
234 |         # initialize NB logmean shift and BetaBinom prob
235 |         log_mu = np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T if init_log_mu is None else init_log_mu
236 |         p_binom = np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T if init_p_binom is None else init_p_binom
237 |         # initialize (inverse of) dispersion param in NB and BetaBinom
238 |         alphas = 0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas
239 |         taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus
240 |         # initialize start probability and emission probability
241 |         log_startprob = np.log( np.ones(n_states) / n_states )
242 |         if n_states > 1:
243 |             transmat = np.ones((n_states, n_states)) * (1-self.t) / (n_states-1)
244 |             np.fill_diagonal(transmat, self.t)
245 |             log_transmat = np.log(transmat)
246 |         else:
247 |             log_transmat = np.zeros((1,1))
248 |         # a trick to speed up BetaBinom optimization: taking only unique values of (B allele count, total SNP covering read count)
249 |         unique_values_nb, mapping_matrices_nb = construct_unique_matrix(X[:,0,:], base_nb_mean)
250 |         unique_values_bb, mapping_matrices_bb = construct_unique_matrix(X[:,1,:], total_bb_RD)
251 |         # EM algorithm
252 |         for r in trange(max_iter):
253 |             # E step
254 |             if tumor_prop is None:
255 |                 log_emission_rdr, log_emission_baf = hmm_nophasing.compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus)
256 |                 log_emission = log_emission_rdr + log_emission_baf
257 |             else:
258 |                 log_emission_rdr, log_emission_baf = hmm_nophasing.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop)
259 |                 log_emission = log_emission_rdr + log_emission_baf
260 |             log_alpha = hmm_nophasing.forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat)
261 |             log_beta = hmm_nophasing.backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat)
262 |             log_gamma = compute_posterior_obs(log_alpha, log_beta)
263 |             log_xi = compute_posterior_transition_nophasing(log_alpha, log_beta, log_transmat, log_emission)
264 |             # M step
265 |             if "s" in self.params:
266 |                 new_log_startprob = update_startprob_nophasing(lengths, log_gamma)
267 |                 new_log_startprob = new_log_startprob.flatten()
268 |             else:
269 |                 new_log_startprob = log_startprob
270 |             if "t" in self.params:
271 |                 new_log_transmat = update_transition_nophasing(log_xi, is_diag=is_diag)
272 |             else:
273 |                 new_log_transmat = log_transmat
274 |             if "m" in self.params:
275 |                 if tumor_prop is None:
276 |                     new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, start_log_mu=log_mu, \
277 |                         fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
278 |                 else:
279 |                     new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, tumor_prop, start_log_mu=log_mu, \
280 |                         fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion)
281 |             else:
282 |                 new_log_mu = log_mu
283 |                 new_alphas = alphas
284 |             if "p" in self.params:
285 |                 if tumor_prop is None:
286 |                     new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, taus, start_p_binom=p_binom, \
287 |                         fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
288 |                 else:
289 |                     new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, taus, tumor_prop, start_p_binom=p_binom, \
290 |                         fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion)
291 |             else:
292 |                 new_p_binom = p_binom
293 |                 new_taus = taus
294 |             # check convergence
295 |             print( np.mean(np.abs( np.exp(new_log_startprob) - np.exp(log_startprob) )), \
296 |                 np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )), \
297 |                 np.mean(np.abs(new_log_mu - log_mu)),\
298 |                 np.mean(np.abs(new_p_binom - p_binom)) )
299 |             print( np.hstack([new_log_mu, new_p_binom]) )
300 |             if np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )) < tol and \
301 |                 np.mean(np.abs(new_log_mu - log_mu)) < tol and np.mean(np.abs(new_p_binom - p_binom)) < tol:
302 |                 break
303 |             log_startprob = new_log_startprob
304 |             log_transmat = new_log_transmat
305 |             log_mu = new_log_mu
306 |             alphas = new_alphas
307 |             p_binom = new_p_binom
308 |             taus = new_taus
309 |         return new_log_mu, new_alphas, new_p_binom, new_taus, new_log_startprob, new_log_transmat, log_gamma
310 | 
311 | 
312 | 


--------------------------------------------------------------------------------
/src/calicost/hmm_NB_sharedstates.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import inspect
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import scipy
  7 | from scipy import linalg, special
  8 | from scipy.special import logsumexp
  9 | from sklearn import cluster
 10 | from sklearn.utils import check_random_state
 11 | from hmmlearn.hmm import BaseHMM
 12 | import statsmodels
 13 | import statsmodels.api as sm
 14 | from statsmodels.base.model import GenericLikelihoodModel
 15 | 
 16 | 
 17 | def convert_params(mean, std):
 18 |     """
 19 |     Convert mean/dispersion parameterization of a negative binomial to the ones scipy supports
 20 | 
 21 |     See https://mathworld.wolfram.com/NegativeBinomialDistribution.html
 22 |     """
 23 |     p = mean/std**2
 24 |     n = mean*p/(1.0 - p)
 25 |     return n, p
 26 | 
 27 | 
 28 | class Weighted_NegativeBinomial(GenericLikelihoodModel):
 29 |     def __init__(self, endog, exog, weights, exposure, seed=0, **kwds):
 30 |         super(Weighted_NegativeBinomial, self).__init__(endog, exog, **kwds)
 31 |         self.weights = weights
 32 |         self.exposure = exposure
 33 |         self.seed = seed
 34 |     #
 35 |     def nloglikeobs(self, params):
 36 |         nb_mean = np.exp(self.exog @ params[:-1]) * self.exposure
 37 |         nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2)
 38 |         n, p = convert_params(nb_mean, nb_std)
 39 |         llf = scipy.stats.nbinom.logpmf(self.endog, n, p)
 40 |         neg_sum_llf = -llf.dot(self.weights)
 41 |         return neg_sum_llf
 42 |     #
 43 |     def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
 44 |         self.exog_names.append('alpha')
 45 | 
 46 |         if start_params is None:
 47 |             if hasattr(self, 'start_params'):
 48 |                 start_params = self.start_params
 49 |             else:
 50 |                 start_params = np.append(0.1 * np.ones(self.nparams), 0.01)
 51 |         
 52 |         return super(Weighted_NegativeBinomial, self).fit(start_params=start_params,
 53 |                                                maxiter=maxiter, maxfun=maxfun,
 54 |                                                **kwds)
 55 | 
 56 | 
 57 | class ConstrainedNBHMM(BaseHMM):
 58 |     """
 59 |     HMM model with NB emission probability and constraint of all cells have the shared hidden state vector.
 60 |     A degenerative case is to use pseudobulk UMI count matrix of size G genes by 1 cell.
 61 | 
 62 |     Attributes
 63 |     ----------
 64 |     base_nb_mean : array, shape (n_genes, n_cells)
 65 |         Mean expression under diploid state.
 66 | 
 67 |     startprob_ : array, shape (n_components)
 68 |         Initial state occupation distribution.
 69 | 
 70 |     transmat_ : array, shape (n_components, n_components)
 71 |         Matrix of transition probabilities between states.
 72 | 
 73 |     log_mu : array, shape (n_components)
 74 |         Shift in log of expression due to CNV. Each CNV states (components) has it's own shift value.
 75 | 
 76 |     params : str
 77 |         "s" for start probability, "t" for transition probability, "m" for log of expression shift due to CNV, "a" for inverse dispersion of NB distribution.
 78 | 
 79 |     Examples
 80 |     ----------
 81 |     base_nb_mean = eta.reshape(-1,1) * np.sum(totalUMI)
 82 |     hmmmodel = ConstrainedNBHMM(n_components=3)
 83 |     X = np.vstack( [np.sum(count,axis=0), base_nb_mean] ).T
 84 |     hmmmodel.fit( X )
 85 |     hmmmodel.predict( X )
 86 |     """
 87 |     def __init__(self, n_components=1, shared_dispersion=False,
 88 |                  startprob_prior=1.0, transmat_prior=1.0,
 89 |                  algorithm="viterbi", random_state=None,
 90 |                  n_iter=10, tol=1e-2, verbose=False,
 91 |                  params="stma",
 92 |                  init_params=""):
 93 |         BaseHMM.__init__(self, n_components,
 94 |                          startprob_prior=startprob_prior,
 95 |                          transmat_prior=transmat_prior, algorithm=algorithm,
 96 |                          random_state=random_state, n_iter=n_iter,
 97 |                          tol=tol, params=params, verbose=verbose,
 98 |                          init_params=init_params)
 99 |         self.shared_dispersion = shared_dispersion
100 |         # initialize CNV's effect
101 |         self.log_mu = np.linspace(-0.1, 0.1, self.n_components)
102 |         # initialize inverse of dispersion
103 |         self.alphas = np.array([0.01] * self.n_components)
104 |         # self.alphas = 0.01 * np.ones(s(self.n_components, self.n_genes))
105 |         # initialize start probability and transition probability
106 |         self.startprob_ = np.ones(self.n_components) / self.n_components
107 |         t = 0.9
108 |         self.transmat_ = np.ones((self.n_components, self.n_components)) * (1-t) / (self.n_components-1)
109 |         np.fill_diagonal(self.transmat_, t)
110 |     #
111 |     def _compute_log_likelihood(self, X):
112 |         """
113 |         Compute log likelihood of X.
114 | 
115 |         Attributes
116 |         ----------
117 |         X : array_like, shape (n_genes, 2*n_cells)
118 |             First (n_genes, n_cells) is the observed UMI count matrix; second (n_genes, n_cells) is base_nb_mean.
119 | 
120 |         Returns
121 |         -------
122 |         lpr : array_like, shape (n_genes, n_components)
123 |             Array containing the log probabilities of each data point in X.
124 |         """
125 |         n_genes = X.shape[0]
126 |         n_cells = int(X.shape[1] / 2)
127 |         base_nb_mean = X[:, n_cells:]
128 |         log_prob = np.zeros((n_genes, n_cells, self.n_components))
129 |         for i in range(self.n_components):
130 |             nb_mean = base_nb_mean * np.exp(self.log_mu[i])
131 |             nb_std = np.sqrt(nb_mean + self.alphas[i] * nb_mean**2)
132 |             # nb_std = np.sqrt(nb_mean + self.alphas[i,:].reshape(-1,1) * nb_mean**2)
133 |             n, p = convert_params(nb_mean, nb_std)
134 |             log_prob[:,:,i] = scipy.stats.nbinom.logpmf(X[:, :n_cells], n, p)
135 |         return log_prob.mean(axis=1)
136 |     #
137 |     def _initialize_sufficient_statistics(self):
138 |         stats = super()._initialize_sufficient_statistics()
139 |         return stats
140 |     #
141 |     def _accumulate_sufficient_statistics(self, stats, X, lattice, posteriors, fwdlattice, bwdlattice):
142 |         super()._accumulate_sufficient_statistics(
143 |             stats, X, lattice, posteriors, fwdlattice, bwdlattice)
144 |         """
145 |         Update sufficient statistics from a given sample.
146 |         Parameters
147 |         ----------
148 |         stats : dict
149 |             Sufficient statistics as returned by
150 |             :meth:`~.BaseHMM._initialize_sufficient_statistics`.
151 |         X : array, shape (n_genes, n_cells)
152 |             Sample sequence.
153 |         lattice : array, shape (n_genes, n_components)
154 |             Probabilities OR Log Probabilities of each sample
155 |             under each of the model states.  Depends on the choice
156 |             of implementation of the Forward-Backward algorithm
157 |         posteriors : array, shape (n_genes, n_components)
158 |             Posterior probabilities of each sample being generated by each
159 |             of the model states.
160 |         fwdlattice, bwdlattice : array, shape (n_genes, n_components)
161 |             forward and backward probabilities.
162 |         """
163 |         if 'm' in self.params or 'a' in self.params:
164 |             stats['post'] = posteriors
165 |             stats['obs'] = X
166 |         if 't' in self.params:            
167 |             # for each ij, recover sum_t xi_ij from the inferred transition matrix
168 |             bothlattice = fwdlattice + bwdlattice
169 |             loggamma = (bothlattice.T - logsumexp(bothlattice, axis = 1)).T
170 | 
171 |             # denominator for each ij is the sum of gammas over i
172 |             denoms = np.sum(np.exp(loggamma), axis = 0)   
173 |             # transpose to perform row-wise multiplication
174 |             stats['denoms'] = denoms
175 |     #
176 |     def _do_mstep(self, stats):
177 |         n_genes = stats['obs'].shape[0]
178 |         n_cells = int(stats['obs'].shape[1] / 2)
179 |         base_nb_mean = stats['obs'][:, n_cells:]
180 |         super()._do_mstep(stats)
181 |         if 'm' in self.params and 'a' in self.params:
182 |             # NB regression fit dispersion and CNV's effect simultaneously
183 |             if not self.shared_dispersion:
184 |                 for i in range(self.n_components):
185 |                     model = Weighted_NegativeBinomial(stats['obs'][:, :n_cells].flatten(), \
186 |                                 np.ones(n_genes*n_cells).reshape(-1,1), \
187 |                                 weights=np.repeat(stats['post'][:,i], n_cells), exposure=base_nb_mean.flatten())
188 |                     res = model.fit(disp=0, maxiter=500)
189 |                     self.log_mu[i] = res.params[0]
190 |                     self.alphas[i] = res.params[-1]
191 |                     # self.alphas[i,:] = res.params[-1]
192 |             else:
193 |                 all_states_nb_mean = np.tile(base_nb_mean.flatten(), self.n_components)
194 |                 all_states_y = np.tile(stats['obs'][:, :n_cells].flatten(), self.n_components)
195 |                 all_states_weights = np.concatenate([np.repeat(stats['post'][:,i], n_cells) for i in range(self.n_components)])
196 |                 all_states_features = np.zeros((self.n_components*n_genes*n_cells, self.n_components))
197 |                 for i in np.arange(self.n_components):
198 |                     all_states_features[(i*n_genes*n_cells):((i+1)*n_genes*n_cells), i] = 1
199 |                 model = Weighted_NegativeBinomial(all_states_y, all_states_features, weights=all_states_weights, exposure=all_states_nb_mean)
200 |                 res = model.fit(disp=0, maxiter=500)
201 |                 self.log_mu = res.params[:-1]
202 |                 self.alphas[:] = res.params[-1]
203 |                 # self.alphas[:,:] = res.params[-1]
204 |                 # print(res.params)
205 |         elif 'm' in self.params:
206 |             # NB regression fit CNV's effect only
207 |             for i in range(self.n_components):
208 |                 model = sm.GLM(stats['obs'].flatten(), np.ones(self.n_genes*self.n_cells).reshape(-1,1), \
209 |                             family=sm.families.NegativeBinomial(alpha=self.alphas[i]), \
210 |                             exposure=base_nb_mean.flatten())
211 |                 # model = sm.GLM(stats['obs'][:, :n_cells].flatten(), np.ones(n_genes*n_cells).reshape(-1,1), \
212 |                 #             family=sm.families.NegativeBinomial(alpha=np.repeat(self.alphas[i], n_cells)), \
213 |                 #             exposure=base_nb_mean.flatten(), var_weights=np.repeat(stats['post'][:,i], n_cells))
214 |                 res = model.fit(disp=0, maxiter=500)
215 |                 self.log_mu[i] = res.params[0]
216 |         if 't' in self.params:
217 |             # following copied from Matt's code
218 |             denoms = stats['denoms']
219 |             x = (self.transmat_.T * denoms).T
220 | 
221 |             # numerator is the sum of ii elements
222 |             num = np.sum(np.diag(x))
223 |             # denominator is the sum of all elements
224 |             denom = np.sum(x)
225 | 
226 |             # (this is the same as sum_i gamma_i)
227 |             #assert np.isclose(denom, np.sum(denoms))
228 | 
229 |             stats['diag'] = num / denom
230 |             self.transmat_ = self.form_transition_matrix(stats['diag'])
231 |     #
232 |     def form_transition_matrix(self, diag):
233 |         tol = 1e-10
234 |         diag = np.clip(diag, tol, 1 - tol)
235 |         
236 |         offdiag = (1 - diag) / (self.n_components - 1)
237 |         transmat_ = np.diag([diag - offdiag] * self.n_components) 
238 |         transmat_ += offdiag
239 |         #assert np.all(transmat_ > 0), (diag, offdiag, transmat_)
240 |         return transmat_


--------------------------------------------------------------------------------
/src/calicost/hmrf_normalmixture.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from numba import njit
 3 | import scipy.special
 4 | import scipy.sparse
 5 | from sklearn.mixture import GaussianMixture
 6 | from sklearn.cluster import KMeans
 7 | from sklearn.metrics import adjusted_rand_score
 8 | from tqdm import trange
 9 | import copy
10 | from pathlib import Path
11 | from hmm_NB_BB_phaseswitch import *
12 | from utils_distribution_fitting import *
13 | from utils_IO import *
14 | from simple_sctransform import *
15 | 
16 | import warnings
17 | from statsmodels.tools.sm_exceptions import ValueWarning
18 | 
19 | 


--------------------------------------------------------------------------------
/src/calicost/joint_allele_generateconfig.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import numpy as np
  3 | import scipy
  4 | import pandas as pd
  5 | from pathlib import Path
  6 | from sklearn.metrics import adjusted_rand_score
  7 | import scanpy as sc
  8 | import anndata
  9 | import logging
 10 | import copy
 11 | from pathlib import Path
 12 | import subprocess
 13 | from hmm_NB_BB_phaseswitch import *
 14 | from utils_distribution_fitting import *
 15 | from hmrf import *
 16 | from utils_IO import *
 17 | 
 18 | 
 19 | def read_joint_configuration_file(filename):
 20 |     ##### [Default settings] #####
 21 |     config = {
 22 |         "input_filelist" : None,
 23 |         "snp_dir" : None,
 24 |         "output_dir" : None,
 25 |         # supporting files and preprocessing arguments
 26 |         "hgtable_file" : None,
 27 |         "normalidx_file" : None,
 28 |         "tumorprop_file" : None,
 29 |         "supervision_clone_file" : None,
 30 |         "alignment_files" : [],
 31 |         "filtergenelist_file" : None,
 32 |         "filterregion_file" : None,
 33 |         "binsize" : 1,
 34 |         "rdrbinsize" : 1,
 35 |         # "secondbinning_min_umi" : 500,
 36 |         "max_nbins" : 1200,
 37 |         "avg_umi_perbinspot" : 1.5,
 38 |         "bafonly" : True,
 39 |         # phase switch probability
 40 |         "nu" : 1,
 41 |         "logphase_shift" : 1,
 42 |         "npart_phasing" : 2,
 43 |         # HMRF configurations
 44 |         "n_clones" : None,
 45 |         "n_clones_rdr" : 2,
 46 |         "min_spots_per_clone" : 100,
 47 |         "min_avgumi_per_clone" : 10,
 48 |         "maxspots_pooling" : 7,
 49 |         "tumorprop_threshold" : 0.5, 
 50 |         "max_iter_outer" : 20,
 51 |         "nodepotential" : "max", # max or weighted_sum
 52 |         "initialization_method" : "rectangle", # rectangle or datadrive
 53 |         "num_hmrf_initialization_start" : 0, 
 54 |         "num_hmrf_initialization_end" : 10,
 55 |         "spatial_weight" : 2.0,
 56 |         "construct_adjacency_method" : "hexagon",
 57 |         "construct_adjacency_w" : 1.0,
 58 |         # HMM configurations
 59 |         "n_states" : None,
 60 |         "params" : None,
 61 |         "t" : None,
 62 |         "t_phaseing" : 1-1e-4,
 63 |         "fix_NB_dispersion" : False,
 64 |         "shared_NB_dispersion" : True,
 65 |         "fix_BB_dispersion" : False,
 66 |         "shared_BB_dispersion" : True,
 67 |         "max_iter" : 30,
 68 |         "tol" : 1e-3,
 69 |         "gmm_random_state" : 0,
 70 |         "np_threshold" : 2.0,
 71 |         "np_eventminlen" : 10
 72 |     }
 73 | 
 74 |     argument_type = {
 75 |         "input_filelist" : "str",
 76 |         "snp_dir" : "str",
 77 |         "output_dir" : "str",
 78 |         # supporting files and preprocessing arguments
 79 |         "hgtable_file" : "str",
 80 |         "normalidx_file" : "str",
 81 |         "tumorprop_file" : "str",
 82 |         "supervision_clone_file" : "str",
 83 |         "alignment_files" : "list_str",
 84 |         "filtergenelist_file" : "str",
 85 |         "filterregion_file" : "str",
 86 |         "binsize" : "int",
 87 |         "rdrbinsize" : "int",
 88 |         # "secondbinning_min_umi" : "int",
 89 |         "max_nbins" : "int",
 90 |         "avg_umi_perbinspot" : "float",
 91 |         "bafonly" : "bool",
 92 |         # phase switch probability
 93 |         "nu" : "float",
 94 |         "logphase_shift" : "float",
 95 |         "npart_phasing" : "int",
 96 |         # HMRF configurations
 97 |         "n_clones" : "int",
 98 |         "n_clones_rdr" : "int",
 99 |         "min_spots_per_clone" : "int",
100 |         "min_avgumi_per_clone" : "int",
101 |         "maxspots_pooling" : "int",
102 |         "tumorprop_threshold" : "float", 
103 |         "max_iter_outer" : "int",
104 |         "nodepotential" : "str",
105 |         "initialization_method" : "str",
106 |         "num_hmrf_initialization_start" : "int", 
107 |         "num_hmrf_initialization_end" : "int",
108 |         "spatial_weight" : "float",
109 |         "construct_adjacency_method" : "str",
110 |         "construct_adjacency_w" : "float",
111 |         # HMM configurations
112 |         "n_states" : "int",
113 |         "params" : "str",
114 |         "t" : "eval",
115 |         "t_phaseing" : "eval",
116 |         "fix_NB_dispersion" : "bool",
117 |         "shared_NB_dispersion" : "bool",
118 |         "fix_BB_dispersion" : "bool",
119 |         "shared_BB_dispersion" : "bool",
120 |         "max_iter" : "int",
121 |         "tol" : "float",
122 |         "gmm_random_state" : "int",
123 |         "np_threshold" : "float",
124 |         "np_eventminlen" : "int"
125 |     }
126 | 
127 |     ##### [ read configuration file to update settings ] #####
128 |     with open(filename, 'r') as fp:
129 |         for line in fp:
130 |             if line.strip() == "" or line[0] == "#":
131 |                 continue
132 |             strs = [x.strip() for x in line.strip().split(":") if x != ""]
133 |             assert strs[0] in config.keys(), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}"
134 |             if len(strs) == 1:
135 |                 config[strs[0]] = []
136 |             elif strs[1].upper() == "NONE":
137 |                 config[strs[0]] = None
138 |             elif argument_type[strs[0]] == "str":
139 |                 config[strs[0]] = strs[1]
140 |             elif argument_type[strs[0]] == "int":
141 |                 config[strs[0]] = int(strs[1])
142 |             elif argument_type[strs[0]] == "float":
143 |                 config[strs[0]] = float(strs[1])
144 |             elif argument_type[strs[0]] == "eval":
145 |                 config[strs[0]] = eval(strs[1])
146 |             elif argument_type[strs[0]] == "bool":
147 |                 config[strs[0]] = (strs[1].upper() == "TRUE")
148 |             elif argument_type[strs[0]] == "list_str":
149 |                 config[strs[0]] = strs[1].split(" ")
150 |     # assertions
151 |     assert not config["input_filelist"] is None, "No input file list!"
152 |     assert not config["snp_dir"] is None, "No SNP directory!"
153 |     assert not config["output_dir"] is None, "No output directory!"
154 | 
155 |     return config
156 | 
157 | 
158 | 
159 | def write_joint_config_file(outputfilename, config):
160 |     list_argument_io = ["input_filelist",
161 |         "snp_dir",
162 |         "output_dir"]
163 |     list_argument_sup = ["hgtable_file",
164 |         "normalidx_file",
165 |         "tumorprop_file",
166 |         "supervision_clone_file",
167 |         "alignment_files",
168 |         "filtergenelist_file",
169 |         "filterregion_file",
170 |         "binsize",
171 |         "rdrbinsize",
172 |         # "secondbinning_min_umi",
173 |         "max_nbins",
174 |         "avg_umi_perbinspot",
175 |         "bafonly"]
176 |     list_argument_phase = ["nu",
177 |         "logphase_shift",
178 |         "npart_phasing"]
179 |     list_argument_hmrf = ["n_clones",
180 |         "n_clones_rdr",
181 |         "min_spots_per_clone",
182 |         "min_avgumi_per_clone",
183 |         "maxspots_pooling",
184 |         "tumorprop_threshold",
185 |         "max_iter_outer",
186 |         "nodepotential",
187 |         "initialization_method",
188 |         "num_hmrf_initialization_start", 
189 |         "num_hmrf_initialization_end",
190 |         "spatial_weight",
191 |         "construct_adjacency_method",
192 |         "construct_adjacency_w"]
193 |     list_argument_hmm = ["n_states",
194 |         "params",
195 |         "t",
196 |         "t_phaseing",
197 |         "fix_NB_dispersion",
198 |         "shared_NB_dispersion",
199 |         "fix_BB_dispersion",
200 |         "shared_BB_dispersion",
201 |         "max_iter",
202 |         "tol",
203 |         "gmm_random_state",
204 |         "np_threshold",
205 |         "np_eventminlen"]
206 |     with open(outputfilename, 'w') as fp:
207 |         #
208 |         for k in list_argument_io:
209 |             fp.write(f"{k} : {config[k]}\n")
210 |         #
211 |         fp.write("\n")
212 |         fp.write("# supporting files and preprocessing arguments\n")
213 |         for k in list_argument_sup:
214 |             if not isinstance(config[k], list):
215 |                 fp.write(f"{k} : {config[k]}\n")
216 |             else:
217 |                 fp.write(f"{k} : " + " ".join(config[k]) + "\n")
218 |         #
219 |         fp.write("\n")
220 |         fp.write("# phase switch probability\n")
221 |         for k in list_argument_phase:
222 |             fp.write(f"{k} : {config[k]}\n")
223 |         #
224 |         fp.write("\n")
225 |         fp.write("# HMRF configurations\n")
226 |         for k in list_argument_hmrf:
227 |             fp.write(f"{k} : {config[k]}\n")
228 |         #
229 |         fp.write("\n")
230 |         fp.write("# HMM configurations\n")
231 |         for k in list_argument_hmm:
232 |             fp.write(f"{k} : {config[k]}\n")
233 | 
234 | 
235 | def main(argv):
236 |     template_configuration_file = argv[1]
237 |     outputdir = argv[2]
238 |     hmrf_seed_s = int(argv[3])
239 |     hmrf_seed_t = int(argv[4])
240 |     config = read_joint_configuration_file(template_configuration_file)
241 |     for r in range(hmrf_seed_s, hmrf_seed_t):
242 |         config["num_hmrf_initialization_start"] = r
243 |         config["num_hmrf_initialization_end"] = r+1
244 |         write_joint_config_file(f"{outputdir}/configfile{r}", config)
245 |     
246 | 
247 | if __name__ == "__main__":
248 |     if len(sys.argv) == 1:
249 |         print("python joint_allele_generateconfig.py <template_configuration_file> <outputdir> <hmrf_seed_s> <hmrf_seed_t>")
250 |     if len(sys.argv) > 1:
251 |         main(sys.argv)


--------------------------------------------------------------------------------
/src/calicost/parse_input.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import numpy as np
  3 | import scipy
  4 | import pandas as pd
  5 | from pathlib import Path
  6 | from sklearn.metrics import adjusted_rand_score
  7 | import scanpy as sc
  8 | import anndata
  9 | import logging
 10 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
 11 | logger = logging.getLogger()
 12 | import copy
 13 | from pathlib import Path
 14 | import functools
 15 | import subprocess
 16 | import argparse
 17 | from calicost.utils_IO import *
 18 | from calicost.phasing import *
 19 | from calicost.arg_parse import *
 20 | 
 21 | 
 22 | def genesnp_to_bininfo(df_gene_snp):
 23 |     table_bininfo = df_gene_snp[~df_gene_snp.bin_id.isnull()].groupby('bin_id').agg({"CHR":'first', 'START':'first', 'END':'last', 'gene':set, 'snp_id':set}).reset_index()
 24 |     table_bininfo['ARM'] = '.'
 25 |     table_bininfo['INCLUDED_GENES'] = [ " ".join([x for x in y if not x is None]) for y in table_bininfo.gene.values ]
 26 |     table_bininfo['INCLUDED_SNP_IDS'] = [ " ".join([x for x in y if not x is None]) for y in table_bininfo.snp_id.values ]
 27 |     table_bininfo['NORMAL_COUNT'] = np.nan
 28 |     table_bininfo['N_SNPS'] = [ len([x for x in y if not x is None]) for y in table_bininfo.snp_id.values ]
 29 |     # drop the set columns
 30 |     table_bininfo.drop(columns=['gene', 'snp_id'], inplace=True)
 31 |     return table_bininfo
 32 | 
 33 | 
 34 | def parse_visium(config):
 35 |     """
 36 |     Read multiple 10X Visium SRT samples and SNP data and generate tables with counts and meta info.
 37 |     
 38 |     Attributes:
 39 |     ----------
 40 |     config : dictionary
 41 |         Dictionary containing configuration parameters. Output from read_joint_configuration_file.
 42 | 
 43 |     Returns:
 44 |     ----------
 45 |     table_bininfo : DataFrame
 46 |         DataFrame with columns [chr, arm, start, end, log_phase_transition, included_genes, normal count, n_snps].
 47 | 
 48 |     table_rdrbaf : DataFrame
 49 |         DataFrame with columns [barcodes, exp_count, tot_count, b_count].
 50 | 
 51 |     meta_info : DataFrame
 52 |         DataFrame with columns [barcodes, sample, x, y, tumor_proportion]
 53 | 
 54 |     expression : sparse matrix, (n_spots, n_genes)
 55 |         Gene expression UMI count matrix.
 56 | 
 57 |     adjacency_mat : array, (n_spots, n_spots)
 58 |         Adjacency matrix for evaluating label coherence in HMRF.
 59 | 
 60 |     smooth_mat : array, (n_spots, n_spots)
 61 |         KNN smoothing matrix.
 62 |     """
 63 |     if "input_filelist" in config:
 64 |         adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, across_slice_adjacency_mat = load_joint_data(config["input_filelist"], config["snp_dir"], config["alignment_files"], config["filtergenelist_file"], config["filterregion_file"], config["normalidx_file"], config['min_snpumi_perspot'], config['min_percent_expressed_spots'])
 65 |         sample_list = [adata.obs["sample"][0]]
 66 |         for i in range(1, adata.shape[0]):
 67 |             if adata.obs["sample"][i] != sample_list[-1]:
 68 |                 sample_list.append( adata.obs["sample"][i] )
 69 |         # convert sample name to index
 70 |         sample_ids = np.zeros(adata.shape[0], dtype=int)
 71 |         for s,sname in enumerate(sample_list):
 72 |             index = np.where(adata.obs["sample"] == sname)[0]
 73 |             sample_ids[index] = s
 74 |     else:
 75 |         adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids = load_data(config["spaceranger_dir"], config["snp_dir"], config["filtergenelist_file"], config["filterregion_file"], config["normalidx_file"], config['min_snpumi_perspot'], config['min_percent_expressed_spots'])
 76 |         adata.obs["sample"] = "unique_sample"
 77 |         sample_list = [adata.obs["sample"][0]]
 78 |         sample_ids = np.zeros(adata.shape[0], dtype=int)
 79 |         across_slice_adjacency_mat = None
 80 | 
 81 |     coords = adata.obsm["X_pos"]
 82 | 
 83 |     if not config["tumorprop_file"] is None:
 84 |         df_tumorprop = pd.read_csv(config["tumorprop_file"], sep="\t", header=0, index_col=0)
 85 |         df_tumorprop = df_tumorprop[["Tumor"]]
 86 |         df_tumorprop.columns = ["tumor_proportion"]
 87 |         adata.obs = adata.obs.join(df_tumorprop)
 88 |         single_tumor_prop = adata.obs["tumor_proportion"]
 89 |     else:
 90 |         single_tumor_prop = None
 91 |     
 92 |     # read original data
 93 |     df_gene_snp = combine_gene_snps(unique_snp_ids, config['hgtable_file'], adata)
 94 |     df_gene_snp = create_haplotype_block_ranges(df_gene_snp, adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids)
 95 |     lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat = summarize_counts_for_blocks(df_gene_snp, \
 96 |             adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, nu=config['nu'], logphase_shift=config['logphase_shift'], geneticmap_file=config['geneticmap_file'])
 97 |     # infer an initial phase using pseudobulk
 98 |     if not Path(f"{config['output_dir']}/initial_phase.npz").exists():
 99 |         initial_clone_for_phasing = perform_partition(coords, sample_ids, x_part=config["npart_phasing"], y_part=config["npart_phasing"], single_tumor_prop=single_tumor_prop, threshold=config["tumorprop_threshold"])
100 |         phase_indicator, refined_lengths = initial_phase_given_partition(single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_for_phasing, 5, log_sitewise_transmat, \
101 |             "sp", config["t_phaseing"], config["gmm_random_state"], config["fix_NB_dispersion"], config["shared_NB_dispersion"], config["fix_BB_dispersion"], config["shared_BB_dispersion"], 30, 1e-3, threshold=config["tumorprop_threshold"])
102 |         np.savez(f"{config['output_dir']}/initial_phase.npz", phase_indicator=phase_indicator, refined_lengths=refined_lengths)
103 |         # map phase indicator to individual snps
104 |         df_gene_snp['phase'] = np.where(df_gene_snp.snp_id.isnull(), None, df_gene_snp.block_id.map({i:x for i,x in enumerate(phase_indicator)}) )
105 |     else:
106 |         tmp = dict(np.load(f"{config['output_dir']}/initial_phase.npz"))
107 |         phase_indicator, refined_lengths = tmp["phase_indicator"], tmp["refined_lengths"]
108 | 
109 |     # binning
110 |     df_gene_snp = create_bin_ranges(df_gene_snp, single_total_bb_RD, refined_lengths, config['secondary_min_umi'])
111 |     lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat = summarize_counts_for_bins(df_gene_snp, \
112 |             adata, single_X, single_total_bb_RD, phase_indicator, nu=config['nu'], logphase_shift=config['logphase_shift'], geneticmap_file=config['geneticmap_file'])
113 |     # lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, sorted_chr_pos, sorted_chr_pos_last, x_gene_list, n_snps = perform_binning_new(lengths, single_X, \
114 |     #     single_base_nb_mean, single_total_bb_RD, sorted_chr_pos, sorted_chr_pos_last, x_gene_list, n_snps, phase_indicator, refined_lengths, config["binsize"], config["rdrbinsize"], config["nu"], config["logphase_shift"], secondary_min_umi=secondary_min_umi)
115 |         
116 |     # # remove bins where normal spots have imbalanced SNPs
117 |     # if not config["tumorprop_file"] is None:
118 |     #     for prop_threshold in np.arange(0, 0.6, 0.05):
119 |     #         normal_candidate = (single_tumor_prop <= prop_threshold)
120 |     #         if np.sum(single_X[:, 0, (normal_candidate==True)]) > single_X.shape[0] * 200:
121 |     #             break
122 |     #     index_normal = np.where(normal_candidate)[0]
123 |     #     lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_gene_snp = bin_selection_basedon_normal(df_gene_snp, \
124 |     #             single_X, single_base_nb_mean, single_total_bb_RD, config["nu"], config["logphase_shift"], index_normal, config['geneticmap_file'])
125 |     #     assert np.sum(lengths) == single_X.shape[0] 
126 |     #     assert single_X.shape[0] == single_total_bb_RD.shape[0]
127 |     #     assert single_X.shape[0] == len(log_sitewise_transmat)
128 | 
129 |     # expression count dataframe
130 |     exp_counts = pd.DataFrame.sparse.from_spmatrix( scipy.sparse.csc_matrix(adata.layers["count"]), index=adata.obs.index, columns=adata.var.index)
131 | 
132 |     # smooth and adjacency matrix for each sample
133 |     adjacency_mat, smooth_mat = multislice_adjacency(sample_ids, sample_list, coords, single_total_bb_RD, exp_counts, 
134 |                                                      across_slice_adjacency_mat, construct_adjacency_method=config['construct_adjacency_method'], 
135 |                                                      maxspots_pooling=config['maxspots_pooling'], construct_adjacency_w=config['construct_adjacency_w'])
136 |     n_pooled = np.median(np.sum(smooth_mat > 0, axis=0).A.flatten())
137 |     print(f"Set up number of spots to pool in HMRF: {n_pooled}")
138 | 
139 |     # If adjacency matrix is only constructed using gene expression similarity (e.g. scRNA-seq data)
140 |     # Then, directly replace coords by the umap of gene expression, to avoid potential inconsistency in HMRF initialization
141 |     if config["construct_adjacency_method"] == "KNN" and config["construct_adjacency_w"] == 0:
142 |         sc.pp.normalize_total(adata, target_sum=np.median(np.sum(exp_counts.values,axis=1)) )
143 |         sc.pp.log1p(adata)
144 |         sc.tl.pca(adata)
145 |         sc.pp.neighbors(adata)
146 |         sc.tl.umap(adata)
147 |         coords = adata.obsm["X_umap"]
148 | 
149 |     # create RDR-BAF table
150 |     table_bininfo = genesnp_to_bininfo(df_gene_snp)
151 |     table_bininfo['LOG_PHASE_TRANSITION'] = log_sitewise_transmat
152 | 
153 |     table_rdrbaf = []
154 |     for i in range(single_X.shape[2]):
155 |         table_rdrbaf.append( pd.DataFrame({"BARCODES":adata.obs.index[i], "EXP":single_X[:,0,i], "TOT":single_total_bb_RD[:,i], "B":single_X[:,1,i]}) )
156 |     table_rdrbaf = pd.concat(table_rdrbaf, ignore_index=True)
157 | 
158 |     # create meta info table
159 |     # note that table_meta.BARCODES is equal to the unique ones of table_rdrbaf.BARCODES in the original order
160 |     table_meta = pd.DataFrame({"BARCODES":adata.obs.index, "SAMPLE":adata.obs["sample"], "X":coords[:,0], "Y":coords[:,1]})
161 |     if not single_tumor_prop is None:
162 |         table_meta["TUMOR_PROPORTION"] = single_tumor_prop
163 |     
164 |     return table_bininfo, table_rdrbaf, table_meta, exp_counts, adjacency_mat, smooth_mat, df_gene_snp
165 | 
166 | 
167 | def load_tables_to_matrices(config):
168 |     """
169 |     Load tables and adjacency from parse_visium_joint or parse_visium_single, and convert to HMM input matrices.
170 |     """
171 |     table_bininfo = pd.read_csv(f"{config['output_dir']}/parsed_inputs/table_bininfo.csv.gz", header=0, index_col=None, sep="\t")
172 |     table_rdrbaf = pd.read_csv(f"{config['output_dir']}/parsed_inputs/table_rdrbaf.csv.gz", header=0, index_col=None, sep="\t")
173 |     table_meta = pd.read_csv(f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz", header=0, index_col=None, sep="\t")
174 |     adjacency_mat = scipy.sparse.load_npz( f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz" )
175 |     smooth_mat = scipy.sparse.load_npz( f"{config['output_dir']}/parsed_inputs/smooth_mat.npz" )
176 |     #
177 |     df_gene_snp = pd.read_csv(f"{config['output_dir']}/parsed_inputs/gene_snp_info.csv.gz", header=0, index_col=None, sep="\t")
178 |     df_gene_snp = df_gene_snp.replace(np.nan, None)
179 |     
180 |     n_spots = table_meta.shape[0]
181 |     n_bins = table_bininfo.shape[0]
182 | 
183 |     # construct single_X
184 |     # single_X = np.zeros((n_bins, 2, n_spots), dtype=int)
185 |     single_X = np.zeros((n_bins, 2, n_spots))
186 |     single_X[:, 0, :] = table_rdrbaf["EXP"].values.reshape((n_bins, n_spots), order="F")
187 |     single_X[:, 1, :] = table_rdrbaf["B"].values.reshape((n_bins, n_spots), order="F")
188 | 
189 |     # construct single_base_nb_mean, lengths
190 |     single_base_nb_mean = table_bininfo["NORMAL_COUNT"].values.reshape(-1,1) / np.sum(table_bininfo["NORMAL_COUNT"].values) @ np.sum(single_X[:,0,:], axis=0).reshape(1,-1)
191 | 
192 |     # construct single_total_bb_RD
193 |     single_total_bb_RD = table_rdrbaf["TOT"].values.reshape((n_bins, n_spots), order="F")
194 | 
195 |     # construct log_sitewise_transmat
196 |     log_sitewise_transmat = table_bininfo["LOG_PHASE_TRANSITION"].values
197 | 
198 |     # construct bin info and lengths and x_gene_list
199 |     df_bininfo = table_bininfo
200 |     lengths = np.array([ np.sum(table_bininfo.CHR == c) for c in df_bininfo.CHR.unique() ])
201 |     
202 |     # construct barcodes
203 |     barcodes = table_meta["BARCODES"]
204 | 
205 |     # construct coords
206 |     coords = table_meta[["X", "Y"]].values
207 | 
208 |     # construct single_tumor_prop
209 |     single_tumor_prop = table_meta["TUMOR_PROPORTION"].values if "TUMOR_PROPORTION" in table_meta.columns else None
210 | 
211 |     # construct sample_list and sample_ids
212 |     sample_list = [table_meta["SAMPLE"].values[0]]
213 |     for i in range(1, table_meta.shape[0]):
214 |         if table_meta["SAMPLE"].values[i] != sample_list[-1]:
215 |             sample_list.append( table_meta["SAMPLE"].values[i] )
216 |     sample_ids = np.zeros(table_meta.shape[0], dtype=int)
217 |     for s,sname in enumerate(sample_list):
218 |         index = np.where(table_meta["SAMPLE"].values == sname)[0]
219 |         sample_ids[index] = s
220 | 
221 |     # expression UMI count matrix
222 |     exp_counts = pd.read_pickle( f"{config['output_dir']}/parsed_inputs/exp_counts.pkl" )
223 | 
224 |     return lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_bininfo, df_gene_snp, \
225 |         barcodes, coords, single_tumor_prop, sample_list, sample_ids, adjacency_mat, smooth_mat, exp_counts
226 | 
227 | 
228 | def run_parse_n_load(config):
229 |     file_exists = np.array([ Path(f"{config['output_dir']}/parsed_inputs/table_bininfo.csv.gz").exists(), \
230 |                              Path(f"{config['output_dir']}/parsed_inputs/table_rdrbaf.csv.gz").exists(), \
231 |                              Path(f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz").exists(), \
232 |                              Path(f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz").exists(), \
233 |                              Path(f"{config['output_dir']}/parsed_inputs/smooth_mat.npz").exists(), \
234 |                              Path(f"{config['output_dir']}/parsed_inputs/exp_counts.pkl").exists() ])
235 |     if not np.all(file_exists):
236 |         # process to tables
237 |         table_bininfo, table_rdrbaf, table_meta, exp_counts, adjacency_mat, smooth_mat, df_gene_snp = parse_visium(config)
238 |         # table_bininfo, table_rdrbaf, table_meta, exp_counts, adjacency_mat, smooth_mat = parse_hatchetblock(config, cellsnplite_dir, bb_file)
239 | 
240 |         # save file
241 |         p = subprocess.Popen(f"mkdir -p {config['output_dir']}/parsed_inputs", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
242 |         out,err = p.communicate()
243 |         
244 |         table_bininfo.to_csv( f"{config['output_dir']}/parsed_inputs/table_bininfo.csv.gz", header=True, index=False, sep="\t" )
245 |         table_rdrbaf.to_csv( f"{config['output_dir']}/parsed_inputs/table_rdrbaf.csv.gz", header=True, index=False, sep="\t" )
246 |         table_meta.to_csv( f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz", header=True, index=False, sep="\t" )
247 |         exp_counts.to_pickle( f"{config['output_dir']}/parsed_inputs/exp_counts.pkl" )
248 |         scipy.sparse.save_npz( f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz", adjacency_mat )
249 |         scipy.sparse.save_npz( f"{config['output_dir']}/parsed_inputs/smooth_mat.npz", smooth_mat )
250 |         #
251 |         df_gene_snp.to_csv( f"{config['output_dir']}/parsed_inputs/gene_snp_info.csv.gz", header=True, index=False, sep="\t" )
252 | 
253 |     # load and parse data
254 |     return load_tables_to_matrices(config)
255 | 
256 | 
257 | if __name__ == "__main__":
258 |     parser = argparse.ArgumentParser()
259 |     parser.add_argument("-c", "--configfile", help="configuration file of CalicoST", required=True, type=str)
260 |     args = parser.parse_args()
261 | 
262 |     try:
263 |         config = read_configuration_file(args.configfile)
264 |     except:
265 |         config = read_joint_configuration_file(args.configfile)
266 | 
267 |     print("Configurations:")
268 |     for k in sorted(list(config.keys())):
269 |         print(f"\t{k} : {config[k]}")
270 | 
271 |     _ = run_parse_n_load(config)
272 | 


--------------------------------------------------------------------------------
/src/calicost/phasing.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from turtle import reset
  3 | import numpy as np
  4 | from numba import njit
  5 | import scipy.special
  6 | import scipy.sparse
  7 | from sklearn.mixture import GaussianMixture
  8 | from sklearn.cluster import KMeans
  9 | from sklearn.metrics import adjusted_rand_score, silhouette_score
 10 | from sklearn.neighbors import kneighbors_graph
 11 | import networkx as nx
 12 | from tqdm import trange
 13 | import copy
 14 | from pathlib import Path
 15 | from calicost.hmm_NB_BB_phaseswitch import *
 16 | from calicost.utils_distribution_fitting import *
 17 | from calicost.utils_hmrf import *
 18 | import warnings
 19 | from statsmodels.tools.sm_exceptions import ValueWarning
 20 | 
 21 | 
 22 | def infer_initial_phase(single_X, lengths, single_base_nb_mean, single_total_bb_RD, n_states, log_sitewise_transmat, \
 23 |     params, t, random_state, fix_NB_dispersion, shared_NB_dispersion, fix_BB_dispersion, shared_BB_dispersion, max_iter, tol):
 24 |     # pseudobulk HMM for phase_prob
 25 |     res = pipeline_baum_welch(None, np.sum(single_X, axis=2, keepdims=True), lengths, n_states, \
 26 |                               np.sum(single_base_nb_mean, axis=1, keepdims=True), np.sum(single_total_bb_RD, axis=1, keepdims=True), log_sitewise_transmat, \
 27 |                               hmmclass=hmm_sitewise, params=params, t=t, random_state=random_state, only_minor=True, \
 28 |                               fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, \
 29 |                               fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, is_diag=True, \
 30 |                               init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, max_iter=max_iter, tol=tol)
 31 |     # phase_prob = np.exp(scipy.special.logsumexp(res["log_gamma"][:n_states, :], axis=0))
 32 |     # return phase_prob
 33 |     pred = np.argmax(res["log_gamma"], axis=0)
 34 |     pred_cnv = pred % n_states
 35 |     phase_indicator = (pred < n_states)
 36 |     refined_lengths = []
 37 |     cumlen = 0
 38 |     for le in lengths:
 39 |         s = 0
 40 |         for i, k in enumerate(pred_cnv[cumlen:(cumlen+le)]):
 41 |             if i > 0 and pred_cnv[i] != pred_cnv[i-1]:
 42 |                 refined_lengths.append(i - s)
 43 |                 s = i
 44 |         refined_lengths.append(le - s)
 45 |         cumlen += le
 46 |     refined_lengths = np.array(refined_lengths)
 47 |     return phase_indicator, refined_lengths
 48 | 
 49 | 
 50 | def initial_phase_given_partition(single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_index, n_states, log_sitewise_transmat, \
 51 |     params, t, random_state, fix_NB_dispersion, shared_NB_dispersion, fix_BB_dispersion, shared_BB_dispersion, max_iter, tol, threshold, min_snpumi=2e3):
 52 |     EPS_BAF = 0.05
 53 |     if single_tumor_prop is None:
 54 |         X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index)
 55 |         tumor_prop = None
 56 |     else:
 57 |         X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index, single_tumor_prop, threshold=threshold)
 58 | 
 59 |     # pseudobulk HMM for phase_prob
 60 |     baf_profiles = np.zeros((X.shape[2], X.shape[0]))
 61 |     pred_cnv = np.zeros((X.shape[2], X.shape[0]))
 62 |     for i in range(X.shape[2]):
 63 |         if np.sum(total_bb_RD[:,i]) < min_snpumi:
 64 |             baf_profiles[i,:] = 0.5
 65 |         else:
 66 |             res = pipeline_baum_welch(None, X[:,:,i:(i+1)], lengths, n_states, base_nb_mean[:,i:(i+1)], total_bb_RD[:,i:(i+1)], log_sitewise_transmat, \
 67 |                                     hmmclass=hmm_sitewise, params=params, t=t, random_state=random_state, only_minor=True, \
 68 |                                     fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, \
 69 |                                     fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, is_diag=True, \
 70 |                                     init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, max_iter=max_iter, tol=tol)
 71 |             #
 72 |             pred = np.argmax(res["log_gamma"], axis=0)
 73 |             this_baf_profiles = np.where(pred < n_states, res["new_p_binom"][pred%n_states, 0], 1-res["new_p_binom"][pred%n_states, 0])
 74 |             this_baf_profiles[np.abs(this_baf_profiles - 0.5) < EPS_BAF] = 0.5
 75 |             baf_profiles[i,:] = this_baf_profiles
 76 |             pred_cnv[i,:] = (pred % n_states)
 77 | 
 78 |     if single_tumor_prop is None:
 79 |         n_total_spots = np.sum([ len(x) for x in initial_clone_index ])
 80 |         population_baf = np.array([ 1.0*len(x)/n_total_spots for x in initial_clone_index]) @ baf_profiles
 81 |     else:
 82 |         n_total_spots = np.sum([ len(x) * tumor_prop[i] for i,x in enumerate(initial_clone_index) ])
 83 |         population_baf = np.array([ 1.0*len(x)*tumor_prop[i]/n_total_spots for i,x in enumerate(initial_clone_index) ]) @ baf_profiles
 84 |     adj_baf_profiles = np.where(baf_profiles < 0.5, baf_profiles, 1-baf_profiles)
 85 |     phase_indicator = (population_baf < 0.5)
 86 |     refined_lengths = []
 87 |     cumlen = 0
 88 |     for le in lengths:
 89 |         s = 0
 90 |         for i in range(le):
 91 |             if i > s + 10 and np.any(np.abs(adj_baf_profiles[:,i+cumlen] - adj_baf_profiles[:,i+cumlen-1]) > 0.1):
 92 |                 refined_lengths.append(i - s)
 93 |                 s = i
 94 |         refined_lengths.append(le - s)
 95 |         cumlen += le
 96 |     refined_lengths = np.array(refined_lengths)
 97 |     return phase_indicator, refined_lengths
 98 | 
 99 | 
100 | def perform_partition(coords, sample_ids, x_part, y_part, single_tumor_prop, threshold):
101 |     initial_clone_index = []
102 |     for s in range(np.max(sample_ids)+1):
103 |         index = np.where(sample_ids == s)[0]
104 |         assert len(index) > 0
105 |         if single_tumor_prop is None:
106 |             tmp_clone_index = fixed_rectangle_initialization(coords[index,:], x_part, y_part)
107 |         else:
108 |             tmp_clone_index = fixed_rectangle_initialization_mix(coords[index,:], x_part, y_part, single_tumor_prop[index], threshold=threshold)
109 |         for x in tmp_clone_index:
110 |             initial_clone_index.append( index[x] )
111 |     return initial_clone_index
112 | 


--------------------------------------------------------------------------------
/src/calicost/phylogeny_startle.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import pandas as pd
  3 | import argparse
  4 | import itertools
  5 | import math
  6 | import subprocess
  7 | import numpy as np
  8 | import seaborn as sns
  9 | from matplotlib import pyplot as plt
 10 | 
 11 | import networkx as nx
 12 | import itertools
 13 | from collections import deque
 14 | import argparse
 15 | 
 16 | 
 17 | def get_LoH_for_phylogeny(df_seglevel_cnv, min_segments):
 18 |     """
 19 |     Treating LoH as irreversible point mutations, output a clone-by-mutation matrix for phylogeny reconstruction.
 20 |     Mutation states: 0 for no LoH, 1 for lossing A allele, 2 for lossing B allele.
 21 | 
 22 |     Attributes
 23 |     ----------
 24 |     df_seglevel_cnv : pd.DataFrame, (n_obs, 3+2*n_clones)
 25 |         Dataframe from cnv_*seglevel.tsv output.
 26 | 
 27 |     Returns
 28 |     ----------
 29 |     df_loh : pd.DataFrame, (n_clones, n_segments)
 30 |     """
 31 |     def get_shared_intervals(acn_profile):
 32 |         '''
 33 |         Takes in allele-specific copy numbers, output a segmentation of genome such that all clones are in the same CN state within each segment.
 34 | 
 35 |         anc_profile : array, (n_obs, 2*n_clones)
 36 |             Allele-specific integer copy numbers for each genomic bin (obs) across all clones.
 37 |         '''
 38 |         intervals = []
 39 |         seg_acn = []
 40 |         s = 0
 41 |         while s < acn_profile.shape[0]:
 42 |             t = np.where( ~np.all(acn_profile[s:,] == acn_profile[s,:], axis=1) )[0]
 43 |             if len(t) == 0:
 44 |                 intervals.append( (s, acn_profile.shape[0])  )
 45 |                 seg_acn.append( acn_profile[s,:] )
 46 |                 s = acn_profile.shape[0]
 47 |             else:
 48 |                 t = t[0]
 49 |                 intervals.append( (s,s+t) )
 50 |                 seg_acn.append( acn_profile[s,:] )
 51 |                 s = s+t
 52 |         return intervals, seg_acn
 53 |     
 54 |     clone_ids = [x.split(" ")[0] for x in df_seglevel_cnv.columns[ np.arange(3, df_seglevel_cnv.shape[1], 2) ] ]
 55 |     
 56 |     acn_profile = df_seglevel_cnv.iloc[:,3:].values
 57 |     intervals, seg_acn = get_shared_intervals(acn_profile)
 58 |     df_loh = []
 59 |     for i, acn in enumerate(seg_acn):
 60 |         if np.all(acn != 0):
 61 |             continue
 62 |         if intervals[i][1] - intervals[i][0] < min_segments:
 63 |             continue
 64 |         idx_zero = np.where(acn == 0)[0]
 65 |         idx_clones = (idx_zero / 2).astype(int)
 66 |         is_A = (idx_zero % 2 == 0)
 67 |         # vector of mutation states
 68 |         mut = np.zeros( int(len(acn) / 2), dtype=int )
 69 |         mut[idx_clones] = np.where(is_A, 1, 2)
 70 |         df_loh.append( pd.DataFrame(mut.reshape(1, -1), index=[f"bin_{intervals[i][0]}_{intervals[i][1]}"], columns=clone_ids) )
 71 | 
 72 |     df_loh = pd.concat(df_loh).T
 73 |     return df_loh
 74 | 
 75 | 
 76 | def get_binary_matrix(df_character_matrix):
 77 |     
 78 |     ncells = len(df_character_matrix)
 79 |     binary_col_dict = {}
 80 |     for column in df_character_matrix.columns:
 81 |         state_list = list(df_character_matrix[column].unique())
 82 |         for s in state_list:
 83 |             if s != -1 and s != 0:
 84 |                 state_col = np.zeros((ncells))
 85 |                 state_col[df_character_matrix[column] == s] = 1
 86 |                 state_col[df_character_matrix[column] == -1] = -1
 87 | 
 88 |                 binary_col_dict[f'{column}_{s}'] = state_col
 89 | 
 90 |     df_binary = pd.DataFrame(binary_col_dict, index = df_character_matrix.index, dtype=int)
 91 |     return df_binary
 92 | 
 93 | 
 94 | def generate_perfect_phylogeny(df_binary):
 95 | 
 96 |     solT_mut = nx.DiGraph()
 97 |     solT_mut.add_node('root')
 98 | 
 99 |     solT_cell = nx.DiGraph()
100 |     solT_cell.add_node('root')
101 | 
102 |     df_binary = df_binary[df_binary.sum().sort_values(ascending=False).index]    
103 | 
104 |     for cell_id, row in df_binary.iterrows():
105 |         if cell_id == 'root':
106 |             continue
107 | 
108 |         curr_node = 'root'
109 |         for column in df_binary.columns[row.values == 1]:
110 |             if column in solT_mut[curr_node]:
111 |                 curr_node = column
112 |             else:
113 |                 if column in solT_mut.nodes:
114 |                     raise NameError(f'{column} is being repeated')
115 |                 solT_mut.add_edge(curr_node, column)
116 |                 solT_cell.add_edge(curr_node, column)
117 |                 curr_node = column
118 | 
119 |         solT_cell.add_edge(curr_node, cell_id)   
120 | 
121 |     return solT_mut, solT_cell
122 | 
123 | 
124 | def tree_to_newick(T, root=None):
125 |     if root is None:
126 |         roots = list(filter(lambda p: p[1] == 0, T.in_degree()))
127 |         assert 1 == len(roots)
128 |         root = roots[0][0]
129 |     subgs = []
130 |     while len(T[root]) == 1:
131 |         root = list(T[root])[0]
132 |     for child in T[root]:
133 |         pathlen = 0
134 |         while len(T[child]) == 1:
135 |             child = list(T[child])[0]
136 |             pathlen += 1
137 |         if len(T[child]) > 0:
138 |             pathlen += 1
139 |             subgs.append(tree_to_newick(T, root=child) + f":{pathlen}")
140 |         else:
141 |             subgs.append( f"{child}:{pathlen}" )
142 |     return "(" + ','.join(map(str, subgs)) + ")"
143 | 
144 | 
145 | def output_startle_input_files(calicostdir, outdir, midfix="", startle_bin="startle", min_segments=3):
146 |     # get LoH data frame
147 |     # rows are clones, columns are bins, entries are 0 (no LoH) or 1 (A allele LoH) of 2 (B allele LoH)
148 |     df_seglevel_cnv = pd.read_csv(f"{calicostdir}/cnv{midfix}_seglevel.tsv", header=0, sep="\t")
149 |     df_loh = get_LoH_for_phylogeny(df_seglevel_cnv, min_segments)
150 |     df_loh.to_csv(f"{outdir}/loh_matrix.tsv", header=True, index=True, sep="\t")
151 |     
152 |     # binarize
153 |     df_binary = get_binary_matrix(df_loh)
154 | 
155 |     cell_list = list(df_binary.index)
156 |     mutation_list = list(df_binary.columns)
157 |     mutation_to_index = {x: idx for idx, x in enumerate(mutation_list)}
158 | 
159 |     # one and missing indices
160 |     # one indices
161 |     one_cell_mut_list = []
162 |     for cell_idx, cell in enumerate(cell_list):
163 |         for mut_idx, mut in enumerate(mutation_list):
164 |             if df_binary.loc[cell][mut] == 1:
165 |                 one_cell_mut_list.append((cell_idx, mut_idx))
166 |     with open(f'{outdir}/loh_one_indices.txt', 'w') as out:
167 |         for cell_idx, mut_idx in one_cell_mut_list:
168 |             out.write(f'{cell_idx} {mut_idx}\n')
169 |     # missimg imdices
170 |     character_list = list(set(['_'.join(x.split('_')[:-1]) for x in df_binary.columns]))
171 |     missing_cell_character_list = []
172 |     for character_idx, character in enumerate(character_list):
173 |         for cell_idx, cell in enumerate(cell_list):
174 |             if df_loh.loc[cell][character] == -1:
175 |                 missing_cell_character_list.append((cell_idx, character_idx))
176 |     with open(f'{outdir}/loh_missing_indices.txt', 'w') as out:
177 |         for cell_idx, character_idx in missing_cell_character_list:
178 |             out.write(f'{cell_idx} {character_idx}\n')
179 | 
180 |     # character mutation mapping
181 |     with open(f'{outdir}/loh_character_mutation_mapping.txt', 'w') as out:
182 |         for _, character in enumerate(character_list):
183 |             character_mutation_list = [mutation_to_index[x] for x in mutation_list if x.startswith(f'{character}_')]
184 |             out.write(' '.join(map(str, character_mutation_list)) + '\n')
185 | 
186 |     # count of character states of mutations
187 |     max_allowed_homoplasy = {}
188 |     for mutation in mutation_list:
189 |         max_allowed_homoplasy[mutation] = 2
190 |     with open(f'{outdir}/loh_counts.txt', 'w') as out:
191 |         for mutation in mutation_list:
192 |             out.write(f'{max_allowed_homoplasy[mutation]}\n')
193 |     
194 |     # weights
195 |     with open(f'{outdir}/loh_weights.txt', 'w') as out:
196 |         for mutation in mutation_list:
197 |             out.write(f"1\n")
198 | 
199 |     ##### run startle #####
200 |     m_mutations = df_binary.shape[1]
201 |     n_clones = df_binary.shape[0]
202 |     command = f"{startle_bin} -m {m_mutations} -n {n_clones} {outdir}/loh_one_indices.txt {outdir}/loh_missing_indices.txt {outdir}/loh_counts.txt {outdir}/loh_character_mutation_mapping.txt {outdir}/loh_weights.txt {outdir}/loh_cpp_output.txt"
203 |     print( command )
204 |     p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
205 |     out,err = p.communicate()
206 | 
207 |     # parse output
208 |     df_cpp_output = pd.read_csv(f'{outdir}/loh_cpp_output.txt', header=None, sep=' ')
209 |     df_cpp_output = df_cpp_output.rename(columns={0:'cell_idx', 1:'mut_idx', 2:'state_idx', 3:'entry'})
210 |     df_cpp_output['name'] = df_cpp_output.apply(lambda x: f"{mutation_list[x['mut_idx']]}_{x['state_idx']}", axis =1)
211 |     
212 |     sol_columns = list(df_cpp_output['name'].unique())
213 |     nsol_columns = len(sol_columns)
214 |     sol_entries = np.zeros((n_clones, nsol_columns), dtype=int)
215 |     for mut_idx, mut in enumerate(sol_columns):
216 |         for cell_idx in df_cpp_output[(df_cpp_output['entry'] == 1) & (df_cpp_output['name'] == mut)]['cell_idx']:
217 |             sol_entries[cell_idx][mut_idx] = 1
218 |     df_sol_binary = pd.DataFrame(sol_entries, columns=sol_columns, index=cell_list)
219 | 
220 |     solT_mut, solT_cell = generate_perfect_phylogeny(df_sol_binary)
221 |     with open(f'{outdir}/loh_tree.newick', 'w') as out:
222 |         out.write(f"{tree_to_newick(solT_cell)};")
223 | 
224 | 
225 | if __name__ == "__main__":
226 |     parser = argparse.ArgumentParser()
227 |     parser.add_argument("-c", "--calicost_dir", help="Directory of a specific random initialization of CalicoST", type=str)
228 |     parser.add_argument("-s", "--startle_bin", help="The startle executable path", default="startle", type=str)
229 |     parser.add_argument("-p", "--ploidy", help="Ploidy of allele-specific integer copy numbers.", default="", type=str)
230 |     parser.add_argument("--min_segments", help="Minimum number of genome segment to keep an LOH event in phylogenetic tree reconstruction.", default=3, type=int)
231 |     parser.add_argument("-o", "--outputdir", help="output directory", type=str)
232 |     args = parser.parse_args()
233 | 
234 |     output_startle_input_files(args.calicost_dir, args.outputdir, midfix=args.ploidy, startle_bin=args.startle_bin, min_segments=args.min_segments)


--------------------------------------------------------------------------------
/src/calicost/phylogeography.py:
--------------------------------------------------------------------------------
  1 | import scanpy as sc
  2 | import numpy as np
  3 | import pandas as pd
  4 | import copy
  5 | from matplotlib import pyplot as plt
  6 | import seaborn
  7 | from ete3 import Tree
  8 | import networkx as nx
  9 | 
 10 | 
 11 | def clone_centers(coords, clone_label, single_tumor_prop=None, sample_list=None, sample_ids=None, tumorprop_threshold=0.6):
 12 |     df_centers = []
 13 |     for l in np.unique(clone_label):
 14 |         # get spot indices of this clone
 15 |         index = np.where(clone_label == l)[0] if single_tumor_prop is None else np.where((clone_label == l) & (single_tumor_prop > tumorprop_threshold))[0]
 16 |         # if the index contains multiple slices, get the most abundance slice
 17 |         if not sample_ids is None:
 18 |             most_abundance_slice = pd.Series(sample_ids[index]).mode().values[0]
 19 |             index = index[ sample_ids[index] == most_abundance_slice ]
 20 |         # get clone cencer
 21 |         if single_tumor_prop is None:
 22 |             center = np.mean(coords[index], axis=0)
 23 |         else:
 24 |             center = single_tumor_prop[index].dot(coords[index]) / np.sum(single_tumor_prop[index])
 25 |         df_centers.append( pd.DataFrame({'clone':l, 'x':center[0], 'y':center[1]}, index=[0]) )
 26 |     df_centers = pd.concat(df_centers, ignore_index=True)
 27 |     return df_centers
 28 | 
 29 | 
 30 | def project_phylogeneny_space(newick_file, coords, clone_label, single_tumor_prop=None, sample_list=None, sample_ids=None):
 31 |     # load tree
 32 |     with open(newick_file, 'r') as fp:
 33 |         t = Tree(fp.readline())
 34 |     
 35 |     # get the 
 36 |     list_leaf_nodes = []
 37 |     list_internal_nodes = []
 38 |     rootnode = np.sort( [leaf.name.replace('clone','') for leaf in t.iter_leaves() ] )
 39 |     rootnode = "ancestor" + "_".join( rootnode )
 40 |     for node in t.traverse():
 41 |         leafnames = np.sort( [leaf.name.replace('clone','') for leaf in node.iter_leaves() ] )
 42 |         if node.name == "":
 43 |             node.name = "ancestor" + "_".join( leafnames )
 44 |         
 45 |         if node.is_leaf():
 46 |             list_leaf_nodes.append(node.name)
 47 |         else:
 48 |             list_internal_nodes.append(node.name)
 49 | 
 50 |     print(f"root node is {rootnode}")
 51 |     print(f"a list of leaf nodes: {list_leaf_nodes}")
 52 |     print(f"a list of internal nodes: {list_internal_nodes}")
 53 |     
 54 |     # set up multivariate Gaussian distribution to estimate internal node location
 55 |     N_nodes = len(list_leaf_nodes) + len(list_internal_nodes)
 56 |     # pairwise distance
 57 |     G = nx.Graph()
 58 |     G.add_nodes_from( list_leaf_nodes + list_internal_nodes )
 59 |     for nodename in list_leaf_nodes:
 60 |         node = t&f"{nodename}"
 61 |         while not node.is_root():
 62 |             p = node.up
 63 |             G.add_edge(node.name, p.name, weight=node.dist)
 64 |             node = p
 65 |     
 66 |     G.edges(data=True)
 67 |     nx_pdc = dict( nx.all_pairs_dijkstra(G) )
 68 | 
 69 |     # covariance matrix based on pairwise distance
 70 |     N_nodes = len(list_leaf_nodes) + len(list_internal_nodes)
 71 |     Sigma_square = np.zeros((N_nodes, N_nodes))
 72 |     base_var = max( np.max(np.abs(coords[:,0])), np.max(np.abs(coords[:,1])) )
 73 |     
 74 |     for n1, name1 in enumerate(list_leaf_nodes + list_internal_nodes):
 75 |         for n2, name2 in enumerate(list_leaf_nodes + list_internal_nodes):
 76 |             if n1 == n2:
 77 |                 Sigma_square[n1, n2] = base_var + nx_pdc[rootnode][0][name1]
 78 |             else:
 79 |                 lca_node = t.get_common_ancestor([name1, name2])
 80 |                 # print( name1, name2, lca_node.name )
 81 |                 if lca_node.name == rootnode:
 82 |                     Sigma_square[n1, n2] = base_var
 83 |                 else:
 84 |                     Sigma_square[n1, n2] = base_var + nx_pdc[rootnode][0][lca_node.name]
 85 | 
 86 |     # mean position
 87 |     mu_1 = np.zeros(( len(list_leaf_nodes),2 ))
 88 |     mu_2 = np.zeros(( len(list_internal_nodes),2 ))
 89 | 
 90 |     # partition covariance matrix
 91 |     Sigma_11 = Sigma_square[:len(list_leaf_nodes), :len(list_leaf_nodes)]
 92 |     Sigma_12 = Sigma_square[:len(list_leaf_nodes), :][:, len(list_leaf_nodes):]
 93 |     Sigma_22 = Sigma_square[len(list_leaf_nodes):, len(list_leaf_nodes):]
 94 | 
 95 |     # get leaf node locations
 96 |     df_centers = clone_centers(coords, clone_label, single_tumor_prop=single_tumor_prop, 
 97 |                                sample_list=sample_list, sample_ids=sample_ids)
 98 |     obs_1 = df_centers.set_index('clone').loc[list_leaf_nodes].values
 99 | 
100 |     # conditional expectation internal node position | leaf node position = mu_1
101 |     expected_internal = mu_2 + Sigma_12.T @ (np.linalg.inv(Sigma_11) @ (obs_1 - mu_1))
102 |     df_centers = pd.concat([ df_centers, pd.DataFrame({'clone':list_internal_nodes, 'x':expected_internal[:,0], 'y':expected_internal[:,1]}) ])
103 | 
104 |     # add to tree features
105 |     for node in t.traverse():
106 |         i = np.where(df_centers.clone.values == node.name)[0][0]
107 |         node.add_features( x=df_centers.x.values[i], y=df_centers.y.values[i] )
108 | 
109 |     return t


--------------------------------------------------------------------------------
/src/calicost/simple_sctransform.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy
  3 | import statsmodels
  4 | import statsmodels.api as sm
  5 | from KDEpy import FFTKDE
  6 | from scipy.special import psi, polygamma
  7 | 
  8 | 
  9 | # copied from sctransformPy
 10 | def theta_ml(y,mu):
 11 |     n = y.size
 12 |     weights = np.ones(n)
 13 |     limit = 10
 14 |     _EPS = np.finfo(float).eps
 15 |     eps = (_EPS)**0.25
 16 |     # inner function
 17 |     def score(n,th,mu,y,w):
 18 |         return sum(w*(psi(th + y) - psi(th) + np.log(th) + 1 - np.log(th + mu) - (y + th)/(mu + th)))
 19 |     # inner function
 20 |     def info(n,th,mu,y,w):
 21 |         return sum(w*( - polygamma(1,th + y) + polygamma(1,th) - 1/th + 2/(mu + th) - (y + th)/(mu + th)**2))
 22 |     # initialize gradient descent
 23 |     t0 = n/sum(weights*(y/mu - 1)**2)
 24 |     it = 0
 25 |     de = 1
 26 |     # gradient descent
 27 |     while(it + 1 < limit and abs(de) > eps):
 28 |         it+=1
 29 |         t0 = abs(t0)
 30 |         i = info(n, t0, mu, y, weights)
 31 |         de = score(n, t0, mu, y, weights)/i
 32 |         t0 += de        
 33 |     t0 = max(t0,0)
 34 |     # note that t0 is the dispersion parameter: var = mu + mu^2 / t0
 35 |     return t0
 36 | 
 37 | 
 38 | def sample_gene_indices(log_geometric_mean, n_subsample, n_partitions=10):
 39 |     bounds = np.linspace(np.min(log_geometric_mean), np.max(log_geometric_mean), n_partitions+1)
 40 |     bounds[-1] += 1e-4
 41 |     idx_subsample = []
 42 |     for p in range(1, n_partitions):
 43 |         tmpidx = np.where(np.logical_and(log_geometric_mean >= bounds[p-1], log_geometric_mean < bounds[p]))[0]
 44 |         np.random.shuffle(tmpidx)
 45 |         idx_subsample.append(tmpidx[:int(n_subsample/n_partitions)])
 46 |     idx_subsample = np.sort(np.concatenate(idx_subsample))
 47 |     if len(idx_subsample) < n_subsample:
 48 |         mask = np.array([True] * len(log_geometric_mean))
 49 |         mask[idx_subsample] = False
 50 |         idx_rest = np.arange(len(log_geometric_mean))[mask]
 51 |         np.random.shuffle(idx_rest)
 52 |         n_rest = n_subsample - len(idx_subsample)
 53 |         idx_subsample = np.sort(np.concatenate([idx_subsample, idx_rest[:n_rest]]))
 54 |     return idx_subsample
 55 | 
 56 | 
 57 | def estimate_logmu_dispersion(counts, bw=None):
 58 |     '''
 59 |     counts of size number spots * number genes.
 60 |     '''
 61 |     N = counts.shape[0]
 62 |     G = counts.shape[1]
 63 |     eps = 1
 64 |     geometric_mean = np.exp(np.log(counts+eps).mean(axis=0).flatten()) - eps
 65 |     log_geometric_mean = np.log( geometric_mean )
 66 |     spot_umi = counts.sum(axis=1)
 67 |     # fitting logmu and theta (dispersion)
 68 |     logmu = np.zeros(G)
 69 |     theta = np.zeros(G)
 70 |     for i in range(G):
 71 |         y = counts[:,i]
 72 |         logmu[i] = np.log( np.sum(y) / np.sum(spot_umi) )
 73 |         mu = spot_umi * np.exp(logmu[i])
 74 |         theta[i] = theta_ml(y, mu)
 75 |     # ratio between geometric mean and dispersion parameter theta
 76 |     log_ratio = np.log(1 + geometric_mean / theta)
 77 |     # smoothing parameter for kernel ridge regression
 78 |     if bw is None:
 79 |         z = FFTKDE(kernel='gaussian', bw='ISJ').fit(log_geometric_mean)
 80 |         z.evaluate();
 81 |         bw_adjust = 3
 82 |         bw = z.bw*bw_adjust
 83 |     # kernel ridge regression for log_ratio (the log ratio between geometric mean expression and dispersion)
 84 |     kr = statsmodels.nonparametric.kernel_regression.KernelReg(log_ratio, log_geometric_mean[:,None], ['c'], reg_type='ll', bw=[bw])
 85 |     pred_log_ratio = kr.fit(data_predict = log_geometric_mean[:,None])[0]
 86 |     pred_theta = geometric_mean / (np.exp(pred_log_ratio) - 1)
 87 |     return logmu, pred_theta
 88 | 
 89 | 
 90 | def pearson_residual(counts, logmu, pred_theta):
 91 |     '''
 92 |     counts of size number spots * number genes.
 93 |     '''
 94 |     N = counts.shape[0]
 95 |     G = counts.shape[1]
 96 |     spot_umi = counts.sum(axis=1)
 97 |     # predicted mean and variance under NB model
 98 |     mud = np.exp(logmu.reshape(1,-1)) * spot_umi.reshape(-1,1)
 99 |     vard = mud + mud**2 / pred_theta.reshape(1,-1)
100 |     X = (counts * 1.0 - mud) / vard**0.5
101 |     # clipping
102 |     clip = np.sqrt(counts.shape[0]/30)
103 |     X[X > clip] = clip
104 |     X[X < -clip] = -clip
105 |     return X
106 | 
107 | 
108 | def deviance_residual(counts, logmu, pred_theta):
109 |     '''
110 |     Equation is taken from Analytic Pearson Residual paper by Lause et al.
111 |     counts of size number spots * number genes.
112 |     '''
113 |     N = counts.shape[0]
114 |     G = counts.shape[1]
115 |     spot_umi = counts.sum(axis=1)
116 |     # predicted mean
117 |     mud = np.exp(logmu.reshape(1,-1)) * spot_umi.reshape(-1,1)
118 |     sign = (counts > mud)
119 |     part1 = counts * np.log(counts / mud)
120 |     part1[counts==0] = 0
121 |     part2 = (counts + pred_theta) * np.log( (counts + pred_theta) / (mud + pred_theta) )
122 |     X = sign * np.sqrt(2 * (part1 - part2))
123 |     return X
124 | 
125 | 
126 | def estimate_logmu_dispersion2(counts, n_subsample=None, bw=None):
127 |     '''
128 |     counts of size number spots * number genes.
129 |     '''
130 |     N = counts.shape[0]
131 |     G = counts.shape[1]
132 |     eps = 1
133 |     geometric_mean = np.exp(np.log(counts+eps).mean(axis=0).flatten()) - eps
134 |     log_geometric_mean = np.log( geometric_mean )
135 |     spot_umi = counts.sum(axis=1)
136 |     logmu = np.log( np.sum(counts, axis=0) / np.sum(spot_umi) )
137 |     # fitting theta (dispersion)
138 |     genes_subsample = np.array([i for i in range(G) if geometric_mean[i] > 0])
139 |     if not (n_subsample is None):
140 |         np.random.seed(0)
141 |         genes_subsample = sample_gene_indices(log_geometric_mean, n_subsample)
142 |     theta = np.zeros(len(genes_subsample))
143 |     for idx,i in enumerate(genes_subsample):
144 |         y = counts[:,i]
145 |         mu = spot_umi * np.exp(logmu[i])
146 |         theta[idx] = theta_ml(y, mu)
147 |     # ratio between geometric mean and dispersion parameter theta
148 |     log_ratio = np.log(1 + geometric_mean[genes_subsample] / theta)
149 |     # smoothing parameter for kernel ridge regression
150 |     if bw is None:
151 |         z = FFTKDE(kernel='gaussian', bw='ISJ').fit(log_geometric_mean[genes_subsample])
152 |         z.evaluate();
153 |         bw_adjust = 3
154 |         bw = z.bw*bw_adjust
155 |     # kernel ridge regression for log_ratio (the log ratio between geometric mean expression and dispersion)
156 |     kr = statsmodels.nonparametric.kernel_regression.KernelReg(log_ratio, log_geometric_mean[genes_subsample][:,None], ['c'], reg_type='ll', bw=[bw])
157 |     pred_log_ratio = kr.fit(data_predict = log_geometric_mean[:,None])[0]
158 |     pred_theta = geometric_mean / (np.exp(pred_log_ratio) - 1)
159 |     return logmu, pred_theta
160 | 
161 | 
162 | def pearson_residual2(counts, logmu, pred_theta):
163 |     '''
164 |     counts of size number spots * number genes.
165 |     '''
166 |     N = counts.shape[0]
167 |     G = counts.shape[1]
168 |     spot_umi = counts.sum(axis=1)
169 |     # predicted mean and variance under NB model
170 |     mud = np.exp(logmu.reshape(1,-1)) * spot_umi.reshape(-1,1)
171 |     vard = mud + mud**2 / pred_theta.reshape(1,-1)
172 |     X = (counts * 1.0 - mud) / vard**0.5
173 |     # clipping
174 |     clip = np.sqrt(counts.shape[0])
175 |     X[X > clip] = clip
176 |     X[X < -clip] = -clip
177 |     return X
178 | 


--------------------------------------------------------------------------------
/src/calicost/utils_distribution_fitting.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import inspect
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import scipy
  7 | from scipy import linalg, special
  8 | from scipy.special import logsumexp, loggamma
  9 | import scipy.integrate
 10 | import scipy.stats
 11 | from numba import jit, njit
 12 | from sklearn import cluster
 13 | from sklearn.utils import check_random_state
 14 | import statsmodels
 15 | import statsmodels.api as sm
 16 | from statsmodels.base.model import GenericLikelihoodModel
 17 | import os
 18 | 
 19 | os.environ["MKL_NUM_THREADS"] = "1"
 20 | os.environ["OPENBLAS_NUM_THREADS"] = "1"
 21 | os.environ["OMP_NUM_THREADS"] = "1"
 22 | 
 23 | 
 24 | def convert_params(mean, std):
 25 |     """
 26 |     Convert mean/dispersion parameterization of a negative binomial to the ones scipy supports
 27 | 
 28 |     See https://mathworld.wolfram.com/NegativeBinomialDistribution.html
 29 |     """
 30 |     p = mean/std**2
 31 |     n = mean*p/(1.0 - p)
 32 |     return n, p
 33 | 
 34 | 
 35 | class Weighted_NegativeBinomial(GenericLikelihoodModel):
 36 |     """
 37 |     Negative Binomial model endog ~ NB(exposure * exp(exog @ params[:-1]), params[-1]), where exog is the design matrix, and params[-1] is 1 / overdispersion.
 38 |     This function fits the NB params when samples are weighted by weights: max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params)
 39 | 
 40 |     Attributes
 41 |     ----------
 42 |     endog : array, (n_samples,)
 43 |         Y values.
 44 | 
 45 |     exog : array, (n_samples, n_features)
 46 |         Design matrix.
 47 | 
 48 |     weights : array, (n_samples,)
 49 |         Sample weights.
 50 | 
 51 |     exposure : array, (n_samples,)
 52 |         Multiplication constant outside the exponential term. In scRNA-seq or SRT data, this term is the total UMI count per cell/spot.
 53 |     """
 54 |     def __init__(self, endog, exog, weights, exposure, seed=0, **kwds):
 55 |         super(Weighted_NegativeBinomial, self).__init__(endog, exog, **kwds)
 56 |         self.weights = weights
 57 |         self.exposure = exposure
 58 |         self.seed = seed
 59 |     #
 60 |     def nloglikeobs(self, params):
 61 |         nb_mean = np.exp(self.exog @ params[:-1]) * self.exposure
 62 |         nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2)
 63 |         n, p = convert_params(nb_mean, nb_std)
 64 |         llf = scipy.stats.nbinom.logpmf(self.endog, n, p)
 65 |         neg_sum_llf = -llf.dot(self.weights)
 66 |         return neg_sum_llf
 67 |     #
 68 |     def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
 69 |         self.exog_names.append('alpha')
 70 |         if start_params is None:
 71 |             if hasattr(self, 'start_params'):
 72 |                 start_params = self.start_params
 73 |             else:
 74 |                 start_params = np.append(0.1 * np.ones(self.nparams), 0.01)
 75 | 
 76 |         return super(Weighted_NegativeBinomial, self).fit(start_params=start_params,
 77 |                                                maxiter=maxiter, maxfun=maxfun,
 78 |                                                **kwds)
 79 | 
 80 | 
 81 | class Weighted_NegativeBinomial_mix(GenericLikelihoodModel):
 82 |     def __init__(self, endog, exog, weights, exposure, tumor_prop, seed=0, **kwds):
 83 |         super(Weighted_NegativeBinomial_mix, self).__init__(endog, exog, **kwds)
 84 |         self.weights = weights
 85 |         self.exposure = exposure
 86 |         self.seed = seed
 87 |         self.tumor_prop = tumor_prop
 88 |     #
 89 |     def nloglikeobs(self, params):
 90 |         nb_mean = self.exposure * (self.tumor_prop * np.exp(self.exog @ params[:-1]) + 1 - self.tumor_prop)
 91 |         nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2)
 92 |         n, p = convert_params(nb_mean, nb_std)
 93 |         llf = scipy.stats.nbinom.logpmf(self.endog, n, p)
 94 |         neg_sum_llf = -llf.dot(self.weights)
 95 |         return neg_sum_llf
 96 |     #
 97 |     def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
 98 |         self.exog_names.append('alpha')
 99 |         if start_params is None:
100 |             if hasattr(self, 'start_params'):
101 |                 start_params = self.start_params
102 |             else:
103 |                 start_params = np.append(0.1 * np.ones(self.nparams), 0.01)
104 |         return super(Weighted_NegativeBinomial_mix, self).fit(start_params=start_params,
105 |                                                maxiter=maxiter, maxfun=maxfun,
106 |                                                **kwds)
107 | 
108 | 
109 | class Weighted_BetaBinom(GenericLikelihoodModel):
110 |     """
111 |     Beta-binomial model endog ~ BetaBin(exposure, tau * p, tau * (1 - p)), where p = exog @ params[:-1] and tau = params[-1].
112 |     This function fits the BetaBin params when samples are weighted by weights: max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params)
113 | 
114 |     Attributes
115 |     ----------
116 |     endog : array, (n_samples,)
117 |         Y values.
118 | 
119 |     exog : array, (n_samples, n_features)
120 |         Design matrix.
121 | 
122 |     weights : array, (n_samples,)
123 |         Sample weights.
124 | 
125 |     exposure : array, (n_samples,)
126 |         Total number of trials. In BAF case, this is the total number of SNP-covering UMIs.
127 |     """
128 |     def __init__(self, endog, exog, weights, exposure, **kwds):
129 |         super(Weighted_BetaBinom, self).__init__(endog, exog, **kwds)
130 |         self.weights = weights
131 |         self.exposure = exposure
132 |     #
133 |     def nloglikeobs(self, params):
134 |         a = (self.exog @ params[:-1]) * params[-1]
135 |         b = (1 - self.exog @ params[:-1]) * params[-1]
136 |         llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b)
137 |         neg_sum_llf = -llf.dot(self.weights)
138 |         return neg_sum_llf
139 |     #
140 |     def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
141 |         self.exog_names.append("tau")
142 |         if start_params is None:
143 |             if hasattr(self, 'start_params'):
144 |                 start_params = self.start_params
145 |             else:
146 |                 start_params = np.append(0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1)
147 |         return super(Weighted_BetaBinom, self).fit(start_params=start_params,
148 |                                                maxiter=maxiter, maxfun=maxfun,
149 |                                                **kwds)
150 | 
151 | 
152 | class Weighted_BetaBinom_mix(GenericLikelihoodModel):
153 |     def __init__(self, endog, exog, weights, exposure, tumor_prop, **kwds):
154 |         super(Weighted_BetaBinom_mix, self).__init__(endog, exog, **kwds)
155 |         self.weights = weights
156 |         self.exposure = exposure
157 |         self.tumor_prop = tumor_prop
158 |     #
159 |     def nloglikeobs(self, params):
160 |         a = (self.exog @ params[:-1] * self.tumor_prop + 0.5 * (1 - self.tumor_prop)) * params[-1]
161 |         b = ((1 - self.exog @ params[:-1]) * self.tumor_prop + 0.5 * (1 - self.tumor_prop)) * params[-1]
162 |         llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b)
163 |         neg_sum_llf = -llf.dot(self.weights)
164 |         return neg_sum_llf
165 |     #
166 |     def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
167 |         self.exog_names.append("tau")
168 |         if start_params is None:
169 |             if hasattr(self, 'start_params'):
170 |                 start_params = self.start_params
171 |             else:
172 |                 start_params = np.append(0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1)
173 |         return super(Weighted_BetaBinom_mix, self).fit(start_params=start_params,
174 |                                                maxiter=maxiter, maxfun=maxfun,
175 |                                                **kwds)
176 | 
177 | 
178 | class Weighted_BetaBinom_fixdispersion(GenericLikelihoodModel):
179 |     def __init__(self, endog, exog, tau, weights, exposure, **kwds):
180 |         super(Weighted_BetaBinom_fixdispersion, self).__init__(endog, exog, **kwds)
181 |         self.tau = tau
182 |         self.weights = weights
183 |         self.exposure = exposure
184 |     #
185 |     def nloglikeobs(self, params):
186 |         a = (self.exog @ params) * self.tau
187 |         b = (1 - self.exog @ params) * self.tau
188 |         llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b)
189 |         neg_sum_llf = -llf.dot(self.weights)
190 |         return neg_sum_llf
191 |     #
192 |     def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
193 |         if start_params is None:
194 |             if hasattr(self, 'start_params'):
195 |                 start_params = self.start_params
196 |             else:
197 |                 start_params = 0.1 * np.ones(self.nparams)
198 |         
199 |         return super(Weighted_BetaBinom_fixdispersion, self).fit(start_params=start_params,
200 |                                                maxiter=maxiter, maxfun=maxfun,
201 |                                                **kwds)
202 | 
203 | 
204 | class Weighted_BetaBinom_fixdispersion_mix(GenericLikelihoodModel):
205 |     def __init__(self, endog, exog, tau, weights, exposure, tumor_prop, **kwds):
206 |         super(Weighted_BetaBinom_fixdispersion_mix, self).__init__(endog, exog, **kwds)
207 |         self.tau = tau
208 |         self.weights = weights
209 |         self.exposure = exposure
210 |         self.tumor_prop = tumor_prop
211 |     #
212 |     def nloglikeobs(self, params):
213 |         a = (self.exog @ params * self.tumor_prop + 0.5 * (1 - self.tumor_prop)) * self.tau
214 |         b = ((1 - self.exog @ params) * self.tumor_prop + 0.5 * (1 - self.tumor_prop)) * self.tau
215 |         llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b)
216 |         neg_sum_llf = -llf.dot(self.weights)
217 |         return neg_sum_llf
218 |     #
219 |     def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
220 |         if start_params is None:
221 |             if hasattr(self, 'start_params'):
222 |                 start_params = self.start_params
223 |             else:
224 |                 start_params = 0.1 * np.ones(self.nparams)
225 |         
226 |         return super(Weighted_BetaBinom_fixdispersion_mix, self).fit(start_params=start_params,
227 |                                                maxiter=maxiter, maxfun=maxfun,
228 |                                                **kwds)
229 | 
230 | 
231 | class BAF_Binom(GenericLikelihoodModel):
232 |     """
233 |     Binomial model endog ~ BetaBin(exposure, tau * p, tau * (1 - p)), where p = exog @ params[:-1] and tau = params[-1].
234 |     This function fits the BetaBin params when samples are weighted by weights: max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params)
235 | 
236 |     Attributes
237 |     ----------
238 |     endog : array, (n_samples,)
239 |         Y values.
240 | 
241 |     exog : array, (n_samples, n_features)
242 |         Design matrix.
243 | 
244 |     weights : array, (n_samples,)
245 |         Sample weights.
246 | 
247 |     exposure : array, (n_samples,)
248 |         Total number of trials. In BAF case, this is the total number of SNP-covering UMIs.
249 |     """
250 |     def __init__(self, endog, exog, weights, exposure, offset, scaling, **kwds):
251 |         super(BAF_Binom, self).__init__(endog, exog, **kwds)
252 |         self.weights = weights
253 |         self.exposure = exposure
254 |         self.offset = offset
255 |         self.scaling = scaling
256 |     #
257 |     def nloglikeobs(self, params):
258 |         linear_term = self.exog @ params
259 |         p = self.scaling / (1 + np.exp(-linear_term + self.offset))
260 |         llf = scipy.stats.binom.logpmf(self.endog, self.exposure, p)
261 |         neg_sum_llf = -llf.dot(self.weights)
262 |         return neg_sum_llf
263 |     #
264 |     def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
265 |         if start_params is None:
266 |             if hasattr(self, 'start_params'):
267 |                 start_params = self.start_params
268 |             else:
269 |                 start_params = 0.5 / np.sum(self.exog.shape[1]) *  np.ones(self.nparams)
270 |         return super(BAF_Binom, self).fit(start_params=start_params,
271 |                                                maxiter=maxiter, maxfun=maxfun,
272 |                                                **kwds)


--------------------------------------------------------------------------------
/src/calicost/utils_phase_switch.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from pathlib import Path
  4 | from tqdm import trange
  5 | import scipy
  6 | import scipy.special
  7 | 
  8 | 
  9 | def get_position_cM_table(chr_pos_vector, geneticmap_file):
 10 |     """
 11 |     Attributes
 12 |     ----------
 13 |     chr_pos_vector : list of pairs
 14 |         list of (chr, pos) pairs of SNPs
 15 |     """
 16 |     df = pd.read_csv(geneticmap_file, header=0, sep="\t")
 17 |     # remove chrX
 18 |     df = df[df.chrom.isin( [f"chr{i}" for i in range(1,23)] )]
 19 |     # check the chromosome names
 20 |     if not ("chr" in str(chr_pos_vector[0][0])):
 21 |         df["chrom"] = [int(x[3:]) for x in df.chrom]
 22 |     df = df.sort_values(by=["chrom", "pos"])
 23 |     ref_chrom = np.array(df.chrom)
 24 |     ref_pos = np.array(df.pos)
 25 |     ref_cm = np.array(df.pos_cm)
 26 |     # also sort the input argument
 27 |     chr_pos_vector.sort()
 28 |     # find the centimorgan values (interpolate between (k-1)-th and k-th rows in centimorgan tables)
 29 |     position_cM = np.ones(len(chr_pos_vector)) * np.nan
 30 |     k = 0
 31 |     for i,x in enumerate(chr_pos_vector):
 32 |         chrname = x[0]
 33 |         pos = x[1]
 34 |         while k < len(ref_chrom) and (ref_chrom[k] < chrname or (ref_chrom[k] == chrname and ref_pos[k] < pos)):
 35 |             k += 1
 36 |         if k < len(ref_chrom) and ref_chrom[k] == chrname and ref_pos[k] >= pos:
 37 |             if k > 0 and ref_chrom[k-1] == chrname:
 38 |                 position_cM[i] = ref_cm[k-1] + (pos - ref_pos[k-1]) / (ref_pos[k] - ref_pos[k-1]) * (ref_cm[k] - ref_cm[k-1])
 39 |             else:
 40 |                 position_cM[i] = (pos - 0) / (ref_pos[k] - 0) * (ref_cm[k] - 0)
 41 |         else:
 42 |             position_cM[i] = ref_cm[k-1]
 43 |     return position_cM
 44 | 
 45 | 
 46 | def compute_phase_switch_probability_position(position_cM, chr_pos_vector, nu = 1, min_prob=1e-20):
 47 |     """
 48 |     Attributes
 49 |     ----------
 50 |     position_cM : array, (number SNP positions)
 51 |         Centimorgans of SNPs located at each entry of position_cM.
 52 | 
 53 |     chr_pos_vector : list of pairs
 54 |         list of (chr, pos) pairs of SNPs. It is used to identify start of a new chr.
 55 |     """
 56 |     phase_switch_prob = np.ones(len(position_cM)) * 1e-20
 57 |     for i,cm in enumerate(position_cM[:-1]):
 58 |         cm_next = position_cM[i+1]
 59 |         if np.isnan(cm) or np.isnan(cm_next) or chr_pos_vector[i][0] != chr_pos_vector[i+1][0]:
 60 |             continue
 61 |         assert cm <= cm_next
 62 |         d = cm_next - cm
 63 |         phase_switch_prob[i] = (1 - np.exp(-2 * nu * d)) / 2
 64 |     phase_switch_prob[phase_switch_prob < min_prob] = min_prob
 65 |     return phase_switch_prob
 66 | 
 67 | 
 68 | def duplicate_RD(chr_baf, pos_baf, chr_rd, start_rd, end_rd, tumor_rd, normal_rd):
 69 |     tumor_reads = np.ones(len(chr_baf)) * np.nan
 70 |     normal_reads = np.ones(len(chr_baf)) * np.nan
 71 |     idx = 0
 72 |     for i in range(len(chr_baf)):
 73 |         while idx < len(chr_rd) and (chr_rd[idx] < chr_baf[i] or (chr_rd[idx] == chr_baf[i] and end_rd[idx] < pos_baf[i])):
 74 |             idx += 1
 75 |         if idx < len(chr_rd) and chr_rd[idx] == chr_baf[i] and end_rd[idx] >= pos_baf[i] and start_rd[idx] <= pos_baf[i]:
 76 |             tumor_reads[i] = tumor_rd[idx]
 77 |             normal_reads[i] = normal_rd[idx]
 78 |     return tumor_reads, normal_reads
 79 | 
 80 | 
 81 | def generate_input_from_HATCHet(hatchetdir, output_picklefile, rdrfile="abin/bulk.bb", baffile="baf/bulk.1bed", phasefile="phase/phased.vcf.gz", with_chr_prefix=True):
 82 |     if with_chr_prefix:
 83 |         unique_chrs = [f"chr{i}" for i in range(1, 23)]
 84 |     else:
 85 |         unique_chrs = np.arange(1, 23)
 86 |     
 87 |     ### load hatchet outputs ###
 88 |     if Path(output_picklefile).exists():
 89 |         # RDR file
 90 |         df_all = pd.read_csv(f"{hatchetdir}/{rdrfile}", header=0, sep="\t")
 91 |         df_all.iloc[:,0] = pd.Categorical(df_all.iloc[:,0], categories=unique_chrs, ordered=True)
 92 |         df_all.sort_values(by=["#CHR", "START"], inplace=True)
 93 |         # samples
 94 |         unique_samples = np.unique(df_all["SAMPLE"])
 95 |         # allele counts
 96 |         df_baf = pd.read_pickle(output_picklefile)
 97 |     else:
 98 |         # RDR file
 99 |         df_all = pd.read_csv(f"{hatchetdir}/{rdrfile}", header=0, sep="\t")
100 |         df_all.iloc[:,0] = pd.Categorical(df_all.iloc[:,0], categories=unique_chrs, ordered=True)
101 |         df_all.sort_values(by=["#CHR", "START"], inplace=True)
102 |         # samples
103 |         unique_samples = np.unique(df_all["SAMPLE"])
104 |         # allele counts for individual SNPs
105 |         def load_shared_BAF(hatchetdir, baffile, unique_chrs, unique_samples):
106 |             tmpdf = pd.read_csv(f"{hatchetdir}/{baffile}", header=None, sep="\t", names=["CHR", "POS", "SAMPLE", "REF", "ALT"])
107 |             df_baf = []
108 |             for chrname in unique_chrs:
109 |                 tmp = tmpdf[tmpdf.CHR == chrname]
110 |                 list_pos = [set(list(tmp[tmp["SAMPLE"] == s].POS)) for s in unique_samples] # SNP set of each individual sample
111 |                 shared_pos = set.intersection(*list_pos) # SNPs that are shared across samples
112 |                 index = np.array([i for i in range(tmp.shape[0]) if tmp.iloc[i,1] in shared_pos])
113 |                 tmp = tmp.iloc[index,:]
114 |                 tmp.sort_values(by=["POS", "SAMPLE"], inplace=True)
115 |                 df_baf.append( tmp )
116 |             df_baf = pd.concat(df_baf, ignore_index=True)
117 |             return df_baf
118 |         df_baf = load_shared_BAF(hatchetdir, baffile, unique_chrs, unique_samples)
119 |         # reference-based phasing results
120 |         df_phase = pd.read_csv(f"{hatchetdir}/{phasefile}", comment="#", sep="\t", \
121 |                         names=["CHR", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "SAMPLENAME"])
122 |         df_phase = df_phase[(df_phase.SAMPLENAME=="0|1") | (df_phase.SAMPLENAME=="1|0")]
123 |         print("HATCHet dataframes loaded.")
124 | 
125 |         ### gather phased BAF info ###
126 |         df_combined_baf = []
127 |         for chrname in unique_chrs:
128 |             tmpdf_baf = df_baf[df_baf.CHR == chrname]
129 |             tmpdf_phase = df_phase[df_phase.CHR == chrname][["POS", "SAMPLENAME"]]
130 |             tmpdf_baf = tmpdf_baf.join( tmpdf_phase.set_index("POS"), on="POS")
131 |             tmpdf_baf = tmpdf_baf[~tmpdf_baf.SAMPLENAME.isnull()]
132 |             tmpdf_baf["B_count"] = np.where(tmpdf_baf.SAMPLENAME=="0|1", tmpdf_baf.REF, tmpdf_baf.ALT)
133 |             tmpdf_baf["DP"] = tmpdf_baf.REF + tmpdf_baf.ALT
134 |             df_combined_baf.append( tmpdf_baf )
135 |         df_combined_baf = pd.concat(df_combined_baf, ignore_index=True)
136 |         df_combined_baf.iloc[:,0] = pd.Categorical(df_combined_baf.CHR, categories=unique_chrs, ordered=True)
137 |         df_combined_baf.sort_values(by=["CHR", "POS"], inplace=True)
138 |         df_baf = df_combined_baf
139 | 
140 |         ### duplicate RDR info for each SNP ###
141 |         df_baf["TOTAL_READS"] = np.nan
142 |         df_baf["NORMAL_READS"] = np.nan
143 |         for s in unique_samples:
144 |             index = np.where(df_baf["SAMPLE"] == s)[0]
145 |             index_rd = np.where(df_all["SAMPLE"] == s)[0]
146 |             tumor_reads, normal_reads = duplicate_RD(np.array(df_baf.iloc[index,:].CHR.cat.codes), np.array(df_baf.iloc[index,:].POS), \
147 |                                                     np.array(df_all.iloc[index_rd,0].cat.codes), np.array(df_all.iloc[index_rd,:].START), np.array(df_all.iloc[index_rd,:].END), \
148 |                                                     np.array(df_all.iloc[index_rd,:].TOTAL_READS), np.array(df_all.iloc[index_rd,:].NORMAL_READS))
149 |             df_baf.iloc[index, -2] = tumor_reads
150 |             df_baf.iloc[index, -1] = normal_reads
151 |         # remove SNP positions with TOTAL_READS=NAN (if NAN occurs in one sample, remove the corresponding SNPs for the other samples too)
152 |         def remove_nan_RD(df_baf):
153 |             idx_nan = np.where(np.logical_or( df_baf.TOTAL_READS.isnull(), df_baf.NORMAL_READS.isnull() ))[0]
154 |             chr = np.array(df_baf.CHR)
155 |             pos = np.array(df_baf.POS)
156 |             chr_pos = np.array([f"{chr[i]}_{pos[i]}" for i in range(len(chr))])
157 |             nan_chr_pos = set(list(chr_pos[idx_nan]))
158 |             idx_remain = np.array([i for i,snpid in enumerate(chr_pos) if not (snpid in nan_chr_pos)])
159 |             df_baf = df_baf.iloc[idx_remain, :]
160 |             return df_baf
161 |         df_baf = remove_nan_RD(df_baf)
162 |         df_baf.to_pickle(output_picklefile)
163 |         print("SNP-level BAF and bin-level RDR paired up.")
164 | 
165 |     ### from BAF, RDR table, generate HMM input ###
166 |     lengths = np.array([ np.sum(np.logical_and(df_baf["CHR"]==chrname, df_baf["SAMPLE"]==unique_samples[0])) for chrname in unique_chrs ])
167 | 
168 |     X = np.zeros(( np.sum(lengths), 2, len(unique_samples) ))
169 |     base_nb_mean = np.zeros((np.sum(lengths), len(unique_samples) ))
170 |     total_bb_RD = np.zeros((np.sum(lengths), len(unique_samples) ))
171 | 
172 |     for k,s in enumerate(unique_samples):
173 |         df = df_baf[df_baf["SAMPLE"] == s]
174 |         X[:,0,k] = df.TOTAL_READS
175 |         X[:,1,k] = df.B_count
176 | 
177 |         total_bb_RD[:,k] = np.array(df.DP)
178 |         df2 = df_all[df_all["SAMPLE"] == s]
179 |         base_nb_mean[:,k] = np.array(df.NORMAL_READS / np.sum(df2.NORMAL_READS) * np.sum(df2.TOTAL_READS))
180 | 
181 |     # site-wise transition matrix
182 |     chr_pos_vector = [(df_baf.CHR.iloc[i], df_baf.POS.iloc[i]) for i in np.where(df_baf["SAMPLE"]==unique_samples[0])[0]]
183 |     position_cM = get_position_cM_table(chr_pos_vector)
184 |     phase_switch_prob = compute_phase_switch_probability_position(position_cM, chr_pos_vector)
185 |     log_sitewise_transmat = np.log(phase_switch_prob)
186 | 
187 |     return X, lengths, base_nb_mean, total_bb_RD, log_sitewise_transmat
188 | 
189 | 
190 | def distance_between_p_binom(state_pred1, clone_pred1, p_binom1, state_pred2, clone_pred2, p_binom2):
191 |     import networkx as nx
192 | 
193 |     # matching predicted CNV states
194 |     n_states = len(np.unique(state_pred1))
195 |     uniq_pred1 = np.sort(np.unique(state_pred1))
196 |     uniq_pred2 = np.sort(np.unique(state_pred2))
197 |     G = nx.Graph()
198 |     G.add_nodes_from([f"A{i}" for i in uniq_pred1], bipartite=0)
199 |     G.add_nodes_from([f"B{j}" for j in uniq_pred2], bipartite=1)
200 |     # G.add_weighted_edges_from( [(f"A{i}", f"B{j}", np.sum(np.logical_and(state_pred1==uniq_pred1[i], state_pred2==uniq_pred2[j]))) for i in uniq_pred1 for j in uniq_pred2] )
201 |     # tmp = nx.max_weight_matching(G)
202 |     # state_matching = {x[0]:x[1] for x in tmp}
203 |     # state_matching.update( {x[1]:x[0] for x in tmp} )
204 |     G.add_weighted_edges_from( [(f"A{i}", f"B{j}", len(state_pred1) - np.sum(np.logical_and(state_pred1==uniq_pred1[i], state_pred2==uniq_pred2[j]))) for i in uniq_pred1 for j in uniq_pred2] )
205 |     state_matching = nx.bipartite.minimum_weight_full_matching(G)
206 | 
207 |     # matching predicted clones
208 |     n_clones = len(np.unique(clone_pred1))
209 |     uniq_pred1 = np.sort(np.unique(clone_pred1))
210 |     uniq_pred2 = np.sort(np.unique(clone_pred2))
211 |     G = nx.Graph()
212 |     G.add_nodes_from([f"A{i}" for i in uniq_pred1], bipartite=0)
213 |     G.add_nodes_from([f"B{j}" for j in uniq_pred2], bipartite=1)
214 |     # G.add_weighted_edges_from( [(f"A{i}", f"B{j}", np.sum(np.logical_and(clone_pred1==uniq_pred1[i], clone_pred2==uniq_pred2[j]))) for i in uniq_pred1 for j in uniq_pred2] )
215 |     # tmp = nx.max_weight_matching(G)
216 |     # clone_matching = {x[0]:x[1] for x in tmp}
217 |     # clone_matching.update( {x[1]:x[0] for x in tmp} )
218 |     G.add_weighted_edges_from( [(f"A{i}", f"B{j}", len(clone_pred1) - np.sum(np.logical_and(clone_pred1==uniq_pred1[i], clone_pred2==uniq_pred2[j]))) for i in uniq_pred1 for j in uniq_pred2] )
219 |     clone_matching = nx.bipartite.minimum_weight_full_matching(G)
220 | 
221 |     # l2 distance between corresponding CNV at corresponding clone
222 |     # reorder p_binom2 based on state_matching and clone_matching
223 |     reorder_p_binom2 = p_binom2[:, np.array([ int(clone_matching[f"A{i}"][1:]) for i in range(n_clones)])]
224 |     reorder_p_binom2 = reorder_p_binom2[np.array([ int(state_matching[f"A{i}"][1:]) for i in range(n_states) ]), :]
225 |     l2 = 0
226 |     for i in range(p_binom1.shape[0]):
227 |         l2 += min( np.sum(np.square(p_binom1[i,:] - reorder_p_binom2[i,:])), np.sum(np.square(p_binom1[i,:] - 1 + reorder_p_binom2[i,:])) )
228 |     return l2
229 | 
230 | 
231 | def get_intervals(pred_cnv):
232 |     intervals = []
233 |     labs = []
234 |     s = 0
235 |     while s < len(pred_cnv):
236 |         t = np.where(pred_cnv[s:] != pred_cnv[s])[0]
237 |         if len(t) == 0:
238 |             intervals.append( (s, len(pred_cnv))  )
239 |             labs.append( pred_cnv[s] )
240 |             s = len(pred_cnv)
241 |         else:
242 |             t = t[0]
243 |             intervals.append( (s,s+t) )
244 |             labs.append( pred_cnv[s] )
245 |             s = s+t
246 |     return intervals, labs
247 | 
248 | 
249 | def get_intervals_nd(pred_cnv):
250 |     """
251 |     pred_cnv : np.array of shape (n_bins, n_clones)
252 |     """
253 |     intervals = []
254 |     labs = []
255 |     s = 0
256 |     while s < len(pred_cnv):
257 |         t = np.where(np.any(pred_cnv[s:] != pred_cnv[s], axis=1))[0]
258 |         if len(t) == 0:
259 |             intervals.append( (s, len(pred_cnv))  )
260 |             labs.append( pred_cnv[s] )
261 |             s = len(pred_cnv)
262 |         else:
263 |             t = t[0]
264 |             intervals.append( (s,s+t) )
265 |             labs.append( pred_cnv[s] )
266 |             s = s+t
267 |     return intervals, labs
268 | 
269 | 
270 | def postbinning_forvisual(X, base_nb_mean, total_bb_RD, lengths, res, binsize=2):
271 |     # a list of intervals used in binning for transforming back to non-binned space
272 |     intervals = []
273 |     bin_lengths = []
274 |     # variables for for-loop
275 |     chrname = 0
276 |     nextlen = lengths[chrname]
277 |     s = 0
278 |     while s < X.shape[0]:
279 |         t = min(s+binsize, nextlen)
280 |         intervals.append( [s,t] )
281 |         s = t
282 |         if s >= nextlen:
283 |             if s < X.shape[0]:
284 |                 chrname += 1
285 |                 nextlen += lengths[chrname]
286 |             bin_lengths.append( len(intervals) )
287 |     bin_lengths = np.array(bin_lengths)
288 |     bin_lengths[1:] = bin_lengths[1:] - bin_lengths[:-1]
289 | 
290 |     # binning based on previous intervals
291 |     n_states = int(res["log_gamma"].shape[0] / 2)
292 |     phase_prob = np.exp(scipy.special.logsumexp(res["log_gamma"][:n_states, :], axis=0))
293 |     bin_X = np.zeros((len(intervals), X.shape[1], X.shape[2]), dtype=int)
294 |     bin_base_nb_mean = np.zeros((len(intervals), base_nb_mean.shape[1]), dtype=int)
295 |     bin_total_bb_RD = np.zeros((len(intervals), total_bb_RD.shape[1]), dtype=int)
296 |     bin_pred_cnv = np.zeros(len(intervals), dtype=int)
297 |     for i, intvl in enumerate(intervals):
298 |         s,t = intvl
299 |         bin_X[i,0,:] = np.sum(X[s:t, 0,:], axis=0)
300 |         bin_X[i,1,:] = np.sum( phase_prob[s:t].dot(X[s:t, 1,:]) + (1-phase_prob[s:t]).dot(total_bb_RD[s:t,:] - X[s:t,1,:]) )
301 |         bin_base_nb_mean[i,:] = np.sum(base_nb_mean[s:t,:], axis=0)
302 |         bin_total_bb_RD[i,:] = np.sum(total_bb_RD[s:t,:], axis=0)
303 |         bin_pred_cnv[i] = res["pred_cnv"][s]
304 |     
305 |     return bin_X, bin_base_nb_mean, bin_total_bb_RD, bin_pred_cnv, bin_lengths, intervals


--------------------------------------------------------------------------------
/utils/filter_snps_forphasing.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | 
 3 | import sys
 4 | import numpy as np
 5 | import pandas as pd
 6 | from pathlib import Path
 7 | import argparse
 8 | 
 9 | 
10 | def main(cellsnplite_result_dir, eagle_out_dir, vaf_threshold=0.1):
11 |     cellsnp_base = [str(x) for x in Path(cellsnplite_result_dir).glob("cellSNP.base*")][0]
12 |     df_snp = pd.read_csv(cellsnp_base, comment="#", sep="\t", names=["tmpCHR", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"])
13 |     df_snp["CHROM"] = [f"chr{x}" for x in df_snp.tmpCHR]
14 |     df_snp["AD"] = [int(x.split(";")[0].split("=")[-1]) for x in df_snp.INFO]
15 |     df_snp["DP"] = [int(x.split(";")[1].split("=")[-1]) for x in df_snp.INFO]
16 |     df_snp["OTH"] = [int(x.split(";")[2].split("=")[-1]) for x in df_snp.INFO]
17 |     # remove records with DP == 0
18 |     df_snp = df_snp[df_snp.DP > 0]
19 |     # keep het SNP (0.1 <= AD/DP <= 0.9) and hom ALT SNP (AD == DP >= 10)
20 |     # df_snp = df_snp[((df_snp.AD / df_snp.DP >= 0.1) & (df_snp.AD / df_snp.DP <= 0.9)) | ((df_snp.AD == df_snp.DP) & (df_snp.DP >= 10))]
21 |     df_snp = df_snp[((df_snp.AD >= 2) & (df_snp.DP - df_snp.AD >= 2) & (df_snp.AD / df_snp.DP >= vaf_threshold) & (df_snp.AD / df_snp.DP <= 1-vaf_threshold)) | ((df_snp.AD == df_snp.DP) & (df_snp.DP >= 10)) | ((df_snp.AD == 0) & (df_snp.DP >= 10))]
22 |     # add addition columns
23 |     df_snp["FORMAT"] = "GT"
24 |     # df_snp[f"{sample_id}"] = ["0/1" if row.AD < row.DP else "1/1" for i,row in df_snp.iterrows()]
25 |     gt_column = np.array(["0/0"] * df_snp.shape[0])
26 |     gt_column[ (df_snp.AD == df_snp.DP) ] = "1/1"
27 |     gt_column[ (df_snp.AD > 0) & (df_snp.DP - df_snp.AD > 0) ] = "0/1"
28 |     df_snp["SAMPLE_ID"] = gt_column
29 |     # output chromosome to folder
30 |     for c in range(1, 23):
31 |         df = df_snp[ (df_snp.tmpCHR == c) | (df_snp.tmpCHR == str(c)) ]
32 |         # remove records that have duplicated snp_id
33 |         snp_id = [f"{row.tmpCHR}_{row.POS}_{row.REF}_{row.ALT}" for i,row in df.iterrows()]
34 |         df["snp_id"] = snp_id
35 |         df = df.groupby("snp_id").agg({"CHROM":"first", "POS":"first", "ID":"first", "REF":"first", "ALT":"first", "QUAL":"first", "FILTER":"first", \
36 |                                        "INFO":"first", "FORMAT":"first", "SAMPLE_ID":"first", "AD":"sum", "DP":"sum", "OTH":"sum"})
37 |         info = [f"AD={row.AD};DP={row.DP};OTH={row.OTH}" for i,row in df.iterrows()]
38 |         df["INFO"] = info
39 |         df = df[["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "SAMPLE_ID"]]
40 |         df.sort_values(by="POS", inplace=True)
41 |         fp = open(f"{eagle_out_dir}/chr{c}.vcf", 'w')
42 |         fp.write("##fileformat=VCFv4.2\n")
43 |         fp.write("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Consensus Genotype across all datasets with called genotype\">\n")
44 |         fp.write("#" + "\t".join(df.columns) + "\n")
45 |         df.to_csv(fp, sep="\t", index=False, header=False)
46 |         fp.close()
47 | 
48 | if __name__ == "__main__":
49 |     parser = argparse.ArgumentParser()
50 |     parser.add_argument("-c", "--cellsnplite_result_dir", help="cellsnplite result directory", type=str)
51 |     parser.add_argument("-o", "--eagle_out_dir", help="eagle output directory", type=str)
52 |     parser.add_argument("-v", "--vaf_threshold", help="vaf threshold", default=0.1, type=float)
53 |     args = parser.parse_args()
54 |     main(args.cellsnplite_result_dir, args.eagle_out_dir, args.vaf_threshold)
55 | 


--------------------------------------------------------------------------------
/utils/get_snp_matrix.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python
  2 | 
  3 | import sys
  4 | import numpy as np
  5 | import pandas as pd
  6 | from scipy.special import logsumexp
  7 | import scipy.io
  8 | from pathlib import Path
  9 | import json
 10 | import gzip
 11 | import pickle
 12 | from tqdm import trange
 13 | import copy
 14 | import argparse
 15 | 
 16 | 
 17 | def process_snp_phasing(cellsnp_folder, eagle_folder, outputfile):
 18 |     # create a (snp_id, GT) map from eagle2 output
 19 |     snp_gt_map = {}
 20 |     for c in range(1, 23):
 21 |         fname = [str(x) for x in Path(eagle_folder).glob("*chr{}.phased.vcf.gz".format(c))]
 22 |         assert len(fname) > 0
 23 |         fname = fname[0]
 24 |         tmpdf = pd.read_table(fname, compression = 'gzip', comment = '#', sep="\t", names=["CHR","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT","PHASE"])
 25 |         this_snp_ids = [ "{}_{}_{}_{}".format(c, row.POS, row.REF, row.ALT) for i,row in tmpdf.iterrows() ]
 26 |         this_gt = list(tmpdf.iloc[:,-1])
 27 |         assert len(this_snp_ids) == len(this_gt)
 28 |         snp_gt_map.update( {this_snp_ids[i]:this_gt[i] for i in range(len(this_gt))} )
 29 |     # cellsnp DP (read depth) and AD (alternative allele depth)
 30 |     # first get a list of snp_id and spot barcodes
 31 |     tmpdf = pd.read_csv(cellsnp_folder + "/cellSNP.base.vcf.gz", header=1, sep="\t")
 32 |     snp_list = np.array([ "{}_{}_{}_{}".format(row["#CHROM"], row.POS, row.REF, row.ALT) for i,row in tmpdf.iterrows() ])
 33 |     tmpdf = pd.read_csv(cellsnp_folder + "/cellSNP.samples.tsv", header=None)
 34 |     sample_list = np.array(list(tmpdf.iloc[:,0]))
 35 |     # then get the DP and AD matrix
 36 |     DP = scipy.io.mmread(cellsnp_folder + "/cellSNP.tag.DP.mtx").tocsr()
 37 |     AD = scipy.io.mmread(cellsnp_folder + "/cellSNP.tag.AD.mtx").tocsr()
 38 |     # remove SNPs that are not phased
 39 |     is_phased = np.array([ (x in snp_gt_map) for x in snp_list ])
 40 |     DP = DP[is_phased,:]
 41 |     AD = AD[is_phased,:]
 42 |     snp_list = snp_list[is_phased]
 43 |     # generate a new dataframe with columns (cell, snp_id, DP, AD, CHROM, POS, GT)
 44 |     rows, cols = DP.nonzero()
 45 |     cell = sample_list[cols]
 46 |     snp_id = snp_list[rows]
 47 |     DP_df = DP[DP.nonzero()].A.flatten()
 48 |     AD_df = AD[DP.nonzero()].A.flatten()
 49 |     GT = [snp_gt_map[x] for x in snp_id]
 50 |     df = pd.DataFrame({"cell":cell, "snp_id":snp_id, "DP":DP_df, "AD":AD_df, \
 51 |                        "CHROM":[int(x.split("_")[0]) for x in snp_id], "POS":[int(x.split("_")[1]) for x in snp_id], "GT":GT})
 52 |     df.to_csv(outputfile, sep="\t", index=False, header=True, compression={'method': 'gzip'})
 53 |     return df
 54 | 
 55 | 
 56 | def read_cell_by_snp(allele_counts_file):
 57 |     df = pd.read_csv(allele_counts_file, sep="\t", header=0)
 58 |     index = np.array([i for i,x in enumerate(df.GT) if x=="0|1" or x=="1|0"])
 59 |     df = df.iloc[index, :]
 60 |     df.CHROM = df.CHROM.astype(int)
 61 |     return df
 62 | 
 63 | 
 64 | def cell_by_gene_lefthap_counts(cellsnp_folder, eagle_folder, barcode_list):
 65 |     # create a (snp_id, GT) map from eagle2 output
 66 |     snp_gt_map = {}
 67 |     for c in range(1, 23):
 68 |         fname = [str(x) for x in Path(eagle_folder).glob("*chr{}.phased.vcf.gz".format(c))]
 69 |         assert len(fname) > 0
 70 |         fname = fname[0]
 71 |         tmpdf = pd.read_table(fname, compression = 'gzip', comment = '#', sep="\t", names=["CHR","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT","PHASE"])
 72 |         # only keep heterozygous SNPs
 73 |         tmpdf = tmpdf[ (tmpdf.PHASE=="0|1") | (tmpdf.PHASE=="1|0") ]
 74 |         this_snp_ids = (str(c) + "_" + tmpdf.POS.astype(str) +"_"+  tmpdf.REF +"_"+ tmpdf.ALT).values
 75 |         this_gt = tmpdf.PHASE.values
 76 |         assert len(this_snp_ids) == len(this_gt)
 77 |         snp_gt_map.update( {this_snp_ids[i]:this_gt[i] for i in range(len(this_gt))} )
 78 | 
 79 |     # cellsnp-lite output
 80 |     cellsnp_base = [str(x) for x in Path(cellsnp_folder).glob("cellSNP.base*")][0]
 81 |     df_snp = pd.read_csv(cellsnp_base, comment="#", sep="\t", names=["tmpCHR", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"])
 82 |     df_snp['snp_id'] = df_snp.tmpCHR.astype(str) + "_" + df_snp.POS.astype(str) + "_" + df_snp.REF + "_" + df_snp.ALT
 83 |     tmpdf = pd.read_csv(cellsnp_folder + "/cellSNP.samples.tsv", header=None)
 84 |     sample_list = np.array(list(tmpdf.iloc[:,0]))
 85 |     barcode_mapper = {x:i for i,x in enumerate(sample_list)}
 86 |     # DP and AD
 87 |     DP = scipy.io.mmread(cellsnp_folder + "/cellSNP.tag.DP.mtx").tocsr()
 88 |     AD = scipy.io.mmread(cellsnp_folder + "/cellSNP.tag.AD.mtx").tocsr()
 89 |     # retain only SNPs that are phased
 90 |     is_phased = (df_snp.snp_id.isin(snp_gt_map)).values
 91 |     df_snp = df_snp[is_phased]
 92 |     df_snp['GT'] = [snp_gt_map[x] for x in df_snp.snp_id]
 93 |     DP = DP[is_phased,:]
 94 |     AD = AD[is_phased,:]
 95 | 
 96 |     # phasing
 97 |     phased_AD = np.where( (df_snp.GT.values == "0|1").reshape(-1,1), AD.A, (DP-AD).A )
 98 |     phased_AD = scipy.sparse.csr_matrix(phased_AD)
 99 | 
100 |     # re-order based on barcode_list
101 |     index = np.array([barcode_mapper[x] for x in barcode_list if x in barcode_mapper])
102 |     DP = DP[:, index]
103 |     phased_AD = phased_AD[:, index]    
104 |     
105 |     # returned matrix has shape (N_cells, N_snps), which is the transpose of the original matrix
106 |     return (DP-phased_AD).T, phased_AD.T, df_snp.snp_id.values
107 | 
108 | 
109 | def cell_by_gene_lefthap_counts_v2(df_cell_snp, hg_table_file, gene_list, barcode_list):
110 |     # index of genes and barcodes in the current gene expression matrix
111 |     barcode_mapper = {x:i for i,x in enumerate(barcode_list)}
112 |     gene_mapper = {x:i for i,x in enumerate(gene_list)}
113 |     # make an numpy array for CHROM and POS in df_cell_snp
114 |     cell_snp_CHROM = np.array(df_cell_snp.CHROM)
115 |     cell_snp_POS = np.array(df_cell_snp.POS)
116 |     # read gene ranges in genome
117 |     # NOTE THAT THE FOLLOWING CODE REQUIRES hg_table_file IS SORTED BY GENOMIC POSITION!
118 |     df_genes = pd.read_csv(hg_table_file, header=0, index_col=0, sep="\t")
119 |     index = np.array([ i for i in range(df_genes.shape[0]) if (not "_" in df_genes.chrom.iloc[i]) and \
120 |                       (df_genes.chrom.iloc[i] != "chrX") and (df_genes.chrom.iloc[i] != "chrY") and (df_genes.chrom.iloc[i] != "chrM") and \
121 |                       (not "GL" in df_genes.chrom.iloc[i]) and (not "KI" in df_genes.chrom.iloc[i]) ])
122 |     df_genes = df_genes.iloc[index, :]
123 |     tmp_gene_ranges = {df_genes.name2.iloc[i]:(int(df_genes.chrom.iloc[i][3:]), df_genes.cdsStart.iloc[i], df_genes.cdsEnd.iloc[i]) for i in np.arange(df_genes.shape[0]) }
124 |     gene_ranges = [(gname, tmp_gene_ranges[gname]) for gname in gene_list if gname in tmp_gene_ranges]
125 |     del tmp_gene_ranges
126 |     # aggregate snp counts to genes
127 |     N = np.unique(df_cell_snp.cell).shape[0]
128 |     G = len(gene_ranges)
129 |     i = 0
130 |     j = 0
131 |     cell_gene_snp_counts = []
132 |     snp_ids = np.array(df_cell_snp.snp_id)
133 |     unique_snp_ids = df_cell_snp.snp_id.unique()
134 |     snp_id_mapper = {unique_snp_ids[i]:i for i in range(len(unique_snp_ids))}
135 |     N_snps = len(unique_snp_ids)
136 |     cell_snp_Aallele = np.zeros((len(barcode_list), N_snps))
137 |     cell_snp_Ballele = np.zeros((len(barcode_list), N_snps))
138 |     snp_gene_list = [""] * N_snps
139 |     for i in trange(df_cell_snp.shape[0]):
140 |         if df_cell_snp.GT.iloc[i] == "1|1" or df_cell_snp.GT.iloc[i] == "0|0":
141 |             continue
142 |         # check cell barcode
143 |         if not df_cell_snp.cell.iloc[i] in barcode_mapper:
144 |             continue
145 |         cell_idx = barcode_mapper[df_cell_snp.cell.iloc[i]]
146 |         # if the SNP is not within any genes
147 |         if j < len(gene_ranges) and (cell_snp_CHROM[i] < gene_ranges[j][1][0] or \
148 |                                      (cell_snp_CHROM[i] == gene_ranges[j][1][0] and cell_snp_POS[i] < gene_ranges[j][1][1])):
149 |             continue
150 |         # if the SNP position passes gene j
151 |         while j < len(gene_ranges) and (cell_snp_CHROM[i] > gene_ranges[j][1][0] or \
152 |                                         (cell_snp_CHROM[i] == gene_ranges[j][1][0] and cell_snp_POS[i] > gene_ranges[j][1][2])):
153 |             j += 1
154 |         # if the SNP is within gene j, add the corresponding gene ID
155 |         if j < len(gene_ranges) and cell_snp_CHROM[i] == gene_ranges[j][1][0] and \
156 |         cell_snp_POS[i] >= gene_ranges[j][1][1] and cell_snp_POS[i] <= gene_ranges[j][1][2]:
157 |             snp_gene_list[ snp_id_mapper[snp_ids[i]] ] = gene_ranges[j][0]
158 |         # add the SNP UMI count to the corresponding cell and loci
159 |         if df_cell_snp.GT.iloc[i] == "0|1":
160 |             cell_snp_Aallele[cell_idx, snp_id_mapper[snp_ids[i]]] = df_cell_snp.DP.iloc[i] - df_cell_snp.AD.iloc[i]
161 |             cell_snp_Ballele[cell_idx, snp_id_mapper[snp_ids[i]]] = df_cell_snp.AD.iloc[i]
162 |         elif df_cell_snp.GT.iloc[i] == "1|0":
163 |             cell_snp_Aallele[cell_idx, snp_id_mapper[snp_ids[i]]] = df_cell_snp.AD.iloc[i]
164 |             cell_snp_Ballele[cell_idx, snp_id_mapper[snp_ids[i]]] = df_cell_snp.DP.iloc[i] - df_cell_snp.AD.iloc[i]
165 |             
166 |     index = np.where(np.logical_and( np.sum(cell_snp_Aallele + cell_snp_Ballele, axis=0) > 0))[0]
167 |     cell_snp_Aallele = cell_snp_Aallele[:, index].astype(int)
168 |     cell_snp_Ballele = cell_snp_Ballele[:, index].astype(int)
169 |     snp_gene_list = np.array(snp_gene_list)[index]
170 |     unique_snp_ids = unique_snp_ids[index]
171 |     return cell_snp_Aallele, cell_snp_Ballele, snp_gene_list, unique_snp_ids
172 | 
173 | 
174 | if __name__ == "__main__":
175 |     parser = argparse.ArgumentParser()
176 |     parser.add_argument("-c", "--cellsnplite_result_dir", help="cellsnplite result directory", type=str)
177 |     parser.add_argument("-e", "--eagle_out_dir", help="eagle output directory", type=str)
178 |     parser.add_argument("-b", "--barcodefile", help="barcode file", type=str)
179 |     parser.add_argument("-o", "--outputdir", help="output directory", type=str)
180 |     args = parser.parse_args()
181 | 
182 |     barcode_list = list(pd.read_csv(args.barcodefile, header=None).iloc[:,0])
183 |     cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids = cell_by_gene_lefthap_counts(args.cellsnplite_result_dir, args.eagle_out_dir, barcode_list)
184 | 
185 |     scipy.sparse.save_npz(f"{args.outputdir}/cell_snp_Aallele.npz", cell_snp_Aallele)
186 |     scipy.sparse.save_npz(f"{args.outputdir}/cell_snp_Ballele.npz", cell_snp_Ballele)
187 |     np.save(f"{args.outputdir}/unique_snp_ids.npy", unique_snp_ids)
188 | 


--------------------------------------------------------------------------------
/utils/merge_bamfile.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | 
 3 | import sys
 4 | import pysam
 5 | import pandas as pd
 6 | import subprocess
 7 | import argparse
 8 | 
 9 | 
10 | def write_merged_bam(input_bamfile_list, suffix_list, output_bam):
11 |     fpin = pysam.AlignmentFile(input_bamfile_list[0], "rb")
12 |     fpout = pysam.AlignmentFile(output_bam, "wb", template=fpin)
13 |     fpin.close()
14 |     for i, fname in enumerate(input_bamfile_list):
15 |         fpin = pysam.AlignmentFile(fname, "rb")
16 |         suffix = suffix_list[i]
17 |         for read in fpin:
18 |             if read.has_tag("CB"):
19 |                 b = read.get_tag("CB")
20 |                 read.set_tag("CB", f"{b}_{suffix}")
21 |             fpout.write(read)
22 |         fpin.close()
23 |     fpout.close()
24 | 
25 | 
26 | def write_merged_deconvolution(input_deconvfile_list, suffix_list, output_deconv):
27 |     df_combined = []
28 |     for i, fname in enumerate(input_deconvfile_list):
29 |         suffix = suffix_list[i]
30 |         tmpdf = pd.read_csv(fname, header=0, index_col=0, sep="\t")
31 |         tmpdf.index = [f"{x}_{suffix}" for x in tmpdf.index]
32 |         df_combined.append(tmpdf)
33 |     df_combined = pd.concat(df_combined, ignore_index=False)
34 |     df_combined.to_csv(output_deconv, header=True, index=True, sep="\t")
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     parser = argparse.ArgumentParser()
39 |     parser.add_argument("-b", "--bamlistfile", help="cellsnplite result directory", type=str)
40 |     parser.add_argument("-o", "--output_dir", help="output directory", type=str)
41 |     args = parser.parse_args()
42 | 
43 |     df = pd.read_csv(args.bamlistfile, sep="\t", header=None, index_col=None)
44 |     if df.shape[1] == 3:
45 |         df.columns=["bamfilename", "suffix", "cellrangerdir"]
46 |     else:
47 |         df.columns=["bamfilename", "suffix", "cellrangerdir", "deconv_filename"]
48 | 
49 |     input_bamfile_list = df.bamfilename.values
50 |     suffix_list = df.suffix.values
51 |     write_merged_bam(input_bamfile_list, suffix_list, f"{args.output_dir}/unsorted_possorted_genome_bam.bam")
52 | 
53 |     if df.shape[1] == 4:
54 |         # merge deconvolution file
55 |         assert "deconv_filename" in df.columns
56 |         input_deconvfile_list = df.deconv_filename.values
57 |         suffix_list = df.suffix.values
58 |         write_merged_deconvolution(input_deconvfile_list, suffix_list, f"{args.output_dir}/merged_deconvolution.tsv")
59 | 


--------------------------------------------------------------------------------
/utils/process_snps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ##### input and output data paths #####
 4 | # SAMPLE_ID is used for setting directory/file name
 5 | SAMPLE_ID="58408_Primary"
 6 | CELLRANGER_OUT="/u/congma/ragr-data/users/congma/Datasets/MM/58408_Primary/cellranger/outs/"
 7 | BAMFILE="/u/congma/ragr-data/users/congma/Datasets/MM/58408_Primary/scRNA.unsorted.58408_Primary.bam"
 8 | OUTDIR="/u/congma/ragr-data/users/congma/Datasets/MM/58408_Primary/numbatprep/"
 9 | 
10 | NTHREADS=20
11 | 
12 | ##### reference file paths #####
13 | # PHASING_PANEL is downloaded as instructed in numbat "1000G Reference Panel" and then unzipped. Link to download: wget http://pklab.med.harvard.edu/teng/data/1000G_hg38.zip
14 | PHASING_PANEL="/u/congma/ragr-data/users/congma/references/phasing_ref/1000G_hg38/"
15 | # REGION_VCF serves as the same purpose as "1000G SNP reference file" in numbat, but using a larger SNP set. Link to download: wget https://sourceforge.net/projects/cellsnp/files/SNPlist/genome1K.phase3.SNP_AF5e4.chr1toX.hg38.vcf.gz
16 | REGION_VCF="/u/congma/ragr-data/users/congma/references/snplist/genome1K.phase3.SNP_AF5e4.chr1toX.hg38.vcf.gz"
17 | # HGTABLE_FILE specifies gene positions in the genome, for mapping SNPs to genes. Link to download: https://github.com/raphael-group/STARCH/blob/develop/hgTables_hg38_gencode.txt
18 | HGTABLE_FILE="/u/congma/ragr-data/users/congma/Codes/STARCH_crazydev/hgTables_hg38_gencode.txt"
19 | # there is a reference file in eagle folder
20 | eagledir="/u/congma/ragr-data/users/congma/environments/Eagle_v2.4.1/"
21 | 
22 | 
23 | ##### Following are commands for calling + phasing + processing SNPs #####
24 | # index bam file
25 | if [[ ! -e ${BAMFILE}.bai ]]; then
26 |     samtools index ${BAMFILE}
27 | fi
28 | 
29 | # write required barcode list file
30 | mkdir -p ${OUTDIR}
31 | gunzip -c ${CELLRANGER_OUT}/filtered_feature_bc_matrix/barcodes.tsv.gz > ${OUTDIR}/barcodes.txt
32 | 
33 | # run cellsnp-lite
34 | mkdir -p ${OUTDIR}/pileup/${SAMPLE_ID}
35 | cellsnp-lite -s ${BAMFILE} \
36 |              -b ${OUTDIR}/barcodes.txt \
37 |              -O ${OUTDIR}/pileup/${SAMPLE_ID} \
38 |              -R ${REGION_VCF} \
39 |              -p ${NTHREADS} \
40 |              --minMAF 0 --minCOUNT 2 --UMItag Auto --cellTAG CB
41 | 
42 | # run phasing
43 | mkdir -p ${OUTDIR}/phasing/
44 | SCRIPTDIR=$(dirname "$0")
45 | python ${SCRIPTDIR}/filter_snps_forphasing.py ${SAMPLE_ID} ${OUTDIR}
46 | for chr in {1..22}; do
47 |     bgzip -f ${OUTDIR}/phasing/${SAMPLE_ID}_chr${chr}.vcf
48 |     tabix ${OUTDIR}/phasing/${SAMPLE_ID}_chr${chr}.vcf.gz
49 |     eagle --numThreads ${NTHREADS} \
50 |           --vcfTarget ${OUTDIR}/phasing/${SAMPLE_ID}_chr${chr}.vcf.gz \
51 |           --vcfRef ${PHASING_PANEL}/chr${chr}.genotypes.bcf \
52 |           --geneticMapFile=${eagledir}/tables/genetic_map_hg38_withX.txt.gz \
53 |           --outPrefix ${OUTDIR}/phasing/${SAMPLE_ID}_chr${chr}.phased
54 | done
55 | 
56 | 
57 | # run my pythonn to get a cell-by-gene matrix of SNP-covering UMI counts
58 | SCRIPTDIR=$(dirname "$0")
59 | python ${SCRIPTDIR}/get_snp_matrix.py ${OUTDIR} ${SAMPLE_ID} ${HGTABLE_FILE} ${CELLRANGER_OUT}/filtered_feature_bc_matrix/ 
60 | 


--------------------------------------------------------------------------------
/utils/process_snps_merged.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ##### input and output data paths #####
 4 | # SAMPLE_ID is used for setting directory/file name
 5 | SAMPLE_ID="joint_H1_245_H2_1"
 6 | INPUTLIST="/u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_snps/joint_H1_245_H2_1/bamfile_list.tsv"
 7 | BAMFILE="/u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_snps/joint_H1_245_H2_1/possorted_genome_bam.bam"
 8 | OUTDIR="/u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_snps/joint_H1_245_H2_1/visium_snpnew"
 9 | 
10 | NTHREADS=20
11 | 
12 | ##### reference file paths #####
13 | # PHASING_PANEL is downloaded as instructed in numbat "1000G Reference Panel" and then unzipped. Link to download: wget http://pklab.med.harvard.edu/teng/data/1000G_hg38.zip
14 | PHASING_PANEL="/u/congma/ragr-data/users/congma/references/phasing_ref/1000G_hg38/"
15 | # REGION_VCF serves as the same purpose as "1000G SNP reference file" in numbat, but using a larger SNP set. Link to download: wget https://sourceforge.net/projects/cellsnp/files/SNPlist/genome1K.phase3.SNP_AF5e4.chr1toX.hg38.vcf.gz
16 | REGION_VCF="/u/congma/ragr-data/users/congma/references/snplist/nocpg.genome1K.phase3.SNP_AF5e4.chr1toX.hg38.vcf.gz"
17 | # HGTABLE_FILE specifies gene positions in the genome, for mapping SNPs to genes. Link to download: https://github.com/raphael-group/STARCH/blob/develop/hgTables_hg38_gencode.txt
18 | HGTABLE_FILE="/u/congma/ragr-data/users/congma/Codes/STARCH_crazydev/hgTables_hg38_gencode.txt"
19 | # there is a reference file in eagle folder
20 | eagledir="/u/congma/ragr-data/users/congma/environments/Eagle_v2.4.1/"
21 | 
22 | 
23 | ##### Following are commands for calling + phasing + processing SNPs #####
24 | # index bam file
25 | if [[ ! -e ${BAMFILE}.bai ]]; then
26 |     samtools index ${BAMFILE}
27 | fi
28 | 
29 | # write required barcode list file
30 | mkdir -p ${OUTDIR}
31 | touch ${OUTDIR}/barcodes.txt
32 | >${OUTDIR}/barcodes.txt
33 | while read -r line; do
34 | 	CELLRANGER_OUT=$(echo ${line} | awk '{print $3}')
35 | 	suffix=$(echo ${line} | awk '{print $2}')
36 | 	gunzip -c ${CELLRANGER_OUT}/filtered_feature_bc_matrix/barcodes.tsv.gz | awk -v var=${suffix} '{print $0"_"var}' >> ${OUTDIR}/barcodes.txt
37 | done < ${INPUTLIST}
38 | 
39 | # run cellsnp-lite
40 | mkdir -p ${OUTDIR}/pileup/${SAMPLE_ID}
41 | cellsnp-lite -s ${BAMFILE} \
42 |              -b ${OUTDIR}/barcodes.txt \
43 |              -O ${OUTDIR}/pileup/${SAMPLE_ID} \
44 |              -R ${REGION_VCF} \
45 |              -p ${NTHREADS} \
46 |              --minMAF 0 --minCOUNT 2 --UMItag Auto --cellTAG CB
47 | 
48 | # run phasing
49 | mkdir -p ${OUTDIR}/phasing/
50 | SCRIPTDIR="/u/congma/ragr-data/users/congma/Codes/STARCH_crazydev/scripts"
51 | python ${SCRIPTDIR}/filter_snps_forphasing.py ${SAMPLE_ID} ${OUTDIR}
52 | for chr in {1..22}; do
53 |     bgzip -f ${OUTDIR}/phasing/${SAMPLE_ID}_chr${chr}.vcf
54 |     tabix ${OUTDIR}/phasing/${SAMPLE_ID}_chr${chr}.vcf.gz
55 |     ${eagledir}/eagle --numThreads ${NTHREADS} \
56 |           --vcfTarget ${OUTDIR}/phasing/${SAMPLE_ID}_chr${chr}.vcf.gz \
57 |           --vcfRef ${PHASING_PANEL}/chr${chr}.genotypes.bcf \
58 |           --geneticMapFile=${eagledir}/tables/genetic_map_hg38_withX.txt.gz \
59 |           --outPrefix ${OUTDIR}/phasing/${SAMPLE_ID}_chr${chr}.phased
60 | done
61 | 
62 | 
63 | # run my pythonn to get a cell-by-gene matrix of SNP-covering UMI counts
64 | #SCRIPTDIR=$(dirname "$0")
65 | python ${SCRIPTDIR}/get_snp_matrix.py ${OUTDIR} ${SAMPLE_ID} ${HGTABLE_FILE} ${OUTDIR}/barcodes.txt ${CELLRANGER_OUT}/filtered_feature_bc_matrix/
66 | 


--------------------------------------------------------------------------------