├── .gitignore ├── .readthedocs.yaml ├── GRCh38_resources ├── HLA_regions.bed ├── genetic_map_GRCh38_merged.tab.gz ├── hgTables_hg38_gencode.txt └── ig_gene_list.txt ├── LICENSE ├── README.md ├── calicost.smk ├── config.yaml ├── configuration_cna ├── configuration_cna_multi ├── configuration_purity ├── docs ├── _ext │ └── typed_returns.py ├── _static │ ├── css │ │ ├── custom.css │ │ ├── dataframe.css │ │ ├── nbsphinx.css │ │ └── sphinx_gallery.css │ └── img │ │ ├── acn_color_palette.png │ │ ├── overview4_combine.pdf │ │ └── overview4_combine.png ├── conf.py ├── index.rst ├── installation.rst ├── notebooks │ └── tutorials │ │ ├── prostate_tutorial.ipynb │ │ └── simulated_data_tutorial.ipynb ├── parameters.rst ├── references.rst └── tutorials.rst ├── environment.yml ├── examples ├── CalicoST_example.tar.gz ├── example_input_filelist ├── prostate_example.tar.gz └── simulated_example.tar.gz ├── pyproject.toml ├── setup.py ├── src └── calicost │ ├── __init__.py │ ├── allele_starch_generateconfig.py │ ├── arg_parse.py │ ├── calicost_main.py │ ├── calicost_supervised.py │ ├── estimate_tumor_proportion.py │ ├── find_integer_copynumber.py │ ├── hmm_NB_BB_nophasing.py │ ├── hmm_NB_BB_nophasing_v2.py │ ├── hmm_NB_BB_phaseswitch.py │ ├── hmm_NB_sharedstates.py │ ├── hmm_gaussian.py │ ├── hmrf.py │ ├── hmrf_normalmixture.py │ ├── joint_allele_generateconfig.py │ ├── oldcode.py │ ├── parse_input.py │ ├── phasing.py │ ├── phylogeny_startle.py │ ├── phylogeography.py │ ├── simple_sctransform.py │ ├── utils_IO.py │ ├── utils_distribution_fitting.py │ ├── utils_hmm.py │ ├── utils_hmrf.py │ ├── utils_phase_switch.py │ └── utils_plotting.py └── utils ├── filter_snps_forphasing.py ├── get_snp_matrix.py ├── maya_plotter.py ├── merge_bamfile.py ├── plot_hatchet.py ├── process_snps.sh └── process_snps_merged.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: "3.10" 7 | 8 | sphinx: 9 | builder: html 10 | configuration: docs/conf.py 11 | fail_on_warning: false 12 | 13 | python: 14 | install: 15 | - method: pip 16 | path: . 17 | extra_requirements: [docs] 18 | 19 | submodules: 20 | include: [docs/notebooks] 21 | recursive: true -------------------------------------------------------------------------------- /GRCh38_resources/HLA_regions.bed: -------------------------------------------------------------------------------- 1 | chr6 29722775 29738528 2 | chr6 29726601 29749049 3 | chr6 29826967 29831125 4 | chr6 29941260 29945884 5 | chr6 30489509 30494194 6 | chr6 31268749 31272130 7 | chr6 31269491 31357188 8 | chr6 32439878 32445046 9 | chr6 32517353 32530287 10 | chr6 32578769 32589848 11 | chr6 32628179 32647062 12 | chr6 32659467 32668383 13 | chr6 32659880 32660729 14 | chr6 32741391 32747198 15 | chr6 32756098 32763532 16 | chr6 32812763 32820466 17 | chr6 32934629 32941028 18 | chr6 32948613 32969094 19 | chr6 33004182 33009591 20 | chr6 33064569 33080775 21 | chr6 33075990 33089696 22 | -------------------------------------------------------------------------------- /GRCh38_resources/genetic_map_GRCh38_merged.tab.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raphael-group/CalicoST/5e4a8a1230e71505667d51390dc9c035a69d60d9/GRCh38_resources/genetic_map_GRCh38_merged.tab.gz -------------------------------------------------------------------------------- /GRCh38_resources/ig_gene_list.txt: -------------------------------------------------------------------------------- 1 | IGKV3OR2-268 2 | IGKC 3 | IGKJ5 4 | IGKJ4 5 | IGKJ3 6 | IGKJ2 7 | IGKJ1 8 | IGKV4-1 9 | IGKV5-2 10 | IGKV1-5 11 | IGKV1-6 12 | IGKV3-7 13 | IGKV1-8 14 | IGKV1-9 15 | IGKV3-11 16 | IGKV1-12 17 | IGKV3-15 18 | IGKV1-16 19 | IGKV1-17 20 | IGKV3-20 21 | IGKV6-21 22 | IGKV2-24 23 | IGKV1-27 24 | IGKV2-28 25 | IGKV2-30 26 | IGKV1-33 27 | IGKV1-37 28 | IGKV1-39 29 | IGKV2-40 30 | IGKV2D-40 31 | IGKV1D-39 32 | IGKV1D-37 33 | IGKV1D-33 34 | IGKV2D-30 35 | IGKV2D-29 36 | IGKV2D-28 37 | IGKV2D-26 38 | IGKV2D-24 39 | IGKV6D-21 40 | IGKV3D-20 41 | IGKV6D-41 42 | IGKV1D-17 43 | IGKV1D-16 44 | IGKV3D-15 45 | IGKV1D-13 46 | IGKV1D-12 47 | IGKV3D-11 48 | IGKV1D-42 49 | IGKV1D-43 50 | IGKV1D-8 51 | IGKV3D-7 52 | IGKV1OR2-108 53 | IGHA2 54 | IGHE 55 | IGHG4 56 | IGHG2 57 | IGHA1 58 | IGHG1 59 | IGHG3 60 | IGHD 61 | IGHM 62 | IGHJ6 63 | IGHJ5 64 | IGHJ4 65 | IGHJ3 66 | IGHJ2 67 | IGHJ1 68 | IGHD7-27 69 | IGHD1-26 70 | IGHD6-25 71 | IGHD5-24 72 | IGHD4-23 73 | IGHD3-22 74 | IGHD2-21 75 | IGHD1-20 76 | IGHD6-19 77 | IGHD5-18 78 | IGHD4-17 79 | IGHD3-16 80 | IGHD2-15 81 | IGHD1-14 82 | IGHD6-13 83 | IGHD5-12 84 | IGHD4-11 85 | IGHD3-10 86 | IGHD3-9 87 | IGHD2-8 88 | IGHD1-7 89 | IGHD6-6 90 | IGHD5-5 91 | IGHD4-4 92 | IGHD3-3 93 | IGHD2-2 94 | IGHD1-1 95 | IGHV6-1 96 | IGHV1-2 97 | IGHV1-3 98 | IGHV4-4 99 | IGHV7-4-1 100 | IGHV2-5 101 | IGHV3-7 102 | IGHV3-64D 103 | IGHV5-10-1 104 | IGHV3-11 105 | IGHV3-13 106 | IGHV3-15 107 | IGHV3-16 108 | IGHV1-18 109 | IGHV3-20 110 | IGHV3-21 111 | IGHV3-23 112 | IGHV1-24 113 | IGHV2-26 114 | IGHV4-28 115 | IGHV3-30 116 | IGHV4-31 117 | IGHV3-33 118 | IGHV4-34 119 | IGHV3-35 120 | IGHV3-38 121 | IGHV4-39 122 | IGHV3-43 123 | IGHV1-45 124 | IGHV1-46 125 | IGHV3-48 126 | IGHV3-49 127 | IGHV5-51 128 | IGHV3-53 129 | IGHV1-58 130 | IGHV4-59 131 | IGHV4-61 132 | IGHV3-64 133 | IGHV3-66 134 | IGHV1-69 135 | IGHV2-70D 136 | IGHV1-69-2 137 | IGHV1-69D 138 | IGHV2-70 139 | IGHV3-72 140 | IGHV3-73 141 | IGHV3-74 142 | IGHV7-81 143 | IGHV1OR15-9 144 | IGHV3OR15-7 145 | IGHD5OR15-5A 146 | IGHD4OR15-4A 147 | IGHD3OR15-3A 148 | IGHD2OR15-2A 149 | IGHD1OR15-1A 150 | IGHD5OR15-5B 151 | IGHD4OR15-4B 152 | IGHD3OR15-3B 153 | IGHD2OR15-2B 154 | IGHD1OR15-1B 155 | AC135068.8 156 | AC135068.2 157 | IGHV1OR15-1 158 | IGHV4OR15-8 159 | IGHV3OR16-9 160 | IGHV2OR16-5 161 | IGHV3OR16-10 162 | IGHV3OR16-8 163 | IGHV3OR16-12 164 | IGHV3OR16-13 165 | IGHV1OR21-1 166 | IGLV4-69 167 | IGLV10-54 168 | IGLV8-61 169 | IGLV4-60 170 | IGLV6-57 171 | IGLV11-55 172 | IGLV5-52 173 | IGLV1-51 174 | IGLV1-50 175 | IGLV9-49 176 | IGLV5-48 177 | IGLV1-47 178 | IGLV7-46 179 | IGLV5-45 180 | IGLV1-44 181 | IGLV7-43 182 | IGLV1-40 183 | IGLV5-37 184 | IGLV1-36 185 | IGLV2-33 186 | IGLV3-32 187 | IGLV3-27 188 | IGLV3-25 189 | IGLV2-23 190 | IGLV3-22 191 | IGLV3-21 192 | IGLV3-19 193 | IGLV2-18 194 | IGLV3-16 195 | IGLV2-14 196 | IGLV3-12 197 | IGLV2-11 198 | IGLV3-10 199 | IGLV3-9 200 | IGLV2-8 201 | IGLV4-3 202 | IGLV3-1 203 | IGLJ1 204 | IGLC1 205 | IGLJ2 206 | IGLC2 207 | IGLJ3 208 | IGLC3 209 | IGLJ4 210 | IGLJ5 211 | IGLJ6 212 | IGLJ7 213 | IGLC7 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2023, Princeton University 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CalicoST 2 | 3 |

4 | 5 |

6 | 7 | CalicoST is a probabilistic model that infers allele-specific copy number aberrations and tumor phylogeography from spatially resolved transcriptomics.CalicoST has the following key features: 8 | 1. Identifies allele-specific integer copy numbers for each transcribed region, revealing events such as copy neutral loss of heterozygosity (CNLOH) and mirrored subclonal CNAs that are invisible to total copy number analysis. 9 | 2. Assigns each spot a clone label indicating whether the spot is primarily normal cells or a cancer clone with aberration copy number profile. 10 | 3. Infers a phylogeny relating the identified cancer clones as well as a phylogeography that combines genetic evolution and spatial dissemination of clones. 11 | 4. Handles normal cell admixture in SRT technologies hat are not single-cell resolution (e.g. 10x Genomics Visium) to infer more accurate allele-specific copy numbers and cancer clones. 12 | 5. Simultaneously analyzes multiple regions or aligned SRT slices from the same tumor. 13 | 14 | # System requirements 15 | The package has tested on the following Linux operating systems: SpringdaleOpenEnterprise 9.2 (Parma) and CentOS Linux 7 (Core). 16 | 17 | # Installation 18 | ## Minimum installation 19 | First setup a conda environment from the `environment.yml` file: 20 | ``` 21 | git clone https://github.com/raphael-group/CalicoST.git 22 | cd CalicoST 23 | conda env create -f environment.yml --name calicost_env 24 | ``` 25 | 26 | 27 | Then, install CalicoST using pip by 28 | ``` 29 | conda activate calicost_env 30 | pip install -e . 31 | ``` 32 | 33 | Setting up the conda environments takes around 15 minutes on an HPC head node. 34 | 35 | ## Additional installation for SNP parsing 36 | CalicoST requires allele count matrices for reference-phased A and B alleles for inferring allele-specific CNAs, and provides a snakemake pipeline for obtaining the required matrices from a BAM file. Run the following commands in CalicoST directory for installing additional package, [Eagle2](https://alkesgroup.broadinstitute.org/Eagle/), for snakemake preprocessing pipeline. 37 | 38 | ``` 39 | mkdir external 40 | wget --directory-prefix=external https://storage.googleapis.com/broad-alkesgroup-public/Eagle/downloads/Eagle_v2.4.1.tar.gz 41 | tar -xzf external/Eagle_v2.4.1.tar.gz -C external 42 | ``` 43 | 44 | ## Additional installation for reconstructing phylogeny 45 | Based on the inferred cancer clones and allele-specific CNAs by CalicoST, we apply Startle to reconstruct a phylogenetic tree along the clones. Install Startle by 46 | ``` 47 | git clone --recurse-submodules https://github.com/raphael-group/startle.git 48 | cd startle 49 | mkdir build; cd build 50 | cmake -DLIBLEMON_ROOT=\ 51 | -DCPLEX_INC_DIR=\ 52 | -DCPLEX_LIB_DIR=\ 53 | -DCONCERT_INC_DIR=\ 54 | -DCONCERT_LIB_DIR=\ 55 | .. 56 | make 57 | ``` 58 | 59 | 60 | # Getting started 61 | ### Preprocessing: genotyping and reference-based phasing 62 | To infer allele-specific CNAs, we generate allele count matrices in this preprocessing step. We followed the recommended pipeline by [Numbat](https://kharchenkolab.github.io/numbat/), which is designed for scRNA-seq data to infer clones and CNAs: first genotyping using the BAM file by cellsnp-lite (included in the conda environment) and reference-based phasing by Eagle2. Download the following panels for genotyping and reference-based phasing. 63 | * [SNP panel](https://sourceforge.net/projects/cellsnp/files/SNPlist/genome1K.phase3.SNP_AF5e4.chr1toX.hg38.vcf.gz) - 0.5GB in size. You can also choose other SNP panels from [cellsnp-lite webpage](https://cellsnp-lite.readthedocs.io/en/latest/main/data.html#data-list-of-common-snps). 64 | * [Phasing panel](http://pklab.med.harvard.edu/teng/data/1000G_hg38.zip)- 9.0GB in size. Unzip the panel after downloading. 65 | 66 | Replace the following paths `config.yaml`: 67 | * `region_vcf`: Replace with the path of downloaded SNP panel. 68 | * `phasing_panel`: Replace with the unzipped directory of the downloaded phasing panel. 69 | * `spaceranger_dir`: Replace with the spaceranger directory of your Visium data, which should contain the BAM file `possorted_genome_bam.bam`. 70 | * `output_snpinfo`: Replace with the desired output directory. 71 | * Replace `calicost_dir` and `eagledir` with the path to the cloned CalicoST directory and downloaded Eagle2 directory. 72 | 73 | Then you can run preprocessing pipeline by 74 | ``` 75 | snakemake --cores --configfile config.yaml --snakefile calicost.smk all 76 | ``` 77 | 78 | ### Inferring tumor purity per spot (optional) 79 | Replace the paths in the parameter configuration file `configuration_purity` with the corresponding data/reference file paths and run 80 | ``` 81 | OMP_NUM_THREADS=1 /src/calicost/estimate_tumor_proportion.py -c configuration_purity 82 | ``` 83 | 84 | ### Inferring clones and allele-specific CNAs 85 | Replace the paths in parameter configuration file `configuration_cna` with the corresponding data/reference file paths and run 86 | ``` 87 | OMP_NUM_THREADS=1 python /src/calicost/calicost_main.py -c configuration_cna 88 | ``` 89 | 90 | When jointly inferring clones and CNAs across multiple SRT slices, prepare a table with the following columns (See [`examples/example_input_filelist`](https://github.com/raphael-group/CalicoST/blob/main/examples/example_input_filelist) as an example). 91 | Path to BAM file | sample ID | Path to Spaceranger outs 92 | Modify `configuration_cna_multi` with paths to the table and run 93 | ``` 94 | OMP_NUM_THREADS=1 python /src/calicost/calicost_main.py -c configuration_cna_multi 95 | ``` 96 | 97 | ### Reconstruct phylogeography 98 | 99 | ``` 100 | python /src/calicost/phylogeny_startle.py -c -s -o 101 | ``` 102 | 103 | 104 | # Tutorials 105 | Check out our [readthedocs](https://calicost.readthedocs.io/en/latest/) for the following tutorials: 106 | 1. [Inferring clones and allele-specific CNAs on simulated data](https://calicost.readthedocs.io/en/latest/notebooks/tutorials/simulated_data_tutorial.html) 107 | The simulated count matrices and parameter configuration file are available from [`examples/simulated_example.tar.gz`](https://github.com/raphael-group/CalicoST/blob/main/examples/simulated_example.tar.gz). CalicoST takes about 2h to finish on this example. 108 | 109 | 2. [Inferring tumor purity, clones, allele-specific CNAs, and phylogeography on prostate cancer data](https://calicost.readthedocs.io/en/latest/notebooks/tutorials/prostate_tutorial.html) 110 | This sample contains five slices and over 10000 spots, CalicoST takes about 9h to jointly infer CNAs and cancer clones across the slides. 111 | 112 | 115 | 116 | 132 | 133 | ### Understanding the output 134 | The above snakemake run will create a folder `calicost` in the directory of downloaded example data. Within this folder, each random initialization of CalicoST generates a subdirectory of `calicost/clone*`. 135 | 136 | CalicoST generates the following key files of each random initialization: 137 | * clone_labels.tsv: The inferred clone labels for each spot. 138 | * cnv_seglevel.tsv: Allele-specific copy numbers for each clone for each genome segment. 139 | * cnv_genelevel.tsv: The projected allele-specific copy numbers from genome segments to the covered genes. 140 | * cnv_diploid_seglevel.tsv, cnv_triploid_seglevel.tsv, cnv_tetraploid_seglevel.tsv, cnv_diploid_genelevel.tsv, cnv_triploid_genelevel.tsv, cnv_tetraploid_genelevel.tsv: Allele-specific copy numbers when enforcing a ploidy for each genome segment or each gene. 141 | 142 | See the following examples of the key files. 143 | ``` 144 | head -10 calicost/clone3_rectangle0_w1.0/clone_labels.tsv 145 | BARCODES clone_label 146 | spot_0 2 147 | spot_1 2 148 | spot_2 2 149 | spot_3 2 150 | spot_4 2 151 | spot_5 2 152 | spot_6 2 153 | spot_7 2 154 | spot_8 0 155 | ``` 156 | 157 | ``` 158 | head -10 calicost/clone3_rectangle0_w1.0/cnv_seglevel.tsv 159 | CHR START END clone0 A clone0 B clone1 A clone1 B clone2 A clone2 B 160 | 1 1001138 1616548 1 1 1 1 1 1 161 | 1 1635227 2384877 1 1 1 1 1 1 162 | 1 2391775 6101016 1 1 1 1 1 1 163 | 1 6185020 6653223 1 1 1 1 1 1 164 | 1 6785454 7780639 1 1 1 1 1 1 165 | 1 7784320 8020748 1 1 1 1 1 1 166 | 1 8026738 9271273 1 1 1 1 1 1 167 | 1 9292894 10375267 1 1 1 1 1 1 168 | 1 10398592 11922488 1 1 1 1 1 1 169 | ``` 170 | 171 | ``` 172 | head -10 calicost/clone3_rectangle0_w1.0/cnv_genelevel.tsv 173 | gene clone0 A clone0 B clone1 A clone1 B clone2 A clone2 B 174 | A1BG 1 1 1 1 1 1 175 | A1CF 1 1 1 1 1 1 176 | A2M 1 1 1 1 1 1 177 | A2ML1-AS1 1 1 1 1 1 1 178 | AACS 1 1 1 1 1 1 179 | AADAC 1 1 1 1 1 1 180 | AADACL2-AS1 1 1 1 1 1 1 181 | AAK1 1 1 1 1 1 1 182 | AAMP 1 1 1 1 1 1 183 | ``` 184 | 185 | CalicoST graphs the following plots for visualizing the inferred cancer clones in space and allele-specific copy number profiles for each random initialization. 186 | * plots/clone_spatial.pdf: The spatial distribution of inferred cancer clones and normal regions (grey color, clone 0 by default) 187 | * plots/rdr_baf_defaultcolor.pdf: The read depth ratio (RDR) and B allele frequency (BAF) along the genome for each clone. Higher RDR indicates higher total copy numbers, and a deviation-from-0.5 BAF indicates allele imbalance due to allele-specific CNAs. 188 | * plots/acn_genome.pdf: The default allele-specific copy numbers along the genome. 189 | * plots/acn_genome_diploid.pdf, plots/acn_genome_triploid.pdf, plots/acn_genome_tetraploid.pdf: Allele-specific copy numbers when enforcing a ploidy. 190 | 191 | The allele-specific copy number plots have the following color legend. 192 |

193 | 194 |

195 | 196 | 197 | # Software dependencies 198 | CalicoST uses the following command-line packages and python for extracting the BAF information 199 | * samtools 200 | * cellsnp-lite 201 | * Eagle2 202 | * pysam 203 | * snakemake 204 | 205 | CalicoST uses the following packages for the remaining steps to infer allele-specific copy numbers and cancer clones: 206 | * numpy 207 | * scipy 208 | * pandas 209 | * scikit-learn 210 | * scanpy 211 | * anndata 212 | * numba 213 | * tqdm 214 | * statsmodels 215 | * networkx 216 | * matplotlib 217 | * seaborn 218 | * snakemake 219 | 220 | 221 | # Citations 222 | The CalicoST manuscript is available on bioRxiv. If you use CalicoST for your work, please cite our paper. 223 | ``` 224 | @article{ma2024inferring, 225 | title={Inferring allele-specific copy number aberrations and tumor phylogeography from spatially resolved transcriptomics}, 226 | author={Ma, Cong and Balaban, Metin and Liu, Jingxian and Chen, Siqi and Ding, Li and Raphael, Benjamin}, 227 | journal={bioRxiv}, 228 | pages={2024--03}, 229 | year={2024}, 230 | publisher={Cold Spring Harbor Laboratory} 231 | } 232 | ``` -------------------------------------------------------------------------------- /calicost.smk: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scipy 4 | import calicost.arg_parse 5 | import calicost.parse_input 6 | 7 | 8 | rule all: 9 | input: 10 | f"{config['output_snpinfo']}/cell_snp_Aallele.npz", 11 | 12 | 13 | rule link_or_merge_bam: 14 | output: 15 | bam="{outputdir}/possorted_genome_bam.bam", 16 | bai="{outputdir}/possorted_genome_bam.bam.bai", 17 | barcodefile="{outputdir}/barcodes.txt", 18 | params: 19 | outputdir = "{outputdir}", 20 | samtools_sorting_mem=config['samtools_sorting_mem'] 21 | threads: 1 22 | log: 23 | "{outputdir}/logs/link_or_merge_bam.log" 24 | run: 25 | if "bamlist" in config: 26 | # merged BAM file 27 | shell(f"python {config['calicost_dir']}/utils/merge_bamfile.py -b {config['bamlist']} -o {params.outputdir}/ >> {log} 2>&1") 28 | shell(f"samtools sort -m {params.samtools_sorting_mem} -o {output.bam} {params.outputdir}/unsorted_possorted_genome_bam.bam >> {log} 2>&1") 29 | shell(f"samtools index {output.bam}") 30 | shell(f"rm -fr {params.outputdir}/unsorted_possorted_genome_bam.bam") 31 | 32 | # merged barcodes 33 | df_entries = pd.read_csv(config["bamlist"], sep='\t', index_col=None, header=None) 34 | df_barcodes = [] 35 | for i in range(df_entries.shape[0]): 36 | tmpdf = pd.read_csv(f"{df_entries.iloc[i,2]}/filtered_feature_bc_matrix/barcodes.tsv.gz", header=None, index_col=None) 37 | tmpdf.iloc[:,0] = [f"{x}_{df_entries.iloc[i,1]}" for x in tmpdf.iloc[:,0]] 38 | df_barcodes.append( tmpdf ) 39 | df_barcodes = pd.concat(df_barcodes, ignore_index=True) 40 | df_barcodes.to_csv(f"{output.barcodefile}", sep='\t', index=False, header=False) 41 | else: 42 | # BAM file 43 | assert "spaceranger_dir" in config 44 | print("softlink of possorted_genome_bam.bam") 45 | shell(f"ln -sf -T {config['spaceranger_dir']}/possorted_genome_bam.bam {output.bam}") 46 | shell(f"ln -sf -T {config['spaceranger_dir']}/possorted_genome_bam.bam.bai {output.bai}") 47 | # barcodes 48 | shell(f"gunzip -c {config['spaceranger_dir']}/filtered_feature_bc_matrix/barcodes.tsv.gz > {output.barcodefile}") 49 | 50 | 51 | 52 | rule genotype: 53 | input: 54 | barcodefile="{outputdir}/barcodes.txt", 55 | bam="{outputdir}/possorted_genome_bam.bam", 56 | bai="{outputdir}/possorted_genome_bam.bam.bai" 57 | output: 58 | vcf="{outputdir}/genotyping/cellSNP.base.vcf.gz" 59 | params: 60 | outputdir="{outputdir}", 61 | region_vcf=config['region_vcf'] 62 | threads: config['nthreads_cellsnplite'] 63 | log: 64 | "{outputdir}/logs/genotyping.log" 65 | run: 66 | shell(f"mkdir -p {params.outputdir}/genotyping") 67 | command = f"cellsnp-lite -s {input.bam} " + \ 68 | f"-b {input.barcodefile} " + \ 69 | f"-O {params.outputdir}/genotyping/ " + \ 70 | f"-R {params.region_vcf} " + \ 71 | f"-p {threads} " + \ 72 | f"--minMAF 0 --minCOUNT 2 --UMItag {config['UMItag']} --cellTAG {config['cellTAG']} --gzip >> {log} 2>&1" 73 | print(command) 74 | shell(command) 75 | 76 | 77 | 78 | rule pre_phasing: 79 | input: 80 | vcf="{outputdir}/genotyping/cellSNP.base.vcf.gz" 81 | output: 82 | expand("{{outputdir}}/phasing/chr{chrname}.vcf.gz", chrname=config["chromosomes"]) 83 | params: 84 | outputdir="{outputdir}", 85 | threads: 1 86 | run: 87 | shell(f"mkdir -p {params.outputdir}/phasing") 88 | print(f"python {config['calicost_dir']}/utils/filter_snps_forphasing.py -c {params.outputdir}/genotyping -o {params.outputdir}/phasing") 89 | shell(f"python {config['calicost_dir']}/utils/filter_snps_forphasing.py -c {params.outputdir}/genotyping -o {params.outputdir}/phasing") 90 | for chrname in config["chromosomes"]: 91 | shell(f"bgzip -f {params.outputdir}/phasing/chr{chrname}.vcf") 92 | shell(f"tabix -f {params.outputdir}/phasing/chr{chrname}.vcf.gz") 93 | 94 | 95 | rule phasing: 96 | input: 97 | vcf="{outputdir}/phasing/chr{chrname}.vcf.gz" 98 | output: 99 | "{outputdir}/phasing/chr{chrname}.phased.vcf.gz" 100 | params: 101 | outputdir="{outputdir}", 102 | chrname="{chrname}", 103 | threads: 2 104 | log: 105 | "{outputdir}/logs/phasing_chr{chrname}.log", 106 | run: 107 | command = f"{config['eagledir']}/eagle --numThreads {threads} --vcfTarget {input.vcf} " + \ 108 | f"--vcfRef {config['phasing_panel']}/chr{params.chrname}.genotypes.bcf " + \ 109 | f"--geneticMapFile={config['eagledir']}/tables/genetic_map_hg38_withX.txt.gz "+ \ 110 | f"--outPrefix {params.outputdir}/phasing/chr{params.chrname}.phased >> {log} 2>&1" 111 | shell(command) 112 | 113 | 114 | 115 | rule parse_final_snp: 116 | input: 117 | "{outputdir}/genotyping/cellSNP.base.vcf.gz", 118 | expand("{{outputdir}}/phasing/chr{chrname}.phased.vcf.gz", chrname=config["chromosomes"]), 119 | output: 120 | "{outputdir}/cell_snp_Aallele.npz", 121 | "{outputdir}/cell_snp_Ballele.npz", 122 | "{outputdir}/unique_snp_ids.npy" 123 | params: 124 | outputdir="{outputdir}", 125 | threads: 1 126 | log: 127 | "{outputdir}/logs/parse_final_snp.log" 128 | run: 129 | command = f"python {config['calicost_dir']}/utils/get_snp_matrix.py " + \ 130 | f"-c {params.outputdir}/genotyping -e {params.outputdir}/phasing -b {params.outputdir}/barcodes.txt -o {params.outputdir}/ >> {log} 2>&1" 131 | shell( command ) 132 | 133 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | # path to executables or their parent directories 2 | calicost_dir: 3 | eagledir: 4 | 5 | # running parameters 6 | # samtools sort (only used when joingly calling from multiple slices) 7 | samtools_sorting_mem: "4G" 8 | # cellsnp-lite 9 | UMItag: "Auto" 10 | cellTAG: "CB" 11 | nthreads_cellsnplite: 20 12 | region_vcf: 13 | # Eagle phasing 14 | phasing_panel: 15 | chromosomes: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22] 16 | 17 | # input 18 | spaceranger_dir: 19 | 20 | # output 21 | output_snpinfo: 22 | -------------------------------------------------------------------------------- /configuration_cna: -------------------------------------------------------------------------------- 1 | 2 | spaceranger_dir : 3 | snp_dir : 4 | output_dir : 5 | 6 | # supporting files and preprocessing arguments 7 | geneticmap_file : /GRCh38_resources/genetic_map_GRCh38_merged.tab.gz 8 | hgtable_file : /GRCh38_resources/hgTables_hg38_gencode.txt 9 | normalidx_file : None 10 | tumorprop_file : None 11 | supervision_clone_file : None 12 | filtergenelist_file : /GRCh38_resources/ig_gene_list.txt 13 | filterregion_file : /GRCh38_resources/HLA_regions.bed 14 | secondary_min_umi : 300 15 | bafonly : False 16 | 17 | # phase switch probability 18 | nu : 1.0 19 | logphase_shift : -2.0 20 | npart_phasing : 3 21 | 22 | # HMRF configurations 23 | n_clones : 3 24 | n_clones_rdr : 2 25 | min_spots_per_clone : 100 26 | min_avgumi_per_clone : 10 27 | maxspots_pooling : 7 28 | tumorprop_threshold : 0.5 29 | max_iter_outer : 20 30 | nodepotential : weighted_sum 31 | initialization_method : rectangle 32 | num_hmrf_initialization_start : 0 33 | num_hmrf_initialization_end : 1 34 | spatial_weight : 1.0 35 | construct_adjacency_method : hexagon 36 | construct_adjacency_w : 1.0 37 | 38 | # HMM configurations 39 | n_states : 7 40 | params : smp 41 | t : 1-1e-5 42 | t_phaseing : 0.9999 43 | fix_NB_dispersion : False 44 | shared_NB_dispersion : True 45 | fix_BB_dispersion : False 46 | shared_BB_dispersion : True 47 | max_iter : 30 48 | tol : 0.0001 49 | gmm_random_state : 0 50 | np_threshold : 1.0 51 | np_eventminlen : 10 52 | 53 | # integer copy number 54 | nonbalance_bafdist : 1.0 55 | nondiploid_rdrdist : 10.0 56 | 57 | -------------------------------------------------------------------------------- /configuration_cna_multi: -------------------------------------------------------------------------------- 1 | 2 | input_filelist: 3 | snp_dir : 4 | output_dir : 5 | 6 | # supporting files and preprocessing arguments 7 | geneticmap_file : /GRCh38_resources/genetic_map_GRCh38_merged.tab.gz 8 | hgtable_file : /GRCh38_resources/hgTables_hg38_gencode.txt 9 | normalidx_file : None 10 | tumorprop_file : None 11 | alignment_files : 12 | supervision_clone_file : None 13 | filtergenelist_file : /GRCh38_resources/ig_gene_list.txt 14 | filterregion_file : /GRCh38_resources/HLA_regions.bed 15 | secondary_min_umi : 300 16 | bafonly : False 17 | 18 | # phase switch probability 19 | nu : 1.0 20 | logphase_shift : -2.0 21 | npart_phasing : 3 22 | 23 | # HMRF configurations 24 | n_clones : 3 25 | n_clones_rdr : 2 26 | min_spots_per_clone : 100 27 | min_avgumi_per_clone : 10 28 | maxspots_pooling : 7 29 | tumorprop_threshold : 0.5 30 | max_iter_outer : 20 31 | nodepotential : weighted_sum 32 | initialization_method : rectangle 33 | num_hmrf_initialization_start : 0 34 | num_hmrf_initialization_end : 1 35 | spatial_weight : 1.0 36 | construct_adjacency_method : hexagon 37 | construct_adjacency_w : 1.0 38 | 39 | # HMM configurations 40 | n_states : 7 41 | params : smp 42 | t : 1-1e-5 43 | t_phaseing : 0.9999 44 | fix_NB_dispersion : False 45 | shared_NB_dispersion : True 46 | fix_BB_dispersion : False 47 | shared_BB_dispersion : True 48 | max_iter : 30 49 | tol : 0.0001 50 | gmm_random_state : 0 51 | np_threshold : 1.0 52 | np_eventminlen : 10 53 | 54 | # integer copy number 55 | nonbalance_bafdist : 1.0 56 | nondiploid_rdrdist : 10.0 57 | 58 | -------------------------------------------------------------------------------- /configuration_purity: -------------------------------------------------------------------------------- 1 | 2 | spaceranger_dir : 3 | snp_dir : 4 | output_dir : 5 | 6 | # supporting files and preprocessing arguments 7 | geneticmap_file : /GRCh38_resources/genetic_map_GRCh38_merged.tab.gz 8 | hgtable_file : /GRCh38_resources/hgTables_hg38_gencode.txt 9 | normalidx_file : None 10 | tumorprop_file : None 11 | alignment_files : 12 | supervision_clone_file : None 13 | filtergenelist_file : /GRCh38_resources/ig_gene_list.txt 14 | filterregion_file : /GRCh38_resources/HLA_regions.bed 15 | secondary_min_umi : 400 16 | bafonly : False 17 | 18 | # phase switch probability 19 | nu : 1.0 20 | logphase_shift : -2.0 21 | npart_phasing : 3 22 | 23 | # HMRF configurations 24 | n_clones : 5 25 | n_clones_rdr : 2 26 | min_spots_per_clone : 100 27 | min_avgumi_per_clone : 10 28 | maxspots_pooling : 19 29 | tumorprop_threshold : 0.5 30 | max_iter_outer : 20 31 | nodepotential : weighted_sum 32 | initialization_method : rectangle 33 | num_hmrf_initialization_start : 0 34 | num_hmrf_initialization_end : 1 35 | spatial_weight : 1.0 36 | construct_adjacency_method : hexagon 37 | construct_adjacency_w : 1.0 38 | 39 | # HMM configurations 40 | n_states : 7 41 | params : smp 42 | t : 1-1e-4 43 | t_phaseing : 0.9999 44 | fix_NB_dispersion : False 45 | shared_NB_dispersion : True 46 | fix_BB_dispersion : False 47 | shared_BB_dispersion : True 48 | max_iter : 30 49 | tol : 0.0001 50 | gmm_random_state : 0 51 | np_threshold : 1.0 52 | np_eventminlen : 10 53 | 54 | # integer copy number 55 | nonbalance_bafdist : 1.0 56 | nondiploid_rdrdist : 10.0 57 | 58 | -------------------------------------------------------------------------------- /docs/_ext/typed_returns.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Iterable, Iterator, List 3 | 4 | from sphinx.application import Sphinx 5 | from sphinx.ext.napoleon import NumpyDocstring 6 | 7 | 8 | def _process_return(lines: Iterable[str]) -> Iterator[str]: 9 | for line in lines: 10 | m = re.fullmatch(r"(?P\w+)\s+:\s+(?P[\w.]+)", line) 11 | if m: 12 | # Once this is in scanpydoc, we can use the fancy hover stuff 13 | yield f'**{m["param"]}** : :class:`~{m["type"]}`' 14 | else: 15 | yield line 16 | 17 | 18 | def _parse_returns_section(self: NumpyDocstring, section: str) -> list[str]: 19 | lines_raw = list(_process_return(self._dedent(self._consume_to_next_section()))) 20 | lines: list[str] = self._format_block(":returns: ", lines_raw) 21 | if lines and lines[-1]: 22 | lines.append("") 23 | return lines 24 | 25 | 26 | def setup(app: Sphinx) -> None: 27 | NumpyDocstring._parse_returns_section = _parse_returns_section 28 | -------------------------------------------------------------------------------- /docs/_static/css/custom.css: -------------------------------------------------------------------------------- 1 | .small { 2 | font-size: 55%; 3 | } 4 | 5 | div.version { 6 | color: #FFD92C!important; 7 | } 8 | 9 | .wy-nav-side { 10 | background: #242335; 11 | } 12 | 13 | .wy-side-nav-search { 14 | background-color: #242335; 15 | } 16 | 17 | .wy-side-nav-search input[type="text"] { 18 | border-radius: 6px!important; 19 | } 20 | 21 | .wy-nav-content { 22 | max-width: 950px; 23 | } 24 | 25 | .wy-menu-vertical a { 26 | color: #eceef4; 27 | } 28 | 29 | .wy-menu-vertical li.current { 30 | background: #f1f5fb; 31 | } 32 | 33 | .wy-menu-vertical li.toctree-l2.current > a { 34 | background: #34377d2e; 35 | } 36 | 37 | .wy-menu-vertical li.toctree-l2.current li.toctree-l3 > a { 38 | background: #34377d4a; 39 | } 40 | 41 | .wy-menu-vertical li.toctree-l3.current li.toctree-l4 > a { 42 | background: #34377d7d; 43 | } 44 | 45 | .wy-menu-vertical a:hover { 46 | background-color: #6b86b0; 47 | } 48 | 49 | .wy-menu-vertical li.current a:hover { 50 | background: #bdcde6a3; 51 | } 52 | 53 | a { 54 | color: #5B64B1; 55 | } 56 | 57 | .rst-content .viewcode-link { 58 | color: #7013e1d9; 59 | } 60 | 61 | .highlight { 62 | background: #f1f5fb!important; 63 | } 64 | 65 | .rst-content div[class^="highlight"] { 66 | border: 1px solid #e4eaf2; 67 | } 68 | 69 | .wy-menu-vertical p.caption { 70 | color: #FFD92C; 71 | } 72 | 73 | div.output_subarea.output_html.rendered_html.output_result{ 74 | overflow: auto; 75 | } 76 | 77 | /* function/class top bar */ 78 | html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) > dt { 79 | color: #404040; 80 | border-top: solid 4px #7013e1d9; 81 | background: #FFD833A8; 82 | } 83 | 84 | /* class params */ 85 | html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) dl:not(.field-list) > dt { 86 | color: #404040; 87 | border-left: solid 4px #7013e1d9; 88 | background: #FFD8338F; 89 | } 90 | 91 | /* the other elements, but more specific - leave them be */ 92 | code.docutils.literal.notranslate > span[class="pre"] { 93 | font-weight: bold; 94 | color: #404040; 95 | } 96 | 97 | /* odd rows in API */ 98 | .rst-content table.docutils:not(.field-list) tr:nth-child(2n-1) td { 99 | background-color: #f6f6f3; 100 | } 101 | 102 | .rst-content div[class^="highlight"] pre { 103 | padding: 8px; 104 | } 105 | 106 | .rst-content .seealso { 107 | background: #fafae2!important; 108 | } 109 | 110 | .rst-content .seealso .admonition-title { 111 | background: #7013e1d9!important; 112 | } 113 | -------------------------------------------------------------------------------- /docs/_static/css/dataframe.css: -------------------------------------------------------------------------------- 1 | /* Pandas dataframe css */ 2 | /* Taken from: https://github.com/spatialaudio/nbsphinx/blob/fb3ba670fc1ba5f54d4c487573dbc1b4ecf7e9ff/src/nbsphinx.py#L587-L619 */ 3 | /* modified margin-left */ 4 | 5 | table.dataframe { 6 | border: none !important; 7 | border-collapse: collapse; 8 | border-spacing: 0; 9 | border-color: transparent; 10 | color: black; 11 | font-size: 12px; 12 | table-layout: fixed; 13 | margin-left: 0!important; 14 | } 15 | 16 | table.dataframe thead { 17 | border-bottom: 1px solid black; 18 | vertical-align: bottom; 19 | } 20 | 21 | table.dataframe tr, 22 | table.dataframe th, 23 | table.dataframe td { 24 | text-align: right; 25 | vertical-align: middle; 26 | padding: 0.5em 0.5em; 27 | line-height: normal; 28 | white-space: normal; 29 | max-width: none; 30 | border: none; 31 | } 32 | 33 | table.dataframe th { 34 | font-weight: bold; 35 | } 36 | 37 | table.dataframe tbody tr:nth-child(odd) { 38 | background: #f5f5f5; 39 | } 40 | 41 | table.dataframe tbody tr:hover { 42 | background: rgba(66, 165, 245, 0.2); 43 | } 44 | -------------------------------------------------------------------------------- /docs/_static/css/nbsphinx.css: -------------------------------------------------------------------------------- 1 | div.nbinput.container div.prompt, 2 | div.nboutput.container div.prompt { 3 | display: none; 4 | } 5 | 6 | div.nbinput.container div.prompt > div.highlight, 7 | div.nboutput.container div.prompt > div.highlight { 8 | display: none; 9 | } 10 | 11 | div.nbinput.container div.input_area div[class*="highlight"] > pre, 12 | div.nboutput.container div.output_area div[class*="highlight"] > pre { 13 | padding: 8px!important; 14 | } 15 | 16 | div.nboutput.container div.output_area > div[class^="highlight"] { 17 | background-color: #fafae2!important; 18 | } 19 | 20 | .rst-content .output_area img { 21 | max-width: unset; 22 | width: 100% !important; 23 | height: auto !important; 24 | } 25 | -------------------------------------------------------------------------------- /docs/_static/css/sphinx_gallery.css: -------------------------------------------------------------------------------- 1 | #graph, #image, #core-tutorials, #external-tutorials, #gallery { 2 | margin-bottom: 1em; 3 | } 4 | 5 | div.sphx-glr-download a { 6 | background-color: #FFD92C9E!important; 7 | background-image: none!important; 8 | border-radius: 2px!important; 9 | border: 1px solid #f4c200!important; 10 | color: #404040!important; 11 | font-weight bold !important; 12 | padding: 0.1cm!important; 13 | text-align: center!important; 14 | } 15 | 16 | 17 | div.sphx-glr-download a[href$=".py"] { 18 | display: none!important; 19 | } 20 | 21 | div.sphx-glr-example-title div[class="highlight"] { 22 | background-color: #F5F5F5; 23 | border: none; 24 | } 25 | 26 | / * notebook output cell */ 27 | .sphx-glr-script-out .highlight pre { 28 | background: #FDFFD9!important; 29 | } 30 | 31 | p.sphx-glr-script-out { 32 | display: none !important; 33 | } 34 | 35 | div.sphx-glr-download p { 36 | margin: 0!important; 37 | width: auto!important; 38 | } 39 | 40 | .sphx-glr-script-out { 41 | color: #404040 !important; 42 | margin: -24px 0px 0px 0px !important; 43 | } 44 | 45 | p.sphx-glr-signature { 46 | display: none!important; 47 | } 48 | 49 | div.sphx-glr-download-link-note { 50 | display: none!important; 51 | } 52 | 53 | /* this gets rid of uneven vertical padding */ 54 | div.sphx-glr-download code.download { 55 | display: block !important; 56 | } 57 | 58 | .sphx-glr-thumbcontainer { 59 | background: none !important; 60 | border: 1px solid #7013e1d9!important; 61 | text-align: center !important; 62 | min-height: 220px !important; 63 | } 64 | 65 | .sphx-glr-thumbcontainer a.internal:hover { 66 | color: #7013e1d9!important; 67 | } 68 | 69 | .sphx-glr-thumbcontainer .headerlink { 70 | display: none !important; 71 | } 72 | 73 | div.sphx-glr-thumbcontainer span { 74 | font-style: normal !important; 75 | } 76 | 77 | p.sphx-glr-timing { 78 | margin: 0 !important; 79 | padding-top: 24px; 80 | border-top: 1px solid #000; 81 | } 82 | 83 | .sphx-glr-thumbcontainer:hover { 84 | box-shadow: 0 0 10px #7013e1d9!important 85 | } 86 | 87 | /* sphinx-gallery inserts 2
after_repr_html_, ignore the 1st one */ 88 | div[class="rendered_html"] + br { 89 | display: none!important; 90 | } 91 | 92 | /* remove `Jupyter notebook: ` from `Download Jupyter notebook: `*/ 93 | div.sphx-glr-download-jupyter code.xref.download.docutils.literal.notranslate > span:nth-child(2), 94 | div.sphx-glr-download-jupyter code.xref.download.docutils.literal.notranslate > span:nth-child(3) { 95 | display: none!important; 96 | } 97 | 98 | .sphx-glr-thumbcontainer a.internal { 99 | padding: 140px 10px 0!important; 100 | } 101 | 102 | div.binder-badge img { 103 | width: 120px; 104 | } 105 | -------------------------------------------------------------------------------- /docs/_static/img/acn_color_palette.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raphael-group/CalicoST/5e4a8a1230e71505667d51390dc9c035a69d60d9/docs/_static/img/acn_color_palette.png -------------------------------------------------------------------------------- /docs/_static/img/overview4_combine.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raphael-group/CalicoST/5e4a8a1230e71505667d51390dc9c035a69d60d9/docs/_static/img/overview4_combine.pdf -------------------------------------------------------------------------------- /docs/_static/img/overview4_combine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raphael-group/CalicoST/5e4a8a1230e71505667d51390dc9c035a69d60d9/docs/_static/img/overview4_combine.png -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | import os 9 | import sys 10 | from datetime import datetime 11 | 12 | # from importlib.metadata import metadata 13 | from pathlib import Path 14 | 15 | from sphinx.application import Sphinx 16 | 17 | HERE = Path(__file__).parent 18 | # sys.path.insert(0, str(HERE.parent.parent)) # this way, we don't have to install squidpy 19 | # sys.path.insert(0, os.path.abspath("_ext")) 20 | 21 | sys.path.insert(0, str(HERE / "_ext")) 22 | 23 | # -- Project information ----------------------------------------------------- 24 | 25 | project = 'CalicoST' 26 | author = 'Ma et al.' 27 | version = '1.0.0' 28 | copyright = f"{datetime.now():%Y}, raphael-lab" 29 | 30 | # -- General configuration --------------------------------------------------- 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = [ 36 | "sphinx.ext.autodoc", 37 | "sphinx.ext.napoleon", 38 | "sphinx.ext.viewcode", 39 | "sphinx_autodoc_typehints", 40 | "sphinx.ext.intersphinx", 41 | "sphinx.ext.autosummary", 42 | "sphinx.ext.mathjax", 43 | "sphinxcontrib.bibtex", 44 | "sphinx_copybutton", 45 | "myst_nb", 46 | "nbsphinx", 47 | "typed_returns", 48 | "IPython.sphinxext.ipython_console_highlighting", 49 | ] 50 | intersphinx_mapping = dict( # noqa: C408 51 | python=("https://docs.python.org/3", None), 52 | numpy=("https://numpy.org/doc/stable/", None), 53 | statsmodels=("https://www.statsmodels.org/stable/", None), 54 | scipy=("https://docs.scipy.org/doc/scipy/", None), 55 | pandas=("https://pandas.pydata.org/pandas-docs/stable/", None), 56 | anndata=("https://anndata.readthedocs.io/en/stable/", None), 57 | scanpy=("https://scanpy.readthedocs.io/en/stable/", None), 58 | matplotlib=("https://matplotlib.org/stable/", None), 59 | seaborn=("https://seaborn.pydata.org/", None), 60 | networkx=("https://networkx.org/documentation/stable/", None), 61 | sklearn=("https://scikit-learn.org/stable/", None), 62 | numba=("https://numba.readthedocs.io/en/stable/", None), 63 | ete3=("http://etetoolkit.org/docs/latest/", None), 64 | ) 65 | 66 | # Add any paths that contain templates here, relative to this directory. 67 | templates_path = ["_templates"] 68 | source_suffix = {".rst": "restructuredtext", ".ipynb": "myst-nb"} 69 | master_doc = "index" 70 | pygments_style = "sphinx" 71 | 72 | # myst 73 | nb_execution_mode = "off" 74 | myst_enable_extensions = [ 75 | "colon_fence", 76 | "dollarmath", 77 | "amsmath", 78 | ] 79 | myst_heading_anchors = 2 80 | 81 | # List of patterns, relative to source directory, that match files and 82 | # directories to ignore when looking for source files. 83 | # This pattern also affects html_static_path and html_extra_path. 84 | exclude_patterns = [ 85 | "notebooks/README.rst", 86 | "notebooks/CONTRIBUTING.rst", 87 | "release/changelog/*", 88 | "**.ipynb_checkpoints", 89 | "build", 90 | ] 91 | suppress_warnings = ["download.not_readable", "git.too_shallow"] 92 | 93 | # -- Options for HTML output ------------------------------------------------- 94 | 95 | # The theme to use for HTML and HTML Help pages. See the documentation for 96 | # a list of builtin themes. 97 | autosummary_generate = True 98 | autodoc_member_order = "groupwise" 99 | autodoc_typehints = "signature" 100 | autodoc_docstring_signature = True 101 | napoleon_google_docstring = False 102 | napoleon_numpy_docstring = True 103 | napoleon_include_init_with_doc = False 104 | napoleon_use_rtype = True 105 | napoleon_use_param = True 106 | todo_include_todos = False 107 | 108 | # bibliography 109 | bibtex_bibfiles = ["references.bib"] 110 | bibtex_reference_style = "author_year" 111 | bibtex_default_style = "alpha" 112 | 113 | # spelling 114 | spelling_lang = "en_US" 115 | spelling_warning = True 116 | spelling_word_list_filename = "spelling_wordlist.txt" 117 | spelling_add_pypi_package_names = True 118 | spelling_show_suggestions = True 119 | spelling_exclude_patterns = ["references.rst"] 120 | # see: https://pyenchant.github.io/pyenchant/api/enchant.tokenize.html 121 | spelling_filters = [ 122 | "enchant.tokenize.URLFilter", 123 | "enchant.tokenize.EmailFilter", 124 | "docs.source.utils.ModnameFilter", 125 | "docs.source.utils.SignatureFilter", 126 | "enchant.tokenize.MentionFilter", 127 | ] 128 | # see the solution from: https://github.com/sphinx-doc/sphinx/issues/7369 129 | linkcheck_ignore = [ 130 | # 403 Client Error 131 | "https://doi.org/10.1126/science.aar7042", 132 | "https://doi.org/10.1126/science.aau5324", 133 | "https://doi.org/10.1093/bioinformatics/btab164", 134 | "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2716260/", 135 | "https://raw.githubusercontent.com/scverse/squidpy/main/docs/_static/img/figure1.png", 136 | ] 137 | 138 | # Add any paths that contain custom static files (such as style sheets) here, 139 | # relative to this directory. They are copied after the builtin static files, 140 | # so a file named "default.css" will overwrite the builtin "default.css". 141 | html_theme = "sphinx_rtd_theme" 142 | html_static_path = ["_static"] 143 | # html_logo = "_static/img/gaston_logo_v2.png" 144 | html_theme_options = {"navigation_depth": 4, "logo_only": True} 145 | html_show_sphinx = False 146 | 147 | 148 | def setup(app: Sphinx) -> None: 149 | app.add_css_file("css/custom.css") 150 | app.add_css_file("css/sphinx_gallery.css") 151 | app.add_css_file("css/nbsphinx.css") 152 | app.add_css_file("css/dataframe.css") # had to add this manually -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | CalicoST - Inferring allele-specific copy number aberrations and tumor phylogeography from spatially resolved transcriptomics 2 | ============================================================================================================================= 3 | 4 | .. image:: https://raw.githubusercontent.com/raphael-group/CalicoST/main/docs/_static/img/overview4_combine.png 5 | :alt: CalicoST overview 6 | :width: 800px 7 | :align: center 8 | 9 | CalicoST is a probabilistic model that infers allele-specific copy number aberrations and tumor phylogeography from spatially resolved transcriptomics.CalicoST has the following key features: 10 | 1. Identifies allele-specific integer copy numbers for each transcribed region, revealing events such as copy neutral loss of heterozygosity (CNLOH) and mirrored subclonal CNAs that are invisible to total copy number analysis. 11 | 2. Assigns each spot a clone label indicating whether the spot is primarily normal cells or a cancer clone with aberration copy number profile. 12 | 3. Infers a phylogeny relating the identified cancer clones as well as a phylogeography that combines genetic evolution and spatial dissemination of clones. 13 | 4. Handles normal cell admixture in SRT technologies hat are not single-cell resolution (e.g. 10x Genomics Visium) to infer more accurate allele-specific copy numbers and cancer clones. 14 | 5. Simultaneously analyzes multiple regional or aligned SRT slices from the same tumor. 15 | 16 | 17 | Installation 18 | ------------ 19 | Find the details of installation `here `_. 20 | 21 | Getting started with CalicoST 22 | ----------------------------- 23 | Browse the Tutorials to get started with CalicoST `here `_. 24 | 25 | .. toctree:: 26 | :maxdepth: 1 27 | 28 | installation 29 | tutorials 30 | parameters 31 | references 32 | 33 | .. _github: https://github.com/raphael-group/CalicoST -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | Minimum installation 4 | -------------------- 5 | First setup a conda environment from the `environment.yml` file: 6 | 7 | .. code-block:: bash 8 | 9 | git clone https://github.com/raphael-group/CalicoST.git 10 | cd CalicoST 11 | conda env create -f environment.yml --name calicost_env 12 | 13 | 14 | Then, install CalicoST using pip by 15 | 16 | .. code-block:: bash 17 | 18 | conda activate calicost_env 19 | pip install -e . 20 | 21 | 22 | Setting up the conda environments takes around 15 minutes on an HPC head node. 23 | 24 | Additional installation for SNP parsing 25 | --------------------------------------- 26 | CalicoST requires allele count matrices for reference-phased A and B alleles for inferring allele-specific CNAs, and provides a snakemake pipeline for obtaining the required matrices from a BAM file. Run the following commands in CalicoST directory for installing additional package, [Eagle2](https://alkesgroup.broadinstitute.org/Eagle/), for snakemake preprocessing pipeline. 27 | 28 | .. code-block:: bash 29 | 30 | mkdir external 31 | wget --directory-prefix=external https://storage.googleapis.com/broad-alkesgroup-public/Eagle/downloads/Eagle_v2.4.1.tar.gz 32 | tar -xzf external/Eagle_v2.4.1.tar.gz -C external 33 | 34 | 35 | Additional installation for reconstructing phylogeny 36 | ---------------------------------------------------- 37 | Based on the inferred cancer clones and allele-specific CNAs by CalicoST, we apply Startle to reconstruct a phylogenetic tree along the clones. Install Startle by 38 | 39 | .. code-block:: bash 40 | 41 | git clone --recurse-submodules https://github.com/raphael-group/startle.git 42 | cd startle 43 | mkdir build; cd build 44 | cmake -DLIBLEMON_ROOT=\ 45 | -DCPLEX_INC_DIR=\ 46 | -DCPLEX_LIB_DIR=\ 47 | -DCONCERT_INC_DIR=\ 48 | -DCONCERT_LIB_DIR=\ 49 | .. 50 | make 51 | 52 | 53 | Prepare reference files for SNP parsing 54 | -------------------- 55 | We followed the recommended pipeline by `Numbat `_` for parsing SNP information from BAM file(s): first genotyping using the BAM file by cellsnp-lite (included in the conda environment) and reference-based phasing by Eagle2. Download the following panels for genotyping and reference-based phasing. 56 | 57 | * `SNP panel `_ - 0.5GB in size. You can also choose other SNP panels from `cellsnp-lite webpage `_. 58 | * `Phasing panel `_ - 9.0GB in size. Unzip the panel after downloading. 59 | -------------------------------------------------------------------------------- /docs/parameters.rst: -------------------------------------------------------------------------------- 1 | Specification of running parameters of CalicoST 2 | =============================================== 3 | 4 | Supporting reference files 5 | -------------------------- 6 | geneticmap_file: str 7 | The path to genetic map file. 8 | 9 | hgtable_file: str 10 | The path to the location of genes in the genome. This file should be a tab-delimited file with the following columns: gene_name, chrom, cdsStart, cdsEnd. 11 | 12 | normalidx_file: str, optional 13 | The path to the file containing the indices of normal spots in the spatial transcriptomics data. Each line is a single index without header. 14 | 15 | tumorprop_file: str, optional 16 | The path to inferred tumor proportions per spot. This file should be a tab-delimited file with the following columns names: barcode, Tumor. 17 | 18 | filtergenelist_file: str, optional 19 | The file to a list of genes to exclude from CNA inference, based on prior knowledge. 20 | 21 | filterregion_file: str, optional 22 | The file to a list of genomic regions to exclude from CNA inference in BED format. E.g., HLA regions. 23 | 24 | 25 | Phasing parameters 26 | ------------------ 27 | logphase_shift: float, optional 28 | Adjustment to the strength of Markov Model self-transition in phasing. The higher the value, the higher self-transition probability. Default is -2.0. 29 | 30 | secondary_min_umi: int, optional 31 | The minimum UMI count a genome segment has in pseudobulk of spots in the step of genome segmentation. Default is 300. 32 | 33 | 34 | Clone inference parameters 35 | -------------------------- 36 | n_clones: int 37 | The number of clones to infer using only BAF signals. Default is 3. 38 | 39 | n_clones_rdr: int, optional 40 | The number of clones to refine for each BAF-identified clone using RDR and BAF signals. Default is 2. 41 | 42 | min_spots_per_clone: int, optional 43 | The minimum number of spots required to call a clone should have. Default is 100. 44 | 45 | min_avgumi_per_clone: int, optional 46 | The minimum average UMI count required for a clone. Default is 10. 47 | 48 | nodepotential: str, optional 49 | One of the following two options: "max" or "weighted_sum". "max" refers to using the MLE decoding of HMM in evaluating the probability of spots being in each clone. "weighted_sum" refers to using the full HMM posterior probabilities to evaluate the probability of spots being in each clone. Default is "weighted_sum". 50 | 51 | spatial_weight: float, optional 52 | The strength of spatial coherence in HMRF. The higher the value, the stronger the spatial coherence. Default is 1.0. 53 | 54 | 55 | CNA inference parameters 56 | ------------------------ 57 | n_states: int 58 | The number of allele-specific copy number states in the HMM for CNA inference. 59 | 60 | t: float, optional 61 | The self-transition probability of HMM. The higher the value, the higher probability that adjacent genome segments are in the same CNA state. Default is 1-1e-5. 62 | 63 | max_iter: int, optional 64 | The number of Baum-Welch steps to perform in HMM. Default is 30. 65 | 66 | tol: float, optional 67 | The convergence threshold to terminate Baum-Welch steps. Default is 1e-4. 68 | 69 | 70 | Merging clones with similar CNAs 71 | -------------------------------- 72 | np_threshold: float, optional 73 | The threshold of Neyman Pearson statistics to decide two clones have distinct CNA events. The higher the value, the two clones are merged more easily. Default is 1.0. 74 | 75 | np_eventminlen: int, optional 76 | The minimum number of consecutive genome segments to be considered as a CN event. Default is 10. 77 | -------------------------------------------------------------------------------- /docs/references.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raphael-group/CalicoST/5e4a8a1230e71505667d51390dc9c035a69d60d9/docs/references.rst -------------------------------------------------------------------------------- /docs/tutorials.rst: -------------------------------------------------------------------------------- 1 | .. toctree:: 2 | :maxdepth: 0 3 | :caption: Contents: 4 | 5 | Allele-specific CNAs and cancer clones on a simulated data 6 | --------------------------------------------------------- 7 | notebooks/tutorials/simulated_data_tutorial.ipynb 8 | 9 | Cancer clones and phylogeography of a five-slice prostate cancer 10 | ---------------------------------------------------------------- 11 | notebooks/tutorials/prostate_tutorial.ipynb -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: calicost_env 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python==3.10 8 | - numpy==1.24.4 9 | - scipy==1.11.3 10 | - samtools==1.18 11 | - bcftools==1.18 12 | - cellsnp-lite 13 | - snakemake 14 | - lemon 15 | -------------------------------------------------------------------------------- /examples/CalicoST_example.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raphael-group/CalicoST/5e4a8a1230e71505667d51390dc9c035a69d60d9/examples/CalicoST_example.tar.gz -------------------------------------------------------------------------------- /examples/example_input_filelist: -------------------------------------------------------------------------------- 1 | /u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H1_2_visium/outs/possorted_genome_bam.bam H12 /u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H1_2_visium/outs/ 2 | /u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H1_4_visium/outs/possorted_genome_bam.bam H14 /u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H1_4_visium/outs/ 3 | /u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H1_5_visium/outs/possorted_genome_bam.bam H15 /u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H1_5_visium/outs/ 4 | /u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H2_1_visium/outs/possorted_genome_bam.bam H21 /u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H2_1_visium/outs/ 5 | /u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H2_5_visium/outs/possorted_genome_bam.bam H25 /u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_spaceranger/P1_H2_5_visium/outs/ 6 | -------------------------------------------------------------------------------- /examples/prostate_example.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raphael-group/CalicoST/5e4a8a1230e71505667d51390dc9c035a69d60d9/examples/prostate_example.tar.gz -------------------------------------------------------------------------------- /examples/simulated_example.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raphael-group/CalicoST/5e4a8a1230e71505667d51390dc9c035a69d60d9/examples/simulated_example.tar.gz -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "CalicoST" 7 | version = "1.0.0" 8 | authors = [ 9 | { name="Cong Ma", email="congma@princeton.edu" }, 10 | { name="Metin Balaban", email="metin@princeton.edu" }, 11 | { name="Jingxian Liu", email="jingxian.liu@wustl.edu" }, 12 | { name="Siqi Chen", email="siqichen@wustl.edu" }, 13 | { name="Li Ding", email="lding@wustl.edu" }, 14 | { name="Ben Raphael", email="braphael@cs.princeton.edu" }, 15 | ] 16 | description = "Inferring allele-specific copy number aberrations and tumor phylogeography from spatially resolved transcriptomics" 17 | readme = "README.md" 18 | requires-python = ">=3.8" 19 | classifiers = [ 20 | "Programming Language :: Python :: 3", 21 | "License :: OSI Approved :: BSD License", 22 | "Operating System :: OS Independent", 23 | ] 24 | dependencies = [ 25 | 'numpy', 26 | 'scipy', 27 | 'pandas', 28 | 'scikit-learn', 29 | 'scanpy', 30 | 'anndata', 31 | 'numba', 32 | 'tqdm', 33 | 'statsmodels', 34 | 'networkx', 35 | 'matplotlib', 36 | 'seaborn', 37 | 'pysam', 38 | 'ete3' 39 | ] 40 | 41 | [project.optional-dependencies] 42 | docs = [ 43 | "ipython", 44 | "ipywidgets>=8.0.0", 45 | "sphinx>=5.3", 46 | "sphinx-autodoc-annotation", 47 | "sphinx-autodoc-typehints>=1.10.3", 48 | "sphinx_rtd_theme", 49 | "sphinxcontrib-bibtex>=2.3.0", 50 | "sphinxcontrib-spelling>=7.6.2", 51 | "nbsphinx>=0.8.1", 52 | "myst-nb>=0.17.1", 53 | "sphinx_copybutton>=0.5.0", 54 | ] 55 | 56 | [project.urls] 57 | "Homepage" = "https://github.com/raphael-group/CalicoST" 58 | 59 | [tool.setuptools.packages.find] 60 | where = ["src"] 61 | include = ["calicost*"] -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | setuptools.setup( 4 | name="calicost", 5 | version="v1.0.0", 6 | python_requires=">=3.8", 7 | packages=["calicost"], 8 | package_dir={"": "src"}, 9 | author="Cong Ma", 10 | author_email="congma@princeton.edu", 11 | description="Allele-specific CNAs and spatial cancer clone inference", 12 | long_description="CalicoST infers allele-specific copy number aberrations and cancer clones in spatially resolved transcriptomics data", 13 | url="https://github.com/raphael-group/CalicoST", 14 | install_requires=[ 15 | "numpy==1.24.4", 16 | "scipy==1.11.3", 17 | "pandas==2.1.1", 18 | "scikit-learn==1.3.2", 19 | "scanpy==1.9.6", 20 | "anndata==0.10.3", 21 | "numba==0.60.0", 22 | "tqdm==4.66.1", 23 | "statsmodels==0.14.0", 24 | "networkx==3.2.1", 25 | "matplotlib==3.7.3", 26 | "seaborn==0.12.2", 27 | "pysam==0.22.1", 28 | "ete3==3.1.3", 29 | "ipykernel", 30 | ], 31 | include_package_data=True, 32 | ) 33 | -------------------------------------------------------------------------------- /src/calicost/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = 'v1.0.0' 2 | -------------------------------------------------------------------------------- /src/calicost/allele_starch_generateconfig.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import scipy 4 | import pandas as pd 5 | from pathlib import Path 6 | from sklearn.metrics import adjusted_rand_score 7 | import scanpy as sc 8 | import anndata 9 | import logging 10 | import copy 11 | from pathlib import Path 12 | import subprocess 13 | from hmm_NB_BB_phaseswitch import * 14 | from utils_distribution_fitting import * 15 | from hmrf import * 16 | from utils_IO import * 17 | 18 | 19 | def read_configuration_file(filename): 20 | ##### [Default settings] ##### 21 | config = { 22 | "spaceranger_dir" : None, 23 | "snp_dir" : None, 24 | "output_dir" : None, 25 | # supporting files and preprocessing arguments 26 | "hgtable_file" : None, 27 | "normalidx_file" : None, 28 | "tumorprop_file" : None, 29 | "supervision_clone_file" : None, 30 | "filtergenelist_file" : None, 31 | "filterregion_file" : None, 32 | "binsize" : 1, 33 | "rdrbinsize" : 1, 34 | # "secondbinning_min_umi" : 500, 35 | "max_nbins" : 1200, 36 | "avg_umi_perbinspot" : 1.5, 37 | "bafonly" : True, 38 | # phase switch probability 39 | "nu" : 1, 40 | "logphase_shift" : 1, 41 | "npart_phasing" : 2, 42 | # HMRF configurations 43 | "n_clones" : None, 44 | "n_clones_rdr" : 2, 45 | "min_spots_per_clone" : 100, 46 | "min_avgumi_per_clone" : 10, 47 | "maxspots_pooling" : 7, 48 | "tumorprop_threshold" : 0.5, 49 | "max_iter_outer" : 20, 50 | "nodepotential" : "max", # max or weighted_sum 51 | "initialization_method" : "rectangle", # rectangle or datadrive 52 | "num_hmrf_initialization_start" : 0, 53 | "num_hmrf_initialization_end" : 10, 54 | "spatial_weight" : 2.0, 55 | "construct_adjacency_method" : "hexagon", 56 | "construct_adjacency_w" : 1.0, 57 | # HMM configurations 58 | "n_states" : None, 59 | "params" : None, 60 | "t" : None, 61 | "t_phaseing" : 1-1e-4, 62 | "fix_NB_dispersion" : False, 63 | "shared_NB_dispersion" : True, 64 | "fix_BB_dispersion" : False, 65 | "shared_BB_dispersion" : True, 66 | "max_iter" : 30, 67 | "tol" : 1e-3, 68 | "gmm_random_state" : 0, 69 | "np_threshold" : 2.0, 70 | "np_eventminlen" : 10 71 | } 72 | 73 | argument_type = { 74 | "spaceranger_dir" : "str", 75 | "snp_dir" : "str", 76 | "output_dir" : "str", 77 | # supporting files and preprocessing arguments 78 | "hgtable_file" : "str", 79 | "normalidx_file" : "str", 80 | "tumorprop_file" : "str", 81 | "supervision_clone_file" : "str", 82 | "filtergenelist_file" : "str", 83 | "filterregion_file" : "str", 84 | "binsize" : "int", 85 | "rdrbinsize" : "int", 86 | # "secondbinning_min_umi" : "int", 87 | "max_nbins" : "int", 88 | "avg_umi_perbinspot" : "float", 89 | "bafonly" : "bool", 90 | # phase switch probability 91 | "nu" : "float", 92 | "logphase_shift" : "float", 93 | "npart_phasing" : "int", 94 | # HMRF configurations 95 | "n_clones" : "int", 96 | "n_clones_rdr" : "int", 97 | "min_spots_per_clone" : "int", 98 | "min_avgumi_per_clone" : "int", 99 | "maxspots_pooling" : "int", 100 | "tumorprop_threshold" : "float", 101 | "max_iter_outer" : "int", 102 | "nodepotential" : "str", 103 | "initialization_method" : "str", 104 | "num_hmrf_initialization_start" : "int", 105 | "num_hmrf_initialization_end" : "int", 106 | "spatial_weight" : "float", 107 | "construct_adjacency_method" : "str", 108 | "construct_adjacency_w" : "float", 109 | # HMM configurations 110 | "n_states" : "int", 111 | "params" : "str", 112 | "t" : "eval", 113 | "t_phaseing" : "eval", 114 | "fix_NB_dispersion" : "bool", 115 | "shared_NB_dispersion" : "bool", 116 | "fix_BB_dispersion" : "bool", 117 | "shared_BB_dispersion" : "bool", 118 | "max_iter" : "int", 119 | "tol" : "float", 120 | "gmm_random_state" : "int", 121 | "np_threshold" : "float", 122 | "np_eventminlen" : "int" 123 | } 124 | 125 | ##### [ read configuration file to update settings ] ##### 126 | with open(filename, 'r') as fp: 127 | for line in fp: 128 | if line.strip() == "" or line[0] == "#": 129 | continue 130 | # strs = [x.replace(" ", "") for x in line.strip().split(":") if x != ""] 131 | strs = [x.strip() for x in line.strip().split(":") if x != ""] 132 | assert strs[0] in config.keys(), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}" 133 | if strs[1].upper() == "NONE": 134 | config[strs[0]] = None 135 | elif argument_type[strs[0]] == "str": 136 | config[strs[0]] = strs[1] 137 | elif argument_type[strs[0]] == "int": 138 | config[strs[0]] = int(strs[1]) 139 | elif argument_type[strs[0]] == "float": 140 | config[strs[0]] = float(strs[1]) 141 | elif argument_type[strs[0]] == "eval": 142 | config[strs[0]] = eval(strs[1]) 143 | elif argument_type[strs[0]] == "bool": 144 | config[strs[0]] = (strs[1].upper() == "TRUE") 145 | elif argument_type[strs[0]] == "list_str": 146 | config[strs[0]] = strs[1].split(" ") 147 | # assertions 148 | assert not config["spaceranger_dir"] is None, "No spaceranger directory!" 149 | assert not config["snp_dir"] is None, "No SNP directory!" 150 | assert not config["output_dir"] is None, "No output directory!" 151 | 152 | return config 153 | 154 | 155 | def write_config_file(outputfilename, config): 156 | list_argument_io = ["spaceranger_dir", 157 | "snp_dir", 158 | "output_dir"] 159 | list_argument_sup = ["hgtable_file", 160 | "normalidx_file", 161 | "tumorprop_file", 162 | "supervision_clone_file", 163 | "filtergenelist_file", 164 | "filterregion_file", 165 | "binsize", 166 | "rdrbinsize", 167 | # "secondbinning_min_umi", 168 | "max_nbins", 169 | "avg_umi_perbinspot", 170 | "bafonly"] 171 | list_argument_phase = ["nu", 172 | "logphase_shift", 173 | "npart_phasing"] 174 | list_argument_hmrf = ["n_clones", 175 | "n_clones_rdr", 176 | "min_spots_per_clone", 177 | "min_avgumi_per_clone", 178 | "maxspots_pooling", 179 | "tumorprop_threshold", 180 | "max_iter_outer", 181 | "nodepotential", 182 | "initialization_method", 183 | "num_hmrf_initialization_start", 184 | "num_hmrf_initialization_end", 185 | "spatial_weight", 186 | "construct_adjacency_method", 187 | "construct_adjacency_w"] 188 | list_argument_hmm = ["n_states", 189 | "params", 190 | "t", 191 | "t_phaseing", 192 | "fix_NB_dispersion", 193 | "shared_NB_dispersion", 194 | "fix_BB_dispersion", 195 | "shared_BB_dispersion", 196 | "max_iter", 197 | "tol", 198 | "gmm_random_state", 199 | "np_threshold", 200 | "np_eventminlen"] 201 | with open(outputfilename, 'w') as fp: 202 | # 203 | for k in list_argument_io: 204 | fp.write(f"{k} : {config[k]}\n") 205 | # 206 | fp.write("\n") 207 | fp.write("# supporting files and preprocessing arguments\n") 208 | for k in list_argument_sup: 209 | fp.write(f"{k} : {config[k]}\n") 210 | # 211 | fp.write("\n") 212 | fp.write("# phase switch probability\n") 213 | for k in list_argument_phase: 214 | fp.write(f"{k} : {config[k]}\n") 215 | # 216 | fp.write("\n") 217 | fp.write("# HMRF configurations\n") 218 | for k in list_argument_hmrf: 219 | fp.write(f"{k} : {config[k]}\n") 220 | # 221 | fp.write("\n") 222 | fp.write("# HMM configurations\n") 223 | for k in list_argument_hmm: 224 | fp.write(f"{k} : {config[k]}\n") 225 | 226 | 227 | def main(argv): 228 | template_configuration_file = argv[1] 229 | outputdir = argv[2] 230 | hmrf_seed_s = int(argv[3]) 231 | hmrf_seed_t = int(argv[4]) 232 | config = read_configuration_file(template_configuration_file) 233 | for r in range(hmrf_seed_s, hmrf_seed_t): 234 | config["num_hmrf_initialization_start"] = r 235 | config["num_hmrf_initialization_end"] = r+1 236 | write_config_file(f"{outputdir}/configfile{r}", config) 237 | 238 | 239 | if __name__ == "__main__": 240 | if len(sys.argv) > 1: 241 | main(sys.argv) -------------------------------------------------------------------------------- /src/calicost/arg_parse.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import scipy 4 | import pandas as pd 5 | import logging 6 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S") 7 | logger = logging.getLogger() 8 | 9 | 10 | def load_default_config(): 11 | config_joint = { 12 | "input_filelist" : None, 13 | "alignment_files" : [] 14 | } 15 | config_single = { 16 | "spaceranger_dir" : None 17 | } 18 | config_shared = { 19 | "snp_dir" : None, 20 | "output_dir" : None, 21 | # supporting files and preprocessing arguments 22 | "geneticmap_file" : None, 23 | "hgtable_file" : None, 24 | "normalidx_file" : None, 25 | "tumorprop_file" : None, 26 | "supervision_clone_file" : None, 27 | "filtergenelist_file" : None, 28 | "filterregion_file" : None, 29 | "secondary_min_umi" : 300, 30 | "min_snpumi_perspot" : 50, 31 | 'min_percent_expressed_spots' : 0.005, 32 | "bafonly" : False, 33 | # phase switch probability 34 | "nu" : 1.0, 35 | "logphase_shift" : -2.0, 36 | "npart_phasing" : 3, 37 | # HMRF configurations 38 | "n_clones" : None, 39 | "n_clones_rdr" : 2, 40 | "min_spots_per_clone" : 100, 41 | "min_avgumi_per_clone" : 10, 42 | "maxspots_pooling" : 7, 43 | "tumorprop_threshold" : 0.5, 44 | "max_iter_outer" : 20, 45 | "nodepotential" : "weighted_sum", # max or weighted_sum 46 | "initialization_method" : "rectangle", # rectangle or datadrive 47 | "num_hmrf_initialization_start" : 0, 48 | "num_hmrf_initialization_end" : 10, 49 | "spatial_weight" : 1.0, 50 | "construct_adjacency_method" : "hexagon", 51 | "construct_adjacency_w" : 1.0, 52 | # HMM configurations 53 | "n_states" : None, 54 | "params" : "smp", 55 | "t" : 1-1e-5, 56 | "t_phaseing" : 1-1e-4, 57 | "fix_NB_dispersion" : False, 58 | "shared_NB_dispersion" : True, 59 | "fix_BB_dispersion" : False, 60 | "shared_BB_dispersion" : True, 61 | "max_iter" : 30, 62 | "tol" : 1e-4, 63 | "gmm_random_state" : 0, 64 | "np_threshold" : 1.0, 65 | "np_eventminlen" : 10, 66 | # integer copy number 67 | "nonbalance_bafdist" : 1.0, 68 | "nondiploid_rdrdist" : 10.0 69 | } 70 | 71 | argtype_joint = { 72 | "input_filelist" : "str", 73 | "alignment_files" : "list_str" 74 | } 75 | argtype_single = { 76 | "spaceranger_dir" : "str" 77 | } 78 | argtype_shared = { 79 | "snp_dir" : "str", 80 | "output_dir" : "str", 81 | # supporting files and preprocessing arguments 82 | "geneticmap_file" : "str", 83 | "hgtable_file" : "str", 84 | "normalidx_file" : "str", 85 | "tumorprop_file" : "str", 86 | "supervision_clone_file" : "str", 87 | "filtergenelist_file" : "str", 88 | "filterregion_file" : "str", 89 | "secondary_min_umi" : "int", 90 | "min_snpumi_perspot" : "int", 91 | 'min_percent_expressed_spots' : "float", 92 | "bafonly" : "bool", 93 | # phase switch probability 94 | "nu" : "float", 95 | "logphase_shift" : "float", 96 | "npart_phasing" : "int", 97 | # HMRF configurations 98 | "n_clones" : "int", 99 | "n_clones_rdr" : "int", 100 | "min_spots_per_clone" : "int", 101 | "min_avgumi_per_clone" : "int", 102 | "maxspots_pooling" : "int", 103 | "tumorprop_threshold" : "float", 104 | "max_iter_outer" : "int", 105 | "nodepotential" : "str", 106 | "initialization_method" : "str", 107 | "num_hmrf_initialization_start" : "int", 108 | "num_hmrf_initialization_end" : "int", 109 | "spatial_weight" : "float", 110 | "construct_adjacency_method" : "str", 111 | "construct_adjacency_w" : "float", 112 | # HMM configurations 113 | "n_states" : "int", 114 | "params" : "str", 115 | "t" : "eval", 116 | "t_phaseing" : "eval", 117 | "fix_NB_dispersion" : "bool", 118 | "shared_NB_dispersion" : "bool", 119 | "fix_BB_dispersion" : "bool", 120 | "shared_BB_dispersion" : "bool", 121 | "max_iter" : "int", 122 | "tol" : "float", 123 | "gmm_random_state" : "int", 124 | "np_threshold" : "float", 125 | "np_eventminlen" : "int", 126 | # integer copy number 127 | "nonbalance_bafdist" : "float", 128 | "nondiploid_rdrdist" : "float" 129 | } 130 | 131 | category_names = ["", "# supporting files and preprocessing arguments", "# phase switch probability", "# HMRF configurations", "# HMM configurations", "# integer copy number"] 132 | category_elements = [["input_filelist", "spaceranger_dir", "snp_dir", "output_dir"], \ 133 | ["geneticmap_file", "hgtable_file", "normalidx_file", "tumorprop_file", "alignment_files", "supervision_clone_file", "filtergenelist_file", "filterregion_file", "secondary_min_umi", "min_snpumi_perspot", "min_percent_expressed_spots", "bafonly"], \ 134 | ["nu", "logphase_shift", "npart_phasing"], \ 135 | ["n_clones", "n_clones_rdr", "min_spots_per_clone", "min_avgumi_per_clone", "maxspots_pooling", "tumorprop_threshold", "max_iter_outer", "nodepotential", "initialization_method", "num_hmrf_initialization_start", "num_hmrf_initialization_end", "spatial_weight", "construct_adjacency_method", "construct_adjacency_w"], \ 136 | ["n_states", "params", "t", "t_phaseing", "fix_NB_dispersion", "shared_NB_dispersion", "fix_BB_dispersion", "shared_BB_dispersion", "max_iter", "tol", "gmm_random_state", "np_threshold", "np_eventminlen"], \ 137 | ["nonbalance_bafdist", "nondiploid_rdrdist"]] 138 | return config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, category_names, category_elements 139 | 140 | 141 | def read_configuration_file(filename): 142 | ##### [Default settings] ##### 143 | config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, _, _ = load_default_config() 144 | config = {**config_shared, **config_single} 145 | argument_type = {**argtype_shared, **argtype_single} 146 | 147 | ##### [ read configuration file to update settings ] ##### 148 | with open(filename, 'r') as fp: 149 | for line in fp: 150 | if line.strip() == "" or line[0] == "#": 151 | continue 152 | strs = [x.strip() for x in line.strip().split(":") if x != ""] 153 | # assert strs[0] in config.keys(), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}" 154 | if (not strs[0] in config.keys()) and (not strs[0] in config_joint.keys()): 155 | # warning that the argument is not a valid configuration parameter and continue 156 | logger.warning(f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}") 157 | continue 158 | if len(strs) == 1: 159 | config[strs[0]] = [] 160 | elif strs[1].upper() == "NONE": 161 | config[strs[0]] = None 162 | elif argument_type[strs[0]] == "str": 163 | config[strs[0]] = strs[1] 164 | elif argument_type[strs[0]] == "int": 165 | config[strs[0]] = int(strs[1]) 166 | elif argument_type[strs[0]] == "float": 167 | config[strs[0]] = float(strs[1]) 168 | elif argument_type[strs[0]] == "eval": 169 | config[strs[0]] = eval(strs[1]) 170 | elif argument_type[strs[0]] == "bool": 171 | config[strs[0]] = (strs[1].upper() == "TRUE") 172 | elif argument_type[strs[0]] == "list_str": 173 | config[strs[0]] = strs[1].split(" ") 174 | # assertions 175 | assert not config["spaceranger_dir"] is None, "No spaceranger directory!" 176 | assert not config["snp_dir"] is None, "No SNP directory!" 177 | assert not config["output_dir"] is None, "No output directory!" 178 | 179 | return config 180 | 181 | 182 | def read_joint_configuration_file(filename): 183 | ##### [Default settings] ##### 184 | config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, _, _ = load_default_config() 185 | config = {**config_shared, **config_joint} 186 | argument_type = {**argtype_shared, **argtype_joint} 187 | 188 | ##### [ read configuration file to update settings ] ##### 189 | with open(filename, 'r') as fp: 190 | for line in fp: 191 | if line.strip() == "" or line[0] == "#": 192 | continue 193 | strs = [x.strip() for x in line.strip().split(":") if x != ""] 194 | # assert strs[0] in config.keys(), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}" 195 | if (not strs[0] in config.keys()) and (not strs[0] in config_single.keys()): 196 | # warning that the argument is not a valid configuration parameter and continue 197 | logger.warning(f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}") 198 | continue 199 | if len(strs) == 1: 200 | config[strs[0]] = [] 201 | elif strs[1].upper() == "NONE": 202 | config[strs[0]] = None 203 | elif argument_type[strs[0]] == "str": 204 | config[strs[0]] = strs[1] 205 | elif argument_type[strs[0]] == "int": 206 | config[strs[0]] = int(strs[1]) 207 | elif argument_type[strs[0]] == "float": 208 | config[strs[0]] = float(strs[1]) 209 | elif argument_type[strs[0]] == "eval": 210 | config[strs[0]] = eval(strs[1]) 211 | elif argument_type[strs[0]] == "bool": 212 | config[strs[0]] = (strs[1].upper() == "TRUE") 213 | elif argument_type[strs[0]] == "list_str": 214 | config[strs[0]] = strs[1].split(" ") 215 | # assertions 216 | assert not config["input_filelist"] is None, "No input file list!" 217 | assert not config["snp_dir"] is None, "No SNP directory!" 218 | assert not config["output_dir"] is None, "No output directory!" 219 | 220 | return config 221 | 222 | 223 | def write_config_file(outputfilename, config): 224 | _,_,_, argtype_shared, argtype_joint, argtype_single, category_names, category_elements = load_default_config() 225 | argument_type = {**argtype_shared, **argtype_joint, **argtype_single} 226 | with open(outputfilename, 'w') as fp: 227 | for i in range(len(category_names)): 228 | fp.write(f"{category_names[i]}\n") 229 | for k in category_elements[i]: 230 | if k in config: 231 | if argument_type[k] == "list_str": 232 | fp.write(f"{k} : {' '.join(config[k])}\n") 233 | else: 234 | fp.write(f"{k} : {config[k]}\n") 235 | fp.write("\n") 236 | 237 | 238 | def get_default_config_single(): 239 | config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, _, _ = load_default_config() 240 | config = {**config_shared, **config_single} 241 | return config 242 | 243 | 244 | def get_default_config_joint(): 245 | config_shared, config_joint, config_single, argtype_shared, argtype_joint, argtype_single, _, _ = load_default_config() 246 | config = {**config_shared, **config_joint} 247 | return config 248 | 249 | 250 | def main(argv): 251 | template_configuration_file = argv[1] 252 | outputdir = argv[2] 253 | hmrf_seed_s = int(argv[3]) 254 | hmrf_seed_t = int(argv[4]) 255 | try: 256 | config = read_configuration_file(template_configuration_file) 257 | except: 258 | config = read_joint_configuration_file(template_configuration_file) 259 | 260 | for r in range(hmrf_seed_s, hmrf_seed_t): 261 | config["num_hmrf_initialization_start"] = r 262 | config["num_hmrf_initialization_end"] = r+1 263 | write_config_file(f"{outputdir}/configfile{r}", config) 264 | 265 | 266 | if __name__ == "__main__": 267 | if len(sys.argv) > 1: 268 | main(sys.argv) 269 | -------------------------------------------------------------------------------- /src/calicost/estimate_tumor_proportion.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import scipy 4 | import pandas as pd 5 | from pathlib import Path 6 | import logging 7 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S") 8 | logger = logging.getLogger() 9 | import copy 10 | import functools 11 | import subprocess 12 | from calicost.arg_parse import * 13 | from calicost.hmm_NB_BB_phaseswitch import * 14 | from calicost.parse_input import * 15 | from calicost.utils_hmrf import * 16 | from calicost.hmrf import * 17 | 18 | 19 | def main(configuration_file): 20 | try: 21 | config = read_configuration_file(configuration_file) 22 | except: 23 | config = read_joint_configuration_file(configuration_file) 24 | 25 | lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_bininfo, df_gene_snp, \ 26 | barcodes, coords, single_tumor_prop, sample_list, sample_ids, adjacency_mat, smooth_mat, exp_counts = run_parse_n_load(config) 27 | 28 | single_base_nb_mean[:,:] = 0 29 | 30 | n_states_for_tumorprop = 5 31 | n_clones_for_tumorprop = 3 32 | n_rdrclones_for_tumorprop = 3 #2 33 | max_outer_iter_for_tumorprop = 10 34 | max_iter_for_tumorprop = 20 35 | MIN_PROP_UNCERTAINTY = 0.05 36 | initial_clone_index = rectangle_initialize_initial_clone(coords, n_clones_for_tumorprop, random_state=0) 37 | # save clone initialization into npz file 38 | prefix = "initialhmm" 39 | if not Path(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz").exists(): 40 | initial_assignment = np.zeros(single_X.shape[2], dtype=int) 41 | for c,idx in enumerate(initial_clone_index): 42 | initial_assignment[idx] = c 43 | allres = {"num_iterations":0, "round-1_assignment":initial_assignment} 44 | np.savez(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz", **allres) 45 | 46 | hmrf_concatenate_pipeline(config['output_dir'], prefix, single_X, lengths, single_base_nb_mean, single_total_bb_RD, initial_clone_index, n_states=n_states_for_tumorprop, \ 47 | log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat, adjacency_mat=adjacency_mat, sample_ids=sample_ids, max_iter_outer=max_outer_iter_for_tumorprop, nodepotential=config["nodepotential"], \ 48 | hmmclass=hmm_nophasing_v2, params="sp", t=config["t"], random_state=config["gmm_random_state"], \ 49 | fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \ 50 | fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \ 51 | is_diag=True, max_iter=max_iter_for_tumorprop, tol=config["tol"], spatial_weight=config["spatial_weight"]) 52 | 53 | res = load_hmrf_last_iteration(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz") 54 | merging_groups, merged_res = merge_by_minspots(res["new_assignment"], res, single_total_bb_RD, min_spots_thresholds=config["min_spots_per_clone"], min_umicount_thresholds=config["min_avgumi_per_clone"]*single_X.shape[0]) 55 | 56 | # further refine clones 57 | combined_assignment = copy.copy(merged_res['new_assignment']) 58 | offset_clone = 0 59 | combined_p_binom = [] 60 | offset_state = 0 61 | combined_pred_cnv = [] 62 | for bafc in range(len(merging_groups)): 63 | prefix = f"initialhmm_clone{bafc}" 64 | idx_spots = np.where(merged_res['new_assignment'] == bafc)[0] 65 | total_allele_count = np.sum(single_total_bb_RD[:, idx_spots]) 66 | if total_allele_count < single_X.shape[0] * 50: # put a minimum B allele read count on pseudobulk to split clones 67 | combined_assignment[idx_spots] = offset_clone 68 | offset_clone += 1 69 | combined_p_binom.append(merged_res['new_p_binom']) 70 | combined_pred_cnv.append(merged_res['pred_cnv'] + offset_state) 71 | offset_state += merged_res['new_p_binom'].shape[0] 72 | continue 73 | # initialize clone 74 | initial_clone_index = rectangle_initialize_initial_clone(coords[idx_spots], n_rdrclones_for_tumorprop, random_state=0) 75 | # save clone initialization into npz file 76 | if not Path(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz").exists(): 77 | initial_assignment = np.zeros(len(idx_spots), dtype=int) 78 | for c,idx in enumerate(initial_clone_index): 79 | initial_assignment[idx] = c 80 | allres = {"barcodes":barcodes[idx_spots], "num_iterations":0, "round-1_assignment":initial_assignment} 81 | np.savez(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz", **allres) 82 | 83 | copy_slice_sample_ids = copy.copy(sample_ids[idx_spots]) 84 | hmrf_concatenate_pipeline(config['output_dir'], prefix, single_X[:,:,idx_spots], lengths, single_base_nb_mean[:,idx_spots], single_total_bb_RD[:,idx_spots], initial_clone_index, n_states=n_states_for_tumorprop, \ 85 | log_sitewise_transmat=log_sitewise_transmat, smooth_mat=smooth_mat[np.ix_(idx_spots,idx_spots)], adjacency_mat=adjacency_mat[np.ix_(idx_spots,idx_spots)], sample_ids=copy_slice_sample_ids, max_iter_outer=10, nodepotential=config["nodepotential"], \ 86 | hmmclass=hmm_nophasing_v2, params="sp", t=config["t"], random_state=config["gmm_random_state"], \ 87 | fix_NB_dispersion=config["fix_NB_dispersion"], shared_NB_dispersion=config["shared_NB_dispersion"], \ 88 | fix_BB_dispersion=config["fix_BB_dispersion"], shared_BB_dispersion=config["shared_BB_dispersion"], \ 89 | is_diag=True, max_iter=max_iter_for_tumorprop, tol=config["tol"], spatial_weight=config["spatial_weight"]) 90 | 91 | cloneres = load_hmrf_last_iteration(f"{config['output_dir']}/{prefix}_nstates{n_states_for_tumorprop}_sp.npz") 92 | combined_assignment[idx_spots] = cloneres['new_assignment'] + offset_clone 93 | offset_clone += np.max(cloneres['new_assignment']) + 1 94 | combined_p_binom.append(cloneres['new_p_binom']) 95 | combined_pred_cnv.append(cloneres['pred_cnv'] + offset_state) 96 | offset_state += cloneres['new_p_binom'].shape[0] 97 | combined_p_binom = np.vstack(combined_p_binom) 98 | combined_pred_cnv = np.concatenate(combined_pred_cnv) 99 | 100 | normal_candidate = identify_normal_spots(single_X, single_total_bb_RD, merged_res['new_assignment'], merged_res['pred_cnv'], merged_res['new_p_binom'], min_count=single_X.shape[0] * 200) 101 | loh_states, is_B_lost, rdr_values, clones_hightumor = identify_loh_per_clone(single_X, combined_assignment, combined_pred_cnv, combined_p_binom, normal_candidate, single_total_bb_RD) 102 | assignments = pd.DataFrame({'coarse':merged_res['new_assignment'], 'combined':combined_assignment}) 103 | # pool across adjacency spot to increase the UMIs covering LOH region 104 | _, tp_smooth_mat = multislice_adjacency(sample_ids, sample_list, coords, single_total_bb_RD, exp_counts, 105 | across_slice_adjacency_mat=None, construct_adjacency_method=config['construct_adjacency_method'], 106 | maxspots_pooling=7, construct_adjacency_w=config['construct_adjacency_w']) 107 | single_tumor_prop, _ = estimator_tumor_proportion(single_X, single_total_bb_RD, assignments, combined_pred_cnv, loh_states, is_B_lost, rdr_values, clones_hightumor, smooth_mat=tp_smooth_mat) 108 | # post-processing to remove negative tumor proportions 109 | single_tumor_prop = np.where(single_tumor_prop < MIN_PROP_UNCERTAINTY, MIN_PROP_UNCERTAINTY, single_tumor_prop) 110 | single_tumor_prop[normal_candidate] = 0 111 | # save single_tumor_prop to file 112 | pd.DataFrame({"Tumor":single_tumor_prop}, index=barcodes).to_csv(f"{config['output_dir']}/loh_estimator_tumor_prop.tsv", header=True, sep="\t") 113 | 114 | 115 | if __name__ == "__main__": 116 | parser = argparse.ArgumentParser() 117 | parser.add_argument("-c", "--configfile", help="configuration file of CalicoST", required=True, type=str) 118 | args = parser.parse_args() 119 | 120 | main(args.configfile) -------------------------------------------------------------------------------- /src/calicost/hmm_NB_BB_nophasing.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import numpy as np 3 | from numba import njit 4 | from scipy.stats import norm, multivariate_normal, poisson 5 | import scipy.special 6 | from scipy.optimize import minimize 7 | from scipy.optimize import Bounds 8 | from sklearn.mixture import GaussianMixture 9 | from tqdm import trange 10 | import statsmodels.api as sm 11 | from statsmodels.base.model import GenericLikelihoodModel 12 | import copy 13 | from calicost.utils_distribution_fitting import * 14 | from calicost.utils_hmm import * 15 | import networkx as nx 16 | 17 | 18 | ############################################################ 19 | # whole inference 20 | ############################################################ 21 | 22 | class hmm_nophasing(object): 23 | def __init__(self, params="stmp", t=1-1e-4): 24 | """ 25 | Attributes 26 | ---------- 27 | params : str 28 | Codes for parameters that need to be updated. The corresponding parameter can only be updated if it is included in this argument. "s" for start probability; "t" for transition probability; "m" for Negative Binomial RDR signal; "p" for Beta Binomial BAF signal. 29 | 30 | t : float 31 | Determine initial self transition probability to be 1-t. 32 | """ 33 | self.params = params 34 | self.t = t 35 | # 36 | @staticmethod 37 | def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus): 38 | """ 39 | Attributes 40 | ---------- 41 | X : array, shape (n_observations, n_components, n_spots) 42 | Observed expression UMI count and allele frequency UMI count. 43 | 44 | base_nb_mean : array, shape (n_observations, n_spots) 45 | Mean expression under diploid state. 46 | 47 | log_mu : array, shape (n_states, n_spots) 48 | Log of read depth change due to CNV. Mean of NB distributions in HMM per state per spot. 49 | 50 | alphas : array, shape (n_states, n_spots) 51 | Over-dispersion of NB distributions in HMM per state per spot. 52 | 53 | total_bb_RD : array, shape (n_observations, n_spots) 54 | SNP-covering reads for both REF and ALT across genes along genome. 55 | 56 | p_binom : array, shape (n_states, n_spots) 57 | BAF due to CNV. Mean of Beta Binomial distribution in HMM per state per spot. 58 | 59 | taus : array, shape (n_states, n_spots) 60 | Over-dispersion of Beta Binomial distribution in HMM per state per spot. 61 | 62 | Returns 63 | ---------- 64 | log_emission : array, shape (n_states, n_obs, n_spots) 65 | Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots. 66 | """ 67 | n_obs = X.shape[0] 68 | n_comp = X.shape[1] 69 | n_spots = X.shape[2] 70 | n_states = log_mu.shape[0] 71 | # initialize log_emission 72 | log_emission_rdr = np.zeros((n_states, n_obs, n_spots)) 73 | log_emission_baf = np.zeros((n_states, n_obs, n_spots)) 74 | for i in np.arange(n_states): 75 | for s in np.arange(n_spots): 76 | # expression from NB distribution 77 | idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0] 78 | if len(idx_nonzero_rdr) > 0: 79 | nb_mean = base_nb_mean[idx_nonzero_rdr,s] * np.exp(log_mu[i, s]) 80 | nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2) 81 | n, p = convert_params(nb_mean, nb_std) 82 | log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p) 83 | # AF from BetaBinom distribution 84 | idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0] 85 | if len(idx_nonzero_baf) > 0: 86 | log_emission_baf[i, idx_nonzero_baf, s] = scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], p_binom[i, s] * taus[i, s], (1-p_binom[i, s]) * taus[i, s]) 87 | return log_emission_rdr, log_emission_baf 88 | # 89 | @staticmethod 90 | def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop, **kwargs): 91 | """ 92 | Attributes 93 | ---------- 94 | X : array, shape (n_observations, n_components, n_spots) 95 | Observed expression UMI count and allele frequency UMI count. 96 | 97 | base_nb_mean : array, shape (n_observations, n_spots) 98 | Mean expression under diploid state. 99 | 100 | log_mu : array, shape (n_states, n_spots) 101 | Log of read depth change due to CNV. Mean of NB distributions in HMM per state per spot. 102 | 103 | alphas : array, shape (n_states, n_spots) 104 | Over-dispersion of NB distributions in HMM per state per spot. 105 | 106 | total_bb_RD : array, shape (n_observations, n_spots) 107 | SNP-covering reads for both REF and ALT across genes along genome. 108 | 109 | p_binom : array, shape (n_states, n_spots) 110 | BAF due to CNV. Mean of Beta Binomial distribution in HMM per state per spot. 111 | 112 | taus : array, shape (n_states, n_spots) 113 | Over-dispersion of Beta Binomial distribution in HMM per state per spot. 114 | 115 | Returns 116 | ---------- 117 | log_emission : array, shape (n_states, n_obs, n_spots) 118 | Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots. 119 | """ 120 | n_obs = X.shape[0] 121 | n_comp = X.shape[1] 122 | n_spots = X.shape[2] 123 | n_states = log_mu.shape[0] 124 | # initialize log_emission 125 | log_emission_rdr = np.zeros((n_states, n_obs, n_spots)) 126 | log_emission_baf = np.zeros((n_states, n_obs, n_spots)) 127 | for i in np.arange(n_states): 128 | for s in np.arange(n_spots): 129 | # expression from NB distribution 130 | idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0] 131 | if len(idx_nonzero_rdr) > 0: 132 | # nb_mean = base_nb_mean[idx_nonzero_rdr,s] * (tumor_prop[s] * np.exp(log_mu[i, s]) + 1 - tumor_prop[s]) 133 | nb_mean = base_nb_mean[idx_nonzero_rdr,s] * (tumor_prop[idx_nonzero_rdr,s] * np.exp(log_mu[i, s]) + 1 - tumor_prop[idx_nonzero_rdr,s]) 134 | nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2) 135 | n, p = convert_params(nb_mean, nb_std) 136 | log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p) 137 | # AF from BetaBinom distribution 138 | idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0] 139 | if len(idx_nonzero_baf) > 0: 140 | # mix_p_A = p_binom[i, s] * tumor_prop[s] + 0.5 * (1 - tumor_prop[s]) 141 | # mix_p_B = (1 - p_binom[i, s]) * tumor_prop[s] + 0.5 * (1 - tumor_prop[s]) 142 | mix_p_A = p_binom[i, s] * tumor_prop[idx_nonzero_baf,s] + 0.5 * (1 - tumor_prop[idx_nonzero_baf,s]) 143 | mix_p_B = (1 - p_binom[i, s]) * tumor_prop[idx_nonzero_baf,s] + 0.5 * (1 - tumor_prop[idx_nonzero_baf,s]) 144 | log_emission_baf[i, idx_nonzero_baf, s] += scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], mix_p_A * taus[i, s], mix_p_B * taus[i, s]) 145 | return log_emission_rdr, log_emission_baf 146 | # 147 | @staticmethod 148 | @njit 149 | def forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat): 150 | ''' 151 | Note that n_states is the CNV states, and there are n_states of paired states for (CNV, phasing) pairs. 152 | Input 153 | lengths: sum of lengths = n_observations. 154 | log_transmat: n_states * n_states. Transition probability after log transformation. 155 | log_startprob: n_states. Start probability after log transformation. 156 | log_emission: n_states * n_observations * n_spots. Log probability. 157 | Output 158 | log_alpha: size n_states * n_observations. log alpha[j, t] = log P(o_1, ... o_t, q_t = j | lambda). 159 | ''' 160 | n_obs = log_emission.shape[1] 161 | n_states = log_emission.shape[0] 162 | assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!" 163 | assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!" 164 | # initialize log_alpha 165 | log_alpha = np.zeros((log_emission.shape[0], n_obs)) 166 | buf = np.zeros(log_emission.shape[0]) 167 | cumlen = 0 168 | for le in lengths: 169 | # start prob 170 | # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. 171 | # But adding too many spots may lead to a higher weight of the emission rather then transition prob. 172 | log_alpha[:, cumlen] = log_startprob + np_sum_ax_squeeze(log_emission[:, cumlen, :], axis=1) 173 | for t in np.arange(1, le): 174 | for j in np.arange(log_emission.shape[0]): 175 | for i in np.arange(log_emission.shape[0]): 176 | buf[i] = log_alpha[i, (cumlen + t - 1)] + log_transmat[i, j] 177 | log_alpha[j, (cumlen + t)] = mylogsumexp(buf) + np.sum(log_emission[j, (cumlen + t), :]) 178 | cumlen += le 179 | return log_alpha 180 | # 181 | @staticmethod 182 | @njit 183 | def backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat): 184 | ''' 185 | Note that n_states is the CNV states, and there are n_states of paired states for (CNV, phasing) pairs. 186 | Input 187 | X: size n_observations * n_components * n_spots. 188 | lengths: sum of lengths = n_observations. 189 | log_transmat: n_states * n_states. Transition probability after log transformation. 190 | log_startprob: n_states. Start probability after log transformation. 191 | log_emission: n_states * n_observations * n_spots. Log probability. 192 | Output 193 | log_beta: size 2*n_states * n_observations. log beta[i, t] = log P(o_{t+1}, ..., o_T | q_t = i, lambda). 194 | ''' 195 | n_obs = log_emission.shape[1] 196 | n_states = log_emission.shape[0] 197 | assert np.sum(lengths) == n_obs, "Sum of lengths must be equal to the first dimension of X!" 198 | assert len(log_startprob) == n_states, "Length of startprob_ must be equal to the first dimension of log_transmat!" 199 | # initialize log_beta 200 | log_beta = np.zeros((log_emission.shape[0], n_obs)) 201 | buf = np.zeros(log_emission.shape[0]) 202 | cumlen = 0 203 | for le in lengths: 204 | # start prob 205 | # ??? Theoretically, joint distribution across spots under iid is the prod (or sum) of individual (log) probabilities. 206 | # But adding too many spots may lead to a higher weight of the emission rather then transition prob. 207 | log_beta[:, (cumlen + le - 1)] = 0 208 | for t in np.arange(le-2, -1, -1): 209 | for i in np.arange(log_emission.shape[0]): 210 | for j in np.arange(log_emission.shape[0]): 211 | buf[j] = log_beta[j, (cumlen + t + 1)] + log_transmat[i, j] + np.sum(log_emission[j, (cumlen + t + 1), :]) 212 | log_beta[i, (cumlen + t)] = mylogsumexp(buf) 213 | cumlen += le 214 | return log_beta 215 | 216 | # 217 | def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat=None, tumor_prop=None, tp_weight_by_mu=None, \ 218 | fix_NB_dispersion=False, shared_NB_dispersion=False, fix_BB_dispersion=False, shared_BB_dispersion=False, \ 219 | is_diag=False, init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, max_iter=100, tol=1e-4, **kwargs): 220 | ''' 221 | Input 222 | X: size n_observations * n_components * n_spots. 223 | lengths: sum of lengths = n_observations. 224 | base_nb_mean: size of n_observations * n_spots. 225 | In NB-BetaBinom model, n_components = 2 226 | Intermediate 227 | log_mu: size of n_states. Log of mean/exposure/base_prob of each HMM state. 228 | alpha: size of n_states. Dispersioon parameter of each HMM state. 229 | ''' 230 | n_obs = X.shape[0] 231 | n_comp = X.shape[1] 232 | n_spots = X.shape[2] 233 | assert n_comp == 2 234 | # initialize NB logmean shift and BetaBinom prob 235 | log_mu = np.vstack([np.linspace(-0.1, 0.1, n_states) for r in range(n_spots)]).T if init_log_mu is None else init_log_mu 236 | p_binom = np.vstack([np.linspace(0.05, 0.45, n_states) for r in range(n_spots)]).T if init_p_binom is None else init_p_binom 237 | # initialize (inverse of) dispersion param in NB and BetaBinom 238 | alphas = 0.1 * np.ones((n_states, n_spots)) if init_alphas is None else init_alphas 239 | taus = 30 * np.ones((n_states, n_spots)) if init_taus is None else init_taus 240 | # initialize start probability and emission probability 241 | log_startprob = np.log( np.ones(n_states) / n_states ) 242 | if n_states > 1: 243 | transmat = np.ones((n_states, n_states)) * (1-self.t) / (n_states-1) 244 | np.fill_diagonal(transmat, self.t) 245 | log_transmat = np.log(transmat) 246 | else: 247 | log_transmat = np.zeros((1,1)) 248 | # a trick to speed up BetaBinom optimization: taking only unique values of (B allele count, total SNP covering read count) 249 | unique_values_nb, mapping_matrices_nb = construct_unique_matrix(X[:,0,:], base_nb_mean) 250 | unique_values_bb, mapping_matrices_bb = construct_unique_matrix(X[:,1,:], total_bb_RD) 251 | # EM algorithm 252 | for r in trange(max_iter): 253 | # E step 254 | if tumor_prop is None: 255 | log_emission_rdr, log_emission_baf = hmm_nophasing.compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus) 256 | log_emission = log_emission_rdr + log_emission_baf 257 | else: 258 | log_emission_rdr, log_emission_baf = hmm_nophasing.compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop) 259 | log_emission = log_emission_rdr + log_emission_baf 260 | log_alpha = hmm_nophasing.forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat) 261 | log_beta = hmm_nophasing.backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat) 262 | log_gamma = compute_posterior_obs(log_alpha, log_beta) 263 | log_xi = compute_posterior_transition_nophasing(log_alpha, log_beta, log_transmat, log_emission) 264 | # M step 265 | if "s" in self.params: 266 | new_log_startprob = update_startprob_nophasing(lengths, log_gamma) 267 | new_log_startprob = new_log_startprob.flatten() 268 | else: 269 | new_log_startprob = log_startprob 270 | if "t" in self.params: 271 | new_log_transmat = update_transition_nophasing(log_xi, is_diag=is_diag) 272 | else: 273 | new_log_transmat = log_transmat 274 | if "m" in self.params: 275 | if tumor_prop is None: 276 | new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, start_log_mu=log_mu, \ 277 | fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) 278 | else: 279 | new_log_mu, new_alphas = update_emission_params_nb_nophasing_uniqvalues_mix(unique_values_nb, mapping_matrices_nb, log_gamma, alphas, tumor_prop, start_log_mu=log_mu, \ 280 | fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion) 281 | else: 282 | new_log_mu = log_mu 283 | new_alphas = alphas 284 | if "p" in self.params: 285 | if tumor_prop is None: 286 | new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues(unique_values_bb, mapping_matrices_bb, log_gamma, taus, start_p_binom=p_binom, \ 287 | fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) 288 | else: 289 | new_p_binom, new_taus = update_emission_params_bb_nophasing_uniqvalues_mix(unique_values_bb, mapping_matrices_bb, log_gamma, taus, tumor_prop, start_p_binom=p_binom, \ 290 | fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion) 291 | else: 292 | new_p_binom = p_binom 293 | new_taus = taus 294 | # check convergence 295 | print( np.mean(np.abs( np.exp(new_log_startprob) - np.exp(log_startprob) )), \ 296 | np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )), \ 297 | np.mean(np.abs(new_log_mu - log_mu)),\ 298 | np.mean(np.abs(new_p_binom - p_binom)) ) 299 | print( np.hstack([new_log_mu, new_p_binom]) ) 300 | if np.mean(np.abs( np.exp(new_log_transmat) - np.exp(log_transmat) )) < tol and \ 301 | np.mean(np.abs(new_log_mu - log_mu)) < tol and np.mean(np.abs(new_p_binom - p_binom)) < tol: 302 | break 303 | log_startprob = new_log_startprob 304 | log_transmat = new_log_transmat 305 | log_mu = new_log_mu 306 | alphas = new_alphas 307 | p_binom = new_p_binom 308 | taus = new_taus 309 | return new_log_mu, new_alphas, new_p_binom, new_taus, new_log_startprob, new_log_transmat, log_gamma 310 | 311 | 312 | -------------------------------------------------------------------------------- /src/calicost/hmm_NB_sharedstates.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import inspect 3 | import logging 4 | 5 | import numpy as np 6 | import scipy 7 | from scipy import linalg, special 8 | from scipy.special import logsumexp 9 | from sklearn import cluster 10 | from sklearn.utils import check_random_state 11 | from hmmlearn.hmm import BaseHMM 12 | import statsmodels 13 | import statsmodels.api as sm 14 | from statsmodels.base.model import GenericLikelihoodModel 15 | 16 | 17 | def convert_params(mean, std): 18 | """ 19 | Convert mean/dispersion parameterization of a negative binomial to the ones scipy supports 20 | 21 | See https://mathworld.wolfram.com/NegativeBinomialDistribution.html 22 | """ 23 | p = mean/std**2 24 | n = mean*p/(1.0 - p) 25 | return n, p 26 | 27 | 28 | class Weighted_NegativeBinomial(GenericLikelihoodModel): 29 | def __init__(self, endog, exog, weights, exposure, seed=0, **kwds): 30 | super(Weighted_NegativeBinomial, self).__init__(endog, exog, **kwds) 31 | self.weights = weights 32 | self.exposure = exposure 33 | self.seed = seed 34 | # 35 | def nloglikeobs(self, params): 36 | nb_mean = np.exp(self.exog @ params[:-1]) * self.exposure 37 | nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2) 38 | n, p = convert_params(nb_mean, nb_std) 39 | llf = scipy.stats.nbinom.logpmf(self.endog, n, p) 40 | neg_sum_llf = -llf.dot(self.weights) 41 | return neg_sum_llf 42 | # 43 | def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): 44 | self.exog_names.append('alpha') 45 | 46 | if start_params is None: 47 | if hasattr(self, 'start_params'): 48 | start_params = self.start_params 49 | else: 50 | start_params = np.append(0.1 * np.ones(self.nparams), 0.01) 51 | 52 | return super(Weighted_NegativeBinomial, self).fit(start_params=start_params, 53 | maxiter=maxiter, maxfun=maxfun, 54 | **kwds) 55 | 56 | 57 | class ConstrainedNBHMM(BaseHMM): 58 | """ 59 | HMM model with NB emission probability and constraint of all cells have the shared hidden state vector. 60 | A degenerative case is to use pseudobulk UMI count matrix of size G genes by 1 cell. 61 | 62 | Attributes 63 | ---------- 64 | base_nb_mean : array, shape (n_genes, n_cells) 65 | Mean expression under diploid state. 66 | 67 | startprob_ : array, shape (n_components) 68 | Initial state occupation distribution. 69 | 70 | transmat_ : array, shape (n_components, n_components) 71 | Matrix of transition probabilities between states. 72 | 73 | log_mu : array, shape (n_components) 74 | Shift in log of expression due to CNV. Each CNV states (components) has it's own shift value. 75 | 76 | params : str 77 | "s" for start probability, "t" for transition probability, "m" for log of expression shift due to CNV, "a" for inverse dispersion of NB distribution. 78 | 79 | Examples 80 | ---------- 81 | base_nb_mean = eta.reshape(-1,1) * np.sum(totalUMI) 82 | hmmmodel = ConstrainedNBHMM(n_components=3) 83 | X = np.vstack( [np.sum(count,axis=0), base_nb_mean] ).T 84 | hmmmodel.fit( X ) 85 | hmmmodel.predict( X ) 86 | """ 87 | def __init__(self, n_components=1, shared_dispersion=False, 88 | startprob_prior=1.0, transmat_prior=1.0, 89 | algorithm="viterbi", random_state=None, 90 | n_iter=10, tol=1e-2, verbose=False, 91 | params="stma", 92 | init_params=""): 93 | BaseHMM.__init__(self, n_components, 94 | startprob_prior=startprob_prior, 95 | transmat_prior=transmat_prior, algorithm=algorithm, 96 | random_state=random_state, n_iter=n_iter, 97 | tol=tol, params=params, verbose=verbose, 98 | init_params=init_params) 99 | self.shared_dispersion = shared_dispersion 100 | # initialize CNV's effect 101 | self.log_mu = np.linspace(-0.1, 0.1, self.n_components) 102 | # initialize inverse of dispersion 103 | self.alphas = np.array([0.01] * self.n_components) 104 | # self.alphas = 0.01 * np.ones(s(self.n_components, self.n_genes)) 105 | # initialize start probability and transition probability 106 | self.startprob_ = np.ones(self.n_components) / self.n_components 107 | t = 0.9 108 | self.transmat_ = np.ones((self.n_components, self.n_components)) * (1-t) / (self.n_components-1) 109 | np.fill_diagonal(self.transmat_, t) 110 | # 111 | def _compute_log_likelihood(self, X): 112 | """ 113 | Compute log likelihood of X. 114 | 115 | Attributes 116 | ---------- 117 | X : array_like, shape (n_genes, 2*n_cells) 118 | First (n_genes, n_cells) is the observed UMI count matrix; second (n_genes, n_cells) is base_nb_mean. 119 | 120 | Returns 121 | ------- 122 | lpr : array_like, shape (n_genes, n_components) 123 | Array containing the log probabilities of each data point in X. 124 | """ 125 | n_genes = X.shape[0] 126 | n_cells = int(X.shape[1] / 2) 127 | base_nb_mean = X[:, n_cells:] 128 | log_prob = np.zeros((n_genes, n_cells, self.n_components)) 129 | for i in range(self.n_components): 130 | nb_mean = base_nb_mean * np.exp(self.log_mu[i]) 131 | nb_std = np.sqrt(nb_mean + self.alphas[i] * nb_mean**2) 132 | # nb_std = np.sqrt(nb_mean + self.alphas[i,:].reshape(-1,1) * nb_mean**2) 133 | n, p = convert_params(nb_mean, nb_std) 134 | log_prob[:,:,i] = scipy.stats.nbinom.logpmf(X[:, :n_cells], n, p) 135 | return log_prob.mean(axis=1) 136 | # 137 | def _initialize_sufficient_statistics(self): 138 | stats = super()._initialize_sufficient_statistics() 139 | return stats 140 | # 141 | def _accumulate_sufficient_statistics(self, stats, X, lattice, posteriors, fwdlattice, bwdlattice): 142 | super()._accumulate_sufficient_statistics( 143 | stats, X, lattice, posteriors, fwdlattice, bwdlattice) 144 | """ 145 | Update sufficient statistics from a given sample. 146 | Parameters 147 | ---------- 148 | stats : dict 149 | Sufficient statistics as returned by 150 | :meth:`~.BaseHMM._initialize_sufficient_statistics`. 151 | X : array, shape (n_genes, n_cells) 152 | Sample sequence. 153 | lattice : array, shape (n_genes, n_components) 154 | Probabilities OR Log Probabilities of each sample 155 | under each of the model states. Depends on the choice 156 | of implementation of the Forward-Backward algorithm 157 | posteriors : array, shape (n_genes, n_components) 158 | Posterior probabilities of each sample being generated by each 159 | of the model states. 160 | fwdlattice, bwdlattice : array, shape (n_genes, n_components) 161 | forward and backward probabilities. 162 | """ 163 | if 'm' in self.params or 'a' in self.params: 164 | stats['post'] = posteriors 165 | stats['obs'] = X 166 | if 't' in self.params: 167 | # for each ij, recover sum_t xi_ij from the inferred transition matrix 168 | bothlattice = fwdlattice + bwdlattice 169 | loggamma = (bothlattice.T - logsumexp(bothlattice, axis = 1)).T 170 | 171 | # denominator for each ij is the sum of gammas over i 172 | denoms = np.sum(np.exp(loggamma), axis = 0) 173 | # transpose to perform row-wise multiplication 174 | stats['denoms'] = denoms 175 | # 176 | def _do_mstep(self, stats): 177 | n_genes = stats['obs'].shape[0] 178 | n_cells = int(stats['obs'].shape[1] / 2) 179 | base_nb_mean = stats['obs'][:, n_cells:] 180 | super()._do_mstep(stats) 181 | if 'm' in self.params and 'a' in self.params: 182 | # NB regression fit dispersion and CNV's effect simultaneously 183 | if not self.shared_dispersion: 184 | for i in range(self.n_components): 185 | model = Weighted_NegativeBinomial(stats['obs'][:, :n_cells].flatten(), \ 186 | np.ones(n_genes*n_cells).reshape(-1,1), \ 187 | weights=np.repeat(stats['post'][:,i], n_cells), exposure=base_nb_mean.flatten()) 188 | res = model.fit(disp=0, maxiter=500) 189 | self.log_mu[i] = res.params[0] 190 | self.alphas[i] = res.params[-1] 191 | # self.alphas[i,:] = res.params[-1] 192 | else: 193 | all_states_nb_mean = np.tile(base_nb_mean.flatten(), self.n_components) 194 | all_states_y = np.tile(stats['obs'][:, :n_cells].flatten(), self.n_components) 195 | all_states_weights = np.concatenate([np.repeat(stats['post'][:,i], n_cells) for i in range(self.n_components)]) 196 | all_states_features = np.zeros((self.n_components*n_genes*n_cells, self.n_components)) 197 | for i in np.arange(self.n_components): 198 | all_states_features[(i*n_genes*n_cells):((i+1)*n_genes*n_cells), i] = 1 199 | model = Weighted_NegativeBinomial(all_states_y, all_states_features, weights=all_states_weights, exposure=all_states_nb_mean) 200 | res = model.fit(disp=0, maxiter=500) 201 | self.log_mu = res.params[:-1] 202 | self.alphas[:] = res.params[-1] 203 | # self.alphas[:,:] = res.params[-1] 204 | # print(res.params) 205 | elif 'm' in self.params: 206 | # NB regression fit CNV's effect only 207 | for i in range(self.n_components): 208 | model = sm.GLM(stats['obs'].flatten(), np.ones(self.n_genes*self.n_cells).reshape(-1,1), \ 209 | family=sm.families.NegativeBinomial(alpha=self.alphas[i]), \ 210 | exposure=base_nb_mean.flatten()) 211 | # model = sm.GLM(stats['obs'][:, :n_cells].flatten(), np.ones(n_genes*n_cells).reshape(-1,1), \ 212 | # family=sm.families.NegativeBinomial(alpha=np.repeat(self.alphas[i], n_cells)), \ 213 | # exposure=base_nb_mean.flatten(), var_weights=np.repeat(stats['post'][:,i], n_cells)) 214 | res = model.fit(disp=0, maxiter=500) 215 | self.log_mu[i] = res.params[0] 216 | if 't' in self.params: 217 | # following copied from Matt's code 218 | denoms = stats['denoms'] 219 | x = (self.transmat_.T * denoms).T 220 | 221 | # numerator is the sum of ii elements 222 | num = np.sum(np.diag(x)) 223 | # denominator is the sum of all elements 224 | denom = np.sum(x) 225 | 226 | # (this is the same as sum_i gamma_i) 227 | #assert np.isclose(denom, np.sum(denoms)) 228 | 229 | stats['diag'] = num / denom 230 | self.transmat_ = self.form_transition_matrix(stats['diag']) 231 | # 232 | def form_transition_matrix(self, diag): 233 | tol = 1e-10 234 | diag = np.clip(diag, tol, 1 - tol) 235 | 236 | offdiag = (1 - diag) / (self.n_components - 1) 237 | transmat_ = np.diag([diag - offdiag] * self.n_components) 238 | transmat_ += offdiag 239 | #assert np.all(transmat_ > 0), (diag, offdiag, transmat_) 240 | return transmat_ -------------------------------------------------------------------------------- /src/calicost/hmrf_normalmixture.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numba import njit 3 | import scipy.special 4 | import scipy.sparse 5 | from sklearn.mixture import GaussianMixture 6 | from sklearn.cluster import KMeans 7 | from sklearn.metrics import adjusted_rand_score 8 | from tqdm import trange 9 | import copy 10 | from pathlib import Path 11 | from hmm_NB_BB_phaseswitch import * 12 | from utils_distribution_fitting import * 13 | from utils_IO import * 14 | from simple_sctransform import * 15 | 16 | import warnings 17 | from statsmodels.tools.sm_exceptions import ValueWarning 18 | 19 | -------------------------------------------------------------------------------- /src/calicost/joint_allele_generateconfig.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import scipy 4 | import pandas as pd 5 | from pathlib import Path 6 | from sklearn.metrics import adjusted_rand_score 7 | import scanpy as sc 8 | import anndata 9 | import logging 10 | import copy 11 | from pathlib import Path 12 | import subprocess 13 | from hmm_NB_BB_phaseswitch import * 14 | from utils_distribution_fitting import * 15 | from hmrf import * 16 | from utils_IO import * 17 | 18 | 19 | def read_joint_configuration_file(filename): 20 | ##### [Default settings] ##### 21 | config = { 22 | "input_filelist" : None, 23 | "snp_dir" : None, 24 | "output_dir" : None, 25 | # supporting files and preprocessing arguments 26 | "hgtable_file" : None, 27 | "normalidx_file" : None, 28 | "tumorprop_file" : None, 29 | "supervision_clone_file" : None, 30 | "alignment_files" : [], 31 | "filtergenelist_file" : None, 32 | "filterregion_file" : None, 33 | "binsize" : 1, 34 | "rdrbinsize" : 1, 35 | # "secondbinning_min_umi" : 500, 36 | "max_nbins" : 1200, 37 | "avg_umi_perbinspot" : 1.5, 38 | "bafonly" : True, 39 | # phase switch probability 40 | "nu" : 1, 41 | "logphase_shift" : 1, 42 | "npart_phasing" : 2, 43 | # HMRF configurations 44 | "n_clones" : None, 45 | "n_clones_rdr" : 2, 46 | "min_spots_per_clone" : 100, 47 | "min_avgumi_per_clone" : 10, 48 | "maxspots_pooling" : 7, 49 | "tumorprop_threshold" : 0.5, 50 | "max_iter_outer" : 20, 51 | "nodepotential" : "max", # max or weighted_sum 52 | "initialization_method" : "rectangle", # rectangle or datadrive 53 | "num_hmrf_initialization_start" : 0, 54 | "num_hmrf_initialization_end" : 10, 55 | "spatial_weight" : 2.0, 56 | "construct_adjacency_method" : "hexagon", 57 | "construct_adjacency_w" : 1.0, 58 | # HMM configurations 59 | "n_states" : None, 60 | "params" : None, 61 | "t" : None, 62 | "t_phaseing" : 1-1e-4, 63 | "fix_NB_dispersion" : False, 64 | "shared_NB_dispersion" : True, 65 | "fix_BB_dispersion" : False, 66 | "shared_BB_dispersion" : True, 67 | "max_iter" : 30, 68 | "tol" : 1e-3, 69 | "gmm_random_state" : 0, 70 | "np_threshold" : 2.0, 71 | "np_eventminlen" : 10 72 | } 73 | 74 | argument_type = { 75 | "input_filelist" : "str", 76 | "snp_dir" : "str", 77 | "output_dir" : "str", 78 | # supporting files and preprocessing arguments 79 | "hgtable_file" : "str", 80 | "normalidx_file" : "str", 81 | "tumorprop_file" : "str", 82 | "supervision_clone_file" : "str", 83 | "alignment_files" : "list_str", 84 | "filtergenelist_file" : "str", 85 | "filterregion_file" : "str", 86 | "binsize" : "int", 87 | "rdrbinsize" : "int", 88 | # "secondbinning_min_umi" : "int", 89 | "max_nbins" : "int", 90 | "avg_umi_perbinspot" : "float", 91 | "bafonly" : "bool", 92 | # phase switch probability 93 | "nu" : "float", 94 | "logphase_shift" : "float", 95 | "npart_phasing" : "int", 96 | # HMRF configurations 97 | "n_clones" : "int", 98 | "n_clones_rdr" : "int", 99 | "min_spots_per_clone" : "int", 100 | "min_avgumi_per_clone" : "int", 101 | "maxspots_pooling" : "int", 102 | "tumorprop_threshold" : "float", 103 | "max_iter_outer" : "int", 104 | "nodepotential" : "str", 105 | "initialization_method" : "str", 106 | "num_hmrf_initialization_start" : "int", 107 | "num_hmrf_initialization_end" : "int", 108 | "spatial_weight" : "float", 109 | "construct_adjacency_method" : "str", 110 | "construct_adjacency_w" : "float", 111 | # HMM configurations 112 | "n_states" : "int", 113 | "params" : "str", 114 | "t" : "eval", 115 | "t_phaseing" : "eval", 116 | "fix_NB_dispersion" : "bool", 117 | "shared_NB_dispersion" : "bool", 118 | "fix_BB_dispersion" : "bool", 119 | "shared_BB_dispersion" : "bool", 120 | "max_iter" : "int", 121 | "tol" : "float", 122 | "gmm_random_state" : "int", 123 | "np_threshold" : "float", 124 | "np_eventminlen" : "int" 125 | } 126 | 127 | ##### [ read configuration file to update settings ] ##### 128 | with open(filename, 'r') as fp: 129 | for line in fp: 130 | if line.strip() == "" or line[0] == "#": 131 | continue 132 | strs = [x.strip() for x in line.strip().split(":") if x != ""] 133 | assert strs[0] in config.keys(), f"{strs[0]} is not a valid configuration parameter! Configuration parameters are: {list(config.keys())}" 134 | if len(strs) == 1: 135 | config[strs[0]] = [] 136 | elif strs[1].upper() == "NONE": 137 | config[strs[0]] = None 138 | elif argument_type[strs[0]] == "str": 139 | config[strs[0]] = strs[1] 140 | elif argument_type[strs[0]] == "int": 141 | config[strs[0]] = int(strs[1]) 142 | elif argument_type[strs[0]] == "float": 143 | config[strs[0]] = float(strs[1]) 144 | elif argument_type[strs[0]] == "eval": 145 | config[strs[0]] = eval(strs[1]) 146 | elif argument_type[strs[0]] == "bool": 147 | config[strs[0]] = (strs[1].upper() == "TRUE") 148 | elif argument_type[strs[0]] == "list_str": 149 | config[strs[0]] = strs[1].split(" ") 150 | # assertions 151 | assert not config["input_filelist"] is None, "No input file list!" 152 | assert not config["snp_dir"] is None, "No SNP directory!" 153 | assert not config["output_dir"] is None, "No output directory!" 154 | 155 | return config 156 | 157 | 158 | 159 | def write_joint_config_file(outputfilename, config): 160 | list_argument_io = ["input_filelist", 161 | "snp_dir", 162 | "output_dir"] 163 | list_argument_sup = ["hgtable_file", 164 | "normalidx_file", 165 | "tumorprop_file", 166 | "supervision_clone_file", 167 | "alignment_files", 168 | "filtergenelist_file", 169 | "filterregion_file", 170 | "binsize", 171 | "rdrbinsize", 172 | # "secondbinning_min_umi", 173 | "max_nbins", 174 | "avg_umi_perbinspot", 175 | "bafonly"] 176 | list_argument_phase = ["nu", 177 | "logphase_shift", 178 | "npart_phasing"] 179 | list_argument_hmrf = ["n_clones", 180 | "n_clones_rdr", 181 | "min_spots_per_clone", 182 | "min_avgumi_per_clone", 183 | "maxspots_pooling", 184 | "tumorprop_threshold", 185 | "max_iter_outer", 186 | "nodepotential", 187 | "initialization_method", 188 | "num_hmrf_initialization_start", 189 | "num_hmrf_initialization_end", 190 | "spatial_weight", 191 | "construct_adjacency_method", 192 | "construct_adjacency_w"] 193 | list_argument_hmm = ["n_states", 194 | "params", 195 | "t", 196 | "t_phaseing", 197 | "fix_NB_dispersion", 198 | "shared_NB_dispersion", 199 | "fix_BB_dispersion", 200 | "shared_BB_dispersion", 201 | "max_iter", 202 | "tol", 203 | "gmm_random_state", 204 | "np_threshold", 205 | "np_eventminlen"] 206 | with open(outputfilename, 'w') as fp: 207 | # 208 | for k in list_argument_io: 209 | fp.write(f"{k} : {config[k]}\n") 210 | # 211 | fp.write("\n") 212 | fp.write("# supporting files and preprocessing arguments\n") 213 | for k in list_argument_sup: 214 | if not isinstance(config[k], list): 215 | fp.write(f"{k} : {config[k]}\n") 216 | else: 217 | fp.write(f"{k} : " + " ".join(config[k]) + "\n") 218 | # 219 | fp.write("\n") 220 | fp.write("# phase switch probability\n") 221 | for k in list_argument_phase: 222 | fp.write(f"{k} : {config[k]}\n") 223 | # 224 | fp.write("\n") 225 | fp.write("# HMRF configurations\n") 226 | for k in list_argument_hmrf: 227 | fp.write(f"{k} : {config[k]}\n") 228 | # 229 | fp.write("\n") 230 | fp.write("# HMM configurations\n") 231 | for k in list_argument_hmm: 232 | fp.write(f"{k} : {config[k]}\n") 233 | 234 | 235 | def main(argv): 236 | template_configuration_file = argv[1] 237 | outputdir = argv[2] 238 | hmrf_seed_s = int(argv[3]) 239 | hmrf_seed_t = int(argv[4]) 240 | config = read_joint_configuration_file(template_configuration_file) 241 | for r in range(hmrf_seed_s, hmrf_seed_t): 242 | config["num_hmrf_initialization_start"] = r 243 | config["num_hmrf_initialization_end"] = r+1 244 | write_joint_config_file(f"{outputdir}/configfile{r}", config) 245 | 246 | 247 | if __name__ == "__main__": 248 | if len(sys.argv) == 1: 249 | print("python joint_allele_generateconfig.py ") 250 | if len(sys.argv) > 1: 251 | main(sys.argv) -------------------------------------------------------------------------------- /src/calicost/parse_input.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import scipy 4 | import pandas as pd 5 | from pathlib import Path 6 | from sklearn.metrics import adjusted_rand_score 7 | import scanpy as sc 8 | import anndata 9 | import logging 10 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S") 11 | logger = logging.getLogger() 12 | import copy 13 | from pathlib import Path 14 | import functools 15 | import subprocess 16 | import argparse 17 | from calicost.utils_IO import * 18 | from calicost.phasing import * 19 | from calicost.arg_parse import * 20 | 21 | 22 | def genesnp_to_bininfo(df_gene_snp): 23 | table_bininfo = df_gene_snp[~df_gene_snp.bin_id.isnull()].groupby('bin_id').agg({"CHR":'first', 'START':'first', 'END':'last', 'gene':set, 'snp_id':set}).reset_index() 24 | table_bininfo['ARM'] = '.' 25 | table_bininfo['INCLUDED_GENES'] = [ " ".join([x for x in y if not x is None]) for y in table_bininfo.gene.values ] 26 | table_bininfo['INCLUDED_SNP_IDS'] = [ " ".join([x for x in y if not x is None]) for y in table_bininfo.snp_id.values ] 27 | table_bininfo['NORMAL_COUNT'] = np.nan 28 | table_bininfo['N_SNPS'] = [ len([x for x in y if not x is None]) for y in table_bininfo.snp_id.values ] 29 | # drop the set columns 30 | table_bininfo.drop(columns=['gene', 'snp_id'], inplace=True) 31 | return table_bininfo 32 | 33 | 34 | def parse_visium(config): 35 | """ 36 | Read multiple 10X Visium SRT samples and SNP data and generate tables with counts and meta info. 37 | 38 | Attributes: 39 | ---------- 40 | config : dictionary 41 | Dictionary containing configuration parameters. Output from read_joint_configuration_file. 42 | 43 | Returns: 44 | ---------- 45 | table_bininfo : DataFrame 46 | DataFrame with columns [chr, arm, start, end, log_phase_transition, included_genes, normal count, n_snps]. 47 | 48 | table_rdrbaf : DataFrame 49 | DataFrame with columns [barcodes, exp_count, tot_count, b_count]. 50 | 51 | meta_info : DataFrame 52 | DataFrame with columns [barcodes, sample, x, y, tumor_proportion] 53 | 54 | expression : sparse matrix, (n_spots, n_genes) 55 | Gene expression UMI count matrix. 56 | 57 | adjacency_mat : array, (n_spots, n_spots) 58 | Adjacency matrix for evaluating label coherence in HMRF. 59 | 60 | smooth_mat : array, (n_spots, n_spots) 61 | KNN smoothing matrix. 62 | """ 63 | if "input_filelist" in config: 64 | adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, across_slice_adjacency_mat = load_joint_data(config["input_filelist"], config["snp_dir"], config["alignment_files"], config["filtergenelist_file"], config["filterregion_file"], config["normalidx_file"], config['min_snpumi_perspot'], config['min_percent_expressed_spots']) 65 | sample_list = [adata.obs["sample"][0]] 66 | for i in range(1, adata.shape[0]): 67 | if adata.obs["sample"][i] != sample_list[-1]: 68 | sample_list.append( adata.obs["sample"][i] ) 69 | # convert sample name to index 70 | sample_ids = np.zeros(adata.shape[0], dtype=int) 71 | for s,sname in enumerate(sample_list): 72 | index = np.where(adata.obs["sample"] == sname)[0] 73 | sample_ids[index] = s 74 | else: 75 | adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids = load_data(config["spaceranger_dir"], config["snp_dir"], config["filtergenelist_file"], config["filterregion_file"], config["normalidx_file"], config['min_snpumi_perspot'], config['min_percent_expressed_spots']) 76 | adata.obs["sample"] = "unique_sample" 77 | sample_list = [adata.obs["sample"][0]] 78 | sample_ids = np.zeros(adata.shape[0], dtype=int) 79 | across_slice_adjacency_mat = None 80 | 81 | coords = adata.obsm["X_pos"] 82 | 83 | if not config["tumorprop_file"] is None: 84 | df_tumorprop = pd.read_csv(config["tumorprop_file"], sep="\t", header=0, index_col=0) 85 | df_tumorprop = df_tumorprop[["Tumor"]] 86 | df_tumorprop.columns = ["tumor_proportion"] 87 | adata.obs = adata.obs.join(df_tumorprop) 88 | single_tumor_prop = adata.obs["tumor_proportion"] 89 | else: 90 | single_tumor_prop = None 91 | 92 | # read original data 93 | df_gene_snp = combine_gene_snps(unique_snp_ids, config['hgtable_file'], adata) 94 | df_gene_snp = create_haplotype_block_ranges(df_gene_snp, adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids) 95 | lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat = summarize_counts_for_blocks(df_gene_snp, \ 96 | adata, cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids, nu=config['nu'], logphase_shift=config['logphase_shift'], geneticmap_file=config['geneticmap_file']) 97 | # infer an initial phase using pseudobulk 98 | if not Path(f"{config['output_dir']}/initial_phase.npz").exists(): 99 | initial_clone_for_phasing = perform_partition(coords, sample_ids, x_part=config["npart_phasing"], y_part=config["npart_phasing"], single_tumor_prop=single_tumor_prop, threshold=config["tumorprop_threshold"]) 100 | phase_indicator, refined_lengths = initial_phase_given_partition(single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_for_phasing, 5, log_sitewise_transmat, \ 101 | "sp", config["t_phaseing"], config["gmm_random_state"], config["fix_NB_dispersion"], config["shared_NB_dispersion"], config["fix_BB_dispersion"], config["shared_BB_dispersion"], 30, 1e-3, threshold=config["tumorprop_threshold"]) 102 | np.savez(f"{config['output_dir']}/initial_phase.npz", phase_indicator=phase_indicator, refined_lengths=refined_lengths) 103 | # map phase indicator to individual snps 104 | df_gene_snp['phase'] = np.where(df_gene_snp.snp_id.isnull(), None, df_gene_snp.block_id.map({i:x for i,x in enumerate(phase_indicator)}) ) 105 | else: 106 | tmp = dict(np.load(f"{config['output_dir']}/initial_phase.npz")) 107 | phase_indicator, refined_lengths = tmp["phase_indicator"], tmp["refined_lengths"] 108 | 109 | # binning 110 | df_gene_snp = create_bin_ranges(df_gene_snp, single_total_bb_RD, refined_lengths, config['secondary_min_umi']) 111 | lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat = summarize_counts_for_bins(df_gene_snp, \ 112 | adata, single_X, single_total_bb_RD, phase_indicator, nu=config['nu'], logphase_shift=config['logphase_shift'], geneticmap_file=config['geneticmap_file']) 113 | # lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, sorted_chr_pos, sorted_chr_pos_last, x_gene_list, n_snps = perform_binning_new(lengths, single_X, \ 114 | # single_base_nb_mean, single_total_bb_RD, sorted_chr_pos, sorted_chr_pos_last, x_gene_list, n_snps, phase_indicator, refined_lengths, config["binsize"], config["rdrbinsize"], config["nu"], config["logphase_shift"], secondary_min_umi=secondary_min_umi) 115 | 116 | # # remove bins where normal spots have imbalanced SNPs 117 | # if not config["tumorprop_file"] is None: 118 | # for prop_threshold in np.arange(0, 0.6, 0.05): 119 | # normal_candidate = (single_tumor_prop <= prop_threshold) 120 | # if np.sum(single_X[:, 0, (normal_candidate==True)]) > single_X.shape[0] * 200: 121 | # break 122 | # index_normal = np.where(normal_candidate)[0] 123 | # lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_gene_snp = bin_selection_basedon_normal(df_gene_snp, \ 124 | # single_X, single_base_nb_mean, single_total_bb_RD, config["nu"], config["logphase_shift"], index_normal, config['geneticmap_file']) 125 | # assert np.sum(lengths) == single_X.shape[0] 126 | # assert single_X.shape[0] == single_total_bb_RD.shape[0] 127 | # assert single_X.shape[0] == len(log_sitewise_transmat) 128 | 129 | # expression count dataframe 130 | exp_counts = pd.DataFrame.sparse.from_spmatrix( scipy.sparse.csc_matrix(adata.layers["count"]), index=adata.obs.index, columns=adata.var.index) 131 | 132 | # smooth and adjacency matrix for each sample 133 | adjacency_mat, smooth_mat = multislice_adjacency(sample_ids, sample_list, coords, single_total_bb_RD, exp_counts, 134 | across_slice_adjacency_mat, construct_adjacency_method=config['construct_adjacency_method'], 135 | maxspots_pooling=config['maxspots_pooling'], construct_adjacency_w=config['construct_adjacency_w']) 136 | n_pooled = np.median(np.sum(smooth_mat > 0, axis=0).A.flatten()) 137 | print(f"Set up number of spots to pool in HMRF: {n_pooled}") 138 | 139 | # If adjacency matrix is only constructed using gene expression similarity (e.g. scRNA-seq data) 140 | # Then, directly replace coords by the umap of gene expression, to avoid potential inconsistency in HMRF initialization 141 | if config["construct_adjacency_method"] == "KNN" and config["construct_adjacency_w"] == 0: 142 | sc.pp.normalize_total(adata, target_sum=np.median(np.sum(exp_counts.values,axis=1)) ) 143 | sc.pp.log1p(adata) 144 | sc.tl.pca(adata) 145 | sc.pp.neighbors(adata) 146 | sc.tl.umap(adata) 147 | coords = adata.obsm["X_umap"] 148 | 149 | # create RDR-BAF table 150 | table_bininfo = genesnp_to_bininfo(df_gene_snp) 151 | table_bininfo['LOG_PHASE_TRANSITION'] = log_sitewise_transmat 152 | 153 | table_rdrbaf = [] 154 | for i in range(single_X.shape[2]): 155 | table_rdrbaf.append( pd.DataFrame({"BARCODES":adata.obs.index[i], "EXP":single_X[:,0,i], "TOT":single_total_bb_RD[:,i], "B":single_X[:,1,i]}) ) 156 | table_rdrbaf = pd.concat(table_rdrbaf, ignore_index=True) 157 | 158 | # create meta info table 159 | # note that table_meta.BARCODES is equal to the unique ones of table_rdrbaf.BARCODES in the original order 160 | table_meta = pd.DataFrame({"BARCODES":adata.obs.index, "SAMPLE":adata.obs["sample"], "X":coords[:,0], "Y":coords[:,1]}) 161 | if not single_tumor_prop is None: 162 | table_meta["TUMOR_PROPORTION"] = single_tumor_prop 163 | 164 | return table_bininfo, table_rdrbaf, table_meta, exp_counts, adjacency_mat, smooth_mat, df_gene_snp 165 | 166 | 167 | def load_tables_to_matrices(config): 168 | """ 169 | Load tables and adjacency from parse_visium_joint or parse_visium_single, and convert to HMM input matrices. 170 | """ 171 | table_bininfo = pd.read_csv(f"{config['output_dir']}/parsed_inputs/table_bininfo.csv.gz", header=0, index_col=None, sep="\t") 172 | table_rdrbaf = pd.read_csv(f"{config['output_dir']}/parsed_inputs/table_rdrbaf.csv.gz", header=0, index_col=None, sep="\t") 173 | table_meta = pd.read_csv(f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz", header=0, index_col=None, sep="\t") 174 | adjacency_mat = scipy.sparse.load_npz( f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz" ) 175 | smooth_mat = scipy.sparse.load_npz( f"{config['output_dir']}/parsed_inputs/smooth_mat.npz" ) 176 | # 177 | df_gene_snp = pd.read_csv(f"{config['output_dir']}/parsed_inputs/gene_snp_info.csv.gz", header=0, index_col=None, sep="\t") 178 | df_gene_snp = df_gene_snp.replace(np.nan, None) 179 | 180 | n_spots = table_meta.shape[0] 181 | n_bins = table_bininfo.shape[0] 182 | 183 | # construct single_X 184 | # single_X = np.zeros((n_bins, 2, n_spots), dtype=int) 185 | single_X = np.zeros((n_bins, 2, n_spots)) 186 | single_X[:, 0, :] = table_rdrbaf["EXP"].values.reshape((n_bins, n_spots), order="F") 187 | single_X[:, 1, :] = table_rdrbaf["B"].values.reshape((n_bins, n_spots), order="F") 188 | 189 | # construct single_base_nb_mean, lengths 190 | single_base_nb_mean = table_bininfo["NORMAL_COUNT"].values.reshape(-1,1) / np.sum(table_bininfo["NORMAL_COUNT"].values) @ np.sum(single_X[:,0,:], axis=0).reshape(1,-1) 191 | 192 | # construct single_total_bb_RD 193 | single_total_bb_RD = table_rdrbaf["TOT"].values.reshape((n_bins, n_spots), order="F") 194 | 195 | # construct log_sitewise_transmat 196 | log_sitewise_transmat = table_bininfo["LOG_PHASE_TRANSITION"].values 197 | 198 | # construct bin info and lengths and x_gene_list 199 | df_bininfo = table_bininfo 200 | lengths = np.array([ np.sum(table_bininfo.CHR == c) for c in df_bininfo.CHR.unique() ]) 201 | 202 | # construct barcodes 203 | barcodes = table_meta["BARCODES"] 204 | 205 | # construct coords 206 | coords = table_meta[["X", "Y"]].values 207 | 208 | # construct single_tumor_prop 209 | single_tumor_prop = table_meta["TUMOR_PROPORTION"].values if "TUMOR_PROPORTION" in table_meta.columns else None 210 | 211 | # construct sample_list and sample_ids 212 | sample_list = [table_meta["SAMPLE"].values[0]] 213 | for i in range(1, table_meta.shape[0]): 214 | if table_meta["SAMPLE"].values[i] != sample_list[-1]: 215 | sample_list.append( table_meta["SAMPLE"].values[i] ) 216 | sample_ids = np.zeros(table_meta.shape[0], dtype=int) 217 | for s,sname in enumerate(sample_list): 218 | index = np.where(table_meta["SAMPLE"].values == sname)[0] 219 | sample_ids[index] = s 220 | 221 | # expression UMI count matrix 222 | exp_counts = pd.read_pickle( f"{config['output_dir']}/parsed_inputs/exp_counts.pkl" ) 223 | 224 | return lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_bininfo, df_gene_snp, \ 225 | barcodes, coords, single_tumor_prop, sample_list, sample_ids, adjacency_mat, smooth_mat, exp_counts 226 | 227 | 228 | def run_parse_n_load(config): 229 | file_exists = np.array([ Path(f"{config['output_dir']}/parsed_inputs/table_bininfo.csv.gz").exists(), \ 230 | Path(f"{config['output_dir']}/parsed_inputs/table_rdrbaf.csv.gz").exists(), \ 231 | Path(f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz").exists(), \ 232 | Path(f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz").exists(), \ 233 | Path(f"{config['output_dir']}/parsed_inputs/smooth_mat.npz").exists(), \ 234 | Path(f"{config['output_dir']}/parsed_inputs/exp_counts.pkl").exists() ]) 235 | if not np.all(file_exists): 236 | # process to tables 237 | table_bininfo, table_rdrbaf, table_meta, exp_counts, adjacency_mat, smooth_mat, df_gene_snp = parse_visium(config) 238 | # table_bininfo, table_rdrbaf, table_meta, exp_counts, adjacency_mat, smooth_mat = parse_hatchetblock(config, cellsnplite_dir, bb_file) 239 | 240 | # save file 241 | p = subprocess.Popen(f"mkdir -p {config['output_dir']}/parsed_inputs", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) 242 | out,err = p.communicate() 243 | 244 | table_bininfo.to_csv( f"{config['output_dir']}/parsed_inputs/table_bininfo.csv.gz", header=True, index=False, sep="\t" ) 245 | table_rdrbaf.to_csv( f"{config['output_dir']}/parsed_inputs/table_rdrbaf.csv.gz", header=True, index=False, sep="\t" ) 246 | table_meta.to_csv( f"{config['output_dir']}/parsed_inputs/table_meta.csv.gz", header=True, index=False, sep="\t" ) 247 | exp_counts.to_pickle( f"{config['output_dir']}/parsed_inputs/exp_counts.pkl" ) 248 | scipy.sparse.save_npz( f"{config['output_dir']}/parsed_inputs/adjacency_mat.npz", adjacency_mat ) 249 | scipy.sparse.save_npz( f"{config['output_dir']}/parsed_inputs/smooth_mat.npz", smooth_mat ) 250 | # 251 | df_gene_snp.to_csv( f"{config['output_dir']}/parsed_inputs/gene_snp_info.csv.gz", header=True, index=False, sep="\t" ) 252 | 253 | # load and parse data 254 | return load_tables_to_matrices(config) 255 | 256 | 257 | if __name__ == "__main__": 258 | parser = argparse.ArgumentParser() 259 | parser.add_argument("-c", "--configfile", help="configuration file of CalicoST", required=True, type=str) 260 | args = parser.parse_args() 261 | 262 | try: 263 | config = read_configuration_file(args.configfile) 264 | except: 265 | config = read_joint_configuration_file(args.configfile) 266 | 267 | print("Configurations:") 268 | for k in sorted(list(config.keys())): 269 | print(f"\t{k} : {config[k]}") 270 | 271 | _ = run_parse_n_load(config) 272 | -------------------------------------------------------------------------------- /src/calicost/phasing.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from turtle import reset 3 | import numpy as np 4 | from numba import njit 5 | import scipy.special 6 | import scipy.sparse 7 | from sklearn.mixture import GaussianMixture 8 | from sklearn.cluster import KMeans 9 | from sklearn.metrics import adjusted_rand_score, silhouette_score 10 | from sklearn.neighbors import kneighbors_graph 11 | import networkx as nx 12 | from tqdm import trange 13 | import copy 14 | from pathlib import Path 15 | from calicost.hmm_NB_BB_phaseswitch import * 16 | from calicost.utils_distribution_fitting import * 17 | from calicost.utils_hmrf import * 18 | import warnings 19 | from statsmodels.tools.sm_exceptions import ValueWarning 20 | 21 | 22 | def infer_initial_phase(single_X, lengths, single_base_nb_mean, single_total_bb_RD, n_states, log_sitewise_transmat, \ 23 | params, t, random_state, fix_NB_dispersion, shared_NB_dispersion, fix_BB_dispersion, shared_BB_dispersion, max_iter, tol): 24 | # pseudobulk HMM for phase_prob 25 | res = pipeline_baum_welch(None, np.sum(single_X, axis=2, keepdims=True), lengths, n_states, \ 26 | np.sum(single_base_nb_mean, axis=1, keepdims=True), np.sum(single_total_bb_RD, axis=1, keepdims=True), log_sitewise_transmat, \ 27 | hmmclass=hmm_sitewise, params=params, t=t, random_state=random_state, only_minor=True, \ 28 | fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, \ 29 | fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, is_diag=True, \ 30 | init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, max_iter=max_iter, tol=tol) 31 | # phase_prob = np.exp(scipy.special.logsumexp(res["log_gamma"][:n_states, :], axis=0)) 32 | # return phase_prob 33 | pred = np.argmax(res["log_gamma"], axis=0) 34 | pred_cnv = pred % n_states 35 | phase_indicator = (pred < n_states) 36 | refined_lengths = [] 37 | cumlen = 0 38 | for le in lengths: 39 | s = 0 40 | for i, k in enumerate(pred_cnv[cumlen:(cumlen+le)]): 41 | if i > 0 and pred_cnv[i] != pred_cnv[i-1]: 42 | refined_lengths.append(i - s) 43 | s = i 44 | refined_lengths.append(le - s) 45 | cumlen += le 46 | refined_lengths = np.array(refined_lengths) 47 | return phase_indicator, refined_lengths 48 | 49 | 50 | def initial_phase_given_partition(single_X, lengths, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, initial_clone_index, n_states, log_sitewise_transmat, \ 51 | params, t, random_state, fix_NB_dispersion, shared_NB_dispersion, fix_BB_dispersion, shared_BB_dispersion, max_iter, tol, threshold, min_snpumi=2e3): 52 | EPS_BAF = 0.05 53 | if single_tumor_prop is None: 54 | X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index) 55 | tumor_prop = None 56 | else: 57 | X, base_nb_mean, total_bb_RD, tumor_prop = merge_pseudobulk_by_index_mix(single_X, single_base_nb_mean, single_total_bb_RD, initial_clone_index, single_tumor_prop, threshold=threshold) 58 | 59 | # pseudobulk HMM for phase_prob 60 | baf_profiles = np.zeros((X.shape[2], X.shape[0])) 61 | pred_cnv = np.zeros((X.shape[2], X.shape[0])) 62 | for i in range(X.shape[2]): 63 | if np.sum(total_bb_RD[:,i]) < min_snpumi: 64 | baf_profiles[i,:] = 0.5 65 | else: 66 | res = pipeline_baum_welch(None, X[:,:,i:(i+1)], lengths, n_states, base_nb_mean[:,i:(i+1)], total_bb_RD[:,i:(i+1)], log_sitewise_transmat, \ 67 | hmmclass=hmm_sitewise, params=params, t=t, random_state=random_state, only_minor=True, \ 68 | fix_NB_dispersion=fix_NB_dispersion, shared_NB_dispersion=shared_NB_dispersion, \ 69 | fix_BB_dispersion=fix_BB_dispersion, shared_BB_dispersion=shared_BB_dispersion, is_diag=True, \ 70 | init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, max_iter=max_iter, tol=tol) 71 | # 72 | pred = np.argmax(res["log_gamma"], axis=0) 73 | this_baf_profiles = np.where(pred < n_states, res["new_p_binom"][pred%n_states, 0], 1-res["new_p_binom"][pred%n_states, 0]) 74 | this_baf_profiles[np.abs(this_baf_profiles - 0.5) < EPS_BAF] = 0.5 75 | baf_profiles[i,:] = this_baf_profiles 76 | pred_cnv[i,:] = (pred % n_states) 77 | 78 | if single_tumor_prop is None: 79 | n_total_spots = np.sum([ len(x) for x in initial_clone_index ]) 80 | population_baf = np.array([ 1.0*len(x)/n_total_spots for x in initial_clone_index]) @ baf_profiles 81 | else: 82 | n_total_spots = np.sum([ len(x) * tumor_prop[i] for i,x in enumerate(initial_clone_index) ]) 83 | population_baf = np.array([ 1.0*len(x)*tumor_prop[i]/n_total_spots for i,x in enumerate(initial_clone_index) ]) @ baf_profiles 84 | adj_baf_profiles = np.where(baf_profiles < 0.5, baf_profiles, 1-baf_profiles) 85 | phase_indicator = (population_baf < 0.5) 86 | refined_lengths = [] 87 | cumlen = 0 88 | for le in lengths: 89 | s = 0 90 | for i in range(le): 91 | if i > s + 10 and np.any(np.abs(adj_baf_profiles[:,i+cumlen] - adj_baf_profiles[:,i+cumlen-1]) > 0.1): 92 | refined_lengths.append(i - s) 93 | s = i 94 | refined_lengths.append(le - s) 95 | cumlen += le 96 | refined_lengths = np.array(refined_lengths) 97 | return phase_indicator, refined_lengths 98 | 99 | 100 | def perform_partition(coords, sample_ids, x_part, y_part, single_tumor_prop, threshold): 101 | initial_clone_index = [] 102 | for s in range(np.max(sample_ids)+1): 103 | index = np.where(sample_ids == s)[0] 104 | assert len(index) > 0 105 | if single_tumor_prop is None: 106 | tmp_clone_index = fixed_rectangle_initialization(coords[index,:], x_part, y_part) 107 | else: 108 | tmp_clone_index = fixed_rectangle_initialization_mix(coords[index,:], x_part, y_part, single_tumor_prop[index], threshold=threshold) 109 | for x in tmp_clone_index: 110 | initial_clone_index.append( index[x] ) 111 | return initial_clone_index 112 | -------------------------------------------------------------------------------- /src/calicost/phylogeny_startle.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pandas as pd 3 | import argparse 4 | import itertools 5 | import math 6 | import subprocess 7 | import numpy as np 8 | import seaborn as sns 9 | from matplotlib import pyplot as plt 10 | 11 | import networkx as nx 12 | import itertools 13 | from collections import deque 14 | import argparse 15 | 16 | 17 | def get_LoH_for_phylogeny(df_seglevel_cnv, min_segments): 18 | """ 19 | Treating LoH as irreversible point mutations, output a clone-by-mutation matrix for phylogeny reconstruction. 20 | Mutation states: 0 for no LoH, 1 for lossing A allele, 2 for lossing B allele. 21 | 22 | Attributes 23 | ---------- 24 | df_seglevel_cnv : pd.DataFrame, (n_obs, 3+2*n_clones) 25 | Dataframe from cnv_*seglevel.tsv output. 26 | 27 | Returns 28 | ---------- 29 | df_loh : pd.DataFrame, (n_clones, n_segments) 30 | """ 31 | def get_shared_intervals(acn_profile): 32 | ''' 33 | Takes in allele-specific copy numbers, output a segmentation of genome such that all clones are in the same CN state within each segment. 34 | 35 | anc_profile : array, (n_obs, 2*n_clones) 36 | Allele-specific integer copy numbers for each genomic bin (obs) across all clones. 37 | ''' 38 | intervals = [] 39 | seg_acn = [] 40 | s = 0 41 | while s < acn_profile.shape[0]: 42 | t = np.where( ~np.all(acn_profile[s:,] == acn_profile[s,:], axis=1) )[0] 43 | if len(t) == 0: 44 | intervals.append( (s, acn_profile.shape[0]) ) 45 | seg_acn.append( acn_profile[s,:] ) 46 | s = acn_profile.shape[0] 47 | else: 48 | t = t[0] 49 | intervals.append( (s,s+t) ) 50 | seg_acn.append( acn_profile[s,:] ) 51 | s = s+t 52 | return intervals, seg_acn 53 | 54 | clone_ids = [x.split(" ")[0] for x in df_seglevel_cnv.columns[ np.arange(3, df_seglevel_cnv.shape[1], 2) ] ] 55 | 56 | acn_profile = df_seglevel_cnv.iloc[:,3:].values 57 | intervals, seg_acn = get_shared_intervals(acn_profile) 58 | df_loh = [] 59 | for i, acn in enumerate(seg_acn): 60 | if np.all(acn != 0): 61 | continue 62 | if intervals[i][1] - intervals[i][0] < min_segments: 63 | continue 64 | idx_zero = np.where(acn == 0)[0] 65 | idx_clones = (idx_zero / 2).astype(int) 66 | is_A = (idx_zero % 2 == 0) 67 | # vector of mutation states 68 | mut = np.zeros( int(len(acn) / 2), dtype=int ) 69 | mut[idx_clones] = np.where(is_A, 1, 2) 70 | df_loh.append( pd.DataFrame(mut.reshape(1, -1), index=[f"bin_{intervals[i][0]}_{intervals[i][1]}"], columns=clone_ids) ) 71 | 72 | df_loh = pd.concat(df_loh).T 73 | return df_loh 74 | 75 | 76 | def get_binary_matrix(df_character_matrix): 77 | 78 | ncells = len(df_character_matrix) 79 | binary_col_dict = {} 80 | for column in df_character_matrix.columns: 81 | state_list = list(df_character_matrix[column].unique()) 82 | for s in state_list: 83 | if s != -1 and s != 0: 84 | state_col = np.zeros((ncells)) 85 | state_col[df_character_matrix[column] == s] = 1 86 | state_col[df_character_matrix[column] == -1] = -1 87 | 88 | binary_col_dict[f'{column}_{s}'] = state_col 89 | 90 | df_binary = pd.DataFrame(binary_col_dict, index = df_character_matrix.index, dtype=int) 91 | return df_binary 92 | 93 | 94 | def generate_perfect_phylogeny(df_binary): 95 | 96 | solT_mut = nx.DiGraph() 97 | solT_mut.add_node('root') 98 | 99 | solT_cell = nx.DiGraph() 100 | solT_cell.add_node('root') 101 | 102 | df_binary = df_binary[df_binary.sum().sort_values(ascending=False).index] 103 | 104 | for cell_id, row in df_binary.iterrows(): 105 | if cell_id == 'root': 106 | continue 107 | 108 | curr_node = 'root' 109 | for column in df_binary.columns[row.values == 1]: 110 | if column in solT_mut[curr_node]: 111 | curr_node = column 112 | else: 113 | if column in solT_mut.nodes: 114 | raise NameError(f'{column} is being repeated') 115 | solT_mut.add_edge(curr_node, column) 116 | solT_cell.add_edge(curr_node, column) 117 | curr_node = column 118 | 119 | solT_cell.add_edge(curr_node, cell_id) 120 | 121 | return solT_mut, solT_cell 122 | 123 | 124 | def tree_to_newick(T, root=None): 125 | if root is None: 126 | roots = list(filter(lambda p: p[1] == 0, T.in_degree())) 127 | assert 1 == len(roots) 128 | root = roots[0][0] 129 | subgs = [] 130 | while len(T[root]) == 1: 131 | root = list(T[root])[0] 132 | for child in T[root]: 133 | pathlen = 0 134 | while len(T[child]) == 1: 135 | child = list(T[child])[0] 136 | pathlen += 1 137 | if len(T[child]) > 0: 138 | pathlen += 1 139 | subgs.append(tree_to_newick(T, root=child) + f":{pathlen}") 140 | else: 141 | subgs.append( f"{child}:{pathlen}" ) 142 | return "(" + ','.join(map(str, subgs)) + ")" 143 | 144 | 145 | def output_startle_input_files(calicostdir, outdir, midfix="", startle_bin="startle", min_segments=3): 146 | # get LoH data frame 147 | # rows are clones, columns are bins, entries are 0 (no LoH) or 1 (A allele LoH) of 2 (B allele LoH) 148 | df_seglevel_cnv = pd.read_csv(f"{calicostdir}/cnv{midfix}_seglevel.tsv", header=0, sep="\t") 149 | df_loh = get_LoH_for_phylogeny(df_seglevel_cnv, min_segments) 150 | df_loh.to_csv(f"{outdir}/loh_matrix.tsv", header=True, index=True, sep="\t") 151 | 152 | # binarize 153 | df_binary = get_binary_matrix(df_loh) 154 | 155 | cell_list = list(df_binary.index) 156 | mutation_list = list(df_binary.columns) 157 | mutation_to_index = {x: idx for idx, x in enumerate(mutation_list)} 158 | 159 | # one and missing indices 160 | # one indices 161 | one_cell_mut_list = [] 162 | for cell_idx, cell in enumerate(cell_list): 163 | for mut_idx, mut in enumerate(mutation_list): 164 | if df_binary.loc[cell][mut] == 1: 165 | one_cell_mut_list.append((cell_idx, mut_idx)) 166 | with open(f'{outdir}/loh_one_indices.txt', 'w') as out: 167 | for cell_idx, mut_idx in one_cell_mut_list: 168 | out.write(f'{cell_idx} {mut_idx}\n') 169 | # missimg imdices 170 | character_list = list(set(['_'.join(x.split('_')[:-1]) for x in df_binary.columns])) 171 | missing_cell_character_list = [] 172 | for character_idx, character in enumerate(character_list): 173 | for cell_idx, cell in enumerate(cell_list): 174 | if df_loh.loc[cell][character] == -1: 175 | missing_cell_character_list.append((cell_idx, character_idx)) 176 | with open(f'{outdir}/loh_missing_indices.txt', 'w') as out: 177 | for cell_idx, character_idx in missing_cell_character_list: 178 | out.write(f'{cell_idx} {character_idx}\n') 179 | 180 | # character mutation mapping 181 | with open(f'{outdir}/loh_character_mutation_mapping.txt', 'w') as out: 182 | for _, character in enumerate(character_list): 183 | character_mutation_list = [mutation_to_index[x] for x in mutation_list if x.startswith(f'{character}_')] 184 | out.write(' '.join(map(str, character_mutation_list)) + '\n') 185 | 186 | # count of character states of mutations 187 | max_allowed_homoplasy = {} 188 | for mutation in mutation_list: 189 | max_allowed_homoplasy[mutation] = 2 190 | with open(f'{outdir}/loh_counts.txt', 'w') as out: 191 | for mutation in mutation_list: 192 | out.write(f'{max_allowed_homoplasy[mutation]}\n') 193 | 194 | # weights 195 | with open(f'{outdir}/loh_weights.txt', 'w') as out: 196 | for mutation in mutation_list: 197 | out.write(f"1\n") 198 | 199 | ##### run startle ##### 200 | m_mutations = df_binary.shape[1] 201 | n_clones = df_binary.shape[0] 202 | command = f"{startle_bin} -m {m_mutations} -n {n_clones} {outdir}/loh_one_indices.txt {outdir}/loh_missing_indices.txt {outdir}/loh_counts.txt {outdir}/loh_character_mutation_mapping.txt {outdir}/loh_weights.txt {outdir}/loh_cpp_output.txt" 203 | print( command ) 204 | p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) 205 | out,err = p.communicate() 206 | 207 | # parse output 208 | df_cpp_output = pd.read_csv(f'{outdir}/loh_cpp_output.txt', header=None, sep=' ') 209 | df_cpp_output = df_cpp_output.rename(columns={0:'cell_idx', 1:'mut_idx', 2:'state_idx', 3:'entry'}) 210 | df_cpp_output['name'] = df_cpp_output.apply(lambda x: f"{mutation_list[x['mut_idx']]}_{x['state_idx']}", axis =1) 211 | 212 | sol_columns = list(df_cpp_output['name'].unique()) 213 | nsol_columns = len(sol_columns) 214 | sol_entries = np.zeros((n_clones, nsol_columns), dtype=int) 215 | for mut_idx, mut in enumerate(sol_columns): 216 | for cell_idx in df_cpp_output[(df_cpp_output['entry'] == 1) & (df_cpp_output['name'] == mut)]['cell_idx']: 217 | sol_entries[cell_idx][mut_idx] = 1 218 | df_sol_binary = pd.DataFrame(sol_entries, columns=sol_columns, index=cell_list) 219 | 220 | solT_mut, solT_cell = generate_perfect_phylogeny(df_sol_binary) 221 | with open(f'{outdir}/loh_tree.newick', 'w') as out: 222 | out.write(f"{tree_to_newick(solT_cell)};") 223 | 224 | 225 | if __name__ == "__main__": 226 | parser = argparse.ArgumentParser() 227 | parser.add_argument("-c", "--calicost_dir", help="Directory of a specific random initialization of CalicoST", type=str) 228 | parser.add_argument("-s", "--startle_bin", help="The startle executable path", default="startle", type=str) 229 | parser.add_argument("-p", "--ploidy", help="Ploidy of allele-specific integer copy numbers.", default="", type=str) 230 | parser.add_argument("--min_segments", help="Minimum number of genome segment to keep an LOH event in phylogenetic tree reconstruction.", default=3, type=int) 231 | parser.add_argument("-o", "--outputdir", help="output directory", type=str) 232 | args = parser.parse_args() 233 | 234 | output_startle_input_files(args.calicost_dir, args.outputdir, midfix=args.ploidy, startle_bin=args.startle_bin, min_segments=args.min_segments) -------------------------------------------------------------------------------- /src/calicost/phylogeography.py: -------------------------------------------------------------------------------- 1 | import scanpy as sc 2 | import numpy as np 3 | import pandas as pd 4 | import copy 5 | from matplotlib import pyplot as plt 6 | import seaborn 7 | from ete3 import Tree 8 | import networkx as nx 9 | 10 | 11 | def clone_centers(coords, clone_label, single_tumor_prop=None, sample_list=None, sample_ids=None, tumorprop_threshold=0.6): 12 | df_centers = [] 13 | for l in np.unique(clone_label): 14 | # get spot indices of this clone 15 | index = np.where(clone_label == l)[0] if single_tumor_prop is None else np.where((clone_label == l) & (single_tumor_prop > tumorprop_threshold))[0] 16 | # if the index contains multiple slices, get the most abundance slice 17 | if not sample_ids is None: 18 | most_abundance_slice = pd.Series(sample_ids[index]).mode().values[0] 19 | index = index[ sample_ids[index] == most_abundance_slice ] 20 | # get clone cencer 21 | if single_tumor_prop is None: 22 | center = np.mean(coords[index], axis=0) 23 | else: 24 | center = single_tumor_prop[index].dot(coords[index]) / np.sum(single_tumor_prop[index]) 25 | df_centers.append( pd.DataFrame({'clone':l, 'x':center[0], 'y':center[1]}, index=[0]) ) 26 | df_centers = pd.concat(df_centers, ignore_index=True) 27 | return df_centers 28 | 29 | 30 | def project_phylogeneny_space(newick_file, coords, clone_label, single_tumor_prop=None, sample_list=None, sample_ids=None): 31 | # load tree 32 | with open(newick_file, 'r') as fp: 33 | t = Tree(fp.readline()) 34 | 35 | # get the 36 | list_leaf_nodes = [] 37 | list_internal_nodes = [] 38 | rootnode = np.sort( [leaf.name.replace('clone','') for leaf in t.iter_leaves() ] ) 39 | rootnode = "ancestor" + "_".join( rootnode ) 40 | for node in t.traverse(): 41 | leafnames = np.sort( [leaf.name.replace('clone','') for leaf in node.iter_leaves() ] ) 42 | if node.name == "": 43 | node.name = "ancestor" + "_".join( leafnames ) 44 | 45 | if node.is_leaf(): 46 | list_leaf_nodes.append(node.name) 47 | else: 48 | list_internal_nodes.append(node.name) 49 | 50 | print(f"root node is {rootnode}") 51 | print(f"a list of leaf nodes: {list_leaf_nodes}") 52 | print(f"a list of internal nodes: {list_internal_nodes}") 53 | 54 | # set up multivariate Gaussian distribution to estimate internal node location 55 | N_nodes = len(list_leaf_nodes) + len(list_internal_nodes) 56 | # pairwise distance 57 | G = nx.Graph() 58 | G.add_nodes_from( list_leaf_nodes + list_internal_nodes ) 59 | for nodename in list_leaf_nodes: 60 | node = t&f"{nodename}" 61 | while not node.is_root(): 62 | p = node.up 63 | G.add_edge(node.name, p.name, weight=node.dist) 64 | node = p 65 | 66 | G.edges(data=True) 67 | nx_pdc = dict( nx.all_pairs_dijkstra(G) ) 68 | 69 | # covariance matrix based on pairwise distance 70 | N_nodes = len(list_leaf_nodes) + len(list_internal_nodes) 71 | Sigma_square = np.zeros((N_nodes, N_nodes)) 72 | base_var = max( np.max(np.abs(coords[:,0])), np.max(np.abs(coords[:,1])) ) 73 | 74 | for n1, name1 in enumerate(list_leaf_nodes + list_internal_nodes): 75 | for n2, name2 in enumerate(list_leaf_nodes + list_internal_nodes): 76 | if n1 == n2: 77 | Sigma_square[n1, n2] = base_var + nx_pdc[rootnode][0][name1] 78 | else: 79 | lca_node = t.get_common_ancestor([name1, name2]) 80 | # print( name1, name2, lca_node.name ) 81 | if lca_node.name == rootnode: 82 | Sigma_square[n1, n2] = base_var 83 | else: 84 | Sigma_square[n1, n2] = base_var + nx_pdc[rootnode][0][lca_node.name] 85 | 86 | # mean position 87 | mu_1 = np.zeros(( len(list_leaf_nodes),2 )) 88 | mu_2 = np.zeros(( len(list_internal_nodes),2 )) 89 | 90 | # partition covariance matrix 91 | Sigma_11 = Sigma_square[:len(list_leaf_nodes), :len(list_leaf_nodes)] 92 | Sigma_12 = Sigma_square[:len(list_leaf_nodes), :][:, len(list_leaf_nodes):] 93 | Sigma_22 = Sigma_square[len(list_leaf_nodes):, len(list_leaf_nodes):] 94 | 95 | # get leaf node locations 96 | df_centers = clone_centers(coords, clone_label, single_tumor_prop=single_tumor_prop, 97 | sample_list=sample_list, sample_ids=sample_ids) 98 | obs_1 = df_centers.set_index('clone').loc[list_leaf_nodes].values 99 | 100 | # conditional expectation internal node position | leaf node position = mu_1 101 | expected_internal = mu_2 + Sigma_12.T @ (np.linalg.inv(Sigma_11) @ (obs_1 - mu_1)) 102 | df_centers = pd.concat([ df_centers, pd.DataFrame({'clone':list_internal_nodes, 'x':expected_internal[:,0], 'y':expected_internal[:,1]}) ]) 103 | 104 | # add to tree features 105 | for node in t.traverse(): 106 | i = np.where(df_centers.clone.values == node.name)[0][0] 107 | node.add_features( x=df_centers.x.values[i], y=df_centers.y.values[i] ) 108 | 109 | return t -------------------------------------------------------------------------------- /src/calicost/simple_sctransform.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy 3 | import statsmodels 4 | import statsmodels.api as sm 5 | from KDEpy import FFTKDE 6 | from scipy.special import psi, polygamma 7 | 8 | 9 | # copied from sctransformPy 10 | def theta_ml(y,mu): 11 | n = y.size 12 | weights = np.ones(n) 13 | limit = 10 14 | _EPS = np.finfo(float).eps 15 | eps = (_EPS)**0.25 16 | # inner function 17 | def score(n,th,mu,y,w): 18 | return sum(w*(psi(th + y) - psi(th) + np.log(th) + 1 - np.log(th + mu) - (y + th)/(mu + th))) 19 | # inner function 20 | def info(n,th,mu,y,w): 21 | return sum(w*( - polygamma(1,th + y) + polygamma(1,th) - 1/th + 2/(mu + th) - (y + th)/(mu + th)**2)) 22 | # initialize gradient descent 23 | t0 = n/sum(weights*(y/mu - 1)**2) 24 | it = 0 25 | de = 1 26 | # gradient descent 27 | while(it + 1 < limit and abs(de) > eps): 28 | it+=1 29 | t0 = abs(t0) 30 | i = info(n, t0, mu, y, weights) 31 | de = score(n, t0, mu, y, weights)/i 32 | t0 += de 33 | t0 = max(t0,0) 34 | # note that t0 is the dispersion parameter: var = mu + mu^2 / t0 35 | return t0 36 | 37 | 38 | def sample_gene_indices(log_geometric_mean, n_subsample, n_partitions=10): 39 | bounds = np.linspace(np.min(log_geometric_mean), np.max(log_geometric_mean), n_partitions+1) 40 | bounds[-1] += 1e-4 41 | idx_subsample = [] 42 | for p in range(1, n_partitions): 43 | tmpidx = np.where(np.logical_and(log_geometric_mean >= bounds[p-1], log_geometric_mean < bounds[p]))[0] 44 | np.random.shuffle(tmpidx) 45 | idx_subsample.append(tmpidx[:int(n_subsample/n_partitions)]) 46 | idx_subsample = np.sort(np.concatenate(idx_subsample)) 47 | if len(idx_subsample) < n_subsample: 48 | mask = np.array([True] * len(log_geometric_mean)) 49 | mask[idx_subsample] = False 50 | idx_rest = np.arange(len(log_geometric_mean))[mask] 51 | np.random.shuffle(idx_rest) 52 | n_rest = n_subsample - len(idx_subsample) 53 | idx_subsample = np.sort(np.concatenate([idx_subsample, idx_rest[:n_rest]])) 54 | return idx_subsample 55 | 56 | 57 | def estimate_logmu_dispersion(counts, bw=None): 58 | ''' 59 | counts of size number spots * number genes. 60 | ''' 61 | N = counts.shape[0] 62 | G = counts.shape[1] 63 | eps = 1 64 | geometric_mean = np.exp(np.log(counts+eps).mean(axis=0).flatten()) - eps 65 | log_geometric_mean = np.log( geometric_mean ) 66 | spot_umi = counts.sum(axis=1) 67 | # fitting logmu and theta (dispersion) 68 | logmu = np.zeros(G) 69 | theta = np.zeros(G) 70 | for i in range(G): 71 | y = counts[:,i] 72 | logmu[i] = np.log( np.sum(y) / np.sum(spot_umi) ) 73 | mu = spot_umi * np.exp(logmu[i]) 74 | theta[i] = theta_ml(y, mu) 75 | # ratio between geometric mean and dispersion parameter theta 76 | log_ratio = np.log(1 + geometric_mean / theta) 77 | # smoothing parameter for kernel ridge regression 78 | if bw is None: 79 | z = FFTKDE(kernel='gaussian', bw='ISJ').fit(log_geometric_mean) 80 | z.evaluate(); 81 | bw_adjust = 3 82 | bw = z.bw*bw_adjust 83 | # kernel ridge regression for log_ratio (the log ratio between geometric mean expression and dispersion) 84 | kr = statsmodels.nonparametric.kernel_regression.KernelReg(log_ratio, log_geometric_mean[:,None], ['c'], reg_type='ll', bw=[bw]) 85 | pred_log_ratio = kr.fit(data_predict = log_geometric_mean[:,None])[0] 86 | pred_theta = geometric_mean / (np.exp(pred_log_ratio) - 1) 87 | return logmu, pred_theta 88 | 89 | 90 | def pearson_residual(counts, logmu, pred_theta): 91 | ''' 92 | counts of size number spots * number genes. 93 | ''' 94 | N = counts.shape[0] 95 | G = counts.shape[1] 96 | spot_umi = counts.sum(axis=1) 97 | # predicted mean and variance under NB model 98 | mud = np.exp(logmu.reshape(1,-1)) * spot_umi.reshape(-1,1) 99 | vard = mud + mud**2 / pred_theta.reshape(1,-1) 100 | X = (counts * 1.0 - mud) / vard**0.5 101 | # clipping 102 | clip = np.sqrt(counts.shape[0]/30) 103 | X[X > clip] = clip 104 | X[X < -clip] = -clip 105 | return X 106 | 107 | 108 | def deviance_residual(counts, logmu, pred_theta): 109 | ''' 110 | Equation is taken from Analytic Pearson Residual paper by Lause et al. 111 | counts of size number spots * number genes. 112 | ''' 113 | N = counts.shape[0] 114 | G = counts.shape[1] 115 | spot_umi = counts.sum(axis=1) 116 | # predicted mean 117 | mud = np.exp(logmu.reshape(1,-1)) * spot_umi.reshape(-1,1) 118 | sign = (counts > mud) 119 | part1 = counts * np.log(counts / mud) 120 | part1[counts==0] = 0 121 | part2 = (counts + pred_theta) * np.log( (counts + pred_theta) / (mud + pred_theta) ) 122 | X = sign * np.sqrt(2 * (part1 - part2)) 123 | return X 124 | 125 | 126 | def estimate_logmu_dispersion2(counts, n_subsample=None, bw=None): 127 | ''' 128 | counts of size number spots * number genes. 129 | ''' 130 | N = counts.shape[0] 131 | G = counts.shape[1] 132 | eps = 1 133 | geometric_mean = np.exp(np.log(counts+eps).mean(axis=0).flatten()) - eps 134 | log_geometric_mean = np.log( geometric_mean ) 135 | spot_umi = counts.sum(axis=1) 136 | logmu = np.log( np.sum(counts, axis=0) / np.sum(spot_umi) ) 137 | # fitting theta (dispersion) 138 | genes_subsample = np.array([i for i in range(G) if geometric_mean[i] > 0]) 139 | if not (n_subsample is None): 140 | np.random.seed(0) 141 | genes_subsample = sample_gene_indices(log_geometric_mean, n_subsample) 142 | theta = np.zeros(len(genes_subsample)) 143 | for idx,i in enumerate(genes_subsample): 144 | y = counts[:,i] 145 | mu = spot_umi * np.exp(logmu[i]) 146 | theta[idx] = theta_ml(y, mu) 147 | # ratio between geometric mean and dispersion parameter theta 148 | log_ratio = np.log(1 + geometric_mean[genes_subsample] / theta) 149 | # smoothing parameter for kernel ridge regression 150 | if bw is None: 151 | z = FFTKDE(kernel='gaussian', bw='ISJ').fit(log_geometric_mean[genes_subsample]) 152 | z.evaluate(); 153 | bw_adjust = 3 154 | bw = z.bw*bw_adjust 155 | # kernel ridge regression for log_ratio (the log ratio between geometric mean expression and dispersion) 156 | kr = statsmodels.nonparametric.kernel_regression.KernelReg(log_ratio, log_geometric_mean[genes_subsample][:,None], ['c'], reg_type='ll', bw=[bw]) 157 | pred_log_ratio = kr.fit(data_predict = log_geometric_mean[:,None])[0] 158 | pred_theta = geometric_mean / (np.exp(pred_log_ratio) - 1) 159 | return logmu, pred_theta 160 | 161 | 162 | def pearson_residual2(counts, logmu, pred_theta): 163 | ''' 164 | counts of size number spots * number genes. 165 | ''' 166 | N = counts.shape[0] 167 | G = counts.shape[1] 168 | spot_umi = counts.sum(axis=1) 169 | # predicted mean and variance under NB model 170 | mud = np.exp(logmu.reshape(1,-1)) * spot_umi.reshape(-1,1) 171 | vard = mud + mud**2 / pred_theta.reshape(1,-1) 172 | X = (counts * 1.0 - mud) / vard**0.5 173 | # clipping 174 | clip = np.sqrt(counts.shape[0]) 175 | X[X > clip] = clip 176 | X[X < -clip] = -clip 177 | return X 178 | -------------------------------------------------------------------------------- /src/calicost/utils_distribution_fitting.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import inspect 3 | import logging 4 | 5 | import numpy as np 6 | import scipy 7 | from scipy import linalg, special 8 | from scipy.special import logsumexp, loggamma 9 | import scipy.integrate 10 | import scipy.stats 11 | from numba import jit, njit 12 | from sklearn import cluster 13 | from sklearn.utils import check_random_state 14 | import statsmodels 15 | import statsmodels.api as sm 16 | from statsmodels.base.model import GenericLikelihoodModel 17 | import os 18 | 19 | os.environ["MKL_NUM_THREADS"] = "1" 20 | os.environ["OPENBLAS_NUM_THREADS"] = "1" 21 | os.environ["OMP_NUM_THREADS"] = "1" 22 | 23 | 24 | def convert_params(mean, std): 25 | """ 26 | Convert mean/dispersion parameterization of a negative binomial to the ones scipy supports 27 | 28 | See https://mathworld.wolfram.com/NegativeBinomialDistribution.html 29 | """ 30 | p = mean/std**2 31 | n = mean*p/(1.0 - p) 32 | return n, p 33 | 34 | 35 | class Weighted_NegativeBinomial(GenericLikelihoodModel): 36 | """ 37 | Negative Binomial model endog ~ NB(exposure * exp(exog @ params[:-1]), params[-1]), where exog is the design matrix, and params[-1] is 1 / overdispersion. 38 | This function fits the NB params when samples are weighted by weights: max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params) 39 | 40 | Attributes 41 | ---------- 42 | endog : array, (n_samples,) 43 | Y values. 44 | 45 | exog : array, (n_samples, n_features) 46 | Design matrix. 47 | 48 | weights : array, (n_samples,) 49 | Sample weights. 50 | 51 | exposure : array, (n_samples,) 52 | Multiplication constant outside the exponential term. In scRNA-seq or SRT data, this term is the total UMI count per cell/spot. 53 | """ 54 | def __init__(self, endog, exog, weights, exposure, seed=0, **kwds): 55 | super(Weighted_NegativeBinomial, self).__init__(endog, exog, **kwds) 56 | self.weights = weights 57 | self.exposure = exposure 58 | self.seed = seed 59 | # 60 | def nloglikeobs(self, params): 61 | nb_mean = np.exp(self.exog @ params[:-1]) * self.exposure 62 | nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2) 63 | n, p = convert_params(nb_mean, nb_std) 64 | llf = scipy.stats.nbinom.logpmf(self.endog, n, p) 65 | neg_sum_llf = -llf.dot(self.weights) 66 | return neg_sum_llf 67 | # 68 | def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): 69 | self.exog_names.append('alpha') 70 | if start_params is None: 71 | if hasattr(self, 'start_params'): 72 | start_params = self.start_params 73 | else: 74 | start_params = np.append(0.1 * np.ones(self.nparams), 0.01) 75 | 76 | return super(Weighted_NegativeBinomial, self).fit(start_params=start_params, 77 | maxiter=maxiter, maxfun=maxfun, 78 | **kwds) 79 | 80 | 81 | class Weighted_NegativeBinomial_mix(GenericLikelihoodModel): 82 | def __init__(self, endog, exog, weights, exposure, tumor_prop, seed=0, **kwds): 83 | super(Weighted_NegativeBinomial_mix, self).__init__(endog, exog, **kwds) 84 | self.weights = weights 85 | self.exposure = exposure 86 | self.seed = seed 87 | self.tumor_prop = tumor_prop 88 | # 89 | def nloglikeobs(self, params): 90 | nb_mean = self.exposure * (self.tumor_prop * np.exp(self.exog @ params[:-1]) + 1 - self.tumor_prop) 91 | nb_std = np.sqrt(nb_mean + params[-1] * nb_mean**2) 92 | n, p = convert_params(nb_mean, nb_std) 93 | llf = scipy.stats.nbinom.logpmf(self.endog, n, p) 94 | neg_sum_llf = -llf.dot(self.weights) 95 | return neg_sum_llf 96 | # 97 | def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): 98 | self.exog_names.append('alpha') 99 | if start_params is None: 100 | if hasattr(self, 'start_params'): 101 | start_params = self.start_params 102 | else: 103 | start_params = np.append(0.1 * np.ones(self.nparams), 0.01) 104 | return super(Weighted_NegativeBinomial_mix, self).fit(start_params=start_params, 105 | maxiter=maxiter, maxfun=maxfun, 106 | **kwds) 107 | 108 | 109 | class Weighted_BetaBinom(GenericLikelihoodModel): 110 | """ 111 | Beta-binomial model endog ~ BetaBin(exposure, tau * p, tau * (1 - p)), where p = exog @ params[:-1] and tau = params[-1]. 112 | This function fits the BetaBin params when samples are weighted by weights: max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params) 113 | 114 | Attributes 115 | ---------- 116 | endog : array, (n_samples,) 117 | Y values. 118 | 119 | exog : array, (n_samples, n_features) 120 | Design matrix. 121 | 122 | weights : array, (n_samples,) 123 | Sample weights. 124 | 125 | exposure : array, (n_samples,) 126 | Total number of trials. In BAF case, this is the total number of SNP-covering UMIs. 127 | """ 128 | def __init__(self, endog, exog, weights, exposure, **kwds): 129 | super(Weighted_BetaBinom, self).__init__(endog, exog, **kwds) 130 | self.weights = weights 131 | self.exposure = exposure 132 | # 133 | def nloglikeobs(self, params): 134 | a = (self.exog @ params[:-1]) * params[-1] 135 | b = (1 - self.exog @ params[:-1]) * params[-1] 136 | llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b) 137 | neg_sum_llf = -llf.dot(self.weights) 138 | return neg_sum_llf 139 | # 140 | def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): 141 | self.exog_names.append("tau") 142 | if start_params is None: 143 | if hasattr(self, 'start_params'): 144 | start_params = self.start_params 145 | else: 146 | start_params = np.append(0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1) 147 | return super(Weighted_BetaBinom, self).fit(start_params=start_params, 148 | maxiter=maxiter, maxfun=maxfun, 149 | **kwds) 150 | 151 | 152 | class Weighted_BetaBinom_mix(GenericLikelihoodModel): 153 | def __init__(self, endog, exog, weights, exposure, tumor_prop, **kwds): 154 | super(Weighted_BetaBinom_mix, self).__init__(endog, exog, **kwds) 155 | self.weights = weights 156 | self.exposure = exposure 157 | self.tumor_prop = tumor_prop 158 | # 159 | def nloglikeobs(self, params): 160 | a = (self.exog @ params[:-1] * self.tumor_prop + 0.5 * (1 - self.tumor_prop)) * params[-1] 161 | b = ((1 - self.exog @ params[:-1]) * self.tumor_prop + 0.5 * (1 - self.tumor_prop)) * params[-1] 162 | llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b) 163 | neg_sum_llf = -llf.dot(self.weights) 164 | return neg_sum_llf 165 | # 166 | def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): 167 | self.exog_names.append("tau") 168 | if start_params is None: 169 | if hasattr(self, 'start_params'): 170 | start_params = self.start_params 171 | else: 172 | start_params = np.append(0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams), 1) 173 | return super(Weighted_BetaBinom_mix, self).fit(start_params=start_params, 174 | maxiter=maxiter, maxfun=maxfun, 175 | **kwds) 176 | 177 | 178 | class Weighted_BetaBinom_fixdispersion(GenericLikelihoodModel): 179 | def __init__(self, endog, exog, tau, weights, exposure, **kwds): 180 | super(Weighted_BetaBinom_fixdispersion, self).__init__(endog, exog, **kwds) 181 | self.tau = tau 182 | self.weights = weights 183 | self.exposure = exposure 184 | # 185 | def nloglikeobs(self, params): 186 | a = (self.exog @ params) * self.tau 187 | b = (1 - self.exog @ params) * self.tau 188 | llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b) 189 | neg_sum_llf = -llf.dot(self.weights) 190 | return neg_sum_llf 191 | # 192 | def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): 193 | if start_params is None: 194 | if hasattr(self, 'start_params'): 195 | start_params = self.start_params 196 | else: 197 | start_params = 0.1 * np.ones(self.nparams) 198 | 199 | return super(Weighted_BetaBinom_fixdispersion, self).fit(start_params=start_params, 200 | maxiter=maxiter, maxfun=maxfun, 201 | **kwds) 202 | 203 | 204 | class Weighted_BetaBinom_fixdispersion_mix(GenericLikelihoodModel): 205 | def __init__(self, endog, exog, tau, weights, exposure, tumor_prop, **kwds): 206 | super(Weighted_BetaBinom_fixdispersion_mix, self).__init__(endog, exog, **kwds) 207 | self.tau = tau 208 | self.weights = weights 209 | self.exposure = exposure 210 | self.tumor_prop = tumor_prop 211 | # 212 | def nloglikeobs(self, params): 213 | a = (self.exog @ params * self.tumor_prop + 0.5 * (1 - self.tumor_prop)) * self.tau 214 | b = ((1 - self.exog @ params) * self.tumor_prop + 0.5 * (1 - self.tumor_prop)) * self.tau 215 | llf = scipy.stats.betabinom.logpmf(self.endog, self.exposure, a, b) 216 | neg_sum_llf = -llf.dot(self.weights) 217 | return neg_sum_llf 218 | # 219 | def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): 220 | if start_params is None: 221 | if hasattr(self, 'start_params'): 222 | start_params = self.start_params 223 | else: 224 | start_params = 0.1 * np.ones(self.nparams) 225 | 226 | return super(Weighted_BetaBinom_fixdispersion_mix, self).fit(start_params=start_params, 227 | maxiter=maxiter, maxfun=maxfun, 228 | **kwds) 229 | 230 | 231 | class BAF_Binom(GenericLikelihoodModel): 232 | """ 233 | Binomial model endog ~ BetaBin(exposure, tau * p, tau * (1 - p)), where p = exog @ params[:-1] and tau = params[-1]. 234 | This function fits the BetaBin params when samples are weighted by weights: max_{params} \sum_{s} weights_s * log P(endog_s | exog_s; params) 235 | 236 | Attributes 237 | ---------- 238 | endog : array, (n_samples,) 239 | Y values. 240 | 241 | exog : array, (n_samples, n_features) 242 | Design matrix. 243 | 244 | weights : array, (n_samples,) 245 | Sample weights. 246 | 247 | exposure : array, (n_samples,) 248 | Total number of trials. In BAF case, this is the total number of SNP-covering UMIs. 249 | """ 250 | def __init__(self, endog, exog, weights, exposure, offset, scaling, **kwds): 251 | super(BAF_Binom, self).__init__(endog, exog, **kwds) 252 | self.weights = weights 253 | self.exposure = exposure 254 | self.offset = offset 255 | self.scaling = scaling 256 | # 257 | def nloglikeobs(self, params): 258 | linear_term = self.exog @ params 259 | p = self.scaling / (1 + np.exp(-linear_term + self.offset)) 260 | llf = scipy.stats.binom.logpmf(self.endog, self.exposure, p) 261 | neg_sum_llf = -llf.dot(self.weights) 262 | return neg_sum_llf 263 | # 264 | def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): 265 | if start_params is None: 266 | if hasattr(self, 'start_params'): 267 | start_params = self.start_params 268 | else: 269 | start_params = 0.5 / np.sum(self.exog.shape[1]) * np.ones(self.nparams) 270 | return super(BAF_Binom, self).fit(start_params=start_params, 271 | maxiter=maxiter, maxfun=maxfun, 272 | **kwds) -------------------------------------------------------------------------------- /src/calicost/utils_phase_switch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from pathlib import Path 4 | from tqdm import trange 5 | import scipy 6 | import scipy.special 7 | 8 | 9 | def get_position_cM_table(chr_pos_vector, geneticmap_file): 10 | """ 11 | Attributes 12 | ---------- 13 | chr_pos_vector : list of pairs 14 | list of (chr, pos) pairs of SNPs 15 | """ 16 | df = pd.read_csv(geneticmap_file, header=0, sep="\t") 17 | # remove chrX 18 | df = df[df.chrom.isin( [f"chr{i}" for i in range(1,23)] )] 19 | # check the chromosome names 20 | if not ("chr" in str(chr_pos_vector[0][0])): 21 | df["chrom"] = [int(x[3:]) for x in df.chrom] 22 | df = df.sort_values(by=["chrom", "pos"]) 23 | ref_chrom = np.array(df.chrom) 24 | ref_pos = np.array(df.pos) 25 | ref_cm = np.array(df.pos_cm) 26 | # also sort the input argument 27 | chr_pos_vector.sort() 28 | # find the centimorgan values (interpolate between (k-1)-th and k-th rows in centimorgan tables) 29 | position_cM = np.ones(len(chr_pos_vector)) * np.nan 30 | k = 0 31 | for i,x in enumerate(chr_pos_vector): 32 | chrname = x[0] 33 | pos = x[1] 34 | while k < len(ref_chrom) and (ref_chrom[k] < chrname or (ref_chrom[k] == chrname and ref_pos[k] < pos)): 35 | k += 1 36 | if k < len(ref_chrom) and ref_chrom[k] == chrname and ref_pos[k] >= pos: 37 | if k > 0 and ref_chrom[k-1] == chrname: 38 | position_cM[i] = ref_cm[k-1] + (pos - ref_pos[k-1]) / (ref_pos[k] - ref_pos[k-1]) * (ref_cm[k] - ref_cm[k-1]) 39 | else: 40 | position_cM[i] = (pos - 0) / (ref_pos[k] - 0) * (ref_cm[k] - 0) 41 | else: 42 | position_cM[i] = ref_cm[k-1] 43 | return position_cM 44 | 45 | 46 | def compute_phase_switch_probability_position(position_cM, chr_pos_vector, nu = 1, min_prob=1e-20): 47 | """ 48 | Attributes 49 | ---------- 50 | position_cM : array, (number SNP positions) 51 | Centimorgans of SNPs located at each entry of position_cM. 52 | 53 | chr_pos_vector : list of pairs 54 | list of (chr, pos) pairs of SNPs. It is used to identify start of a new chr. 55 | """ 56 | phase_switch_prob = np.ones(len(position_cM)) * 1e-20 57 | for i,cm in enumerate(position_cM[:-1]): 58 | cm_next = position_cM[i+1] 59 | if np.isnan(cm) or np.isnan(cm_next) or chr_pos_vector[i][0] != chr_pos_vector[i+1][0]: 60 | continue 61 | assert cm <= cm_next 62 | d = cm_next - cm 63 | phase_switch_prob[i] = (1 - np.exp(-2 * nu * d)) / 2 64 | phase_switch_prob[phase_switch_prob < min_prob] = min_prob 65 | return phase_switch_prob 66 | 67 | 68 | def duplicate_RD(chr_baf, pos_baf, chr_rd, start_rd, end_rd, tumor_rd, normal_rd): 69 | tumor_reads = np.ones(len(chr_baf)) * np.nan 70 | normal_reads = np.ones(len(chr_baf)) * np.nan 71 | idx = 0 72 | for i in range(len(chr_baf)): 73 | while idx < len(chr_rd) and (chr_rd[idx] < chr_baf[i] or (chr_rd[idx] == chr_baf[i] and end_rd[idx] < pos_baf[i])): 74 | idx += 1 75 | if idx < len(chr_rd) and chr_rd[idx] == chr_baf[i] and end_rd[idx] >= pos_baf[i] and start_rd[idx] <= pos_baf[i]: 76 | tumor_reads[i] = tumor_rd[idx] 77 | normal_reads[i] = normal_rd[idx] 78 | return tumor_reads, normal_reads 79 | 80 | 81 | def generate_input_from_HATCHet(hatchetdir, output_picklefile, rdrfile="abin/bulk.bb", baffile="baf/bulk.1bed", phasefile="phase/phased.vcf.gz", with_chr_prefix=True): 82 | if with_chr_prefix: 83 | unique_chrs = [f"chr{i}" for i in range(1, 23)] 84 | else: 85 | unique_chrs = np.arange(1, 23) 86 | 87 | ### load hatchet outputs ### 88 | if Path(output_picklefile).exists(): 89 | # RDR file 90 | df_all = pd.read_csv(f"{hatchetdir}/{rdrfile}", header=0, sep="\t") 91 | df_all.iloc[:,0] = pd.Categorical(df_all.iloc[:,0], categories=unique_chrs, ordered=True) 92 | df_all.sort_values(by=["#CHR", "START"], inplace=True) 93 | # samples 94 | unique_samples = np.unique(df_all["SAMPLE"]) 95 | # allele counts 96 | df_baf = pd.read_pickle(output_picklefile) 97 | else: 98 | # RDR file 99 | df_all = pd.read_csv(f"{hatchetdir}/{rdrfile}", header=0, sep="\t") 100 | df_all.iloc[:,0] = pd.Categorical(df_all.iloc[:,0], categories=unique_chrs, ordered=True) 101 | df_all.sort_values(by=["#CHR", "START"], inplace=True) 102 | # samples 103 | unique_samples = np.unique(df_all["SAMPLE"]) 104 | # allele counts for individual SNPs 105 | def load_shared_BAF(hatchetdir, baffile, unique_chrs, unique_samples): 106 | tmpdf = pd.read_csv(f"{hatchetdir}/{baffile}", header=None, sep="\t", names=["CHR", "POS", "SAMPLE", "REF", "ALT"]) 107 | df_baf = [] 108 | for chrname in unique_chrs: 109 | tmp = tmpdf[tmpdf.CHR == chrname] 110 | list_pos = [set(list(tmp[tmp["SAMPLE"] == s].POS)) for s in unique_samples] # SNP set of each individual sample 111 | shared_pos = set.intersection(*list_pos) # SNPs that are shared across samples 112 | index = np.array([i for i in range(tmp.shape[0]) if tmp.iloc[i,1] in shared_pos]) 113 | tmp = tmp.iloc[index,:] 114 | tmp.sort_values(by=["POS", "SAMPLE"], inplace=True) 115 | df_baf.append( tmp ) 116 | df_baf = pd.concat(df_baf, ignore_index=True) 117 | return df_baf 118 | df_baf = load_shared_BAF(hatchetdir, baffile, unique_chrs, unique_samples) 119 | # reference-based phasing results 120 | df_phase = pd.read_csv(f"{hatchetdir}/{phasefile}", comment="#", sep="\t", \ 121 | names=["CHR", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "SAMPLENAME"]) 122 | df_phase = df_phase[(df_phase.SAMPLENAME=="0|1") | (df_phase.SAMPLENAME=="1|0")] 123 | print("HATCHet dataframes loaded.") 124 | 125 | ### gather phased BAF info ### 126 | df_combined_baf = [] 127 | for chrname in unique_chrs: 128 | tmpdf_baf = df_baf[df_baf.CHR == chrname] 129 | tmpdf_phase = df_phase[df_phase.CHR == chrname][["POS", "SAMPLENAME"]] 130 | tmpdf_baf = tmpdf_baf.join( tmpdf_phase.set_index("POS"), on="POS") 131 | tmpdf_baf = tmpdf_baf[~tmpdf_baf.SAMPLENAME.isnull()] 132 | tmpdf_baf["B_count"] = np.where(tmpdf_baf.SAMPLENAME=="0|1", tmpdf_baf.REF, tmpdf_baf.ALT) 133 | tmpdf_baf["DP"] = tmpdf_baf.REF + tmpdf_baf.ALT 134 | df_combined_baf.append( tmpdf_baf ) 135 | df_combined_baf = pd.concat(df_combined_baf, ignore_index=True) 136 | df_combined_baf.iloc[:,0] = pd.Categorical(df_combined_baf.CHR, categories=unique_chrs, ordered=True) 137 | df_combined_baf.sort_values(by=["CHR", "POS"], inplace=True) 138 | df_baf = df_combined_baf 139 | 140 | ### duplicate RDR info for each SNP ### 141 | df_baf["TOTAL_READS"] = np.nan 142 | df_baf["NORMAL_READS"] = np.nan 143 | for s in unique_samples: 144 | index = np.where(df_baf["SAMPLE"] == s)[0] 145 | index_rd = np.where(df_all["SAMPLE"] == s)[0] 146 | tumor_reads, normal_reads = duplicate_RD(np.array(df_baf.iloc[index,:].CHR.cat.codes), np.array(df_baf.iloc[index,:].POS), \ 147 | np.array(df_all.iloc[index_rd,0].cat.codes), np.array(df_all.iloc[index_rd,:].START), np.array(df_all.iloc[index_rd,:].END), \ 148 | np.array(df_all.iloc[index_rd,:].TOTAL_READS), np.array(df_all.iloc[index_rd,:].NORMAL_READS)) 149 | df_baf.iloc[index, -2] = tumor_reads 150 | df_baf.iloc[index, -1] = normal_reads 151 | # remove SNP positions with TOTAL_READS=NAN (if NAN occurs in one sample, remove the corresponding SNPs for the other samples too) 152 | def remove_nan_RD(df_baf): 153 | idx_nan = np.where(np.logical_or( df_baf.TOTAL_READS.isnull(), df_baf.NORMAL_READS.isnull() ))[0] 154 | chr = np.array(df_baf.CHR) 155 | pos = np.array(df_baf.POS) 156 | chr_pos = np.array([f"{chr[i]}_{pos[i]}" for i in range(len(chr))]) 157 | nan_chr_pos = set(list(chr_pos[idx_nan])) 158 | idx_remain = np.array([i for i,snpid in enumerate(chr_pos) if not (snpid in nan_chr_pos)]) 159 | df_baf = df_baf.iloc[idx_remain, :] 160 | return df_baf 161 | df_baf = remove_nan_RD(df_baf) 162 | df_baf.to_pickle(output_picklefile) 163 | print("SNP-level BAF and bin-level RDR paired up.") 164 | 165 | ### from BAF, RDR table, generate HMM input ### 166 | lengths = np.array([ np.sum(np.logical_and(df_baf["CHR"]==chrname, df_baf["SAMPLE"]==unique_samples[0])) for chrname in unique_chrs ]) 167 | 168 | X = np.zeros(( np.sum(lengths), 2, len(unique_samples) )) 169 | base_nb_mean = np.zeros((np.sum(lengths), len(unique_samples) )) 170 | total_bb_RD = np.zeros((np.sum(lengths), len(unique_samples) )) 171 | 172 | for k,s in enumerate(unique_samples): 173 | df = df_baf[df_baf["SAMPLE"] == s] 174 | X[:,0,k] = df.TOTAL_READS 175 | X[:,1,k] = df.B_count 176 | 177 | total_bb_RD[:,k] = np.array(df.DP) 178 | df2 = df_all[df_all["SAMPLE"] == s] 179 | base_nb_mean[:,k] = np.array(df.NORMAL_READS / np.sum(df2.NORMAL_READS) * np.sum(df2.TOTAL_READS)) 180 | 181 | # site-wise transition matrix 182 | chr_pos_vector = [(df_baf.CHR.iloc[i], df_baf.POS.iloc[i]) for i in np.where(df_baf["SAMPLE"]==unique_samples[0])[0]] 183 | position_cM = get_position_cM_table(chr_pos_vector) 184 | phase_switch_prob = compute_phase_switch_probability_position(position_cM, chr_pos_vector) 185 | log_sitewise_transmat = np.log(phase_switch_prob) 186 | 187 | return X, lengths, base_nb_mean, total_bb_RD, log_sitewise_transmat 188 | 189 | 190 | def distance_between_p_binom(state_pred1, clone_pred1, p_binom1, state_pred2, clone_pred2, p_binom2): 191 | import networkx as nx 192 | 193 | # matching predicted CNV states 194 | n_states = len(np.unique(state_pred1)) 195 | uniq_pred1 = np.sort(np.unique(state_pred1)) 196 | uniq_pred2 = np.sort(np.unique(state_pred2)) 197 | G = nx.Graph() 198 | G.add_nodes_from([f"A{i}" for i in uniq_pred1], bipartite=0) 199 | G.add_nodes_from([f"B{j}" for j in uniq_pred2], bipartite=1) 200 | # G.add_weighted_edges_from( [(f"A{i}", f"B{j}", np.sum(np.logical_and(state_pred1==uniq_pred1[i], state_pred2==uniq_pred2[j]))) for i in uniq_pred1 for j in uniq_pred2] ) 201 | # tmp = nx.max_weight_matching(G) 202 | # state_matching = {x[0]:x[1] for x in tmp} 203 | # state_matching.update( {x[1]:x[0] for x in tmp} ) 204 | G.add_weighted_edges_from( [(f"A{i}", f"B{j}", len(state_pred1) - np.sum(np.logical_and(state_pred1==uniq_pred1[i], state_pred2==uniq_pred2[j]))) for i in uniq_pred1 for j in uniq_pred2] ) 205 | state_matching = nx.bipartite.minimum_weight_full_matching(G) 206 | 207 | # matching predicted clones 208 | n_clones = len(np.unique(clone_pred1)) 209 | uniq_pred1 = np.sort(np.unique(clone_pred1)) 210 | uniq_pred2 = np.sort(np.unique(clone_pred2)) 211 | G = nx.Graph() 212 | G.add_nodes_from([f"A{i}" for i in uniq_pred1], bipartite=0) 213 | G.add_nodes_from([f"B{j}" for j in uniq_pred2], bipartite=1) 214 | # G.add_weighted_edges_from( [(f"A{i}", f"B{j}", np.sum(np.logical_and(clone_pred1==uniq_pred1[i], clone_pred2==uniq_pred2[j]))) for i in uniq_pred1 for j in uniq_pred2] ) 215 | # tmp = nx.max_weight_matching(G) 216 | # clone_matching = {x[0]:x[1] for x in tmp} 217 | # clone_matching.update( {x[1]:x[0] for x in tmp} ) 218 | G.add_weighted_edges_from( [(f"A{i}", f"B{j}", len(clone_pred1) - np.sum(np.logical_and(clone_pred1==uniq_pred1[i], clone_pred2==uniq_pred2[j]))) for i in uniq_pred1 for j in uniq_pred2] ) 219 | clone_matching = nx.bipartite.minimum_weight_full_matching(G) 220 | 221 | # l2 distance between corresponding CNV at corresponding clone 222 | # reorder p_binom2 based on state_matching and clone_matching 223 | reorder_p_binom2 = p_binom2[:, np.array([ int(clone_matching[f"A{i}"][1:]) for i in range(n_clones)])] 224 | reorder_p_binom2 = reorder_p_binom2[np.array([ int(state_matching[f"A{i}"][1:]) for i in range(n_states) ]), :] 225 | l2 = 0 226 | for i in range(p_binom1.shape[0]): 227 | l2 += min( np.sum(np.square(p_binom1[i,:] - reorder_p_binom2[i,:])), np.sum(np.square(p_binom1[i,:] - 1 + reorder_p_binom2[i,:])) ) 228 | return l2 229 | 230 | 231 | def get_intervals(pred_cnv): 232 | intervals = [] 233 | labs = [] 234 | s = 0 235 | while s < len(pred_cnv): 236 | t = np.where(pred_cnv[s:] != pred_cnv[s])[0] 237 | if len(t) == 0: 238 | intervals.append( (s, len(pred_cnv)) ) 239 | labs.append( pred_cnv[s] ) 240 | s = len(pred_cnv) 241 | else: 242 | t = t[0] 243 | intervals.append( (s,s+t) ) 244 | labs.append( pred_cnv[s] ) 245 | s = s+t 246 | return intervals, labs 247 | 248 | 249 | def get_intervals_nd(pred_cnv): 250 | """ 251 | pred_cnv : np.array of shape (n_bins, n_clones) 252 | """ 253 | intervals = [] 254 | labs = [] 255 | s = 0 256 | while s < len(pred_cnv): 257 | t = np.where(np.any(pred_cnv[s:] != pred_cnv[s], axis=1))[0] 258 | if len(t) == 0: 259 | intervals.append( (s, len(pred_cnv)) ) 260 | labs.append( pred_cnv[s] ) 261 | s = len(pred_cnv) 262 | else: 263 | t = t[0] 264 | intervals.append( (s,s+t) ) 265 | labs.append( pred_cnv[s] ) 266 | s = s+t 267 | return intervals, labs 268 | 269 | 270 | def postbinning_forvisual(X, base_nb_mean, total_bb_RD, lengths, res, binsize=2): 271 | # a list of intervals used in binning for transforming back to non-binned space 272 | intervals = [] 273 | bin_lengths = [] 274 | # variables for for-loop 275 | chrname = 0 276 | nextlen = lengths[chrname] 277 | s = 0 278 | while s < X.shape[0]: 279 | t = min(s+binsize, nextlen) 280 | intervals.append( [s,t] ) 281 | s = t 282 | if s >= nextlen: 283 | if s < X.shape[0]: 284 | chrname += 1 285 | nextlen += lengths[chrname] 286 | bin_lengths.append( len(intervals) ) 287 | bin_lengths = np.array(bin_lengths) 288 | bin_lengths[1:] = bin_lengths[1:] - bin_lengths[:-1] 289 | 290 | # binning based on previous intervals 291 | n_states = int(res["log_gamma"].shape[0] / 2) 292 | phase_prob = np.exp(scipy.special.logsumexp(res["log_gamma"][:n_states, :], axis=0)) 293 | bin_X = np.zeros((len(intervals), X.shape[1], X.shape[2]), dtype=int) 294 | bin_base_nb_mean = np.zeros((len(intervals), base_nb_mean.shape[1]), dtype=int) 295 | bin_total_bb_RD = np.zeros((len(intervals), total_bb_RD.shape[1]), dtype=int) 296 | bin_pred_cnv = np.zeros(len(intervals), dtype=int) 297 | for i, intvl in enumerate(intervals): 298 | s,t = intvl 299 | bin_X[i,0,:] = np.sum(X[s:t, 0,:], axis=0) 300 | bin_X[i,1,:] = np.sum( phase_prob[s:t].dot(X[s:t, 1,:]) + (1-phase_prob[s:t]).dot(total_bb_RD[s:t,:] - X[s:t,1,:]) ) 301 | bin_base_nb_mean[i,:] = np.sum(base_nb_mean[s:t,:], axis=0) 302 | bin_total_bb_RD[i,:] = np.sum(total_bb_RD[s:t,:], axis=0) 303 | bin_pred_cnv[i] = res["pred_cnv"][s] 304 | 305 | return bin_X, bin_base_nb_mean, bin_total_bb_RD, bin_pred_cnv, bin_lengths, intervals -------------------------------------------------------------------------------- /utils/filter_snps_forphasing.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | 3 | import sys 4 | import numpy as np 5 | import pandas as pd 6 | from pathlib import Path 7 | import argparse 8 | 9 | 10 | def main(cellsnplite_result_dir, eagle_out_dir, vaf_threshold=0.1): 11 | cellsnp_base = [str(x) for x in Path(cellsnplite_result_dir).glob("cellSNP.base*")][0] 12 | df_snp = pd.read_csv(cellsnp_base, comment="#", sep="\t", names=["tmpCHR", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]) 13 | df_snp["CHROM"] = [f"chr{x}" for x in df_snp.tmpCHR] 14 | df_snp["AD"] = [int(x.split(";")[0].split("=")[-1]) for x in df_snp.INFO] 15 | df_snp["DP"] = [int(x.split(";")[1].split("=")[-1]) for x in df_snp.INFO] 16 | df_snp["OTH"] = [int(x.split(";")[2].split("=")[-1]) for x in df_snp.INFO] 17 | # remove records with DP == 0 18 | df_snp = df_snp[df_snp.DP > 0] 19 | # keep het SNP (0.1 <= AD/DP <= 0.9) and hom ALT SNP (AD == DP >= 10) 20 | # df_snp = df_snp[((df_snp.AD / df_snp.DP >= 0.1) & (df_snp.AD / df_snp.DP <= 0.9)) | ((df_snp.AD == df_snp.DP) & (df_snp.DP >= 10))] 21 | df_snp = df_snp[((df_snp.AD >= 2) & (df_snp.DP - df_snp.AD >= 2) & (df_snp.AD / df_snp.DP >= vaf_threshold) & (df_snp.AD / df_snp.DP <= 1-vaf_threshold)) | ((df_snp.AD == df_snp.DP) & (df_snp.DP >= 10)) | ((df_snp.AD == 0) & (df_snp.DP >= 10))] 22 | # add addition columns 23 | df_snp["FORMAT"] = "GT" 24 | # df_snp[f"{sample_id}"] = ["0/1" if row.AD < row.DP else "1/1" for i,row in df_snp.iterrows()] 25 | gt_column = np.array(["0/0"] * df_snp.shape[0]) 26 | gt_column[ (df_snp.AD == df_snp.DP) ] = "1/1" 27 | gt_column[ (df_snp.AD > 0) & (df_snp.DP - df_snp.AD > 0) ] = "0/1" 28 | df_snp["SAMPLE_ID"] = gt_column 29 | # output chromosome to folder 30 | for c in range(1, 23): 31 | df = df_snp[ (df_snp.tmpCHR == c) | (df_snp.tmpCHR == str(c)) ] 32 | # remove records that have duplicated snp_id 33 | snp_id = [f"{row.tmpCHR}_{row.POS}_{row.REF}_{row.ALT}" for i,row in df.iterrows()] 34 | df["snp_id"] = snp_id 35 | df = df.groupby("snp_id").agg({"CHROM":"first", "POS":"first", "ID":"first", "REF":"first", "ALT":"first", "QUAL":"first", "FILTER":"first", \ 36 | "INFO":"first", "FORMAT":"first", "SAMPLE_ID":"first", "AD":"sum", "DP":"sum", "OTH":"sum"}) 37 | info = [f"AD={row.AD};DP={row.DP};OTH={row.OTH}" for i,row in df.iterrows()] 38 | df["INFO"] = info 39 | df = df[["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "SAMPLE_ID"]] 40 | df.sort_values(by="POS", inplace=True) 41 | fp = open(f"{eagle_out_dir}/chr{c}.vcf", 'w') 42 | fp.write("##fileformat=VCFv4.2\n") 43 | fp.write("##FORMAT=\n") 44 | fp.write("#" + "\t".join(df.columns) + "\n") 45 | df.to_csv(fp, sep="\t", index=False, header=False) 46 | fp.close() 47 | 48 | if __name__ == "__main__": 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument("-c", "--cellsnplite_result_dir", help="cellsnplite result directory", type=str) 51 | parser.add_argument("-o", "--eagle_out_dir", help="eagle output directory", type=str) 52 | parser.add_argument("-v", "--vaf_threshold", help="vaf threshold", default=0.1, type=float) 53 | args = parser.parse_args() 54 | main(args.cellsnplite_result_dir, args.eagle_out_dir, args.vaf_threshold) 55 | -------------------------------------------------------------------------------- /utils/get_snp_matrix.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | 3 | import sys 4 | import numpy as np 5 | import pandas as pd 6 | from scipy.special import logsumexp 7 | import scipy.io 8 | from pathlib import Path 9 | import json 10 | import gzip 11 | import pickle 12 | from tqdm import trange 13 | import copy 14 | import argparse 15 | 16 | 17 | def process_snp_phasing(cellsnp_folder, eagle_folder, outputfile): 18 | # create a (snp_id, GT) map from eagle2 output 19 | snp_gt_map = {} 20 | for c in range(1, 23): 21 | fname = [str(x) for x in Path(eagle_folder).glob("*chr{}.phased.vcf.gz".format(c))] 22 | assert len(fname) > 0 23 | fname = fname[0] 24 | tmpdf = pd.read_table(fname, compression = 'gzip', comment = '#', sep="\t", names=["CHR","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT","PHASE"]) 25 | this_snp_ids = [ "{}_{}_{}_{}".format(c, row.POS, row.REF, row.ALT) for i,row in tmpdf.iterrows() ] 26 | this_gt = list(tmpdf.iloc[:,-1]) 27 | assert len(this_snp_ids) == len(this_gt) 28 | snp_gt_map.update( {this_snp_ids[i]:this_gt[i] for i in range(len(this_gt))} ) 29 | # cellsnp DP (read depth) and AD (alternative allele depth) 30 | # first get a list of snp_id and spot barcodes 31 | tmpdf = pd.read_csv(cellsnp_folder + "/cellSNP.base.vcf.gz", header=1, sep="\t") 32 | snp_list = np.array([ "{}_{}_{}_{}".format(row["#CHROM"], row.POS, row.REF, row.ALT) for i,row in tmpdf.iterrows() ]) 33 | tmpdf = pd.read_csv(cellsnp_folder + "/cellSNP.samples.tsv", header=None) 34 | sample_list = np.array(list(tmpdf.iloc[:,0])) 35 | # then get the DP and AD matrix 36 | DP = scipy.io.mmread(cellsnp_folder + "/cellSNP.tag.DP.mtx").tocsr() 37 | AD = scipy.io.mmread(cellsnp_folder + "/cellSNP.tag.AD.mtx").tocsr() 38 | # remove SNPs that are not phased 39 | is_phased = np.array([ (x in snp_gt_map) for x in snp_list ]) 40 | DP = DP[is_phased,:] 41 | AD = AD[is_phased,:] 42 | snp_list = snp_list[is_phased] 43 | # generate a new dataframe with columns (cell, snp_id, DP, AD, CHROM, POS, GT) 44 | rows, cols = DP.nonzero() 45 | cell = sample_list[cols] 46 | snp_id = snp_list[rows] 47 | DP_df = DP[DP.nonzero()].A.flatten() 48 | AD_df = AD[DP.nonzero()].A.flatten() 49 | GT = [snp_gt_map[x] for x in snp_id] 50 | df = pd.DataFrame({"cell":cell, "snp_id":snp_id, "DP":DP_df, "AD":AD_df, \ 51 | "CHROM":[int(x.split("_")[0]) for x in snp_id], "POS":[int(x.split("_")[1]) for x in snp_id], "GT":GT}) 52 | df.to_csv(outputfile, sep="\t", index=False, header=True, compression={'method': 'gzip'}) 53 | return df 54 | 55 | 56 | def read_cell_by_snp(allele_counts_file): 57 | df = pd.read_csv(allele_counts_file, sep="\t", header=0) 58 | index = np.array([i for i,x in enumerate(df.GT) if x=="0|1" or x=="1|0"]) 59 | df = df.iloc[index, :] 60 | df.CHROM = df.CHROM.astype(int) 61 | return df 62 | 63 | 64 | def cell_by_gene_lefthap_counts(cellsnp_folder, eagle_folder, barcode_list): 65 | # create a (snp_id, GT) map from eagle2 output 66 | snp_gt_map = {} 67 | for c in range(1, 23): 68 | fname = [str(x) for x in Path(eagle_folder).glob("*chr{}.phased.vcf.gz".format(c))] 69 | assert len(fname) > 0 70 | fname = fname[0] 71 | tmpdf = pd.read_table(fname, compression = 'gzip', comment = '#', sep="\t", names=["CHR","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT","PHASE"]) 72 | # only keep heterozygous SNPs 73 | tmpdf = tmpdf[ (tmpdf.PHASE=="0|1") | (tmpdf.PHASE=="1|0") ] 74 | this_snp_ids = (str(c) + "_" + tmpdf.POS.astype(str) +"_"+ tmpdf.REF +"_"+ tmpdf.ALT).values 75 | this_gt = tmpdf.PHASE.values 76 | assert len(this_snp_ids) == len(this_gt) 77 | snp_gt_map.update( {this_snp_ids[i]:this_gt[i] for i in range(len(this_gt))} ) 78 | 79 | # cellsnp-lite output 80 | cellsnp_base = [str(x) for x in Path(cellsnp_folder).glob("cellSNP.base*")][0] 81 | df_snp = pd.read_csv(cellsnp_base, comment="#", sep="\t", names=["tmpCHR", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]) 82 | df_snp['snp_id'] = df_snp.tmpCHR.astype(str) + "_" + df_snp.POS.astype(str) + "_" + df_snp.REF + "_" + df_snp.ALT 83 | tmpdf = pd.read_csv(cellsnp_folder + "/cellSNP.samples.tsv", header=None) 84 | sample_list = np.array(list(tmpdf.iloc[:,0])) 85 | barcode_mapper = {x:i for i,x in enumerate(sample_list)} 86 | # DP and AD 87 | DP = scipy.io.mmread(cellsnp_folder + "/cellSNP.tag.DP.mtx").tocsr() 88 | AD = scipy.io.mmread(cellsnp_folder + "/cellSNP.tag.AD.mtx").tocsr() 89 | # retain only SNPs that are phased 90 | is_phased = (df_snp.snp_id.isin(snp_gt_map)).values 91 | df_snp = df_snp[is_phased] 92 | df_snp['GT'] = [snp_gt_map[x] for x in df_snp.snp_id] 93 | DP = DP[is_phased,:] 94 | AD = AD[is_phased,:] 95 | 96 | # phasing 97 | phased_AD = np.where( (df_snp.GT.values == "0|1").reshape(-1,1), AD.A, (DP-AD).A ) 98 | phased_AD = scipy.sparse.csr_matrix(phased_AD) 99 | 100 | # re-order based on barcode_list 101 | index = np.array([barcode_mapper[x] for x in barcode_list if x in barcode_mapper]) 102 | DP = DP[:, index] 103 | phased_AD = phased_AD[:, index] 104 | 105 | # returned matrix has shape (N_cells, N_snps), which is the transpose of the original matrix 106 | return (DP-phased_AD).T, phased_AD.T, df_snp.snp_id.values 107 | 108 | 109 | def cell_by_gene_lefthap_counts_v2(df_cell_snp, hg_table_file, gene_list, barcode_list): 110 | # index of genes and barcodes in the current gene expression matrix 111 | barcode_mapper = {x:i for i,x in enumerate(barcode_list)} 112 | gene_mapper = {x:i for i,x in enumerate(gene_list)} 113 | # make an numpy array for CHROM and POS in df_cell_snp 114 | cell_snp_CHROM = np.array(df_cell_snp.CHROM) 115 | cell_snp_POS = np.array(df_cell_snp.POS) 116 | # read gene ranges in genome 117 | # NOTE THAT THE FOLLOWING CODE REQUIRES hg_table_file IS SORTED BY GENOMIC POSITION! 118 | df_genes = pd.read_csv(hg_table_file, header=0, index_col=0, sep="\t") 119 | index = np.array([ i for i in range(df_genes.shape[0]) if (not "_" in df_genes.chrom.iloc[i]) and \ 120 | (df_genes.chrom.iloc[i] != "chrX") and (df_genes.chrom.iloc[i] != "chrY") and (df_genes.chrom.iloc[i] != "chrM") and \ 121 | (not "GL" in df_genes.chrom.iloc[i]) and (not "KI" in df_genes.chrom.iloc[i]) ]) 122 | df_genes = df_genes.iloc[index, :] 123 | tmp_gene_ranges = {df_genes.name2.iloc[i]:(int(df_genes.chrom.iloc[i][3:]), df_genes.cdsStart.iloc[i], df_genes.cdsEnd.iloc[i]) for i in np.arange(df_genes.shape[0]) } 124 | gene_ranges = [(gname, tmp_gene_ranges[gname]) for gname in gene_list if gname in tmp_gene_ranges] 125 | del tmp_gene_ranges 126 | # aggregate snp counts to genes 127 | N = np.unique(df_cell_snp.cell).shape[0] 128 | G = len(gene_ranges) 129 | i = 0 130 | j = 0 131 | cell_gene_snp_counts = [] 132 | snp_ids = np.array(df_cell_snp.snp_id) 133 | unique_snp_ids = df_cell_snp.snp_id.unique() 134 | snp_id_mapper = {unique_snp_ids[i]:i for i in range(len(unique_snp_ids))} 135 | N_snps = len(unique_snp_ids) 136 | cell_snp_Aallele = np.zeros((len(barcode_list), N_snps)) 137 | cell_snp_Ballele = np.zeros((len(barcode_list), N_snps)) 138 | snp_gene_list = [""] * N_snps 139 | for i in trange(df_cell_snp.shape[0]): 140 | if df_cell_snp.GT.iloc[i] == "1|1" or df_cell_snp.GT.iloc[i] == "0|0": 141 | continue 142 | # check cell barcode 143 | if not df_cell_snp.cell.iloc[i] in barcode_mapper: 144 | continue 145 | cell_idx = barcode_mapper[df_cell_snp.cell.iloc[i]] 146 | # if the SNP is not within any genes 147 | if j < len(gene_ranges) and (cell_snp_CHROM[i] < gene_ranges[j][1][0] or \ 148 | (cell_snp_CHROM[i] == gene_ranges[j][1][0] and cell_snp_POS[i] < gene_ranges[j][1][1])): 149 | continue 150 | # if the SNP position passes gene j 151 | while j < len(gene_ranges) and (cell_snp_CHROM[i] > gene_ranges[j][1][0] or \ 152 | (cell_snp_CHROM[i] == gene_ranges[j][1][0] and cell_snp_POS[i] > gene_ranges[j][1][2])): 153 | j += 1 154 | # if the SNP is within gene j, add the corresponding gene ID 155 | if j < len(gene_ranges) and cell_snp_CHROM[i] == gene_ranges[j][1][0] and \ 156 | cell_snp_POS[i] >= gene_ranges[j][1][1] and cell_snp_POS[i] <= gene_ranges[j][1][2]: 157 | snp_gene_list[ snp_id_mapper[snp_ids[i]] ] = gene_ranges[j][0] 158 | # add the SNP UMI count to the corresponding cell and loci 159 | if df_cell_snp.GT.iloc[i] == "0|1": 160 | cell_snp_Aallele[cell_idx, snp_id_mapper[snp_ids[i]]] = df_cell_snp.DP.iloc[i] - df_cell_snp.AD.iloc[i] 161 | cell_snp_Ballele[cell_idx, snp_id_mapper[snp_ids[i]]] = df_cell_snp.AD.iloc[i] 162 | elif df_cell_snp.GT.iloc[i] == "1|0": 163 | cell_snp_Aallele[cell_idx, snp_id_mapper[snp_ids[i]]] = df_cell_snp.AD.iloc[i] 164 | cell_snp_Ballele[cell_idx, snp_id_mapper[snp_ids[i]]] = df_cell_snp.DP.iloc[i] - df_cell_snp.AD.iloc[i] 165 | 166 | index = np.where(np.logical_and( np.sum(cell_snp_Aallele + cell_snp_Ballele, axis=0) > 0))[0] 167 | cell_snp_Aallele = cell_snp_Aallele[:, index].astype(int) 168 | cell_snp_Ballele = cell_snp_Ballele[:, index].astype(int) 169 | snp_gene_list = np.array(snp_gene_list)[index] 170 | unique_snp_ids = unique_snp_ids[index] 171 | return cell_snp_Aallele, cell_snp_Ballele, snp_gene_list, unique_snp_ids 172 | 173 | 174 | if __name__ == "__main__": 175 | parser = argparse.ArgumentParser() 176 | parser.add_argument("-c", "--cellsnplite_result_dir", help="cellsnplite result directory", type=str) 177 | parser.add_argument("-e", "--eagle_out_dir", help="eagle output directory", type=str) 178 | parser.add_argument("-b", "--barcodefile", help="barcode file", type=str) 179 | parser.add_argument("-o", "--outputdir", help="output directory", type=str) 180 | args = parser.parse_args() 181 | 182 | barcode_list = list(pd.read_csv(args.barcodefile, header=None).iloc[:,0]) 183 | cell_snp_Aallele, cell_snp_Ballele, unique_snp_ids = cell_by_gene_lefthap_counts(args.cellsnplite_result_dir, args.eagle_out_dir, barcode_list) 184 | 185 | scipy.sparse.save_npz(f"{args.outputdir}/cell_snp_Aallele.npz", cell_snp_Aallele) 186 | scipy.sparse.save_npz(f"{args.outputdir}/cell_snp_Ballele.npz", cell_snp_Ballele) 187 | np.save(f"{args.outputdir}/unique_snp_ids.npy", unique_snp_ids) 188 | -------------------------------------------------------------------------------- /utils/merge_bamfile.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | 3 | import sys 4 | import pysam 5 | import pandas as pd 6 | import subprocess 7 | import argparse 8 | 9 | 10 | def write_merged_bam(input_bamfile_list, suffix_list, output_bam): 11 | fpin = pysam.AlignmentFile(input_bamfile_list[0], "rb") 12 | fpout = pysam.AlignmentFile(output_bam, "wb", template=fpin) 13 | fpin.close() 14 | for i, fname in enumerate(input_bamfile_list): 15 | fpin = pysam.AlignmentFile(fname, "rb") 16 | suffix = suffix_list[i] 17 | for read in fpin: 18 | if read.has_tag("CB"): 19 | b = read.get_tag("CB") 20 | read.set_tag("CB", f"{b}_{suffix}") 21 | fpout.write(read) 22 | fpin.close() 23 | fpout.close() 24 | 25 | 26 | def write_merged_deconvolution(input_deconvfile_list, suffix_list, output_deconv): 27 | df_combined = [] 28 | for i, fname in enumerate(input_deconvfile_list): 29 | suffix = suffix_list[i] 30 | tmpdf = pd.read_csv(fname, header=0, index_col=0, sep="\t") 31 | tmpdf.index = [f"{x}_{suffix}" for x in tmpdf.index] 32 | df_combined.append(tmpdf) 33 | df_combined = pd.concat(df_combined, ignore_index=False) 34 | df_combined.to_csv(output_deconv, header=True, index=True, sep="\t") 35 | 36 | 37 | if __name__ == "__main__": 38 | parser = argparse.ArgumentParser() 39 | parser.add_argument("-b", "--bamlistfile", help="cellsnplite result directory", type=str) 40 | parser.add_argument("-o", "--output_dir", help="output directory", type=str) 41 | args = parser.parse_args() 42 | 43 | df = pd.read_csv(args.bamlistfile, sep="\t", header=None, index_col=None) 44 | if df.shape[1] == 3: 45 | df.columns=["bamfilename", "suffix", "cellrangerdir"] 46 | else: 47 | df.columns=["bamfilename", "suffix", "cellrangerdir", "deconv_filename"] 48 | 49 | input_bamfile_list = df.bamfilename.values 50 | suffix_list = df.suffix.values 51 | write_merged_bam(input_bamfile_list, suffix_list, f"{args.output_dir}/unsorted_possorted_genome_bam.bam") 52 | 53 | if df.shape[1] == 4: 54 | # merge deconvolution file 55 | assert "deconv_filename" in df.columns 56 | input_deconvfile_list = df.deconv_filename.values 57 | suffix_list = df.suffix.values 58 | write_merged_deconvolution(input_deconvfile_list, suffix_list, f"{args.output_dir}/merged_deconvolution.tsv") 59 | -------------------------------------------------------------------------------- /utils/process_snps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ##### input and output data paths ##### 4 | # SAMPLE_ID is used for setting directory/file name 5 | SAMPLE_ID="58408_Primary" 6 | CELLRANGER_OUT="/u/congma/ragr-data/users/congma/Datasets/MM/58408_Primary/cellranger/outs/" 7 | BAMFILE="/u/congma/ragr-data/users/congma/Datasets/MM/58408_Primary/scRNA.unsorted.58408_Primary.bam" 8 | OUTDIR="/u/congma/ragr-data/users/congma/Datasets/MM/58408_Primary/numbatprep/" 9 | 10 | NTHREADS=20 11 | 12 | ##### reference file paths ##### 13 | # PHASING_PANEL is downloaded as instructed in numbat "1000G Reference Panel" and then unzipped. Link to download: wget http://pklab.med.harvard.edu/teng/data/1000G_hg38.zip 14 | PHASING_PANEL="/u/congma/ragr-data/users/congma/references/phasing_ref/1000G_hg38/" 15 | # REGION_VCF serves as the same purpose as "1000G SNP reference file" in numbat, but using a larger SNP set. Link to download: wget https://sourceforge.net/projects/cellsnp/files/SNPlist/genome1K.phase3.SNP_AF5e4.chr1toX.hg38.vcf.gz 16 | REGION_VCF="/u/congma/ragr-data/users/congma/references/snplist/genome1K.phase3.SNP_AF5e4.chr1toX.hg38.vcf.gz" 17 | # HGTABLE_FILE specifies gene positions in the genome, for mapping SNPs to genes. Link to download: https://github.com/raphael-group/STARCH/blob/develop/hgTables_hg38_gencode.txt 18 | HGTABLE_FILE="/u/congma/ragr-data/users/congma/Codes/STARCH_crazydev/hgTables_hg38_gencode.txt" 19 | # there is a reference file in eagle folder 20 | eagledir="/u/congma/ragr-data/users/congma/environments/Eagle_v2.4.1/" 21 | 22 | 23 | ##### Following are commands for calling + phasing + processing SNPs ##### 24 | # index bam file 25 | if [[ ! -e ${BAMFILE}.bai ]]; then 26 | samtools index ${BAMFILE} 27 | fi 28 | 29 | # write required barcode list file 30 | mkdir -p ${OUTDIR} 31 | gunzip -c ${CELLRANGER_OUT}/filtered_feature_bc_matrix/barcodes.tsv.gz > ${OUTDIR}/barcodes.txt 32 | 33 | # run cellsnp-lite 34 | mkdir -p ${OUTDIR}/pileup/${SAMPLE_ID} 35 | cellsnp-lite -s ${BAMFILE} \ 36 | -b ${OUTDIR}/barcodes.txt \ 37 | -O ${OUTDIR}/pileup/${SAMPLE_ID} \ 38 | -R ${REGION_VCF} \ 39 | -p ${NTHREADS} \ 40 | --minMAF 0 --minCOUNT 2 --UMItag Auto --cellTAG CB 41 | 42 | # run phasing 43 | mkdir -p ${OUTDIR}/phasing/ 44 | SCRIPTDIR=$(dirname "$0") 45 | python ${SCRIPTDIR}/filter_snps_forphasing.py ${SAMPLE_ID} ${OUTDIR} 46 | for chr in {1..22}; do 47 | bgzip -f ${OUTDIR}/phasing/${SAMPLE_ID}_chr${chr}.vcf 48 | tabix ${OUTDIR}/phasing/${SAMPLE_ID}_chr${chr}.vcf.gz 49 | eagle --numThreads ${NTHREADS} \ 50 | --vcfTarget ${OUTDIR}/phasing/${SAMPLE_ID}_chr${chr}.vcf.gz \ 51 | --vcfRef ${PHASING_PANEL}/chr${chr}.genotypes.bcf \ 52 | --geneticMapFile=${eagledir}/tables/genetic_map_hg38_withX.txt.gz \ 53 | --outPrefix ${OUTDIR}/phasing/${SAMPLE_ID}_chr${chr}.phased 54 | done 55 | 56 | 57 | # run my pythonn to get a cell-by-gene matrix of SNP-covering UMI counts 58 | SCRIPTDIR=$(dirname "$0") 59 | python ${SCRIPTDIR}/get_snp_matrix.py ${OUTDIR} ${SAMPLE_ID} ${HGTABLE_FILE} ${CELLRANGER_OUT}/filtered_feature_bc_matrix/ 60 | -------------------------------------------------------------------------------- /utils/process_snps_merged.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ##### input and output data paths ##### 4 | # SAMPLE_ID is used for setting directory/file name 5 | SAMPLE_ID="joint_H1_245_H2_1" 6 | INPUTLIST="/u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_snps/joint_H1_245_H2_1/bamfile_list.tsv" 7 | BAMFILE="/u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_snps/joint_H1_245_H2_1/possorted_genome_bam.bam" 8 | OUTDIR="/u/congma/ragr-data/datasets/spatial_cna/Lundeberg_organwide/P1_snps/joint_H1_245_H2_1/visium_snpnew" 9 | 10 | NTHREADS=20 11 | 12 | ##### reference file paths ##### 13 | # PHASING_PANEL is downloaded as instructed in numbat "1000G Reference Panel" and then unzipped. Link to download: wget http://pklab.med.harvard.edu/teng/data/1000G_hg38.zip 14 | PHASING_PANEL="/u/congma/ragr-data/users/congma/references/phasing_ref/1000G_hg38/" 15 | # REGION_VCF serves as the same purpose as "1000G SNP reference file" in numbat, but using a larger SNP set. Link to download: wget https://sourceforge.net/projects/cellsnp/files/SNPlist/genome1K.phase3.SNP_AF5e4.chr1toX.hg38.vcf.gz 16 | REGION_VCF="/u/congma/ragr-data/users/congma/references/snplist/nocpg.genome1K.phase3.SNP_AF5e4.chr1toX.hg38.vcf.gz" 17 | # HGTABLE_FILE specifies gene positions in the genome, for mapping SNPs to genes. Link to download: https://github.com/raphael-group/STARCH/blob/develop/hgTables_hg38_gencode.txt 18 | HGTABLE_FILE="/u/congma/ragr-data/users/congma/Codes/STARCH_crazydev/hgTables_hg38_gencode.txt" 19 | # there is a reference file in eagle folder 20 | eagledir="/u/congma/ragr-data/users/congma/environments/Eagle_v2.4.1/" 21 | 22 | 23 | ##### Following are commands for calling + phasing + processing SNPs ##### 24 | # index bam file 25 | if [[ ! -e ${BAMFILE}.bai ]]; then 26 | samtools index ${BAMFILE} 27 | fi 28 | 29 | # write required barcode list file 30 | mkdir -p ${OUTDIR} 31 | touch ${OUTDIR}/barcodes.txt 32 | >${OUTDIR}/barcodes.txt 33 | while read -r line; do 34 | CELLRANGER_OUT=$(echo ${line} | awk '{print $3}') 35 | suffix=$(echo ${line} | awk '{print $2}') 36 | gunzip -c ${CELLRANGER_OUT}/filtered_feature_bc_matrix/barcodes.tsv.gz | awk -v var=${suffix} '{print $0"_"var}' >> ${OUTDIR}/barcodes.txt 37 | done < ${INPUTLIST} 38 | 39 | # run cellsnp-lite 40 | mkdir -p ${OUTDIR}/pileup/${SAMPLE_ID} 41 | cellsnp-lite -s ${BAMFILE} \ 42 | -b ${OUTDIR}/barcodes.txt \ 43 | -O ${OUTDIR}/pileup/${SAMPLE_ID} \ 44 | -R ${REGION_VCF} \ 45 | -p ${NTHREADS} \ 46 | --minMAF 0 --minCOUNT 2 --UMItag Auto --cellTAG CB 47 | 48 | # run phasing 49 | mkdir -p ${OUTDIR}/phasing/ 50 | SCRIPTDIR="/u/congma/ragr-data/users/congma/Codes/STARCH_crazydev/scripts" 51 | python ${SCRIPTDIR}/filter_snps_forphasing.py ${SAMPLE_ID} ${OUTDIR} 52 | for chr in {1..22}; do 53 | bgzip -f ${OUTDIR}/phasing/${SAMPLE_ID}_chr${chr}.vcf 54 | tabix ${OUTDIR}/phasing/${SAMPLE_ID}_chr${chr}.vcf.gz 55 | ${eagledir}/eagle --numThreads ${NTHREADS} \ 56 | --vcfTarget ${OUTDIR}/phasing/${SAMPLE_ID}_chr${chr}.vcf.gz \ 57 | --vcfRef ${PHASING_PANEL}/chr${chr}.genotypes.bcf \ 58 | --geneticMapFile=${eagledir}/tables/genetic_map_hg38_withX.txt.gz \ 59 | --outPrefix ${OUTDIR}/phasing/${SAMPLE_ID}_chr${chr}.phased 60 | done 61 | 62 | 63 | # run my pythonn to get a cell-by-gene matrix of SNP-covering UMI counts 64 | #SCRIPTDIR=$(dirname "$0") 65 | python ${SCRIPTDIR}/get_snp_matrix.py ${OUTDIR} ${SAMPLE_ID} ${HGTABLE_FILE} ${OUTDIR}/barcodes.txt ${CELLRANGER_OUT}/filtered_feature_bc_matrix/ 66 | --------------------------------------------------------------------------------