├── .github └── workflows │ ├── python-package-release.yml │ └── python-package.yml ├── .gitignore ├── LICENSE ├── README.md ├── scanpy-scripts-tests.bats ├── scanpy_scripts ├── __init__.py ├── cli.py ├── click_utils.py ├── cmd_options.py ├── cmd_utils.py ├── cmds.py ├── lib │ ├── __init__.py │ ├── _bbknn.py │ ├── _combat.py │ ├── _diffexp.py │ ├── _diffmap.py │ ├── _dpt.py │ ├── _fdg.py │ ├── _filter.py │ ├── _hvg.py │ ├── _leiden.py │ ├── _louvain.py │ ├── _mnn.py │ ├── _neighbors.py │ ├── _norm.py │ ├── _paga.py │ ├── _pca.py │ ├── _read.py │ ├── _scrublet.py │ ├── _tsne.py │ └── _umap.py └── obj_utils.py ├── setup.py └── test-env.yaml /.github/workflows/python-package-release.yml: -------------------------------------------------------------------------------- 1 | name: Python package release 2 | 3 | on: 4 | push: 5 | tags: 6 | - v* 7 | 8 | jobs: 9 | build: 10 | 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | 16 | - name: Set up Python 3 17 | uses: actions/setup-python@v2 18 | with: 19 | python-version: '3.x' 20 | 21 | - name: Install dependencies 22 | run: | 23 | pip install -U setuptools>=40.1 24 | 25 | - name: Build dists 26 | run: | 27 | pip install wheel 28 | python setup.py sdist bdist_wheel 29 | 30 | - name: Publish to PyPI 31 | uses: pypa/gh-action-pypi-publish@master 32 | with: 33 | user: __token__ 34 | password: ${{ secrets.pypi_password }} 35 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: [pull_request] 4 | 5 | defaults: 6 | run: 7 | # for conda env activation 8 | shell: bash -l {0} 9 | 10 | jobs: 11 | build: 12 | 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: ["3.8", "3.9"] 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | 21 | - name: Setup mamba 22 | uses: mamba-org/setup-micromamba@v2 23 | with: 24 | environment-file: test-env.yaml 25 | cache-downloads: true 26 | condarc: | 27 | channels: 28 | - conda-forge 29 | - bioconda 30 | - defaults 31 | create-args: | 32 | python=${{ matrix.python-version }} 33 | 34 | - name: Run black manually 35 | run: | 36 | black --check --verbose ./ 37 | 38 | # - name: Install dependencies 39 | # run: | 40 | # sudo apt-get install libhdf5-dev 41 | # pip install -U setuptools>=40.1 wheel 'cmake<3.20' pytest 42 | # pip install $(pwd)/scanpy-scripts 43 | # # python -m pip install $(pwd)/scanpy --no-deps --ignore-installed -vv 44 | 45 | - name: Run unit tests 46 | run: | 47 | # needed for __version__ to be available 48 | pip install . --no-deps --ignore-installed 49 | pytest --doctest-modules -v ./ 50 | 51 | - name: Test with bats 52 | run: | 53 | ./scanpy-scripts-tests.bats 54 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea/ 2 | /build/ 3 | /dist/ 4 | /*.egg-info/ 5 | /post_install_tests/ 6 | *cache*/ 7 | *.pyc 8 | /.*history 9 | /.*swp 10 | data 11 | compressed 12 | uncompressed 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scanpy-scripts [![Anaconda-Server Badge](https://anaconda.org/bioconda/scanpy-scripts/badges/installer/conda.svg)](https://anaconda.org/bioconda/scanpy-scripts) 2 | 3 | A command-line interface for functions of the Scanpy suite, to facilitate flexible constrution of workflows, for example in Galaxy, Nextflow, Snakemake etc. 4 | 5 | ## Install 6 | 7 | The recommended way of using this package is through the latest container produced by Bioconda [here](https://quay.io/repository/biocontainers/scanpy-scripts?tab=tags). If you must, one can install scanpy-scripts via conda: 8 | 9 | ```bash 10 | conda install scanpy-scripts 11 | ``` 12 | 13 | pip installation is also possible, however the version of mnnpy is not patched as in the conda version, and so the `integrate` command will not work. 14 | 15 | ```bash 16 | pip install scanpy-scripts 17 | ``` 18 | 19 | For development installation, we suggest following the github actions python-package.yml file. 20 | 21 | Currently, tests run on python 3.9, so those are the recommended versions if not installing via conda. BKNN doesn't currently install on Python 3.10 due to a skip in Bioconda. 22 | 23 | ## Test installation 24 | 25 | There is an example script included: 26 | 27 | ```bash 28 | scanpy-scripts-tests.bats 29 | ``` 30 | 31 | This requires the [bats](https://github.com/sstephenson/bats) testing framework to run. The script downloads [a well-known test 10X dataset]('https://s3-us-west-2.amazonaws.com/10x.files/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz) and executes all of the commands described below. 32 | 33 | ## Commands 34 | 35 | Available commands are described below. Each has usage instructions available via `--help`, consult function documentation in scanpy for further details. 36 | 37 | ``` 38 | Usage: scanpy-cli [OPTIONS] COMMAND [ARGS]... 39 | 40 | Command line interface to [scanpy](https://github.com/theislab/scanpy) 41 | 42 | Options: 43 | --debug Print debug information 44 | --verbosity INTEGER Set scanpy verbosity 45 | --njobs INTEGER Set scanpy default number of jobs/CPUs, defaults to 1 46 | --version Show the version and exit. 47 | --help Show this message and exit. 48 | 49 | Commands: 50 | read Read 10x data and save in specified format. 51 | filter Filter data based on specified conditions. 52 | norm Normalise data per cell. 53 | hvg Find highly variable genes. 54 | scale Scale data per gene. 55 | regress Regress-out observation variables. 56 | pca Dimensionality reduction by PCA. 57 | neighbor Compute a neighbourhood graph of observations. 58 | embed Embed cells into two-dimensional space. 59 | cluster Cluster cells into sub-populations. 60 | diffexp Find markers for each clusters. 61 | paga Trajectory inference by abstract graph analysis. 62 | dpt Calculate diffusion pseudotime relative to the root cells. 63 | integrate Integrate cells from different experimental batches. 64 | multiplet Execute methods for multiplet removal. 65 | plot Visualise data. 66 | ``` 67 | 68 | ## Versioning 69 | 70 | Major and major versions will follow the scanpy versions. The first digit of the patch should follow the scanpy patch version as well, subsequent digits in the patch are reserved for changes in this repository. 71 | -------------------------------------------------------------------------------- /scanpy-scripts-tests.bats: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bats 2 | 3 | # Extract the test data 4 | setup() { 5 | scanpy="scanpy-cli" 6 | test_dir="post_install_tests" 7 | data_dir="${test_dir}/data" 8 | output_dir="${test_dir}/outputs" 9 | raw_matrix="${data_dir}/matrix.mtx" 10 | raw_matrix_from_raw="${data_dir}/raw/matrix.mtx" 11 | raw_matrix_from_layer="${data_dir}/layer/matrix.mtx" 12 | singlet_obs="${data_dir}/singlet_obs.txt" 13 | batch_obs="${data_dir}/batch_obs.txt" 14 | read_opt="-x $data_dir --show-obj stdout" 15 | read_obj="${output_dir}/read.h5ad" 16 | filter_opt="--save-raw -p n_genes 200 2500 -p c:n_counts 0 50000 -p n_cells 3 inf -p pct_counts_mito 0 0.2 -c mito '!True' --show-obj stdout --export-mtx ${output_dir}/filtered --mtx-compression gzip" 17 | filter_obj="${output_dir}/filter.h5ad" 18 | filter_mtx_gz="${output_dir}/filtered_matrix.mtx.gz" 19 | test_clustering='louvain_k10_r0_5' 20 | scrublet_tsv="${output_dir}/scrublet.tsv" 21 | scrublet_png="${output_dir}/scrublet.png" 22 | scrublet_obj="${output_dir}/scrublet.h5ad" 23 | scrublet_batched_obj="${output_dir}/scrublet_batched.h5ad" 24 | scrublet_simulate_obj="${output_dir}/scrublet_simulate.h5ad" 25 | scrublet_opt="--input-obj-sim ${scrublet_simulate_obj} --filter --export-table ${scrublet_tsv}" 26 | scrublet_batched_opt="--filter --batch-key batch" 27 | norm_mtx="${output_dir}/norm" 28 | norm_opt="--save-layer filtered -t 10000 -l all -n after -X ${norm_mtx} --show-obj stdout" 29 | norm_obj="${output_dir}/norm.h5ad" 30 | hvg_opt="-m 0.0125 3 -d 0.5 inf -s --show-obj stdout" 31 | always_hvg="${data_dir}/always_hvg.txt" 32 | never_hvg="${data_dir}/never_hvg.txt" 33 | hvg_opt_always_never="--always-hv-genes-file ${always_hvg} --never-hv-genes-file ${never_hvg}" 34 | hvg_obj="${output_dir}/hvg.h5ad" 35 | hvg_obj_on_off="${output_dir}/hvg_on_off.h5ad" 36 | regress_opt="-k n_counts --show-obj stdout" 37 | regress_obj="${output_dir}/regress.h5ad" 38 | scale_opt="--save-layer normalised -m 10 --show-obj stdout" 39 | scale_obj="${output_dir}/scale.h5ad" 40 | pca_embed="${output_dir}/pca.tsv" 41 | pca_opt="--n-comps 50 -V auto --show-obj stdout -E ${pca_embed}" 42 | pca_obj="${output_dir}/pca.h5ad" 43 | neighbor_opt="-k 5,10,20 -n 25 -m umap -t euclidean --show-obj stdout" 44 | neighbor_obj="${output_dir}/neighbor.h5ad" 45 | tsne_embed="${output_dir}/tsne.tsv" 46 | tsne_opt="-n 25 --use-rep X_pca --learning-rate 200 -E ${tsne_embed}" 47 | tsne_obj="${output_dir}/tsne.h5ad" 48 | umap_embed="${output_dir}/umap.tsv" 49 | umap_opt="--neighbors-key k10 --min-dist 0.75 --alpha 1 --gamma 1 -E ${umap_embed}" 50 | umap_obj="${output_dir}/umap.h5ad" 51 | fdg_embed="${output_dir}/fdg.tsv" 52 | fdg_opt="--neighbors-key k10 --layout fr -E ${fdg_embed} --init-pos paga" 53 | fdg_obj="${output_dir}/fdg.h5ad" 54 | louvain_tsv="${output_dir}/louvain.tsv" 55 | louvain_opt="-r 0.1,0.5,1 --neighbors-key k10 --key-added k10 --export-cluster ${louvain_tsv}" 56 | louvain_obj="${output_dir}/louvain.h5ad" 57 | leiden_tsv="${output_dir}/leiden.tsv" 58 | leiden_opt="-r 0.3,0.7 --neighbors-key k10 --key-added k10 -F loom --loom-write-obsm-varm --export-cluster ${leiden_tsv}" 59 | leiden_obj="${output_dir}/leiden.loom" 60 | diffexp_tsv="${output_dir}/diffexp.tsv" 61 | diffexp_opt="-g ${test_clustering} --reference rest --filter-params min_in_group_fraction:0.25,min_fold_change:1.5 --save ${diffexp_tsv}" 62 | diffexp_obj="${output_dir}/diffexp.h5ad" 63 | test_singlet_clustering='groupby_with_singlet' 64 | diffexp_singlet_tsv="${output_dir}/diffexp_singlet.tsv" 65 | diffexp_singlet_opt="-g ${test_singlet_clustering} --reference rest --filter-params min_in_group_fraction:0.25,min_fold_change:1.5 --save ${diffexp_singlet_tsv}" 66 | diffexp_singlet_obj="${output_dir}/diffexp_singlet.h5ad" 67 | paga_opt="--neighbors-key k10 --key-added ${test_clustering} --groups ${test_clustering} --model v1.2" 68 | paga_obj="${output_dir}/paga.h5ad" 69 | diffmap_embed="${output_dir}/diffmap.tsv" 70 | diffmap_opt="--neighbors-key k10 --n-comps 10 -E ${diffmap_embed}" 71 | diffmap_obj="${output_dir}/diffmap.h5ad" 72 | dpt_opt="--neighbors-key k10 --key-added k10 --n-dcs 10 --disallow-kendall-tau-shift --root ${test_clustering} 0" 73 | dpt_obj="${output_dir}/dpt.h5ad" 74 | plt_embed_opt="--projection 2d --color ${test_clustering} --title test" 75 | plt_embed_pdf="${output_dir}/umap_${test_clustering}.pdf" 76 | plt_paga_pdf="${output_dir}/paga_k10_r0_7.pdf" 77 | plt_paga_obj="${output_dir}/paga_k10_r0_7.h5ad" 78 | plt_paga_opt="--use-key paga_${test_clustering} --node-size-scale 2 --edge-width-scale 0.5 --basis diffmap --color dpt_pseudotime_k10 --frameoff --output-obj $plt_paga_obj" 79 | test_markers='LDHB,CD3D,CD3E' 80 | diffexp_plot_opt="--var-names $test_markers --use-raw --dendrogram --groupby ${test_clustering}" 81 | plt_stacked_violin_opt="${diffexp_plot_opt} --no-jitter --swap-axes" 82 | plt_stacked_violin_pdf="${output_dir}/sviolin_${test_clustering}_LDHB_CD3D_CD3E.pdf" 83 | plt_dotplot_pdf="${output_dir}/dot_${test_clustering}_LDHB_CD3D_CD3E.pdf" 84 | plt_matrixplot_pdf="${output_dir}/matrix_${test_clustering}_LDHB_CD3D_CD3E.pdf" 85 | plt_heatmap_pdf="${output_dir}/heatmap_${test_clustering}_LDHB_CD3D_CD3E.pdf" 86 | plt_rank_genes_groups_opt="--rgg --groups 3,4" 87 | plt_rank_genes_groups_singlet_opt="--rgg" 88 | plt_rank_genes_groups_stacked_violin_pdf="${output_dir}/rggsviolin_${test_clustering}.pdf" 89 | plt_rank_genes_groups_matrix_pdf="${output_dir}/rggmatrix_${test_clustering}.pdf" 90 | plt_rank_genes_groups_dot_pdf="${output_dir}/rggdot_${test_clustering}.pdf" 91 | plt_rank_genes_groups_dot_singlet_pdf="${output_dir}/rggdot_${test_singlet_clustering}.pdf" 92 | plt_rank_genes_groups_heatmap_pdf="${output_dir}/rggheatmap_${test_clustering}.pdf" 93 | harmony_integrate_obj="${output_dir}/harmony_integrate.h5ad" 94 | harmony_integrate_opt="--batch-key ${test_clustering}" 95 | harmony_plt_embed_opt="--projection 2d --color ${test_clustering} --title 'PCA embeddings after harmony' --basis 'X_pca_harmony'" 96 | noharmony_plt_embed_opt="--projection 2d --color ${test_clustering} --title 'PCA embeddings before harmony' --basis 'X_pca'" 97 | harmony_integrated_pca_pdf="${output_dir}/harmony_pca_${test_clustering}.pdf" 98 | noharmony_integrated_pca_pdf="${output_dir}/pca_${test_clustering}.pdf" 99 | bbknn_obj="${output_dir}/bbknn.h5ad" 100 | bbknn_opt="--batch-key ${test_clustering} --key-added bbknn" 101 | mnn_obj="${output_dir}/mnn.h5ad" 102 | mnn_opt="--save-layer uncorrected --batch-key ${test_clustering}" 103 | combat_obj="${output_dir}/combat.h5ad" 104 | combat_opt="--batch-key ${test_clustering}" 105 | 106 | 107 | if [ ! -d "$data_dir" ]; then 108 | mkdir -p $data_dir 109 | fi 110 | 111 | if [ ! -d "$output_dir" ]; then 112 | mkdir -p $output_dir 113 | fi 114 | } 115 | 116 | @test "Extract test data from Scanpy" { 117 | if [ "$resume" = 'true' ] && [ -f "$raw_matrix" ]; then 118 | skip "$raw_matrix exists" 119 | fi 120 | 121 | run rm -rf ${data_dir}/* && eval "echo -e \"import scanpy as sc\nfrom scanpy_scripts.cmd_utils import write_mtx\nimport os\nos.makedirs('$data_dir', exist_ok=True)\nwrite_mtx(sc.datasets.pbmc3k(), '$data_dir/')\" | python" 122 | 123 | [ "$status" -eq 0 ] 124 | [ -f "$raw_matrix" ] 125 | } 126 | 127 | @test "Test MTX write from .raw" { 128 | if [ "$resume" = 'true' ] && [ -f "$raw_matrix_from_raw" ]; then 129 | skip "$raw_matrix exists" 130 | fi 131 | 132 | run rm -rf ${data_dir}/raw/* && eval "echo -e \"import scanpy as sc\nfrom scanpy_scripts.cmd_utils import write_mtx\nimport os\nos.makedirs('$data_dir/raw', exist_ok=True)\nadata=sc.datasets.pbmc3k();adata.raw=adata\nwrite_mtx(adata, '$data_dir/raw/', use_raw=True)\" | python" 133 | 134 | [ "$status" -eq 0 ] 135 | [ -f "$raw_matrix_from_raw" ] 136 | } 137 | 138 | @test "Add genes to be considered HVGs" { 139 | if [ "$resume" = 'true' ] && [ -f "$always_hvg" ]; then 140 | skip "$always_hvg exists" 141 | fi 142 | 143 | run eval "echo -e 'MIR1302-10\nFAM138A' > $always_hvg" 144 | } 145 | 146 | @test "Add genes not to be considered HVGs" { 147 | if [ "$resume" = 'true' ] && [ -f "$never_hvg" ]; then 148 | skip "$never_hvg exists" 149 | fi 150 | 151 | run eval "echo -e 'ISG15\nTNFRSF4' > $never_hvg" 152 | } 153 | 154 | @test "Test MTX write from layers" { 155 | if [ "$resume" = 'true' ] && [ -f "$raw_matrix_from_layer" ]; then 156 | skip "$raw_matrix exists" 157 | fi 158 | 159 | run rm -rf ${data_dir}/layer/* && eval "echo -e \"import scanpy as sc\nfrom scanpy_scripts.cmd_utils import write_mtx\nimport os\nos.makedirs('$data_dir/layer', exist_ok=True)\nadata=sc.datasets.pbmc3k();adata.layers['test']=adata.X\nwrite_mtx(adata, '$data_dir/layer/', use_layer='test')\" | python" 160 | 161 | [ "$status" -eq 0 ] 162 | [ -f "$raw_matrix_from_layer" ] 163 | } 164 | 165 | @test "Make .obs with a singlet cell group" { 166 | 167 | if [ "$resume" = 'true' ] && [ -f "$singlet_obs" ]; then 168 | skip "$singlet_obs exists" 169 | fi 170 | 171 | run rm -rf $singlet_obs && eval "echo -e \"index\tgroupby_with_singlet\" > $singlet_obs && head -n 1 $data_dir/barcodes.tsv | awk -v cluster='cluster1' '{print \$1\"\t\"cluster}' >> $singlet_obs && sed -n '2,100p;101q' $data_dir/barcodes.tsv | awk -v cluster='cluster3' '{print \$1\"\t\"cluster}' >> $singlet_obs && tail -n +101 $data_dir/barcodes.tsv | awk -v cluster='cluster2' '{print \$1\"\t\"cluster}' >> $singlet_obs" 172 | 173 | [ "$status" -eq 0 ] 174 | [ -f "$singlet_obs" ] 175 | } 176 | 177 | @test "Make a batch variable" { 178 | 179 | if [ "$resume" = 'true' ] && [ -f "$batch_obs" ]; then 180 | skip "$singlet_obs exists" 181 | fi 182 | 183 | run rm -rf $batch_obs && echo -e "batch\n$(printf "%0.sbatch1\n" {1..1350})\n$(printf "%0.sbatch2\n" {1..1350})" > $batch_obs 184 | 185 | [ "$status" -eq 0 ] 186 | [ -f "$batch_obs" ] 187 | } 188 | 189 | # Read 10x dataset 190 | 191 | @test "Scanpy object creation from 10x" { 192 | if [ "$resume" = 'true' ] && [ -f "$read_obj" ]; then 193 | skip "$read_obj exists and resume is set to 'true'" 194 | fi 195 | 196 | run rm -f $read_obj && eval "paste -d $'\t' $singlet_obs $batch_obs > obs.txt && $scanpy read --extra-obs obs.txt $read_opt $read_obj" 197 | 198 | [ "$status" -eq 0 ] 199 | [ -f "$read_obj" ] 200 | } 201 | 202 | # Filter 203 | 204 | @test "Filter cells and genes from a raw object" { 205 | if [ "$resume" = 'true' ] && [ -f "$filter_obj" ]; then 206 | skip "$filter_obj exists and resume is set to 'true'" 207 | fi 208 | 209 | run rm -f $filter_obj && eval "$scanpy filter $filter_opt $read_obj $filter_obj" 210 | 211 | [ "$status" -eq 0 ] 212 | [ -f "$filter_obj" ] 213 | [ -f "$filter_mtx_gz" ] 214 | } 215 | 216 | # Normalise 217 | 218 | @test "Normalise expression values per cell" { 219 | if [ "$resume" = 'true' ] && [ -f "$norm_obj" ]; then 220 | skip "$norm_obj exists and resume is set to 'true'" 221 | fi 222 | 223 | run rm -f $norm_obj && eval "$scanpy norm $norm_opt $filter_obj $norm_obj" 224 | 225 | [ "$status" -eq 0 ] 226 | [ -f "$norm_obj" ] && [ -f "${norm_mtx}_matrix.mtx" ] 227 | } 228 | 229 | # Find variable genes 230 | 231 | @test "Find variable genes" { 232 | if [ "$resume" = 'true' ] && [ -f "$hvg_obj" ]; then 233 | skip "$hvg_obj exists and resume is set to 'true'" 234 | fi 235 | 236 | run rm -f $hvg_obj $hvg_obj && eval "$scanpy hvg $hvg_opt $norm_obj $hvg_obj" 237 | 238 | [ "$status" -eq 0 ] 239 | [ -f "$hvg_obj" ] 240 | } 241 | 242 | @test "Find variable genes with optional turn on/off lists" { 243 | if [ "$resume" = 'true' ] && [ -f "$hvg_obj_on_off" ]; then 244 | skip "$hvg_obj_on_off exists and resume is set to 'true'" 245 | fi 246 | 247 | run rm -f $hvg_obj_on_off && eval "$scanpy hvg $hvg_opt_always_never $norm_obj $hvg_obj_on_off" 248 | } 249 | 250 | # Do separate doublet simulation step (normally we'd just let the main scrublet 251 | # process do this). 252 | 253 | @test "Run Scrublet doublet simulation" { 254 | if [ "$resume" = 'true' ] && [ -f "$scrublet_simulate_obj" ]; then 255 | skip "$scrublet_simulate_obj exists and resume is set to 'true'" 256 | fi 257 | 258 | run rm -f $srublet_simulate_obj && eval "$scanpy multiplet scrublet_simulate_doublets $hvg_obj $scrublet_simulate_obj" 259 | 260 | [ "$status" -eq 0 ] 261 | [ -f "$scrublet_simulate_obj" ] 262 | } 263 | 264 | # Detect multiplets with Scrublet 265 | 266 | @test "Run Scrublet for multiplet detection" { 267 | if [ "$resume" = 'true' ] && [ -f "$scrublet_obj" ]; then 268 | skip "$scrublet_obj exists and resume is set to 'true'" 269 | fi 270 | 271 | run rm -f $scrublet_obj && eval "$scanpy multiplet scrublet $scrublet_opt $hvg_obj $scrublet_obj" 272 | 273 | [ "$status" -eq 0 ] 274 | [ -f "$scrublet_obj" ] && [ -f "$scrublet_tsv" ] 275 | } 276 | 277 | # Run the doublet plot from Scrublet 278 | 279 | @test "Run Scrublet score distribution plot" { 280 | if [ "$resume" = 'true' ] && [ -f "$scrublet_png" ]; then 281 | skip "$scrublet_png exists and resume is set to 'true'" 282 | fi 283 | 284 | run rm -f $scrublet_png && eval "$scanpy plot scrublet $scrublet_obj $scrublet_png" 285 | 286 | [ "$status" -eq 0 ] 287 | [ -f "$scrublet_png" ] 288 | } 289 | 290 | # Detect multiplets with Scrublet (batched) 291 | 292 | @test "Run Scrublet for multiplet detection (batched)" { 293 | if [ "$resume" = 'true' ] && [ -f "$scrublet_batched_obj" ]; then 294 | skip "$scrublet_batched_obj exists and resume is set to 'true'" 295 | fi 296 | 297 | run rm -f $scrublet_batched_obj && eval "$scanpy multiplet scrublet $scrublet_batched_opt $read_obj $scrublet_batched_obj" 298 | 299 | [ "$status" -eq 0 ] 300 | [ -f "$scrublet_batched_obj" ] 301 | } 302 | 303 | 304 | # Regress out variables 305 | 306 | @test "Regress out unwanted variable" { 307 | if [ "$resume" = 'true' ] && [ -f "$regress_obj" ]; then 308 | skip "$regress_obj exists and resume is set to 'true'" 309 | fi 310 | 311 | run rm -f $regress_obj && eval "$scanpy regress $regress_opt $hvg_obj $regress_obj" 312 | 313 | [ "$status" -eq 0 ] 314 | [ -f "$regress_obj" ] 315 | } 316 | 317 | # Scale expression values 318 | 319 | @test "Scale expression values" { 320 | if [ "$resume" = 'true' ] && [ -f "$scale_obj" ]; then 321 | skip "$scale_obj exists and resume is set to 'true'" 322 | fi 323 | 324 | run rm -f $scale_obj && eval "$scanpy scale $scale_opt $hvg_obj $scale_obj" 325 | 326 | [ "$status" -eq 0 ] 327 | [ -f "$scale_obj" ] 328 | } 329 | 330 | # Run PCA 331 | 332 | @test "Run principal component analysis" { 333 | if [ "$resume" = 'true' ] && [ -f "$pca_obj" ]; then 334 | skip "$pca_obj exists and resume is set to 'true'" 335 | fi 336 | 337 | run rm -f $pca_obj && eval "$scanpy pca $pca_opt $scale_obj $pca_obj" 338 | 339 | [ "$status" -eq 0 ] 340 | [ -f "$pca_obj" ] 341 | } 342 | 343 | # Compute graph 344 | 345 | @test "Run compute neighbor graph" { 346 | if [ "$resume" = 'true' ] && [ -f "$neighbor_obj" ]; then 347 | skip "$scaled_object exists and resume is set to 'true'" 348 | fi 349 | 350 | run rm -f $neighbor_obj && eval "$scanpy neighbor $neighbor_opt $pca_obj $neighbor_obj" 351 | 352 | [ "$status" -eq 0 ] 353 | [ -f "$neighbor_obj" ] 354 | } 355 | 356 | # Run TSNE 357 | 358 | @test "Run TSNE analysis" { 359 | if [ "$resume" = 'true' ] && [ -f "$tsne_obj" ]; then 360 | skip "$tsne_obj exists and resume is set to 'true'" 361 | fi 362 | 363 | run rm -f $tsne_obj && eval "$scanpy embed tsne $tsne_opt $pca_obj $tsne_obj" 364 | 365 | [ "$status" -eq 0 ] 366 | [ -f "$tsne_obj" ] && [ -f "$tsne_embed" ] 367 | } 368 | 369 | # Run UMAP 370 | 371 | @test "Run UMAP analysis" { 372 | if [ "$resume" = 'true' ] && [ -f "$umap_obj" ]; then 373 | skip "$umap_obj exists and resume is set to 'true'" 374 | fi 375 | 376 | run rm -f $umap_obj && eval "$scanpy embed umap $umap_opt $neighbor_obj $umap_obj" 377 | 378 | [ "$status" -eq 0 ] 379 | [ -f "$umap_obj" ] && [ -f "$umap_embed" ] 380 | } 381 | 382 | # Find clusters Louvain 383 | 384 | @test "Run find cluster (louvain)" { 385 | if [ "$resume" = 'true' ] && [ -f "$louvain_obj" ]; then 386 | skip "$louvain_obj exists and resume is set to 'true'" 387 | fi 388 | 389 | run rm -f $louvain_obj && eval "$scanpy cluster louvain $louvain_opt $umap_obj $louvain_obj" 390 | 391 | [ "$status" -eq 0 ] 392 | [ -f "$louvain_obj" ] && [ -f "$louvain_tsv" ] 393 | } 394 | 395 | # Find clusters Leiden 396 | 397 | @test "Run find cluster (leiden)" { 398 | if [ "$resume" = 'true' ] && [ -f "$leiden_obj" ]; then 399 | skip "$leiden_obj exists and resume is set to 'true'" 400 | fi 401 | 402 | run rm -f $leiden_obj && eval "$scanpy cluster leiden $leiden_opt $umap_obj $leiden_obj" 403 | 404 | [ "$status" -eq 0 ] 405 | [ -f "$leiden_obj" ] && [ -f "$leiden_tsv" ] 406 | } 407 | 408 | # Find markers 409 | 410 | @test "Run find markers" { 411 | if [ "$resume" = 'true' ] && [ -f "$diffexp_obj" ]; then 412 | skip "$diffexp_obj exists and resume is set to 'true'" 413 | fi 414 | 415 | run rm -f $diffexp_obj $diffexp_tsv && eval "$scanpy diffexp $diffexp_opt $louvain_obj $diffexp_obj" 416 | 417 | [ "$status" -eq 0 ] 418 | [ -f "$diffexp_obj" ] && [ -f "$diffexp_tsv" ] 419 | } 420 | 421 | # Find markers, with singlet group 422 | 423 | @test "Run find markers, with singlet group ignored" { 424 | if [ "$resume" = 'true' ] && [ -f "$diffexp_singlet_obj" ]; then 425 | skip "$diffexp_singlet_obj exists and resume is set to 'true'" 426 | fi 427 | 428 | run rm -f $diffexp_singlet_obj $diffexp_singlet_tsv && eval "$scanpy diffexp $diffexp_singlet_opt $louvain_obj $diffexp_singlet_obj" 429 | 430 | [ "$status" -eq 0 ] 431 | [ -f "$diffexp_singlet_obj" ] && [ -f "$diffexp_singlet_tsv" ] 432 | } 433 | 434 | # Run PAGA 435 | 436 | @test "Run PAGA" { 437 | if [ "$resume" = 'true' ] && [ -f "$paga_obj" ]; then 438 | skip "$paga_obj exists and resume is set to 'true'" 439 | fi 440 | 441 | run rm -f $paga_obj && eval "$scanpy paga $paga_opt $louvain_obj $paga_obj" 442 | 443 | [ "$status" -eq 0 ] 444 | [ -f "$paga_obj" ] 445 | } 446 | 447 | # Run Diffmap 448 | 449 | @test "Run Diffmap" { 450 | if [ "$resume" = 'true' ] && [ -f "$diffmap_obj" ]; then 451 | skip "$diffmap_obj exists and resume is set to 'true'" 452 | fi 453 | 454 | run rm -f $diffmap_obj && eval "$scanpy embed diffmap $diffmap_opt $paga_obj $diffmap_obj" 455 | 456 | [ "$status" -eq 0 ] 457 | [ -f "$diffmap_obj" ] && [ -f "$diffmap_embed" ] 458 | } 459 | 460 | # Run DPT 461 | 462 | @test "Run DPT" { 463 | if [ "$resume" = 'true' ] && [ -f "$dpt_obj" ]; then 464 | skip "$dpt_obj exists and resume is set to 'true'" 465 | fi 466 | 467 | run rm -f $dpt_obj && eval "$scanpy dpt $dpt_opt $diffmap_obj $dpt_obj" 468 | 469 | [ "$status" -eq 0 ] 470 | [ -f "$dpt_obj" ] 471 | } 472 | 473 | # Run Plot embedding 474 | 475 | @test "Run Plot embedding" { 476 | if [ "$resume" = 'true' ] && [ -f "$plt_embed_pdf" ]; then 477 | skip "$plt_embed_pdf exists and resume is set to 'true'" 478 | fi 479 | 480 | run rm -f $plt_embed_pdf && eval "$scanpy plot embed $plt_embed_opt $louvain_obj $plt_embed_pdf" 481 | 482 | [ "$status" -eq 0 ] 483 | [ -f "$plt_embed_pdf" ] 484 | } 485 | 486 | # Run Plot paga 487 | 488 | @test "Run Plot trajectory" { 489 | if [ "$resume" = 'true' ] && [ -f "$plt_paga_pdf" ]; then 490 | skip "$plt_paga_pdf exists and resume is set to 'true'" 491 | fi 492 | 493 | run rm -f $plt_paga_pdf && eval "$scanpy plot paga $plt_paga_opt $dpt_obj $plt_paga_pdf" 494 | 495 | [ "$status" -eq 0 ] 496 | [ -f "$plt_paga_pdf" ] && [ -f "$plt_paga_obj" ] 497 | } 498 | 499 | # Run FDG, with initial coordinates from paga plotting 500 | 501 | @test "Run FDG analysis" { 502 | if [ "$resume" = 'true' ] && [ -f "$fdg_obj" ]; then 503 | skip "$fdg_obj exists and resume is set to 'true'" 504 | fi 505 | 506 | run rm -f $fdg_obj && eval "$scanpy embed fdg $fdg_opt $plt_paga_obj $fdg_obj" 507 | 508 | [ "$status" -eq 0 ] 509 | [ -f "$fdg_obj" ] && [ -f "$fdg_embed" ] 510 | } 511 | 512 | 513 | # Plot a stacked violin plot for markers 514 | 515 | @test "Run Plot stacked violins" { 516 | if [ "$resume" = 'true' ] && [ -f "$plt_stacked_violin_pdf" ]; then 517 | skip "$plt_stacked_violin_pdf exists and resume is set to 'true'" 518 | fi 519 | 520 | run rm -f $plt_stacked_violin_pdf && eval "$scanpy plot sviol $plt_stacked_violin_opt $diffexp_obj $plt_stacked_violin_pdf" 521 | 522 | [ "$status" -eq 0 ] 523 | [ -f "$plt_stacked_violin_pdf" ] 524 | } 525 | 526 | # Plot ranking of genes using a stacked violin plot for markers 527 | 528 | @test "Run Plot ranking of genes using stacked_violin plot" { 529 | if [ "$resume" = 'true' ] && [ -f "$plt_rank_genes_groups_stacked_violin_pdf" ]; then 530 | skip "$plt_rank_genes_groups_stacked_violin_pdf exists and resume is set to 'true'" 531 | fi 532 | 533 | run rm -f $plt_rank_genes_groups_stacked_violin_pdf && eval "$scanpy plot sviol $plt_rank_genes_groups_opt $diffexp_obj $plt_rank_genes_groups_stacked_violin_pdf" 534 | 535 | [ "$status" -eq 0 ] 536 | [ -f "$plt_rank_genes_groups_stacked_violin_pdf" ] 537 | } 538 | 539 | # Plot a dot plot for markers 540 | 541 | @test "Run Plot dotplot" { 542 | if [ "$resume" = 'true' ] && [ -f "$plt_dotplot_pdf" ]; then 543 | skip "$plt_dotplot_pdf exists and resume is set to 'true'" 544 | fi 545 | 546 | run rm -f $plt_dotplot_pdf && eval "$scanpy plot dot $diffexp_plot_opt $diffexp_obj $plt_dotplot_pdf" 547 | 548 | [ "$status" -eq 0 ] 549 | [ -f "$plt_dotplot_pdf" ] 550 | } 551 | 552 | # Plot ranking of genes using a dot plot for markers 553 | 554 | @test "Run Plot ranking of genes using a dot plot" { 555 | if [ "$resume" = 'true' ] && [ -f "$plt_rank_genes_groups_dot_pdf" ]; then 556 | skip "$plt_rank_genes_groups_dot_pdf exists and resume is set to 'true'" 557 | fi 558 | 559 | run rm -f $plt_rank_genes_groups_dot_pdf && eval "$scanpy plot dot $plt_rank_genes_groups_opt $diffexp_obj $plt_rank_genes_groups_dot_pdf" 560 | 561 | [ "$status" -eq 0 ] 562 | [ -f "$plt_rank_genes_groups_dot_pdf" ] 563 | } 564 | 565 | # Plot ranking of genes using a dot plot for markers, high resolution clustering 566 | 567 | @test "Run Plot ranking of genes using a dot plot, high resolution clustering" { 568 | if [ "$resume" = 'true' ] && [ -f "$plt_rank_genes_groups_dot_singlet_pdf" ]; then 569 | skip "$plt_rank_genes_groups_dot_singlet_pdf exists and resume is set to 'true'" 570 | fi 571 | 572 | run rm -f $plt_rank_genes_groups_dot_singlet_pdf && eval "$scanpy plot dot $plt_rank_genes_groups_singlet_opt $diffexp_singlet_obj $plt_rank_genes_groups_dot_singlet_pdf" 573 | 574 | [ "$status" -eq 0 ] 575 | [ -f "$plt_rank_genes_groups_dot_singlet_pdf" ] 576 | } 577 | 578 | # Plot a matrix plot for markers 579 | 580 | @test "Run Plot matrix" { 581 | if [ "$resume" = 'true' ] && [ -f "$plt_matrixplot_pdf" ]; then 582 | skip "$plt_matrixplot_pdf exists and resume is set to 'true'" 583 | fi 584 | 585 | run rm -f $plt_matrixplot_pdf && eval "$scanpy plot matrix $diffexp_plot_opt $diffexp_obj $plt_matrixplot_pdf" 586 | 587 | [ "$status" -eq 0 ] 588 | [ -f "$plt_matrixplot_pdf" ] 589 | } 590 | 591 | # Plot ranking of genes using a matrix plot for markers 592 | 593 | @test "Run Plot ranking of genes using a matrix plot" { 594 | if [ "$resume" = 'true' ] && [ -f "$plt_rank_genes_groups_matrix_pdf" ]; then 595 | skip "$plt_rank_genes_groups_matrix_pdf exists and resume is set to 'true'" 596 | fi 597 | 598 | run rm -f $plt_rank_genes_groups_matrix_pdf && eval "$scanpy plot matrix $plt_rank_genes_groups_opt $diffexp_obj $plt_rank_genes_groups_matrix_pdf" 599 | 600 | [ "$status" -eq 0 ] 601 | [ -f "$plt_rank_genes_groups_matrix_pdf" ] 602 | } 603 | 604 | # Plot a matrix plot for markers 605 | 606 | @test "Run Heatmap" { 607 | if [ "$resume" = 'true' ] && [ -f "$plt_heatmap_pdf" ]; then 608 | skip "$plt_matrixplot_pdf exists and resume is set to 'true'" 609 | fi 610 | 611 | run rm -f $plt_heatmap_pdf && eval "$scanpy plot heat $diffexp_plot_opt $diffexp_obj $plt_heatmap_pdf" 612 | 613 | [ "$status" -eq 0 ] 614 | [ -f "$plt_heatmap_pdf" ] 615 | } 616 | 617 | # Plot ranking of genes using a matrix plot for markers 618 | 619 | @test "Run Plot ranking of genes using a heatmap" { 620 | if [ "$resume" = 'true' ] && [ -f "$plt_rank_genes_groups_heatmap_pdf" ]; then 621 | skip "$plt_rank_genes_groups_heatmap_pdf exists and resume is set to 'true'" 622 | fi 623 | 624 | run rm -f $plt_rank_genes_groups_heatmap_pdf && eval "$scanpy plot heat $plt_rank_genes_groups_opt $diffexp_obj $plt_rank_genes_groups_heatmap_pdf" 625 | 626 | [ "$status" -eq 0 ] 627 | [ -f "$plt_rank_genes_groups_matrix_pdf" ] 628 | } 629 | 630 | # Do harmony batch correction, using clustering as batch (just for test purposes) 631 | 632 | @test "Run Harmony batch integration using clustering as batch" { 633 | if [ "$resume" = 'true' ] && [ -f "$harmony_integrate_obj" ]; then 634 | skip "$harmony_integrate_obj exists and resume is set to 'true'" 635 | fi 636 | 637 | run rm -f $harmony_integrate_obj && eval "$scanpy integrate harmony $harmony_integrate_opt $louvain_obj $harmony_integrate_obj" 638 | 639 | [ "$status" -eq 0 ] 640 | [ -f "$plt_rank_genes_groups_matrix_pdf" ] 641 | 642 | } 643 | 644 | # Run Plot PCA embedding before harmony 645 | 646 | @test "Run Plot PCA embedding before Harmony" { 647 | if [ "$resume" = 'true' ] && [ -f "$noharmony_integrated_pca_pdf" ]; then 648 | skip "$noharmony_integrated_pca_pdf exists and resume is set to 'true'" 649 | fi 650 | 651 | run rm -f $noharmony_integrated_pca_pdf && eval "$scanpy plot embed $noharmony_plt_embed_opt $louvain_obj $noharmony_integrated_pca_pdf" 652 | 653 | [ "$status" -eq 0 ] 654 | [ -f "$noharmony_integrated_pca_pdf" ] 655 | } 656 | 657 | # Run Plot PCA embedding after harmony 658 | 659 | @test "Run Plot PCA embedding after Harmony" { 660 | if [ "$resume" = 'true' ] && [ -f "$harmony_integrated_pca_pdf" ]; then 661 | skip "$harmony_integrated_pca_pdf exists and resume is set to 'true'" 662 | fi 663 | 664 | run rm -f $harmony_integrated_pca_pdf && eval "$scanpy plot embed $harmony_plt_embed_opt $harmony_integrate_obj $harmony_integrated_pca_pdf" 665 | 666 | [ "$status" -eq 0 ] 667 | [ -f "$harmony_integrated_pca_pdf" ] 668 | } 669 | 670 | # Do bbknn batch correction, using clustering as batch (just for test purposes) 671 | 672 | @test "Run BBKNN batch integration using clustering as batch" { 673 | if [ "$resume" = 'true' ] && [ -f "$bbknn_obj" ]; then 674 | skip "$bbknn_obj exists and resume is set to 'true'" 675 | fi 676 | 677 | run rm -f $bbknn_obj && eval "$scanpy integrate bbknn $bbknn_opt $louvain_obj $bbknn_obj" 678 | 679 | [ "$status" -eq 0 ] 680 | [ -f "$plt_rank_genes_groups_matrix_pdf" ] 681 | } 682 | 683 | # Do MNN batch correction, using clustering as batch (just for test purposes) 684 | # Commented as it fails with scanpy 1.9.1 685 | # 686 | # @test "Run MNN batch integration using clustering as batch" { 687 | # if [ "$resume" = 'true' ] && [ -f "$mnn_obj" ]; then 688 | # skip "$mnn_obj exists and resume is set to 'true'" 689 | # fi 690 | # 691 | # run rm -f $mnn_obj && eval "$scanpy integrate mnn $mnn_opt $louvain_obj $mnn_obj" 692 | # 693 | # [ "$status" -eq 0 ] 694 | # [ -f "$mnn_obj" ] 695 | #} 696 | 697 | # Do ComBat batch correction, using clustering as batch (just for test purposes) 698 | 699 | @test "Run Combat batch integration using clustering as batch" { 700 | if [ "$resume" = 'true' ] && [ -f "$combat_obj" ]; then 701 | skip "$combat_obj exists and resume is set to 'true'" 702 | fi 703 | 704 | run rm -f $combat_obj && eval "$scanpy integrate combat $combat_opt $louvain_obj $combat_obj" 705 | 706 | [ "$status" -eq 0 ] 707 | [ -f "$combat_obj" ] 708 | } 709 | 710 | # Local Variables: 711 | # mode: sh 712 | # End: 713 | -------------------------------------------------------------------------------- /scanpy_scripts/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provides version, author and exports 3 | """ 4 | import importlib.metadata 5 | 6 | __version__ = importlib.metadata.version("scanpy-scripts") 7 | 8 | __author__ = ", ".join( 9 | [ 10 | "Ni Huang", 11 | "Pablo Moreno", 12 | "Jonathan Manning", 13 | "Philipp Angerer", 14 | ] 15 | ) 16 | 17 | from . import lib 18 | -------------------------------------------------------------------------------- /scanpy_scripts/cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanpy 3 | """ 4 | 5 | import logging 6 | import click 7 | import scanpy as sc 8 | from .click_utils import NaturalOrderGroup 9 | from .cmds import ( 10 | READ_CMD, 11 | FILTER_CMD, 12 | NORM_CMD, 13 | HVG_CMD, 14 | SCALE_CMD, 15 | REGRESS_CMD, 16 | PCA_CMD, 17 | NEIGHBOR_CMD, 18 | UMAP_CMD, 19 | TSNE_CMD, 20 | FDG_CMD, 21 | LOUVAIN_CMD, 22 | LEIDEN_CMD, 23 | DIFFEXP_CMD, 24 | PAGA_CMD, 25 | DIFFMAP_CMD, 26 | DPT_CMD, 27 | PLOT_EMBED_CMD, 28 | PLOT_PAGA_CMD, 29 | PLOT_STACKED_VIOLIN_CMD, 30 | PLOT_DOT_CMD, 31 | PLOT_MATRIX_CMD, 32 | PLOT_HEATMAP_CMD, 33 | HARMONY_INTEGRATE_CMD, 34 | SCRUBLET_MULTIPLET_CMD, 35 | SCRUBLET_MULTIPLET_SIMULATE_CMD, 36 | SCRUBLET_MULTIPLET_PLOT_CMD, 37 | BBKNN_CMD, 38 | MNN_CORRECT_CMD, 39 | COMBAT_CMD, 40 | ) 41 | 42 | 43 | @click.group(cls=NaturalOrderGroup) 44 | @click.option( 45 | "--debug", 46 | is_flag=True, 47 | default=False, 48 | help="Print debug information", 49 | ) 50 | @click.option( 51 | "--verbosity", 52 | type=click.INT, 53 | default=3, 54 | help="Set scanpy verbosity", 55 | ) 56 | @click.option( 57 | "--njobs", 58 | type=click.INT, 59 | default=1, 60 | help="Set scanpy default number of jobs/CPUs, defaults 1", 61 | ) 62 | @click.version_option( 63 | version="0.2.0", 64 | prog_name="scanpy", 65 | ) 66 | def cli(debug=False, verbosity=3, njobs=1): 67 | """ 68 | Command line interface to [scanpy](https://github.com/theislab/scanpy) 69 | """ 70 | log_level = logging.DEBUG if debug else logging.INFO 71 | logging.basicConfig( 72 | level=log_level, 73 | format=( 74 | "%(asctime)s; %(levelname)s; %(filename)s; " "%(funcName)s(): %(message)s" 75 | ), 76 | datefmt="%y-%m-%d %H:%M:%S", 77 | ) 78 | logging.debug("debugging") 79 | sc.settings.verbosity = verbosity 80 | sc.settings.n_jobs = njobs 81 | return 0 82 | 83 | 84 | cli.add_command(READ_CMD) 85 | cli.add_command(FILTER_CMD) 86 | cli.add_command(NORM_CMD) 87 | cli.add_command(HVG_CMD) 88 | cli.add_command(SCALE_CMD) 89 | cli.add_command(REGRESS_CMD) 90 | cli.add_command(PCA_CMD) 91 | cli.add_command(NEIGHBOR_CMD) 92 | 93 | 94 | @cli.group(cls=NaturalOrderGroup) 95 | def embed(): 96 | """Embed cells into two-dimensional space.""" 97 | 98 | 99 | embed.add_command(UMAP_CMD) 100 | embed.add_command(TSNE_CMD) 101 | embed.add_command(FDG_CMD) 102 | embed.add_command(DIFFMAP_CMD) 103 | 104 | 105 | @cli.group(cls=NaturalOrderGroup) 106 | def cluster(): 107 | """Cluster cells into sub-populations.""" 108 | 109 | 110 | cluster.add_command(LOUVAIN_CMD) 111 | cluster.add_command(LEIDEN_CMD) 112 | 113 | 114 | cli.add_command(DIFFEXP_CMD) 115 | cli.add_command(PAGA_CMD) 116 | cli.add_command(DPT_CMD) 117 | 118 | 119 | @cli.group(cls=NaturalOrderGroup) 120 | def integrate(): 121 | """Integrate cells from different experimental batches.""" 122 | 123 | 124 | integrate.add_command(HARMONY_INTEGRATE_CMD) 125 | integrate.add_command(BBKNN_CMD) 126 | integrate.add_command(MNN_CORRECT_CMD) 127 | integrate.add_command(COMBAT_CMD) 128 | 129 | 130 | @cli.group(cls=NaturalOrderGroup) 131 | def multiplet(): 132 | """Execute methods for multiplet removal.""" 133 | 134 | 135 | multiplet.add_command(SCRUBLET_MULTIPLET_CMD) 136 | multiplet.add_command(SCRUBLET_MULTIPLET_SIMULATE_CMD) 137 | 138 | 139 | @cli.group(cls=NaturalOrderGroup) 140 | def plot(): 141 | """Visualise data.""" 142 | 143 | 144 | plot.add_command(PLOT_EMBED_CMD) 145 | plot.add_command(PLOT_PAGA_CMD) 146 | plot.add_command(PLOT_STACKED_VIOLIN_CMD) 147 | plot.add_command(PLOT_DOT_CMD) 148 | plot.add_command(PLOT_MATRIX_CMD) 149 | plot.add_command(PLOT_HEATMAP_CMD) 150 | plot.add_command(SCRUBLET_MULTIPLET_PLOT_CMD) 151 | -------------------------------------------------------------------------------- /scanpy_scripts/click_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provide helper functions for command line parsing with click 3 | """ 4 | 5 | import click 6 | import sys 7 | 8 | 9 | class NaturalOrderGroup(click.Group): 10 | """Command group trying to list subcommands in the order they were added. 11 | 12 | With decorator, use:: 13 | 14 | @click.group(cls=NaturalOrderGroup) 15 | """ 16 | 17 | def list_commands(self, ctx): 18 | """List command names as they are in commands dict. 19 | 20 | If the dict is OrderedDict, it will preserve the order commands 21 | were added. 22 | """ 23 | return self.commands.keys() 24 | 25 | 26 | class CommaSeparatedText(click.ParamType): 27 | """ 28 | Comma separated text 29 | """ 30 | 31 | def __init__(self, dtype=click.STRING, simplify=False, length=None): 32 | self.dtype = dtype 33 | self.dtype_name = _get_type_name(dtype) 34 | self.simplify = simplify 35 | self.length = length 36 | if length and length <= 3: 37 | self.name = ",".join([f"{self.dtype_name}"] * length) 38 | else: 39 | self.name = "{}[,{}...]".format(self.dtype_name, self.dtype_name) 40 | 41 | def convert(self, value, param, ctx): 42 | """ 43 | >>> @click.command() 44 | ... @click.option('--test-param') 45 | ... def test_cmd(): 46 | ... pass 47 | ... 48 | >>> ctx = click.Context(test_cmd) 49 | >>> param = test_cmd.params[0] 50 | >>> test_cst1 = CommaSeparatedText() 51 | >>> test_cst2 = CommaSeparatedText(click.INT, length=2) 52 | >>> test_cst3 = CommaSeparatedText(click.FLOAT, simplify=True) 53 | >>> 54 | >>> test_cst1.convert(None, param, ctx) 55 | >>> test_cst2.convert('7,2', param, ctx) 56 | [7, 2] 57 | >>> test_cst2.convert('7.2', param, ctx) 58 | Traceback (most recent call last): 59 | ... 60 | click.exceptions.BadParameter: 7.2 is not a valid integer 61 | >>> test_cst2.convert('7', param, ctx) 62 | Traceback (most recent call last): 63 | ... 64 | click.exceptions.BadParameter: 7 is not a valid comma separated list of length 2 65 | >>> test_cst3.convert('7.2', param, ctx) 66 | 7.2 67 | """ 68 | try: 69 | if value is None: 70 | converted = None 71 | else: 72 | converted = list(map(self.dtype, str(value).split(","))) 73 | if self.simplify and len(converted) == 1: 74 | converted = converted[0] 75 | except ValueError: 76 | self.fail( 77 | "{} is not a valid comma separated list of {}".format( 78 | value, self.dtype_name 79 | ), 80 | param, 81 | ctx, 82 | ) 83 | if self.length: 84 | if len(converted) != self.length: 85 | self.fail( 86 | "{} is not a valid comma separated list of length {}".format( 87 | value, self.length 88 | ), 89 | param, 90 | ctx, 91 | ) 92 | return converted 93 | 94 | 95 | class Dictionary(click.ParamType): 96 | """ 97 | Text to be parsed as a python dict definition 98 | """ 99 | 100 | def __init__(self, keys=None): 101 | self.name = "TEXT:VAL[,TEXT:VAL...]" 102 | self.keys = keys 103 | 104 | def convert(self, value, param, ctx): 105 | """ 106 | >>> @click.command() 107 | ... @click.option('--my-param', type=Dictionary(keys=('abc', 'def', 'ghi', 'jkl', 'mno'))) 108 | ... def test_cmd(): 109 | ... pass 110 | ... 111 | >>> ctx = click.Context(test_cmd) 112 | >>> param = test_cmd.params[0] 113 | >>> dict_param = param.type 114 | >>> dict_str1 = 'abc:0.1,def:TRUE,ghi:False,jkl:None,mno:some_string' 115 | >>> dict_str2 = 'abc:0.1,def:TRUE,ghi:False,jkl:None,mnp:some_string' 116 | >>> dict_str3 = '' 117 | >>> dict_param.convert(dict_str1, param, ctx) 118 | {'abc': 0.1, 'def': True, 'ghi': False, 'jkl': None, 'mno': 'some_string'} 119 | >>> dict_param.convert(dict_str2, param, ctx) 120 | Traceback (most recent call last): 121 | ... 122 | click.exceptions.BadParameter: mnp is not a valid key (('abc', 'def', 'ghi', 'jkl', 'mno')) 123 | >>> dict_param.convert(dict_str3, param, ctx) 124 | Traceback (most recent call last): 125 | ... 126 | click.exceptions.BadParameter: is not a valid python dict definition 127 | """ 128 | try: 129 | converted = dict() 130 | for token in value.split(","): 131 | if ":" not in token: 132 | raise ValueError 133 | key, _, value = token.partition(":") 134 | if not key: 135 | raise ValueError 136 | if isinstance(self.keys, (list, tuple)) and key not in self.keys: 137 | self.fail(f"{key} is not a valid key ({self.keys})") 138 | if value == "None": 139 | value = None 140 | elif value.lower() == "true": 141 | value = True 142 | elif value.lower() == "false": 143 | value = False 144 | else: 145 | try: 146 | value = float(value) 147 | except ValueError: 148 | pass 149 | converted[key] = value 150 | return converted 151 | except ValueError: 152 | self.fail(f"{value} is not a valid python dict definition", param, ctx) 153 | 154 | 155 | def _get_type_name(obj): 156 | name = "text" 157 | try: 158 | name = getattr(obj, "name") 159 | except AttributeError: 160 | name = getattr(obj, "__name__") 161 | return name 162 | 163 | 164 | def valid_limit(ctx, param, value): 165 | """ 166 | Callback function that checks order of numeric inputs 167 | 168 | >>> @click.command() 169 | ... @click.option('--test-param', help='Sample help') 170 | ... def test_cmd(): 171 | ... pass 172 | ... 173 | >>> ctx = click.Context(test_cmd) 174 | >>> param = test_cmd.params[0] 175 | >>> valid_limit(ctx, param, value=[0.0125, 3]) 176 | [0.0125, 3] 177 | >>> valid_limit(ctx, param, value=[0.0125, -0.0125]) 178 | Traceback (most recent call last): 179 | ... 180 | click.exceptions.BadParameter: lower limit must not exceed upper limit 181 | >>> valid_limit(ctx, param, value=[0.0125, 0.0125]) 182 | [0.0125, 0.0125] 183 | """ 184 | if value[0] > value[1]: 185 | param.type.fail("lower limit must not exceed upper limit", param, ctx) 186 | return value 187 | 188 | 189 | def valid_parameter_limits(ctx, param, value): 190 | """ 191 | Callback function that checks order of multiple numeric inputs 192 | 193 | >>> @click.command() 194 | ... @click.option('--test-param', type=(click.STRING, click.FLOAT, click.FLOAT), multiple=True) 195 | ... def test_cmd(): 196 | ... pass 197 | ... 198 | >>> ctx = click.Context(test_cmd) 199 | >>> param = test_cmd.params[0] 200 | >>> valid_parameter_limits(ctx, param, [['a', 0.0, 2.0]]) 201 | [['a', 0.0, 2.0]] 202 | >>> valid_parameter_limits(ctx, param, [['b', 0.0, 0.0]]) 203 | [['b', 0.0, 0.0]] 204 | >>> valid_parameter_limits(ctx, param, [['c', 0.0, -1.0]]) 205 | Traceback (most recent call last): 206 | ... 207 | click.exceptions.BadParameter: lower limit must not exceed upper limit 208 | >>> valid_parameter_limits(ctx, param, [['a', 0.0, 2.0], ['c', 0.0, -1.0]]) 209 | Traceback (most recent call last): 210 | ... 211 | click.exceptions.BadParameter: lower limit must not exceed upper limit 212 | """ 213 | for val in value: 214 | if val[1] > val[2]: 215 | param.type.fail("lower limit must not exceed upper limit", param, ctx) 216 | return value 217 | 218 | 219 | def mutually_exclusive_with(param_name): 220 | internal_name = param_name.strip("-").replace("-", "_").lower() 221 | 222 | def valid_mutually_exclusive(ctx, param, value): 223 | try: 224 | other_value = ctx.params[internal_name] 225 | except KeyError: 226 | return value 227 | if (value is None) == (other_value is None): 228 | param.type.fail( 229 | 'mutually exclusive with "{}", one and only one must be ' 230 | "specified.".format(param_name), 231 | param, 232 | ctx, 233 | ) 234 | return value 235 | 236 | return valid_mutually_exclusive 237 | 238 | 239 | def required_by(param_name): 240 | internal_name = param_name.strip("-").replace("-", "_").lower() 241 | 242 | def required(ctx, param, value): 243 | try: 244 | other_value = ctx.params[internal_name] 245 | except KeyError: 246 | return value 247 | if other_value and not value: 248 | param.type.fail( 249 | 'required by "{}".'.format(param_name), 250 | param, 251 | ctx, 252 | ) 253 | return value 254 | 255 | return required 256 | 257 | 258 | if __name__ == "__main__": 259 | import doctest 260 | 261 | sys.exit(doctest.testmod(verbose=True)[0]) 262 | -------------------------------------------------------------------------------- /scanpy_scripts/cmd_options.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provide cmd options 3 | """ 4 | 5 | import click 6 | 7 | from .click_utils import ( 8 | CommaSeparatedText, 9 | Dictionary, 10 | mutually_exclusive_with, 11 | required_by, 12 | valid_limit, 13 | valid_parameter_limits, 14 | ) 15 | 16 | COMMON_OPTIONS = { 17 | "input": [ 18 | click.argument( 19 | "input_obj", 20 | metavar="", 21 | type=click.Path(exists=True, dir_okay=False), 22 | ), 23 | click.option( 24 | "--input-format", 25 | "-f", 26 | type=click.Choice(["anndata", "loom"]), 27 | default="anndata", 28 | show_default=True, 29 | help="Input object format.", 30 | ), 31 | ], 32 | "output": [ 33 | click.argument( 34 | "output_obj", 35 | metavar="", 36 | type=click.Path(dir_okay=False, writable=True), 37 | ), 38 | click.option( 39 | "--output-format", 40 | "-F", 41 | type=click.Choice(["anndata", "loom", "zarr"]), 42 | default="anndata", 43 | show_default=True, 44 | help="Output object format.", 45 | ), 46 | click.option( 47 | "--zarr-chunk-size", 48 | "-z", 49 | type=click.INT, 50 | default=1000, 51 | show_default=True, 52 | help="Chunk size for writing output in zarr format.", 53 | ), 54 | click.option( 55 | "--loom-write-obsm-varm", 56 | "-b", 57 | is_flag=True, 58 | default=False, 59 | show_default=True, 60 | help="Write obsm and varm to the Loom file?", 61 | ), 62 | click.option( 63 | "--export-mtx", 64 | "-X", 65 | type=click.Path(dir_okay=True, writable=True), 66 | default=None, 67 | show_default=True, 68 | help="When specified, using it as prefix for exporting mtx files. " 69 | 'If not empty and not ending with "/" or "_", a "_" will be ' 70 | "appended.", 71 | ), 72 | click.option( 73 | "--mtx-compression", 74 | "-G", 75 | type=click.Choice(["zip", "gzip", "bz2", "zstd"]), 76 | default=None, 77 | show_default=True, 78 | help="Compression type for MTX output.", 79 | ), 80 | click.option( 81 | "--show-obj", 82 | type=click.Choice(["stdout", "stderr"]), 83 | default=None, 84 | show_default=True, 85 | help="Print output object summary info to specified stream.", 86 | ), 87 | ], 88 | "save": [ 89 | click.option( 90 | "--save-raw", 91 | "-r", 92 | is_flag=True, 93 | default=False, 94 | show_default=True, 95 | help="Save adata to adata.raw before processing.", 96 | ), 97 | click.option( 98 | "--save-layer", 99 | "-y", 100 | type=click.STRING, 101 | default=None, 102 | show_default=True, 103 | help="Save adata.X to the specified layer before processing.", 104 | ), 105 | ], 106 | "plot": [ 107 | click.argument( 108 | "output_fig", 109 | metavar="", 110 | type=click.Path(dir_okay=False, writable=True), 111 | ), 112 | click.option( 113 | "--fig-size", 114 | type=CommaSeparatedText(click.INT, length=2), 115 | default="7,7", 116 | show_default=True, 117 | help="Figure size.", 118 | ), 119 | click.option( 120 | "--fig-dpi", 121 | type=click.INT, 122 | default=80, 123 | show_default=True, 124 | help="Figure DPI.", 125 | ), 126 | click.option( 127 | "--fig-fontsize", 128 | type=click.INT, 129 | default=15, 130 | show_default=True, 131 | help="Figure font size.", 132 | ), 133 | ], 134 | "frame_title": [ 135 | click.option( 136 | "--frameon/--frameoff", 137 | "frameon", 138 | default=True, 139 | show_default=True, 140 | help="Draw a frame around the plot", 141 | ), 142 | click.option( 143 | "--title", 144 | type=CommaSeparatedText(simplify=True), 145 | default=None, 146 | show_default=True, 147 | help="Provide title for the plot or panels.", 148 | ), 149 | ], 150 | "use_pc": [ 151 | click.option( 152 | "--n-pcs", 153 | "-n", 154 | type=click.INT, 155 | default=None, 156 | show_default=True, 157 | help="Use this many PCs. Use `.X` if --n-pcs is 0 when --use-rep is " 158 | "None.", 159 | ), 160 | click.option( 161 | "--use-rep", 162 | "-u", 163 | type=click.STRING, 164 | default=None, 165 | show_default=True, 166 | help="Use the indicated representation. If None, the representation is " 167 | "chosen automatically: for `.n_vars` < 50, `.X` is used, otherwise " 168 | "`X_pca` is used. If `X_pca` is not present, it's computed with " 169 | "default parameters.", 170 | ), 171 | ], 172 | "knn_graph": [ 173 | click.option( 174 | "--neighbors-key", 175 | type=click.STRING, 176 | default=None, 177 | show_default=False, 178 | help="If not specified, look in .uns[‘neighbors’] for neighbors " 179 | "settings and .obsp[‘connectivities’], .obsp[‘distances’] for connectivities and " 180 | "distances respectively (default storage places for pp.neighbors). If specified, " 181 | "look in .uns[neighbors_key] for neighbors settings and " 182 | ".obsp[.uns[neighbors_key][‘connectivities_key’]], " 183 | ".obsp[.uns[neighbors_key][‘distances_key’]] for connectivities and distances " 184 | "respectively.", 185 | ), 186 | click.option( 187 | "--obsp", 188 | type=click.STRING, 189 | default=None, 190 | show_default=True, 191 | help="Use .obsp[obsp] as adjacency. You can’t specify both obsp and " 192 | "neighbors_key at the same time.", 193 | ), 194 | click.option( 195 | "--directed/--undirected", 196 | "directed", 197 | default=True, 198 | show_default=True, 199 | help="Interpret the adjacency matrix as directed graph.", 200 | ), 201 | click.option( 202 | "--use-weights", 203 | is_flag=True, 204 | default=False, 205 | show_default=True, 206 | help="Use weights from KNN graph.", 207 | ), 208 | ], 209 | "neighbor_metric": click.option( 210 | "--metric", 211 | "-t", 212 | type=click.Choice( 213 | [ 214 | "cityblock", 215 | "cosine", 216 | "euclidean", 217 | "l1", 218 | "l2", 219 | "manhattan", 220 | "braycurtis", 221 | "canberra", 222 | "chebyshev", 223 | "correlation", 224 | "dice", 225 | "hamming", 226 | "jaccard", 227 | "kulsinski", 228 | "mahalanobis", 229 | "minkowski", 230 | "rogerstanimoto", 231 | "russellrao", 232 | "seuclidean", 233 | "sokalmichener", 234 | "sokalsneath", 235 | "sqeuclidean", 236 | "yule", 237 | ] 238 | ), 239 | default="euclidean", 240 | show_default=True, 241 | help="A known metric’s name.", 242 | ), 243 | "layer": click.option( 244 | "--layer", 245 | type=CommaSeparatedText(simplify=True), 246 | default=None, 247 | show_default=True, 248 | help="Name of the AnnData object layer that wants to be plotted. By " 249 | "default adata.raw.X is plotted. If use_raw=False is set, then adata.X " 250 | "is plotted. If layer is set to a valid layer name, then the layer is " 251 | "plotted. layer takes precedence over use_raw.", 252 | ), 253 | "n_comps": click.option( 254 | "--n-comps", 255 | type=click.INT, 256 | default=None, 257 | show_default=True, 258 | help="Number of components to compute", 259 | ), 260 | "key_added": click.option( 261 | "--key-added", 262 | type=CommaSeparatedText(simplify=True), 263 | default=None, 264 | show_default=True, 265 | help="Key under which to add the computed results", 266 | ), 267 | "random_state": click.option( 268 | "--random-state", 269 | "-S", 270 | type=click.INT, 271 | default=0, 272 | show_default=True, 273 | help="Seed for random number generator.", 274 | ), 275 | "use_raw": click.option( 276 | "--use-raw/--no-raw", 277 | "use_raw", 278 | default=None, 279 | show_default=True, 280 | help="Use expression values in `.raw` if present.", 281 | ), 282 | "zero_center": click.option( 283 | "--no-zero-center", 284 | "zero_center", 285 | is_flag=True, 286 | flag_value=False, 287 | default=True, 288 | help="When set, omit zero-centering variables to allow efficient " 289 | "handling of sparse input.", 290 | ), 291 | "n_jobs": click.option( 292 | "--n-jobs", 293 | "-J", 294 | type=click.INT, 295 | default=None, 296 | show_default=True, 297 | help="Number of jobs for parallel computation.", 298 | ), 299 | "restrict_to": click.option( 300 | "--restrict-to", 301 | type=(click.STRING, CommaSeparatedText()), 302 | default=(None, None), 303 | show_default=True, 304 | help="Restrict the clustering to the categories within the key for " 305 | 'sample annotation, in the form of "obs_key list_of_categories".', 306 | ), 307 | "export_embedding": click.option( 308 | "--export-embedding", 309 | "-E", 310 | type=click.Path(dir_okay=False, writable=True), 311 | default=None, 312 | show_default=True, 313 | help="Export embeddings in a tab-separated text table.", 314 | ), 315 | "export_cluster": click.option( 316 | "--export-cluster", 317 | type=click.Path(dir_okay=False, writable=True), 318 | default=None, 319 | show_default=True, 320 | help="Export embeddings in a tab-separated text table.", 321 | ), 322 | "var_names": click.option( 323 | "--var-names", 324 | type=(CommaSeparatedText()), 325 | show_default=True, 326 | help="var_names should be a valid subset of adata.var_names.", 327 | ), 328 | "gene_symbols": click.option( 329 | "--gene-symbols", 330 | type=CommaSeparatedText(simplify=True), 331 | default=None, 332 | show_default=True, 333 | help="Column name in .var DataFrame that stores gene symbols. By " 334 | "default this is assumed to be the index column of the .var " 335 | "DataFrame. Setting this option allows alternative names to be " 336 | "used.", 337 | ), 338 | "diffexp_plot": [ 339 | click.option( 340 | "--rgg", 341 | is_flag=True, 342 | default=False, 343 | show_default=True, 344 | help="When set, use the rank_genes_groups_ form of the function, " 345 | "where gene lists are automatically selected.", 346 | ), 347 | click.option( 348 | "--groupby", 349 | type=CommaSeparatedText(simplify=True), 350 | default=None, 351 | show_default=True, 352 | help="The key of the observation grouping to consider.", 353 | ), 354 | click.option( 355 | "--log", 356 | is_flag=True, 357 | default=False, 358 | show_default=True, 359 | help="Plot on logarithmic axis.", 360 | ), 361 | click.option( 362 | "--num-categories", 363 | type=click.INT, 364 | default=7, 365 | show_default=True, 366 | help="Only used if groupby observation is not categorical. This value " 367 | "determines the number of groups into which the groupby observation " 368 | "should be subdivided.", 369 | ), 370 | click.option( 371 | "--dendrogram", 372 | is_flag=True, 373 | default=False, 374 | show_default=False, 375 | help="If True, a dendrogram based on the hierarchical clustering " 376 | "between the groupby categories is added. The dendrogram information is " 377 | "computed using scanpy.tl.dendrogram(). If tl.dendrogram has not been " 378 | "called previously the function is called with default parameters.", 379 | ), 380 | click.option( 381 | "--standard-scale", 382 | type=click.Choice(["var", "obs"]), 383 | default=None, 384 | show_default=True, 385 | help="Whether or not to standardize that dimension between 0 and 1, " 386 | "meaning for each variable or group, subtract the minimum and divide " 387 | "each by its maximum.", 388 | ), 389 | ], 390 | "sviol": [ 391 | click.option( 392 | "--no-stripplot", 393 | "stripplot", 394 | is_flag=True, 395 | default=True, 396 | show_default=True, 397 | help="When set, do not add a stripplot on top of the violin plot.", 398 | ), 399 | click.option( 400 | "--no-jitter", 401 | "jitter", 402 | is_flag=True, 403 | default=True, 404 | show_default=True, 405 | help="Suppress jitter in the stripplot (only when stripplot is True)", 406 | ), 407 | click.option( 408 | "--size", 409 | type=click.INT, 410 | default=1, 411 | show_default=True, 412 | help="Size of the jitter points.", 413 | ), 414 | click.option( 415 | "--order", 416 | type=CommaSeparatedText(), 417 | default=None, 418 | show_default=True, 419 | help="Order in which to show the categories.", 420 | ), 421 | click.option( 422 | "--scale", 423 | type=click.Choice(["area", "count", "width"]), 424 | default="width", 425 | show_default=True, 426 | help="The method used to scale the width of each violin. If ‘area’, " 427 | "each violin will have the same area. If ‘count’, the width of the " 428 | "violins will be scaled by the number of observations in that bin. If " 429 | "‘width’, each violin will have the same width.", 430 | ), 431 | click.option( 432 | "--row-palette", 433 | type=CommaSeparatedText(simplify=True), 434 | default="muted", 435 | show_default=True, 436 | help="The row palette determines the colors to use in each of the " 437 | "stacked violin plots. The value should be a valid seaborn palette name " 438 | "or a valic matplotlib colormap (see " 439 | "https://seaborn.pydata.org/generated/seaborn.color_palette.html). " 440 | "Alternatively, a single color name or hex value can be passed. E.g. " 441 | "‘red’ or ‘#cc33ff’.", 442 | ), 443 | ], 444 | "dot": [ 445 | click.option( 446 | "--expression-cutoff", 447 | type=click.FLOAT, 448 | default=0, 449 | show_default=True, 450 | help="Expression cutoff that is used for binarizing the gene expression " 451 | "and determining the fraction of cells expressing given genes. A gene is " 452 | "expressed only if the expression value is greater than this threshold.", 453 | ), 454 | click.option( 455 | "--mean-only-expressed", 456 | is_flag=True, 457 | default=False, 458 | show_default=True, 459 | help="If True, gene expression is averaged only over the cells " 460 | "expressing the given genes.", 461 | ), 462 | click.option( 463 | "--color-map", 464 | type=CommaSeparatedText(simplify=True), 465 | default="Reds", 466 | show_default=True, 467 | help="String denoting matplotlib color map.", 468 | ), 469 | click.option( 470 | "--dot-max", 471 | type=click.FLOAT, 472 | default=None, 473 | show_default=True, 474 | help="If none, the maximum dot size is set to the maximum fraction " 475 | "value found (e.g. 0.6). If given, the value should be a number between " 476 | "0 and 1. All fractions larger than dot_max are clipped to this value.", 477 | ), 478 | click.option( 479 | "--dot-min", 480 | type=click.FLOAT, 481 | default=None, 482 | show_default=True, 483 | help="If none, the minimum dot size is set to 0. If given, the value " 484 | "should be a number between 0 and 1. All fractions smaller than dot_min " 485 | "are clipped to this value.", 486 | ), 487 | click.option( 488 | "--smallest-dot", 489 | type=click.FLOAT, 490 | default=0, 491 | show_default=True, 492 | help="If none, the smallest dot has size 0. All expression levels with " 493 | "dot_min are potted with smallest_dot dot size.", 494 | ), 495 | ], 496 | "heat": [ 497 | click.option( 498 | "--show-gene-labels", 499 | is_flag=True, 500 | default=None, 501 | show_default=True, 502 | help="By default gene labels are shown when there are 50 or less " 503 | "genes. Otherwise the labels are removed.", 504 | ), 505 | ], 506 | "swap_axes": click.option( 507 | "--swap-axes", 508 | is_flag=True, 509 | default=False, 510 | show_default=True, 511 | help="By default, the x axis contains var_names (e.g. genes) and the y " 512 | "axis the groupby categories. By setting swap_axes then x are the " 513 | "groupby categories and y the var_names. When swapping axes " 514 | "var_group_positions are no longer used.", 515 | ), 516 | "rank_genes_groups_plots": [ 517 | click.option( 518 | "--groups", 519 | type=CommaSeparatedText(), 520 | default=None, 521 | show_default=True, 522 | help="The groups for which to show the gene ranking.", 523 | ), 524 | click.option( 525 | "--n-genes", 526 | "-n", 527 | type=click.INT, 528 | default=10, 529 | show_default=True, 530 | help="Number of genes to show.", 531 | ), 532 | ], 533 | "root": click.option( 534 | "--root", 535 | type=click.INT, 536 | default=0, 537 | show_default=True, 538 | help="If choosing a tree layout, this is the index of the root node.", 539 | ), 540 | "plot_embed": [ 541 | click.option( 542 | "--use-raw/--no-raw", 543 | default=None, 544 | show_default=True, 545 | help="Use `.raw` attribute for coloring with gene expression. If " 546 | "`None`, uses `.raw` if present.", 547 | ), 548 | click.option( 549 | "--groups", 550 | type=click.STRING, 551 | default=None, 552 | help="Key for categorical in `.obs`. You can pass your predefined " 553 | "groups by choosing any categorical annotation of observations.", 554 | ), 555 | ], 556 | "batch_key": click.option( 557 | "--batch-key", 558 | "key", 559 | type=click.STRING, 560 | required=True, 561 | help="The name of the column in adata.obs that differentiates among " 562 | "experiments/batches.", 563 | ), 564 | "batch_layer": click.option( 565 | "--layer", 566 | "-l", 567 | type=click.STRING, 568 | default=None, 569 | show_default=True, 570 | help="Layer to batch correct. By default corrects the contents of .X.", 571 | ), 572 | "scrublet": [ 573 | click.option( 574 | "--sim-doublet-ratio", 575 | type=click.FLOAT, 576 | default=2.0, 577 | show_default=True, 578 | help="Number of doublets to simulate relative to the number of " 579 | "observed transcriptomes.", 580 | ), 581 | click.option( 582 | "--synthetic-doublet-umi-subsampling", 583 | type=click.FLOAT, 584 | default=1.0, 585 | show_default=True, 586 | help="Where input_obj_sim not suplied, rate for sampling UMIs when " 587 | "creating synthetic doublets. If 1.0, each doublet is created by " 588 | "simply adding the UMI counts from two randomly sampled observed " 589 | "transcriptomes. For values less than 1, the UMI counts are added " 590 | "and then randomly sampled at the specified rate.", 591 | ), 592 | ], 593 | } 594 | 595 | COMMON_OPTIONS["opt_output"] = [ 596 | click.option( 597 | "--output-obj", 598 | type=click.Path(dir_okay=False, writable=True), 599 | help="Optionally output an object to the specified path.", 600 | ), 601 | *COMMON_OPTIONS["output"][1:], 602 | ] 603 | 604 | CMD_OPTIONS = { 605 | "read": [ 606 | click.option( 607 | "--input-10x-h5", 608 | "-i", 609 | type=click.Path(exists=True, dir_okay=False), 610 | callback=mutually_exclusive_with("--input-10x-mtx"), 611 | help="Input 10x data in Cell-Ranger hdf5 format.", 612 | ), 613 | click.option( 614 | "--input-10x-mtx", 615 | "-x", 616 | type=click.Path(exists=True, file_okay=False), 617 | callback=mutually_exclusive_with("--input-10x-h5"), 618 | help="Path of input folder containing 10x data in mtx format.", 619 | ), 620 | *COMMON_OPTIONS["output"], 621 | click.option( 622 | "--genome", 623 | "-g", 624 | callback=required_by("--input-10x-h5"), 625 | default="hg19", 626 | show_default=True, 627 | help="Name of the genome group in hdf5 file, required by " 628 | '"--input-10x-h5".', 629 | ), 630 | click.option( 631 | "--var-names", 632 | "-v", 633 | type=click.Choice(["gene_symbols", "gene_ids"]), 634 | callback=required_by("--input-10x-mtx"), 635 | default="gene_symbols", 636 | show_default=True, 637 | help="Attribute to be used as the index of the variable table, " 638 | 'required by "--input-10x-mtx".', 639 | ), 640 | click.option( 641 | "--extra-obs", 642 | type=click.Path(exists=True, dir_okay=False), 643 | default=None, 644 | show_default=True, 645 | help="Extra cell metadata table, must be tab-separated with a header " 646 | "row and an index column, and with matched dimension.", 647 | ), 648 | click.option( 649 | "--extra-var", 650 | type=click.Path(exists=True, dir_okay=False), 651 | default=None, 652 | show_default=True, 653 | help="Extra gene metadata table, must be tab-separated with a header " 654 | "row and an index column, and with matched dimension.", 655 | ), 656 | ], 657 | "filter": [ 658 | *COMMON_OPTIONS["input"], 659 | *COMMON_OPTIONS["output"], 660 | COMMON_OPTIONS["save"][0], # --save-raw 661 | click.option( 662 | "--gene-name", 663 | "-g", 664 | type=click.STRING, 665 | default="index", 666 | show_default=True, 667 | help="Name of the variable that contains gene names, used for flagging " 668 | 'mitochondria genes when column "mito" is absent from `.var`.', 669 | ), 670 | click.option( 671 | "--list-attr", 672 | "-l", 673 | is_flag=True, 674 | default=False, 675 | help="When set, list attributes that can be filtered on.", 676 | ), 677 | click.option( 678 | "--param", 679 | "-p", 680 | type=(click.STRING, click.FLOAT, click.FLOAT), 681 | multiple=True, 682 | callback=valid_parameter_limits, 683 | help="Numerical parameters used to filter the data, " 684 | 'in the format of "-p name min max". ' 685 | "Multiple -p entries allowed.", 686 | ), 687 | click.option( 688 | "--category", 689 | "-c", 690 | type=(click.STRING, CommaSeparatedText()), 691 | multiple=True, 692 | help="Categorical attributes used to filter the data, " 693 | 'in the format of "-c ", ' 694 | "where entries with attribute with value in are kept. " 695 | 'If is preceded by "!", entries with value in are ' 696 | "removed. Multiple -c entries allowed.", 697 | ), 698 | click.option( 699 | "--subset", 700 | "-s", 701 | type=(click.STRING, click.File()), 702 | multiple=True, 703 | help='Similar to --category in the format of "-s ", ' 704 | "but the to be a one-column table that provides the values. " 705 | "Multiple -s entries allowed.", 706 | ), 707 | click.option( 708 | "--force-recalc", 709 | is_flag=True, 710 | default=False, 711 | help="When set, re-calculate `pct_counts_` and " 712 | "`pct_counts_in_top__genes` even if they exist.", 713 | ), 714 | ], 715 | "norm": [ 716 | *COMMON_OPTIONS["input"], 717 | *COMMON_OPTIONS["output"], 718 | *COMMON_OPTIONS["save"], 719 | COMMON_OPTIONS["key_added"], 720 | click.option( 721 | "--no-log-transform", 722 | "log_transform", 723 | is_flag=True, 724 | default=True, 725 | show_default=True, 726 | help="When set, do not apply (natural) log transform following normalisation.", 727 | ), 728 | click.option( 729 | "--normalize-to", 730 | "-t", 731 | "target_sum", 732 | type=float, 733 | default=10_000, 734 | show_default=True, 735 | help="Normalize per cell nUMI to this number.", 736 | ), 737 | click.option( 738 | "--exclude-highly-expressed", 739 | "-e", 740 | "exclude_highly_expressed", 741 | is_flag=True, 742 | default=False, 743 | show_default=True, 744 | help="Exclude (very) highly expressed genes for the computation of " 745 | "the normalization factor (size factor) for each cell. A gene is considered " 746 | "highly expressed, if it has more than max_fraction of the total counts in at " 747 | "least one cell. The not-excluded genes will sum up to the number " 748 | "specified by --normalize-to.", 749 | ), 750 | click.option( 751 | "--max-fraction", 752 | "-m", 753 | "max_fraction", 754 | type=float, 755 | default=0.05, 756 | show_default=True, 757 | help="If exclude_highly_expressed=True, consider cells as highly " 758 | "expressed that have more counts than max_fraction of the original total counts " 759 | "in at least one cell.", 760 | ), 761 | click.option( 762 | "--layers", 763 | "-l", 764 | type=CommaSeparatedText(simplify=True), 765 | default=None, 766 | show_default=True, 767 | help="List of layers to normalize. Set to 'all' to normalize all layers.", 768 | ), 769 | click.option( 770 | "--layer-norm", 771 | "-n", 772 | "layer_norm", 773 | type=click.Choice(["after", "X"]), 774 | default=None, 775 | show_default=True, 776 | help="Specifies how to normalize layers: 1) If None, after " 777 | "normalization, for each layer in layers each cell has a total count equal to " 778 | "the median of the counts_per_cell before normalization of the layer. 2) If " 779 | "'after', for each layer in layers each cell has a total count equal to " 780 | "target_sum. 3) If 'X', for each layer in layers each cell has a total count " 781 | "equal to the median of total counts for observations (cells) of adata.X before " 782 | "normalization.'", 783 | ), 784 | ], 785 | "hvg": [ 786 | *COMMON_OPTIONS["input"], 787 | *COMMON_OPTIONS["output"], 788 | click.option( 789 | "--mean-limits", 790 | "-m", 791 | type=(click.FLOAT, click.FLOAT), 792 | callback=valid_limit, 793 | default=(0.0125, 3), 794 | show_default=True, 795 | help="Cutoffs for the mean of expression" 'in the format of "-m min max".', 796 | ), 797 | click.option( 798 | "--disp-limits", 799 | "-d", 800 | type=(click.FLOAT, click.FLOAT), 801 | callback=valid_limit, 802 | default=(0.5, float("inf")), 803 | show_default=True, 804 | help="Cutoffs for the dispersion of expression" 805 | 'in the format of "-d min max".', 806 | ), 807 | click.option( 808 | "--span", 809 | type=click.FLOAT, 810 | default=0.3, 811 | show_default=True, 812 | help="The fraction of the data (cells) used when estimating the " 813 | "variance in the loess model fit if flavor='seurat_v3'.", 814 | ), 815 | click.option( 816 | "--n-bins", 817 | "-b", 818 | type=click.INT, 819 | default=20, 820 | show_default=True, 821 | help="Number of bins for binning the mean gene expression.", 822 | ), 823 | click.option( 824 | "--n-top-genes", 825 | "-t", 826 | type=click.INT, 827 | default=None, 828 | show_default=True, 829 | help="Number of highly-variable genes to keep.", 830 | ), 831 | click.option( 832 | "--flavor", 833 | "-v", 834 | type=click.Choice(["seurat", "cell_ranger", "seurat_v3"]), 835 | default="seurat", 836 | show_default=True, 837 | help="Choose the flavor for computing normalized dispersion.", 838 | ), 839 | click.option( 840 | "--subset", 841 | "-s", 842 | is_flag=True, 843 | default=False, 844 | help="When set, inplace subset to highly-variable genes, otherwise " 845 | "only flag highly-variable genes.", 846 | ), 847 | click.option( 848 | "--batch-key", 849 | "batch_key", 850 | type=click.STRING, 851 | default=None, 852 | help="If specified, highly-variable genes are selected within each " 853 | "batch separately and merged. This simple process avoids the selection of " 854 | "batch-specific genes and acts as a lightweight batch correction method. For all " 855 | "flavors, genes are first sorted by how many batches they are a HVG. For " 856 | "dispersion-based flavors ties are broken by normalized dispersion. If flavor = " 857 | "'seurat_v3', ties are broken by the median (across batches) rank based on " 858 | "within-batch normalized variance.", 859 | ), 860 | click.option( 861 | "--always-hv-genes-file", 862 | "always_hv_genes_file", 863 | type=click.Path(exists=True), 864 | default=None, 865 | help="If specified, the gene identifers in this file will be set as highly variable in the var dataframe after HVGs are computed.", 866 | ), 867 | click.option( 868 | "--never-hv-genes-file", 869 | "never_hv_genes_file", 870 | type=click.Path(exists=True), 871 | default=None, 872 | help="If specified, the gene identifers in this file will be removed from highly variable in the var dataframe (set to false) after HVGs are computed.", 873 | ), 874 | ], 875 | "scale": [ 876 | *COMMON_OPTIONS["input"], 877 | *COMMON_OPTIONS["output"], 878 | *COMMON_OPTIONS["save"], 879 | COMMON_OPTIONS["zero_center"], 880 | click.option( 881 | "--max-value", 882 | "-m", 883 | type=click.FLOAT, 884 | default=None, 885 | show_default=True, 886 | help="When specified, clip to this value after scaling, otherwise do " 887 | "not clip", 888 | ), 889 | click.option( 890 | "--layer", 891 | "-l", 892 | type=CommaSeparatedText(simplify=True), 893 | default=None, 894 | help="If provided, which element of layers to scale.", 895 | ), 896 | ], 897 | "regress": [ 898 | *COMMON_OPTIONS["input"], 899 | *COMMON_OPTIONS["output"], 900 | *COMMON_OPTIONS["save"], 901 | COMMON_OPTIONS["n_jobs"], 902 | click.option( 903 | "--keys", 904 | "-k", 905 | type=CommaSeparatedText(simplify=True), 906 | default=None, 907 | show_default=True, 908 | help="Key(s) for observation annotation on which to regress.", 909 | ), 910 | ], 911 | "pca": [ 912 | *COMMON_OPTIONS["input"], 913 | *COMMON_OPTIONS["output"], 914 | COMMON_OPTIONS["zero_center"], 915 | COMMON_OPTIONS["random_state"], 916 | COMMON_OPTIONS["export_embedding"], 917 | COMMON_OPTIONS["n_comps"], 918 | click.option( 919 | "--svd-solver", 920 | "-V", 921 | type=click.Choice(["auto", "arpack", "randomized"]), 922 | default="auto", 923 | show_default=True, 924 | help="SVD solver to use.", 925 | ), 926 | click.option( 927 | "--use-all", 928 | "-a", 929 | "use_highly_variable", 930 | is_flag=True, 931 | flag_value=False, 932 | default=True, 933 | help="When set, use all genes for PCA, otherwise use " 934 | "highly-variable genes by default.", 935 | ), 936 | click.option( 937 | "--chunked", 938 | "-K", 939 | is_flag=True, 940 | default=False, 941 | help="When set, perform an incremental PCA on segments of " 942 | "--chunk-size, which automatically zero centers and ignore settings of " 943 | "--random-state and --svd-solver.", 944 | ), 945 | click.option( 946 | "--chunk-size", 947 | "-Z", 948 | type=click.INT, 949 | callback=required_by("--chunked"), 950 | default=None, 951 | show_default=True, 952 | help="Number of observations to include in each chunk, required by " 953 | "--chunked.", 954 | ), 955 | ], 956 | "neighbor": [ 957 | *COMMON_OPTIONS["input"], 958 | *COMMON_OPTIONS["output"], 959 | *COMMON_OPTIONS["use_pc"], 960 | COMMON_OPTIONS["key_added"], 961 | COMMON_OPTIONS["random_state"], 962 | click.option( 963 | "--n-neighbors", 964 | "-k", 965 | type=CommaSeparatedText(click.INT, simplify=True), 966 | default=15, 967 | show_default=True, 968 | help="The size of local neighborhood (in terms of number of " 969 | "neighboring data points) used for manifold approximation. Larger " 970 | "values result in more global views of the manifold, while smaller " 971 | "values result in more local data being preserved. In general values " 972 | "should be in the range 2 to 100. If --knn is set, number of nearest " 973 | "neighbors to be searched, othwise a Gaussian kernel width is set to " 974 | "the distance of the --n-neighbors neighbor.", 975 | ), 976 | click.option( 977 | "--no-knn", 978 | "knn", 979 | is_flag=True, 980 | flag_value=False, 981 | default=True, 982 | show_default=True, 983 | help="When NOT set, use a hard threshold to restrict the number of " 984 | "neighbors to --n-neighbors. Otherwise, use a Gaussian kernel to " 985 | "assign low weights to neighbors more distant than the --n-neighbors " 986 | "nearest neighbor", 987 | ), 988 | click.option( 989 | "--method", 990 | "-m", 991 | type=click.Choice(["umap", "gauss", "rapids"]), 992 | default="umap", 993 | show_default=True, 994 | help="Use umap or gauss with adaptive width for computing " 995 | "connectivities. Use rapids for the RAPIDS implementation of UMAP " 996 | "(experimental, GPU only).", 997 | ), 998 | COMMON_OPTIONS["neighbor_metric"], 999 | ], 1000 | "umap": [ 1001 | *COMMON_OPTIONS["input"], 1002 | *COMMON_OPTIONS["output"], 1003 | COMMON_OPTIONS["knn_graph"][0], # --neighbors-key 1004 | COMMON_OPTIONS["random_state"], 1005 | COMMON_OPTIONS["key_added"], 1006 | COMMON_OPTIONS["export_embedding"], 1007 | click.option( 1008 | "--init-pos", 1009 | type=click.STRING, 1010 | default="spectral", 1011 | show_default=True, 1012 | help="How to initialize the low dimensional embedding. Can be " 1013 | '"spectral", "paga" or "random", or any key of `.obsm`.', 1014 | ), 1015 | click.option( 1016 | "--min-dist", 1017 | type=click.FLOAT, 1018 | default=0.5, 1019 | show_default=True, 1020 | help="The effective minimum distance between embedded points. Smaller " 1021 | "values will result in a more clustered embedding, while larger values " 1022 | "will results in a more even dispersal of points.", 1023 | ), 1024 | click.option( 1025 | "--spread", 1026 | type=click.FLOAT, 1027 | default=1.0, 1028 | show_default=True, 1029 | help="The effective scale of embedded points, which determines the " 1030 | "scale at which embedded points will be spread out.", 1031 | ), 1032 | click.option( 1033 | "--n-components", 1034 | type=click.INT, 1035 | default=2, 1036 | show_default=True, 1037 | help="The number of dimensions of the embedding.", 1038 | ), 1039 | click.option( 1040 | "--maxiter", 1041 | type=click.INT, 1042 | default=None, 1043 | show_default=True, 1044 | help="The number of iterations of the optimization.", 1045 | ), 1046 | click.option( 1047 | "--alpha", 1048 | type=click.FLOAT, 1049 | default=1.0, 1050 | show_default=True, 1051 | help="The initial learning rate for the embedding optimization.", 1052 | ), 1053 | click.option( 1054 | "--gamma", 1055 | type=click.FLOAT, 1056 | default=1.0, 1057 | show_default=True, 1058 | help="Weighting applied to negative samples in low dimensional " 1059 | "embedding optimization.", 1060 | ), 1061 | click.option( 1062 | "--negative-sample-rate", 1063 | type=click.INT, 1064 | default=5, 1065 | show_default=True, 1066 | help="The number of negative edge samples to use per positive edge " 1067 | "sample in optimizing the low dimensional embedding.", 1068 | ), 1069 | click.option( 1070 | "--method", 1071 | type=click.Choice(["umap", "rapids"]), 1072 | default="umap", 1073 | show_default=True, 1074 | help="Use the original ‘umap’ implementation, or ‘rapids’ " 1075 | "(experimental, GPU only).", 1076 | ), 1077 | ], 1078 | "tsne": [ 1079 | *COMMON_OPTIONS["input"], 1080 | *COMMON_OPTIONS["output"], 1081 | *COMMON_OPTIONS["use_pc"], 1082 | COMMON_OPTIONS["random_state"], 1083 | COMMON_OPTIONS["key_added"], 1084 | COMMON_OPTIONS["n_jobs"], 1085 | COMMON_OPTIONS["export_embedding"], 1086 | click.option( 1087 | "--perplexity", 1088 | type=click.FLOAT, 1089 | default=30, 1090 | show_default=True, 1091 | help="The perplexity is related to the number of nearest neighbors " 1092 | "that is used in other manifold learning algorithms. Larger datasets " 1093 | "usually require a larger perplexity. Consider selecting a value " 1094 | "between 5 and 50. The choice is not extremely critical since t-SNE " 1095 | "is quite insensitive to this parameter.", 1096 | ), 1097 | click.option( 1098 | "--early-exaggeration", 1099 | type=click.FLOAT, 1100 | default=12, 1101 | show_default=True, 1102 | help="Controls how tight natural clusters in the original space are in " 1103 | "the embedded space and how much space will be between them. For " 1104 | "larger values, the space between natural clusters will be larger in " 1105 | "the embedded space. Again, the choice of this parameter is not very " 1106 | "critical. If the cost function increases during initial optimization, " 1107 | "the early exaggeration factor or the learning rate might be too high.", 1108 | ), 1109 | click.option( 1110 | "--learning-rate", 1111 | type=click.FLOAT, 1112 | default=1000, 1113 | show_default=True, 1114 | help='Note that the R-package "Rtsne" uses a default of 200. The ' 1115 | "learning rate can be a critical parameter. It should be between 100 " 1116 | "and 1000. If the cost function increases during initial optimization, " 1117 | "the early exaggeration factor or the learning rate might be too high. " 1118 | "If the cost function gets stuck in a bad local minimum increasing the " 1119 | "learning rate helps sometimes.", 1120 | ), 1121 | click.option( 1122 | "--no-fast-tsne", 1123 | "use_fast_tsne", 1124 | is_flag=True, 1125 | flag_value=False, 1126 | default=True, 1127 | show_default=True, 1128 | help="When NOT set, use the MulticoreTSNE package by D. Ulyanov if " 1129 | "installed.", 1130 | ), 1131 | ], 1132 | "fdg": [ 1133 | *COMMON_OPTIONS["input"], 1134 | *COMMON_OPTIONS["output"], 1135 | COMMON_OPTIONS["random_state"], 1136 | COMMON_OPTIONS["export_embedding"], 1137 | COMMON_OPTIONS["root"], 1138 | click.option( 1139 | "--init-pos", 1140 | type=click.STRING, 1141 | default=None, 1142 | help="Use precomputed coordinates for initialization. Can be any key " 1143 | 'of `.obsm` or "paga" if .uns["paga"] is present', 1144 | ), 1145 | click.option( 1146 | "--layout", 1147 | type=click.Choice( 1148 | ["fa", "fr", "grid_fr", "kk", "lgl", "drl", "rt", "rt_circular"] 1149 | ), 1150 | default="fa", 1151 | show_default=True, 1152 | help='Name of any valid igraph layout, including "fa" (ForceAtlas2), ' 1153 | '"fr" (Fruchterman Reingold), "grid_fr" (Grid Fruchterman Reingold, ' 1154 | 'faster than "fr"), "kk" (Kamadi Kawai, slower than "fr"), "lgl" ' 1155 | '(Large Graph Layout, very fast), "drl" (Distributed Recursive Layout, ' 1156 | 'pretty fast) and "rt" (Reingold Tilford tree layout).', 1157 | ), 1158 | click.option( 1159 | "--key-added-ext", 1160 | type=click.STRING, 1161 | default=None, 1162 | show_default=True, 1163 | help="By default, append 'layout'", 1164 | ), 1165 | click.option( 1166 | "--init-pos", 1167 | type=click.STRING, 1168 | default=None, 1169 | show_default=True, 1170 | help='How to initialize the low dimensional embedding. Can be "paga", ' 1171 | "or any valid key of `.obsm`.", 1172 | ), 1173 | COMMON_OPTIONS["knn_graph"][0], # --neighbors-key 1174 | COMMON_OPTIONS["knn_graph"][1], # --obsp 1175 | ], 1176 | "louvain": [ 1177 | *COMMON_OPTIONS["input"], 1178 | *COMMON_OPTIONS["output"], 1179 | COMMON_OPTIONS["export_cluster"], 1180 | *COMMON_OPTIONS["knn_graph"], 1181 | COMMON_OPTIONS["restrict_to"], 1182 | COMMON_OPTIONS["random_state"], 1183 | COMMON_OPTIONS["key_added"], 1184 | click.option( 1185 | "--flavor", 1186 | type=click.Choice(["vtraag", "igraph"]), 1187 | default="vtraag", 1188 | show_default=True, 1189 | help="Choose between two packages for computing the clustering. " 1190 | '"vtraag" is much powerful, and the default.', 1191 | ), 1192 | click.option( 1193 | "--resolution", 1194 | "-r", 1195 | type=CommaSeparatedText(click.FLOAT, simplify=True), 1196 | default=1, 1197 | show_default=True, 1198 | help='For the default flavor "vtraag", you can provide a resolution. ' 1199 | "Higher resolution means finding more and smaller clusters.", 1200 | ), 1201 | ], 1202 | "leiden": [ 1203 | *COMMON_OPTIONS["input"], 1204 | *COMMON_OPTIONS["output"], 1205 | COMMON_OPTIONS["export_cluster"], 1206 | *COMMON_OPTIONS["knn_graph"], 1207 | COMMON_OPTIONS["restrict_to"], 1208 | COMMON_OPTIONS["random_state"], 1209 | COMMON_OPTIONS["key_added"], 1210 | click.option( 1211 | "--resolution", 1212 | "-r", 1213 | type=CommaSeparatedText(click.FLOAT, simplify=True), 1214 | default=1, 1215 | show_default=True, 1216 | help="A parameter value controlling the coarseness of the clustering. " 1217 | 'Higher values lead to more clusters. Set to "None" if overriding ' 1218 | "--partition_type to one that doesn't accept `resolution_parameter`.", 1219 | ), 1220 | click.option( 1221 | "--n-iterations", 1222 | type=click.INT, 1223 | default=-1, 1224 | show_default=True, 1225 | help="How many iterations of the Leiden clustering algorithm to " 1226 | "perform. -1 has the algorithm run until it reaches its optimal " 1227 | "clustering.", 1228 | ), 1229 | ], 1230 | "diffexp": [ 1231 | *COMMON_OPTIONS["input"], 1232 | *COMMON_OPTIONS["output"], 1233 | COMMON_OPTIONS["use_raw"], 1234 | COMMON_OPTIONS["key_added"], 1235 | click.option( 1236 | "--layer", 1237 | "-l", 1238 | type=click.STRING, 1239 | default=None, 1240 | help="Key from adata.layers whose value will be used to perform tests on.", 1241 | ), 1242 | click.option( 1243 | "--groupby", 1244 | "-g", 1245 | type=click.STRING, 1246 | required=True, 1247 | help="The key of the observations grouping to consider.", 1248 | ), 1249 | click.option( 1250 | "--groups", 1251 | type=CommaSeparatedText(simplify=True), 1252 | default="all", 1253 | show_default=True, 1254 | help="Subset of groups to which comparison shall be restricted.", 1255 | ), 1256 | click.option( 1257 | "--reference", 1258 | type=click.STRING, 1259 | default="rest", 1260 | show_default=True, 1261 | help='If "rest", compare each group to the union of the rest of the ' 1262 | "groups. If a group identifier, compare with respect to this group.", 1263 | ), 1264 | click.option( 1265 | "--n-genes", 1266 | "-n", 1267 | type=click.INT, 1268 | default=None, 1269 | show_default=True, 1270 | help="The number of genes that appear in the retured tables. By " 1271 | "default return all available genes depending on the value of " 1272 | "--use-raw.", 1273 | ), 1274 | click.option( 1275 | "--method", 1276 | type=click.Choice(["logreg", "t-test", "wilcoxon", "t-test_overestim_var"]), 1277 | default="t-test_overestim_var", 1278 | show_default=True, 1279 | help="Method of performing differential expression analysis.", 1280 | ), 1281 | click.option( 1282 | "--corr-method", 1283 | type=click.Choice(["benjamini-hochberg", "bonferroni"]), 1284 | default="benjamini-hochberg", 1285 | show_default=True, 1286 | help='P-value correction method. Used only for "t-test", ' 1287 | '"t-test_overestim_var" and "wilcoxon".', 1288 | ), 1289 | click.option( 1290 | "--rankby-abs", 1291 | is_flag=True, 1292 | default=False, 1293 | show_default=True, 1294 | help="Rank genes by the absolute value of the score, not by the score. " 1295 | "The returned scores are never the absolute values.", 1296 | ), 1297 | click.option( 1298 | "--pts", 1299 | is_flag=True, 1300 | default=False, 1301 | show_default=True, 1302 | help="Compute the fraction of cells expressing the genes.", 1303 | ), 1304 | click.option( 1305 | "--tie-correct", 1306 | is_flag=True, 1307 | default=False, 1308 | show_default=True, 1309 | help="Use tie correction for 'wilcoxon' scores. Used only for " 1310 | "'wilcoxon'.", 1311 | ), 1312 | click.option( 1313 | "--filter-params", 1314 | type=Dictionary( 1315 | keys=[ 1316 | "min_in_group_fraction", 1317 | "max_out_group_fraction", 1318 | "min_fold_change", 1319 | ] 1320 | ), 1321 | default=None, 1322 | show_default=True, 1323 | help="Parameters for filtering DE results, valid parameters are: " 1324 | '"min_in_group_fraction" (float), "max_out_group_fraction" (float), ' 1325 | '"min_fold_change" (float).', 1326 | ), 1327 | click.option( 1328 | "--logreg-param", 1329 | type=Dictionary(), 1330 | default=None, 1331 | show_default=True, 1332 | help="Parameters passed to `sklearn.linear_model.LogisticRegression`.", 1333 | ), 1334 | click.option( 1335 | "--save", 1336 | type=click.Path(dir_okay=False, writable=True), 1337 | default=None, 1338 | show_default=True, 1339 | help="Tab-separated table to store results of differential expression " 1340 | "analysis.", 1341 | ), 1342 | ], 1343 | "paga": [ 1344 | *COMMON_OPTIONS["input"], 1345 | *COMMON_OPTIONS["output"], 1346 | COMMON_OPTIONS["knn_graph"][0], # --neighbors-key 1347 | COMMON_OPTIONS["key_added"], 1348 | click.option( 1349 | "--groups", 1350 | type=click.STRING, 1351 | required=True, 1352 | help="Key for categorical in `.obs`. You can pass your predefined " 1353 | "groups by choosing any categorical annotation of observations.", 1354 | ), 1355 | click.option( 1356 | "--model", 1357 | type=click.Choice(["v1.2", "v1.0"]), 1358 | default="v1.2", 1359 | show_default=True, 1360 | help="The PAGA connectivity model.", 1361 | ), 1362 | click.option( 1363 | "--use-rna-velocity", 1364 | is_flag=True, 1365 | default=False, 1366 | show_default=True, 1367 | help="Use RNA velocity to orient edges in the abstracted graph and " 1368 | "estimate transitions. Requires that adata.uns contains a directed single-cell " 1369 | "graph with key velocity_graph. This feature might be subject to change in the " 1370 | "future.", 1371 | ), 1372 | ], 1373 | "diffmap": [ 1374 | *COMMON_OPTIONS["input"], 1375 | *COMMON_OPTIONS["output"], 1376 | COMMON_OPTIONS["knn_graph"][0], # --neighbors-key 1377 | COMMON_OPTIONS["key_added"], 1378 | COMMON_OPTIONS["export_embedding"], 1379 | COMMON_OPTIONS["n_comps"], 1380 | ], 1381 | "dpt": [ 1382 | *COMMON_OPTIONS["input"], 1383 | *COMMON_OPTIONS["output"], 1384 | COMMON_OPTIONS["knn_graph"][0], # --neighbors-key 1385 | COMMON_OPTIONS["key_added"], 1386 | click.option( 1387 | "--root", 1388 | type=(click.STRING, click.STRING), 1389 | default=(None, None), 1390 | show_default=True, 1391 | help="Specify a categorical annotaion of observations (`.obs`) and a " 1392 | "value representing the root cells.", 1393 | ), 1394 | click.option( 1395 | "--n-dcs", 1396 | type=click.INT, 1397 | default=10, 1398 | show_default=True, 1399 | help="The number of diffusion components to use.", 1400 | ), 1401 | click.option( 1402 | "--n-branchings", 1403 | type=click.INT, 1404 | default=0, 1405 | show_default=True, 1406 | help="Number of branchings to detect.", 1407 | ), 1408 | click.option( 1409 | "--min-group-size", 1410 | type=click.FLOAT, 1411 | default=0.01, 1412 | show_default=True, 1413 | help="During recursive splitting of branches for --n-branchings > 1, " 1414 | "do not consider branches/groups that contain fewer than this fraction " 1415 | "of the total number of data points.", 1416 | ), 1417 | click.option( 1418 | "--disallow-kendall-tau-shift", 1419 | "allow_kendall_tau_shift", 1420 | is_flag=True, 1421 | default=True, 1422 | show_default=True, 1423 | help="By default: If a very small branch is detected upon " 1424 | "splitting, shift away from maximum correlation in Kendall tau criterion of " 1425 | "[Haghverdi16] to stabilize the splitting. Use flag to disable this.", 1426 | ), 1427 | ], 1428 | "combat": [ 1429 | *COMMON_OPTIONS["input"], 1430 | *COMMON_OPTIONS["output"], 1431 | COMMON_OPTIONS["batch_key"], 1432 | COMMON_OPTIONS["batch_layer"], 1433 | click.option( 1434 | "--key-added", 1435 | type=click.STRING, 1436 | default=None, 1437 | show_default=True, 1438 | help="Key under which to add the computed results. By default a new " 1439 | "layer will be created called 'combat', 'combat_{layer}' or " 1440 | "'combat_layer_{key_added}' where those parameters were specified. A value of 'X' " 1441 | "causes batch-corrected values to overwrite the original content of .X.", 1442 | ), 1443 | click.option( 1444 | "--covariates", 1445 | type=(CommaSeparatedText()), 1446 | default=None, 1447 | show_default=True, 1448 | help="Comma-separated list of additional covariates besides the " 1449 | "batch variable such as adjustment variables or biological condition. This " 1450 | "parameter refers to the design matrix X in Equation 2.1 in [Johnson07] and to " 1451 | "the mod argument in the original combat function in the sva R package. Note " 1452 | "that not including covariates may introduce bias or lead to the removal of " 1453 | "biological signal in unbalanced designs.", 1454 | ), 1455 | ], 1456 | "harmony": [ 1457 | *COMMON_OPTIONS["input"], 1458 | *COMMON_OPTIONS["output"], 1459 | COMMON_OPTIONS["batch_key"], 1460 | click.option( 1461 | "--basis", 1462 | type=click.STRING, 1463 | default="X_pca", 1464 | show_default=True, 1465 | help="The name of the field in adata.obsm where the PCA table is " 1466 | "stored. Defaults to 'X_pca', which is the default for sc.tl.pca().", 1467 | ), 1468 | click.option( 1469 | "--adjusted-basis", 1470 | type=click.STRING, 1471 | default="X_pca_harmony", 1472 | show_default=True, 1473 | help="The name of the field in adata.obsm where the adjusted PCA " 1474 | "table will be stored after running this function.", 1475 | ), 1476 | click.option( 1477 | "--theta", 1478 | type=click.FLOAT, 1479 | default=2, 1480 | show_default=True, 1481 | help="Diversity clustering penalty parameter. theta=0 does not encourage any " 1482 | "diversity. Larger values of theta result in more diverse clusters.", 1483 | ), 1484 | click.option( 1485 | "--lambda", 1486 | "lamb", 1487 | type=click.FLOAT, 1488 | default=1, 1489 | show_default=True, 1490 | help="Ridge regression penalty parameter. Lambda must be strictly " 1491 | "positive. Smaller values result in more aggressive correction.", 1492 | ), 1493 | click.option( 1494 | "--sigma", 1495 | type=click.FLOAT, 1496 | default=0.1, 1497 | show_default=True, 1498 | help="Width of soft kmeans clusters. Sigma scales the distance from " 1499 | "a cell to cluster centroids. Larger values of sigma result in cells assigned to " 1500 | "more clusters. Smaller values of sigma make soft kmeans cluster approach hard " 1501 | "clustering.", 1502 | ), 1503 | click.option( 1504 | "--n-clust", 1505 | "nclust", 1506 | type=click.INT, 1507 | default=None, 1508 | show_default=False, 1509 | help="Number of clusters in model. nclust=1 equivalent to simple " 1510 | "linear regression.", 1511 | ), 1512 | click.option( 1513 | "--tau", 1514 | type=click.INT, 1515 | default=0, 1516 | show_default=True, 1517 | help="Protection against overclustering small datasets with large ones. " 1518 | "tau is the expected number of cells per cluster.", 1519 | ), 1520 | click.option( 1521 | "--block-size", 1522 | type=click.FLOAT, 1523 | default=0.05, 1524 | show_default=True, 1525 | help="What proportion of cells to update during clustering. Between " 1526 | "0 to 1, default 0.05. Larger values may be faster but less accurate.", 1527 | ), 1528 | click.option( 1529 | "--max-iter-cluster", 1530 | "max_iter_kmeans", 1531 | type=click.INT, 1532 | default=20, 1533 | show_default=True, 1534 | help="Maximum number of rounds to run clustering at each round of " 1535 | "Harmony.", 1536 | ), 1537 | click.option( 1538 | "--max-iter-harmony", 1539 | type=click.INT, 1540 | default=10, 1541 | show_default=True, 1542 | help="Maximum number of rounds to run Harmony. One round of Harmony " 1543 | "involves one clustering and one correction step.", 1544 | ), 1545 | click.option( 1546 | "--epsilon-cluster", 1547 | type=click.FLOAT, 1548 | default=1e-5, 1549 | show_default=True, 1550 | help="Convergence tolerance for clustering round of Harmony Set to " 1551 | "-Inf to never stop early.", 1552 | ), 1553 | click.option( 1554 | "--epsilon-harmony", 1555 | type=click.FLOAT, 1556 | default=1e-5, 1557 | show_default=True, 1558 | help="Convergence tolerance for clustering round of Harmony Set to " 1559 | "-Inf to never stop early.", 1560 | ), 1561 | COMMON_OPTIONS["random_state"], 1562 | ], 1563 | "mnn": [ 1564 | *COMMON_OPTIONS["input"], 1565 | *COMMON_OPTIONS["output"], 1566 | *COMMON_OPTIONS["save"], 1567 | COMMON_OPTIONS["batch_key"], 1568 | COMMON_OPTIONS["batch_layer"], 1569 | click.option( 1570 | "--key-added", 1571 | type=click.STRING, 1572 | default=None, 1573 | show_default=True, 1574 | help="Key under which to add the computed results. By default a new " 1575 | "layer will be created called 'mnn', 'mnn_{layer}' or " 1576 | "'mnn_layer_{key_added}' where those parameters were specified. A value of 'X' " 1577 | "causes batch-corrected values to overwrite the original content of .X.", 1578 | ), 1579 | click.option( 1580 | "--var-subset", 1581 | type=(click.STRING, CommaSeparatedText()), 1582 | multiple=True, 1583 | help="The subset of vars (list of str) to be used when performing " 1584 | "MNN correction in the format of '--var-subset '. Typically, use " 1585 | "the highly variable genes (HVGs) like '--var-subset highly_variable True'. When " 1586 | "unset, uses all vars.", 1587 | ), 1588 | click.option( 1589 | "--n-neighbors", 1590 | "-k", 1591 | type=CommaSeparatedText(click.INT, simplify=True), 1592 | default=20, 1593 | show_default=True, 1594 | help="Number of mutual nearest neighbors.", 1595 | ), 1596 | click.option( 1597 | "--sigma", 1598 | type=click.FLOAT, 1599 | default=1.0, 1600 | show_default=True, 1601 | help="The bandwidth of the Gaussian smoothing kernel used to " 1602 | "compute the correction vectors.", 1603 | ), 1604 | click.option( 1605 | "--no-cos_norm_in", 1606 | "cos_norm_in", 1607 | is_flag=True, 1608 | default=True, 1609 | help="Default behaviour is to perform cosine normalization on the " 1610 | "input data prior to calculating distances between cells. Use this " 1611 | "flag to disable that behaviour.", 1612 | ), 1613 | click.option( 1614 | "--no-cos_norm_out", 1615 | "cos_norm_out", 1616 | is_flag=True, 1617 | default=True, 1618 | help="Default behaviour is to perform cosine normalization prior to " 1619 | "computing corrected expression values. Use this flag to disable that " 1620 | "behaviour.", 1621 | ), 1622 | click.option( 1623 | "--svd-dim", 1624 | type=click.INT, 1625 | default=None, 1626 | show_default=True, 1627 | help="The number of dimensions to use for summarizing biological " 1628 | "substructure within each batch. If not set, biological components " 1629 | "will not be removed from the correction vectors.", 1630 | ), 1631 | click.option( 1632 | "--no-var-adj", 1633 | is_flag=True, 1634 | default=True, 1635 | help="Default behaviour is to adjust variance of the correction " 1636 | "vectors. Use this flag to disable that behaviour. Note this step takes most " 1637 | "computing time.", 1638 | ), 1639 | click.option( 1640 | "--compute-angle", 1641 | is_flag=True, 1642 | default=False, 1643 | help="When set, compute the angle between each cell’s correction " 1644 | "vector and the biological subspace of the reference batch.", 1645 | ), 1646 | click.option( 1647 | "--svd-mode", 1648 | type=click.Choice(["svd", "rsvd", "irlb"]), 1649 | default="rsvd", 1650 | show_default=True, 1651 | help="'svd' computes SVD using a non-randomized SVD-via-ID " 1652 | "algorithm, while 'rsvd' uses a randomized version. 'irlb' performs truncated " 1653 | "SVD by implicitly restarted Lanczos bidiagonalization (forked from " 1654 | "https://github.com/airysen/irlbpy).", 1655 | ), 1656 | ], 1657 | "bbknn": [ 1658 | *COMMON_OPTIONS["input"], 1659 | *COMMON_OPTIONS["output"], 1660 | COMMON_OPTIONS["key_added"], 1661 | COMMON_OPTIONS["batch_key"], 1662 | click.option( 1663 | "--use-rep", 1664 | "-u", 1665 | type=click.STRING, 1666 | default="X_pca", 1667 | show_default=True, 1668 | help="The dimensionality reduction in .obsm to use for neighbour " 1669 | "detection.", 1670 | ), 1671 | COMMON_OPTIONS["use_pc"][0], # --n-pcs 1672 | click.option( 1673 | "--no-approx", 1674 | "approx", 1675 | is_flag=True, 1676 | default=True, 1677 | help="Default behaviour is to use annoy’s approximate neighbour " 1678 | "finding. This results in a quicker run time for large datasets while also " 1679 | "potentially increasing the degree of batch correction. Use this flag to disable " 1680 | "that behaviour.", 1681 | ), 1682 | COMMON_OPTIONS["neighbor_metric"], 1683 | click.option( 1684 | "--neighbors-within-batch", 1685 | type=click.INT, 1686 | default=3, 1687 | show_default=True, 1688 | help="How many top neighbours to report for each batch; total " 1689 | "number of neighbours will be this number times the number of batches.", 1690 | ), 1691 | click.option( 1692 | "--trim", 1693 | type=click.INT, 1694 | default=None, 1695 | show_default=True, 1696 | help="Trim the neighbours of each cell to these many top " 1697 | "connectivities. May help with population independence and improve the tidiness " 1698 | "of clustering. The lower the value the more independent the individual " 1699 | "populations, at the cost of more conserved batch effect. If None, sets the " 1700 | "parameter value automatically to 10 times the total number of neighbours for " 1701 | "each cell. Set to 0 to skip.", 1702 | ), 1703 | click.option( 1704 | "--annoy-n-trees", 1705 | type=click.INT, 1706 | default=10, 1707 | show_default=True, 1708 | help="Only used when approx=True. The number of trees to construct " 1709 | "in the annoy forest. More trees give higher precision when querying, at the " 1710 | "cost of increased run time and resource intensity.", 1711 | ), 1712 | click.option( 1713 | "--no-use-faiss", 1714 | "use_faiss", 1715 | is_flag=True, 1716 | default=True, 1717 | help="Default behaviour If approx=False and the metric is " 1718 | "“euclidean”, is to use the faiss package to compute nearest neighbours if " 1719 | "installed. This improves performance at a minor cost to numerical precision as " 1720 | "faiss operates on float32. Use this flag to disable that behaviour.", 1721 | ), 1722 | click.option( 1723 | "--set-op-mix-ratio", 1724 | type=click.FLOAT, 1725 | default=1, 1726 | show_default=True, 1727 | help="UMAP connectivity computation parameter, float between 0 and " 1728 | "1, controlling the blend between a connectivity matrix formed exclusively from " 1729 | "mutual nearest neighbour pairs (0) and a union of all observed neighbour " 1730 | "relationships with the mutual pairs emphasised (1).", 1731 | ), 1732 | click.option( 1733 | "--local-connectivity", 1734 | type=click.INT, 1735 | default=1, 1736 | show_default=True, 1737 | help="UMAP connectivity computation parameter, how many nearest " 1738 | "neighbors of each cell are assumed to be fully connected (and given a " 1739 | "connectivity value of 1)", 1740 | ), 1741 | ], 1742 | "scrublet": [ 1743 | *COMMON_OPTIONS["input"], 1744 | *COMMON_OPTIONS["output"], 1745 | click.option( 1746 | "--batch-key", 1747 | "batch_key", 1748 | type=click.STRING, 1749 | help="The name of the column in adata.obs that differentiates among " 1750 | "experiments/batches. Doublets will be detected in each batch separately.", 1751 | ), 1752 | click.option( 1753 | "--input-obj-sim", 1754 | "adata_sim", 1755 | type=click.Path(exists=True, dir_okay=False), 1756 | default=None, 1757 | help="(Advanced use case) Optional annData object generated by " 1758 | "sc.external.pp.scrublet_simulate_doublets(), with same number of " 1759 | "vars as adata. This should have been built from input_obj after " 1760 | "filtering genes and cells and selcting highly-variable genes.", 1761 | ), 1762 | click.option( 1763 | "--threshold", 1764 | type=click.FLOAT, 1765 | default=None, 1766 | show_default=True, 1767 | help="Doublet score threshold for calling a transcriptome a " 1768 | "doublet. If not set, this is set automatically by looking for the " 1769 | "minimum between the two modes of the doublet_scores_sim_ histogram. " 1770 | "It is best practice to check the threshold visually using the " 1771 | "doublet_scores_sim_ histogram and/or based on co-localization of " 1772 | "predicted doublets in a 2-D embedding.", 1773 | ), 1774 | *COMMON_OPTIONS["scrublet"], 1775 | click.option( 1776 | "--expected-doublet-rate", 1777 | type=click.FLOAT, 1778 | default=0.05, 1779 | show_default=True, 1780 | help="Where input_obj_sim not suplied, the estimated doublet rate " 1781 | "for the experiment.", 1782 | ), 1783 | click.option( 1784 | "--stdev-doublet-rate", 1785 | type=click.FLOAT, 1786 | default=0.02, 1787 | show_default=True, 1788 | help="Where input_obje_sim not suplied, uncertainty in the expected " 1789 | "doublet rate.", 1790 | ), 1791 | click.option( 1792 | "--knn-dist-metric", 1793 | "-t", 1794 | type=click.Choice( 1795 | [ 1796 | "cityblock", 1797 | "cosine", 1798 | "euclidean", 1799 | "l1", 1800 | "l2", 1801 | "manhattan", 1802 | "braycurtis", 1803 | "canberra", 1804 | "chebyshev", 1805 | "correlation", 1806 | "dice", 1807 | "hamming", 1808 | "jaccard", 1809 | "kulsinski", 1810 | "mahalanobis", 1811 | "minkowski", 1812 | "rogerstanimoto", 1813 | "russellrao", 1814 | "seuclidean", 1815 | "sokalmichener", 1816 | "sokalsneath", 1817 | "sqeuclidean", 1818 | "yule", 1819 | ] 1820 | ), 1821 | default="euclidean", 1822 | show_default=True, 1823 | help="A known metric’s name.", 1824 | ), 1825 | click.option( 1826 | "--no-normalize-variance", 1827 | "normalize_variance", 1828 | is_flag=True, 1829 | default=True, 1830 | help="Default is to normalize the data such that each gene has a " 1831 | "variance of 1. sklearn.decomposition.TruncatedSVD will be used for " 1832 | "dimensionality reduction, if --no-mean-center is set. Use this flag " 1833 | "to disable that behaviour.", 1834 | ), 1835 | click.option( 1836 | "--log-transform", 1837 | is_flag=True, 1838 | default=False, 1839 | show_default=True, 1840 | help="Whether to use :func:~scanpy.pp.log1p to log-transform the " 1841 | "data prior to PCA.", 1842 | ), 1843 | click.option( 1844 | "--no-mean-center", 1845 | "mean_center", 1846 | is_flag=True, 1847 | default=True, 1848 | help="If True, center the data such that each gene has a mean of 0. " 1849 | "sklearn.decomposition.PCA will be used for dimensionality " 1850 | "reduction.", 1851 | ), 1852 | click.option( 1853 | "--n-pcs", 1854 | "n_prin_comps", 1855 | type=click.INT, 1856 | default=30, 1857 | show_default=True, 1858 | help="Number of principal components used to embed the " 1859 | "transcriptomes prior to k-nearest-neighbor graph construction.", 1860 | ), 1861 | click.option( 1862 | "--no-approx", 1863 | "use_approx_neighbors", 1864 | is_flag=True, 1865 | default=True, 1866 | help="Default behaviour is to use the approximate nearest neighbor " 1867 | "method (annoy) for the KNN classifier. Use this flag to disable " 1868 | "that behaviour.", 1869 | ), 1870 | click.option( 1871 | "--get-doublet-neighbor-parents", 1872 | is_flag=True, 1873 | default=False, 1874 | show_default=True, 1875 | help="If set, return (in .uns) the parent transcriptomes that " 1876 | "generated the doublet neighbors of each observed transcriptome. " 1877 | "This information can be used to infer the cell states that " 1878 | "generated a given doublet state.", 1879 | ), 1880 | click.option( 1881 | "--n-neighbors", 1882 | "-k", 1883 | type=CommaSeparatedText(click.INT, simplify=True), 1884 | default=None, 1885 | show_default=True, 1886 | help="Number of neighbors used to construct the KNN graph of " 1887 | "observed transcriptomes and simulated doublets. If not set, this is " 1888 | "automatically set to np.round(0.5 * np.sqrt(n_obs)).", 1889 | ), 1890 | click.option( 1891 | "--filter", 1892 | "filter", 1893 | is_flag=True, 1894 | default=False, 1895 | help="By default, the output object is annotated but not filtered " 1896 | "according to the scrublet status. Setting this flag will cause " 1897 | "predicted multiplet elements to be removed.", 1898 | ), 1899 | click.option( 1900 | "--no-verbose", 1901 | "verbose", 1902 | is_flag=True, 1903 | default=True, 1904 | help="Default behaviour is to print progress updates. Use this flag " 1905 | "to disable that.", 1906 | ), 1907 | click.option( 1908 | "--export-table", 1909 | type=click.Path(dir_okay=False, writable=True), 1910 | default=None, 1911 | show_default=True, 1912 | help="Export a table of double scores and calls to the specified file.", 1913 | ), 1914 | COMMON_OPTIONS["random_state"], 1915 | ], 1916 | "plot_scrublet": [ 1917 | *COMMON_OPTIONS["input"], 1918 | *COMMON_OPTIONS["plot"], 1919 | click.option( 1920 | "--scale-hist-obs", 1921 | "-b", 1922 | type=click.Choice(["linear", "log", "symlog", "logit"]), 1923 | default="log", 1924 | show_default=True, 1925 | help="Set y axis scale transformation in matplotlib for the plot of observed transcriptomes.", 1926 | ), 1927 | click.option( 1928 | "--scale-hist-sim", 1929 | "-s", 1930 | type=click.Choice(["linear", "log", "symlog", "logit"]), 1931 | default="linear", 1932 | show_default=True, 1933 | help="Set y axis scale transformation in matplotlib for the plot of observed transcriptomes.", 1934 | ), 1935 | ], 1936 | "scrublet_simulate_doublets": [ 1937 | *COMMON_OPTIONS["input"], 1938 | *COMMON_OPTIONS["output"], 1939 | *COMMON_OPTIONS["scrublet"], 1940 | click.option( 1941 | "--layer", 1942 | "-l", 1943 | type=click.STRING, 1944 | default=None, 1945 | help="Layer of adata where raw values are stored, or ‘X’ if values " 1946 | "are in .X.", 1947 | ), 1948 | ], 1949 | "embed": [ 1950 | *COMMON_OPTIONS["input"], 1951 | *COMMON_OPTIONS["plot"], 1952 | *COMMON_OPTIONS["frame_title"], 1953 | COMMON_OPTIONS["layer"], 1954 | click.option( 1955 | "--basis", 1956 | type=click.STRING, 1957 | default="umap", 1958 | show_default=True, 1959 | help="Name of the embedding to plot, must be a key of `.obsm` without " 1960 | 'the prefix "X_".', 1961 | ), 1962 | click.option( 1963 | "--color", 1964 | type=CommaSeparatedText(simplify=True), 1965 | default=None, 1966 | show_default=True, 1967 | help="Keys for annotations of observations/cells or variables/genes.", 1968 | ), 1969 | click.option( 1970 | "--legend-loc", 1971 | type=click.Choice(["right margin", "on data"]), 1972 | default="right margin", 1973 | show_default=True, 1974 | help='Location of legend, either "on data", "right margin" or valid ' 1975 | "keywords for `matplotlib.legend`.", 1976 | ), 1977 | click.option( 1978 | "--legend-fontsize", 1979 | type=click.INT, 1980 | default=15, 1981 | show_default=True, 1982 | help="Legend font size.", 1983 | ), 1984 | click.option( 1985 | "--size", 1986 | type=click.FLOAT, 1987 | default=None, 1988 | show_default=True, 1989 | help="Point size. Automatically computed if not specified.", 1990 | ), 1991 | COMMON_OPTIONS["gene_symbols"], 1992 | click.option( 1993 | "--edges", 1994 | is_flag=True, 1995 | default=False, 1996 | show_default=True, 1997 | help="Show edges.", 1998 | ), 1999 | click.option( 2000 | "--edges-width", 2001 | type=click.FLOAT, 2002 | default=0.1, 2003 | show_default=True, 2004 | help="Width of edges.", 2005 | ), 2006 | click.option( 2007 | "--edges-color", 2008 | type=click.STRING, 2009 | default=None, 2010 | show_default=True, 2011 | help="Color of edges. See draw_networkx_edges().", 2012 | ), 2013 | COMMON_OPTIONS["knn_graph"][0], # --neighbors-key 2014 | click.option( 2015 | "--no-sort-order", 2016 | "sort_order", 2017 | is_flag=True, 2018 | default=True, 2019 | show_default=True, 2020 | help="Disable default behaviour: for continuous annotations used as " 2021 | "color parameter, plot data points with higher values on top of others.", 2022 | ), 2023 | *COMMON_OPTIONS["plot_embed"], 2024 | click.option( 2025 | "--components", 2026 | type=click.STRING, 2027 | default=None, 2028 | show_default=True, 2029 | help="For instance, ['1,2', '2,3']. To plot all available components use 'all'.", 2030 | ), 2031 | click.option( 2032 | "--projection", 2033 | type=click.Choice(["2d", "3d"]), 2034 | default="2d", 2035 | show_default=True, 2036 | help="Projection of plot.", 2037 | ), 2038 | ], 2039 | "plot_paga": [ 2040 | *COMMON_OPTIONS["input"], 2041 | *COMMON_OPTIONS["plot"], 2042 | *COMMON_OPTIONS["frame_title"], 2043 | *COMMON_OPTIONS["plot_embed"], 2044 | COMMON_OPTIONS["random_state"], 2045 | click.option( 2046 | "--use-key", 2047 | type=click.STRING, 2048 | default="paga", 2049 | show_default=True, 2050 | help="The key in `.uns` that contains trajectory information.", 2051 | ), 2052 | click.option( 2053 | "--layout", 2054 | type=click.Choice(["fa", "fr", "grid_fr", "kk", "lgl", "drl", "rt"]), 2055 | default="fr", 2056 | show_default=True, 2057 | help="Plotting layout that computes positions.", 2058 | ), 2059 | click.option( 2060 | "--init-pos", 2061 | type=click.STRING, 2062 | default=None, 2063 | show_default=True, 2064 | help="Plotting layout that computes positions.", 2065 | ), 2066 | click.option( 2067 | "--threshold", 2068 | type=click.FLOAT, 2069 | default=0.01, 2070 | show_default=True, 2071 | help="Do not draw edges for weights below this threshold. Set to 0 to " 2072 | "include all edges.", 2073 | ), 2074 | COMMON_OPTIONS["root"], 2075 | click.option( 2076 | "--root", 2077 | type=click.INT, 2078 | default=0, 2079 | show_default=True, 2080 | help="If choosing a tree layout, this is the index of the root node.", 2081 | ), 2082 | click.option( 2083 | "--transitions", 2084 | type=click.STRING, 2085 | default=None, 2086 | show_default=True, 2087 | help='Key for `.uns["paga"]` that specifies the matrix, e.g. ' 2088 | "`transition_confidence`, that stores the arrows.", 2089 | ), 2090 | click.option( 2091 | "--single-component", 2092 | is_flag=True, 2093 | default=False, 2094 | show_default=True, 2095 | help="Restrict to largest connected component", 2096 | ), 2097 | click.option( 2098 | "--solid-edges", 2099 | type=click.Choice(["connectivities", "connectivities_tree"]), 2100 | default="connectivities", 2101 | show_default=True, 2102 | help='Key for `.uns["paga"]` that specifies the matrix that stores the ' 2103 | "edges to be drawn solid black.", 2104 | ), 2105 | click.option( 2106 | "--basis", 2107 | type=click.STRING, 2108 | default=None, 2109 | show_default=True, 2110 | help="Name of the embedding to plot, must be a key of `.obsm` without " 2111 | 'the prefix "X_".', 2112 | ), 2113 | click.option( 2114 | "--color", 2115 | type=CommaSeparatedText(simplify=True), 2116 | default=None, 2117 | show_default=True, 2118 | help="Key(s) for annotation of observations/cells or variables/genes. Comma-separated if more than one", 2119 | ), 2120 | click.option( 2121 | "--legend-loc", 2122 | type=click.Choice(["right margin", "on data"]), 2123 | default="right margin", 2124 | show_default=True, 2125 | help='Location of legend, either "on data", "right margin" or valid ' 2126 | "keywords for `matplotlib.legend`.", 2127 | ), 2128 | click.option( 2129 | "--size", 2130 | type=click.FLOAT, 2131 | default=None, 2132 | show_default=True, 2133 | help="Point size. Automatically computed if not specified.", 2134 | ), 2135 | click.option( 2136 | "--node-size-scale", 2137 | type=click.FLOAT, 2138 | default=1.0, 2139 | show_default=True, 2140 | help="Increase of decrease the size of the nodes.", 2141 | ), 2142 | click.option( 2143 | "--fontsize", 2144 | type=click.INT, 2145 | default=None, 2146 | show_default=True, 2147 | help="Font size for node labels.", 2148 | ), 2149 | click.option( 2150 | "--edge-width-scale", 2151 | type=click.FLOAT, 2152 | default=1.0, 2153 | show_default=True, 2154 | help="Increase of decrease the width of the edges.", 2155 | ), 2156 | click.option( 2157 | "--arrowsize", 2158 | type=click.INT, 2159 | default=30, 2160 | show_default=True, 2161 | help="For directed graphs, specify the length and width of the arrowhead.", 2162 | ), 2163 | *COMMON_OPTIONS["opt_output"], 2164 | ], 2165 | "sviol": [ 2166 | *COMMON_OPTIONS["input"], 2167 | *COMMON_OPTIONS["plot"], 2168 | COMMON_OPTIONS["use_raw"], 2169 | COMMON_OPTIONS["var_names"], 2170 | *COMMON_OPTIONS["rank_genes_groups_plots"], 2171 | COMMON_OPTIONS["layer"], 2172 | *COMMON_OPTIONS["diffexp_plot"], 2173 | COMMON_OPTIONS["gene_symbols"], 2174 | *COMMON_OPTIONS["sviol"], 2175 | COMMON_OPTIONS["swap_axes"], 2176 | ], 2177 | "dot": [ 2178 | *COMMON_OPTIONS["input"], 2179 | *COMMON_OPTIONS["plot"], 2180 | COMMON_OPTIONS["use_raw"], 2181 | COMMON_OPTIONS["var_names"], 2182 | *COMMON_OPTIONS["rank_genes_groups_plots"], 2183 | COMMON_OPTIONS["layer"], 2184 | *COMMON_OPTIONS["diffexp_plot"], 2185 | COMMON_OPTIONS["gene_symbols"], 2186 | *COMMON_OPTIONS["dot"], 2187 | ], 2188 | "matrix": [ 2189 | *COMMON_OPTIONS["input"], 2190 | *COMMON_OPTIONS["plot"], 2191 | COMMON_OPTIONS["use_raw"], 2192 | COMMON_OPTIONS["var_names"], 2193 | *COMMON_OPTIONS["rank_genes_groups_plots"], 2194 | COMMON_OPTIONS["layer"], 2195 | *COMMON_OPTIONS["diffexp_plot"], 2196 | COMMON_OPTIONS["gene_symbols"], 2197 | ], 2198 | "heat": [ 2199 | *COMMON_OPTIONS["input"], 2200 | *COMMON_OPTIONS["plot"], 2201 | COMMON_OPTIONS["use_raw"], 2202 | COMMON_OPTIONS["var_names"], 2203 | *COMMON_OPTIONS["rank_genes_groups_plots"], 2204 | COMMON_OPTIONS["layer"], 2205 | *COMMON_OPTIONS["diffexp_plot"], 2206 | COMMON_OPTIONS["gene_symbols"], 2207 | *COMMON_OPTIONS["heat"], 2208 | COMMON_OPTIONS["swap_axes"], 2209 | ], 2210 | } 2211 | -------------------------------------------------------------------------------- /scanpy_scripts/cmd_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provide helper functions for constructing sub-commands 3 | """ 4 | 5 | import click 6 | import pandas as pd 7 | import scanpy as sc 8 | import scanpy.external as sce 9 | 10 | from .cmd_options import CMD_OPTIONS 11 | from .lib._paga import plot_paga 12 | from .lib._scrublet import plot_scrublet 13 | from .obj_utils import _save_matrix 14 | 15 | 16 | def make_subcmd(cmd_name, func, cmd_desc, arg_desc, opt_set=None): 17 | """ 18 | Factory function that returns a sub-command function 19 | """ 20 | opt_set = opt_set if opt_set else cmd_name 21 | options = CMD_OPTIONS[opt_set] 22 | option_spec = [click.command(cmd_name)] 23 | option_spec.extend(options) 24 | 25 | def add_docstring(cmd_desc, arg_desc): 26 | def docstring_dec(obj): 27 | obj.__doc__ = obj.__doc__.format(cmd_desc=cmd_desc, arg_desc=arg_desc) 28 | return obj 29 | 30 | return docstring_dec 31 | 32 | @add_options(option_spec) 33 | @add_docstring(cmd_desc, arg_desc) 34 | def cmd( 35 | input_obj=None, 36 | output_obj=None, 37 | input_format=None, 38 | output_format=None, 39 | zarr_chunk_size=None, 40 | loom_write_obsm_varm=False, 41 | export_mtx=None, 42 | mtx_compression=None, 43 | show_obj=None, 44 | **kwargs, 45 | ): 46 | """{cmd_desc}\n\n\b\n{arg_desc}""" 47 | if input_obj: 48 | adata = _read_obj(input_obj, input_format=input_format) 49 | func(adata, **kwargs) 50 | else: 51 | adata = func(**kwargs) 52 | 53 | if output_obj: 54 | _write_obj( 55 | adata, 56 | output_obj, 57 | output_format=output_format, 58 | chunk_size=zarr_chunk_size, 59 | write_obsm_varm=loom_write_obsm_varm, 60 | export_mtx=export_mtx, 61 | mtx_compression=mtx_compression, 62 | show_obj=show_obj, 63 | ) 64 | return 0 65 | 66 | return cmd 67 | 68 | 69 | def add_options(options): 70 | """ 71 | Returns a decorator to group multiple click decorators 72 | """ 73 | 74 | def _add_options(func): 75 | for option in reversed(options): 76 | func = option(func) 77 | return func 78 | 79 | return _add_options 80 | 81 | 82 | def _fix_booleans(df): 83 | for var in df.columns: 84 | if ( 85 | df[var].dtype.kind == "O" 86 | and df[var].dtype.name == "object" 87 | and set(pd.Categorical(df[var])).issubset(set(["True", "False", "nan"])) 88 | ): 89 | d = {"False": True, "False": False, "nan": False} 90 | df[var] = df[var].map(d).astype(bool) 91 | return df 92 | 93 | 94 | def _read_obj(input_obj, input_format="anndata", **kwargs): 95 | if input_format == "anndata": 96 | adata = sc.read_h5ad(input_obj, **kwargs) 97 | elif input_format == "loom": 98 | adata = sc.read_loom(input_obj, **kwargs) 99 | else: 100 | raise NotImplementedError("Unsupported input format: {}".format(input_format)) 101 | adata.var = _fix_booleans(adata.var) 102 | adata.obs = _fix_booleans(adata.obs) 103 | 104 | return adata 105 | 106 | 107 | def _write_obj( 108 | adata, 109 | output_obj, 110 | output_format="anndata", 111 | chunk_size=None, 112 | export_mtx=None, 113 | mtx_compression=None, 114 | show_obj=None, 115 | write_obsm_varm=False, 116 | **kwargs, 117 | ): 118 | if output_format == "anndata": 119 | adata.write(output_obj, compression="gzip") 120 | elif output_format == "loom": 121 | adata.write_loom(output_obj, write_obsm_varm=write_obsm_varm) 122 | elif output_format == "zarr": 123 | adata.write_zarr(output_obj, chunk_size=chunk_size, **kwargs) 124 | else: 125 | raise NotImplementedError("Unsupported output format: {}".format(output_format)) 126 | if export_mtx: 127 | compression = None 128 | if mtx_compression is not None: 129 | compression = {"method": mtx_compression} 130 | 131 | write_mtx(adata, fname_prefix=export_mtx, compression=compression, **kwargs) 132 | if show_obj: 133 | click.echo(adata, err=show_obj == "stderr") 134 | return 0 135 | 136 | 137 | def write_mtx( 138 | adata, 139 | fname_prefix="", 140 | var=None, 141 | obs=None, 142 | use_raw=False, 143 | use_layer=None, 144 | compression=None, 145 | ): 146 | """Export AnnData object to mtx formt 147 | * Parameters 148 | + adata : AnnData 149 | An AnnData object 150 | + fname_prefix : str 151 | Prefix of the exported files. If not empty and not ending with '/' or '_', 152 | a '_' will be appended. Full names will be matrix.mtx, 153 | genes.tsv, barcodes.tsv 154 | + var : list 155 | A list of column names to be exported to gene table 156 | + obs : list 157 | A list of column names to be exported to barcode/cell table 158 | + use_raw : bool 159 | Take data the matrix from .raw.X? 160 | + use_layer: str 161 | Specify a layer to use instead of .X (non-raw only) 162 | + compression: None, str or dict 163 | Compression parameter for Pandas' to_csv(). For compression, a dict 164 | with a 'method' key, e.g. {'method': 'gzip', 'compresslevel': 1, 165 | 'mtime': 1} 166 | 167 | >>> import os 168 | >>> from pathlib import Path 169 | >>> adata = sc.datasets.pbmc3k() 170 | >>> # Test uncompressed write 171 | >>> Path("uncompressed").mkdir(parents=True, exist_ok=True) 172 | >>> write_mtx(adata, fname_prefix = 'uncompressed/', use_raw = False, use_layer = None, var = ['gene_name']) 173 | >>> sorted(os.listdir('uncompressed')) 174 | ['barcodes.tsv', 'genes.tsv', 'matrix.mtx'] 175 | >>> # Test that the matrix is the same when we read it back 176 | >>> test_readable = sc.read_10x_mtx('uncompressed') 177 | >>> if any(test_readable.obs_names != adata.obs_names) or any(test_readable.var_names != adata.var_names) or (test_readable.X[1].sum() - adata.X[1].sum()) > 1e-5: 178 | ... print("Re-read matrix is different to the one we stored, something is wrong with the writing") 179 | >>> # Test compressed write 180 | >>> Path("compressed").mkdir(parents=True, exist_ok=True) 181 | >>> write_mtx(adata, fname_prefix = 'compressed/', use_raw = False, use_layer = None, var = ['gene_name'], compression = {'method': 'gzip'}) 182 | >>> sorted(os.listdir('compressed')) 183 | ['barcodes.tsv.gz', 'genes.tsv.gz', 'matrix.mtx.gz'] 184 | """ 185 | if fname_prefix and not (fname_prefix.endswith("/") or fname_prefix.endswith("_")): 186 | fname_prefix = fname_prefix + "_" 187 | if var is None: 188 | var = [] 189 | if obs is None: 190 | obs = [] 191 | 192 | import scipy.sparse as sp 193 | 194 | if use_raw: 195 | var_source = adata.raw.var 196 | mat = sp.coo_matrix(adata.raw.X) 197 | else: 198 | var_source = adata.var 199 | if use_layer is not None: 200 | mat = sp.coo_matrix(adata.layers[use_layer]) 201 | else: 202 | mat = sp.coo_matrix(adata.X) 203 | 204 | obs = list(set(obs) & set(adata.obs.columns)) 205 | var = list(set(var) & set(var_source.columns)) 206 | 207 | n_obs, n_var = mat.shape 208 | n_entry = len(mat.data) 209 | 210 | # Define the header lines as a Pandas DataFrame so we can use the same compression 211 | header = pd.DataFrame( 212 | ["%%MatrixMarket matrix coordinate real general", f"{n_var} {n_obs} {n_entry}"] 213 | ) 214 | df = pd.DataFrame({"col": mat.col + 1, "row": mat.row + 1, "data": mat.data}) 215 | 216 | # Define outputs 217 | mtx_fname = fname_prefix + "matrix.mtx" 218 | gene_fname = fname_prefix + "genes.tsv" 219 | barcode_fname = fname_prefix + "barcodes.tsv" 220 | 221 | # Write matrix with Pandas CSV and use its compression where requested 222 | if ( 223 | compression is not None 224 | and type(compression) is dict 225 | and "method" in compression 226 | ): 227 | compressed_exts = {"zip": "zip", "gzip": "gz", "bz2": "bz2", "zstd": "zst"} 228 | ext = compressed_exts.get(compression["method"], "None") 229 | 230 | if ext is None: 231 | errmsg = "Invalid compression method" 232 | raise Exception(errmsg) 233 | 234 | mtx_fname += f".{ext}" 235 | gene_fname += f".{ext}" 236 | barcode_fname += f".{ext}" 237 | else: 238 | compression = None 239 | 240 | header.to_csv(mtx_fname, header=False, index=False, compression=compression) 241 | df.to_csv( 242 | mtx_fname, sep=" ", header=False, index=False, compression=compression, mode="a" 243 | ) 244 | 245 | # Now write the obs and var, also with compression if appropriate 246 | obs_df = adata.obs[obs].reset_index(level=0) 247 | obs_df.to_csv( 248 | barcode_fname, sep="\t", header=False, index=False, compression=compression 249 | ) 250 | var_df = var_source[var].reset_index(level=0) 251 | if not var: 252 | var_df["gene"] = var_df["index"] 253 | var_df.to_csv( 254 | gene_fname, sep="\t", header=False, index=False, compression=compression 255 | ) 256 | 257 | 258 | def make_plot_function(func_name, kind=None): 259 | """Make plot function that handles common plotting parameters""" 260 | 261 | # Provide a function translation 262 | 263 | plot_funcs = { 264 | "embedding": sc.pl.embedding, 265 | "scatter": sc.pl.scatter, 266 | "sviol": sc.pl.stacked_violin, 267 | "rgg_sviol": sc.pl.rank_genes_groups_stacked_violin, 268 | "dot": sc.pl.dotplot, 269 | "rgg_dot": sc.pl.rank_genes_groups_dotplot, 270 | "matrix": sc.pl.matrixplot, 271 | "rgg_matrix": sc.pl.rank_genes_groups_matrixplot, 272 | "heat": sc.pl.heatmap, 273 | "rgg_heat": sc.pl.rank_genes_groups_heatmap, 274 | } 275 | 276 | def plot_function( 277 | adata, 278 | output_fig=None, 279 | fig_size=None, 280 | fig_dpi=300, 281 | fig_fontsize=15, 282 | **kwargs, 283 | ): 284 | sc.settings.set_figure_params(dpi=fig_dpi, fontsize=fig_fontsize) 285 | if fig_size: 286 | from matplotlib import rcParams 287 | 288 | rcParams.update({"figure.figsize": fig_size}) 289 | 290 | # Choose the function to run 291 | 292 | is_rgg = False 293 | 294 | if func_name in plot_funcs: 295 | if "rgg" in kwargs: 296 | if kwargs["rgg"] == True: 297 | is_rgg = True 298 | func = plot_funcs["rgg_" + func_name] 299 | kwargs.pop("var_names", None) 300 | else: 301 | func = plot_funcs[func_name] 302 | kwargs.pop("groups", None) 303 | kwargs.pop("n_genes", None) 304 | 305 | kwargs.pop("rgg") 306 | else: 307 | func = plot_funcs[func_name] 308 | else: 309 | func = globals()[func_name] 310 | 311 | # Generate the output file name 312 | 313 | figname = False 314 | showfig = True 315 | if output_fig: 316 | import os 317 | 318 | import matplotlib.pyplot as plt 319 | 320 | sc.settings.figdir = os.path.dirname(output_fig) or "." 321 | 322 | figname = os.path.basename(output_fig) 323 | showfig = False 324 | 325 | # Run the selected function 326 | 327 | func(adata, save=figname, show=showfig, **kwargs) 328 | 329 | # Rename output to the spefied file name. We need to work out what 330 | # prefix the function will have used for its output files. 331 | 332 | if output_fig: 333 | prefix = "" 334 | if func_name == "scatter" or func_name == "embedding": 335 | prefix = kwargs.get("basis", func.__name__) 336 | elif kind: 337 | prefix = kind 338 | elif func_name in plot_funcs: 339 | prefix = plot_funcs[func_name].__name__.split(".")[-1] 340 | if func_name in [ 341 | "sviol", 342 | "rgg_sviol", 343 | "dot", 344 | "rgg_dot", 345 | "matrix", 346 | "rgg_matrix", 347 | ]: 348 | prefix = prefix + "_" 349 | 350 | os.rename(os.path.join(sc.settings.figdir, prefix + figname), output_fig) 351 | plt.close() 352 | 353 | return plot_function 354 | 355 | 356 | # Wrap matrix-processing functions in logic to back up .X or specified input 357 | # layers prior to processing 358 | 359 | 360 | def make_matrix_function(func): 361 | def matrix_function( 362 | adata, 363 | save_raw=True, 364 | save_layer=None, 365 | **kwargs, 366 | ): 367 | 368 | # For the subset of matrix functions that allow layer specification, 369 | # pass that as the thing to save. 370 | 371 | layer = None 372 | if "layer" in kwargs: 373 | layer = kwargs["layer"] 374 | 375 | _save_matrix(adata, save_raw, save_layer=save_layer, layer=layer) 376 | func(adata, **kwargs) 377 | return adata 378 | 379 | return matrix_function 380 | -------------------------------------------------------------------------------- /scanpy_scripts/cmds.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provides sub-commands 3 | """ 4 | 5 | import os 6 | import sys 7 | import scanpy as sc 8 | import scanpy.external as sce 9 | 10 | from .cmd_utils import ( 11 | make_subcmd, 12 | make_plot_function, 13 | make_matrix_function, 14 | ) 15 | from .lib._read import read_10x 16 | from .lib._filter import filter_anndata 17 | from .lib._norm import normalize 18 | from .lib._hvg import hvg 19 | from .lib._pca import pca 20 | from .lib._neighbors import neighbors 21 | from .lib._umap import umap 22 | from .lib._tsne import tsne 23 | from .lib._fdg import fdg 24 | from .lib._louvain import louvain 25 | from .lib._leiden import leiden 26 | from .lib._diffexp import diffexp 27 | from .lib._paga import paga 28 | from .lib._diffmap import diffmap 29 | from .lib._dpt import dpt 30 | from .lib._bbknn import bbknn 31 | from .lib._mnn import mnn_correct 32 | from .lib._combat import combat 33 | from .lib._scrublet import scrublet, scrublet_simulate_doublets 34 | 35 | LANG = os.environ.get("LANG", None) 36 | 37 | if LANG is None or not ( 38 | LANG.endswith("UTF-8") 39 | or LANG.endswith("UTF8") 40 | or LANG.endswith("utf-8") 41 | or LANG.endswith("utf8") 42 | ): 43 | print("This programme requires a UTF-8 locale, please check your $LANG setting.") 44 | sys.exit(0) 45 | 46 | 47 | _I_DESC = ": input file in format specfied by --input-format" 48 | _O_DESC = ": output file in format specfied by --output-format" 49 | _P_DESC = ": output figure in pdf or png format" 50 | _IO_DESC = "\n".join([_I_DESC, _O_DESC]) 51 | _IP_DESC = "\n".join([_I_DESC, _P_DESC]) 52 | 53 | 54 | READ_CMD = make_subcmd( 55 | "read", 56 | read_10x, 57 | cmd_desc="Read 10x data and save in specified format.", 58 | arg_desc=_O_DESC, 59 | ) 60 | 61 | 62 | FILTER_CMD = make_subcmd( 63 | "filter", 64 | make_matrix_function(filter_anndata), 65 | cmd_desc="Filter data based on specified conditions.", 66 | arg_desc=_IO_DESC, 67 | ) 68 | 69 | 70 | NORM_CMD = make_subcmd( 71 | "norm", 72 | make_matrix_function(normalize), 73 | cmd_desc="Normalise data per cell.", 74 | arg_desc=_IO_DESC, 75 | ) 76 | 77 | 78 | HVG_CMD = make_subcmd( 79 | "hvg", 80 | hvg, 81 | cmd_desc="Find highly variable genes.", 82 | arg_desc=_IO_DESC, 83 | ) 84 | 85 | 86 | SCALE_CMD = make_subcmd( 87 | "scale", 88 | make_matrix_function(sc.pp.scale), 89 | cmd_desc="Scale data per gene.", 90 | arg_desc=_IO_DESC, 91 | ) 92 | 93 | 94 | REGRESS_CMD = make_subcmd( 95 | "regress", 96 | make_matrix_function(sc.pp.regress_out), 97 | cmd_desc="Regress-out observation variables.", 98 | arg_desc=_IO_DESC, 99 | ) 100 | 101 | 102 | PCA_CMD = make_subcmd( 103 | "pca", 104 | pca, 105 | cmd_desc="Dimensionality reduction by PCA.", 106 | arg_desc=_IO_DESC, 107 | ) 108 | 109 | NEIGHBOR_CMD = make_subcmd( 110 | "neighbor", 111 | neighbors, 112 | cmd_desc="Compute a neighbourhood graph of observations.", 113 | arg_desc=_IO_DESC, 114 | ) 115 | 116 | UMAP_CMD = make_subcmd( 117 | "umap", 118 | umap, 119 | cmd_desc="Embed the neighborhood graph using UMAP.", 120 | arg_desc=_IO_DESC, 121 | ) 122 | 123 | TSNE_CMD = make_subcmd( 124 | "tsne", 125 | tsne, 126 | cmd_desc="Embed the cells using t-SNE.", 127 | arg_desc=_IO_DESC, 128 | ) 129 | 130 | FDG_CMD = make_subcmd( 131 | "fdg", 132 | fdg, 133 | cmd_desc="Embed the neighborhood graph using force-directed graph.", 134 | arg_desc=_IO_DESC, 135 | ) 136 | 137 | DIFFMAP_CMD = make_subcmd( 138 | "diffmap", 139 | diffmap, 140 | cmd_desc="Embed the neighborhood graph using diffusion map.", 141 | arg_desc=_IO_DESC, 142 | ) 143 | 144 | LOUVAIN_CMD = make_subcmd( 145 | "louvain", 146 | louvain, 147 | cmd_desc="Find clusters by Louvain algorithm.", 148 | arg_desc=_IO_DESC, 149 | ) 150 | 151 | LEIDEN_CMD = make_subcmd( 152 | "leiden", 153 | leiden, 154 | cmd_desc="Find clusters by Leiden algorithm.", 155 | arg_desc=_IO_DESC, 156 | ) 157 | 158 | DIFFEXP_CMD = make_subcmd( 159 | "diffexp", 160 | diffexp, 161 | cmd_desc="Find markers for each clusters.", 162 | arg_desc=_IO_DESC, 163 | ) 164 | 165 | PAGA_CMD = make_subcmd( 166 | "paga", 167 | paga, 168 | cmd_desc="Trajectory inference by abstract graph analysis.", 169 | arg_desc=_IO_DESC, 170 | ) 171 | 172 | DPT_CMD = make_subcmd( 173 | "dpt", 174 | dpt, 175 | cmd_desc="Calculate diffusion pseudotime relative to the root cells.", 176 | arg_desc=_IO_DESC, 177 | ) 178 | 179 | PLOT_EMBED_CMD = make_subcmd( 180 | "embed", 181 | make_plot_function("embedding"), 182 | cmd_desc="Plot cell embeddings.", 183 | arg_desc=_IP_DESC, 184 | ) 185 | 186 | PLOT_STACKED_VIOLIN_CMD = make_subcmd( 187 | "sviol", 188 | make_plot_function("sviol"), 189 | cmd_desc="Plot stacked violin plots.", 190 | arg_desc=_IP_DESC, 191 | ) 192 | 193 | PLOT_DOT_CMD = make_subcmd( 194 | "dot", 195 | make_plot_function("dot"), 196 | cmd_desc="Plot a dot plot of expression values.", 197 | arg_desc=_IP_DESC, 198 | ) 199 | 200 | PLOT_MATRIX_CMD = make_subcmd( 201 | "matrix", 202 | make_plot_function("matrix"), 203 | cmd_desc="Plot a heatmap of the mean expression values per cluster.", 204 | arg_desc=_IP_DESC, 205 | ) 206 | 207 | PLOT_HEATMAP_CMD = make_subcmd( 208 | "heat", 209 | make_plot_function("heat"), 210 | cmd_desc="Plot a heatmap of the expression values of genes.", 211 | arg_desc=_IP_DESC, 212 | ) 213 | 214 | PLOT_PAGA_CMD = make_subcmd( 215 | "paga", 216 | make_plot_function("plot_paga", kind="paga"), 217 | cmd_desc="Plot PAGA trajectories.", 218 | arg_desc=_IP_DESC, 219 | opt_set="plot_paga", 220 | ) 221 | 222 | COMBAT_CMD = make_subcmd( 223 | "combat", 224 | combat, 225 | cmd_desc="ComBat function for batch effect correction", 226 | arg_desc=_IO_DESC, 227 | ) 228 | 229 | HARMONY_INTEGRATE_CMD = make_subcmd( 230 | "harmony", 231 | sce.pp.harmony_integrate, 232 | cmd_desc="Use harmonypy [Korunsky19] to integrate different experiments.", 233 | arg_desc=_IO_DESC, 234 | ) 235 | 236 | BBKNN_CMD = make_subcmd( 237 | "bbknn", 238 | bbknn, 239 | cmd_desc="Batch balanced kNN [Polanski19].", 240 | arg_desc=_IO_DESC, 241 | ) 242 | 243 | MNN_CORRECT_CMD = make_subcmd( 244 | "mnn", 245 | make_matrix_function(mnn_correct), 246 | cmd_desc="Correct batch effects by matching mutual nearest neighbors [Haghverdi18] [Kang18].", 247 | arg_desc=_IO_DESC, 248 | ) 249 | 250 | SCRUBLET_MULTIPLET_CMD = make_subcmd( 251 | "scrublet", 252 | scrublet, 253 | cmd_desc="Filter out likely multiplets from droplet data using Scrublet [Wolock2019].", 254 | arg_desc=_IO_DESC, 255 | ) 256 | 257 | SCRUBLET_MULTIPLET_SIMULATE_CMD = make_subcmd( 258 | "scrublet_simulate_doublets", 259 | scrublet_simulate_doublets, 260 | cmd_desc="Simulate doublets with random transcriptome pairs for Scrublet [Wolock2019].", 261 | arg_desc=_IO_DESC, 262 | ) 263 | 264 | SCRUBLET_MULTIPLET_PLOT_CMD = make_subcmd( 265 | "scrublet", 266 | make_plot_function("plot_scrublet", "scrublet_score_distribution"), 267 | cmd_desc="Plot histogram of doublet scores for observed transcriptomes and simulated doublets..", 268 | arg_desc=_IP_DESC, 269 | opt_set="plot_scrublet", 270 | ) 271 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provides exported functions 3 | """ 4 | 5 | from ._read import read_10x 6 | from ._filter import filter_anndata 7 | from ._norm import normalize 8 | from ._hvg import hvg 9 | from ._neighbors import neighbors 10 | from ._umap import umap 11 | from ._fdg import fdg 12 | from ._tsne import tsne 13 | from ._louvain import louvain 14 | from ._leiden import leiden 15 | from ._diffexp import diffexp, diffexp_paired, extract_de_table 16 | from ._diffmap import diffmap 17 | from ._dpt import dpt 18 | from ._paga import paga, plot_paga 19 | from ..cmd_utils import _read_obj as read_obj 20 | from ..cmd_utils import _write_obj as write_obj 21 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/_bbknn.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanpy external bbknn 3 | """ 4 | 5 | import scanpy.external as sce 6 | 7 | from ..obj_utils import ( 8 | _backup_default_key, 9 | _delete_backup_key, 10 | _rename_default_key, 11 | ) 12 | 13 | # Wrapper for bbknn allowing use of non-standard slot 14 | 15 | 16 | def bbknn(adata, key=None, key_added=None, **kwargs): 17 | """ 18 | Wrapper function for sce.pp.bbknn(), for supporting non-standard neighbors slot 19 | """ 20 | 21 | _backup_default_key(adata.uns, "neighbors") 22 | _backup_default_key(adata.obsp, "distances") 23 | _backup_default_key(adata.obsp, "connectivities") 24 | sce.pp.bbknn(adata, batch_key=key, **kwargs) 25 | 26 | if key_added: 27 | _rename_default_key(adata.uns, "neighbors", f"{key_added}") 28 | _rename_default_key(adata.obsp, "distances", f"{key_added}_distances") 29 | _rename_default_key(adata.obsp, "connectivities", f"{key_added}_connectivities") 30 | else: 31 | _delete_backup_key(adata.uns, "neighbors") 32 | _delete_backup_key(adata.obsp, "distances") 33 | _delete_backup_key(adata.obsp, "connectivities") 34 | 35 | return adata 36 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/_combat.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanpy combat 3 | """ 4 | 5 | import scanpy as sc 6 | 7 | # Wrapper for mnn allowing use of non-standard slot 8 | 9 | 10 | def combat(adata, key=None, key_added=None, layer=None, **kwargs): 11 | """ 12 | Wrapper function for scanpy.pp.combat(), for supporting non-standard slots 13 | """ 14 | 15 | # If layer is set then we have to move the contents of that layer into 16 | # .X for analysis. We back up the original .X, but only if the user hasn't 17 | # specified to overwrite it anyway. 18 | 19 | if layer: 20 | if key_added and key_added != "X": 21 | adata.layers["X_backup"] = adata.X 22 | 23 | adata.X = adata.layers[layer] 24 | 25 | # If we're storing results in .X (whether from .X or from a layer), run in 26 | # place to save copying objects. 27 | 28 | if key_added and key_added == "X": 29 | sc.pp.combat(adata, key=key, **kwargs) 30 | 31 | # If we're storing in 'layers' (key_added is not set, or is not X, then 32 | # don't run in place, and put the matrix in the specified layer. 33 | 34 | else: 35 | 36 | cdata = sc.pp.combat(adata, key=key, inplace=False, **kwargs) 37 | 38 | combat_key = "combat" 39 | if layer: 40 | combat_key = f"{combat_key}_{layer}" 41 | 42 | # If we ran from a layer, restore the .X we had to overwrite 43 | 44 | adata.X = adata.layers["X_backup"] 45 | del adata.layers["X_backup"] 46 | 47 | if key_added: 48 | combat_key = f"{combat_key}_{key_added}" 49 | 50 | adata.layers[combat_key] = cdata 51 | 52 | return adata 53 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/_diffexp.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanpy diffexp 3 | """ 4 | 5 | import logging 6 | import math 7 | 8 | import pandas as pd 9 | import scanpy as sc 10 | 11 | 12 | def diffexp( 13 | adata, 14 | use_raw=None, 15 | n_genes=None, 16 | key_added="rank_genes_groups", 17 | layer=None, 18 | logreg_param=None, 19 | filter_params=None, 20 | save=None, 21 | groupby=None, 22 | groups=None, 23 | **kwargs, 24 | ): 25 | """ 26 | Wrapper function for sc.tl.rank_genes_groups. 27 | 28 | Test that we can load a single group. 29 | >>> import os 30 | >>> from pathlib import Path 31 | >>> adata = sc.datasets.krumsiek11() 32 | >>> tbl = diffexp(adata, groupby='cell_type', groups='Mo', reference='progenitor') 33 | >>> # get the size of the data frame 34 | >>> tbl.shape 35 | (11, 8) 36 | """ 37 | if adata.raw is None: 38 | use_raw = False 39 | 40 | if n_genes is None: 41 | n_genes = adata.raw.shape[1] if use_raw else adata.shape[1] 42 | 43 | if logreg_param and isinstance(logreg_param, dict): 44 | for key, val in logreg_param: 45 | kwargs[key] = val 46 | 47 | key_added = key_added if key_added else "rank_genes_groups" 48 | diff_key = (key_added + f"_{layer}") if layer else key_added 49 | 50 | if groups == "all": 51 | 52 | # Avoid divisions by zeros for singlet groups. See 53 | # https://github.com/theislab/scanpy/pull/1490#issuecomment-726031442. 54 | 55 | groups_to_test = list( 56 | adata.obs[groupby].value_counts().loc[lambda x: x > 1].index 57 | ) 58 | 59 | if len(groups_to_test) < len(adata.obs[groupby].cat.categories): 60 | groups = groups_to_test 61 | logging.warning( 62 | "Singlet groups removed before passing to rank_genes_groups()" 63 | ) 64 | 65 | # avoid issue when groups is a single group as a string simplified by click 66 | # https://github.com/ebi-gene-expression-group/scanpy-scripts/issues/123 67 | if groups != "all" and isinstance(groups, str): 68 | groups = [groups] 69 | 70 | sc.tl.rank_genes_groups( 71 | adata, 72 | use_raw=use_raw, 73 | n_genes=n_genes, 74 | key_added=diff_key, 75 | groupby=groupby, 76 | groups=groups, 77 | **kwargs, 78 | ) 79 | 80 | de_tbl = extract_de_table(adata.uns[diff_key]) 81 | 82 | if isinstance(filter_params, dict): 83 | key_filtered = diff_key + "_filtered" 84 | sc.tl.filter_rank_genes_groups( 85 | adata, 86 | key=diff_key, 87 | key_added=key_filtered, 88 | use_raw=use_raw, 89 | **filter_params, 90 | ) 91 | 92 | # there are non strings on recarray object at this point, in 93 | # adata.uns['rank_genes_groups_filtered']['names'] 94 | # for instance: 95 | # adata.uns['rank_genes_groups_filtered']['names'][0] 96 | # (nan, nan, 'NKG7', nan, nan, 'PPBP') 97 | # this now upsets h5py > 3.0 98 | de_tbl = extract_de_table(adata.uns[key_filtered]) 99 | de_tbl = de_tbl.loc[de_tbl.genes.astype(str) != "nan", :] 100 | 101 | # change nan for strings in adata.uns['rank_genes_groups_filtered']['names'] 102 | # TODO on scanpy updates, check if this is not done within scanpy so that we can remove this 103 | for row in range(0, len(adata.uns[key_filtered]["names"])): 104 | for col in range(0, len(adata.uns[key_filtered]["names"][row])): 105 | element = adata.uns[key_filtered]["names"][row][col] 106 | if isinstance(element, float) and math.isnan(element): 107 | adata.uns[key_filtered]["names"][row][col] = "nan" 108 | 109 | if save: 110 | de_tbl.to_csv(save, sep="\t", header=True, index=False) 111 | 112 | return de_tbl 113 | 114 | 115 | def diffexp_paired(adata, groupby, pair, **kwargs): 116 | """ 117 | Restrict DE to between a pair of clusters, return both up and down genes 118 | """ 119 | test, ref = pair 120 | de_key = f"de.{test}-{ref}" 121 | up_de = diffexp( 122 | adata, 123 | key_added=de_key, 124 | groupby=groupby, 125 | groups=[test], 126 | reference=ref, 127 | **kwargs, 128 | ) 129 | ref, test = pair 130 | de_key = f"de.{test}-{ref}" 131 | down_de = diffexp( 132 | adata, 133 | key_added=de_key, 134 | groupby=groupby, 135 | groups=[test], 136 | reference=ref, 137 | **kwargs, 138 | ) 139 | return up_de, down_de 140 | 141 | 142 | def extract_de_table(de_dict): 143 | """ 144 | Extract DE table from adata.uns 145 | """ 146 | if de_dict["params"]["method"] == "logreg": 147 | requested_fields = ("scores",) 148 | else: 149 | requested_fields = ( 150 | "scores", 151 | "logfoldchanges", 152 | "pvals", 153 | "pvals_adj", 154 | ) 155 | gene_df = _recarray_to_dataframe(de_dict["names"], "genes")[ 156 | ["cluster", "rank", "genes"] 157 | ] 158 | gene_df["ref"] = de_dict["params"]["reference"] 159 | gene_df = gene_df[["cluster", "ref", "rank", "genes"]] 160 | de_df = pd.DataFrame( 161 | { 162 | field: _recarray_to_dataframe(de_dict[field], field)[field] 163 | for field in requested_fields 164 | if field in de_dict 165 | } 166 | ) 167 | return gene_df.merge(de_df, left_index=True, right_index=True) 168 | 169 | 170 | def _recarray_to_dataframe(array, field_name): 171 | return ( 172 | pd.DataFrame(array) 173 | .reset_index() 174 | .rename(columns={"index": "rank"}) 175 | .melt(id_vars="rank", var_name="cluster", value_name=field_name) 176 | ) 177 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/_diffmap.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanpy diffmap 3 | """ 4 | 5 | import scanpy as sc 6 | from ..obj_utils import ( 7 | _rename_obsm_key, 8 | write_embedding, 9 | ) 10 | 11 | 12 | def diffmap( 13 | adata, 14 | key_added=None, 15 | export_embedding=None, 16 | **kwargs, 17 | ): 18 | """ 19 | Wrapper function for sc.tl.diffmap, for supporting named slot 20 | """ 21 | sc.tl.diffmap(adata, **kwargs) 22 | 23 | diffmap_key = "X_diffmap" 24 | if key_added: 25 | diffmap_key = f"{diffmap_key}_{key_added}" 26 | _rename_obsm_key(adata, "X_diffmap", diffmap_key) 27 | 28 | if export_embedding is not None: 29 | write_embedding(adata, diffmap_key, export_embedding, key_added=key_added) 30 | return adata 31 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/_dpt.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanpy dpt 3 | """ 4 | 5 | import numpy as np 6 | import scanpy as sc 7 | from ..obj_utils import ( 8 | _rename_default_key, 9 | ) 10 | 11 | 12 | def dpt( 13 | adata, 14 | root=None, 15 | use_diffmap="X_diffmap", 16 | key_added=None, 17 | **kwargs, 18 | ): 19 | """ 20 | Wrapper function for sc.tl.dpt 21 | """ 22 | if root is None or not (isinstance(root, (list, tuple)) and len(root) == 2): 23 | root = (None, None) 24 | if "iroot" not in adata.uns.keys() and root[0] is None: 25 | raise ValueError( 26 | "Annotate your data with root cell first, i.e. " 27 | 'boolean vector `.uns["iroot"]` is required.' 28 | ) 29 | if root[0] is not None: 30 | adata.uns["iroot"] = np.random.choice( 31 | np.flatnonzero(adata.obs[root[0]] == root[1]) 32 | ) 33 | 34 | sc.tl.dpt(adata, **kwargs) 35 | if key_added: 36 | dpt_key = f"dpt_pseudotime_{key_added}" 37 | _rename_default_key(adata.obs, "dpt_pseudotime", dpt_key) 38 | 39 | return adata 40 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/_fdg.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanpy fdg 3 | """ 4 | 5 | import scanpy as sc 6 | from ..obj_utils import ( 7 | _backup_obsm_key, 8 | _delete_obsm_backup_key, 9 | _rename_obsm_key, 10 | write_embedding, 11 | ) 12 | 13 | 14 | def fdg( 15 | adata, 16 | layout="fa", 17 | key_added_ext=None, 18 | random_state=0, 19 | export_embedding=None, 20 | **kwargs, 21 | ): 22 | """ 23 | Wrapper function for sc.tl.draw_graph, for supporting named slot of fdg 24 | embeddings. 25 | """ 26 | sc.tl.draw_graph( 27 | adata, 28 | layout=layout, 29 | key_added_ext=key_added_ext, 30 | random_state=random_state, 31 | **kwargs, 32 | ) 33 | 34 | fdg_key = f"X_draw_graph_{key_added_ext or layout}" 35 | 36 | if export_embedding is not None: 37 | write_embedding(adata, fdg_key, export_embedding, key_added=key_added_ext) 38 | return adata 39 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/_filter.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanpy filter 3 | """ 4 | 5 | import logging 6 | import re 7 | import click 8 | import numpy as np 9 | import scanpy as sc 10 | 11 | 12 | def filter_anndata( 13 | adata, 14 | gene_name="index", 15 | list_attr=False, 16 | param=None, 17 | category=None, 18 | subset=None, 19 | force_recalc=False, 20 | ): 21 | """ 22 | Wrapper function for sc.pp.filter_cells() and sc.pp.filter_genes(), mainly 23 | for supporting arbitrary filtering 24 | """ 25 | param = [] if param is None else param 26 | category = [] if category is None else category 27 | subset = [] if subset is None else subset 28 | 29 | logging.debug("--gene-name=%s", gene_name) 30 | logging.debug("--param=%s", param) 31 | logging.debug("--category=%s", category) 32 | logging.debug("--subset=%s", subset) 33 | 34 | if "mito" not in adata.var.keys() and gene_name: 35 | try: 36 | gene_names = getattr(adata.var, gene_name) 37 | k_mito = gene_names.str.startswith("MT-") 38 | if k_mito.sum() > 0: 39 | adata.var["mito"] = k_mito 40 | # adata.var["mito"] = adata.var["mito"].astype("category") 41 | else: 42 | logging.warning( 43 | "No MT genes found, skip calculating " 44 | "expression of mitochondria genes" 45 | ) 46 | except AttributeError: 47 | logging.warning( 48 | "Specified gene column [%s] not found, skip calculating " 49 | "expression of mitochondria genes", 50 | gene_name, 51 | ) 52 | 53 | attributes = _get_attributes(adata) 54 | if list_attr: 55 | click.echo(_repr_obj(attributes)) 56 | return 0 57 | 58 | conditions, qc_vars, pct_top = _get_filter_conditions( 59 | attributes, param, category, subset 60 | ) 61 | 62 | layer = "counts" if "counts" in adata.layers.keys() else None 63 | obs_columns = adata.obs.columns 64 | for qv in qc_vars: 65 | if f"pct_counts_{qv}" in obs_columns and not force_recalc: 66 | logging.warning( 67 | "`pct_counts_%s` exists, not overwriting " "without --force-recalc", qv 68 | ) 69 | qc_vars.remove(qv) 70 | for pt in pct_top: 71 | if f"pct_counts_in_top_{pt}_genes" in obs_columns and not force_recalc: 72 | logging.warning( 73 | "`pct_counts_%s` exists, not overwriting " "without --force-recalc", pt 74 | ) 75 | pct_top.remove(pt) 76 | 77 | # Calculate mito stats if we can, even if we're not filtering by them 78 | 79 | if "mito" not in qc_vars and "mito" in adata.var.keys(): 80 | qc_vars.append("mito") 81 | 82 | sc.pp.calculate_qc_metrics( 83 | adata, layer=layer, qc_vars=qc_vars, percent_top=pct_top, inplace=True 84 | ) 85 | 86 | adata.obs["n_counts"] = adata.obs["total_counts"] 87 | adata.obs["n_genes"] = adata.obs["n_genes_by_counts"] 88 | adata.var["n_counts"] = adata.var["total_counts"] 89 | adata.var["n_cells"] = adata.var["n_cells_by_counts"] 90 | 91 | k_cell = np.ones(len(adata.obs)).astype(bool) 92 | for cond in conditions["c"]["numerical"]: 93 | name, vmin, vmax = cond 94 | attr = adata.obs[name] 95 | k_cell = k_cell & (attr >= vmin) & (attr <= vmax) 96 | 97 | for cond in conditions["c"]["categorical"]: 98 | name, values = cond 99 | attr = getattr(adata.obs, name).astype(str) 100 | if values[0].startswith("!"): 101 | values[0] = values[0][1:] 102 | k_cell = k_cell & (~attr.isin(values)) 103 | else: 104 | k_cell = k_cell & attr.isin(values) 105 | 106 | k_gene = np.ones(len(adata.var)).astype(bool) 107 | for cond in conditions["g"]["numerical"]: 108 | name, vmin, vmax = cond 109 | attr = adata.var[name] 110 | k_gene = k_gene & (attr >= vmin) & (attr <= vmax) 111 | 112 | for cond in conditions["g"]["categorical"]: 113 | name, values = cond 114 | attr = getattr(adata.var, name).astype(str) 115 | if values[0].startswith("!"): 116 | values[0] = values[0][1:] 117 | k_gene = k_gene & ~(attr.isin(values)) 118 | else: 119 | k_gene = k_gene & attr.isin(values) 120 | 121 | adata._inplace_subset_obs(k_cell) 122 | adata._inplace_subset_var(k_gene) 123 | 124 | return adata 125 | 126 | 127 | def _get_attributes(adata): 128 | attributes = { 129 | "c": { 130 | "numerical": [], 131 | "categorical": ["index"], 132 | "bool": [], 133 | }, 134 | "g": { 135 | "numerical": [], 136 | "categorical": ["index"], 137 | "bool": [], 138 | }, 139 | } 140 | 141 | for attr, dtype in adata.obs.dtypes.to_dict().items(): 142 | typ = dtype.kind 143 | if typ == "O": 144 | if dtype.name == "category" and dtype.categories.is_boolean(): 145 | attributes["c"]["bool"].append(attr) 146 | attributes["c"]["categorical"].append(attr) 147 | elif typ in ("i", "f", "u"): 148 | attributes["c"]["numerical"].append(attr) 149 | elif typ == "b": 150 | attributes["c"]["bool"].append(attr) 151 | attributes["c"]["categorical"].append(attr) 152 | 153 | for attr, dtype in adata.var.dtypes.to_dict().items(): 154 | typ = dtype.kind 155 | if typ == "O": 156 | if dtype.name == "category" and dtype.categories.is_boolean(): 157 | attributes["g"]["bool"].append(attr) 158 | attributes["g"]["categorical"].append(attr) 159 | elif typ in ("i", "f", "u"): 160 | attributes["g"]["numerical"].append(attr) 161 | elif typ == "b": 162 | attributes["g"]["bool"].append(attr) 163 | attributes["g"]["categorical"].append(attr) 164 | 165 | attributes["c"]["numerical"].extend( 166 | [ 167 | "n_genes", 168 | "n_counts", 169 | ] 170 | ) 171 | 172 | for attr in attributes["g"]["bool"]: 173 | attr2 = "pct_counts_" + attr 174 | if attr2 not in adata.obs.columns: 175 | attr2 += "*" 176 | attributes["c"]["numerical"].append(attr2) 177 | 178 | attributes["g"]["numerical"].extend( 179 | [ 180 | "n_cells", 181 | "n_counts", 182 | "mean_counts", 183 | "pct_dropout_by_counts", 184 | ] 185 | ) 186 | logging.debug(attributes) 187 | return attributes 188 | 189 | 190 | def _attributes_exists(name, attributes, dtype): 191 | cond_cat = "" 192 | if name.startswith("c:") or name.startswith("g:"): 193 | cond_cat, _, cond_name = name.partition(":") 194 | found = int(cond_name in attributes[cond_cat][dtype]) 195 | else: 196 | cond_name = name 197 | if cond_name in attributes["c"][dtype]: 198 | cond_cat += "c" 199 | if cond_name in attributes["g"][dtype]: 200 | cond_cat += "g" 201 | found = len(cond_cat) 202 | return found, cond_cat, cond_name 203 | 204 | 205 | def _get_filter_conditions(attributes, param, category, subset): 206 | conditions = { 207 | "c": { 208 | "numerical": [], 209 | "categorical": [], 210 | "bool": [], 211 | }, 212 | "g": { 213 | "numerical": [], 214 | "categorical": [], 215 | "bool": [], 216 | }, 217 | } 218 | percent_top_pattern = re.compile(r"^pct_counts_in_top_(?P\d+)_genes$") 219 | pct_top = [] 220 | qc_vars_pattern = re.compile(r"^pct_counts_(?P\S+)$") 221 | qc_vars = [] 222 | 223 | for name, vmin, vmax in param: 224 | found, cond_cat, cond_name = _attributes_exists(name, attributes, "numerical") 225 | pt_match = percent_top_pattern.match(cond_name) 226 | qv_match = qc_vars_pattern.match(cond_name) 227 | if found > 1: 228 | raise click.ClickException( 229 | f'Ambiguous parameter "{name}" found in ' "both cell and gene table" 230 | ) 231 | if found < 1: 232 | if pt_match: 233 | pct_top.append(int(pt_match["n"])) 234 | cond_cat = "c" 235 | elif qv_match and qv_match["qc_var"] in attributes["g"]["bool"]: 236 | qc_vars.append(qv_match["qc_var"]) 237 | cond_cat = "c" 238 | else: 239 | raise click.ClickException(f'Parameter "{name}" unavailable') 240 | if pt_match or qv_match: 241 | vmin *= 100 242 | vmax *= 100 243 | conditions[cond_cat]["numerical"].append([cond_name, vmin, vmax]) 244 | 245 | for name, values in category + subset: 246 | found, cond_cat, cond_name = _attributes_exists(name, attributes, "categorical") 247 | if found > 1: 248 | raise click.ClickException( 249 | f'Ambiguous attribute "{name}" found in ' "both cell and gene table" 250 | ) 251 | if found < 1: 252 | raise click.ClickException(f'Attribute "{name}" unavailable') 253 | if not isinstance(values, (list, tuple)): 254 | fh = values 255 | values = fh.read().rstrip().split("\n") 256 | fh.close() 257 | conditions[cond_cat]["categorical"].append((cond_name, values)) 258 | 259 | logging.debug((conditions, qc_vars, pct_top)) 260 | return conditions, qc_vars, sorted(pct_top) 261 | 262 | 263 | def _repr_obj(obj, padding=" ", level=0): 264 | if isinstance(obj, dict): 265 | obj_str = "\n".join( 266 | [ 267 | "\n".join([padding * level + k + ":", _repr_obj(v, level=level + 1)]) 268 | for k, v in obj.items() 269 | ] 270 | ) 271 | elif isinstance(obj, (tuple, list, set)): 272 | obj_str = "\n".join([_repr_obj(elm, level=level) for elm in obj]) 273 | else: 274 | obj_str = padding * level + repr(obj) 275 | return obj_str 276 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/_hvg.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanpy hvg 3 | """ 4 | 5 | import numpy as np 6 | import scanpy as sc 7 | 8 | 9 | def hvg( 10 | adata, 11 | mean_limits=(0.0125, 3), 12 | disp_limits=(0.5, float("inf")), 13 | **kwargs, 14 | ): 15 | """ 16 | Wrapper function for sc.highly_variable_genes() 17 | """ 18 | 19 | # Check for n_top_genes beeing greater than the total genes 20 | 21 | if "n_top_genes" in kwargs and kwargs["n_top_genes"] is not None: 22 | kwargs["n_top_genes"] = min(adata.n_vars, kwargs["n_top_genes"]) 23 | 24 | always_hv_genes = None 25 | if "always_hv_genes_file" in kwargs and kwargs["always_hv_genes_file"] is not None: 26 | with open(kwargs["always_hv_genes_file"], "r") as f: 27 | always_hv_genes = f.read().splitlines() 28 | 29 | never_hv_genes = None 30 | if "never_hv_genes_file" in kwargs and kwargs["never_hv_genes_file"] is not None: 31 | with open(kwargs["never_hv_genes_file"], "r") as f: 32 | never_hv_genes = f.read().splitlines() 33 | 34 | # to avoid upsetting the scanpy function with unexpected keyword arguments 35 | del kwargs["always_hv_genes_file"] 36 | del kwargs["never_hv_genes_file"] 37 | 38 | sc.pp.highly_variable_genes( 39 | adata, 40 | min_mean=mean_limits[0], 41 | max_mean=mean_limits[1], 42 | min_disp=disp_limits[0], 43 | max_disp=disp_limits[1], 44 | **kwargs, 45 | ) 46 | 47 | return switch_hvgs(adata, always_hv_genes, never_hv_genes) 48 | 49 | 50 | def switch_hvgs(adata, always_hv_genes=None, never_hv_genes=None): 51 | """ 52 | Function to switch on/off highly variable genes based on a list of genes. 53 | 54 | >>> adata = sc.datasets.pbmc3k() 55 | >>> sc.pp.normalize_total(adata) 56 | >>> sc.pp.log1p(adata) 57 | >>> sc.pp.highly_variable_genes(adata) 58 | >>> adata = switch_hvgs(adata, always_hv_genes=['MIR1302-10', 'FAM138A'], never_hv_genes=['ISG15', 'TNFRSF4']) 59 | >>> adata.var.loc['ISG15'].highly_variable 60 | False 61 | >>> adata.var.loc['TNFRSF4'].highly_variable 62 | False 63 | >>> adata.var.loc['MIR1302-10'].highly_variable 64 | True 65 | >>> adata.var.loc['CPSF3L'].highly_variable 66 | True 67 | """ 68 | if always_hv_genes is not None: 69 | adata.var.highly_variable = np.logical_or( 70 | adata.var.highly_variable, adata.var_names.isin(always_hv_genes) 71 | ) 72 | 73 | if never_hv_genes is not None: 74 | adata.var.highly_variable = np.logical_and( 75 | adata.var.highly_variable, ~adata.var_names.isin(never_hv_genes) 76 | ) 77 | 78 | return adata 79 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/_leiden.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanpy leiden 3 | """ 4 | 5 | import scanpy as sc 6 | from ..obj_utils import write_obs 7 | 8 | 9 | def leiden( 10 | adata, 11 | resolution, 12 | neighbors_key=None, 13 | obsp=None, 14 | key_added=None, 15 | export_cluster=None, 16 | **kwargs, 17 | ): 18 | """ 19 | Wrapper function for sc.tl.leiden, for supporting multiple resolutions. 20 | """ 21 | keys = [] 22 | if kwargs.get("restrict_to", None) and not kwargs["restrict_to"][0]: 23 | kwargs["restrict_to"] = None 24 | 25 | if not isinstance(resolution, (list, tuple)): 26 | if key_added is not None and not key_added.startswith("leiden_"): 27 | key_added = f"leiden_{key_added}" 28 | elif key_added is None: 29 | key_added = "leiden" 30 | sc.tl.leiden( 31 | adata, 32 | resolution=resolution, 33 | neighbors_key=neighbors_key, 34 | obsp=obsp, 35 | key_added=key_added, 36 | **kwargs, 37 | ) 38 | keys.append(key_added) 39 | else: 40 | for i, res in enumerate(resolution): 41 | res_key = str(res).replace(".", "_") 42 | if key_added is None: 43 | graph_key = ( 44 | ("_" + f"{neighbors_key or obsp}") if neighbors or obsp else "" 45 | ) 46 | key = f"leiden{graph_key}_r{res_key}" 47 | elif not isinstance(key_added, (list, tuple)): 48 | key = f"leiden_{key_added}_r{res_key}" 49 | elif len(key_added) == len(resolution): 50 | key = key_added[i] 51 | else: 52 | raise ValueError( 53 | "`key_added` can only be None, a scalar, or an " 54 | "iterable of the same length as `resolution`." 55 | ) 56 | keys.extend( 57 | leiden( 58 | adata, 59 | resolution=res, 60 | neighbors_key=neighbors_key, 61 | obsp=obsp, 62 | key_added=key, 63 | **kwargs, 64 | ) 65 | ) 66 | 67 | if export_cluster: 68 | write_obs(adata, keys, export_cluster) 69 | 70 | return keys 71 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/_louvain.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanpy louvain 3 | """ 4 | 5 | import scanpy as sc 6 | 7 | from ..obj_utils import write_obs 8 | 9 | 10 | def louvain( 11 | adata, 12 | resolution, 13 | neighbors_key=None, 14 | obsp=None, 15 | key_added=None, 16 | export_cluster=None, 17 | **kwargs, 18 | ): 19 | """ 20 | Wrapper function for sc.tl.louvain, for supporting multiple resolutions. 21 | """ 22 | keys = [] 23 | if kwargs["restrict_to"] and not kwargs["restrict_to"][0]: 24 | kwargs["restrict_to"] = None 25 | 26 | if not isinstance(resolution, (list, tuple)): 27 | if key_added is not None and not key_added.startswith("louvain_"): 28 | key_added = f"louvain_{key_added}" 29 | elif key_added is None: 30 | key_added = "louvain" 31 | sc.tl.louvain( 32 | adata, 33 | resolution=resolution, 34 | key_added=key_added, 35 | neighbors_key=neighbors_key, 36 | obsp=obsp, 37 | **kwargs, 38 | ) 39 | keys.append(key_added) 40 | else: 41 | for i, res in enumerate(resolution): 42 | 43 | res_key = str(res).replace(".", "_") 44 | 45 | if key_added is None: 46 | graph_key = ( 47 | ("_" + f"{neighbors_key or obsp}") if neighbors or obsp else "" 48 | ) 49 | key = f"louvain{graph_key}_r{res_key}" 50 | elif not isinstance(key_added, (list, tuple)): 51 | key = f"louvain_{key_added}_r{res_key}" 52 | elif len(key_added) == len(resolution): 53 | key = key_added[i] 54 | else: 55 | raise ValueError( 56 | "`key_added` can only be None, a scalar, or an " 57 | "iterable of the same length as `resolution`." 58 | ) 59 | keys.extend( 60 | louvain( 61 | adata, 62 | resolution=res, 63 | neighbors_key=neighbors_key, 64 | obsp=obsp, 65 | key_added=key, 66 | **kwargs, 67 | ) 68 | ) 69 | 70 | if export_cluster: 71 | write_obs(adata, keys, export_cluster) 72 | 73 | return keys 74 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/_mnn.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanpy external mnn 3 | """ 4 | 5 | import click 6 | import numpy as np 7 | import scanpy.external as sce 8 | import logging 9 | 10 | # Wrapper for mnn allowing use of non-standard slot 11 | 12 | 13 | def mnn_correct(adata, key=None, key_added=None, var_subset=None, layer=None, **kwargs): 14 | """ 15 | Wrapper function for sce.pp.mnn_correct(), for supporting non-standard neighbors slot 16 | """ 17 | 18 | # mnn will use .X, so we need to put other layers there for processing 19 | 20 | logging.warning( 21 | "Use mnn_correct at your own risk, environment installation seems faulty for this module." 22 | ) 23 | 24 | if layer: 25 | adata.layers["X_backup"] = adata.X 26 | adata.X = adata.layers[layer] 27 | 28 | # mnn_correct() wants batches in separate adatas 29 | 30 | batches = np.unique(adata.obs[key]) 31 | alldata = [] 32 | for batch in batches: 33 | alldata.append( 34 | adata[ 35 | adata.obs[key] == batch, 36 | ] 37 | ) 38 | 39 | # Process var_subset into a list of strings that can be provided to 40 | # mnn_correct() 41 | 42 | if var_subset is not None and len(var_subset) > 0 and var_subset[0] is not None: 43 | 44 | subset = [] 45 | 46 | for name, values in var_subset: 47 | if name in adata.var: 48 | if adata.var[name].dtype == "bool": 49 | values = [True if x.lower() == "true" else x for x in values] 50 | else: 51 | raise click.ClickException(f'Var "{name}" unavailable') 52 | 53 | ind = [x in values for x in adata.var[name]] 54 | subset = subset + adata.var.index[ind].to_list() 55 | 56 | var_subset = set(subset) 57 | print("Will use %d selected genes for MNN" % len(var_subset)) 58 | 59 | else: 60 | var_subset = None 61 | 62 | # Here's the main bit 63 | 64 | cdata = sce.pp.mnn_correct( 65 | *alldata, 66 | var_subset=var_subset, 67 | do_concatenate=True, 68 | index_unique=None, 69 | **kwargs, 70 | ) 71 | 72 | # If user has specified key_added = X then they want us to overwrite .X, 73 | # othwerwise copy the .X to a named layer of the original object. In either 74 | # case make sure obs and var are the same as the original. 75 | 76 | if key_added is None or key_added != "X": 77 | 78 | mnn_key = "mnn" 79 | if layer: 80 | mnn_key = f"{mnn_key}_{layer}" 81 | 82 | # Layers is set (so we're not storing computed results in the .X, 83 | # and we had to overwrite .X to run mnn), and key_added shows we're 84 | # not storing in the .X, so we need to restore from the backup. 85 | 86 | adata.X = adata.layers["X_backup"] 87 | 88 | if key_added: 89 | mnn_key = f"{mnn_key}_{key_added}" 90 | 91 | adata.layers[mnn_key] = cdata[0][adata.obs.index, adata.var.index].X 92 | 93 | else: 94 | adata.X = cdata[0][adata.obs.index, adata.var.index].X 95 | 96 | # Delete the backup of .X if we needed one 97 | 98 | if layer: 99 | del adata.layers["X_backup"] 100 | 101 | return adata 102 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/_neighbors.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanpy neighbors 3 | """ 4 | 5 | import scanpy as sc 6 | from ..obj_utils import ( 7 | _backup_default_key, 8 | _delete_backup_key, 9 | _rename_default_key, 10 | ) 11 | 12 | 13 | def neighbors(adata, n_neighbors=15, key_added=None, **kwargs): 14 | """ 15 | Wrapper function for sc.pp.neighbors(), for supporting multiple n_neighbors 16 | """ 17 | if not isinstance(n_neighbors, (list, tuple)): 18 | sc.pp.neighbors(adata, n_neighbors=n_neighbors, key_added=key_added, **kwargs) 19 | else: 20 | for i, n_nb in enumerate(n_neighbors): 21 | if key_added is None: 22 | graph_key = f"k{n_nb}" 23 | elif not isinstance(key_added, (list, tuple)): 24 | graph_key = f"{key_added}_k{n_nb}" 25 | elif len(key_added) == len(n_neighbors): 26 | graph_key = key_added[i] 27 | else: 28 | raise ValueError( 29 | "`key_added` can only be None, a scalar, or an " 30 | "iterable of the same length as `n_neighbors`." 31 | ) 32 | neighbors( 33 | adata, 34 | n_neighbors=n_nb, 35 | key_added=graph_key, 36 | **kwargs, 37 | ) 38 | return adata 39 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/_norm.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanpy norm 3 | """ 4 | 5 | import scanpy as sc 6 | import math 7 | 8 | 9 | def normalize(adata, log_transform=True, **kwargs): 10 | """ 11 | Wrapper function for sc.pp.normalize_per_cell() and sc.pp.log1p(), mainly 12 | for supporting different ways of saving raw data. 13 | """ 14 | sc.pp.normalize_total(adata, **kwargs) 15 | if log_transform: 16 | # Natural logarithm is the default by scanpy, if base is not set 17 | base = math.e 18 | sc.pp.log1p(adata, base=base) 19 | # scanpy is not setting base in uns['log1p'] keys, but later on asking for it 20 | if "log1p" in adata.uns_keys() and "base" not in adata.uns["log1p"]: 21 | # Note that setting base to None doesn't solve the problem at other modules that check for base later on 22 | # as adata.uns["log1p"]["base"] = None gets dropped at either anndata write or read. 23 | adata.uns["log1p"]["base"] = base 24 | 25 | return adata 26 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/_paga.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanpy paga 3 | """ 4 | 5 | import numpy as np 6 | import scanpy as sc 7 | from ..obj_utils import ( 8 | _backup_default_key, 9 | _delete_backup_key, 10 | _rename_default_key, 11 | _set_default_key, 12 | _restore_default_key, 13 | ) 14 | 15 | 16 | def paga( 17 | adata, 18 | key_added=None, 19 | **kwargs, 20 | ): 21 | """ 22 | Wrapper function for sc.tl.paga, for supporting named slot 23 | """ 24 | sc.tl.paga(adata, **kwargs) 25 | 26 | if key_added: 27 | paga_key = f"paga_{key_added}" 28 | _rename_default_key(adata.uns, "paga", paga_key) 29 | else: 30 | _delete_backup_key(adata.uns, "paga") 31 | 32 | return adata 33 | 34 | 35 | def plot_paga( 36 | adata, 37 | use_key="paga", 38 | basis=None, 39 | layout=None, 40 | init_pos=None, 41 | legend_loc="on data", 42 | color=None, 43 | size=None, 44 | title=None, 45 | show=None, 46 | save=None, 47 | **kwargs, 48 | ): 49 | """Make PAGA plot""" 50 | if basis is not None and f"X_{basis}" in adata.obsm.keys(): 51 | ax = sc.pl.embedding( 52 | adata, 53 | basis=basis, 54 | color=color, 55 | legend_loc=legend_loc, 56 | size=size, 57 | title=None, 58 | save=False, 59 | show=False, 60 | ) 61 | 62 | grouping = adata.uns[use_key]["groups"] 63 | categories = list(adata.obs[grouping].cat.categories) 64 | obsm = adata.obsm[f"X_{basis}"] 65 | group_pos = np.zeros((len(categories), 2)) 66 | for i, label in enumerate(categories): 67 | offset = 1 if basis.startswith("diffmap") else 0 68 | _scatter = obsm[adata.obs[grouping] == label, (0 + offset) : (2 + offset)] 69 | x_pos, y_pos = np.median(_scatter, axis=0) 70 | group_pos[i] = [x_pos, y_pos] 71 | 72 | _set_default_key(adata.uns, "paga", use_key) 73 | kwargs["node_size_scale"] = 0 74 | kwargs["fontsize"] = 1 75 | kwargs["pos"] = group_pos 76 | kwargs["color"] = None 77 | try: 78 | sc.pl.paga( 79 | adata, 80 | ax=ax, 81 | title=title, 82 | show=show, 83 | save=save, 84 | **kwargs, 85 | ) 86 | finally: 87 | _restore_default_key(adata.uns, "paga", use_key) 88 | else: 89 | _set_default_key(adata.uns, "paga", use_key) 90 | try: 91 | sc.pl.paga( 92 | adata, 93 | layout=layout, 94 | init_pos=init_pos, 95 | color=color, 96 | title=title, 97 | show=show, 98 | save=save, 99 | **kwargs, 100 | ) 101 | finally: 102 | _restore_default_key(adata.uns, "paga", use_key) 103 | 104 | return adata 105 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/_pca.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanpy pca 3 | """ 4 | 5 | import logging 6 | import scanpy as sc 7 | from ..obj_utils import write_embedding 8 | 9 | 10 | def pca(adata, key_added=None, export_embedding=None, **kwargs): 11 | """ 12 | Wrapper function for sc.pp.pca, for supporting named slot 13 | """ 14 | 15 | # omit "svd_solver" to let scanpy choose automatically 16 | if "svd_solver" in kwargs and kwargs["svd_solver"] == "auto": 17 | del kwargs["svd_solver"] 18 | 19 | if key_added: 20 | if "X_pca" in adata.obsm.keys(): 21 | adata.obsm["X_pca_bkup"] = adata.obsm["X_pca"] 22 | sc.pp.pca(adata, **kwargs) 23 | pca_key = f"X_pca_{key_added}" 24 | adata.obsm[pca_key] = adata.obsm["X_pca"] 25 | del adata.obsm["X_pca"] 26 | if "X_pca_bkup" in adata.obsm.keys(): 27 | adata.obsm["X_pca"] = adata.obsm["X_pca_bkup"] 28 | del adata.obsm["X_pca_bkup"] 29 | else: 30 | sc.pp.pca(adata, **kwargs) 31 | pca_key = "X_pca" 32 | 33 | if export_embedding is not None: 34 | write_embedding(adata, pca_key, export_embedding, key_added=key_added) 35 | return adata 36 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/_read.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provides read_10x() 3 | """ 4 | 5 | import pandas as pd 6 | import scanpy as sc 7 | 8 | 9 | def read_10x( 10 | input_10x_h5, 11 | input_10x_mtx, 12 | genome="hg19", 13 | var_names="gene_symbols", 14 | extra_obs=None, 15 | extra_var=None, 16 | ): 17 | """ 18 | Wrapper function for sc.read_10x_h5() and sc.read_10x_mtx(), mainly to 19 | support adding extra metadata 20 | """ 21 | if input_10x_h5 is not None: 22 | adata = sc.read_10x_h5(input_10x_h5, genome=genome) 23 | elif input_10x_mtx is not None: 24 | adata = sc.read_10x_mtx(input_10x_mtx, var_names=var_names) 25 | 26 | if extra_obs: 27 | obs_tbl = pd.read_csv(extra_obs, sep="\t", header=0, index_col=0) 28 | adata.obs = adata.obs.merge( 29 | obs_tbl, 30 | how="left", 31 | left_index=True, 32 | right_index=True, 33 | suffixes=(False, False), 34 | ) 35 | 36 | if extra_var: 37 | var_tbl = pd.read_csv(extra_var, sep="\t", header=0, index_col=0) 38 | adata.var = adata.var.merge( 39 | var_tbl, 40 | how="left", 41 | left_index=True, 42 | right_index=True, 43 | suffixes=(False, False), 44 | ) 45 | return adata 46 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/_scrublet.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanpy external scrublet 3 | """ 4 | 5 | import anndata 6 | import numpy as np 7 | import pandas as pd 8 | import scanpy as sc 9 | import scanpy.external as sce 10 | 11 | from ..obj_utils import write_obs 12 | 13 | # Wrapper for scrublet allowing text export and filtering 14 | 15 | 16 | def scrublet(adata, adata_sim=None, filter=False, export_table=None, **kwargs): 17 | """ 18 | Wrapper function for sce.pp.scrublet(), to allow filtering of resulting object 19 | """ 20 | 21 | # Do we need to read an object with the doublet simulations? 22 | 23 | if adata_sim: 24 | adata_sim = sc.read_h5ad(adata_sim) 25 | 26 | sce.pp.scrublet(adata, adata_sim=adata_sim, **kwargs) 27 | 28 | # Do any export before optional filtering 29 | 30 | if export_table: 31 | write_obs(adata, ["doublet_score", "predicted_doublet"], export_table) 32 | 33 | # Filter out predited doublets 34 | 35 | if filter: 36 | adata._inplace_subset_obs(np.invert(adata.obs["predicted_doublet"])) 37 | 38 | return adata 39 | 40 | 41 | # Run the doublet simulation. 42 | 43 | 44 | def scrublet_simulate_doublets(adata, **kwargs): 45 | adata_sim = sce.pp.scrublet_simulate_doublets(adata, **kwargs) 46 | adata._init_as_actual( 47 | X=adata_sim.X, obs=adata_sim.obs, obsm=adata_sim.obsm, uns=adata.uns 48 | ) 49 | 50 | 51 | # Just absorb the extra plotting args before passing to 52 | # scanpy.external.pl.scrublet_score_distribution 53 | 54 | 55 | def plot_scrublet( 56 | adata, scale_hist_obs="log", scale_hist_sim="linear", fig_size=(8, 3), **kwargs 57 | ): 58 | """ 59 | Wrapper function for sce.pl.scrublet_score_distribution(), to allow 60 | plotting of score distribution 61 | """ 62 | sce.pl.scrublet_score_distribution( 63 | adata, 64 | scale_hist_obs=scale_hist_obs, 65 | scale_hist_sim=scale_hist_sim, 66 | figsize=fig_size, 67 | **kwargs 68 | ) 69 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/_tsne.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanpy tsne 3 | """ 4 | 5 | import scanpy as sc 6 | from ..obj_utils import ( 7 | _backup_obsm_key, 8 | _rename_obsm_key, 9 | _delete_obsm_backup_key, 10 | write_embedding, 11 | ) 12 | 13 | 14 | def tsne( 15 | adata, 16 | key_added=None, 17 | random_state=0, 18 | export_embedding=None, 19 | **kwargs, 20 | ): 21 | """ 22 | Wrapper function for sc.tl.tsne, for supporting named slot of tsne embeddings 23 | """ 24 | if not isinstance(random_state, (list, tuple)): 25 | _backup_obsm_key(adata, "X_tsne") 26 | 27 | sc.tl.tsne(adata, random_state=random_state, **kwargs) 28 | 29 | tsne_key = "X_tsne" 30 | if key_added: 31 | tsne_key = f"X_tsne_{key_added}" 32 | _rename_obsm_key(adata, "X_tsne", tsne_key) 33 | else: 34 | _delete_obsm_backup_key(adata, "X_tsne") 35 | 36 | if export_embedding is not None: 37 | write_embedding(adata, tsne_key, export_embedding, key_added=key_added) 38 | else: 39 | for i, rseed in enumerate(random_state): 40 | if key_added is None: 41 | tsne_key = f"r{rseed}" 42 | elif not isinstance(key_added, (list, tuple)): 43 | tsne_key = f"{key_added}_r{rseed}" 44 | elif len(key_added) == len(random_state): 45 | tsne_key = key_added[i] 46 | else: 47 | raise ValueError( 48 | "`key_added` can only be None, a scalar, or " 49 | "an iterable of the same length as " 50 | "`random_state`." 51 | ) 52 | tsne( 53 | adata, 54 | key_added=tsne_key, 55 | random_state=rseed, 56 | **kwargs, 57 | ) 58 | return adata 59 | -------------------------------------------------------------------------------- /scanpy_scripts/lib/_umap.py: -------------------------------------------------------------------------------- 1 | """ 2 | scanpy umap 3 | """ 4 | 5 | import scanpy as sc 6 | from ..obj_utils import ( 7 | _set_default_key, 8 | _restore_default_key, 9 | _backup_obsm_key, 10 | _rename_obsm_key, 11 | _delete_obsm_backup_key, 12 | write_embedding, 13 | ) 14 | 15 | 16 | def umap( 17 | adata, 18 | key_added=None, 19 | random_state=0, 20 | export_embedding=None, 21 | **kwargs, 22 | ): 23 | """ 24 | Wrapper function for sc.tl.umap, for supporting named slot of umap embeddings 25 | """ 26 | if not isinstance(random_state, (list, tuple)): 27 | _backup_obsm_key(adata, "X_umap") 28 | 29 | sc.tl.umap(adata, random_state=random_state, **kwargs) 30 | 31 | umap_key = "X_umap" 32 | if key_added: 33 | umap_key = f"X_umap_{key_added}" 34 | _rename_obsm_key(adata, "X_umap", umap_key) 35 | else: 36 | _delete_obsm_backup_key(adata, "X_umap") 37 | 38 | if export_embedding is not None: 39 | write_embedding(adata, umap_key, export_embedding, key_added=key_added) 40 | else: 41 | for i, rseed in enumerate(random_state): 42 | if key_added is None: 43 | umap_key = f"r{rseed}" 44 | elif not isinstance(key_added, (list, tuple)): 45 | umap_key = f"{key_added}_r{rseed}" 46 | elif len(key_added) == len(random_state): 47 | umap_key = key_added[i] 48 | else: 49 | raise ValueError( 50 | "`key_added` can only be None, a scalar, or an " 51 | "iterable of the same length as `random_state`." 52 | ) 53 | umap( 54 | adata, 55 | key_added=umap_key, 56 | random_state=rseed, 57 | **kwargs, 58 | ) 59 | return adata 60 | -------------------------------------------------------------------------------- /scanpy_scripts/obj_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provide helper functions for constructing sub-commands 3 | """ 4 | 5 | import scanpy as sc 6 | import pandas as pd 7 | 8 | 9 | def write_obs(adata, keys, obs_fn, sep="\t"): 10 | """Export cell clustering as a text table""" 11 | if not isinstance(keys, (list, tuple)): 12 | keys = [keys] 13 | for key in keys: 14 | if key not in adata.obs.keys(): 15 | raise KeyError(f"{key} is not a valid `.uns` key") 16 | adata.obs[keys].reset_index(level=0).rename(columns={"index": "cells"}).to_csv( 17 | obs_fn, sep=sep, header=True, index=False 18 | ) 19 | 20 | 21 | def write_embedding(adata, key, embed_fn, n_comp=None, sep="\t", key_added=None): 22 | """Export cell embeddings as a txt table""" 23 | if key_added: 24 | if embed_fn.endswith(".tsv"): 25 | embed_fn = embed_fn[0:-4] 26 | embed_fn = f"{embed_fn}_{key_added}.tsv" 27 | if key not in adata.obsm.keys(): 28 | raise KeyError(f"{key} is not a valid `.obsm` key") 29 | mat = adata.obsm[key].copy() 30 | if n_comp is not None and mat.shape[1] >= n_comp: 31 | mat = mat[:, 0:n_comp] 32 | pd.DataFrame(mat, index=adata.obs_names).to_csv( 33 | embed_fn, sep=sep, header=False, index=True 34 | ) 35 | 36 | 37 | # The functions below handles slot key. 38 | # 39 | # Default keys are those read and written by scanpy functions by default, e.g 40 | # "X_pca", "neighbors", "louvain", etc. 41 | # 42 | # Of them, `obsm_key` specifically refers to those used for embedding, e.g 43 | # "X_pca", "X_tsne", "X_umap", etc. 44 | # 45 | # The approach for supplying a non-standard key to a function as input is: 46 | # if the function only reads the value in the default key, we first backup the 47 | # value in the default key, then write the value of the non-standard key into 48 | # the standard key, run the funtion, and finally restore the value of the 49 | # default key from backup and delete the backup. 50 | # 51 | # The approach for writting the results of a function to a non-standard key is: 52 | # if the function only writes to the default key, we first backup the value in 53 | # the default key, run the function, copy the value of the default key to the 54 | # desired non-standard key, and finally restore the value of the default key 55 | # from backup and delete the backup. 56 | # 57 | # Specical treatment for obsm_key is needed, as the underlying data type is not 58 | # a python dictionary but a numpy array. 59 | 60 | 61 | def _backup_default_key(slot, default): 62 | if default in slot.keys(): 63 | bkup_key = f"{default}_bkup" 64 | if bkup_key in slot.keys(): 65 | sc.logging.warn(f"overwrite existing {bkup_key}") 66 | slot[bkup_key] = slot[default] 67 | 68 | 69 | def _restore_default_key(slot, default, key=None): 70 | if key != default: 71 | bkup_key = f"{default}_bkup" 72 | if bkup_key in slot.keys(): 73 | slot[default] = slot[bkup_key] 74 | del slot[bkup_key] 75 | 76 | 77 | def _delete_backup_key(slot, default): 78 | bkup_key = f"{default}_bkup" 79 | if bkup_key in slot.keys(): 80 | del slot[bkup_key] 81 | 82 | 83 | def _set_default_key(slot, default, key): 84 | if key != default: 85 | if key not in slot.keys(): 86 | raise KeyError(f"{key} does not exist") 87 | _backup_default_key(slot, default) 88 | slot[default] = slot[key] 89 | 90 | 91 | def _rename_default_key(slot, default, key): 92 | if not default in slot.keys(): 93 | raise KeyError(f"{default} does not exist") 94 | slot[key] = slot[default] 95 | del slot[default] 96 | _restore_default_key(slot, default) 97 | 98 | 99 | def _backup_obsm_key(adata, key): 100 | if key in adata.obsm_keys(): 101 | bkup_key = f"{key}_bkup" 102 | if bkup_key in adata.obsm_keys(): 103 | sc.logging.warn(f"overwrite existing {bkup_key}") 104 | adata.obsm[bkup_key] = adata.obsm[key] 105 | 106 | 107 | def _restore_obsm_key(adata, key, new_key=None): 108 | if new_key != key: 109 | bkup_key = f"{key}_bkup" 110 | if bkup_key in adata.obsm_keys(): 111 | adata.obsm[key] = adata.obsm[bkup_key] 112 | del adata.obsm[bkup_key] 113 | 114 | 115 | def _delete_obsm_backup_key(adata, key): 116 | bkup_key = f"{key}_bkup" 117 | if bkup_key in adata.obsm_keys(): 118 | del adata.obsm[bkup_key] 119 | 120 | 121 | def _set_obsm_key(adata, key, new_key): 122 | if new_key != key: 123 | if new_key not in adata.obsm_keys(): 124 | raise KeyError(f"{new_key} does not exist") 125 | _backup_obsm_key(adata, key) 126 | adata.obsm[key] = adata.obsm[new_key] 127 | 128 | 129 | def _rename_obsm_key(adata, from_key, to_key): 130 | if not from_key in adata.obsm_keys(): 131 | raise KeyError(f"{from_key} does not exist") 132 | adata.obsm[to_key] = adata.obsm[from_key] 133 | del adata.obsm[from_key] 134 | _restore_obsm_key(adata, from_key) 135 | 136 | 137 | # Place the content of .X or specified layer in a specified backup location. 138 | 139 | 140 | def _save_matrix(adata, save_raw=False, save_layer=None, layer=None): 141 | if save_raw: 142 | adata.raw = adata 143 | if save_layer is not None: 144 | if layer is not None: 145 | if layer not in adata.layers(): 146 | raise KeyError(f"Layer {layer} does not exist") 147 | adata.layers[save_layer] = adata.layers[layer] 148 | else: 149 | adata.layers[save_layer] = adata.X 150 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setup( 7 | name="scanpy-scripts", 8 | version="1.9.301", 9 | author="nh3", 10 | author_email="nh3@users.noreply.github.com", 11 | description="Scripts for using scanpy from the command line", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/ebi-gene-expression-group/scanpy-scripts", 15 | packages=find_packages(), 16 | scripts=[ 17 | "scanpy-scripts-tests.bats", 18 | ], 19 | entry_points=dict( 20 | console_scripts=[ 21 | "scanpy-cli=scanpy_scripts.cli:cli", 22 | "scanpy-read-10x=scanpy_scripts.cmds:READ_CMD", 23 | "scanpy-filter-cells=scanpy_scripts.cmds:FILTER_CMD", 24 | "scanpy-filter-genes=scanpy_scripts.cmds:FILTER_CMD", 25 | "scanpy-normalise-data=scanpy_scripts.cmds:NORM_CMD", 26 | "scanpy-find-variable-genes=scanpy_scripts.cmds:HVG_CMD", 27 | "scanpy-scale-data=scanpy_scripts.cmds:SCALE_CMD", 28 | "scanpy-regress=scanpy_scripts.cmds:REGRESS_CMD", 29 | "scanpy-run-pca=scanpy_scripts.cmds:PCA_CMD", 30 | "scanpy-neighbors=scanpy_scripts.cmds:NEIGHBOR_CMD", 31 | "scanpy-run-tsne=scanpy_scripts.cmds:TSNE_CMD", 32 | "scanpy-run-umap=scanpy_scripts.cmds:UMAP_CMD", 33 | "scanpy-find-cluster=scanpy_scripts.cli:cluster", 34 | "scanpy-find-markers=scanpy_scripts.cmds:DIFFEXP_CMD", 35 | ] 36 | ), 37 | install_requires=[ 38 | # "packaging", 39 | # "anndata", 40 | # "scipy", 41 | # "matplotlib", 42 | # "pandas", 43 | # "h5py<3.0.0", 44 | "scanpy==1.9.3", 45 | "louvain", 46 | "igraph", 47 | "leidenalg", 48 | "loompy", 49 | "Click<8", 50 | # "umap-learn", 51 | "harmonypy>=0.0.5", 52 | "bbknn>=1.5.0,<1.6.0", 53 | "mnnpy>=0.1.9.5", 54 | "scipy<1.9.0", 55 | "scikit-learn<1.3.0", 56 | "scrublet", 57 | "fa2", 58 | ], 59 | ) 60 | -------------------------------------------------------------------------------- /test-env.yaml: -------------------------------------------------------------------------------- 1 | name: scanpy-scripts 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - scanpy=1.9.3 8 | - louvain 9 | - igraph 10 | - leidenalg 11 | - loompy 12 | - Click <8 13 | - harmonypy>=0.0.5 14 | - bbknn>=1.5.0,<1.6.0 15 | - mnnpy>=0.1.9.5 16 | # for mnnpy using n_jobs 17 | - scipy <1.9.0 18 | - scikit-learn <1.3.0 19 | - scrublet 20 | - fa2 21 | # for testing 22 | - bats 23 | - black 24 | - pytest 25 | --------------------------------------------------------------------------------