├── .history ├── Cell type Annotation │ ├── finetune cell type classification cross_20230826230604.py │ └── finetune cell type classification cross_20250115140910.py ├── installation_baselines │ ├── Dockerfile_20241228134847 │ └── Dockerfile_20241228134950 ├── readme_20231228192429.md ├── readme_20240311112645.md ├── readme_20240311112739.md ├── readme_20240311112740.md ├── readme_20240311112848.md ├── readme_20240311112907.md ├── readme_20240311112908.md ├── readme_20241130154021.md ├── readme_20241130154116.md ├── readme_20241210100101.md ├── readme_20241210100332.md ├── readme_20241228135005.md ├── readme_20241228135006.md ├── sceval_lib_20240311171801.py ├── sceval_lib_20240317220609.py ├── sceval_lib_20240317220632.py ├── sceval_method_20240311113014.py ├── sceval_method_20240311113104.py ├── sceval_method_20240311113148.py ├── sceval_method_20240311113334.py ├── sceval_method_20240311113451.py ├── sceval_method_20240311113518.py ├── sceval_method_20240311113519.py ├── sceval_method_20240311113920.py ├── sceval_method_20240311113921.py ├── sceval_method_20240312125931.py ├── sceval_method_20240312130054.py ├── sceval_method_20240312130115.py ├── sceval_method_20240312130116.py ├── sceval_method_20240312130143.py ├── sceval_method_20240312130144.py ├── sceval_method_20240312130145.py ├── sceval_method_20240317220612.py └── sceval_method_20240317220624.py ├── Batch Effect Correction ├── bec_cellplm.py ├── bec_geneformer.py ├── bec_scf.sh ├── bec_scim.py ├── bec_tgpt.py ├── bec_uce.sh ├── finetune batch effect correction official.ipynb ├── finetune batch effect correction.ipynb ├── sceval_batcheffect.py ├── sceval_batcheffect.sh ├── sceval_batcheffect_official.py └── sceval_batcheffect_scgpt.py ├── Cell type Annotation ├── cta_cellm.sh ├── cta_geneformer.py ├── cta_scbert.py ├── cta_scf.py ├── cta_scim.py ├── cta_tgpt_uce.py ├── finetune cell type classification cross.py ├── finetune cell-type annotation official scgpt.py ├── finetune cell-type annotation official.ipynb └── finetune cell-type annotation official.py ├── Dockerfile ├── Gene Network Analysis ├── GRN Inference example scGPT.ipynb ├── gna_geneformer.py ├── gna_scf.sh ├── grn.py ├── sceval_gna1.py ├── sceval_gna2.py └── sceval_gna_selfdefineEval.py ├── Gene function preiction ├── Gene Function Prediction.ipynb ├── gfp_geneformer.py ├── sceval_gfp.py └── sceval_gfp_scgpt.py ├── Imputation ├── imp_cellplm.py ├── sceval_singlecell_imputation.py ├── sceval_spatial_imputation_finetuning.py ├── sceval_spatial_imputation_zero_shots.py ├── spatial imputation mouse zeroshots.ipynb └── spatial imputation mouse.ipynb ├── Multi-omic data integration ├── finetune multiomics official.py ├── finetune multiomics.ipynb ├── finetune multiomics.py └── multiomic integration official.ipynb ├── Perturbation Prediction ├── finetune perturb seq.ipynb ├── finetune perturbation prediction official.py ├── perturbation prediction official.ipynb ├── peturbation prediction.py ├── pp_scf.sh └── pp_uce_tgpt_scim.py ├── Scaling ├── emergent_ability.md └── vanilla_NN.py ├── Simulation ├── Test simulation.ipynb └── sceval_simulation.py ├── installation_baselines ├── Dockerfile ├── GeneCompass.yml ├── cellm.yml ├── cellplm.yml ├── geneformer.yml ├── scFoundation.yml ├── scbert.yml ├── scgpt.yml ├── scimilarity.yml ├── tgpt.yml └── uce.yml ├── readme.md ├── sceval_lib.py ├── sceval_method.py ├── scgpt.yml ├── scgpt_bench.yml ├── scib ├── __init__.py ├── _package_tools.py ├── exceptions.py ├── integration.py ├── knn_graph │ ├── .gitignore │ ├── README.md │ ├── knn_graph.cpp │ └── makefile ├── metrics │ ├── __init__.py │ ├── ari.py │ ├── cell_cycle.py │ ├── clustering.py │ ├── graph_connectivity.py │ ├── highly_variable_genes.py │ ├── isolated_labels.py │ ├── kbet.py │ ├── lisi.py │ ├── metrics.py │ ├── nmi.py │ ├── pcr.py │ ├── silhouette.py │ ├── trajectory.py │ └── utils.py ├── preprocessing.py ├── resources │ ├── g2m_genes_tirosh.txt │ ├── g2m_genes_tirosh_hm.txt │ ├── s_genes_tirosh.txt │ └── s_genes_tirosh_hm.txt ├── trajectory_inference.py └── utils.py └── scjoint.py /.history/installation_baselines/Dockerfile_20241228134847: -------------------------------------------------------------------------------- 1 | # Start from a Miniconda image 2 | FROM continuumio/miniconda3:latest 3 | 4 | # Create a working directory 5 | WORKDIR /app 6 | 7 | # Copy the environment.yml file into the container 8 | COPY scgpt_bench.yml . 9 | 10 | # Create the environment 11 | # Using `mamba` here for faster installations; it's included in newer images. 12 | RUN conda install -n base -c conda-forge mamba && \ 13 | mamba env create -f scgpt_bench.yml 14 | 15 | # Activate the environment by default 16 | # The conda environment will be located at /opt/conda/envs/myenv 17 | ENV PATH /opt/conda/envs/myenv/bin:$PATH 18 | 19 | # Clean up conda cache to reduce image size 20 | RUN conda clean --all --yes 21 | 22 | # (Optional) Set a default command to start a shell 23 | CMD ["/bin/bash"] 24 | -------------------------------------------------------------------------------- /.history/installation_baselines/Dockerfile_20241228134950: -------------------------------------------------------------------------------- 1 | # Start from a Miniconda image 2 | FROM continuumio/miniconda3:latest 3 | 4 | # Create a working directory 5 | WORKDIR /app 6 | 7 | # Copy the environment.yml file into the container 8 | COPY scgpt_bench.yml . 9 | 10 | # Create the environment 11 | # Using `mamba` here for faster installations; it's included in newer images. 12 | RUN conda install -n base -c conda-forge mamba && \ 13 | mamba env create -f scgpt_bench.yml 14 | 15 | # Activate the environment by default 16 | # The conda environment will be located at /opt/conda/envs/myenv 17 | ENV PATH /opt/conda/envs/myenv/bin:$PATH 18 | 19 | # Clean up conda cache to reduce image size 20 | RUN conda clean --all --yes 21 | 22 | # (Optional) Set a default command to start a shell 23 | CMD ["/bin/bash"] 24 | -------------------------------------------------------------------------------- /.history/readme_20231228192429.md: -------------------------------------------------------------------------------- 1 | # scEval😈: A evaluation platform for single-cell Foundation Models (FMs) 2 | 3 | This is the repo for our benchmarking and analysis project. All methods are collected until Dec 1st, 2023. 4 | 5 | # Install 6 | 7 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create a environment based on this yml file in your own machine: 8 | ``` 9 | conda env create -n scgpt --file scgpt_bench.yml 10 | ``` 11 | 12 | For other methods we used, please refer their original project website for instruction. We recommend creating different environment for different methods. 13 | 14 | These methods include: 15 | 16 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE). These are also single-cell FMs. 17 | 18 | And 19 | 20 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models. 21 | 22 | 23 | We need scIB for evaluation. Please use pip to install it: 24 | ``` 25 | pip install scib 26 | ``` 27 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correclty. 28 | 29 | 30 | # Pre-training weights 31 | 32 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. 33 | 34 | Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). 35 | 36 | scFoundation relies on the APIs for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. 37 | 38 | # Benchmarking information 39 | 40 | Please refer different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here: 41 | 42 | | Tasks | Metrics | 43 | |-------------------------------------------------------|------------------------------------------| 44 | | Batch Effect Correction, Multi-omics Data Integration | 45 | | and Simulation | [scIB](https://github.com/theislab/scib) | 46 | | Cell-type Annotation and Gene Function Prediction | Accuracy, Precision, Recall and F1 score | 47 | | Imputation | [scIB](https://github.com/theislab/scib), Correlation | 48 | | Perturbation Prediction | Correlation | 49 | | Gene Network Analysis | Jaccard similarity | 50 | 51 | The file 'sceval_lib.py' includes all of the metrics we used in this project. 52 | 53 | To run the codes in different tasks, please use (we choose batch effect correction as an example here): 54 | 55 | ``` 56 | python sceval_batcheffect.py 57 | ``` 58 | 59 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing). 60 | 61 | To avoid using wandb, please set: 62 | 63 | ``` 64 | os.environ["WANDB_MODE"] = "offline" 65 | ``` 66 | 67 | # Results 68 | 69 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 70 | 71 | # Contact 72 | 73 | Please contact tianyu.liu@yale.edu if you have any questions about this project. 74 | 75 | # Citation 76 | 77 | ``` 78 | @article {Liu2023.09.08.555192, 79 | author = {Tianyu Liu and Kexing Li and Yuge Wang and Hongyu Li and Hongyu Zhao}, 80 | title = {Evaluating the Utilities of Large Language Models in Single-cell Data Analysis}, 81 | elocation-id = {2023.09.08.555192}, 82 | year = {2023}, 83 | doi = {10.1101/2023.09.08.555192}, 84 | publisher = {Cold Spring Harbor Laboratory}, 85 | URL = {https://www.biorxiv.org/content/early/2023/09/08/2023.09.08.555192}, 86 | eprint = {https://www.biorxiv.org/content/early/2023/09/08/2023.09.08.555192.full.pdf}, 87 | journal = {bioRxiv} 88 | } 89 | ``` -------------------------------------------------------------------------------- /.history/readme_20240311112645.md: -------------------------------------------------------------------------------- 1 | # scEval😈: A evaluation platform for single-cell Foundation Models (FMs) 2 | 3 | This is the repo for our benchmarking and analysis project. All methods are collected until March 1st, 2024. 4 | 5 | # Install 6 | 7 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create a environment based on this yml file in your own machine: 8 | ``` 9 | conda env create -n scgpt --file scgpt_bench.yml 10 | ``` 11 | 12 | For other methods we used, please refer their original project website for instruction. We recommend creating different environment for different methods. 13 | 14 | These methods include: 15 | 16 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE). These are also single-cell FMs. 17 | 18 | And 19 | 20 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models. 21 | 22 | 23 | We need scIB for evaluation. Please use pip to install it: 24 | ``` 25 | pip install scib 26 | ``` 27 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correclty. 28 | 29 | 30 | # Pre-training weights 31 | 32 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. 33 | 34 | Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). 35 | 36 | scFoundation relies on the APIs for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. 37 | 38 | # Benchmarking information 39 | 40 | Please refer different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here: 41 | 42 | | Tasks | Metrics | 43 | |-------------------------------------------------------|------------------------------------------| 44 | | Batch Effect Correction, Multi-omics Data Integration | 45 | | and Simulation | [scIB](https://github.com/theislab/scib) | 46 | | Cell-type Annotation and Gene Function Prediction | Accuracy, Precision, Recall and F1 score | 47 | | Imputation | [scIB](https://github.com/theislab/scib), Correlation | 48 | | Perturbation Prediction | Correlation | 49 | | Gene Network Analysis | Jaccard similarity | 50 | 51 | The file 'sceval_lib.py' includes all of the metrics we used in this project. 52 | 53 | To run the codes in different tasks, please use (we choose batch effect correction as an example here): 54 | 55 | ``` 56 | python sceval_batcheffect.py 57 | ``` 58 | 59 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing). 60 | 61 | To avoid using wandb, please set: 62 | 63 | ``` 64 | os.environ["WANDB_MODE"] = "offline" 65 | ``` 66 | 67 | # Results 68 | 69 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 70 | 71 | # Contact 72 | 73 | Please contact tianyu.liu@yale.edu if you have any questions about this project. 74 | 75 | # Citation 76 | 77 | ``` 78 | @article {Liu2023.09.08.555192, 79 | author = {Tianyu Liu and Kexing Li and Yuge Wang and Hongyu Li and Hongyu Zhao}, 80 | title = {Evaluating the Utilities of Large Language Models in Single-cell Data Analysis}, 81 | elocation-id = {2023.09.08.555192}, 82 | year = {2023}, 83 | doi = {10.1101/2023.09.08.555192}, 84 | publisher = {Cold Spring Harbor Laboratory}, 85 | URL = {https://www.biorxiv.org/content/early/2023/09/08/2023.09.08.555192}, 86 | eprint = {https://www.biorxiv.org/content/early/2023/09/08/2023.09.08.555192.full.pdf}, 87 | journal = {bioRxiv} 88 | } 89 | ``` -------------------------------------------------------------------------------- /.history/readme_20240311112739.md: -------------------------------------------------------------------------------- 1 | # scEval😈: A evaluation platform for single-cell Foundation Models (FMs) 2 | 3 | This is the repo for our benchmarking and analysis project. All methods are collected until March 1st, 2024. 4 | 5 | # Install 6 | 7 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create a environment based on this yml file in your own machine: 8 | ``` 9 | conda env create -n scgpt --file scgpt_bench.yml 10 | ``` 11 | 12 | For other methods we used, please refer their original project website for instruction. We recommend creating different environment for different methods. 13 | 14 | These methods include: 15 | 16 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE). These are also single-cell FMs. 17 | 18 | And 19 | 20 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models. 21 | 22 | 23 | We need scIB for evaluation. Please use pip to install it: 24 | ``` 25 | pip install scib 26 | ``` 27 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correclty. 28 | 29 | We will release a version of sceval with more functions in the future! 30 | 31 | 32 | # Pre-training weights 33 | 34 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. 35 | 36 | Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). 37 | 38 | scFoundation relies on the APIs for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. 39 | 40 | # Benchmarking information 41 | 42 | Please refer different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here: 43 | 44 | | Tasks | Metrics | 45 | |-------------------------------------------------------|------------------------------------------| 46 | | Batch Effect Correction, Multi-omics Data Integration | 47 | | and Simulation | [scIB](https://github.com/theislab/scib) | 48 | | Cell-type Annotation and Gene Function Prediction | Accuracy, Precision, Recall and F1 score | 49 | | Imputation | [scIB](https://github.com/theislab/scib), Correlation | 50 | | Perturbation Prediction | Correlation | 51 | | Gene Network Analysis | Jaccard similarity | 52 | 53 | The file 'sceval_lib.py' includes all of the metrics we used in this project. 54 | 55 | To run the codes in different tasks, please use (we choose batch effect correction as an example here): 56 | 57 | ``` 58 | python sceval_batcheffect.py 59 | ``` 60 | 61 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing). 62 | 63 | To avoid using wandb, please set: 64 | 65 | ``` 66 | os.environ["WANDB_MODE"] = "offline" 67 | ``` 68 | 69 | # Results 70 | 71 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 72 | 73 | # Contact 74 | 75 | Please contact tianyu.liu@yale.edu if you have any questions about this project. 76 | 77 | # Citation 78 | 79 | ``` 80 | @article {Liu2023.09.08.555192, 81 | author = {Tianyu Liu and Kexing Li and Yuge Wang and Hongyu Li and Hongyu Zhao}, 82 | title = {Evaluating the Utilities of Large Language Models in Single-cell Data Analysis}, 83 | elocation-id = {2023.09.08.555192}, 84 | year = {2023}, 85 | doi = {10.1101/2023.09.08.555192}, 86 | publisher = {Cold Spring Harbor Laboratory}, 87 | URL = {https://www.biorxiv.org/content/early/2023/09/08/2023.09.08.555192}, 88 | eprint = {https://www.biorxiv.org/content/early/2023/09/08/2023.09.08.555192.full.pdf}, 89 | journal = {bioRxiv} 90 | } 91 | ``` -------------------------------------------------------------------------------- /.history/readme_20240311112740.md: -------------------------------------------------------------------------------- 1 | # scEval😈: A evaluation platform for single-cell Foundation Models (FMs) 2 | 3 | This is the repo for our benchmarking and analysis project. All methods are collected until March 1st, 2024. 4 | 5 | # Install 6 | 7 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create a environment based on this yml file in your own machine: 8 | ``` 9 | conda env create -n scgpt --file scgpt_bench.yml 10 | ``` 11 | 12 | For other methods we used, please refer their original project website for instruction. We recommend creating different environment for different methods. 13 | 14 | These methods include: 15 | 16 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE). These are also single-cell FMs. 17 | 18 | And 19 | 20 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models. 21 | 22 | 23 | We need scIB for evaluation. Please use pip to install it: 24 | ``` 25 | pip install scib 26 | ``` 27 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correclty. 28 | 29 | We will release a version of sceval with more functions in the future! 30 | 31 | 32 | # Pre-training weights 33 | 34 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. 35 | 36 | Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). 37 | 38 | scFoundation relies on the APIs for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. 39 | 40 | # Benchmarking information 41 | 42 | Please refer different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here: 43 | 44 | | Tasks | Metrics | 45 | |-------------------------------------------------------|------------------------------------------| 46 | | Batch Effect Correction, Multi-omics Data Integration | 47 | | and Simulation | [scIB](https://github.com/theislab/scib) | 48 | | Cell-type Annotation and Gene Function Prediction | Accuracy, Precision, Recall and F1 score | 49 | | Imputation | [scIB](https://github.com/theislab/scib), Correlation | 50 | | Perturbation Prediction | Correlation | 51 | | Gene Network Analysis | Jaccard similarity | 52 | 53 | The file 'sceval_lib.py' includes all of the metrics we used in this project. 54 | 55 | To run the codes in different tasks, please use (we choose batch effect correction as an example here): 56 | 57 | ``` 58 | python sceval_batcheffect.py 59 | ``` 60 | 61 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing). 62 | 63 | To avoid using wandb, please set: 64 | 65 | ``` 66 | os.environ["WANDB_MODE"] = "offline" 67 | ``` 68 | 69 | # Results 70 | 71 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 72 | 73 | # Contact 74 | 75 | Please contact tianyu.liu@yale.edu if you have any questions about this project. 76 | 77 | # Citation 78 | 79 | ``` 80 | @article {Liu2023.09.08.555192, 81 | author = {Tianyu Liu and Kexing Li and Yuge Wang and Hongyu Li and Hongyu Zhao}, 82 | title = {Evaluating the Utilities of Large Language Models in Single-cell Data Analysis}, 83 | elocation-id = {2023.09.08.555192}, 84 | year = {2023}, 85 | doi = {10.1101/2023.09.08.555192}, 86 | publisher = {Cold Spring Harbor Laboratory}, 87 | URL = {https://www.biorxiv.org/content/early/2023/09/08/2023.09.08.555192}, 88 | eprint = {https://www.biorxiv.org/content/early/2023/09/08/2023.09.08.555192.full.pdf}, 89 | journal = {bioRxiv} 90 | } 91 | ``` -------------------------------------------------------------------------------- /.history/readme_20240311112848.md: -------------------------------------------------------------------------------- 1 | # scEval😈: A evaluation platform for single-cell Foundation Models (FMs) 2 | 3 | This is the repo for our benchmarking and analysis project. All methods are collected until March 1st, 2024. 4 | 5 | # Install 6 | 7 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create a environment based on this yml file in your own machine: 8 | ``` 9 | conda env create -n scgpt --file scgpt_bench.yml 10 | ``` 11 | 12 | For other methods we used, please refer their original project website for instruction. We recommend creating different environment for different methods. 13 | 14 | These methods include: 15 | 16 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE). These are also single-cell FMs. 17 | 18 | And 19 | 20 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models. 21 | 22 | 23 | We need scIB for evaluation. Please use pip to install it: 24 | ``` 25 | pip install scib 26 | ``` 27 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correclty. 28 | 29 | We will release a version of sceval with more functions in the future! 30 | 31 | 32 | # Pre-training weights 33 | 34 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. 35 | 36 | Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). 37 | 38 | scFoundation relies on the APIs for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. 39 | 40 | # Benchmarking information 41 | 42 | Please refer different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here: 43 | 44 | | Tasks | Metrics | 45 | |-------------------------------------------------------|------------------------------------------| 46 | | Batch Effect Correction, Multi-omics Data Integration | 47 | | and Simulation | [scIB](https://github.com/theislab/scib) | 48 | | Cell-type Annotation and Gene Function Prediction | Accuracy, Precision, Recall and F1 score | 49 | | Imputation | [scIB](https://github.com/theislab/scib), Correlation | 50 | | Perturbation Prediction | Correlation | 51 | | Gene Network Analysis | Jaccard similarity | 52 | 53 | The file 'sceval_lib.py' includes all of the metrics we used in this project. 54 | 55 | To run the codes in different tasks, please use (we choose batch effect correction as an example here): 56 | 57 | ``` 58 | python sceval_batcheffect.py 59 | ``` 60 | 61 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing). 62 | 63 | To avoid using wandb, please set: 64 | 65 | ``` 66 | os.environ["WANDB_MODE"] = "offline" 67 | ``` 68 | 69 | # Results 70 | 71 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 72 | 73 | # Contact 74 | 75 | Please contact tianyu.liu@yale.edu if you have any questions about this project. 76 | 77 | # Citation 78 | 79 | ``` 80 | @article{liu2023evaluating, 81 | title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis}, 82 | author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu}, 83 | journal={bioRxiv}, 84 | pages={2023--09}, 85 | year={2023}, 86 | publisher={Cold Spring Harbor Laboratory} 87 | } 88 | ``` -------------------------------------------------------------------------------- /.history/readme_20240311112907.md: -------------------------------------------------------------------------------- 1 | # scEval😈: A evaluation platform for single-cell Foundation Models (FMs) 2 | 3 | This is the repo for our benchmarking and analysis project. All methods are collected until March 1st, 2024. 4 | 5 | # Install 6 | 7 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create a environment based on this yml file in your own machine: 8 | ``` 9 | conda env create -n scgpt --file scgpt_bench.yml 10 | ``` 11 | 12 | For other methods we used, please refer their original project website for instruction. We recommend creating different environment for different methods. 13 | 14 | These methods include: 15 | 16 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE). These are also single-cell FMs. 17 | 18 | And 19 | 20 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models. 21 | 22 | 23 | We need scIB for evaluation. Please use pip to install it: 24 | ``` 25 | pip install scib 26 | ``` 27 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correclty. 28 | 29 | We will release a version of sceval with more functions in the future! 30 | 31 | 32 | # Pre-training weights 33 | 34 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. 35 | 36 | Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). 37 | 38 | scFoundation relies on the APIs for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. 39 | 40 | # Benchmarking information 41 | 42 | Please refer different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here: 43 | 44 | | Tasks | Metrics | 45 | |-------------------------------------------------------|------------------------------------------| 46 | | Batch Effect Correction, Multi-omics Data Integration | 47 | | and Simulation | [scIB](https://github.com/theislab/scib) | 48 | | Cell-type Annotation and Gene Function Prediction | Accuracy, Precision, Recall and F1 score | 49 | | Imputation | [scIB](https://github.com/theislab/scib), Correlation | 50 | | Perturbation Prediction | Correlation | 51 | | Gene Network Analysis | Jaccard similarity | 52 | 53 | The file 'sceval_lib.py' includes all of the metrics we used in this project. 54 | 55 | To run the codes in different tasks, please use (we choose batch effect correction as an example here): 56 | 57 | ``` 58 | python sceval_batcheffect.py 59 | ``` 60 | 61 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing). 62 | 63 | To avoid using wandb, please set: 64 | 65 | ``` 66 | os.environ["WANDB_MODE"] = "offline" 67 | ``` 68 | 69 | # Results 70 | 71 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 72 | 73 | # Contact 74 | 75 | Please contact tianyu.liu@yale.edu if you have any questions about this project. 76 | 77 | # Citation 78 | 79 | ``` 80 | @article{liu2023evaluating, 81 | title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis}, 82 | author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu}, 83 | journal={bioRxiv}, 84 | pages={2023--09}, 85 | year={2023}, 86 | publisher={Cold Spring Harbor Laboratory} 87 | } 88 | ``` -------------------------------------------------------------------------------- /.history/readme_20240311112908.md: -------------------------------------------------------------------------------- 1 | # scEval😈: A evaluation platform for single-cell Foundation Models (FMs) 2 | 3 | This is the repo for our benchmarking and analysis project. All methods are collected until March 1st, 2024. 4 | 5 | # Install 6 | 7 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create a environment based on this yml file in your own machine: 8 | ``` 9 | conda env create -n scgpt --file scgpt_bench.yml 10 | ``` 11 | 12 | For other methods we used, please refer their original project website for instruction. We recommend creating different environment for different methods. 13 | 14 | These methods include: 15 | 16 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE). These are also single-cell FMs. 17 | 18 | And 19 | 20 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models. 21 | 22 | 23 | We need scIB for evaluation. Please use pip to install it: 24 | ``` 25 | pip install scib 26 | ``` 27 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correclty. 28 | 29 | We will release a version of sceval with more functions in the future! 30 | 31 | 32 | # Pre-training weights 33 | 34 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. 35 | 36 | Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). 37 | 38 | scFoundation relies on the APIs for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. 39 | 40 | # Benchmarking information 41 | 42 | Please refer different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here: 43 | 44 | | Tasks | Metrics | 45 | |-------------------------------------------------------|------------------------------------------| 46 | | Batch Effect Correction, Multi-omics Data Integration | 47 | | and Simulation | [scIB](https://github.com/theislab/scib) | 48 | | Cell-type Annotation and Gene Function Prediction | Accuracy, Precision, Recall and F1 score | 49 | | Imputation | [scIB](https://github.com/theislab/scib), Correlation | 50 | | Perturbation Prediction | Correlation | 51 | | Gene Network Analysis | Jaccard similarity | 52 | 53 | The file 'sceval_lib.py' includes all of the metrics we used in this project. 54 | 55 | To run the codes in different tasks, please use (we choose batch effect correction as an example here): 56 | 57 | ``` 58 | python sceval_batcheffect.py 59 | ``` 60 | 61 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing). 62 | 63 | To avoid using wandb, please set: 64 | 65 | ``` 66 | os.environ["WANDB_MODE"] = "offline" 67 | ``` 68 | 69 | # Results 70 | 71 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 72 | 73 | # Contact 74 | 75 | Please contact tianyu.liu@yale.edu if you have any questions about this project. 76 | 77 | # Citation 78 | 79 | ``` 80 | @article{liu2023evaluating, 81 | title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis}, 82 | author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu}, 83 | journal={bioRxiv}, 84 | pages={2023--09}, 85 | year={2023}, 86 | publisher={Cold Spring Harbor Laboratory} 87 | } 88 | ``` -------------------------------------------------------------------------------- /.history/readme_20241130154021.md: -------------------------------------------------------------------------------- 1 | # scEval😈: An evaluation platform for single-cell Foundation Models (FMs) 2 | 3 | This is the repo for our benchmarking and analysis project. All methods are collected until March 1st, 2024. 4 | 5 | # Install 6 | 7 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create an environment based on this yml file in your own machine: 8 | ``` 9 | conda env create -n scgpt --file scgpt_bench.yml 10 | ``` 11 | 12 | If you face any issues due to version conflicts, you can try to comment the problematic packages and try: 13 | 14 | ``` 15 | conda activate scgpt 16 | conda env update --file scgpt_bench.yml 17 | ``` 18 | 19 | For other methods we used, please refer to their original project website for instructions. We recommend creating different environments for different methods. Considering the difficulties of installing different scFMs, we provide a list of yml files we used to install these models in the folder **installation_baselines**. 20 | 21 | These methods include: 22 | 23 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE), [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main). These are also single-cell FMs. 24 | 25 | And 26 | 27 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models. 28 | 29 | 30 | We need scIB for evaluation. Please use pip to install it: 31 | ``` 32 | pip install scib 33 | ``` 34 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correctly. 35 | 36 | We will release a version of scEval with more functions in the future! 37 | 38 | 39 | # Pre-training weights 40 | 41 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). Pre-training weights of UCE can be found in [UCE](https://github.com/snap-stanford/UCE). Pre-training weights of tGPT can be found in [tGPT](https://github.com/deeplearningplus/tGPT). Pre-training weights of CellPLM can be found in [CellPLM](https://github.com/OmicsML/CellPLM). 42 | 43 | scFoundation relies on the APIs or local sever for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. Details of GeneCompas can be found in [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main) 44 | 45 | # Benchmarking information 46 | 47 | Please refer to different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here: 48 | 49 | | Tasks | Metrics | 50 | |-------------------------------------------------------|------------------------------------------| 51 | | Batch Effect Correction, Multi-omics Data Integration | 52 | | and Simulation | [scIB](https://github.com/theislab/scib) | 53 | | Cell-type Annotation and Gene Function Prediction | Accuracy, Precision, Recall and F1 score | 54 | | Imputation | [scIB](https://github.com/theislab/scib), Correlation | 55 | | Perturbation Prediction | Correlation | 56 | | Gene Network Analysis | Jaccard similarity | 57 | 58 | The file 'sceval_lib.py' includes all of the metrics we used in this project. 59 | 60 | To run the codes in different tasks, please use (we choose batch effect correction of scGPT as an example here): 61 | 62 | ``` 63 | python sceval_batcheffect.py 64 | ``` 65 | 66 | We recommend directly evaluating the methods based on their outputs (as .h5ad file), which can be easily performed based on the codes in **sceval_method.py**. 67 | 68 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing). 69 | 70 | To avoid using wandb, please set: 71 | 72 | ``` 73 | os.environ["WANDB_MODE"] = "offline" 74 | 75 | ``` 76 | 77 | We will upload our codes for benchmarking different foundation models soon. 78 | 79 | # Devices 80 | 81 | We recommend using sever to run benchmarked methods and scEval platform. To run single-cell Foundation Models, GPU cores (A100 or higher version) and 40+ GB memory are required. To run scEval (only the evaluation), 40+ GB memory is recommended. 82 | 83 | # Results 84 | 85 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 86 | 87 | # Contact 88 | 89 | Please contact tianyu.liu@yale.edu if you have any questions about this project. 90 | 91 | # Citation 92 | 93 | ``` 94 | @article{liu2023evaluating, 95 | title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis}, 96 | author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu}, 97 | journal={bioRxiv}, 98 | pages={2023--09}, 99 | year={2023}, 100 | publisher={Cold Spring Harbor Laboratory} 101 | } 102 | ``` -------------------------------------------------------------------------------- /.history/readme_20241130154116.md: -------------------------------------------------------------------------------- 1 | # scEval😈: An evaluation platform for single-cell Foundation Models (FMs) 2 | 3 | This is the repo for our benchmarking and analysis project. All methods are collected until March 1st, 2024. 4 | 5 | # Install 6 | 7 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create an environment based on this yml file in your own machine: 8 | ``` 9 | conda env create -n scgpt --file scgpt_bench.yml 10 | ``` 11 | 12 | If you face any issues due to version conflicts, you can try to comment the problematic packages and try: 13 | 14 | ``` 15 | conda activate scgpt 16 | conda env update --file scgpt_bench.yml 17 | ``` 18 | 19 | For other methods we used, please refer to their original project website for instructions. We recommend creating different environments for different methods. Considering the difficulties of installing different scFMs, we provide a list of yml files we used to install these models in the folder **installation_baselines**. 20 | 21 | These methods include: 22 | 23 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE), [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main). These are also single-cell FMs. 24 | 25 | And 26 | 27 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models. 28 | 29 | 30 | We need scIB for evaluation. Please use pip to install it: 31 | ``` 32 | pip install scib 33 | ``` 34 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correctly. 35 | 36 | We will release a version of scEval with more functions in the future! 37 | 38 | 39 | # Pre-training weights 40 | 41 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). Pre-training weights of UCE can be found in [UCE](https://github.com/snap-stanford/UCE). Pre-training weights of tGPT can be found in [tGPT](https://github.com/deeplearningplus/tGPT). Pre-training weights of CellPLM can be found in [CellPLM](https://github.com/OmicsML/CellPLM). 42 | 43 | scFoundation relies on the APIs or local sever for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. Details of GeneCompas can be found in [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main) 44 | 45 | # Benchmarking information 46 | 47 | Please refer to different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here: 48 | 49 | | Tasks | Metrics | 50 | |-------------------------------------------------------|------------------------------------------| 51 | | Batch Effect Correction, Multi-omics Data Integration | 52 | | and Simulation | [scIB](https://github.com/theislab/scib) | 53 | | Cell-type Annotation and Gene Function Prediction | Accuracy, Precision, Recall and F1 score | 54 | | Imputation | [scIB](https://github.com/theislab/scib), Correlation | 55 | | Perturbation Prediction | Correlation | 56 | | Gene Network Analysis | Jaccard similarity | 57 | 58 | The file 'sceval_lib.py' includes all of the metrics we used in this project. 59 | 60 | To run the codes in different tasks, please use (we choose batch effect correction of scGPT as an example here): 61 | 62 | ``` 63 | python sceval_batcheffect.py 64 | ``` 65 | 66 | We recommend directly evaluating the methods based on their outputs (as .h5ad file), which can be easily performed based on the codes in **sceval_method.py**. 67 | 68 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing). 69 | 70 | To avoid using wandb, please set: 71 | 72 | ``` 73 | os.environ["WANDB_MODE"] = "offline" 74 | 75 | ``` 76 | 77 | We will upload our codes for benchmarking different foundation models soon. 78 | 79 | # Devices 80 | 81 | We recommend using sever to run benchmarked methods and scEval platform. To run single-cell Foundation Models, GPU cores (A100 or higher version) and 40+ GB memory are required. To run scEval (only the evaluation), 40+ GB memory is recommended. 82 | 83 | # Results 84 | 85 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 86 | 87 | # Contact 88 | 89 | Please contact tianyu.liu@yale.edu if you have any questions about this project. 90 | 91 | # Citation 92 | 93 | ``` 94 | @article{liu2023evaluating, 95 | title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis}, 96 | author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu}, 97 | journal={bioRxiv}, 98 | pages={2023--09}, 99 | year={2023}, 100 | publisher={Cold Spring Harbor Laboratory} 101 | } 102 | ``` -------------------------------------------------------------------------------- /.history/readme_20241210100101.md: -------------------------------------------------------------------------------- 1 | # scEval😈: An evaluation platform for single-cell Foundation Models (FMs) 2 | 3 | This is the repo for our benchmarking and analysis project. All methods are collected until Dec 1st, 2024. 4 | 5 | News: We are collaborating with [OpenProblems](https://openproblems.bio/) to make this benchmark alive! Stay tuned and we will update the benchmarking results soon! 6 | 7 | # Install 8 | 9 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create an environment based on this yml file in your own machine: 10 | ``` 11 | conda env create -n scgpt --file scgpt_bench.yml 12 | ``` 13 | 14 | If you face any issues due to version conflicts, you can try to comment the problematic packages and try: 15 | 16 | ``` 17 | conda activate scgpt 18 | conda env update --file scgpt_bench.yml 19 | ``` 20 | 21 | For other methods we used, please refer to their original project website for instructions. We recommend creating different environments for different methods. Considering the difficulties of installing different scFMs, we provide a list of yml files we used to install these models in the folder **installation_baselines**. 22 | 23 | These methods include: 24 | 25 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE), [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main). These are also single-cell FMs. 26 | 27 | And 28 | 29 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models. 30 | 31 | 32 | We need scIB for evaluation. Please use pip to install it: 33 | ``` 34 | pip install scib 35 | ``` 36 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correctly. 37 | 38 | We will release a version of scEval with more functions in the future! 39 | 40 | 41 | # Pre-training weights 42 | 43 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). Pre-training weights of UCE can be found in [UCE](https://github.com/snap-stanford/UCE). Pre-training weights of tGPT can be found in [tGPT](https://github.com/deeplearningplus/tGPT). Pre-training weights of CellPLM can be found in [CellPLM](https://github.com/OmicsML/CellPLM). 44 | 45 | scFoundation relies on the APIs or local sever for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. Details of GeneCompas can be found in [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main) 46 | 47 | # Benchmarking information 48 | 49 | Please refer to different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here: 50 | 51 | | Tasks | Metrics | 52 | |-------------------------------------------------------|------------------------------------------| 53 | | Batch Effect Correction, Multi-omics Data Integration | 54 | | and Simulation | [scIB](https://github.com/theislab/scib) | 55 | | Cell-type Annotation and Gene Function Prediction | Accuracy, Precision, Recall and F1 score | 56 | | Imputation | [scIB](https://github.com/theislab/scib), Correlation | 57 | | Perturbation Prediction | Correlation | 58 | | Gene Network Analysis | Jaccard similarity | 59 | 60 | The file 'sceval_lib.py' includes all of the metrics we used in this project. 61 | 62 | To run the codes in different tasks, please use (we choose batch effect correction of scGPT as an example here): 63 | 64 | ``` 65 | python sceval_batcheffect.py 66 | ``` 67 | 68 | We recommend directly evaluating the methods based on their outputs (as .h5ad file), which can be easily performed based on the codes in **sceval_method.py**. 69 | 70 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing). 71 | 72 | To avoid using wandb, please set: 73 | 74 | ``` 75 | os.environ["WANDB_MODE"] = "offline" 76 | 77 | ``` 78 | 79 | We will upload our codes for benchmarking different foundation models soon. 80 | 81 | # Devices 82 | 83 | We recommend using sever to run benchmarked methods and scEval platform. To run single-cell Foundation Models, GPU cores (A100 or higher version) and 40+ GB memory are required. To run scEval (only the evaluation), 40+ GB memory is recommended. 84 | 85 | # Results 86 | 87 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 88 | 89 | # Contact 90 | 91 | Please contact tianyu.liu@yale.edu if you have any questions about this project. 92 | 93 | # Citation 94 | 95 | ``` 96 | @article{liu2023evaluating, 97 | title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis}, 98 | author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu}, 99 | journal={bioRxiv}, 100 | pages={2023--09}, 101 | year={2023}, 102 | publisher={Cold Spring Harbor Laboratory} 103 | } 104 | ``` -------------------------------------------------------------------------------- /.history/readme_20241210100332.md: -------------------------------------------------------------------------------- 1 | # scEval😈: An evaluation platform for single-cell Foundation Models (FMs) 2 | 3 | This is the repo for our benchmarking and analysis project. All methods are collected until Dec 1st, 2024. 4 | 5 | News: We are collaborating with [OpenProblems](https://openproblems.bio/) to make this benchmark alive! Stay tuned and we will update the benchmarking results soon! 6 | 7 | # Install 8 | 9 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create an environment based on this yml file in your own machine: 10 | ``` 11 | conda env create -n scgpt --file scgpt_bench.yml 12 | ``` 13 | 14 | If you face any issues due to version conflicts, you can try to comment the problematic packages and try: 15 | 16 | ``` 17 | conda activate scgpt 18 | conda env update --file scgpt_bench.yml 19 | ``` 20 | 21 | For other methods we used, please refer to their original project website for instructions. We recommend creating different environments for different methods. Considering the difficulties of installing different scFMs, we provide a list of yml files we used to install these models in the folder **installation_baselines**. 22 | 23 | These methods include: 24 | 25 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE), [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main). These are also single-cell FMs. 26 | 27 | And 28 | 29 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models. 30 | 31 | 32 | We need scIB for evaluation. Please use pip to install it: 33 | ``` 34 | pip install scib 35 | ``` 36 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correctly. 37 | 38 | We will release a version of scEval with more functions in the future! 39 | 40 | 41 | # Pre-training weights 42 | 43 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). Pre-training weights of UCE can be found in [UCE](https://github.com/snap-stanford/UCE). Pre-training weights of tGPT can be found in [tGPT](https://github.com/deeplearningplus/tGPT). Pre-training weights of CellPLM can be found in [CellPLM](https://github.com/OmicsML/CellPLM). 44 | 45 | scFoundation relies on the APIs or local sever for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. Details of GeneCompas can be found in [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main) 46 | 47 | # Benchmarking information 48 | 49 | Please refer to different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here: 50 | 51 | | Tasks | Metrics | 52 | |-------------------------------------------------------|------------------------------------------| 53 | | Batch Effect Correction, Multi-omics Data Integration | 54 | | and Simulation | [scIB](https://github.com/theislab/scib) | 55 | | Cell-type Annotation and Gene Function Prediction | Accuracy, Precision, Recall and F1 score | 56 | | Imputation | [scIB](https://github.com/theislab/scib), Correlation | 57 | | Perturbation Prediction | Correlation, Mean Squared Error | 58 | | Gene Network Analysis | Jaccard similarity | 59 | 60 | The file 'sceval_lib.py' includes all of the metrics we used in this project. 61 | 62 | To run the codes in different tasks, please use (we choose batch effect correction of scGPT as an example here): 63 | 64 | ``` 65 | python sceval_batcheffect.py 66 | ``` 67 | 68 | We recommend directly evaluating the methods based on their outputs (as .h5ad file), which can be easily performed based on the codes in **sceval_method.py**. 69 | 70 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://yaleedu-my.sharepoint.com/:f:/g/personal/tianyu_liu_yale_edu/Eiqs78qeqwBNiy6zoI_JDnABfz7e2w4Gpj0F4t4l5S-oCw?e=0xSnew). 71 | 72 | To avoid using wandb, please set: 73 | 74 | ``` 75 | os.environ["WANDB_MODE"] = "offline" 76 | 77 | ``` 78 | 79 | We will upload our codes for benchmarking different foundation models soon. 80 | 81 | # Devices 82 | 83 | We recommend using sever to run benchmarked methods and scEval platform. To run single-cell Foundation Models, GPU cores (A100 or higher version) and 40+ GB memory are required. To run scEval (only the evaluation), 40+ GB memory is recommended. 84 | 85 | # Results 86 | 87 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 88 | 89 | # Contact 90 | 91 | Please contact tianyu.liu@yale.edu if you have any questions about this project. 92 | 93 | # Citation 94 | 95 | ``` 96 | @article{liu2023evaluating, 97 | title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis}, 98 | author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu}, 99 | journal={bioRxiv}, 100 | pages={2023--09}, 101 | year={2023}, 102 | publisher={Cold Spring Harbor Laboratory} 103 | } 104 | ``` -------------------------------------------------------------------------------- /.history/readme_20241228135005.md: -------------------------------------------------------------------------------- 1 | # scEval😈: An evaluation platform for single-cell Foundation Models (FMs) 2 | 3 | This is the repo for our benchmarking and analysis project. All methods are collected until Dec 1st, 2024. 4 | 5 | News: We are collaborating with [OpenProblems](https://openproblems.bio/) to make this benchmark alive! Stay tuned and we will update the benchmarking results soon! 6 | 7 | # Install 8 | 9 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create an environment based on this yml file in your own machine: 10 | ``` 11 | conda env create -n scgpt --file scgpt_bench.yml 12 | ``` 13 | 14 | If you face any issues due to version conflicts, you can try to comment the problematic packages and try: 15 | 16 | ``` 17 | conda activate scgpt 18 | conda env update --file scgpt_bench.yml 19 | ``` 20 | 21 | We also provide docker installation, please use (need gpu): 22 | 23 | ``` 24 | docker build -t my-conda-image . 25 | ``` 26 | 27 | To activate it, please use: 28 | 29 | ``` 30 | docker run -it --rm my-conda-image 31 | ``` 32 | 33 | For other methods we used, please refer to their original project website for instructions. We recommend creating different environments for different methods. Considering the difficulties of installing different scFMs, we provide a list of yml files and an example of Dockerfile we used to install these models in the folder **installation_baselines**. 34 | 35 | These methods include: 36 | 37 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE), [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main). These are also single-cell FMs. 38 | 39 | And 40 | 41 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models. 42 | 43 | 44 | We need scIB for evaluation. Please use pip to install it: 45 | ``` 46 | pip install scib 47 | ``` 48 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correctly. 49 | 50 | We will release a version of scEval with more functions in the future! 51 | 52 | 53 | # Pre-training weights 54 | 55 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). Pre-training weights of UCE can be found in [UCE](https://github.com/snap-stanford/UCE). Pre-training weights of tGPT can be found in [tGPT](https://github.com/deeplearningplus/tGPT). Pre-training weights of CellPLM can be found in [CellPLM](https://github.com/OmicsML/CellPLM). 56 | 57 | scFoundation relies on the APIs or local sever for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. Details of GeneCompas can be found in [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main) 58 | 59 | # Benchmarking information 60 | 61 | Please refer to different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here: 62 | 63 | | Tasks | Metrics | 64 | |-------------------------------------------------------|------------------------------------------| 65 | | Batch Effect Correction, Multi-omics Data Integration | 66 | | and Simulation | [scIB](https://github.com/theislab/scib) | 67 | | Cell-type Annotation and Gene Function Prediction | Accuracy, Precision, Recall and F1 score | 68 | | Imputation | [scIB](https://github.com/theislab/scib), Correlation | 69 | | Perturbation Prediction | Correlation, Mean Squared Error | 70 | | Gene Network Analysis | Jaccard similarity | 71 | 72 | The file 'sceval_lib.py' includes all of the metrics we used in this project. 73 | 74 | To run the codes in different tasks, please use (we choose batch effect correction of scGPT as an example here): 75 | 76 | ``` 77 | python sceval_batcheffect.py 78 | ``` 79 | 80 | We recommend directly evaluating the methods based on their outputs (as .h5ad file), which can be easily performed based on the codes in **sceval_method.py**. 81 | 82 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://yaleedu-my.sharepoint.com/:f:/g/personal/tianyu_liu_yale_edu/Eiqs78qeqwBNiy6zoI_JDnABfz7e2w4Gpj0F4t4l5S-oCw?e=0xSnew). 83 | 84 | To avoid using wandb, please set: 85 | 86 | ``` 87 | os.environ["WANDB_MODE"] = "offline" 88 | 89 | ``` 90 | 91 | We will upload our codes for benchmarking different foundation models soon. 92 | 93 | # Devices 94 | 95 | We recommend using sever to run benchmarked methods and scEval platform. To run single-cell Foundation Models, GPU cores (A100 or higher version) and 40+ GB memory are required. To run scEval (only the evaluation), 40+ GB memory is recommended. 96 | 97 | # Results 98 | 99 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 100 | 101 | # Contact 102 | 103 | Please contact tianyu.liu@yale.edu if you have any questions about this project. 104 | 105 | # Citation 106 | 107 | ``` 108 | @article{liu2023evaluating, 109 | title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis}, 110 | author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu}, 111 | journal={bioRxiv}, 112 | pages={2023--09}, 113 | year={2023}, 114 | publisher={Cold Spring Harbor Laboratory} 115 | } 116 | ``` -------------------------------------------------------------------------------- /.history/readme_20241228135006.md: -------------------------------------------------------------------------------- 1 | # scEval😈: An evaluation platform for single-cell Foundation Models (FMs) 2 | 3 | This is the repo for our benchmarking and analysis project. All methods are collected until Dec 1st, 2024. 4 | 5 | News: We are collaborating with [OpenProblems](https://openproblems.bio/) to make this benchmark alive! Stay tuned and we will update the benchmarking results soon! 6 | 7 | # Install 8 | 9 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create an environment based on this yml file in your own machine: 10 | ``` 11 | conda env create -n scgpt --file scgpt_bench.yml 12 | ``` 13 | 14 | If you face any issues due to version conflicts, you can try to comment the problematic packages and try: 15 | 16 | ``` 17 | conda activate scgpt 18 | conda env update --file scgpt_bench.yml 19 | ``` 20 | 21 | We also provide docker installation, please use (need gpu): 22 | 23 | ``` 24 | docker build -t my-conda-image . 25 | ``` 26 | 27 | To activate it, please use: 28 | 29 | ``` 30 | docker run -it --rm my-conda-image 31 | ``` 32 | 33 | For other methods we used, please refer to their original project website for instructions. We recommend creating different environments for different methods. Considering the difficulties of installing different scFMs, we provide a list of yml files and an example of Dockerfile we used to install these models in the folder **installation_baselines**. 34 | 35 | These methods include: 36 | 37 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE), [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main). These are also single-cell FMs. 38 | 39 | And 40 | 41 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models. 42 | 43 | 44 | We need scIB for evaluation. Please use pip to install it: 45 | ``` 46 | pip install scib 47 | ``` 48 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correctly. 49 | 50 | We will release a version of scEval with more functions in the future! 51 | 52 | 53 | # Pre-training weights 54 | 55 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). Pre-training weights of UCE can be found in [UCE](https://github.com/snap-stanford/UCE). Pre-training weights of tGPT can be found in [tGPT](https://github.com/deeplearningplus/tGPT). Pre-training weights of CellPLM can be found in [CellPLM](https://github.com/OmicsML/CellPLM). 56 | 57 | scFoundation relies on the APIs or local sever for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. Details of GeneCompas can be found in [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main) 58 | 59 | # Benchmarking information 60 | 61 | Please refer to different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here: 62 | 63 | | Tasks | Metrics | 64 | |-------------------------------------------------------|------------------------------------------| 65 | | Batch Effect Correction, Multi-omics Data Integration | 66 | | and Simulation | [scIB](https://github.com/theislab/scib) | 67 | | Cell-type Annotation and Gene Function Prediction | Accuracy, Precision, Recall and F1 score | 68 | | Imputation | [scIB](https://github.com/theislab/scib), Correlation | 69 | | Perturbation Prediction | Correlation, Mean Squared Error | 70 | | Gene Network Analysis | Jaccard similarity | 71 | 72 | The file 'sceval_lib.py' includes all of the metrics we used in this project. 73 | 74 | To run the codes in different tasks, please use (we choose batch effect correction of scGPT as an example here): 75 | 76 | ``` 77 | python sceval_batcheffect.py 78 | ``` 79 | 80 | We recommend directly evaluating the methods based on their outputs (as .h5ad file), which can be easily performed based on the codes in **sceval_method.py**. 81 | 82 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://yaleedu-my.sharepoint.com/:f:/g/personal/tianyu_liu_yale_edu/Eiqs78qeqwBNiy6zoI_JDnABfz7e2w4Gpj0F4t4l5S-oCw?e=0xSnew). 83 | 84 | To avoid using wandb, please set: 85 | 86 | ``` 87 | os.environ["WANDB_MODE"] = "offline" 88 | 89 | ``` 90 | 91 | We will upload our codes for benchmarking different foundation models soon. 92 | 93 | # Devices 94 | 95 | We recommend using sever to run benchmarked methods and scEval platform. To run single-cell Foundation Models, GPU cores (A100 or higher version) and 40+ GB memory are required. To run scEval (only the evaluation), 40+ GB memory is recommended. 96 | 97 | # Results 98 | 99 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 100 | 101 | # Contact 102 | 103 | Please contact tianyu.liu@yale.edu if you have any questions about this project. 104 | 105 | # Citation 106 | 107 | ``` 108 | @article{liu2023evaluating, 109 | title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis}, 110 | author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu}, 111 | journal={bioRxiv}, 112 | pages={2023--09}, 113 | year={2023}, 114 | publisher={Cold Spring Harbor Laboratory} 115 | } 116 | ``` -------------------------------------------------------------------------------- /.history/sceval_lib_20240311171801.py: -------------------------------------------------------------------------------- 1 | import AnnData 2 | import torch 3 | import numpy as np 4 | import scib 5 | import scanpy as sc 6 | import scipy 7 | import scipy.stats 8 | from scgpt.utils import set_seed 9 | from sklearn.metrics import classification_report 10 | from typing import List, Tuple, Dict, Union, Optional 11 | 12 | set_seed(0) 13 | def eval_scib_metrics( 14 | adata: AnnData, 15 | batch_key: str = "batch", 16 | label_key: str = "celltype", 17 | emb_name: str = "X_scGPT", 18 | notes: Optional[str] = None, 19 | ) -> Dict: 20 | results = scib.metrics.metrics( 21 | adata, 22 | adata_int=adata, 23 | batch_key=batch_key, 24 | label_key=label_key, 25 | embed=emb_name, 26 | isolated_labels_asw_=False, 27 | silhouette_=True, 28 | hvg_score_=False, 29 | graph_conn_=True, 30 | pcr_=True, 31 | isolated_labels_f1_=False, 32 | trajectory_=False, 33 | nmi_=True, 34 | ari_=True, 35 | cell_cycle_=False, 36 | kBET_=True, 37 | ilisi_=False, 38 | clisi_=False, 39 | ) 40 | 41 | result_dict = results[0].to_dict() 42 | 43 | result_dict["avg_bio"] = np.mean( 44 | [ 45 | result_dict["NMI_cluster/label"], 46 | result_dict["ARI_cluster/label"], 47 | result_dict["ASW_label"], 48 | ] 49 | ) 50 | 51 | # remove nan value in result_dict 52 | result_dict = {k: v for k, v in result_dict.items() if not np.isnan(v)} 53 | 54 | print(results) 55 | return result_dict 56 | 57 | 58 | def eval_scib_metrics_onlybio( 59 | adata: AnnData, 60 | batch_key: str = "batch", 61 | label_key: str = "celltype", 62 | emb_name: str = "X_scGPT", 63 | notes: Optional[str] = None, 64 | ) -> Dict: 65 | results = scib.metrics.metrics_onlybio( 66 | adata, 67 | adata_int=adata, 68 | batch_key=batch_key, 69 | label_key=label_key, 70 | embed=emb_name, 71 | isolated_labels_asw_=False, 72 | silhouette_=True, 73 | hvg_score_=False, 74 | graph_conn_=True, 75 | pcr_=True, 76 | isolated_labels_f1_=False, 77 | trajectory_=False, 78 | nmi_=True, 79 | ari_=True, 80 | cell_cycle_=False, 81 | kBET_=False, 82 | ilisi_=False, 83 | clisi_=False, 84 | ) 85 | 86 | result_dict = results[0].to_dict() 87 | result_dict["avg_bio"] = np.mean( 88 | [ 89 | result_dict["NMI_cluster/label"], 90 | result_dict["ARI_cluster/label"], 91 | result_dict["ASW_label"], 92 | ] 93 | ) 94 | 95 | # remove nan value in result_dict 96 | result_dict = {k: v for k, v in result_dict.items() if not np.isnan(v)} 97 | 98 | print(results) 99 | return result_dict 100 | 101 | def calculate_correlation_metric(y1, y2): 102 | cor = 0.0 103 | y1 = y1.float() 104 | y2 = y2.float() 105 | for id1, id2 in zip(y1, y2): 106 | 107 | cor_cal,_ = scipy.stats.pearsonr(id1,id2) 108 | cor += cor_cal.item() 109 | return cor 110 | 111 | 112 | class scEval(object): 113 | 114 | def __init__(self, adata): 115 | self.label = 'scGPT' 116 | self.adata = adata # adata is the output of the model you plan to benchmark. 117 | self.pvalue = 0.005 118 | 119 | def evaluation_bec(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scGPT'): 120 | results = eval_scib_metrics(self.adata,batch_key,label_key, emb_name) 121 | return results 122 | 123 | 124 | def evaluation_cta_gfp(self, pred_label, true_label): 125 | results = classification_report(pred_label, true_label, digits=4) 126 | return results 127 | 128 | def evaluation_perturb_pred(self, pred_model, true_result): #assume the outputs are both in AnnData format. Rows are cells while columns are genes. 129 | cor_total = calculate_correlation_metric(pred_model.X.T, true_result.X.T) 130 | return {"correlation":cor_total / len(pred_model.X.T)} 131 | 132 | def evaluation_perturb_pred_gearsofficial(self, gears_model, pred_model ): 133 | from gears.inference import evaluate, compute_metrics, deeper_analysis, non_dropout_analysis 134 | test_res = evaluate(gears_model.dataloader['test_loader'], pred_model) 135 | test_metrics, test_pert_res = compute_metrics(test_res) 136 | return test_metrics 137 | 138 | def evaluation_imputation_scrna(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scGPT'): 139 | results = eval_scib_metrics_onlybio(self.adata,batch_key,label_key, emb_name) 140 | return results 141 | 142 | def evaluation_imputation_spatial(self, adata_sp): 143 | adata_imp_new = self.adata[:, adata_sp.var_names] 144 | cor_list = [] 145 | pval_list = [] 146 | for item in adata_sp.var_names: 147 | adata1 = adata_sp[:,item] 148 | adata2 = adata_imp_new[:,item] 149 | cor, pval = scipy.stats.pearsonr(np.array(adata1.X.todense().T)[0], np.array(adata2.X.T)[0]) # for this step, please check the data form 150 | cor_list.append(cor) 151 | pval_list.append(pval) 152 | 153 | adata_imp_new.var['cor'] = cor_list 154 | adata_imp_new.var['pval'] = pval_list 155 | 156 | mean_cor = np.mean(adata_imp_new.var['cor'].values) 157 | 158 | avg_sig = np.sum(adata_imp_new.var['pval'].values Dict: 20 | results = scib.metrics.metrics( 21 | adata, 22 | adata_int=adata, 23 | batch_key=batch_key, 24 | label_key=label_key, 25 | embed=emb_name, 26 | isolated_labels_asw_=False, 27 | silhouette_=True, 28 | hvg_score_=False, 29 | graph_conn_=True, 30 | pcr_=True, 31 | isolated_labels_f1_=False, 32 | trajectory_=False, 33 | nmi_=True, 34 | ari_=True, 35 | cell_cycle_=False, 36 | kBET_=True, 37 | ilisi_=False, 38 | clisi_=False, 39 | ) 40 | 41 | result_dict = results[0].to_dict() 42 | 43 | result_dict["avg_bio"] = np.mean( 44 | [ 45 | result_dict["NMI_cluster/label"], 46 | result_dict["ARI_cluster/label"], 47 | result_dict["ASW_label"], 48 | ] 49 | ) 50 | 51 | # remove nan value in result_dict 52 | result_dict = {k: v for k, v in result_dict.items() if not np.isnan(v)} 53 | 54 | print(results) 55 | return result_dict 56 | 57 | 58 | def eval_scib_metrics_onlybio( 59 | adata: AnnData, 60 | batch_key: str = "batch", 61 | label_key: str = "celltype", 62 | emb_name: str = "X_scGPT", 63 | notes: Optional[str] = None, 64 | ) -> Dict: 65 | results = scib.metrics.metrics_onlybio( 66 | adata, 67 | adata_int=adata, 68 | batch_key=batch_key, 69 | label_key=label_key, 70 | embed=emb_name, 71 | isolated_labels_asw_=False, 72 | silhouette_=True, 73 | hvg_score_=False, 74 | graph_conn_=True, 75 | pcr_=True, 76 | isolated_labels_f1_=False, 77 | trajectory_=False, 78 | nmi_=True, 79 | ari_=True, 80 | cell_cycle_=False, 81 | kBET_=False, 82 | ilisi_=False, 83 | clisi_=False, 84 | ) 85 | 86 | result_dict = results[0].to_dict() 87 | result_dict["avg_bio"] = np.mean( 88 | [ 89 | result_dict["NMI_cluster/label"], 90 | result_dict["ARI_cluster/label"], 91 | result_dict["ASW_label"], 92 | ] 93 | ) 94 | 95 | # remove nan value in result_dict 96 | result_dict = {k: v for k, v in result_dict.items() if not np.isnan(v)} 97 | 98 | print(results) 99 | return result_dict 100 | 101 | def calculate_correlation_metric(y1, y2): 102 | cor = 0.0 103 | y1 = y1.float() 104 | y2 = y2.float() 105 | for id1, id2 in zip(y1, y2): 106 | 107 | cor_cal,_ = scipy.stats.pearsonr(id1,id2) 108 | cor += cor_cal.item() 109 | return cor 110 | 111 | 112 | class scEval(object): 113 | 114 | def __init__(self, adata): 115 | self.label = 'scGPT' 116 | self.adata = adata # adata is the output of the model you plan to benchmark. 117 | self.pvalue = 0.005 118 | 119 | def evaluation_bec(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scGPT'): 120 | results = eval_scib_metrics(self.adata,batch_key,label_key, emb_name) 121 | return results 122 | 123 | 124 | def evaluation_cta_gfp(self, pred_label, true_label): 125 | results = classification_report(pred_label, true_label, digits=4) 126 | return results 127 | 128 | def evaluation_perturb_pred(self, pred_model, true_result): #assume the outputs are both in AnnData format. Rows are cells while columns are genes. 129 | cor_total = calculate_correlation_metric(pred_model.X.T, true_result.X.T) 130 | return {"correlation":cor_total / len(pred_model.X.T)} 131 | 132 | def evaluation_perturb_pred_gearsofficial(self, gears_model, pred_model ): 133 | from gears.inference import evaluate, compute_metrics, deeper_analysis, non_dropout_analysis 134 | test_res = evaluate(gears_model.dataloader['test_loader'], pred_model) 135 | test_metrics, test_pert_res = compute_metrics(test_res) 136 | return test_metrics 137 | 138 | def evaluation_imputation_scrna(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scGPT'): 139 | results = eval_scib_metrics_onlybio(self.adata,batch_key,label_key, emb_name) 140 | return results 141 | 142 | def evaluation_imputation_spatial(self, adata_sp): 143 | adata_imp_new = self.adata[:, adata_sp.var_names] 144 | cor_list = [] 145 | pval_list = [] 146 | for item in adata_sp.var_names: 147 | adata1 = adata_sp[:,item] 148 | adata2 = adata_imp_new[:,item] 149 | cor, pval = scipy.stats.pearsonr(np.array(adata1.X.todense().T)[0], np.array(adata2.X.T)[0]) # for this step, please check the data form 150 | cor_list.append(cor) 151 | pval_list.append(pval) 152 | 153 | adata_imp_new.var['cor'] = cor_list 154 | adata_imp_new.var['pval'] = pval_list 155 | 156 | mean_cor = np.mean(adata_imp_new.var['cor'].values) 157 | 158 | avg_sig = np.sum(adata_imp_new.var['pval'].values Dict: 20 | results = scib.metrics.metrics( 21 | adata, 22 | adata_int=adata, 23 | batch_key=batch_key, 24 | label_key=label_key, 25 | embed=emb_name, 26 | isolated_labels_asw_=False, 27 | silhouette_=True, 28 | hvg_score_=False, 29 | graph_conn_=True, 30 | pcr_=True, 31 | isolated_labels_f1_=False, 32 | trajectory_=False, 33 | nmi_=True, 34 | ari_=True, 35 | cell_cycle_=False, 36 | kBET_=True, 37 | ilisi_=False, 38 | clisi_=False, 39 | ) 40 | 41 | result_dict = results[0].to_dict() 42 | 43 | result_dict["avg_bio"] = np.mean( 44 | [ 45 | result_dict["NMI_cluster/label"], 46 | result_dict["ARI_cluster/label"], 47 | result_dict["ASW_label"], 48 | ] 49 | ) 50 | 51 | # remove nan value in result_dict 52 | result_dict = {k: v for k, v in result_dict.items() if not np.isnan(v)} 53 | 54 | print(results) 55 | return result_dict 56 | 57 | 58 | def eval_scib_metrics_onlybio( 59 | adata: AnnData, 60 | batch_key: str = "batch", 61 | label_key: str = "celltype", 62 | emb_name: str = "X_scGPT", 63 | notes: Optional[str] = None, 64 | ) -> Dict: 65 | results = scib.metrics.metrics_onlybio( 66 | adata, 67 | adata_int=adata, 68 | batch_key=batch_key, 69 | label_key=label_key, 70 | embed=emb_name, 71 | isolated_labels_asw_=False, 72 | silhouette_=True, 73 | hvg_score_=False, 74 | graph_conn_=True, 75 | pcr_=True, 76 | isolated_labels_f1_=False, 77 | trajectory_=False, 78 | nmi_=True, 79 | ari_=True, 80 | cell_cycle_=False, 81 | kBET_=False, 82 | ilisi_=False, 83 | clisi_=False, 84 | ) 85 | 86 | result_dict = results[0].to_dict() 87 | result_dict["avg_bio"] = np.mean( 88 | [ 89 | result_dict["NMI_cluster/label"], 90 | result_dict["ARI_cluster/label"], 91 | result_dict["ASW_label"], 92 | ] 93 | ) 94 | 95 | # remove nan value in result_dict 96 | result_dict = {k: v for k, v in result_dict.items() if not np.isnan(v)} 97 | 98 | print(results) 99 | return result_dict 100 | 101 | def calculate_correlation_metric(y1, y2): 102 | cor = 0.0 103 | y1 = y1.float() 104 | y2 = y2.float() 105 | for id1, id2 in zip(y1, y2): 106 | 107 | cor_cal,_ = scipy.stats.pearsonr(id1,id2) 108 | cor += cor_cal.item() 109 | return cor 110 | 111 | 112 | class scEval(object): 113 | 114 | def __init__(self, adata): 115 | self.label = 'scGPT' 116 | self.adata = adata # adata is the output of the model you plan to benchmark. 117 | self.pvalue = 0.005 118 | 119 | def evaluation_bec(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scGPT'): 120 | results = eval_scib_metrics(self.adata,batch_key,label_key, emb_name) 121 | return results 122 | 123 | 124 | def evaluation_cta_gfp(self, pred_label, true_label): 125 | results = classification_report(pred_label, true_label, digits=4) 126 | return results 127 | 128 | def evaluation_perturb_pred(self, pred_model, true_result): #assume the outputs are both in AnnData format. Rows are cells while columns are genes. 129 | cor_total = calculate_correlation_metric(pred_model.X.T, true_result.X.T) 130 | return {"correlation":cor_total / len(pred_model.X.T)} 131 | 132 | def evaluation_perturb_pred_gearsofficial(self, gears_model, pred_model ): 133 | from gears.inference import evaluate, compute_metrics, deeper_analysis, non_dropout_analysis 134 | test_res = evaluate(gears_model.dataloader['test_loader'], pred_model) 135 | test_metrics, test_pert_res = compute_metrics(test_res) 136 | return test_metrics 137 | 138 | def evaluation_imputation_scrna(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scGPT'): 139 | results = eval_scib_metrics_onlybio(self.adata,batch_key,label_key, emb_name) 140 | return results 141 | 142 | def evaluation_imputation_spatial(self, adata_sp): 143 | adata_imp_new = self.adata[:, adata_sp.var_names] 144 | cor_list = [] 145 | pval_list = [] 146 | for item in adata_sp.var_names: 147 | adata1 = adata_sp[:,item] 148 | adata2 = adata_imp_new[:,item] 149 | cor, pval = scipy.stats.pearsonr(np.array(adata1.X.todense().T)[0], np.array(adata2.X.T)[0]) # for this step, please check the data form 150 | cor_list.append(cor) 151 | pval_list.append(pval) 152 | 153 | adata_imp_new.var['cor'] = cor_list 154 | adata_imp_new.var['pval'] = pval_list 155 | 156 | mean_cor = np.mean(adata_imp_new.var['cor'].values) 157 | 158 | avg_sig = np.sum(adata_imp_new.var['pval'].values20].index 47 | t2 = time.time() 48 | 49 | 50 | print("time", t2-t1) 51 | import resource 52 | print(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / (1e6) ) 53 | import torch 54 | print(torch.cuda.max_memory_allocated()/1024/1024/1024) 55 | -------------------------------------------------------------------------------- /Cell type Annotation/cta_tgpt_uce.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from scipy.stats import mode 7 | import scanpy as sc 8 | import sklearn 9 | import warnings 10 | 11 | sys.path.insert(0, "../") 12 | import scgpt as scg 13 | 14 | # extra dependency for similarity search 15 | try: 16 | import faiss 17 | 18 | faiss_imported = True 19 | except ImportError: 20 | faiss_imported = False 21 | print( 22 | "faiss not installed! We highly recommend installing it for fast similarity search." 23 | ) 24 | print("To install it, see https://github.com/facebookresearch/faiss/wiki/Installing-Faiss") 25 | 26 | warnings.filterwarnings("ignore", category=ResourceWarning) 27 | 28 | ref_embed_adata = sc.read_h5ad("./tgpt_out/spaital_mouse_slideseqv2_tgpt_all.h5ad") # you can change it accordingly 29 | 30 | ref_embed_adata 31 | 32 | from sklearn.model_selection import train_test_split 33 | from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_recall_fscore_support, classification_report 34 | 35 | train_obs,test_obs = train_test_split( 36 | ref_embed_adata.obs_names, random_state=42 37 | ) 38 | 39 | adata_train = ref_embed_adata[train_obs] 40 | adata_test = ref_embed_adata[test_obs] 41 | 42 | from sklearn.linear_model import LogisticRegression 43 | clf = LogisticRegression(random_state=0).fit(adata_train.X, adata_train.obs.celltype) # or adata.obsm['emb'] for uce. 44 | 45 | pred_label = clf.predict(adata_test.X) 46 | true_label = adata_test.obs.celltype 47 | 48 | print(classification_report(true_label, pred_label, digits=4)) -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Start from a Miniconda image 2 | FROM continuumio/miniconda3:latest 3 | 4 | # Create a working directory 5 | WORKDIR /app 6 | 7 | # Copy the environment.yml file into the container 8 | COPY scgpt_bench.yml . 9 | 10 | # Create the environment 11 | # Using `mamba` here for faster installations; it's included in newer images. 12 | RUN conda install -n base -c conda-forge mamba && \ 13 | mamba env create -f scgpt_bench.yml 14 | 15 | # Activate the environment by default 16 | # The conda environment will be located at /opt/conda/envs/myenv 17 | ENV PATH /opt/conda/envs/myenv/bin:$PATH 18 | 19 | # Clean up conda cache to reduce image size 20 | RUN conda clean --all --yes 21 | 22 | # (Optional) Set a default command to start a shell 23 | CMD ["/bin/bash"] 24 | -------------------------------------------------------------------------------- /Gene Network Analysis/gna_geneformer.py: -------------------------------------------------------------------------------- 1 | from geneformer import EmbExtractor 2 | import time 3 | 4 | # initiate EmbExtractor 5 | t1 = time.time() 6 | embex = EmbExtractor( 7 | emb_mode='gene', 8 | forward_batch_size=20, 9 | nproc=16 10 | ) 11 | 12 | # extracts embedding from input data 13 | # input data is tokenized rank value encodings generated by Geneformer tokenizer (see tokenizing_scRNAseq_data.ipynb) 14 | # example dataset: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset 15 | 16 | # pip install tdigest 17 | 18 | embs = embex.extract_embs("./", 19 | "./data/datasets/immune_all_human.dataset/", 20 | "./humanpbmc/", 21 | "output_prefix") 22 | 23 | 24 | t2 = time.time() 25 | print(t2 - t1) 26 | import resource 27 | print(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / (1e6) ) 28 | import torch 29 | print(torch.cuda.max_memory_allocated()/1024/1024/1024) 30 | 31 | 32 | 33 | embs.to_csv("immune_all_human.csv") -------------------------------------------------------------------------------- /Gene Network Analysis/gna_scf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Genemodule 3 | python get_embedding_h5ad.py --task_name ihatest --input_type singlecell --output_type gene --pool_type all --tgthighres f1 --data_path "/gpfs/gibbs/pi/zhao/tl688/scgpt_dataset/Immune_ALL_human.h5ad" --save_path ./examples/genemodule/ --pre_normalized F --demo 4 | -------------------------------------------------------------------------------- /Gene Network Analysis/sceval_gna_selfdefineEval.py: -------------------------------------------------------------------------------- 1 | import scanpy as sc 2 | import numpy as np 3 | 4 | import pandas as pd 5 | from grn import GeneEmbedding 6 | import seaborn as sns 7 | import gseapy as gp 8 | adata = sc.read_h5ad("pbmc_tissue_gene_embeddings.h5ad") 9 | 10 | #marker genes defined by the original paper filtered based on expression profiles. 11 | mkr_set = {'Erythrocytes': ['CST3'], 12 | 'Erythroid progenitors': ['GATA2'], 13 | 'CD10+ B cells': ['MME'], 14 | 'Megakaryocyte progenitors': ['PF4', 'ITGA2B', 'PPBP'], 15 | 'HSPCs': ['CD34', 'PROCR'], 16 | 'Monocyte progenitors': ['IRF8', 'CSF1R', 'LY86'], 17 | 'Plasmacytoid dendritic cells': ['GZMB', 'IL3RA'], 18 | 'CD20+ B cells': ['MS4A1'], 19 | 'Plasma cells': [], 20 | 'Monocyte-derived dendritic cells': ['CD1C','FCER1A'], 21 | 'CD14+ Monocytes': ['CD14'], 22 | 'CD16+ Monocytes': ['FCGR3A'], 23 | 'CD4+ T cells': ['CD4'], 24 | 'CD8+ T cells': ['CD8B', 'CD8A'], 25 | 'NK cells': ['NKG7','GNLY'], 26 | } 27 | 28 | makerlist = [] 29 | 30 | for i in adata.obs['gene_name']: 31 | count = 0 32 | for ctp in mkr_set.keys(): 33 | if i in mkr_set[ctp]: 34 | makerlist.append(ctp) 35 | count = 1 36 | if count ==0: 37 | makerlist.append(None) 38 | 39 | adata.obs['new_marker'] = makerlist 40 | 41 | sc.pl.umap(adata, color='new_marker', edges=True) 42 | 43 | 44 | # specific pathway from scGPT suggestions 45 | mole_list = pd.read_table("Participating Molecules [R-HSA-168256].tsv") 46 | 47 | mole_list_dnarna = mole_list[ mole_list["MoleculeType"] == 'DNA/RNA' ] 48 | 49 | adata_new = adata 50 | cofunction_gene = [] 51 | for i in mole_list_dnarna["MoleculeName"].values: 52 | gene = i.split(' ')[1] 53 | cofunction_gene.append(gene) 54 | 55 | adata_HLA = adata_new[[True if ('HLA' in i ) else False for i in adata_new.obs['gene_name'].values]] 56 | adata_CD = adata_new[[True if ('CD' in i) else False for i in adata_new.obs['gene_name'].values]] 57 | 58 | CD_genes = adata_new.obs['gene_name'].values 59 | 60 | # Meta info about the number of terms (tests) in the databases 61 | df_database = pd.DataFrame( 62 | data = [['GO_Biological_Process_2021', 6036], 63 | ['GO_Molecular_Function_2021', 1274], 64 | ['Reactome_2022', 1818]], 65 | columns = ['dataset', 'term']) 66 | 67 | # Select desired database for query; here use Reactome as an example 68 | databases = ['Reactome_2022'] 69 | m = df_database[df_database['dataset'].isin(databases)]['term'].sum() 70 | # p-value correction for total number of tests done 71 | p_thresh = 0.05/m 72 | 73 | # Perform pathway enrichment analysis using the gseapy package in the Reactome database 74 | df = pd.DataFrame() 75 | enr_Reactome = gp.enrichr(gene_list=CD_genes, 76 | gene_sets=databases, 77 | organism='Human', 78 | outdir='test/enr_Reactome', 79 | cutoff=0.5) 80 | out = enr_Reactome.results 81 | out = out[out['P-value'] < p_thresh] 82 | df = df.append(out, ignore_index=True) 83 | df 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /Imputation/imp_cellplm.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | 4 | import hdf5plugin 5 | import numpy as np 6 | import anndata as ad 7 | import scanpy as sc 8 | from scipy.sparse import csr_matrix 9 | from CellPLM.utils import set_seed 10 | from CellPLM.utils.data import stratified_sample_genes_by_sparsity 11 | from CellPLM.pipeline.imputation import ImputationPipeline, ImputationDefaultPipelineConfig, ImputationDefaultModelConfig 12 | from CellPLM.pipeline.experimental import symbol_to_ensembl 13 | 14 | # !kill -9 3485071 15 | 16 | ## Specify important parameters before getting started 17 | 18 | DATASET = 'Liver' # 'Lung' 19 | PRETRAIN_VERSION = '20230926_85M' 20 | DEVICE = 'cuda:0' 21 | 22 | ## Load Downstream Dataset 23 | set_seed(11) 24 | 25 | ref_data = sc.read_h5ad("/gpfs/gibbs/pi/zhao/tl688/scGPT/examples/mouse_scrnaseq.h5ad") 26 | query_data = sc.read_h5ad("/gpfs/gibbs/pi/zhao/tl688/scGPT/examples/mouse_spatial.h5ad") 27 | 28 | ref_data.var_names = symbol_to_ensembl(ref_data.var_names ) 29 | query_data.var_names = symbol_to_ensembl(query_data.var_names ) 30 | 31 | ref_data.var_names_make_unique() 32 | query_data.var_names_make_unique() 33 | 34 | ref_data.obs['batch'] = ref_data.obs_names 35 | query_data.obs['batch'] = query_data.obs_names 36 | 37 | query_data.var_names 38 | 39 | # ref_data.var_names = [i.upper() for i in ref_data.var_names] 40 | # query_data.var_names = [i.upper() for i in query_data.var_names] 41 | # target_genes = query_data.var_names 42 | target_genes = ['ENSG00000206579'] 43 | query_data.obsm['truth'] = query_data[:, target_genes].X.toarray() 44 | query_data[:, target_genes].X = 0 45 | train_data = query_data.concatenate(ref_data, join='outer', batch_key=None, index_unique=None) 46 | 47 | train_data.obs['split'] = 'train' 48 | train_data.obs['split'][train_data.obs['batch']==query_data.obs['batch'][-1]] = 'valid' 49 | train_data.obs['split'][train_data.obs['batch']==ref_data.obs['batch'][-1]] = 'valid' 50 | 51 | 52 | query_data.obs['platform'] = 'merfish' 53 | 54 | query_data.obsm['spatial'][:,0] 55 | 56 | query_data.obs['x_FOV_px'] = query_data.obsm['spatial'][:,0] 57 | query_data.obs['y_FOV_px'] = query_data.obsm['spatial'][:,1] 58 | 59 | query_data.var.index 60 | 61 | ref_data.var.index 62 | 63 | query_var_new = [] 64 | for i in query_data.var.index: 65 | if "ENSG" in i: 66 | query_var_new.append(i) 67 | ref_var_new = [] 68 | for i in ref_data.var.index: 69 | if "ENSG" in i: 70 | ref_var_new.append(i) 71 | 72 | query_data = query_data[:,query_var_new] 73 | ref_data = ref_data[:,ref_var_new] 74 | ## Specify gene to impute 75 | 76 | query_genes = [g for g in query_data.var.index if g not in ['MRPL15']] 77 | query_batches = list(query_data.obs['batch'].unique()) 78 | ref_batches = list(ref_data.obs['batch'].unique()) 79 | batch_gene_list = dict(zip(list(query_batches) + list(ref_batches), 80 | [query_genes]*len(query_batches) + [ref_data.var.index.tolist()]*len(ref_batches))) 81 | 82 | ## Overwrite parts of the default config 83 | pipeline_config = ImputationDefaultPipelineConfig.copy() 84 | model_config = ImputationDefaultModelConfig.copy() 85 | 86 | pipeline_config, model_config 87 | 88 | ## Fine-tuning 89 | 90 | pipeline = ImputationPipeline(pretrain_prefix=PRETRAIN_VERSION, # Specify the pretrain checkpoint to load 91 | overwrite_config=model_config, # This is for overwriting part of the pretrain config 92 | pretrain_directory='/gpfs/gibbs/pi/zhao/tl688/CellPLM_cta/ckpt/') 93 | pipeline.model 94 | 95 | # batch_gene_list 96 | pipeline.fit(train_data, # An AnnData object 97 | pipeline_config, # The config dictionary we created previously, optional 98 | split_field = 'split', # Specify a column in .obs that contains split information 99 | train_split = 'train', 100 | valid_split = 'valid', 101 | batch_gene_list = batch_gene_list, # Specify genes that are measured in each batch, see previous section for more details 102 | device = DEVICE, 103 | ) 104 | -------------------------------------------------------------------------------- /Perturbation Prediction/pp_scf.sh: -------------------------------------------------------------------------------- 1 | # GEARS 2 | python get_embedding.py --task_name GEARS_demo_batch --input_type singlecell --output_type gene_batch --pool_type all --tgthighres f1 --data_path ./examples/GEARS/pre_in.npy --save_path ./examples/GEARS/ --pre_normalized A -------------------------------------------------------------------------------- /Perturbation Prediction/pp_uce_tgpt_scim.py: -------------------------------------------------------------------------------- 1 | from torch_geometric.loader import DataLoader 2 | from gears_001 import PertData, GEARS 3 | from gears_001.inference import compute_metrics, deeper_analysis, non_dropout_analysis 4 | from gears_001.utils import create_cell_graph_dataset_for_prediction 5 | 6 | import scanpy as sc 7 | import numpy as np 8 | import sklearn 9 | 10 | 11 | 12 | from sklearn.preprocessing import StandardScaler 13 | def model_training(adata_train, emb_name = 'X_uce'): 14 | model = sklearn.linear_model.LinearRegression() 15 | train_data = np.concatenate([adata_train.obsm[emb_name], adata_train.obs['pert_condition'].values.reshape(-1,1)], axis=1) 16 | pred_data = adata_train.obsm['ground_truth'] 17 | scaler = StandardScaler() 18 | train_data = scaler.fit_transform(train_data) 19 | model.fit(train_data, pred_data) 20 | return model,scaler 21 | 22 | 23 | 24 | adata = sc.read_h5ad("/gpfs/gibbs/pi/zhao/tl688/scGPT/examples/tgpt_out/adata_train_adamson_tgpt_all.h5ad") #can replace it with other embeddings 25 | adata.obsm['ground_truth'] = adata.layers['ground_truth'].copy() 26 | model,scaler = model_training(adata, emb_name = 'X_tgpt') 27 | 28 | from gears import PertData, GEARS 29 | 30 | # get data 31 | pert_data = PertData('./data') 32 | # pert_data = PertData('./data_folder') 33 | # load dataset in paper: norman, adamson, dixit. 34 | pert_data.load(data_name = 'adamson') 35 | # specify data split 36 | pert_data.prepare_split(split = 'simulation', seed = 1) 37 | # get dataloader with batch size 38 | pert_data.get_dataloader(batch_size = 1024, test_batch_size = 1024) 39 | 40 | adata = sc.read_h5ad("/gpfs/gibbs/pi/zhao/tl688/scGPT/examples/tgpt_out/adata_test_adamson_tgpt_all.h5ad") 41 | 42 | adata.obsm['ground_truth'] = adata.layers['ground_truth'].copy() 43 | 44 | 45 | 46 | import torch 47 | 48 | def eval_perturb( 49 | loader: DataLoader, adata, model, scaler,obsm_name = 'X_uce' 50 | ): 51 | """ 52 | Run model in inference mode using a given data loader 53 | """ 54 | 55 | pert_cat = [] 56 | pred = [] 57 | truth = [] 58 | pred_de = [] 59 | truth_de = [] 60 | results = {} 61 | logvar = [] 62 | 63 | for itr, batch in enumerate(loader): 64 | pert_cat.extend(batch.pert) 65 | 66 | adata_filter = adata[itr*1024:(itr+1)*1024] 67 | test_data = np.concatenate([adata_filter.obsm[obsm_name], adata_filter.obs['pert_condition'].values.reshape(-1,1)], axis=1) 68 | test_data = scaler.transform(test_data) 69 | p = model.predict(test_data) 70 | # print(p) 71 | t = batch.y.numpy() 72 | pred.extend(p) 73 | truth.extend(t) 74 | # Differentially expressed genes 75 | for itr, de_idx in enumerate(batch.de_idx): 76 | pred_de.append(p[itr, de_idx]) 77 | truth_de.append(t[itr, de_idx]) 78 | 79 | # all genes 80 | results["pert_cat"] = np.array(pert_cat) 81 | pred = np.stack(pred) 82 | truth = np.stack(truth) 83 | results["pred"] = pred 84 | results["truth"] = truth 85 | 86 | pred_de = np.stack(pred_de) 87 | truth_de = np.stack(truth_de) 88 | results["pred_de"] = pred_de 89 | results["truth_de"] = truth_de 90 | 91 | return results 92 | 93 | results = eval_perturb(pert_data.dataloader['test_loader'],adata,model,scaler, obsm_name = 'X_tgpt') 94 | 95 | test_metrics, test_pert_res = compute_metrics(results) 96 | print(test_metrics) 97 | 98 | 99 | -------------------------------------------------------------------------------- /Scaling/emergent_ability.md: -------------------------------------------------------------------------------- 1 | # Emergent Ability analysis 2 | 3 | Here we discuss out experiment design to analyze emergent ability of single-cell LLMs. All the results and pipelines here are related to Figure 21 in the main text. 4 | 5 | # Cross-data cell-type annotation 6 | 7 | We compared the performance of scGPT to vanilla NN based on the crossing-data cell type annotation. The datasets here include "demo_train.h5ad" and "demo_test.h5ad". They are from Pancreas. Codes here are related to "Cell type Annotation". 8 | 9 | # Cross-specises cell-type annotation 10 | 11 | We compared the performance of scGPT to vanilla NN based on cell type prediction for 1. spatial transcriptomics and 2. mouse cell atlas seperated by batch. Codes here are related to "Cell type Annotation". 12 | 13 | # Spatial transcriptomics batch effect correction. 14 | 15 | We colelct spatial transcriptomics from human brain without cell labels and reduce the batch effect of the two datasets based on scGPT. Codes here are related to "Batch Effect Correction". 16 | 17 | 18 | -------------------------------------------------------------------------------- /installation_baselines/Dockerfile: -------------------------------------------------------------------------------- 1 | # Start from a Miniconda image 2 | FROM continuumio/miniconda3:latest 3 | 4 | # Create a working directory 5 | WORKDIR /app 6 | 7 | # Copy the environment.yml file into the container 8 | COPY scgpt_bench.yml . 9 | 10 | # Create the environment 11 | # Using `mamba` here for faster installations; it's included in newer images. 12 | RUN conda install -n base -c conda-forge mamba && \ 13 | mamba env create -f scgpt_bench.yml 14 | 15 | # Activate the environment by default 16 | # The conda environment will be located at /opt/conda/envs/myenv 17 | ENV PATH /opt/conda/envs/myenv/bin:$PATH 18 | 19 | # Clean up conda cache to reduce image size 20 | RUN conda clean --all --yes 21 | 22 | # (Optional) Set a default command to start a shell 23 | CMD ["/bin/bash"] 24 | -------------------------------------------------------------------------------- /installation_baselines/cellm.yml: -------------------------------------------------------------------------------- 1 | name: OpenBioMed 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=conda_forge 8 | - _openmp_mutex=4.5=2_gnu 9 | - boost=1.78.0=py38h4e30db6_4 10 | - boost-cpp=1.78.0=h6582d0a_3 11 | - brotli=1.0.9=h166bdaf_9 12 | - brotli-bin=1.0.9=h166bdaf_9 13 | - bzip2=1.0.8=h7f98852_4 14 | - ca-certificates=2023.7.22=hbcca054_0 15 | - cairo=1.16.0=hbbf8b49_1016 16 | - certifi=2023.7.22=pyhd8ed1ab_0 17 | - contourpy=1.1.0=py38h7f3f72f_0 18 | - cycler=0.11.0=pyhd8ed1ab_0 19 | - expat=2.5.0=hcb278e6_1 20 | - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 21 | - font-ttf-inconsolata=3.000=h77eed37_0 22 | - font-ttf-source-code-pro=2.038=h77eed37_0 23 | - font-ttf-ubuntu=0.83=hab24e00_0 24 | - fontconfig=2.14.2=h14ed4e7_0 25 | - fonts-conda-ecosystem=1=0 26 | - fonts-conda-forge=1=0 27 | - fonttools=4.41.1=py38h01eb140_0 28 | - freetype=2.12.1=hca18f0e_1 29 | - freetype-py=2.3.0=pyhd8ed1ab_0 30 | - gettext=0.21.1=h27087fc_0 31 | - greenlet=2.0.2=py38h17151c0_1 32 | - icu=72.1=hcb278e6_0 33 | - importlib-resources=6.0.0=pyhd8ed1ab_1 34 | - importlib_resources=6.0.0=pyhd8ed1ab_1 35 | - kiwisolver=1.4.4=py38h43d8883_1 36 | - lcms2=2.15=haa2dc70_1 37 | - ld_impl_linux-64=2.40=h41732ed_0 38 | - lerc=4.0.0=h27087fc_0 39 | - libblas=3.9.0=17_linux64_openblas 40 | - libbrotlicommon=1.0.9=h166bdaf_9 41 | - libbrotlidec=1.0.9=h166bdaf_9 42 | - libbrotlienc=1.0.9=h166bdaf_9 43 | - libcblas=3.9.0=17_linux64_openblas 44 | - libdeflate=1.18=h0b41bf4_0 45 | - libexpat=2.5.0=hcb278e6_1 46 | - libffi=3.4.2=h7f98852_5 47 | - libgcc-ng=13.1.0=he5830b7_0 48 | - libgfortran-ng=13.1.0=h69a702a_0 49 | - libgfortran5=13.1.0=h15d22d2_0 50 | - libglib=2.76.4=hebfc3b9_0 51 | - libgomp=13.1.0=he5830b7_0 52 | - libiconv=1.17=h166bdaf_0 53 | - libjpeg-turbo=2.1.5.1=h0b41bf4_0 54 | - liblapack=3.9.0=17_linux64_openblas 55 | - libnsl=2.0.0=h7f98852_0 56 | - libopenblas=0.3.23=pthreads_h80387f5_0 57 | - libpng=1.6.39=h753d276_0 58 | - libsqlite=3.42.0=h2797004_0 59 | - libstdcxx-ng=13.1.0=hfd8a6a1_0 60 | - libtiff=4.5.1=h8b53f26_0 61 | - libuuid=2.38.1=h0b41bf4_0 62 | - libwebp-base=1.3.1=hd590300_0 63 | - libxcb=1.15=h0b41bf4_0 64 | - libzlib=1.2.13=hd590300_5 65 | - matplotlib-base=3.7.2=py38hf5b0b65_0 66 | - munkres=1.1.4=pyh9f0ad1d_0 67 | - ncurses=6.4=hcb278e6_0 68 | - numpy=1.24.4=py38h59b608b_0 69 | - openjpeg=2.5.0=hfec8fc6_2 70 | - openssl=3.1.1=hd590300_1 71 | - packaging=23.1=pyhd8ed1ab_0 72 | - pandas=2.0.3=py38h01efb38_1 73 | - pcre2=10.40=hc3806b6_0 74 | - pillow=10.0.0=py38h885162f_0 75 | - pip=23.2.1=pyhd8ed1ab_0 76 | - pixman=0.40.0=h36c2ea0_0 77 | - pthread-stubs=0.4=h36c2ea0_1001 78 | - pycairo=1.24.0=py38h1a1917b_0 79 | - pyparsing=3.0.9=pyhd8ed1ab_0 80 | - python=3.8.17=he550d4f_0_cpython 81 | - python-dateutil=2.8.2=pyhd8ed1ab_0 82 | - python-tzdata=2023.3=pyhd8ed1ab_0 83 | - python_abi=3.8=3_cp38 84 | - pytz=2023.3=pyhd8ed1ab_0 85 | - rdkit=2023.03.2=py38h36d2b2f_0 86 | - readline=8.2=h8228510_1 87 | - reportlab=4.0.4=py38h01eb140_0 88 | - rlpycairo=0.2.0=pyhd8ed1ab_0 89 | - setuptools=68.0.0=pyhd8ed1ab_0 90 | - six=1.16.0=pyh6c4a22f_0 91 | - sqlalchemy=2.0.19=py38h01eb140_0 92 | - tk=8.6.12=h27826a3_0 93 | - typing-extensions=4.7.1=hd8ed1ab_0 94 | - typing_extensions=4.7.1=pyha770c72_0 95 | - unicodedata2=15.0.0=py38h0a891b7_0 96 | - wheel=0.41.0=pyhd8ed1ab_0 97 | - xorg-kbproto=1.0.7=h7f98852_1002 98 | - xorg-libice=1.1.1=hd590300_0 99 | - xorg-libsm=1.2.4=h7391055_0 100 | - xorg-libx11=1.8.6=h8ee46fc_0 101 | - xorg-libxau=1.0.11=hd590300_0 102 | - xorg-libxdmcp=1.1.3=h7f98852_0 103 | - xorg-libxext=1.3.4=h0b41bf4_2 104 | - xorg-libxrender=0.9.11=hd590300_0 105 | - xorg-renderproto=0.11.1=h7f98852_1002 106 | - xorg-xextproto=7.3.0=h0b41bf4_1003 107 | - xorg-xproto=7.0.31=h7f98852_1007 108 | - xz=5.2.6=h166bdaf_0 109 | - zipp=3.16.2=pyhd8ed1ab_0 110 | - zlib=1.2.13=hd590300_5 111 | - zstd=1.5.2=hfc55251_7 112 | - pip: 113 | - anndata==0.9.2 114 | - charset-normalizer==3.2.0 115 | - cmake==3.27.0 116 | - einops==0.6.1 117 | - filelock==3.12.2 118 | - fsspec==2023.6.0 119 | - h5py==3.9.0 120 | - huggingface-hub==0.16.4 121 | - idna==3.4 122 | - importlib-metadata==6.8.0 123 | - jinja2==3.1.2 124 | - joblib==1.3.1 125 | - lit==16.0.6 126 | - littleutils==0.2.2 127 | - llvmlite==0.40.1 128 | - local-attention==1.8.6 129 | - markupsafe==2.1.3 130 | - mhfp==1.9.6 131 | - mpmath==1.3.0 132 | - natsort==8.4.0 133 | - networkx==3.1 134 | - numba==0.57.1 135 | - nvidia-cublas-cu11==11.10.3.66 136 | - nvidia-cuda-cupti-cu11==11.7.101 137 | - nvidia-cuda-nvrtc-cu11==11.7.99 138 | - nvidia-cuda-runtime-cu11==11.7.99 139 | - nvidia-cudnn-cu11==8.5.0.96 140 | - nvidia-cufft-cu11==10.9.0.58 141 | - nvidia-curand-cu11==10.2.10.91 142 | - nvidia-cusolver-cu11==11.4.0.1 143 | - nvidia-cusparse-cu11==11.7.4.91 144 | - nvidia-nccl-cu11==2.14.3 145 | - nvidia-nvtx-cu11==11.7.91 146 | - ogb==1.3.6 147 | - outdated==0.2.2 148 | - patsy==0.5.3 149 | - psutil==5.9.5 150 | - pyg-lib==0.2.0+pt20cu117 151 | - pynndescent==0.5.10 152 | - pyyaml==6.0.1 153 | - regex==2023.6.3 154 | - requests==2.31.0 155 | - safetensors==0.3.1 156 | - scanpy==1.9.3 157 | - scikit-learn==1.3.0 158 | - scipy==1.10.1 159 | - seaborn==0.12.2 160 | - session-info==1.0.0 161 | - statsmodels==0.14.0 162 | - stdlib-list==0.9.0 163 | - sympy==1.12 164 | - threadpoolctl==3.2.0 165 | - tokenizers==0.13.3 166 | - torch==2.0.1 167 | - torch-cluster==1.6.1+pt20cu117 168 | - torch-geometric==2.3.1 169 | - torch-scatter==2.1.1+pt20cu117 170 | - torch-sparse==0.6.17+pt20cu117 171 | - torch-spline-conv==1.2.2+pt20cu117 172 | - tqdm==4.65.0 173 | - transformers==4.31.0 174 | - triton==2.0.0 175 | - umap-learn==0.5.3 176 | - urllib3==2.0.4 177 | prefix: /gpfs/gibbs/project/zhao/tl688/conda_envs/OpenBioMed 178 | -------------------------------------------------------------------------------- /installation_baselines/scbert.yml: -------------------------------------------------------------------------------- 1 | name: scbert 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=conda_forge 8 | - _openmp_mutex=4.5=2_gnu 9 | - bzip2=1.0.8=h7f98852_4 10 | - ca-certificates=2022.12.7=ha878542_0 11 | - ld_impl_linux-64=2.40=h41732ed_0 12 | - libffi=3.4.2=h7f98852_5 13 | - libgcc-ng=12.2.0=h65d4601_19 14 | - libgomp=12.2.0=h65d4601_19 15 | - libnsl=2.0.0=h7f98852_0 16 | - libsqlite=3.40.0=h753d276_1 17 | - libuuid=2.38.1=h0b41bf4_0 18 | - libzlib=1.2.13=h166bdaf_4 19 | - ncurses=6.3=h27087fc_1 20 | - openssl=3.1.0=hd590300_3 21 | - pip=23.1.2=pyhd8ed1ab_0 22 | - python=3.8.16=he550d4f_1_cpython 23 | - readline=8.2=h8228510_1 24 | - setuptools=67.7.2=pyhd8ed1ab_0 25 | - tk=8.6.12=h27826a3_0 26 | - wheel=0.40.0=pyhd8ed1ab_0 27 | - xz=5.2.6=h166bdaf_0 28 | - pip: 29 | - anndata==0.9.1 30 | - anyio==3.6.2 31 | - argon2-cffi==21.3.0 32 | - argon2-cffi-bindings==21.2.0 33 | - arrow==1.2.3 34 | - asttokens==2.2.1 35 | - attrs==23.1.0 36 | - axial-positional-embedding==0.2.1 37 | - backcall==0.2.0 38 | - beautifulsoup4==4.12.2 39 | - bleach==6.0.0 40 | - blosc2==2.0.0 41 | - certifi==2022.12.7 42 | - cffi==1.15.1 43 | - charset-normalizer==3.1.0 44 | - click==8.1.3 45 | - cmake==3.26.3 46 | - comm==0.1.3 47 | - contourpy==1.0.7 48 | - cycler==0.11.0 49 | - cython==0.29.34 50 | - debugpy==1.6.7 51 | - decorator==5.1.1 52 | - defusedxml==0.7.1 53 | - dunamai==1.16.0 54 | - einops==0.6.1 55 | - executing==1.2.0 56 | - fastjsonschema==2.16.3 57 | - filelock==3.12.0 58 | - fonttools==4.39.3 59 | - fqdn==1.5.1 60 | - get-version==3.5.4 61 | - h5py==3.8.0 62 | - huggingface-hub==0.0.8 63 | - idna==3.4 64 | - importlib-metadata==6.6.0 65 | - importlib-resources==5.12.0 66 | - ipykernel==6.23.1 67 | - ipython==8.12.2 68 | - ipython-genutils==0.2.0 69 | - ipywidgets==8.0.6 70 | - isoduration==20.11.0 71 | - jedi==0.18.2 72 | - jinja2==3.1.2 73 | - joblib==1.2.0 74 | - jsonpointer==2.3 75 | - jsonschema==4.17.3 76 | - jupyter==1.0.0 77 | - jupyter-client==8.2.0 78 | - jupyter-console==6.6.3 79 | - jupyter-core==5.3.0 80 | - jupyter-events==0.6.3 81 | - jupyter-server==2.5.0 82 | - jupyter-server-terminals==0.4.4 83 | - jupyterlab-pygments==0.2.2 84 | - jupyterlab-widgets==3.0.7 85 | - kiwisolver==1.4.4 86 | - legacy-api-wrap==1.2 87 | - lit==16.0.5 88 | - llvmlite==0.39.1 89 | - local-attention==1.8.6 90 | - loompy==3.0.7 91 | - markupsafe==2.1.2 92 | - matplotlib==3.6.3 93 | - matplotlib-inline==0.1.6 94 | - mistune==2.0.5 95 | - mpmath==1.3.0 96 | - msgpack==1.0.5 97 | - natsort==8.3.1 98 | - nbclassic==1.0.0 99 | - nbclient==0.7.4 100 | - nbconvert==7.4.0 101 | - nbformat==5.8.0 102 | - nest-asyncio==1.5.6 103 | - networkx==3.1 104 | - notebook==6.5.4 105 | - notebook-shim==0.2.3 106 | - numba==0.56.4 107 | - numexpr==2.8.4 108 | - numpy==1.19.2 109 | - numpy-groupies==0.9.22 110 | - nvidia-cublas-cu11==11.10.3.66 111 | - nvidia-cuda-cupti-cu11==11.7.101 112 | - nvidia-cuda-nvrtc-cu11==11.7.99 113 | - nvidia-cuda-runtime-cu11==11.7.99 114 | - nvidia-cudnn-cu11==8.5.0.96 115 | - nvidia-cufft-cu11==10.9.0.58 116 | - nvidia-curand-cu11==10.2.10.91 117 | - nvidia-cusolver-cu11==11.4.0.1 118 | - nvidia-cusparse-cu11==11.7.4.91 119 | - nvidia-nccl-cu11==2.14.3 120 | - nvidia-nvtx-cu11==11.7.91 121 | - packaging==23.1 122 | - pandas==1.1.5 123 | - pandocfilters==1.5.0 124 | - parso==0.8.3 125 | - patsy==0.5.3 126 | - pexpect==4.8.0 127 | - pickleshare==0.7.5 128 | - pillow==9.5.0 129 | - pkgutil-resolve-name==1.3.10 130 | - platformdirs==3.5.1 131 | - portalocker==2.7.0 132 | - prometheus-client==0.16.0 133 | - prompt-toolkit==3.0.38 134 | - psutil==5.9.5 135 | - ptyprocess==0.7.0 136 | - pure-eval==0.2.2 137 | - py-cpuinfo==9.0.0 138 | - pycparser==2.21 139 | - pygments==2.15.1 140 | - pynndescent==0.5.10 141 | - pyparsing==3.0.9 142 | - pyrsistent==0.19.3 143 | - python-dateutil==2.8.2 144 | - python-json-logger==2.0.7 145 | - pytz==2023.3 146 | - pyyaml==6.0 147 | - pyzmq==25.0.2 148 | - qtconsole==5.4.3 149 | - qtpy==2.3.1 150 | - regex==2023.5.5 151 | - requests==2.30.0 152 | - rfc3339-validator==0.1.4 153 | - rfc3986-validator==0.1.1 154 | - sacremoses==0.0.53 155 | - scanpy==1.7.2 156 | - scikit-learn==0.24.2 157 | - scipy==1.5.4 158 | - seaborn==0.12.2 159 | - send2trash==1.8.2 160 | - sinfo==0.3.4 161 | - six==1.16.0 162 | - sniffio==1.3.0 163 | - soupsieve==2.4.1 164 | - stack-data==0.6.2 165 | - statsmodels==0.14.0rc0 166 | - stdlib-list==0.8.0 167 | - sympy==1.12 168 | - tables==3.8.0 169 | - terminado==0.17.1 170 | - threadpoolctl==3.1.0 171 | - tinycss2==1.2.1 172 | - tokenizers==0.10.3 173 | - torch==2.0.1 174 | - torchdata==0.6.1 175 | - torchtext==0.15.2 176 | - torchvision==0.9.1 177 | - tornado==6.3.2 178 | - tqdm==4.65.0 179 | - traitlets==5.9.0 180 | - transformers==4.6.1 181 | - triton==2.0.0 182 | - typing-extensions==4.5.0 183 | - umap-learn==0.5.3 184 | - uri-template==1.2.0 185 | - urllib3==2.0.2 186 | - wcwidth==0.2.6 187 | - webcolors==1.13 188 | - webencodings==0.5.1 189 | - websocket-client==1.5.1 190 | - widgetsnbextension==4.0.7 191 | - zipp==3.15.0 192 | prefix: /gpfs/gibbs/project/zhao/tl688/conda_envs/scbert 193 | -------------------------------------------------------------------------------- /installation_baselines/scimilarity.yml: -------------------------------------------------------------------------------- 1 | name: scimilarity 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=conda_forge 8 | - _openmp_mutex=4.5=2_gnu 9 | - bzip2=1.0.8=hd590300_5 10 | - ca-certificates=2023.11.17=hbcca054_0 11 | - ld_impl_linux-64=2.40=h41732ed_0 12 | - libffi=3.4.2=h7f98852_5 13 | - libgcc-ng=13.2.0=h807b86a_3 14 | - libgomp=13.2.0=h807b86a_3 15 | - libnsl=2.0.1=hd590300_0 16 | - libsqlite=3.44.2=h2797004_0 17 | - libuuid=2.38.1=h0b41bf4_0 18 | - libzlib=1.2.13=hd590300_5 19 | - ncurses=6.4=h59595ed_2 20 | - openssl=3.2.0=hd590300_1 21 | - pip=23.3.2=pyhd8ed1ab_0 22 | - python=3.8.18=hd12c33a_0_cpython 23 | - readline=8.2=h8228510_1 24 | - setuptools=68.2.2=pyhd8ed1ab_0 25 | - tk=8.6.13=noxft_h4845f30_101 26 | - wheel=0.42.0=pyhd8ed1ab_0 27 | - xz=5.2.6=h166bdaf_0 28 | - pip: 29 | - adjusttext==0.8 30 | - aiohttp==3.9.1 31 | - aiosignal==1.3.1 32 | - anndata==0.9.2 33 | - anyio==4.2.0 34 | - argon2-cffi==23.1.0 35 | - argon2-cffi-bindings==21.2.0 36 | - arrow==1.3.0 37 | - asciitree==0.3.3 38 | - asttokens==2.4.1 39 | - async-lru==2.0.4 40 | - async-timeout==4.0.3 41 | - attrs==23.1.0 42 | - babel==2.14.0 43 | - backcall==0.2.0 44 | - beautifulsoup4==4.12.2 45 | - bleach==6.1.0 46 | - captum==0.7.0 47 | - certifi==2023.11.17 48 | - cffi==1.16.0 49 | - charset-normalizer==3.3.2 50 | - circlify==0.15.0 51 | - click==8.1.7 52 | - comm==0.2.0 53 | - contourpy==1.1.1 54 | - cycler==0.12.1 55 | - cython==3.0.7 56 | - debugpy==1.8.0 57 | - decorator==5.1.1 58 | - defusedxml==0.7.1 59 | - demuxem==0.1.7 60 | - docopt==0.6.2 61 | - exceptiongroup==1.2.0 62 | - executing==2.0.1 63 | - fasteners==0.19 64 | - fastjsonschema==2.19.0 65 | - filelock==3.13.1 66 | - fonttools==4.47.0 67 | - fqdn==1.5.1 68 | - frozenlist==1.4.1 69 | - fsspec==2023.12.2 70 | - get-annotations==0.1.2 71 | - h5py==3.10.0 72 | - hnswlib==0.8.0 73 | - idna==3.6 74 | - igraph==0.10.8 75 | - importlib-metadata==7.0.0 76 | - importlib-resources==6.1.1 77 | - ipykernel==6.27.1 78 | - ipython==8.12.3 79 | - ipywidgets==8.1.1 80 | - isoduration==20.11.0 81 | - jedi==0.19.1 82 | - jinja2==3.1.2 83 | - joblib==1.3.2 84 | - json5==0.9.14 85 | - jsonpointer==2.4 86 | - jsonschema==4.20.0 87 | - jsonschema-specifications==2023.11.2 88 | - jupyter==1.0.0 89 | - jupyter-client==8.6.0 90 | - jupyter-console==6.6.3 91 | - jupyter-core==5.5.1 92 | - jupyter-events==0.9.0 93 | - jupyter-lsp==2.2.1 94 | - jupyter-server==2.12.1 95 | - jupyter-server-terminals==0.5.0 96 | - jupyterlab==4.0.9 97 | - jupyterlab-pygments==0.3.0 98 | - jupyterlab-server==2.25.2 99 | - jupyterlab-widgets==3.0.9 100 | - kiwisolver==1.4.5 101 | - leidenalg==0.10.1 102 | - lightgbm==4.1.0 103 | - lightning-utilities==0.10.0 104 | - llvmlite==0.41.1 105 | - loompy==3.0.7 106 | - louvain==0.8.1 107 | - markupsafe==2.1.3 108 | - matplotlib==3.7.4 109 | - matplotlib-inline==0.1.6 110 | - mistune==3.0.2 111 | - mpmath==1.3.0 112 | - multidict==6.0.4 113 | - natsort==8.4.0 114 | - nbclient==0.9.0 115 | - nbconvert==7.13.0 116 | - nbformat==5.9.2 117 | - nest-asyncio==1.5.8 118 | - networkx==3.1 119 | - notebook==7.0.6 120 | - notebook-shim==0.2.3 121 | - numba==0.58.1 122 | - numcodecs==0.12.1 123 | - numpy==1.24.4 124 | - numpy-groupies==0.9.22 125 | - nvidia-cublas-cu12==12.1.3.1 126 | - nvidia-cuda-cupti-cu12==12.1.105 127 | - nvidia-cuda-nvrtc-cu12==12.1.105 128 | - nvidia-cuda-runtime-cu12==12.1.105 129 | - nvidia-cudnn-cu12==8.9.2.26 130 | - nvidia-cufft-cu12==11.0.2.54 131 | - nvidia-curand-cu12==10.3.2.106 132 | - nvidia-cusolver-cu12==11.4.5.107 133 | - nvidia-cusparse-cu12==12.1.0.106 134 | - nvidia-nccl-cu12==2.18.1 135 | - nvidia-nvjitlink-cu12==12.3.101 136 | - nvidia-nvtx-cu12==12.1.105 137 | - obonet==1.0.0 138 | - overrides==7.4.0 139 | - packaging==23.2 140 | - pandas==2.0.3 141 | - pandocfilters==1.5.0 142 | - parso==0.8.3 143 | - patsy==0.5.4 144 | - pegasusio==0.8.1 145 | - pegasuspy==1.7.1 146 | - pexpect==4.9.0 147 | - pickleshare==0.7.5 148 | - pillow==10.1.0 149 | - pkgutil-resolve-name==1.3.10 150 | - platformdirs==4.1.0 151 | - prometheus-client==0.19.0 152 | - prompt-toolkit==3.0.43 153 | - psutil==5.9.7 154 | - ptyprocess==0.7.0 155 | - pure-eval==0.2.2 156 | - pyarrow==14.0.2 157 | - pybind11==2.11.1 158 | - pycparser==2.21 159 | - pygments==2.17.2 160 | - pynndescent==0.5.11 161 | - pyparsing==3.1.1 162 | - python-dateutil==2.8.2 163 | - python-igraph==0.10.8 164 | - python-json-logger==2.0.7 165 | - pytorch-lightning==2.1.2 166 | - pytz==2023.3.post1 167 | - pyyaml==6.0.1 168 | - pyzmq==25.1.2 169 | - qtconsole==5.5.1 170 | - qtpy==2.4.1 171 | - referencing==0.32.0 172 | - requests==2.31.0 173 | - rfc3339-validator==0.1.4 174 | - rfc3986-validator==0.1.1 175 | - rpds-py==0.15.2 176 | - scanpy==1.9.6 177 | - scikit-learn==1.3.2 178 | - scikit-misc==0.2.0 179 | - scimilarity==0.1.0.post1.dev1+g683b129 180 | - scipy==1.10.1 181 | - seaborn==0.12.2 182 | - send2trash==1.8.2 183 | - session-info==1.0.0 184 | - six==1.16.0 185 | - sniffio==1.3.0 186 | - soupsieve==2.5 187 | - stack-data==0.6.3 188 | - statsmodels==0.14.1 189 | - stdlib-list==0.10.0 190 | - sympy==1.12 191 | - terminado==0.18.0 192 | - texttable==1.7.0 193 | - threadpoolctl==3.2.0 194 | - tiledb==0.24.0 195 | - tinycss2==1.2.1 196 | - tomli==2.0.1 197 | - torch==2.1.2 198 | - torchmetrics==1.2.1 199 | - tornado==6.4 200 | - tqdm==4.66.1 201 | - traitlets==5.14.0 202 | - triton==2.1.0 203 | - types-python-dateutil==2.8.19.14 204 | - typing-extensions==4.9.0 205 | - tzdata==2023.3 206 | - umap-learn==0.5.5 207 | - uri-template==1.3.0 208 | - urllib3==2.1.0 209 | - wcwidth==0.2.12 210 | - webcolors==1.13 211 | - webencodings==0.5.1 212 | - websocket-client==1.7.0 213 | - widgetsnbextension==4.0.9 214 | - wordcloud==1.9.3 215 | - xlsxwriter==3.1.9 216 | - yarl==1.9.4 217 | - zarr==2.16.1 218 | - zipp==3.17.0 219 | prefix: /gpfs/gibbs/project/zhao/tl688/conda_envs/scimilarity 220 | -------------------------------------------------------------------------------- /installation_baselines/uce.yml: -------------------------------------------------------------------------------- 1 | name: uce 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=conda_forge 8 | - _openmp_mutex=4.5=2_gnu 9 | - bzip2=1.0.8=hd590300_5 10 | - ca-certificates=2023.11.17=hbcca054_0 11 | - ld_impl_linux-64=2.40=h41732ed_0 12 | - libffi=3.4.2=h7f98852_5 13 | - libgcc-ng=13.2.0=h807b86a_3 14 | - libgomp=13.2.0=h807b86a_3 15 | - libnsl=2.0.1=hd590300_0 16 | - libsqlite=3.44.2=h2797004_0 17 | - libuuid=2.38.1=h0b41bf4_0 18 | - libzlib=1.2.13=hd590300_5 19 | - ncurses=6.4=h59595ed_2 20 | - openssl=3.2.0=hd590300_1 21 | - pip=23.3.1=pyhd8ed1ab_0 22 | - python=3.8.18=hd12c33a_0_cpython 23 | - readline=8.2=h8228510_1 24 | - setuptools=68.2.2=pyhd8ed1ab_0 25 | - tk=8.6.13=noxft_h4845f30_101 26 | - wheel=0.42.0=pyhd8ed1ab_0 27 | - xz=5.2.6=h166bdaf_0 28 | - pip: 29 | - accelerate==0.25.0 30 | - accelerator==2023.11.3.dev1 31 | - anndata==0.9.2 32 | - bottle==0.12.25 33 | - certifi==2023.11.17 34 | - charset-normalizer==3.3.2 35 | - contourpy==1.1.1 36 | - cycler==0.12.1 37 | - filelock==3.13.1 38 | - fonttools==4.46.0 39 | - fsspec==2023.12.1 40 | - get-annotations==0.1.2 41 | - h5py==3.10.0 42 | - huggingface-hub==0.19.4 43 | - idna==3.6 44 | - importlib-metadata==7.0.0 45 | - importlib-resources==6.1.1 46 | - jinja2==3.1.2 47 | - joblib==1.3.2 48 | - kiwisolver==1.4.5 49 | - llvmlite==0.41.1 50 | - markupsafe==2.1.3 51 | - matplotlib==3.7.4 52 | - mpmath==1.3.0 53 | - natsort==8.4.0 54 | - networkx==3.1 55 | - numba==0.58.1 56 | - numpy==1.24.4 57 | - nvidia-cublas-cu12==12.1.3.1 58 | - nvidia-cuda-cupti-cu12==12.1.105 59 | - nvidia-cuda-nvrtc-cu12==12.1.105 60 | - nvidia-cuda-runtime-cu12==12.1.105 61 | - nvidia-cudnn-cu12==8.9.2.26 62 | - nvidia-cufft-cu12==11.0.2.54 63 | - nvidia-curand-cu12==10.3.2.106 64 | - nvidia-cusolver-cu12==11.4.5.107 65 | - nvidia-cusparse-cu12==12.1.0.106 66 | - nvidia-nccl-cu12==2.18.1 67 | - nvidia-nvjitlink-cu12==12.3.101 68 | - nvidia-nvtx-cu12==12.1.105 69 | - packaging==23.2 70 | - pandas==2.0.3 71 | - patsy==0.5.4 72 | - pillow==10.1.0 73 | - psutil==5.9.6 74 | - pynndescent==0.5.11 75 | - pyparsing==3.1.1 76 | - python-dateutil==2.8.2 77 | - pytz==2023.3.post1 78 | - pyyaml==6.0.1 79 | - requests==2.31.0 80 | - safetensors==0.4.1 81 | - scanpy==1.9.6 82 | - scikit-learn==1.3.2 83 | - scipy==1.10.1 84 | - seaborn==0.12.2 85 | - session-info==1.0.0 86 | - setproctitle==1.3.3 87 | - six==1.16.0 88 | - statsmodels==0.14.0 89 | - stdlib-list==0.10.0 90 | - sympy==1.12 91 | - threadpoolctl==3.2.0 92 | - torch==2.1.1 93 | - tqdm==4.66.1 94 | - triton==2.1.0 95 | - typing-extensions==4.8.0 96 | - tzdata==2023.3 97 | - umap-learn==0.5.5 98 | - urllib3==1.26.6 99 | - waitress==2.1.2 100 | - zipp==3.17.0 101 | prefix: /gpfs/gibbs/project/zhao/tl688/conda_envs/uce 102 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # scEval😈: An evaluation platform for single-cell Foundation Models (FMs) 2 | 3 | This is the repo for our benchmarking and analysis project. All methods are collected until Dec 1st, 2024. 4 | 5 | News: We are collaborating with [OpenProblems](https://openproblems.bio/) to make this benchmark alive! Stay tuned and we will update the benchmarking results soon! 6 | 7 | # Install 8 | 9 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create an environment based on this yml file in your own machine: 10 | ``` 11 | conda env create -n scgpt --file scgpt_bench.yml 12 | ``` 13 | 14 | If you face any issues due to version conflicts, you can try to comment the problematic packages and try: 15 | 16 | ``` 17 | conda activate scgpt 18 | conda env update --file scgpt_bench.yml 19 | ``` 20 | 21 | We also provide docker installation, please use (need gpu): 22 | 23 | ``` 24 | docker build -t my-conda-image . 25 | ``` 26 | 27 | To activate it, please use: 28 | 29 | ``` 30 | docker run -it --rm my-conda-image 31 | ``` 32 | 33 | For other methods we used, please refer to their original project website for instructions. We recommend creating different environments for different methods. Considering the difficulties of installing different scFMs, we provide a list of yml files and an example of Dockerfile we used to install these models in the folder **installation_baselines**. 34 | 35 | These methods include: 36 | 37 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE), [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main). These are also single-cell FMs. 38 | 39 | And 40 | 41 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models. 42 | 43 | 44 | We need scIB for evaluation. Please use pip to install it: 45 | ``` 46 | pip install scib 47 | ``` 48 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correctly. 49 | 50 | We will release a version of scEval with more functions in the future! 51 | 52 | 53 | # Pre-training weights 54 | 55 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). Pre-training weights of UCE can be found in [UCE](https://github.com/snap-stanford/UCE). Pre-training weights of tGPT can be found in [tGPT](https://github.com/deeplearningplus/tGPT). Pre-training weights of CellPLM can be found in [CellPLM](https://github.com/OmicsML/CellPLM). 56 | 57 | scFoundation relies on the APIs or local sever for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. Details of GeneCompas can be found in [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main) 58 | 59 | # Benchmarking information 60 | 61 | Please refer to different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here: 62 | 63 | | Tasks | Metrics | 64 | |-------------------------------------------------------|------------------------------------------| 65 | | Batch Effect Correction, Multi-omics Data Integration | 66 | | and Simulation | [scIB](https://github.com/theislab/scib) | 67 | | Cell-type Annotation and Gene Function Prediction | Accuracy, Precision, Recall and F1 score | 68 | | Imputation | [scIB](https://github.com/theislab/scib), Correlation | 69 | | Perturbation Prediction | Correlation, Mean Squared Error | 70 | | Gene Network Analysis | Jaccard similarity | 71 | 72 | The file 'sceval_lib.py' includes all of the metrics we used in this project. 73 | 74 | To run the codes in different tasks, please use (we choose batch effect correction of scGPT as an example here): 75 | 76 | ``` 77 | python sceval_batcheffect.py 78 | ``` 79 | 80 | We recommend directly evaluating the methods based on their outputs (as .h5ad file), which can be easily performed based on the codes in **sceval_method.py**. 81 | 82 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://yaleedu-my.sharepoint.com/:f:/g/personal/tianyu_liu_yale_edu/Eiqs78qeqwBNiy6zoI_JDnABfz7e2w4Gpj0F4t4l5S-oCw?e=0xSnew). 83 | 84 | To avoid using wandb, please set: 85 | 86 | ``` 87 | os.environ["WANDB_MODE"] = "offline" 88 | 89 | ``` 90 | 91 | We will upload our codes for benchmarking different foundation models soon. 92 | 93 | # Devices 94 | 95 | We recommend using sever to run benchmarked methods and scEval platform. To run single-cell Foundation Models, GPU cores (A100 or higher version) and 40+ GB memory are required. To run scEval (only the evaluation), 40+ GB memory is recommended. 96 | 97 | # Results 98 | 99 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 100 | 101 | # Contact 102 | 103 | Please contact tianyu.liu@yale.edu if you have any questions about this project. 104 | 105 | # Citation 106 | 107 | ``` 108 | @article{liu2023evaluating, 109 | title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis}, 110 | author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu}, 111 | journal={bioRxiv}, 112 | pages={2023--09}, 113 | year={2023}, 114 | publisher={Cold Spring Harbor Laboratory} 115 | } 116 | ``` -------------------------------------------------------------------------------- /sceval_lib.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import scib 4 | import scanpy as sc 5 | import scipy 6 | import scipy.stats 7 | from scgpt.utils import set_seed 8 | from anndata import AnnData 9 | from sklearn.metrics import classification_report 10 | from typing import List, Tuple, Dict, Union, Optional 11 | 12 | set_seed(0) 13 | def eval_scib_metrics( 14 | adata: AnnData, 15 | batch_key: str = "batch", 16 | label_key: str = "celltype", 17 | emb_name: str = "X_scGPT", 18 | notes: Optional[str] = None, 19 | ) -> Dict: 20 | results = scib.metrics.metrics( 21 | adata, 22 | adata_int=adata, 23 | batch_key=batch_key, 24 | label_key=label_key, 25 | embed=emb_name, 26 | isolated_labels_asw_=False, 27 | silhouette_=True, 28 | hvg_score_=False, 29 | graph_conn_=True, 30 | pcr_=True, 31 | isolated_labels_f1_=False, 32 | trajectory_=False, 33 | nmi_=True, 34 | ari_=True, 35 | cell_cycle_=False, 36 | kBET_=True, 37 | ilisi_=False, 38 | clisi_=False, 39 | ) 40 | 41 | result_dict = results[0].to_dict() 42 | 43 | result_dict["avg_bio"] = np.mean( 44 | [ 45 | result_dict["NMI_cluster/label"], 46 | result_dict["ARI_cluster/label"], 47 | result_dict["ASW_label"], 48 | ] 49 | ) 50 | 51 | # remove nan value in result_dict 52 | result_dict = {k: v for k, v in result_dict.items() if not np.isnan(v)} 53 | 54 | print(results) 55 | return result_dict 56 | 57 | 58 | def eval_scib_metrics_onlybio( 59 | adata: AnnData, 60 | batch_key: str = "batch", 61 | label_key: str = "celltype", 62 | 63 | emb_name: str = "X_scGPT", 64 | notes: Optional[str] = None, 65 | ) -> Dict: 66 | results = scib.metrics.metrics_onlybio( 67 | adata, 68 | adata_int=adata, 69 | batch_key=batch_key, 70 | label_key=label_key, 71 | embed=emb_name, 72 | isolated_labels_asw_=False, 73 | silhouette_=True, 74 | hvg_score_=False, 75 | graph_conn_=True, 76 | pcr_=True, 77 | isolated_labels_f1_=False, 78 | trajectory_=False, 79 | nmi_=True, 80 | ari_=True, 81 | cell_cycle_=False, 82 | kBET_=False, 83 | ilisi_=False, 84 | clisi_=False, 85 | ) 86 | 87 | result_dict = results[0].to_dict() 88 | result_dict["avg_bio"] = np.mean( 89 | [ 90 | result_dict["NMI_cluster/label"], 91 | result_dict["ARI_cluster/label"], 92 | result_dict["ASW_label"], 93 | ] 94 | ) 95 | 96 | # remove nan value in result_dict 97 | result_dict = {k: v for k, v in result_dict.items() if not np.isnan(v)} 98 | 99 | print(results) 100 | return result_dict 101 | 102 | def calculate_correlation_metric(y1, y2): 103 | cor = 0.0 104 | y1 = y1.float() 105 | y2 = y2.float() 106 | for id1, id2 in zip(y1, y2): 107 | 108 | cor_cal,_ = scipy.stats.pearsonr(id1,id2) 109 | cor += cor_cal.item() 110 | return cor 111 | 112 | 113 | class scEval(object): 114 | 115 | def __init__(self, adata): 116 | self.label = 'scGPT' 117 | self.adata = adata # adata is the output of the model you plan to benchmark. 118 | self.pvalue = 0.005 119 | 120 | def evaluation_bec(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scGPT'): 121 | results = eval_scib_metrics(self.adata,batch_key,label_key, emb_name) 122 | return results 123 | 124 | 125 | def evaluation_cta_gfp(self, pred_label, true_label): 126 | results = classification_report(true_label, pred_label, digits=4) 127 | return results 128 | 129 | def evaluation_perturb_pred(self, pred_model, true_result): #assume the outputs are both in AnnData format. Rows are cells while columns are genes. 130 | cor_total = calculate_correlation_metric(pred_model.X.T, true_result.X.T) 131 | return {"correlation":cor_total / len(pred_model.X.T)} 132 | 133 | def evaluation_perturb_pred_gearsofficial(self, gears_model, pred_model ): 134 | from gears.inference import evaluate, compute_metrics, deeper_analysis, non_dropout_analysis 135 | test_res = evaluate(gears_model.dataloader['test_loader'], pred_model) 136 | test_metrics, test_pert_res = compute_metrics(test_res) 137 | return test_metrics 138 | 139 | def evaluation_imputation_scrna(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scGPT'): 140 | results = eval_scib_metrics_onlybio(self.adata,batch_key,label_key, emb_name) 141 | return results 142 | 143 | def evaluation_imputation_spatial(self, adata_sp): 144 | adata_imp_new = self.adata[:, adata_sp.var_names] 145 | cor_list = [] 146 | pval_list = [] 147 | for item in adata_sp.var_names: 148 | adata1 = adata_sp[:,item] 149 | adata2 = adata_imp_new[:,item] 150 | cor, pval = scipy.stats.pearsonr(np.array(adata1.X.todense().T)[0], np.array(adata2.X.T)[0]) # for this step, please check the data form 151 | cor_list.append(cor) 152 | pval_list.append(pval) 153 | 154 | adata_imp_new.var['cor'] = cor_list 155 | adata_imp_new.var['pval'] = pval_list 156 | 157 | mean_cor = np.mean(adata_imp_new.var['cor'].values) 158 | 159 | avg_sig = np.sum(adata_imp_new.var['pval'].values 1, "dpt_pseudotime" 73 | ] = 0 74 | adata_post_ti.obs["dpt_pseudotime"] = 0 75 | adata_post_ti.obs["dpt_pseudotime"] = adata_post_ti2.obs["dpt_pseudotime"] 76 | adata_post_ti.obs["dpt_pseudotime"].fillna(0, inplace=True) 77 | 78 | if batch_key is None: 79 | pseudotime_before = adata_pre_ti.obs[pseudotime_key] 80 | pseudotime_after = adata_post_ti.obs["dpt_pseudotime"] 81 | correlation = pseudotime_before.corr(pseudotime_after, "spearman") 82 | return (correlation + 1) / 2 # scaled 83 | else: 84 | check_batch(batch_key, adata_pre.obs) 85 | check_batch(batch_key, adata_post.obs) 86 | 87 | # check if batches match 88 | if not np.array_equal( 89 | adata_post_ti.obs[batch_key], adata_pre_ti.obs[batch_key] 90 | ): 91 | raise ValueError( 92 | "Batch columns do not match\n" 93 | f"adata_post_ti.obs['batch']:\n {adata_post_ti.obs[batch_key]}\n" 94 | f"adata_pre_ti.obs['batch']:\n {adata_pre_ti.obs[batch_key]}\n" 95 | ) 96 | 97 | corr = pd.Series() 98 | for i in adata_pre_ti.obs[batch_key].unique(): 99 | pseudotime_before = adata_pre_ti.obs[adata_pre_ti.obs[batch_key] == i][ 100 | pseudotime_key 101 | ] 102 | pseudotime_after = adata_post_ti.obs[adata_post_ti.obs[batch_key] == i][ 103 | "dpt_pseudotime" 104 | ] 105 | corr[i] = pseudotime_before.corr(pseudotime_after, "spearman") 106 | 107 | return (corr.mean() + 1) / 2 # scaled 108 | 109 | 110 | def get_root(adata_pre, adata_post, ct_key, pseudotime_key="dpt_pseudotime", dpt_dim=3): 111 | """Determine root cell for integrated adata based on unintegrated adata 112 | 113 | :param adata_pre: unintegrated adata 114 | :param adata_post: integrated adata 115 | :param label_key: column in ``adata_pre.obs`` of the groups used to precompute the trajectory 116 | :param pseudotime_key: column in ``adata_pre.obs`` in which the pseudotime is saved in. 117 | Column can contain empty entries, the dataset will be subset to the cells with scores. 118 | :param dpt_dim: number of diffmap dimensions used to determine root 119 | """ 120 | n_components, adata_post.obs["neighborhood"] = connected_components( 121 | csgraph=adata_post.obsp["connectivities"], directed=False, return_labels=True 122 | ) 123 | 124 | start_clust = adata_pre.obs.groupby([ct_key]).mean()[pseudotime_key].idxmin() 125 | min_dpt = adata_pre.obs[adata_pre.obs[ct_key] == start_clust].index 126 | which_max_neigh = ( 127 | adata_post.obs["neighborhood"] 128 | == adata_post.obs["neighborhood"].value_counts().idxmax() 129 | ) 130 | min_dpt = [ 131 | value for value in min_dpt if value in adata_post.obs[which_max_neigh].index 132 | ] 133 | 134 | adata_post_ti = adata_post[which_max_neigh] 135 | 136 | min_dpt = [adata_post_ti.obs_names.get_loc(i) for i in min_dpt] 137 | 138 | # compute Diffmap for adata_post 139 | sc.tl.diffmap(adata_post_ti) 140 | 141 | # determine most extreme cell in adata_post Diffmap 142 | min_dpt_cell = np.zeros(len(min_dpt)) 143 | for dim in np.arange(dpt_dim): 144 | 145 | diffmap_mean = adata_post_ti.obsm["X_diffmap"][:, dim].mean() 146 | diffmap_min_dpt = adata_post_ti.obsm["X_diffmap"][min_dpt, dim] 147 | 148 | # count opt cell 149 | if len(diffmap_min_dpt) == 0: 150 | raise RootCellError("No root cell in largest component") 151 | 152 | # choose optimum function 153 | if len(diffmap_min_dpt) > 0 and diffmap_min_dpt.mean() < diffmap_mean: 154 | opt = np.argmin 155 | else: 156 | opt = np.argmax 157 | 158 | min_dpt_cell[opt(diffmap_min_dpt)] += 1 159 | 160 | # root cell is cell with max vote 161 | return min_dpt[np.argmax(min_dpt_cell)], adata_post_ti 162 | -------------------------------------------------------------------------------- /scib/metrics/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from scipy import sparse 4 | 5 | # Errors 6 | 7 | 8 | class RootCellError(Exception): 9 | def __init__(self, message): 10 | self.message = message 11 | 12 | 13 | class NeighborsError(Exception): 14 | def __init__(self, message): 15 | self.message = message 16 | 17 | 18 | # Diffusion 19 | 20 | 21 | def diffusion_conn(adata, min_k=50, copy=True, max_iterations=26): 22 | """ 23 | Diffusion for connectivites matrix extension 24 | This function performs graph diffusion on the connectivities matrix until a 25 | minimum number `min_k` of entries per row are non-zero. 26 | 27 | Note: 28 | Due to self-loops min_k-1 non-zero connectivies entries is actually the stopping 29 | criterion. This is equivalent to `sc.pp.neighbors`. 30 | 31 | Returns: 32 | The diffusion-enhanced connectivities matrix of a copy of the AnnData object 33 | with the diffusion-enhanced connectivities matrix is in 34 | `adata.uns["neighbors"]["conectivities"]` 35 | """ 36 | if "neighbors" not in adata.uns: 37 | raise ValueError( 38 | "`neighbors` not in adata object. " "Please compute a neighbourhood graph!" 39 | ) 40 | 41 | if "connectivities" not in adata.obsp: 42 | raise ValueError( 43 | "`connectivities` not in `adata.obsp`. " 44 | "Please pass an object with connectivities computed!" 45 | ) 46 | 47 | T = adata.obsp["connectivities"] 48 | 49 | # Normalize T with max row sum 50 | # Note: This keeps the matrix symmetric and ensures |M| doesn't keep growing 51 | T = sparse.diags(1 / np.array([T.sum(1).max()] * T.shape[0])) * T 52 | 53 | M = T 54 | 55 | # Check for disconnected component 56 | n_comp, labs = sparse.csgraph.connected_components( 57 | adata.obsp["connectivities"], connection="strong" 58 | ) 59 | 60 | if n_comp > 1: 61 | tab = pd.value_counts(labs) 62 | small_comps = tab.index[tab < min_k] 63 | large_comp_mask = np.array(~pd.Series(labs).isin(small_comps)) 64 | else: 65 | large_comp_mask = np.array([True] * M.shape[0]) 66 | 67 | T_agg = T 68 | i = 2 69 | while ((M[large_comp_mask, :][:, large_comp_mask] > 0).sum(1).min() < min_k) and ( 70 | i < max_iterations 71 | ): 72 | print(f"Adding diffusion to step {i}") 73 | T_agg *= T 74 | M += T_agg 75 | i += 1 76 | 77 | if (M[large_comp_mask, :][:, large_comp_mask] > 0).sum(1).min() < min_k: 78 | raise ValueError( 79 | "could not create diffusion connectivities matrix" 80 | f"with at least {min_k} non-zero entries in" 81 | f"{max_iterations} iterations.\n Please increase the" 82 | "value of max_iterations or reduce k_min.\n" 83 | ) 84 | 85 | M.setdiag(0) 86 | 87 | if copy: 88 | adata_tmp = adata.copy() 89 | adata_tmp.uns["neighbors"].update({"diffusion_connectivities": M}) 90 | return adata_tmp 91 | 92 | else: 93 | return M 94 | 95 | 96 | def diffusion_nn(adata, k, max_iterations=26): 97 | """ 98 | Diffusion neighbourhood score 99 | This function generates a nearest neighbour list from a connectivities matrix 100 | as supplied by BBKNN or Conos. This allows us to select a consistent number 101 | of nearest neighbours across all methods. 102 | 103 | Return: 104 | `k_indices` a numpy.ndarray of the indices of the k-nearest neighbors. 105 | """ 106 | if "neighbors" not in adata.uns: 107 | raise ValueError( 108 | "`neighbors` not in adata object. " "Please compute a neighbourhood graph!" 109 | ) 110 | 111 | if "connectivities" not in adata.obsp: 112 | raise ValueError( 113 | "`connectivities` not in `adata.obsp`. " 114 | "Please pass an object with connectivities computed!" 115 | ) 116 | 117 | T = adata.obsp["connectivities"] 118 | 119 | # Row-normalize T 120 | T = sparse.diags(1 / T.sum(1).A.ravel()) * T 121 | 122 | T_agg = T**3 123 | M = T + T**2 + T_agg 124 | i = 4 125 | 126 | while ((M > 0).sum(1).min() < (k + 1)) and (i < max_iterations): 127 | # note: k+1 is used as diag is non-zero (self-loops) 128 | print(f"Adding diffusion to step {i}") 129 | T_agg *= T 130 | M += T_agg 131 | i += 1 132 | 133 | if (M > 0).sum(1).min() < (k + 1): 134 | raise NeighborsError( 135 | f"could not find {k} nearest neighbors in {max_iterations}" 136 | "diffusion steps.\n Please increase max_iterations or reduce" 137 | " k.\n" 138 | ) 139 | 140 | M.setdiag(0) 141 | k_indices = np.argpartition(M.A, -k, axis=1)[:, -k:] 142 | 143 | return k_indices 144 | 145 | 146 | # Not used 147 | 148 | 149 | def get_hvg_indices(adata, verbose=True): 150 | if "highly_variable" not in adata.var.columns: 151 | if verbose: 152 | print( 153 | f"No highly variable genes computed, continuing with full matrix {adata.shape}" 154 | ) 155 | return np.array(range(adata.n_vars)) 156 | return np.where(adata.var["highly_variable"] is True)[0] 157 | 158 | 159 | def select_hvg(adata, select=True): 160 | if select and "highly_variable" in adata.var: 161 | return adata[:, adata.var["highly_variable"]].copy() 162 | else: 163 | return adata 164 | -------------------------------------------------------------------------------- /scib/resources/g2m_genes_tirosh.txt: -------------------------------------------------------------------------------- 1 | Hmgb2 2 | Cdk1 3 | Nusap1 4 | Ube2c 5 | Birc5 6 | Tpx2 7 | Top2a 8 | Ndc80 9 | Cks2 10 | Nuf2 11 | Cks1b 12 | Mki67 13 | Tmpo 14 | Cenpf 15 | Tacc3 16 | Fam64a 17 | Smc4 18 | Ccnb2 19 | Ckap2l 20 | Ckap2 21 | Aurkb 22 | Bub1 23 | Kif11 24 | Anp32e 25 | Tubb4b 26 | Gtse1 27 | Kif20b 28 | Hjurp 29 | Cdca3 30 | Hn1 31 | Cdc20 32 | Ttk 33 | Cdc25c 34 | Kif2c 35 | Rangap1 36 | Ncapd2 37 | Dlgap5 38 | Cdca2 39 | Cdca8 40 | Ect2 41 | Kif23 42 | Hmmr 43 | Aurka 44 | Psrc1 45 | Anln 46 | Lbr 47 | Ckap5 48 | Cenpe 49 | Ctcf 50 | Nek2 51 | G2e3 52 | Gas2l3 53 | Cbx5 54 | Cenpa 55 | -------------------------------------------------------------------------------- /scib/resources/g2m_genes_tirosh_hm.txt: -------------------------------------------------------------------------------- 1 | HMGB2 2 | CDK1 3 | NUSAP1 4 | UBE2C 5 | BIRC5 6 | TPX2 7 | TOP2A 8 | NDC80 9 | CKS2 10 | NUF2 11 | CKS1B 12 | MKI67 13 | TMPO 14 | CENPF 15 | TACC3 16 | FAM64A 17 | SMC4 18 | CCNB2 19 | CKAP2L 20 | CKAP2 21 | AURKB 22 | BUB1 23 | KIF11 24 | ANP32E 25 | TUBB4B 26 | GTSE1 27 | KIF20B 28 | HJURP 29 | CDCA3 30 | HN1 31 | CDC20 32 | TTK 33 | CDC25C 34 | KIF2C 35 | RANGAP1 36 | NCAPD2 37 | DLGAP5 38 | CDCA2 39 | CDCA8 40 | ECT2 41 | KIF23 42 | HMMR 43 | AURKA 44 | PSRC1 45 | ANLN 46 | LBR 47 | CKAP5 48 | CENPE 49 | CTCF 50 | NEK2 51 | G2E3 52 | GAS2L3 53 | CBX5 54 | CENPA 55 | -------------------------------------------------------------------------------- /scib/resources/s_genes_tirosh.txt: -------------------------------------------------------------------------------- 1 | Mcm5 2 | Pcna 3 | Tyms 4 | Fen1 5 | Mcm2 6 | Mcm4 7 | Rrm1 8 | Ung 9 | Gins2 10 | Mcm6 11 | Cdca7 12 | Dtl 13 | Prim1 14 | Uhrf1 15 | Mlf1ip 16 | Hells 17 | Rfc2 18 | Rpa2 19 | Nasp 20 | Rad51ap1 21 | Gmnn 22 | Wdr76 23 | Slbp 24 | Ccne2 25 | Ubr7 26 | Pold3 27 | Msh2 28 | Atad2 29 | Rad51 30 | Rrm2 31 | Cdc45 32 | Cdc6 33 | Exo1 34 | Tipin 35 | Dscc1 36 | Blm 37 | Casp8ap2 38 | Usp1 39 | Clspn 40 | Pola1 41 | Chaf1b 42 | Brip1 43 | E2f8 44 | -------------------------------------------------------------------------------- /scib/resources/s_genes_tirosh_hm.txt: -------------------------------------------------------------------------------- 1 | MCM5 2 | PCNA 3 | TYMS 4 | FEN1 5 | MCM2 6 | MCM4 7 | RRM1 8 | UNG 9 | GINS2 10 | MCM6 11 | CDCA7 12 | DTL 13 | PRIM1 14 | UHRF1 15 | MLF1IP 16 | HELLS 17 | RFC2 18 | RPA2 19 | NASP 20 | RAD51AP1 21 | GMNN 22 | WDR76 23 | SLBP 24 | CCNE2 25 | UBR7 26 | POLD3 27 | MSH2 28 | ATAD2 29 | RAD51 30 | RRM2 31 | CDC45 32 | CDC6 33 | EXO1 34 | TIPIN 35 | DSCC1 36 | BLM 37 | CASP8AP2 38 | USP1 39 | CLSPN 40 | POLA1 41 | CHAF1B 42 | BRIP1 43 | E2F8 44 | -------------------------------------------------------------------------------- /scib/trajectory_inference.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import scanpy as sc 4 | 5 | from . import utils 6 | 7 | 8 | def paga(adata, groups="louvain"): 9 | """ """ 10 | utils.check_adata(adata) 11 | 12 | sc.pp.neighbors(adata) 13 | sc.tl.paga(adata, groups=groups) 14 | _ = sc.pl.paga_compare(adata, show=False) 15 | 16 | fig1, ax1 = plt.subplots() 17 | sc.pl.umap(adata, size=40, ax=ax1, show=False) 18 | sc.pl.paga( 19 | adata, 20 | pos=adata.uns["paga"]["pos"], 21 | show=False, 22 | node_size_scale=10, 23 | node_size_power=1, 24 | ax=ax1, 25 | text_kwds={"alpha": 0}, 26 | ) 27 | plt.show() 28 | 29 | 30 | def dpt(adata, group, root, opt="min", comp=0): 31 | utils.check_adata() 32 | 33 | # TODO compute diffmap before 34 | 35 | # get root 36 | stem_mask = np.isin(adata.obs[group], root) 37 | if opt == "min": 38 | opt_stem_id = np.argmin(adata.obsm["X_diffmap"][stem_mask, comp]) 39 | elif opt == "max": 40 | opt_stem_id = np.argmax(adata.obsm["X_diffmap"][stem_mask, comp]) 41 | else: 42 | raise ("invalid optimum", opt) 43 | root_id = np.arange(len(stem_mask))[stem_mask][opt_stem_id] 44 | adata.uns["iroot"] = root_id 45 | # compute pseudotime 46 | sc.tl.dpt(adata) 47 | -------------------------------------------------------------------------------- /scib/utils.py: -------------------------------------------------------------------------------- 1 | import anndata 2 | 3 | 4 | # checker functions for data sanity 5 | def check_adata(adata): 6 | if type(adata) is not anndata.AnnData: 7 | raise TypeError("Input is not a valid AnnData object") 8 | 9 | 10 | def check_batch(batch, obs, verbose=False): 11 | if batch not in obs: 12 | raise ValueError(f"column {batch} is not in obs") 13 | elif verbose: 14 | print(f"Object contains {obs[batch].nunique()} batches.") 15 | 16 | 17 | def check_hvg(hvg, adata_var): 18 | if type(hvg) is not list: 19 | raise TypeError("HVG list is not a list") 20 | else: 21 | if not all(i in adata_var.index for i in hvg): 22 | raise ValueError("Not all HVGs are in the adata object") 23 | 24 | 25 | def check_sanity(adata, batch, hvg): 26 | check_adata(adata) 27 | check_batch(batch, adata.obs) 28 | if hvg is not None: 29 | check_hvg(hvg, adata.var) 30 | 31 | 32 | def split_batches(adata, batch, hvg=None, return_categories=False): 33 | """Split batches and preserve category information 34 | 35 | :param adata: 36 | :param batch: name of column in ``adata.obs``. The data type of the column must be of ``Category``. 37 | :param hvg: list of highly variable genes 38 | :param return_categories: whether to return the categories object of ``batch`` 39 | """ 40 | split = [] 41 | batch_categories = adata.obs[batch].cat.categories 42 | if hvg is not None: 43 | adata = adata[:, hvg] 44 | for i in batch_categories: 45 | split.append(adata[adata.obs[batch] == i].copy()) 46 | if return_categories: 47 | return split, batch_categories 48 | return split 49 | 50 | 51 | def merge_adata(*adata_list, **kwargs): 52 | """Merge adatas from list while remove duplicated ``obs`` and ``var`` columns 53 | 54 | :param adata_list: ``anndata`` objects to be concatenated 55 | :param kwargs: arguments to be passed to ``anndata.AnnData.concatenate`` 56 | """ 57 | 58 | if len(adata_list) == 1: 59 | return adata_list[0] 60 | 61 | # Make sure that adatas do not contain duplicate columns 62 | for _adata in adata_list: 63 | for attr in ("obs", "var"): 64 | df = getattr(_adata, attr) 65 | dup_mask = df.columns.duplicated() 66 | if dup_mask.any(): 67 | print( 68 | f"Deleting duplicated keys `{list(df.columns[dup_mask].unique())}` from `adata.{attr}`." 69 | ) 70 | setattr(_adata, attr, df.loc[:, ~dup_mask]) 71 | 72 | return anndata.AnnData.concatenate(*adata_list, **kwargs) 73 | 74 | 75 | def todense(adata): 76 | import scipy 77 | 78 | if isinstance(adata.X, scipy.sparse.csr_matrix): 79 | adata.X = adata.X.todense() 80 | --------------------------------------------------------------------------------