├── .history
    ├── Cell type Annotation
    │   ├── finetune cell type classification cross_20230826230604.py
    │   └── finetune cell type classification cross_20250115140910.py
    ├── installation_baselines
    │   ├── Dockerfile_20241228134847
    │   └── Dockerfile_20241228134950
    ├── readme_20231228192429.md
    ├── readme_20240311112645.md
    ├── readme_20240311112739.md
    ├── readme_20240311112740.md
    ├── readme_20240311112848.md
    ├── readme_20240311112907.md
    ├── readme_20240311112908.md
    ├── readme_20241130154021.md
    ├── readme_20241130154116.md
    ├── readme_20241210100101.md
    ├── readme_20241210100332.md
    ├── readme_20241228135005.md
    ├── readme_20241228135006.md
    ├── sceval_lib_20240311171801.py
    ├── sceval_lib_20240317220609.py
    ├── sceval_lib_20240317220632.py
    ├── sceval_method_20240311113014.py
    ├── sceval_method_20240311113104.py
    ├── sceval_method_20240311113148.py
    ├── sceval_method_20240311113334.py
    ├── sceval_method_20240311113451.py
    ├── sceval_method_20240311113518.py
    ├── sceval_method_20240311113519.py
    ├── sceval_method_20240311113920.py
    ├── sceval_method_20240311113921.py
    ├── sceval_method_20240312125931.py
    ├── sceval_method_20240312130054.py
    ├── sceval_method_20240312130115.py
    ├── sceval_method_20240312130116.py
    ├── sceval_method_20240312130143.py
    ├── sceval_method_20240312130144.py
    ├── sceval_method_20240312130145.py
    ├── sceval_method_20240317220612.py
    └── sceval_method_20240317220624.py
├── Batch Effect Correction
    ├── bec_cellplm.py
    ├── bec_geneformer.py
    ├── bec_scf.sh
    ├── bec_scim.py
    ├── bec_tgpt.py
    ├── bec_uce.sh
    ├── finetune batch effect correction official.ipynb
    ├── finetune batch effect correction.ipynb
    ├── sceval_batcheffect.py
    ├── sceval_batcheffect.sh
    ├── sceval_batcheffect_official.py
    └── sceval_batcheffect_scgpt.py
├── Cell type Annotation
    ├── cta_cellm.sh
    ├── cta_geneformer.py
    ├── cta_scbert.py
    ├── cta_scf.py
    ├── cta_scim.py
    ├── cta_tgpt_uce.py
    ├── finetune cell type classification cross.py
    ├── finetune cell-type annotation official scgpt.py
    ├── finetune cell-type annotation official.ipynb
    └── finetune cell-type annotation official.py
├── Dockerfile
├── Gene Network Analysis
    ├── GRN Inference example scGPT.ipynb
    ├── gna_geneformer.py
    ├── gna_scf.sh
    ├── grn.py
    ├── sceval_gna1.py
    ├── sceval_gna2.py
    └── sceval_gna_selfdefineEval.py
├── Gene function preiction
    ├── Gene Function Prediction.ipynb
    ├── gfp_geneformer.py
    ├── sceval_gfp.py
    └── sceval_gfp_scgpt.py
├── Imputation
    ├── imp_cellplm.py
    ├── sceval_singlecell_imputation.py
    ├── sceval_spatial_imputation_finetuning.py
    ├── sceval_spatial_imputation_zero_shots.py
    ├── spatial imputation mouse zeroshots.ipynb
    └── spatial imputation mouse.ipynb
├── Multi-omic data integration
    ├── finetune multiomics official.py
    ├── finetune multiomics.ipynb
    ├── finetune multiomics.py
    └── multiomic integration official.ipynb
├── Perturbation Prediction
    ├── finetune perturb seq.ipynb
    ├── finetune perturbation prediction official.py
    ├── perturbation prediction official.ipynb
    ├── peturbation prediction.py
    ├── pp_scf.sh
    └── pp_uce_tgpt_scim.py
├── Scaling
    ├── emergent_ability.md
    └── vanilla_NN.py
├── Simulation
    ├── Test simulation.ipynb
    └── sceval_simulation.py
├── installation_baselines
    ├── Dockerfile
    ├── GeneCompass.yml
    ├── cellm.yml
    ├── cellplm.yml
    ├── geneformer.yml
    ├── scFoundation.yml
    ├── scbert.yml
    ├── scgpt.yml
    ├── scimilarity.yml
    ├── tgpt.yml
    └── uce.yml
├── readme.md
├── sceval_lib.py
├── sceval_method.py
├── scgpt.yml
├── scgpt_bench.yml
├── scib
    ├── __init__.py
    ├── _package_tools.py
    ├── exceptions.py
    ├── integration.py
    ├── knn_graph
    │   ├── .gitignore
    │   ├── README.md
    │   ├── knn_graph.cpp
    │   └── makefile
    ├── metrics
    │   ├── __init__.py
    │   ├── ari.py
    │   ├── cell_cycle.py
    │   ├── clustering.py
    │   ├── graph_connectivity.py
    │   ├── highly_variable_genes.py
    │   ├── isolated_labels.py
    │   ├── kbet.py
    │   ├── lisi.py
    │   ├── metrics.py
    │   ├── nmi.py
    │   ├── pcr.py
    │   ├── silhouette.py
    │   ├── trajectory.py
    │   └── utils.py
    ├── preprocessing.py
    ├── resources
    │   ├── g2m_genes_tirosh.txt
    │   ├── g2m_genes_tirosh_hm.txt
    │   ├── s_genes_tirosh.txt
    │   └── s_genes_tirosh_hm.txt
    ├── trajectory_inference.py
    └── utils.py
└── scjoint.py


/.history/installation_baselines/Dockerfile_20241228134847:
--------------------------------------------------------------------------------
 1 | # Start from a Miniconda image
 2 | FROM continuumio/miniconda3:latest
 3 | 
 4 | # Create a working directory
 5 | WORKDIR /app
 6 | 
 7 | # Copy the environment.yml file into the container
 8 | COPY scgpt_bench.yml .
 9 | 
10 | # Create the environment
11 | # Using `mamba` here for faster installations; it's included in newer images.
12 | RUN conda install -n base -c conda-forge mamba && \
13 |     mamba env create -f scgpt_bench.yml
14 | 
15 | # Activate the environment by default
16 | # The conda environment will be located at /opt/conda/envs/myenv
17 | ENV PATH /opt/conda/envs/myenv/bin:$PATH
18 | 
19 | # Clean up conda cache to reduce image size
20 | RUN conda clean --all --yes
21 | 
22 | # (Optional) Set a default command to start a shell
23 | CMD ["/bin/bash"]
24 | 


--------------------------------------------------------------------------------
/.history/installation_baselines/Dockerfile_20241228134950:
--------------------------------------------------------------------------------
 1 | # Start from a Miniconda image
 2 | FROM continuumio/miniconda3:latest
 3 | 
 4 | # Create a working directory
 5 | WORKDIR /app
 6 | 
 7 | # Copy the environment.yml file into the container
 8 | COPY scgpt_bench.yml .
 9 | 
10 | # Create the environment
11 | # Using `mamba` here for faster installations; it's included in newer images.
12 | RUN conda install -n base -c conda-forge mamba && \
13 |     mamba env create -f scgpt_bench.yml
14 | 
15 | # Activate the environment by default
16 | # The conda environment will be located at /opt/conda/envs/myenv
17 | ENV PATH /opt/conda/envs/myenv/bin:$PATH
18 | 
19 | # Clean up conda cache to reduce image size
20 | RUN conda clean --all --yes
21 | 
22 | # (Optional) Set a default command to start a shell
23 | CMD ["/bin/bash"]
24 | 


--------------------------------------------------------------------------------
/.history/readme_20231228192429.md:
--------------------------------------------------------------------------------
 1 | # scEval😈: A evaluation platform for single-cell Foundation Models (FMs)
 2 | 
 3 | This is the repo for our benchmarking and analysis project. All methods are collected until Dec 1st, 2023. 
 4 | 
 5 | # Install
 6 | 
 7 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create a environment based on this yml file in your own machine:
 8 | ```
 9 | conda env create -n scgpt --file scgpt_bench.yml
10 | ```
11 | 
12 | For other methods we used, please refer their original project website for instruction. We recommend creating different environment for different methods.
13 | 
14 | These methods include: 
15 | 
16 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE). These are also single-cell FMs.
17 | 
18 | And
19 | 
20 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models.
21 | 
22 | 
23 | We need scIB for evaluation. Please use pip to install it:
24 | ```
25 | pip install scib
26 | ```
27 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correclty.
28 | 
29 | 
30 | # Pre-training weights
31 | 
32 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation.
33 | 
34 | Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity).
35 | 
36 | scFoundation relies on the APIs for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details.
37 | 
38 | # Benchmarking information
39 | 
40 | Please refer different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here:
41 | 
42 | | Tasks                                                 | Metrics                                  |
43 | |-------------------------------------------------------|------------------------------------------|
44 | | Batch Effect Correction, Multi-omics Data Integration |
45 | | and Simulation                                        | [scIB](https://github.com/theislab/scib)                                     |
46 | | Cell-type Annotation and Gene Function Prediction     | Accuracy, Precision, Recall and F1 score |
47 | | Imputation                                            | [scIB](https://github.com/theislab/scib), Correlation                        |
48 | | Perturbation Prediction                               | Correlation                              |
49 | | Gene Network Analysis                                 | Jaccard similarity                       |
50 | 
51 | The file 'sceval_lib.py' includes all of the metrics we used in this project.
52 | 
53 | To run the codes in different tasks, please use (we choose batch effect correction as an example here):
54 | 
55 | ```
56 | python sceval_batcheffect.py
57 | ```
58 | 
59 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing).
60 | 
61 | To avoid using wandb, please set:
62 | 
63 | ```
64 | os.environ["WANDB_MODE"] = "offline"
65 | ```
66 | 
67 | # Results
68 | 
69 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 
70 | 
71 | # Contact
72 | 
73 | Please contact tianyu.liu@yale.edu if you have any questions about this project.
74 | 
75 | # Citation
76 | 
77 | ```
78 | @article {Liu2023.09.08.555192,
79 | 	author = {Tianyu Liu and Kexing Li and Yuge Wang and Hongyu Li and Hongyu Zhao},
80 | 	title = {Evaluating the Utilities of Large Language Models in Single-cell Data Analysis},
81 | 	elocation-id = {2023.09.08.555192},
82 | 	year = {2023},
83 | 	doi = {10.1101/2023.09.08.555192},
84 | 	publisher = {Cold Spring Harbor Laboratory},
85 | 	URL = {https://www.biorxiv.org/content/early/2023/09/08/2023.09.08.555192},
86 | 	eprint = {https://www.biorxiv.org/content/early/2023/09/08/2023.09.08.555192.full.pdf},
87 | 	journal = {bioRxiv}
88 | }
89 | ```


--------------------------------------------------------------------------------
/.history/readme_20240311112645.md:
--------------------------------------------------------------------------------
 1 | # scEval😈: A evaluation platform for single-cell Foundation Models (FMs)
 2 | 
 3 | This is the repo for our benchmarking and analysis project. All methods are collected until March 1st, 2024. 
 4 | 
 5 | # Install
 6 | 
 7 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create a environment based on this yml file in your own machine:
 8 | ```
 9 | conda env create -n scgpt --file scgpt_bench.yml
10 | ```
11 | 
12 | For other methods we used, please refer their original project website for instruction. We recommend creating different environment for different methods.
13 | 
14 | These methods include: 
15 | 
16 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE). These are also single-cell FMs.
17 | 
18 | And
19 | 
20 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models.
21 | 
22 | 
23 | We need scIB for evaluation. Please use pip to install it:
24 | ```
25 | pip install scib
26 | ```
27 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correclty.
28 | 
29 | 
30 | # Pre-training weights
31 | 
32 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation.
33 | 
34 | Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity).
35 | 
36 | scFoundation relies on the APIs for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details.
37 | 
38 | # Benchmarking information
39 | 
40 | Please refer different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here:
41 | 
42 | | Tasks                                                 | Metrics                                  |
43 | |-------------------------------------------------------|------------------------------------------|
44 | | Batch Effect Correction, Multi-omics Data Integration |
45 | | and Simulation                                        | [scIB](https://github.com/theislab/scib)                                     |
46 | | Cell-type Annotation and Gene Function Prediction     | Accuracy, Precision, Recall and F1 score |
47 | | Imputation                                            | [scIB](https://github.com/theislab/scib), Correlation                        |
48 | | Perturbation Prediction                               | Correlation                              |
49 | | Gene Network Analysis                                 | Jaccard similarity                       |
50 | 
51 | The file 'sceval_lib.py' includes all of the metrics we used in this project.
52 | 
53 | To run the codes in different tasks, please use (we choose batch effect correction as an example here):
54 | 
55 | ```
56 | python sceval_batcheffect.py
57 | ```
58 | 
59 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing).
60 | 
61 | To avoid using wandb, please set:
62 | 
63 | ```
64 | os.environ["WANDB_MODE"] = "offline"
65 | ```
66 | 
67 | # Results
68 | 
69 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 
70 | 
71 | # Contact
72 | 
73 | Please contact tianyu.liu@yale.edu if you have any questions about this project.
74 | 
75 | # Citation
76 | 
77 | ```
78 | @article {Liu2023.09.08.555192,
79 | 	author = {Tianyu Liu and Kexing Li and Yuge Wang and Hongyu Li and Hongyu Zhao},
80 | 	title = {Evaluating the Utilities of Large Language Models in Single-cell Data Analysis},
81 | 	elocation-id = {2023.09.08.555192},
82 | 	year = {2023},
83 | 	doi = {10.1101/2023.09.08.555192},
84 | 	publisher = {Cold Spring Harbor Laboratory},
85 | 	URL = {https://www.biorxiv.org/content/early/2023/09/08/2023.09.08.555192},
86 | 	eprint = {https://www.biorxiv.org/content/early/2023/09/08/2023.09.08.555192.full.pdf},
87 | 	journal = {bioRxiv}
88 | }
89 | ```


--------------------------------------------------------------------------------
/.history/readme_20240311112739.md:
--------------------------------------------------------------------------------
 1 | # scEval😈: A evaluation platform for single-cell Foundation Models (FMs)
 2 | 
 3 | This is the repo for our benchmarking and analysis project. All methods are collected until March 1st, 2024. 
 4 | 
 5 | # Install
 6 | 
 7 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create a environment based on this yml file in your own machine:
 8 | ```
 9 | conda env create -n scgpt --file scgpt_bench.yml
10 | ```
11 | 
12 | For other methods we used, please refer their original project website for instruction. We recommend creating different environment for different methods.
13 | 
14 | These methods include: 
15 | 
16 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE). These are also single-cell FMs.
17 | 
18 | And
19 | 
20 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models.
21 | 
22 | 
23 | We need scIB for evaluation. Please use pip to install it:
24 | ```
25 | pip install scib
26 | ```
27 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correclty.
28 | 
29 | We will release a version of sceval with more functions in the future!
30 | 
31 | 
32 | # Pre-training weights
33 | 
34 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation.
35 | 
36 | Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity).
37 | 
38 | scFoundation relies on the APIs for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details.
39 | 
40 | # Benchmarking information
41 | 
42 | Please refer different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here:
43 | 
44 | | Tasks                                                 | Metrics                                  |
45 | |-------------------------------------------------------|------------------------------------------|
46 | | Batch Effect Correction, Multi-omics Data Integration |
47 | | and Simulation                                        | [scIB](https://github.com/theislab/scib)                                     |
48 | | Cell-type Annotation and Gene Function Prediction     | Accuracy, Precision, Recall and F1 score |
49 | | Imputation                                            | [scIB](https://github.com/theislab/scib), Correlation                        |
50 | | Perturbation Prediction                               | Correlation                              |
51 | | Gene Network Analysis                                 | Jaccard similarity                       |
52 | 
53 | The file 'sceval_lib.py' includes all of the metrics we used in this project.
54 | 
55 | To run the codes in different tasks, please use (we choose batch effect correction as an example here):
56 | 
57 | ```
58 | python sceval_batcheffect.py
59 | ```
60 | 
61 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing).
62 | 
63 | To avoid using wandb, please set:
64 | 
65 | ```
66 | os.environ["WANDB_MODE"] = "offline"
67 | ```
68 | 
69 | # Results
70 | 
71 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 
72 | 
73 | # Contact
74 | 
75 | Please contact tianyu.liu@yale.edu if you have any questions about this project.
76 | 
77 | # Citation
78 | 
79 | ```
80 | @article {Liu2023.09.08.555192,
81 | 	author = {Tianyu Liu and Kexing Li and Yuge Wang and Hongyu Li and Hongyu Zhao},
82 | 	title = {Evaluating the Utilities of Large Language Models in Single-cell Data Analysis},
83 | 	elocation-id = {2023.09.08.555192},
84 | 	year = {2023},
85 | 	doi = {10.1101/2023.09.08.555192},
86 | 	publisher = {Cold Spring Harbor Laboratory},
87 | 	URL = {https://www.biorxiv.org/content/early/2023/09/08/2023.09.08.555192},
88 | 	eprint = {https://www.biorxiv.org/content/early/2023/09/08/2023.09.08.555192.full.pdf},
89 | 	journal = {bioRxiv}
90 | }
91 | ```


--------------------------------------------------------------------------------
/.history/readme_20240311112740.md:
--------------------------------------------------------------------------------
 1 | # scEval😈: A evaluation platform for single-cell Foundation Models (FMs)
 2 | 
 3 | This is the repo for our benchmarking and analysis project. All methods are collected until March 1st, 2024. 
 4 | 
 5 | # Install
 6 | 
 7 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create a environment based on this yml file in your own machine:
 8 | ```
 9 | conda env create -n scgpt --file scgpt_bench.yml
10 | ```
11 | 
12 | For other methods we used, please refer their original project website for instruction. We recommend creating different environment for different methods.
13 | 
14 | These methods include: 
15 | 
16 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE). These are also single-cell FMs.
17 | 
18 | And
19 | 
20 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models.
21 | 
22 | 
23 | We need scIB for evaluation. Please use pip to install it:
24 | ```
25 | pip install scib
26 | ```
27 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correclty.
28 | 
29 | We will release a version of sceval with more functions in the future!
30 | 
31 | 
32 | # Pre-training weights
33 | 
34 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation.
35 | 
36 | Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity).
37 | 
38 | scFoundation relies on the APIs for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details.
39 | 
40 | # Benchmarking information
41 | 
42 | Please refer different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here:
43 | 
44 | | Tasks                                                 | Metrics                                  |
45 | |-------------------------------------------------------|------------------------------------------|
46 | | Batch Effect Correction, Multi-omics Data Integration |
47 | | and Simulation                                        | [scIB](https://github.com/theislab/scib)                                     |
48 | | Cell-type Annotation and Gene Function Prediction     | Accuracy, Precision, Recall and F1 score |
49 | | Imputation                                            | [scIB](https://github.com/theislab/scib), Correlation                        |
50 | | Perturbation Prediction                               | Correlation                              |
51 | | Gene Network Analysis                                 | Jaccard similarity                       |
52 | 
53 | The file 'sceval_lib.py' includes all of the metrics we used in this project.
54 | 
55 | To run the codes in different tasks, please use (we choose batch effect correction as an example here):
56 | 
57 | ```
58 | python sceval_batcheffect.py
59 | ```
60 | 
61 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing).
62 | 
63 | To avoid using wandb, please set:
64 | 
65 | ```
66 | os.environ["WANDB_MODE"] = "offline"
67 | ```
68 | 
69 | # Results
70 | 
71 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 
72 | 
73 | # Contact
74 | 
75 | Please contact tianyu.liu@yale.edu if you have any questions about this project.
76 | 
77 | # Citation
78 | 
79 | ```
80 | @article {Liu2023.09.08.555192,
81 | 	author = {Tianyu Liu and Kexing Li and Yuge Wang and Hongyu Li and Hongyu Zhao},
82 | 	title = {Evaluating the Utilities of Large Language Models in Single-cell Data Analysis},
83 | 	elocation-id = {2023.09.08.555192},
84 | 	year = {2023},
85 | 	doi = {10.1101/2023.09.08.555192},
86 | 	publisher = {Cold Spring Harbor Laboratory},
87 | 	URL = {https://www.biorxiv.org/content/early/2023/09/08/2023.09.08.555192},
88 | 	eprint = {https://www.biorxiv.org/content/early/2023/09/08/2023.09.08.555192.full.pdf},
89 | 	journal = {bioRxiv}
90 | }
91 | ```


--------------------------------------------------------------------------------
/.history/readme_20240311112848.md:
--------------------------------------------------------------------------------
 1 | # scEval😈: A evaluation platform for single-cell Foundation Models (FMs)
 2 | 
 3 | This is the repo for our benchmarking and analysis project. All methods are collected until March 1st, 2024. 
 4 | 
 5 | # Install
 6 | 
 7 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create a environment based on this yml file in your own machine:
 8 | ```
 9 | conda env create -n scgpt --file scgpt_bench.yml
10 | ```
11 | 
12 | For other methods we used, please refer their original project website for instruction. We recommend creating different environment for different methods.
13 | 
14 | These methods include: 
15 | 
16 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE). These are also single-cell FMs.
17 | 
18 | And
19 | 
20 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models.
21 | 
22 | 
23 | We need scIB for evaluation. Please use pip to install it:
24 | ```
25 | pip install scib
26 | ```
27 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correclty.
28 | 
29 | We will release a version of sceval with more functions in the future!
30 | 
31 | 
32 | # Pre-training weights
33 | 
34 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation.
35 | 
36 | Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity).
37 | 
38 | scFoundation relies on the APIs for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details.
39 | 
40 | # Benchmarking information
41 | 
42 | Please refer different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here:
43 | 
44 | | Tasks                                                 | Metrics                                  |
45 | |-------------------------------------------------------|------------------------------------------|
46 | | Batch Effect Correction, Multi-omics Data Integration |
47 | | and Simulation                                        | [scIB](https://github.com/theislab/scib)                                     |
48 | | Cell-type Annotation and Gene Function Prediction     | Accuracy, Precision, Recall and F1 score |
49 | | Imputation                                            | [scIB](https://github.com/theislab/scib), Correlation                        |
50 | | Perturbation Prediction                               | Correlation                              |
51 | | Gene Network Analysis                                 | Jaccard similarity                       |
52 | 
53 | The file 'sceval_lib.py' includes all of the metrics we used in this project.
54 | 
55 | To run the codes in different tasks, please use (we choose batch effect correction as an example here):
56 | 
57 | ```
58 | python sceval_batcheffect.py
59 | ```
60 | 
61 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing).
62 | 
63 | To avoid using wandb, please set:
64 | 
65 | ```
66 | os.environ["WANDB_MODE"] = "offline"
67 | ```
68 | 
69 | # Results
70 | 
71 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 
72 | 
73 | # Contact
74 | 
75 | Please contact tianyu.liu@yale.edu if you have any questions about this project.
76 | 
77 | # Citation
78 | 
79 | ```
80 | @article{liu2023evaluating,
81 |   title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis},
82 |   author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu},
83 |   journal={bioRxiv},
84 |   pages={2023--09},
85 |   year={2023},
86 |   publisher={Cold Spring Harbor Laboratory}
87 | }
88 | ```


--------------------------------------------------------------------------------
/.history/readme_20240311112907.md:
--------------------------------------------------------------------------------
 1 | # scEval😈: A evaluation platform for single-cell Foundation Models (FMs)
 2 | 
 3 | This is the repo for our benchmarking and analysis project. All methods are collected until March 1st, 2024. 
 4 | 
 5 | # Install
 6 | 
 7 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create a environment based on this yml file in your own machine:
 8 | ```
 9 | conda env create -n scgpt --file scgpt_bench.yml
10 | ```
11 | 
12 | For other methods we used, please refer their original project website for instruction. We recommend creating different environment for different methods.
13 | 
14 | These methods include: 
15 | 
16 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE). These are also single-cell FMs.
17 | 
18 | And
19 | 
20 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models.
21 | 
22 | 
23 | We need scIB for evaluation. Please use pip to install it:
24 | ```
25 | pip install scib
26 | ```
27 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correclty.
28 | 
29 | We will release a version of sceval with more functions in the future!
30 | 
31 | 
32 | # Pre-training weights
33 | 
34 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation.
35 | 
36 | Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity).
37 | 
38 | scFoundation relies on the APIs for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details.
39 | 
40 | # Benchmarking information
41 | 
42 | Please refer different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here:
43 | 
44 | | Tasks                                                 | Metrics                                  |
45 | |-------------------------------------------------------|------------------------------------------|
46 | | Batch Effect Correction, Multi-omics Data Integration |
47 | | and Simulation                                        | [scIB](https://github.com/theislab/scib)                                     |
48 | | Cell-type Annotation and Gene Function Prediction     | Accuracy, Precision, Recall and F1 score |
49 | | Imputation                                            | [scIB](https://github.com/theislab/scib), Correlation                        |
50 | | Perturbation Prediction                               | Correlation                              |
51 | | Gene Network Analysis                                 | Jaccard similarity                       |
52 | 
53 | The file 'sceval_lib.py' includes all of the metrics we used in this project.
54 | 
55 | To run the codes in different tasks, please use (we choose batch effect correction as an example here):
56 | 
57 | ```
58 | python sceval_batcheffect.py
59 | ```
60 | 
61 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing).
62 | 
63 | To avoid using wandb, please set:
64 | 
65 | ```
66 | os.environ["WANDB_MODE"] = "offline"
67 | ```
68 | 
69 | # Results
70 | 
71 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 
72 | 
73 | # Contact
74 | 
75 | Please contact tianyu.liu@yale.edu if you have any questions about this project.
76 | 
77 | # Citation
78 | 
79 | ```
80 | @article{liu2023evaluating,
81 |   title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis},
82 |   author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu},
83 |   journal={bioRxiv},
84 |   pages={2023--09},
85 |   year={2023},
86 |   publisher={Cold Spring Harbor Laboratory}
87 | }
88 | ```


--------------------------------------------------------------------------------
/.history/readme_20240311112908.md:
--------------------------------------------------------------------------------
 1 | # scEval😈: A evaluation platform for single-cell Foundation Models (FMs)
 2 | 
 3 | This is the repo for our benchmarking and analysis project. All methods are collected until March 1st, 2024. 
 4 | 
 5 | # Install
 6 | 
 7 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create a environment based on this yml file in your own machine:
 8 | ```
 9 | conda env create -n scgpt --file scgpt_bench.yml
10 | ```
11 | 
12 | For other methods we used, please refer their original project website for instruction. We recommend creating different environment for different methods.
13 | 
14 | These methods include: 
15 | 
16 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE). These are also single-cell FMs.
17 | 
18 | And
19 | 
20 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models.
21 | 
22 | 
23 | We need scIB for evaluation. Please use pip to install it:
24 | ```
25 | pip install scib
26 | ```
27 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correclty.
28 | 
29 | We will release a version of sceval with more functions in the future!
30 | 
31 | 
32 | # Pre-training weights
33 | 
34 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation.
35 | 
36 | Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity).
37 | 
38 | scFoundation relies on the APIs for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details.
39 | 
40 | # Benchmarking information
41 | 
42 | Please refer different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here:
43 | 
44 | | Tasks                                                 | Metrics                                  |
45 | |-------------------------------------------------------|------------------------------------------|
46 | | Batch Effect Correction, Multi-omics Data Integration |
47 | | and Simulation                                        | [scIB](https://github.com/theislab/scib)                                     |
48 | | Cell-type Annotation and Gene Function Prediction     | Accuracy, Precision, Recall and F1 score |
49 | | Imputation                                            | [scIB](https://github.com/theislab/scib), Correlation                        |
50 | | Perturbation Prediction                               | Correlation                              |
51 | | Gene Network Analysis                                 | Jaccard similarity                       |
52 | 
53 | The file 'sceval_lib.py' includes all of the metrics we used in this project.
54 | 
55 | To run the codes in different tasks, please use (we choose batch effect correction as an example here):
56 | 
57 | ```
58 | python sceval_batcheffect.py
59 | ```
60 | 
61 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing).
62 | 
63 | To avoid using wandb, please set:
64 | 
65 | ```
66 | os.environ["WANDB_MODE"] = "offline"
67 | ```
68 | 
69 | # Results
70 | 
71 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 
72 | 
73 | # Contact
74 | 
75 | Please contact tianyu.liu@yale.edu if you have any questions about this project.
76 | 
77 | # Citation
78 | 
79 | ```
80 | @article{liu2023evaluating,
81 |   title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis},
82 |   author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu},
83 |   journal={bioRxiv},
84 |   pages={2023--09},
85 |   year={2023},
86 |   publisher={Cold Spring Harbor Laboratory}
87 | }
88 | ```


--------------------------------------------------------------------------------
/.history/readme_20241130154021.md:
--------------------------------------------------------------------------------
  1 | # scEval😈: An evaluation platform for single-cell Foundation Models (FMs)
  2 | 
  3 | This is the repo for our benchmarking and analysis project. All methods are collected until March 1st, 2024. 
  4 | 
  5 | # Install
  6 | 
  7 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create an environment based on this yml file in your own machine:
  8 | ```
  9 | conda env create -n scgpt --file scgpt_bench.yml
 10 | ```
 11 | 
 12 | If you face any issues due to version conflicts, you can try to comment the problematic packages and try:
 13 | 
 14 | ```
 15 | conda activate scgpt
 16 | conda env update --file scgpt_bench.yml
 17 | ```
 18 | 
 19 | For other methods we used, please refer to their original project website for instructions. We recommend creating different environments for different methods. Considering the difficulties of installing different scFMs, we provide a list of yml files we used to install these models in the folder **installation_baselines**.
 20 | 
 21 | These methods include: 
 22 | 
 23 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE), [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main). These are also single-cell FMs.
 24 | 
 25 | And
 26 | 
 27 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models.
 28 | 
 29 | 
 30 | We need scIB for evaluation. Please use pip to install it:
 31 | ```
 32 | pip install scib
 33 | ```
 34 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correctly.
 35 | 
 36 | We will release a version of scEval with more functions in the future!
 37 | 
 38 | 
 39 | # Pre-training weights
 40 | 
 41 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). Pre-training weights of UCE can be found in [UCE](https://github.com/snap-stanford/UCE). Pre-training weights of tGPT can be found in [tGPT](https://github.com/deeplearningplus/tGPT). Pre-training weights of CellPLM can be found in [CellPLM](https://github.com/OmicsML/CellPLM). 
 42 | 
 43 | scFoundation relies on the APIs or local sever for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. Details of GeneCompas can be found in [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main)
 44 | 
 45 | # Benchmarking information
 46 | 
 47 | Please refer to different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here:
 48 | 
 49 | | Tasks                                                 | Metrics                                  |
 50 | |-------------------------------------------------------|------------------------------------------|
 51 | | Batch Effect Correction, Multi-omics Data Integration |
 52 | | and Simulation                                        | [scIB](https://github.com/theislab/scib)                                     |
 53 | | Cell-type Annotation and Gene Function Prediction     | Accuracy, Precision, Recall and F1 score |
 54 | | Imputation                                            | [scIB](https://github.com/theislab/scib), Correlation                        |
 55 | | Perturbation Prediction                               | Correlation                              |
 56 | | Gene Network Analysis                                 | Jaccard similarity                       |
 57 | 
 58 | The file 'sceval_lib.py' includes all of the metrics we used in this project.
 59 | 
 60 | To run the codes in different tasks, please use (we choose batch effect correction of scGPT as an example here):
 61 | 
 62 | ```
 63 | python sceval_batcheffect.py
 64 | ```
 65 | 
 66 | We recommend directly evaluating the methods based on their outputs (as .h5ad file), which can be easily performed based on the codes in **sceval_method.py**.
 67 | 
 68 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing).
 69 | 
 70 | To avoid using wandb, please set:
 71 | 
 72 | ```
 73 | os.environ["WANDB_MODE"] = "offline"
 74 | 
 75 | ```
 76 | 
 77 | We will upload our codes for benchmarking different foundation models soon.
 78 | 
 79 | # Devices
 80 | 
 81 | We recommend using sever to run benchmarked methods and scEval platform. To run single-cell Foundation Models, GPU cores (A100 or higher version) and 40+ GB memory are required. To run scEval (only the evaluation), 40+ GB memory is recommended.
 82 | 
 83 | # Results
 84 | 
 85 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 
 86 | 
 87 | # Contact
 88 | 
 89 | Please contact tianyu.liu@yale.edu if you have any questions about this project.
 90 | 
 91 | # Citation
 92 | 
 93 | ```
 94 | @article{liu2023evaluating,
 95 |   title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis},
 96 |   author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu},
 97 |   journal={bioRxiv},
 98 |   pages={2023--09},
 99 |   year={2023},
100 |   publisher={Cold Spring Harbor Laboratory}
101 | }
102 | ```


--------------------------------------------------------------------------------
/.history/readme_20241130154116.md:
--------------------------------------------------------------------------------
  1 | # scEval😈: An evaluation platform for single-cell Foundation Models (FMs)
  2 | 
  3 | This is the repo for our benchmarking and analysis project. All methods are collected until March 1st, 2024. 
  4 | 
  5 | # Install
  6 | 
  7 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create an environment based on this yml file in your own machine:
  8 | ```
  9 | conda env create -n scgpt --file scgpt_bench.yml
 10 | ```
 11 | 
 12 | If you face any issues due to version conflicts, you can try to comment the problematic packages and try:
 13 | 
 14 | ```
 15 | conda activate scgpt
 16 | conda env update --file scgpt_bench.yml
 17 | ```
 18 | 
 19 | For other methods we used, please refer to their original project website for instructions. We recommend creating different environments for different methods. Considering the difficulties of installing different scFMs, we provide a list of yml files we used to install these models in the folder **installation_baselines**.
 20 | 
 21 | These methods include: 
 22 | 
 23 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE), [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main). These are also single-cell FMs.
 24 | 
 25 | And
 26 | 
 27 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models.
 28 | 
 29 | 
 30 | We need scIB for evaluation. Please use pip to install it:
 31 | ```
 32 | pip install scib
 33 | ```
 34 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correctly.
 35 | 
 36 | We will release a version of scEval with more functions in the future!
 37 | 
 38 | 
 39 | # Pre-training weights
 40 | 
 41 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). Pre-training weights of UCE can be found in [UCE](https://github.com/snap-stanford/UCE). Pre-training weights of tGPT can be found in [tGPT](https://github.com/deeplearningplus/tGPT). Pre-training weights of CellPLM can be found in [CellPLM](https://github.com/OmicsML/CellPLM). 
 42 | 
 43 | scFoundation relies on the APIs or local sever for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. Details of GeneCompas can be found in [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main)
 44 | 
 45 | # Benchmarking information
 46 | 
 47 | Please refer to different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here:
 48 | 
 49 | | Tasks                                                 | Metrics                                  |
 50 | |-------------------------------------------------------|------------------------------------------|
 51 | | Batch Effect Correction, Multi-omics Data Integration |
 52 | | and Simulation                                        | [scIB](https://github.com/theislab/scib)                                     |
 53 | | Cell-type Annotation and Gene Function Prediction     | Accuracy, Precision, Recall and F1 score |
 54 | | Imputation                                            | [scIB](https://github.com/theislab/scib), Correlation                        |
 55 | | Perturbation Prediction                               | Correlation                              |
 56 | | Gene Network Analysis                                 | Jaccard similarity                       |
 57 | 
 58 | The file 'sceval_lib.py' includes all of the metrics we used in this project.
 59 | 
 60 | To run the codes in different tasks, please use (we choose batch effect correction of scGPT as an example here):
 61 | 
 62 | ```
 63 | python sceval_batcheffect.py
 64 | ```
 65 | 
 66 | We recommend directly evaluating the methods based on their outputs (as .h5ad file), which can be easily performed based on the codes in **sceval_method.py**.
 67 | 
 68 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing).
 69 | 
 70 | To avoid using wandb, please set:
 71 | 
 72 | ```
 73 | os.environ["WANDB_MODE"] = "offline"
 74 | 
 75 | ```
 76 | 
 77 | We will upload our codes for benchmarking different foundation models soon.
 78 | 
 79 | # Devices
 80 | 
 81 | We recommend using sever to run benchmarked methods and scEval platform. To run single-cell Foundation Models, GPU cores (A100 or higher version) and 40+ GB memory are required. To run scEval (only the evaluation), 40+ GB memory is recommended.
 82 | 
 83 | # Results
 84 | 
 85 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 
 86 | 
 87 | # Contact
 88 | 
 89 | Please contact tianyu.liu@yale.edu if you have any questions about this project.
 90 | 
 91 | # Citation
 92 | 
 93 | ```
 94 | @article{liu2023evaluating,
 95 |   title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis},
 96 |   author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu},
 97 |   journal={bioRxiv},
 98 |   pages={2023--09},
 99 |   year={2023},
100 |   publisher={Cold Spring Harbor Laboratory}
101 | }
102 | ```


--------------------------------------------------------------------------------
/.history/readme_20241210100101.md:
--------------------------------------------------------------------------------
  1 | # scEval😈: An evaluation platform for single-cell Foundation Models (FMs)
  2 | 
  3 | This is the repo for our benchmarking and analysis project. All methods are collected until Dec 1st, 2024. 
  4 | 
  5 | News: We are collaborating with [OpenProblems](https://openproblems.bio/) to make this benchmark alive! Stay tuned and we will update the benchmarking results soon!
  6 | 
  7 | # Install
  8 | 
  9 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create an environment based on this yml file in your own machine:
 10 | ```
 11 | conda env create -n scgpt --file scgpt_bench.yml
 12 | ```
 13 | 
 14 | If you face any issues due to version conflicts, you can try to comment the problematic packages and try:
 15 | 
 16 | ```
 17 | conda activate scgpt
 18 | conda env update --file scgpt_bench.yml
 19 | ```
 20 | 
 21 | For other methods we used, please refer to their original project website for instructions. We recommend creating different environments for different methods. Considering the difficulties of installing different scFMs, we provide a list of yml files we used to install these models in the folder **installation_baselines**.
 22 | 
 23 | These methods include: 
 24 | 
 25 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE), [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main). These are also single-cell FMs.
 26 | 
 27 | And
 28 | 
 29 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models.
 30 | 
 31 | 
 32 | We need scIB for evaluation. Please use pip to install it:
 33 | ```
 34 | pip install scib
 35 | ```
 36 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correctly.
 37 | 
 38 | We will release a version of scEval with more functions in the future!
 39 | 
 40 | 
 41 | # Pre-training weights
 42 | 
 43 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). Pre-training weights of UCE can be found in [UCE](https://github.com/snap-stanford/UCE). Pre-training weights of tGPT can be found in [tGPT](https://github.com/deeplearningplus/tGPT). Pre-training weights of CellPLM can be found in [CellPLM](https://github.com/OmicsML/CellPLM). 
 44 | 
 45 | scFoundation relies on the APIs or local sever for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. Details of GeneCompas can be found in [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main)
 46 | 
 47 | # Benchmarking information
 48 | 
 49 | Please refer to different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here:
 50 | 
 51 | | Tasks                                                 | Metrics                                  |
 52 | |-------------------------------------------------------|------------------------------------------|
 53 | | Batch Effect Correction, Multi-omics Data Integration |
 54 | | and Simulation                                        | [scIB](https://github.com/theislab/scib)                                     |
 55 | | Cell-type Annotation and Gene Function Prediction     | Accuracy, Precision, Recall and F1 score |
 56 | | Imputation                                            | [scIB](https://github.com/theislab/scib), Correlation                        |
 57 | | Perturbation Prediction                               | Correlation                              |
 58 | | Gene Network Analysis                                 | Jaccard similarity                       |
 59 | 
 60 | The file 'sceval_lib.py' includes all of the metrics we used in this project.
 61 | 
 62 | To run the codes in different tasks, please use (we choose batch effect correction of scGPT as an example here):
 63 | 
 64 | ```
 65 | python sceval_batcheffect.py
 66 | ```
 67 | 
 68 | We recommend directly evaluating the methods based on their outputs (as .h5ad file), which can be easily performed based on the codes in **sceval_method.py**.
 69 | 
 70 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://drive.google.com/drive/folders/1YvBQ44H_jzhS8B35mPjpCMwQserLLhZs?usp=sharing).
 71 | 
 72 | To avoid using wandb, please set:
 73 | 
 74 | ```
 75 | os.environ["WANDB_MODE"] = "offline"
 76 | 
 77 | ```
 78 | 
 79 | We will upload our codes for benchmarking different foundation models soon.
 80 | 
 81 | # Devices
 82 | 
 83 | We recommend using sever to run benchmarked methods and scEval platform. To run single-cell Foundation Models, GPU cores (A100 or higher version) and 40+ GB memory are required. To run scEval (only the evaluation), 40+ GB memory is recommended.
 84 | 
 85 | # Results
 86 | 
 87 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 
 88 | 
 89 | # Contact
 90 | 
 91 | Please contact tianyu.liu@yale.edu if you have any questions about this project.
 92 | 
 93 | # Citation
 94 | 
 95 | ```
 96 | @article{liu2023evaluating,
 97 |   title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis},
 98 |   author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu},
 99 |   journal={bioRxiv},
100 |   pages={2023--09},
101 |   year={2023},
102 |   publisher={Cold Spring Harbor Laboratory}
103 | }
104 | ```


--------------------------------------------------------------------------------
/.history/readme_20241210100332.md:
--------------------------------------------------------------------------------
  1 | # scEval😈: An evaluation platform for single-cell Foundation Models (FMs)
  2 | 
  3 | This is the repo for our benchmarking and analysis project. All methods are collected until Dec 1st, 2024. 
  4 | 
  5 | News: We are collaborating with [OpenProblems](https://openproblems.bio/) to make this benchmark alive! Stay tuned and we will update the benchmarking results soon!
  6 | 
  7 | # Install
  8 | 
  9 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create an environment based on this yml file in your own machine:
 10 | ```
 11 | conda env create -n scgpt --file scgpt_bench.yml
 12 | ```
 13 | 
 14 | If you face any issues due to version conflicts, you can try to comment the problematic packages and try:
 15 | 
 16 | ```
 17 | conda activate scgpt
 18 | conda env update --file scgpt_bench.yml
 19 | ```
 20 | 
 21 | For other methods we used, please refer to their original project website for instructions. We recommend creating different environments for different methods. Considering the difficulties of installing different scFMs, we provide a list of yml files we used to install these models in the folder **installation_baselines**.
 22 | 
 23 | These methods include: 
 24 | 
 25 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE), [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main). These are also single-cell FMs.
 26 | 
 27 | And
 28 | 
 29 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models.
 30 | 
 31 | 
 32 | We need scIB for evaluation. Please use pip to install it:
 33 | ```
 34 | pip install scib
 35 | ```
 36 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correctly.
 37 | 
 38 | We will release a version of scEval with more functions in the future!
 39 | 
 40 | 
 41 | # Pre-training weights
 42 | 
 43 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). Pre-training weights of UCE can be found in [UCE](https://github.com/snap-stanford/UCE). Pre-training weights of tGPT can be found in [tGPT](https://github.com/deeplearningplus/tGPT). Pre-training weights of CellPLM can be found in [CellPLM](https://github.com/OmicsML/CellPLM). 
 44 | 
 45 | scFoundation relies on the APIs or local sever for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. Details of GeneCompas can be found in [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main)
 46 | 
 47 | # Benchmarking information
 48 | 
 49 | Please refer to different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here:
 50 | 
 51 | | Tasks                                                 | Metrics                                  |
 52 | |-------------------------------------------------------|------------------------------------------|
 53 | | Batch Effect Correction, Multi-omics Data Integration |
 54 | | and Simulation                                        | [scIB](https://github.com/theislab/scib)                                     |
 55 | | Cell-type Annotation and Gene Function Prediction     | Accuracy, Precision, Recall and F1 score |
 56 | | Imputation                                            | [scIB](https://github.com/theislab/scib), Correlation                        |
 57 | | Perturbation Prediction                               | Correlation, Mean Squared Error                              |
 58 | | Gene Network Analysis                                 | Jaccard similarity                       |
 59 | 
 60 | The file 'sceval_lib.py' includes all of the metrics we used in this project.
 61 | 
 62 | To run the codes in different tasks, please use (we choose batch effect correction of scGPT as an example here):
 63 | 
 64 | ```
 65 | python sceval_batcheffect.py
 66 | ```
 67 | 
 68 | We recommend directly evaluating the methods based on their outputs (as .h5ad file), which can be easily performed based on the codes in **sceval_method.py**.
 69 | 
 70 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://yaleedu-my.sharepoint.com/:f:/g/personal/tianyu_liu_yale_edu/Eiqs78qeqwBNiy6zoI_JDnABfz7e2w4Gpj0F4t4l5S-oCw?e=0xSnew).
 71 | 
 72 | To avoid using wandb, please set:
 73 | 
 74 | ```
 75 | os.environ["WANDB_MODE"] = "offline"
 76 | 
 77 | ```
 78 | 
 79 | We will upload our codes for benchmarking different foundation models soon.
 80 | 
 81 | # Devices
 82 | 
 83 | We recommend using sever to run benchmarked methods and scEval platform. To run single-cell Foundation Models, GPU cores (A100 or higher version) and 40+ GB memory are required. To run scEval (only the evaluation), 40+ GB memory is recommended.
 84 | 
 85 | # Results
 86 | 
 87 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 
 88 | 
 89 | # Contact
 90 | 
 91 | Please contact tianyu.liu@yale.edu if you have any questions about this project.
 92 | 
 93 | # Citation
 94 | 
 95 | ```
 96 | @article{liu2023evaluating,
 97 |   title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis},
 98 |   author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu},
 99 |   journal={bioRxiv},
100 |   pages={2023--09},
101 |   year={2023},
102 |   publisher={Cold Spring Harbor Laboratory}
103 | }
104 | ```


--------------------------------------------------------------------------------
/.history/readme_20241228135005.md:
--------------------------------------------------------------------------------
  1 | # scEval😈: An evaluation platform for single-cell Foundation Models (FMs)
  2 | 
  3 | This is the repo for our benchmarking and analysis project. All methods are collected until Dec 1st, 2024. 
  4 | 
  5 | News: We are collaborating with [OpenProblems](https://openproblems.bio/) to make this benchmark alive! Stay tuned and we will update the benchmarking results soon!
  6 | 
  7 | # Install
  8 | 
  9 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create an environment based on this yml file in your own machine:
 10 | ```
 11 | conda env create -n scgpt --file scgpt_bench.yml
 12 | ```
 13 | 
 14 | If you face any issues due to version conflicts, you can try to comment the problematic packages and try:
 15 | 
 16 | ```
 17 | conda activate scgpt
 18 | conda env update --file scgpt_bench.yml
 19 | ```
 20 | 
 21 | We also provide docker installation, please use (need gpu):
 22 | 
 23 | ```
 24 | docker build -t my-conda-image .
 25 | ```
 26 | 
 27 | To activate it, please use:
 28 | 
 29 | ```
 30 | docker run -it --rm my-conda-image
 31 | ```
 32 | 
 33 | For other methods we used, please refer to their original project website for instructions. We recommend  creating different environments for different methods. Considering the difficulties of installing different scFMs, we provide a list of yml files and an example of Dockerfile we used to install these models in the folder **installation_baselines**.
 34 | 
 35 | These methods include: 
 36 | 
 37 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE), [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main). These are also single-cell FMs.
 38 | 
 39 | And
 40 | 
 41 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models.
 42 | 
 43 | 
 44 | We need scIB for evaluation. Please use pip to install it:
 45 | ```
 46 | pip install scib
 47 | ```
 48 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correctly.
 49 | 
 50 | We will release a version of scEval with more functions in the future!
 51 | 
 52 | 
 53 | # Pre-training weights
 54 | 
 55 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). Pre-training weights of UCE can be found in [UCE](https://github.com/snap-stanford/UCE). Pre-training weights of tGPT can be found in [tGPT](https://github.com/deeplearningplus/tGPT). Pre-training weights of CellPLM can be found in [CellPLM](https://github.com/OmicsML/CellPLM). 
 56 | 
 57 | scFoundation relies on the APIs or local sever for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. Details of GeneCompas can be found in [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main)
 58 | 
 59 | # Benchmarking information
 60 | 
 61 | Please refer to different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here:
 62 | 
 63 | | Tasks                                                 | Metrics                                  |
 64 | |-------------------------------------------------------|------------------------------------------|
 65 | | Batch Effect Correction, Multi-omics Data Integration |
 66 | | and Simulation                                        | [scIB](https://github.com/theislab/scib)                                     |
 67 | | Cell-type Annotation and Gene Function Prediction     | Accuracy, Precision, Recall and F1 score |
 68 | | Imputation                                            | [scIB](https://github.com/theislab/scib), Correlation                        |
 69 | | Perturbation Prediction                               | Correlation, Mean Squared Error                              |
 70 | | Gene Network Analysis                                 | Jaccard similarity                       |
 71 | 
 72 | The file 'sceval_lib.py' includes all of the metrics we used in this project.
 73 | 
 74 | To run the codes in different tasks, please use (we choose batch effect correction of scGPT as an example here):
 75 | 
 76 | ```
 77 | python sceval_batcheffect.py
 78 | ```
 79 | 
 80 | We recommend directly evaluating the methods based on their outputs (as .h5ad file), which can be easily performed based on the codes in **sceval_method.py**.
 81 | 
 82 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://yaleedu-my.sharepoint.com/:f:/g/personal/tianyu_liu_yale_edu/Eiqs78qeqwBNiy6zoI_JDnABfz7e2w4Gpj0F4t4l5S-oCw?e=0xSnew).
 83 | 
 84 | To avoid using wandb, please set:
 85 | 
 86 | ```
 87 | os.environ["WANDB_MODE"] = "offline"
 88 | 
 89 | ```
 90 | 
 91 | We will upload our codes for benchmarking different foundation models soon.
 92 | 
 93 | # Devices
 94 | 
 95 | We recommend using sever to run benchmarked methods and scEval platform. To run single-cell Foundation Models, GPU cores (A100 or higher version) and 40+ GB memory are required. To run scEval (only the evaluation), 40+ GB memory is recommended.
 96 | 
 97 | # Results
 98 | 
 99 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 
100 | 
101 | # Contact
102 | 
103 | Please contact tianyu.liu@yale.edu if you have any questions about this project.
104 | 
105 | # Citation
106 | 
107 | ```
108 | @article{liu2023evaluating,
109 |   title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis},
110 |   author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu},
111 |   journal={bioRxiv},
112 |   pages={2023--09},
113 |   year={2023},
114 |   publisher={Cold Spring Harbor Laboratory}
115 | }
116 | ```


--------------------------------------------------------------------------------
/.history/readme_20241228135006.md:
--------------------------------------------------------------------------------
  1 | # scEval😈: An evaluation platform for single-cell Foundation Models (FMs)
  2 | 
  3 | This is the repo for our benchmarking and analysis project. All methods are collected until Dec 1st, 2024. 
  4 | 
  5 | News: We are collaborating with [OpenProblems](https://openproblems.bio/) to make this benchmark alive! Stay tuned and we will update the benchmarking results soon!
  6 | 
  7 | # Install
  8 | 
  9 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create an environment based on this yml file in your own machine:
 10 | ```
 11 | conda env create -n scgpt --file scgpt_bench.yml
 12 | ```
 13 | 
 14 | If you face any issues due to version conflicts, you can try to comment the problematic packages and try:
 15 | 
 16 | ```
 17 | conda activate scgpt
 18 | conda env update --file scgpt_bench.yml
 19 | ```
 20 | 
 21 | We also provide docker installation, please use (need gpu):
 22 | 
 23 | ```
 24 | docker build -t my-conda-image .
 25 | ```
 26 | 
 27 | To activate it, please use:
 28 | 
 29 | ```
 30 | docker run -it --rm my-conda-image
 31 | ```
 32 | 
 33 | For other methods we used, please refer to their original project website for instructions. We recommend  creating different environments for different methods. Considering the difficulties of installing different scFMs, we provide a list of yml files and an example of Dockerfile we used to install these models in the folder **installation_baselines**.
 34 | 
 35 | These methods include: 
 36 | 
 37 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE), [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main). These are also single-cell FMs.
 38 | 
 39 | And
 40 | 
 41 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models.
 42 | 
 43 | 
 44 | We need scIB for evaluation. Please use pip to install it:
 45 | ```
 46 | pip install scib
 47 | ```
 48 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correctly.
 49 | 
 50 | We will release a version of scEval with more functions in the future!
 51 | 
 52 | 
 53 | # Pre-training weights
 54 | 
 55 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). Pre-training weights of UCE can be found in [UCE](https://github.com/snap-stanford/UCE). Pre-training weights of tGPT can be found in [tGPT](https://github.com/deeplearningplus/tGPT). Pre-training weights of CellPLM can be found in [CellPLM](https://github.com/OmicsML/CellPLM). 
 56 | 
 57 | scFoundation relies on the APIs or local sever for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. Details of GeneCompas can be found in [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main)
 58 | 
 59 | # Benchmarking information
 60 | 
 61 | Please refer to different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here:
 62 | 
 63 | | Tasks                                                 | Metrics                                  |
 64 | |-------------------------------------------------------|------------------------------------------|
 65 | | Batch Effect Correction, Multi-omics Data Integration |
 66 | | and Simulation                                        | [scIB](https://github.com/theislab/scib)                                     |
 67 | | Cell-type Annotation and Gene Function Prediction     | Accuracy, Precision, Recall and F1 score |
 68 | | Imputation                                            | [scIB](https://github.com/theislab/scib), Correlation                        |
 69 | | Perturbation Prediction                               | Correlation, Mean Squared Error                              |
 70 | | Gene Network Analysis                                 | Jaccard similarity                       |
 71 | 
 72 | The file 'sceval_lib.py' includes all of the metrics we used in this project.
 73 | 
 74 | To run the codes in different tasks, please use (we choose batch effect correction of scGPT as an example here):
 75 | 
 76 | ```
 77 | python sceval_batcheffect.py
 78 | ```
 79 | 
 80 | We recommend directly evaluating the methods based on their outputs (as .h5ad file), which can be easily performed based on the codes in **sceval_method.py**.
 81 | 
 82 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://yaleedu-my.sharepoint.com/:f:/g/personal/tianyu_liu_yale_edu/Eiqs78qeqwBNiy6zoI_JDnABfz7e2w4Gpj0F4t4l5S-oCw?e=0xSnew).
 83 | 
 84 | To avoid using wandb, please set:
 85 | 
 86 | ```
 87 | os.environ["WANDB_MODE"] = "offline"
 88 | 
 89 | ```
 90 | 
 91 | We will upload our codes for benchmarking different foundation models soon.
 92 | 
 93 | # Devices
 94 | 
 95 | We recommend using sever to run benchmarked methods and scEval platform. To run single-cell Foundation Models, GPU cores (A100 or higher version) and 40+ GB memory are required. To run scEval (only the evaluation), 40+ GB memory is recommended.
 96 | 
 97 | # Results
 98 | 
 99 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 
100 | 
101 | # Contact
102 | 
103 | Please contact tianyu.liu@yale.edu if you have any questions about this project.
104 | 
105 | # Citation
106 | 
107 | ```
108 | @article{liu2023evaluating,
109 |   title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis},
110 |   author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu},
111 |   journal={bioRxiv},
112 |   pages={2023--09},
113 |   year={2023},
114 |   publisher={Cold Spring Harbor Laboratory}
115 | }
116 | ```


--------------------------------------------------------------------------------
/.history/sceval_lib_20240311171801.py:
--------------------------------------------------------------------------------
  1 | import AnnData
  2 | import torch
  3 | import numpy as np
  4 | import scib
  5 | import scanpy as sc
  6 | import scipy
  7 | import scipy.stats
  8 | from scgpt.utils import set_seed
  9 | from sklearn.metrics import classification_report
 10 | from typing import List, Tuple, Dict, Union, Optional
 11 | 
 12 | set_seed(0)
 13 | def eval_scib_metrics(
 14 |     adata: AnnData,
 15 |     batch_key: str = "batch",
 16 |     label_key: str = "celltype",
 17 |     emb_name: str = "X_scGPT",
 18 |     notes: Optional[str] = None,
 19 | ) -> Dict:
 20 |     results = scib.metrics.metrics(
 21 |         adata,
 22 |         adata_int=adata,
 23 |         batch_key=batch_key,
 24 |         label_key=label_key,
 25 |         embed=emb_name,
 26 |         isolated_labels_asw_=False,
 27 |         silhouette_=True,
 28 |         hvg_score_=False,
 29 |         graph_conn_=True,
 30 |         pcr_=True,
 31 |         isolated_labels_f1_=False,
 32 |         trajectory_=False,
 33 |         nmi_=True,  
 34 |         ari_=True, 
 35 |         cell_cycle_=False,
 36 |         kBET_=True,  
 37 |         ilisi_=False,
 38 |         clisi_=False,
 39 |     )
 40 | 
 41 |     result_dict = results[0].to_dict()
 42 | 
 43 |     result_dict["avg_bio"] = np.mean(
 44 |         [
 45 |             result_dict["NMI_cluster/label"],
 46 |             result_dict["ARI_cluster/label"],
 47 |             result_dict["ASW_label"],
 48 |         ]
 49 |     )
 50 | 
 51 |     # remove nan value in result_dict
 52 |     result_dict = {k: v for k, v in result_dict.items() if not np.isnan(v)}
 53 | 
 54 |     print(results)
 55 |     return result_dict
 56 | 
 57 | 
 58 | def eval_scib_metrics_onlybio(
 59 |     adata: AnnData,
 60 |     batch_key: str = "batch",
 61 |     label_key: str = "celltype",
 62 |     emb_name: str = "X_scGPT",
 63 |     notes: Optional[str] = None,
 64 | ) -> Dict:
 65 |     results = scib.metrics.metrics_onlybio(
 66 |         adata,
 67 |         adata_int=adata,
 68 |         batch_key=batch_key,
 69 |         label_key=label_key,
 70 |         embed=emb_name,
 71 |         isolated_labels_asw_=False,
 72 |         silhouette_=True,
 73 |         hvg_score_=False,
 74 |         graph_conn_=True,
 75 |         pcr_=True,
 76 |         isolated_labels_f1_=False,
 77 |         trajectory_=False,
 78 |         nmi_=True,  
 79 |         ari_=True,  
 80 |         cell_cycle_=False,
 81 |         kBET_=False,  
 82 |         ilisi_=False,
 83 |         clisi_=False,
 84 |     )
 85 | 
 86 |     result_dict = results[0].to_dict()
 87 |     result_dict["avg_bio"] = np.mean(
 88 |         [
 89 |             result_dict["NMI_cluster/label"],
 90 |             result_dict["ARI_cluster/label"],
 91 |             result_dict["ASW_label"],
 92 |         ]
 93 |     )
 94 | 
 95 |     # remove nan value in result_dict
 96 |     result_dict = {k: v for k, v in result_dict.items() if not np.isnan(v)}
 97 | 
 98 |     print(results)
 99 |     return result_dict
100 | 
101 | def calculate_correlation_metric(y1, y2):
102 |     cor = 0.0
103 |     y1 = y1.float()
104 |     y2 = y2.float()
105 |     for id1, id2 in zip(y1, y2):
106 |         
107 |         cor_cal,_ = scipy.stats.pearsonr(id1,id2)
108 |         cor += cor_cal.item()
109 |     return cor
110 | 
111 | 
112 | class scEval(object):
113 | 
114 |     def __init__(self, adata):
115 |         self.label = 'scGPT'
116 |         self.adata = adata # adata is the output of the model you plan to benchmark.
117 |         self.pvalue = 0.005
118 | 
119 |     def evaluation_bec(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scGPT'):
120 |         results = eval_scib_metrics(self.adata,batch_key,label_key, emb_name)
121 |         return results
122 |     
123 | 
124 |     def evaluation_cta_gfp(self, pred_label, true_label):
125 |         results = classification_report(pred_label, true_label, digits=4)
126 |         return results
127 |     
128 |     def evaluation_perturb_pred(self, pred_model, true_result): #assume the outputs are both in AnnData format. Rows are cells while columns are genes.
129 |         cor_total = calculate_correlation_metric(pred_model.X.T, true_result.X.T)
130 |         return {"correlation":cor_total / len(pred_model.X.T)}
131 |     
132 |     def evaluation_perturb_pred_gearsofficial(self, gears_model, pred_model ):
133 |         from gears.inference import evaluate, compute_metrics, deeper_analysis, non_dropout_analysis
134 |         test_res = evaluate(gears_model.dataloader['test_loader'], pred_model)
135 |         test_metrics, test_pert_res = compute_metrics(test_res)
136 |         return test_metrics
137 |     
138 |     def evaluation_imputation_scrna(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scGPT'):
139 |         results = eval_scib_metrics_onlybio(self.adata,batch_key,label_key, emb_name)
140 |         return results
141 |     
142 |     def evaluation_imputation_spatial(self, adata_sp):
143 |         adata_imp_new = self.adata[:, adata_sp.var_names]
144 |         cor_list = []
145 |         pval_list = []
146 |         for item in adata_sp.var_names:
147 |             adata1 = adata_sp[:,item]
148 |             adata2 = adata_imp_new[:,item]
149 |             cor, pval = scipy.stats.pearsonr(np.array(adata1.X.todense().T)[0], np.array(adata2.X.T)[0]) # for this step, please check the data form
150 |             cor_list.append(cor)
151 |             pval_list.append(pval)
152 | 
153 |         adata_imp_new.var['cor'] = cor_list 
154 |         adata_imp_new.var['pval'] = pval_list
155 | 
156 |         mean_cor = np.mean(adata_imp_new.var['cor'].values)
157 | 
158 |         avg_sig = np.sum(adata_imp_new.var['pval'].values<self.pvalue)/len(adata_imp_new.var['pval'].values)
159 |         return {"mean_cor":mean_cor, "avg_sign":avg_sig} 
160 |     
161 |     def evaluation_simulation(self, batch_key = 'batch',label_key = 'celltype', isbatch = True, emb_name = 'X_scGPT'):
162 | 
163 |         if isbatch:
164 |             results = eval_scib_metrics(self.adata,batch_key,label_key, emb_name)
165 |             return results 
166 |         else:
167 |             results = eval_scib_metrics_onlybio(self.adata,batch_key,label_key, emb_name)
168 |             return results             
169 | 
170 | 
171 | 
172 | 
173 | 


--------------------------------------------------------------------------------
/.history/sceval_lib_20240317220609.py:
--------------------------------------------------------------------------------
  1 | from anndata import AnnData
  2 | import torch
  3 | import numpy as np
  4 | import scib
  5 | import scanpy as sc
  6 | import scipy
  7 | import scipy.stats
  8 | from scgpt.utils import set_seed
  9 | from sklearn.metrics import classification_report
 10 | from typing import List, Tuple, Dict, Union, Optional
 11 | 
 12 | set_seed(0)
 13 | def eval_scib_metrics(
 14 |     adata: AnnData,
 15 |     batch_key: str = "batch",
 16 |     label_key: str = "celltype",
 17 |     emb_name: str = "X_scGPT",
 18 |     notes: Optional[str] = None,
 19 | ) -> Dict:
 20 |     results = scib.metrics.metrics(
 21 |         adata,
 22 |         adata_int=adata,
 23 |         batch_key=batch_key,
 24 |         label_key=label_key,
 25 |         embed=emb_name,
 26 |         isolated_labels_asw_=False,
 27 |         silhouette_=True,
 28 |         hvg_score_=False,
 29 |         graph_conn_=True,
 30 |         pcr_=True,
 31 |         isolated_labels_f1_=False,
 32 |         trajectory_=False,
 33 |         nmi_=True,  
 34 |         ari_=True, 
 35 |         cell_cycle_=False,
 36 |         kBET_=True,  
 37 |         ilisi_=False,
 38 |         clisi_=False,
 39 |     )
 40 | 
 41 |     result_dict = results[0].to_dict()
 42 | 
 43 |     result_dict["avg_bio"] = np.mean(
 44 |         [
 45 |             result_dict["NMI_cluster/label"],
 46 |             result_dict["ARI_cluster/label"],
 47 |             result_dict["ASW_label"],
 48 |         ]
 49 |     )
 50 | 
 51 |     # remove nan value in result_dict
 52 |     result_dict = {k: v for k, v in result_dict.items() if not np.isnan(v)}
 53 | 
 54 |     print(results)
 55 |     return result_dict
 56 | 
 57 | 
 58 | def eval_scib_metrics_onlybio(
 59 |     adata: AnnData,
 60 |     batch_key: str = "batch",
 61 |     label_key: str = "celltype",
 62 |     emb_name: str = "X_scGPT",
 63 |     notes: Optional[str] = None,
 64 | ) -> Dict:
 65 |     results = scib.metrics.metrics_onlybio(
 66 |         adata,
 67 |         adata_int=adata,
 68 |         batch_key=batch_key,
 69 |         label_key=label_key,
 70 |         embed=emb_name,
 71 |         isolated_labels_asw_=False,
 72 |         silhouette_=True,
 73 |         hvg_score_=False,
 74 |         graph_conn_=True,
 75 |         pcr_=True,
 76 |         isolated_labels_f1_=False,
 77 |         trajectory_=False,
 78 |         nmi_=True,  
 79 |         ari_=True,  
 80 |         cell_cycle_=False,
 81 |         kBET_=False,  
 82 |         ilisi_=False,
 83 |         clisi_=False,
 84 |     )
 85 | 
 86 |     result_dict = results[0].to_dict()
 87 |     result_dict["avg_bio"] = np.mean(
 88 |         [
 89 |             result_dict["NMI_cluster/label"],
 90 |             result_dict["ARI_cluster/label"],
 91 |             result_dict["ASW_label"],
 92 |         ]
 93 |     )
 94 | 
 95 |     # remove nan value in result_dict
 96 |     result_dict = {k: v for k, v in result_dict.items() if not np.isnan(v)}
 97 | 
 98 |     print(results)
 99 |     return result_dict
100 | 
101 | def calculate_correlation_metric(y1, y2):
102 |     cor = 0.0
103 |     y1 = y1.float()
104 |     y2 = y2.float()
105 |     for id1, id2 in zip(y1, y2):
106 |         
107 |         cor_cal,_ = scipy.stats.pearsonr(id1,id2)
108 |         cor += cor_cal.item()
109 |     return cor
110 | 
111 | 
112 | class scEval(object):
113 | 
114 |     def __init__(self, adata):
115 |         self.label = 'scGPT'
116 |         self.adata = adata # adata is the output of the model you plan to benchmark.
117 |         self.pvalue = 0.005
118 | 
119 |     def evaluation_bec(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scGPT'):
120 |         results = eval_scib_metrics(self.adata,batch_key,label_key, emb_name)
121 |         return results
122 |     
123 | 
124 |     def evaluation_cta_gfp(self, pred_label, true_label):
125 |         results = classification_report(pred_label, true_label, digits=4)
126 |         return results
127 |     
128 |     def evaluation_perturb_pred(self, pred_model, true_result): #assume the outputs are both in AnnData format. Rows are cells while columns are genes.
129 |         cor_total = calculate_correlation_metric(pred_model.X.T, true_result.X.T)
130 |         return {"correlation":cor_total / len(pred_model.X.T)}
131 |     
132 |     def evaluation_perturb_pred_gearsofficial(self, gears_model, pred_model ):
133 |         from gears.inference import evaluate, compute_metrics, deeper_analysis, non_dropout_analysis
134 |         test_res = evaluate(gears_model.dataloader['test_loader'], pred_model)
135 |         test_metrics, test_pert_res = compute_metrics(test_res)
136 |         return test_metrics
137 |     
138 |     def evaluation_imputation_scrna(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scGPT'):
139 |         results = eval_scib_metrics_onlybio(self.adata,batch_key,label_key, emb_name)
140 |         return results
141 |     
142 |     def evaluation_imputation_spatial(self, adata_sp):
143 |         adata_imp_new = self.adata[:, adata_sp.var_names]
144 |         cor_list = []
145 |         pval_list = []
146 |         for item in adata_sp.var_names:
147 |             adata1 = adata_sp[:,item]
148 |             adata2 = adata_imp_new[:,item]
149 |             cor, pval = scipy.stats.pearsonr(np.array(adata1.X.todense().T)[0], np.array(adata2.X.T)[0]) # for this step, please check the data form
150 |             cor_list.append(cor)
151 |             pval_list.append(pval)
152 | 
153 |         adata_imp_new.var['cor'] = cor_list 
154 |         adata_imp_new.var['pval'] = pval_list
155 | 
156 |         mean_cor = np.mean(adata_imp_new.var['cor'].values)
157 | 
158 |         avg_sig = np.sum(adata_imp_new.var['pval'].values<self.pvalue)/len(adata_imp_new.var['pval'].values)
159 |         return {"mean_cor":mean_cor, "avg_sign":avg_sig} 
160 |     
161 |     def evaluation_simulation(self, batch_key = 'batch',label_key = 'celltype', isbatch = True, emb_name = 'X_scGPT'):
162 | 
163 |         if isbatch:
164 |             results = eval_scib_metrics(self.adata,batch_key,label_key, emb_name)
165 |             return results 
166 |         else:
167 |             results = eval_scib_metrics_onlybio(self.adata,batch_key,label_key, emb_name)
168 |             return results             
169 | 
170 | 
171 | 
172 | 
173 | 


--------------------------------------------------------------------------------
/.history/sceval_lib_20240317220632.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import scib
  4 | import scanpy as sc
  5 | import scipy
  6 | import scipy.stats
  7 | from scgpt.utils import set_seed
  8 | from anndata import AnnData
  9 | from sklearn.metrics import classification_report
 10 | from typing import List, Tuple, Dict, Union, Optional
 11 | 
 12 | set_seed(0)
 13 | def eval_scib_metrics(
 14 |     adata: AnnData,
 15 |     batch_key: str = "batch",
 16 |     label_key: str = "celltype",
 17 |     emb_name: str = "X_scGPT",
 18 |     notes: Optional[str] = None,
 19 | ) -> Dict:
 20 |     results = scib.metrics.metrics(
 21 |         adata,
 22 |         adata_int=adata,
 23 |         batch_key=batch_key,
 24 |         label_key=label_key,
 25 |         embed=emb_name,
 26 |         isolated_labels_asw_=False,
 27 |         silhouette_=True,
 28 |         hvg_score_=False,
 29 |         graph_conn_=True,
 30 |         pcr_=True,
 31 |         isolated_labels_f1_=False,
 32 |         trajectory_=False,
 33 |         nmi_=True,  
 34 |         ari_=True, 
 35 |         cell_cycle_=False,
 36 |         kBET_=True,  
 37 |         ilisi_=False,
 38 |         clisi_=False,
 39 |     )
 40 | 
 41 |     result_dict = results[0].to_dict()
 42 | 
 43 |     result_dict["avg_bio"] = np.mean(
 44 |         [
 45 |             result_dict["NMI_cluster/label"],
 46 |             result_dict["ARI_cluster/label"],
 47 |             result_dict["ASW_label"],
 48 |         ]
 49 |     )
 50 | 
 51 |     # remove nan value in result_dict
 52 |     result_dict = {k: v for k, v in result_dict.items() if not np.isnan(v)}
 53 | 
 54 |     print(results)
 55 |     return result_dict
 56 | 
 57 | 
 58 | def eval_scib_metrics_onlybio(
 59 |     adata: AnnData,
 60 |     batch_key: str = "batch",
 61 |     label_key: str = "celltype",
 62 |     emb_name: str = "X_scGPT",
 63 |     notes: Optional[str] = None,
 64 | ) -> Dict:
 65 |     results = scib.metrics.metrics_onlybio(
 66 |         adata,
 67 |         adata_int=adata,
 68 |         batch_key=batch_key,
 69 |         label_key=label_key,
 70 |         embed=emb_name,
 71 |         isolated_labels_asw_=False,
 72 |         silhouette_=True,
 73 |         hvg_score_=False,
 74 |         graph_conn_=True,
 75 |         pcr_=True,
 76 |         isolated_labels_f1_=False,
 77 |         trajectory_=False,
 78 |         nmi_=True,  
 79 |         ari_=True,  
 80 |         cell_cycle_=False,
 81 |         kBET_=False,  
 82 |         ilisi_=False,
 83 |         clisi_=False,
 84 |     )
 85 | 
 86 |     result_dict = results[0].to_dict()
 87 |     result_dict["avg_bio"] = np.mean(
 88 |         [
 89 |             result_dict["NMI_cluster/label"],
 90 |             result_dict["ARI_cluster/label"],
 91 |             result_dict["ASW_label"],
 92 |         ]
 93 |     )
 94 | 
 95 |     # remove nan value in result_dict
 96 |     result_dict = {k: v for k, v in result_dict.items() if not np.isnan(v)}
 97 | 
 98 |     print(results)
 99 |     return result_dict
100 | 
101 | def calculate_correlation_metric(y1, y2):
102 |     cor = 0.0
103 |     y1 = y1.float()
104 |     y2 = y2.float()
105 |     for id1, id2 in zip(y1, y2):
106 |         
107 |         cor_cal,_ = scipy.stats.pearsonr(id1,id2)
108 |         cor += cor_cal.item()
109 |     return cor
110 | 
111 | 
112 | class scEval(object):
113 | 
114 |     def __init__(self, adata):
115 |         self.label = 'scGPT'
116 |         self.adata = adata # adata is the output of the model you plan to benchmark.
117 |         self.pvalue = 0.005
118 | 
119 |     def evaluation_bec(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scGPT'):
120 |         results = eval_scib_metrics(self.adata,batch_key,label_key, emb_name)
121 |         return results
122 |     
123 | 
124 |     def evaluation_cta_gfp(self, pred_label, true_label):
125 |         results = classification_report(pred_label, true_label, digits=4)
126 |         return results
127 |     
128 |     def evaluation_perturb_pred(self, pred_model, true_result): #assume the outputs are both in AnnData format. Rows are cells while columns are genes.
129 |         cor_total = calculate_correlation_metric(pred_model.X.T, true_result.X.T)
130 |         return {"correlation":cor_total / len(pred_model.X.T)}
131 |     
132 |     def evaluation_perturb_pred_gearsofficial(self, gears_model, pred_model ):
133 |         from gears.inference import evaluate, compute_metrics, deeper_analysis, non_dropout_analysis
134 |         test_res = evaluate(gears_model.dataloader['test_loader'], pred_model)
135 |         test_metrics, test_pert_res = compute_metrics(test_res)
136 |         return test_metrics
137 |     
138 |     def evaluation_imputation_scrna(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scGPT'):
139 |         results = eval_scib_metrics_onlybio(self.adata,batch_key,label_key, emb_name)
140 |         return results
141 |     
142 |     def evaluation_imputation_spatial(self, adata_sp):
143 |         adata_imp_new = self.adata[:, adata_sp.var_names]
144 |         cor_list = []
145 |         pval_list = []
146 |         for item in adata_sp.var_names:
147 |             adata1 = adata_sp[:,item]
148 |             adata2 = adata_imp_new[:,item]
149 |             cor, pval = scipy.stats.pearsonr(np.array(adata1.X.todense().T)[0], np.array(adata2.X.T)[0]) # for this step, please check the data form
150 |             cor_list.append(cor)
151 |             pval_list.append(pval)
152 | 
153 |         adata_imp_new.var['cor'] = cor_list 
154 |         adata_imp_new.var['pval'] = pval_list
155 | 
156 |         mean_cor = np.mean(adata_imp_new.var['cor'].values)
157 | 
158 |         avg_sig = np.sum(adata_imp_new.var['pval'].values<self.pvalue)/len(adata_imp_new.var['pval'].values)
159 |         return {"mean_cor":mean_cor, "avg_sign":avg_sig} 
160 |     
161 |     def evaluation_simulation(self, batch_key = 'batch',label_key = 'celltype', isbatch = True, emb_name = 'X_scGPT'):
162 | 
163 |         if isbatch:
164 |             results = eval_scib_metrics(self.adata,batch_key,label_key, emb_name)
165 |             return results 
166 |         else:
167 |             results = eval_scib_metrics_onlybio(self.adata,batch_key,label_key, emb_name)
168 |             return results             
169 | 
170 | 
171 | 
172 | 
173 | 


--------------------------------------------------------------------------------
/.history/sceval_method_20240311113014.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloWorldLTY/scEval/46bd453c54d53fbc73408032fdcb45ea32255401/.history/sceval_method_20240311113014.py


--------------------------------------------------------------------------------
/.history/sceval_method_20240311113104.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple, Dict, Union, Optional
 2 | import AnnData
 3 | import torch
 4 | import numpy as np
 5 | import scib
 6 | import scanpy as sc
 7 | import scipy
 8 | from scgpt.utils import set_seed
 9 | from sklearn.metrics import classification_report
10 | import scipy.stats
11 | 
12 | class scEval_bench(object):
13 | 
14 |     def __init__(self, adata):
15 |         self.label = 'scGPT'
16 |         self.adata = adata # adata is the output of the model you plan to benchmark.
17 |         self.pvalue = 0.005
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/.history/sceval_method_20240311113148.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple, Dict, Union, Optional
 2 | import AnnData
 3 | import torch
 4 | import numpy as np
 5 | import scib
 6 | import scanpy as sc
 7 | import scipy
 8 | from scgpt.utils import set_seed
 9 | from sklearn.metrics import classification_report
10 | import scipy.stats
11 | import scvi 
12 | 
13 | 
14 | class scEval_bench(object):
15 | 
16 |     def __init__(self, adata):
17 |         self.label = 'scGPT'
18 |         self.adata = adata # adata is raw data file for evaluating.
19 |         self.pvalue = 0.005
20 |     
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/.history/sceval_method_20240311113334.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple, Dict, Union, Optional
 2 | import AnnData
 3 | import torch
 4 | import numpy as np
 5 | import scib
 6 | import scanpy as sc
 7 | import scipy
 8 | from scgpt.utils import set_seed
 9 | from sklearn.metrics import classification_report
10 | import scipy.stats
11 | import scvi 
12 | 
13 | 
14 | class scEval_bench(object):
15 | 
16 |     def __init__(self, adata):
17 |         self.label = 'scGPT'
18 |         self.adata = adata # adata is raw data file for evaluating.
19 |         self.pvalue = 0.005
20 | 
21 |     def bec_scvi(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
22 |         pass
23 | 
24 |     def bec_respan(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
25 |         pass
26 | 
27 |     def mdi_glue(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
28 |         pass
29 | 
30 |     def mdi_scjoint(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
31 |         pass
32 | 
33 |     def cta_tosica(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
34 |         pass
35 | 
36 |     def cta_tosica(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
37 |         pass
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/.history/sceval_method_20240311113451.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple, Dict, Union, Optional
 2 | import AnnData
 3 | import torch
 4 | import numpy as np
 5 | import scib
 6 | import scanpy as sc
 7 | import scipy
 8 | from scgpt.utils import set_seed
 9 | from sklearn.metrics import classification_report
10 | import scipy.stats
11 | import scvi 
12 | 
13 | 
14 | class scEval_bench(object):
15 | 
16 |     def __init__(self, adata):
17 |         self.label = 'scGPT'
18 |         self.adata = adata # adata is raw data file for evaluating.
19 |         self.pvalue = 0.005
20 | 
21 |     def bec_scvi(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
22 |         pass
23 | 
24 |     def bec_respan(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
25 |         pass
26 | 
27 |     def mdi_glue(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
28 |         pass
29 | 
30 |     def mdi_scjoint(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
31 |         pass
32 | 
33 |     def cta_tosica(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
34 |         pass
35 | 
36 |     def cta_tosica(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
37 |         pass
38 | 
39 |     def imp_tangram(self, ref_data, query_data, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
40 |         pass
41 | 
42 |     def imp_tangram(self, ref_data, query_data, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
43 |         pass
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/.history/sceval_method_20240311113518.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple, Dict, Union, Optional
 2 | import AnnData
 3 | import torch
 4 | import numpy as np
 5 | import scib
 6 | import scanpy as sc
 7 | import scipy
 8 | from scgpt.utils import set_seed
 9 | from sklearn.metrics import classification_report
10 | import scipy.stats
11 | import scvi 
12 | 
13 | 
14 | class scEval_bench(object):
15 | 
16 |     def __init__(self, adata):
17 |         self.label = 'scGPT'
18 |         self.adata = adata # adata is raw data file for evaluating.
19 |         self.pvalue = 0.005
20 | 
21 |     def bec_scvi(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
22 |         pass
23 | 
24 |     def bec_respan(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
25 |         pass
26 | 
27 |     def mdi_glue(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
28 |         pass
29 | 
30 |     def mdi_scjoint(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
31 |         pass
32 | 
33 |     def cta_tosica(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
34 |         pass
35 | 
36 |     def cta_tosica(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
37 |         pass
38 | 
39 |     def imp_tangram(self, ref_data, query_data, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
40 |         pass
41 | 
42 |     def pert_gears(self, ref_data, query_data, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
43 |         pass
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/.history/sceval_method_20240311113519.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple, Dict, Union, Optional
 2 | import AnnData
 3 | import torch
 4 | import numpy as np
 5 | import scib
 6 | import scanpy as sc
 7 | import scipy
 8 | from scgpt.utils import set_seed
 9 | from sklearn.metrics import classification_report
10 | import scipy.stats
11 | import scvi 
12 | 
13 | 
14 | class scEval_bench(object):
15 | 
16 |     def __init__(self, adata):
17 |         self.label = 'scGPT'
18 |         self.adata = adata # adata is raw data file for evaluating.
19 |         self.pvalue = 0.005
20 | 
21 |     def bec_scvi(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
22 |         pass
23 | 
24 |     def bec_respan(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
25 |         pass
26 | 
27 |     def mdi_glue(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
28 |         pass
29 | 
30 |     def mdi_scjoint(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
31 |         pass
32 | 
33 |     def cta_tosica(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
34 |         pass
35 | 
36 |     def cta_tosica(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
37 |         pass
38 | 
39 |     def imp_tangram(self, ref_data, query_data, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
40 |         pass
41 | 
42 |     def pert_gears(self, ref_data, query_data, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
43 |         pass
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/.history/sceval_method_20240311113920.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple, Dict, Union, Optional
 2 | import AnnData
 3 | import torch
 4 | import numpy as np
 5 | import scib
 6 | import scanpy as sc
 7 | import scipy
 8 | from scgpt.utils import set_seed
 9 | from sklearn.metrics import classification_report
10 | import scipy.stats
11 | import scvi 
12 | 
13 | '''
14 | To evaluate simulation, we used scDesign3 and Splatter, which are both designed in R. Please install R and refer their tutorials to run their codes.
15 | '''
16 | class scEval_bench(object):
17 | 
18 |     def __init__(self, adata):
19 |         self.label = 'scGPT'
20 |         self.adata = adata # adata is raw data file for evaluating.
21 |         self.pvalue = 0.005
22 | 
23 |     def bec_scvi(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
24 |         pass
25 | 
26 |     def bec_respan(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
27 |         pass
28 | 
29 |     def mdi_glue(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
30 |         pass
31 | 
32 |     def mdi_scjoint(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
33 |         pass
34 | 
35 |     def cta_tosica(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
36 |         pass
37 | 
38 |     def cta_tosica(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
39 |         pass
40 | 
41 |     def imp_tangram(self, ref_data, query_data, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
42 |         pass
43 | 
44 |     def pert_gears(self, ref_data, query_data, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
45 |         pass
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/.history/sceval_method_20240311113921.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple, Dict, Union, Optional
 2 | import AnnData
 3 | import torch
 4 | import numpy as np
 5 | import scib
 6 | import scanpy as sc
 7 | import scipy
 8 | from scgpt.utils import set_seed
 9 | from sklearn.metrics import classification_report
10 | import scipy.stats
11 | import scvi 
12 | 
13 | '''
14 | To evaluate simulation, we used scDesign3 and Splatter, which are both designed in R. Please install R and refer their tutorials to run their codes.
15 | '''
16 | class scEval_bench(object):
17 | 
18 |     def __init__(self, adata):
19 |         self.label = 'scGPT'
20 |         self.adata = adata # adata is raw data file for evaluating.
21 |         self.pvalue = 0.005
22 | 
23 |     def bec_scvi(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
24 |         pass
25 | 
26 |     def bec_respan(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
27 |         pass
28 | 
29 |     def mdi_glue(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
30 |         pass
31 | 
32 |     def mdi_scjoint(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
33 |         pass
34 | 
35 |     def cta_tosica(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
36 |         pass
37 | 
38 |     def cta_tosica(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
39 |         pass
40 | 
41 |     def imp_tangram(self, ref_data, query_data, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
42 |         pass
43 | 
44 |     def pert_gears(self, ref_data, query_data, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
45 |         pass
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/.history/sceval_method_20240312125931.py:
--------------------------------------------------------------------------------
  1 | import AnnData
  2 | import torch
  3 | import numpy as np
  4 | import scib
  5 | import scanpy as sc
  6 | import scipy
  7 | import scipy.stats
  8 | import scvi 
  9 | import scglue
 10 | import networkx as nx
 11 | import TOSICA
 12 | import scanpy as sc
 13 | import numpy as np
 14 | import pandas as pd
 15 | import tangram as tg
 16 | 
 17 | from typing import List, Tuple, Dict, Union, Optional
 18 | from ResPAN import run_respan
 19 | from itertools import chain
 20 | from anndata import AnnData
 21 | from gears import PertData, GEARS
 22 | from gears.inference import evaluate, compute_metrics, deeper_analysis, non_dropout_analysis
 23 | 
 24 | '''
 25 | To evaluate simulation, we used scDesign3 and Splatter, which are both designed in R. Please install R and refer their tutorials to run their codes.
 26 | '''
 27 | class scEval_bench(object):
 28 | 
 29 |     def __init__(self, adata):
 30 |         self.label = 'scGPT'
 31 |         self.adata = adata # adata is raw data file for evaluating.
 32 |         self.pvalue = 0.005
 33 | 
 34 |     def bec_scvi(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scvi'):
 35 |         adata = self.adata
 36 |         adata.raw = adata  # keep full dimension safe
 37 |         sc.pp.highly_variable_genes(
 38 |             adata,
 39 |             flavor="seurat_v3",
 40 |             n_top_genes=2000,
 41 |             layer="counts",
 42 |             batch_key="batch",
 43 |             subset=True,
 44 |         )
 45 |         scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="batch")
 46 |         model = scvi.model.SCVI(adata, n_layers=2, n_latent=30, gene_likelihood="nb")
 47 |         model.train()
 48 |         adata.obsm[emb_name] = model.get_latent_representation()
 49 |         return adata
 50 | 
 51 | 
 52 | 
 53 |     def bec_respan(self, batch_key = 'batch',label_key = 'celltype'):
 54 | 
 55 |         adata = self.adata
 56 |         # pre-processing
 57 |         sc.pp.filter_cells(adata, min_genes=200)
 58 |         sc.pp.filter_genes(adata, min_cells=3)
 59 |         adata.X = adata.X.toarray().astype('float')
 60 |         sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
 61 |         sc.pp.log1p(adata)
 62 |         sc.pp.highly_variable_genes(adata, n_top_genes=2000, batch_key='batch')
 63 |         adata = adata[:, adata.var['highly_variable']]
 64 |         # check if data is in sparse format
 65 |         if isinstance(adata.X, scipy.sparse.csr.csr_matrix): 
 66 |             adata_new = sc.AnnData(adata.X.todense())
 67 |             adata_new.obs = adata.obs.copy()
 68 |             adata_new.obs_names = adata.obs_names
 69 |             adata_new.var_names = adata.var_names
 70 |             adata_new.obs_names.name = 'CellID'
 71 |             adata_new.var_names.name = 'Gene'
 72 |             del adata
 73 |             adata = adata_new
 74 | 
 75 |         adata_new = run_respan(adata, batch_key=batch_key, epoch=300, batch=1024, reduction='pca', subsample=3000, seed=999)
 76 |         return adata_new
 77 | 
 78 |     # here atac data should contain peak infromation.
 79 |     def mdi_glue(self, rna, atac, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_glue'):
 80 |         result_folder = './'
 81 |         # Data preprocessing
 82 |         sc.pp.filter_cells(rna, min_genes=200)
 83 |         sc.pp.filter_genes(rna, min_cells=3)
 84 |         rna.layers["counts"] = rna.X.copy()
 85 |         sc.pp.highly_variable_genes(rna, n_top_genes=2000, flavor="seurat_v3")
 86 |         sc.pp.normalize_total(rna)
 87 |         sc.pp.log1p(rna)
 88 |         sc.pp.scale(rna)
 89 |         sc.tl.pca(rna, n_comps=100, svd_solver="auto")
 90 |         sc.pp.filter_cells(atac, min_genes=200)
 91 |         sc.pp.filter_genes(atac, min_cells=3)
 92 |         scglue.data.lsi(atac, n_components=100, n_iter=15)
 93 | 
 94 |         # Graph construction
 95 |         guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac)
 96 | 
 97 |         # Configure data
 98 |         scglue.models.configure_dataset(
 99 |             rna, "NB", use_highly_variable=True,
100 |             use_layer="counts", use_rep="X_pca"
101 |         )
102 |         scglue.models.configure_dataset(
103 |             atac, "NB", use_highly_variable=True,
104 |             use_rep="X_lsi"
105 |         )
106 | 
107 |         guidance_hvf = guidance.subgraph(chain(
108 |             rna.var.query("highly_variable").index,
109 |             atac.var.query("highly_variable").index
110 |         )).copy()
111 | 
112 |         # Run GLUE
113 |         glue = scglue.models.fit_SCGLUE(
114 |             {"rna": rna, "atac": atac}, guidance_hvf,
115 |             fit_kws={"directory": result_folder}
116 |         )
117 |         glue.save("%s/glue.dill" % result_folder)
118 | 
119 |         # Check integration consistency
120 |         dx = scglue.models.integration_consistency(
121 |             glue, {"rna": rna, "atac": atac}, guidance_hvf
122 |         )
123 |         print(dx)
124 | 
125 |         # KNN classifier
126 |         rna.obsm[emb_name] = glue.encode_data("rna", rna)
127 |         atac.obsm[emb_name] = glue.encode_data("atac", atac)
128 | 
129 |         return rna, atac
130 | 
131 |     def mdi_scjoint(self, rna_path, atac_path, result_folder, subset_rna, subset_atac, rna_new_annot, stage1_lr, stage3_lr, nepoch):
132 |         from scjoint import run_scJoint
133 |         run_scJoint(rna_path, atac_path, result_folder, subset_rna, subset_atac, rna_new_annot, stage1_lr, stage3_lr, nepoch)
134 | 
135 | 
136 |     def cta_tosica(self, ref_data, query_data, batch_key = 'batch',label_key = 'celltype'):
137 |         TOSICA.train(ref_data, gmt_path='human_gobp', label_name=label_key, epochs=3, project = "./")
138 |         model_weight_path = './model-0.pth'
139 |         new_adata = TOSICA.pre(query_data, model_weight_path = model_weight_path, project = "./")
140 |         return new_adata
141 |     
142 |     def imp_tangram(self, ref_data, query_data, batch_key = 'batch',label_key = 'celltype'):
143 | 
144 |         #ensure ref_data has more genes than query_data.
145 |         adata_sc = ref_data 
146 |         adata_st = query_data
147 |         sc.tl.rank_genes_groups(adata_sc, groupby=label_key, use_raw=False)
148 |         markers_df = pd.DataFrame(adata_sc.uns["rank_genes_groups"]["names"]).iloc[0:100, :]
149 |         markers = list(np.unique(markers_df.melt().value.values))
150 |         tg.pp_adatas(adata_sc, adata_st, genes=markers)
151 |         ad_map = tg.map_cells_to_space(adata_sc, adata_st,
152 |             mode="cells",
153 |         #     mode="clusters",
154 |         #     cluster_label='cell_subclass',  # .obs field w cell types
155 |             density_prior='rna_count_based',
156 |             num_epochs=500,
157 |             # device="cuda:0",
158 |             device='cpu',
159 |         )
160 |         ad_ge = tg.project_genes(adata_map=ad_map, adata_sc=adata_sc)
161 |         return ad_ge
162 | 
163 |     def pert_gears(self, batch_key = 'batch',label_key = 'celltype', device='cuda"0', save_path = './'):
164 |         pert_data = self.adata 
165 |         # set up and train a model
166 |         gears_model = GEARS(pert_data, device = device)
167 |         gears_model.model_initialize(hidden_size = 64)
168 |         gears_model.train(epochs = 20)
169 |         gears_model.save_model(save_path)
170 |         gears_model.load_pretrained(save_path)
171 |         test_res = evaluate(gears_model.dataloader['test_loader'], gears_model.model, gears_model.config['uncertainty'], gears_model.device)
172 |         test_metrics, test_pert_res = compute_metrics(test_res)
173 |         return test_metrics
174 | 
175 | 
176 | 
177 | 
178 | 


--------------------------------------------------------------------------------
/Batch Effect Correction/bec_cellplm.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | import os
 4 | 
 5 | import hdf5plugin
 6 | import numpy as np
 7 | import anndata as ad
 8 | from scipy.sparse import csr_matrix
 9 | from CellPLM.utils import set_seed
10 | from CellPLM.pipeline.cell_embedding import CellEmbeddingPipeline
11 | import scanpy as sc
12 | import matplotlib.pyplot as plt
13 | import resource
14 | # import rapids_singlecell as rsc  # For faster evaluation, we recommend the installation of rapids_singlecell.
15 | 
16 | ## Specify important parameters before getting started
17 | 
18 | PRETRAIN_VERSION = '20230926_85M'
19 | DEVICE = 'cuda'
20 | 
21 | set_seed(42)
22 | filename = 'benchmark_trajectory'
23 | data = ad.read_h5ad(f"/gpfs/gibbs/pi/zhao/tl688/trajectory_data/{filename}.h5ad")
24 | data.obs_names_make_unique()
25 | 
26 | ## Set up the pipeline
27 | 
28 | pipeline = CellEmbeddingPipeline(pretrain_prefix=PRETRAIN_VERSION, # Specify the pretrain checkpoint to load
29 |                                  pretrain_directory='/gpfs/gibbs/pi/zhao/tl688/CellPLM_cta/ckpt/')
30 | pipeline.model
31 | 
32 | ## Evaluation and Inference
33 | 
34 | embedding = pipeline.predict(data, # An AnnData object
35 |                 device=DEVICE) # Specify a gpu or cpu for model inference
36 | 
37 | data.obsm['emb'] = embedding.cpu().numpy()
38 | print(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss  / (1e6) )
39 | # resource.getrusage(resource.RUSAGE_SELF).ru_utime
40 | data.write_h5ad(f"./bec_cellplm/{filename}_cellplm.h5ad")
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/Batch Effect Correction/bec_geneformer.py:
--------------------------------------------------------------------------------
 1 | from geneformer import EmbExtractor
 2 | import time
 3 | 
 4 | # initiate EmbExtractor
 5 | embex = EmbExtractor(model_type="CellClassifier",
 6 |                      num_classes=0,
 7 |                      max_ncells=None,
 8 |                      emb_layer=0,
 9 |                      emb_label=["celltype"],
10 |                      forward_batch_size=200,
11 |                      nproc=16)
12 | 
13 | 
14 | # extracts embedding from input data
15 | # input data is tokenized rank value encodings generated by Geneformer tokenizer (see tokenizing_scRNAseq_data.ipynb)
16 | # example dataset: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset
17 | embs = embex.extract_embs("./",
18 |                           "./humanpbmc.dataset/",
19 |                           "./",
20 |                           "output_prefix")
21 | 
22 | 
23 | import scanpy as sc
24 | 
25 | adata = sc.read_h5ad("../scGPT/examples/HumanPBMC_raw.h5ad")
26 | 
27 | del embs['celltype']
28 | 
29 | adata.obsm['X_geneformer'] = embs.values
30 | 
31 | adata.write_h5ad("./HumanPBMC_geneformer.h5ad")
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/Batch Effect Correction/bec_scf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # bec
3 | python get_embedding_h5ad.py --task_name benchmark_trajectory --input_type singlecell --output_type cell --pool_type all --tgthighres a5 --data_path "/gpfs/gibbs/pi/zhao/tl688/trajectory_data/benchmark_trajectory.h5ad" --save_path ./examples/bec/ --pre_normalized F --version rde


--------------------------------------------------------------------------------
/Batch Effect Correction/bec_scim.py:
--------------------------------------------------------------------------------
 1 | # Environment settings
 2 | import scanpy as sc
 3 | sc.set_figure_params(dpi=100)
 4 | 
 5 | import warnings
 6 | warnings.filterwarnings('ignore')
 7 | 
 8 | from src.scimilarity.utils import lognorm_counts
 9 | from src.scimilarity import CellAnnotation, align_dataset, CellQuery
10 | 
11 | from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_recall_fscore_support, classification_report
12 | 
13 | # import CellQuery object
14 | annotation_path = './models/annotation_model_v1'
15 | query_path = './models/query_model_v1'
16 | cq = CellQuery(model_path=annotation_path,
17 |                cellsearch_path=query_path)
18 | 
19 | # Load the tutorial data.
20 | # Replace data_path with your local file path.
21 | data_path = "/gpfs/gibbs/pi/zhao/tl688/largedataset/singlecellimmune_covid.h5ad"
22 | adams = sc.read(data_path)
23 | adams.layers['counts'] = adams.X
24 | # adams.var_names = [i.upper() for i in adams.var_names]
25 | 
26 | adams.var_names = list(adams.var['feature_name'])
27 | 
28 | adams = align_dataset(adams, cq.gene_order)
29 | 
30 | adams = lognorm_counts(adams)
31 | 
32 | adams.obsm['X_scimilarity'] = cq.get_embeddings(adams.X)
33 | 
34 | adams.write_h5ad("/gpfs/gibbs/pi/zhao/tl688/scimilarity/bec_result/singlecellimmune_covid_sim.h5ad")


--------------------------------------------------------------------------------
/Batch Effect Correction/bec_tgpt.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import torch
 3 | import numpy as np
 4 | import pandas as pd
 5 | import scanpy as sc
 6 | from tqdm import tqdm
 7 | from torch.utils.data import DataLoader, Dataset
 8 | from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel
 9 | 
10 | # Setting parameter and file path
11 | 
12 | device = "cuda" if torch.cuda.is_available() else "cpu" 
13 | tokenizer_file = "lixiangchun/transcriptome-gpt-1024-8-16-64" 
14 | checkpoint = "lixiangchun/transcriptome-gpt-1024-8-16-64" ## Pretrained model
15 | celltype_path = "./data/Muris_cell_labels.txt.gz" ## Cell type annotation
16 | max_len = 64 ## Number of top genes used for analysis
17 | text_file = "./data/Muris_gene_rankings.txt.gz"  ## Gene symbols ranked by exprssion
18 | 
19 | adata = sc.read_h5ad("/gpfs/gibbs/pi/zhao/tl688/largedataset/singlecellimmune_covid.h5ad")
20 | 
21 | adata.var_names = list(adata.var.feature_name)
22 | 
23 | def get_gene_token(adata):
24 |     lines = []
25 |     for i in adata.obs_names:
26 |         adata_t = adata[i,:]
27 |         reverse_index = np.argsort(adata_t.X.toarray()[0])[::-1]
28 |         reverse_index = reverse_index[0:256]
29 |         gene_list = adata_t.var_names.values[reverse_index]
30 |         raw_gene = ''
31 |         for index, gene in enumerate(gene_list):
32 |             raw_gene += gene
33 |             if index != len(gene_list)-1:
34 |                 raw_gene += ' '
35 |         lines.append(raw_gene)
36 |     return lines
37 |     
38 | 
39 | lines = get_gene_token(adata)
40 | 
41 | # Extract features
42 | 
43 | class LineDataset(Dataset):
44 |     def __init__(self, lines):
45 |         self.lines = lines
46 |         self.regex = re.compile(r'\-|\.')
47 |     def __getitem__(self, i):
48 |         return self.regex.sub('_', self.lines[i])
49 |     def __len__(self):
50 |         return len(self.lines)
51 | 
52 | tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_file)
53 | model = GPT2LMHeadModel.from_pretrained(checkpoint,output_hidden_states = True).transformer
54 | model = model.to(device)
55 | model.eval()
56 | 
57 | ds = LineDataset(lines)
58 | dl = DataLoader(ds, batch_size=64)
59 | 
60 | Xs = []
61 | for a in tqdm(dl, total=len(dl)):
62 |     batch = tokenizer(a, max_length= max_len, truncation=True, padding=True, return_tensors="pt")
63 | 
64 |     for k, v in batch.items():
65 |         batch[k] = v.to(device)
66 | 
67 |     with torch.no_grad():
68 |         x = model(**batch)
69 |     
70 |     eos_idxs = batch.attention_mask.sum(dim=1) - 1
71 |     xx = x.last_hidden_state
72 |        
73 |     result_list = [[] for i in range(len(xx))]
74 | 
75 |     for j, item in enumerate(xx):
76 |         result_list[j] = item[1:int(eos_idxs[j]),:].mean(dim =0).tolist()
77 |         
78 |     Xs.extend(result_list)
79 |     
80 | features = np.stack(Xs)
81 | 
82 | adata_test=sc.AnnData(features)
83 | adata_test.obs["batch"] = list(adata.obs["Site"])
84 | adata_test.obs["batch"] = adata_test.obs["batch"].astype("category")
85 | 
86 | adata_test.obs["str_batch"] = adata_test.obs["batch"].astype("category")
87 | adata_test.obs["celltype"] = list(adata.obs["cell_type"])
88 | adata_test.obs["celltype"] = adata_test.obs["celltype"].astype("category")
89 | 
90 | adata_test.write_h5ad('immuneatlas_tgpt_all.h5ad')


--------------------------------------------------------------------------------
/Batch Effect Correction/bec_uce.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #SBATCH --job-name=uce_getemb
4 | #SBATCH --output=uce_getemb.txt
5 | 
6 | #change batch size does not handle large-scale dataset.
7 | 
8 | python eval_single_anndata.py --adata_path "/gpfs/gibbs/pi/zhao/tl688/scgpt_dataset/PBMC368k_raw.h5ad" --dir ./uce_emb_data/ --batch_size 50


--------------------------------------------------------------------------------
/Batch Effect Correction/sceval_batcheffect.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #SBATCH --job-name=ft_job_bec_sceval
4 | #SBATCH --output=ft_job_bec_sceval.txt
5 | python sceval_batcheffect.py --dataset "/gpfs/ysm/pi/zhao/tl688/datasets/HumanPBMC_raw.loom"


--------------------------------------------------------------------------------
/Cell type Annotation/cta_cellm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=celllm_cross_MBSpatial
 3 | #SBATCH --output=celllm_cross_MBSpatial.txt
 4 | DEVICE=0
 5 | python tasks/cell_task/ctc.py \
 6 | --device ${DEVICE} \
 7 | --config_path ./configs/ctc/cellLM.json \
 8 | --dataset MBSpatial_raw \
 9 | --dataset_path ../../scgpt_dataset \
10 | --output_path ../ckpts/finetune_ckpts/celllm_cl_fintune.pth \
11 | --mode train \
12 | --epochs 10 \
13 | --batch_size 3 \
14 | --logging_steps 1000 \
15 | --gradient_accumulation_steps 4 \
16 | --patience 10
17 | 


--------------------------------------------------------------------------------
/Cell type Annotation/cta_scim.py:
--------------------------------------------------------------------------------
 1 | # Environment settings
 2 | import scanpy as sc
 3 | sc.set_figure_params(dpi=100)
 4 | import time
 5 | import warnings
 6 | warnings.filterwarnings('ignore')
 7 | 
 8 | import src.scimilarity
 9 | 
10 | from src.scimilarity.utils import lognorm_counts
11 | from src.scimilarity import CellAnnotation, align_dataset
12 | 
13 | from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_recall_fscore_support, classification_report
14 | 
15 | annotation_path = './models/annotation_model_v1'
16 | ca = CellAnnotation(model_path=annotation_path)
17 | 
18 | 
19 | def get_parameter_number(model):
20 |     total_num = sum(p.numel() for p in model.parameters())
21 |     trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
22 |     return {'Total':total_num, 'Trainable':trainable_num}
23 | 
24 | get_parameter_number(ca.model)
25 | 
26 | 
27 | # Load the tutorial data.
28 | # Replace data_path with your local file path.
29 | # sc.read("/gpfs/gibbs/pi/zhao/tl688/scgpt_dataset/aizarani_liver.h5ad")
30 | t1 = time.time()
31 | data_path = "/gpfs/gibbs/pi/zhao/tl688/scgpt_dataset/aizarani_liver.h5ad"
32 | adams = sc.read(data_path)
33 | adams.layers['counts'] = adams.X
34 | # adams.var_names =[i.upper() for i in adams.var_names]
35 | adams = align_dataset(adams, ca.gene_order)
36 | adams = lognorm_counts(adams)
37 | 
38 | print(adams)
39 | 
40 | adams.obsm['X_scimilarity'] = ca.get_embeddings(adams.X)
41 | 
42 | predictions, nn_idxs, nn_dists, nn_stats = ca.get_predictions_kNN(adams.obsm['X_scimilarity'])
43 | adams.obs['predictions_unconstrained'] = predictions.values
44 | 
45 | celltype_counts = adams.obs.predictions_unconstrained.value_counts()
46 | well_represented_celltypes = celltype_counts[celltype_counts>20].index
47 | t2 = time.time()
48 | 
49 | 
50 | print("time", t2-t1)
51 | import resource
52 | print(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss  / (1e6) )
53 | import torch
54 | print(torch.cuda.max_memory_allocated()/1024/1024/1024)
55 | 


--------------------------------------------------------------------------------
/Cell type Annotation/cta_tgpt_uce.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pathlib import Path
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | from scipy.stats import mode
 7 | import scanpy as sc
 8 | import sklearn
 9 | import warnings
10 | 
11 | sys.path.insert(0, "../")
12 | import scgpt as scg
13 | 
14 | # extra dependency for similarity search
15 | try:
16 |     import faiss
17 | 
18 |     faiss_imported = True
19 | except ImportError:
20 |     faiss_imported = False
21 |     print(
22 |         "faiss not installed! We highly recommend installing it for fast similarity search."
23 |     )
24 |     print("To install it, see https://github.com/facebookresearch/faiss/wiki/Installing-Faiss")
25 | 
26 | warnings.filterwarnings("ignore", category=ResourceWarning)
27 | 
28 | ref_embed_adata = sc.read_h5ad("./tgpt_out/spaital_mouse_slideseqv2_tgpt_all.h5ad") # you can change it accordingly
29 | 
30 | ref_embed_adata
31 | 
32 | from sklearn.model_selection import train_test_split
33 | from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_recall_fscore_support, classification_report
34 | 
35 | train_obs,test_obs = train_test_split(
36 |     ref_embed_adata.obs_names, random_state=42
37 | )
38 | 
39 | adata_train = ref_embed_adata[train_obs]
40 | adata_test = ref_embed_adata[test_obs]
41 | 
42 | from sklearn.linear_model import LogisticRegression
43 | clf = LogisticRegression(random_state=0).fit(adata_train.X, adata_train.obs.celltype) # or adata.obsm['emb'] for uce.
44 | 
45 | pred_label = clf.predict(adata_test.X)
46 | true_label = adata_test.obs.celltype
47 | 
48 | print(classification_report(true_label, pred_label, digits=4))


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Start from a Miniconda image
 2 | FROM continuumio/miniconda3:latest
 3 | 
 4 | # Create a working directory
 5 | WORKDIR /app
 6 | 
 7 | # Copy the environment.yml file into the container
 8 | COPY scgpt_bench.yml .
 9 | 
10 | # Create the environment
11 | # Using `mamba` here for faster installations; it's included in newer images.
12 | RUN conda install -n base -c conda-forge mamba && \
13 |     mamba env create -f scgpt_bench.yml
14 | 
15 | # Activate the environment by default
16 | # The conda environment will be located at /opt/conda/envs/myenv
17 | ENV PATH /opt/conda/envs/myenv/bin:$PATH
18 | 
19 | # Clean up conda cache to reduce image size
20 | RUN conda clean --all --yes
21 | 
22 | # (Optional) Set a default command to start a shell
23 | CMD ["/bin/bash"]
24 | 


--------------------------------------------------------------------------------
/Gene Network Analysis/gna_geneformer.py:
--------------------------------------------------------------------------------
 1 | from geneformer import EmbExtractor
 2 | import time
 3 | 
 4 | # initiate EmbExtractor
 5 | t1 = time.time()
 6 | embex = EmbExtractor(
 7 |                      emb_mode='gene',
 8 |                      forward_batch_size=20,
 9 |                      nproc=16
10 | )
11 | 
12 | # extracts embedding from input data
13 | # input data is tokenized rank value encodings generated by Geneformer tokenizer (see tokenizing_scRNAseq_data.ipynb)
14 | # example dataset: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset
15 | 
16 | # pip install tdigest
17 | 
18 | embs = embex.extract_embs("./",
19 |                           "./data/datasets/immune_all_human.dataset/",
20 |                           "./humanpbmc/",
21 |                           "output_prefix")
22 | 
23 | 
24 | t2 = time.time()
25 | print(t2 - t1)
26 | import resource
27 | print(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss  / (1e6) )
28 | import torch
29 | print(torch.cuda.max_memory_allocated()/1024/1024/1024)
30 | 
31 | 
32 | 
33 | embs.to_csv("immune_all_human.csv")


--------------------------------------------------------------------------------
/Gene Network Analysis/gna_scf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Genemodule
3 | python get_embedding_h5ad.py --task_name ihatest --input_type singlecell --output_type gene --pool_type all --tgthighres f1 --data_path "/gpfs/gibbs/pi/zhao/tl688/scgpt_dataset/Immune_ALL_human.h5ad" --save_path ./examples/genemodule/ --pre_normalized F --demo
4 | 


--------------------------------------------------------------------------------
/Gene Network Analysis/sceval_gna_selfdefineEval.py:
--------------------------------------------------------------------------------
 1 | import scanpy as sc
 2 | import numpy as np
 3 | 
 4 | import pandas as pd
 5 | from grn import GeneEmbedding
 6 | import seaborn as sns
 7 | import gseapy as gp
 8 | adata = sc.read_h5ad("pbmc_tissue_gene_embeddings.h5ad")
 9 | 
10 | #marker genes defined by the original paper filtered based on expression profiles.
11 | mkr_set = {'Erythrocytes': ['CST3'],
12 |  'Erythroid progenitors': ['GATA2'],
13 |  'CD10+ B cells': ['MME'],
14 |  'Megakaryocyte progenitors': ['PF4',	'ITGA2B',	'PPBP'],
15 |  'HSPCs': ['CD34',	'PROCR'],
16 |  'Monocyte progenitors': ['IRF8',	'CSF1R',	'LY86'],
17 |  'Plasmacytoid dendritic cells': ['GZMB',	'IL3RA'],
18 |  'CD20+ B cells': ['MS4A1'],
19 |  'Plasma cells': [],
20 |  'Monocyte-derived dendritic cells': ['CD1C','FCER1A'],
21 |  'CD14+ Monocytes': ['CD14'],
22 |  'CD16+ Monocytes': ['FCGR3A'],
23 |  'CD4+ T cells': ['CD4'],
24 |  'CD8+ T cells': ['CD8B', 'CD8A'],
25 |  'NK cells': ['NKG7','GNLY'],
26 |           }
27 | 
28 | makerlist = []
29 | 
30 | for i in adata.obs['gene_name']:
31 |     count = 0
32 |     for ctp in mkr_set.keys():
33 |         if i in mkr_set[ctp]:
34 |             makerlist.append(ctp)
35 |             count = 1
36 |     if count ==0:
37 |         makerlist.append(None)
38 | 
39 | adata.obs['new_marker'] = makerlist
40 | 
41 | sc.pl.umap(adata, color='new_marker', edges=True)
42 | 
43 | 
44 | # specific pathway from scGPT suggestions
45 | mole_list = pd.read_table("Participating Molecules [R-HSA-168256].tsv")
46 | 
47 | mole_list_dnarna = mole_list[ mole_list["MoleculeType"]  == 'DNA/RNA' ]
48 | 
49 | adata_new = adata
50 | cofunction_gene = []
51 | for i in mole_list_dnarna["MoleculeName"].values:
52 |     gene = i.split(' ')[1]
53 |     cofunction_gene.append(gene)
54 | 
55 | adata_HLA = adata_new[[True if ('HLA' in i )  else False for i in adata_new.obs['gene_name'].values]]
56 | adata_CD = adata_new[[True if ('CD' in i) else False for i in adata_new.obs['gene_name'].values]]
57 | 
58 | CD_genes = adata_new.obs['gene_name'].values
59 | 
60 | # Meta info about the number of terms (tests) in the databases
61 | df_database = pd.DataFrame(
62 | data = [['GO_Biological_Process_2021', 6036],
63 | ['GO_Molecular_Function_2021', 1274],
64 | ['Reactome_2022', 1818]],
65 | columns = ['dataset', 'term'])
66 | 
67 | # Select desired database for query; here use Reactome as an example
68 | databases = ['Reactome_2022']
69 | m = df_database[df_database['dataset'].isin(databases)]['term'].sum()
70 | # p-value correction for total number of tests done
71 | p_thresh = 0.05/m
72 | 
73 | # Perform pathway enrichment analysis using the gseapy package in the Reactome database
74 | df = pd.DataFrame()
75 | enr_Reactome = gp.enrichr(gene_list=CD_genes,
76 |                           gene_sets=databases,
77 |                           organism='Human',
78 |                           outdir='test/enr_Reactome',
79 |                           cutoff=0.5)
80 | out = enr_Reactome.results
81 | out = out[out['P-value'] < p_thresh]
82 | df = df.append(out, ignore_index=True)
83 | df
84 | 
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/Imputation/imp_cellplm.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | warnings.filterwarnings("ignore")
  3 | 
  4 | import hdf5plugin
  5 | import numpy as np
  6 | import anndata as ad
  7 | import scanpy as sc
  8 | from scipy.sparse import csr_matrix
  9 | from CellPLM.utils import set_seed
 10 | from CellPLM.utils.data import stratified_sample_genes_by_sparsity
 11 | from CellPLM.pipeline.imputation import ImputationPipeline, ImputationDefaultPipelineConfig, ImputationDefaultModelConfig
 12 | from CellPLM.pipeline.experimental import symbol_to_ensembl
 13 | 
 14 | # !kill -9 3485071
 15 | 
 16 | ## Specify important parameters before getting started
 17 | 
 18 | DATASET = 'Liver' # 'Lung'
 19 | PRETRAIN_VERSION = '20230926_85M'
 20 | DEVICE = 'cuda:0'
 21 | 
 22 | ## Load Downstream Dataset
 23 | set_seed(11)
 24 | 
 25 | ref_data = sc.read_h5ad("/gpfs/gibbs/pi/zhao/tl688/scGPT/examples/mouse_scrnaseq.h5ad")
 26 | query_data = sc.read_h5ad("/gpfs/gibbs/pi/zhao/tl688/scGPT/examples/mouse_spatial.h5ad")
 27 | 
 28 | ref_data.var_names = symbol_to_ensembl(ref_data.var_names )
 29 | query_data.var_names = symbol_to_ensembl(query_data.var_names )
 30 | 
 31 | ref_data.var_names_make_unique()
 32 | query_data.var_names_make_unique()
 33 | 
 34 | ref_data.obs['batch'] = ref_data.obs_names
 35 | query_data.obs['batch'] = query_data.obs_names
 36 | 
 37 | query_data.var_names
 38 | 
 39 | # ref_data.var_names = [i.upper() for i in ref_data.var_names]
 40 | # query_data.var_names = [i.upper() for i in query_data.var_names]
 41 | # target_genes = query_data.var_names
 42 | target_genes = ['ENSG00000206579']
 43 | query_data.obsm['truth'] = query_data[:, target_genes].X.toarray()
 44 | query_data[:, target_genes].X = 0
 45 | train_data = query_data.concatenate(ref_data, join='outer', batch_key=None, index_unique=None)
 46 | 
 47 | train_data.obs['split'] = 'train'
 48 | train_data.obs['split'][train_data.obs['batch']==query_data.obs['batch'][-1]] = 'valid'
 49 | train_data.obs['split'][train_data.obs['batch']==ref_data.obs['batch'][-1]] = 'valid'
 50 | 
 51 | 
 52 | query_data.obs['platform'] = 'merfish'
 53 | 
 54 | query_data.obsm['spatial'][:,0]
 55 | 
 56 | query_data.obs['x_FOV_px'] = query_data.obsm['spatial'][:,0]
 57 | query_data.obs['y_FOV_px'] = query_data.obsm['spatial'][:,1]
 58 | 
 59 | query_data.var.index
 60 | 
 61 | ref_data.var.index
 62 | 
 63 | query_var_new = []
 64 | for i in query_data.var.index:
 65 |     if "ENSG" in i:
 66 |         query_var_new.append(i)
 67 | ref_var_new = []
 68 | for i in ref_data.var.index:
 69 |     if "ENSG" in i:
 70 |         ref_var_new.append(i)
 71 | 
 72 | query_data = query_data[:,query_var_new]
 73 | ref_data = ref_data[:,ref_var_new]
 74 | ## Specify gene to impute
 75 | 
 76 | query_genes = [g for g in query_data.var.index if g not in ['MRPL15']]
 77 | query_batches = list(query_data.obs['batch'].unique())
 78 | ref_batches = list(ref_data.obs['batch'].unique())
 79 | batch_gene_list = dict(zip(list(query_batches) + list(ref_batches),
 80 |     [query_genes]*len(query_batches) + [ref_data.var.index.tolist()]*len(ref_batches)))
 81 | 
 82 | ## Overwrite parts of the default config
 83 | pipeline_config = ImputationDefaultPipelineConfig.copy()
 84 | model_config = ImputationDefaultModelConfig.copy()
 85 | 
 86 | pipeline_config, model_config
 87 | 
 88 | ## Fine-tuning
 89 | 
 90 | pipeline = ImputationPipeline(pretrain_prefix=PRETRAIN_VERSION, # Specify the pretrain checkpoint to load
 91 |                                       overwrite_config=model_config,  # This is for overwriting part of the pretrain config
 92 |                                       pretrain_directory='/gpfs/gibbs/pi/zhao/tl688/CellPLM_cta/ckpt/')
 93 | pipeline.model
 94 | 
 95 | # batch_gene_list
 96 | pipeline.fit(train_data, # An AnnData object
 97 |             pipeline_config, # The config dictionary we created previously, optional
 98 |             split_field = 'split', #  Specify a column in .obs that contains split information
 99 |             train_split = 'train',
100 |             valid_split = 'valid',
101 |             batch_gene_list = batch_gene_list, # Specify genes that are measured in each batch, see previous section for more details
102 |             device = DEVICE,
103 |             ) 
104 | 


--------------------------------------------------------------------------------
/Perturbation Prediction/pp_scf.sh:
--------------------------------------------------------------------------------
1 | # GEARS
2 | python get_embedding.py --task_name GEARS_demo_batch --input_type singlecell --output_type gene_batch --pool_type all --tgthighres f1 --data_path ./examples/GEARS/pre_in.npy --save_path ./examples/GEARS/ --pre_normalized A


--------------------------------------------------------------------------------
/Perturbation Prediction/pp_uce_tgpt_scim.py:
--------------------------------------------------------------------------------
 1 | from torch_geometric.loader import DataLoader
 2 | from gears_001 import PertData, GEARS
 3 | from gears_001.inference import compute_metrics, deeper_analysis, non_dropout_analysis
 4 | from gears_001.utils import create_cell_graph_dataset_for_prediction
 5 | 
 6 | import scanpy as sc
 7 | import numpy as np
 8 | import sklearn
 9 | 
10 | 
11 | 
12 | from sklearn.preprocessing import StandardScaler
13 | def model_training(adata_train, emb_name = 'X_uce'):
14 |     model = sklearn.linear_model.LinearRegression()
15 |     train_data = np.concatenate([adata_train.obsm[emb_name], adata_train.obs['pert_condition'].values.reshape(-1,1)], axis=1)
16 |     pred_data = adata_train.obsm['ground_truth']
17 |     scaler = StandardScaler()
18 |     train_data = scaler.fit_transform(train_data)
19 |     model.fit(train_data, pred_data)
20 |     return model,scaler
21 | 
22 | 
23 | 
24 | adata = sc.read_h5ad("/gpfs/gibbs/pi/zhao/tl688/scGPT/examples/tgpt_out/adata_train_adamson_tgpt_all.h5ad") #can replace it with other embeddings
25 | adata.obsm['ground_truth'] = adata.layers['ground_truth'].copy()
26 | model,scaler = model_training(adata, emb_name = 'X_tgpt')
27 | 
28 | from gears import PertData, GEARS
29 | 
30 | # get data
31 | pert_data = PertData('./data')
32 | # pert_data = PertData('./data_folder')
33 | # load dataset in paper: norman, adamson, dixit.
34 | pert_data.load(data_name = 'adamson')
35 | # specify data split
36 | pert_data.prepare_split(split = 'simulation', seed = 1)
37 | # get dataloader with batch size
38 | pert_data.get_dataloader(batch_size = 1024, test_batch_size = 1024)
39 | 
40 | adata = sc.read_h5ad("/gpfs/gibbs/pi/zhao/tl688/scGPT/examples/tgpt_out/adata_test_adamson_tgpt_all.h5ad")
41 | 
42 | adata.obsm['ground_truth'] = adata.layers['ground_truth'].copy()
43 | 
44 | 
45 | 
46 | import torch
47 | 
48 | def eval_perturb(
49 |     loader: DataLoader, adata, model, scaler,obsm_name = 'X_uce'
50 | ):
51 |     """
52 |     Run model in inference mode using a given data loader
53 |     """
54 | 
55 |     pert_cat = []
56 |     pred = []
57 |     truth = []
58 |     pred_de = []
59 |     truth_de = []
60 |     results = {}
61 |     logvar = []
62 | 
63 |     for itr, batch in enumerate(loader):
64 |         pert_cat.extend(batch.pert)
65 |         
66 |         adata_filter = adata[itr*1024:(itr+1)*1024]
67 |         test_data = np.concatenate([adata_filter.obsm[obsm_name], adata_filter.obs['pert_condition'].values.reshape(-1,1)], axis=1)
68 |         test_data = scaler.transform(test_data)
69 |         p = model.predict(test_data)
70 | #         print(p)
71 |         t = batch.y.numpy()
72 |         pred.extend(p)
73 |         truth.extend(t)
74 |         # Differentially expressed genes
75 |         for itr, de_idx in enumerate(batch.de_idx):
76 |             pred_de.append(p[itr, de_idx])
77 |             truth_de.append(t[itr, de_idx])
78 | 
79 |     # all genes
80 |     results["pert_cat"] = np.array(pert_cat)
81 |     pred = np.stack(pred)
82 |     truth = np.stack(truth)
83 |     results["pred"] = pred
84 |     results["truth"] = truth
85 | 
86 |     pred_de = np.stack(pred_de)
87 |     truth_de = np.stack(truth_de)
88 |     results["pred_de"] = pred_de
89 |     results["truth_de"] = truth_de
90 | 
91 |     return results
92 | 
93 | results = eval_perturb(pert_data.dataloader['test_loader'],adata,model,scaler, obsm_name = 'X_tgpt')
94 | 
95 | test_metrics, test_pert_res = compute_metrics(results)
96 | print(test_metrics)
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/Scaling/emergent_ability.md:
--------------------------------------------------------------------------------
 1 | # Emergent Ability analysis
 2 | 
 3 | Here we discuss out experiment design to analyze emergent ability of single-cell LLMs. All the results and pipelines here are related to Figure 21 in the main text.
 4 | 
 5 | # Cross-data cell-type annotation
 6 | 
 7 | We compared the performance of scGPT to vanilla NN based on the crossing-data cell type annotation. The datasets here include "demo_train.h5ad" and "demo_test.h5ad". They are from Pancreas. Codes here are related to "Cell type Annotation".
 8 | 
 9 | # Cross-specises cell-type annotation
10 | 
11 | We compared the performance of scGPT to vanilla NN based on cell type prediction for 1. spatial transcriptomics and 2. mouse cell atlas seperated by batch. Codes here are related to "Cell type Annotation".
12 | 
13 | # Spatial transcriptomics batch effect correction.
14 | 
15 | We colelct spatial transcriptomics from human brain without cell labels and reduce the batch effect of the two datasets based on scGPT. Codes here are related to "Batch Effect Correction".
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/installation_baselines/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Start from a Miniconda image
 2 | FROM continuumio/miniconda3:latest
 3 | 
 4 | # Create a working directory
 5 | WORKDIR /app
 6 | 
 7 | # Copy the environment.yml file into the container
 8 | COPY scgpt_bench.yml .
 9 | 
10 | # Create the environment
11 | # Using `mamba` here for faster installations; it's included in newer images.
12 | RUN conda install -n base -c conda-forge mamba && \
13 |     mamba env create -f scgpt_bench.yml
14 | 
15 | # Activate the environment by default
16 | # The conda environment will be located at /opt/conda/envs/myenv
17 | ENV PATH /opt/conda/envs/myenv/bin:$PATH
18 | 
19 | # Clean up conda cache to reduce image size
20 | RUN conda clean --all --yes
21 | 
22 | # (Optional) Set a default command to start a shell
23 | CMD ["/bin/bash"]
24 | 


--------------------------------------------------------------------------------
/installation_baselines/cellm.yml:
--------------------------------------------------------------------------------
  1 | name: OpenBioMed
  2 | channels:
  3 |   - conda-forge
  4 |   - bioconda
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=conda_forge
  8 |   - _openmp_mutex=4.5=2_gnu
  9 |   - boost=1.78.0=py38h4e30db6_4
 10 |   - boost-cpp=1.78.0=h6582d0a_3
 11 |   - brotli=1.0.9=h166bdaf_9
 12 |   - brotli-bin=1.0.9=h166bdaf_9
 13 |   - bzip2=1.0.8=h7f98852_4
 14 |   - ca-certificates=2023.7.22=hbcca054_0
 15 |   - cairo=1.16.0=hbbf8b49_1016
 16 |   - certifi=2023.7.22=pyhd8ed1ab_0
 17 |   - contourpy=1.1.0=py38h7f3f72f_0
 18 |   - cycler=0.11.0=pyhd8ed1ab_0
 19 |   - expat=2.5.0=hcb278e6_1
 20 |   - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
 21 |   - font-ttf-inconsolata=3.000=h77eed37_0
 22 |   - font-ttf-source-code-pro=2.038=h77eed37_0
 23 |   - font-ttf-ubuntu=0.83=hab24e00_0
 24 |   - fontconfig=2.14.2=h14ed4e7_0
 25 |   - fonts-conda-ecosystem=1=0
 26 |   - fonts-conda-forge=1=0
 27 |   - fonttools=4.41.1=py38h01eb140_0
 28 |   - freetype=2.12.1=hca18f0e_1
 29 |   - freetype-py=2.3.0=pyhd8ed1ab_0
 30 |   - gettext=0.21.1=h27087fc_0
 31 |   - greenlet=2.0.2=py38h17151c0_1
 32 |   - icu=72.1=hcb278e6_0
 33 |   - importlib-resources=6.0.0=pyhd8ed1ab_1
 34 |   - importlib_resources=6.0.0=pyhd8ed1ab_1
 35 |   - kiwisolver=1.4.4=py38h43d8883_1
 36 |   - lcms2=2.15=haa2dc70_1
 37 |   - ld_impl_linux-64=2.40=h41732ed_0
 38 |   - lerc=4.0.0=h27087fc_0
 39 |   - libblas=3.9.0=17_linux64_openblas
 40 |   - libbrotlicommon=1.0.9=h166bdaf_9
 41 |   - libbrotlidec=1.0.9=h166bdaf_9
 42 |   - libbrotlienc=1.0.9=h166bdaf_9
 43 |   - libcblas=3.9.0=17_linux64_openblas
 44 |   - libdeflate=1.18=h0b41bf4_0
 45 |   - libexpat=2.5.0=hcb278e6_1
 46 |   - libffi=3.4.2=h7f98852_5
 47 |   - libgcc-ng=13.1.0=he5830b7_0
 48 |   - libgfortran-ng=13.1.0=h69a702a_0
 49 |   - libgfortran5=13.1.0=h15d22d2_0
 50 |   - libglib=2.76.4=hebfc3b9_0
 51 |   - libgomp=13.1.0=he5830b7_0
 52 |   - libiconv=1.17=h166bdaf_0
 53 |   - libjpeg-turbo=2.1.5.1=h0b41bf4_0
 54 |   - liblapack=3.9.0=17_linux64_openblas
 55 |   - libnsl=2.0.0=h7f98852_0
 56 |   - libopenblas=0.3.23=pthreads_h80387f5_0
 57 |   - libpng=1.6.39=h753d276_0
 58 |   - libsqlite=3.42.0=h2797004_0
 59 |   - libstdcxx-ng=13.1.0=hfd8a6a1_0
 60 |   - libtiff=4.5.1=h8b53f26_0
 61 |   - libuuid=2.38.1=h0b41bf4_0
 62 |   - libwebp-base=1.3.1=hd590300_0
 63 |   - libxcb=1.15=h0b41bf4_0
 64 |   - libzlib=1.2.13=hd590300_5
 65 |   - matplotlib-base=3.7.2=py38hf5b0b65_0
 66 |   - munkres=1.1.4=pyh9f0ad1d_0
 67 |   - ncurses=6.4=hcb278e6_0
 68 |   - numpy=1.24.4=py38h59b608b_0
 69 |   - openjpeg=2.5.0=hfec8fc6_2
 70 |   - openssl=3.1.1=hd590300_1
 71 |   - packaging=23.1=pyhd8ed1ab_0
 72 |   - pandas=2.0.3=py38h01efb38_1
 73 |   - pcre2=10.40=hc3806b6_0
 74 |   - pillow=10.0.0=py38h885162f_0
 75 |   - pip=23.2.1=pyhd8ed1ab_0
 76 |   - pixman=0.40.0=h36c2ea0_0
 77 |   - pthread-stubs=0.4=h36c2ea0_1001
 78 |   - pycairo=1.24.0=py38h1a1917b_0
 79 |   - pyparsing=3.0.9=pyhd8ed1ab_0
 80 |   - python=3.8.17=he550d4f_0_cpython
 81 |   - python-dateutil=2.8.2=pyhd8ed1ab_0
 82 |   - python-tzdata=2023.3=pyhd8ed1ab_0
 83 |   - python_abi=3.8=3_cp38
 84 |   - pytz=2023.3=pyhd8ed1ab_0
 85 |   - rdkit=2023.03.2=py38h36d2b2f_0
 86 |   - readline=8.2=h8228510_1
 87 |   - reportlab=4.0.4=py38h01eb140_0
 88 |   - rlpycairo=0.2.0=pyhd8ed1ab_0
 89 |   - setuptools=68.0.0=pyhd8ed1ab_0
 90 |   - six=1.16.0=pyh6c4a22f_0
 91 |   - sqlalchemy=2.0.19=py38h01eb140_0
 92 |   - tk=8.6.12=h27826a3_0
 93 |   - typing-extensions=4.7.1=hd8ed1ab_0
 94 |   - typing_extensions=4.7.1=pyha770c72_0
 95 |   - unicodedata2=15.0.0=py38h0a891b7_0
 96 |   - wheel=0.41.0=pyhd8ed1ab_0
 97 |   - xorg-kbproto=1.0.7=h7f98852_1002
 98 |   - xorg-libice=1.1.1=hd590300_0
 99 |   - xorg-libsm=1.2.4=h7391055_0
100 |   - xorg-libx11=1.8.6=h8ee46fc_0
101 |   - xorg-libxau=1.0.11=hd590300_0
102 |   - xorg-libxdmcp=1.1.3=h7f98852_0
103 |   - xorg-libxext=1.3.4=h0b41bf4_2
104 |   - xorg-libxrender=0.9.11=hd590300_0
105 |   - xorg-renderproto=0.11.1=h7f98852_1002
106 |   - xorg-xextproto=7.3.0=h0b41bf4_1003
107 |   - xorg-xproto=7.0.31=h7f98852_1007
108 |   - xz=5.2.6=h166bdaf_0
109 |   - zipp=3.16.2=pyhd8ed1ab_0
110 |   - zlib=1.2.13=hd590300_5
111 |   - zstd=1.5.2=hfc55251_7
112 |   - pip:
113 |       - anndata==0.9.2
114 |       - charset-normalizer==3.2.0
115 |       - cmake==3.27.0
116 |       - einops==0.6.1
117 |       - filelock==3.12.2
118 |       - fsspec==2023.6.0
119 |       - h5py==3.9.0
120 |       - huggingface-hub==0.16.4
121 |       - idna==3.4
122 |       - importlib-metadata==6.8.0
123 |       - jinja2==3.1.2
124 |       - joblib==1.3.1
125 |       - lit==16.0.6
126 |       - littleutils==0.2.2
127 |       - llvmlite==0.40.1
128 |       - local-attention==1.8.6
129 |       - markupsafe==2.1.3
130 |       - mhfp==1.9.6
131 |       - mpmath==1.3.0
132 |       - natsort==8.4.0
133 |       - networkx==3.1
134 |       - numba==0.57.1
135 |       - nvidia-cublas-cu11==11.10.3.66
136 |       - nvidia-cuda-cupti-cu11==11.7.101
137 |       - nvidia-cuda-nvrtc-cu11==11.7.99
138 |       - nvidia-cuda-runtime-cu11==11.7.99
139 |       - nvidia-cudnn-cu11==8.5.0.96
140 |       - nvidia-cufft-cu11==10.9.0.58
141 |       - nvidia-curand-cu11==10.2.10.91
142 |       - nvidia-cusolver-cu11==11.4.0.1
143 |       - nvidia-cusparse-cu11==11.7.4.91
144 |       - nvidia-nccl-cu11==2.14.3
145 |       - nvidia-nvtx-cu11==11.7.91
146 |       - ogb==1.3.6
147 |       - outdated==0.2.2
148 |       - patsy==0.5.3
149 |       - psutil==5.9.5
150 |       - pyg-lib==0.2.0+pt20cu117
151 |       - pynndescent==0.5.10
152 |       - pyyaml==6.0.1
153 |       - regex==2023.6.3
154 |       - requests==2.31.0
155 |       - safetensors==0.3.1
156 |       - scanpy==1.9.3
157 |       - scikit-learn==1.3.0
158 |       - scipy==1.10.1
159 |       - seaborn==0.12.2
160 |       - session-info==1.0.0
161 |       - statsmodels==0.14.0
162 |       - stdlib-list==0.9.0
163 |       - sympy==1.12
164 |       - threadpoolctl==3.2.0
165 |       - tokenizers==0.13.3
166 |       - torch==2.0.1
167 |       - torch-cluster==1.6.1+pt20cu117
168 |       - torch-geometric==2.3.1
169 |       - torch-scatter==2.1.1+pt20cu117
170 |       - torch-sparse==0.6.17+pt20cu117
171 |       - torch-spline-conv==1.2.2+pt20cu117
172 |       - tqdm==4.65.0
173 |       - transformers==4.31.0
174 |       - triton==2.0.0
175 |       - umap-learn==0.5.3
176 |       - urllib3==2.0.4
177 | prefix: /gpfs/gibbs/project/zhao/tl688/conda_envs/OpenBioMed
178 | 


--------------------------------------------------------------------------------
/installation_baselines/scbert.yml:
--------------------------------------------------------------------------------
  1 | name: scbert
  2 | channels:
  3 |   - conda-forge
  4 |   - bioconda
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=conda_forge
  8 |   - _openmp_mutex=4.5=2_gnu
  9 |   - bzip2=1.0.8=h7f98852_4
 10 |   - ca-certificates=2022.12.7=ha878542_0
 11 |   - ld_impl_linux-64=2.40=h41732ed_0
 12 |   - libffi=3.4.2=h7f98852_5
 13 |   - libgcc-ng=12.2.0=h65d4601_19
 14 |   - libgomp=12.2.0=h65d4601_19
 15 |   - libnsl=2.0.0=h7f98852_0
 16 |   - libsqlite=3.40.0=h753d276_1
 17 |   - libuuid=2.38.1=h0b41bf4_0
 18 |   - libzlib=1.2.13=h166bdaf_4
 19 |   - ncurses=6.3=h27087fc_1
 20 |   - openssl=3.1.0=hd590300_3
 21 |   - pip=23.1.2=pyhd8ed1ab_0
 22 |   - python=3.8.16=he550d4f_1_cpython
 23 |   - readline=8.2=h8228510_1
 24 |   - setuptools=67.7.2=pyhd8ed1ab_0
 25 |   - tk=8.6.12=h27826a3_0
 26 |   - wheel=0.40.0=pyhd8ed1ab_0
 27 |   - xz=5.2.6=h166bdaf_0
 28 |   - pip:
 29 |       - anndata==0.9.1
 30 |       - anyio==3.6.2
 31 |       - argon2-cffi==21.3.0
 32 |       - argon2-cffi-bindings==21.2.0
 33 |       - arrow==1.2.3
 34 |       - asttokens==2.2.1
 35 |       - attrs==23.1.0
 36 |       - axial-positional-embedding==0.2.1
 37 |       - backcall==0.2.0
 38 |       - beautifulsoup4==4.12.2
 39 |       - bleach==6.0.0
 40 |       - blosc2==2.0.0
 41 |       - certifi==2022.12.7
 42 |       - cffi==1.15.1
 43 |       - charset-normalizer==3.1.0
 44 |       - click==8.1.3
 45 |       - cmake==3.26.3
 46 |       - comm==0.1.3
 47 |       - contourpy==1.0.7
 48 |       - cycler==0.11.0
 49 |       - cython==0.29.34
 50 |       - debugpy==1.6.7
 51 |       - decorator==5.1.1
 52 |       - defusedxml==0.7.1
 53 |       - dunamai==1.16.0
 54 |       - einops==0.6.1
 55 |       - executing==1.2.0
 56 |       - fastjsonschema==2.16.3
 57 |       - filelock==3.12.0
 58 |       - fonttools==4.39.3
 59 |       - fqdn==1.5.1
 60 |       - get-version==3.5.4
 61 |       - h5py==3.8.0
 62 |       - huggingface-hub==0.0.8
 63 |       - idna==3.4
 64 |       - importlib-metadata==6.6.0
 65 |       - importlib-resources==5.12.0
 66 |       - ipykernel==6.23.1
 67 |       - ipython==8.12.2
 68 |       - ipython-genutils==0.2.0
 69 |       - ipywidgets==8.0.6
 70 |       - isoduration==20.11.0
 71 |       - jedi==0.18.2
 72 |       - jinja2==3.1.2
 73 |       - joblib==1.2.0
 74 |       - jsonpointer==2.3
 75 |       - jsonschema==4.17.3
 76 |       - jupyter==1.0.0
 77 |       - jupyter-client==8.2.0
 78 |       - jupyter-console==6.6.3
 79 |       - jupyter-core==5.3.0
 80 |       - jupyter-events==0.6.3
 81 |       - jupyter-server==2.5.0
 82 |       - jupyter-server-terminals==0.4.4
 83 |       - jupyterlab-pygments==0.2.2
 84 |       - jupyterlab-widgets==3.0.7
 85 |       - kiwisolver==1.4.4
 86 |       - legacy-api-wrap==1.2
 87 |       - lit==16.0.5
 88 |       - llvmlite==0.39.1
 89 |       - local-attention==1.8.6
 90 |       - loompy==3.0.7
 91 |       - markupsafe==2.1.2
 92 |       - matplotlib==3.6.3
 93 |       - matplotlib-inline==0.1.6
 94 |       - mistune==2.0.5
 95 |       - mpmath==1.3.0
 96 |       - msgpack==1.0.5
 97 |       - natsort==8.3.1
 98 |       - nbclassic==1.0.0
 99 |       - nbclient==0.7.4
100 |       - nbconvert==7.4.0
101 |       - nbformat==5.8.0
102 |       - nest-asyncio==1.5.6
103 |       - networkx==3.1
104 |       - notebook==6.5.4
105 |       - notebook-shim==0.2.3
106 |       - numba==0.56.4
107 |       - numexpr==2.8.4
108 |       - numpy==1.19.2
109 |       - numpy-groupies==0.9.22
110 |       - nvidia-cublas-cu11==11.10.3.66
111 |       - nvidia-cuda-cupti-cu11==11.7.101
112 |       - nvidia-cuda-nvrtc-cu11==11.7.99
113 |       - nvidia-cuda-runtime-cu11==11.7.99
114 |       - nvidia-cudnn-cu11==8.5.0.96
115 |       - nvidia-cufft-cu11==10.9.0.58
116 |       - nvidia-curand-cu11==10.2.10.91
117 |       - nvidia-cusolver-cu11==11.4.0.1
118 |       - nvidia-cusparse-cu11==11.7.4.91
119 |       - nvidia-nccl-cu11==2.14.3
120 |       - nvidia-nvtx-cu11==11.7.91
121 |       - packaging==23.1
122 |       - pandas==1.1.5
123 |       - pandocfilters==1.5.0
124 |       - parso==0.8.3
125 |       - patsy==0.5.3
126 |       - pexpect==4.8.0
127 |       - pickleshare==0.7.5
128 |       - pillow==9.5.0
129 |       - pkgutil-resolve-name==1.3.10
130 |       - platformdirs==3.5.1
131 |       - portalocker==2.7.0
132 |       - prometheus-client==0.16.0
133 |       - prompt-toolkit==3.0.38
134 |       - psutil==5.9.5
135 |       - ptyprocess==0.7.0
136 |       - pure-eval==0.2.2
137 |       - py-cpuinfo==9.0.0
138 |       - pycparser==2.21
139 |       - pygments==2.15.1
140 |       - pynndescent==0.5.10
141 |       - pyparsing==3.0.9
142 |       - pyrsistent==0.19.3
143 |       - python-dateutil==2.8.2
144 |       - python-json-logger==2.0.7
145 |       - pytz==2023.3
146 |       - pyyaml==6.0
147 |       - pyzmq==25.0.2
148 |       - qtconsole==5.4.3
149 |       - qtpy==2.3.1
150 |       - regex==2023.5.5
151 |       - requests==2.30.0
152 |       - rfc3339-validator==0.1.4
153 |       - rfc3986-validator==0.1.1
154 |       - sacremoses==0.0.53
155 |       - scanpy==1.7.2
156 |       - scikit-learn==0.24.2
157 |       - scipy==1.5.4
158 |       - seaborn==0.12.2
159 |       - send2trash==1.8.2
160 |       - sinfo==0.3.4
161 |       - six==1.16.0
162 |       - sniffio==1.3.0
163 |       - soupsieve==2.4.1
164 |       - stack-data==0.6.2
165 |       - statsmodels==0.14.0rc0
166 |       - stdlib-list==0.8.0
167 |       - sympy==1.12
168 |       - tables==3.8.0
169 |       - terminado==0.17.1
170 |       - threadpoolctl==3.1.0
171 |       - tinycss2==1.2.1
172 |       - tokenizers==0.10.3
173 |       - torch==2.0.1
174 |       - torchdata==0.6.1
175 |       - torchtext==0.15.2
176 |       - torchvision==0.9.1
177 |       - tornado==6.3.2
178 |       - tqdm==4.65.0
179 |       - traitlets==5.9.0
180 |       - transformers==4.6.1
181 |       - triton==2.0.0
182 |       - typing-extensions==4.5.0
183 |       - umap-learn==0.5.3
184 |       - uri-template==1.2.0
185 |       - urllib3==2.0.2
186 |       - wcwidth==0.2.6
187 |       - webcolors==1.13
188 |       - webencodings==0.5.1
189 |       - websocket-client==1.5.1
190 |       - widgetsnbextension==4.0.7
191 |       - zipp==3.15.0
192 | prefix: /gpfs/gibbs/project/zhao/tl688/conda_envs/scbert
193 | 


--------------------------------------------------------------------------------
/installation_baselines/scimilarity.yml:
--------------------------------------------------------------------------------
  1 | name: scimilarity
  2 | channels:
  3 |   - conda-forge
  4 |   - bioconda
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=conda_forge
  8 |   - _openmp_mutex=4.5=2_gnu
  9 |   - bzip2=1.0.8=hd590300_5
 10 |   - ca-certificates=2023.11.17=hbcca054_0
 11 |   - ld_impl_linux-64=2.40=h41732ed_0
 12 |   - libffi=3.4.2=h7f98852_5
 13 |   - libgcc-ng=13.2.0=h807b86a_3
 14 |   - libgomp=13.2.0=h807b86a_3
 15 |   - libnsl=2.0.1=hd590300_0
 16 |   - libsqlite=3.44.2=h2797004_0
 17 |   - libuuid=2.38.1=h0b41bf4_0
 18 |   - libzlib=1.2.13=hd590300_5
 19 |   - ncurses=6.4=h59595ed_2
 20 |   - openssl=3.2.0=hd590300_1
 21 |   - pip=23.3.2=pyhd8ed1ab_0
 22 |   - python=3.8.18=hd12c33a_0_cpython
 23 |   - readline=8.2=h8228510_1
 24 |   - setuptools=68.2.2=pyhd8ed1ab_0
 25 |   - tk=8.6.13=noxft_h4845f30_101
 26 |   - wheel=0.42.0=pyhd8ed1ab_0
 27 |   - xz=5.2.6=h166bdaf_0
 28 |   - pip:
 29 |       - adjusttext==0.8
 30 |       - aiohttp==3.9.1
 31 |       - aiosignal==1.3.1
 32 |       - anndata==0.9.2
 33 |       - anyio==4.2.0
 34 |       - argon2-cffi==23.1.0
 35 |       - argon2-cffi-bindings==21.2.0
 36 |       - arrow==1.3.0
 37 |       - asciitree==0.3.3
 38 |       - asttokens==2.4.1
 39 |       - async-lru==2.0.4
 40 |       - async-timeout==4.0.3
 41 |       - attrs==23.1.0
 42 |       - babel==2.14.0
 43 |       - backcall==0.2.0
 44 |       - beautifulsoup4==4.12.2
 45 |       - bleach==6.1.0
 46 |       - captum==0.7.0
 47 |       - certifi==2023.11.17
 48 |       - cffi==1.16.0
 49 |       - charset-normalizer==3.3.2
 50 |       - circlify==0.15.0
 51 |       - click==8.1.7
 52 |       - comm==0.2.0
 53 |       - contourpy==1.1.1
 54 |       - cycler==0.12.1
 55 |       - cython==3.0.7
 56 |       - debugpy==1.8.0
 57 |       - decorator==5.1.1
 58 |       - defusedxml==0.7.1
 59 |       - demuxem==0.1.7
 60 |       - docopt==0.6.2
 61 |       - exceptiongroup==1.2.0
 62 |       - executing==2.0.1
 63 |       - fasteners==0.19
 64 |       - fastjsonschema==2.19.0
 65 |       - filelock==3.13.1
 66 |       - fonttools==4.47.0
 67 |       - fqdn==1.5.1
 68 |       - frozenlist==1.4.1
 69 |       - fsspec==2023.12.2
 70 |       - get-annotations==0.1.2
 71 |       - h5py==3.10.0
 72 |       - hnswlib==0.8.0
 73 |       - idna==3.6
 74 |       - igraph==0.10.8
 75 |       - importlib-metadata==7.0.0
 76 |       - importlib-resources==6.1.1
 77 |       - ipykernel==6.27.1
 78 |       - ipython==8.12.3
 79 |       - ipywidgets==8.1.1
 80 |       - isoduration==20.11.0
 81 |       - jedi==0.19.1
 82 |       - jinja2==3.1.2
 83 |       - joblib==1.3.2
 84 |       - json5==0.9.14
 85 |       - jsonpointer==2.4
 86 |       - jsonschema==4.20.0
 87 |       - jsonschema-specifications==2023.11.2
 88 |       - jupyter==1.0.0
 89 |       - jupyter-client==8.6.0
 90 |       - jupyter-console==6.6.3
 91 |       - jupyter-core==5.5.1
 92 |       - jupyter-events==0.9.0
 93 |       - jupyter-lsp==2.2.1
 94 |       - jupyter-server==2.12.1
 95 |       - jupyter-server-terminals==0.5.0
 96 |       - jupyterlab==4.0.9
 97 |       - jupyterlab-pygments==0.3.0
 98 |       - jupyterlab-server==2.25.2
 99 |       - jupyterlab-widgets==3.0.9
100 |       - kiwisolver==1.4.5
101 |       - leidenalg==0.10.1
102 |       - lightgbm==4.1.0
103 |       - lightning-utilities==0.10.0
104 |       - llvmlite==0.41.1
105 |       - loompy==3.0.7
106 |       - louvain==0.8.1
107 |       - markupsafe==2.1.3
108 |       - matplotlib==3.7.4
109 |       - matplotlib-inline==0.1.6
110 |       - mistune==3.0.2
111 |       - mpmath==1.3.0
112 |       - multidict==6.0.4
113 |       - natsort==8.4.0
114 |       - nbclient==0.9.0
115 |       - nbconvert==7.13.0
116 |       - nbformat==5.9.2
117 |       - nest-asyncio==1.5.8
118 |       - networkx==3.1
119 |       - notebook==7.0.6
120 |       - notebook-shim==0.2.3
121 |       - numba==0.58.1
122 |       - numcodecs==0.12.1
123 |       - numpy==1.24.4
124 |       - numpy-groupies==0.9.22
125 |       - nvidia-cublas-cu12==12.1.3.1
126 |       - nvidia-cuda-cupti-cu12==12.1.105
127 |       - nvidia-cuda-nvrtc-cu12==12.1.105
128 |       - nvidia-cuda-runtime-cu12==12.1.105
129 |       - nvidia-cudnn-cu12==8.9.2.26
130 |       - nvidia-cufft-cu12==11.0.2.54
131 |       - nvidia-curand-cu12==10.3.2.106
132 |       - nvidia-cusolver-cu12==11.4.5.107
133 |       - nvidia-cusparse-cu12==12.1.0.106
134 |       - nvidia-nccl-cu12==2.18.1
135 |       - nvidia-nvjitlink-cu12==12.3.101
136 |       - nvidia-nvtx-cu12==12.1.105
137 |       - obonet==1.0.0
138 |       - overrides==7.4.0
139 |       - packaging==23.2
140 |       - pandas==2.0.3
141 |       - pandocfilters==1.5.0
142 |       - parso==0.8.3
143 |       - patsy==0.5.4
144 |       - pegasusio==0.8.1
145 |       - pegasuspy==1.7.1
146 |       - pexpect==4.9.0
147 |       - pickleshare==0.7.5
148 |       - pillow==10.1.0
149 |       - pkgutil-resolve-name==1.3.10
150 |       - platformdirs==4.1.0
151 |       - prometheus-client==0.19.0
152 |       - prompt-toolkit==3.0.43
153 |       - psutil==5.9.7
154 |       - ptyprocess==0.7.0
155 |       - pure-eval==0.2.2
156 |       - pyarrow==14.0.2
157 |       - pybind11==2.11.1
158 |       - pycparser==2.21
159 |       - pygments==2.17.2
160 |       - pynndescent==0.5.11
161 |       - pyparsing==3.1.1
162 |       - python-dateutil==2.8.2
163 |       - python-igraph==0.10.8
164 |       - python-json-logger==2.0.7
165 |       - pytorch-lightning==2.1.2
166 |       - pytz==2023.3.post1
167 |       - pyyaml==6.0.1
168 |       - pyzmq==25.1.2
169 |       - qtconsole==5.5.1
170 |       - qtpy==2.4.1
171 |       - referencing==0.32.0
172 |       - requests==2.31.0
173 |       - rfc3339-validator==0.1.4
174 |       - rfc3986-validator==0.1.1
175 |       - rpds-py==0.15.2
176 |       - scanpy==1.9.6
177 |       - scikit-learn==1.3.2
178 |       - scikit-misc==0.2.0
179 |       - scimilarity==0.1.0.post1.dev1+g683b129
180 |       - scipy==1.10.1
181 |       - seaborn==0.12.2
182 |       - send2trash==1.8.2
183 |       - session-info==1.0.0
184 |       - six==1.16.0
185 |       - sniffio==1.3.0
186 |       - soupsieve==2.5
187 |       - stack-data==0.6.3
188 |       - statsmodels==0.14.1
189 |       - stdlib-list==0.10.0
190 |       - sympy==1.12
191 |       - terminado==0.18.0
192 |       - texttable==1.7.0
193 |       - threadpoolctl==3.2.0
194 |       - tiledb==0.24.0
195 |       - tinycss2==1.2.1
196 |       - tomli==2.0.1
197 |       - torch==2.1.2
198 |       - torchmetrics==1.2.1
199 |       - tornado==6.4
200 |       - tqdm==4.66.1
201 |       - traitlets==5.14.0
202 |       - triton==2.1.0
203 |       - types-python-dateutil==2.8.19.14
204 |       - typing-extensions==4.9.0
205 |       - tzdata==2023.3
206 |       - umap-learn==0.5.5
207 |       - uri-template==1.3.0
208 |       - urllib3==2.1.0
209 |       - wcwidth==0.2.12
210 |       - webcolors==1.13
211 |       - webencodings==0.5.1
212 |       - websocket-client==1.7.0
213 |       - widgetsnbextension==4.0.9
214 |       - wordcloud==1.9.3
215 |       - xlsxwriter==3.1.9
216 |       - yarl==1.9.4
217 |       - zarr==2.16.1
218 |       - zipp==3.17.0
219 | prefix: /gpfs/gibbs/project/zhao/tl688/conda_envs/scimilarity
220 | 


--------------------------------------------------------------------------------
/installation_baselines/uce.yml:
--------------------------------------------------------------------------------
  1 | name: uce
  2 | channels:
  3 |   - conda-forge
  4 |   - bioconda
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=conda_forge
  8 |   - _openmp_mutex=4.5=2_gnu
  9 |   - bzip2=1.0.8=hd590300_5
 10 |   - ca-certificates=2023.11.17=hbcca054_0
 11 |   - ld_impl_linux-64=2.40=h41732ed_0
 12 |   - libffi=3.4.2=h7f98852_5
 13 |   - libgcc-ng=13.2.0=h807b86a_3
 14 |   - libgomp=13.2.0=h807b86a_3
 15 |   - libnsl=2.0.1=hd590300_0
 16 |   - libsqlite=3.44.2=h2797004_0
 17 |   - libuuid=2.38.1=h0b41bf4_0
 18 |   - libzlib=1.2.13=hd590300_5
 19 |   - ncurses=6.4=h59595ed_2
 20 |   - openssl=3.2.0=hd590300_1
 21 |   - pip=23.3.1=pyhd8ed1ab_0
 22 |   - python=3.8.18=hd12c33a_0_cpython
 23 |   - readline=8.2=h8228510_1
 24 |   - setuptools=68.2.2=pyhd8ed1ab_0
 25 |   - tk=8.6.13=noxft_h4845f30_101
 26 |   - wheel=0.42.0=pyhd8ed1ab_0
 27 |   - xz=5.2.6=h166bdaf_0
 28 |   - pip:
 29 |       - accelerate==0.25.0
 30 |       - accelerator==2023.11.3.dev1
 31 |       - anndata==0.9.2
 32 |       - bottle==0.12.25
 33 |       - certifi==2023.11.17
 34 |       - charset-normalizer==3.3.2
 35 |       - contourpy==1.1.1
 36 |       - cycler==0.12.1
 37 |       - filelock==3.13.1
 38 |       - fonttools==4.46.0
 39 |       - fsspec==2023.12.1
 40 |       - get-annotations==0.1.2
 41 |       - h5py==3.10.0
 42 |       - huggingface-hub==0.19.4
 43 |       - idna==3.6
 44 |       - importlib-metadata==7.0.0
 45 |       - importlib-resources==6.1.1
 46 |       - jinja2==3.1.2
 47 |       - joblib==1.3.2
 48 |       - kiwisolver==1.4.5
 49 |       - llvmlite==0.41.1
 50 |       - markupsafe==2.1.3
 51 |       - matplotlib==3.7.4
 52 |       - mpmath==1.3.0
 53 |       - natsort==8.4.0
 54 |       - networkx==3.1
 55 |       - numba==0.58.1
 56 |       - numpy==1.24.4
 57 |       - nvidia-cublas-cu12==12.1.3.1
 58 |       - nvidia-cuda-cupti-cu12==12.1.105
 59 |       - nvidia-cuda-nvrtc-cu12==12.1.105
 60 |       - nvidia-cuda-runtime-cu12==12.1.105
 61 |       - nvidia-cudnn-cu12==8.9.2.26
 62 |       - nvidia-cufft-cu12==11.0.2.54
 63 |       - nvidia-curand-cu12==10.3.2.106
 64 |       - nvidia-cusolver-cu12==11.4.5.107
 65 |       - nvidia-cusparse-cu12==12.1.0.106
 66 |       - nvidia-nccl-cu12==2.18.1
 67 |       - nvidia-nvjitlink-cu12==12.3.101
 68 |       - nvidia-nvtx-cu12==12.1.105
 69 |       - packaging==23.2
 70 |       - pandas==2.0.3
 71 |       - patsy==0.5.4
 72 |       - pillow==10.1.0
 73 |       - psutil==5.9.6
 74 |       - pynndescent==0.5.11
 75 |       - pyparsing==3.1.1
 76 |       - python-dateutil==2.8.2
 77 |       - pytz==2023.3.post1
 78 |       - pyyaml==6.0.1
 79 |       - requests==2.31.0
 80 |       - safetensors==0.4.1
 81 |       - scanpy==1.9.6
 82 |       - scikit-learn==1.3.2
 83 |       - scipy==1.10.1
 84 |       - seaborn==0.12.2
 85 |       - session-info==1.0.0
 86 |       - setproctitle==1.3.3
 87 |       - six==1.16.0
 88 |       - statsmodels==0.14.0
 89 |       - stdlib-list==0.10.0
 90 |       - sympy==1.12
 91 |       - threadpoolctl==3.2.0
 92 |       - torch==2.1.1
 93 |       - tqdm==4.66.1
 94 |       - triton==2.1.0
 95 |       - typing-extensions==4.8.0
 96 |       - tzdata==2023.3
 97 |       - umap-learn==0.5.5
 98 |       - urllib3==1.26.6
 99 |       - waitress==2.1.2
100 |       - zipp==3.17.0
101 | prefix: /gpfs/gibbs/project/zhao/tl688/conda_envs/uce
102 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # scEval😈: An evaluation platform for single-cell Foundation Models (FMs)
  2 | 
  3 | This is the repo for our benchmarking and analysis project. All methods are collected until Dec 1st, 2024. 
  4 | 
  5 | News: We are collaborating with [OpenProblems](https://openproblems.bio/) to make this benchmark alive! Stay tuned and we will update the benchmarking results soon!
  6 | 
  7 | # Install
  8 | 
  9 | To install our benchmarking environment based on [scGPT](https://scgpt.readthedocs.io/en/latest/), please use conda to create an environment based on this yml file in your own machine:
 10 | ```
 11 | conda env create -n scgpt --file scgpt_bench.yml
 12 | ```
 13 | 
 14 | If you face any issues due to version conflicts, you can try to comment the problematic packages and try:
 15 | 
 16 | ```
 17 | conda activate scgpt
 18 | conda env update --file scgpt_bench.yml
 19 | ```
 20 | 
 21 | We also provide docker installation, please use (need gpu):
 22 | 
 23 | ```
 24 | docker build -t my-conda-image .
 25 | ```
 26 | 
 27 | To activate it, please use:
 28 | 
 29 | ```
 30 | docker run -it --rm my-conda-image
 31 | ```
 32 | 
 33 | For other methods we used, please refer to their original project website for instructions. We recommend  creating different environments for different methods. Considering the difficulties of installing different scFMs, we provide a list of yml files and an example of Dockerfile we used to install these models in the folder **installation_baselines**.
 34 | 
 35 | These methods include: 
 36 | 
 37 | [tGPT](https://github.com/deeplearningplus/tGPT), [Geneformer](https://huggingface.co/ctheodoris/Geneformer), [scBERT](https://github.com/TencentAILabHealthcare/scBERT), [CellLM](https://github.com/BioFM/OpenBioMed/tree/main), [SCimilarity](https://github.com/Genentech/scimilarity), [scFoundation](https://github.com/biomap-research/scFoundation), [CellPLM](https://github.com/OmicsML/CellPLM), [UCE](https://github.com/snap-stanford/UCE), [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main). These are also single-cell FMs.
 38 | 
 39 | And
 40 | 
 41 | [TOSICA](https://github.com/JackieHanLab/TOSICA/tree/main), [scJoint](https://github.com/SydneyBioX/scJoint), [GLUE](https://github.com/gao-lab/GLUE), [ResPAN](https://github.com/AprilYuge/ResPAN/tree/main), [Harmony](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html), [scDesign3](https://github.com/SONGDONGYUAN1994/scDesign3), [Splatter](https://github.com/Oshlack/splatter), [scVI](https://scvi-tools.org/), [Tangram](https://github.com/broadinstitute/Tangram), [GEARS](https://github.com/snap-stanford/GEARS). These are task-specific models.
 42 | 
 43 | 
 44 | We need scIB for evaluation. Please use pip to install it:
 45 | ```
 46 | pip install scib
 47 | ```
 48 | We also provide a scib version with our new function in this repo. Please make sure you have **scib >=1.0.4** to run kBET correctly.
 49 | 
 50 | We will release a version of scEval with more functions in the future!
 51 | 
 52 | 
 53 | # Pre-training weights
 54 | 
 55 | Most of our experiments were finished based on weights under [scGPT_bc](https://drive.google.com/drive/folders/1S9B2QUvBAh_FxUNrWrLfsvsds1thF9ad?usp=share_link). [scGPT_full](https://drive.google.com/drive/folders/1eNdHu45uXDHOF4u0J1sYiBLZYN55yytS?usp=share_link) from scGPT v2 was also used in the batch effect correction evaluation. Pre-training weights of scBERT can be found in [scBERT](https://github.com/TencentAILabHealthcare/scBERT). Pre-training weights of CellLM can be found in [cellLM](https://github.com/BioFM/OpenBioMed/tree/main). Pre-training weights of Geneformer can be found in [Geneformer](https://huggingface.co/ctheodoris/Geneformer). Pre-training weights of SCimilarity can be found in [SCimilarity](https://github.com/Genentech/scimilarity). Pre-training weights of UCE can be found in [UCE](https://github.com/snap-stanford/UCE). Pre-training weights of tGPT can be found in [tGPT](https://github.com/deeplearningplus/tGPT). Pre-training weights of CellPLM can be found in [CellPLM](https://github.com/OmicsML/CellPLM). 
 56 | 
 57 | scFoundation relies on the APIs or local sever for access, please refer [scFoundation](https://github.com/biomap-research/scFoundation) for details. Details of GeneCompas can be found in [GeneCompass](https://github.com/xCompass-AI/GeneCompass/tree/main)
 58 | 
 59 | # Benchmarking information
 60 | 
 61 | Please refer to different folders for the codes of scEval and metrics we used to evaluate single-cell LLMs under different tasks. In general, we list the tasks and corresponding metrics here:
 62 | 
 63 | | Tasks                                                 | Metrics                                  |
 64 | |-------------------------------------------------------|------------------------------------------|
 65 | | Batch Effect Correction, Multi-omics Data Integration |
 66 | | and Simulation                                        | [scIB](https://github.com/theislab/scib)                                     |
 67 | | Cell-type Annotation and Gene Function Prediction     | Accuracy, Precision, Recall and F1 score |
 68 | | Imputation                                            | [scIB](https://github.com/theislab/scib), Correlation                        |
 69 | | Perturbation Prediction                               | Correlation, Mean Squared Error                              |
 70 | | Gene Network Analysis                                 | Jaccard similarity                       |
 71 | 
 72 | The file 'sceval_lib.py' includes all of the metrics we used in this project.
 73 | 
 74 | To run the codes in different tasks, please use (we choose batch effect correction of scGPT as an example here):
 75 | 
 76 | ```
 77 | python sceval_batcheffect.py
 78 | ```
 79 | 
 80 | We recommend directly evaluating the methods based on their outputs (as .h5ad file), which can be easily performed based on the codes in **sceval_method.py**.
 81 | 
 82 | We offer demo datasets for batch effect correction and cell type annotation. Such datasets can be found [here](https://yaleedu-my.sharepoint.com/:f:/g/personal/tianyu_liu_yale_edu/Eiqs78qeqwBNiy6zoI_JDnABfz7e2w4Gpj0F4t4l5S-oCw?e=0xSnew).
 83 | 
 84 | To avoid using wandb, please set:
 85 | 
 86 | ```
 87 | os.environ["WANDB_MODE"] = "offline"
 88 | 
 89 | ```
 90 | 
 91 | We will upload our codes for benchmarking different foundation models soon.
 92 | 
 93 | # Devices
 94 | 
 95 | We recommend using sever to run benchmarked methods and scEval platform. To run single-cell Foundation Models, GPU cores (A100 or higher version) and 40+ GB memory are required. To run scEval (only the evaluation), 40+ GB memory is recommended.
 96 | 
 97 | # Results
 98 | 
 99 | We have an official website as the summary of our work. Please use this [link](https://sites.google.com/yale.edu/sceval) for access. 
100 | 
101 | # Contact
102 | 
103 | Please contact tianyu.liu@yale.edu if you have any questions about this project.
104 | 
105 | # Citation
106 | 
107 | ```
108 | @article{liu2023evaluating,
109 |   title={Evaluating the Utilities of Foundation Models in Single-cell Data Analysis},
110 |   author={Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu},
111 |   journal={bioRxiv},
112 |   pages={2023--09},
113 |   year={2023},
114 |   publisher={Cold Spring Harbor Laboratory}
115 | }
116 | ```


--------------------------------------------------------------------------------
/sceval_lib.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import scib
  4 | import scanpy as sc
  5 | import scipy
  6 | import scipy.stats
  7 | from scgpt.utils import set_seed
  8 | from anndata import AnnData
  9 | from sklearn.metrics import classification_report
 10 | from typing import List, Tuple, Dict, Union, Optional
 11 | 
 12 | set_seed(0)
 13 | def eval_scib_metrics(
 14 |     adata: AnnData,
 15 |     batch_key: str = "batch",
 16 |     label_key: str = "celltype",
 17 |     emb_name: str = "X_scGPT",
 18 |     notes: Optional[str] = None,
 19 | ) -> Dict:
 20 |     results = scib.metrics.metrics(
 21 |         adata,
 22 |         adata_int=adata,
 23 |         batch_key=batch_key,
 24 |         label_key=label_key,
 25 |         embed=emb_name,
 26 |         isolated_labels_asw_=False,
 27 |         silhouette_=True,
 28 |         hvg_score_=False,
 29 |         graph_conn_=True,
 30 |         pcr_=True,
 31 |         isolated_labels_f1_=False,
 32 |         trajectory_=False,
 33 |         nmi_=True,  
 34 |         ari_=True, 
 35 |         cell_cycle_=False,
 36 |         kBET_=True,  
 37 |         ilisi_=False,
 38 |         clisi_=False,
 39 |     )
 40 | 
 41 |     result_dict = results[0].to_dict()
 42 | 
 43 |     result_dict["avg_bio"] = np.mean(
 44 |         [
 45 |             result_dict["NMI_cluster/label"],
 46 |             result_dict["ARI_cluster/label"],
 47 |             result_dict["ASW_label"],
 48 |         ]
 49 |     )
 50 | 
 51 |     # remove nan value in result_dict
 52 |     result_dict = {k: v for k, v in result_dict.items() if not np.isnan(v)}
 53 | 
 54 |     print(results)
 55 |     return result_dict
 56 | 
 57 | 
 58 | def eval_scib_metrics_onlybio(
 59 |     adata: AnnData,
 60 |     batch_key: str = "batch",
 61 |     label_key: str = "celltype",
 62 |     
 63 |     emb_name: str = "X_scGPT",
 64 |     notes: Optional[str] = None,
 65 | ) -> Dict:
 66 |     results = scib.metrics.metrics_onlybio(
 67 |         adata,
 68 |         adata_int=adata,
 69 |         batch_key=batch_key,
 70 |         label_key=label_key,
 71 |         embed=emb_name,
 72 |         isolated_labels_asw_=False,
 73 |         silhouette_=True,
 74 |         hvg_score_=False,
 75 |         graph_conn_=True,
 76 |         pcr_=True,
 77 |         isolated_labels_f1_=False,
 78 |         trajectory_=False,
 79 |         nmi_=True,  
 80 |         ari_=True,  
 81 |         cell_cycle_=False,
 82 |         kBET_=False,  
 83 |         ilisi_=False,
 84 |         clisi_=False,
 85 |     )
 86 | 
 87 |     result_dict = results[0].to_dict()
 88 |     result_dict["avg_bio"] = np.mean(
 89 |         [
 90 |             result_dict["NMI_cluster/label"],
 91 |             result_dict["ARI_cluster/label"],
 92 |             result_dict["ASW_label"],
 93 |         ]
 94 |     )
 95 | 
 96 |     # remove nan value in result_dict
 97 |     result_dict = {k: v for k, v in result_dict.items() if not np.isnan(v)}
 98 | 
 99 |     print(results)
100 |     return result_dict
101 | 
102 | def calculate_correlation_metric(y1, y2):
103 |     cor = 0.0
104 |     y1 = y1.float()
105 |     y2 = y2.float()
106 |     for id1, id2 in zip(y1, y2):
107 |         
108 |         cor_cal,_ = scipy.stats.pearsonr(id1,id2)
109 |         cor += cor_cal.item()
110 |     return cor
111 | 
112 | 
113 | class scEval(object):
114 | 
115 |     def __init__(self, adata):
116 |         self.label = 'scGPT'
117 |         self.adata = adata # adata is the output of the model you plan to benchmark.
118 |         self.pvalue = 0.005
119 | 
120 |     def evaluation_bec(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scGPT'):
121 |         results = eval_scib_metrics(self.adata,batch_key,label_key, emb_name)
122 |         return results
123 |     
124 | 
125 |     def evaluation_cta_gfp(self, pred_label, true_label):
126 |         results = classification_report(true_label, pred_label, digits=4)
127 |         return results
128 |     
129 |     def evaluation_perturb_pred(self, pred_model, true_result): #assume the outputs are both in AnnData format. Rows are cells while columns are genes.
130 |         cor_total = calculate_correlation_metric(pred_model.X.T, true_result.X.T)
131 |         return {"correlation":cor_total / len(pred_model.X.T)}
132 |     
133 |     def evaluation_perturb_pred_gearsofficial(self, gears_model, pred_model ):
134 |         from gears.inference import evaluate, compute_metrics, deeper_analysis, non_dropout_analysis
135 |         test_res = evaluate(gears_model.dataloader['test_loader'], pred_model)
136 |         test_metrics, test_pert_res = compute_metrics(test_res)
137 |         return test_metrics
138 |     
139 |     def evaluation_imputation_scrna(self, batch_key = 'batch',label_key = 'celltype', emb_name = 'X_scGPT'):
140 |         results = eval_scib_metrics_onlybio(self.adata,batch_key,label_key, emb_name)
141 |         return results
142 |     
143 |     def evaluation_imputation_spatial(self, adata_sp):
144 |         adata_imp_new = self.adata[:, adata_sp.var_names]
145 |         cor_list = []
146 |         pval_list = []
147 |         for item in adata_sp.var_names:
148 |             adata1 = adata_sp[:,item]
149 |             adata2 = adata_imp_new[:,item]
150 |             cor, pval = scipy.stats.pearsonr(np.array(adata1.X.todense().T)[0], np.array(adata2.X.T)[0]) # for this step, please check the data form
151 |             cor_list.append(cor)
152 |             pval_list.append(pval)
153 | 
154 |         adata_imp_new.var['cor'] = cor_list 
155 |         adata_imp_new.var['pval'] = pval_list
156 | 
157 |         mean_cor = np.mean(adata_imp_new.var['cor'].values)
158 | 
159 |         avg_sig = np.sum(adata_imp_new.var['pval'].values<self.pvalue)/len(adata_imp_new.var['pval'].values)
160 |         return {"mean_cor":mean_cor, "avg_sign":avg_sig} 
161 |     
162 |     def evaluation_simulation(self, batch_key = 'batch',label_key = 'celltype', isbatch = True, emb_name = 'X_scGPT'):
163 | 
164 |         if isbatch:
165 |             results = eval_scib_metrics(self.adata,batch_key,label_key, emb_name)
166 |             return results 
167 |         else:
168 |             results = eval_scib_metrics_onlybio(self.adata,batch_key,label_key, emb_name)
169 |             return results             
170 | 
171 | 
172 | 
173 | 
174 | 


--------------------------------------------------------------------------------
/scib/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from importlib import metadata
 3 | except ImportError:  # for Python<3.8
 4 |     import importlib_metadata as metadata
 5 | 
 6 | __version__ = metadata.version("scib")
 7 | 
 8 | from . import integration, metrics, preprocessing, utils
 9 | from ._package_tools import rename_func
10 | from .metrics import clustering
11 | 
12 | alias_func_map = {
13 |     "runScanorama": integration.scanorama,
14 |     "runTrVae": integration.trvae,
15 |     "runTrVaep": integration.trvaep,
16 |     "runScGen": integration.scgen,
17 |     "runScvi": integration.scvi,
18 |     "runScanvi": integration.scanvi,
19 |     "runMNN": integration.mnn,
20 |     "runBBKNN": integration.bbknn,
21 |     "runSaucie": integration.saucie,
22 |     "runCombat": integration.combat,
23 |     "runDESC": integration.desc,
24 |     "readConos": preprocessing.read_conos,
25 |     "readSeurat": preprocessing.read_seurat,
26 |     "saveSeurat": preprocessing.save_seurat,
27 | }
28 | 
29 | for alias, func in alias_func_map.items():
30 |     rename_func(func, alias)
31 | 
32 | pp = preprocessing
33 | ig = integration
34 | me = metrics
35 | cl = clustering
36 | 


--------------------------------------------------------------------------------
/scib/_package_tools.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import warnings
 3 | from functools import wraps
 4 | 
 5 | warnings.simplefilter("default")  # or 'always'
 6 | 
 7 | 
 8 | def wrap_func_naming(func, name):
 9 |     """
10 |     Decorator that adds a `DeprecationWarning` and a name to `func`.
11 |     """
12 | 
13 |     @wraps(func)
14 |     def wrapper(*args, **kwargs):
15 |         warnings.warn(
16 |             f"Mixed case function naming is deprecated for '{name}'. "
17 |             f"Please use '{func.__name__}' instead.",
18 |             DeprecationWarning,
19 |             stacklevel=2,
20 |         )
21 |         return func(*args, **kwargs)
22 | 
23 |     wrapper.__name__ = name
24 |     return wrapper
25 | 
26 | 
27 | def rename_func(function, new_name):
28 |     if callable(function):
29 |         function = wrap_func_naming(function, new_name)
30 |     setattr(inspect.getmodule(function), new_name, function)
31 | 


--------------------------------------------------------------------------------
/scib/exceptions.py:
--------------------------------------------------------------------------------
 1 | class OptionalDependencyNotInstalled(ModuleNotFoundError):
 2 |     def __init__(self, exception, module_name=None):
 3 |         if module_name is None:
 4 |             module_name = exception.name
 5 |         self.message = (
 6 |             f"\n'{module_name}' is an optional dependency and not installed by default. "
 7 |             f"Please make sure you install it manually."
 8 |         )
 9 |         super().__init__(self.message)
10 | 
11 | 
12 | class RLibraryNotFound(ModuleNotFoundError):
13 |     def __init__(self, exception):
14 |         self.message = f"\nproblem loading library: {exception}"
15 |         super().__init__(self.message)
16 | 


--------------------------------------------------------------------------------
/scib/knn_graph/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | 


--------------------------------------------------------------------------------
/scib/knn_graph/README.md:
--------------------------------------------------------------------------------
 1 | # knn-graph function written in C++
 2 | 
 3 | The function `knn_graph.cpp` was written by Thomas Neumann and adapted by Maren Büttner for graph-LISI.
 4 | Manual compilation:
 5 | 
 6 | ```
 7 | g++ -std=c++11 -O3 knn_graph.cpp -o knn_graph.o
 8 | ```
 9 | 
10 | A more modern version with `C++17` features is planned.
11 | 


--------------------------------------------------------------------------------
/scib/knn_graph/makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | GCC_FLAGS="-O3 -std=c++11"
 3 | CC="g++"
 4 | SRC_DIR = scIB/knn_graph
 5 | SRC = $(foreach sdir,$(SRC_DIR),$(wildcard $(sdir)/*.cpp))
 6 | OBJS = $(patsubst src/%.cpp,$(SRC_DIR)/%.o,$(SRC))
 7 | 
 8 | all:
 9 |    $CC $GCC_FLAGS ${SRC} -o ${OBJS}
10 | 


--------------------------------------------------------------------------------
/scib/metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | # fmt: off
 2 | from .ari import ari
 3 | from .cell_cycle import cell_cycle
 4 | from .clustering import (cluster_optimal_resolution, get_resolutions,
 5 |                          opt_louvain)
 6 | from .graph_connectivity import graph_connectivity
 7 | from .highly_variable_genes import hvg_overlap
 8 | from .isolated_labels import (isolated_labels, isolated_labels_asw,
 9 |                               isolated_labels_f1)
10 | from .kbet import kBET
11 | from .lisi import clisi_graph, ilisi_graph, lisi_graph
12 | from .metrics import metrics, metrics_all, metrics_fast, metrics_slim
13 | from .nmi import nmi
14 | from .pcr import pc_regression, pcr, pcr_comparison
15 | from .silhouette import silhouette, silhouette_batch
16 | from .trajectory import trajectory_conservation
17 | 


--------------------------------------------------------------------------------
/scib/metrics/ari.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import scipy.special
 4 | from scanpy._utils import deprecated_arg_names
 5 | from sklearn.metrics.cluster import adjusted_rand_score
 6 | 
 7 | from ..utils import check_adata, check_batch
 8 | 
 9 | 
10 | @deprecated_arg_names({"group1": "cluster_key", "group2": "label_key"})
11 | def ari(adata, cluster_key, label_key, implementation=None):
12 |     """Adjusted Rand Index
13 | 
14 |     The adjusted rand index is a chance-adjusted rand index, which evaluates the pair-wise accuracy of clustering vs.
15 |     ground truth label assignments.
16 |     The score ranges between 0 and 1 with larger values indicating better conservation of the data-driven cell identity
17 |     discovery after integration compared to annotated labels.
18 | 
19 |     :param adata: anndata object with cluster assignments in ``adata.obs[cluster_key]``
20 |     :param cluster_key: string of column in adata.obs containing cluster assignments
21 |     :param label_key: string of column in adata.obs containing labels
22 |     :param implementation: if set to 'sklearn', uses sklearn's implementation,
23 |         otherwise native implementation is taken
24 | 
25 |     This function can be applied to all integration output types.
26 |     The ``adata`` must contain cluster assignments that are based off the knn graph given or derived from the integration
27 |     method output.
28 |     For this metric you need to include all steps that are needed for clustering.
29 |     See :ref:`preprocessing` for more information on preprocessing.
30 | 
31 |     **Examples**
32 | 
33 |     .. code-block:: python
34 | 
35 |         # feature output
36 |         scib.pp.reduce_data(
37 |             adata, n_top_genes=2000, batch_key="batch", pca=True, neighbors=True
38 |         )
39 |         scib.me.cluster_optimal_resolution(adata, cluster_key="cluster", label_key="celltype")
40 |         scib.me.ari(adata, cluster_key="cluster", label_key="celltype")
41 | 
42 |         # embedding output
43 |         sc.pp.neighbors(adata, use_rep="X_emb")
44 |         scib.me.cluster_optimal_resolution(adata, cluster_key="cluster", label_key="celltype")
45 |         scib.me.ari(adata, cluster_key="cluster", label_key="celltype")
46 | 
47 |         # knn output
48 |         scib.me.cluster_optimal_resolution(adata, cluster_key="cluster", label_key="celltype")
49 |         scib.me.ari(adata, cluster_key="cluster", label_key="celltype")
50 | 
51 |     """
52 | 
53 |     check_adata(adata)
54 |     check_batch(cluster_key, adata.obs)
55 |     check_batch(label_key, adata.obs)
56 | 
57 |     cluster_key = adata.obs[cluster_key].to_numpy()
58 |     label_key = adata.obs[label_key].to_numpy()
59 | 
60 |     if len(cluster_key) != len(label_key):
61 |         raise ValueError(
62 |             f"different lengths in cluster_key ({len(cluster_key)}) and label_key ({len(label_key)})"
63 |         )
64 | 
65 |     if implementation == "sklearn":
66 |         return adjusted_rand_score(cluster_key, label_key)
67 | 
68 |     def binom_sum(x, k=2):
69 |         return scipy.special.binom(x, k).sum()
70 | 
71 |     n = len(cluster_key)
72 |     contingency = pd.crosstab(cluster_key, label_key)
73 | 
74 |     ai_sum = binom_sum(contingency.sum(axis=0))
75 |     bi_sum = binom_sum(contingency.sum(axis=1))
76 | 
77 |     index = binom_sum(np.ravel(contingency))
78 |     expected_index = ai_sum * bi_sum / binom_sum(n, 2)
79 |     max_index = 0.5 * (ai_sum + bi_sum)
80 | 
81 |     return (index - expected_index) / (max_index - expected_index)
82 | 


--------------------------------------------------------------------------------
/scib/metrics/graph_connectivity.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from scipy.sparse.csgraph import connected_components
 4 | 
 5 | 
 6 | def graph_connectivity(adata, label_key):
 7 |     """Graph Connectivity
 8 | 
 9 |     Quantify the connectivity of the subgraph per cell type label.
10 |     The final score is the average for all cell type labels :math:`C`, according to the equation:
11 | 
12 |     .. math::
13 | 
14 |         GC = \\frac {1} {|C|} \\sum_{c \\in C} \\frac {|{LCC(subgraph_c)}|} {|c|}
15 | 
16 |     where :math:`|LCC(subgraph_c)|` stands for all cells in the largest connected component and :math:`|c|` stands for all cells of
17 |     cell type :math:`c`.
18 | 
19 |     :param adata: integrated adata with computed neighborhood graph
20 |     :param label_key: name in adata.obs containing the cell identity labels
21 | 
22 |     This function can be applied to all integration output types.
23 |     The integrated object (``adata``) needs to have a kNN graph based on the integration output.
24 |     See :ref:`preprocessing` for more information on preprocessing.
25 | 
26 |     **Examples**
27 | 
28 |     .. code-block:: python
29 | 
30 |         # feature output
31 |         scib.pp.reduce_data(
32 |             adata, n_top_genes=2000, batch_key="batch", pca=True, neighbors=True
33 |         )
34 |         scib.me.graph_connectivity(adata, label_key="celltype")
35 | 
36 |         # embedding output
37 |         sc.pp.neighbors(adata, use_rep="X_emb")
38 |         scib.me.graph_connectivity(adata, label_key="celltype")
39 | 
40 |         # knn output
41 |         scib.me.graph_connectivity(adata, label_key="celltype")
42 | 
43 |     """
44 |     if "neighbors" not in adata.uns:
45 |         raise KeyError(
46 |             "Please compute the neighborhood graph before running this function!"
47 |         )
48 | 
49 |     clust_res = []
50 | 
51 |     for label in adata.obs[label_key].cat.categories:
52 |         adata_sub = adata[adata.obs[label_key].isin([label])]
53 |         _, labels = connected_components(
54 |             adata_sub.obsp["connectivities"], connection="strong"
55 |         )
56 |         tab = pd.value_counts(labels)
57 |         clust_res.append(tab.max() / sum(tab))
58 | 
59 |     return np.mean(clust_res)
60 | 


--------------------------------------------------------------------------------
/scib/metrics/highly_variable_genes.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scanpy as sc
 3 | from scanpy._utils import deprecated_arg_names
 4 | 
 5 | from ..utils import split_batches
 6 | 
 7 | 
 8 | def precompute_hvg_batch(adata, batch, features, n_hvg=500, save_hvg=False):
 9 |     """
10 |     Compute HVGs per batch
11 | 
12 |     :param adata: anndata object
13 |     :param batch: key in adata.obs
14 |     :param features: features to subset to
15 |     :param n_hvg: maximum number of HVGs to compute
16 |     :save_hvg: whether to add hvg per batch information to adata object
17 |     :return:
18 |         dictionary of batch to HVG list
19 |     """
20 |     adata_list = split_batches(adata, batch, hvg=features)
21 |     hvg_dir = {}
22 |     for i in adata_list:
23 |         sc.pp.filter_genes(i, min_cells=1)
24 |         n_hvg_tmp = np.minimum(n_hvg, int(0.5 * i.n_vars))
25 |         if n_hvg_tmp < n_hvg:
26 |             print(i.obs[batch][0] + " has less than the specified number of genes")
27 |             print("Number of genes: " + str(i.n_vars))
28 |         hvg = sc.pp.highly_variable_genes(
29 |             i, flavor="cell_ranger", n_top_genes=n_hvg_tmp, inplace=False
30 |         )
31 |         hvg_dir[i.obs[batch][0]] = i.var.index[hvg["highly_variable"]]
32 | 
33 |     if save_hvg:
34 |         adata.uns["hvg_before"] = hvg_dir
35 |     else:
36 |         return hvg_dir
37 | 
38 | 
39 | @deprecated_arg_names({"batch": "batch_key"})
40 | def hvg_overlap(adata_pre, adata_post, batch_key, n_hvg=500, verbose=False):
41 |     """Highly variable gene overlap
42 | 
43 |     Metric that computes the average percentage of overlapping highly variable genes per batch pre post integration.
44 | 
45 |     :param adata_pre: Unintegrated anndata object
46 |     :param adata_post: Integrated anndata object
47 |     :param batch_key: Batch variable in ``adata_post.obs``
48 |     :param n_hvg: Number of hvgs to compute per batch
49 |     :return:
50 |         Average percentage of overlapping highly variable genes
51 | 
52 |     The score can only be computed on feature spaces.
53 |     No preprocessing is needed, as the function will perform highly variable gene selection.
54 | 
55 |     **Example**
56 | 
57 |     .. code-block:: python
58 | 
59 |         # full feature output
60 |         scib.me.hvg_overlap(adata_unintegrated, adata, batch_key="batch")
61 |     """
62 |     hvg_post = adata_post.var_names
63 | 
64 |     adata_post_list = split_batches(adata_post, batch_key)
65 |     overlap = []
66 | 
67 |     hvg_pre_list = precompute_hvg_batch(adata_pre, batch_key, hvg_post, n_hvg=n_hvg)
68 | 
69 |     for ad_post in adata_post_list:  # range(len(adata_pre_list)):
70 |         # remove genes unexpressed (otherwise hvg might break)
71 |         sc.pp.filter_genes(ad_post, min_cells=1)
72 |         batch_var = ad_post.obs[batch_key][0]
73 |         n_hvg_tmp = len(hvg_pre_list[batch_var])
74 | 
75 |         if verbose:
76 |             print(n_hvg_tmp)
77 | 
78 |         tmp_pre = hvg_pre_list[batch_var]
79 |         hvg_post = sc.pp.highly_variable_genes(
80 |             ad_post, flavor="cell_ranger", n_top_genes=n_hvg_tmp, inplace=False
81 |         )
82 |         tmp_post = ad_post.var.index[hvg_post["highly_variable"]]
83 |         n_hvg_real = np.minimum(len(tmp_pre), len(tmp_post))
84 |         overlap.append((len(set(tmp_pre).intersection(set(tmp_post)))) / n_hvg_real)
85 |     return np.mean(overlap)
86 | 


--------------------------------------------------------------------------------
/scib/metrics/silhouette.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from scanpy._utils import deprecated_arg_names
  4 | from sklearn.metrics.cluster import silhouette_samples, silhouette_score
  5 | 
  6 | 
  7 | @deprecated_arg_names({"group_key": "label_key"})
  8 | def silhouette(adata, label_key, embed, metric="euclidean", scale=True):
  9 |     """Average silhouette width (ASW)
 10 | 
 11 |     Wrapper for sklearn silhouette function values range from [-1, 1] with
 12 | 
 13 |         * 1 indicates distinct, compact clusters
 14 |         * 0 indicates overlapping clusters
 15 |         * -1 indicates core-periphery (non-cluster) structure
 16 | 
 17 |     By default, the score is scaled between 0 and 1 (``scale=True``).
 18 | 
 19 |     :param label_key: key in adata.obs of cell labels
 20 |     :param embed: embedding key in adata.obsm, default: 'X_pca'
 21 |     :param metric: type of distance metric to use for the silhouette scores
 22 |     :param scale: default True, scale between 0 (worst) and 1 (best)
 23 | 
 24 |     The function requires an embedding to be stored in ``adata.obsm`` and can only be applied to feature and embedding
 25 |     integration outputs.
 26 |     Please note, that the metric cannot be used to evaluate kNN graph outputs.
 27 |     See :ref:`preprocessing` for more information on preprocessing.
 28 | 
 29 |     **Examples**
 30 | 
 31 |     .. code-block:: python
 32 | 
 33 |         # full feature output
 34 |         scib.pp.reduce_data(
 35 |             adata, n_top_genes=2000, batch_key="batch", pca=True, neighbors=False
 36 |         )
 37 |         scib.me.silhouette(adata, label_key="celltype", embed="X_pca")
 38 | 
 39 |         # embedding output
 40 |         scib.me.silhouette(adata, label_key="celltype", embed="X_emb")
 41 |     """
 42 |     if embed not in adata.obsm.keys():
 43 |         print(adata.obsm.keys())
 44 |         raise KeyError(f"{embed} not in obsm")
 45 |     asw = silhouette_score(
 46 |         X=adata.obsm[embed], labels=adata.obs[label_key], metric=metric
 47 |     )
 48 |     if scale:
 49 |         asw = (asw + 1) / 2
 50 |     return asw
 51 | 
 52 | 
 53 | @deprecated_arg_names({"group_key": "label_key"})
 54 | def silhouette_batch(
 55 |     adata,
 56 |     batch_key,
 57 |     label_key,
 58 |     embed,
 59 |     metric="euclidean",
 60 |     return_all=False,
 61 |     scale=True,
 62 |     verbose=True,
 63 | ):
 64 |     """Batch ASW
 65 | 
 66 |     Modified average silhouette width (ASW) of batch
 67 | 
 68 |     This metric measures the silhouette of a given batch.
 69 |     It assumes that a silhouette width close to 0 represents perfect overlap of the batches, thus the absolute value of
 70 |     the silhouette width is used to measure how well batches are mixed.
 71 |     For all cells :math:`i` of a cell type :math:`C_j`, the batch ASW of that cell type is:
 72 | 
 73 |     .. math::
 74 | 
 75 |         batch \\, ASW_j = \\frac{1}{|C_j|} \\sum_{i \\in C_j} |silhouette(i)|
 76 | 
 77 |     The final score is the average of the absolute silhouette widths computed per cell type :math:`M`.
 78 | 
 79 |     .. math::
 80 | 
 81 |         batch \\, ASW = \\frac{1}{|M|} \\sum_{i \\in M} batch \\, ASW_j
 82 | 
 83 |     For a scaled metric (which is the default), the absolute ASW per group is subtracted from 1 before averaging, so that
 84 |     0 indicates suboptimal label representation and 1 indicates optimal label representation.
 85 | 
 86 |     .. math::
 87 | 
 88 |         batch \\, ASW_j = \\frac{1}{|C_j|} \\sum_{i \\in C_j} 1 - |silhouette(i)|
 89 | 
 90 |     :param batch_key: batch labels to be compared against
 91 |     :param label_key: group labels to be subset by e.g. cell type
 92 |     :param embed: name of column in adata.obsm
 93 |     :param metric: see sklearn silhouette score
 94 |     :param scale: if True, scale between 0 and 1
 95 |     :param return_all: if True, return all silhouette scores and label means
 96 |         default False: return average width silhouette (ASW)
 97 |     :param verbose: print silhouette score per group
 98 |     :return:
 99 |         Batch ASW  (always)
100 |         Mean silhouette per group in pd.DataFrame (additionally, if return_all=True)
101 |         Absolute silhouette scores per group label (additionally, if return_all=True)
102 | 
103 |     The function requires an embedding to be stored in ``adata.obsm`` and can only be applied to feature and embedding
104 |     integration outputs.
105 |     Please note, that the metric cannot be used to evaluate kNN graph outputs.
106 |     See :ref:`preprocessing` for more information on preprocessing.
107 | 
108 |     **Examples**
109 | 
110 |     .. code-block:: python
111 | 
112 |         # feature output
113 |         scib.pp.reduce_data(
114 |             adata, n_top_genes=2000, batch_key="batch", pca=True, neighbors=False
115 |         )
116 |         scib.me.silhouette_batch(adata, batch_key="batch", label_key="celltype", embed="X_pca")
117 | 
118 |         # embedding output
119 |         scib.me.silhouette_batch(adata, batch_key="batch", label_key="celltype", embed="X_emb")
120 | 
121 |     """
122 |     if embed not in adata.obsm.keys():
123 |         print(adata.obsm.keys())
124 |         raise KeyError(f"{embed} not in obsm")
125 | 
126 |     sil_per_label = []
127 |     for group in adata.obs[label_key].unique():
128 |         adata_group = adata[adata.obs[label_key] == group]
129 |         n_batches = adata_group.obs[batch_key].nunique()
130 | 
131 |         if (n_batches == 1) or (n_batches == adata_group.shape[0]):
132 |             continue
133 | 
134 |         sil = silhouette_samples(
135 |             adata_group.obsm[embed], adata_group.obs[batch_key], metric=metric
136 |         )
137 | 
138 |         # take only absolute value
139 |         sil = [abs(i) for i in sil]
140 | 
141 |         if scale:
142 |             # scale s.t. highest number is optimal
143 |             sil = [1 - i for i in sil]
144 | 
145 |         sil_per_label.extend([(group, score) for score in sil])
146 | 
147 |     sil_df = pd.DataFrame.from_records(
148 |         sil_per_label, columns=["group", "silhouette_score"]
149 |     )
150 | 
151 |     if len(sil_per_label) == 0:
152 |         sil_means = np.nan
153 |         asw = np.nan
154 |     else:
155 |         sil_means = sil_df.groupby("group").mean()
156 |         asw = sil_means["silhouette_score"].mean()
157 | 
158 |     if verbose:
159 |         print(f"mean silhouette per group: {sil_means}")
160 | 
161 |     if return_all:
162 |         return asw, sil_means, sil_df
163 | 
164 |     return asw
165 | 


--------------------------------------------------------------------------------
/scib/metrics/trajectory.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import scanpy as sc
  4 | from scipy.sparse.csgraph import connected_components
  5 | 
  6 | from ..utils import check_batch
  7 | from .utils import RootCellError
  8 | 
  9 | 
 10 | def trajectory_conservation(
 11 |     adata_pre, adata_post, label_key, pseudotime_key="dpt_pseudotime", batch_key=None
 12 | ):
 13 |     """Trajectory conservation score
 14 | 
 15 |     Trajectory conservation is measured by  spearman’s rank correlation coefficient :math:`s`, between the pseudotime
 16 |     values before and after integration.
 17 |     The final score was scaled to a value between 0 and 1 using the equation
 18 | 
 19 |      .. math::
 20 | 
 21 |         trajectory \\, conservation = \\frac {s + 1} {2}
 22 | 
 23 |     :param adata_pre: unintegrated adata
 24 |     :param adata_post: integrated adata
 25 |     :param label_key: column in ``adata_pre.obs`` of the groups used to precompute the trajectory
 26 |     :param pseudotime_key: column in ``adata_pre.obs`` in which the pseudotime is saved in.
 27 |         Column can contain empty entries, the dataset will be subset to the cells with scores.
 28 |     :param batch_key: set to batch key if you want to compute the trajectory metric by batch. By default the batch
 29 |         information will be ignored (``batch_key=None``)
 30 | 
 31 |     This function requires pseudotime values in ``.obs`` of the unintegrated object (``adata_pre``) computed per batch
 32 |     and can be applied to all integration output types.
 33 |     The input trajectories should be curated manually as the quality of the metric depends on the quality of the metric
 34 |     depends on the quality of the annotation.
 35 |     The integrated object (``adata_post``) needs to have a kNN graph based on the integration output.
 36 |     See :ref:`preprocessing` for more information on preprocessing.
 37 | 
 38 |     **Examples**
 39 | 
 40 |     .. code-block:: python
 41 | 
 42 |         # feature output
 43 |         scib.pp.reduce_data(
 44 |             adata, n_top_genes=2000, batch_key="batch", pca=True, neighbors=True
 45 |         )
 46 |         scib.me.trajectory_conservation(adata_unintegrated, adata, label_key="cell_type")
 47 | 
 48 |         # embedding output
 49 |         sc.pp.neighbors(adata, use_rep="X_emb")
 50 |         scib.me.trajectory_conservation(adata_unintegrated, adata, label_key="celltype")
 51 | 
 52 |         # knn output
 53 |         scib.me.trajectory_conservation(adata_unintegrated, adata, label_key="celltype")
 54 | 
 55 |     """
 56 |     # subset to cells for which pseudotime has been computed
 57 |     cell_subset = adata_pre.obs.index[adata_pre.obs[pseudotime_key].notnull()]
 58 |     adata_pre_ti = adata_pre[cell_subset]
 59 |     adata_post_ti = adata_post[cell_subset]
 60 |     try:
 61 |         iroot, adata_post_ti2 = get_root(
 62 |             adata_pre_ti, adata_post_ti, label_key, pseudotime_key
 63 |         )
 64 |     except RootCellError:
 65 |         print("No root cell found, setting trajectory conservation metric to 0.")
 66 |         return 0  # failure to find root cell means no TI conservation
 67 | 
 68 |     adata_post_ti2.uns["iroot"] = iroot
 69 | 
 70 |     sc.tl.dpt(adata_post_ti2)  # stored in 'dpt_pseudotime'
 71 |     adata_post_ti2.obs.loc[
 72 |         adata_post_ti2.obs["dpt_pseudotime"] > 1, "dpt_pseudotime"
 73 |     ] = 0
 74 |     adata_post_ti.obs["dpt_pseudotime"] = 0
 75 |     adata_post_ti.obs["dpt_pseudotime"] = adata_post_ti2.obs["dpt_pseudotime"]
 76 |     adata_post_ti.obs["dpt_pseudotime"].fillna(0, inplace=True)
 77 | 
 78 |     if batch_key is None:
 79 |         pseudotime_before = adata_pre_ti.obs[pseudotime_key]
 80 |         pseudotime_after = adata_post_ti.obs["dpt_pseudotime"]
 81 |         correlation = pseudotime_before.corr(pseudotime_after, "spearman")
 82 |         return (correlation + 1) / 2  # scaled
 83 |     else:
 84 |         check_batch(batch_key, adata_pre.obs)
 85 |         check_batch(batch_key, adata_post.obs)
 86 | 
 87 |         # check if batches match
 88 |         if not np.array_equal(
 89 |             adata_post_ti.obs[batch_key], adata_pre_ti.obs[batch_key]
 90 |         ):
 91 |             raise ValueError(
 92 |                 "Batch columns do not match\n"
 93 |                 f"adata_post_ti.obs['batch']:\n {adata_post_ti.obs[batch_key]}\n"
 94 |                 f"adata_pre_ti.obs['batch']:\n {adata_pre_ti.obs[batch_key]}\n"
 95 |             )
 96 | 
 97 |         corr = pd.Series()
 98 |         for i in adata_pre_ti.obs[batch_key].unique():
 99 |             pseudotime_before = adata_pre_ti.obs[adata_pre_ti.obs[batch_key] == i][
100 |                 pseudotime_key
101 |             ]
102 |             pseudotime_after = adata_post_ti.obs[adata_post_ti.obs[batch_key] == i][
103 |                 "dpt_pseudotime"
104 |             ]
105 |             corr[i] = pseudotime_before.corr(pseudotime_after, "spearman")
106 | 
107 |         return (corr.mean() + 1) / 2  # scaled
108 | 
109 | 
110 | def get_root(adata_pre, adata_post, ct_key, pseudotime_key="dpt_pseudotime", dpt_dim=3):
111 |     """Determine root cell for integrated adata based on unintegrated adata
112 | 
113 |     :param adata_pre: unintegrated adata
114 |     :param adata_post: integrated adata
115 |     :param label_key: column in ``adata_pre.obs`` of the groups used to precompute the trajectory
116 |     :param pseudotime_key: column in ``adata_pre.obs`` in which the pseudotime is saved in.
117 |         Column can contain empty entries, the dataset will be subset to the cells with scores.
118 |     :param dpt_dim: number of diffmap dimensions used to determine root
119 |     """
120 |     n_components, adata_post.obs["neighborhood"] = connected_components(
121 |         csgraph=adata_post.obsp["connectivities"], directed=False, return_labels=True
122 |     )
123 | 
124 |     start_clust = adata_pre.obs.groupby([ct_key]).mean()[pseudotime_key].idxmin()
125 |     min_dpt = adata_pre.obs[adata_pre.obs[ct_key] == start_clust].index
126 |     which_max_neigh = (
127 |         adata_post.obs["neighborhood"]
128 |         == adata_post.obs["neighborhood"].value_counts().idxmax()
129 |     )
130 |     min_dpt = [
131 |         value for value in min_dpt if value in adata_post.obs[which_max_neigh].index
132 |     ]
133 | 
134 |     adata_post_ti = adata_post[which_max_neigh]
135 | 
136 |     min_dpt = [adata_post_ti.obs_names.get_loc(i) for i in min_dpt]
137 | 
138 |     # compute Diffmap for adata_post
139 |     sc.tl.diffmap(adata_post_ti)
140 | 
141 |     # determine most extreme cell in adata_post Diffmap
142 |     min_dpt_cell = np.zeros(len(min_dpt))
143 |     for dim in np.arange(dpt_dim):
144 | 
145 |         diffmap_mean = adata_post_ti.obsm["X_diffmap"][:, dim].mean()
146 |         diffmap_min_dpt = adata_post_ti.obsm["X_diffmap"][min_dpt, dim]
147 | 
148 |         # count opt cell
149 |         if len(diffmap_min_dpt) == 0:
150 |             raise RootCellError("No root cell in largest component")
151 | 
152 |         # choose optimum function
153 |         if len(diffmap_min_dpt) > 0 and diffmap_min_dpt.mean() < diffmap_mean:
154 |             opt = np.argmin
155 |         else:
156 |             opt = np.argmax
157 | 
158 |         min_dpt_cell[opt(diffmap_min_dpt)] += 1
159 | 
160 |     # root cell is cell with max vote
161 |     return min_dpt[np.argmax(min_dpt_cell)], adata_post_ti
162 | 


--------------------------------------------------------------------------------
/scib/metrics/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from scipy import sparse
  4 | 
  5 | # Errors
  6 | 
  7 | 
  8 | class RootCellError(Exception):
  9 |     def __init__(self, message):
 10 |         self.message = message
 11 | 
 12 | 
 13 | class NeighborsError(Exception):
 14 |     def __init__(self, message):
 15 |         self.message = message
 16 | 
 17 | 
 18 | # Diffusion
 19 | 
 20 | 
 21 | def diffusion_conn(adata, min_k=50, copy=True, max_iterations=26):
 22 |     """
 23 |     Diffusion for connectivites matrix extension
 24 |     This function performs graph diffusion on the connectivities matrix until a
 25 |     minimum number `min_k` of entries per row are non-zero.
 26 | 
 27 |     Note:
 28 |     Due to self-loops min_k-1 non-zero connectivies entries is actually the stopping
 29 |     criterion. This is equivalent to `sc.pp.neighbors`.
 30 | 
 31 |     Returns:
 32 |        The diffusion-enhanced connectivities matrix of a copy of the AnnData object
 33 |        with the diffusion-enhanced connectivities matrix is in
 34 |        `adata.uns["neighbors"]["conectivities"]`
 35 |     """
 36 |     if "neighbors" not in adata.uns:
 37 |         raise ValueError(
 38 |             "`neighbors` not in adata object. " "Please compute a neighbourhood graph!"
 39 |         )
 40 | 
 41 |     if "connectivities" not in adata.obsp:
 42 |         raise ValueError(
 43 |             "`connectivities` not in `adata.obsp`. "
 44 |             "Please pass an object with connectivities computed!"
 45 |         )
 46 | 
 47 |     T = adata.obsp["connectivities"]
 48 | 
 49 |     # Normalize T with max row sum
 50 |     # Note: This keeps the matrix symmetric and ensures |M| doesn't keep growing
 51 |     T = sparse.diags(1 / np.array([T.sum(1).max()] * T.shape[0])) * T
 52 | 
 53 |     M = T
 54 | 
 55 |     # Check for disconnected component
 56 |     n_comp, labs = sparse.csgraph.connected_components(
 57 |         adata.obsp["connectivities"], connection="strong"
 58 |     )
 59 | 
 60 |     if n_comp > 1:
 61 |         tab = pd.value_counts(labs)
 62 |         small_comps = tab.index[tab < min_k]
 63 |         large_comp_mask = np.array(~pd.Series(labs).isin(small_comps))
 64 |     else:
 65 |         large_comp_mask = np.array([True] * M.shape[0])
 66 | 
 67 |     T_agg = T
 68 |     i = 2
 69 |     while ((M[large_comp_mask, :][:, large_comp_mask] > 0).sum(1).min() < min_k) and (
 70 |         i < max_iterations
 71 |     ):
 72 |         print(f"Adding diffusion to step {i}")
 73 |         T_agg *= T
 74 |         M += T_agg
 75 |         i += 1
 76 | 
 77 |     if (M[large_comp_mask, :][:, large_comp_mask] > 0).sum(1).min() < min_k:
 78 |         raise ValueError(
 79 |             "could not create diffusion connectivities matrix"
 80 |             f"with at least {min_k} non-zero entries in"
 81 |             f"{max_iterations} iterations.\n Please increase the"
 82 |             "value of max_iterations or reduce k_min.\n"
 83 |         )
 84 | 
 85 |     M.setdiag(0)
 86 | 
 87 |     if copy:
 88 |         adata_tmp = adata.copy()
 89 |         adata_tmp.uns["neighbors"].update({"diffusion_connectivities": M})
 90 |         return adata_tmp
 91 | 
 92 |     else:
 93 |         return M
 94 | 
 95 | 
 96 | def diffusion_nn(adata, k, max_iterations=26):
 97 |     """
 98 |     Diffusion neighbourhood score
 99 |     This function generates a nearest neighbour list from a connectivities matrix
100 |     as supplied by BBKNN or Conos. This allows us to select a consistent number
101 |     of nearest neighbours across all methods.
102 | 
103 |     Return:
104 |        `k_indices` a numpy.ndarray of the indices of the k-nearest neighbors.
105 |     """
106 |     if "neighbors" not in adata.uns:
107 |         raise ValueError(
108 |             "`neighbors` not in adata object. " "Please compute a neighbourhood graph!"
109 |         )
110 | 
111 |     if "connectivities" not in adata.obsp:
112 |         raise ValueError(
113 |             "`connectivities` not in `adata.obsp`. "
114 |             "Please pass an object with connectivities computed!"
115 |         )
116 | 
117 |     T = adata.obsp["connectivities"]
118 | 
119 |     # Row-normalize T
120 |     T = sparse.diags(1 / T.sum(1).A.ravel()) * T
121 | 
122 |     T_agg = T**3
123 |     M = T + T**2 + T_agg
124 |     i = 4
125 | 
126 |     while ((M > 0).sum(1).min() < (k + 1)) and (i < max_iterations):
127 |         # note: k+1 is used as diag is non-zero (self-loops)
128 |         print(f"Adding diffusion to step {i}")
129 |         T_agg *= T
130 |         M += T_agg
131 |         i += 1
132 | 
133 |     if (M > 0).sum(1).min() < (k + 1):
134 |         raise NeighborsError(
135 |             f"could not find {k} nearest neighbors in {max_iterations}"
136 |             "diffusion steps.\n Please increase max_iterations or reduce"
137 |             " k.\n"
138 |         )
139 | 
140 |     M.setdiag(0)
141 |     k_indices = np.argpartition(M.A, -k, axis=1)[:, -k:]
142 | 
143 |     return k_indices
144 | 
145 | 
146 | # Not used
147 | 
148 | 
149 | def get_hvg_indices(adata, verbose=True):
150 |     if "highly_variable" not in adata.var.columns:
151 |         if verbose:
152 |             print(
153 |                 f"No highly variable genes computed, continuing with full matrix {adata.shape}"
154 |             )
155 |         return np.array(range(adata.n_vars))
156 |     return np.where(adata.var["highly_variable"] is True)[0]
157 | 
158 | 
159 | def select_hvg(adata, select=True):
160 |     if select and "highly_variable" in adata.var:
161 |         return adata[:, adata.var["highly_variable"]].copy()
162 |     else:
163 |         return adata
164 | 


--------------------------------------------------------------------------------
/scib/resources/g2m_genes_tirosh.txt:
--------------------------------------------------------------------------------
 1 | Hmgb2
 2 | Cdk1
 3 | Nusap1
 4 | Ube2c
 5 | Birc5
 6 | Tpx2
 7 | Top2a
 8 | Ndc80
 9 | Cks2
10 | Nuf2
11 | Cks1b
12 | Mki67
13 | Tmpo
14 | Cenpf
15 | Tacc3
16 | Fam64a
17 | Smc4
18 | Ccnb2
19 | Ckap2l
20 | Ckap2
21 | Aurkb
22 | Bub1
23 | Kif11
24 | Anp32e
25 | Tubb4b
26 | Gtse1
27 | Kif20b
28 | Hjurp
29 | Cdca3
30 | Hn1
31 | Cdc20
32 | Ttk
33 | Cdc25c
34 | Kif2c
35 | Rangap1
36 | Ncapd2
37 | Dlgap5
38 | Cdca2
39 | Cdca8
40 | Ect2
41 | Kif23
42 | Hmmr
43 | Aurka
44 | Psrc1
45 | Anln
46 | Lbr
47 | Ckap5
48 | Cenpe
49 | Ctcf
50 | Nek2
51 | G2e3
52 | Gas2l3
53 | Cbx5
54 | Cenpa
55 | 


--------------------------------------------------------------------------------
/scib/resources/g2m_genes_tirosh_hm.txt:
--------------------------------------------------------------------------------
 1 | HMGB2
 2 | CDK1
 3 | NUSAP1
 4 | UBE2C
 5 | BIRC5
 6 | TPX2
 7 | TOP2A
 8 | NDC80
 9 | CKS2
10 | NUF2
11 | CKS1B
12 | MKI67
13 | TMPO
14 | CENPF
15 | TACC3
16 | FAM64A
17 | SMC4
18 | CCNB2
19 | CKAP2L
20 | CKAP2
21 | AURKB
22 | BUB1
23 | KIF11
24 | ANP32E
25 | TUBB4B
26 | GTSE1
27 | KIF20B
28 | HJURP
29 | CDCA3
30 | HN1
31 | CDC20
32 | TTK
33 | CDC25C
34 | KIF2C
35 | RANGAP1
36 | NCAPD2
37 | DLGAP5
38 | CDCA2
39 | CDCA8
40 | ECT2
41 | KIF23
42 | HMMR
43 | AURKA
44 | PSRC1
45 | ANLN
46 | LBR
47 | CKAP5
48 | CENPE
49 | CTCF
50 | NEK2
51 | G2E3
52 | GAS2L3
53 | CBX5
54 | CENPA
55 | 


--------------------------------------------------------------------------------
/scib/resources/s_genes_tirosh.txt:
--------------------------------------------------------------------------------
 1 | Mcm5
 2 | Pcna
 3 | Tyms
 4 | Fen1
 5 | Mcm2
 6 | Mcm4
 7 | Rrm1
 8 | Ung
 9 | Gins2
10 | Mcm6
11 | Cdca7
12 | Dtl
13 | Prim1
14 | Uhrf1
15 | Mlf1ip
16 | Hells
17 | Rfc2
18 | Rpa2
19 | Nasp
20 | Rad51ap1
21 | Gmnn
22 | Wdr76
23 | Slbp
24 | Ccne2
25 | Ubr7
26 | Pold3
27 | Msh2
28 | Atad2
29 | Rad51
30 | Rrm2
31 | Cdc45
32 | Cdc6
33 | Exo1
34 | Tipin
35 | Dscc1
36 | Blm
37 | Casp8ap2
38 | Usp1
39 | Clspn
40 | Pola1
41 | Chaf1b
42 | Brip1
43 | E2f8
44 | 


--------------------------------------------------------------------------------
/scib/resources/s_genes_tirosh_hm.txt:
--------------------------------------------------------------------------------
 1 | MCM5
 2 | PCNA
 3 | TYMS
 4 | FEN1
 5 | MCM2
 6 | MCM4
 7 | RRM1
 8 | UNG
 9 | GINS2
10 | MCM6
11 | CDCA7
12 | DTL
13 | PRIM1
14 | UHRF1
15 | MLF1IP
16 | HELLS
17 | RFC2
18 | RPA2
19 | NASP
20 | RAD51AP1
21 | GMNN
22 | WDR76
23 | SLBP
24 | CCNE2
25 | UBR7
26 | POLD3
27 | MSH2
28 | ATAD2
29 | RAD51
30 | RRM2
31 | CDC45
32 | CDC6
33 | EXO1
34 | TIPIN
35 | DSCC1
36 | BLM
37 | CASP8AP2
38 | USP1
39 | CLSPN
40 | POLA1
41 | CHAF1B
42 | BRIP1
43 | E2F8
44 | 


--------------------------------------------------------------------------------
/scib/trajectory_inference.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | import scanpy as sc
 4 | 
 5 | from . import utils
 6 | 
 7 | 
 8 | def paga(adata, groups="louvain"):
 9 |     """ """
10 |     utils.check_adata(adata)
11 | 
12 |     sc.pp.neighbors(adata)
13 |     sc.tl.paga(adata, groups=groups)
14 |     _ = sc.pl.paga_compare(adata, show=False)
15 | 
16 |     fig1, ax1 = plt.subplots()
17 |     sc.pl.umap(adata, size=40, ax=ax1, show=False)
18 |     sc.pl.paga(
19 |         adata,
20 |         pos=adata.uns["paga"]["pos"],
21 |         show=False,
22 |         node_size_scale=10,
23 |         node_size_power=1,
24 |         ax=ax1,
25 |         text_kwds={"alpha": 0},
26 |     )
27 |     plt.show()
28 | 
29 | 
30 | def dpt(adata, group, root, opt="min", comp=0):
31 |     utils.check_adata()
32 | 
33 |     # TODO compute diffmap before
34 | 
35 |     # get root
36 |     stem_mask = np.isin(adata.obs[group], root)
37 |     if opt == "min":
38 |         opt_stem_id = np.argmin(adata.obsm["X_diffmap"][stem_mask, comp])
39 |     elif opt == "max":
40 |         opt_stem_id = np.argmax(adata.obsm["X_diffmap"][stem_mask, comp])
41 |     else:
42 |         raise ("invalid optimum", opt)
43 |     root_id = np.arange(len(stem_mask))[stem_mask][opt_stem_id]
44 |     adata.uns["iroot"] = root_id
45 |     # compute pseudotime
46 |     sc.tl.dpt(adata)
47 | 


--------------------------------------------------------------------------------
/scib/utils.py:
--------------------------------------------------------------------------------
 1 | import anndata
 2 | 
 3 | 
 4 | # checker functions for data sanity
 5 | def check_adata(adata):
 6 |     if type(adata) is not anndata.AnnData:
 7 |         raise TypeError("Input is not a valid AnnData object")
 8 | 
 9 | 
10 | def check_batch(batch, obs, verbose=False):
11 |     if batch not in obs:
12 |         raise ValueError(f"column {batch} is not in obs")
13 |     elif verbose:
14 |         print(f"Object contains {obs[batch].nunique()} batches.")
15 | 
16 | 
17 | def check_hvg(hvg, adata_var):
18 |     if type(hvg) is not list:
19 |         raise TypeError("HVG list is not a list")
20 |     else:
21 |         if not all(i in adata_var.index for i in hvg):
22 |             raise ValueError("Not all HVGs are in the adata object")
23 | 
24 | 
25 | def check_sanity(adata, batch, hvg):
26 |     check_adata(adata)
27 |     check_batch(batch, adata.obs)
28 |     if hvg is not None:
29 |         check_hvg(hvg, adata.var)
30 | 
31 | 
32 | def split_batches(adata, batch, hvg=None, return_categories=False):
33 |     """Split batches and preserve category information
34 | 
35 |     :param adata:
36 |     :param batch: name of column in ``adata.obs``. The data type of the column must be of ``Category``.
37 |     :param hvg: list of highly variable genes
38 |     :param return_categories: whether to return the categories object of ``batch``
39 |     """
40 |     split = []
41 |     batch_categories = adata.obs[batch].cat.categories
42 |     if hvg is not None:
43 |         adata = adata[:, hvg]
44 |     for i in batch_categories:
45 |         split.append(adata[adata.obs[batch] == i].copy())
46 |     if return_categories:
47 |         return split, batch_categories
48 |     return split
49 | 
50 | 
51 | def merge_adata(*adata_list, **kwargs):
52 |     """Merge adatas from list while remove duplicated ``obs`` and ``var`` columns
53 | 
54 |     :param adata_list: ``anndata`` objects to be concatenated
55 |     :param kwargs: arguments to be passed to ``anndata.AnnData.concatenate``
56 |     """
57 | 
58 |     if len(adata_list) == 1:
59 |         return adata_list[0]
60 | 
61 |     # Make sure that adatas do not contain duplicate columns
62 |     for _adata in adata_list:
63 |         for attr in ("obs", "var"):
64 |             df = getattr(_adata, attr)
65 |             dup_mask = df.columns.duplicated()
66 |             if dup_mask.any():
67 |                 print(
68 |                     f"Deleting duplicated keys `{list(df.columns[dup_mask].unique())}` from `adata.{attr}`."
69 |                 )
70 |                 setattr(_adata, attr, df.loc[:, ~dup_mask])
71 | 
72 |     return anndata.AnnData.concatenate(*adata_list, **kwargs)
73 | 
74 | 
75 | def todense(adata):
76 |     import scipy
77 | 
78 |     if isinstance(adata.X, scipy.sparse.csr_matrix):
79 |         adata.X = adata.X.todense()
80 | 


--------------------------------------------------------------------------------