├── tissue ├── SpaGE │ ├── __init__.py │ ├── dimensionality_reduction.py │ ├── main.py │ └── principal_vectors.py ├── __init__.py ├── utils.py ├── experiments.py ├── downstream.py └── main.py ├── pipeline.png ├── .gitignore ├── dist ├── tissue-0.0.1.tar.gz ├── tissue_sc-0.0.1.tar.gz ├── tissue_sc-0.0.2.tar.gz ├── tissue_sc-1.0.0.tar.gz ├── tissue-0.0.1-py2.py3-none-any.whl ├── tissue_sc-0.0.1-py2.py3-none-any.whl ├── tissue_sc-0.0.2-py2.py3-none-any.whl └── tissue_sc-1.0.0-py2.py3-none-any.whl ├── README_files ├── README_13_0.png ├── README_22_0.png ├── README_24_0.png ├── README_25_0.png ├── README_26_0.png ├── README_28_1.png ├── README_29_1.png ├── README_51_0.png ├── README_52_0.png ├── README_54_0.png ├── README_55_0.png ├── README_58_0.png ├── README_60_0.png ├── README_62_0.png └── README_63_0.png ├── requirements.txt ├── pyproject.toml ├── LICENSE ├── test.py └── README.md /tissue/SpaGE/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/pipeline.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | tissue/__pycache__/ 3 | tissue/SpaGE/__pycache__/ 4 | README.ipynb -------------------------------------------------------------------------------- /dist/tissue-0.0.1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/dist/tissue-0.0.1.tar.gz -------------------------------------------------------------------------------- /README_files/README_13_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_13_0.png -------------------------------------------------------------------------------- /README_files/README_22_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_22_0.png -------------------------------------------------------------------------------- /README_files/README_24_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_24_0.png -------------------------------------------------------------------------------- /README_files/README_25_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_25_0.png -------------------------------------------------------------------------------- /README_files/README_26_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_26_0.png -------------------------------------------------------------------------------- /README_files/README_28_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_28_1.png -------------------------------------------------------------------------------- /README_files/README_29_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_29_1.png -------------------------------------------------------------------------------- /README_files/README_51_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_51_0.png -------------------------------------------------------------------------------- /README_files/README_52_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_52_0.png -------------------------------------------------------------------------------- /README_files/README_54_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_54_0.png -------------------------------------------------------------------------------- /README_files/README_55_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_55_0.png -------------------------------------------------------------------------------- /README_files/README_58_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_58_0.png -------------------------------------------------------------------------------- /README_files/README_60_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_60_0.png -------------------------------------------------------------------------------- /README_files/README_62_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_62_0.png -------------------------------------------------------------------------------- /README_files/README_63_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_63_0.png -------------------------------------------------------------------------------- /dist/tissue_sc-0.0.1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/dist/tissue_sc-0.0.1.tar.gz -------------------------------------------------------------------------------- /dist/tissue_sc-0.0.2.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/dist/tissue_sc-0.0.2.tar.gz -------------------------------------------------------------------------------- /dist/tissue_sc-1.0.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/dist/tissue_sc-1.0.0.tar.gz -------------------------------------------------------------------------------- /dist/tissue-0.0.1-py2.py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/dist/tissue-0.0.1-py2.py3-none-any.whl -------------------------------------------------------------------------------- /dist/tissue_sc-0.0.1-py2.py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/dist/tissue_sc-0.0.1-py2.py3-none-any.whl -------------------------------------------------------------------------------- /dist/tissue_sc-0.0.2-py2.py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/dist/tissue_sc-0.0.2-py2.py3-none-any.whl -------------------------------------------------------------------------------- /dist/tissue_sc-1.0.0-py2.py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/dist/tissue_sc-1.0.0-py2.py3-none-any.whl -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | squidpy==1.2.3 2 | #wpca==0.1 3 | #tangram-sc==1.0.3 4 | #harmonypy==0.0.6 5 | #scvi-tools==0.19.0 6 | #spatialde==1.1.3 -------------------------------------------------------------------------------- /tissue/__init__.py: -------------------------------------------------------------------------------- 1 | '''TISSUE (Transcript Imputation with Spatial Single-cell Uncertainty Estimation) provides tools for estimating well-calibrated uncertainty measures for gene expression predictions in single-cell spatial transcriptomics datasets and utilizing them in downstream analyses''' 2 | 3 | __version__ = "1.0.1" -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["flit_core >=3.2,<4"] 3 | build-backend = "flit_core.buildapi" 4 | 5 | [project] 6 | name = "tissue-sc" 7 | authors = [{name = "Eric Sun", email = "edsun97@gmail.com"}] 8 | readme = "README.md" 9 | license = {file = "LICENSE"} 10 | classifiers = ["License :: OSI Approved :: MIT License"] 11 | dynamic = ["version", "description"] 12 | dependencies = [ 13 | "squidpy>=1.2.3" 14 | ] 15 | 16 | [project.urls] 17 | Home = "https://github.com/sunericd/tissue" 18 | 19 | [tool.flit.module] 20 | name = "tissue" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2022 Eric David Sun 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /tissue/SpaGE/dimensionality_reduction.py: -------------------------------------------------------------------------------- 1 | """ Dimensionality Reduction 2 | @author: Soufiane Mourragui 3 | This module extracts the domain-specific factors from the high-dimensional omics 4 | dataset. Several methods are here implemented and they can be directly 5 | called from string name in main method method. All the methods 6 | use scikit-learn implementation. 7 | Notes 8 | ------- 9 | - 10 | 11 | References 12 | ------- 13 | [1] Pedregosa, Fabian, et al. (2011) Scikit-learn: Machine learning in Python. 14 | Journal of Machine Learning Research 15 | """ 16 | 17 | import numpy as np 18 | from sklearn.decomposition import PCA, FastICA, FactorAnalysis, NMF, SparsePCA 19 | from sklearn.cross_decomposition import PLSRegression 20 | 21 | 22 | def process_dim_reduction(method='pca', n_dim=10): 23 | """ 24 | Default linear dimensionality reduction method. For each method, return a 25 | BaseEstimator instance corresponding to the method given as input. 26 | Attributes 27 | ------- 28 | method: str, default to 'pca' 29 | Method used for dimensionality reduction. 30 | Implemented: 'pca', 'ica', 'fa' (Factor Analysis), 31 | 'nmf' (Non-negative matrix factorisation), 'sparsepca' (Sparse PCA). 32 | 33 | n_dim: int, default to 10 34 | Number of domain-specific factors to compute. 35 | Return values 36 | ------- 37 | Classifier, i.e. BaseEstimator instance 38 | """ 39 | 40 | if method.lower() == 'pca': 41 | clf = PCA(n_components=n_dim) 42 | 43 | elif method.lower() == 'ica': 44 | print('ICA') 45 | clf = FastICA(n_components=n_dim) 46 | 47 | elif method.lower() == 'fa': 48 | clf = FactorAnalysis(n_components=n_dim) 49 | 50 | elif method.lower() == 'nmf': 51 | clf = NMF(n_components=n_dim) 52 | 53 | elif method.lower() == 'sparsepca': 54 | clf = SparsePCA(n_components=n_dim, alpha=10., tol=1e-4, verbose=10, n_jobs=1) 55 | 56 | elif method.lower() == 'pls': 57 | clf = PLS(n_components=n_dim) 58 | 59 | else: 60 | raise NameError('%s is not an implemented method'%(method)) 61 | 62 | return clf 63 | 64 | 65 | class PLS(): 66 | """ 67 | Implement PLS to make it compliant with the other dimensionality 68 | reduction methodology. 69 | (Simple class rewritting). 70 | """ 71 | def __init__(self, n_components=10): 72 | self.clf = PLSRegression(n_components) 73 | 74 | def get_components_(self): 75 | return self.clf.x_weights_.transpose() 76 | 77 | def set_components_(self, x): 78 | pass 79 | 80 | components_ = property(get_components_, set_components_) 81 | 82 | def fit(self, X, y): 83 | self.clf.fit(X,y) 84 | return self 85 | 86 | def transform(self, X): 87 | return self.clf.transform(X) 88 | 89 | def predict(self, X): 90 | return self.clf.predict(X) -------------------------------------------------------------------------------- /tissue/SpaGE/main.py: -------------------------------------------------------------------------------- 1 | """ SpaGE [1] 2 | @author: Tamim Abdelaal 3 | This function integrates two single-cell datasets, spatial and scRNA-seq, and 4 | enhance the spatial data by predicting the expression of the spatially 5 | unmeasured genes from the scRNA-seq data. 6 | The integration is performed using the domain adaption method PRECISE [2] 7 | 8 | References 9 | ------- 10 | [1] Abdelaal T., Mourragui S., Mahfouz A., Reiders M.J.T. (2020) 11 | SpaGE: Spatial Gene Enhancement using scRNA-seq 12 | [2] Mourragui S., Loog M., Reinders M.J.T., Wessels L.F.A. (2019) 13 | PRECISE: A domain adaptation approach to transfer predictors of drug response 14 | from pre-clinical models to tumors 15 | """ 16 | 17 | import numpy as np 18 | import pandas as pd 19 | import scipy.stats as st 20 | from sklearn.neighbors import NearestNeighbors 21 | #from tissue.SpaGE.principal_vectors import PVComputation 22 | from .principal_vectors import PVComputation 23 | 24 | def SpaGE(Spatial_data,RNA_data,n_pv,genes_to_predict=None): 25 | """ 26 | @author: Tamim Abdelaal 27 | This function integrates two single-cell datasets, spatial and scRNA-seq, 28 | and enhance the spatial data by predicting the expression of the spatially 29 | unmeasured genes from the scRNA-seq data. 30 | 31 | Parameters 32 | ------- 33 | Spatial_data : Dataframe 34 | Normalized Spatial data matrix (cells X genes). 35 | RNA_data : Dataframe 36 | Normalized scRNA-seq data matrix (cells X genes). 37 | n_pv : int 38 | Number of principal vectors to find from the independently computed 39 | principal components, and used to align both datasets. This should 40 | be <= number of shared genes between the two datasets. 41 | genes_to_predict : str array 42 | list of gene names missing from the spatial data, to be predicted 43 | from the scRNA-seq data. Default is the set of different genes 44 | (columns) between scRNA-seq and spatial data. 45 | 46 | Return 47 | ------- 48 | Imp_Genes: Dataframe 49 | Matrix containing the predicted gene expressions for the spatial 50 | cells. Rows are equal to the number of spatial data rows (cells), 51 | and columns are equal to genes_to_predict, . 52 | """ 53 | 54 | if genes_to_predict is SpaGE.__defaults__[0]: 55 | genes_to_predict = np.setdiff1d(RNA_data.columns,Spatial_data.columns) 56 | 57 | RNA_data_scaled = pd.DataFrame(data=st.zscore(RNA_data,axis=0), 58 | index = RNA_data.index,columns=RNA_data.columns) 59 | Spatial_data_scaled = pd.DataFrame(data=st.zscore(Spatial_data,axis=0), 60 | index = Spatial_data.index,columns=Spatial_data.columns) 61 | Common_data = RNA_data_scaled[np.intersect1d(Spatial_data_scaled.columns,RNA_data_scaled.columns)] 62 | 63 | Imp_Genes = pd.DataFrame(np.zeros((Spatial_data.shape[0],len(genes_to_predict))), 64 | columns=genes_to_predict) 65 | 66 | pv_Spatial_RNA = PVComputation( 67 | n_factors = n_pv, 68 | n_pv = n_pv, 69 | dim_reduction = 'pca', 70 | dim_reduction_target = 'pca' 71 | ) 72 | 73 | pv_Spatial_RNA.fit(Common_data,Spatial_data_scaled[Common_data.columns]) 74 | 75 | S = pv_Spatial_RNA.source_components_.T 76 | 77 | Effective_n_pv = sum(np.diag(pv_Spatial_RNA.cosine_similarity_matrix_) > 0.3) 78 | S = S[:,0:Effective_n_pv] 79 | 80 | Common_data_projected = Common_data.dot(S) 81 | Spatial_data_projected = Spatial_data_scaled[Common_data.columns].dot(S) 82 | 83 | nbrs = NearestNeighbors(n_neighbors=50, algorithm='auto', 84 | metric = 'cosine').fit(Common_data_projected) 85 | distances, indices = nbrs.kneighbors(Spatial_data_projected) 86 | 87 | for j in range(0,Spatial_data.shape[0]): 88 | 89 | weights = 1-(distances[j,:][distances[j,:]<1])/(np.sum(distances[j,:][distances[j,:]<1])) 90 | weights = weights/(len(weights)-1) 91 | Imp_Genes.iloc[j,:] = np.dot(weights,RNA_data[genes_to_predict].iloc[indices[j,:][distances[j,:] < 1]]) 92 | 93 | return Imp_Genes 94 | -------------------------------------------------------------------------------- /tissue/utils.py: -------------------------------------------------------------------------------- 1 | # Contains utility functions for TISSUE 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import anndata as ad 6 | import os 7 | 8 | 9 | def large_save(adata, dirpath): 10 | ''' 11 | Saves anndata objects by saving each obsm value with its {key}.csv as pandas dataframe 12 | Saves each uns value that is a dataframe with uns/{key}.csv as pandas dataframe 13 | Then saves the anndata object with obsm removed. 14 | 15 | Parameters 16 | ---------- 17 | adata [AnnData] - AnnData object to save 18 | 19 | dirpath [str] - path to directory for where to save the h5ad and csv files; will create if not existing 20 | adata will be saved as {dirpath}/adata.h5ad 21 | obsm will be saved as {dirpath}/{key}.csv 22 | 23 | Returns 24 | ------- 25 | Saves anndata object in "large" folder format 26 | ''' 27 | # check if dirpath exists; else create it 28 | if not os.path.exists(dirpath): 29 | os.makedirs(dirpath) 30 | 31 | # extract the obsm metadata and save it as separate csv files 32 | for key, value in adata.obsm.items(): 33 | df = pd.DataFrame(value) 34 | df.to_csv(os.path.join(dirpath, f"{key}.csv"), index=False) 35 | 36 | # remove the obsm metadata from the anndata object 37 | adatac = adata.copy() 38 | adatac.obsm = {} 39 | 40 | # extract the uns metadata and save it as separate csv files 41 | del_keys = [] 42 | for key, value in adatac.uns.items(): 43 | if isinstance(value, pd.DataFrame): 44 | if not os.path.exists(os.path.join(dirpath,"uns")): 45 | os.makedirs(os.path.join(dirpath,"uns")) 46 | df = pd.DataFrame(value) 47 | df.to_csv(os.path.join(dirpath,"uns",f"{key}.csv"), index=False) 48 | del_keys.append(key) 49 | 50 | # remove uns metadata from the anndata object 51 | for key in del_keys: 52 | del adatac.uns[key] 53 | 54 | # save the new anndata object 55 | adatac.write(os.path.join(dirpath, "adata.h5ad")) 56 | 57 | 58 | 59 | def large_load(dirpath, skipfiles=[]): 60 | ''' 61 | Loads in anndata and associated pandas dataframe csv files to be added to obsm metadata and uns metadata. 62 | Input is the directory path to the output directory of large_save() 63 | 64 | Parameters 65 | ---------- 66 | dirpath [str] - path to directory for where outputs of large_save() are located 67 | skipfiles [list] - list of filenames to exclude from anndata object 68 | 69 | Returns 70 | ------- 71 | adata - AnnData object loaded from dirpath along with all obsm and uns key values added to metadata 72 | ''' 73 | # read h5ad anndata object 74 | adata = ad.read_h5ad(os.path.join(dirpath, "adata.h5ad")) 75 | 76 | # read and load in obsm from CSV files 77 | for fn in os.listdir(dirpath): 78 | if (".csv" in fn) and (fn not in skipfiles): 79 | df = pd.read_csv(os.path.join(dirpath, fn)) 80 | df.index = adata.obs_names 81 | key = fn.split(".")[0] 82 | adata.obsm[key] = df 83 | 84 | # read and load any usn metadata from CSV files 85 | if os.path.isdir(os.path.join(dirpath,"uns")): 86 | for fn in os.listdir(os.path.join(dirpath,"uns")): 87 | if (".csv" in fn) and (fn not in skipfiles): 88 | df = pd.read_csv(os.path.join(dirpath,"uns",fn)) 89 | key = fn.split(".")[0] 90 | adata.uns[key] = df 91 | 92 | return(adata) 93 | 94 | 95 | def convert_adata_to_dataupload (adata, savedir): 96 | ''' 97 | Saves AnnData object into TISSUE input directory 98 | 99 | Parameters 100 | ---------- 101 | adata - AnnData object to be saved with all metadata in adata.obs and spatial coordinates in adata.obsm['spatial'] 102 | savedir [str] - path to existing directory to save the files for TISSUE loading 103 | 104 | Returns 105 | ------- 106 | Saves all TISSUE input files into the specified directory for the given AnnData object 107 | 108 | NOTE: You will need to independently include scRNA_count.txt in savedir for TISSUE inputs to be complete 109 | ''' 110 | locations = pd.DataFrame(adata.obsm['spatial'], columns=['x','y']) 111 | locations.to_csv(os.path.join(savedir,"Locations.txt"), sep="\t", index=False) 112 | 113 | df = pd.DataFrame(adata.X, columns=adata.var_names) 114 | df.to_csv(os.path.join(savedir,"Spatial_count.txt"), sep="\t", index=False) 115 | 116 | meta = pd.DataFrame(adata.obs) 117 | meta.to_csv(os.path.join(savedir,"Metadata.txt")) -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # TEST FILE FOR BASIC TISSUE FUNCTIONALITIES 2 | 3 | 4 | # import packages 5 | 6 | import tissue.main, tissue.downstream 7 | 8 | import numpy as np 9 | import pandas as pd 10 | import matplotlib.pyplot as plt 11 | import scanpy as sc 12 | import anndata as ad 13 | import os 14 | 15 | import warnings 16 | warnings.filterwarnings("ignore") 17 | 18 | ################################################################################################################# 19 | print ("Testing TISSUE data loading...") 20 | try: 21 | adata, RNAseq_adata = tissue.main.load_paired_datasets("tests/data/Spatial_count.txt", 22 | "tests/data/Locations.txt", 23 | "tests/data/scRNA_count.txt") 24 | except: 25 | raise Exception ("Failed data loading from tests/data/ with tissue.main.load_paired_datasets()") 26 | 27 | ################################################################################################################# 28 | print ("Testing TISSUE preprocessing...") 29 | adata.var_names = [x.lower() for x in adata.var_names] 30 | RNAseq_adata.var_names = [x.lower() for x in RNAseq_adata.var_names] 31 | try: 32 | tissue.main.preprocess_data(RNAseq_adata, standardize=False, normalize=True) 33 | except: 34 | raise Exception ("Failed TISSUE preprocessing. Make sure all dependencies are installed.") 35 | gene_names = np.intersect1d(adata.var_names, RNAseq_adata.var_names) 36 | adata = adata[:, gene_names].copy() 37 | target_gene = "plp1" 38 | target_expn = adata[:, target_gene].X.copy() 39 | adata = adata[:, [gene for gene in gene_names if gene != target_gene]].copy() 40 | 41 | ################################################################################################################# 42 | print("Testing TISSUE spatial gene expression prediction...") 43 | try: 44 | tissue.main.predict_gene_expression (adata, RNAseq_adata, [target_gene], 45 | method="spage", n_folds=3, n_pv=10) 46 | except: 47 | raise Exception("TISSUE prediction failed for SpaGE at tissue.main.predict_gene_expression()") 48 | 49 | ################################################################################################################# 50 | print("Testing TISSUE calibration...") 51 | try: 52 | tissue.main.build_spatial_graph(adata, method="fixed_radius", n_neighbors=15) 53 | except: 54 | raise Exception ("Failed TISSUE spatial graph building at tissue.main.build_spatial_graph()") 55 | try: 56 | tissue.main.conformalize_spatial_uncertainty(adata, "spage_predicted_expression", calib_genes=adata.var_names, 57 | grouping_method="kmeans_gene_cell", k=4, k2=2) 58 | except: 59 | raise Exception ("Failed TISSUE cell-centric variability and calibration scores processing at tissue.main.conformalize_spatial_uncertainty()") 60 | try: 61 | tissue.main.conformalize_prediction_interval (adata, "spage_predicted_expression", calib_genes=adata.var_names, 62 | alpha_level=0.23, compute_wasserstein=True) 63 | except: 64 | raise Exception ("Failed TISSUE prediction interval calibration at tissue.main.conformalize_prediction_interval()") 65 | 66 | ################################################################################################################# 67 | print ("Testing TISSUE multiple imputation t-test...") 68 | adata.obs['condition'] = ['A' if i < round(adata.shape[0]/2) else 'B' for i in range(adata.shape[0])] 69 | try: 70 | tissue.downstream.multiple_imputation_testing(adata, "spage_predicted_expression", 71 | calib_genes=adata.var_names, 72 | condition='condition', 73 | group1 = "A", # use None to compute for all conditions, condition vs all 74 | group2 = "B", # use None to compute for group1 vs all 75 | n_imputations=2) 76 | except: 77 | raise Exception ("Failed TISSUE MI t-test at tissue.downstream.multiple_imputation_testing()") 78 | 79 | ################################################################################################################# 80 | print("Testing TISSUE cell filtering") 81 | X_uncertainty = adata.obsm["spage_predicted_expression_hi"].values - adata.obsm["spage_predicted_expression_lo"].values 82 | try: 83 | keep_idxs = tissue.downstream.detect_uncertain_cells (X_uncertainty, 84 | proportion="otsu", 85 | stratification=adata.obs['condition'].values) 86 | except: 87 | raise Exception ("Failed TISSUE cell filtering at tissue.downstream.detect_uncertain_cells()") 88 | try: 89 | keep_idxs = tissue.downstream.filtered_PCA (adata, # anndata object 90 | "spage", # prediction method 91 | proportion="otsu", 92 | stratification=adata.obs['condition'].values, 93 | return_keep_idxs=True) 94 | except: 95 | raise Exception ("Failed TISSUE-filtered PCA at tissue.downstream.filtered_PCA()") 96 | 97 | print("TISSUE tests passed!") -------------------------------------------------------------------------------- /tissue/SpaGE/principal_vectors.py: -------------------------------------------------------------------------------- 1 | """ Principal Vectors 2 | @author: Soufiane Mourragui 3 | This module computes the principal vectors from two datasets, i.e.: 4 | - perform linear dimensionality reduction independently for both dataset, resulting 5 | in set of domain-specific factors. 6 | - find the common factors using principal vectors [1] 7 | This result in set of pairs of vectors. Each pair has one vector from the source and one 8 | from the target. For each pair, a similarity score (cosine similarity) can be computed 9 | between the principal vectors and the pairs are naturally ordered by decreasing order 10 | of this similarity measure. 11 | Example 12 | ------- 13 | Examples are given in the vignettes. 14 | Notes 15 | ------- 16 | Examples are given in the vignette 17 | 18 | References 19 | ------- 20 | [1] Golub, G.H. and Van Loan, C.F., 2012. "Matrix computations" (Vol. 3). JHU Press. 21 | [2] Mourragui, S., Loog, M., Reinders, M.J.T., Wessels, L.F.A. (2019) 22 | PRECISE: A domain adaptation approach to transfer predictors of drug response 23 | from pre-clinical models to tumors 24 | """ 25 | 26 | import numpy as np 27 | import pandas as pd 28 | import scipy 29 | from pathlib import Path 30 | from sklearn.preprocessing import normalize 31 | 32 | #from tissue.SpaGE.dimensionality_reduction import process_dim_reduction 33 | from .dimensionality_reduction import process_dim_reduction 34 | 35 | class PVComputation: 36 | """ 37 | Attributes 38 | ------- 39 | n_factors: int 40 | Number of domain-specific factors to compute. 41 | n_pv: int 42 | Number of principal vectors. 43 | dim_reduction_method_source: str 44 | Dimensionality reduction method used for source data 45 | dim_reduction_target: str 46 | Dimensionality reduction method used for source data 47 | source_components_ : numpy.ndarray, shape (n_pv, n_features) 48 | Loadings of the source principal vectors ranked by similarity to the 49 | target. Components are in the row. 50 | source_explained_variance_ratio_: numpy.ndarray, shape (n_pv) 51 | Explained variance of the source on each source principal vector. 52 | target_components_ : numpy.ndarray, shape (n_pv, n_features) 53 | Loadings of the target principal vectors ranked by similarity to the 54 | source. Components are in the row. 55 | target_explained_variance_ratio_: numpy.ndarray, shape (n_pv) 56 | Explained variance of the target on each target principal vector. 57 | cosine_similarity_matrix_: numpy.ndarray, shape (n_pv, n_pv) 58 | Scalar product between the source and the target principal vectors. Source 59 | principal vectors are in the rows while target's are in the columns. If 60 | the domain adaptation is sensible, a diagonal matrix should be obtained. 61 | """ 62 | 63 | def __init__(self, n_factors,n_pv, 64 | dim_reduction='pca', 65 | dim_reduction_target=None, 66 | project_on=0): 67 | """ 68 | Parameters 69 | ------- 70 | n_factors : int 71 | Number of domain-specific factors to extract from the data (e.g. using PCA, ICA). 72 | n_pv : int 73 | Number of principal vectors to find from the independently computed factors. 74 | dim_reduction : str, default to 'pca' 75 | Dimensionality reduction method for the source data, 76 | i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'. 77 | dim_reduction_target : str, default to None 78 | Dimensionality reduction method for the target data, 79 | i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'. If None, set to dim_reduction. 80 | project_on: int or bool, default to 0 81 | Where data should be projected on. 0 means source PVs, -1 means target PVs and 1 means 82 | both PVs. 83 | """ 84 | self.n_factors = n_factors 85 | self.n_pv = n_pv 86 | self.dim_reduction_method_source = dim_reduction 87 | self.dim_reduction_method_target = dim_reduction_target or dim_reduction 88 | self.dim_reduction_source = self._process_dim_reduction(self.dim_reduction_method_source) 89 | self.dim_reduction_target = self._process_dim_reduction(self.dim_reduction_method_target) 90 | 91 | self.source_components_ = None 92 | self.source_explained_variance_ratio_ = None 93 | self.target_components_ = None 94 | self.target_explained_variance_ratio_ = None 95 | self.cosine_similarity_matrix_ = None 96 | 97 | def _process_dim_reduction(self, dim_reduction): 98 | if type(dim_reduction) == str: 99 | return process_dim_reduction(method=dim_reduction, n_dim=self.n_factors) 100 | else: 101 | return dim_reduction 102 | 103 | def fit(self, X_source, X_target, y_source=None): 104 | """ 105 | Compute the common factors between two set of data. 106 | IMPORTANT: Same genes have to be given for source and target, and in same order 107 | Parameters 108 | ------- 109 | X_source : np.ndarray, shape (n_components, n_genes) 110 | Source dataset 111 | X_target : np.ndarray, shape (n_components, n_genes) 112 | Target dataset 113 | y_source : np.ndarray, shape (n_components, 1) (optional, default to None) 114 | Eventual output, in case one wants to give ouput (for instance PLS) 115 | Return values 116 | ------- 117 | self: returns an instance of self. 118 | """ 119 | # Compute factors independently for source and target. Orthogonalize the basis 120 | Ps = self.dim_reduction_source.fit(X_source, y_source).components_ 121 | Ps = scipy.linalg.orth(Ps.transpose()).transpose() 122 | 123 | Pt = self.dim_reduction_target.fit(X_target, y_source).components_ 124 | Pt = scipy.linalg.orth(Pt.transpose()).transpose() 125 | 126 | # Compute the principal factors 127 | self.compute_principal_vectors(Ps, Pt) 128 | 129 | # Compute variance explained 130 | self.source_explained_variance_ratio_ = np.var(self.source_components_.dot(X_source.transpose()), axis=1)/\ 131 | np.sum(np.var(X_source), axis=0) 132 | self.target_explained_variance_ratio_ = np.var(self.target_components_.dot(X_target.transpose()), axis=1)/\ 133 | np.sum(np.var(X_target), axis=0) 134 | 135 | return self 136 | 137 | def compute_principal_vectors(self, source_factors, target_factors): 138 | """ 139 | Compute the principal vectors between the already computed set of domain-specific 140 | factors, using approach presented in [1,2]. 141 | IMPORTANT: Same genes have to be given for source and target, and in same order 142 | Parameters 143 | ------- 144 | source_factors: np.ndarray, shape (n_components, n_genes) 145 | Source domain-specific factors. 146 | target_factors: np.ndarray, shape (n_components, n_genes) 147 | Target domain-specific factors. 148 | Return values 149 | ------- 150 | self: returns an instance of self. 151 | """ 152 | 153 | # Find principal vectors using SVD 154 | u,sigma,v = np.linalg.svd(source_factors.dot(target_factors.transpose())) 155 | self.source_components_ = u.transpose().dot(source_factors)[:self.n_pv] 156 | self.target_components_ = v.dot(target_factors)[:self.n_pv] 157 | # Normalize to make sure that vectors are unitary 158 | self.source_components_ = normalize(self.source_components_, axis = 1) 159 | self.target_components_ = normalize(self.target_components_, axis = 1) 160 | 161 | # Compute cosine similarity matrix 162 | self.initial_cosine_similarity_matrix_ = source_factors.dot(target_factors.transpose()) 163 | self.cosine_similarity_matrix_ = self.source_components_.dot(self.target_components_.transpose()) 164 | 165 | # Compute angles 166 | self.angles_ = np.arccos(np.diag(self.cosine_similarity_matrix_)) 167 | 168 | return self 169 | 170 | 171 | def transform(self, X, project_on=None): 172 | """ 173 | Projects data onto principal vectors. 174 | Parameters 175 | ------- 176 | X : numpy.ndarray, shape (n_samples, n_genes) 177 | Data to project. 178 | project_on: int or bool, default to None 179 | Where data should be projected on. 0 means source PVs, -1 means target PVs and 1 means 180 | both PVs. If None, set to class instance value. 181 | Return values 182 | ------- 183 | Projected data as a numpy.ndarray of shape (n_samples, n_factors) 184 | """ 185 | 186 | project_on = project_on or self.project_on 187 | 188 | # Project on source 189 | if project_on == 0: 190 | return X.dot(self.source_components_.transpose()) 191 | 192 | # Project on target 193 | elif project_on == -1: 194 | return X.dot(self.target_components_.transpose()) 195 | 196 | # Project on both 197 | elif project_on == 1: 198 | return X.dot(np.concatenate([self.source_components_.transpose(), self.target_components_.transpose()])) 199 | 200 | else: 201 | raise ValueError('project_on should be 0 (source), -1 (target) or 1 (both). %s not correct value'%(project_on)) -------------------------------------------------------------------------------- /tissue/experiments.py: -------------------------------------------------------------------------------- 1 | # Contains compound functions for generating results for experiments with TISSUE 2 | # These are unlikely to be used for general applications but were used in our development/testing of TISSUE 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | import scanpy as sc 8 | import squidpy as sq 9 | import anndata as ad 10 | import warnings 11 | import os 12 | import gc 13 | 14 | #from tissue.utils import large_save, large_load 15 | from .utils import large_save, large_load 16 | from .main import load_spatial_data, conformalize_prediction_interval, get_spatial_uncertainty_scores_from_metadata 17 | from .downstream import multiple_imputation_testing 18 | 19 | 20 | def group_conformalize_from_intermediate(dataset_name, methods, symmetric, alpha_levels, 21 | save_alpha=[0.05], savedir="SCPI", type_dataset="DataUpload"): 22 | ''' 23 | Function for taking intermediate fold predictions and running group conformalization for all different alpha values 24 | 25 | Returns a results dictionary with calibration quality (res_dict) and the AnnData with CI for all folds at alpha of save_alpha [float] 26 | 27 | Parameters 28 | ---------- 29 | dataset_name [str] - name of folder in DataUpload/ 30 | methods [list of str] - list of method keys to use for prediction_sets 31 | symmetric [bool] - whether to use symmetric prediction intervals 32 | alpha_levels [array] - alpha levels to calibrate over 33 | save_alpha [list of float] - alphas to save prediction intervals into adata.obsm 34 | savedir [str] - folder where the intermediate results are saved (independent folds) 35 | type_dataset [str] - default to "DataUpload" but may have additional options in the future 36 | 37 | Returns 38 | ------- 39 | res_dict [dict] - dictionary of calibration statistics / coverage statistics across the alpha levels 40 | adata [AnnData] - anndata with calibration results added to metadata 41 | ''' 42 | # read in spatial data 43 | if type_dataset == "DataUpload": 44 | if os.path.isfile("DataUpload/"+dataset_name+"/Metadata.txt"): 45 | adata = load_spatial_data("DataUpload/"+dataset_name+"/Spatial_count.txt", 46 | "DataUpload/"+dataset_name+"/Locations.txt", 47 | spatial_metadata = "DataUpload/"+dataset_name+"/Metadata.txt") 48 | else: 49 | adata = load_spatial_data("DataUpload/"+dataset_name+"/Spatial_count.txt", 50 | "DataUpload/"+dataset_name+"/Locations.txt") 51 | else: 52 | adata = sc.read_h5ad(os.path.join("additional_data",dataset_name,"spatial.h5ad")) 53 | adata.var_names = [x.lower() for x in adata.var_names] 54 | 55 | # results dict 56 | res_dict = {} 57 | 58 | for method in methods: 59 | 60 | res_dict[method] = {} 61 | res_dict[method]['ind_gene_results'] = {} 62 | 63 | calibration_weight = 0 # for computing weighted average 64 | test_weight = 0 65 | 66 | dirpath = savedir+"/"+dataset_name+"_intermediate/"+method 67 | 68 | folds = np.load(os.path.join(savedir+"/"+dataset_name+"_intermediate/"+method,"folds.npy"), allow_pickle=True) 69 | 70 | # subset spatial data into shared genes 71 | gene_names = np.concatenate(folds) 72 | adata = adata[:, gene_names] 73 | 74 | for i, fold in enumerate(folds): 75 | 76 | # load adata within fold 77 | sub_adata = large_load(os.path.join(dirpath, "fold"+str(i))) 78 | target_genes = list(fold) 79 | 80 | # subset data 81 | predicted = method+"_predicted_expression" 82 | test_genes = target_genes.copy() 83 | calib_genes = [gene for gene in gene_names if gene not in test_genes] 84 | test_idxs = [np.where(sub_adata.obsm[predicted].columns==gene)[0][0] for gene in test_genes] 85 | calib_idxs = [np.where(sub_adata.obsm[predicted].columns==gene)[0][0] for gene in calib_genes] 86 | 87 | # get uncertainties and scores from saved adata 88 | scores, residuals, G_stdev, G, groups = get_spatial_uncertainty_scores_from_metadata (sub_adata, predicted) 89 | 90 | # init dict for individual gene results 91 | for g in test_genes: 92 | if g not in res_dict[method]['ind_gene_results'].keys(): 93 | res_dict[method]['ind_gene_results'][g] = {} 94 | res_dict[method]['ind_gene_results'][g]['1-alpha'] = 1-alpha_levels 95 | res_dict[method]['ind_gene_results'][g]['test'] = [] 96 | 97 | # iterate over different alphas for conformalization 98 | test_perc = [] 99 | calib_perc = [] 100 | 101 | for alpha_level in alpha_levels: 102 | sub_adatac = sub_adata.copy() 103 | conformalize_prediction_interval (sub_adatac, predicted, calib_genes, alpha_level=alpha_level, 104 | symmetric=symmetric, return_scores_dict=False) 105 | 106 | prediction_sets = (sub_adatac.obsm[predicted+"_lo"].values, sub_adatac.obsm[predicted+"_hi"].values) 107 | 108 | test_perc.append(np.nanmean(((adata[:,test_genes].X>prediction_sets[0][:,test_idxs]) & (adata[:,test_genes].Xprediction_sets[0][:,calib_idxs]) & (adata[:,calib_genes].X==` inside the same Conda environment as TISSUE (preferably before TISSUE installation). 17 | 18 | For TISSUE installation, we provide two options: (A) PyPI installation with pip or (B) local installation. We recommend starting with option A and only going to option B if option A fails in your environment. 19 | 20 | ### Option A: PyPI 21 | 22 | Install the package through PyPI with `pip`. We recommend setting up a conda environment (https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or another virtual environment first since `tissue-sc` currently relies on specific versions for its dependencies (although it should generally work for other environment versions, but this hasn't been thoroughly tested): 23 | 24 | ``` 25 | conda create -n myenv python=3.8 26 | conda activate myenv 27 | 28 | 29 | pip install tissue-sc 30 | ``` 31 | 32 | Note that you will want to separately download the data from this repository (`tests/data/`) to run our TISSUE tutorials. 33 | 34 | 35 | ### Option B: Local installation 36 | 37 | An alternative way to install the package along with associated test and tutorial files is to clone the directory and then install the requirements for using the package. To do this, first clone the repository using git (you can install git following the instructions [here](https://github.com/git-guides/install-git)): 38 | 39 | ``` 40 | git clone https://github.com/sunericd/TISSUE.git 41 | ``` 42 | 43 | We recommend setting up a conda environment to install the requirements for the package (instructions for installing conda and what conda environment can do can be found [here](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html)). Installation of requirements can then be done with the following commands: 44 | 45 | ``` 46 | conda create -n tissue python=3.8 47 | conda activate tissue 48 | 49 | cd TISSUE 50 | pip install -r requirements.txt 51 | ``` 52 | 53 | To keep the requirements light, we have only included packages that are necessary for the core functionalities of TISSUE. For additional utilities such as gene prediction with Tangram, please install those packages separately (or uncomment those lines in `requirements.txt`). 54 | 55 | To test that the installation is working correctly, you can run `python test.py` in the cloned directory. 56 | 57 | 58 | # TISSUE Tutorials 59 | 60 | Below we include several mini-tutorials to highlight the main TISSUE pipeline and downstream applications. These tutorials rely on a small test dataset (a subset of one of the datasets used in the original publication) for fast processing but the approaches can readily be extended to other datasets. For larger-scale examples, please refer to the code repository corresponding the figures and analyses generated for the TISSUE manuscript: https://github.com/sunericd/tissue-figures-and-analyses.git 61 | 62 | 63 | ```python 64 | # import packages 65 | 66 | import tissue.main, tissue.downstream 67 | 68 | import numpy as np 69 | import pandas as pd 70 | import matplotlib.pyplot as plt 71 | import scanpy as sc 72 | import anndata as ad 73 | import os 74 | ``` 75 | 76 | ## Upstream TISSUE analyses: spatial gene expression prediction and uncertainty calibration 77 | 78 | The first part of the TISSUE pipeline involves making predictions of spatial gene expression profiles using paired spatial transcriptomics and scRNAseq datasets, and then computing and calibrating uncertainties for these predictions that are translated to prediction intervals. All functions for upstream analysis are in the `tissue.main` module. These include: 79 | 80 | - `tissue.main.load_paired_datasets()` for loading paired datasets from formatted directories 81 | 82 | - `tissue.main.predict_gene_expression()` for predicting spatial gene expression from paired datasets using a specified prediction method and number of cross-validation folds 83 | 84 | - `tissue.main.build_spatial_graph()` for building a cell-cell spatial graph to define neighborhoods for computing the cell-centric variability 85 | 86 | - `tissue.main.conformalize_spatial_uncertainty()` for computing cell-centric variability, stratified grouping, and the TISSUE calibration scores for predicted gene expression 87 | 88 | - `tissue.main.conformalize_prediction_interval()` for building prediction intervals from uncertainty measures 89 | 90 | ### Tutorial 1: Predicting spatial gene expression 91 | 92 | First, we load a minimal subset of osmFISH spatial transcriptomics data of mouse somatosensory cortex published by Codeluppi et al., 2018: https://doi.org/10.1038/s41592-018-0175-z. 93 | 94 | Note that we are using the TISSUE methods for building an AnnData object from tab-delimited text files for the spatial counts, scRNAseq counts, spatial locations, and spatial metadata. If you already have an AnnData object for the spatial data and another object for the scRNAseq data, you can skip this step. 95 | 96 | 97 | ```python 98 | # load in spatial and scRNAseq datasets 99 | 100 | adata, RNAseq_adata = tissue.main.load_paired_datasets("tests/data/Spatial_count.txt", 101 | "tests/data/Locations.txt", 102 | "tests/data/scRNA_count.txt") 103 | ``` 104 | 105 | /home/edsun/anaconda3/envs/tissue/lib/python3.8/site-packages/anndata/_core/anndata.py:117: ImplicitModificationWarning: Transforming to str index. 106 | warnings.warn("Transforming to str index.", ImplicitModificationWarning) 107 | /home/edsun/anaconda3/envs/tissue/lib/python3.8/site-packages/anndata/_core/anndata.py:856: UserWarning: 108 | AnnData expects .obs.index to contain strings, but got values like: 109 | [0, 1, 2, 3, 4] 110 | 111 | Inferred to be: integer 112 | 113 | names = self._prep_dim_index(names, "obs") 114 | 115 | 116 | Now we can impute any genes of interest that are found in the scRNAseq dataset but not in the spatial dataset. In this case, we will hold out a target gene from the spatial data and apply an imputation method to predict its expression using the scRNAseq dataset. 117 | 118 | First, we preprocess the data and make sure that the gene names are matchable across the two datasets: 119 | 120 | 121 | ```python 122 | # make genes lowercase 123 | adata.var_names = [x.lower() for x in adata.var_names] 124 | RNAseq_adata.var_names = [x.lower() for x in RNAseq_adata.var_names] 125 | 126 | # preprocess RNAseq data 127 | tissue.main.preprocess_data(RNAseq_adata, standardize=False, normalize=True) 128 | 129 | # subset spatial data into shared genes 130 | gene_names = np.intersect1d(adata.var_names, RNAseq_adata.var_names) 131 | adata = adata[:, gene_names].copy() 132 | 133 | # hold out target gene 134 | target_gene = "plp1" 135 | target_expn = adata[:, target_gene].X.copy() 136 | adata = adata[:, [gene for gene in gene_names if gene != target_gene]].copy() 137 | ``` 138 | 139 | 140 | ```python 141 | # dimensions of spatial transcriptomics dataset (number of cells x number of genes) 142 | adata.shape 143 | ``` 144 | 145 | 146 | 147 | 148 | (3405, 31) 149 | 150 | 151 | 152 | 153 | ```python 154 | # dimensions of RNAseq dataset (number of cells x number of genes) 155 | RNAseq_adata.shape 156 | ``` 157 | 158 | 159 | 160 | 161 | (1000, 32) 162 | 163 | 164 | 165 | Now, we can make predictions of the target gene expression. In TISSUE, we currently have several methods for gene imputation including SpaGE, Tangram, and Harmony-kNN. We will be using SpaGE in this example: 166 | 167 | 168 | ```python 169 | # SpaGE spatial gene expression prediction 170 | 171 | tissue.main.predict_gene_expression (adata, RNAseq_adata, [target_gene], 172 | method="spage", n_folds=10, n_pv=10) 173 | ``` 174 | 175 | How good is the imputation? Since we left out this gene from the spatial data, we can plot the predicted and actual expression and visually inspect the agreement. 176 | 177 | 178 | ```python 179 | # Visualizing predicted and actual expression side by side 180 | 181 | fig, (ax1, ax2) = plt.subplots(1, 2) 182 | 183 | ax1.axis('off') 184 | cmap = target_expn 185 | cmap[cmap<0] = 0 186 | cmap = np.log1p(cmap) 187 | cmap[cmap > np.percentile(cmap,95)] = np.percentile(cmap,95) 188 | im = ax1.scatter(adata.obsm['spatial'][:,0],adata.obsm['spatial'][:,1],s=1,c=cmap,rasterized=True) 189 | ax1.set_title('Actual', fontsize = 12) 190 | 191 | cbar = fig.colorbar(im) 192 | cbar.ax.get_yaxis().labelpad = 15 193 | cbar.ax.set_ylabel('Log Expression', rotation=270) 194 | 195 | ax2.axis('off') 196 | cmap = adata.obsm['spage_predicted_expression'][target_gene].values 197 | cmap[cmap<0] = 0 198 | cmap = np.log1p(cmap) 199 | cmap[cmap > np.percentile(cmap,95)] = np.percentile(cmap,95) 200 | im = ax2.scatter(adata.obsm['spatial'][:,0],adata.obsm['spatial'][:,1],s=1,c=cmap,rasterized=True) 201 | ax2.set_title('Predicted', fontsize = 12) 202 | 203 | cbar = fig.colorbar(im) 204 | cbar.ax.get_yaxis().labelpad = 15 205 | cbar.ax.set_ylabel('Log Expression', rotation=270) 206 | 207 | plt.suptitle("SpaGE Prediction", fontsize=16) 208 | plt.tight_layout() 209 | plt.show() 210 | ``` 211 | 212 | 213 | 214 | ![png](README_files/README_13_0.png) 215 | 216 | 217 | 218 | Not too bad, especially considering that we used a downsampled scRNAseq dataset for this imputation. 219 | 220 | ### Tutorial 2: Using TISSUE to calibrate uncertainties and obtain prediction intervals 221 | 222 | Note that when we ran `tissue.main.predict_gene_expression()` in the previous tutorial, we obtained cross-validated predictions for all genes in the existing spatial dataset (e.g. 10 folds). These will now come in handy when we compute and calibrate TISSUE uncertainties for the predicted gene expression. 223 | 224 | First, we build spatial graphs using TISSUE (this can also be done with native Scanpy functions): 225 | 226 | 227 | ```python 228 | # build spatial graph and calculate adjacency weights 229 | 230 | tissue.main.build_spatial_graph(adata, method="fixed_radius", n_neighbors=15) 231 | ``` 232 | 233 | The entire TISSUE spatial uncertainty generation pipeline can be launched with one line of code using `tissue.main.conformalize_spatial_uncertainty()`. Here, the first two arguments are the spatial AnnData object and a string specifier for the key in `obsm` corresponding to the predicted gene expression. We can specify which of the genes to use in the calibration set (generally this is all genes in the spatial data). There are other arguments for the grouping setup and weighting schemes, but we will use the default settings in this tutorial. 234 | 235 | 236 | ```python 237 | # build calibration scores 238 | 239 | tissue.main.conformalize_spatial_uncertainty(adata, "spage_predicted_expression", calib_genes=adata.var_names, 240 | grouping_method="kmeans_gene_cell", k=4, k2=2) 241 | ``` 242 | 243 | /home/edsun/anaconda3/envs/tissue/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning 244 | warnings.warn( 245 | /home/edsun/anaconda3/envs/tissue/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning 246 | warnings.warn( 247 | /home/edsun/anaconda3/envs/tissue/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning 248 | warnings.warn( 249 | /home/edsun/anaconda3/envs/tissue/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning 250 | warnings.warn( 251 | /home/edsun/anaconda3/envs/tissue/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning 252 | warnings.warn( 253 | 254 | 255 | Finally, we can calibrate the spatial uncertainty measures to get calibration scores and then use those to extract prediction intervals for any confidence level $1-\alpha$. This is done with `tissue.main.conformalize_prediction_interval()` 256 | 257 | 258 | ```python 259 | # get prediction interval for 67% coverage 260 | 261 | tissue.main.conformalize_prediction_interval (adata, "spage_predicted_expression", calib_genes=adata.var_names, 262 | alpha_level=0.23) 263 | 264 | # set `compute_wasserstein=True` to compute a measure indicating the distance between the cell-centric variability values 265 | # of the predicted genes and the values of their support (i.e. calibration group) in the original data. 266 | # The lower this value, the better supported the predicted group. 267 | ``` 268 | 269 | Now let's visualize what these prediction intervals look like for the target (unseen) gene and how it compares to the actual prediction errors. 270 | 271 | 272 | ```python 273 | m = "spage" 274 | 275 | i = np.where(adata.var_names == target_gene)[0] 276 | 277 | # define consistent color map 278 | all_vals = np.concatenate((np.abs(target_expn.flatten()-adata.obsm[m+"_predicted_expression"][target_gene].values), 279 | adata.obsm[m+f"_predicted_expression_hi"][target_gene].values-adata.obsm[m+f"_predicted_expression_lo"][target_gene].values)) 280 | all_vals[all_vals<0]=0 281 | vmin = np.percentile(np.log1p(all_vals), 0) 282 | vmax = np.percentile(np.log1p(all_vals), 95) 283 | 284 | 285 | fig, (ax1, ax2) = plt.subplots(1, 2) 286 | 287 | ax1.axis('off') 288 | cmap = np.abs(target_expn.flatten()-adata.obsm[m+"_predicted_expression"][target_gene].values) 289 | cmap[cmap<0] = 0 290 | cmap = np.log1p(cmap) 291 | cmap[cmap > np.percentile(cmap,95)] = np.percentile(cmap,95) 292 | im = ax1.scatter(adata.obsm['spatial'][:,0],adata.obsm['spatial'][:,1],s=1,c=cmap,rasterized=True)#,vmin=vmin,vmax=vmax) 293 | ax1.set_title('Imputation Error ' + target_gene, fontsize = 12) 294 | 295 | cbar = fig.colorbar(im) 296 | cbar.ax.get_yaxis().labelpad = 15 297 | cbar.ax.set_ylabel('Log Expression', rotation=270) 298 | 299 | ax2.axis('off') 300 | cmap = adata.obsm[m+f"_predicted_expression_hi"][target_gene].values-adata.obsm[m+f"_predicted_expression_lo"][target_gene].values 301 | cmap[cmap<0] = 0 302 | cmap = np.log1p(cmap) 303 | cmap[cmap > np.percentile(cmap,95)] = np.percentile(cmap,95) 304 | im = ax2.scatter(adata.obsm['spatial'][:,0],adata.obsm['spatial'][:,1],s=1,c=cmap,rasterized=True)#,vmin=vmin,vmax=vmax) 305 | ax2.set_title('PI Width ' + target_gene, fontsize = 12) 306 | 307 | cbar = fig.colorbar(im) 308 | cbar.ax.get_yaxis().labelpad = 15 309 | cbar.ax.set_ylabel('Log Expression', rotation=270) 310 | 311 | plt.suptitle(m, fontsize=16) 312 | plt.tight_layout() 313 | plt.show() 314 | ``` 315 | 316 | 317 | 318 | ![png](README_files/README_22_0.png) 319 | 320 | 321 | 322 | The TISSUE prediction intervals are decent and match the distribution of imputation errors especially well in the bottom portions of the section. On the full dataset, which has much richer reference scRNAseq data, the calibration quality is further improved (see Figure 2 of the TISSUE manuscript). 323 | 324 | ## Downstream TISSUE analyses: Hypothesis testing, Clustering/Visualization, Prediction 325 | 326 | TISSUE provides additional functionalities for leveraging these uncertainty estimates and prediction intervals in common downstream single-cell spatial transcriptomics analysese. All functions for downstream analysis are in the `tissue.downstream` module. These include: 327 | 328 | - `tissue.downstream.multiple_imputation_testing()` for hypothesis testing using multiple imputations drawn from the calibration score sets 329 | 330 | - `tissue.downstream.weighted_PCA()` for computing weighted principal components where weights correspond to a transform of the inverse prediction interval width 331 | 332 | - `tissue.downstream.detect_uncertain_cells()` for filtering low-confidence cells from data before training and evaluation of machine learning models, which generally improves performance 333 | 334 | Below we include a few example uses of these modules. For more examples (including those for the experiments in the associated manuscript), please see the Github repository: https://github.com/sunericd/tissue-figures-and-analyses.git. 335 | 336 | ### Tutorial 3: Hypothesis testing with TISSUE multiple imputation framework 337 | 338 | This tutorial uses the TISSUE calibration scores to generate multiple imputations and then perform hypothesis testing by aggregating statistics across these multiple imputations. The default and recommended statistical test for this framework is the Student's t-test, but other options for spatially variable gene detection using SpatialDE and non-parametric (one-sided) Mann-Whitney/Wilcoxon tests can also be performed by specifying `method="spatialde"`, `method="wilcoxon_greater"`, `method="wilcoxon_less"` for `tissue.downstream.multiple_imputation_testing()`. 339 | 340 | Please run the code in Tutorials 1-2 to generate predictions and TISSUE uncertainty measures before this tutorial. 341 | 342 | After, we construct some binary labels for the cells in the dataset: 343 | 344 | 345 | ```python 346 | # split into two groups based on indices 347 | adata.obs['condition'] = ['A' if i < round(adata.shape[0]/2) else 'B' for i in range(adata.shape[0])] 348 | 349 | # plot conditions 350 | plt.scatter(adata[adata.obs.condition=="A"].obsm['spatial'][:,0], 351 | adata[adata.obs.condition=="A"].obsm['spatial'][:,1], 352 | c='tab:red', s=3, label="A") 353 | plt.scatter(adata[adata.obs.condition=="B"].obsm['spatial'][:,0], 354 | adata[adata.obs.condition=="B"].obsm['spatial'][:,1], 355 | c='tab:blue', s=3, label="B") 356 | plt.legend(loc='best') 357 | plt.show() 358 | ``` 359 | 360 | 361 | 362 | ![png](README_files/README_26_0.png) 363 | 364 | 365 | 366 | As we can see, the cells in group A primarily belong to the medial layers of the section while the cells in group B correspond to upper pia layer and also the bottom layers of the section. As such, when we do differential gene expression analysis, we should expect some differentially expressed markers between these two labels. 367 | 368 | Now, we perform differential gene expression analysis using TISSUE multiple imputation hypothesis testing, which works by using TISSUE calibration scores to sample multiple "imputations" (alternative predictions) and then aggregate these statistics together afterwards. Here, we set `group1="A"` and `group2="B"` to find genes that differentially expressed between the two groups across `n_imputations=10` number of imputations (higher values here are better but take more time to compute): 369 | 370 | 371 | ```python 372 | # multiple imputation hypothesis testing 373 | 374 | tissue.downstream.multiple_imputation_testing(adata, "spage_predicted_expression", 375 | calib_genes=adata.var_names, 376 | condition='condition', 377 | group1 = "A", # use None to compute for all conditions, condition vs all 378 | group2 = "B", # use None to compute for group1 vs all 379 | n_imputations=10) 380 | ``` 381 | 382 | TISSUE multiple imputation testing saves the results directing within the `adata.uns` metadata and can be read out by the compared groups and the name of the statistic: 383 | 384 | 385 | ```python 386 | # extract statistics for target_gene 387 | print("t-statistic = "+str(round(adata.uns['spage_A_B_tstat'][target_gene].values[0],5))) 388 | print("P = "+str(round(adata.uns['spage_A_B_pvalue'][target_gene].values[0],5))) 389 | ``` 390 | 391 | t-statistic = -2.38565 392 | P = 0.02371 393 | 394 | 395 | Testing of our target gene (Plp1) results in significant under-expression in group A as compared to group B, suggesting that Plp1 could be a marker gene for the cell types / regions in group B. 396 | 397 | **NOTE: Many TISSUE modules rely on stochastic sampling so the printed metrics may vary slightly from run to run.** 398 | 399 | ### Tutorial 4: TISSUE cell filtering for supervised learning 400 | 401 | TISSUE cell filtering removes cells with the greatest average uncertainty in predicted gene expression, which generally improves the performance of supervised learning models (i.e. classifiers) when trained and evaluated on these filtered predicted expression data. 402 | 403 | In this tutorial, we will filter out the uncertain cells (using automatic Otsu thresholding) and then train and evaluate a logistic regression classifier to predict the two cell groups (A vs B) from Tutorial 3. 404 | 405 | To start, we will need to compute the TISSUE prediction interval width as a proxy for uncertainty. We do this by subtracting the lower bound from the upper bound: 406 | 407 | 408 | ```python 409 | # get uncertainty (PI width) for filtering 410 | 411 | X_uncertainty = adata.obsm["spage_predicted_expression_hi"].values - adata.obsm["spage_predicted_expression_lo"].values 412 | ``` 413 | 414 | Then we can filter using the TISSUE prediction interval width. We perform filtering within each strata (i.e. cell group label "A" or "B"), but this can also be done across other groupings or across the entire population of cells if desired. Here we use Otsu thresholding to automatically determine the proportion of cells to filter out within each strata, but you can set this to a hard threshold if desired. 415 | 416 | 417 | ```python 418 | # uncertainty-based cell filtering 419 | 420 | keep_idxs = tissue.downstream.detect_uncertain_cells (X_uncertainty, 421 | proportion="otsu", 422 | stratification=adata.obs['condition'].values) 423 | 424 | adata_filtered = adata[adata.obs_names[keep_idxs],:].copy() 425 | ``` 426 | 427 | Now that we have an object with filtered predicted gene expression, we can check to see how the dimensions of our data have changed from TISSUE filtering: 428 | 429 | 430 | ```python 431 | # examine dimensions of data before/after TISSUE filtering 432 | 433 | print("Before TISSUE cell filtering:") 434 | print(adata.shape) 435 | print("\nAfter TISSUE cell filtering:") 436 | print(adata_filtered.shape) 437 | ``` 438 | 439 | Before TISSUE cell filtering: 440 | (3405, 31) 441 | 442 | After TISSUE cell filtering: 443 | (2862, 31) 444 | 445 | 446 | And similarly, we can check the balance in the two cell groups after filtering: 447 | 448 | 449 | ```python 450 | # print balance of labels in the filtered dataset 451 | 452 | pd.DataFrame(np.unique(adata_filtered.obs['condition'], return_counts=True),index=["Group","Number of Cells"]) 453 | ``` 454 | 455 | 456 | 457 | 458 |
459 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 |
01
GroupAB
Number of Cells12501612
493 |
494 | 495 | 496 | 497 | As we can see, TISSUE automatically filters out a large number of cells with uncertain gene predictions. In the filtered dataset, the balance between group A and group B is relatively preserved. 498 | 499 | Now, we will move on to training a logistic regression classifier on the filtered data. Given the modular nature of TISSUE filtering (i.e. the output is a cellxgene matrix), integrating TISSUE with other supervised learning models is as easy as plug-and-play. 500 | 501 | First, we split into a train (80%) and test set (20%): 502 | 503 | 504 | ```python 505 | # split train and test randomly (80%-20%) 506 | np.random.seed(444) 507 | train_idxs = np.random.choice(np.arange(adata_filtered.shape[0]), round(adata_filtered.shape[0]*0.8), replace=False) 508 | test_idxs = np.array([idx for idx in np.arange(adata_filtered.shape[0]) if idx not in train_idxs]) 509 | 510 | train_data = adata_filtered.obsm["spage_predicted_expression"].values[train_idxs,:] 511 | train_labels = adata_filtered.obs["condition"][train_idxs] 512 | 513 | test_data = adata_filtered.obsm["spage_predicted_expression"].values[test_idxs,:] 514 | test_labels = adata_filtered.obs["condition"][test_idxs] 515 | ``` 516 | 517 | Then, we train logistic regression classifier on the filtered and split data: 518 | 519 | 520 | ```python 521 | from sklearn.linear_model import LogisticRegression 522 | from sklearn.preprocessing import StandardScaler 523 | from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score 524 | 525 | # init and scale data 526 | scaler = StandardScaler() 527 | train_data = scaler.fit_transform(train_data) 528 | 529 | # fit model on scaled data 530 | model = LogisticRegression(penalty='l1', solver='liblinear').fit(train_data, train_labels) 531 | ``` 532 | 533 | Now that we have trained the model, we can evaluate its performance on the unseen test data: 534 | 535 | 536 | ```python 537 | # make predictions on test data 538 | pred_test = model.predict(scaler.transform(test_data)) 539 | pred_test_scores = model.predict_proba(scaler.transform(test_data)) 540 | 541 | # print metrics 542 | test_labels_num = [0 if x=="A" else 1 for x in test_labels] 543 | print(f"Accuracy Score: {accuracy_score(test_labels, pred_test)}") 544 | print(f"ROC-AUC Score: {roc_auc_score(test_labels_num, pred_test_scores[:,1])}") 545 | ``` 546 | 547 | Accuracy Score: 0.8024475524475524 548 | ROC-AUC Score: 0.8765594181459566 549 | 550 | 551 | The model performs quite well! It has high accuracy and high ROC-AUC for a relatively balanced binary classification problem. A similar approach can be taken to leverage TISSUE uncertainties in training/evaluating other model architectures (e.g. linear regression, random forest, neural nets). 552 | 553 | ### Tutorial 5: TISSUE cell filtering for PCA (clustering and visualization) 554 | 555 | Downstream clustering and data visualization tasks in transcriptomics data analysis generally rely on dimensionality reduction via principal component analysis (PCA). To incorporate TISSUE uncertainties in these downstream tasks, we perform TISSUE cell filtering before fitting the PCA model and reducing dimensionality. 556 | 557 | In this tutorial, we will apply TISSUE cell filtering to the dataset to generate principal components. This can be done with `tissue.downstream.filtered_PCA()` which is wrapper around the direct TISSUE cell filtering. From these principal components, we can make a two-dimensional PCA plot and perform clustering on the top 15 principal components using K-Means. 558 | 559 | We will be starting with the AnnData object after running Tutorials 1-3: `adata` and then applying TISSUE-filtered PCA: 560 | 561 | 562 | ```python 563 | # uncertainty-based cell filtering for PCA 564 | 565 | keep_idxs = tissue.downstream.filtered_PCA (adata, # anndata object 566 | "spage", # prediction method 567 | proportion="otsu", 568 | stratification=adata.obs['condition'].values, 569 | return_keep_idxs=True) 570 | 571 | # filter to keep track of labels 572 | adata_filtered = adata[adata.obs_names[keep_idxs],:].copy() 573 | ``` 574 | 575 | Here we used the same `otsu` threshold-based automatic filtering as before and stratify the filtering by the conditions. We use the default `n_components=15`. 576 | 577 | There are two options for the TISSUE-filtered PCA, both of which are saved into `adata.uns` and `adata.obsm` respectively. The first is the standard principal components obtained on the TISSUE-filtered data, which can be found in `adata.uns['{name of prediction method}_predicted_expression_PC15_filtered_']`. The second is the PCA fitted onto TISSUE-filtered data but then applied to the entire dataset (unfiltered), which can be found in `adata.obsm['{name of prediction method}_predicted_expression_PC15_']`. We will use the first (and recommended) option in this tutorial 578 | 579 | 580 | ```python 581 | # retrieve filtered PCA 582 | 583 | PC_reduced = adata.uns['spage_predicted_expression_PC15_filtered_'].copy() 584 | print(PC_reduced.shape) 585 | ``` 586 | 587 | (2862, 15) 588 | 589 | 590 | We now have a reduced representation of our original data that is filtered by TISSUE and has 15 principal components. We can visualize the first two principal components: 591 | 592 | 593 | ```python 594 | # make 2D PCA plot labeled by group 595 | 596 | plt.title("TISSUE-Filtered PCA") 597 | plt.scatter(PC_reduced[adata_filtered.obs['condition']=='A',0], 598 | PC_reduced[adata_filtered.obs['condition']=='A',1], 599 | c="tab:red", s=3, label="A") 600 | plt.scatter(PC_reduced[adata_filtered.obs['condition']=='B',0], 601 | PC_reduced[adata_filtered.obs['condition']=='B',1], 602 | c="tab:blue", s=3, label="B") 603 | plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) 604 | plt.xlabel("PC 1") 605 | plt.ylabel("PC 2") 606 | plt.show() 607 | ``` 608 | 609 | 610 | 611 | ![png](README_files/README_52_0.png) 612 | 613 | 614 | 615 | Visually, there is some separation between the two groups on the first two principal components (although lots of overlap too). 616 | 617 | Next, we can try K-Means clustering using all 15 principal components (and evaluate clustering with the ARI): 618 | 619 | 620 | ```python 621 | from sklearn.cluster import KMeans 622 | 623 | # K-Means clustering 624 | kmeans = KMeans(n_clusters=2).fit(PC_reduced) 625 | clusters = kmeans.labels_ 626 | 627 | # evaluate ARI 628 | from sklearn.metrics import adjusted_rand_score 629 | print(adjusted_rand_score(adata_filtered.obs['condition'], clusters)) 630 | ``` 631 | 632 | /home/edsun/anaconda3/envs/tissue/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning 633 | warnings.warn( 634 | 635 | 636 | 0.218865537222824 637 | 638 | 639 | Evidently, the clustering with TISSUE-filtered principal components can provide some degree of separation between the two cell groups that we defined previously. 640 | 641 | ### Tutorial 6: TISSUE-WPCA (weighted principal component analysis) 642 | 643 | An alternative approach to TISSUE cell filtering for PCA is TISSUE-WPCA, which involves weighting each value in the predicted gene expression matrix in PCA, thus allowing for a softer approach that can leverage more of the predicted expression data. However, in practice, TISSUE-WPCA generally does not yield as many changes to the resulting principal components than TISSUE cell filtering when compared to normal PCA. 644 | 645 | TISSUE-WPCA is highly customizable, mostly in the definition of the weights (see source code for more details and documentation). We use one of the implementations highlighted in the TISSUE manuscript, which involves the inverse TISSUE prediction interval width that is then binarized into a high weight and a low weight that are separated by an order of magnitude: 646 | 647 | 648 | ```python 649 | # weighted PCA 650 | 651 | tissue.downstream.weighted_PCA(adata, "spage", pca_method="wpca", weighting="inverse_pi_width", 652 | replace_inf="max", binarize=0.2, binarize_ratio=10, 653 | n_components=15) 654 | ``` 655 | 656 | Here we used the `inverse_pi_width` method which uses the inverse prediction interval width as the initial weight. We replace all `inf` values with the maximum weights. We specify `binarize=0.2`, which is the proportion of cells to draw the high/low weight split. The `binarize_ratio` is the fold-change between the high and low weight values, and we use `n_components=15`. 657 | 658 | Now that we have performed TISSUE-WPCA, we can access the resulting principal components from `adata.obsm['{prediction method name}_predicted_expression_PC15_']` and use these as we would with any other reduced representation of the data. For example, we can visualize the two cell groups along the first two principal components: 659 | 660 | 661 | ```python 662 | # make PC plot 663 | 664 | X_pc = adata.obsm['spage_predicted_expression_PC15_'] 665 | 666 | plt.title("TISSUE Weighted PCA") 667 | plt.scatter(X_pc[adata.obs['condition']=='A',0], X_pc[adata.obs['condition']=='A',1], 668 | c="tab:red", s=3, label="A") 669 | plt.scatter(X_pc[adata.obs['condition']=='B',0], X_pc[adata.obs['condition']=='B',1], 670 | c="tab:blue", s=3, label="B") 671 | plt.xlabel("PC 1") 672 | plt.ylabel("PC 2") 673 | plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) 674 | plt.show() 675 | ``` 676 | 677 | 678 | 679 | ![png](README_files/README_60_0.png) 680 | 681 | 682 | 683 | Similarly to TISSUE cell filtering PCA, we see that TISSUE-WPCA can visually separate the two cell groups in the PCA plot. 684 | 685 | # Additional considerations 686 | 687 | ## Hyperparameter selection: 688 | 689 | At various parts of the TISSUE pipeline, the user can select different hyperparameters. Here we outline some guiding principles for reasonably selecting these hyperparameters: 690 | 691 | - `n_neighbors` in `tissue.main.build_spatial_graph()` - the approximate number of neighbors to use for computing TISSUE cell-centric variability. Generally, we recommend setting this to a value close to 15 to ensure reliable cell-centric variability estimates. Values from 5-30 also work comparably well. Alternatively, you can try out other spatial graph methods such as the other options in `tissue.main.build_spatial_graph()` or load in your own spatial graph adjacency matrix using `tissue.main.load_spatial_graph()` from a .npz file. 692 | 693 | - `alpha_level` in `tissue.main.conformalize_prediction_interval()` - the confidence measure corresponding to (1-alpha) TISSUE prediction interval coverage. Generally, we recommend 0.23 to retrieve the 67% TISSUE prediction interval (approx. one standard error) but downstream results are largely robust to the exact choice. Values very close to 0 or very close to 1 are less likely to provide informative calibrations. 694 | 695 | - `k`, `k2` in `tissue.main.conformalize_spatial_uncertainty()` - these are the gene and cell stratified group numbers respectively. You can try different values for each of these, but we recommend staying below 4 for either parameter. In the manuscript, we primarily used `k=4` and `k2=1`. If you don't want to choose, TISSUE can automatically select these hyperparameters if you set `k='auto'` and `k2='auto'`. 696 | 697 | 698 | ## Computational runtime and speed ups: 699 | 700 | If you are experiencing slow runtimes with TISSUE, there are several things to check or change for faster runtime (also refer to Extended Data Figure 9 in our publication for runtime breakdown in the first version of TISSUE for different-sized datasets): 701 | 702 | 703 | **Large number of cells** 704 | - We have tested TISSUE on datasets up to 20K cells. If your data contains substantially more cells, we recommend downsampling the cells (e.g. random uniform sampling) or subsetting to cell groups of interest. 705 | - If the prediction step takes a long time, we suggest decreasing the number of cross-validation folds by setting `n_folds` in `tissue.main.predict_gene_expression()`. 706 | 707 | **Large number of genes** 708 | - Generally TISSUE is robust to the number of genes but for further speedup, we recommend only predicting genes that are necessary or using the scRNAseq reference dataset to identify a set of highly variable genes for prediction beforehand (if whole-transcriptome is desired, for example using `scanpy.pp.highly_variable_genes`). 709 | - Set `weight='exp_cos_pca'` and `weight_n_pc` to some integer (e.g. 15) in `tissue.main.conformalize_spatial_uncertainty()` to perform cosine similarity weights based on low-dimensionality (for better runtime and reduce high-dimensional distortions) 710 | - Set `n_pc` and `n_pc2` to some integer (e.g. 15) in `tissue.main.conformalize_spatial_uncertainty()` so that k-means clustering will be done on lower dimensional space (for better runtime and performance) 711 | 712 | **Other runtime tips**: 713 | 714 | - Turn off Wasserstein calculation in `tissue.main.conformalize_prediction_interval()` by setting `compute_wasserstein=False` (default) 715 | 716 | 717 | 718 | ## Memory usage: 719 | 720 | We have optimized TISSUE to be memory-efficient with respect the size of the original dataset. Since spatial transcriptomics datasets can be very large, and TISSUE requires additional overhead for some of its operations, here are some suggestions for dealing with memory issues: 721 | 722 | - For downsizing datasets with many cells or many genes (to predict), refer to the previous section on runtimes for ways of downsampling for both improved runtime and lower memory usage 723 | 724 | - In `tissue.main.build_spatial_graph()`, you can consider setting `radius` to a value (default is None) for any of the radius-based methods. This is more important in older versions of TISSUE, which were not optimized. 725 | 726 | - In `tissue.downstream.multiple_imputation_testing()`, make sure that `save_mi=False` which stops saving of each multiple imputation into memory. 727 | 728 | 729 | 730 | 731 | # UNDER DEVELOPMENT: 732 | - Multi-threading for making cross-validation predictions in `tissue.main.predict_gene_expression()`. 733 | - Gene filtering guidelines / strategy 734 | - Suppress warning printouts 735 | 736 | # Citation 737 | 738 | If you find this code useful, we would appreciate it if you cite the following publications: 739 | 740 | --- 741 | Sun, E.D., Ma, R., Navarro Negredo, P. et al. TISSUE: uncertainty-calibrated prediction of single-cell spatial transcriptomics improves downstream analyses. Nat Methods (2024). https://doi.org/10.1038/s41592-024-02184-y 742 | 743 | --- 744 | **Preprint:** 745 | 746 | Sun ED, Ma R, Navarro Negredo P, Brunet A, Zou J. TISSUE: uncertainty-calibrated prediction of single-cell spatial transcriptomics improves downstream analyses. Preprint at https://doi.org/10.1101/2023.04.25.538326 (2023). 747 | 748 | For Jupyter notebooks and Python scripts associated with our original publication, please refer to https://github.com/sunericd/tissue-figures-and-analyses.git. **NOTE: For the original publication, we used TISSUE version 0.0.2** 749 | 750 | 751 | ```python 752 | 753 | ``` 754 | -------------------------------------------------------------------------------- /tissue/downstream.py: -------------------------------------------------------------------------------- 1 | # Contains functions for all downstream applications of TISSUE calibration scores and prediction intervals 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | import scanpy as sc 7 | from sklearn.preprocessing import StandardScaler 8 | from sklearn.decomposition import PCA 9 | import anndata as ad 10 | import os 11 | import sys 12 | 13 | #from tissue.main import build_calibration_scores, get_spatial_uncertainty_scores_from_metadata 14 | from .main import build_calibration_scores, get_spatial_uncertainty_scores_from_metadata 15 | 16 | 17 | def multiple_imputation_testing (adata, predicted, calib_genes, condition, test="ttest", n_imputations=100, 18 | group1=None, group2=None, symmetric=False, return_keys=False, save_mi=False): 19 | ''' 20 | Uses multiple imputation with the score distributions to perform hypothesis testing 21 | 22 | Parameters 23 | ---------- 24 | adata [AnnData] - contains adata.obsm[predicted] corresponding to the predicted gene expression 25 | predicted [str] - key in adata.obsm that corresponds to predicted gene expression 26 | calib_genes [list or arr of str] - names of the genes in adata.var_names that are used in the calibration set 27 | condition [str] - key in adata.obs for which to compute the hypothesis test 28 | group1 [value] - value in adata.obs[condition] identifying the first comparison group 29 | if None, will perform group vs all comparisons for all unique values in adata.obs[condition] 30 | group2 [value] - value in adata.obs[condition] identifying the second comparison group 31 | if None, will compare against all values that are not group1 32 | test [str] - statistical test to use: 33 | "ttest" - two-sample t-test using Rubin's rules (best theoretical support/guarantee) 34 | "wilcoxon_greater" - one-sample wilcoxon (Mann-Whitney U test) for greater expression using p-value transformation 35 | "wilcoxon_less" - one-sample wilcoxon (Mann-Whitney U test) for lesser expression using p-value transformation 36 | "spatialde" - SpatialDE test using p-value transformation 37 | n_imputations [int] - number of imputations to use 38 | symmetric [bool] - whether to have symmetric (or non-symmetric) prediction intervals 39 | return_keys [bool] - whether to return the keys for which to access the results from adata 40 | save_mi [False or str] - multiple imputation saving (only used for multiple_imputation_ttest()) 41 | 42 | Returns 43 | ------- 44 | Modifies adata in-place to add the statistics and test results to metadata 45 | Optionally returns the keys to access the results from adata 46 | 47 | ''' 48 | ##################################################################### 49 | # T-test (default) - this is the option with best theoretical support 50 | ##################################################################### 51 | if test == "ttest": 52 | keys = multiple_imputation_ttest (adata, predicted, calib_genes, condition, n_imputations=n_imputations, 53 | group1=group1, group2=group2, symmetric=symmetric, save_mi=save_mi) 54 | 55 | ##################################################################### 56 | # One-sample ("less"/"greater") Wilcoxon test 57 | ##################################################################### 58 | elif test == "wilcoxon_less": 59 | keys = multiple_imputation_wilcoxon (adata, predicted, calib_genes, condition, n_imputations=n_imputations, 60 | group1=group1, group2=group2, symmetric=symmetric, direction='less') 61 | elif test == "wilcoxon_greater": 62 | keys = multiple_imputation_wilcoxon (adata, predicted, calib_genes, condition, n_imputations=n_imputations, 63 | group1=group1, group2=group2, symmetric=symmetric, direction='greater') 64 | 65 | ##################################################################### 66 | # SpatialDE (spatially variable genes) test 67 | ##################################################################### 68 | elif test == "spatialde": 69 | keys = multiple_imputation_spatialde (adata, predicted, calib_genes, n_imputations=n_imputations, symmetric=symmetric) 70 | 71 | # raise exception if test does not match options 72 | else: 73 | raise Exception ("Specified test not recognized") 74 | 75 | if return_keys is True: 76 | 77 | return(keys) 78 | 79 | 80 | def multiple_imputation_spatialde (adata, predicted, calib_genes, n_imputations=100, symmetric=False): 81 | ''' 82 | Runs TISSUE multiple imputation SpatialDE test using p-value transformation 83 | 84 | See multiple_imputation_testing() for details on parameters 85 | ''' 86 | import SpatialDE 87 | 88 | # get uncertainties and scores from saved adata 89 | scores, residuals, G_stdev, G, groups = get_spatial_uncertainty_scores_from_metadata (adata, predicted) 90 | 91 | ### Building calibration sets for scores 92 | 93 | scores_flattened_dict = build_calibration_scores(adata, predicted, calib_genes, symmetric=symmetric, 94 | include_zero_scores=True, trim_quantiles=[None, 0.8]) # trim top 20% scores 95 | 96 | ### Multiple imputation 97 | 98 | # init dictionary to hold results 99 | stat_dict = {} 100 | stat_dict["pvalue"] = {} 101 | 102 | for m in range(n_imputations): 103 | 104 | # generate new imputation 105 | new_G = sample_new_imputation_from_scores (G, G_stdev, groups, scores_flattened_dict, symmetric=symmetric) 106 | 107 | key = "spatialde" 108 | 109 | if m == 0: # init list 110 | stat_dict["pvalue"][key] = [] 111 | 112 | # get spatialDE p-values 113 | normalized_matrix = new_G/(1+np.sum(new_G,axis=1)[:,None]) 114 | normalized_matrix = np.log1p((normalized_matrix-np.min(normalized_matrix)) * 100) 115 | sp_df = pd.DataFrame(normalized_matrix, 116 | columns=adata.obsm[predicted].columns, 117 | index=adata.obsm[predicted].index) 118 | 119 | results = SpatialDE.run(adata.obsm['spatial'], sp_df) 120 | 121 | # sort by gene name order 122 | results.drop_duplicates(subset = ['g'], keep = 'first', inplace = True) # workaround duplication SpatialDE bug 123 | results.g = results.g.astype("category") 124 | results.g = results.g.cat.set_categories(adata.obsm[predicted].columns) 125 | results = results.sort_values(["g"]) 126 | 127 | # get pvalues 128 | pval = list(results["pval"]) 129 | stat_dict["pvalue"][key].append(pval) 130 | 131 | # pool statistics 132 | pooled_results_dict = {} 133 | pooled_results_dict['pvalue'] = {} 134 | # for each test grouping 135 | for key in stat_dict['pvalue'].keys(): 136 | pooled_results_dict['pvalue'][key] = [] 137 | pval_arr = np.vstack(stat_dict['pvalue'][key]) 138 | # for each gene, get mi pvalue 139 | for ci in range(pval_arr.shape[1]): 140 | mi_pval = multiply_imputed_pvalue (pval_arr[:,ci], method="licht_rubin") 141 | pooled_results_dict['pvalue'][key].append(mi_pval) 142 | 143 | # add stats to adata 144 | keys_list = [] 145 | for key_measure in pooled_results_dict.keys(): 146 | for key_comparison in pooled_results_dict[key_measure].keys(): 147 | adata.uns[predicted.split("_")[0]+"_"+key_comparison+"_"+key_measure] = pd.DataFrame(np.array(pooled_results_dict[key_measure][key_comparison])[None,:], 148 | columns=adata.obsm[predicted].columns) 149 | keys_list.append(predicted.split("_")[0]+"_"+key_comparison+"_"+key_measure) 150 | 151 | return(keys_list) 152 | 153 | 154 | def multiple_imputation_wilcoxon (adata, predicted, calib_genes, condition, n_imputations=100, 155 | group1=None, group2=None, symmetric=False, direction="greater"): 156 | ''' 157 | Runs TISSUE multiple imputation one-sample Wilcoxon (greater/lesser) test using p-value transformation 158 | 159 | See multiple_imputation_testing() for details on parameters 160 | ''' 161 | from scipy.stats import mannwhitneyu 162 | 163 | # get uncertainties and scores from saved adata 164 | scores, residuals, G_stdev, G, groups = get_spatial_uncertainty_scores_from_metadata (adata, predicted) 165 | 166 | ### Building calibration sets for scores 167 | 168 | scores_flattened_dict = build_calibration_scores(adata, predicted, calib_genes, symmetric=symmetric, 169 | include_zero_scores=True, trim_quantiles=[None, 0.8]) # trim top 20% scores 170 | 171 | ### Multiple imputation 172 | 173 | # init dictionary to hold results 174 | stat_dict = {} 175 | stat_dict["pvalue"] = {} 176 | 177 | # cast condition to str 178 | condition = str(condition) 179 | 180 | for m in range(n_imputations): 181 | 182 | # generate new imputation 183 | new_G = sample_new_imputation_from_scores (G, G_stdev, groups, scores_flattened_dict, symmetric=symmetric) 184 | 185 | if group1 is None: # pairwise comparisons against all 186 | 187 | for g1 in np.unique(adata.obs[condition]): 188 | 189 | key = str(g1)+"_all" 190 | 191 | if m == 0: # init list 192 | stat_dict["pvalue"][key] = [] 193 | 194 | g1_bool = (adata.obs[condition] == g1) # g1 195 | g2_bool = (adata.obs[condition] != g1) # all other 196 | 197 | # get SpatialDE p-values 198 | pval = [] 199 | for ci in range(new_G.shape[1]): 200 | u,p = mannwhitneyu(new_G[g1_bool,ci], new_G[g2_bool,ci], alternative=direction) 201 | pval.append(p) 202 | 203 | stat_dict["pvalue"][key].append(pval) 204 | 205 | elif group2 is None: # group1 vs all 206 | 207 | key = str(group1)+"_all" 208 | 209 | if m == 0: # init list 210 | stat_dict["pvalue"][key] = [] 211 | 212 | g1_bool = (adata.obs[condition] == group1) # g1 213 | g2_bool = (adata.obs[condition] != group1) # all other 214 | 215 | # get wilcoxon p-values 216 | pval = [] 217 | for ci in range(new_G.shape[1]): 218 | u,p = mannwhitneyu(new_G[g1_bool,ci], new_G[g2_bool,ci], alternative=direction) 219 | pval.append(p) 220 | 221 | stat_dict["pvalue"][key].append(pval) 222 | 223 | else: # group1 vs group2 224 | 225 | key = str(group1)+"_"+str(group2) 226 | 227 | if m == 0: # init list 228 | stat_dict["pvalue"][key] = [] 229 | 230 | g1_bool = (adata.obs[condition] == group1) # g1 231 | g2_bool = (adata.obs[condition] == group2) # g2 232 | 233 | # get wilcoxon p-values 234 | pval = [] 235 | for ci in range(new_G.shape[1]): 236 | u,p = mannwhitneyu(new_G[g1_bool,ci], new_G[g2_bool,ci], alternative=direction) 237 | pval.append(p) 238 | 239 | stat_dict["pvalue"][key].append(pval) 240 | 241 | # pool statistics 242 | pooled_results_dict = {} 243 | pooled_results_dict['pvalue'] = {} 244 | # for each test grouping 245 | for key in stat_dict['pvalue'].keys(): 246 | pooled_results_dict['pvalue'][key] = [] 247 | pval_arr = np.vstack(stat_dict['pvalue'][key]) 248 | # for each gene, get mi pvalue 249 | for ci in range(pval_arr.shape[1]): 250 | mi_pval = multiply_imputed_pvalue (pval_arr[:,ci], method="licht_rubin") 251 | pooled_results_dict['pvalue'][key].append(mi_pval) 252 | 253 | # add stats to adata 254 | keys_list = [] 255 | for key_measure in pooled_results_dict.keys(): 256 | for key_comparison in pooled_results_dict[key_measure].keys(): 257 | adata.uns[predicted.split("_")[0]+"_"+key_comparison+"_"+key_measure] = pd.DataFrame(np.array(pooled_results_dict[key_measure][key_comparison])[None,:], 258 | columns=adata.obsm[predicted].columns) 259 | keys_list.append(predicted.split("_")[0]+"_"+key_comparison+"_"+key_measure) 260 | 261 | return(keys_list) 262 | 263 | 264 | def multiply_imputed_pvalue (pvalues, method="licht_rubin"): 265 | ''' 266 | Computes a multiply imputed p-value from a list of p-values according to Licht-Rubin procedure or median procedure 267 | 268 | Parameters 269 | ---------- 270 | pvalues [array-like] - array of p-values from multiple imputation tests 271 | method [str] - which method for p-value calculation to use: "licht_rubin" or "median" 272 | 273 | Returns 274 | ------- 275 | mi_pvalue [float] - p-value modified for multiple imputation 276 | 277 | See reference for technical details: https://stefvanbuuren.name/fimd/sec-multiparameter.html#sec:chi 278 | ''' 279 | from scipy.stats import norm 280 | 281 | if method == "licht_rubin": 282 | z = norm.ppf(pvalues) # transform to z-scale 283 | num = np.nanmean(z) 284 | den = np.sqrt(1 + np.nanvar(z)) 285 | mi_pvalue = norm.cdf( num / den) # average and transform back 286 | 287 | elif method == "median": 288 | mi_pvalue = np.nanmedian(pvalues) 289 | 290 | else: 291 | raise Exception ("method for multiply_imputed_pvalue() not recognized") 292 | 293 | return(mi_pvalue) 294 | 295 | 296 | 297 | def multiple_imputation_ttest (adata, predicted, calib_genes, condition, n_imputations=100, 298 | group1=None, group2=None, symmetric=False, save_mi=False): 299 | ''' 300 | Runs TISSUE multiple imputation two-sample t-test using Rubin's rules 301 | 302 | See multiple_imputation_testing() for details on parameters 303 | 304 | Additional Parameters 305 | --------------------- 306 | save_mi [False or str] - if not False, then saves "multiple_imputations.npy" stacked matrix of imputed gene expression at save_mi path -- NOTE: this requires large memory 307 | ''' 308 | 309 | # get uncertainties and scores from saved adata 310 | scores, residuals, G_stdev, G, groups = get_spatial_uncertainty_scores_from_metadata (adata, predicted) 311 | 312 | ### Building calibration sets for scores 313 | 314 | scores_flattened_dict = build_calibration_scores(adata, predicted, calib_genes, symmetric=symmetric, 315 | include_zero_scores=True, trim_quantiles=[None, 0.8]) # trim top 20% scores 316 | 317 | ### Multiple imputation 318 | 319 | # init dictionary to hold results (for independent two-sample t-test) 320 | stat_dict = {} 321 | stat_dict["mean_difference"] = {} 322 | stat_dict["standard_deviation"] = {} 323 | 324 | # cast condition to str 325 | condition = str(condition) 326 | 327 | new_G_list = [] # for saving multiple imputations 328 | 329 | for m in range(n_imputations): 330 | 331 | # generate new imputation 332 | new_G = sample_new_imputation_from_scores (G, G_stdev, groups, scores_flattened_dict, symmetric=symmetric) 333 | if save_mi is not False: 334 | new_G_list.append(new_G) 335 | 336 | # calculate statistics for the imputation using approach from Palmer & Peer, 2016 337 | 338 | if group1 is None: # pairwise comparisons against all 339 | 340 | for g1 in np.unique(adata.obs[condition]): 341 | 342 | key = str(g1)+"_all" 343 | 344 | if m == 0: # init list 345 | stat_dict["mean_difference"][key] = [] 346 | stat_dict["standard_deviation"][key] = [] 347 | 348 | g1_bool = (adata.obs[condition] == g1) # g1 349 | g2_bool = (adata.obs[condition] != g1) # all other 350 | 351 | mean_diff, pooled_sd = get_ttest_stats(new_G, g1_bool, g2_bool) # get ttest stats 352 | stat_dict["mean_difference"][key].append(mean_diff) 353 | stat_dict["standard_deviation"][key].append(pooled_sd) 354 | 355 | elif group2 is None: # group1 vs all 356 | 357 | key = str(group1)+"_all" 358 | 359 | if m == 0: # init list 360 | stat_dict["mean_difference"][key] = [] 361 | stat_dict["standard_deviation"][key] = [] 362 | 363 | g1_bool = (adata.obs[condition] == group1) # g1 364 | g2_bool = (adata.obs[condition] != group1) # all other 365 | 366 | mean_diff, pooled_sd = get_ttest_stats(new_G, g1_bool, g2_bool) # get ttest stats 367 | stat_dict["mean_difference"][key].append(mean_diff) 368 | stat_dict["standard_deviation"][key].append(pooled_sd) 369 | 370 | else: # group1 vs group2 371 | 372 | key = str(group1)+"_"+str(group2) 373 | 374 | if m == 0: # init list 375 | stat_dict["mean_difference"][key] = [] 376 | stat_dict["standard_deviation"][key] = [] 377 | 378 | g1_bool = (adata.obs[condition] == group1) # g1 379 | g2_bool = (adata.obs[condition] == group2) # g2 380 | 381 | mean_diff, pooled_sd = get_ttest_stats(new_G, g1_bool, g2_bool) # get ttest stats 382 | stat_dict["mean_difference"][key].append(mean_diff) 383 | stat_dict["standard_deviation"][key].append(pooled_sd) 384 | 385 | # pool statistics and perform t-test 386 | pooled_results_dict = pool_multiple_stats(stat_dict) 387 | 388 | # add stats to adata 389 | keys_list = [] 390 | for key_measure in pooled_results_dict.keys(): 391 | for key_comparison in pooled_results_dict[key_measure].keys(): 392 | adata.uns[predicted.split("_")[0]+"_"+key_comparison+"_"+key_measure] = pd.DataFrame(pooled_results_dict[key_measure][key_comparison][None,:], 393 | columns=adata.obsm[predicted].columns) 394 | keys_list.append(predicted.split("_")[0]+"_"+key_comparison+"_"+key_measure) 395 | 396 | # save multiple imputations 397 | if save_mi is not False: 398 | # stack all imputations and save 399 | stacked_mi = np.dstack(new_G_list) 400 | np.save(os.path.join(save_mi,f"{predicted}.npy"), stacked_mi) 401 | 402 | return(keys_list) 403 | 404 | 405 | def multiple_imputation_gene_signature (sig_dirpath, adata, predicted, calib_genes, condition, n_imputations=100, 406 | group1=None, group2=None, symmetric=False, return_keys=False, load_mi=False): 407 | ''' 408 | Uses multiple imputation with the score distributions to perform hypothesis testing on gene signatures 409 | 410 | Parameters 411 | ---------- 412 | sig_dirpath [str] - path to the directory containing the gene signatures organized as: 413 | sig_dirpath/ 414 | {name of signature 1}/ 415 | {name of signature N}/ 416 | genes.txt - text file with each row being a gene name 417 | coefficients.txt - optional text file with each row being a float weight for corresponding gene 418 | adata [AnnData] - contains adata.obsm[predicted] corresponding to the predicted gene expression 419 | predicted [str] - key in adata.obsm that corresponds to predicted gene expression 420 | calib_genes [list or arr of str] - names of the genes in adata.var_names that are used in the calibration set 421 | condition [str] - key in adata.obs for which to compute the hypothesis test 422 | group1 [value] - value in adata.obs[condition] identifying the first comparison group 423 | if None, will perform group vs all comparisons for all unique values in adata.obs[condition] 424 | group2 [value] - value in adata.obs[condition] identifying the second comparison group 425 | if None, will compare against all values that are not group1 426 | n_imputations [int] - number of imputations to use 427 | symmetric [bool] - whether to have symmetric (or non-symmetric) prediction intervals 428 | return_keys [bool] - whether to return the keys for which to access the results from adata 429 | load_mi [bool] - whether to save "{predicted}.npy" stacked matrix of all multiple imputations at sig_dirpath 430 | 431 | Returns 432 | ------- 433 | Modifies adata in-place to add the statistics and test results to metadata 434 | Optionally returns the keys to access the results from adata 435 | 436 | ''' 437 | ##################################################################### 438 | # T-test (default) - this is the only option currently for signatures 439 | ##################################################################### 440 | 441 | if load_mi is False: 442 | # get uncertainties and scores from saved adata 443 | scores, residuals, G_stdev, G, groups = get_spatial_uncertainty_scores_from_metadata (adata, predicted) 444 | 445 | ### Building calibration sets for scores 446 | 447 | scores_flattened_dict = build_calibration_scores(adata, predicted, calib_genes, symmetric=symmetric, 448 | include_zero_scores=True, trim_quantiles=[None, 0.8]) # trim top 20% scores 449 | else: # load in saved multiple imputations 450 | mi_path = os.path.join(sig_dirpath,f"{predicted}.npy") # path to saved multiple imputations 451 | mi_stacked = np.load(mi_path) 452 | 453 | ### Multiple imputation 454 | 455 | # init dictionary to hold results (for independent two-sample t-test) 456 | stat_dict = {} 457 | stat_dict["mean_difference"] = {} 458 | stat_dict["standard_deviation"] = {} 459 | 460 | # cast condition to str 461 | condition = str(condition) 462 | 463 | for m in range(n_imputations): 464 | 465 | # generate new imputation 466 | if load_mi is False: 467 | new_G = sample_new_imputation_from_scores (G, G_stdev, groups, scores_flattened_dict, symmetric=symmetric) 468 | else: 469 | new_G = mi_stacked[:,:,m].copy() # take the m-th multiple imputation 470 | 471 | # compute all signatures 472 | imputed_sigs = [] 473 | sig_names = [] 474 | 475 | for sigdir in next(os.walk(sig_dirpath))[1]: # iterate all top-level signature directories 476 | # read in genes 477 | with open(os.path.join(sig_dirpath,sigdir,"genes.txt")) as f: 478 | signature_genes = [line.rstrip() for line in f] 479 | signature_genes = np.array([x.lower() for x in signature_genes]) 480 | # load coefficients (if any) 481 | if os.path.isfile(os.path.join(sig_dirpath,sigdir,"coefficients.txt")): 482 | signature_coefficients = np.loadtxt(os.path.join(sig_dirpath,sigdir,"coefficients.txt")) 483 | else: 484 | signature_coefficients = np.ones(len(signature_genes)) 485 | # subset into shared genes 486 | shared_gene_idxs = [ii for ii in range(len(signature_genes)) if signature_genes[ii] in adata.obsm[predicted].columns] 487 | signature_genes = signature_genes[shared_gene_idxs] 488 | signature_coefficients = signature_coefficients[shared_gene_idxs] 489 | # if non-empty signature, then compute 490 | if len(signature_genes) > 0: 491 | # compute signature 492 | subset_new_G = pd.DataFrame(new_G, columns = adata.obsm[predicted].columns)[signature_genes].values 493 | sig_value = np.nansum(subset_new_G*signature_coefficients, axis=1) 494 | # append signature value and name 495 | imputed_sigs.append(sig_value) 496 | sig_names.append(sigdir) 497 | 498 | # construct gene signature matrix 499 | imputed_sigs = np.vstack(imputed_sigs).T 500 | 501 | # keep running average of imputed gene signatures 502 | if m == 0: 503 | mean_imputed_sigs = imputed_sigs * 1/n_imputations 504 | else: 505 | mean_imputed_sigs += imputed_sigs * 1/n_imputations 506 | 507 | # calculate statistics for the imputation using approach from Palmer & Peer, 2016 508 | 509 | if group1 is None: # pairwise comparisons against all 510 | 511 | for g1 in np.unique(adata.obs[condition]): 512 | 513 | key = str(g1)+"_all" 514 | 515 | if m == 0: # init list 516 | stat_dict["mean_difference"][key] = [] 517 | stat_dict["standard_deviation"][key] = [] 518 | 519 | g1_bool = (adata.obs[condition] == g1) # g1 520 | g2_bool = (adata.obs[condition] != g1) # all other 521 | 522 | mean_diff, pooled_sd = get_ttest_stats(imputed_sigs, g1_bool, g2_bool) # get ttest stats 523 | stat_dict["mean_difference"][key].append(mean_diff) 524 | stat_dict["standard_deviation"][key].append(pooled_sd) 525 | 526 | elif group2 is None: # group1 vs all 527 | 528 | key = str(group1)+"_all" 529 | 530 | if m == 0: # init list 531 | stat_dict["mean_difference"][key] = [] 532 | stat_dict["standard_deviation"][key] = [] 533 | 534 | g1_bool = (adata.obs[condition] == group1) # g1 535 | g2_bool = (adata.obs[condition] != group1) # all other 536 | 537 | mean_diff, pooled_sd = get_ttest_stats(imputed_sigs, g1_bool, g2_bool) # get ttest stats 538 | stat_dict["mean_difference"][key].append(mean_diff) 539 | stat_dict["standard_deviation"][key].append(pooled_sd) 540 | 541 | else: # group1 vs group2 542 | 543 | key = str(group1)+"_"+str(group2) 544 | 545 | if m == 0: # init list 546 | stat_dict["mean_difference"][key] = [] 547 | stat_dict["standard_deviation"][key] = [] 548 | 549 | g1_bool = (adata.obs[condition] == group1) # g1 550 | g2_bool = (adata.obs[condition] == group2) # g2 551 | 552 | mean_diff, pooled_sd = get_ttest_stats(imputed_sigs, g1_bool, g2_bool) # get ttest stats 553 | stat_dict["mean_difference"][key].append(mean_diff) 554 | stat_dict["standard_deviation"][key].append(pooled_sd) 555 | 556 | # pool statistics and perform t-test 557 | pooled_results_dict = pool_multiple_stats(stat_dict) 558 | 559 | # add stats to adata 560 | keys_list = [] 561 | for key_measure in pooled_results_dict.keys(): 562 | for key_comparison in pooled_results_dict[key_measure].keys(): 563 | adata.uns[predicted.split("_")[0]+"_"+key_comparison+"_"+key_measure] = pd.DataFrame(pooled_results_dict[key_measure][key_comparison][None,:], 564 | columns=sig_names) 565 | keys_list.append(predicted.split("_")[0]+"_"+key_comparison+"_"+key_measure) 566 | 567 | # add gene sigs to adata 568 | adata.obsm[predicted+"_gene_signatures"] = pd.DataFrame(mean_imputed_sigs, columns=sig_names, index=adata.obs_names) 569 | 570 | if return_keys is True: 571 | 572 | return(keys_list) 573 | 574 | 575 | 576 | def sample_new_imputation_from_scores (G, G_stdev, groups, scores_flattened_dict, symmetric=False): 577 | ''' 578 | Creates a new imputation by sampling from scores and adding to G 579 | 580 | Parameters 581 | ---------- 582 | G, G_stdev, groups - outputs of get_spatial_uncertainty_scores_from_metadata() 583 | scores_flattened_dict - output of build_calibration_scores() 584 | 585 | See multiple_imputation_testing() for more details of arguments 586 | 587 | Returns 588 | ------- 589 | new_G - array of the new sampled predicted gene expression (same dimensions as new_G: cells x genes) 590 | ''' 591 | new_scores = np.zeros(G.shape) # init array for sampled scores 592 | new_add_sub = np.zeros(G.shape) # init array for add/subtract coefs 593 | 594 | # for each group, sample calibration score and corresponding imputations 595 | unique_groups, unique_counts = np.unique(groups[~np.isnan(groups)], return_counts=True) 596 | 597 | for ui, group in enumerate(unique_groups): 598 | count = unique_counts[ui] # get number of values in group 599 | 600 | # sample scores and add/sub indicators 601 | if symmetric is True: 602 | scores_flattened = scores_flattened_dict[str(group)] # get scores 603 | if len(scores_flattened) < 100: # default to full set if <100 in group 604 | scores_flattened = scores_flattened_dict[str(np.nan)] 605 | sampled_scores = np.random.choice(scores_flattened, count, replace=True) # with replacement, sample scores 606 | add_sub = np.random.choice([-1,1], count, replace=True) # add or subtract 607 | else: 608 | scores_lo_flattened = scores_flattened_dict[str(group)][0] 609 | scores_hi_flattened = scores_flattened_dict[str(group)][1] 610 | if (len(scores_lo_flattened) < 100) or (len(scores_hi_flattened) < 100): # default to full set if <100 in group 611 | scores_lo_flattened = scores_flattened_dict[str(np.nan)][0] 612 | scores_hi_flattened = scores_flattened_dict[str(np.nan)][1] 613 | scores_flattened = np.concatenate((scores_lo_flattened, scores_hi_flattened)) 614 | lo_hi_indicators = np.concatenate(([-1]*len(scores_lo_flattened), [1]*len(scores_hi_flattened))) 615 | # sample indices 616 | sampled_idxs = np.random.choice(np.arange(len(scores_flattened)), count, replace=True) # with replacement 617 | sampled_scores = scores_flattened[sampled_idxs] 618 | add_sub = lo_hi_indicators[sampled_idxs] 619 | 620 | # append to new_scores and new_add_sub 621 | new_scores[groups==group] = sampled_scores 622 | new_add_sub[groups==group] = add_sub 623 | 624 | # calculate new imputation 625 | new_G = G + new_add_sub*(new_scores*G_stdev) 626 | 627 | return (new_G) 628 | 629 | 630 | def get_ttest_stats(G, g1_bool, g2_bool): 631 | ''' 632 | Computes mean_diff and pooled SD for each column of G independently 633 | 634 | Parameters 635 | ---------- 636 | G [array] - 2D array with columns as genes and rows as cells 637 | g1_bool [bool array] - 1D array with length equal to number of rows in G; labels group1 638 | g2_bool [bool array] - 1D array with length equal to number of rows in G; labels group2 639 | 640 | Returns 641 | ------- 642 | mean_diff - mean difference for t-test 643 | pooled_sd - pooled standard deviation for t-test 644 | ''' 645 | mean_diff = np.nanmean(G[g1_bool,:], axis=0) - np.nanmean(G[g2_bool,:], axis=0) 646 | n1 = np.count_nonzero(~np.isnan(G[g1_bool,:]), axis=0) 647 | n2 = np.count_nonzero(~np.isnan(G[g2_bool,:]), axis=0) 648 | sp = np.sqrt( ( (n1-1)*(np.nanvar(G[g1_bool,:],axis=0)) + (n2-1)*(np.nanvar(G[g2_bool,:],axis=0)) ) / (n1+n2-2) ) 649 | pooled_sd = np.sqrt(1/n1 + 1/n2) * sp 650 | 651 | return(mean_diff, pooled_sd) 652 | 653 | 654 | def two_sample_ttest (G, g1_bool, g2_bool): 655 | ''' 656 | Computes two-sample t-test for unequal sample sizes using get_ttest_stats() 657 | 658 | Parameters 659 | ---------- 660 | G [array] - 2D array with columns as genes and rows as cells 661 | g1_bool [bool array] - 1D array with length equal to number of rows in G; labels group1 662 | g2_bool [bool array] - 1D array with length equal to number of rows in G; labels group2 663 | 664 | Returns 665 | ------- 666 | tt - t-statistic 667 | pp - p-value 668 | ''' 669 | from scipy import stats 670 | # calculate t-stat 671 | mean_diff, pooled_sd = get_ttest_stats(G, g1_bool, g2_bool) 672 | tt = mean_diff/pooled_sd 673 | # calculate dof 674 | n1 = np.count_nonzero(~np.isnan(G[g1_bool,:]), axis=0) 675 | n2 = np.count_nonzero(~np.isnan(G[g2_bool,:]), axis=0) 676 | dof = n1+n2-2 677 | # calculate p-value 678 | pp = 2*(1 - stats.t.cdf(np.abs(tt), dof)) 679 | 680 | return(tt, pp) 681 | 682 | 683 | def pool_multiple_stats(stat_dict): 684 | ''' 685 | Pool stats across multiple imputations for t-test 686 | 687 | Parameters 688 | ---------- 689 | stat_dict [dict] - dictionary containing statistical testing results (generated in multiple_imputation_ttest()) 690 | 691 | Returns 692 | ------- 693 | results_dict [dict] - dictionary containing the pooled statistics from using Rubin's rules 694 | ''' 695 | from scipy import stats 696 | 697 | # init results_dict 698 | results_dict = {} 699 | results_dict["tstat"] = {} 700 | results_dict["pvalue"] = {} 701 | 702 | results_dict["varw"] = {} 703 | results_dict["varb"] = {} 704 | results_dict["poolmean"] = {} 705 | 706 | for key in stat_dict["mean_difference"].keys(): 707 | 708 | d = len(stat_dict["mean_difference"][key]) 709 | 710 | # compute pooled terms 711 | pooled_mean = np.mean(np.vstack(stat_dict["mean_difference"][key]), axis=0) 712 | var_w = np.mean(np.vstack(stat_dict["standard_deviation"][key])**2, axis=0) # within-draw sample variance 713 | var_b = 1/(d-1) * np.sum((np.vstack(stat_dict["mean_difference"][key])-pooled_mean)**2, axis=0) # between-draw sample variance 714 | var_MI = var_w + (1+1/d)*var_b # multiple imputation variance 715 | 716 | test_stat = pooled_mean / np.sqrt(var_MI) # pooled t statistic 717 | 718 | # compute pvalue from T distribution 719 | dof = (d-1)*(1+(d*var_w)/((d+1)*var_b))**2 # degrees of freedom for T distribution 720 | pval = 2*(1 - stats.t.cdf(np.abs(test_stat), dof)) 721 | 722 | # Add test statistic and pvalue 723 | results_dict["tstat"][key] = test_stat 724 | results_dict["pvalue"][key] = pval 725 | 726 | # Add intermediate stats (for debugging, etc) 727 | results_dict["varw"][key] = var_w 728 | results_dict["varb"][key] = var_b 729 | results_dict["poolmean"][key] = pooled_mean 730 | 731 | return(results_dict) 732 | 733 | 734 | 735 | def weighted_PCA(adata, imp_method, pca_method="wpca", weighting="inverse_norm_pi_width", quantile_cutoff=None, 736 | n_components=15, replace_inf=None, binarize=0.2, binarize_ratio=10, log_transform=False, 737 | scale=True, tag="", return_weights=False,): 738 | ''' 739 | Runs weighted PCA using the "wpca" package: https://github.com/jakevdp/wpca 740 | 741 | Parameters 742 | ---------- 743 | adata [AnnData] - should be the AnnData after running conformalize_prediction_interval() 744 | - must include in obsm: {imp_method}_predicted_expression, 745 | {imp_method}_predicted_expression_lo, 746 | {imp_method}_predicted_expression_hi 747 | imp_method [str] - specifies which imputation method to return PCA for (e.g. 'knn', 'spage', 'tangram') 748 | pca_method [str] - "wpca" for WPCA (Delchambre, 2014), "empca" for EMPCA (Bailey, 2012), "pca" for PCA 749 | weighting [str] - "uniform" (regular PCA) 750 | "inverse_pi_width" (weights are 1/(prediction interval width)) 751 | "inverse_norm_pi_width" (weights are predicted expression/(prediction interval width)) 752 | quantile_cutoff [None or float] - quantile (between 0 and 1) for which to set a ceiling for the weights 753 | n_components [int] - number of principal components 754 | replace_inf [None, str, float] - what to replace np.inf with (after all other weight transforms); if None, keeps np.inf 755 | can also be "max" or "min" to replace with the max or min weights 756 | binarize [bool] - binarizes the weights with Otsu threshold -- if larger than threshold, set to 1; else 1e-2 757 | binarize_ratio [int or float] - how much to "upweight" values greater than the binarized threshold 758 | log_transform [bool] - whether to log1p transform weights (will be done before binarization if binarize=True) 759 | scale [bool - whether to scale data with StandardScaler() before running WPCA 760 | tag [str] - additional tag to append to the obsm key for storing the PCs 761 | return_weights [bool] - whether to return weights used in WPCA 762 | 763 | Returns 764 | ------- 765 | Stores the result in adata.obsm["{imp_method}_predicted_expression_PC{n_components}_{tag}"] 766 | Optionally returns the array of weights used in WPCA 767 | 768 | Refer to postprocess_weights() for order for weight calculations 769 | ''' 770 | from wpca import PCA, WPCA, EMPCA 771 | 772 | predicted = f"{imp_method}_predicted_expression" 773 | 774 | # get gene names/order 775 | genes = adata.obsm[predicted].columns 776 | 777 | # determine weights 778 | if weighting == "inverse_pi_width": 779 | weights = 1/(adata.obsm[predicted+'_hi'][genes].values-adata.obsm[predicted+'_lo'][genes].values) 780 | weights = postprocess_weights(weights, quantile_cutoff, replace_inf, binarize, binarize_ratio, log_transform) 781 | elif weighting == "inverse_norm_pi_width": 782 | weights = 1/(adata.obsm[predicted+'_hi'][genes].values-adata.obsm[predicted+'_lo'][genes].values) 783 | weights = weights / np.nanmean(weights, axis=0) 784 | weights = postprocess_weights(weights, quantile_cutoff, replace_inf, binarize, binarize_ratio, log_transform) 785 | elif weighting == "uniform": 786 | weights = np.ones(adata.obsm[predicted].shape) 787 | elif weighting == "inverse_residual": 788 | weights = 1/np.abs(adata.obsm[predicted][genes].values - np.array(adata[:,genes].X)) 789 | weights = postprocess_weights(weights, quantile_cutoff, replace_inf, binarize, binarize_ratio, log_transform) 790 | elif weighting == "inverse_norm_residual": 791 | weights = 1/np.abs(adata.obsm[predicted][genes].values - np.array(adata[:,genes].X)) 792 | weights = weights / np.nanmean(weights, axis=0) 793 | weights = postprocess_weights(weights, quantile_cutoff, replace_inf, binarize, binarize_ratio, log_transform) 794 | else: 795 | raise Exception("weighting not recognized") 796 | 797 | # scaling 798 | if scale is True: 799 | X = StandardScaler().fit_transform(adata.obsm[predicted].values) 800 | else: 801 | X = adata.obsm[predicted].values 802 | 803 | # run weighted PCA 804 | if pca_method == "wpca": 805 | X_red = WPCA(n_components=n_components).fit_transform(X, weights=weights) 806 | elif pca_method == "empca": 807 | X_red = EMPCA(n_components=n_components).fit_transform(X, weights=weights) 808 | elif pca_method == "pca": 809 | X_red = PCA(n_components=n_components).fit_transform(X) 810 | elif pca_method == "gwpca": # gene-weighted PCA 811 | weights = np.nanmean(weights, axis=0) 812 | X_red = PCA(n_components=n_components).fit_transform(X * weights) 813 | else: 814 | raise Exception("pca_method not recognized") 815 | 816 | # add PCs to adata 817 | adata.obsm[predicted+f"_PC{n_components}_{tag}"] = X_red 818 | 819 | if return_weights is True: 820 | return(weights) 821 | 822 | 823 | def postprocess_weights(weights, quantile_cutoff, replace_inf, binarize, binarize_ratio, log_transform): 824 | ''' 825 | Method for postprocessing weights (filter with cutoff, replace inf, etc) for weighted_PCA() 826 | 827 | Refer to weighted_pca() for details on arguments 828 | ''' 829 | # cutoff weights 830 | if quantile_cutoff is not None: 831 | cutoff = np.nanquantile(weights, quantile_cutoff) 832 | weights[np.isfinite(weights) & (weights >= cutoff)] = cutoff 833 | 834 | # log-transform 835 | if log_transform is True: 836 | weights = np.log1p(weights) 837 | 838 | # binarize weights 839 | if binarize is True: 840 | from skimage.filters import threshold_otsu 841 | cutoff = threshold_otsu(weights[np.isfinite(weights)]) 842 | weights[np.isfinite(weights) & (weights >= cutoff)] = 1 843 | weights[np.isfinite(weights) & (weights < cutoff)] = 1/binarize_ratio 844 | elif binarize is False: 845 | pass 846 | elif isinstance(binarize, float) or isinstance(binarize, int): 847 | cutoff = np.nanquantile(weights, binarize) 848 | weights[np.isfinite(weights) & (weights >= cutoff)] = 1 849 | weights[np.isfinite(weights) & (weights < cutoff)] = 1/binarize_ratio 850 | 851 | # deal with infs (from division by zero) 852 | if replace_inf == "max": 853 | weights[~np.isfinite(weights)] = np.nanmax(weights[np.isfinite(weights)]) 854 | elif replace_inf == "min": 855 | weights[~np.isfinite(weights)] = np.nanmin(weights[np.isfinite(weights)]) 856 | elif replace_inf == "mean": 857 | weights[~np.isfinite(weights)] = np.nanmean(weights[np.isfinite(weights)]) 858 | elif replace_inf == "median": 859 | weights[~np.isfinite(weights)] = np.nanmedian(weights[np.isfinite(weights)]) 860 | elif isinstance(replace_inf, float) or isinstance(replace_inf, int): 861 | weights[~np.isfinite(weights)] = replace_inf 862 | 863 | return(weights) 864 | 865 | 866 | def filtered_PCA(adata, imp_method, proportion=0.05, stratification=None, n_components=15, scale=True, normalize=False, 867 | tag="", return_keep_idxs=False): 868 | ''' 869 | Runs filtered PCA using the TISSUE cell filtering approach 870 | 871 | Parameters 872 | ---------- 873 | adata [AnnData] - should be the AnnData after running conformalize_prediction_interval() 874 | - must include in obsm: {imp_method}_predicted_expression, 875 | {imp_method}_predicted_expression_lo, 876 | {imp_method}_predicted_expression_hi 877 | imp_method [str] - specifies which imputation method to return PCA for (e.g. 'knn', 'spage', 'tangram') 878 | proportion [float] - between 0 and 1; proportion of most uncertain cells to drop 879 | stratification [None or 1d numpy array] - array of values to stratify the drop by 880 | - same length as number of rows in X 881 | - if None, no stratification 882 | n_components [int] - number of principal components 883 | scale [bool] - whether to scale data with StandardScaler() before running PCA 884 | normalize [bool] - whether to normalize prediction interval width by the absolute predicted expression value 885 | tag [str] - additional tag to append to the obsm key for storing the PCs 886 | return_keep_idxs [bool] - whether to return the keep_idxs for filtering 887 | 888 | Returns 889 | ------- 890 | Stores the result in adata.obsm["{imp_method}_predicted_expression_PC{n_components}_{tag}"] 891 | Optionally returns the indices corresponding to the observations to keep after filtering 892 | ''' 893 | predicted = f"{imp_method}_predicted_expression" 894 | 895 | # get predicted expression matrices 896 | X = adata.obsm[predicted].values.copy() 897 | 898 | # get uncertainty (PI width) for filtering 899 | X_uncertainty = adata.obsm[f'{predicted}_hi'].values - adata.obsm[f'{predicted}_lo'].values 900 | if normalize is True: 901 | X_uncertainty = X_uncertainty / (1+np.abs(adata.obsm[f'{predicted}'].values)) 902 | 903 | # filter cells 904 | keep_idxs = detect_uncertain_cells(X_uncertainty, proportion=proportion, stratification=stratification) 905 | X_filtered = X[keep_idxs,:].copy() 906 | 907 | # scaling 908 | if scale is True: 909 | scaler = StandardScaler().fit(X_filtered) 910 | X = scaler.transform(X) 911 | X_filtered = scaler.transform(X_filtered) 912 | 913 | # run PCA 914 | pca = PCA(n_components=n_components).fit(X_filtered) 915 | X_red = pca.transform(X) 916 | X_red_filtered = pca.transform(X_filtered) 917 | 918 | # add PCs to adata 919 | adata.obsm[predicted+f"_PC{n_components}_{tag}"] = X_red 920 | adata.uns[predicted+f"_PC{n_components}_filtered_{tag}"] = X_red_filtered 921 | 922 | if return_keep_idxs is True: 923 | return (keep_idxs) 924 | 925 | 926 | 927 | def detect_uncertain_cells (X, proportion=0.05, stratification=None): 928 | ''' 929 | Method for dropping a portion of the most uncertain cells from the input. 930 | 931 | Parameters 932 | ---------- 933 | X [2d numpy array] - array of uncertainty values 934 | proportion [float] - between 0 and 1; proportion of most uncertain cells to drop 935 | stratification [None or 1d numpy array] - array of values to stratify the drop by 936 | - same length as number of rows in X 937 | - if None, no stratification 938 | 939 | Returns 940 | ------- 941 | keep_idxs [list] - array of row indices after dropping most uncertain cells 942 | ''' 943 | from scipy.stats import zscore 944 | 945 | if stratification is not None: # drop cells within each strata independently 946 | 947 | drop_idxs = [] 948 | 949 | for strata in np.unique(stratification): 950 | 951 | # compute scores 952 | X_strat = X[stratification==strata,:].copy() # calc gene z-scores 953 | orig_idxs = np.arange(X.shape[0])[stratification==strata] 954 | cell_scores = np.nanmean(zscore(X_strat, axis=0), axis=1) # average z-score for each cell 955 | 956 | # determine cutoff score and indices to drop 957 | if (isinstance(proportion, float)) or (isinstance(proportion, int)): 958 | cutoff_idx = int(np.ceil(proportion*len(cell_scores))) # number of cells to drop 959 | strata_drop_idxs = np.argsort(cell_scores)[::-1][:cutoff_idx] 960 | elif proportion == "otsu": 961 | from skimage.filters import threshold_otsu 962 | cutoff = threshold_otsu(cell_scores) 963 | strata_drop_idxs = [i for i in range(len(cell_scores)) if cell_scores[i] > cutoff] 964 | else: 965 | raise Exception("proportion specified not valid") 966 | 967 | drop_idxs.append(orig_idxs[strata_drop_idxs]) # get idxs of highest scores 968 | 969 | drop_idxs = list(np.concatenate(drop_idxs)) 970 | 971 | else: 972 | 973 | # compute scores 974 | cell_scores = zscore(X, axis=0).mean(axis=1) # average z-score for each cell 975 | 976 | # determine cutoff score and indices to drop 977 | if (isinstance(proportion, float)) or (isinstance(proportion, int)): 978 | cutoff_idx = int(np.ceil(proportion*len(cell_scores))) # number of cells to drop 979 | drop_idxs = list(np.argsort(cell_scores)[::-1][:cutoff_idx]) # get idxs of highest scores 980 | elif proportion == "otsu": 981 | from skimage.filters import threshold_otsu 982 | cutoff = threshold_otsu(cell_scores) 983 | drop_idxs = [i for i in range(len(cell_scores)) if cell_scores[i] > cutoff] 984 | else: 985 | raise Exception("proportion specified not valid") 986 | 987 | # return keep indices (determined as indices not in drop indices) 988 | keep_idxs = [i for i in range(X.shape[0]) if i not in drop_idxs] 989 | 990 | return (keep_idxs) -------------------------------------------------------------------------------- /tissue/main.py: -------------------------------------------------------------------------------- 1 | # Contains main functions for core TISSUE pipeline: computing cell-centric variability and calibrated prediction intervals 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | import scanpy as sc 7 | import squidpy as sq 8 | from sklearn.decomposition import PCA 9 | from sklearn.preprocessing import StandardScaler 10 | from sklearn.cluster import KMeans 11 | from sklearn.model_selection import KFold, StratifiedKFold 12 | import anndata as ad 13 | import warnings 14 | import os 15 | 16 | 17 | def load_paired_datasets (spatial_counts, spatial_loc, RNAseq_counts, spatial_metadata = None, 18 | min_cell_prevalence_spatial = 0.0, min_cell_prevalence_RNAseq = 0.01, 19 | min_gene_prevalence_spatial = 0.0, min_gene_prevalence_RNAseq = 0.0): 20 | ''' 21 | Uses datasets in the format specified by Li et al. (2022) 22 | See: https://drive.google.com/drive/folders/1pHmE9cg_tMcouV1LFJFtbyBJNp7oQo9J 23 | 24 | Parameters 25 | ---------- 26 | spatial_counts [str] - path to spatial counts file; rows are cells 27 | spatial_loc [str] - path to spatial locations file; rows are cells 28 | RNAseq_counts [str] - path to RNAseq counts file; rows are genes 29 | spatial_metadata [None or str] - if not None, then path to spatial metadata file (will be read into spatial_adata.obs) 30 | min_cell_prevalence_spatial [float between 0 and 1] - minimum prevalence among cells to include gene in spatial anndata object, default=0 31 | min_cell_prevalence_RNAseq [float between 0 and 1] - minimum prevalence among cells to include gene in RNAseq anndata object, default=0.01 32 | min_gene_prevalence_spatial [float between 0 and 1] - minimum prevalence among genes to include cell in spatial anndata object, default=0 33 | min_gene_prevalence_RNAseq [float between 0 and 1] - minimum prevalence among genes to include cell in RNAseq anndata object, default=0 34 | 35 | Returns 36 | ------- 37 | spatial_adata, RNAseq_adata - AnnData objects with counts and location (if applicable) in metadata 38 | ''' 39 | # Spatial data loading 40 | spatial_adata = load_spatial_data (spatial_counts, 41 | spatial_loc, 42 | spatial_metadata = spatial_metadata, 43 | min_cell_prevalence_spatial = min_cell_prevalence_spatial, 44 | min_gene_prevalence_spatial = min_gene_prevalence_spatial) 45 | 46 | # RNAseq data loading 47 | RNAseq_adata = load_rnaseq_data (RNAseq_counts, 48 | min_cell_prevalence_RNAseq = min_cell_prevalence_RNAseq, 49 | min_gene_prevalence_RNAseq = min_gene_prevalence_RNAseq) 50 | 51 | return(spatial_adata, RNAseq_adata) 52 | 53 | 54 | def load_spatial_data (spatial_counts, spatial_loc, spatial_metadata=None, 55 | min_cell_prevalence_spatial = 0.0, min_gene_prevalence_spatial = 0.0): 56 | ''' 57 | Loads in spatial data from text files. 58 | 59 | See load_paired_datasets() for details on arguments 60 | ''' 61 | # read in spatial counts 62 | df = pd.read_csv(spatial_counts,header=0,sep="\t") 63 | 64 | # filter lowly expressed genes 65 | cells_prevalence = np.mean(df.values>0, axis=0) 66 | df = df.loc[:,cells_prevalence > min_cell_prevalence_spatial] 67 | 68 | # filter sparse cells 69 | genes_prevalence = np.mean(df.values>0, axis=1) 70 | df = df.loc[genes_prevalence > min_gene_prevalence_spatial,:] 71 | 72 | # create AnnData 73 | spatial_adata = ad.AnnData(X=df, dtype='float64') 74 | spatial_adata.obs_names = df.index.values 75 | spatial_adata.obs_names = spatial_adata.obs_names.astype(str) 76 | spatial_adata.var_names = df.columns 77 | del df 78 | 79 | # add spatial locations 80 | locations = pd.read_csv(spatial_loc,header=0,delim_whitespace=True) 81 | spatial_adata.obsm["spatial"] = locations.loc[genes_prevalence > min_gene_prevalence_spatial, :].values 82 | 83 | # add metadata 84 | if spatial_metadata is not None: 85 | metadata_df = pd.read_csv(spatial_metadata) 86 | metadata_df = metadata_df.loc[genes_prevalence > min_gene_prevalence_spatial, :] 87 | metadata_df.index = spatial_adata.obs_names 88 | spatial_adata.obs = metadata_df 89 | 90 | # remove genes with nan values 91 | spatial_adata = spatial_adata[:,np.isnan(spatial_adata.X).sum(axis=0)==0].copy() 92 | 93 | # make unique obs_names and var_names 94 | spatial_adata.obs_names_make_unique() 95 | spatial_adata.var_names_make_unique() 96 | 97 | return (spatial_adata) 98 | 99 | 100 | def load_rnaseq_data (RNAseq_counts, min_cell_prevalence_RNAseq = 0.0, min_gene_prevalence_RNAseq = 0.0): 101 | ''' 102 | Loads in scRNAseq data from text files. 103 | 104 | See load_paired_datasets() for details on arguments 105 | ''' 106 | # read in RNAseq counts 107 | df = pd.read_csv(RNAseq_counts,header=0,index_col=0,sep="\t") 108 | 109 | # filter lowly expressed genes -- note that df is transposed gene x cell 110 | cells_prevalence = np.mean(df>0, axis=1) 111 | df = df.loc[cells_prevalence > min_cell_prevalence_RNAseq,:] 112 | del cells_prevalence 113 | 114 | # filter sparse cells 115 | genes_prevalence = np.mean(df>0, axis=0) 116 | df = df.loc[:,genes_prevalence > min_gene_prevalence_RNAseq] 117 | del genes_prevalence 118 | 119 | # create AnnData 120 | RNAseq_adata = ad.AnnData(X=df.T, dtype='float64') 121 | RNAseq_adata.obs_names = df.T.index.values 122 | RNAseq_adata.var_names = df.T.columns 123 | del df 124 | 125 | # remove genes with nan values 126 | RNAseq_adata = RNAseq_adata[:,np.isnan(RNAseq_adata.X).sum(axis=0)==0].copy() 127 | 128 | # make unique obs_names and var_names 129 | RNAseq_adata.obs_names_make_unique() 130 | RNAseq_adata.var_names_make_unique() 131 | 132 | return (RNAseq_adata) 133 | 134 | 135 | 136 | def preprocess_data (adata, standardize=False, normalize=False): 137 | ''' 138 | Preprocesses adata inplace: 139 | 1. sc.pp.normalize_total() if normalize is True 140 | 2. sc.pp.log1p() if normalize is True 141 | 3. Not recommended: standardize each gene (subtract mean, divide by standard deviation) 142 | 143 | Parameters 144 | ---------- 145 | standardize [Boolean] - whether to standardize genes; default is False 146 | normalize [Boolean] - whether to normalize data; default is False (based on finding by Li et al., 2022) 147 | 148 | Returns 149 | ------- 150 | Modifies adata in-place 151 | 152 | NOTE: Under current default settings for TISSUE, this method does nothing to adata 153 | ''' 154 | # normalize data 155 | if normalize is True: 156 | sc.pp.normalize_total(adata) 157 | sc.pp.log1p(adata) 158 | 159 | # standardize data 160 | if standardize is True: 161 | adata.X = np.divide(adata.X - np.mean(adata.X, axis=0), np.std(adata.X, axis=0)) 162 | 163 | 164 | def build_spatial_graph (adata, method="fixed_radius", spatial="spatial", radius=None, n_neighbors=20, set_diag=True): 165 | ''' 166 | Builds a spatial graph from AnnData according to specifications. Uses Squidpy implementations for building spatial graphs. 167 | 168 | Parameters 169 | ---------- 170 | adata [AnnData] - spatial data, must include adata.obsm[spatial] 171 | method [str]: 172 | - "radius" (all cells within radius are neighbors) 173 | - "delaunay" (triangulation) 174 | - "delaunay_radius" (triangulation with pruning by max radius; DEFAULT) 175 | - "fixed" (the k-nearest cells are neighbors determined by n_neighbors) 176 | - "fixed_radius" (knn by n_neighbors with pruning by max radius) 177 | spatial [str] - column name for adata.obsm to retrieve spatial coordinates 178 | radius [None or float/int] - radius around cell centers for which to detect neighbor cells; defaults to Q3+1.5*IQR of delaunay (or fixed for fixed_radius) neighbor distances 179 | n_neighbors [None or int] - number of neighbors to get for each cell (if method is "fixed" or "fixed_radius" or "radius_fixed"); defaults to 20 180 | set_diag [True or False] - whether to have diagonal of 1 in adjacency (before normalization); False is identical to theory and True is more robust; defaults to True 181 | 182 | Returns 183 | ------- 184 | Modifies adata in-place 185 | ''' 186 | # delaunay graph 187 | if method == "delaunay": # triangulation only 188 | sq.gr.spatial_neighbors(adata, delaunay=True, coord_type="generic", set_diag=set_diag) 189 | 190 | # neighborhoods determined by fixed radius 191 | elif method == "radius": 192 | if radius is None: # compute 90th percentile of delaunay triangulation as default radius 193 | sq.gr.spatial_neighbors(adata, delaunay=True, coord_type="generic") 194 | if isinstance(adata.obsp["spatial_distances"],np.ndarray): # numpy array 195 | dists = adata.obsp['spatial_distances'][np.nonzero(adata.obsp['spatial_distances'])] # get nonzero array 196 | else: # sparse matrix 197 | adata.obsp['spatial_distances'].eliminate_zeros() # remove hard-set zeros 198 | dists = adata.obsp['spatial_distances'].data # get non-zero values in sparse matrix 199 | radius = np.percentile(dists, 75) + 1.5*(np.percentile(dists, 75) - np.percentile(dists, 25)) 200 | # build graph 201 | sq.gr.spatial_neighbors(adata, radius=radius, coord_type="generic", set_diag=set_diag) 202 | 203 | # delaunay graph with removal of outlier edges with distance > radius 204 | elif method == "delaunay_radius": 205 | # build initial graph 206 | sq.gr.spatial_neighbors(adata, delaunay=True, coord_type="generic", set_diag=set_diag) 207 | if radius is None: # compute default radius as 75th percentile + 1.5*IQR 208 | if isinstance(adata.obsp["spatial_distances"],np.ndarray): # numpy array 209 | dists = adata.obsp['spatial_distances'][np.nonzero(adata.obsp['spatial_distances'])] # get nonzero array 210 | else: # sparse matrix 211 | adata.obsp['spatial_distances'].eliminate_zeros() # remove hard-set zeros 212 | dists = adata.obsp['spatial_distances'].data # get non-zero values in sparse matrix 213 | radius = np.percentile(dists, 75) + 1.5*(np.percentile(dists, 75) - np.percentile(dists, 25)) 214 | # prune edges by radius 215 | adata.obsp['spatial_connectivities'][adata.obsp['spatial_distances']>radius] = 0 216 | adata.obsp['spatial_distances'][adata.obsp['spatial_distances']>radius] = 0 217 | 218 | # fixed neighborhood size with removal of outlier edges with distance > radius 219 | elif method == "fixed_radius": 220 | # build initial graph 221 | sq.gr.spatial_neighbors(adata, n_neighs=n_neighbors, coord_type="generic", set_diag=set_diag) 222 | if radius is None: # compute default radius as 75th percentile + 1.5*IQR 223 | if isinstance(adata.obsp["spatial_distances"],np.ndarray): # numpy array 224 | dists = adata.obsp['spatial_distances'][np.nonzero(adata.obsp['spatial_distances'])] # get nonzero array 225 | else: # sparse matrix 226 | adata.obsp['spatial_distances'].eliminate_zeros() # remove hard-set zeros 227 | dists = adata.obsp['spatial_distances'].data # get non-zero values in sparse matrix 228 | radius = np.percentile(dists, 75) + 1.5*(np.percentile(dists, 75) - np.percentile(dists, 25)) 229 | # prune edges by radius 230 | adata.obsp['spatial_connectivities'][adata.obsp['spatial_distances']>radius] = 0 231 | adata.obsp['spatial_distances'][adata.obsp['spatial_distances']>radius] = 0 232 | 233 | # fixed neighborhood size 234 | elif method == "fixed": 235 | sq.gr.spatial_neighbors(adata, n_neighs=n_neighbors, coord_type="generic", set_diag=set_diag) 236 | 237 | else: 238 | raise Exception ("method not recognized") 239 | 240 | 241 | def load_spatial_graph(adata, npz_filepath, add_identity=True): 242 | ''' 243 | Reads in scipy sparse adjacency matrix from the specified npz_filepath and adds it to adata.obsp["spatial_connectivities"] 244 | 245 | Parameters 246 | ---------- 247 | add_identity [bool] - whether to add a diagonal of 1's to ensure compatability with TISSUE (i.e. fully connected) 248 | 249 | Returns 250 | ------- 251 | Modifies adata in-place 252 | 253 | If graph is weighted, then you should set weight="spatial_connectivities" in downstream TISSUE calls for cell-centric variability calculation 254 | ''' 255 | from scipy import sparse 256 | a = sparse.load_npz(npz_filepath) 257 | 258 | if add_identity is True: 259 | a += sparse.identity(a.shape[0]) # add identity matrix 260 | 261 | adata.obsp["spatial_connectivities"] = a 262 | 263 | print("If graph is weighted, then you should set weight='spatial_connectivities' in downstream call of conformalize_spatial_uncertainty()") 264 | 265 | 266 | def predict_gene_expression (spatial_adata, RNAseq_adata, 267 | target_genes, conf_genes=None, 268 | method="spage", n_folds=None, random_seed=444, **kwargs): 269 | ''' 270 | Leverages one of several methods to predict spatial gene expression from a paired spatial and scRNAseq dataset 271 | 272 | Parameters 273 | ---------- 274 | spatial_adata [AnnData] = spatial data 275 | RNAseq_adata [AnnData] = RNAseq data, RNAseq_adata.var_names should be superset of spatial_adata.var_names 276 | target_genes [list of str] = genes to predict spatial expression for; must be a subset of RNAseq_adata.var_names 277 | conf_genes [list of str] = genes in spatial_adata.var_names to use for confidence measures; Default is to use all genes in spatial_adata.var_names 278 | method [str] = baseline imputation method 279 | "knn" (uses average of k-nearest neighbors in RNAseq data on Harmony joint space) 280 | "spage" (SpaGE imputation by Abdelaal et al., 2020) 281 | "tangram" (Tangram cell positioning by Biancalani et al., 2021) 282 | Others TBD 283 | n_folds [None or int] = number of cv folds to use for conf_genes, cannot exceed number of conf_genes, None is keeping each gene in its own fold 284 | random_seed [int] = used to see n_folds choice (defaults to 444) 285 | 286 | Returns 287 | ------- 288 | Adds to adata the [numpy matrix]: spatial_adata.obsm["predicted_expression"], spatial_adata.obsm["combined_loo_expression"] 289 | - matrix of predicted gene expressions (same number of rows as spatial_adata, columns are target_genes) 290 | ''' 291 | # change all genes to lower 292 | target_genes = [t.lower() for t in target_genes] 293 | spatial_adata.var_names = [v.lower() for v in spatial_adata.var_names] 294 | RNAseq_adata.var_names = [v.lower() for v in RNAseq_adata.var_names] 295 | 296 | # drop duplicates if any (happens in Dataset14) 297 | if RNAseq_adata.var_names.duplicated().sum() > 0: 298 | RNAseq_adata = RNAseq_adata[:,~RNAseq_adata.var_names.duplicated()].copy() 299 | if spatial_adata.var_names.duplicated().sum() > 0: 300 | spatial_adata = spatial_adata[:,~spatial_adata.var_names.duplicated()].copy() 301 | 302 | # raise warning if any target_genes in spatial data already 303 | if any(x in target_genes for x in spatial_adata.var_names): 304 | warnings.warn("Some target_genes are already measured in the spatial_adata object!") 305 | 306 | # First pass over all genes using specified method 307 | if method == "knn": 308 | predicted_expression_target = knn_impute(spatial_adata,RNAseq_adata,genes_to_predict=target_genes,**kwargs) 309 | elif method == "spage": 310 | predicted_expression_target = spage_impute(spatial_adata,RNAseq_adata,genes_to_predict=target_genes,**kwargs) 311 | elif method == "gimvi": 312 | predicted_expression_target = gimvi_impute(spatial_adata,RNAseq_adata,genes_to_predict=target_genes,**kwargs) 313 | elif method == "tangram": 314 | predicted_expression_target = tangram_impute(spatial_adata,RNAseq_adata,genes_to_predict=target_genes,**kwargs) 315 | else: 316 | raise Exception ("method not recognized") 317 | 318 | # Second pass over conf_genes using specified method using cross-validation 319 | 320 | if conf_genes is None: 321 | conf_genes = list(spatial_adata.var_names) 322 | conf_genes = [c.lower() for c in conf_genes] 323 | conf_genes_unique = [c for c in conf_genes if c not in target_genes] # removes any conf_genes also in target_genes 324 | if len(conf_genes_unique) < len(conf_genes): 325 | print("Found "+str(len(conf_genes)-len(conf_genes_unique))+" duplicate conf_gene in target_genes.") 326 | conf_genes_RNA = [c for c in conf_genes_unique if c in RNAseq_adata.var_names] # remove any conf genes not in RNAseq 327 | if len(conf_genes_RNA) < len(conf_genes_unique): 328 | print("Found "+str(len(conf_genes_unique)-len(conf_genes_RNA))+" conf_gene not in RNAseq_adata.") 329 | conf_genes = conf_genes_RNA 330 | 331 | # raise error if no conf_genes 332 | if len(conf_genes) == 0: 333 | raise Exception ("No suitable conf_genes specified!") 334 | 335 | # create folds if needed 336 | if n_folds is None: 337 | n_folds = len(conf_genes) 338 | elif n_folds > len(conf_genes): 339 | raise Warning ("n_folds in predict_gene_expression() is greater than length of conf_genes...") 340 | n_folds = len(conf_genes) 341 | 342 | np.random.seed(random_seed) 343 | np.random.shuffle(conf_genes) 344 | folds = np.array_split(conf_genes, n_folds) 345 | 346 | # run prediction on each fold 347 | for gi, fold in enumerate(folds): 348 | if method == "knn": 349 | loo_expression = knn_impute(spatial_adata[:,~spatial_adata.var_names.isin(fold)],RNAseq_adata,genes_to_predict=list(fold)+target_genes,**kwargs) 350 | elif method == "spage": 351 | loo_expression = spage_impute(spatial_adata[:,~spatial_adata.var_names.isin(fold)],RNAseq_adata,genes_to_predict=list(fold)+target_genes,**kwargs) 352 | elif method == "gimvi": 353 | loo_expression = gimvi_impute(spatial_adata[:,~spatial_adata.var_names.isin(fold)],RNAseq_adata,genes_to_predict=list(fold)+target_genes,**kwargs) 354 | elif method == "tangram": 355 | loo_expression = tangram_impute(spatial_adata[:,~spatial_adata.var_names.isin(fold)],RNAseq_adata,genes_to_predict=list(fold)+target_genes,**kwargs) 356 | else: 357 | raise Exception ("method not recognized") 358 | 359 | # Update 360 | if gi == 0: 361 | predicted_expression_conf = loo_expression.copy() 362 | else: 363 | predicted_expression_conf['index'] = range(predicted_expression_conf.shape[0]) 364 | loo_expression['index'] = range(loo_expression.shape[0]) 365 | predicted_expression_conf.set_index('index') 366 | loo_expression.set_index('index') 367 | predicted_expression_conf = pd.concat((predicted_expression_conf,loo_expression)).groupby(by="index").sum().reset_index().drop(columns=['index']) 368 | 369 | # Take average of target_genes (later overwritten by "all genes"-predicted) 370 | predicted_expression_conf[target_genes] = predicted_expression_conf[target_genes]/(len(conf_genes)) 371 | 372 | # Update spatial_adata 373 | predicted_expression_target.index = spatial_adata.obs_names 374 | predicted_expression_conf.index = spatial_adata.obs_names 375 | 376 | # gets predictions for target genes followed by conf genes 377 | predicted_expression_target[conf_genes] = predicted_expression_conf[conf_genes].copy() 378 | spatial_adata.obsm[method+"_predicted_expression"] = predicted_expression_target 379 | 380 | spatial_adata.uns["conf_genes_used"] = conf_genes 381 | spatial_adata.uns["target_genes_used"] = target_genes 382 | 383 | 384 | def knn_impute (spatial_adata, RNAseq_adata, genes_to_predict, n_neighbors, **kwargs): 385 | ''' 386 | Runs basic kNN imputation using Harmony subspace 387 | 388 | See predict_gene_expression() for details on arguments 389 | ''' 390 | from scanpy.external.pp import harmony_integrate 391 | from scipy.spatial.distance import cdist 392 | 393 | # combine anndatas 394 | intersection = np.intersect1d(spatial_adata.var_names, RNAseq_adata.var_names) 395 | subRNA = RNAseq_adata[:, intersection] 396 | subspatial = spatial_adata[:, intersection] 397 | joint_adata = ad.AnnData(X=np.vstack((subRNA.X,subspatial.X)), dtype='float32') 398 | joint_adata.obs_names = np.concatenate((subRNA.obs_names.values,subspatial.obs_names.values)) 399 | joint_adata.var_names = subspatial.var_names.values 400 | joint_adata.obs["batch"] = ["rna"]*len(subRNA.obs_names.values)+["spatial"]*len(spatial_adata.obs_names.values) 401 | 402 | # run Harmony 403 | sc.tl.pca(joint_adata) 404 | harmony_integrate(joint_adata, 'batch', verbose=False) 405 | 406 | # kNN imputation 407 | knn_mat = cdist(joint_adata[joint_adata.obs["batch"] == "spatial"].obsm['X_pca_harmony'][:,:np.min([30,joint_adata.obsm['X_pca_harmony'].shape[1]])], 408 | joint_adata[joint_adata.obs["batch"] == "rna"].obsm['X_pca_harmony'][:,:np.min([30,joint_adata.obsm['X_pca_harmony'].shape[1]])]) 409 | k_dist_threshold = np.sort(knn_mat)[:, n_neighbors-1] 410 | knn_mat[knn_mat > k_dist_threshold[:,np.newaxis]] = 0 # sets all dist > thresh to 0 411 | knn_mat[knn_mat > 0] = 1 # 1 for connection to a nn 412 | row_sums = knn_mat.sum(axis=1) 413 | knn_mat = knn_mat / row_sums[:,np.newaxis] 414 | predicted_expression = knn_mat @ RNAseq_adata.X 415 | 416 | predicted_expression = pd.DataFrame(predicted_expression, columns=RNAseq_adata.var_names.values) 417 | predicted_expression = predicted_expression[genes_to_predict] 418 | 419 | return(predicted_expression) 420 | 421 | 422 | def spage_impute (spatial_adata, RNAseq_adata, genes_to_predict, **kwargs): 423 | ''' 424 | Runs SpaGE gene imputation 425 | 426 | See predict_gene_expression() for details on arguments 427 | ''' 428 | #from tissue.SpaGE.main import SpaGE 429 | from .SpaGE.main import SpaGE 430 | 431 | # transform adata in spage input data format 432 | if isinstance(spatial_adata.X,np.ndarray): 433 | spatial_data = pd.DataFrame(spatial_adata.X.T) 434 | else: 435 | spatial_data = pd.DataFrame(spatial_adata.X.T.toarray()) 436 | spatial_data.index = spatial_adata.var_names.values 437 | if isinstance(RNAseq_adata.X,np.ndarray): # convert to array if needed 438 | RNAseq_data = pd.DataFrame(RNAseq_adata.X.T) 439 | else: 440 | RNAseq_data = pd.DataFrame(RNAseq_adata.X.T.toarray()) 441 | RNAseq_data.index = RNAseq_adata.var_names.values 442 | 443 | # predict with SpaGE 444 | predicted_expression = SpaGE(spatial_data.T,RNAseq_data.T,genes_to_predict=genes_to_predict,**kwargs) 445 | 446 | return(predicted_expression) 447 | 448 | 449 | def tangram_impute (spatial_adata, RNAseq_adata, genes_to_predict, **kwargs): 450 | ''' 451 | Run Tangram gene imputation (positioning) using the more efficient cluster-level approach with Leiden clustering 452 | 453 | See predict_gene_expression() for details on arguments 454 | ''' 455 | import torch 456 | from torch.nn.functional import softmax, cosine_similarity, sigmoid 457 | import tangram as tg 458 | 459 | # clustering and preprocessing 460 | RNAseq_adata_label = RNAseq_adata.copy() 461 | sc.pp.highly_variable_genes(RNAseq_adata_label) 462 | RNAseq_adata_label = RNAseq_adata[:, RNAseq_adata_label.var.highly_variable].copy() 463 | sc.pp.scale(RNAseq_adata_label, max_value=10) 464 | sc.tl.pca(RNAseq_adata_label) 465 | sc.pp.neighbors(RNAseq_adata_label) 466 | sc.tl.leiden(RNAseq_adata_label, resolution = 0.5) 467 | RNAseq_adata.obs['leiden'] = RNAseq_adata_label.obs.leiden 468 | del RNAseq_adata_label 469 | tg.pp_adatas(RNAseq_adata, spatial_adata) # genes=None default using all genes shared between two data 470 | 471 | # gene projection onto spatial 472 | ad_map = tg.map_cells_to_space(RNAseq_adata, spatial_adata, mode='clusters', cluster_label='leiden', density_prior='rna_count_based', verbose=False) 473 | ad_ge = tg.project_genes(ad_map, RNAseq_adata, cluster_label='leiden') 474 | predicted_expression = pd.DataFrame(ad_ge[:,genes_to_predict].X, index=ad_ge[:,genes_to_predict].obs_names, columns=ad_ge[:,genes_to_predict].var_names) 475 | 476 | return(predicted_expression) 477 | 478 | 479 | def gimvi_impute (spatial_adata, RNAseq_adata, genes_to_predict, **kwargs): 480 | ''' 481 | Run gimVI gene imputation 482 | 483 | See predict_gene_expression() for details on arguments 484 | ''' 485 | import scvi 486 | from scvi.external import GIMVI 487 | 488 | # preprocessing of data 489 | spatial_adata = spatial_adata[:, spatial_adata.var_names.isin(RNAseq_adata.var_names)].copy() 490 | predict_idxs = [list(RNAseq_adata.var_names).index(gene) for gene in genes_to_predict] 491 | spatial_dim0 = spatial_adata.shape[0] 492 | 493 | # indices for filtering out zero-expression cells 494 | filtered_cells_spatial = (spatial_adata.X.sum(axis=1) > 1) 495 | filtered_cells_RNAseq = (RNAseq_adata.X.sum(axis=1) > 1) 496 | 497 | # make copies of subsets 498 | spatial_adata = spatial_adata[filtered_cells_spatial,:].copy() 499 | RNAseq_adata = RNAseq_adata[filtered_cells_RNAseq,:].copy() 500 | 501 | # setup anndata for scvi 502 | GIMVI.setup_anndata(spatial_adata) 503 | GIMVI.setup_anndata(RNAseq_adata) 504 | 505 | # train gimVI model 506 | model = GIMVI(RNAseq_adata, spatial_adata, generative_distributions=['nb', 'nb'], **kwargs) # 'nb' tends to be less buggy 507 | model.train(200) 508 | 509 | # apply trained model for imputation 510 | _, imputation = model.get_imputed_values(normalized=False) 511 | imputed = imputation[:, predict_idxs] 512 | predicted_expression = np.zeros((spatial_dim0, imputed.shape[1])) 513 | predicted_expression[filtered_cells_spatial,:] = imputed 514 | predicted_expression = pd.DataFrame(predicted_expression, columns=genes_to_predict) 515 | 516 | return(predicted_expression) 517 | 518 | 519 | def conformalize_spatial_uncertainty (adata, predicted, calib_genes, weight='exp_cos', add_one=True, 520 | grouping_method=None, k='auto', k2='auto', n_pc=None, n_pc2=None, weight_n_pc=10): 521 | ''' 522 | Generates cell-centric variability and then performs stratified grouping and conformal score calculation 523 | 524 | Parameters 525 | ---------- 526 | adata - AnnData object with adata.obsm[predicted] and adata.obsp['spatial_connectivites'] 527 | predicted [str] - string corresponding to key in adata.obsm that contains the predicted transcript expression 528 | calib_genes [list or np.1darray] - strings corresponding to the genes to use in calibration 529 | weight [str] - weights to use when computing spatial variability (either 'exp_cos' or 'spatial_connectivities') 530 | add_one [bool] - whether to add an intercept term of one to the spatial standard deviation 531 | weight_n_pc [None or int] - if not None, then specifies number of top principal components to use for weight calculation if weight is 'exp_cos' (default is None) 532 | For grouping_method [str], k [int>0 or 'auto'], k2 [None or int>0 or 'auto'], n_pc [None or int>0], n_pc2 [None or int>0]; refer to get_grouping() 533 | 534 | Returns 535 | ------- 536 | Saves the uncertainty in adata.obsm[predicted+"_uncertainty"] 537 | Saves the scores in adata.obsm[predicted+"_score"] 538 | Saves an upper and lower bound in adata.obsm[predicted+"_lo"/"_hi"] 539 | ''' 540 | # get spatial uncertainty and add to annotations 541 | scores, residuals, G_stdev, G = get_spatial_uncertainty_scores(adata, predicted, calib_genes, 542 | weight=weight, 543 | add_one=add_one, 544 | weight_n_pc=weight_n_pc) 545 | 546 | adata.obsm[predicted+"_uncertainty"] = pd.DataFrame(G_stdev, 547 | columns=adata.obsm[predicted].columns, 548 | index=adata.obsm[predicted].index) 549 | adata.obsm[predicted+"_score"] = pd.DataFrame(scores, 550 | columns=calib_genes, 551 | index=adata.obsm[predicted].index) 552 | adata.obsm[predicted+"_error"] = pd.DataFrame(residuals, 553 | columns=calib_genes, 554 | index=adata.obsm[predicted].index) 555 | 556 | # define group 557 | if grouping_method is None: 558 | groups = np.zeros(G.shape) 559 | else: 560 | groups, k_final, k2_final = get_grouping(G, method=grouping_method, k=k, k2=k2, n_pc=n_pc, n_pc2=n_pc2) 561 | 562 | # add grouping and k-values to anndata 563 | adata.obsm[predicted+"_groups"] = groups 564 | adata.uns[predicted+"_kg"] = k_final 565 | adata.uns[predicted+"_kc"] = k2_final 566 | 567 | 568 | def get_spatial_uncertainty_scores (adata, predicted, calib_genes, weight='exp_cos', 569 | add_one=True, weight_n_pc=None): 570 | ''' 571 | Computes spatial uncertainty scores (i.e. cell-centric variability) 572 | 573 | Parameters 574 | ---------- 575 | adata - AnnData object with adata.obsm[predicted] and adata.obsp['spatial_connectivites'] 576 | predicted [str] - string corresponding to key in adata.obsm that contains the predicted transcript expression 577 | calib_genes [list or np.1darray] - strings corresponding to the genes to use in calibration 578 | weight [str] - weights to use when computing spatial variability (either 'exp_cos' or 'spatial_connectivities') 579 | - 'spatial_connectivities' will use values in adata.obsp['spatial_connectivities'] 580 | add_one [bool] - whether to add one to the uncertainty 581 | weight_n_pc [None or int] - if not None, then specifies number of top principal components to use for weight calculation if weight is 'exp_cos' (default is None) 582 | 583 | Returns 584 | ------- 585 | scores - spatial uncertainty scores for all calib_genes 586 | residuals - prediction errors matching scores dimensions 587 | G_stdev - spatial standard deviations measured; same shape as adata.obsm[predicted] 588 | G - adata.obsm[predicted].values 589 | ''' 590 | if weight not in ["exp_cos", "spatial_connectivities"]: 591 | raise Exception('weight not recognized') 592 | 593 | if 'spatial_connectivities' not in adata.obsp.keys(): 594 | raise Exception ("'spatial_connectivities' not found in adata.obsp and is required") 595 | 596 | # init prediction array and uncertainties array 597 | A = adata.obsp['spatial_connectivities'] 598 | A.eliminate_zeros() 599 | G = adata.obsm[predicted].values.copy() 600 | G_stdev = np.zeros_like(G) 601 | 602 | # init for exp_cos weighting 603 | if weight == "exp_cos": 604 | from sklearn.metrics.pairwise import cosine_similarity 605 | if weight_n_pc is not None: # perform PCA first and then compute cosine weights from PCs 606 | G_pca = StandardScaler().fit_transform(G) 607 | G_pca = PCA(n_components=weight_n_pc, random_state=444).fit_transform(G_pca) 608 | 609 | # compute cell-centric variability 610 | for i in range(G.shape[0]): # iterate cells 611 | 612 | # get its neighbors only 613 | cell_idxs = np.nonzero(A[i,:])[1] 614 | c_idx = np.where(cell_idxs==i)[0][0] # center idx in subsetted array 615 | 616 | # compute weights for cell neighbors 617 | if weight == "exp_cos": # use TISSUE cosine similarity weighting 618 | if weight_n_pc is not None: # perform PCA first and then compute cosine weights from PCs 619 | cos_weights = cosine_similarity(G_pca[i,:].reshape(1,-1), G_pca[cell_idxs,:]) 620 | else: # compute cosine weights from gene expression 621 | cos_weights = cosine_similarity(G[i,:].reshape(1,-1), G[cell_idxs,:]) 622 | weights = np.exp(cos_weights).flatten() 623 | 624 | elif weight == "spatial_connectivities": # use preset weights 625 | weights = A[i,cell_idxs].toarray().flatten() 626 | weights[np.isnan(weights)] = 0 627 | 628 | else: # set uniform weights 629 | weights = np.ones(len(cell_idxs)) 630 | 631 | # compute CCV for each gene 632 | nA_std = [] 633 | for j in range(G.shape[1]): # iterate genes 634 | 635 | # get expression of gene for cell and neighbors 636 | expression_vec = G[cell_idxs,j] 637 | 638 | # compute CCV for cell 639 | nA_std.append(cell_centered_variability(expression_vec, weights=weights, c_idx=c_idx)) 640 | 641 | nA_std = np.array(nA_std) 642 | 643 | # add one if specified 644 | if add_one is True: 645 | nA_std += 1 646 | 647 | # update G_stdev with uncertainties 648 | G_stdev[i,:] = nA_std 649 | 650 | # compute scores based on confidence genes (prediction residuals) 651 | calib_idxs = [np.where(adata.obsm[predicted].columns==gene)[0][0] for gene in calib_genes] 652 | residuals = adata[:, calib_genes].X - adata.obsm[predicted][calib_genes].values # Y-G 653 | 654 | warnings.filterwarnings("ignore", category=RuntimeWarning) # suppress RuntimeWarning for division by zero 655 | scores = np.abs(residuals) / G_stdev[:, calib_idxs] # scores 656 | warnings.filterwarnings("default", category=RuntimeWarning) 657 | 658 | return(scores, residuals, G_stdev, G) 659 | 660 | 661 | def cell_centered_variability (values, weights, c_idx): 662 | ''' 663 | Takes in an array and weights to compute cell-centric variability: 664 | 665 | Parameters 666 | ---------- 667 | values [1d arr] - array with cell's masked neighborhood expression (non-neighbors are nan) 668 | weights [1d arr] - same dim as values; contains weights for computing CCV_c 669 | c_idx [int] - index for which element of nA corresponds to center cell 670 | 671 | Returns 672 | ------- 673 | ccv [float] - cell-centric varaiblity 674 | ''' 675 | values_f = values[np.isfinite(values)] 676 | weights_f = weights[np.isfinite(values)] 677 | average = values[c_idx] # "average" is simply the center cell value 678 | variance = np.average((values_f-average)**2, weights=weights_f) 679 | ccv = np.sqrt(variance) 680 | 681 | return(ccv) 682 | 683 | 684 | def get_spatial_uncertainty_scores_from_metadata(adata, predicted): 685 | ''' 686 | Returns scores, residuals, G_stdev, G (outputs of get_spatial_uncertainty_scores) from precomputed entries 687 | in the AnnData (adata) object. Note, these must have been computed and saved in the same was as in 688 | conformalize_spatial_uncertainty(). 689 | 690 | Parameters 691 | ---------- 692 | adata [AnnData] - object that has saved results in obsm 693 | predicted [str] - key for predictions in obsm 694 | 695 | Returns 696 | ------- 697 | scores - array of calibration scores [cell x gene] 698 | residuals - prediction error [cell x gene] 699 | G_stdev - array of cell-centric variability measures [cell x gene] 700 | groups - array of indices for group assignment [cell x gene] 701 | ''' 702 | scores = np.array(adata.obsm[predicted+"_score"]).copy() 703 | residuals = np.array(adata.obsm[predicted+"_error"]).copy() 704 | G_stdev = np.array(adata.obsm[predicted+"_uncertainty"]).copy() 705 | G = np.array(adata.obsm[predicted]).copy() 706 | groups = np.array(adata.obsm[predicted+"_groups"]).copy() 707 | 708 | return(scores, residuals, G_stdev, G, groups) 709 | 710 | 711 | def get_grouping(G, method, k='auto', k2='auto', min_samples=5, n_pc=None, n_pc2=None): 712 | ''' 713 | Given the predicted gene expression matrix G (rows=cells, cols=genes), 714 | creates a grouping of the different genes (or cells) determined by: 715 | 716 | Parameters 717 | ---------- 718 | G [numpy matrix/array] - predicted gene expression; columns are genes 719 | method [str] - 'kmeans_gene_cell' to separate by genes and the by cells by k-means clustering 720 | k [int] - number of groups; only for cv_exp, kmeans_gene, kmeans_cell and kmeans_gene_cell 721 | if <=1 then defaults to one group including all values 722 | k2 [int] - second number of groups for kmeans_gene_cell 723 | if <=1 then defaults to one group including all values 724 | min_samples [int] - min number of samples; only for dbscan clustering 725 | n_pc and npc2 [None or int] - number of PCs to use before KMeans clustering 726 | - NOTE: It is recommended to do this for methods: "kmeans_gene" and "kmeans_gene_cell" 727 | 728 | Returns 729 | ------- 730 | groups [numpy array] - same dimension as G with values corresponding to group number (integer) 731 | ''' 732 | # for auto k searches 733 | k_list = [2,3,4] 734 | 735 | # grouping by genes then by cells 736 | if method == "kmeans_gene_cell": 737 | 738 | ### Gene grouping 739 | X = StandardScaler().fit_transform(G.T) 740 | if n_pc is not None: 741 | X = PCA(n_components=n_pc, random_state=444).fit_transform(X) 742 | # if "auto", then select best k (k_gene) 743 | if k == 'auto': 744 | k = get_best_k(X, k_list) 745 | # group genes 746 | if k > 1: 747 | kmeans_genes = KMeans(n_clusters=k, random_state=444).fit(X) 748 | cluster_genes = kmeans_genes.labels_ 749 | else: 750 | cluster_genes = np.zeros(X.shape[0]) 751 | 752 | # set up groups 753 | groups = np.ones(G.shape)*np.nan # init groups array 754 | counter = 0 # to index new groups with integers 755 | 756 | ### Cell grouping 757 | # if "auto", then select best k2 (k_cell) 758 | if k2 == 'auto': 759 | X = StandardScaler().fit_transform(G) 760 | if n_pc2 is not None: 761 | X = PCA(n_components=n_pc2, random_state=444).fit_transform(X) 762 | k2 = get_best_k(X, k_list) 763 | # within each gene group, group cells 764 | for cg in np.unique(cluster_genes): 765 | if k2 > 1: # group if more than one cell group needed 766 | G_group = G[:, cluster_genes==cg] 767 | X_group = StandardScaler().fit_transform(G_group) 768 | if n_pc2 is not None: 769 | X_group = PCA(n_components=n_pc2, random_state=444).fit_transform(X_group) 770 | kmeans_cells = KMeans(n_clusters=k2, random_state=444).fit(X_group) 771 | cluster_cells = kmeans_cells.labels_ 772 | else: # set same labels for all cells 773 | cluster_cells = np.zeros(G.shape[0]) 774 | # assign cell-gene stratified groupings 775 | for cc in np.unique(cluster_cells): 776 | groups[np.ix_(cluster_cells==cc, cluster_genes==cg)] = counter 777 | counter += 1 778 | 779 | else: 780 | raise Exception("method for get_grouping() is not recognized") 781 | 782 | return(groups, k, k2) 783 | 784 | 785 | def get_best_k (X, k_list): 786 | ''' 787 | Given a matrix X to perform KMeans clustering and list of k parameter values, 788 | searches for the best k value 789 | 790 | k_list should be in ascending order since get_best_k will terminate once the 791 | silhouette score decreases 792 | 793 | Parameters 794 | ---------- 795 | X - array to perform K-means clustering on 796 | k_list - list of positive integers for number of clusters to use 797 | 798 | Returns 799 | ------- 800 | best_k [int] - k value that returns the highest silhouette score 801 | ''' 802 | from sklearn.metrics import silhouette_score 803 | 804 | # init search 805 | current_best = -np.inf 806 | best_k = 1 807 | 808 | # search along k_list 809 | for k in k_list: 810 | kmeans = KMeans(n_clusters=k, random_state=444).fit(X) 811 | score = silhouette_score(X, kmeans.labels_) 812 | if score > current_best: # update if score increases 813 | current_best = score 814 | best_k = k 815 | else: # stop if score decreases 816 | break 817 | 818 | return(best_k) 819 | 820 | 821 | 822 | def conformalize_prediction_interval (adata, predicted, calib_genes, alpha_level=0.33, symmetric=True, return_scores_dict=False, compute_wasserstein=False): 823 | ''' 824 | Builds conformal prediction interval sets for the predicted gene expression 825 | 826 | Parameters 827 | ---------- 828 | adata [AnnData] - contains adata.obsm[predicted] corresponding to the predicted gene expression 829 | predicted [str] - key in adata.obsm that corresponds to predicted gene expression 830 | calib_genes [list or arr of str] - names of the genes in adata.var_names that are used in the calibration set 831 | alpha_level [float] - between 0 and 1; determines the alpha level; the CI will span the (1-alpha_level) interval 832 | default value is alpha_level = 0.33 corresponding to 67% CI 833 | symmetric [bool] - whether to report symmetric prediction intervals or non-symmetric intervals; default is True (symmetric) 834 | return_scores_dict [bool] - whether to return the scores dictionary 835 | compute_wasserstein [bool] - whether to compute the Wasserstein distance of the score distributions between each subgroup and its calibration set 836 | - added to adata.obsm["{predicted}_wasserstein"] 837 | 838 | Returns 839 | ------- 840 | Modifies adata in-place 841 | Optionally returns the scores_flattened_dict (dictionary containing calibration scores and group assignments) 842 | ''' 843 | # get uncertainties and scores from saved adata 844 | scores, residuals, G_stdev, G, groups = get_spatial_uncertainty_scores_from_metadata (adata, predicted) 845 | 846 | ### Building calibration sets for scores 847 | 848 | scores_flattened_dict = build_calibration_scores(adata, predicted, calib_genes, symmetric=symmetric) 849 | 850 | ### Building prediction intervals 851 | 852 | prediction_sets = (np.zeros(G.shape), np.zeros(G.shape)) # init prediction sets 853 | 854 | if compute_wasserstein is True: # set up matrix to store Wasserstein distances 855 | from scipy.stats import wasserstein_distance 856 | score_dist_wasserstein = np.ones(G.shape).astype(G.dtype)*np.nan 857 | 858 | # conformalize independently within groups of genes 859 | for group in np.unique(groups[~np.isnan(groups)]): 860 | 861 | # for symmetric intervals 862 | if symmetric is True: 863 | scores_flattened = scores_flattened_dict[str(group)] # flatten scores 864 | n = len(scores_flattened) 865 | if (n < 100): # if less than 100 samples in either set, then use the full group set 866 | scores_flattened = scores_flattened_dict[str(np.nan)] 867 | n = len(scores_flattened)-np.isnan(scores_flattened).sum() 868 | try: 869 | qhat = np.nanquantile(scores_flattened, np.ceil((n+1)*(1-alpha_level))/n) 870 | except: 871 | qhat = np.nan 872 | prediction_sets[0][groups==group] = (G-G_stdev*qhat)[groups==group] # lower bound 873 | prediction_sets[1][groups==group] = (G+G_stdev*qhat)[groups==group] # upper bound 874 | 875 | # for asymmetric intervals (Default) 876 | else: 877 | scores_lo_flattened = scores_flattened_dict[str(group)][0] 878 | scores_hi_flattened = scores_flattened_dict[str(group)][1] 879 | n_lo = len(scores_lo_flattened)-np.isnan(scores_lo_flattened).sum() 880 | n_hi = len(scores_hi_flattened)-np.isnan(scores_hi_flattened).sum() 881 | # compute qhat for lower and upper bounds 882 | if (n_lo < 100) or (n_hi < 100): # if less than 100 samples in either set, then use the full group set 883 | scores_lo_flattened = scores_flattened_dict[str(np.nan)][0] 884 | scores_hi_flattened = scores_flattened_dict[str(np.nan)][1] 885 | n_lo = len(scores_lo_flattened)-np.isnan(scores_lo_flattened).sum() 886 | n_hi = len(scores_hi_flattened)-np.isnan(scores_hi_flattened).sum() 887 | try: 888 | qhat_lo = np.nanquantile(scores_lo_flattened, np.ceil((n_lo+1)*(1-alpha_level))/n_lo) 889 | qhat_hi = np.nanquantile(scores_hi_flattened, np.ceil((n_hi+1)*(1-alpha_level))/n_hi) 890 | except: 891 | qhat_lo = np.nan 892 | qhat_hi = np.nan 893 | # compute bounds of prediction interval 894 | prediction_sets[0][groups==group] = (G-G_stdev*qhat_lo)[groups==group] # lower bound 895 | prediction_sets[1][groups==group] = (G+G_stdev*qhat_hi)[groups==group] # upper bound 896 | 897 | # Wasserstein distances 898 | if compute_wasserstein is True: 899 | # set up mask for calibration genes 900 | calib_idxs = [np.where(adata.obsm[predicted].columns==gene)[0][0] for gene in calib_genes] 901 | calib_mask = np.full(G_stdev.shape, False) 902 | calib_mask[:,calib_idxs] = True 903 | # get CCV measures 904 | v = G_stdev[(groups==group)&~(calib_mask)].flatten() # group CCV 905 | if len(v) > 0: # skip if no observations in group 906 | if symmetric is True: 907 | if n < 100: 908 | u = G_stdev[calib_mask].flatten() # calibration CCV 909 | else: 910 | u = G_stdev[(groups==group)&(calib_mask)].flatten() # calibration CCV 911 | else: 912 | if (n_lo < 100) or (n_hi < 100): 913 | u = G_stdev[calib_mask].flatten() # calibration CCV 914 | else: 915 | u = G_stdev[(groups==group)&(calib_mask)].flatten() # calibration CCV 916 | # calculate wasserstein distance for the CCV distributions 917 | score_dist_wasserstein[groups==group] = wasserstein_distance(u, v).astype(G.dtype) 918 | 919 | # add prediction intervals to adata 920 | adata.uns['alpha'] = alpha_level 921 | adata.obsm[predicted+"_lo"] = pd.DataFrame(prediction_sets[0], 922 | columns=adata.obsm[predicted].columns, 923 | index=adata.obsm[predicted].index) 924 | adata.obsm[predicted+"_hi"] = pd.DataFrame(prediction_sets[1], 925 | columns=adata.obsm[predicted].columns, 926 | index=adata.obsm[predicted].index) 927 | # add wasserstein distances to adata 928 | if compute_wasserstein is True: 929 | adata.obsm[predicted+"_wasserstein"] = pd.DataFrame(score_dist_wasserstein, 930 | columns=adata.obsm[predicted].columns, 931 | index=adata.obsm[predicted].index) 932 | 933 | 934 | if return_scores_dict is True: 935 | 936 | return(scores_flattened_dict) 937 | 938 | 939 | 940 | def build_calibration_scores (adata, predicted, calib_genes, symmetric=False, include_zero_scores=False, 941 | trim_quantiles=[None,None]): 942 | ''' 943 | Builds calibration score sets 944 | 945 | Parameters 946 | ---------- 947 | adata [AnnData] - contains adata.obsm[predicted] corresponding to the predicted gene expression 948 | predicted [str] - key in adata.obsm with predicted gene expression values 949 | calib_genes [list or arr of str] - names of the genes in adata.var_names that are used in the calibration set 950 | symmetric [bool] - whether to have symmetric (or non-symmetric) prediction intervals 951 | include_zero_scores [bool] - whether to exclude zero scores 952 | trim_quantiles [list of len 2; None or float between 0 and 1] - specifies what quantile range of scores to trim to; None implies no bounds 953 | 954 | Returns 955 | ------- 956 | scores_flattened_dict - dictionary containing the calibration scores for each stratified group 957 | ''' 958 | 959 | # get uncertainties and scores from saved adata 960 | scores, residuals, G_stdev, G, groups = get_spatial_uncertainty_scores_from_metadata (adata, predicted) 961 | 962 | scores_flattened_dict = {} 963 | 964 | # get calibration genes 965 | calib_idxs = [np.where(adata.obsm[predicted].columns==gene)[0][0] for gene in calib_genes] 966 | 967 | # iterate groups and build conformal sets of calibration scores 968 | for group in np.unique(groups[~np.isnan(groups)]): 969 | if (np.isnan(group)) or (group not in groups[:, calib_idxs]): # defer to using full calibration set 970 | scores_group = scores.copy() 971 | residuals_group = residuals.copy() 972 | else: # for groups that are found in the calibration set, build group-specific sets 973 | scores_group = scores.copy()[groups[:, calib_idxs]==group] 974 | residuals_group = residuals.copy()[groups[:, calib_idxs]==group] 975 | if symmetric is True: # symmetric calibration set 976 | if include_zero_scores is False: 977 | scores_flattened = scores_group[residuals_group != 0].flatten() # exclude zeros -- empirically this way is fastest 978 | else: 979 | scores_flattened = scores_group.flatten() 980 | scores_flattened_dict[str(group)] = scores_flattened[np.isfinite(scores_flattened)] # add to dict 981 | else: # separate into hi/lo non-symmetric calibration sets 982 | if include_zero_scores is False: 983 | scores_lo_flattened = scores_group[residuals_group < 0].flatten() 984 | scores_hi_flattened = scores_group[residuals_group > 0].flatten() 985 | else: 986 | scores_lo_flattened = scores_group[residuals_group <= 0].flatten() 987 | scores_hi_flattened = scores_group[residuals_group >= 0].flatten() 988 | scores_flattened_dict[str(group)] = (scores_lo_flattened[np.isfinite(scores_lo_flattened)], 989 | scores_hi_flattened[np.isfinite(scores_hi_flattened)]) # add to dict 990 | 991 | # build nan group consisting of all scores 992 | if symmetric is True: # symmetric calibration set 993 | if include_zero_scores is False: 994 | scores_flattened = scores[residuals != 0].flatten() # exclude zeros 995 | else: 996 | scores_flattened = scores.flatten() 997 | scores_flattened_dict[str(np.nan)] = scores_flattened[np.isfinite(scores_flattened)] # add to dict 998 | else: # separate into hi/lo non-symmetric calibration sets 999 | if include_zero_scores is False: 1000 | scores_lo_flattened = scores[residuals < 0].flatten() 1001 | scores_hi_flattened = scores[residuals > 0].flatten() 1002 | else: 1003 | scores_lo_flattened = scores[residuals <= 0].flatten() 1004 | scores_hi_flattened = scores[residuals >= 0].flatten() 1005 | scores_flattened_dict[str(np.nan)] = (scores_lo_flattened[np.isfinite(scores_lo_flattened)], 1006 | scores_hi_flattened[np.isfinite(scores_hi_flattened)]) # add to dict 1007 | 1008 | # trim all scores if specified 1009 | for key in scores_flattened_dict.keys(): 1010 | 1011 | # determine quantiles from original scores 1012 | if symmetric is True: 1013 | if trim_quantiles[0] is not None: 1014 | lower_bound = np.nanquantile(scores_flattened_dict[key], trim_quantiles[0]) 1015 | if trim_quantiles[1] is not None: 1016 | upper_bound = np.nanquantile(scores_flattened_dict[key], trim_quantiles[1]) 1017 | else: 1018 | if trim_quantiles[0] is not None: 1019 | lower_bound_lo = np.nanquantile(scores_flattened_dict[key][0], trim_quantiles[0]) 1020 | lower_bound_hi = np.nanquantile(scores_flattened_dict[key][1], trim_quantiles[0]) 1021 | if trim_quantiles[1] is not None: 1022 | upper_bound_lo = np.nanquantile(scores_flattened_dict[key][0], trim_quantiles[1]) 1023 | upper_bound_hi = np.nanquantile(scores_flattened_dict[key][1], trim_quantiles[1]) 1024 | 1025 | # trim based on quantiles 1026 | if symmetric is True: 1027 | if trim_quantiles[0] is not None: 1028 | scores_flattened_dict[key] = scores_flattened_dict[key][scores_flattened_dict[key]>lower_bound] 1029 | if trim_quantiles[1] is not None: 1030 | scores_flattened_dict[key] = scores_flattened_dict[key][scores_flattened_dict[key]lower_bound_lo], 1034 | scores_flattened_dict[key][1][scores_flattened_dict[key][1]>lower_bound_hi]) 1035 | if trim_quantiles[1] is not None: 1036 | scores_flattened_dict[key] = (scores_flattened_dict[key][0][scores_flattened_dict[key][0]