├── tissue
    ├── SpaGE
    │   ├── __init__.py
    │   ├── dimensionality_reduction.py
    │   ├── main.py
    │   └── principal_vectors.py
    ├── __init__.py
    ├── utils.py
    ├── experiments.py
    ├── downstream.py
    └── main.py
├── pipeline.png
├── .gitignore
├── dist
    ├── tissue-0.0.1.tar.gz
    ├── tissue_sc-0.0.1.tar.gz
    ├── tissue_sc-0.0.2.tar.gz
    ├── tissue_sc-1.0.0.tar.gz
    ├── tissue-0.0.1-py2.py3-none-any.whl
    ├── tissue_sc-0.0.1-py2.py3-none-any.whl
    ├── tissue_sc-0.0.2-py2.py3-none-any.whl
    └── tissue_sc-1.0.0-py2.py3-none-any.whl
├── README_files
    ├── README_13_0.png
    ├── README_22_0.png
    ├── README_24_0.png
    ├── README_25_0.png
    ├── README_26_0.png
    ├── README_28_1.png
    ├── README_29_1.png
    ├── README_51_0.png
    ├── README_52_0.png
    ├── README_54_0.png
    ├── README_55_0.png
    ├── README_58_0.png
    ├── README_60_0.png
    ├── README_62_0.png
    └── README_63_0.png
├── requirements.txt
├── pyproject.toml
├── LICENSE
├── test.py
└── README.md


/tissue/SpaGE/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/pipeline.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | tissue/__pycache__/
3 | tissue/SpaGE/__pycache__/
4 | README.ipynb


--------------------------------------------------------------------------------
/dist/tissue-0.0.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/dist/tissue-0.0.1.tar.gz


--------------------------------------------------------------------------------
/README_files/README_13_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_13_0.png


--------------------------------------------------------------------------------
/README_files/README_22_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_22_0.png


--------------------------------------------------------------------------------
/README_files/README_24_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_24_0.png


--------------------------------------------------------------------------------
/README_files/README_25_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_25_0.png


--------------------------------------------------------------------------------
/README_files/README_26_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_26_0.png


--------------------------------------------------------------------------------
/README_files/README_28_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_28_1.png


--------------------------------------------------------------------------------
/README_files/README_29_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_29_1.png


--------------------------------------------------------------------------------
/README_files/README_51_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_51_0.png


--------------------------------------------------------------------------------
/README_files/README_52_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_52_0.png


--------------------------------------------------------------------------------
/README_files/README_54_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_54_0.png


--------------------------------------------------------------------------------
/README_files/README_55_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_55_0.png


--------------------------------------------------------------------------------
/README_files/README_58_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_58_0.png


--------------------------------------------------------------------------------
/README_files/README_60_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_60_0.png


--------------------------------------------------------------------------------
/README_files/README_62_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_62_0.png


--------------------------------------------------------------------------------
/README_files/README_63_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/README_files/README_63_0.png


--------------------------------------------------------------------------------
/dist/tissue_sc-0.0.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/dist/tissue_sc-0.0.1.tar.gz


--------------------------------------------------------------------------------
/dist/tissue_sc-0.0.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/dist/tissue_sc-0.0.2.tar.gz


--------------------------------------------------------------------------------
/dist/tissue_sc-1.0.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/dist/tissue_sc-1.0.0.tar.gz


--------------------------------------------------------------------------------
/dist/tissue-0.0.1-py2.py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/dist/tissue-0.0.1-py2.py3-none-any.whl


--------------------------------------------------------------------------------
/dist/tissue_sc-0.0.1-py2.py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/dist/tissue_sc-0.0.1-py2.py3-none-any.whl


--------------------------------------------------------------------------------
/dist/tissue_sc-0.0.2-py2.py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/dist/tissue_sc-0.0.2-py2.py3-none-any.whl


--------------------------------------------------------------------------------
/dist/tissue_sc-1.0.0-py2.py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunericd/TISSUE/HEAD/dist/tissue_sc-1.0.0-py2.py3-none-any.whl


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | squidpy==1.2.3
2 | #wpca==0.1
3 | #tangram-sc==1.0.3
4 | #harmonypy==0.0.6
5 | #scvi-tools==0.19.0
6 | #spatialde==1.1.3


--------------------------------------------------------------------------------
/tissue/__init__.py:
--------------------------------------------------------------------------------
1 | '''TISSUE (Transcript Imputation with Spatial Single-cell Uncertainty Estimation) provides tools for estimating well-calibrated uncertainty measures for gene expression predictions in single-cell spatial transcriptomics datasets and utilizing them in downstream analyses'''
2 | 
3 | __version__ = "1.0.1"


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["flit_core >=3.2,<4"]
 3 | build-backend = "flit_core.buildapi"
 4 | 
 5 | [project]
 6 | name = "tissue-sc"
 7 | authors = [{name = "Eric Sun", email = "edsun97@gmail.com"}]
 8 | readme = "README.md"
 9 | license = {file = "LICENSE"}
10 | classifiers = ["License :: OSI Approved :: MIT License"]
11 | dynamic = ["version", "description"]
12 | dependencies = [
13 | 				"squidpy>=1.2.3"
14 | 				]
15 | 
16 | [project.urls]
17 | Home = "https://github.com/sunericd/tissue"
18 | 
19 | [tool.flit.module]
20 | name = "tissue"


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2022 Eric David Sun
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/tissue/SpaGE/dimensionality_reduction.py:
--------------------------------------------------------------------------------
 1 | """ Dimensionality Reduction
 2 | @author: Soufiane Mourragui
 3 | This module extracts the domain-specific factors from the high-dimensional omics
 4 | dataset. Several methods are here implemented and they can be directly
 5 | called from string name in main method method. All the methods
 6 | use scikit-learn implementation.
 7 | Notes
 8 | -------
 9 | 	-
10 | 	
11 | References
12 | -------
13 | 	[1] Pedregosa, Fabian, et al. (2011) Scikit-learn: Machine learning in Python.
14 | 	Journal of Machine Learning Research
15 | """
16 | 
17 | import numpy as np
18 | from sklearn.decomposition import PCA, FastICA, FactorAnalysis, NMF, SparsePCA
19 | from sklearn.cross_decomposition import PLSRegression
20 | 
21 | 
22 | def process_dim_reduction(method='pca', n_dim=10):
23 |     """
24 |     Default linear dimensionality reduction method. For each method, return a
25 |     BaseEstimator instance corresponding to the method given as input.
26 | 	Attributes
27 |     -------
28 |     method: str, default to 'pca'
29 |     	Method used for dimensionality reduction.
30 |     	Implemented: 'pca', 'ica', 'fa' (Factor Analysis), 
31 |     	'nmf' (Non-negative matrix factorisation), 'sparsepca' (Sparse PCA).
32 |     
33 |     n_dim: int, default to 10
34 |     	Number of domain-specific factors to compute.
35 |     Return values
36 |     -------
37 |     Classifier, i.e. BaseEstimator instance
38 |     """
39 | 
40 |     if method.lower() == 'pca':
41 |         clf = PCA(n_components=n_dim)
42 | 
43 |     elif method.lower() == 'ica':
44 |         print('ICA')
45 |         clf = FastICA(n_components=n_dim)
46 | 
47 |     elif method.lower() == 'fa':
48 |         clf = FactorAnalysis(n_components=n_dim)
49 | 
50 |     elif method.lower() == 'nmf':
51 |         clf = NMF(n_components=n_dim)
52 | 
53 |     elif method.lower() == 'sparsepca':
54 |         clf = SparsePCA(n_components=n_dim, alpha=10., tol=1e-4, verbose=10, n_jobs=1)
55 | 
56 |     elif method.lower() == 'pls':
57 |         clf = PLS(n_components=n_dim)
58 | 		
59 |     else:
60 |         raise NameError('%s is not an implemented method'%(method))
61 | 
62 |     return clf
63 | 
64 | 
65 | class PLS():
66 |     """
67 |     Implement PLS to make it compliant with the other dimensionality
68 |     reduction methodology.
69 |     (Simple class rewritting).
70 |     """
71 |     def __init__(self, n_components=10):
72 |         self.clf = PLSRegression(n_components)
73 | 
74 |     def get_components_(self):
75 |         return self.clf.x_weights_.transpose()
76 | 
77 |     def set_components_(self, x):
78 |         pass
79 | 
80 |     components_ = property(get_components_, set_components_)
81 | 
82 |     def fit(self, X, y):
83 |         self.clf.fit(X,y)
84 |         return self
85 | 
86 |     def transform(self, X):
87 |         return self.clf.transform(X)
88 | 
89 |     def predict(self, X):
90 |         return self.clf.predict(X)


--------------------------------------------------------------------------------
/tissue/SpaGE/main.py:
--------------------------------------------------------------------------------
 1 | """ SpaGE [1]
 2 | @author: Tamim Abdelaal
 3 | This function integrates two single-cell datasets, spatial and scRNA-seq, and 
 4 | enhance the spatial data by predicting the expression of the spatially 
 5 | unmeasured genes from the scRNA-seq data.
 6 | The integration is performed using the domain adaption method PRECISE [2]
 7 | 	
 8 | References
 9 | -------
10 |     [1] Abdelaal T., Mourragui S., Mahfouz A., Reiders M.J.T. (2020)
11 |     SpaGE: Spatial Gene Enhancement using scRNA-seq
12 |     [2] Mourragui S., Loog M., Reinders M.J.T., Wessels L.F.A. (2019)
13 |     PRECISE: A domain adaptation approach to transfer predictors of drug response
14 |     from pre-clinical models to tumors
15 | """
16 | 
17 | import numpy as np
18 | import pandas as pd
19 | import scipy.stats as st
20 | from sklearn.neighbors import NearestNeighbors
21 | #from tissue.SpaGE.principal_vectors import PVComputation
22 | from .principal_vectors import PVComputation
23 | 
24 | def SpaGE(Spatial_data,RNA_data,n_pv,genes_to_predict=None):
25 |     """
26 |         @author: Tamim Abdelaal
27 |         This function integrates two single-cell datasets, spatial and scRNA-seq, 
28 |         and enhance the spatial data by predicting the expression of the spatially 
29 |         unmeasured genes from the scRNA-seq data.
30 |         
31 |         Parameters
32 |         -------
33 |         Spatial_data : Dataframe
34 |             Normalized Spatial data matrix (cells X genes).
35 |         RNA_data : Dataframe
36 |             Normalized scRNA-seq data matrix (cells X genes).
37 |         n_pv : int
38 |             Number of principal vectors to find from the independently computed
39 |             principal components, and used to align both datasets. This should
40 |             be <= number of shared genes between the two datasets.
41 |         genes_to_predict : str array 
42 |             list of gene names missing from the spatial data, to be predicted 
43 |             from the scRNA-seq data. Default is the set of different genes 
44 |             (columns) between scRNA-seq and spatial data.
45 |             
46 |         Return
47 |         -------
48 |         Imp_Genes: Dataframe
49 |             Matrix containing the predicted gene expressions for the spatial 
50 |             cells. Rows are equal to the number of spatial data rows (cells), 
51 |             and columns are equal to genes_to_predict,  .
52 |     """
53 |     
54 |     if genes_to_predict is SpaGE.__defaults__[0]:
55 |         genes_to_predict = np.setdiff1d(RNA_data.columns,Spatial_data.columns)
56 |         
57 |     RNA_data_scaled = pd.DataFrame(data=st.zscore(RNA_data,axis=0),
58 |                                    index = RNA_data.index,columns=RNA_data.columns)
59 |     Spatial_data_scaled = pd.DataFrame(data=st.zscore(Spatial_data,axis=0),
60 |                                    index = Spatial_data.index,columns=Spatial_data.columns)
61 |     Common_data = RNA_data_scaled[np.intersect1d(Spatial_data_scaled.columns,RNA_data_scaled.columns)]
62 |     
63 |     Imp_Genes = pd.DataFrame(np.zeros((Spatial_data.shape[0],len(genes_to_predict))),
64 |                                  columns=genes_to_predict)
65 |     
66 |     pv_Spatial_RNA = PVComputation(
67 |             n_factors = n_pv,
68 |             n_pv = n_pv,
69 |             dim_reduction = 'pca',
70 |             dim_reduction_target = 'pca'
71 |     )
72 |     
73 |     pv_Spatial_RNA.fit(Common_data,Spatial_data_scaled[Common_data.columns])
74 |     
75 |     S = pv_Spatial_RNA.source_components_.T
76 |         
77 |     Effective_n_pv = sum(np.diag(pv_Spatial_RNA.cosine_similarity_matrix_) > 0.3)
78 |     S = S[:,0:Effective_n_pv]
79 |     
80 |     Common_data_projected = Common_data.dot(S)
81 |     Spatial_data_projected = Spatial_data_scaled[Common_data.columns].dot(S)
82 |         
83 |     nbrs = NearestNeighbors(n_neighbors=50, algorithm='auto',
84 |                             metric = 'cosine').fit(Common_data_projected)
85 |     distances, indices = nbrs.kneighbors(Spatial_data_projected)
86 |     
87 |     for j in range(0,Spatial_data.shape[0]):
88 |     
89 |         weights = 1-(distances[j,:][distances[j,:]<1])/(np.sum(distances[j,:][distances[j,:]<1]))
90 |         weights = weights/(len(weights)-1)
91 |         Imp_Genes.iloc[j,:] = np.dot(weights,RNA_data[genes_to_predict].iloc[indices[j,:][distances[j,:] < 1]])
92 |         
93 |     return Imp_Genes
94 | 


--------------------------------------------------------------------------------
/tissue/utils.py:
--------------------------------------------------------------------------------
  1 | # Contains utility functions for TISSUE
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import anndata as ad
  6 | import os
  7 | 
  8 | 
  9 | def large_save(adata, dirpath):
 10 |     '''
 11 |     Saves anndata objects by saving each obsm value with its {key}.csv as pandas dataframe
 12 |     Saves each uns value that is a dataframe with uns/{key}.csv as pandas dataframe
 13 |     Then saves the anndata object with obsm removed.
 14 |     
 15 |     Parameters
 16 |     ----------
 17 |         adata [AnnData] - AnnData object to save
 18 |         
 19 |         dirpath [str] - path to directory for where to save the h5ad and csv files; will create if not existing
 20 |             adata will be saved as {dirpath}/adata.h5ad
 21 |             obsm will be saved as {dirpath}/{key}.csv
 22 |         
 23 |     Returns
 24 |     -------
 25 |         Saves anndata object in "large" folder format
 26 |     '''
 27 |     # check if dirpath exists; else create it
 28 |     if not os.path.exists(dirpath):
 29 |         os.makedirs(dirpath)
 30 |     
 31 |     # extract the obsm metadata and save it as separate csv files
 32 |     for key, value in adata.obsm.items():
 33 |         df = pd.DataFrame(value)
 34 |         df.to_csv(os.path.join(dirpath, f"{key}.csv"), index=False)
 35 | 
 36 |     # remove the obsm metadata from the anndata object
 37 |     adatac = adata.copy()
 38 |     adatac.obsm = {}
 39 |     
 40 |     # extract the uns metadata and save it as separate csv files
 41 |     del_keys = []
 42 |     for key, value in adatac.uns.items():
 43 |         if isinstance(value, pd.DataFrame):
 44 |             if not os.path.exists(os.path.join(dirpath,"uns")):
 45 |                 os.makedirs(os.path.join(dirpath,"uns"))
 46 |             df = pd.DataFrame(value)
 47 |             df.to_csv(os.path.join(dirpath,"uns",f"{key}.csv"), index=False)
 48 |             del_keys.append(key)
 49 |     
 50 |     # remove uns metadata from the anndata object
 51 |     for key in del_keys:
 52 |         del adatac.uns[key]
 53 | 
 54 |     # save the new anndata object
 55 |     adatac.write(os.path.join(dirpath, "adata.h5ad"))
 56 | 
 57 | 
 58 | 
 59 | def large_load(dirpath, skipfiles=[]):
 60 |     '''
 61 |     Loads in anndata and associated pandas dataframe csv files to be added to obsm metadata and uns metadata.
 62 |     Input is the directory path to the output directory of large_save()
 63 |     
 64 |     Parameters
 65 |     ----------
 66 |         dirpath [str] - path to directory for where outputs of large_save() are located
 67 |         skipfiles [list] - list of filenames to exclude from anndata object
 68 |     
 69 |     Returns
 70 |     -------
 71 |         adata - AnnData object loaded from dirpath along with all obsm and uns key values added to metadata
 72 |     '''
 73 |     # read h5ad anndata object
 74 |     adata = ad.read_h5ad(os.path.join(dirpath, "adata.h5ad"))
 75 |     
 76 |     # read and load in obsm from CSV files
 77 |     for fn in os.listdir(dirpath):
 78 |         if (".csv" in fn) and (fn not in skipfiles):
 79 |             df = pd.read_csv(os.path.join(dirpath, fn))
 80 |             df.index = adata.obs_names
 81 |             key = fn.split(".")[0]
 82 |             adata.obsm[key] = df
 83 |             
 84 |     # read and load any usn metadata from CSV files
 85 |     if os.path.isdir(os.path.join(dirpath,"uns")):
 86 |         for fn in os.listdir(os.path.join(dirpath,"uns")):
 87 |             if (".csv" in fn) and (fn not in skipfiles):
 88 |                 df = pd.read_csv(os.path.join(dirpath,"uns",fn))
 89 |                 key = fn.split(".")[0]
 90 |                 adata.uns[key] = df
 91 |             
 92 |     return(adata)
 93 | 
 94 | 
 95 | def convert_adata_to_dataupload (adata, savedir):
 96 |     '''
 97 |     Saves AnnData object into TISSUE input directory
 98 |     
 99 |     Parameters
100 |     ----------
101 |         adata - AnnData object to be saved with all metadata in adata.obs and spatial coordinates in adata.obsm['spatial']
102 |         savedir [str] - path to existing directory to save the files for TISSUE loading
103 |         
104 |     Returns
105 |     -------
106 |         Saves all TISSUE input files into the specified directory for the given AnnData object
107 |         
108 |     NOTE: You will need to independently include scRNA_count.txt in savedir for TISSUE inputs to be complete
109 |     '''
110 |     locations = pd.DataFrame(adata.obsm['spatial'], columns=['x','y'])
111 |     locations.to_csv(os.path.join(savedir,"Locations.txt"), sep="\t", index=False)
112 |     
113 |     df = pd.DataFrame(adata.X, columns=adata.var_names)
114 |     df.to_csv(os.path.join(savedir,"Spatial_count.txt"), sep="\t", index=False)
115 |     
116 |     meta = pd.DataFrame(adata.obs)
117 |     meta.to_csv(os.path.join(savedir,"Metadata.txt"))


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | # TEST FILE FOR BASIC TISSUE FUNCTIONALITIES
 2 | 
 3 | 
 4 | # import packages
 5 | 
 6 | import tissue.main, tissue.downstream
 7 | 
 8 | import numpy as np
 9 | import pandas as pd
10 | import matplotlib.pyplot as plt
11 | import scanpy as sc
12 | import anndata as ad
13 | import os
14 | 
15 | import warnings
16 | warnings.filterwarnings("ignore")
17 | 
18 | #################################################################################################################
19 | print ("Testing TISSUE data loading...")
20 | try:
21 |     adata, RNAseq_adata = tissue.main.load_paired_datasets("tests/data/Spatial_count.txt",
22 |                                                            "tests/data/Locations.txt",
23 |                                                            "tests/data/scRNA_count.txt")
24 | except:
25 |     raise Exception ("Failed data loading from tests/data/ with tissue.main.load_paired_datasets()")
26 | 
27 | #################################################################################################################
28 | print ("Testing TISSUE preprocessing...")
29 | adata.var_names = [x.lower() for x in adata.var_names]
30 | RNAseq_adata.var_names = [x.lower() for x in RNAseq_adata.var_names]
31 | try:
32 |     tissue.main.preprocess_data(RNAseq_adata, standardize=False, normalize=True)
33 | except:
34 |     raise Exception ("Failed TISSUE preprocessing. Make sure all dependencies are installed.")
35 | gene_names = np.intersect1d(adata.var_names, RNAseq_adata.var_names)
36 | adata = adata[:, gene_names].copy()
37 | target_gene = "plp1"
38 | target_expn = adata[:, target_gene].X.copy()
39 | adata = adata[:, [gene for gene in gene_names if gene != target_gene]].copy()
40 | 
41 | #################################################################################################################
42 | print("Testing TISSUE spatial gene expression prediction...")
43 | try:
44 |     tissue.main.predict_gene_expression (adata, RNAseq_adata, [target_gene],
45 |                                          method="spage", n_folds=3, n_pv=10)
46 | except:
47 |     raise Exception("TISSUE prediction failed for SpaGE at tissue.main.predict_gene_expression()")
48 | 
49 | #################################################################################################################
50 | print("Testing TISSUE calibration...")
51 | try:
52 |     tissue.main.build_spatial_graph(adata, method="fixed_radius", n_neighbors=15)
53 | except:
54 |     raise Exception ("Failed TISSUE spatial graph building at tissue.main.build_spatial_graph()")
55 | try:
56 |     tissue.main.conformalize_spatial_uncertainty(adata, "spage_predicted_expression", calib_genes=adata.var_names,
57 |                                                  grouping_method="kmeans_gene_cell", k=4, k2=2)
58 | except:
59 |     raise Exception ("Failed TISSUE cell-centric variability and calibration scores processing at tissue.main.conformalize_spatial_uncertainty()")
60 | try:
61 |     tissue.main.conformalize_prediction_interval (adata, "spage_predicted_expression", calib_genes=adata.var_names,
62 |                                                   alpha_level=0.23, compute_wasserstein=True)
63 | except:
64 |     raise Exception ("Failed TISSUE prediction interval calibration at tissue.main.conformalize_prediction_interval()")
65 | 
66 | #################################################################################################################
67 | print ("Testing TISSUE multiple imputation t-test...")
68 | adata.obs['condition'] = ['A' if i < round(adata.shape[0]/2) else 'B' for i in range(adata.shape[0])]
69 | try:
70 |     tissue.downstream.multiple_imputation_testing(adata, "spage_predicted_expression",
71 |                                                   calib_genes=adata.var_names,
72 |                                                   condition='condition',
73 |                                                   group1 = "A", # use None to compute for all conditions, condition vs all
74 |                                                   group2 = "B", # use None to compute for group1 vs all
75 |                                                   n_imputations=2)
76 | except:
77 |     raise Exception ("Failed TISSUE MI t-test at tissue.downstream.multiple_imputation_testing()")
78 | 
79 | #################################################################################################################
80 | print("Testing TISSUE cell filtering")
81 | X_uncertainty = adata.obsm["spage_predicted_expression_hi"].values - adata.obsm["spage_predicted_expression_lo"].values
82 | try:
83 |     keep_idxs = tissue.downstream.detect_uncertain_cells (X_uncertainty,
84 |                                                           proportion="otsu",
85 |                                                           stratification=adata.obs['condition'].values)
86 | except:
87 |     raise Exception ("Failed TISSUE cell filtering at tissue.downstream.detect_uncertain_cells()")
88 | try:
89 |     keep_idxs = tissue.downstream.filtered_PCA (adata, # anndata object
90 |                                                 "spage", # prediction method
91 |                                                 proportion="otsu",
92 |                                                 stratification=adata.obs['condition'].values,
93 |                                                 return_keep_idxs=True)
94 | except:
95 |     raise Exception ("Failed TISSUE-filtered PCA at tissue.downstream.filtered_PCA()")
96 | 
97 | print("TISSUE tests passed!")


--------------------------------------------------------------------------------
/tissue/SpaGE/principal_vectors.py:
--------------------------------------------------------------------------------
  1 | """ Principal Vectors
  2 | @author: Soufiane Mourragui
  3 | This module computes the principal vectors from two datasets, i.e.:
  4 | - perform linear dimensionality reduction independently for both dataset, resulting
  5 | in set of domain-specific factors.
  6 | - find the common factors using principal vectors [1]
  7 | This result in set of pairs of vectors. Each pair has one vector from the source and one
  8 | from the target. For each pair, a similarity score (cosine similarity) can be computed
  9 | between the principal vectors and the pairs are naturally ordered by decreasing order
 10 | of this similarity measure.
 11 | Example
 12 | -------
 13 |     Examples are given in the vignettes.
 14 | Notes
 15 | -------
 16 | 	Examples are given in the vignette
 17 | 	
 18 | References
 19 | -------
 20 | 	[1] Golub, G.H. and Van Loan, C.F., 2012. "Matrix computations" (Vol. 3). JHU Press.
 21 | 	[2] Mourragui, S., Loog, M., Reinders, M.J.T., Wessels, L.F.A. (2019)
 22 |     PRECISE: A domain adaptation approach to transfer predictors of drug response
 23 |     from pre-clinical models to tumors
 24 | """
 25 | 
 26 | import numpy as np
 27 | import pandas as pd
 28 | import scipy
 29 | from pathlib import Path
 30 | from sklearn.preprocessing import normalize
 31 | 
 32 | #from tissue.SpaGE.dimensionality_reduction import process_dim_reduction
 33 | from .dimensionality_reduction import process_dim_reduction
 34 | 
 35 | class PVComputation:
 36 |     """
 37 |     Attributes
 38 |     -------
 39 |     n_factors: int
 40 |         Number of domain-specific factors to compute.
 41 |     n_pv: int
 42 |         Number of principal vectors.
 43 |     dim_reduction_method_source: str
 44 |         Dimensionality reduction method used for source data
 45 |     dim_reduction_target: str
 46 |         Dimensionality reduction method used for source data
 47 |     source_components_ : numpy.ndarray, shape (n_pv, n_features)
 48 |         Loadings of the source principal vectors ranked by similarity to the
 49 |         target. Components are in the row.
 50 |     source_explained_variance_ratio_: numpy.ndarray, shape (n_pv)
 51 |         Explained variance of the source on each source principal vector.
 52 |     target_components_ : numpy.ndarray, shape (n_pv, n_features)
 53 |         Loadings of the target principal vectors ranked by similarity to the
 54 |         source. Components are in the row.
 55 |     target_explained_variance_ratio_: numpy.ndarray, shape (n_pv)
 56 |         Explained variance of the target on each target principal vector.
 57 |     cosine_similarity_matrix_: numpy.ndarray, shape (n_pv, n_pv)
 58 |         Scalar product between the source and the target principal vectors. Source
 59 |         principal vectors are in the rows while target's are in the columns. If
 60 |         the domain adaptation is sensible, a diagonal matrix should be obtained.
 61 |     """
 62 | 
 63 |     def __init__(self, n_factors,n_pv,
 64 |                 dim_reduction='pca',
 65 |                 dim_reduction_target=None,
 66 |                 project_on=0):
 67 |         """
 68 |         Parameters
 69 |         -------
 70 |         n_factors : int
 71 |             Number of domain-specific factors to extract from the data (e.g. using PCA, ICA).
 72 |         n_pv : int
 73 |             Number of principal vectors to find from the independently computed factors.
 74 |         dim_reduction : str, default to 'pca' 
 75 |             Dimensionality reduction method for the source data,
 76 |             i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'.
 77 |         dim_reduction_target : str, default to None 
 78 |             Dimensionality reduction method for the target data,
 79 |             i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'. If None, set to dim_reduction.
 80 |     	project_on: int or bool, default to 0
 81 |     		Where data should be projected on. 0 means source PVs, -1 means target PVs and 1 means
 82 |             both PVs.
 83 |         """
 84 |         self.n_factors = n_factors
 85 |         self.n_pv = n_pv
 86 |         self.dim_reduction_method_source = dim_reduction
 87 |         self.dim_reduction_method_target = dim_reduction_target or dim_reduction
 88 |         self.dim_reduction_source = self._process_dim_reduction(self.dim_reduction_method_source)
 89 |         self.dim_reduction_target = self._process_dim_reduction(self.dim_reduction_method_target)
 90 | 
 91 |         self.source_components_ = None
 92 |         self.source_explained_variance_ratio_ = None
 93 |         self.target_components_ = None
 94 |         self.target_explained_variance_ratio_ = None
 95 |         self.cosine_similarity_matrix_ = None
 96 | 
 97 |     def _process_dim_reduction(self, dim_reduction):
 98 |         if type(dim_reduction) == str:
 99 |             return process_dim_reduction(method=dim_reduction, n_dim=self.n_factors)
100 |         else:
101 |             return dim_reduction
102 | 
103 |     def fit(self, X_source, X_target, y_source=None):
104 |         """
105 |     	Compute the common factors between two set of data.
106 |     	IMPORTANT: Same genes have to be given for source and target, and in same order
107 |         Parameters
108 |         -------
109 |         X_source : np.ndarray, shape (n_components, n_genes)
110 |             Source dataset
111 |         X_target : np.ndarray, shape (n_components, n_genes)
112 |             Target dataset
113 |         y_source : np.ndarray, shape (n_components, 1) (optional, default to None)
114 |             Eventual output, in case one wants to give ouput (for instance PLS)
115 |         Return values
116 |         -------
117 |         self: returns an instance of self.
118 |         """
119 |         # Compute factors independently for source and target. Orthogonalize the basis
120 |         Ps = self.dim_reduction_source.fit(X_source, y_source).components_
121 |         Ps = scipy.linalg.orth(Ps.transpose()).transpose()
122 | 
123 |         Pt = self.dim_reduction_target.fit(X_target, y_source).components_
124 |         Pt = scipy.linalg.orth(Pt.transpose()).transpose()
125 | 
126 |         # Compute the principal factors
127 |         self.compute_principal_vectors(Ps, Pt)
128 | 
129 |         # Compute variance explained
130 |         self.source_explained_variance_ratio_ = np.var(self.source_components_.dot(X_source.transpose()), axis=1)/\
131 |                                                 np.sum(np.var(X_source), axis=0)
132 |         self.target_explained_variance_ratio_ = np.var(self.target_components_.dot(X_target.transpose()), axis=1)/\
133 |                                                 np.sum(np.var(X_target), axis=0)
134 | 
135 |         return self
136 | 
137 |     def compute_principal_vectors(self, source_factors, target_factors):
138 |         """
139 |     	Compute the principal vectors between the already computed set of domain-specific
140 |         factors, using approach presented in [1,2].
141 |     	IMPORTANT: Same genes have to be given for source and target, and in same order
142 |         Parameters
143 |         -------
144 |     	source_factors: np.ndarray, shape (n_components, n_genes)
145 |     		Source domain-specific factors.
146 |     	target_factors: np.ndarray, shape (n_components, n_genes)
147 |     		Target domain-specific factors.
148 |         Return values
149 |         -------
150 |         self: returns an instance of self.
151 |         """
152 | 
153 |         # Find principal vectors using SVD
154 |         u,sigma,v = np.linalg.svd(source_factors.dot(target_factors.transpose()))
155 |         self.source_components_ = u.transpose().dot(source_factors)[:self.n_pv]
156 |         self.target_components_ = v.dot(target_factors)[:self.n_pv]
157 |         # Normalize to make sure that vectors are unitary
158 |         self.source_components_ = normalize(self.source_components_, axis = 1)
159 |         self.target_components_ = normalize(self.target_components_, axis = 1)
160 | 
161 |         # Compute cosine similarity matrix
162 |         self.initial_cosine_similarity_matrix_ = source_factors.dot(target_factors.transpose())
163 |         self.cosine_similarity_matrix_ = self.source_components_.dot(self.target_components_.transpose())
164 | 
165 |         # Compute angles
166 |         self.angles_ = np.arccos(np.diag(self.cosine_similarity_matrix_))
167 | 
168 |         return self
169 | 
170 | 
171 |     def transform(self, X, project_on=None):
172 |         """
173 |     	Projects data onto principal vectors.
174 |         Parameters
175 |         -------
176 |         X : numpy.ndarray, shape (n_samples, n_genes)
177 |             Data to project.
178 |         project_on: int or bool, default to None
179 |             Where data should be projected on. 0 means source PVs, -1 means target PVs and 1 means
180 |             both PVs. If None, set to class instance value.
181 |     	Return values
182 |         -------
183 |         Projected data as a numpy.ndarray of shape (n_samples, n_factors)
184 |         """
185 | 
186 |         project_on = project_on or self.project_on
187 | 
188 |         # Project on source
189 |         if project_on == 0:
190 |             return X.dot(self.source_components_.transpose())
191 | 
192 |         # Project on target
193 |         elif project_on == -1:
194 |             return X.dot(self.target_components_.transpose())
195 | 
196 |         # Project on both
197 |         elif project_on == 1:
198 |             return X.dot(np.concatenate([self.source_components_.transpose(), self.target_components_.transpose()]))
199 | 
200 |         else:
201 |             raise ValueError('project_on should be 0 (source), -1 (target) or 1 (both). %s not correct value'%(project_on))


--------------------------------------------------------------------------------
/tissue/experiments.py:
--------------------------------------------------------------------------------
  1 | # Contains compound functions for generating results for experiments with TISSUE
  2 | # These are unlikely to be used for general applications but were used in our development/testing of TISSUE
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import matplotlib.pyplot as plt
  7 | import scanpy as sc
  8 | import squidpy as sq
  9 | import anndata as ad
 10 | import warnings
 11 | import os
 12 | import gc
 13 | 
 14 | #from tissue.utils import large_save, large_load
 15 | from .utils import large_save, large_load
 16 | from .main import load_spatial_data, conformalize_prediction_interval, get_spatial_uncertainty_scores_from_metadata
 17 | from .downstream import multiple_imputation_testing
 18 | 
 19 | 
 20 | def group_conformalize_from_intermediate(dataset_name, methods, symmetric, alpha_levels,
 21 |                                          save_alpha=[0.05], savedir="SCPI", type_dataset="DataUpload"):
 22 |     '''
 23 |     Function for taking intermediate fold predictions and running group conformalization for all different alpha values
 24 |     
 25 |     Returns a results dictionary with calibration quality (res_dict) and the AnnData with CI for all folds at alpha of save_alpha [float]
 26 |     
 27 |     Parameters
 28 |     ----------
 29 |         dataset_name [str] - name of folder in DataUpload/
 30 |         methods [list of str] - list of method keys to use for prediction_sets
 31 |         symmetric [bool] - whether to use symmetric prediction intervals
 32 |         alpha_levels [array] - alpha levels to calibrate over
 33 |         save_alpha [list of float] - alphas to save prediction intervals into adata.obsm
 34 |         savedir [str] - folder where the intermediate results are saved (independent folds)
 35 |         type_dataset [str] - default to "DataUpload" but may have additional options in the future
 36 |         
 37 |     Returns
 38 |     -------
 39 |         res_dict [dict] - dictionary of calibration statistics / coverage statistics across the alpha levels
 40 |         adata [AnnData] - anndata with calibration results added to metadata
 41 |     '''
 42 |     # read in spatial data
 43 |     if type_dataset == "DataUpload":
 44 |         if os.path.isfile("DataUpload/"+dataset_name+"/Metadata.txt"):
 45 |             adata = load_spatial_data("DataUpload/"+dataset_name+"/Spatial_count.txt",
 46 |                                       "DataUpload/"+dataset_name+"/Locations.txt",
 47 |                                        spatial_metadata = "DataUpload/"+dataset_name+"/Metadata.txt")
 48 |         else:
 49 |             adata = load_spatial_data("DataUpload/"+dataset_name+"/Spatial_count.txt",
 50 |                                       "DataUpload/"+dataset_name+"/Locations.txt")
 51 |     else:
 52 |         adata = sc.read_h5ad(os.path.join("additional_data",dataset_name,"spatial.h5ad"))
 53 |     adata.var_names = [x.lower() for x in adata.var_names]
 54 |     
 55 |     # results dict
 56 |     res_dict = {}
 57 |     
 58 |     for method in methods:
 59 | 
 60 |         res_dict[method] = {}
 61 |         res_dict[method]['ind_gene_results'] = {}
 62 | 
 63 |         calibration_weight = 0 # for computing weighted average
 64 |         test_weight = 0
 65 | 
 66 |         dirpath = savedir+"/"+dataset_name+"_intermediate/"+method
 67 |         
 68 |         folds = np.load(os.path.join(savedir+"/"+dataset_name+"_intermediate/"+method,"folds.npy"), allow_pickle=True)
 69 | 
 70 |         # subset spatial data into shared genes
 71 |         gene_names = np.concatenate(folds)
 72 |         adata = adata[:, gene_names]
 73 | 
 74 |         for i, fold in enumerate(folds):
 75 | 
 76 |             # load adata within fold
 77 |             sub_adata = large_load(os.path.join(dirpath, "fold"+str(i)))
 78 |             target_genes = list(fold)
 79 | 
 80 |             # subset data
 81 |             predicted = method+"_predicted_expression"
 82 |             test_genes = target_genes.copy()
 83 |             calib_genes = [gene for gene in gene_names if gene not in test_genes]
 84 |             test_idxs = [np.where(sub_adata.obsm[predicted].columns==gene)[0][0] for gene in test_genes]
 85 |             calib_idxs = [np.where(sub_adata.obsm[predicted].columns==gene)[0][0] for gene in calib_genes]
 86 | 
 87 |             # get uncertainties and scores from saved adata
 88 |             scores, residuals, G_stdev, G, groups = get_spatial_uncertainty_scores_from_metadata (sub_adata, predicted)
 89 | 
 90 |             # init dict for individual gene results
 91 |             for g in test_genes:
 92 |                 if g not in res_dict[method]['ind_gene_results'].keys():
 93 |                     res_dict[method]['ind_gene_results'][g] = {}
 94 |                     res_dict[method]['ind_gene_results'][g]['1-alpha'] = 1-alpha_levels
 95 |                     res_dict[method]['ind_gene_results'][g]['test'] = []
 96 | 
 97 |             # iterate over different alphas for conformalization
 98 |             test_perc = []
 99 |             calib_perc = []
100 | 
101 |             for alpha_level in alpha_levels:
102 |                 sub_adatac = sub_adata.copy()
103 |                 conformalize_prediction_interval (sub_adatac, predicted, calib_genes, alpha_level=alpha_level,
104 |                                                   symmetric=symmetric, return_scores_dict=False)
105 |                 
106 |                 prediction_sets = (sub_adatac.obsm[predicted+"_lo"].values, sub_adatac.obsm[predicted+"_hi"].values)
107 |                 
108 |                 test_perc.append(np.nanmean(((adata[:,test_genes].X>prediction_sets[0][:,test_idxs]) & (adata[:,test_genes].X<prediction_sets[1][:,test_idxs]))[(G[:,test_idxs]!=0)&(G_stdev[:,test_idxs]!=0)&(adata[:,test_genes].X!=0)]))
109 |                 calib_perc.append(np.nanmean(((adata[:,calib_genes].X>prediction_sets[0][:,calib_idxs]) & (adata[:,calib_genes].X<prediction_sets[1][:,calib_idxs]))[(G[:,calib_idxs]!=0)&(G_stdev[:,calib_idxs]!=0)&(adata[:,calib_genes].X!=0)]))
110 | 
111 |                 # Compute individual calibration curves for each gene
112 |                 for ti, tg in zip(test_idxs, test_genes):
113 |                     if sub_adatac.obsm[predicted].columns[ti] != tg:
114 |                         raise Warning ("ti not equal to tg: "+str(adata.var_names[ti])+" != "+str(tg))
115 |                     ind_test_ci = np.nanmean(((adata[:,tg].X>prediction_sets[0][:,ti]) & (adata[:,tg].X<prediction_sets[1][:,ti]))[(G[:,ti]!=0)&(G_stdev[:,ti]!=0)&(adata[:,tg].X!=0)])
116 |                     res_dict[method]['ind_gene_results'][tg]['test'].append(ind_test_ci)
117 |                     
118 |                 del sub_adatac
119 |                 del prediction_sets
120 |                 del ind_test_ci
121 |                 gc.collect()
122 | 
123 |             # weighted average
124 |             if i == 0:
125 |                 calibration_ci = np.array(calib_perc) * len(calib_genes)
126 |                 calibration_weight += len(calib_genes)
127 |                 test_ci = np.array(test_perc) * len(test_genes)
128 |                 test_weight += len(test_genes)
129 |             else:
130 |                 calibration_ci += np.array(calib_perc) * len(calib_genes)
131 |                 calibration_weight += len(calib_genes)
132 |                 test_ci += np.array(test_perc) * len(test_genes)
133 |                 test_weight += len(test_genes)
134 |                 
135 |             # Add new predictions
136 |             for si, s_alpha in enumerate(save_alpha):
137 |                 conformalize_prediction_interval (sub_adata, predicted, calib_genes, alpha_level=s_alpha,
138 |                                                   symmetric=symmetric, return_scores_dict=False)
139 |                 
140 |                 if i == 0:
141 |                     if si == 0: # to avoid overwriting multiple times
142 |                         adata.obsm[predicted] = pd.DataFrame(sub_adata.obsm[predicted][fold].values,
143 |                                                           columns=fold,
144 |                                                           index=adata.obs_names)
145 |                     adata.obsm[predicted+f"_lo_{round((1-s_alpha)*100)}"] = pd.DataFrame(sub_adata.obsm[predicted+"_lo"][fold].values,
146 |                                                       columns=fold,
147 |                                                       index=adata.obs_names)
148 |                     adata.obsm[predicted+f"_hi_{round((1-s_alpha)*100)}"] = pd.DataFrame(sub_adata.obsm[predicted+"_hi"][fold].values,
149 |                                                       columns=fold,
150 |                                                       index=adata.obs_names)
151 |                 else:
152 |                     if si == 0: # to avoid overwriting multiple times
153 |                         adata.obsm[predicted][fold] = sub_adata.obsm[predicted][fold].values.copy()
154 |                     adata.obsm[predicted+f"_lo_{round((1-s_alpha)*100)}"][fold] = sub_adata.obsm[predicted+"_lo"][fold].values.copy()
155 |                     adata.obsm[predicted+f"_hi_{round((1-s_alpha)*100)}"][fold] = sub_adata.obsm[predicted+"_hi"][fold].values.copy()
156 |                 
157 |             del sub_adata
158 |             gc.collect()
159 | 
160 |         # add results
161 |         calibration_ci = calibration_ci / calibration_weight
162 |         test_ci = test_ci / test_weight
163 | 
164 |         res_dict[method]['1-alpha'] = 1-alpha_levels
165 |         res_dict[method]['calibration'] = calibration_ci
166 |         res_dict[method]['test'] = test_ci
167 |         
168 |     return(res_dict, adata)
169 | 
170 | 
171 | def measure_calibration_error (res_dict, key, method="average"):
172 |     '''
173 |     Scores the calibration results from the res_dict object (dictionary output of group_conformalize_from_intermediate())
174 |     
175 |     Parameters
176 |     ----------
177 |         res_dict [python dict]
178 |         key [str] - key to access for scoring (i.e. the model name)
179 |         method [str] = "average" or "gene" to report either the results on average calibration or average metric across all genes
180 |         
181 |     Returns
182 |     -------
183 |         score [float] - score for calibration error (lower is better)
184 |     '''        
185 |     from sklearn.metrics import auc
186 |     
187 |     if method == "gene":    
188 |         auc_diffs = []
189 |             
190 |         for gene in res_dict[key]['ind_gene_results'].keys():
191 |             diff = np.abs(res_dict[key]['ind_gene_results'][gene]['test'] - res_dict[key]['ind_gene_results'][gene]['1-alpha'])            
192 |             auc_diffs.append(np.trapz(y=diff, x=res_dict[key]['ind_gene_results'][gene]['1-alpha']))
193 |                 
194 |         score = np.nanmean(np.abs(auc_diffs))
195 |         
196 |     else:
197 |         diff = np.abs(res_dict[key]['test'] - res_dict[key]['1-alpha'])            
198 |         score = np.abs(np.trapz(y=diff, x=res_dict[key]['1-alpha']))
199 |     
200 |     return (score)
201 | 
202 | 
203 | def group_multiple_imputation_testing_from_intermediate(dataset_name, methods, symmetric, condition, n_imputations=100,
204 |                                                         group1=None, group2=None, savedir="SCPI", test="ttest"):
205 |     '''
206 |     Function for taking intermediate fold predictions and running multiple imputation t-tests
207 |     
208 |     Returns AnnData object with all test results saved in adata.var
209 |     
210 |     Parameters
211 |     ----------
212 |         dataset_name [str] - name of folder in DataUpload/
213 |         methods [list of str] - list of method keys to use for prediction_sets
214 |         symmetric [bool] - whether to use symmetric prediction intervals
215 |         condition [str] - key in adata.obs to use for testing
216 |         n_imputations [int] - number of multiple imputations
217 |         group1 [None or str] - value in condition to use for group1 (if None, then will get results for all unique values)
218 |         group2 [None or str] - value in condition to use for group2 (if None, then will use all other values as group2)
219 |         savedir [str] - folder where the intermediate results are saved (independent folds)
220 |         type_dataset [str] - default to "DataUpload" but may have additional options in the future
221 |         
222 |     Returns
223 |     -------
224 |         adata [AnnData] - anndata with testing results added to metadata
225 |     ''' 
226 |     # read in spatial data
227 |     if os.path.isfile("DataUpload/"+dataset_name+"/Metadata.txt"):
228 |         adata = load_spatial_data("DataUpload/"+dataset_name+"/Spatial_count.txt",
229 |                                   "DataUpload/"+dataset_name+"/Locations.txt",
230 |                                    spatial_metadata = "DataUpload/"+dataset_name+"/Metadata.txt")
231 |     else:
232 |         adata = load_spatial_data("DataUpload/"+dataset_name+"/Spatial_count.txt",
233 |                                   "DataUpload/"+dataset_name+"/Locations.txt")
234 |     adata.var_names = [x.lower() for x in adata.var_names]
235 |     
236 |     for method in methods:
237 | 
238 |         dirpath = savedir+"/"+dataset_name+"_intermediate/"+method
239 |         
240 |         folds = np.load(os.path.join(savedir+"/"+dataset_name+"_intermediate/"+method,"folds.npy"), allow_pickle=True)
241 | 
242 |         # subset spatial data into shared genes
243 |         gene_names = np.concatenate(folds)
244 |         adata = adata[:, gene_names]
245 | 
246 |         for i, fold in enumerate(folds):
247 | 
248 |             # load adata within fold
249 |             sub_adata = large_load(os.path.join(dirpath, "fold"+str(i)))
250 |             target_genes = list(fold)
251 | 
252 |             # subset data
253 |             predicted = method+"_predicted_expression"
254 |             test_genes = target_genes.copy()
255 |             calib_genes = [gene for gene in gene_names if gene not in test_genes]
256 |             test_idxs = [np.where(sub_adata.obsm[predicted].columns==gene)[0][0] for gene in test_genes]
257 |             calib_idxs = [np.where(sub_adata.obsm[predicted].columns==gene)[0][0] for gene in calib_genes]
258 | 
259 |             # run multiple imputation test
260 |             keys_list = multiple_imputation_testing (sub_adata, predicted, calib_genes, condition, n_imputations=n_imputations,
261 |                                                      group1=group1, group2=group2, symmetric=symmetric, return_keys=True, test=test)
262 |             
263 |             if i == 0:
264 |                 for key in keys_list:
265 |                     adata.var[key] = np.zeros(adata.shape[1])
266 |                     
267 |                 adata.obsm[predicted] = pd.DataFrame(sub_adata.obsm[predicted][fold].values,
268 |                                                   columns=fold,
269 |                                                   index=adata.obs_names)
270 |             for key in keys_list:
271 |                 adata.var[key][[np.where(adata.var_names==gene)[0][0] for gene in fold]] = sub_adata.uns[key][fold].values.flatten()
272 |                 adata.obsm[predicted][fold] = sub_adata.obsm[predicted][fold].values.copy()
273 |                 
274 |     return(adata)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # TISSUE
  2 | **TISSUE (Transcript Imputation with Spatial Single-cell Uncertainty Estimation)** provides tools for estimating well-calibrated uncertainty measures for gene expression predictions in single-cell spatial transcriptomics datasets and utilizing them in downstream analyses
  3 | 
  4 | ![plot](./pipeline.png)
  5 | 
  6 | 
  7 | If you find TISSUE useful, please consider citing our publication:
  8 | 
  9 | **Sun, E.D., Ma, R., Navarro Negredo, P. et al. TISSUE: uncertainty-calibrated prediction of single-cell spatial transcriptomics improves downstream analyses. Nat Methods (2024). https://doi.org/10.1038/s41592-024-02184-y**
 10 | 
 11 | [![Paper](https://img.shields.io/badge/Nature_Methods-DOI-orange)](https://doi.org/10.1038/s41592-024-02184-y)
 12 | 
 13 | 
 14 | ## Installation and setup
 15 | 
 16 | Complete installation (including of dependencies) of TISSUE in a new Conda environment should take less than 5 minutes on a normal desktop/laptop setup (Windows, Mac OSX, Linux). The base version of TISSUE requires SquidPy and all of its dependencies. The base version comes with the SpaGE spatial gene expression prediction method. Additional TISSUE functionalities (i.e. Tangram prediction, Harmony-kNN prediction, gimVI prediction, TISSUE-WPCA, TISSUE multiple imputation with SpatialDE) require installing additional dependencies, which are listed in `requirements.txt`. These should be installed with `pip install <package name>==<package version>` inside the same Conda environment as TISSUE (preferably before TISSUE installation).
 17 | 
 18 | For TISSUE installation, we provide two options: (A) PyPI installation with pip or (B) local installation. We recommend starting with option A and only going to option B if option A fails in your environment.
 19 | 
 20 | ### Option A: PyPI
 21 | 
 22 | Install the package through PyPI with `pip`. We recommend setting up a conda environment (https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or another virtual environment first since `tissue-sc` currently relies on specific versions for its dependencies (although it should generally work for other environment versions, but this hasn't been thoroughly tested):
 23 | 
 24 | ```
 25 | conda create -n myenv python=3.8
 26 | conda activate myenv
 27 | 
 28 | <pip install any additional dependencies>
 29 | pip install tissue-sc
 30 | ```
 31 | 
 32 | Note that you will want to separately download the data from this repository (`tests/data/`) to run our TISSUE tutorials.
 33 | 
 34 | 
 35 | ### Option B: Local installation
 36 | 
 37 | An alternative way to install the package along with associated test and tutorial files is to clone the directory and then install the requirements for using the package. To do this, first clone the repository using git (you can install git following the instructions [here](https://github.com/git-guides/install-git)):
 38 | 
 39 | ```
 40 | git clone https://github.com/sunericd/TISSUE.git
 41 | ```
 42 | 
 43 | We recommend setting up a conda environment to install the requirements for the package (instructions for installing conda and what conda environment can do can be found [here](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html)). Installation of requirements can then be done with the following commands:
 44 | 
 45 | ```
 46 | conda create -n tissue python=3.8
 47 | conda activate tissue
 48 | 
 49 | cd TISSUE
 50 | pip install -r requirements.txt
 51 | ```
 52 | 
 53 | To keep the requirements light, we have only included packages that are necessary for the core functionalities of TISSUE. For additional utilities such as gene prediction with Tangram, please install those packages separately (or uncomment those lines in `requirements.txt`).
 54 | 
 55 | To test that the installation is working correctly, you can run `python test.py` in the cloned directory.
 56 | 
 57 | 
 58 | # TISSUE Tutorials
 59 | 
 60 | Below we include several mini-tutorials to highlight the main TISSUE pipeline and downstream applications. These tutorials rely on a small test dataset (a subset of one of the datasets used in the original publication) for fast processing but the approaches can readily be extended to other datasets. For larger-scale examples, please refer to the code repository corresponding the figures and analyses generated for the TISSUE manuscript: https://github.com/sunericd/tissue-figures-and-analyses.git
 61 | 
 62 | 
 63 | ```python
 64 | # import packages
 65 | 
 66 | import tissue.main, tissue.downstream
 67 | 
 68 | import numpy as np
 69 | import pandas as pd
 70 | import matplotlib.pyplot as plt
 71 | import scanpy as sc
 72 | import anndata as ad
 73 | import os
 74 | ```
 75 | 
 76 | ## Upstream TISSUE analyses: spatial gene expression prediction and uncertainty calibration
 77 | 
 78 | The first part of the TISSUE pipeline involves making predictions of spatial gene expression profiles using paired spatial transcriptomics and scRNAseq datasets, and then computing and calibrating uncertainties for these predictions that are translated to prediction intervals. All functions for upstream analysis are in the `tissue.main` module. These include:
 79 | 
 80 | - `tissue.main.load_paired_datasets()` for loading paired datasets from formatted directories
 81 | 
 82 | - `tissue.main.predict_gene_expression()` for predicting spatial gene expression from paired datasets using a specified prediction method and number of cross-validation folds
 83 | 
 84 | - `tissue.main.build_spatial_graph()` for building a cell-cell spatial graph to define neighborhoods for computing the cell-centric variability
 85 | 
 86 | - `tissue.main.conformalize_spatial_uncertainty()` for computing cell-centric variability, stratified grouping, and the TISSUE calibration scores for predicted gene expression
 87 | 
 88 | - `tissue.main.conformalize_prediction_interval()` for building prediction intervals from uncertainty measures
 89 | 
 90 | ### Tutorial 1: Predicting spatial gene expression
 91 | 
 92 | First, we load a minimal subset of osmFISH spatial transcriptomics data of mouse somatosensory cortex published by Codeluppi et al., 2018: https://doi.org/10.1038/s41592-018-0175-z. 
 93 | 
 94 | Note that we are using the TISSUE methods for building an AnnData object from tab-delimited text files for the spatial counts, scRNAseq counts, spatial locations, and spatial metadata. If you already have an AnnData object for the spatial data and another object for the scRNAseq data, you can skip this step.
 95 | 
 96 | 
 97 | ```python
 98 | # load in spatial and scRNAseq datasets
 99 | 
100 | adata, RNAseq_adata = tissue.main.load_paired_datasets("tests/data/Spatial_count.txt",
101 |                                                        "tests/data/Locations.txt",
102 |                                                        "tests/data/scRNA_count.txt")
103 | ```
104 | 
105 |     /home/edsun/anaconda3/envs/tissue/lib/python3.8/site-packages/anndata/_core/anndata.py:117: ImplicitModificationWarning: Transforming to str index.
106 |       warnings.warn("Transforming to str index.", ImplicitModificationWarning)
107 |     /home/edsun/anaconda3/envs/tissue/lib/python3.8/site-packages/anndata/_core/anndata.py:856: UserWarning: 
108 |     AnnData expects .obs.index to contain strings, but got values like:
109 |         [0, 1, 2, 3, 4]
110 |     
111 |         Inferred to be: integer
112 |     
113 |       names = self._prep_dim_index(names, "obs")
114 | 
115 | 
116 | Now we can impute any genes of interest that are found in the scRNAseq dataset but not in the spatial dataset. In this case, we will hold out a target gene from the spatial data and apply an imputation method to predict its expression using the scRNAseq dataset.
117 | 
118 | First, we preprocess the data and make sure that the gene names are matchable across the two datasets:
119 | 
120 | 
121 | ```python
122 | # make genes lowercase
123 | adata.var_names = [x.lower() for x in adata.var_names]
124 | RNAseq_adata.var_names = [x.lower() for x in RNAseq_adata.var_names]
125 | 
126 | # preprocess RNAseq data
127 | tissue.main.preprocess_data(RNAseq_adata, standardize=False, normalize=True)
128 | 
129 | # subset spatial data into shared genes
130 | gene_names = np.intersect1d(adata.var_names, RNAseq_adata.var_names)
131 | adata = adata[:, gene_names].copy()
132 | 
133 | # hold out target gene
134 | target_gene = "plp1"
135 | target_expn = adata[:, target_gene].X.copy()
136 | adata = adata[:, [gene for gene in gene_names if gene != target_gene]].copy()
137 | ```
138 | 
139 | 
140 | ```python
141 | # dimensions of spatial transcriptomics dataset (number of cells x number of genes)
142 | adata.shape
143 | ```
144 | 
145 | 
146 | 
147 | 
148 |     (3405, 31)
149 | 
150 | 
151 | 
152 | 
153 | ```python
154 | # dimensions of RNAseq dataset (number of cells x number of genes)
155 | RNAseq_adata.shape
156 | ```
157 | 
158 | 
159 | 
160 | 
161 |     (1000, 32)
162 | 
163 | 
164 | 
165 | Now, we can make predictions of the target gene expression. In TISSUE, we currently have several methods for gene imputation including SpaGE, Tangram, and Harmony-kNN. We will be using SpaGE in this example:
166 | 
167 | 
168 | ```python
169 | # SpaGE spatial gene expression prediction
170 | 
171 | tissue.main.predict_gene_expression (adata, RNAseq_adata, [target_gene],
172 |                                      method="spage", n_folds=10, n_pv=10)
173 | ```
174 | 
175 | How good is the imputation? Since we left out this gene from the spatial data, we can plot the predicted and actual expression and visually inspect the agreement.
176 | 
177 | 
178 | ```python
179 | # Visualizing predicted and actual expression side by side
180 | 
181 | fig, (ax1, ax2) = plt.subplots(1, 2)
182 | 
183 | ax1.axis('off')
184 | cmap = target_expn
185 | cmap[cmap<0] = 0
186 | cmap = np.log1p(cmap)
187 | cmap[cmap > np.percentile(cmap,95)] = np.percentile(cmap,95)
188 | im = ax1.scatter(adata.obsm['spatial'][:,0],adata.obsm['spatial'][:,1],s=1,c=cmap,rasterized=True)
189 | ax1.set_title('Actual', fontsize = 12)
190 | 
191 | cbar = fig.colorbar(im)
192 | cbar.ax.get_yaxis().labelpad = 15
193 | cbar.ax.set_ylabel('Log Expression', rotation=270)
194 | 
195 | ax2.axis('off')
196 | cmap = adata.obsm['spage_predicted_expression'][target_gene].values
197 | cmap[cmap<0] = 0
198 | cmap = np.log1p(cmap)
199 | cmap[cmap > np.percentile(cmap,95)] = np.percentile(cmap,95)
200 | im = ax2.scatter(adata.obsm['spatial'][:,0],adata.obsm['spatial'][:,1],s=1,c=cmap,rasterized=True)
201 | ax2.set_title('Predicted', fontsize = 12)
202 | 
203 | cbar = fig.colorbar(im)
204 | cbar.ax.get_yaxis().labelpad = 15
205 | cbar.ax.set_ylabel('Log Expression', rotation=270)
206 | 
207 | plt.suptitle("SpaGE Prediction", fontsize=16)
208 | plt.tight_layout()
209 | plt.show()
210 | ```
211 | 
212 | 
213 |     
214 | ![png](README_files/README_13_0.png)
215 |     
216 | 
217 | 
218 | Not too bad, especially considering that we used a downsampled scRNAseq dataset for this imputation.
219 | 
220 | ### Tutorial 2: Using TISSUE to calibrate uncertainties and obtain prediction intervals
221 | 
222 | Note that when we ran `tissue.main.predict_gene_expression()` in the previous tutorial, we obtained cross-validated predictions for all genes in the existing spatial dataset (e.g. 10 folds). These will now come in handy when we compute and calibrate TISSUE uncertainties for the predicted gene expression.
223 | 
224 | First, we build spatial graphs using TISSUE (this can also be done with native Scanpy functions):
225 | 
226 | 
227 | ```python
228 | # build spatial graph and calculate adjacency weights
229 | 
230 | tissue.main.build_spatial_graph(adata, method="fixed_radius", n_neighbors=15)
231 | ```
232 | 
233 | The entire TISSUE spatial uncertainty generation pipeline can be launched with one line of code using `tissue.main.conformalize_spatial_uncertainty()`. Here, the first two arguments are the spatial AnnData object and a string specifier for the key in `obsm` corresponding to the predicted gene expression. We can specify which of the genes to use in the calibration set (generally this is all genes in the spatial data). There are other arguments for the grouping setup and weighting schemes, but we will use the default settings in this tutorial.
234 | 
235 | 
236 | ```python
237 | # build calibration scores
238 | 
239 | tissue.main.conformalize_spatial_uncertainty(adata, "spage_predicted_expression", calib_genes=adata.var_names,
240 |                                              grouping_method="kmeans_gene_cell", k=4, k2=2)
241 | ```
242 | 
243 |     /home/edsun/anaconda3/envs/tissue/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
244 |       warnings.warn(
245 |     /home/edsun/anaconda3/envs/tissue/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
246 |       warnings.warn(
247 |     /home/edsun/anaconda3/envs/tissue/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
248 |       warnings.warn(
249 |     /home/edsun/anaconda3/envs/tissue/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
250 |       warnings.warn(
251 |     /home/edsun/anaconda3/envs/tissue/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
252 |       warnings.warn(
253 | 
254 | 
255 | Finally, we can calibrate the spatial uncertainty measures to get calibration scores and then use those to extract prediction intervals for any confidence level $1-\alpha$. This is done with `tissue.main.conformalize_prediction_interval()`
256 | 
257 | 
258 | ```python
259 | # get prediction interval for 67% coverage
260 | 
261 | tissue.main.conformalize_prediction_interval (adata, "spage_predicted_expression", calib_genes=adata.var_names,
262 |                                               alpha_level=0.23)
263 | 
264 | # set `compute_wasserstein=True` to compute a measure indicating the distance between the cell-centric variability values 
265 | # of the predicted genes and the values of their support (i.e. calibration group) in the original data. 
266 | # The lower this value, the better supported the predicted group.
267 | ```
268 | 
269 | Now let's visualize what these prediction intervals look like for the target (unseen) gene and how it compares to the actual prediction errors.
270 | 
271 | 
272 | ```python
273 | m = "spage"
274 | 
275 | i = np.where(adata.var_names == target_gene)[0]
276 | 
277 | # define consistent color map
278 | all_vals = np.concatenate((np.abs(target_expn.flatten()-adata.obsm[m+"_predicted_expression"][target_gene].values),
279 |                            adata.obsm[m+f"_predicted_expression_hi"][target_gene].values-adata.obsm[m+f"_predicted_expression_lo"][target_gene].values))
280 | all_vals[all_vals<0]=0
281 | vmin = np.percentile(np.log1p(all_vals), 0)
282 | vmax = np.percentile(np.log1p(all_vals), 95)
283 | 
284 | 
285 | fig, (ax1, ax2) = plt.subplots(1, 2)
286 | 
287 | ax1.axis('off')
288 | cmap = np.abs(target_expn.flatten()-adata.obsm[m+"_predicted_expression"][target_gene].values)
289 | cmap[cmap<0] = 0
290 | cmap = np.log1p(cmap)
291 | cmap[cmap > np.percentile(cmap,95)] = np.percentile(cmap,95)
292 | im = ax1.scatter(adata.obsm['spatial'][:,0],adata.obsm['spatial'][:,1],s=1,c=cmap,rasterized=True)#,vmin=vmin,vmax=vmax)
293 | ax1.set_title('Imputation Error ' + target_gene, fontsize = 12)
294 | 
295 | cbar = fig.colorbar(im)
296 | cbar.ax.get_yaxis().labelpad = 15
297 | cbar.ax.set_ylabel('Log Expression', rotation=270)
298 | 
299 | ax2.axis('off')
300 | cmap = adata.obsm[m+f"_predicted_expression_hi"][target_gene].values-adata.obsm[m+f"_predicted_expression_lo"][target_gene].values
301 | cmap[cmap<0] = 0
302 | cmap = np.log1p(cmap)
303 | cmap[cmap > np.percentile(cmap,95)] = np.percentile(cmap,95)
304 | im = ax2.scatter(adata.obsm['spatial'][:,0],adata.obsm['spatial'][:,1],s=1,c=cmap,rasterized=True)#,vmin=vmin,vmax=vmax)
305 | ax2.set_title('PI Width ' + target_gene, fontsize = 12)
306 | 
307 | cbar = fig.colorbar(im)
308 | cbar.ax.get_yaxis().labelpad = 15
309 | cbar.ax.set_ylabel('Log Expression', rotation=270)
310 | 
311 | plt.suptitle(m, fontsize=16)
312 | plt.tight_layout()
313 | plt.show()
314 | ```
315 | 
316 | 
317 |     
318 | ![png](README_files/README_22_0.png)
319 |     
320 | 
321 | 
322 | The TISSUE prediction intervals are decent and match the distribution of imputation errors especially well in the bottom portions of the section. On the full dataset, which has much richer reference scRNAseq data, the calibration quality is further improved (see Figure 2 of the TISSUE manuscript).
323 | 
324 | ## Downstream TISSUE analyses: Hypothesis testing, Clustering/Visualization, Prediction
325 | 
326 | TISSUE provides additional functionalities for leveraging these uncertainty estimates and prediction intervals in common downstream single-cell spatial transcriptomics analysese. All functions for downstream analysis are in the `tissue.downstream` module. These include:
327 | 
328 | - `tissue.downstream.multiple_imputation_testing()` for hypothesis testing using multiple imputations drawn from the calibration score sets
329 | 
330 | - `tissue.downstream.weighted_PCA()` for computing weighted principal components where weights correspond to a transform of the inverse prediction interval width
331 | 
332 | - `tissue.downstream.detect_uncertain_cells()` for filtering low-confidence cells from data before training and evaluation of machine learning models, which generally improves performance
333 | 
334 | Below we include a few example uses of these modules. For more examples (including those for the experiments in the associated manuscript), please see the Github repository: https://github.com/sunericd/tissue-figures-and-analyses.git.
335 | 
336 | ### Tutorial 3: Hypothesis testing with TISSUE multiple imputation framework
337 | 
338 | This tutorial uses the TISSUE calibration scores to generate multiple imputations and then perform hypothesis testing by aggregating statistics across these multiple imputations. The default and recommended statistical test for this framework is the Student's t-test, but other options for spatially variable gene detection using SpatialDE and non-parametric (one-sided) Mann-Whitney/Wilcoxon tests can also be performed by specifying `method="spatialde"`, `method="wilcoxon_greater"`, `method="wilcoxon_less"` for `tissue.downstream.multiple_imputation_testing()`.
339 | 
340 | Please run the code in Tutorials 1-2 to generate predictions and TISSUE uncertainty measures before this tutorial.
341 | 
342 | After, we construct some binary labels for the cells in the dataset:
343 | 
344 | 
345 | ```python
346 | # split into two groups based on indices
347 | adata.obs['condition'] = ['A' if i < round(adata.shape[0]/2) else 'B' for i in range(adata.shape[0])]
348 | 
349 | # plot conditions
350 | plt.scatter(adata[adata.obs.condition=="A"].obsm['spatial'][:,0],
351 |             adata[adata.obs.condition=="A"].obsm['spatial'][:,1],
352 |             c='tab:red', s=3, label="A")
353 | plt.scatter(adata[adata.obs.condition=="B"].obsm['spatial'][:,0],
354 |             adata[adata.obs.condition=="B"].obsm['spatial'][:,1],
355 |             c='tab:blue', s=3, label="B")
356 | plt.legend(loc='best')
357 | plt.show()
358 | ```
359 | 
360 | 
361 |     
362 | ![png](README_files/README_26_0.png)
363 |     
364 | 
365 | 
366 | As we can see, the cells in group A primarily belong to the medial layers of the section while the cells in group B correspond to upper pia layer and also the bottom layers of the section. As such, when we do differential gene expression analysis, we should expect some differentially expressed markers between these two labels.
367 | 
368 | Now, we perform differential gene expression analysis using TISSUE multiple imputation hypothesis testing, which works by using TISSUE calibration scores to sample multiple "imputations" (alternative predictions) and then aggregate these statistics together afterwards. Here, we set `group1="A"` and `group2="B"` to find genes that differentially expressed between the two groups across `n_imputations=10` number of imputations (higher values here are better but take more time to compute):
369 | 
370 | 
371 | ```python
372 | # multiple imputation hypothesis testing
373 | 
374 | tissue.downstream.multiple_imputation_testing(adata, "spage_predicted_expression",
375 |                                               calib_genes=adata.var_names,
376 |                                               condition='condition',
377 |                                               group1 = "A", # use None to compute for all conditions, condition vs all
378 |                                               group2 = "B", # use None to compute for group1 vs all
379 |                                               n_imputations=10)
380 | ```
381 | 
382 | TISSUE multiple imputation testing saves the results directing within the `adata.uns` metadata and can be read out by the compared groups and the name of the statistic:
383 | 
384 | 
385 | ```python
386 | # extract statistics for target_gene
387 | print("t-statistic = "+str(round(adata.uns['spage_A_B_tstat'][target_gene].values[0],5)))
388 | print("P = "+str(round(adata.uns['spage_A_B_pvalue'][target_gene].values[0],5)))
389 | ```
390 | 
391 |     t-statistic = -2.38565
392 |     P = 0.02371
393 | 
394 | 
395 | Testing of our target gene (Plp1) results in significant under-expression in group A as compared to group B, suggesting that Plp1 could be a marker gene for the cell types / regions in group B.
396 | 
397 | **NOTE: Many TISSUE modules rely on stochastic sampling so the printed metrics may vary slightly from run to run.**
398 | 
399 | ### Tutorial 4: TISSUE cell filtering for supervised learning
400 | 
401 | TISSUE cell filtering removes cells with the greatest average uncertainty in predicted gene expression, which generally improves the performance of supervised learning models (i.e. classifiers) when trained and evaluated on these filtered predicted expression data.
402 | 
403 | In this tutorial, we will filter out the uncertain cells (using automatic Otsu thresholding) and then train and evaluate a logistic regression classifier to predict the two cell groups (A vs B) from Tutorial 3.
404 | 
405 | To start, we will need to compute the TISSUE prediction interval width as a proxy for uncertainty. We do this by subtracting the lower bound from the upper bound:
406 | 
407 | 
408 | ```python
409 | # get uncertainty (PI width) for filtering
410 | 
411 | X_uncertainty = adata.obsm["spage_predicted_expression_hi"].values - adata.obsm["spage_predicted_expression_lo"].values
412 | ```
413 | 
414 | Then we can filter using the TISSUE prediction interval width. We perform filtering within each strata (i.e. cell group label "A" or "B"), but this can also be done across other groupings or across the entire population of cells if desired. Here we use Otsu thresholding to automatically determine the proportion of cells to filter out within each strata, but you can set this to a hard threshold if desired.
415 | 
416 | 
417 | ```python
418 | # uncertainty-based cell filtering
419 | 
420 | keep_idxs = tissue.downstream.detect_uncertain_cells (X_uncertainty,
421 |                                                       proportion="otsu",
422 |                                                       stratification=adata.obs['condition'].values)
423 | 
424 | adata_filtered = adata[adata.obs_names[keep_idxs],:].copy()
425 | ```
426 | 
427 | Now that we have an object with filtered predicted gene expression, we can check to see how the dimensions of our data have changed from TISSUE filtering:
428 | 
429 | 
430 | ```python
431 | # examine dimensions of data before/after TISSUE filtering
432 | 
433 | print("Before TISSUE cell filtering:")
434 | print(adata.shape)
435 | print("\nAfter TISSUE cell filtering:")
436 | print(adata_filtered.shape)
437 | ```
438 | 
439 |     Before TISSUE cell filtering:
440 |     (3405, 31)
441 |     
442 |     After TISSUE cell filtering:
443 |     (2862, 31)
444 | 
445 | 
446 | And similarly, we can check the balance in the two cell groups after filtering:
447 | 
448 | 
449 | ```python
450 | # print balance of labels in the filtered dataset
451 | 
452 | pd.DataFrame(np.unique(adata_filtered.obs['condition'], return_counts=True),index=["Group","Number of Cells"])
453 | ```
454 | 
455 | 
456 | 
457 | 
458 | <div>
459 | <style scoped>
460 |     .dataframe tbody tr th:only-of-type {
461 |         vertical-align: middle;
462 |     }
463 | 
464 |     .dataframe tbody tr th {
465 |         vertical-align: top;
466 |     }
467 | 
468 |     .dataframe thead th {
469 |         text-align: right;
470 |     }
471 | </style>
472 | <table border="1" class="dataframe">
473 |   <thead>
474 |     <tr style="text-align: right;">
475 |       <th></th>
476 |       <th>0</th>
477 |       <th>1</th>
478 |     </tr>
479 |   </thead>
480 |   <tbody>
481 |     <tr>
482 |       <th>Group</th>
483 |       <td>A</td>
484 |       <td>B</td>
485 |     </tr>
486 |     <tr>
487 |       <th>Number of Cells</th>
488 |       <td>1250</td>
489 |       <td>1612</td>
490 |     </tr>
491 |   </tbody>
492 | </table>
493 | </div>
494 | 
495 | 
496 | 
497 | As we can see, TISSUE automatically filters out a large number of cells with uncertain gene predictions. In the filtered dataset, the balance between group A and group B is relatively preserved.
498 | 
499 | Now, we will move on to training a logistic regression classifier on the filtered data. Given the modular nature of TISSUE filtering (i.e. the output is a cellxgene matrix), integrating TISSUE with other supervised learning models is as easy as plug-and-play.
500 | 
501 | First, we split into a train (80%) and test set (20%):
502 | 
503 | 
504 | ```python
505 | # split train and test randomly (80%-20%)
506 | np.random.seed(444)
507 | train_idxs = np.random.choice(np.arange(adata_filtered.shape[0]), round(adata_filtered.shape[0]*0.8), replace=False)
508 | test_idxs = np.array([idx for idx in np.arange(adata_filtered.shape[0]) if idx not in train_idxs])
509 | 
510 | train_data = adata_filtered.obsm["spage_predicted_expression"].values[train_idxs,:]
511 | train_labels = adata_filtered.obs["condition"][train_idxs]
512 | 
513 | test_data = adata_filtered.obsm["spage_predicted_expression"].values[test_idxs,:]
514 | test_labels = adata_filtered.obs["condition"][test_idxs]
515 | ```
516 | 
517 | Then, we train logistic regression classifier on the filtered and split data:
518 | 
519 | 
520 | ```python
521 | from sklearn.linear_model import LogisticRegression
522 | from sklearn.preprocessing import StandardScaler
523 | from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
524 | 
525 | # init and scale data
526 | scaler = StandardScaler()
527 | train_data = scaler.fit_transform(train_data)
528 | 
529 | # fit model on scaled data
530 | model = LogisticRegression(penalty='l1', solver='liblinear').fit(train_data, train_labels)
531 | ```
532 | 
533 | Now that we have trained the model, we can evaluate its performance on the unseen test data:
534 | 
535 | 
536 | ```python
537 | # make predictions on test data
538 | pred_test = model.predict(scaler.transform(test_data))
539 | pred_test_scores = model.predict_proba(scaler.transform(test_data))
540 | 
541 | # print metrics
542 | test_labels_num = [0 if x=="A" else 1 for x in test_labels]
543 | print(f"Accuracy Score: {accuracy_score(test_labels, pred_test)}")
544 | print(f"ROC-AUC Score: {roc_auc_score(test_labels_num, pred_test_scores[:,1])}")
545 | ```
546 | 
547 |     Accuracy Score: 0.8024475524475524
548 |     ROC-AUC Score: 0.8765594181459566
549 | 
550 | 
551 | The model performs quite well! It has high accuracy and high ROC-AUC for a relatively balanced binary classification problem. A similar approach can be taken to leverage TISSUE uncertainties in training/evaluating other model architectures (e.g. linear regression, random forest, neural nets).
552 | 
553 | ### Tutorial 5: TISSUE cell filtering for PCA (clustering and visualization)
554 | 
555 | Downstream clustering and data visualization tasks in transcriptomics data analysis generally rely on dimensionality reduction via principal component analysis (PCA). To incorporate TISSUE uncertainties in these downstream tasks, we perform TISSUE cell filtering before fitting the PCA model and reducing dimensionality.
556 | 
557 | In this tutorial, we will apply TISSUE cell filtering to the dataset to generate principal components. This can be done with `tissue.downstream.filtered_PCA()` which is wrapper around the direct TISSUE cell filtering. From these principal components, we can make a two-dimensional PCA plot and perform clustering on the top 15 principal components using K-Means.
558 | 
559 | We will be starting with the AnnData object after running Tutorials 1-3: `adata` and then applying TISSUE-filtered PCA:
560 | 
561 | 
562 | ```python
563 | # uncertainty-based cell filtering for PCA
564 | 
565 | keep_idxs = tissue.downstream.filtered_PCA (adata, # anndata object
566 |                                             "spage", # prediction method
567 |                                             proportion="otsu",
568 |                                             stratification=adata.obs['condition'].values,
569 |                                             return_keep_idxs=True)
570 | 
571 | # filter to keep track of labels
572 | adata_filtered = adata[adata.obs_names[keep_idxs],:].copy()
573 | ```
574 | 
575 | Here we used the same `otsu` threshold-based automatic filtering as before and stratify the filtering by the conditions. We use the default `n_components=15`.
576 | 
577 | There are two options for the TISSUE-filtered PCA, both of which are saved into `adata.uns` and `adata.obsm` respectively. The first is the standard principal components obtained on the TISSUE-filtered data, which can be found in `adata.uns['{name of prediction method}_predicted_expression_PC15_filtered_']`. The second is the PCA fitted onto TISSUE-filtered data but then applied to the entire dataset (unfiltered), which can be found in `adata.obsm['{name of prediction method}_predicted_expression_PC15_']`. We will use the first (and recommended) option in this tutorial
578 | 
579 | 
580 | ```python
581 | # retrieve filtered PCA
582 | 
583 | PC_reduced = adata.uns['spage_predicted_expression_PC15_filtered_'].copy()
584 | print(PC_reduced.shape)
585 | ```
586 | 
587 |     (2862, 15)
588 | 
589 | 
590 | We now have a reduced representation of our original data that is filtered by TISSUE and has 15 principal components. We can visualize the first two principal components:
591 | 
592 | 
593 | ```python
594 | # make 2D PCA plot labeled by group
595 | 
596 | plt.title("TISSUE-Filtered PCA")
597 | plt.scatter(PC_reduced[adata_filtered.obs['condition']=='A',0],
598 |             PC_reduced[adata_filtered.obs['condition']=='A',1],
599 |             c="tab:red", s=3, label="A")
600 | plt.scatter(PC_reduced[adata_filtered.obs['condition']=='B',0],
601 |             PC_reduced[adata_filtered.obs['condition']=='B',1],
602 |             c="tab:blue", s=3, label="B")
603 | plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
604 | plt.xlabel("PC 1")
605 | plt.ylabel("PC 2")
606 | plt.show()
607 | ```
608 | 
609 | 
610 |     
611 | ![png](README_files/README_52_0.png)
612 |     
613 | 
614 | 
615 | Visually, there is some separation between the two groups on the first two principal components (although lots of overlap too).
616 | 
617 | Next, we can try K-Means clustering using all 15 principal components (and evaluate clustering with the ARI):
618 | 
619 | 
620 | ```python
621 | from sklearn.cluster import KMeans
622 | 
623 | # K-Means clustering
624 | kmeans = KMeans(n_clusters=2).fit(PC_reduced)
625 | clusters = kmeans.labels_
626 | 
627 | # evaluate ARI
628 | from sklearn.metrics import adjusted_rand_score
629 | print(adjusted_rand_score(adata_filtered.obs['condition'], clusters))
630 | ```
631 | 
632 |     /home/edsun/anaconda3/envs/tissue/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
633 |       warnings.warn(
634 | 
635 | 
636 |     0.218865537222824
637 | 
638 | 
639 | Evidently, the clustering with TISSUE-filtered principal components can provide some degree of separation between the two cell groups that we defined previously.
640 | 
641 | ### Tutorial 6: TISSUE-WPCA (weighted principal component analysis)
642 | 
643 | An alternative approach to TISSUE cell filtering for PCA is TISSUE-WPCA, which involves weighting each value in the predicted gene expression matrix in PCA, thus allowing for a softer approach that can leverage more of the predicted expression data. However, in practice, TISSUE-WPCA generally does not yield as many changes to the resulting principal components than TISSUE cell filtering when compared to normal PCA.
644 | 
645 | TISSUE-WPCA is highly customizable, mostly in the definition of the weights (see source code for more details and documentation). We use one of the implementations highlighted in the TISSUE manuscript, which involves the inverse TISSUE prediction interval width that is then binarized into a high weight and a low weight that are separated by an order of magnitude:
646 | 
647 | 
648 | ```python
649 | # weighted PCA
650 | 
651 | tissue.downstream.weighted_PCA(adata, "spage", pca_method="wpca", weighting="inverse_pi_width",
652 |                                replace_inf="max", binarize=0.2, binarize_ratio=10,
653 |                                n_components=15)
654 | ```
655 | 
656 | Here we used the `inverse_pi_width` method which uses the inverse prediction interval width as the initial weight. We replace all `inf` values with the maximum weights. We specify `binarize=0.2`, which is the proportion of cells to draw the high/low weight split. The `binarize_ratio` is the fold-change between the high and low weight values, and we use `n_components=15`.
657 | 
658 | Now that we have performed TISSUE-WPCA, we can access the resulting principal components from `adata.obsm['{prediction method name}_predicted_expression_PC15_']` and use these as we would with any other reduced representation of the data. For example, we can visualize the two cell groups along the first two principal components:
659 | 
660 | 
661 | ```python
662 | # make PC plot
663 | 
664 | X_pc = adata.obsm['spage_predicted_expression_PC15_']
665 | 
666 | plt.title("TISSUE Weighted PCA")
667 | plt.scatter(X_pc[adata.obs['condition']=='A',0], X_pc[adata.obs['condition']=='A',1],
668 |             c="tab:red", s=3, label="A")
669 | plt.scatter(X_pc[adata.obs['condition']=='B',0], X_pc[adata.obs['condition']=='B',1],
670 |             c="tab:blue", s=3, label="B")
671 | plt.xlabel("PC 1")
672 | plt.ylabel("PC 2")
673 | plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
674 | plt.show()
675 | ```
676 | 
677 | 
678 |     
679 | ![png](README_files/README_60_0.png)
680 |     
681 | 
682 | 
683 | Similarly to TISSUE cell filtering PCA, we see that TISSUE-WPCA can visually separate the two cell groups in the PCA plot.
684 | 
685 | # Additional considerations
686 | 
687 | ## Hyperparameter selection:
688 | 
689 | At various parts of the TISSUE pipeline, the user can select different hyperparameters. Here we outline some guiding principles for reasonably selecting these hyperparameters:
690 | 
691 | - `n_neighbors` in `tissue.main.build_spatial_graph()` - the approximate number of neighbors to use for computing TISSUE cell-centric variability. Generally, we recommend setting this to a value close to 15 to ensure reliable cell-centric variability estimates. Values from 5-30 also work comparably well. Alternatively, you can try out other spatial graph methods such as the other options in `tissue.main.build_spatial_graph()` or load in your own spatial graph adjacency matrix using `tissue.main.load_spatial_graph()` from a .npz file.
692 | 
693 | - `alpha_level` in `tissue.main.conformalize_prediction_interval()` - the confidence measure corresponding to (1-alpha) TISSUE prediction interval coverage. Generally, we recommend 0.23 to retrieve the 67% TISSUE prediction interval (approx. one standard error) but downstream results are largely robust to the exact choice. Values very close to 0 or very close to 1 are less likely to provide informative calibrations.
694 | 
695 | - `k`, `k2` in `tissue.main.conformalize_spatial_uncertainty()` - these are the gene and cell stratified group numbers respectively. You can try different values for each of these, but we recommend staying below 4 for either parameter. In the manuscript, we primarily used `k=4` and `k2=1`. If you don't want to choose, TISSUE can automatically select these hyperparameters if you set `k='auto'` and `k2='auto'`.
696 | 
697 | 
698 | ## Computational runtime and speed ups:
699 | 
700 | If you are experiencing slow runtimes with TISSUE, there are several things to check or change for faster runtime (also refer to Extended Data Figure 9 in our publication for runtime breakdown in the first version of TISSUE for different-sized datasets):
701 | 
702 | 
703 | **Large number of cells**
704 | - We have tested TISSUE on datasets up to 20K cells. If your data contains substantially more cells, we recommend downsampling the cells (e.g. random uniform sampling) or subsetting to cell groups of interest.
705 | - If the prediction step takes a long time, we suggest decreasing the number of cross-validation folds by setting `n_folds` in `tissue.main.predict_gene_expression()`.
706 | 
707 | **Large number of genes**
708 | - Generally TISSUE is robust to the number of genes but for further speedup, we recommend only predicting genes that are necessary or using the scRNAseq reference dataset to identify a set of highly variable genes for prediction beforehand (if whole-transcriptome is desired, for example using `scanpy.pp.highly_variable_genes`).
709 | - Set `weight='exp_cos_pca'` and `weight_n_pc` to some integer (e.g. 15) in `tissue.main.conformalize_spatial_uncertainty()` to perform cosine similarity weights based on low-dimensionality (for better runtime and reduce high-dimensional distortions)
710 | - Set `n_pc` and `n_pc2` to some integer (e.g. 15) in `tissue.main.conformalize_spatial_uncertainty()` so that k-means clustering will be done on lower dimensional space (for better runtime and performance)
711 | 
712 | **Other runtime tips**:
713 | 
714 | - Turn off Wasserstein calculation in `tissue.main.conformalize_prediction_interval()` by setting `compute_wasserstein=False` (default)
715 | 
716 | 
717 | 
718 | ## Memory usage:
719 | 
720 | We have optimized TISSUE to be memory-efficient with respect the size of the original dataset. Since spatial transcriptomics datasets can be very large, and TISSUE requires additional overhead for some of its operations, here are some suggestions for dealing with memory issues:
721 | 
722 | - For downsizing datasets with many cells or many genes (to predict), refer to the previous section on runtimes for ways of downsampling for both improved runtime and lower memory usage
723 | 
724 | - In `tissue.main.build_spatial_graph()`, you can consider setting `radius` to a value (default is None) for any of the radius-based methods. This is more important in older versions of TISSUE, which were not optimized.
725 | 
726 | - In `tissue.downstream.multiple_imputation_testing()`, make sure that `save_mi=False` which stops saving of each multiple imputation into memory.
727 | 
728 | 
729 | 
730 | 
731 | # UNDER DEVELOPMENT:
732 | - Multi-threading for making cross-validation predictions in `tissue.main.predict_gene_expression()`.
733 | - Gene filtering guidelines / strategy
734 | - Suppress warning printouts
735 | 
736 | # Citation
737 | 
738 | If you find this code useful, we would appreciate it if you cite the following publications:
739 | 
740 | ---
741 | <font size="5">Sun, E.D., Ma, R., Navarro Negredo, P. et al. TISSUE: uncertainty-calibrated prediction of single-cell spatial transcriptomics improves downstream analyses. Nat Methods (2024). https://doi.org/10.1038/s41592-024-02184-y</font> 
742 | 
743 | ---
744 | **Preprint:**
745 | 
746 | Sun ED, Ma R, Navarro Negredo P, Brunet A, Zou J. TISSUE: uncertainty-calibrated prediction of single-cell spatial transcriptomics improves downstream analyses. Preprint at https://doi.org/10.1101/2023.04.25.538326 (2023).
747 | 
748 | For Jupyter notebooks and Python scripts associated with our original publication, please refer to https://github.com/sunericd/tissue-figures-and-analyses.git. **NOTE: For the original publication, we used TISSUE version 0.0.2**
749 | 
750 | 
751 | ```python
752 | 
753 | ```
754 | 


--------------------------------------------------------------------------------
/tissue/downstream.py:
--------------------------------------------------------------------------------
  1 | # Contains functions for all downstream applications of TISSUE calibration scores and prediction intervals
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import matplotlib.pyplot as plt
  6 | import scanpy as sc
  7 | from sklearn.preprocessing import StandardScaler
  8 | from sklearn.decomposition import PCA
  9 | import anndata as ad
 10 | import os
 11 | import sys
 12 | 
 13 | #from tissue.main import build_calibration_scores, get_spatial_uncertainty_scores_from_metadata
 14 | from .main import build_calibration_scores, get_spatial_uncertainty_scores_from_metadata
 15 | 
 16 | 
 17 | def multiple_imputation_testing (adata, predicted, calib_genes, condition, test="ttest", n_imputations=100,
 18 |                                  group1=None, group2=None, symmetric=False, return_keys=False, save_mi=False):
 19 |     '''
 20 |     Uses multiple imputation with the score distributions to perform hypothesis testing
 21 |     
 22 |     Parameters
 23 |     ----------
 24 |         adata [AnnData] - contains adata.obsm[predicted] corresponding to the predicted gene expression
 25 |         predicted [str] - key in adata.obsm that corresponds to predicted gene expression
 26 |         calib_genes [list or arr of str] - names of the genes in adata.var_names that are used in the calibration set
 27 |         condition [str] - key in adata.obs for which to compute the hypothesis test
 28 |             group1 [value] - value in adata.obs[condition] identifying the first comparison group
 29 |                              if None, will perform group vs all comparisons for all unique values in adata.obs[condition]
 30 |             group2 [value] - value in adata.obs[condition] identifying the second comparison group
 31 |                              if None, will compare against all values that are not group1
 32 |         test [str] - statistical test to use:
 33 |                         "ttest" - two-sample t-test using Rubin's rules (best theoretical support/guarantee)
 34 |                         "wilcoxon_greater" - one-sample wilcoxon (Mann-Whitney U test) for greater expression using p-value transformation
 35 |                         "wilcoxon_less" - one-sample wilcoxon (Mann-Whitney U test) for lesser expression using p-value transformation
 36 |                         "spatialde" - SpatialDE test using p-value transformation
 37 |         n_imputations [int] - number of imputations to use
 38 |         symmetric [bool] - whether to have symmetric (or non-symmetric) prediction intervals
 39 |         return_keys [bool] - whether to return the keys for which to access the results from adata
 40 |         save_mi [False or str] - multiple imputation saving (only used for multiple_imputation_ttest())
 41 |         
 42 |     Returns
 43 |     -------
 44 |         Modifies adata in-place to add the statistics and test results to metadata
 45 |         Optionally returns the keys to access the results from adata
 46 |         
 47 |     '''
 48 |     #####################################################################
 49 |     # T-test (default) - this is the option with best theoretical support
 50 |     #####################################################################
 51 |     if test == "ttest":
 52 |         keys = multiple_imputation_ttest (adata, predicted, calib_genes, condition, n_imputations=n_imputations,
 53 |                                  group1=group1, group2=group2, symmetric=symmetric, save_mi=save_mi)
 54 |             
 55 |     #####################################################################
 56 |     # One-sample ("less"/"greater") Wilcoxon test  
 57 |     #####################################################################    
 58 |     elif test == "wilcoxon_less":
 59 |         keys = multiple_imputation_wilcoxon (adata, predicted, calib_genes, condition, n_imputations=n_imputations,
 60 |                                  group1=group1, group2=group2, symmetric=symmetric, direction='less')
 61 |     elif test == "wilcoxon_greater":
 62 |         keys = multiple_imputation_wilcoxon (adata, predicted, calib_genes, condition, n_imputations=n_imputations,
 63 |                                  group1=group1, group2=group2, symmetric=symmetric, direction='greater')
 64 |                                  
 65 |     #####################################################################
 66 |     # SpatialDE (spatially variable genes) test 
 67 |     ##################################################################### 
 68 |     elif test == "spatialde":
 69 |         keys = multiple_imputation_spatialde (adata, predicted, calib_genes, n_imputations=n_imputations, symmetric=symmetric)
 70 |     
 71 |     # raise exception if test does not match options
 72 |     else:
 73 |         raise Exception ("Specified test not recognized")
 74 |         
 75 |     if return_keys is True:
 76 |         
 77 |         return(keys)
 78 | 
 79 | 
 80 | def multiple_imputation_spatialde (adata, predicted, calib_genes, n_imputations=100, symmetric=False):
 81 |     '''
 82 |     Runs TISSUE multiple imputation SpatialDE test using p-value transformation
 83 |     
 84 |     See multiple_imputation_testing() for details on parameters
 85 |     '''
 86 |     import SpatialDE
 87 |     
 88 |     # get uncertainties and scores from saved adata
 89 |     scores, residuals, G_stdev, G, groups = get_spatial_uncertainty_scores_from_metadata (adata, predicted)
 90 |     
 91 |     ### Building calibration sets for scores
 92 |     
 93 |     scores_flattened_dict = build_calibration_scores(adata, predicted, calib_genes, symmetric=symmetric,
 94 |                                                      include_zero_scores=True, trim_quantiles=[None, 0.8]) # trim top 20% scores
 95 |     
 96 |     ### Multiple imputation
 97 | 
 98 |     # init dictionary to hold results
 99 |     stat_dict = {}
100 |     stat_dict["pvalue"] = {}
101 |     
102 |     for m in range(n_imputations):
103 |         
104 |         # generate new imputation
105 |         new_G = sample_new_imputation_from_scores (G, G_stdev, groups, scores_flattened_dict, symmetric=symmetric)
106 |         
107 |         key = "spatialde"
108 |     
109 |         if m == 0: # init list
110 |             stat_dict["pvalue"][key] = []
111 |         
112 |         # get spatialDE p-values
113 |         normalized_matrix = new_G/(1+np.sum(new_G,axis=1)[:,None])
114 |         normalized_matrix = np.log1p((normalized_matrix-np.min(normalized_matrix)) * 100) 
115 |         sp_df = pd.DataFrame(normalized_matrix,
116 |                           columns=adata.obsm[predicted].columns,
117 |                           index=adata.obsm[predicted].index)
118 | 
119 |         results = SpatialDE.run(adata.obsm['spatial'], sp_df)
120 | 
121 |         # sort by gene name order
122 |         results.drop_duplicates(subset = ['g'], keep = 'first', inplace = True) # workaround duplication SpatialDE bug
123 |         results.g = results.g.astype("category")
124 |         results.g = results.g.cat.set_categories(adata.obsm[predicted].columns)
125 |         results = results.sort_values(["g"])
126 | 
127 |         # get pvalues
128 |         pval = list(results["pval"])
129 |         stat_dict["pvalue"][key].append(pval)
130 | 
131 |     # pool statistics
132 |     pooled_results_dict = {}
133 |     pooled_results_dict['pvalue'] = {}
134 |     # for each test grouping
135 |     for key in stat_dict['pvalue'].keys():
136 |         pooled_results_dict['pvalue'][key] = []
137 |         pval_arr = np.vstack(stat_dict['pvalue'][key])
138 |         # for each gene, get mi pvalue
139 |         for ci in range(pval_arr.shape[1]):
140 |             mi_pval = multiply_imputed_pvalue (pval_arr[:,ci], method="licht_rubin")
141 |             pooled_results_dict['pvalue'][key].append(mi_pval)
142 |      
143 |     # add stats to adata
144 |     keys_list = []
145 |     for key_measure in pooled_results_dict.keys():
146 |         for key_comparison in pooled_results_dict[key_measure].keys():
147 |             adata.uns[predicted.split("_")[0]+"_"+key_comparison+"_"+key_measure] = pd.DataFrame(np.array(pooled_results_dict[key_measure][key_comparison])[None,:],
148 |                                                                                                  columns=adata.obsm[predicted].columns)
149 |             keys_list.append(predicted.split("_")[0]+"_"+key_comparison+"_"+key_measure)
150 |     
151 |     return(keys_list)
152 | 
153 | 
154 | def multiple_imputation_wilcoxon (adata, predicted, calib_genes, condition, n_imputations=100,
155 |                                   group1=None, group2=None, symmetric=False, direction="greater"):
156 |     '''
157 |     Runs TISSUE multiple imputation one-sample Wilcoxon (greater/lesser) test using p-value transformation
158 |     
159 |     See multiple_imputation_testing() for details on parameters
160 |     '''
161 |     from scipy.stats import mannwhitneyu
162 |     
163 |     # get uncertainties and scores from saved adata
164 |     scores, residuals, G_stdev, G, groups = get_spatial_uncertainty_scores_from_metadata (adata, predicted)
165 |     
166 |     ### Building calibration sets for scores
167 |     
168 |     scores_flattened_dict = build_calibration_scores(adata, predicted, calib_genes, symmetric=symmetric,
169 |                                                      include_zero_scores=True, trim_quantiles=[None, 0.8]) # trim top 20% scores
170 |     
171 |     ### Multiple imputation
172 | 
173 |     # init dictionary to hold results
174 |     stat_dict = {}
175 |     stat_dict["pvalue"] = {}
176 |     
177 |     # cast condition to str
178 |     condition = str(condition)
179 |     
180 |     for m in range(n_imputations):
181 |         
182 |         # generate new imputation
183 |         new_G = sample_new_imputation_from_scores (G, G_stdev, groups, scores_flattened_dict, symmetric=symmetric)
184 |             
185 |         if group1 is None: # pairwise comparisons against all
186 |             
187 |             for g1 in np.unique(adata.obs[condition]):
188 |                 
189 |                 key = str(g1)+"_all"
190 |             
191 |                 if m == 0: # init list
192 |                     stat_dict["pvalue"][key] = []
193 |                 
194 |                 g1_bool = (adata.obs[condition] == g1) # g1
195 |                 g2_bool = (adata.obs[condition] != g1) # all other
196 |                 
197 |                 # get SpatialDE p-values
198 |                 pval = []
199 |                 for ci in range(new_G.shape[1]):
200 |                     u,p = mannwhitneyu(new_G[g1_bool,ci], new_G[g2_bool,ci], alternative=direction)
201 |                     pval.append(p)
202 |                 
203 |                 stat_dict["pvalue"][key].append(pval)
204 |                 
205 |         elif group2 is None: # group1 vs all
206 |         
207 |             key = str(group1)+"_all"
208 |             
209 |             if m == 0: # init list
210 |                 stat_dict["pvalue"][key] = []
211 |             
212 |             g1_bool = (adata.obs[condition] == group1) # g1
213 |             g2_bool = (adata.obs[condition] != group1) # all other
214 |             
215 |             # get wilcoxon p-values
216 |             pval = []
217 |             for ci in range(new_G.shape[1]):
218 |                 u,p = mannwhitneyu(new_G[g1_bool,ci], new_G[g2_bool,ci], alternative=direction)
219 |                 pval.append(p)
220 |             
221 |             stat_dict["pvalue"][key].append(pval)
222 |             
223 |         else: # group1 vs group2
224 |             
225 |             key = str(group1)+"_"+str(group2)
226 |             
227 |             if m == 0: # init list
228 |                 stat_dict["pvalue"][key] = []
229 |             
230 |             g1_bool = (adata.obs[condition] == group1) # g1
231 |             g2_bool = (adata.obs[condition] == group2) # g2
232 |             
233 |             # get wilcoxon p-values
234 |             pval = []
235 |             for ci in range(new_G.shape[1]):
236 |                 u,p = mannwhitneyu(new_G[g1_bool,ci], new_G[g2_bool,ci], alternative=direction)
237 |                 pval.append(p)
238 |                 
239 |             stat_dict["pvalue"][key].append(pval)
240 | 
241 |     # pool statistics
242 |     pooled_results_dict = {}
243 |     pooled_results_dict['pvalue'] = {}
244 |     # for each test grouping
245 |     for key in stat_dict['pvalue'].keys():
246 |         pooled_results_dict['pvalue'][key] = []
247 |         pval_arr = np.vstack(stat_dict['pvalue'][key])
248 |         # for each gene, get mi pvalue
249 |         for ci in range(pval_arr.shape[1]):
250 |             mi_pval = multiply_imputed_pvalue (pval_arr[:,ci], method="licht_rubin")
251 |             pooled_results_dict['pvalue'][key].append(mi_pval)
252 |      
253 |     # add stats to adata
254 |     keys_list = []
255 |     for key_measure in pooled_results_dict.keys():
256 |         for key_comparison in pooled_results_dict[key_measure].keys():
257 |             adata.uns[predicted.split("_")[0]+"_"+key_comparison+"_"+key_measure] = pd.DataFrame(np.array(pooled_results_dict[key_measure][key_comparison])[None,:],
258 |                                                                                                  columns=adata.obsm[predicted].columns)
259 |             keys_list.append(predicted.split("_")[0]+"_"+key_comparison+"_"+key_measure)
260 |     
261 |     return(keys_list)
262 | 
263 | 
264 | def multiply_imputed_pvalue (pvalues, method="licht_rubin"):
265 |     '''
266 |     Computes a multiply imputed p-value from a list of p-values according to Licht-Rubin procedure or median procedure
267 |     
268 |     Parameters
269 |     ----------
270 |         pvalues [array-like] - array of p-values from multiple imputation tests
271 |         method [str] - which method for p-value calculation to use: "licht_rubin" or "median"
272 |         
273 |     Returns
274 |     -------
275 |         mi_pvalue [float] - p-value modified for multiple imputation
276 |         
277 |     See reference for technical details: https://stefvanbuuren.name/fimd/sec-multiparameter.html#sec:chi
278 |     '''
279 |     from scipy.stats import norm
280 |     
281 |     if method == "licht_rubin":
282 |         z = norm.ppf(pvalues)  # transform to z-scale
283 |         num = np.nanmean(z)
284 |         den = np.sqrt(1 + np.nanvar(z))
285 |         mi_pvalue = norm.cdf( num / den) # average and transform back
286 |     
287 |     elif method == "median":
288 |         mi_pvalue = np.nanmedian(pvalues)
289 |     
290 |     else:
291 |         raise Exception ("method for multiply_imputed_pvalue() not recognized")
292 | 
293 |     return(mi_pvalue)
294 | 
295 | 
296 | 
297 | def multiple_imputation_ttest (adata, predicted, calib_genes, condition, n_imputations=100,
298 |                                group1=None, group2=None, symmetric=False, save_mi=False):
299 |     '''
300 |     Runs TISSUE multiple imputation two-sample t-test using Rubin's rules
301 |     
302 |     See multiple_imputation_testing() for details on parameters
303 |     
304 |     Additional Parameters
305 |     ---------------------
306 |         save_mi [False or str] - if not False, then saves "multiple_imputations.npy" stacked matrix of imputed gene expression at save_mi path -- NOTE: this requires large memory
307 |     '''
308 | 
309 |     # get uncertainties and scores from saved adata
310 |     scores, residuals, G_stdev, G, groups = get_spatial_uncertainty_scores_from_metadata (adata, predicted)
311 |     
312 |     ### Building calibration sets for scores
313 |     
314 |     scores_flattened_dict = build_calibration_scores(adata, predicted, calib_genes, symmetric=symmetric,
315 |                                                      include_zero_scores=True, trim_quantiles=[None, 0.8]) # trim top 20% scores
316 |     
317 |     ### Multiple imputation
318 | 
319 |     # init dictionary to hold results (for independent two-sample t-test)
320 |     stat_dict = {}
321 |     stat_dict["mean_difference"] = {}
322 |     stat_dict["standard_deviation"] = {}
323 |     
324 |     # cast condition to str
325 |     condition = str(condition)
326 |     
327 |     new_G_list = [] # for saving multiple imputations
328 |     
329 |     for m in range(n_imputations):
330 |         
331 |         # generate new imputation
332 |         new_G = sample_new_imputation_from_scores (G, G_stdev, groups, scores_flattened_dict, symmetric=symmetric)
333 |         if save_mi is not False:
334 |             new_G_list.append(new_G)
335 |     
336 |         # calculate statistics for the imputation using approach from Palmer & Peer, 2016
337 |         
338 |         if group1 is None: # pairwise comparisons against all
339 |             
340 |             for g1 in np.unique(adata.obs[condition]):
341 |                 
342 |                 key = str(g1)+"_all"
343 |             
344 |                 if m == 0: # init list
345 |                     stat_dict["mean_difference"][key] = []
346 |                     stat_dict["standard_deviation"][key] = []
347 |                 
348 |                 g1_bool = (adata.obs[condition] == g1) # g1
349 |                 g2_bool = (adata.obs[condition] != g1) # all other
350 |                 
351 |                 mean_diff, pooled_sd = get_ttest_stats(new_G, g1_bool, g2_bool) # get ttest stats
352 |                 stat_dict["mean_difference"][key].append(mean_diff)
353 |                 stat_dict["standard_deviation"][key].append(pooled_sd)
354 |                 
355 |         elif group2 is None: # group1 vs all
356 |         
357 |             key = str(group1)+"_all"
358 |             
359 |             if m == 0: # init list
360 |                 stat_dict["mean_difference"][key] = []
361 |                 stat_dict["standard_deviation"][key] = []
362 |             
363 |             g1_bool = (adata.obs[condition] == group1) # g1
364 |             g2_bool = (adata.obs[condition] != group1) # all other
365 |             
366 |             mean_diff, pooled_sd = get_ttest_stats(new_G, g1_bool, g2_bool) # get ttest stats
367 |             stat_dict["mean_difference"][key].append(mean_diff)
368 |             stat_dict["standard_deviation"][key].append(pooled_sd)
369 |             
370 |         else: # group1 vs group2
371 |             
372 |             key = str(group1)+"_"+str(group2)
373 |             
374 |             if m == 0: # init list
375 |                 stat_dict["mean_difference"][key] = []
376 |                 stat_dict["standard_deviation"][key] = []
377 |             
378 |             g1_bool = (adata.obs[condition] == group1) # g1
379 |             g2_bool = (adata.obs[condition] == group2) # g2
380 |             
381 |             mean_diff, pooled_sd = get_ttest_stats(new_G, g1_bool, g2_bool) # get ttest stats
382 |             stat_dict["mean_difference"][key].append(mean_diff)
383 |             stat_dict["standard_deviation"][key].append(pooled_sd)
384 | 
385 |     # pool statistics and perform t-test
386 |     pooled_results_dict = pool_multiple_stats(stat_dict)
387 |      
388 |     # add stats to adata
389 |     keys_list = []
390 |     for key_measure in pooled_results_dict.keys():
391 |         for key_comparison in pooled_results_dict[key_measure].keys():
392 |             adata.uns[predicted.split("_")[0]+"_"+key_comparison+"_"+key_measure] = pd.DataFrame(pooled_results_dict[key_measure][key_comparison][None,:],
393 |                                                                                                  columns=adata.obsm[predicted].columns)
394 |             keys_list.append(predicted.split("_")[0]+"_"+key_comparison+"_"+key_measure)
395 |     
396 |     # save multiple imputations
397 |     if save_mi is not False:
398 |         # stack all imputations and save
399 |         stacked_mi = np.dstack(new_G_list)
400 |         np.save(os.path.join(save_mi,f"{predicted}.npy"), stacked_mi)
401 |     
402 |     return(keys_list)
403 | 
404 | 
405 | def multiple_imputation_gene_signature (sig_dirpath, adata, predicted, calib_genes, condition, n_imputations=100,
406 |                                  group1=None, group2=None, symmetric=False, return_keys=False, load_mi=False):
407 |     '''
408 |     Uses multiple imputation with the score distributions to perform hypothesis testing on gene signatures
409 |     
410 |     Parameters
411 |     ----------
412 |         sig_dirpath [str] - path to the directory containing the gene signatures organized as:
413 |                             sig_dirpath/
414 |                                 {name of signature 1}/
415 |                                 {name of signature N}/
416 |                                     genes.txt - text file with each row being a gene name
417 |                                     coefficients.txt - optional text file with each row being a float weight for corresponding gene
418 |         adata [AnnData] - contains adata.obsm[predicted] corresponding to the predicted gene expression
419 |         predicted [str] - key in adata.obsm that corresponds to predicted gene expression
420 |         calib_genes [list or arr of str] - names of the genes in adata.var_names that are used in the calibration set
421 |         condition [str] - key in adata.obs for which to compute the hypothesis test
422 |             group1 [value] - value in adata.obs[condition] identifying the first comparison group
423 |                              if None, will perform group vs all comparisons for all unique values in adata.obs[condition]
424 |             group2 [value] - value in adata.obs[condition] identifying the second comparison group
425 |                              if None, will compare against all values that are not group1
426 |         n_imputations [int] - number of imputations to use
427 |         symmetric [bool] - whether to have symmetric (or non-symmetric) prediction intervals
428 |         return_keys [bool] - whether to return the keys for which to access the results from adata
429 |         load_mi [bool] - whether to save "{predicted}.npy" stacked matrix of all multiple imputations at sig_dirpath 
430 |         
431 |     Returns
432 |     -------
433 |         Modifies adata in-place to add the statistics and test results to metadata
434 |         Optionally returns the keys to access the results from adata
435 |         
436 |     '''
437 |     #####################################################################
438 |     # T-test (default) - this is the only option currently for signatures
439 |     #####################################################################
440 |     
441 |     if load_mi is False:
442 |         # get uncertainties and scores from saved adata
443 |         scores, residuals, G_stdev, G, groups = get_spatial_uncertainty_scores_from_metadata (adata, predicted)
444 |         
445 |         ### Building calibration sets for scores
446 |         
447 |         scores_flattened_dict = build_calibration_scores(adata, predicted, calib_genes, symmetric=symmetric,
448 |                                                          include_zero_scores=True, trim_quantiles=[None, 0.8]) # trim top 20% scores
449 |     else: # load in saved multiple imputations
450 |         mi_path = os.path.join(sig_dirpath,f"{predicted}.npy") # path to saved multiple imputations
451 |         mi_stacked = np.load(mi_path)
452 |     
453 |     ### Multiple imputation
454 | 
455 |     # init dictionary to hold results (for independent two-sample t-test)
456 |     stat_dict = {}
457 |     stat_dict["mean_difference"] = {}
458 |     stat_dict["standard_deviation"] = {}
459 |     
460 |     # cast condition to str
461 |     condition = str(condition)
462 |     
463 |     for m in range(n_imputations):
464 |         
465 |         # generate new imputation
466 |         if load_mi is False:
467 |             new_G = sample_new_imputation_from_scores (G, G_stdev, groups, scores_flattened_dict, symmetric=symmetric)
468 |         else:
469 |             new_G = mi_stacked[:,:,m].copy() # take the m-th multiple imputation
470 |         
471 |         # compute all signatures
472 |         imputed_sigs = [] 
473 |         sig_names = []
474 |         
475 |         for sigdir in next(os.walk(sig_dirpath))[1]: # iterate all top-level signature directories
476 |             # read in genes
477 |             with open(os.path.join(sig_dirpath,sigdir,"genes.txt")) as f:
478 |                 signature_genes = [line.rstrip() for line in f]
479 |             signature_genes = np.array([x.lower() for x in signature_genes])
480 |             # load coefficients (if any)
481 |             if os.path.isfile(os.path.join(sig_dirpath,sigdir,"coefficients.txt")):
482 |                 signature_coefficients = np.loadtxt(os.path.join(sig_dirpath,sigdir,"coefficients.txt"))
483 |             else:
484 |                 signature_coefficients = np.ones(len(signature_genes))
485 |             # subset into shared genes
486 |             shared_gene_idxs = [ii for ii in range(len(signature_genes)) if signature_genes[ii] in adata.obsm[predicted].columns]
487 |             signature_genes = signature_genes[shared_gene_idxs]
488 |             signature_coefficients = signature_coefficients[shared_gene_idxs]
489 |             # if non-empty signature, then compute
490 |             if len(signature_genes) > 0:
491 |                 # compute signature
492 |                 subset_new_G = pd.DataFrame(new_G, columns = adata.obsm[predicted].columns)[signature_genes].values
493 |                 sig_value = np.nansum(subset_new_G*signature_coefficients, axis=1)
494 |                 # append signature value and name
495 |                 imputed_sigs.append(sig_value)
496 |                 sig_names.append(sigdir)
497 | 
498 |         # construct gene signature matrix
499 |         imputed_sigs = np.vstack(imputed_sigs).T
500 |         
501 |         # keep running average of imputed gene signatures
502 |         if m == 0:
503 |             mean_imputed_sigs = imputed_sigs * 1/n_imputations
504 |         else:
505 |             mean_imputed_sigs += imputed_sigs * 1/n_imputations
506 |         
507 |         # calculate statistics for the imputation using approach from Palmer & Peer, 2016
508 |         
509 |         if group1 is None: # pairwise comparisons against all
510 |             
511 |             for g1 in np.unique(adata.obs[condition]):
512 |                 
513 |                 key = str(g1)+"_all"
514 |             
515 |                 if m == 0: # init list
516 |                     stat_dict["mean_difference"][key] = []
517 |                     stat_dict["standard_deviation"][key] = []
518 |                 
519 |                 g1_bool = (adata.obs[condition] == g1) # g1
520 |                 g2_bool = (adata.obs[condition] != g1) # all other
521 |                 
522 |                 mean_diff, pooled_sd = get_ttest_stats(imputed_sigs, g1_bool, g2_bool) # get ttest stats
523 |                 stat_dict["mean_difference"][key].append(mean_diff)
524 |                 stat_dict["standard_deviation"][key].append(pooled_sd)
525 |                 
526 |         elif group2 is None: # group1 vs all
527 |         
528 |             key = str(group1)+"_all"
529 |             
530 |             if m == 0: # init list
531 |                 stat_dict["mean_difference"][key] = []
532 |                 stat_dict["standard_deviation"][key] = []
533 |             
534 |             g1_bool = (adata.obs[condition] == group1) # g1
535 |             g2_bool = (adata.obs[condition] != group1) # all other
536 |             
537 |             mean_diff, pooled_sd = get_ttest_stats(imputed_sigs, g1_bool, g2_bool) # get ttest stats
538 |             stat_dict["mean_difference"][key].append(mean_diff)
539 |             stat_dict["standard_deviation"][key].append(pooled_sd)
540 |             
541 |         else: # group1 vs group2
542 |             
543 |             key = str(group1)+"_"+str(group2)
544 |             
545 |             if m == 0: # init list
546 |                 stat_dict["mean_difference"][key] = []
547 |                 stat_dict["standard_deviation"][key] = []
548 |             
549 |             g1_bool = (adata.obs[condition] == group1) # g1
550 |             g2_bool = (adata.obs[condition] == group2) # g2
551 |             
552 |             mean_diff, pooled_sd = get_ttest_stats(imputed_sigs, g1_bool, g2_bool) # get ttest stats
553 |             stat_dict["mean_difference"][key].append(mean_diff)
554 |             stat_dict["standard_deviation"][key].append(pooled_sd)
555 | 
556 |     # pool statistics and perform t-test
557 |     pooled_results_dict = pool_multiple_stats(stat_dict)
558 |      
559 |     # add stats to adata
560 |     keys_list = []
561 |     for key_measure in pooled_results_dict.keys():
562 |         for key_comparison in pooled_results_dict[key_measure].keys():
563 |             adata.uns[predicted.split("_")[0]+"_"+key_comparison+"_"+key_measure] = pd.DataFrame(pooled_results_dict[key_measure][key_comparison][None,:],
564 |                                                                                                  columns=sig_names)
565 |             keys_list.append(predicted.split("_")[0]+"_"+key_comparison+"_"+key_measure)
566 |                     
567 |     # add gene sigs to adata
568 |     adata.obsm[predicted+"_gene_signatures"] = pd.DataFrame(mean_imputed_sigs, columns=sig_names, index=adata.obs_names)
569 |     
570 |     if return_keys is True:
571 |         
572 |         return(keys_list)
573 | 
574 | 
575 | 
576 | def sample_new_imputation_from_scores (G, G_stdev, groups, scores_flattened_dict, symmetric=False):
577 |     '''
578 |     Creates a new imputation by sampling from scores and adding to G
579 |     
580 |     Parameters
581 |     ----------
582 |         G, G_stdev, groups - outputs of get_spatial_uncertainty_scores_from_metadata()
583 |         scores_flattened_dict - output of build_calibration_scores()
584 |     
585 |     See multiple_imputation_testing() for more details of arguments
586 |     
587 |     Returns
588 |     -------
589 |         new_G - array of the new sampled predicted gene expression (same dimensions as new_G: cells x genes)
590 |     '''
591 |     new_scores = np.zeros(G.shape) # init array for sampled scores
592 |     new_add_sub = np.zeros(G.shape) # init array for add/subtract coefs
593 |     
594 |     # for each group, sample calibration score and corresponding imputations
595 |     unique_groups, unique_counts = np.unique(groups[~np.isnan(groups)], return_counts=True)
596 |     
597 |     for ui, group in enumerate(unique_groups):
598 |         count = unique_counts[ui] # get number of values in group
599 |         
600 |         # sample scores and add/sub indicators
601 |         if symmetric is True:
602 |             scores_flattened = scores_flattened_dict[str(group)] # get scores
603 |             if len(scores_flattened) < 100: # default to full set if <100 in group
604 |                 scores_flattened = scores_flattened_dict[str(np.nan)]
605 |             sampled_scores = np.random.choice(scores_flattened, count, replace=True) # with replacement, sample scores
606 |             add_sub = np.random.choice([-1,1], count, replace=True) # add or subtract
607 |         else:
608 |             scores_lo_flattened = scores_flattened_dict[str(group)][0]
609 |             scores_hi_flattened = scores_flattened_dict[str(group)][1]
610 |             if (len(scores_lo_flattened) < 100) or (len(scores_hi_flattened) < 100): # default to full set if <100 in group
611 |                 scores_lo_flattened = scores_flattened_dict[str(np.nan)][0]
612 |                 scores_hi_flattened = scores_flattened_dict[str(np.nan)][1]
613 |             scores_flattened = np.concatenate((scores_lo_flattened, scores_hi_flattened))
614 |             lo_hi_indicators = np.concatenate(([-1]*len(scores_lo_flattened), [1]*len(scores_hi_flattened)))
615 |             # sample indices
616 |             sampled_idxs = np.random.choice(np.arange(len(scores_flattened)), count, replace=True) # with replacement
617 |             sampled_scores = scores_flattened[sampled_idxs]
618 |             add_sub = lo_hi_indicators[sampled_idxs]
619 |         
620 |         # append to new_scores and new_add_sub
621 |         new_scores[groups==group] = sampled_scores
622 |         new_add_sub[groups==group] = add_sub
623 |         
624 |     # calculate new imputation
625 |     new_G = G + new_add_sub*(new_scores*G_stdev)
626 | 
627 |     return (new_G)
628 | 
629 | 
630 | def get_ttest_stats(G, g1_bool, g2_bool):
631 |     '''
632 |     Computes mean_diff and pooled SD for each column of G independently
633 |     
634 |     Parameters
635 |     ----------
636 |         G [array] - 2D array with columns as genes and rows as cells
637 |         g1_bool [bool array] - 1D array with length equal to number of rows in G; labels group1
638 |         g2_bool [bool array] - 1D array with length equal to number of rows in G; labels group2
639 |         
640 |     Returns
641 |     -------
642 |         mean_diff - mean difference for t-test
643 |         pooled_sd - pooled standard deviation for t-test
644 |     '''
645 |     mean_diff = np.nanmean(G[g1_bool,:], axis=0) - np.nanmean(G[g2_bool,:], axis=0)
646 |     n1 = np.count_nonzero(~np.isnan(G[g1_bool,:]), axis=0)
647 |     n2 = np.count_nonzero(~np.isnan(G[g2_bool,:]), axis=0)
648 |     sp = np.sqrt( ( (n1-1)*(np.nanvar(G[g1_bool,:],axis=0)) + (n2-1)*(np.nanvar(G[g2_bool,:],axis=0)) ) / (n1+n2-2) )
649 |     pooled_sd = np.sqrt(1/n1 + 1/n2) * sp
650 |     
651 |     return(mean_diff, pooled_sd)
652 | 
653 | 
654 | def two_sample_ttest (G, g1_bool, g2_bool):
655 |     '''
656 |     Computes two-sample t-test for unequal sample sizes using get_ttest_stats()
657 |     
658 |     Parameters
659 |     ----------
660 |         G [array] - 2D array with columns as genes and rows as cells
661 |         g1_bool [bool array] - 1D array with length equal to number of rows in G; labels group1
662 |         g2_bool [bool array] - 1D array with length equal to number of rows in G; labels group2
663 |         
664 |     Returns
665 |     -------
666 |         tt - t-statistic
667 |         pp - p-value
668 |     '''
669 |     from scipy import stats
670 |     # calculate t-stat
671 |     mean_diff, pooled_sd = get_ttest_stats(G, g1_bool, g2_bool)
672 |     tt = mean_diff/pooled_sd
673 |     # calculate dof
674 |     n1 = np.count_nonzero(~np.isnan(G[g1_bool,:]), axis=0)
675 |     n2 = np.count_nonzero(~np.isnan(G[g2_bool,:]), axis=0)
676 |     dof = n1+n2-2
677 |     # calculate p-value
678 |     pp = 2*(1 - stats.t.cdf(np.abs(tt), dof))
679 |     
680 |     return(tt, pp)
681 | 
682 | 
683 | def pool_multiple_stats(stat_dict):
684 |     '''
685 |     Pool stats across multiple imputations for t-test
686 |     
687 |     Parameters
688 |     ----------
689 |         stat_dict [dict] - dictionary containing statistical testing results (generated in multiple_imputation_ttest())
690 |         
691 |     Returns
692 |     -------
693 |         results_dict [dict] - dictionary containing the pooled statistics from using Rubin's rules
694 |     '''
695 |     from scipy import stats
696 |     
697 |     # init results_dict
698 |     results_dict = {}
699 |     results_dict["tstat"] = {}
700 |     results_dict["pvalue"] = {}
701 |     
702 |     results_dict["varw"] = {}
703 |     results_dict["varb"] = {}
704 |     results_dict["poolmean"] = {}
705 |     
706 |     for key in stat_dict["mean_difference"].keys():
707 |         
708 |         d = len(stat_dict["mean_difference"][key])
709 |         
710 |         # compute pooled terms
711 |         pooled_mean = np.mean(np.vstack(stat_dict["mean_difference"][key]), axis=0)
712 |         var_w = np.mean(np.vstack(stat_dict["standard_deviation"][key])**2, axis=0) # within-draw sample variance
713 |         var_b = 1/(d-1) * np.sum((np.vstack(stat_dict["mean_difference"][key])-pooled_mean)**2, axis=0) # between-draw sample variance
714 |         var_MI = var_w + (1+1/d)*var_b # multiple imputation variance
715 |         
716 |         test_stat = pooled_mean / np.sqrt(var_MI) # pooled t statistic
717 |         
718 |         # compute pvalue from T distribution
719 |         dof = (d-1)*(1+(d*var_w)/((d+1)*var_b))**2 # degrees of freedom for T distribution
720 |         pval = 2*(1 - stats.t.cdf(np.abs(test_stat), dof))
721 |         
722 |         # Add test statistic and pvalue
723 |         results_dict["tstat"][key] = test_stat
724 |         results_dict["pvalue"][key] = pval
725 |         
726 |         # Add intermediate stats (for debugging, etc)
727 |         results_dict["varw"][key] = var_w
728 |         results_dict["varb"][key] = var_b
729 |         results_dict["poolmean"][key] = pooled_mean
730 |     
731 |     return(results_dict)
732 | 
733 | 
734 | 
735 | def weighted_PCA(adata, imp_method, pca_method="wpca", weighting="inverse_norm_pi_width", quantile_cutoff=None,
736 |                  n_components=15, replace_inf=None, binarize=0.2, binarize_ratio=10, log_transform=False,
737 |                  scale=True, tag="", return_weights=False,):
738 |     '''
739 |     Runs weighted PCA using the "wpca" package: https://github.com/jakevdp/wpca
740 |     
741 |     Parameters
742 |     ----------
743 |         adata [AnnData] - should be the AnnData after running conformalize_prediction_interval()
744 |                         - must include in obsm: {imp_method}_predicted_expression,
745 |                                                 {imp_method}_predicted_expression_lo,
746 |                                                 {imp_method}_predicted_expression_hi
747 |         imp_method [str] - specifies which imputation method to return PCA for (e.g. 'knn', 'spage', 'tangram')
748 |         pca_method [str] - "wpca" for WPCA (Delchambre, 2014), "empca" for EMPCA (Bailey, 2012), "pca" for PCA
749 |         weighting [str] - "uniform" (regular PCA)
750 |                           "inverse_pi_width" (weights are 1/(prediction interval width))
751 |                           "inverse_norm_pi_width" (weights are predicted expression/(prediction interval width))
752 |         quantile_cutoff [None or float] - quantile (between 0 and 1) for which to set a ceiling for the weights
753 |         n_components [int] - number of principal components
754 |         replace_inf [None, str, float] - what to replace np.inf with (after all other weight transforms); if None, keeps np.inf
755 |                                          can also be "max" or "min" to replace with the max or min weights
756 |         binarize [bool] - binarizes the weights with Otsu threshold -- if larger than threshold, set to 1; else 1e-2
757 |         binarize_ratio [int or float] - how much to "upweight" values greater than the binarized threshold
758 |         log_transform [bool] - whether to log1p transform weights (will be done before binarization if binarize=True)
759 |         scale [bool - whether to scale data with StandardScaler() before running WPCA
760 |         tag [str] - additional tag to append to the obsm key for storing the PCs
761 |         return_weights [bool] - whether to return weights used in WPCA
762 |      
763 |     Returns
764 |     -------
765 |         Stores the result in adata.obsm["{imp_method}_predicted_expression_PC{n_components}_{tag}"]
766 |         Optionally returns the array of weights used in WPCA
767 |     
768 |     Refer to postprocess_weights() for order for weight calculations
769 |     '''
770 |     from wpca import PCA, WPCA, EMPCA
771 |     
772 |     predicted = f"{imp_method}_predicted_expression"
773 |     
774 |     # get gene names/order
775 |     genes = adata.obsm[predicted].columns
776 |     
777 |     # determine weights
778 |     if weighting == "inverse_pi_width":
779 |         weights = 1/(adata.obsm[predicted+'_hi'][genes].values-adata.obsm[predicted+'_lo'][genes].values)
780 |         weights = postprocess_weights(weights, quantile_cutoff, replace_inf, binarize, binarize_ratio, log_transform)
781 |     elif weighting == "inverse_norm_pi_width":
782 |         weights = 1/(adata.obsm[predicted+'_hi'][genes].values-adata.obsm[predicted+'_lo'][genes].values)
783 |         weights = weights / np.nanmean(weights, axis=0)
784 |         weights = postprocess_weights(weights, quantile_cutoff, replace_inf, binarize, binarize_ratio, log_transform)
785 |     elif weighting == "uniform":
786 |         weights = np.ones(adata.obsm[predicted].shape)
787 |     elif weighting == "inverse_residual":
788 |         weights = 1/np.abs(adata.obsm[predicted][genes].values - np.array(adata[:,genes].X))
789 |         weights = postprocess_weights(weights, quantile_cutoff, replace_inf, binarize, binarize_ratio, log_transform)
790 |     elif weighting == "inverse_norm_residual":
791 |         weights = 1/np.abs(adata.obsm[predicted][genes].values - np.array(adata[:,genes].X))
792 |         weights = weights / np.nanmean(weights, axis=0)
793 |         weights = postprocess_weights(weights, quantile_cutoff, replace_inf, binarize, binarize_ratio, log_transform)
794 |     else:
795 |         raise Exception("weighting not recognized")
796 |     
797 |     # scaling
798 |     if scale is True:
799 |         X = StandardScaler().fit_transform(adata.obsm[predicted].values)
800 |     else:
801 |         X = adata.obsm[predicted].values
802 |     
803 |     # run weighted PCA
804 |     if pca_method == "wpca":
805 |         X_red = WPCA(n_components=n_components).fit_transform(X, weights=weights)
806 |     elif pca_method == "empca":
807 |         X_red = EMPCA(n_components=n_components).fit_transform(X, weights=weights)
808 |     elif pca_method == "pca":
809 |         X_red = PCA(n_components=n_components).fit_transform(X)
810 |     elif pca_method == "gwpca": # gene-weighted PCA
811 |         weights = np.nanmean(weights, axis=0)
812 |         X_red = PCA(n_components=n_components).fit_transform(X * weights)
813 |     else:
814 |         raise Exception("pca_method not recognized")
815 |         
816 |     # add PCs to adata
817 |     adata.obsm[predicted+f"_PC{n_components}_{tag}"] = X_red
818 | 
819 |     if return_weights is True:
820 |         return(weights)
821 | 
822 | 
823 | def postprocess_weights(weights, quantile_cutoff, replace_inf, binarize, binarize_ratio, log_transform):
824 |     '''
825 |     Method for postprocessing weights (filter with cutoff, replace inf, etc) for weighted_PCA()
826 |     
827 |     Refer to weighted_pca() for details on arguments
828 |     '''
829 |     # cutoff weights
830 |     if quantile_cutoff is not None:
831 |         cutoff = np.nanquantile(weights, quantile_cutoff)
832 |         weights[np.isfinite(weights) & (weights >= cutoff)] = cutoff
833 |     
834 |     # log-transform
835 |     if log_transform is True:
836 |         weights = np.log1p(weights)
837 |     
838 |     # binarize weights
839 |     if binarize is True:
840 |         from skimage.filters import threshold_otsu
841 |         cutoff = threshold_otsu(weights[np.isfinite(weights)])
842 |         weights[np.isfinite(weights) & (weights >= cutoff)] = 1
843 |         weights[np.isfinite(weights) & (weights < cutoff)] = 1/binarize_ratio
844 |     elif binarize is False:
845 |         pass
846 |     elif isinstance(binarize, float) or isinstance(binarize, int):
847 |         cutoff = np.nanquantile(weights, binarize)
848 |         weights[np.isfinite(weights) & (weights >= cutoff)] = 1
849 |         weights[np.isfinite(weights) & (weights < cutoff)] = 1/binarize_ratio
850 |         
851 |     # deal with infs (from division by zero)
852 |     if replace_inf == "max":
853 |         weights[~np.isfinite(weights)] = np.nanmax(weights[np.isfinite(weights)])
854 |     elif replace_inf == "min":
855 |         weights[~np.isfinite(weights)] = np.nanmin(weights[np.isfinite(weights)])
856 |     elif replace_inf == "mean":
857 |         weights[~np.isfinite(weights)] = np.nanmean(weights[np.isfinite(weights)])
858 |     elif replace_inf == "median":
859 |         weights[~np.isfinite(weights)] = np.nanmedian(weights[np.isfinite(weights)])
860 |     elif isinstance(replace_inf, float) or isinstance(replace_inf, int):
861 |         weights[~np.isfinite(weights)] = replace_inf
862 |     
863 |     return(weights)
864 | 
865 | 
866 | def filtered_PCA(adata, imp_method, proportion=0.05, stratification=None, n_components=15, scale=True, normalize=False,
867 |                  tag="", return_keep_idxs=False):
868 |     '''
869 |     Runs filtered PCA using the TISSUE cell filtering approach
870 |     
871 |     Parameters
872 |     ----------
873 |         adata [AnnData] - should be the AnnData after running conformalize_prediction_interval()
874 |                         - must include in obsm: {imp_method}_predicted_expression,
875 |                                                 {imp_method}_predicted_expression_lo,
876 |                                                 {imp_method}_predicted_expression_hi
877 |         imp_method [str] - specifies which imputation method to return PCA for (e.g. 'knn', 'spage', 'tangram')
878 |         proportion [float] - between 0 and 1; proportion of most uncertain cells to drop
879 |         stratification [None or 1d numpy array] - array of values to stratify the drop by
880 |                                                 - same length as number of rows in X
881 |                                                 - if None, no stratification
882 |         n_components [int] - number of principal components
883 |         scale [bool] - whether to scale data with StandardScaler() before running PCA
884 |         normalize [bool] - whether to normalize prediction interval width by the absolute predicted expression value
885 |         tag [str] - additional tag to append to the obsm key for storing the PCs
886 |         return_keep_idxs [bool] - whether to return the keep_idxs for filtering
887 |     
888 |     Returns
889 |     -------
890 |         Stores the result in adata.obsm["{imp_method}_predicted_expression_PC{n_components}_{tag}"]
891 |         Optionally returns the indices corresponding to the observations to keep after filtering
892 |     '''    
893 |     predicted = f"{imp_method}_predicted_expression"
894 |     
895 |     # get predicted expression matrices
896 |     X = adata.obsm[predicted].values.copy()
897 |     
898 |     # get uncertainty (PI width) for filtering
899 |     X_uncertainty = adata.obsm[f'{predicted}_hi'].values - adata.obsm[f'{predicted}_lo'].values
900 |     if normalize is True:
901 |         X_uncertainty = X_uncertainty / (1+np.abs(adata.obsm[f'{predicted}'].values))
902 |     
903 |     # filter cells
904 |     keep_idxs = detect_uncertain_cells(X_uncertainty, proportion=proportion, stratification=stratification)
905 |     X_filtered = X[keep_idxs,:].copy()
906 |     
907 |     # scaling
908 |     if scale is True:
909 |         scaler = StandardScaler().fit(X_filtered)
910 |         X = scaler.transform(X)
911 |         X_filtered = scaler.transform(X_filtered)
912 |     
913 |     # run PCA
914 |     pca = PCA(n_components=n_components).fit(X_filtered)
915 |     X_red = pca.transform(X)
916 |     X_red_filtered = pca.transform(X_filtered)
917 |         
918 |     # add PCs to adata
919 |     adata.obsm[predicted+f"_PC{n_components}_{tag}"] = X_red
920 |     adata.uns[predicted+f"_PC{n_components}_filtered_{tag}"] = X_red_filtered
921 |     
922 |     if return_keep_idxs is True:
923 |         return (keep_idxs)
924 | 
925 | 
926 | 
927 | def detect_uncertain_cells (X, proportion=0.05, stratification=None):
928 |     '''
929 |     Method for dropping a portion of the most uncertain cells from the input. 
930 |     
931 |     Parameters
932 |     ----------
933 |         X [2d numpy array] - array of uncertainty values 
934 |         proportion [float] - between 0 and 1; proportion of most uncertain cells to drop
935 |         stratification [None or 1d numpy array] - array of values to stratify the drop by
936 |                                                 - same length as number of rows in X
937 |                                                 - if None, no stratification
938 |         
939 |     Returns
940 |     -------
941 |         keep_idxs [list] - array of row indices after dropping most uncertain cells
942 |     '''
943 |     from scipy.stats import zscore
944 |     
945 |     if stratification is not None: # drop cells within each strata independently
946 |     
947 |         drop_idxs = []
948 |         
949 |         for strata in np.unique(stratification):
950 |             
951 |             # compute scores
952 |             X_strat = X[stratification==strata,:].copy() # calc gene z-scores
953 |             orig_idxs = np.arange(X.shape[0])[stratification==strata]
954 |             cell_scores = np.nanmean(zscore(X_strat, axis=0), axis=1) # average z-score for each cell
955 |             
956 |             # determine cutoff score and indices to drop
957 |             if (isinstance(proportion, float)) or (isinstance(proportion, int)):
958 |                 cutoff_idx = int(np.ceil(proportion*len(cell_scores))) # number of cells to drop
959 |                 strata_drop_idxs = np.argsort(cell_scores)[::-1][:cutoff_idx]
960 |             elif proportion == "otsu":
961 |                 from skimage.filters import threshold_otsu
962 |                 cutoff = threshold_otsu(cell_scores)
963 |                 strata_drop_idxs = [i for i in range(len(cell_scores)) if cell_scores[i] > cutoff]
964 |             else:
965 |                 raise Exception("proportion specified not valid")
966 |                 
967 |             drop_idxs.append(orig_idxs[strata_drop_idxs]) # get idxs of highest scores
968 |             
969 |         drop_idxs = list(np.concatenate(drop_idxs))
970 |     
971 |     else:
972 |         
973 |         # compute scores
974 |         cell_scores = zscore(X, axis=0).mean(axis=1) # average z-score for each cell
975 |         
976 |         # determine cutoff score and indices to drop
977 |         if (isinstance(proportion, float)) or (isinstance(proportion, int)):
978 |             cutoff_idx = int(np.ceil(proportion*len(cell_scores))) # number of cells to drop
979 |             drop_idxs = list(np.argsort(cell_scores)[::-1][:cutoff_idx]) # get idxs of highest scores
980 |         elif proportion == "otsu":
981 |             from skimage.filters import threshold_otsu
982 |             cutoff = threshold_otsu(cell_scores)
983 |             drop_idxs = [i for i in range(len(cell_scores)) if cell_scores[i] > cutoff] 
984 |         else:
985 |             raise Exception("proportion specified not valid")
986 |     
987 |     # return keep indices (determined as indices not in drop indices)
988 |     keep_idxs = [i for i in range(X.shape[0]) if i not in drop_idxs]
989 |     
990 |     return (keep_idxs)


--------------------------------------------------------------------------------
/tissue/main.py:
--------------------------------------------------------------------------------
   1 | # Contains main functions for core TISSUE pipeline: computing cell-centric variability and calibrated prediction intervals
   2 | 
   3 | import numpy as np
   4 | import pandas as pd
   5 | import matplotlib.pyplot as plt
   6 | import scanpy as sc
   7 | import squidpy as sq
   8 | from sklearn.decomposition import PCA
   9 | from sklearn.preprocessing import StandardScaler
  10 | from sklearn.cluster import KMeans
  11 | from sklearn.model_selection import KFold, StratifiedKFold
  12 | import anndata as ad
  13 | import warnings
  14 | import os
  15 | 
  16 | 
  17 | def load_paired_datasets (spatial_counts, spatial_loc, RNAseq_counts, spatial_metadata = None,
  18 |                           min_cell_prevalence_spatial = 0.0, min_cell_prevalence_RNAseq = 0.01,
  19 |                           min_gene_prevalence_spatial = 0.0, min_gene_prevalence_RNAseq = 0.0):
  20 |     '''
  21 |     Uses datasets in the format specified by Li et al. (2022)
  22 |         See: https://drive.google.com/drive/folders/1pHmE9cg_tMcouV1LFJFtbyBJNp7oQo9J
  23 |     
  24 |     Parameters
  25 |     ----------
  26 |         spatial_counts [str] - path to spatial counts file; rows are cells
  27 |         spatial_loc [str] - path to spatial locations file; rows are cells
  28 |         RNAseq_counts [str] - path to RNAseq counts file; rows are genes
  29 |         spatial_metadata [None or str] - if not None, then path to spatial metadata file (will be read into spatial_adata.obs)
  30 |         min_cell_prevalence_spatial [float between 0 and 1] - minimum prevalence among cells to include gene in spatial anndata object, default=0
  31 |         min_cell_prevalence_RNAseq [float between 0 and 1] - minimum prevalence among cells to include gene in RNAseq anndata object, default=0.01
  32 |         min_gene_prevalence_spatial [float between 0 and 1] - minimum prevalence among genes to include cell in spatial anndata object, default=0
  33 |         min_gene_prevalence_RNAseq [float between 0 and 1] - minimum prevalence among genes to include cell in RNAseq anndata object, default=0
  34 |     
  35 |     Returns
  36 |     -------
  37 |         spatial_adata, RNAseq_adata - AnnData objects with counts and location (if applicable) in metadata
  38 |     '''
  39 |     # Spatial data loading
  40 |     spatial_adata = load_spatial_data (spatial_counts,
  41 |                                        spatial_loc,
  42 |                                        spatial_metadata = spatial_metadata,
  43 |                                        min_cell_prevalence_spatial = min_cell_prevalence_spatial,
  44 |                                        min_gene_prevalence_spatial = min_gene_prevalence_spatial)
  45 |     
  46 |     # RNAseq data loading
  47 |     RNAseq_adata = load_rnaseq_data (RNAseq_counts,
  48 |                                      min_cell_prevalence_RNAseq = min_cell_prevalence_RNAseq,
  49 |                                      min_gene_prevalence_RNAseq = min_gene_prevalence_RNAseq)
  50 | 
  51 |     return(spatial_adata, RNAseq_adata)
  52 | 
  53 | 
  54 | def load_spatial_data (spatial_counts, spatial_loc, spatial_metadata=None,
  55 |                        min_cell_prevalence_spatial = 0.0, min_gene_prevalence_spatial = 0.0):
  56 |     '''
  57 |     Loads in spatial data from text files.
  58 |     
  59 |     See load_paired_datasets() for details on arguments
  60 |     '''
  61 |     # read in spatial counts
  62 |     df = pd.read_csv(spatial_counts,header=0,sep="\t")
  63 |     
  64 |     # filter lowly expressed genes
  65 |     cells_prevalence = np.mean(df.values>0, axis=0)
  66 |     df = df.loc[:,cells_prevalence > min_cell_prevalence_spatial]
  67 |     
  68 |     # filter sparse cells
  69 |     genes_prevalence = np.mean(df.values>0, axis=1)
  70 |     df = df.loc[genes_prevalence > min_gene_prevalence_spatial,:]
  71 |     
  72 |     # create AnnData
  73 |     spatial_adata = ad.AnnData(X=df, dtype='float64')
  74 |     spatial_adata.obs_names = df.index.values
  75 |     spatial_adata.obs_names = spatial_adata.obs_names.astype(str)
  76 |     spatial_adata.var_names = df.columns
  77 |     del df
  78 |     
  79 |     # add spatial locations
  80 |     locations = pd.read_csv(spatial_loc,header=0,delim_whitespace=True)
  81 |     spatial_adata.obsm["spatial"] = locations.loc[genes_prevalence > min_gene_prevalence_spatial, :].values
  82 |     
  83 |     # add metadata
  84 |     if spatial_metadata is not None:
  85 |         metadata_df = pd.read_csv(spatial_metadata)
  86 |         metadata_df = metadata_df.loc[genes_prevalence > min_gene_prevalence_spatial, :]
  87 |         metadata_df.index = spatial_adata.obs_names
  88 |         spatial_adata.obs = metadata_df
  89 |     
  90 |     # remove genes with nan values
  91 |     spatial_adata = spatial_adata[:,np.isnan(spatial_adata.X).sum(axis=0)==0].copy()
  92 |     
  93 |     # make unique obs_names and var_names
  94 |     spatial_adata.obs_names_make_unique()
  95 |     spatial_adata.var_names_make_unique()
  96 |     
  97 |     return (spatial_adata)
  98 | 
  99 | 
 100 | def load_rnaseq_data (RNAseq_counts, min_cell_prevalence_RNAseq = 0.0, min_gene_prevalence_RNAseq = 0.0):
 101 |     '''
 102 |     Loads in scRNAseq data from text files.
 103 |     
 104 |     See load_paired_datasets() for details on arguments
 105 |     '''
 106 |     # read in RNAseq counts
 107 |     df = pd.read_csv(RNAseq_counts,header=0,index_col=0,sep="\t")
 108 |     
 109 |     # filter lowly expressed genes -- note that df is transposed gene x cell
 110 |     cells_prevalence = np.mean(df>0, axis=1)
 111 |     df = df.loc[cells_prevalence > min_cell_prevalence_RNAseq,:]
 112 |     del cells_prevalence
 113 |     
 114 |     # filter sparse cells
 115 |     genes_prevalence = np.mean(df>0, axis=0)
 116 |     df = df.loc[:,genes_prevalence > min_gene_prevalence_RNAseq]
 117 |     del genes_prevalence
 118 |     
 119 |     # create AnnData
 120 |     RNAseq_adata = ad.AnnData(X=df.T, dtype='float64')
 121 |     RNAseq_adata.obs_names = df.T.index.values
 122 |     RNAseq_adata.var_names = df.T.columns
 123 |     del df
 124 |     
 125 |     # remove genes with nan values
 126 |     RNAseq_adata = RNAseq_adata[:,np.isnan(RNAseq_adata.X).sum(axis=0)==0].copy()
 127 |     
 128 |     # make unique obs_names and var_names
 129 |     RNAseq_adata.obs_names_make_unique()
 130 |     RNAseq_adata.var_names_make_unique()
 131 |     
 132 |     return (RNAseq_adata)
 133 | 
 134 | 
 135 | 
 136 | def preprocess_data (adata, standardize=False, normalize=False):
 137 |     '''
 138 |     Preprocesses adata inplace:
 139 |         1. sc.pp.normalize_total() if normalize is True
 140 |         2. sc.pp.log1p() if normalize is True
 141 |         3. Not recommended: standardize each gene (subtract mean, divide by standard deviation)
 142 |     
 143 |     Parameters
 144 |     ----------
 145 |         standardize [Boolean] - whether to standardize genes; default is False
 146 |         normalize [Boolean] - whether to normalize data; default is False (based on finding by Li et al., 2022)
 147 |     
 148 |     Returns
 149 |     -------
 150 |         Modifies adata in-place
 151 |     
 152 |     NOTE: Under current default settings for TISSUE, this method does nothing to adata
 153 |     '''
 154 |     # normalize data
 155 |     if normalize is True:
 156 |         sc.pp.normalize_total(adata)
 157 |         sc.pp.log1p(adata)
 158 |     
 159 |     # standardize data
 160 |     if standardize is True:
 161 |         adata.X = np.divide(adata.X - np.mean(adata.X, axis=0), np.std(adata.X, axis=0))
 162 | 
 163 | 
 164 | def build_spatial_graph (adata, method="fixed_radius", spatial="spatial", radius=None, n_neighbors=20, set_diag=True):
 165 |     '''
 166 |     Builds a spatial graph from AnnData according to specifications. Uses Squidpy implementations for building spatial graphs.
 167 |     
 168 |     Parameters
 169 |     ----------
 170 |         adata [AnnData] - spatial data, must include adata.obsm[spatial]
 171 |         method [str]:
 172 |             - "radius" (all cells within radius are neighbors)
 173 |             - "delaunay" (triangulation)
 174 |             - "delaunay_radius" (triangulation with pruning by max radius; DEFAULT)
 175 |             - "fixed" (the k-nearest cells are neighbors determined by n_neighbors)
 176 |             - "fixed_radius" (knn by n_neighbors with pruning by max radius)
 177 |         spatial [str] - column name for adata.obsm to retrieve spatial coordinates
 178 |         radius [None or float/int] - radius around cell centers for which to detect neighbor cells; defaults to Q3+1.5*IQR of delaunay (or fixed for fixed_radius) neighbor distances
 179 |         n_neighbors [None or int] - number of neighbors to get for each cell (if method is "fixed" or "fixed_radius" or "radius_fixed"); defaults to 20
 180 |         set_diag [True or False] - whether to have diagonal of 1 in adjacency (before normalization); False is identical to theory and True is more robust; defaults to True
 181 |     
 182 |     Returns
 183 |     -------
 184 |         Modifies adata in-place
 185 |     '''
 186 |     # delaunay graph
 187 |     if method == "delaunay": # triangulation only
 188 |         sq.gr.spatial_neighbors(adata, delaunay=True, coord_type="generic", set_diag=set_diag)
 189 |     
 190 |     # neighborhoods determined by fixed radius
 191 |     elif method == "radius":
 192 |         if radius is None: # compute 90th percentile of delaunay triangulation as default radius
 193 |             sq.gr.spatial_neighbors(adata, delaunay=True, coord_type="generic")
 194 |             if isinstance(adata.obsp["spatial_distances"],np.ndarray): # numpy array
 195 |                 dists = adata.obsp['spatial_distances'][np.nonzero(adata.obsp['spatial_distances'])] # get nonzero array
 196 |             else: # sparse matrix
 197 |                 adata.obsp['spatial_distances'].eliminate_zeros() # remove hard-set zeros
 198 |                 dists = adata.obsp['spatial_distances'].data # get non-zero values in sparse matrix
 199 |             radius = np.percentile(dists, 75) + 1.5*(np.percentile(dists, 75) - np.percentile(dists, 25))
 200 |         # build graph
 201 |         sq.gr.spatial_neighbors(adata, radius=radius, coord_type="generic", set_diag=set_diag)
 202 |     
 203 |     # delaunay graph with removal of outlier edges with distance > radius
 204 |     elif method == "delaunay_radius":
 205 |         # build initial graph
 206 |         sq.gr.spatial_neighbors(adata, delaunay=True, coord_type="generic", set_diag=set_diag)
 207 |         if radius is None: # compute default radius as 75th percentile + 1.5*IQR
 208 |             if isinstance(adata.obsp["spatial_distances"],np.ndarray): # numpy array
 209 |                 dists = adata.obsp['spatial_distances'][np.nonzero(adata.obsp['spatial_distances'])] # get nonzero array
 210 |             else: # sparse matrix
 211 |                 adata.obsp['spatial_distances'].eliminate_zeros() # remove hard-set zeros
 212 |                 dists = adata.obsp['spatial_distances'].data # get non-zero values in sparse matrix
 213 |             radius = np.percentile(dists, 75) + 1.5*(np.percentile(dists, 75) - np.percentile(dists, 25))
 214 |         # prune edges by radius
 215 |         adata.obsp['spatial_connectivities'][adata.obsp['spatial_distances']>radius] = 0
 216 |         adata.obsp['spatial_distances'][adata.obsp['spatial_distances']>radius] = 0
 217 |     
 218 |     # fixed neighborhood size with removal of outlier edges with distance > radius
 219 |     elif method == "fixed_radius":
 220 |         # build initial graph
 221 |         sq.gr.spatial_neighbors(adata, n_neighs=n_neighbors, coord_type="generic", set_diag=set_diag)
 222 |         if radius is None: # compute default radius as 75th percentile + 1.5*IQR
 223 |             if isinstance(adata.obsp["spatial_distances"],np.ndarray): # numpy array
 224 |                 dists = adata.obsp['spatial_distances'][np.nonzero(adata.obsp['spatial_distances'])] # get nonzero array
 225 |             else: # sparse matrix
 226 |                 adata.obsp['spatial_distances'].eliminate_zeros() # remove hard-set zeros
 227 |                 dists = adata.obsp['spatial_distances'].data # get non-zero values in sparse matrix
 228 |             radius = np.percentile(dists, 75) + 1.5*(np.percentile(dists, 75) - np.percentile(dists, 25))
 229 |         # prune edges by radius
 230 |         adata.obsp['spatial_connectivities'][adata.obsp['spatial_distances']>radius] = 0
 231 |         adata.obsp['spatial_distances'][adata.obsp['spatial_distances']>radius] = 0
 232 |             
 233 |     # fixed neighborhood size
 234 |     elif method == "fixed":
 235 |         sq.gr.spatial_neighbors(adata, n_neighs=n_neighbors, coord_type="generic", set_diag=set_diag)
 236 |             
 237 |     else:
 238 |         raise Exception ("method not recognized")
 239 | 
 240 | 
 241 | def load_spatial_graph(adata, npz_filepath, add_identity=True):
 242 |     '''
 243 |     Reads in scipy sparse adjacency matrix from the specified npz_filepath and adds it to adata.obsp["spatial_connectivities"]
 244 |     
 245 |     Parameters
 246 |     ----------
 247 |         add_identity [bool] - whether to add a diagonal of 1's to ensure compatability with TISSUE (i.e. fully connected)
 248 |     
 249 |     Returns
 250 |     -------
 251 |         Modifies adata in-place
 252 |     
 253 |     If graph is weighted, then you should set weight="spatial_connectivities" in downstream TISSUE calls for cell-centric variability calculation
 254 |     '''
 255 |     from scipy import sparse
 256 |     a = sparse.load_npz(npz_filepath)
 257 |     
 258 |     if add_identity is True:
 259 |         a += sparse.identity(a.shape[0]) # add identity matrix
 260 | 
 261 |     adata.obsp["spatial_connectivities"] = a
 262 |     
 263 |     print("If graph is weighted, then you should set weight='spatial_connectivities' in downstream call of conformalize_spatial_uncertainty()")
 264 |     
 265 | 
 266 | def predict_gene_expression (spatial_adata, RNAseq_adata,
 267 |                              target_genes, conf_genes=None,
 268 |                              method="spage", n_folds=None, random_seed=444, **kwargs):
 269 |     '''
 270 |     Leverages one of several methods to predict spatial gene expression from a paired spatial and scRNAseq dataset
 271 |     
 272 |     Parameters
 273 |     ----------
 274 |         spatial_adata [AnnData] = spatial data
 275 |         RNAseq_adata [AnnData] = RNAseq data, RNAseq_adata.var_names should be superset of spatial_adata.var_names
 276 |         target_genes [list of str] = genes to predict spatial expression for; must be a subset of RNAseq_adata.var_names
 277 |         conf_genes [list of str] = genes in spatial_adata.var_names to use for confidence measures; Default is to use all genes in spatial_adata.var_names
 278 |         method [str] = baseline imputation method
 279 |             "knn" (uses average of k-nearest neighbors in RNAseq data on Harmony joint space)
 280 |             "spage" (SpaGE imputation by Abdelaal et al., 2020)
 281 |             "tangram" (Tangram cell positioning by Biancalani et al., 2021)
 282 |             Others TBD
 283 |         n_folds [None or int] = number of cv folds to use for conf_genes, cannot exceed number of conf_genes, None is keeping each gene in its own fold
 284 |         random_seed [int] = used to see n_folds choice (defaults to 444)
 285 |     
 286 |     Returns
 287 |     -------
 288 |         Adds to adata the [numpy matrix]: spatial_adata.obsm["predicted_expression"], spatial_adata.obsm["combined_loo_expression"]
 289 |             - matrix of predicted gene expressions (same number of rows as spatial_adata, columns are target_genes)
 290 |     '''
 291 |     # change all genes to lower
 292 |     target_genes = [t.lower() for t in target_genes]
 293 |     spatial_adata.var_names = [v.lower() for v in spatial_adata.var_names]
 294 |     RNAseq_adata.var_names = [v.lower() for v in RNAseq_adata.var_names]
 295 |     
 296 |     # drop duplicates if any (happens in Dataset14)
 297 |     if RNAseq_adata.var_names.duplicated().sum() > 0:
 298 |         RNAseq_adata = RNAseq_adata[:,~RNAseq_adata.var_names.duplicated()].copy()
 299 |     if spatial_adata.var_names.duplicated().sum() > 0:
 300 |         spatial_adata = spatial_adata[:,~spatial_adata.var_names.duplicated()].copy()
 301 |     
 302 |     # raise warning if any target_genes in spatial data already
 303 |     if any(x in target_genes for x in spatial_adata.var_names):
 304 |         warnings.warn("Some target_genes are already measured in the spatial_adata object!")
 305 |     
 306 |     # First pass over all genes using specified method
 307 |     if method == "knn":
 308 |         predicted_expression_target = knn_impute(spatial_adata,RNAseq_adata,genes_to_predict=target_genes,**kwargs)
 309 |     elif method == "spage":
 310 |         predicted_expression_target = spage_impute(spatial_adata,RNAseq_adata,genes_to_predict=target_genes,**kwargs)
 311 |     elif method == "gimvi":
 312 |         predicted_expression_target = gimvi_impute(spatial_adata,RNAseq_adata,genes_to_predict=target_genes,**kwargs)
 313 |     elif method == "tangram":
 314 |         predicted_expression_target = tangram_impute(spatial_adata,RNAseq_adata,genes_to_predict=target_genes,**kwargs)
 315 |     else:
 316 |         raise Exception ("method not recognized")
 317 |         
 318 |     # Second pass over conf_genes using specified method using cross-validation
 319 |     
 320 |     if conf_genes is None:
 321 |         conf_genes = list(spatial_adata.var_names)
 322 |     conf_genes = [c.lower() for c in conf_genes]
 323 |     conf_genes_unique = [c for c in conf_genes if c not in target_genes] # removes any conf_genes also in target_genes
 324 |     if len(conf_genes_unique) < len(conf_genes):
 325 |         print("Found "+str(len(conf_genes)-len(conf_genes_unique))+" duplicate conf_gene in target_genes.")
 326 |     conf_genes_RNA = [c for c in conf_genes_unique if c in RNAseq_adata.var_names] # remove any conf genes not in RNAseq
 327 |     if len(conf_genes_RNA) < len(conf_genes_unique):
 328 |         print("Found "+str(len(conf_genes_unique)-len(conf_genes_RNA))+" conf_gene not in RNAseq_adata.")
 329 |     conf_genes = conf_genes_RNA
 330 |     
 331 |     # raise error if no conf_genes
 332 |     if len(conf_genes) == 0:
 333 |         raise Exception ("No suitable conf_genes specified!")
 334 |     
 335 |     # create folds if needed
 336 |     if n_folds is None:
 337 |         n_folds = len(conf_genes)
 338 |     elif n_folds > len(conf_genes):
 339 |         raise Warning ("n_folds in predict_gene_expression() is greater than length of conf_genes...")
 340 |         n_folds = len(conf_genes)
 341 | 
 342 |     np.random.seed(random_seed)
 343 |     np.random.shuffle(conf_genes)
 344 |     folds = np.array_split(conf_genes, n_folds)
 345 |     
 346 |     # run prediction on each fold
 347 |     for gi, fold in enumerate(folds):
 348 |         if method == "knn":
 349 |             loo_expression = knn_impute(spatial_adata[:,~spatial_adata.var_names.isin(fold)],RNAseq_adata,genes_to_predict=list(fold)+target_genes,**kwargs)
 350 |         elif method == "spage":
 351 |             loo_expression = spage_impute(spatial_adata[:,~spatial_adata.var_names.isin(fold)],RNAseq_adata,genes_to_predict=list(fold)+target_genes,**kwargs)
 352 |         elif method == "gimvi":
 353 |             loo_expression = gimvi_impute(spatial_adata[:,~spatial_adata.var_names.isin(fold)],RNAseq_adata,genes_to_predict=list(fold)+target_genes,**kwargs)
 354 |         elif method == "tangram":
 355 |             loo_expression = tangram_impute(spatial_adata[:,~spatial_adata.var_names.isin(fold)],RNAseq_adata,genes_to_predict=list(fold)+target_genes,**kwargs)
 356 |         else:
 357 |             raise Exception ("method not recognized")
 358 |     
 359 |         # Update 
 360 |         if gi == 0:
 361 |             predicted_expression_conf = loo_expression.copy()
 362 |         else:
 363 |             predicted_expression_conf['index'] = range(predicted_expression_conf.shape[0])
 364 |             loo_expression['index'] = range(loo_expression.shape[0])
 365 |             predicted_expression_conf.set_index('index')
 366 |             loo_expression.set_index('index')
 367 |             predicted_expression_conf = pd.concat((predicted_expression_conf,loo_expression)).groupby(by="index").sum().reset_index().drop(columns=['index'])
 368 |     
 369 |     # Take average of target_genes (later overwritten by "all genes"-predicted)
 370 |     predicted_expression_conf[target_genes] = predicted_expression_conf[target_genes]/(len(conf_genes))
 371 |     
 372 |     # Update spatial_adata
 373 |     predicted_expression_target.index = spatial_adata.obs_names
 374 |     predicted_expression_conf.index = spatial_adata.obs_names
 375 | 
 376 |     # gets predictions for target genes followed by conf genes
 377 |     predicted_expression_target[conf_genes] = predicted_expression_conf[conf_genes].copy()
 378 |     spatial_adata.obsm[method+"_predicted_expression"] = predicted_expression_target
 379 |     
 380 |     spatial_adata.uns["conf_genes_used"] = conf_genes
 381 |     spatial_adata.uns["target_genes_used"] = target_genes
 382 | 
 383 | 
 384 | def knn_impute (spatial_adata, RNAseq_adata, genes_to_predict, n_neighbors, **kwargs):
 385 |     '''
 386 |     Runs basic kNN imputation using Harmony subspace
 387 |     
 388 |     See predict_gene_expression() for details on arguments
 389 |     '''
 390 |     from scanpy.external.pp import harmony_integrate
 391 |     from scipy.spatial.distance import cdist
 392 |     
 393 |     # combine anndatas
 394 |     intersection = np.intersect1d(spatial_adata.var_names, RNAseq_adata.var_names)
 395 |     subRNA = RNAseq_adata[:, intersection]
 396 |     subspatial = spatial_adata[:, intersection]
 397 |     joint_adata = ad.AnnData(X=np.vstack((subRNA.X,subspatial.X)), dtype='float32')
 398 |     joint_adata.obs_names = np.concatenate((subRNA.obs_names.values,subspatial.obs_names.values))
 399 |     joint_adata.var_names = subspatial.var_names.values
 400 |     joint_adata.obs["batch"] = ["rna"]*len(subRNA.obs_names.values)+["spatial"]*len(spatial_adata.obs_names.values)
 401 |     
 402 |     # run Harmony
 403 |     sc.tl.pca(joint_adata)
 404 |     harmony_integrate(joint_adata, 'batch', verbose=False)
 405 |     
 406 |     # kNN imputation
 407 |     knn_mat = cdist(joint_adata[joint_adata.obs["batch"] == "spatial"].obsm['X_pca_harmony'][:,:np.min([30,joint_adata.obsm['X_pca_harmony'].shape[1]])],
 408 |                      joint_adata[joint_adata.obs["batch"] == "rna"].obsm['X_pca_harmony'][:,:np.min([30,joint_adata.obsm['X_pca_harmony'].shape[1]])])
 409 |     k_dist_threshold = np.sort(knn_mat)[:, n_neighbors-1]
 410 |     knn_mat[knn_mat > k_dist_threshold[:,np.newaxis]] = 0 # sets all dist > thresh to 0
 411 |     knn_mat[knn_mat > 0] = 1 # 1 for connection to a nn
 412 |     row_sums = knn_mat.sum(axis=1)
 413 |     knn_mat = knn_mat / row_sums[:,np.newaxis]
 414 |     predicted_expression = knn_mat @ RNAseq_adata.X
 415 |     
 416 |     predicted_expression = pd.DataFrame(predicted_expression, columns=RNAseq_adata.var_names.values)
 417 |     predicted_expression = predicted_expression[genes_to_predict]
 418 |     
 419 |     return(predicted_expression)
 420 |     
 421 |     
 422 | def spage_impute (spatial_adata, RNAseq_adata, genes_to_predict, **kwargs):
 423 |     '''
 424 |     Runs SpaGE gene imputation
 425 |     
 426 |     See predict_gene_expression() for details on arguments
 427 |     '''
 428 |     #from tissue.SpaGE.main import SpaGE
 429 |     from .SpaGE.main import SpaGE
 430 |     
 431 |     # transform adata in spage input data format
 432 |     if isinstance(spatial_adata.X,np.ndarray):
 433 |         spatial_data = pd.DataFrame(spatial_adata.X.T)
 434 |     else:
 435 |         spatial_data = pd.DataFrame(spatial_adata.X.T.toarray())
 436 |     spatial_data.index = spatial_adata.var_names.values
 437 |     if isinstance(RNAseq_adata.X,np.ndarray): # convert to array if needed
 438 |         RNAseq_data = pd.DataFrame(RNAseq_adata.X.T)
 439 |     else:
 440 |         RNAseq_data = pd.DataFrame(RNAseq_adata.X.T.toarray())
 441 |     RNAseq_data.index = RNAseq_adata.var_names.values
 442 |     
 443 |     # predict with SpaGE
 444 |     predicted_expression = SpaGE(spatial_data.T,RNAseq_data.T,genes_to_predict=genes_to_predict,**kwargs)
 445 |     
 446 |     return(predicted_expression)
 447 | 
 448 | 
 449 | def tangram_impute (spatial_adata, RNAseq_adata, genes_to_predict, **kwargs):
 450 |     '''
 451 |     Run Tangram gene imputation (positioning) using the more efficient cluster-level approach with Leiden clustering
 452 |     
 453 |     See predict_gene_expression() for details on arguments
 454 |     '''
 455 |     import torch
 456 |     from torch.nn.functional import softmax, cosine_similarity, sigmoid
 457 |     import tangram as tg
 458 |     
 459 |     # clustering and preprocessing
 460 |     RNAseq_adata_label = RNAseq_adata.copy()
 461 |     sc.pp.highly_variable_genes(RNAseq_adata_label)
 462 |     RNAseq_adata_label = RNAseq_adata[:, RNAseq_adata_label.var.highly_variable].copy()
 463 |     sc.pp.scale(RNAseq_adata_label, max_value=10)
 464 |     sc.tl.pca(RNAseq_adata_label)
 465 |     sc.pp.neighbors(RNAseq_adata_label)
 466 |     sc.tl.leiden(RNAseq_adata_label, resolution = 0.5)
 467 |     RNAseq_adata.obs['leiden'] = RNAseq_adata_label.obs.leiden
 468 |     del RNAseq_adata_label
 469 |     tg.pp_adatas(RNAseq_adata, spatial_adata) # genes=None default using all genes shared between two data
 470 |     
 471 |     # gene projection onto spatial
 472 |     ad_map = tg.map_cells_to_space(RNAseq_adata, spatial_adata, mode='clusters', cluster_label='leiden', density_prior='rna_count_based', verbose=False)
 473 |     ad_ge = tg.project_genes(ad_map, RNAseq_adata, cluster_label='leiden')
 474 |     predicted_expression = pd.DataFrame(ad_ge[:,genes_to_predict].X, index=ad_ge[:,genes_to_predict].obs_names, columns=ad_ge[:,genes_to_predict].var_names)
 475 |     
 476 |     return(predicted_expression)
 477 | 
 478 | 
 479 | def gimvi_impute (spatial_adata, RNAseq_adata, genes_to_predict, **kwargs):
 480 |     '''
 481 |     Run gimVI gene imputation
 482 |     
 483 |     See predict_gene_expression() for details on arguments
 484 |     '''
 485 |     import scvi
 486 |     from scvi.external import GIMVI
 487 |     
 488 |     # preprocessing of data
 489 |     spatial_adata = spatial_adata[:, spatial_adata.var_names.isin(RNAseq_adata.var_names)].copy()
 490 |     predict_idxs = [list(RNAseq_adata.var_names).index(gene) for gene in genes_to_predict]
 491 |     spatial_dim0 = spatial_adata.shape[0]
 492 |     
 493 |     # indices for filtering out zero-expression cells
 494 |     filtered_cells_spatial = (spatial_adata.X.sum(axis=1) > 1)
 495 |     filtered_cells_RNAseq = (RNAseq_adata.X.sum(axis=1) > 1)
 496 |     
 497 |     # make copies of subsets
 498 |     spatial_adata = spatial_adata[filtered_cells_spatial,:].copy()
 499 |     RNAseq_adata = RNAseq_adata[filtered_cells_RNAseq,:].copy()
 500 |     
 501 |     # setup anndata for scvi
 502 |     GIMVI.setup_anndata(spatial_adata)
 503 |     GIMVI.setup_anndata(RNAseq_adata)
 504 |     
 505 |     # train gimVI model
 506 |     model = GIMVI(RNAseq_adata, spatial_adata, generative_distributions=['nb', 'nb'], **kwargs) # 'nb' tends to be less buggy
 507 |     model.train(200)
 508 |     
 509 |     # apply trained model for imputation
 510 |     _, imputation = model.get_imputed_values(normalized=False)
 511 |     imputed = imputation[:, predict_idxs]
 512 |     predicted_expression = np.zeros((spatial_dim0, imputed.shape[1]))
 513 |     predicted_expression[filtered_cells_spatial,:] = imputed
 514 |     predicted_expression = pd.DataFrame(predicted_expression, columns=genes_to_predict)
 515 |     
 516 |     return(predicted_expression)
 517 | 
 518 |     
 519 | def conformalize_spatial_uncertainty (adata, predicted, calib_genes, weight='exp_cos', add_one=True,
 520 |                                       grouping_method=None, k='auto', k2='auto', n_pc=None, n_pc2=None, weight_n_pc=10):
 521 |     '''
 522 |     Generates cell-centric variability and then performs stratified grouping and conformal score calculation
 523 |     
 524 |     Parameters
 525 |     ----------
 526 |         adata - AnnData object with adata.obsm[predicted] and adata.obsp['spatial_connectivites']
 527 |         predicted [str] - string corresponding to key in adata.obsm that contains the predicted transcript expression
 528 |         calib_genes [list or np.1darray] - strings corresponding to the genes to use in calibration
 529 |         weight [str] - weights to use when computing spatial variability (either 'exp_cos' or 'spatial_connectivities')
 530 |         add_one [bool] - whether to add an intercept term of one to the spatial standard deviation
 531 |         weight_n_pc [None or int] - if not None, then specifies number of top principal components to use for weight calculation if weight is 'exp_cos' (default is None)
 532 |         For grouping_method [str], k [int>0 or 'auto'], k2 [None or int>0 or 'auto'], n_pc [None or int>0], n_pc2 [None or int>0]; refer to get_grouping()
 533 |     
 534 |     Returns
 535 |     -------
 536 |         Saves the uncertainty in adata.obsm[predicted+"_uncertainty"]
 537 |         Saves the scores in adata.obsm[predicted+"_score"]
 538 |         Saves an upper and lower bound in adata.obsm[predicted+"_lo"/"_hi"]
 539 |     '''
 540 |     # get spatial uncertainty and add to annotations
 541 |     scores, residuals, G_stdev, G = get_spatial_uncertainty_scores(adata, predicted, calib_genes,
 542 |                                                                    weight=weight,
 543 |                                                                    add_one=add_one,
 544 |                                                                    weight_n_pc=weight_n_pc)
 545 |     
 546 |     adata.obsm[predicted+"_uncertainty"] = pd.DataFrame(G_stdev,
 547 |                                                         columns=adata.obsm[predicted].columns,
 548 |                                                         index=adata.obsm[predicted].index)
 549 |     adata.obsm[predicted+"_score"] = pd.DataFrame(scores,
 550 |                                                   columns=calib_genes,
 551 |                                                   index=adata.obsm[predicted].index)
 552 |     adata.obsm[predicted+"_error"] = pd.DataFrame(residuals,
 553 |                                                   columns=calib_genes,
 554 |                                                   index=adata.obsm[predicted].index)                                              
 555 |         
 556 |     # define group
 557 |     if grouping_method is None:
 558 |         groups = np.zeros(G.shape)
 559 |     else:
 560 |         groups, k_final, k2_final = get_grouping(G, method=grouping_method, k=k, k2=k2, n_pc=n_pc, n_pc2=n_pc2)
 561 |     
 562 |     # add grouping and k-values to anndata
 563 |     adata.obsm[predicted+"_groups"] = groups
 564 |     adata.uns[predicted+"_kg"] = k_final
 565 |     adata.uns[predicted+"_kc"] = k2_final
 566 |     
 567 | 
 568 | def get_spatial_uncertainty_scores (adata, predicted, calib_genes, weight='exp_cos',
 569 |                                     add_one=True, weight_n_pc=None):
 570 |     '''
 571 |     Computes spatial uncertainty scores (i.e. cell-centric variability)
 572 |     
 573 |     Parameters
 574 |     ----------
 575 |         adata - AnnData object with adata.obsm[predicted] and adata.obsp['spatial_connectivites']
 576 |         predicted [str] - string corresponding to key in adata.obsm that contains the predicted transcript expression
 577 |         calib_genes [list or np.1darray] - strings corresponding to the genes to use in calibration
 578 |         weight [str] - weights to use when computing spatial variability (either 'exp_cos' or 'spatial_connectivities')
 579 |                      - 'spatial_connectivities' will use values in adata.obsp['spatial_connectivities']
 580 |         add_one [bool] - whether to add one to the uncertainty
 581 |         weight_n_pc [None or int] - if not None, then specifies number of top principal components to use for weight calculation if weight is 'exp_cos' (default is None)
 582 |         
 583 |     Returns
 584 |     -------
 585 |         scores - spatial uncertainty scores for all calib_genes
 586 |         residuals - prediction errors matching scores dimensions
 587 |         G_stdev - spatial standard deviations measured; same shape as adata.obsm[predicted]
 588 |         G - adata.obsm[predicted].values
 589 |     '''
 590 |     if weight not in ["exp_cos", "spatial_connectivities"]:
 591 |         raise Exception('weight not recognized')
 592 |     
 593 |     if 'spatial_connectivities' not in adata.obsp.keys():
 594 |         raise Exception ("'spatial_connectivities' not found in adata.obsp and is required")
 595 |     
 596 |     # init prediction array and uncertainties array
 597 |     A = adata.obsp['spatial_connectivities']
 598 |     A.eliminate_zeros()
 599 |     G = adata.obsm[predicted].values.copy()
 600 |     G_stdev = np.zeros_like(G)
 601 |     
 602 |     # init for exp_cos weighting
 603 |     if weight == "exp_cos":
 604 |         from sklearn.metrics.pairwise import cosine_similarity
 605 |         if weight_n_pc is not None: # perform PCA first and then compute cosine weights from PCs
 606 |             G_pca = StandardScaler().fit_transform(G)
 607 |             G_pca = PCA(n_components=weight_n_pc, random_state=444).fit_transform(G_pca)
 608 |     
 609 |     # compute cell-centric variability
 610 |     for i in range(G.shape[0]): # iterate cells
 611 |         
 612 |         # get its neighbors only
 613 |         cell_idxs = np.nonzero(A[i,:])[1]
 614 |         c_idx = np.where(cell_idxs==i)[0][0] # center idx in subsetted array
 615 |         
 616 |         # compute weights for cell neighbors
 617 |         if weight == "exp_cos": # use TISSUE cosine similarity weighting
 618 |             if weight_n_pc is not None: # perform PCA first and then compute cosine weights from PCs
 619 |                 cos_weights = cosine_similarity(G_pca[i,:].reshape(1,-1), G_pca[cell_idxs,:])
 620 |             else: # compute cosine weights from gene expression
 621 |                 cos_weights = cosine_similarity(G[i,:].reshape(1,-1), G[cell_idxs,:])
 622 |             weights = np.exp(cos_weights).flatten()
 623 |         
 624 |         elif weight == "spatial_connectivities": # use preset weights
 625 |             weights = A[i,cell_idxs].toarray().flatten()
 626 |             weights[np.isnan(weights)] = 0
 627 |         
 628 |         else: # set uniform weights
 629 |             weights = np.ones(len(cell_idxs))
 630 |         
 631 |         # compute CCV for each gene
 632 |         nA_std = []
 633 |         for j in range(G.shape[1]): # iterate genes
 634 |             
 635 |             # get expression of gene for cell and neighbors
 636 |             expression_vec = G[cell_idxs,j]
 637 |             
 638 |             # compute CCV for cell
 639 |             nA_std.append(cell_centered_variability(expression_vec, weights=weights, c_idx=c_idx))
 640 |         
 641 |         nA_std = np.array(nA_std)
 642 |         
 643 |         # add one if specified
 644 |         if add_one is True:
 645 |             nA_std += 1
 646 |         
 647 |         # update G_stdev with uncertainties
 648 |         G_stdev[i,:] = nA_std
 649 |     
 650 |     # compute scores based on confidence genes (prediction residuals)
 651 |     calib_idxs = [np.where(adata.obsm[predicted].columns==gene)[0][0] for gene in calib_genes]
 652 |     residuals = adata[:, calib_genes].X - adata.obsm[predicted][calib_genes].values # Y-G
 653 |     
 654 |     warnings.filterwarnings("ignore", category=RuntimeWarning) # suppress RuntimeWarning for division by zero
 655 |     scores = np.abs(residuals) / G_stdev[:, calib_idxs] # scores
 656 |     warnings.filterwarnings("default", category=RuntimeWarning)
 657 |     
 658 |     return(scores, residuals, G_stdev, G)
 659 | 
 660 | 
 661 | def cell_centered_variability (values, weights, c_idx):
 662 |     '''
 663 |     Takes in an array and weights to compute cell-centric variability:
 664 |     
 665 |     Parameters
 666 |     ----------
 667 |         values [1d arr] - array with cell's masked neighborhood expression (non-neighbors are nan)
 668 |         weights [1d arr] - same dim as values; contains weights for computing CCV_c
 669 |         c_idx [int] - index for which element of nA corresponds to center cell
 670 |         
 671 |     Returns
 672 |     -------
 673 |         ccv [float] - cell-centric varaiblity
 674 |     '''
 675 |     values_f = values[np.isfinite(values)]
 676 |     weights_f = weights[np.isfinite(values)]
 677 |     average = values[c_idx] # "average" is simply the center cell value
 678 |     variance = np.average((values_f-average)**2, weights=weights_f)
 679 |     ccv = np.sqrt(variance)
 680 |     
 681 |     return(ccv)
 682 | 
 683 | 
 684 | def get_spatial_uncertainty_scores_from_metadata(adata, predicted):
 685 |     '''
 686 |     Returns scores, residuals, G_stdev, G (outputs of get_spatial_uncertainty_scores) from precomputed entries
 687 |     in the AnnData (adata) object. Note, these must have been computed and saved in the same was as in
 688 |     conformalize_spatial_uncertainty().
 689 |     
 690 |     Parameters
 691 |     ----------
 692 |         adata [AnnData] - object that has saved results in obsm
 693 |         predicted [str] - key for predictions in obsm
 694 |         
 695 |     Returns
 696 |     -------
 697 |         scores - array of calibration scores [cell x gene]
 698 |         residuals - prediction error [cell x gene]
 699 |         G_stdev - array of cell-centric variability measures [cell x gene]
 700 |         groups - array of indices for group assignment [cell x gene]
 701 |     '''
 702 |     scores = np.array(adata.obsm[predicted+"_score"]).copy()
 703 |     residuals = np.array(adata.obsm[predicted+"_error"]).copy()
 704 |     G_stdev = np.array(adata.obsm[predicted+"_uncertainty"]).copy()
 705 |     G = np.array(adata.obsm[predicted]).copy()
 706 |     groups = np.array(adata.obsm[predicted+"_groups"]).copy()
 707 |     
 708 |     return(scores, residuals, G_stdev, G, groups)
 709 | 
 710 | 
 711 | def get_grouping(G, method, k='auto', k2='auto', min_samples=5, n_pc=None, n_pc2=None):
 712 |     '''
 713 |     Given the predicted gene expression matrix G (rows=cells, cols=genes),
 714 |     creates a grouping of the different genes (or cells) determined by:
 715 |     
 716 |     Parameters
 717 |     ----------
 718 |         G [numpy matrix/array] - predicted gene expression; columns are genes
 719 |         method [str] - 'kmeans_gene_cell' to separate by genes and the by cells by k-means clustering
 720 |         k [int] - number of groups; only for cv_exp, kmeans_gene, kmeans_cell and kmeans_gene_cell
 721 |                   if <=1 then defaults to one group including all values
 722 |         k2 [int] - second number of groups for kmeans_gene_cell
 723 |                   if <=1 then defaults to one group including all values
 724 |         min_samples [int] - min number of samples; only for dbscan clustering
 725 |         n_pc and npc2 [None or int] - number of PCs to use before KMeans clustering
 726 |                            - NOTE: It is recommended to do this for methods: "kmeans_gene" and "kmeans_gene_cell"
 727 |         
 728 |     Returns
 729 |     -------
 730 |         groups [numpy array] - same dimension as G with values corresponding to group number (integer)
 731 |     '''
 732 |     # for auto k searches
 733 |     k_list = [2,3,4]
 734 |             
 735 |     # grouping by genes then by cells
 736 |     if method == "kmeans_gene_cell":
 737 |         
 738 |         ### Gene grouping
 739 |         X = StandardScaler().fit_transform(G.T)
 740 |         if n_pc is not None:
 741 |             X = PCA(n_components=n_pc, random_state=444).fit_transform(X)
 742 |         # if "auto", then select best k (k_gene)
 743 |         if k == 'auto':
 744 |             k = get_best_k(X, k_list)
 745 |         # group genes
 746 |         if k > 1:
 747 |             kmeans_genes = KMeans(n_clusters=k, random_state=444).fit(X)
 748 |             cluster_genes = kmeans_genes.labels_
 749 |         else:
 750 |             cluster_genes = np.zeros(X.shape[0])
 751 |         
 752 |         # set up groups
 753 |         groups = np.ones(G.shape)*np.nan # init groups array
 754 |         counter = 0 # to index new groups with integers
 755 |         
 756 |         ### Cell grouping
 757 |         # if "auto", then select best k2 (k_cell)
 758 |         if k2 == 'auto':
 759 |             X = StandardScaler().fit_transform(G)
 760 |             if n_pc2 is not None:
 761 |                 X = PCA(n_components=n_pc2, random_state=444).fit_transform(X)
 762 |             k2 = get_best_k(X, k_list)
 763 |         # within each gene group, group cells        
 764 |         for cg in np.unique(cluster_genes):
 765 |             if k2 > 1: # group if more than one cell group needed
 766 |                 G_group = G[:, cluster_genes==cg]
 767 |                 X_group = StandardScaler().fit_transform(G_group)
 768 |                 if n_pc2 is not None:
 769 |                     X_group = PCA(n_components=n_pc2, random_state=444).fit_transform(X_group)
 770 |                 kmeans_cells = KMeans(n_clusters=k2, random_state=444).fit(X_group)
 771 |                 cluster_cells = kmeans_cells.labels_
 772 |             else: # set same labels for all cells
 773 |                 cluster_cells = np.zeros(G.shape[0])
 774 |             # assign cell-gene stratified groupings
 775 |             for cc in np.unique(cluster_cells): 
 776 |                 groups[np.ix_(cluster_cells==cc, cluster_genes==cg)] = counter
 777 |                 counter += 1
 778 |         
 779 |     else:
 780 |         raise Exception("method for get_grouping() is not recognized")
 781 |     
 782 |     return(groups, k, k2)
 783 | 
 784 | 
 785 | def get_best_k (X, k_list):
 786 |     '''
 787 |     Given a matrix X to perform KMeans clustering and list of k parameter values,
 788 |     searches for the best k value
 789 |     
 790 |     k_list should be in ascending order since get_best_k will terminate once the
 791 |     silhouette score decreases
 792 |     
 793 |     Parameters
 794 |     ----------
 795 |         X - array to perform K-means clustering on
 796 |         k_list - list of positive integers for number of clusters to use
 797 |         
 798 |     Returns
 799 |     -------
 800 |         best_k [int] - k value that returns the highest silhouette score
 801 |     '''
 802 |     from sklearn.metrics import silhouette_score
 803 |     
 804 |     # init search
 805 |     current_best = -np.inf
 806 |     best_k = 1
 807 |     
 808 |     # search along k_list
 809 |     for k in k_list:
 810 |         kmeans = KMeans(n_clusters=k, random_state=444).fit(X)
 811 |         score = silhouette_score(X, kmeans.labels_)
 812 |         if score > current_best: # update if score increases
 813 |             current_best = score
 814 |             best_k = k
 815 |         else: # stop if score decreases
 816 |             break
 817 |             
 818 |     return(best_k)
 819 | 
 820 | 
 821 | 
 822 | def conformalize_prediction_interval (adata, predicted, calib_genes, alpha_level=0.33, symmetric=True, return_scores_dict=False, compute_wasserstein=False):
 823 |     '''
 824 |     Builds conformal prediction interval sets for the predicted gene expression
 825 |     
 826 |     Parameters
 827 |     ----------
 828 |         adata [AnnData] - contains adata.obsm[predicted] corresponding to the predicted gene expression
 829 |         predicted [str] - key in adata.obsm that corresponds to predicted gene expression 
 830 |         calib_genes [list or arr of str] - names of the genes in adata.var_names that are used in the calibration set
 831 |         alpha_level [float] - between 0 and 1; determines the alpha level; the CI will span the (1-alpha_level) interval
 832 |                               default value is alpha_level = 0.33 corresponding to 67% CI
 833 |         symmetric [bool] - whether to report symmetric prediction intervals or non-symmetric intervals; default is True (symmetric)
 834 |         return_scores_dict [bool] - whether to return the scores dictionary
 835 |         compute_wasserstein [bool] - whether to compute the Wasserstein distance of the score distributions between each subgroup and its calibration set
 836 |                                    - added to adata.obsm["{predicted}_wasserstein"]
 837 |                                    
 838 |     Returns
 839 |     -------
 840 |         Modifies adata in-place
 841 |         Optionally returns the scores_flattened_dict (dictionary containing calibration scores and group assignments)
 842 |     '''
 843 |     # get uncertainties and scores from saved adata
 844 |     scores, residuals, G_stdev, G, groups = get_spatial_uncertainty_scores_from_metadata (adata, predicted)
 845 |     
 846 |     ### Building calibration sets for scores
 847 |     
 848 |     scores_flattened_dict = build_calibration_scores(adata, predicted, calib_genes, symmetric=symmetric)
 849 |     
 850 |     ### Building prediction intervals
 851 | 
 852 |     prediction_sets = (np.zeros(G.shape), np.zeros(G.shape)) # init prediction sets
 853 |     
 854 |     if compute_wasserstein is True: # set up matrix to store Wasserstein distances
 855 |         from scipy.stats import wasserstein_distance
 856 |         score_dist_wasserstein = np.ones(G.shape).astype(G.dtype)*np.nan
 857 | 
 858 |     # conformalize independently within groups of genes
 859 |     for group in np.unique(groups[~np.isnan(groups)]):
 860 |         
 861 |         # for symmetric intervals
 862 |         if symmetric is True:
 863 |             scores_flattened = scores_flattened_dict[str(group)] # flatten scores
 864 |             n = len(scores_flattened)
 865 |             if (n < 100): # if less than 100 samples in either set, then use the full group set
 866 |                 scores_flattened = scores_flattened_dict[str(np.nan)]
 867 |                 n = len(scores_flattened)-np.isnan(scores_flattened).sum()
 868 |             try:
 869 |                 qhat = np.nanquantile(scores_flattened, np.ceil((n+1)*(1-alpha_level))/n)
 870 |             except:
 871 |                 qhat = np.nan
 872 |             prediction_sets[0][groups==group] = (G-G_stdev*qhat)[groups==group] # lower bound
 873 |             prediction_sets[1][groups==group] = (G+G_stdev*qhat)[groups==group] # upper bound
 874 |         
 875 |         # for asymmetric intervals (Default)
 876 |         else:
 877 |             scores_lo_flattened = scores_flattened_dict[str(group)][0]
 878 |             scores_hi_flattened = scores_flattened_dict[str(group)][1]
 879 |             n_lo = len(scores_lo_flattened)-np.isnan(scores_lo_flattened).sum()
 880 |             n_hi = len(scores_hi_flattened)-np.isnan(scores_hi_flattened).sum()
 881 |             # compute qhat for lower and upper bounds
 882 |             if (n_lo < 100) or (n_hi < 100): # if less than 100 samples in either set, then use the full group set
 883 |                 scores_lo_flattened = scores_flattened_dict[str(np.nan)][0]
 884 |                 scores_hi_flattened = scores_flattened_dict[str(np.nan)][1]
 885 |                 n_lo = len(scores_lo_flattened)-np.isnan(scores_lo_flattened).sum()
 886 |                 n_hi = len(scores_hi_flattened)-np.isnan(scores_hi_flattened).sum()
 887 |             try:
 888 |                 qhat_lo = np.nanquantile(scores_lo_flattened, np.ceil((n_lo+1)*(1-alpha_level))/n_lo)
 889 |                 qhat_hi = np.nanquantile(scores_hi_flattened, np.ceil((n_hi+1)*(1-alpha_level))/n_hi)
 890 |             except:
 891 |                 qhat_lo = np.nan
 892 |                 qhat_hi = np.nan
 893 |             # compute bounds of prediction interval
 894 |             prediction_sets[0][groups==group] = (G-G_stdev*qhat_lo)[groups==group] # lower bound
 895 |             prediction_sets[1][groups==group] = (G+G_stdev*qhat_hi)[groups==group] # upper bound
 896 |             
 897 |         # Wasserstein distances
 898 |         if compute_wasserstein is True:
 899 |             # set up mask for calibration genes
 900 |             calib_idxs = [np.where(adata.obsm[predicted].columns==gene)[0][0] for gene in calib_genes]
 901 |             calib_mask = np.full(G_stdev.shape, False)
 902 |             calib_mask[:,calib_idxs] = True
 903 |             # get CCV measures
 904 |             v = G_stdev[(groups==group)&~(calib_mask)].flatten() # group CCV
 905 |             if len(v) > 0: # skip if no observations in group
 906 |                 if symmetric is True:
 907 |                     if n < 100:
 908 |                         u = G_stdev[calib_mask].flatten() # calibration CCV
 909 |                     else:
 910 |                         u = G_stdev[(groups==group)&(calib_mask)].flatten() # calibration CCV
 911 |                 else:
 912 |                     if (n_lo < 100) or (n_hi < 100):
 913 |                         u = G_stdev[calib_mask].flatten() # calibration CCV
 914 |                     else:
 915 |                         u = G_stdev[(groups==group)&(calib_mask)].flatten() # calibration CCV
 916 |                 # calculate wasserstein distance for the CCV distributions
 917 |                 score_dist_wasserstein[groups==group] = wasserstein_distance(u, v).astype(G.dtype)
 918 |             
 919 |     # add prediction intervals to adata
 920 |     adata.uns['alpha'] = alpha_level
 921 |     adata.obsm[predicted+"_lo"] = pd.DataFrame(prediction_sets[0],
 922 |                                                columns=adata.obsm[predicted].columns,
 923 |                                                index=adata.obsm[predicted].index)
 924 |     adata.obsm[predicted+"_hi"] = pd.DataFrame(prediction_sets[1],
 925 |                                                columns=adata.obsm[predicted].columns,
 926 |                                                index=adata.obsm[predicted].index)
 927 |     # add wasserstein distances to adata        
 928 |     if compute_wasserstein is True:
 929 |         adata.obsm[predicted+"_wasserstein"] = pd.DataFrame(score_dist_wasserstein,
 930 |                                                columns=adata.obsm[predicted].columns,
 931 |                                                index=adata.obsm[predicted].index)
 932 |     
 933 |     
 934 |     if return_scores_dict is True:
 935 |     
 936 |         return(scores_flattened_dict)
 937 |         
 938 |         
 939 |         
 940 | def build_calibration_scores (adata, predicted, calib_genes, symmetric=False, include_zero_scores=False,
 941 |                               trim_quantiles=[None,None]):
 942 |     '''
 943 |     Builds calibration score sets
 944 |     
 945 |     Parameters
 946 |     ----------
 947 |         adata [AnnData] - contains adata.obsm[predicted] corresponding to the predicted gene expression
 948 |         predicted [str] - key in adata.obsm with predicted gene expression values
 949 |         calib_genes [list or arr of str] - names of the genes in adata.var_names that are used in the calibration set
 950 |         symmetric [bool] - whether to have symmetric (or non-symmetric) prediction intervals
 951 |         include_zero_scores [bool] - whether to exclude zero scores
 952 |         trim_quantiles [list of len 2; None or float between 0 and 1] - specifies what quantile range of scores to trim to; None implies no bounds
 953 |         
 954 |     Returns
 955 |     -------
 956 |         scores_flattened_dict - dictionary containing the calibration scores for each stratified group
 957 |     '''
 958 |     
 959 |     # get uncertainties and scores from saved adata
 960 |     scores, residuals, G_stdev, G, groups = get_spatial_uncertainty_scores_from_metadata (adata, predicted)
 961 | 
 962 |     scores_flattened_dict = {}
 963 |     
 964 |     # get calibration genes
 965 |     calib_idxs = [np.where(adata.obsm[predicted].columns==gene)[0][0] for gene in calib_genes]
 966 |     
 967 |     # iterate groups and build conformal sets of calibration scores
 968 |     for group in np.unique(groups[~np.isnan(groups)]):
 969 |         if (np.isnan(group)) or (group not in groups[:, calib_idxs]): # defer to using full calibration set
 970 |             scores_group = scores.copy()
 971 |             residuals_group = residuals.copy()
 972 |         else: # for groups that are found in the calibration set, build group-specific sets
 973 |             scores_group = scores.copy()[groups[:, calib_idxs]==group]
 974 |             residuals_group = residuals.copy()[groups[:, calib_idxs]==group]
 975 |         if symmetric is True: # symmetric calibration set
 976 |             if include_zero_scores is False:
 977 |                 scores_flattened = scores_group[residuals_group != 0].flatten() # exclude zeros -- empirically this way is fastest
 978 |             else:
 979 |                 scores_flattened = scores_group.flatten()
 980 |             scores_flattened_dict[str(group)] = scores_flattened[np.isfinite(scores_flattened)] # add to dict
 981 |         else: # separate into hi/lo non-symmetric calibration sets
 982 |             if include_zero_scores is False:
 983 |                 scores_lo_flattened = scores_group[residuals_group < 0].flatten()
 984 |                 scores_hi_flattened = scores_group[residuals_group > 0].flatten()
 985 |             else:
 986 |                 scores_lo_flattened = scores_group[residuals_group <= 0].flatten()
 987 |                 scores_hi_flattened = scores_group[residuals_group >= 0].flatten()
 988 |             scores_flattened_dict[str(group)] = (scores_lo_flattened[np.isfinite(scores_lo_flattened)],
 989 |                                                  scores_hi_flattened[np.isfinite(scores_hi_flattened)]) # add to dict
 990 | 
 991 |     # build nan group consisting of all scores
 992 |     if symmetric is True: # symmetric calibration set
 993 |         if include_zero_scores is False:
 994 |             scores_flattened = scores[residuals != 0].flatten() # exclude zeros
 995 |         else:
 996 |             scores_flattened = scores.flatten()
 997 |         scores_flattened_dict[str(np.nan)] = scores_flattened[np.isfinite(scores_flattened)] # add to dict
 998 |     else: # separate into hi/lo non-symmetric calibration sets
 999 |         if include_zero_scores is False:
1000 |             scores_lo_flattened = scores[residuals < 0].flatten()
1001 |             scores_hi_flattened = scores[residuals > 0].flatten()
1002 |         else:
1003 |             scores_lo_flattened = scores[residuals <= 0].flatten()
1004 |             scores_hi_flattened = scores[residuals >= 0].flatten()
1005 |         scores_flattened_dict[str(np.nan)] = (scores_lo_flattened[np.isfinite(scores_lo_flattened)],
1006 |                                              scores_hi_flattened[np.isfinite(scores_hi_flattened)]) # add to dict
1007 |     
1008 |     # trim all scores if specified
1009 |     for key in scores_flattened_dict.keys():
1010 |     
1011 |         # determine quantiles from original scores
1012 |         if symmetric is True:
1013 |             if trim_quantiles[0] is not None:
1014 |                 lower_bound = np.nanquantile(scores_flattened_dict[key], trim_quantiles[0])
1015 |             if trim_quantiles[1] is not None:
1016 |                 upper_bound = np.nanquantile(scores_flattened_dict[key], trim_quantiles[1])
1017 |         else:
1018 |             if trim_quantiles[0] is not None:
1019 |                 lower_bound_lo = np.nanquantile(scores_flattened_dict[key][0], trim_quantiles[0])
1020 |                 lower_bound_hi = np.nanquantile(scores_flattened_dict[key][1], trim_quantiles[0])
1021 |             if trim_quantiles[1] is not None:
1022 |                 upper_bound_lo = np.nanquantile(scores_flattened_dict[key][0], trim_quantiles[1])
1023 |                 upper_bound_hi = np.nanquantile(scores_flattened_dict[key][1], trim_quantiles[1])
1024 |         
1025 |         # trim based on quantiles
1026 |         if symmetric is True:
1027 |             if trim_quantiles[0] is not None:    
1028 |                 scores_flattened_dict[key] = scores_flattened_dict[key][scores_flattened_dict[key]>lower_bound]
1029 |             if trim_quantiles[1] is not None:    
1030 |                 scores_flattened_dict[key] = scores_flattened_dict[key][scores_flattened_dict[key]<upper_bound]
1031 |         else:
1032 |             if trim_quantiles[0] is not None:    
1033 |                 scores_flattened_dict[key] = (scores_flattened_dict[key][0][scores_flattened_dict[key][0]>lower_bound_lo],
1034 |                                               scores_flattened_dict[key][1][scores_flattened_dict[key][1]>lower_bound_hi])
1035 |             if trim_quantiles[1] is not None:    
1036 |                 scores_flattened_dict[key] = (scores_flattened_dict[key][0][scores_flattened_dict[key][0]<upper_bound_lo],
1037 |                                               scores_flattened_dict[key][1][scores_flattened_dict[key][1]<upper_bound_hi])
1038 |        
1039 |     return (scores_flattened_dict)


--------------------------------------------------------------------------------