├── concept_fig.png
├── images
    ├── conceptFig.jpg
    ├── top_genes_tcr.png
    └── top_pathways_img.png
├── utils.py
├── datasets.py
├── LICENSE
├── top_pathways.py
├── figures
    ├── make_fig2.py
    └── supplementary_figures
    │   ├── mcfarland_cond_top_pathways.py
    │   ├── .ipynb_checkpoints
    │       ├── mcfarland_cond_top_pathways-checkpoint.py
    │       ├── drop_g-checkpoint.ipynb
    │       └── g_enrichments-checkpoint.ipynb
    │   ├── drop_g.ipynb
    │   └── g_enrichments.ipynb
├── README.md
├── benchmark_intercode.py
├── summary.py
├── get_top_pathways.py
├── pathexplainer.py
├── standard_VAE_impute_benchmark.py
└── benchmark_pmvae.py


/concept_fig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/suinleelab/PAUSE/HEAD/concept_fig.png


--------------------------------------------------------------------------------
/images/conceptFig.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/suinleelab/PAUSE/HEAD/images/conceptFig.jpg


--------------------------------------------------------------------------------
/images/top_genes_tcr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/suinleelab/PAUSE/HEAD/images/top_genes_tcr.png


--------------------------------------------------------------------------------
/images/top_pathways_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/suinleelab/PAUSE/HEAD/images/top_pathways_img.png


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | def parse_gmt(path, symbols=None, min_genes=10):
 4 |     lut = dict()
 5 |     for line in open(path, 'r'):
 6 |         key, _, *genes = line.strip().split()
 7 |         if symbols is not None:
 8 |             genes = symbols.intersection(genes).tolist()
 9 |         if len(genes) < min_genes:
10 |             continue
11 |         lut[key] = genes
12 | 
13 |     return lut
14 | 
15 | def load_annotations(gmt, genes, min_genes=10):
16 |     genesets = parse_gmt(gmt, genes, min_genes)
17 |     annotations = pd.DataFrame(False, index=genes, columns=genesets.keys())
18 |     for key, genes in genesets.items():
19 |         annotations.loc[genes, key] = True
20 | 
21 |     return annotations


--------------------------------------------------------------------------------
/datasets.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset, DataLoader
 2 | import torch
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | class RNASeqData(Dataset):
 7 |     
 8 |     def __init__(self, X, c=None, y=None, transform=None):
 9 |         self.X = X
10 |         self.y = y
11 |         self.c = c
12 |         self.transform = transform
13 |         
14 |     def __len__(self):
15 |         return self.X.shape[0]
16 |     
17 |     def __getitem__(self, index):
18 |         sample = self.X[index,:]
19 |         
20 |         if self.transform is not None:
21 |             sample = self.transform(sample)
22 |             
23 |         if self.y is not None and self.c is not None:
24 |             return sample, self.y[index], self.c[index]
25 |         if self.y is None and self.c is not None:
26 |             return sample, self.c[index]
27 |         else:
28 |             return sample


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Lee Lab @ UW Allen School
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/top_pathways.py:
--------------------------------------------------------------------------------
  1 | import anndata
  2 | import numpy as np
  3 | import pandas as pd
  4 | import torch
  5 | import torch.nn.functional as F
  6 | import os
  7 | 
  8 | from utils import load_annotations
  9 | from sklearn.model_selection import train_test_split
 10 | 
 11 | from torch.utils.data import Dataset, DataLoader
 12 | from datasets import RNASeqData
 13 | 
 14 | from pathexplainer import PathExplainerTorch
 15 | from sklearn.linear_model import LogisticRegression
 16 | 
 17 | from models import pmVAEModel
 18 | import os
 19 | os.environ["CUDA_VISIBLE_DEVICES"]="1"
 20 | 
 21 | 
 22 | 
 23 | def main():
 24 | 
 25 |     # load data 
 26 |     
 27 |     # kang dataset 
 28 |     data = anndata.read('data/kang_count.h5ad')
 29 |     
 30 |     # haber dataset 
 31 |     #data = anndata.read('/projects/leelab/data/single-cell/haber_2017/preprocessed/adata_top_2000_genes.h5ad')    
 32 |     #data = data[data.obs['condition'] != 'Salmonella'].copy()
 33 |     
 34 |     symbols = data.var_names
 35 |     
 36 |     number_of_replicates = 10
 37 | 
 38 |     first_run = True 
 39 | 
 40 |     # for 10 experimental replicates
 41 |     for rand_seed in range(number_of_replicates):
 42 | 
 43 |         print("replicate number " + str(rand_seed))
 44 | 
 45 |         # split data
 46 | 
 47 |         train_data, test_data = train_test_split(data,
 48 |                                                 test_size=0.25,
 49 |                                                 shuffle=True,
 50 |                                                 random_state=rand_seed)
 51 |         tr_data, val_data = train_test_split(train_data,
 52 |                                             test_size=0.25,
 53 |                                             shuffle=True,
 54 |                                             random_state=rand_seed)
 55 | 
 56 |         tr_ds = RNASeqData(np.array(tr_data.X))
 57 |         val_ds = RNASeqData(np.array(val_data.X))
 58 | 
 59 |         # load annotations
 60 |         membership_mask = load_annotations('data/c2.cp.reactome.v7.4.symbols.gmt',
 61 |                                             symbols,
 62 |                                             min_genes=13
 63 |                                         ).astype(bool).T
 64 | 
 65 |         ##
 66 |         ## train base model
 67 |         ##
 68 | 
 69 |         # initialize base model
 70 |         basePMVAE = pmVAEModel(membership_mask.values,
 71 |                                 [12],
 72 |                                 1,
 73 |                                 beta=1e-05,
 74 |                                 terms=membership_mask.index,
 75 |                                 add_auxiliary_module=True
 76 |                             )
 77 | 
 78 |         
 79 |         if first_run: # first run 
 80 |             top_ig = pd.DataFrame(index=basePMVAE.latent_space_names())
 81 |             top_lr = pd.DataFrame(index=basePMVAE.latent_space_names())
 82 |             first_run = False 
 83 |         
 84 |         
 85 |         # train
 86 |         basePMVAE.train(tr_ds, val_ds, 
 87 |                         checkpoint_path='top_kang.pkl',
 88 |                         max_epochs=100)
 89 | 
 90 |         basePMVAE.set_gpu(False)
 91 | 
 92 | 
 93 |         # IG pathway rankings
 94 |         print("Calc IG score")
 95 |         def model_loss_wrapper(z):
 96 |             module_outputs = basePMVAE.model.decoder_net(z)
 97 |             global_recon = basePMVAE.model.merge(module_outputs)
 98 |             return F.mse_loss(global_recon, ground_truth, reduction='none').mean(1).view(-1,1)
 99 | 
100 |         ground_truth = torch.tensor(np.array(val_data.X)).float()
101 |         outs = basePMVAE.model(ground_truth)
102 |         
103 |         input_data = outs.z
104 |         baseline_data = torch.zeros(outs.z.shape[1])
105 |         baseline_data.requires_grad = True
106 | 
107 |         explainer = PathExplainerTorch(model_loss_wrapper)
108 |         attributions = explainer.attributions(input_data,
109 |                                               baseline=baseline_data,
110 |                                               num_samples=200,
111 |                                               use_expectation=False)
112 | 
113 |         np_attribs = attributions.detach().numpy()
114 |         top_ig[rand_seed] = np_attribs.mean(0)
115 | 
116 |         # so far! 
117 |         top_ig.to_csv('kang_ig.csv', index=False)
118 |         
119 |         
120 |         
121 |         
122 |         # LR pathway rankings
123 |         print("Calc LR score")
124 |         
125 |         y_tr = tr_data.obs['condition']
126 |         y_val = val_data.obs['condition']
127 | 
128 |         train_labels = (y_tr == 'stimulated').values
129 |         val_labels = (y_val == 'stimulated').values
130 |         
131 |         
132 |         train_embedding = basePMVAE.model(torch.tensor(tr_data.X).float()).z.detach().numpy()
133 |         val_embedding = basePMVAE.model(torch.tensor(val_data.X).float()).z.detach().numpy()
134 | 
135 |         lr_scores = []
136 |         for pathway in range(train_embedding.shape[1]):
137 |             clf = LogisticRegression(random_state=0).fit(train_embedding[:,pathway].reshape(-1,1), train_labels)
138 |             lr_scores.append(clf.score(val_embedding[:,pathway].reshape(-1,1), val_labels))
139 |             
140 |         
141 |         top_lr[rand_seed] = lr_scores
142 |         top_lr[rand_seed] = -1.*top_lr[rand_seed]
143 | 
144 | 
145 |         # so far! 
146 |         top_lr.to_csv('kang_lr.csv', index=False)
147 |         
148 |         
149 |     
150 |     top_ig.to_csv('kang_ig.csv', index=False)
151 |     top_lr.to_csv('kang_lr.csv', index=False)
152 | 
153 |     
154 |     
155 | if __name__ == '__main__':
156 |     main()    


--------------------------------------------------------------------------------
/figures/make_fig2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from scipy import stats
  4 | import matplotlib.pyplot as plt
  5 | import seaborn as sb
  6 | from statannotations.Annotator import Annotator
  7 | 
  8 | DATASETS = ['kang', 'haber', 'datlinger']
  9 | 
 10 | METHODS = ['impute', 'retrain'] 
 11 | PAL = 'colorblind'
 12 | 
 13 | LABEL_SIZE = 18
 14 | TITLE_SIZE = 18
 15 | AXES_SIZE = 18
 16 | LEG_SIZE = 14
 17 | 
 18 | 
 19 | def get_arrays(dataset, method): 
 20 |     ig_results = np.load('complete_results/'+ dataset + '_' + method + '_ig.npy')
 21 |     logvar_results = np.load('complete_results/'+ dataset + '_' + method + '_logvar.npy')
 22 |     lr_results = np.load('complete_results/'+ dataset + '_' + method + '_lr.npy')
 23 |     kld_results = np.load('complete_results/'+ dataset + '_' + method + '_kld.npy')
 24 |     rand_results = np.load('complete_results/'+ dataset + '_' + method + '_rand.npy')
 25 | 
 26 |     return ig_results, logvar_results, lr_results, kld_results, rand_results 
 27 | 
 28 | # load results for single dataset and benchmark method 
 29 | def load_res(dataset, method): 
 30 |     
 31 |     ig_results, logvar_results, lr_results, kld_results, rand_results = get_arrays(dataset, method)
 32 |     
 33 |     # get AUCs
 34 |     ig_aucs = np.trapz(ig_results, axis=1)
 35 |     lr_aucs = np.trapz(lr_results, axis=1)
 36 |     kld_aucs = np.trapz(kld_results, axis=1)
 37 |     rand_aucs = np.trapz(rand_results, axis=1)
 38 |     logvar_aucs = np.trapz(logvar_results, axis=1)
 39 |     
 40 |     auc_stack = np.concatenate((ig_aucs, lr_aucs, kld_aucs, rand_aucs, logvar_aucs))
 41 |     
 42 |     num_trials = 10 
 43 |     #rankings_methods = np.concatenate((['Loss Attribution']*num_trials, ['LR Score']*num_trials, ['KL Divergence']*num_trials, ['Random']*num_trials, ['LS Variance']*num_trials))
 44 |     
 45 |     rankings_methods = np.concatenate((['PAUSE']*num_trials, ['LR']*num_trials, ['KLD']*num_trials, ['Random']*num_trials, ['LSV']*num_trials))
 46 | 
 47 |     
 48 |     
 49 |     results = pd.DataFrame(index=list(range(0,50)))
 50 |     results['methods'] = rankings_methods
 51 |     results['aucs'] = auc_stack
 52 |     
 53 |     return results
 54 | 
 55 | 
 56 | def get_subplot(dataset, method): 
 57 | 
 58 |     plt.rc('axes', titlesize=TITLE_SIZE)     # fontsize title
 59 |     plt.rc('axes', labelsize=AXES_SIZE)    # fontsize of the x and y axis labels
 60 |     plt.rc('xtick', labelsize=LABEL_SIZE)    # fontsize of the (method) tick labels
 61 |     
 62 |     results = load_res(dataset, method) 
 63 |     
 64 |     plt.style.use('seaborn-colorblind')
 65 | 
 66 |     fig, ax = plt.subplots(figsize=(6,4))
 67 | 
 68 |     bp = sb.boxplot(ax=ax,
 69 |              data=results,x='methods',y='aucs',dodge=True,
 70 |              color='white', fliersize=0, 
 71 |            )
 72 | 
 73 |     sb.stripplot(ax=ax,
 74 |              data=results,x='methods',y='aucs',
 75 |              dodge=True,
 76 |              s=4)
 77 |     
 78 |     """
 79 |     pairs=[("Loss Attribution", "LR Score")]
 80 |     annotator = Annotator(ax, pairs, data=results, x='methods',y='aucs')
 81 |     annotator.set_custom_annotations(['**'])
 82 |     annotator.annotate()
 83 |     """
 84 |     
 85 |     # for ** position
 86 |     top = [results[results['methods'] == "LR"].max()['aucs'], 
 87 |             results[results['methods'] == "KLD"].max()['aucs'], 
 88 |             results[results['methods'] == "Random"].max()['aucs'], 
 89 |             results[results['methods'] == "LSV"].max()['aucs']]
 90 | 
 91 |     
 92 |     for i in range(4):
 93 |         plt.text(x=bp.get_xticks()[i+1] - 0.07, y=top[i] + 0.001, s='**', fontdict={'size':12, 'color':'black'})
 94 | 
 95 | 
 96 |     ax.set_ylabel('AUC')
 97 |     
 98 |     if method == "retrain": # not for bottom row
 99 |         ax.set_xlabel('Pathway Ranking Method')
100 |     else: 
101 |          ax.set_xlabel('')
102 |     
103 |     #plt.title(get_title(dataset) + ' ' + method.capitalize() + ' Benchmark')
104 |     plt.title(method.capitalize())
105 | 
106 |     
107 |     plt.savefig('figs/dataset=%s-method=%s.pdf' % (dataset, method), bbox_inches='tight')
108 | 
109 |     plt.show()
110 |     
111 |     
112 |     
113 | def get_title(dataset): 
114 |     dataset_title = ''
115 |     if dataset == 'kang':
116 |         dataset_title = 'PBMC'
117 |     if dataset == 'haber':
118 |         dataset_title = 'Intestinal'  
119 |     if dataset == 'datlinger':
120 |         dataset_title = 'Jurkat' 
121 |     if dataset == 'grubman':
122 |         dataset_title = 'Entorhinal'   
123 |     return dataset_title 
124 |     
125 |     
126 | 
127 | # get single line graph 
128 | def get_lines(dataset, method): 
129 |     ig_results, logvar_results, lr_results, kld_results, rand_results = get_arrays(dataset, method)
130 |     
131 |     plt.style.use('seaborn-colorblind')
132 |     
133 |     fig, ax = plt.subplots(figsize=(6,4))
134 | 
135 |     sb.lineplot(data=ig_results.mean(0), label='PAUSE')
136 |     sb.lineplot(data=lr_results.mean(0), label='LR')
137 |     sb.lineplot(data=kld_results.mean(0), label='KLD')
138 |     sb.lineplot(data=rand_results.mean(0), label='Random')
139 |     sb.lineplot(data=logvar_results.mean(0), label='LSV')
140 | 
141 |     
142 |     if method == 'impute': 
143 |         ax.set_xlabel('Number of Top Pathways Ablated')
144 | 
145 |     if method == 'retrain': 
146 |         ax.set_xlabel('Number of Top Pathways Included')
147 |     
148 |     plt.legend(fontsize=LEG_SIZE)
149 |     
150 |     ax.set_ylabel('Reconstruction Error')
151 |     
152 |     #plt.title(get_title(dataset) + ' ' + method.capitalize() + ' Benchmark')
153 |     plt.title(method.capitalize())
154 | 
155 |     
156 |     plt.savefig('figs/lines-dataset=%s-method=%s.pdf' % (dataset, method),bbox_inches='tight')
157 | 
158 |     plt.show()
159 | 
160 | 
161 | if __name__ == '__main__':
162 |     
163 |     for dataset in DATASETS: 
164 |         for method in METHODS: 
165 |             get_subplot(dataset, method) 
166 |     
167 |     get_lines('haber', 'impute')
168 |     get_lines('haber', 'retrain')
169 |     
170 | 


--------------------------------------------------------------------------------
/figures/supplementary_figures/mcfarland_cond_top_pathways.py:
--------------------------------------------------------------------------------
  1 | # get Mcfarland top pathways, condition on cell lines 
  2 | 
  3 | import anndata
  4 | import numpy as np
  5 | import pandas as pd
  6 | import torch
  7 | import torch.nn.functional as F
  8 | import os
  9 | 
 10 | from utils import load_annotations
 11 | from sklearn.model_selection import train_test_split
 12 | 
 13 | from torch.utils.data import Dataset, DataLoader
 14 | from datasets import RNASeqData
 15 | 
 16 | from pathexplainer import PathExplainerTorch
 17 | from sklearn.linear_model import LogisticRegression
 18 | from sklearn.preprocessing import OneHotEncoder
 19 | import argparse
 20 | 
 21 | 
 22 | from models import pmVAEModel
 23 | import os
 24 | import time 
 25 | 
 26 | save_path = 'new_for_revision/new_res/'
 27 | 
 28 | 
 29 | def main():
 30 | 
 31 |     ig_times = []
 32 |     lr_times = []
 33 |     train_times = []
 34 |     
 35 |     parser = argparse.ArgumentParser()
 36 |     parser.add_argument('dataset', action="store", default='kang')
 37 |     parser.add_argument('which_gpu', action="store", default='0')
 38 | 
 39 |     args = parser.parse_args()
 40 |     
 41 |     os.environ["CUDA_VISIBLE_DEVICES"]=args.which_gpu
 42 |     dataset =args.dataset
 43 | 
 44 |     # load data 
 45 |     
 46 |     # load mcfarland data
 47 |     data = anndata.read('/projects/leelab/data/single-cell/mcfarland_2020_Idasanutlin/preprocessed/adata_top_2000_genes_tc.h5ad')
 48 |     
 49 |     data = data[data.obs['condition'] == 'Idasanutlin'].copy() 
 50 |     symbols = data.var_names
 51 |     
 52 |     conditions = np.array(data.obs['cell_line']).reshape(-1,1)
 53 |     enc = OneHotEncoder()
 54 |     enc.fit(conditions)
 55 |     pre_processed_conditions = enc.transform(conditions).toarray()
 56 |     
 57 |     number_of_replicates = 10
 58 | 
 59 |     first_run = True 
 60 |     
 61 |     # for 10 experimental replicates
 62 |     for rand_seed in range(number_of_replicates):
 63 | 
 64 |         print("replicate number " + str(rand_seed))
 65 | 
 66 |         # split data
 67 |         
 68 |         train_data, test_data, train_c, test_c = train_test_split(data,pre_processed_conditions,
 69 |                                                 test_size=0.25,
 70 |                                                 shuffle=True,
 71 |                                                 random_state=rand_seed)
 72 |         tr_data, val_data, tr_c, val_c = train_test_split(train_data,train_c,
 73 |                                             test_size=0.25,
 74 |                                             shuffle=True,
 75 |                                             random_state=rand_seed)
 76 |         
 77 |         tr_ds = RNASeqData(np.array(tr_data.X), c=tr_c)
 78 |         val_ds = RNASeqData(np.array(val_data.X), c=val_c)
 79 | 
 80 |         # load annotations
 81 |         membership_mask = load_annotations('data/c2.cp.reactome.v7.4.symbols.gmt',
 82 |                                             symbols,
 83 |                                             min_genes=13
 84 |                                         ).astype(bool).T
 85 | 
 86 |         ##
 87 |         ## train model
 88 |         ##
 89 | 
 90 |         # initialize base model
 91 |         basePMVAE = pmVAEModel(membership_mask.values,
 92 |                                 [12],
 93 |                                 1,
 94 |                                 cdim = tr_c.shape[1],
 95 |                                 beta=1e-05,
 96 |                                 terms=membership_mask.index,
 97 |                                 add_auxiliary_module=True
 98 |                             )
 99 | 
100 |         
101 |         if first_run: # first run 
102 |             top_ig = pd.DataFrame(index=basePMVAE.latent_space_names())
103 |             top_lr = pd.DataFrame(index=basePMVAE.latent_space_names())
104 |             first_run = False 
105 |         
106 |         
107 |         # train
108 |         
109 |         start_train = time.time()
110 |         basePMVAE.train(tr_ds, val_ds, 
111 |                         checkpoint_path='saved_models/seed_' + str(rand_seed) + 'cell_lines_cond_top_' + dataset + '.pkl',
112 |                         max_epochs=100)
113 |         
114 |         end_train = time.time()
115 |         train_times.append(end_train - start_train)
116 |         
117 | 
118 |         basePMVAE.set_gpu(False)
119 | 
120 | 
121 |         # IG pathway rankings
122 |         print("Calc IG score")
123 |         
124 |         start_ig = time.time()
125 |         
126 |         def model_loss_wrapper(z):
127 |             latent_input = torch.cat([z, c_full], 1)
128 |             module_outputs = basePMVAE.model.decoder_net(latent_input)
129 |             global_recon = basePMVAE.model.merge(module_outputs)
130 |             return F.mse_loss(global_recon, ground_truth, reduction='none').mean(1).view(-1,1)
131 | 
132 |         ground_truth = torch.tensor(data.X).float()
133 |         c_full = torch.tensor(pre_processed_conditions).float()
134 |         outs = basePMVAE.model(ground_truth,c_full)
135 |         
136 |         input_data = outs.z
137 |         baseline_data = torch.zeros(outs.z.shape[1])
138 |         baseline_data.requires_grad = True
139 | 
140 |         explainer = PathExplainerTorch(model_loss_wrapper)
141 |         attributions = explainer.attributions(input_data,
142 |                                       baseline=baseline_data,
143 |                                       num_samples=200,
144 |                                       use_expectation=False)
145 | 
146 |         np_attribs = attributions.detach().numpy()
147 |         top_ig[rand_seed] = np_attribs.mean(0)
148 |         
149 |         end_ig = time.time()
150 |         ig_times.append(end_ig - start_ig)
151 |         
152 | 
153 |         # so far! 
154 |         top_ig.to_csv(save_path + dataset + '_cell_lines_cond_ig.csv', index=False)
155 |         
156 |         
157 |         # LR pathway rankings
158 |         print("Calc LR score")
159 |         start_lr = time.time()
160 | 
161 |         if args.dataset == 'mcfarland':
162 | 
163 |             y_tr = tr_data.obs['TP53_mutation_status']
164 |             y_val = val_data.obs['TP53_mutation_status']
165 | 
166 |             train_labels = (y_tr == 'Wild Type').values
167 |             val_labels = (y_val == 'Wild Type').values
168 |             
169 |         
170 |         train_embedding = basePMVAE.model(torch.tensor(tr_data.X).float(), torch.tensor(tr_c).float()).z.detach().numpy()
171 |         val_embedding = basePMVAE.model(torch.tensor(val_data.X).float(), torch.tensor(val_c).float()).z.detach().numpy()
172 |         
173 |         
174 |         lr_scores = []
175 |         for pathway in range(train_embedding.shape[1]):
176 |             clf = LogisticRegression(random_state=0).fit(train_embedding[:,pathway].reshape(-1,1), train_labels)
177 |             lr_scores.append(clf.score(val_embedding[:,pathway].reshape(-1,1), val_labels))
178 |             
179 |         
180 |         top_lr[rand_seed] = lr_scores
181 |         top_lr[rand_seed] = -1.*top_lr[rand_seed]
182 |         
183 |         end_lr = time.time()
184 |         lr_times.append(end_lr - start_lr)
185 | 
186 | 
187 |         # so far! 
188 |         top_lr.to_csv(save_path + dataset + '_cell_lines_cond_lr.csv', index=False)
189 | 
190 |         times = pd.DataFrame()
191 |         times['ig_times'] = ig_times
192 |         times['lr_times'] = lr_times
193 |         times['train_times'] = train_times
194 | 
195 |         times.to_csv(save_path + args.dataset + '_cell_lines_cond_times.csv')
196 |     
197 |     
198 | if __name__ == '__main__':
199 |     main()    


--------------------------------------------------------------------------------
/figures/supplementary_figures/.ipynb_checkpoints/mcfarland_cond_top_pathways-checkpoint.py:
--------------------------------------------------------------------------------
  1 | # get Mcfarland top pathways, condition on cell lines 
  2 | 
  3 | import anndata
  4 | import numpy as np
  5 | import pandas as pd
  6 | import torch
  7 | import torch.nn.functional as F
  8 | import os
  9 | 
 10 | from utils import load_annotations
 11 | from sklearn.model_selection import train_test_split
 12 | 
 13 | from torch.utils.data import Dataset, DataLoader
 14 | from datasets import RNASeqData
 15 | 
 16 | from pathexplainer import PathExplainerTorch
 17 | from sklearn.linear_model import LogisticRegression
 18 | from sklearn.preprocessing import OneHotEncoder
 19 | import argparse
 20 | 
 21 | 
 22 | from models import pmVAEModel
 23 | import os
 24 | import time 
 25 | 
 26 | save_path = 'new_for_revision/new_res/'
 27 | 
 28 | 
 29 | def main():
 30 | 
 31 |     ig_times = []
 32 |     lr_times = []
 33 |     train_times = []
 34 |     
 35 |     parser = argparse.ArgumentParser()
 36 |     parser.add_argument('dataset', action="store", default='kang')
 37 |     parser.add_argument('which_gpu', action="store", default='0')
 38 | 
 39 |     args = parser.parse_args()
 40 |     
 41 |     os.environ["CUDA_VISIBLE_DEVICES"]=args.which_gpu
 42 |     dataset =args.dataset
 43 | 
 44 |     # load data 
 45 |     
 46 |     # load mcfarland data
 47 |     data = anndata.read('/projects/leelab/data/single-cell/mcfarland_2020_Idasanutlin/preprocessed/adata_top_2000_genes_tc.h5ad')
 48 |     
 49 |     data = data[data.obs['condition'] == 'Idasanutlin'].copy() 
 50 |     symbols = data.var_names
 51 |     
 52 |     conditions = np.array(data.obs['cell_line']).reshape(-1,1)
 53 |     enc = OneHotEncoder()
 54 |     enc.fit(conditions)
 55 |     pre_processed_conditions = enc.transform(conditions).toarray()
 56 |     
 57 |     number_of_replicates = 10
 58 | 
 59 |     first_run = True 
 60 |     
 61 |     # for 10 experimental replicates
 62 |     for rand_seed in range(number_of_replicates):
 63 | 
 64 |         print("replicate number " + str(rand_seed))
 65 | 
 66 |         # split data
 67 |         
 68 |         train_data, test_data, train_c, test_c = train_test_split(data,pre_processed_conditions,
 69 |                                                 test_size=0.25,
 70 |                                                 shuffle=True,
 71 |                                                 random_state=rand_seed)
 72 |         tr_data, val_data, tr_c, val_c = train_test_split(train_data,train_c,
 73 |                                             test_size=0.25,
 74 |                                             shuffle=True,
 75 |                                             random_state=rand_seed)
 76 |         
 77 |         tr_ds = RNASeqData(np.array(tr_data.X), c=tr_c)
 78 |         val_ds = RNASeqData(np.array(val_data.X), c=val_c)
 79 | 
 80 |         # load annotations
 81 |         membership_mask = load_annotations('data/c2.cp.reactome.v7.4.symbols.gmt',
 82 |                                             symbols,
 83 |                                             min_genes=13
 84 |                                         ).astype(bool).T
 85 | 
 86 |         ##
 87 |         ## train model
 88 |         ##
 89 | 
 90 |         # initialize base model
 91 |         basePMVAE = pmVAEModel(membership_mask.values,
 92 |                                 [12],
 93 |                                 1,
 94 |                                 cdim = tr_c.shape[1],
 95 |                                 beta=1e-05,
 96 |                                 terms=membership_mask.index,
 97 |                                 add_auxiliary_module=True
 98 |                             )
 99 | 
100 |         
101 |         if first_run: # first run 
102 |             top_ig = pd.DataFrame(index=basePMVAE.latent_space_names())
103 |             top_lr = pd.DataFrame(index=basePMVAE.latent_space_names())
104 |             first_run = False 
105 |         
106 |         
107 |         # train
108 |         
109 |         start_train = time.time()
110 |         basePMVAE.train(tr_ds, val_ds, 
111 |                         checkpoint_path='saved_models/seed_' + str(rand_seed) + 'cell_lines_cond_top_' + dataset + '.pkl',
112 |                         max_epochs=100)
113 |         
114 |         end_train = time.time()
115 |         train_times.append(end_train - start_train)
116 |         
117 | 
118 |         basePMVAE.set_gpu(False)
119 | 
120 | 
121 |         # IG pathway rankings
122 |         print("Calc IG score")
123 |         
124 |         start_ig = time.time()
125 |         
126 |         def model_loss_wrapper(z):
127 |             latent_input = torch.cat([z, c_full], 1)
128 |             module_outputs = basePMVAE.model.decoder_net(latent_input)
129 |             global_recon = basePMVAE.model.merge(module_outputs)
130 |             return F.mse_loss(global_recon, ground_truth, reduction='none').mean(1).view(-1,1)
131 | 
132 |         ground_truth = torch.tensor(data.X).float()
133 |         c_full = torch.tensor(pre_processed_conditions).float()
134 |         outs = basePMVAE.model(ground_truth,c_full)
135 |         
136 |         input_data = outs.z
137 |         baseline_data = torch.zeros(outs.z.shape[1])
138 |         baseline_data.requires_grad = True
139 | 
140 |         explainer = PathExplainerTorch(model_loss_wrapper)
141 |         attributions = explainer.attributions(input_data,
142 |                                       baseline=baseline_data,
143 |                                       num_samples=200,
144 |                                       use_expectation=False)
145 | 
146 |         np_attribs = attributions.detach().numpy()
147 |         top_ig[rand_seed] = np_attribs.mean(0)
148 |         
149 |         end_ig = time.time()
150 |         ig_times.append(end_ig - start_ig)
151 |         
152 | 
153 |         # so far! 
154 |         top_ig.to_csv(save_path + dataset + '_cell_lines_cond_ig.csv', index=False)
155 |         
156 |         
157 |         # LR pathway rankings
158 |         print("Calc LR score")
159 |         start_lr = time.time()
160 | 
161 |         if args.dataset == 'mcfarland':
162 | 
163 |             y_tr = tr_data.obs['TP53_mutation_status']
164 |             y_val = val_data.obs['TP53_mutation_status']
165 | 
166 |             train_labels = (y_tr == 'Wild Type').values
167 |             val_labels = (y_val == 'Wild Type').values
168 |             
169 |         
170 |         train_embedding = basePMVAE.model(torch.tensor(tr_data.X).float(), torch.tensor(tr_c).float()).z.detach().numpy()
171 |         val_embedding = basePMVAE.model(torch.tensor(val_data.X).float(), torch.tensor(val_c).float()).z.detach().numpy()
172 |         
173 |         
174 |         lr_scores = []
175 |         for pathway in range(train_embedding.shape[1]):
176 |             clf = LogisticRegression(random_state=0).fit(train_embedding[:,pathway].reshape(-1,1), train_labels)
177 |             lr_scores.append(clf.score(val_embedding[:,pathway].reshape(-1,1), val_labels))
178 |             
179 |         
180 |         top_lr[rand_seed] = lr_scores
181 |         top_lr[rand_seed] = -1.*top_lr[rand_seed]
182 |         
183 |         end_lr = time.time()
184 |         lr_times.append(end_lr - start_lr)
185 | 
186 | 
187 |         # so far! 
188 |         top_lr.to_csv(save_path + dataset + '_cell_lines_cond_lr.csv', index=False)
189 | 
190 |         times = pd.DataFrame()
191 |         times['ig_times'] = ig_times
192 |         times['lr_times'] = lr_times
193 |         times['train_times'] = train_times
194 | 
195 |         times.to_csv(save_path + args.dataset + '_cell_lines_cond_times.csv')
196 |     
197 |     
198 | if __name__ == '__main__':
199 |     main()    


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PAUSE
  2 | 
  3 | ![Main Concept Fig](/images/conceptFig.jpg)
  4 | 
  5 | Code for the paper "Principled feature attribution for unsupervised gene expression analysis" (PAUSE). 
  6 | For more information, see our preprint: https://www.biorxiv.org/content/10.1101/2022.05.03.490535v1.
  7 | 
  8 | ## Examples
  9 | 
 10 | ### Identify most important pathways from an interpretable autoencoder
 11 | This first example demonstrates how the PAUSE framework can be used to identify the most important pathways for an interpretable autoencoder.
 12 | 
 13 | ```python
 14 | import anndata
 15 | # and other import statements...
 16 | 
 17 | ## load a single cell dataset
 18 | data = anndata.read('data/kang_count.h5ad')
 19 | 
 20 | ## load a pathway gene set file 
 21 | ## more examples can be found here (http://www.gsea-msigdb.org/gsea/msigdb/collections.jsp)
 22 | data.varm['annotations'] = load_annotations(
 23 |     'data/c2.cp.reactome.v7.4.symbols.gmt',
 24 |     data.var_names,
 25 |     min_genes=13
 26 | )
 27 | # binary matrix mapping from genes to pathways
 28 | membership_mask = data.varm['annotations'].astype(bool).T.values
 29 | ```
 30 | 
 31 | After loading the RNA-seq dataset you want to analyze, you can then initialize and train a model on the dataset. In this case, we use our PyTorch implementation of the [pmVAE architecture](https://www.biorxiv.org/content/10.1101/2021.01.28.428664v1), which is a variational autoencoder composed of a set of subnetworks (pathway modules) that are factorized according to the gene sets defined above. In this model, each latent node in the bottleneck layer only contains information about the genes belonging to its corresponding pathway.
 32 | 
 33 | ```python
 34 | from models import pmVAEModel 
 35 | 
 36 | # initialize pmVAE model. 
 37 | # positional arguments are 1) the binary gene set membership matrix, 
 38 | # 2) a list containing the number of nodes in each hidden layer, and 
 39 | # 3) an integer indicating the number of nodes in each module's bottleneck.
 40 | pmvae = pmVAEModel(
 41 |     membership_mask,
 42 |     [12], # This indicates that there will be one intermediate layer before the bottleneck with 12 nodes in each module. To have 2 intermediate layers of 6 nodes, you could write [6, 6]
 43 |     4, # number of nodes in each module bottleneck 
 44 |     terms=membership_mask.index, # a list of the names of the pathway modules
 45 |     add_auxiliary_module=True # whether or not to include a densely connected auxiliary module
 46 | )
 47 | 
 48 | # train pmVAE model
 49 | pmvae.train(train_dataset, # a PyTorch dataset object containing the training expression samples
 50 |               val_dataset, # a PyTorch dataset object containing the val expression samples
 51 |               max_epochs=200, # Maximum number of epochs to train
 52 |               lr=0.001, # learning rate of the adam optimizer used to train the model
 53 |               beta=1e-5, # weight multiplier of KL loss term
 54 |               batch_size=256, # samples per batch
 55 |               pathway_dropout=True, # whether or not to train with pathway dropout scheme as defined in pmVAE paper
 56 |               checkpoint_path='pmvae_checkpoint.pkl' # path of model checkpoint
 57 |               )
 58 | ```
 59 | 
 60 | Once the model is trained, we can use the [Path Explain software](https://github.com/suinleelab/path_explain) (also provided in this repository in the `pathexplainer.py` file) to *identify the top pathways* in the dataset by explaining the trained models reconstruction error with respect to the learned latent pathways.
 61 | 
 62 | ```python
 63 | from pathexplainer import PathExplainerTorch
 64 | import torch
 65 | import torch.nn.functional as F
 66 | 
 67 | # define a wrapper function that outputs the reconstruction error of the model given the latent codes
 68 | def model_loss_wrapper(z):
 69 |     module_outputs = pmvae.model.decoder_net(z)
 70 |     global_recon = pmvae.model.merge(module_outputs)
 71 |     return F.mse_loss(global_recon, ground_truth, reduction='none').mean(1).view(-1,1)
 72 |     
 73 | # define a tensor to hold the original data, which gets used as an argument in the reconstruction error in the wrapper above
 74 | ground_truth = torch.tensor(data.X).float()
 75 | 
 76 | # get the latent codes to use as input to the model loss wrapper
 77 | outs = pmvae.model(ground_truth)
 78 | input_data = outs.z
 79 | baseline_data = torch.zeros(outs.z.shape[1]) # define a baseline, in this case the zeros vector
 80 | baseline_data.requires_grad = True
 81 | 
 82 | # calculate the pathway attributions
 83 | explainer = PathExplainerTorch(model_loss_wrapper)
 84 | attributions = explainer.attributions(input_data,
 85 |                                       baseline=baseline_data,
 86 |                                       num_samples=200, # number of samples to use when calculating the path integral
 87 |                                       use_expectation=False)
 88 | 
 89 | ```
 90 | 
 91 | Once you have calculated the pathway attributions, you can average them over all samples in the dataset to identify and plot the most important pathways.
 92 | 
 93 | ```python
 94 | # move attributions to numpy, make a df w/ index as latent space names
 95 | np_attribs = attributions.detach().numpy()
 96 | top_features = pd.DataFrame(index=pmvae.latent_space_names())
 97 | top_features['global_attribs'] = np_attribs.mean(0) # in this case, global attributions are the mean over the dataset
 98 | 
 99 | # Loss explanation
100 | top_features.sort_values('global_attribs',ascending=True).iloc[:30,0].plot.bar()
101 | ```
102 | 
103 | ![Showing pathway attributions](/images/top_pathways_img.png)
104 | 
105 | ### Identify most important genes contributing to a particular latent pathway
106 | This first example demonstrates how the PAUSE framework can be used to identify the most important pathways for an interpretable autoencoder. However, as you see above, these interpretable autoencoders often have multiple bottleneck nodes for each pathway, raising the question of what the difference between these bottleneck nodes is. Additionally, sometimes the most important pathways are the "uninterpretable" densely-connected auxiliary pathways. How can we identify the most important genes contributing to these latent pathways, and interpret their biological meaning? By using gene level attributions. This example uses another pmVAE model, as in the above example. This time, however, instead of getting attributions of the loss to the latent pathways, we can pick a latent pathway and explain it in terms of its input genes.
107 | 
108 | ```python
109 | from summary import summary_plot
110 | 
111 | # explain tcr in terms of genes
112 | def model_latent_wrapper(x):
113 |     outs = pmvae.model(x)
114 |     z = outs.mu
115 |     return z[:,316].reshape(-1,1) # 316 is the latent node number corresponding to the pathway of interest here
116 |     
117 | input_data = torch.tensor(data.X).float()
118 | input_data.requires_grad = True
119 | baseline_data = torch.zeros(data.X.shape[1])
120 | baseline_data.requires_grad = True
121 | 
122 | explainer = PathExplainerTorch(model_latent_wrapper) # this time, use explanation software with latent output wrapper
123 | attributions = explainer.attributions(input_data,
124 |                                       baseline=baseline_data,
125 |                                       num_samples=200, # again use 200 interpolation points to numerically approximate the path integral
126 |                                       use_expectation=False)
127 | 
128 | np_attribs = attributions.detach().numpy()
129 | top_features = pd.DataFrame(index=membership_mask.columns)
130 | top_features['global_attribs'] = np.abs(np_attribs).mean(0) # to find top genes, we take the average MAGNITUDE of attribs across all samples
131 | 
132 | summary_plot(np_attribs,
133 |              data.X,
134 |              feature_names=membership_mask.columns,
135 |              plot_top_k=10,
136 |              standardize_features=False,
137 |              scale_x_ind=False,
138 |              scale_y_ind=False,
139 |              figsize=(4, 4),
140 |              dpi=300,
141 |              cmap=coolwarm)
142 | ```
143 | 
144 | ![Showing genee attributions](/images/top_genes_tcr.png)
145 | 
146 | ## Reproducing experiments and figures from paper
147 | 
148 | For code to generate the models used, see "models.py". Pathway attributions and gene attributions are generated using code from "pathexplainer.py". Benchmarking pathways attributions against other methods for ranking pathway importance is done using the files "benchmark_pmvae.py", "benchmark_intercode.py", and "top_pathways.py". For code to generate the figures in the paper, see the folder `figures`. 
149 | 
150 | 


--------------------------------------------------------------------------------
/benchmark_intercode.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # benchmark_pmvae.py
  3 | 
  4 | import anndata
  5 | import numpy as np
  6 | import pandas as pd
  7 | import torch
  8 | import torch.nn.functional as F
  9 | import os
 10 | 
 11 | from utils import load_annotations
 12 | from sklearn.model_selection import train_test_split
 13 | 
 14 | from torch.utils.data import Dataset, DataLoader
 15 | from datasets import RNASeqData
 16 | 
 17 | import argparse
 18 | 
 19 | from pathexplainer import PathExplainerTorch
 20 | from sklearn.linear_model import LogisticRegression
 21 | 
 22 | from models import pmVAEModel
 23 | 
 24 | from intercode import AutoencoderLinearDecoder, train_autoencoder
 25 | 
 26 | import os
 27 | os.environ["CUDA_VISIBLE_DEVICES"]="4"
 28 | 
 29 | def main():
 30 |     
 31 |     # get dataset, removal method
 32 |     parser = argparse.ArgumentParser()
 33 | #     parser.add_argument('split', action="store", default='0')
 34 |     parser.add_argument('dataset', action="store", default='kang')
 35 |     parser.add_argument('removal', action="store", default='impute')
 36 |     
 37 |     args = parser.parse_args()
 38 |     
 39 |     # load data
 40 |     if args.dataset == 'kang':
 41 |         
 42 |         data = anndata.read('data/kang_count.h5ad')
 43 |         data.varm['I'] = load_annotations(
 44 |             'data/c2.cp.reactome.v7.4.symbols.gmt',
 45 |             data.var_names,
 46 |             min_genes=13
 47 |         ).values
 48 |         data.uns['terms'] = list(load_annotations(
 49 |             'data/c2.cp.reactome.v7.4.symbols.gmt',
 50 |             data.var_names,
 51 |             min_genes=13
 52 |         ).columns)
 53 |         
 54 |     number_of_pathways = 20
 55 |     number_of_replicates = 10
 56 |     
 57 |     l2_results = np.zeros((number_of_replicates,number_of_pathways))
 58 |     ig_results = np.zeros((number_of_replicates,number_of_pathways))
 59 | #     lr_results = np.zeros((number_of_replicates,number_of_pathways))
 60 | #     kld_results = np.zeros((number_of_replicates,number_of_pathways))
 61 |     rand_results = np.zeros((number_of_replicates,number_of_pathways))
 62 |     
 63 |     # for 10 experimental replicates
 64 |     for rand_seed in range(number_of_replicates):
 65 |         
 66 |         print("replicate number " + str(rand_seed))
 67 |         
 68 |         # split data
 69 |         
 70 |         train_data, test_data = train_test_split(data,
 71 |                                                 test_size=0.25,
 72 |                                                 shuffle=True,
 73 |                                                 random_state=rand_seed)
 74 |         tr_data, val_data = train_test_split(train_data,
 75 |                                             test_size=0.25,
 76 |                                             shuffle=True,
 77 |                                             random_state=rand_seed)
 78 |         
 79 |         tr_ds = RNASeqData(np.array(tr_data.X))
 80 |         val_ds = RNASeqData(np.array(val_data.X))
 81 |         
 82 |         # load annotations
 83 |         membership_mask = load_annotations('data/c2.cp.reactome.v7.4.symbols.gmt',
 84 |                                             data.var_names,
 85 |                                             min_genes=13
 86 |                                         ).astype(bool).T
 87 |         
 88 |         ##
 89 |         ## train base model
 90 |         ##
 91 |         
 92 |         LR = 0.001
 93 |         BATCH_SIZE = 62
 94 |         N_EPOCHS = 30
 95 | 
 96 |         # regularization hyperparameters
 97 |         # lambda0 - page 19 of presentation
 98 |         # lambdas 1-3 - last term on page 20
 99 | 
100 |         LAMBDA0 = 0.1
101 | 
102 |         LAMBDA1 = 0.93*LR
103 |         LAMBDA2 = 0.43*LR
104 |         LAMBDA3 = 0.57*LR
105 |         
106 |         # initialize base model
107 |         autoencoder = AutoencoderLinearDecoder(tr_data.n_vars, n_ann=len(tr_data.uns['terms']))
108 |         autoencoder.cuda()
109 |         
110 |         # train
111 |         train_autoencoder(tr_data, autoencoder, LR, BATCH_SIZE, N_EPOCHS,
112 |                   l2_reg_lambda0=LAMBDA0, lambda1=LAMBDA1, lambda2=LAMBDA2, lambda3=LAMBDA3)
113 |         
114 |         ##
115 |         ## get pathway rankings
116 |         ##
117 |         top_features = pd.DataFrame(index=data.uns['terms'])
118 |         
119 |         ## get L2
120 |         top_features['l2'] = -1.*autoencoder.decoder.weight_dict['annotated'].data.norm(p=2, dim=0).detach().cpu().numpy()
121 |         
122 |         print("Calc IG score")
123 |         # IG pathway rankings
124 |         ground_truth = torch.tensor(val_data.X).float()
125 |         autoencoder.cpu()
126 | 
127 |         def intercode_loss_wrapper(z):
128 |             global_recon = autoencoder.decoder(z)
129 |             return F.mse_loss(global_recon, ground_truth, reduction='none').mean(1).view(-1,1)
130 | 
131 |         
132 |         input_data = autoencoder.encoder(torch.tensor(val_data.X).float())
133 |         baseline_data = torch.zeros(input_data.shape[1])
134 |         baseline_data.requires_grad = True
135 |         
136 |         explainer = PathExplainerTorch(intercode_loss_wrapper)
137 |         attributions = explainer.attributions(input_data,
138 |                                               baseline=baseline_data,
139 |                                               num_samples=200,
140 |                                               use_expectation=False)
141 |         
142 |         top_features['IG'] = attributions.detach().numpy().mean(0)
143 |         
144 | #         # LR pathway rankings
145 | #         print("Calc LR score")
146 | #         y_tr = tr_data.obs['condition']
147 | #         y_val = val_data.obs['condition']
148 |         
149 | #         train_embedding = basePMVAE.model(torch.tensor(tr_data.X).float()).z.detach().numpy()
150 | #         val_embedding = basePMVAE.model(torch.tensor(val_data.X).float()).z.detach().numpy()
151 |         
152 | #         lr_scores = []
153 | #         for pathway in range(train_embedding.shape[1]):
154 | #             train_labels = (y_tr == 'stimulated').values
155 | #             val_labels = (y_val == 'stimulated').values
156 | #             clf = LogisticRegression(random_state=0).fit(train_embedding[:,pathway].reshape(-1,1), train_labels)
157 | #             lr_scores.append(clf.score(val_embedding[:,pathway].reshape(-1,1), val_labels))
158 |             
159 | #         top_features['lr_score'] = lr_scores
160 | #         top_features['lr_score'] = -1.*top_features['lr_score']
161 |         
162 | #         # KLD pathway rankings
163 | #         print("Calc KLD")
164 | #         pathway_kld = (-0.5 * (1 + outs.logvar - outs.mu.pow(2) - outs.logvar.exp()).mean(0)).detach().numpy()
165 | #         top_features['kld'] = -1.*pathway_kld
166 |         
167 |         # Random pathway rankings
168 |         print("Calc Random")
169 |         np.random.seed(rand_seed)
170 |         top_features['rand'] = np.random.randn(top_features.shape[0])
171 | 
172 |         # impute or retrain
173 |         def impute_benchmark(method,n_pathways=20):
174 |             method_recons_errors = []
175 | 
176 |             # for top 10 pathways 
177 |             for i in range(1,1+n_pathways):
178 | 
179 |                 # set pathways = 0.
180 |                 test_matrix = torch.tensor(test_data.X).float()
181 |                 test_matrix_embedded = autoencoder.encoder(test_matrix)
182 |                 for x in top_features.sort_values(method).index[:i]:
183 |                     index_to_zero = list(top_features.index).index(x)
184 |                     test_matrix_embedded[:,index_to_zero] = 0.
185 | 
186 |                 global_recon = autoencoder.decoder(test_matrix_embedded)
187 |                 recons_error = F.mse_loss(global_recon, test_matrix).detach().item()
188 |                 method_recons_errors.append(recons_error)
189 |             return method_recons_errors
190 |         
191 |         print("Impute L2")
192 |         l2_results[rand_seed,:] = impute_benchmark('l2')
193 |         print("Impute IG")
194 |         ig_results[rand_seed,:] = impute_benchmark('IG')
195 | #         print("Impute LR")
196 | #         lr_results[rand_seed,:] = impute_benchmark('lr_score')
197 | #         print("Impute KLD")
198 | #         kld_results[rand_seed,:] = impute_benchmark('kld')
199 |         print("Impute RAND")
200 |         rand_results[rand_seed,:] = impute_benchmark('rand')
201 | 
202 |     # save results
203 |     with open('results/intercode_kang_impute_l2.npy', 'wb') as f:
204 |         np.save(f, l2_results)
205 |     with open('results/intercode_kang_impute_ig.npy', 'wb') as f:
206 |         np.save(f, ig_results)
207 | #     with open('results/intercode_kang_impute_lr.npy', 'wb') as f:
208 | #         np.save(f, lr_results)
209 | #     with open('results/intercode_kang_impute_kld.npy', 'wb') as f:
210 | #         np.save(f, kld_results)
211 |     with open('results/intercode_kang_impute_rand.npy', 'wb') as f:
212 |         np.save(f, rand_results)
213 |     
214 | if __name__ == '__main__':
215 |     main()    


--------------------------------------------------------------------------------
/summary.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Defines a function to plot individual feature-level importances
  3 | in a summary plot.
  4 | """
  5 | import pandas as pd
  6 | import numpy as np
  7 | import matplotlib.pyplot as plt
  8 | import matplotlib as mpl
  9 | from scatter import _get_bounds, _color_bar, _get_shared_limits, _set_axis_config
 10 | import colors
 11 | 
 12 | def _get_jitter_array(feature_values,
 13 |                       select_attributions):
 14 |     """
 15 |     Helper function to get jitter in a summary plot.
 16 |     Args:
 17 |         feature_values: see summary_plot
 18 |         select_attributions: see summary_plot
 19 |     """
 20 |     jitter_array = np.zeros(feature_values.shape)
 21 |     for i in range(select_attributions.shape[1]):
 22 |         feature_attr = select_attributions[:, i]
 23 |         num_samples = feature_attr.shape[0]
 24 |         nbins = 100
 25 |         quant = np.round(nbins * (feature_attr - np.min(feature_attr)) / \
 26 |                          (np.max(feature_attr) - \
 27 |                           np.min(feature_attr) + 1e-8))
 28 |         inds = np.argsort(quant + np.random.randn(num_samples) * 1e-6)
 29 |         layer = 0
 30 |         last_bin = -1
 31 |         jitter_values = np.zeros(num_samples)
 32 |         for ind in inds:
 33 |             if quant[ind] != last_bin:
 34 |                 layer = 0
 35 |             jitter_values[ind] = np.ceil(layer / 2) * ((layer % 2) * 2 - 1)
 36 |             layer += 1
 37 |             last_bin = quant[ind]
 38 |         jitter_values *= 0.9 * (1.0 / np.max(jitter_values + 1))
 39 |         jitter_array[:, i] = jitter_values
 40 |     return jitter_array
 41 | 
 42 | def _get_jitter_df(interactions, feature_values,
 43 |                    select_attributions, attributions,
 44 |                    interaction_feature, feature_order):
 45 |     """
 46 |     Helper function to call the jitter matrix function.
 47 |     """
 48 |     if interactions is None:
 49 |         jitter_array = _get_jitter_array(feature_values, select_attributions)
 50 |         jitter_df = pd.DataFrame(jitter_array)
 51 |     else:
 52 |         if interactions.shape == attributions.shape:
 53 |             select_interactions = interactions[:, feature_order]
 54 |         else:
 55 |             if interaction_feature is None:
 56 |                 raise ValueError('Argument interaction was specified ' + \
 57 |                                  'but argument interaction_feature was not.')
 58 |             select_interactions = interactions[:, feature_order, interaction_feature]
 59 |         jitter_df = pd.DataFrame(select_interactions)
 60 |     return jitter_df
 61 | 
 62 | def summary_plot(attributions,
 63 |                  feature_values,
 64 |                  interactions=None,
 65 |                  interaction_feature=None,
 66 |                  feature_names=None,
 67 |                  plot_top_k=None,
 68 |                  standardize_features=True,
 69 |                  scale_x_ind=False,
 70 |                  scale_y_ind=False,
 71 |                  figsize=(8, 4),
 72 |                  dpi=150,
 73 |                  **kwargs):
 74 |     """
 75 |     Function to draw an interactive scatter plot of
 76 |     attribution values. Since this is built on top
 77 |     of altair, this function works best when the
 78 |     number of points is small (< 5000).
 79 |     Args:
 80 |         attributions: A matrix of attributions.
 81 |                       Should be of shape [batch_size, feature_dims].
 82 |         feature_values: A matrix of feature values.
 83 |                         Should the same shape as the attributions.
 84 |         interactions:  Either a matrix of the same shape as attributions representing
 85 |                        the interaction between interaction_feature and all other features,
 86 |                        or a matrix that can be indexed as
 87 |                        interactions[:, :, interaction_feature].
 88 |         interaction_feature: A feature to use for interactions if interactions
 89 |                              are provided as all pairwise interactions.
 90 |         feature_names: An optional list of length attributions.shape[1]. Each
 91 |                        entry should be a string representing the name of a feature.
 92 |         plot_top_k: The number of features to plot. If none, will plot all features.
 93 |                     This might take a while, depending on how many features you have.
 94 |         scale_x_ind: Set to True to scale the x axes of each plot independently.
 95 |                      Defaults to False.
 96 |         scale_y_ind: Set to True to scale the y axes of each plot independently.
 97 |                      Defaults to False.
 98 |         figsize: Figure size in matplotlib units. Each figure will be square.
 99 |         dpi: Resolution of each plot.
100 |         kwargs: Passed to plt.scatter
101 |     """
102 |     if plot_top_k is None:
103 |         plot_top_k = attributions.shape[1]
104 |     mean_abs_attr = np.mean(np.abs(attributions), axis=0)
105 |     max_order = np.argsort(mean_abs_attr)
106 |     feature_order = max_order[::-1][:plot_top_k]
107 | 
108 |     if feature_names is None:
109 |         feature_names = ['Feature {}'.format(i) for i in range(feature_values.shape[1])]
110 | 
111 |     feature_values = feature_values[:, feature_order]
112 |     select_attributions = attributions[:, feature_order]
113 |     feature_names = [feature_names[i] for i in feature_order]
114 | 
115 |     if standardize_features:
116 |         standardized_feature_values = (feature_values - np.mean(feature_values,
117 |                                                                 axis=0,
118 |                                                                 keepdims=True))
119 |         standardized_feature_values = standardized_feature_values / \
120 |                                       (np.std(standardized_feature_values,
121 |                                               axis=0,
122 |                                               keepdims=True) + 1e7)
123 |     else:
124 |         standardized_feature_values = feature_values
125 | 
126 |     vmin, vmax = _get_bounds(standardized_feature_values)
127 |     standardized_feature_values = np.clip(standardized_feature_values, vmin, vmax)
128 | 
129 |     attribution_names = ['Attribution to {}'.format(feature_names[i]) for \
130 |                              i in range(len(feature_names))]
131 |     feature_df = pd.DataFrame(standardized_feature_values)
132 |     attribution_df = pd.DataFrame(select_attributions)
133 |     feature_df.columns = feature_names
134 |     attribution_df.columns = attribution_names
135 | 
136 |     feature_df = pd.melt(feature_df, var_name='Feature', value_name='Normalized Feature Value')
137 |     attribution_df = pd.melt(attribution_df, var_name='Attribution', value_name='Attribution Value')
138 |     attribution_df = attribution_df.drop(columns=['Attribution'])
139 | 
140 |     jitter_df = _get_jitter_df(interactions, feature_values,
141 |                                select_attributions, attributions,
142 |                                interaction_feature, feature_order)
143 |     jitter_df = pd.melt(jitter_df, var_name='Variable', value_name='Jitter')
144 |     jitter_df = jitter_df.drop(columns=['Variable'])
145 |     melted_df = pd.concat([feature_df, attribution_df, jitter_df], axis=1)
146 | 
147 |     if 's' not in kwargs:
148 |         kwargs['s'] = 4
149 |     if 'cmap' not in kwargs:
150 |         kwargs['cmap'] = colors.green_gold()
151 | 
152 |     x_limits, y_limits = _get_shared_limits(melted_df['Attribution Value'],
153 |                                             melted_df['Jitter'],
154 |                                             scale_x_ind,
155 |                                             scale_y_ind)
156 | 
157 |     fig, axs = plt.subplots(plot_top_k, 1, figsize=figsize, dpi=dpi)
158 |     fig.subplots_adjust(left=0.2, hspace=0)
159 |     for i in range(plot_top_k - 1):
160 |         axis = axs[i]
161 |         _set_axis_config(axis,
162 |                          clear_x_ticks=True,
163 |                          clear_y_ticks=True)
164 |         trans = mpl.transforms.blended_transform_factory(axis.transData, axis.transAxes)
165 |         axis.plot([0.0, 1.0], [0.5, 0.5], transform=axis.transAxes,
166 |                   linewidth=0.5, color='black', alpha=0.3, zorder=1)
167 |         axis.plot([0.0, 0.0], [-1.0, 1.0], transform=trans, clip_on=False,
168 |                   linewidth=0.5, color='black', alpha=0.3, zorder=1)
169 | 
170 |     axis = axs[-1]
171 |     _set_axis_config(axis,
172 |                      [0.0, 0.0, 0.0, 0.5],
173 |                      clear_x_ticks=False,
174 |                      clear_y_ticks=True)
175 |     trans = mpl.transforms.blended_transform_factory(axis.transData, axis.transAxes)
176 |     axis.plot([0.0, 1.0], [0.5, 0.5], transform=axis.transAxes,
177 |               linewidth=0.5, color='black', alpha=0.3, zorder=1)
178 |     axis.plot([0.0, 0.0], [0.0, 1.0], transform=trans,
179 |               linewidth=0.5, color='black', alpha=0.3, zorder=1)
180 |     axis.tick_params(length=4, labelsize=8)
181 |     axis.set_xlabel('Attribution Value')
182 | 
183 |     for i in range(plot_top_k):
184 |         axis = axs[i]
185 |         selected_df = melted_df.loc[melted_df['Feature'] == feature_names[i]]
186 |         trans = mpl.transforms.blended_transform_factory(axis.transAxes, axis.transAxes)
187 |         axis.text(-0.02, 0.5, feature_names[i],
188 |                   horizontalalignment='right',
189 |                   verticalalignment='center',
190 |                   fontsize=8,
191 |                   transform=trans)
192 |         axis.scatter(x=selected_df['Attribution Value'],
193 |                      y=selected_df['Jitter'],
194 |                      c=selected_df['Normalized Feature Value'],
195 |                      zorder=2,
196 |                      **kwargs)
197 |         if x_limits is not None:
198 |             axis.set_xlim(x_limits)
199 |         if y_limits is not None:
200 |             axis.set_ylim(y_limits)
201 | 
202 |     _color_bar(fig, vmin, vmax, 'Feature Value', ticks=False, label_size=8, **kwargs)


--------------------------------------------------------------------------------
/get_top_pathways.py:
--------------------------------------------------------------------------------
  1 | import anndata
  2 | import numpy as np
  3 | import pandas as pd
  4 | import torch
  5 | import torch.nn.functional as F
  6 | import os
  7 | 
  8 | from utils import load_annotations
  9 | from sklearn.model_selection import train_test_split
 10 | 
 11 | from torch.utils.data import Dataset, DataLoader
 12 | from datasets import RNASeqData
 13 | 
 14 | from pathexplainer import PathExplainerTorch
 15 | from sklearn.linear_model import LogisticRegression
 16 | import argparse
 17 | 
 18 | 
 19 | from models import pmVAEModel
 20 | import mygene
 21 | import os
 22 | import time 
 23 | 
 24 | save_path = 'new_for_revision/new_res/'
 25 | 
 26 | def main():
 27 | 
 28 |     ig_times = []
 29 |     lr_times = []
 30 |     train_times = []
 31 |     
 32 |     # get dataset, removal method
 33 |     parser = argparse.ArgumentParser()
 34 |     parser.add_argument('dataset', action="store", default='kang')
 35 |     parser.add_argument('which_gpu', action="store", default='0')
 36 |     parser.add_argument('gene_prog', action="store", default='Ctrl')
 37 | 
 38 |     args = parser.parse_args()
 39 |     
 40 |     os.environ["CUDA_VISIBLE_DEVICES"]=args.which_gpu
 41 |     dataset =args.dataset
 42 | 
 43 | 
 44 |     # load data 
 45 |     
 46 |     # load datlinger data 
 47 |     if args.dataset == 'datlinger':
 48 |         
 49 |         data = anndata.read('data/datlinger_pp.h5ad')
 50 |         symbols = data.var_names
 51 |     
 52 |     
 53 |      # load kang data
 54 |     if args.dataset == 'kang':
 55 |         
 56 |         data = anndata.read('data/kang_count.h5ad')
 57 |         symbols = data.var_names
 58 |         
 59 |     
 60 |     # load mcfarland data
 61 |     if args.dataset == 'mcfarland':
 62 |         
 63 |         data = anndata.read('/projects/leelab/data/single-cell/mcfarland_2020_Idasanutlin/preprocessed/adata_top_2000_genes_tc.h5ad')
 64 |         data = data[data.obs['condition'] == 'Idasanutlin'].copy() 
 65 |         symbols = data.var_names
 66 |                    
 67 |             
 68 |     # load zheng data 
 69 |     if args.dataset == 'zheng':
 70 |         data = anndata.read('/projects/leelab/data/single-cell/zheng_2017/preprocessed/adata_top_2000_genes.h5ad')
 71 | 
 72 |         # convert ENSG IDs to gene symbols: 
 73 |         
 74 |         mg = mygene.MyGeneInfo()
 75 |         geneList = data.var_names
 76 |         geneSyms = mg.querymany(geneList , scopes='ensembl.gene', fields='symbol', species='human', returnall=True)
 77 | 
 78 |         symbols = []
 79 |         not_in = []
 80 |         is_in = []
 81 |         for k in range(2000):
 82 |             if ('symbol' in geneSyms['out'][k]):  
 83 |                 symbols += [geneSyms['out'][k]['symbol']]
 84 |                 is_in += [geneSyms['out'][k]['query']]
 85 |             else:
 86 |                 not_in += [geneSyms['out'][k]['query']]
 87 |         symbols = pd.Index(symbols)
 88 |         
 89 |         symbols = pd.Index(set(symbols.to_numpy()))
 90 | 
 91 |         # filter out post transplant
 92 |         data = data[data.obs['condition'] != 'post_transplant'][:,is_in].copy() 
 93 |         
 94 |             
 95 |     # load haber data
 96 |     if args.dataset == 'haber':
 97 |         
 98 |         data = anndata.read('/projects/leelab/data/single-cell/haber_2017/preprocessed/adata_top_2000_genes.h5ad')
 99 |         
100 |         # filter out H poly 
101 |         data = data[data.obs['condition'] != 'Salmonella'].copy()
102 |        
103 |         symbols = data.var_names
104 |     
105 | 
106 |     # load grubman data 
107 |     if args.dataset == 'grubman':
108 |         
109 |         data = anndata.read('/projects/leelab/data/single-cell/grubman_2019/preprocessed/adata_top_2000_genes.h5ad')
110 |        
111 |         symbols = data.var_names  
112 |     
113 |     
114 |     if args.dataset == 'norman': 
115 |                 
116 |         data = anndata.read('/projects/leelab/data/single-cell/norman_2019/preprocessed/adata_top_2000_genes_tc.h5ad')
117 |         
118 |         if args.gene_prog == 'erythroid': 
119 |             data = data[(data.obs['gene_program'] == 'Ctrl') | (data.obs['gene_program'] == 'Erythroid')].copy()
120 |                         
121 |         if args.gene_prog == 'granulocyte-apoptosis': 
122 |             data = data[(data.obs['gene_program'] == 'Ctrl') | (data.obs['gene_program'] == 'Granulocyte/apoptosis')].copy()
123 | 
124 |         if args.gene_prog == 'megakaryocyte': 
125 |             data = data[(data.obs['gene_program'] == 'Ctrl') | (data.obs['gene_program'] == 'Megakaryocyte')].copy()
126 |             
127 |         if args.gene_prog == 'pro-growth': 
128 |             data = data[(data.obs['gene_program'] == 'Ctrl') | (data.obs['gene_program'] == 'Pro-growth')].copy()
129 |             
130 |         test_df = pd.DataFrame(index=data.var['gene_name'])
131 |         symbols = test_df.index
132 |         
133 | 
134 |         
135 |     number_of_replicates = 10
136 |     first_run = True 
137 |     
138 |     # for 10 experimental replicates
139 |     for rand_seed in range(number_of_replicates):
140 | 
141 |         print("replicate number " + str(rand_seed))
142 | 
143 |         # split data
144 | 
145 |         train_data, test_data = train_test_split(data,
146 |                                                 test_size=0.25,
147 |                                                 shuffle=True,
148 |                                                 random_state=rand_seed)
149 |         tr_data, val_data = train_test_split(train_data,
150 |                                             test_size=0.25,
151 |                                             shuffle=True,
152 |                                             random_state=rand_seed)
153 | 
154 |         tr_ds = RNASeqData(np.array(tr_data.X))
155 |         val_ds = RNASeqData(np.array(val_data.X))
156 | 
157 |         # load annotations
158 |         membership_mask = load_annotations('data/c2.cp.reactome.v7.4.symbols.gmt',
159 |                                             symbols,
160 |                                             min_genes=13
161 |                                         ).astype(bool).T
162 | 
163 |         ##
164 |         ## train model
165 |         ##
166 | 
167 |         # initialize base model
168 |         basePMVAE = pmVAEModel(membership_mask.values,
169 |                                 [12],
170 |                                 1,
171 |                                 beta=1e-05,
172 |                                 terms=membership_mask.index,
173 |                                 add_auxiliary_module=True
174 |                             )
175 | 
176 |         
177 |         if first_run: # first run 
178 |                         
179 |             top_ig = pd.DataFrame(index=basePMVAE.latent_space_names())
180 |             top_lr = pd.DataFrame(index=basePMVAE.latent_space_names())
181 |             first_run = False 
182 |         
183 |         # train
184 |         
185 |         start_train = time.time()
186 |         basePMVAE.train(tr_ds, val_ds, 
187 |                         checkpoint_path='saved_models/' + dataset + '_' + args.gene_prog + '.pkl',
188 |                         max_epochs=100)
189 |         
190 |         end_train = time.time()
191 |         train_times.append(end_train - start_train)
192 | 
193 |         basePMVAE.set_gpu(False)
194 | 
195 | 
196 |         # IG pathway rankings
197 |         print("Calc IG score")
198 |         
199 |         start_ig = time.time()
200 |         
201 |         def model_loss_wrapper(z):
202 |             module_outputs = basePMVAE.model.decoder_net(z)
203 |             global_recon = basePMVAE.model.merge(module_outputs)
204 |             return F.mse_loss(global_recon, ground_truth, reduction='none').mean(1).view(-1,1)
205 | 
206 |         ground_truth = torch.tensor(np.array(val_data.X)).float()
207 |         outs = basePMVAE.model(ground_truth)
208 |         
209 |         input_data = outs.z
210 |         baseline_data = torch.zeros(outs.z.shape[1])
211 |         baseline_data.requires_grad = True
212 | 
213 |         explainer = PathExplainerTorch(model_loss_wrapper)
214 |         attributions = explainer.attributions(input_data,
215 |                                               baseline=baseline_data,
216 |                                               num_samples=200,
217 |                                               use_expectation=False)
218 | 
219 |         np_attribs = attributions.detach().numpy()
220 |         top_ig[rand_seed] = np_attribs.mean(0)
221 |         
222 |         end_ig = time.time()
223 |         ig_times.append(end_ig - start_ig)
224 | 
225 |         # so far! 
226 |         top_ig.to_csv(save_path + dataset + '_ig.csv', index=False)
227 |         
228 |         
229 |         # LR pathway rankings
230 |         print("Calc LR score")
231 |         start_lr = time.time()
232 | 
233 |         
234 |         if args.dataset == 'kang' or args.dataset == 'datlinger':
235 |             y_tr = tr_data.obs['condition']
236 |             y_val = val_data.obs['condition']
237 | 
238 |             train_labels = (y_tr == 'stimulated').values
239 |             val_labels = (y_val == 'stimulated').values
240 | 
241 | 
242 |         if args.dataset == 'mcfarland':
243 | 
244 |             y_tr = tr_data.obs['TP53_mutation_status']
245 |             y_val = val_data.obs['TP53_mutation_status']
246 | 
247 |             train_labels = (y_tr == 'Wild Type').values
248 |             val_labels = (y_val == 'Wild Type').values
249 | 
250 | 
251 |         if args.dataset == 'haber':
252 |             y_tr = tr_data.obs['condition']
253 |             y_val = val_data.obs['condition']
254 | 
255 |             train_labels = (y_tr == 'Control').values
256 |             val_labels = (y_val == 'Control').values
257 | 
258 |         if args.dataset == 'grubman': 
259 |             y_tr = tr_data.obs['batchCond']
260 |             y_val = val_data.obs['batchCond']
261 | 
262 |             train_labels = (y_tr == 'ct').values
263 |             val_labels = (y_val == 'ct').values
264 | 
265 | 
266 |         if args.dataset == 'zheng': 
267 |             y_tr = tr_data.obs['condition']
268 |             y_val = val_data.obs['condition']
269 | 
270 |             train_labels = (y_tr == 'healthy').values
271 |             val_labels = (y_val == 'healthy').values
272 |             
273 |         
274 |         if args.dataset == 'norman': 
275 |             y_tr = tr_data.obs['gene_program']
276 |             y_val = val_data.obs['gene_program']
277 | 
278 |             train_labels = (y_tr == 'Ctrl').values
279 |             val_labels = (y_val == 'Ctrl').values
280 |         
281 |         train_embedding = basePMVAE.model(torch.tensor(tr_data.X).float()).z.detach().numpy()
282 |         val_embedding = basePMVAE.model(torch.tensor(val_data.X).float()).z.detach().numpy()
283 | 
284 |         lr_scores = []
285 |         for pathway in range(train_embedding.shape[1]):
286 |             clf = LogisticRegression(random_state=0).fit(train_embedding[:,pathway].reshape(-1,1), train_labels)
287 |             lr_scores.append(clf.score(val_embedding[:,pathway].reshape(-1,1), val_labels))
288 |             
289 |         
290 |         top_lr[rand_seed] = lr_scores
291 |         top_lr[rand_seed] = -1.*top_lr[rand_seed]
292 |         
293 |         end_lr = time.time()
294 |         lr_times.append(end_lr - start_lr)
295 | 
296 | 
297 |         # so far! 
298 |         top_lr.to_csv(save_path + dataset + '_lr.csv', index=False)
299 | 
300 |         times = pd.DataFrame()
301 |         times['ig_times'] = ig_times
302 |         times['lr_times'] = lr_times
303 |         times['train_times'] = train_times
304 | 
305 |         times.to_csv(save_path + args.dataset + '_times.csv')
306 | 
307 | if __name__ == '__main__':
308 |     main()    


--------------------------------------------------------------------------------
/pathexplainer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import functools
  3 | import operator
  4 | import torch
  5 | from torch.autograd import grad
  6 | import numpy as np
  7 | from tqdm import *
  8 | 
  9 | def gather_nd(params, indices):
 10 |     """
 11 |     Args:
 12 |         params: Tensor to index
 13 |         indices: k-dimension tensor of integers. 
 14 |     Returns:
 15 |         output: 1-dimensional tensor of elements of ``params``, where
 16 |             output[i] = params[i][indices[i]]
 17 |             
 18 |             params   indices   output
 19 |             1 2       1 1       4
 20 |             3 4       2 0 ----> 5
 21 |             5 6       0 0       1
 22 |     """
 23 |     max_value = functools.reduce(operator.mul, list(params.size())) - 1
 24 |     indices = indices.t().long()
 25 |     ndim = indices.size(0)
 26 |     idx = torch.zeros_like(indices[0]).long()
 27 |     m = 1
 28 | 
 29 |     for i in range(ndim)[::-1]:
 30 |         idx += indices[i]*m
 31 |         m *= params.size(i)
 32 | 
 33 |     idx[idx < 0] = 0
 34 |     idx[idx > max_value] = 0
 35 |     return torch.take(params, idx)
 36 | 
 37 | class PathExplainerTorch(object):
 38 |     def __init__(self, model):
 39 |         self.model = model
 40 |         return
 41 |     
 42 |     def _get_ref_tensor(self,baseline,batch_size,num_samples):
 43 |         number_to_draw = num_samples * batch_size
 44 |         replace = baseline.shape[0] < number_to_draw
 45 |         sample_indices = np.random.choice(baseline.shape[0],
 46 |                                           size=number_to_draw,
 47 |                                           replace=replace)
 48 |         ref_tensor = baseline[sample_indices,:]
 49 |         
 50 |         return ref_tensor
 51 | 
 52 |     def _get_samples_input(self, input_tensor, baseline, 
 53 |                            num_samples, use_expectation):
 54 |         '''
 55 |         calculate interpolation points
 56 |         Args:
 57 |             input_tensor: Tensor of shape (batch, ...), where ... indicates
 58 |                           the input dimensions. 
 59 |             reference_tensor: A tensor of shape (batch, k, ...) where ... 
 60 |                 indicates dimensions, and k represents the number of background 
 61 |                 reference samples to draw per input in the batch.
 62 |         Returns: 
 63 |             samples_input: A tensor of shape (batch, k, ...) with the 
 64 |                 interpolated points between input and ref.
 65 |             samples_delta: A tensor of shape (batch, 1, ...) with the
 66 |                 difference between input and reference for each sample
 67 |         '''
 68 |         input_dims = list(input_tensor.size())[1:]
 69 |         num_input_dims = len(input_dims)
 70 |         batch_size = input_tensor.size()[0]
 71 |         
 72 |         if use_expectation:
 73 |             reference_tensor = self._get_ref_tensor(baseline,batch_size,num_samples)
 74 |             shape = reference_tensor.shape
 75 |             reference_tensor = reference_tensor.view(
 76 |                     batch_size, 
 77 |                     num_samples, 
 78 |                     *(shape[1:]))
 79 |             
 80 |             # Grab a [batch_size, k]-sized interpolation sample
 81 |             t_tensor = torch.FloatTensor(batch_size, num_samples).uniform_(0,1).to(reference_tensor.device)
 82 |             shape = [batch_size, num_samples] + [1] * num_input_dims
 83 |             interp_coef = t_tensor.view(*shape)
 84 | 
 85 |             # Evaluate the end points
 86 |             end_point_ref = (1.0 - interp_coef) * reference_tensor
 87 | 
 88 |             input_expand_mult = input_tensor.unsqueeze(1)
 89 |             end_point_input = interp_coef * input_expand_mult
 90 | 
 91 |             # Affine Combine
 92 |             samples_input = end_point_input + end_point_ref
 93 |             
 94 |         else:
 95 |             batch_size = input_tensor.size()[0]
 96 |             input_expand = input_tensor.unsqueeze(1)
 97 |             reps = np.ones(len(baseline.shape)).astype(int)
 98 |             reps[0] = batch_size
 99 |             reference_tensor = baseline.repeat(list(reps)).unsqueeze(1)
100 | #             reference_tensor = torch.as_tensor(sampled_baseline).unsqueeze(1).to(baseline.device)
101 |             scaled_inputs = [reference_tensor + (float(i)/num_samples)*(input_expand - reference_tensor) \
102 |                              for i in range(0,num_samples+1)]
103 |             samples_input = torch.cat(scaled_inputs,dim=1)
104 |         
105 |         samples_delta = self._get_samples_delta(input_tensor, reference_tensor)
106 |         samples_delta = samples_delta.to(samples_input.device)
107 |         
108 |         return samples_input, samples_delta
109 |     
110 |     def _get_samples_delta(self, input_tensor, reference_tensor):
111 |         input_expand_mult = input_tensor.unsqueeze(1)
112 |         sd = input_expand_mult - reference_tensor
113 |         return sd
114 |     
115 |     def _get_grads(self, samples_input, output_indices=None):
116 | 
117 |         grad_tensor = torch.zeros(samples_input.shape).float().to(samples_input.device)
118 |         
119 |         k_ = samples_input.shape[1]
120 | 
121 |         for i in range(k_):
122 |             particular_slice = samples_input[:,i]
123 |             batch_output = self.model(particular_slice)
124 |             # should check that users pass in sparse labels
125 |             # Only look at the user-specified label
126 |             if batch_output.size(1) > 1:
127 |                 sample_indices = torch.arange(0,batch_output.size(0)).to(samples_input.device)
128 |                 indices_tensor = torch.cat([
129 |                         sample_indices.unsqueeze(1), 
130 |                         output_indices.unsqueeze(1)], dim=1)
131 |                 batch_output = gather_nd(batch_output, indices_tensor)
132 | 
133 |             model_grads = grad(
134 |                     outputs=batch_output,
135 |                     inputs=particular_slice,
136 |                     grad_outputs=torch.ones_like(batch_output).to(samples_input.device),
137 |                     create_graph=True)
138 |             grad_tensor[:,i,:] = model_grads[0]
139 |         return grad_tensor
140 |            
141 |     def attributions(self, input_tensor, baseline,
142 |                      num_samples = 50, use_expectation=True, 
143 |                      output_indices=None):
144 |         """
145 |         Calculate either Expected or Integrated Gradients approximation of 
146 |         Aumann-Shapley values for the sample ``input_tensor``.
147 |         Args:
148 |             model (torch.nn.Module): Pytorch neural network model for which the
149 |                 output should be explained.
150 |             input_tensor (torch.Tensor): Pytorch tensor representing the input
151 |                 to be explained.
152 |             baseline (torch.Tensor): Pytorch tensor representing the baseline.
153 |                 If use_expectation is true, then baseline should be shape
154 |                 (num_refs, ...) where ... indicates the dimensionality
155 |                 of the input. Otherwise, baseline should be shape (1, ...).
156 |             output_indices (optional, default=None): For multi-class prediction
157 |         """
158 |         equal_dims = baseline.shape[1:] == input_tensor.shape[1:]
159 |         almost_equal_dims = baseline.shape == input_tensor.shape[1:]
160 |         
161 |         dev = input_tensor.device
162 |         baseline = baseline.to(dev)
163 |         
164 |         input_tensor.requires_grad_ = True
165 |         
166 |         if use_expectation and not equal_dims:
167 |             raise ValueError('baseline should be shape (num_refs, ...) \
168 |                               where ... indicates the dimensionality   \
169 |                               of the input')
170 |             
171 |         if not use_expectation and baseline.shape[0] != 1:
172 |             if almost_equal_dims:
173 |                 baseline = baseline.unsqueeze(0)
174 |             else:
175 |                 raise ValueError('baseline should be shape (...)           \
176 |                                   where ... indicates the dimensionality   \
177 |                                   of the input')
178 |         
179 |         samples_input, samples_delta = self._get_samples_input(input_tensor, baseline,
180 |                                                                num_samples, use_expectation)
181 |         grad_tensor = self._get_grads(samples_input, output_indices)
182 |         mult_grads = samples_delta * grad_tensor
183 |         attributions = mult_grads.mean(1)
184 |         
185 |         return attributions
186 |     
187 |     def interactions(self, input_tensor, baseline,
188 |                      num_samples=50, use_expectation=True,
189 |                      output_indices=None, interaction_index=None,
190 |                      verbose=True):
191 |         """
192 |         samples_input: A tensor of shape (batch, k, features)
193 |         ig_tensor: also size (batch, k, features), but contains IG values
194 |         
195 |         """
196 |         
197 |         if len(input_tensor.shape) != 2:
198 |             raise ValueError('PyTorch Explainer only supports ' + \
199 |                              'interaction for 2D input tensors!')
200 |         
201 |         equal_dims = baseline.shape[1:] == input_tensor.shape[1:]
202 |         almost_equal_dims = baseline.shape == input_tensor.shape[1:]
203 |         
204 |         if use_expectation and not equal_dims:
205 |             raise ValueError('baseline should be shape (num_refs, ...) \
206 |                               where ... indicates the dimensionality   \
207 |                               of the input')
208 |             
209 |         if not use_expectation and baseline.shape[0] != 1:
210 |             if almost_equal_dims:
211 |                 baseline = baseline.unsqueeze(0)
212 |             else:
213 |                 raise ValueError('baseline should be shape (...)           \
214 |                                   where ... indicates the dimensionality   \
215 |                                   of the input')
216 |         
217 |         inner_loop_nsamples = int(round(np.sqrt(num_samples)))
218 |         
219 |         samples_input, samples_delta = self._get_samples_input(input_tensor, baseline,
220 |                                                                inner_loop_nsamples, use_expectation)
221 |         
222 |         if interaction_index is not None:
223 |             interaction_mult_tensor = torch.zeros([input_tensor.size(0), samples_input.size(1), input_tensor.size(1)])
224 |         else:
225 |             interaction_mult_tensor = torch.zeros([input_tensor.size(0), samples_input.size(1), 
226 |                                                    input_tensor.size(1), input_tensor.size(1)])
227 |             
228 |         ig_tensor = torch.zeros(samples_input.shape).float()
229 |         
230 |         if use_expectation:
231 |             loop_num = inner_loop_nsamples
232 |         else:
233 |             loop_num = inner_loop_nsamples + 1
234 |         
235 |         if verbose:
236 |             iterable = tqdm(range(loop_num))
237 |         else:
238 |             iterable = range(loop_num)
239 |         
240 |         for i in iterable:
241 |             
242 |             particular_slice = samples_input[:,i]
243 |             ig_tensor[:,i,:] = self.attributions(particular_slice, baseline,
244 |                                                  num_samples=inner_loop_nsamples, use_expectation=use_expectation,
245 |                                                  output_indices=output_indices)
246 |             
247 |             if interaction_index is not None:
248 |                 second_grads = grad(
249 |                         outputs=ig_tensor[:,i,interaction_index],
250 |                         inputs=particular_slice,
251 |                         grad_outputs=torch.ones_like(ig_tensor[:,i,interaction_index]),
252 |                         create_graph=True)[0]
253 |                 interaction_mult_tensor[:,i,:] = second_grads
254 | 
255 |             else:
256 |                 for feature in range(input_tensor.size(1)):
257 |                     second_grads = grad(
258 |                         outputs=ig_tensor[:,i,feature],
259 |                         inputs=particular_slice,
260 |                         grad_outputs=torch.ones_like(ig_tensor[:,i,feature]),
261 |                         create_graph=True)[0]
262 |                     interaction_mult_tensor[:,i,feature,:] = second_grads
263 | 
264 |         interaction_mult_tensor = interaction_mult_tensor.to(samples_delta.device)
265 |         if interaction_index is not None:
266 |             interaction_tensor = interaction_mult_tensor * samples_delta
267 |         else:
268 |             interaction_tensor = interaction_mult_tensor * samples_delta.unsqueeze(2)
269 |         interactions = interaction_tensor.mean(1)
270 |         
271 |         return interactions


--------------------------------------------------------------------------------
/standard_VAE_impute_benchmark.py:
--------------------------------------------------------------------------------
  1 | # impute benchmark on standard VAE
  2 | 
  3 | import anndata
  4 | import numpy as np
  5 | import pandas as pd
  6 | import torch
  7 | import torch.nn.functional as F
  8 | import os
  9 | import mygene
 10 | 
 11 | from utils import load_annotations
 12 | from sklearn.model_selection import train_test_split
 13 | 
 14 | from torch.utils.data import Dataset, DataLoader
 15 | from datasets import RNASeqData
 16 | 
 17 | import argparse
 18 | 
 19 | from pathexplainer import PathExplainerTorch
 20 | from sklearn.linear_model import LogisticRegression
 21 | 
 22 | from models import VAEModel
 23 | import time 
 24 | 
 25 | import os
 26 | 
 27 | save_path = 'new_for_revision/new_res/dense/'
 28 | 
 29 | def main():
 30 |     
 31 |     # get dataset, removal method
 32 |     parser = argparse.ArgumentParser()
 33 |     parser.add_argument('dataset', action="store", default='kang')
 34 |     parser.add_argument('removal', action="store", default='impute')
 35 |     parser.add_argument('which_gpu', action="store", default='0')
 36 | 
 37 |     args = parser.parse_args()
 38 |     
 39 |     os.environ["CUDA_VISIBLE_DEVICES"]=args.which_gpu
 40 | 
 41 |     
 42 |     # load datlinger data 
 43 |     if args.dataset == 'datlinger':
 44 |         
 45 |         data = anndata.read('data/datlinger_pp.h5ad')
 46 |         symbols = data.var_names
 47 |     
 48 |     
 49 |      # load kang data
 50 |     if args.dataset == 'kang':
 51 |         
 52 |         data = anndata.read('data/kang_count.h5ad')
 53 |         symbols = data.var_names
 54 |                 
 55 |     
 56 |     # load mcfarland data
 57 |     if args.dataset == 'mcfarland':
 58 |         
 59 |         data = anndata.read('/projects/leelab/data/single-cell/mcfarland_2020_Idasanutlin/preprocessed/adata_top_2000_genes_tc.h5ad')
 60 |         data = data[data.obs['condition'] == 'Idasanutlin'].copy() 
 61 |         symbols = data.var_names
 62 |             
 63 | 
 64 |     # load zheng data 
 65 |     if args.dataset == 'zheng':
 66 |         data = anndata.read('/projects/leelab/data/single-cell/zheng_2017/preprocessed/adata_top_2000_genes.h5ad')
 67 | 
 68 |         # convert ENSG IDs to gene symbols: 
 69 |         
 70 |         mg = mygene.MyGeneInfo()
 71 |         geneList = data.var_names
 72 |         geneSyms = mg.querymany(geneList , scopes='ensembl.gene', fields='symbol', species='human', returnall=True)
 73 | 
 74 |         symbols = []
 75 |         not_in = []
 76 |         is_in = []
 77 |         for k in range(2000):
 78 |             if ('symbol' in geneSyms['out'][k]):  
 79 |                 symbols += [geneSyms['out'][k]['symbol']]
 80 |                 is_in += [geneSyms['out'][k]['query']]
 81 |             else:
 82 |                 not_in += [geneSyms['out'][k]['query']]
 83 |         symbols = pd.Index(symbols)
 84 |         
 85 |         symbols = pd.Index(set(symbols.to_numpy()))
 86 | 
 87 |         # filter out post transplant
 88 |         data = data[data.obs['condition'] != 'post_transplant'][:,is_in].copy() 
 89 |         
 90 |             
 91 |     # load haber data
 92 |     if args.dataset == 'haber':
 93 |         
 94 |         data = anndata.read('/projects/leelab/data/single-cell/haber_2017/preprocessed/adata_top_2000_genes.h5ad')
 95 |         
 96 |         # filter out H poly 
 97 |         data = data[data.obs['condition'] != 'Salmonella'].copy()
 98 |        
 99 |         symbols = data.var_names
100 |     
101 |         
102 | 
103 |     # load grubman data 
104 |     if args.dataset == 'grubman':
105 |         
106 |         data = anndata.read('/projects/leelab/data/single-cell/grubman_2019/preprocessed/adata_top_2000_genes.h5ad')
107 |        
108 |         symbols = data.var_names
109 |     
110 |       # for all datasets 
111 |     data.varm['I'] = load_annotations(
112 |         'data/c2.cp.reactome.v7.4.symbols.gmt',
113 |         symbols,
114 |         min_genes=33
115 |     ).values
116 |     data.uns['terms'] = list(load_annotations(
117 |         'data/c2.cp.reactome.v7.4.symbols.gmt',
118 |         symbols,
119 |         min_genes=33
120 |     ).columns)
121 |     
122 |     number_of_pathways = 20
123 |     number_of_replicates = 10
124 |     
125 |     logvar_results = np.zeros((number_of_replicates,number_of_pathways))
126 |     ig_results = np.zeros((number_of_replicates,number_of_pathways))
127 |     lr_results = np.zeros((number_of_replicates,number_of_pathways))
128 |     kld_results = np.zeros((number_of_replicates,number_of_pathways))
129 |     rand_results = np.zeros((number_of_replicates,number_of_pathways))
130 |     
131 |     logvar_times = []
132 |     ig_times = []
133 |     lr_times = []
134 |     kld_times = []
135 |     rand_times = []
136 |     
137 |     # for 10 experimental replicates
138 |     for rand_seed in range(number_of_replicates):
139 |         
140 |         print("replicate number " + str(rand_seed))
141 |         
142 |         # split data
143 |         
144 |         train_data, test_data = train_test_split(data,
145 |                                                 test_size=0.25,
146 |                                                 shuffle=True,
147 |                                                 random_state=rand_seed)
148 |         tr_data, val_data = train_test_split(train_data,
149 |                                             test_size=0.25,
150 |                                             shuffle=True,
151 |                                             random_state=rand_seed)
152 |         
153 |         tr_ds = RNASeqData(np.array(tr_data.X))
154 |         val_ds = RNASeqData(np.array(val_data.X))
155 |         
156 |         # load annotations
157 |        
158 |     
159 |         membership_mask = load_annotations('data/c2.cp.reactome.v7.4.symbols.gmt',
160 |                                             symbols,
161 |                                             min_genes=13
162 |         
163 |         ##
164 |         ## train base model
165 |         ##
166 |         
167 | 
168 |         # initialize base model
169 |         basePMVAE = VAEModel(n_features=tr_data.X.shape[1],
170 |                                 hidden_layers=[12*n_pathways, n_pathways],
171 |                                 beta=1e-05,
172 |                                 add_auxiliary_module=False
173 |                             )
174 |         
175 |         
176 |         print(basePMVAE.model)
177 |         
178 |         # train
179 |         basePMVAE.train(tr_ds, val_ds, 
180 |                         checkpoint_path='saved_models/dense/'+args.dataset + '_' + args.removal +'_baseModel.pkl',
181 |                         max_epochs=100)
182 |         
183 |         basePMVAE.set_gpu(False)
184 |         
185 |         ##
186 |         ## get pathway rankings
187 |         ##
188 |         top_features = pd.DataFrame(index=data.uns['terms'])
189 |         
190 |         ## get max val logvar
191 |         
192 |         print("Calc max val score")
193 |         
194 |         ground_truth = torch.tensor(np.array(val_data.X)).float()
195 |         outs = basePMVAE.model(ground_truth)
196 |         
197 |         start_logvar= time.time()
198 |         
199 |         top_features['logvar'] = -1.*outs.logvar.mean(0).detach().numpy()
200 |         
201 |         end_logvar= time.time()
202 |         logvar_times.append(end_logvar-start_logvar)
203 | 
204 |         # IG pathway rankings
205 |         print("Calc IG score")
206 |         start_ig = time.time()
207 |         
208 |         def model_loss_wrapper(z):
209 |             module_outputs = basePMVAE.model.decoder_net(z)
210 |             
211 |             global_recon = module_outputs
212 |             #global_recon = basePMVAE.model.merge(module_outputs)
213 |             
214 |             return F.mse_loss(global_recon, ground_truth, reduction='none').mean(1).view(-1,1)
215 |         
216 |         input_data = outs.z
217 |         baseline_data = torch.zeros(outs.z.shape[1])
218 |         baseline_data.requires_grad = True
219 |         
220 |         explainer = PathExplainerTorch(model_loss_wrapper)
221 |         attributions = explainer.attributions(input_data,
222 |                                               baseline=baseline_data,
223 |                                               num_samples=200, #200
224 |                                               use_expectation=False)
225 |         
226 |         np_attribs = attributions.detach().numpy()
227 |         top_features['IG'] = np_attribs.mean(0)
228 |                 
229 |         end_ig = time.time()
230 |         ig_times.append(end_ig - start_ig)
231 |         
232 |         
233 |         # LR pathway rankings
234 |         print("Calc LR score")
235 |         start_lr = time.time()
236 |                             
237 |         if args.dataset == 'kang':
238 |             print('here')
239 |             y_tr = tr_data.obs['condition']
240 |             y_val = val_data.obs['condition']
241 |                             
242 |             train_labels = (y_tr == b'stimulated').values
243 |             val_labels = (y_val == b'stimulated').values
244 |             
245 |             print(train_labels.shape)
246 |             print(train_labels.sum())
247 |             
248 |             print(val_labels.shape)
249 |             print(val_labels.sum())
250 |             
251 |             print(tr_data.obs['condition'])
252 |             
253 |             
254 |         if args.dataset == 'datlinger':
255 |             y_tr = tr_data.obs['condition']
256 |             y_val = val_data.obs['condition']
257 |                             
258 |             train_labels = (y_tr == 'stimulated').values
259 |             val_labels = (y_val == 'stimulated').values
260 |        
261 |                             
262 |         if args.dataset == 'mcfarland':
263 |             
264 |             y_tr = tr_data.obs['TP53_mutation_status']
265 |             y_val = val_data.obs['TP53_mutation_status']
266 |                             
267 |             train_labels = (y_tr == 'Wild Type').values
268 |             val_labels = (y_val == 'Wild Type').values
269 |             
270 |             
271 |         if args.dataset == 'haber':
272 |             y_tr = tr_data.obs['condition']
273 |             y_val = val_data.obs['condition']
274 |                             
275 |             train_labels = (y_tr == 'Control').values
276 |             val_labels = (y_val == 'Control').values
277 |             
278 |         if args.dataset == 'grubman': 
279 |             y_tr = tr_data.obs['batchCond']
280 |             y_val = val_data.obs['batchCond']
281 |                             
282 |             train_labels = (y_tr == 'ct').values
283 |             val_labels = (y_val == 'ct').values
284 |             
285 | 
286 |         if args.dataset == 'zheng': 
287 |             y_tr = tr_data.obs['condition']
288 |             y_val = val_data.obs['condition']
289 |                             
290 |             train_labels = (y_tr == 'healthy').values
291 |             val_labels = (y_val == 'healthy').values
292 |         
293 |         train_embedding = basePMVAE.model(torch.tensor(tr_data.X).float()).z.detach().numpy()
294 |         val_embedding = basePMVAE.model(torch.tensor(val_data.X).float()).z.detach().numpy()
295 |         
296 |         lr_scores = []
297 |         for pathway in range(train_embedding.shape[1]):
298 |             clf = LogisticRegression(random_state=0).fit(train_embedding[:,pathway].reshape(-1,1), train_labels)
299 |             lr_scores.append(clf.score(val_embedding[:,pathway].reshape(-1,1), val_labels))
300 |             
301 |         top_features['lr_score'] = lr_scores
302 |         top_features['lr_score'] = -1.*top_features['lr_score']
303 |         
304 |         end_lr = time.time()
305 |         lr_times.append(end_lr - start_lr)
306 |         
307 |         
308 |         # KLD pathway rankings
309 |         print("Calc KLD")
310 |         start_kld = time.time()
311 |         
312 |         pathway_kld = (-0.5 * (1 + outs.logvar - outs.mu.pow(2) - outs.logvar.exp()).mean(0)).detach().numpy()
313 |         top_features['kld'] = -1.*pathway_kld
314 |         
315 |         end_kld = time.time()
316 |         kld_times.append(end_kld - start_kld)
317 |         
318 |         # Random pathway rankings
319 |         print("Calc Random")
320 |         np.random.seed(rand_seed)
321 |         top_features['rand'] = np.random.randn(top_features.shape[0])
322 |         
323 |              
324 |         times = pd.DataFrame()
325 |         times['logvar_times'] = logvar_times
326 |         times['ig_times'] = ig_times
327 |         times['lr_times'] = lr_times
328 |         times['kld_times'] = kld_times
329 | 
330 |         times.to_csv(save_path + args.dataset + '_times.csv')
331 | 
332 |         
333 |         # impute 
334 |         def impute_benchmark(method,n_pathways=20):
335 |             method_recons_errors = []
336 | 
337 |             # for top 20 pathways 
338 |             for i in range(1,1+n_pathways):
339 | 
340 |                 # set pathways = 0.
341 |                 test_matrix = torch.tensor(test_data.X).float()
342 |                 test_matrix_embedded = basePMVAE.model(test_matrix).z
343 |                 for x in top_features.sort_values(method).index[:i]:
344 |                     index_to_zero = list(top_features.index).index(x)
345 |                     test_matrix_embedded[:,index_to_zero] = 0.
346 | 
347 |                 module_outputs = basePMVAE.model.decoder_net(test_matrix_embedded)
348 |                 
349 |                 global_recon = module_outputs
350 |                                            
351 |                 recons_error = F.mse_loss(global_recon, test_matrix).detach().item()
352 |                 method_recons_errors.append(recons_error)
353 |             return method_recons_errors
354 | 
355 |         # run impute 
356 |         if args.removal == "impute": 
357 |             print("Impute Logvar")
358 |             logvar_results[rand_seed,:] = impute_benchmark('logvar')
359 |             print("Impute IG")
360 |             ig_results[rand_seed,:] = impute_benchmark('IG')
361 |             print("Impute LR")
362 |             lr_results[rand_seed,:] = impute_benchmark('lr_score')
363 |             print("Impute KLD")
364 |             kld_results[rand_seed,:] = impute_benchmark('kld')
365 |             print("Impute RAND")
366 |             rand_results[rand_seed,:] = impute_benchmark('rand')
367 |             
368 |                     
369 |         # save results every iteration so that if it crashes
370 |         # there's at least some progress
371 |         with open('{}/{}_{}_logvar.npy'.format(save_path, args.dataset, args.removal), 'wb') as f:
372 |             np.save(f, logvar_results)
373 |         with open('{}/{}_{}_ig.npy'.format(save_path, args.dataset, args.removal), 'wb') as f:
374 |             np.save(f, ig_results)
375 |         with open('{}/{}_{}_lr.npy'.format(save_path, args.dataset, args.removal), 'wb') as f:
376 |             np.save(f, lr_results)
377 |         with open('{}/{}_{}_kld.npy'.format(save_path, args.dataset, args.removal), 'wb') as f:
378 |             np.save(f, kld_results)
379 |         with open('{}/{}_{}_rand.npy'.format(save_path, args.dataset, args.removal), 'wb') as f:
380 |             np.save(f, rand_results)
381 |     
382 | if __name__ == '__main__':
383 |     main()    
384 | 


--------------------------------------------------------------------------------
/benchmark_pmvae.py:
--------------------------------------------------------------------------------
  1 | import anndata
  2 | import numpy as np
  3 | import pandas as pd
  4 | import torch
  5 | import torch.nn.functional as F
  6 | import os
  7 | import mygene
  8 | 
  9 | from utils import load_annotations
 10 | from sklearn.model_selection import train_test_split
 11 | 
 12 | from torch.utils.data import Dataset, DataLoader
 13 | from datasets import RNASeqData
 14 | 
 15 | import argparse
 16 | 
 17 | from pathexplainer import PathExplainerTorch
 18 | from sklearn.linear_model import LogisticRegression
 19 | 
 20 | from models import pmVAEModel
 21 | import time 
 22 | 
 23 | import os
 24 | 
 25 | save_path = 'new_for_revision/new_res/'
 26 | 
 27 | def main():
 28 |     
 29 |     # get dataset, removal method
 30 |     parser = argparse.ArgumentParser()
 31 |     parser.add_argument('dataset', action="store", default='kang')
 32 |     parser.add_argument('removal', action="store", default='impute')
 33 |     parser.add_argument('which_gpu', action="store", default='0')
 34 | 
 35 |     args = parser.parse_args()
 36 |     
 37 |     os.environ["CUDA_VISIBLE_DEVICES"]=args.which_gpu
 38 | 
 39 |     # load datlinger data 
 40 |     if args.dataset == 'datlinger':
 41 |         
 42 |         data = anndata.read('data/datlinger_pp.h5ad')
 43 |         symbols = data.var_names
 44 |         
 45 |     # load norman data  
 46 |     if args.dataset == 'norman':
 47 |         data = anndata.read('/projects/leelab/data/single-cell/norman_2019/preprocessed/adata_top_2000_genes_tc.h5ad')
 48 |         data = data[(data.obs['gene_program'] == 'Ctrl') | (data.obs['gene_program'] == 'Granulocyte/apoptosis')].copy()
 49 |     
 50 |         test_df = pd.DataFrame(index=data.var['gene_name'])
 51 |         symbols = test_df.index
 52 |     
 53 |      # load kang data
 54 |     if args.dataset == 'kang':
 55 |         
 56 |         data = anndata.read('data/kang_count.h5ad')
 57 |         symbols = data.var_names
 58 |         
 59 |     
 60 |     # load mcfarland data
 61 |     if args.dataset == 'mcfarland':
 62 | 
 63 |         data = anndata.read('/projects/leelab/data/single-cell/mcfarland_2020_Idasanutlin/preprocessed/adata_top_2000_genes_tc.h5ad')
 64 |         data = data[data.obs['condition'] == 'Idasanutlin'].copy() 
 65 |         symbols = data.var_names
 66 |  
 67 |     # load zheng data 
 68 |     if args.dataset == 'zheng':
 69 |         data = anndata.read('/projects/leelab/data/single-cell/zheng_2017/preprocessed/adata_top_2000_genes.h5ad')
 70 | 
 71 |         # convert ENSG IDs to gene symbols: 
 72 |         
 73 |         mg = mygene.MyGeneInfo()
 74 |         geneList = data.var_names
 75 |         geneSyms = mg.querymany(geneList , scopes='ensembl.gene', fields='symbol', species='human', returnall=True)
 76 | 
 77 |         symbols = []
 78 |         not_in = []
 79 |         is_in = []
 80 |         for k in range(2000):
 81 |             if ('symbol' in geneSyms['out'][k]):  
 82 |                 symbols += [geneSyms['out'][k]['symbol']]
 83 |                 is_in += [geneSyms['out'][k]['query']]
 84 |             else:
 85 |                 not_in += [geneSyms['out'][k]['query']]
 86 |         symbols = pd.Index(symbols)
 87 |         
 88 |         symbols = pd.Index(set(symbols.to_numpy()))
 89 | 
 90 |         # filter out post transplant
 91 |         data = data[data.obs['condition'] != 'post_transplant'][:,is_in].copy()  
 92 |             
 93 |     # load haber data
 94 |     if args.dataset == 'haber':
 95 |         
 96 |         data = anndata.read('/projects/leelab/data/single-cell/haber_2017/preprocessed/adata_top_2000_genes.h5ad')
 97 |         
 98 |         # filter out H poly 
 99 |         data = data[data.obs['condition'] != 'Salmonella'].copy()
100 |        
101 |         symbols = data.var_names
102 | 
103 | 
104 |     # load grubman data 
105 |     if args.dataset == 'grubman':
106 |         
107 |         data = anndata.read('/projects/leelab/data/single-cell/grubman_2019/preprocessed/adata_top_2000_genes.h5ad')
108 |        
109 |         symbols = data.var_names
110 |          
111 |         
112 |     # for all datasets 
113 |     data.varm['I'] = load_annotations(
114 |         'data/c2.cp.reactome.v7.4.symbols.gmt',
115 |         symbols,
116 |         min_genes=13
117 |     ).values
118 |     data.uns['terms'] = list(load_annotations(
119 |         'data/c2.cp.reactome.v7.4.symbols.gmt',
120 |         symbols,
121 |         min_genes=13
122 |     ).columns)
123 |     
124 |     top_ig = pd.DataFrame(index=data.uns['terms'])
125 |     top_lr = pd.DataFrame(index=data.uns['terms'])
126 |     
127 |     number_of_pathways = 20
128 |     number_of_replicates = 10
129 |     
130 |     logvar_results = np.zeros((number_of_replicates,number_of_pathways))
131 |     ig_results = np.zeros((number_of_replicates,number_of_pathways))
132 |     lr_results = np.zeros((number_of_replicates,number_of_pathways))
133 |     kld_results = np.zeros((number_of_replicates,number_of_pathways))
134 |     rand_results = np.zeros((number_of_replicates,number_of_pathways))
135 |     
136 |     logvar_times = []
137 |     ig_times = []
138 |     lr_times = []
139 |     kld_times = []
140 |     rand_times = []
141 |     
142 |     # for 10 experimental replicates
143 |     for rand_seed in range(number_of_replicates):
144 |         
145 |         print("replicate number " + str(rand_seed))
146 |         
147 |         # split data
148 |         
149 |         train_data, test_data = train_test_split(data,
150 |                                                 test_size=0.25,
151 |                                                 shuffle=True,
152 |                                                 random_state=rand_seed)
153 |         tr_data, val_data = train_test_split(train_data,
154 |                                             test_size=0.25,
155 |                                             shuffle=True,
156 |                                             random_state=rand_seed)
157 |         
158 |         tr_ds = RNASeqData(np.array(tr_data.X))
159 |         val_ds = RNASeqData(np.array(val_data.X))
160 |         
161 |         # load annotations
162 |         membership_mask = load_annotations('data/c2.cp.reactome.v7.4.symbols.gmt',
163 |                                             symbols,
164 |                                             min_genes=13
165 |                                         ).astype(bool).T
166 |         
167 |         ##
168 |         ## train base model
169 |         ##
170 |         
171 |         # initialize base model
172 |         basePMVAE = pmVAEModel(membership_mask.values,
173 |                                 [12],
174 |                                 1,
175 |                                 beta=1e-05,
176 |                                 terms=membership_mask.index,
177 |                                 add_auxiliary_module=False
178 |                             )
179 |         
180 |         # train
181 |         basePMVAE.train(tr_ds, val_ds, 
182 |                         checkpoint_path=args.dataset + '_' + args.removal +'_baseModel.pkl',
183 |                         max_epochs=100)
184 |         
185 |         basePMVAE.set_gpu(False)
186 |         
187 |         ##
188 |         ## get pathway rankings
189 |         ##
190 |         top_features = pd.DataFrame(index=data.uns['terms'])
191 |         
192 |         ## get max val logvar
193 |         
194 |         print("Calc max val score")
195 |         
196 |         ground_truth = torch.tensor(np.array(val_data.X)).float()
197 |         outs = basePMVAE.model(ground_truth)
198 |         
199 |         start_logvar= time.time()
200 |         
201 |         top_features['logvar'] = -1.*outs.logvar.mean(0).detach().numpy()
202 |         
203 |         end_logvar= time.time()
204 |         logvar_times.append(end_logvar-start_logvar)
205 | 
206 |         
207 |         # IG pathway rankings
208 |         print("Calc IG score")
209 |         start_ig = time.time()
210 |         
211 |         def model_loss_wrapper(z):
212 |             module_outputs = basePMVAE.model.decoder_net(z)
213 |             global_recon = basePMVAE.model.merge(module_outputs)
214 |             return F.mse_loss(global_recon, ground_truth, reduction='none').mean(1).view(-1,1)
215 |         
216 |         input_data = outs.z
217 |         baseline_data = torch.zeros(outs.z.shape[1])
218 |         baseline_data.requires_grad = True
219 |         
220 |         explainer = PathExplainerTorch(model_loss_wrapper)
221 |         attributions = explainer.attributions(input_data,
222 |                                               baseline=baseline_data,
223 |                                               num_samples=200,
224 |                                               use_expectation=False)
225 |         
226 |         np_attribs = attributions.detach().numpy()
227 |         top_features['IG'] = np_attribs.mean(0)
228 |         
229 |         top_ig[rand_seed] = np_attribs.mean(0)
230 |         
231 |         end_ig = time.time()
232 |         ig_times.append(end_ig - start_ig)
233 |         
234 |         
235 |         
236 |         # LR pathway rankings
237 |         print("Calc LR score")
238 |         start_lr = time.time()
239 |                             
240 |         if args.dataset == 'kang' or args.dataset == 'datlinger':
241 |             y_tr = tr_data.obs['condition']
242 |             y_val = val_data.obs['condition']
243 |                             
244 |             train_labels = (y_tr == 'stimulated').values
245 |             val_labels = (y_val == 'stimulated').values
246 |                             
247 |                             
248 |         if args.dataset == 'mcfarland':
249 |             
250 |             y_tr = tr_data.obs['TP53_mutation_status']
251 |             y_val = val_data.obs['TP53_mutation_status']
252 |                             
253 |             train_labels = (y_tr == 'Wild Type').values
254 |             val_labels = (y_val == 'Wild Type').values
255 |             
256 |             
257 |         if args.dataset == 'haber':
258 |             y_tr = tr_data.obs['condition']
259 |             y_val = val_data.obs['condition']
260 |                             
261 |             train_labels = (y_tr == 'Control').values
262 |             val_labels = (y_val == 'Control').values
263 |             
264 |         if args.dataset == 'grubman': 
265 |             y_tr = tr_data.obs['batchCond']
266 |             y_val = val_data.obs['batchCond']
267 |                             
268 |             train_labels = (y_tr == 'ct').values
269 |             val_labels = (y_val == 'ct').values
270 |             
271 | 
272 |         if args.dataset == 'zheng': 
273 |             y_tr = tr_data.obs['condition']
274 |             y_val = val_data.obs['condition']
275 |                             
276 |             train_labels = (y_tr == 'healthy').values
277 |             val_labels = (y_val == 'healthy').values
278 |             
279 |             
280 |         if args.dataset == 'norman': 
281 |             y_tr = tr_data.obs['gene_program']
282 |             y_val = val_data.obs['gene_program']
283 |                             
284 |             train_labels = (y_tr == 'Ctrl').values
285 |             val_labels = (y_val == 'Ctrl').values
286 |         
287 |     
288 |         train_embedding = basePMVAE.model(torch.tensor(tr_data.X).float()).z.detach().numpy()
289 |         val_embedding = basePMVAE.model(torch.tensor(val_data.X).float()).z.detach().numpy()
290 |         
291 |         lr_scores = []
292 |         for pathway in range(train_embedding.shape[1]):
293 |             clf = LogisticRegression(random_state=0).fit(train_embedding[:,pathway].reshape(-1,1), train_labels)
294 |             lr_scores.append(clf.score(val_embedding[:,pathway].reshape(-1,1), val_labels))
295 |             
296 |         top_features['lr_score'] = lr_scores
297 |         top_features['lr_score'] = -1.*top_features['lr_score']
298 |         
299 |         end_lr = time.time()
300 |         lr_times.append(end_lr - start_lr)
301 |         
302 |         
303 |         # KLD pathway rankings
304 |         print("Calc KLD")
305 |         start_kld = time.time()
306 |         
307 |         pathway_kld = (-0.5 * (1 + outs.logvar - outs.mu.pow(2) - outs.logvar.exp()).mean(0)).detach().numpy()
308 |         top_features['kld'] = -1.*pathway_kld
309 |         
310 |         end_kld = time.time()
311 |         kld_times.append(end_kld - start_kld)
312 |         
313 |         # Random pathway rankings
314 |         print("Calc Random")
315 |         np.random.seed(rand_seed)
316 |         top_features['rand'] = np.random.randn(top_features.shape[0])
317 |              
318 |         times = pd.DataFrame()
319 |         times['logvar_times'] = logvar_times
320 |         times['ig_times'] = ig_times
321 |         times['lr_times'] = lr_times
322 |         times['kld_times'] = kld_times
323 | 
324 |         times.to_csv(save_path + args.dataset + '_times.csv')
325 |         
326 |         # impute or retrain
327 |         def impute_benchmark(method,n_pathways=20):
328 |             method_recons_errors = []
329 | 
330 |             # for top 20 pathways 
331 |             for i in range(1,1+n_pathways):
332 | 
333 |                 # set pathways = 0.
334 |                 test_matrix = torch.tensor(test_data.X).float()
335 |                 test_matrix_embedded = basePMVAE.model(test_matrix).z
336 |                 for x in top_features.sort_values(method).index[:i]:
337 |                     index_to_zero = list(top_features.index).index(x)
338 |                     test_matrix_embedded[:,index_to_zero] = 0.
339 | 
340 |                 module_outputs = basePMVAE.model.decoder_net(test_matrix_embedded)
341 |                 global_recon = basePMVAE.model.merge(module_outputs)
342 |                 recons_error = F.mse_loss(global_recon, test_matrix).detach().item()
343 |                 method_recons_errors.append(recons_error)
344 |             return method_recons_errors
345 |         
346 |         def retrain_benchmark(method,n_pathways=20):
347 |             method_recons_errors = []
348 |             # for top 20 pathways 
349 |             for i in range(1,21):
350 | 
351 |                 # get cumulative pathways
352 |                 A_new=[]
353 |                 for x in top_features.sort_values(method).index[:i]:
354 |                     A_new.append(membership_mask.loc[x,:].values.reshape(1,-1))
355 |                 A_new = np.concatenate(A_new,axis=0)
356 | 
357 |                 reducedVAE = pmVAEModel(
358 |                                 A_new,
359 |                                 [12],
360 |                                 1,
361 |                                 beta=1e-05,
362 |                                 terms=list(range(A_new.shape[0])),
363 |                                 add_auxiliary_module=False
364 |                             )
365 |                 
366 |                 reducedVAE.train(tr_ds, val_ds, checkpoint_path= args.dataset + '_' + args.removal +'_reducedVAE.pkl', max_epochs=50)
367 | 
368 |                 test_matrix = torch.tensor(test_data.X).float().cuda()
369 |                 global_recon = reducedVAE.model(test_matrix).global_recon
370 | 
371 |                 recons_error = F.mse_loss(global_recon, test_matrix).detach().item()
372 |                 method_recons_errors.append(recons_error)
373 |             return method_recons_errors    
374 |         
375 | 
376 |         # run impute or retrain 
377 |         if args.removal == "impute": 
378 |             print("Impute Logvar")
379 |             logvar_results[rand_seed,:] = impute_benchmark('logvar')
380 |             print("Impute IG")
381 |             ig_results[rand_seed,:] = impute_benchmark('IG')
382 |             print("Impute LR")
383 |             lr_results[rand_seed,:] = impute_benchmark('lr_score')
384 |             print("Impute KLD")
385 |             kld_results[rand_seed,:] = impute_benchmark('kld')
386 |             print("Impute RAND")
387 |             rand_results[rand_seed,:] = impute_benchmark('rand')
388 |             
389 |         if args.removal == "retrain":
390 |             print("Retrain Logvar")
391 |             logvar_results[rand_seed,:] = retrain_benchmark('logvar')
392 |             print("Retrain IG")
393 |             ig_results[rand_seed,:] = retrain_benchmark('IG')
394 |             print("Retrain LR")
395 |             lr_results[rand_seed,:] = retrain_benchmark('lr_score')
396 |             print("Retrain KLD")
397 |             kld_results[rand_seed,:] = retrain_benchmark('kld')
398 |             print("Retrain RAND")
399 |             rand_results[rand_seed,:] = retrain_benchmark('rand')
400 |                       
401 |                     
402 |         # save results every iteration so that if it crashes
403 |         # there's at least some progress
404 |         with open('{}/{}_{}_logvar.npy'.format(save_path, args.dataset, args.removal), 'wb') as f:
405 |             np.save(f, logvar_results)
406 |         with open('{}/{}_{}_ig.npy'.format(save_path, args.dataset, args.removal), 'wb') as f:
407 |             np.save(f, ig_results)
408 |         with open('{}/{}_{}_lr.npy'.format(save_path, args.dataset, args.removal), 'wb') as f:
409 |             np.save(f, lr_results)
410 |         with open('{}/{}_{}_kld.npy'.format(save_path, args.dataset, args.removal), 'wb') as f:
411 |             np.save(f, kld_results)
412 |         with open('{}/{}_{}_rand.npy'.format(save_path, args.dataset, args.removal), 'wb') as f:
413 |             np.save(f, rand_results)
414 |     
415 |     
416 | if __name__ == '__main__':
417 |     main()    
418 | 


--------------------------------------------------------------------------------
/figures/supplementary_figures/drop_g.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import anndata\n",
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd\n",
 12 |     "\n",
 13 |     "import torch\n",
 14 |     "\n",
 15 |     "import os\n",
 16 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\""
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from utils import load_annotations"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "from sklearn.model_selection import train_test_split"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "# load data"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 4,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "data = anndata.read('data/kang_count.h5ad')"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 5,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "pathway_ann_matrix = load_annotations(\n",
 60 |     "    'data/c2.cp.reactome.v7.4.symbols.gmt',\n",
 61 |     "    data.var_names,\n",
 62 |     "    min_genes=13\n",
 63 |     ")"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 6,
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "data": {
 73 |       "text/plain": [
 74 |        "['REACTOME_REGULATION_OF_PLK1_ACTIVITY_AT_G2_M_TRANSITION']"
 75 |       ]
 76 |      },
 77 |      "execution_count": 6,
 78 |      "metadata": {},
 79 |      "output_type": "execute_result"
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "[x for x in pathway_ann_matrix.columns if 'G2_M_TRANSITION' in x or 'PLK1' in x]"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 7,
 89 |    "metadata": {},
 90 |    "outputs": [
 91 |     {
 92 |      "data": {
 93 |       "text/html": [
 94 |        "<div>\n",
 95 |        "<style scoped>\n",
 96 |        "    .dataframe tbody tr th:only-of-type {\n",
 97 |        "        vertical-align: middle;\n",
 98 |        "    }\n",
 99 |        "\n",
100 |        "    .dataframe tbody tr th {\n",
101 |        "        vertical-align: top;\n",
102 |        "    }\n",
103 |        "\n",
104 |        "    .dataframe thead th {\n",
105 |        "        text-align: right;\n",
106 |        "    }\n",
107 |        "</style>\n",
108 |        "<table border=\"1\" class=\"dataframe\">\n",
109 |        "  <thead>\n",
110 |        "    <tr style=\"text-align: right;\">\n",
111 |        "      <th></th>\n",
112 |        "      <th>REACTOME_REGULATION_OF_PLK1_ACTIVITY_AT_G2_M_TRANSITION</th>\n",
113 |        "    </tr>\n",
114 |        "    <tr>\n",
115 |        "      <th>index</th>\n",
116 |        "      <th></th>\n",
117 |        "    </tr>\n",
118 |        "  </thead>\n",
119 |        "  <tbody>\n",
120 |        "    <tr>\n",
121 |        "      <th>PPP1CB</th>\n",
122 |        "      <td>True</td>\n",
123 |        "    </tr>\n",
124 |        "    <tr>\n",
125 |        "      <th>CLASP1</th>\n",
126 |        "      <td>True</td>\n",
127 |        "    </tr>\n",
128 |        "    <tr>\n",
129 |        "      <th>TUBA4A</th>\n",
130 |        "      <td>True</td>\n",
131 |        "    </tr>\n",
132 |        "    <tr>\n",
133 |        "      <th>CCNB1</th>\n",
134 |        "      <td>True</td>\n",
135 |        "    </tr>\n",
136 |        "    <tr>\n",
137 |        "      <th>TUBB</th>\n",
138 |        "      <td>True</td>\n",
139 |        "    </tr>\n",
140 |        "    <tr>\n",
141 |        "      <th>CUL1</th>\n",
142 |        "      <td>True</td>\n",
143 |        "    </tr>\n",
144 |        "    <tr>\n",
145 |        "      <th>CDK5RAP2</th>\n",
146 |        "      <td>True</td>\n",
147 |        "    </tr>\n",
148 |        "    <tr>\n",
149 |        "      <th>TUBB4B</th>\n",
150 |        "      <td>True</td>\n",
151 |        "    </tr>\n",
152 |        "    <tr>\n",
153 |        "      <th>CDK1</th>\n",
154 |        "      <td>True</td>\n",
155 |        "    </tr>\n",
156 |        "    <tr>\n",
157 |        "      <th>ACTR1A</th>\n",
158 |        "      <td>True</td>\n",
159 |        "    </tr>\n",
160 |        "    <tr>\n",
161 |        "      <th>TUBA1A</th>\n",
162 |        "      <td>True</td>\n",
163 |        "    </tr>\n",
164 |        "    <tr>\n",
165 |        "      <th>DCTN2</th>\n",
166 |        "      <td>True</td>\n",
167 |        "    </tr>\n",
168 |        "    <tr>\n",
169 |        "      <th>CENPJ</th>\n",
170 |        "      <td>True</td>\n",
171 |        "    </tr>\n",
172 |        "    <tr>\n",
173 |        "      <th>HSP90AA1</th>\n",
174 |        "      <td>True</td>\n",
175 |        "    </tr>\n",
176 |        "    <tr>\n",
177 |        "      <th>CCNB2</th>\n",
178 |        "      <td>True</td>\n",
179 |        "    </tr>\n",
180 |        "    <tr>\n",
181 |        "      <th>AURKA</th>\n",
182 |        "      <td>True</td>\n",
183 |        "    </tr>\n",
184 |        "  </tbody>\n",
185 |        "</table>\n",
186 |        "</div>"
187 |       ],
188 |       "text/plain": [
189 |        "          REACTOME_REGULATION_OF_PLK1_ACTIVITY_AT_G2_M_TRANSITION\n",
190 |        "index                                                            \n",
191 |        "PPP1CB                                                 True      \n",
192 |        "CLASP1                                                 True      \n",
193 |        "TUBA4A                                                 True      \n",
194 |        "CCNB1                                                  True      \n",
195 |        "TUBB                                                   True      \n",
196 |        "CUL1                                                   True      \n",
197 |        "CDK5RAP2                                               True      \n",
198 |        "TUBB4B                                                 True      \n",
199 |        "CDK1                                                   True      \n",
200 |        "ACTR1A                                                 True      \n",
201 |        "TUBA1A                                                 True      \n",
202 |        "DCTN2                                                  True      \n",
203 |        "CENPJ                                                  True      \n",
204 |        "HSP90AA1                                               True      \n",
205 |        "CCNB2                                                  True      \n",
206 |        "AURKA                                                  True      "
207 |       ]
208 |      },
209 |      "execution_count": 7,
210 |      "metadata": {},
211 |      "output_type": "execute_result"
212 |     }
213 |    ],
214 |    "source": [
215 |     "pathway_ann_matrix[pathway_ann_matrix['REACTOME_REGULATION_OF_PLK1_ACTIVITY_AT_G2_M_TRANSITION']][['REACTOME_REGULATION_OF_PLK1_ACTIVITY_AT_G2_M_TRANSITION']]"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 8,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "true_pathways_list = [x for x in pathway_ann_matrix.columns if 'G2_M_TRANSITION' in x or 'PLK1' in x]\n",
225 |     "drop_pathway_ann_matrix = pathway_ann_matrix.loc[:,~pathway_ann_matrix.columns.isin(true_pathways_list)]"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 9,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "data.varm['annotations'] = drop_pathway_ann_matrix"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 10,
240 |    "metadata": {},
241 |    "outputs": [
242 |     {
243 |      "data": {
244 |       "text/html": [
245 |        "<div>\n",
246 |        "<style scoped>\n",
247 |        "    .dataframe tbody tr th:only-of-type {\n",
248 |        "        vertical-align: middle;\n",
249 |        "    }\n",
250 |        "\n",
251 |        "    .dataframe tbody tr th {\n",
252 |        "        vertical-align: top;\n",
253 |        "    }\n",
254 |        "\n",
255 |        "    .dataframe thead th {\n",
256 |        "        text-align: right;\n",
257 |        "    }\n",
258 |        "</style>\n",
259 |        "<table border=\"1\" class=\"dataframe\">\n",
260 |        "  <thead>\n",
261 |        "    <tr style=\"text-align: right;\">\n",
262 |        "      <th></th>\n",
263 |        "      <th>REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM</th>\n",
264 |        "      <th>REACTOME_INTERFERON_ALPHA_BETA_SIGNALING</th>\n",
265 |        "      <th>REACTOME_INTERFERON_SIGNALING</th>\n",
266 |        "    </tr>\n",
267 |        "    <tr>\n",
268 |        "      <th>index</th>\n",
269 |        "      <th></th>\n",
270 |        "      <th></th>\n",
271 |        "      <th></th>\n",
272 |        "    </tr>\n",
273 |        "  </thead>\n",
274 |        "  <tbody>\n",
275 |        "    <tr>\n",
276 |        "      <th>ISG15</th>\n",
277 |        "      <td>True</td>\n",
278 |        "      <td>True</td>\n",
279 |        "      <td>True</td>\n",
280 |        "    </tr>\n",
281 |        "    <tr>\n",
282 |        "      <th>MIB2</th>\n",
283 |        "      <td>False</td>\n",
284 |        "      <td>False</td>\n",
285 |        "      <td>False</td>\n",
286 |        "    </tr>\n",
287 |        "    <tr>\n",
288 |        "      <th>PRKCZ</th>\n",
289 |        "      <td>False</td>\n",
290 |        "      <td>False</td>\n",
291 |        "      <td>False</td>\n",
292 |        "    </tr>\n",
293 |        "    <tr>\n",
294 |        "      <th>KCNAB2</th>\n",
295 |        "      <td>False</td>\n",
296 |        "      <td>False</td>\n",
297 |        "      <td>False</td>\n",
298 |        "    </tr>\n",
299 |        "    <tr>\n",
300 |        "      <th>CTNNBIP1</th>\n",
301 |        "      <td>False</td>\n",
302 |        "      <td>False</td>\n",
303 |        "      <td>False</td>\n",
304 |        "    </tr>\n",
305 |        "    <tr>\n",
306 |        "      <th>...</th>\n",
307 |        "      <td>...</td>\n",
308 |        "      <td>...</td>\n",
309 |        "      <td>...</td>\n",
310 |        "    </tr>\n",
311 |        "    <tr>\n",
312 |        "      <th>CYP19A1</th>\n",
313 |        "      <td>False</td>\n",
314 |        "      <td>False</td>\n",
315 |        "      <td>False</td>\n",
316 |        "    </tr>\n",
317 |        "    <tr>\n",
318 |        "      <th>RAP1GAP2</th>\n",
319 |        "      <td>False</td>\n",
320 |        "      <td>False</td>\n",
321 |        "      <td>False</td>\n",
322 |        "    </tr>\n",
323 |        "    <tr>\n",
324 |        "      <th>SSTR2</th>\n",
325 |        "      <td>False</td>\n",
326 |        "      <td>False</td>\n",
327 |        "      <td>False</td>\n",
328 |        "    </tr>\n",
329 |        "    <tr>\n",
330 |        "      <th>BIRC5</th>\n",
331 |        "      <td>True</td>\n",
332 |        "      <td>False</td>\n",
333 |        "      <td>False</td>\n",
334 |        "    </tr>\n",
335 |        "    <tr>\n",
336 |        "      <th>PLCB4</th>\n",
337 |        "      <td>False</td>\n",
338 |        "      <td>False</td>\n",
339 |        "      <td>False</td>\n",
340 |        "    </tr>\n",
341 |        "  </tbody>\n",
342 |        "</table>\n",
343 |        "<p>979 rows × 3 columns</p>\n",
344 |        "</div>"
345 |       ],
346 |       "text/plain": [
347 |        "          REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM  \\\n",
348 |        "index                                                    \n",
349 |        "ISG15                                             True   \n",
350 |        "MIB2                                             False   \n",
351 |        "PRKCZ                                            False   \n",
352 |        "KCNAB2                                           False   \n",
353 |        "CTNNBIP1                                         False   \n",
354 |        "...                                                ...   \n",
355 |        "CYP19A1                                          False   \n",
356 |        "RAP1GAP2                                         False   \n",
357 |        "SSTR2                                            False   \n",
358 |        "BIRC5                                             True   \n",
359 |        "PLCB4                                            False   \n",
360 |        "\n",
361 |        "          REACTOME_INTERFERON_ALPHA_BETA_SIGNALING  \\\n",
362 |        "index                                                \n",
363 |        "ISG15                                         True   \n",
364 |        "MIB2                                         False   \n",
365 |        "PRKCZ                                        False   \n",
366 |        "KCNAB2                                       False   \n",
367 |        "CTNNBIP1                                     False   \n",
368 |        "...                                            ...   \n",
369 |        "CYP19A1                                      False   \n",
370 |        "RAP1GAP2                                     False   \n",
371 |        "SSTR2                                        False   \n",
372 |        "BIRC5                                        False   \n",
373 |        "PLCB4                                        False   \n",
374 |        "\n",
375 |        "          REACTOME_INTERFERON_SIGNALING  \n",
376 |        "index                                    \n",
377 |        "ISG15                              True  \n",
378 |        "MIB2                              False  \n",
379 |        "PRKCZ                             False  \n",
380 |        "KCNAB2                            False  \n",
381 |        "CTNNBIP1                          False  \n",
382 |        "...                                 ...  \n",
383 |        "CYP19A1                           False  \n",
384 |        "RAP1GAP2                          False  \n",
385 |        "SSTR2                             False  \n",
386 |        "BIRC5                             False  \n",
387 |        "PLCB4                             False  \n",
388 |        "\n",
389 |        "[979 rows x 3 columns]"
390 |       ]
391 |      },
392 |      "execution_count": 10,
393 |      "metadata": {},
394 |      "output_type": "execute_result"
395 |     }
396 |    ],
397 |    "source": [
398 |     "drop_pathway_ann_matrix.iloc[:,drop_pathway_ann_matrix.loc['IFITM3',:].values == True]"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 11,
404 |    "metadata": {},
405 |    "outputs": [],
406 |    "source": [
407 |     "membership_mask = data.varm['annotations'].astype(bool).T\n",
408 |     "X_train, X_test = train_test_split(\n",
409 |     "    data.X,\n",
410 |     "    test_size=0.25,\n",
411 |     "    shuffle=True,\n",
412 |     "    random_state=0,\n",
413 |     "    \n",
414 |     ")"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "markdown",
419 |    "metadata": {},
420 |    "source": [
421 |     "# initialize model"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": 12,
427 |    "metadata": {},
428 |    "outputs": [],
429 |    "source": [
430 |     "from models import pmVAEModel"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": 13,
436 |    "metadata": {},
437 |    "outputs": [],
438 |    "source": [
439 |     "kangVAE = pmVAEModel(\n",
440 |     "    membership_mask.values,\n",
441 |     "    [12],\n",
442 |     "    4,\n",
443 |     "    beta=1e-05,\n",
444 |     "    terms=membership_mask.index,\n",
445 |     "    add_auxiliary_module=True\n",
446 |     ")"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "code",
451 |    "execution_count": 14,
452 |    "metadata": {},
453 |    "outputs": [
454 |     {
455 |      "data": {
456 |       "text/plain": [
457 |        "pmVAE(\n",
458 |        "  (encoder_net): pmEncoder(\n",
459 |        "    (encoder_dense_1): CustomizedLinear(input_features=979, output_features=2400, bias=True)\n",
460 |        "    (encoder_norm_1): BatchNorm1d(2400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
461 |        "    (encoder_elu_1): ELU(alpha=1.0, inplace=True)\n",
462 |        "    (encoder_dense_2): CustomizedLinear(input_features=2400, output_features=1600, bias=True)\n",
463 |        "    (encoder_norm_2): BatchNorm1d(1600, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
464 |        "  )\n",
465 |        "  (decoder_net): pmDecoder(\n",
466 |        "    (decoder_dense_1): CustomizedLinear(input_features=800, output_features=2400, bias=True)\n",
467 |        "    (decoder_norm_1): BatchNorm1d(2400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
468 |        "    (decoder_elu_1): ELU(alpha=1.0, inplace=True)\n",
469 |        "  )\n",
470 |        "  (merge_layer): CustomizedLinear(input_features=2400, output_features=979, bias=False)\n",
471 |        ")"
472 |       ]
473 |      },
474 |      "execution_count": 14,
475 |      "metadata": {},
476 |      "output_type": "execute_result"
477 |     }
478 |    ],
479 |    "source": [
480 |     "kangVAE.model"
481 |    ]
482 |   },
483 |   {
484 |    "cell_type": "markdown",
485 |    "metadata": {},
486 |    "source": [
487 |     "# train model"
488 |    ]
489 |   },
490 |   {
491 |    "cell_type": "code",
492 |    "execution_count": null,
493 |    "metadata": {},
494 |    "outputs": [],
495 |    "source": [
496 |     "kangVAE.train(train_ds, test_ds, checkpoint_path='pmvae_dropG2M_checkpoint.pkl')"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "markdown",
501 |    "metadata": {},
502 |    "source": [
503 |     "# explain model"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": 15,
509 |    "metadata": {},
510 |    "outputs": [],
511 |    "source": [
512 |     "kangVAE.load_checkpoint('saved_models/pmvae_dropG2M_checkpoint.pkl.best_loss')"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": 16,
518 |    "metadata": {},
519 |    "outputs": [],
520 |    "source": [
521 |     "kangVAE.set_gpu(False)"
522 |    ]
523 |   },
524 |   {
525 |    "cell_type": "code",
526 |    "execution_count": 17,
527 |    "metadata": {},
528 |    "outputs": [
529 |     {
530 |      "data": {
531 |       "text/plain": [
532 |        "800"
533 |       ]
534 |      },
535 |      "execution_count": 17,
536 |      "metadata": {},
537 |      "output_type": "execute_result"
538 |     }
539 |    ],
540 |    "source": [
541 |     "len(kangVAE.latent_space_names())"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "code",
546 |    "execution_count": 18,
547 |    "metadata": {},
548 |    "outputs": [
549 |     {
550 |      "data": {
551 |       "text/plain": [
552 |        "796"
553 |       ]
554 |      },
555 |      "execution_count": 18,
556 |      "metadata": {},
557 |      "output_type": "execute_result"
558 |     }
559 |    ],
560 |    "source": [
561 |     "kangVAE.latent_space_names().index('AUXILIARY-0')"
562 |    ]
563 |   },
564 |   {
565 |    "cell_type": "code",
566 |    "execution_count": 19,
567 |    "metadata": {},
568 |    "outputs": [
569 |     {
570 |      "data": {
571 |       "text/plain": [
572 |        "'AUXILIARY-0'"
573 |       ]
574 |      },
575 |      "execution_count": 19,
576 |      "metadata": {},
577 |      "output_type": "execute_result"
578 |     }
579 |    ],
580 |    "source": [
581 |     "kangVAE.latent_space_names()[-4]"
582 |    ]
583 |   },
584 |   {
585 |    "cell_type": "code",
586 |    "execution_count": 20,
587 |    "metadata": {},
588 |    "outputs": [
589 |     {
590 |      "data": {
591 |       "text/plain": [
592 |        "'AUXILIARY-1'"
593 |       ]
594 |      },
595 |      "execution_count": 20,
596 |      "metadata": {},
597 |      "output_type": "execute_result"
598 |     }
599 |    ],
600 |    "source": [
601 |     "kangVAE.latent_space_names()[-3]"
602 |    ]
603 |   },
604 |   {
605 |    "cell_type": "code",
606 |    "execution_count": 21,
607 |    "metadata": {},
608 |    "outputs": [
609 |     {
610 |      "data": {
611 |       "text/plain": [
612 |        "'AUXILIARY-2'"
613 |       ]
614 |      },
615 |      "execution_count": 21,
616 |      "metadata": {},
617 |      "output_type": "execute_result"
618 |     }
619 |    ],
620 |    "source": [
621 |     "kangVAE.latent_space_names()[-2]"
622 |    ]
623 |   },
624 |   {
625 |    "cell_type": "code",
626 |    "execution_count": 22,
627 |    "metadata": {},
628 |    "outputs": [
629 |     {
630 |      "data": {
631 |       "text/plain": [
632 |        "'AUXILIARY-3'"
633 |       ]
634 |      },
635 |      "execution_count": 22,
636 |      "metadata": {},
637 |      "output_type": "execute_result"
638 |     }
639 |    ],
640 |    "source": [
641 |     "kangVAE.latent_space_names()[-1]"
642 |    ]
643 |   },
644 |   {
645 |    "cell_type": "code",
646 |    "execution_count": 23,
647 |    "metadata": {},
648 |    "outputs": [],
649 |    "source": [
650 |     "def model_latent_wrapper(x):\n",
651 |     "    outs = kangVAE.model(x)\n",
652 |     "    z = outs.mu\n",
653 |     "    return z[:,-4].reshape(-1,1) # which to explain"
654 |    ]
655 |   },
656 |   {
657 |    "cell_type": "code",
658 |    "execution_count": 24,
659 |    "metadata": {},
660 |    "outputs": [],
661 |    "source": [
662 |     "from pathexplainer import PathExplainerTorch"
663 |    ]
664 |   },
665 |   {
666 |    "cell_type": "code",
667 |    "execution_count": 25,
668 |    "metadata": {},
669 |    "outputs": [],
670 |    "source": [
671 |     "input_data = torch.tensor(data.X)\n",
672 |     "input_data.requires_grad = True\n",
673 |     "baseline_data = torch.zeros(data.X.shape[1])\n",
674 |     "baseline_data.requires_grad = True"
675 |    ]
676 |   },
677 |   {
678 |    "cell_type": "code",
679 |    "execution_count": 26,
680 |    "metadata": {},
681 |    "outputs": [],
682 |    "source": [
683 |     "explainer = PathExplainerTorch(model_latent_wrapper)\n",
684 |     "attributions = explainer.attributions(input_data,\n",
685 |     "                                      baseline=baseline_data,\n",
686 |     "                                      num_samples=200,\n",
687 |     "                                      use_expectation=False)"
688 |    ]
689 |   },
690 |   {
691 |    "cell_type": "code",
692 |    "execution_count": 27,
693 |    "metadata": {},
694 |    "outputs": [],
695 |    "source": [
696 |     "np_attribs = attributions.detach().numpy()"
697 |    ]
698 |   },
699 |   {
700 |    "cell_type": "code",
701 |    "execution_count": 28,
702 |    "metadata": {},
703 |    "outputs": [],
704 |    "source": [
705 |     "top = pd.DataFrame(index=membership_mask.columns)\n",
706 |     "top['means'] = np.abs(np_attribs).mean(0)\n",
707 |     "top['stds'] = np.abs(np_attribs).std(0)\n"
708 |    ]
709 |   },
710 |   {
711 |    "cell_type": "code",
712 |    "execution_count": 29,
713 |    "metadata": {},
714 |    "outputs": [
715 |     {
716 |      "data": {
717 |       "text/html": [
718 |        "<div>\n",
719 |        "<style scoped>\n",
720 |        "    .dataframe tbody tr th:only-of-type {\n",
721 |        "        vertical-align: middle;\n",
722 |        "    }\n",
723 |        "\n",
724 |        "    .dataframe tbody tr th {\n",
725 |        "        vertical-align: top;\n",
726 |        "    }\n",
727 |        "\n",
728 |        "    .dataframe thead th {\n",
729 |        "        text-align: right;\n",
730 |        "    }\n",
731 |        "</style>\n",
732 |        "<table border=\"1\" class=\"dataframe\">\n",
733 |        "  <thead>\n",
734 |        "    <tr style=\"text-align: right;\">\n",
735 |        "      <th></th>\n",
736 |        "      <th>means</th>\n",
737 |        "      <th>stds</th>\n",
738 |        "    </tr>\n",
739 |        "    <tr>\n",
740 |        "      <th>index</th>\n",
741 |        "      <th></th>\n",
742 |        "      <th></th>\n",
743 |        "    </tr>\n",
744 |        "  </thead>\n",
745 |        "  <tbody>\n",
746 |        "    <tr>\n",
747 |        "      <th>H2AFZ</th>\n",
748 |        "      <td>1.558621</td>\n",
749 |        "      <td>0.690636</td>\n",
750 |        "    </tr>\n",
751 |        "    <tr>\n",
752 |        "      <th>IL8</th>\n",
753 |        "      <td>0.588597</td>\n",
754 |        "      <td>0.379918</td>\n",
755 |        "    </tr>\n",
756 |        "    <tr>\n",
757 |        "      <th>PLA2G7</th>\n",
758 |        "      <td>0.433617</td>\n",
759 |        "      <td>0.340465</td>\n",
760 |        "    </tr>\n",
761 |        "    <tr>\n",
762 |        "      <th>SSB</th>\n",
763 |        "      <td>0.398044</td>\n",
764 |        "      <td>0.208317</td>\n",
765 |        "    </tr>\n",
766 |        "    <tr>\n",
767 |        "      <th>HIST1H2AC</th>\n",
768 |        "      <td>0.234484</td>\n",
769 |        "      <td>0.173549</td>\n",
770 |        "    </tr>\n",
771 |        "    <tr>\n",
772 |        "      <th>...</th>\n",
773 |        "      <td>...</td>\n",
774 |        "      <td>...</td>\n",
775 |        "    </tr>\n",
776 |        "    <tr>\n",
777 |        "      <th>IFNB1</th>\n",
778 |        "      <td>0.000011</td>\n",
779 |        "      <td>0.000189</td>\n",
780 |        "    </tr>\n",
781 |        "    <tr>\n",
782 |        "      <th>PELI3</th>\n",
783 |        "      <td>0.000010</td>\n",
784 |        "      <td>0.000337</td>\n",
785 |        "    </tr>\n",
786 |        "    <tr>\n",
787 |        "      <th>AURKB</th>\n",
788 |        "      <td>0.000010</td>\n",
789 |        "      <td>0.000136</td>\n",
790 |        "    </tr>\n",
791 |        "    <tr>\n",
792 |        "      <th>SRGAP3</th>\n",
793 |        "      <td>0.000010</td>\n",
794 |        "      <td>0.000202</td>\n",
795 |        "    </tr>\n",
796 |        "    <tr>\n",
797 |        "      <th>ATP6V0A4</th>\n",
798 |        "      <td>0.000005</td>\n",
799 |        "      <td>0.000120</td>\n",
800 |        "    </tr>\n",
801 |        "  </tbody>\n",
802 |        "</table>\n",
803 |        "<p>979 rows × 2 columns</p>\n",
804 |        "</div>"
805 |       ],
806 |       "text/plain": [
807 |        "              means      stds\n",
808 |        "index                        \n",
809 |        "H2AFZ      1.558621  0.690636\n",
810 |        "IL8        0.588597  0.379918\n",
811 |        "PLA2G7     0.433617  0.340465\n",
812 |        "SSB        0.398044  0.208317\n",
813 |        "HIST1H2AC  0.234484  0.173549\n",
814 |        "...             ...       ...\n",
815 |        "IFNB1      0.000011  0.000189\n",
816 |        "PELI3      0.000010  0.000337\n",
817 |        "AURKB      0.000010  0.000136\n",
818 |        "SRGAP3     0.000010  0.000202\n",
819 |        "ATP6V0A4   0.000005  0.000120\n",
820 |        "\n",
821 |        "[979 rows x 2 columns]"
822 |       ]
823 |      },
824 |      "execution_count": 29,
825 |      "metadata": {},
826 |      "output_type": "execute_result"
827 |     }
828 |    ],
829 |    "source": [
830 |     "top.sort_values('means',ascending=False)"
831 |    ]
832 |   },
833 |   {
834 |    "cell_type": "code",
835 |    "execution_count": 30,
836 |    "metadata": {},
837 |    "outputs": [],
838 |    "source": [
839 |     "top.to_csv('kang_remove_g/aux_0.csv')"
840 |    ]
841 |   }
842 |  ],
843 |  "metadata": {
844 |   "kernelspec": {
845 |    "display_name": "newenv",
846 |    "language": "python",
847 |    "name": "newenv"
848 |   },
849 |   "language_info": {
850 |    "codemirror_mode": {
851 |     "name": "ipython",
852 |     "version": 3
853 |    },
854 |    "file_extension": ".py",
855 |    "mimetype": "text/x-python",
856 |    "name": "python",
857 |    "nbconvert_exporter": "python",
858 |    "pygments_lexer": "ipython3",
859 |    "version": "3.9.7"
860 |   }
861 |  },
862 |  "nbformat": 4,
863 |  "nbformat_minor": 4
864 | }
865 | 


--------------------------------------------------------------------------------
/figures/supplementary_figures/.ipynb_checkpoints/drop_g-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import anndata\n",
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd\n",
 12 |     "\n",
 13 |     "import torch\n",
 14 |     "\n",
 15 |     "import os\n",
 16 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\""
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from utils import load_annotations"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "from sklearn.model_selection import train_test_split"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "# load data"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 4,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "data = anndata.read('data/kang_count.h5ad')"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 5,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "pathway_ann_matrix = load_annotations(\n",
 60 |     "    'data/c2.cp.reactome.v7.4.symbols.gmt',\n",
 61 |     "    data.var_names,\n",
 62 |     "    min_genes=13\n",
 63 |     ")"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 6,
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "data": {
 73 |       "text/plain": [
 74 |        "['REACTOME_REGULATION_OF_PLK1_ACTIVITY_AT_G2_M_TRANSITION']"
 75 |       ]
 76 |      },
 77 |      "execution_count": 6,
 78 |      "metadata": {},
 79 |      "output_type": "execute_result"
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "[x for x in pathway_ann_matrix.columns if 'G2_M_TRANSITION' in x or 'PLK1' in x]"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 7,
 89 |    "metadata": {},
 90 |    "outputs": [
 91 |     {
 92 |      "data": {
 93 |       "text/html": [
 94 |        "<div>\n",
 95 |        "<style scoped>\n",
 96 |        "    .dataframe tbody tr th:only-of-type {\n",
 97 |        "        vertical-align: middle;\n",
 98 |        "    }\n",
 99 |        "\n",
100 |        "    .dataframe tbody tr th {\n",
101 |        "        vertical-align: top;\n",
102 |        "    }\n",
103 |        "\n",
104 |        "    .dataframe thead th {\n",
105 |        "        text-align: right;\n",
106 |        "    }\n",
107 |        "</style>\n",
108 |        "<table border=\"1\" class=\"dataframe\">\n",
109 |        "  <thead>\n",
110 |        "    <tr style=\"text-align: right;\">\n",
111 |        "      <th></th>\n",
112 |        "      <th>REACTOME_REGULATION_OF_PLK1_ACTIVITY_AT_G2_M_TRANSITION</th>\n",
113 |        "    </tr>\n",
114 |        "    <tr>\n",
115 |        "      <th>index</th>\n",
116 |        "      <th></th>\n",
117 |        "    </tr>\n",
118 |        "  </thead>\n",
119 |        "  <tbody>\n",
120 |        "    <tr>\n",
121 |        "      <th>PPP1CB</th>\n",
122 |        "      <td>True</td>\n",
123 |        "    </tr>\n",
124 |        "    <tr>\n",
125 |        "      <th>CLASP1</th>\n",
126 |        "      <td>True</td>\n",
127 |        "    </tr>\n",
128 |        "    <tr>\n",
129 |        "      <th>TUBA4A</th>\n",
130 |        "      <td>True</td>\n",
131 |        "    </tr>\n",
132 |        "    <tr>\n",
133 |        "      <th>CCNB1</th>\n",
134 |        "      <td>True</td>\n",
135 |        "    </tr>\n",
136 |        "    <tr>\n",
137 |        "      <th>TUBB</th>\n",
138 |        "      <td>True</td>\n",
139 |        "    </tr>\n",
140 |        "    <tr>\n",
141 |        "      <th>CUL1</th>\n",
142 |        "      <td>True</td>\n",
143 |        "    </tr>\n",
144 |        "    <tr>\n",
145 |        "      <th>CDK5RAP2</th>\n",
146 |        "      <td>True</td>\n",
147 |        "    </tr>\n",
148 |        "    <tr>\n",
149 |        "      <th>TUBB4B</th>\n",
150 |        "      <td>True</td>\n",
151 |        "    </tr>\n",
152 |        "    <tr>\n",
153 |        "      <th>CDK1</th>\n",
154 |        "      <td>True</td>\n",
155 |        "    </tr>\n",
156 |        "    <tr>\n",
157 |        "      <th>ACTR1A</th>\n",
158 |        "      <td>True</td>\n",
159 |        "    </tr>\n",
160 |        "    <tr>\n",
161 |        "      <th>TUBA1A</th>\n",
162 |        "      <td>True</td>\n",
163 |        "    </tr>\n",
164 |        "    <tr>\n",
165 |        "      <th>DCTN2</th>\n",
166 |        "      <td>True</td>\n",
167 |        "    </tr>\n",
168 |        "    <tr>\n",
169 |        "      <th>CENPJ</th>\n",
170 |        "      <td>True</td>\n",
171 |        "    </tr>\n",
172 |        "    <tr>\n",
173 |        "      <th>HSP90AA1</th>\n",
174 |        "      <td>True</td>\n",
175 |        "    </tr>\n",
176 |        "    <tr>\n",
177 |        "      <th>CCNB2</th>\n",
178 |        "      <td>True</td>\n",
179 |        "    </tr>\n",
180 |        "    <tr>\n",
181 |        "      <th>AURKA</th>\n",
182 |        "      <td>True</td>\n",
183 |        "    </tr>\n",
184 |        "  </tbody>\n",
185 |        "</table>\n",
186 |        "</div>"
187 |       ],
188 |       "text/plain": [
189 |        "          REACTOME_REGULATION_OF_PLK1_ACTIVITY_AT_G2_M_TRANSITION\n",
190 |        "index                                                            \n",
191 |        "PPP1CB                                                 True      \n",
192 |        "CLASP1                                                 True      \n",
193 |        "TUBA4A                                                 True      \n",
194 |        "CCNB1                                                  True      \n",
195 |        "TUBB                                                   True      \n",
196 |        "CUL1                                                   True      \n",
197 |        "CDK5RAP2                                               True      \n",
198 |        "TUBB4B                                                 True      \n",
199 |        "CDK1                                                   True      \n",
200 |        "ACTR1A                                                 True      \n",
201 |        "TUBA1A                                                 True      \n",
202 |        "DCTN2                                                  True      \n",
203 |        "CENPJ                                                  True      \n",
204 |        "HSP90AA1                                               True      \n",
205 |        "CCNB2                                                  True      \n",
206 |        "AURKA                                                  True      "
207 |       ]
208 |      },
209 |      "execution_count": 7,
210 |      "metadata": {},
211 |      "output_type": "execute_result"
212 |     }
213 |    ],
214 |    "source": [
215 |     "pathway_ann_matrix[pathway_ann_matrix['REACTOME_REGULATION_OF_PLK1_ACTIVITY_AT_G2_M_TRANSITION']][['REACTOME_REGULATION_OF_PLK1_ACTIVITY_AT_G2_M_TRANSITION']]"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 8,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "true_pathways_list = [x for x in pathway_ann_matrix.columns if 'G2_M_TRANSITION' in x or 'PLK1' in x]\n",
225 |     "drop_pathway_ann_matrix = pathway_ann_matrix.loc[:,~pathway_ann_matrix.columns.isin(true_pathways_list)]"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 9,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "data.varm['annotations'] = drop_pathway_ann_matrix"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 10,
240 |    "metadata": {},
241 |    "outputs": [
242 |     {
243 |      "data": {
244 |       "text/html": [
245 |        "<div>\n",
246 |        "<style scoped>\n",
247 |        "    .dataframe tbody tr th:only-of-type {\n",
248 |        "        vertical-align: middle;\n",
249 |        "    }\n",
250 |        "\n",
251 |        "    .dataframe tbody tr th {\n",
252 |        "        vertical-align: top;\n",
253 |        "    }\n",
254 |        "\n",
255 |        "    .dataframe thead th {\n",
256 |        "        text-align: right;\n",
257 |        "    }\n",
258 |        "</style>\n",
259 |        "<table border=\"1\" class=\"dataframe\">\n",
260 |        "  <thead>\n",
261 |        "    <tr style=\"text-align: right;\">\n",
262 |        "      <th></th>\n",
263 |        "      <th>REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM</th>\n",
264 |        "      <th>REACTOME_INTERFERON_ALPHA_BETA_SIGNALING</th>\n",
265 |        "      <th>REACTOME_INTERFERON_SIGNALING</th>\n",
266 |        "    </tr>\n",
267 |        "    <tr>\n",
268 |        "      <th>index</th>\n",
269 |        "      <th></th>\n",
270 |        "      <th></th>\n",
271 |        "      <th></th>\n",
272 |        "    </tr>\n",
273 |        "  </thead>\n",
274 |        "  <tbody>\n",
275 |        "    <tr>\n",
276 |        "      <th>ISG15</th>\n",
277 |        "      <td>True</td>\n",
278 |        "      <td>True</td>\n",
279 |        "      <td>True</td>\n",
280 |        "    </tr>\n",
281 |        "    <tr>\n",
282 |        "      <th>MIB2</th>\n",
283 |        "      <td>False</td>\n",
284 |        "      <td>False</td>\n",
285 |        "      <td>False</td>\n",
286 |        "    </tr>\n",
287 |        "    <tr>\n",
288 |        "      <th>PRKCZ</th>\n",
289 |        "      <td>False</td>\n",
290 |        "      <td>False</td>\n",
291 |        "      <td>False</td>\n",
292 |        "    </tr>\n",
293 |        "    <tr>\n",
294 |        "      <th>KCNAB2</th>\n",
295 |        "      <td>False</td>\n",
296 |        "      <td>False</td>\n",
297 |        "      <td>False</td>\n",
298 |        "    </tr>\n",
299 |        "    <tr>\n",
300 |        "      <th>CTNNBIP1</th>\n",
301 |        "      <td>False</td>\n",
302 |        "      <td>False</td>\n",
303 |        "      <td>False</td>\n",
304 |        "    </tr>\n",
305 |        "    <tr>\n",
306 |        "      <th>...</th>\n",
307 |        "      <td>...</td>\n",
308 |        "      <td>...</td>\n",
309 |        "      <td>...</td>\n",
310 |        "    </tr>\n",
311 |        "    <tr>\n",
312 |        "      <th>CYP19A1</th>\n",
313 |        "      <td>False</td>\n",
314 |        "      <td>False</td>\n",
315 |        "      <td>False</td>\n",
316 |        "    </tr>\n",
317 |        "    <tr>\n",
318 |        "      <th>RAP1GAP2</th>\n",
319 |        "      <td>False</td>\n",
320 |        "      <td>False</td>\n",
321 |        "      <td>False</td>\n",
322 |        "    </tr>\n",
323 |        "    <tr>\n",
324 |        "      <th>SSTR2</th>\n",
325 |        "      <td>False</td>\n",
326 |        "      <td>False</td>\n",
327 |        "      <td>False</td>\n",
328 |        "    </tr>\n",
329 |        "    <tr>\n",
330 |        "      <th>BIRC5</th>\n",
331 |        "      <td>True</td>\n",
332 |        "      <td>False</td>\n",
333 |        "      <td>False</td>\n",
334 |        "    </tr>\n",
335 |        "    <tr>\n",
336 |        "      <th>PLCB4</th>\n",
337 |        "      <td>False</td>\n",
338 |        "      <td>False</td>\n",
339 |        "      <td>False</td>\n",
340 |        "    </tr>\n",
341 |        "  </tbody>\n",
342 |        "</table>\n",
343 |        "<p>979 rows × 3 columns</p>\n",
344 |        "</div>"
345 |       ],
346 |       "text/plain": [
347 |        "          REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM  \\\n",
348 |        "index                                                    \n",
349 |        "ISG15                                             True   \n",
350 |        "MIB2                                             False   \n",
351 |        "PRKCZ                                            False   \n",
352 |        "KCNAB2                                           False   \n",
353 |        "CTNNBIP1                                         False   \n",
354 |        "...                                                ...   \n",
355 |        "CYP19A1                                          False   \n",
356 |        "RAP1GAP2                                         False   \n",
357 |        "SSTR2                                            False   \n",
358 |        "BIRC5                                             True   \n",
359 |        "PLCB4                                            False   \n",
360 |        "\n",
361 |        "          REACTOME_INTERFERON_ALPHA_BETA_SIGNALING  \\\n",
362 |        "index                                                \n",
363 |        "ISG15                                         True   \n",
364 |        "MIB2                                         False   \n",
365 |        "PRKCZ                                        False   \n",
366 |        "KCNAB2                                       False   \n",
367 |        "CTNNBIP1                                     False   \n",
368 |        "...                                            ...   \n",
369 |        "CYP19A1                                      False   \n",
370 |        "RAP1GAP2                                     False   \n",
371 |        "SSTR2                                        False   \n",
372 |        "BIRC5                                        False   \n",
373 |        "PLCB4                                        False   \n",
374 |        "\n",
375 |        "          REACTOME_INTERFERON_SIGNALING  \n",
376 |        "index                                    \n",
377 |        "ISG15                              True  \n",
378 |        "MIB2                              False  \n",
379 |        "PRKCZ                             False  \n",
380 |        "KCNAB2                            False  \n",
381 |        "CTNNBIP1                          False  \n",
382 |        "...                                 ...  \n",
383 |        "CYP19A1                           False  \n",
384 |        "RAP1GAP2                          False  \n",
385 |        "SSTR2                             False  \n",
386 |        "BIRC5                             False  \n",
387 |        "PLCB4                             False  \n",
388 |        "\n",
389 |        "[979 rows x 3 columns]"
390 |       ]
391 |      },
392 |      "execution_count": 10,
393 |      "metadata": {},
394 |      "output_type": "execute_result"
395 |     }
396 |    ],
397 |    "source": [
398 |     "drop_pathway_ann_matrix.iloc[:,drop_pathway_ann_matrix.loc['IFITM3',:].values == True]"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 11,
404 |    "metadata": {},
405 |    "outputs": [],
406 |    "source": [
407 |     "membership_mask = data.varm['annotations'].astype(bool).T\n",
408 |     "X_train, X_test = train_test_split(\n",
409 |     "    data.X,\n",
410 |     "    test_size=0.25,\n",
411 |     "    shuffle=True,\n",
412 |     "    random_state=0,\n",
413 |     "    \n",
414 |     ")"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "markdown",
419 |    "metadata": {},
420 |    "source": [
421 |     "# initialize model"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": 12,
427 |    "metadata": {},
428 |    "outputs": [],
429 |    "source": [
430 |     "from models import pmVAEModel"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": 13,
436 |    "metadata": {},
437 |    "outputs": [],
438 |    "source": [
439 |     "kangVAE = pmVAEModel(\n",
440 |     "    membership_mask.values,\n",
441 |     "    [12],\n",
442 |     "    4,\n",
443 |     "    beta=1e-05,\n",
444 |     "    terms=membership_mask.index,\n",
445 |     "    add_auxiliary_module=True\n",
446 |     ")"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "code",
451 |    "execution_count": 14,
452 |    "metadata": {},
453 |    "outputs": [
454 |     {
455 |      "data": {
456 |       "text/plain": [
457 |        "pmVAE(\n",
458 |        "  (encoder_net): pmEncoder(\n",
459 |        "    (encoder_dense_1): CustomizedLinear(input_features=979, output_features=2400, bias=True)\n",
460 |        "    (encoder_norm_1): BatchNorm1d(2400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
461 |        "    (encoder_elu_1): ELU(alpha=1.0, inplace=True)\n",
462 |        "    (encoder_dense_2): CustomizedLinear(input_features=2400, output_features=1600, bias=True)\n",
463 |        "    (encoder_norm_2): BatchNorm1d(1600, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
464 |        "  )\n",
465 |        "  (decoder_net): pmDecoder(\n",
466 |        "    (decoder_dense_1): CustomizedLinear(input_features=800, output_features=2400, bias=True)\n",
467 |        "    (decoder_norm_1): BatchNorm1d(2400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
468 |        "    (decoder_elu_1): ELU(alpha=1.0, inplace=True)\n",
469 |        "  )\n",
470 |        "  (merge_layer): CustomizedLinear(input_features=2400, output_features=979, bias=False)\n",
471 |        ")"
472 |       ]
473 |      },
474 |      "execution_count": 14,
475 |      "metadata": {},
476 |      "output_type": "execute_result"
477 |     }
478 |    ],
479 |    "source": [
480 |     "kangVAE.model"
481 |    ]
482 |   },
483 |   {
484 |    "cell_type": "markdown",
485 |    "metadata": {},
486 |    "source": [
487 |     "# train model"
488 |    ]
489 |   },
490 |   {
491 |    "cell_type": "code",
492 |    "execution_count": null,
493 |    "metadata": {},
494 |    "outputs": [],
495 |    "source": [
496 |     "kangVAE.train(train_ds, test_ds, checkpoint_path='pmvae_dropG2M_checkpoint.pkl')"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "markdown",
501 |    "metadata": {},
502 |    "source": [
503 |     "# explain model"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": 15,
509 |    "metadata": {},
510 |    "outputs": [],
511 |    "source": [
512 |     "kangVAE.load_checkpoint('saved_models/pmvae_dropG2M_checkpoint.pkl.best_loss')"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": 16,
518 |    "metadata": {},
519 |    "outputs": [],
520 |    "source": [
521 |     "kangVAE.set_gpu(False)"
522 |    ]
523 |   },
524 |   {
525 |    "cell_type": "code",
526 |    "execution_count": 17,
527 |    "metadata": {},
528 |    "outputs": [
529 |     {
530 |      "data": {
531 |       "text/plain": [
532 |        "800"
533 |       ]
534 |      },
535 |      "execution_count": 17,
536 |      "metadata": {},
537 |      "output_type": "execute_result"
538 |     }
539 |    ],
540 |    "source": [
541 |     "len(kangVAE.latent_space_names())"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "code",
546 |    "execution_count": 18,
547 |    "metadata": {},
548 |    "outputs": [
549 |     {
550 |      "data": {
551 |       "text/plain": [
552 |        "796"
553 |       ]
554 |      },
555 |      "execution_count": 18,
556 |      "metadata": {},
557 |      "output_type": "execute_result"
558 |     }
559 |    ],
560 |    "source": [
561 |     "kangVAE.latent_space_names().index('AUXILIARY-0')"
562 |    ]
563 |   },
564 |   {
565 |    "cell_type": "code",
566 |    "execution_count": 19,
567 |    "metadata": {},
568 |    "outputs": [
569 |     {
570 |      "data": {
571 |       "text/plain": [
572 |        "'AUXILIARY-0'"
573 |       ]
574 |      },
575 |      "execution_count": 19,
576 |      "metadata": {},
577 |      "output_type": "execute_result"
578 |     }
579 |    ],
580 |    "source": [
581 |     "kangVAE.latent_space_names()[-4]"
582 |    ]
583 |   },
584 |   {
585 |    "cell_type": "code",
586 |    "execution_count": 20,
587 |    "metadata": {},
588 |    "outputs": [
589 |     {
590 |      "data": {
591 |       "text/plain": [
592 |        "'AUXILIARY-1'"
593 |       ]
594 |      },
595 |      "execution_count": 20,
596 |      "metadata": {},
597 |      "output_type": "execute_result"
598 |     }
599 |    ],
600 |    "source": [
601 |     "kangVAE.latent_space_names()[-3]"
602 |    ]
603 |   },
604 |   {
605 |    "cell_type": "code",
606 |    "execution_count": 21,
607 |    "metadata": {},
608 |    "outputs": [
609 |     {
610 |      "data": {
611 |       "text/plain": [
612 |        "'AUXILIARY-2'"
613 |       ]
614 |      },
615 |      "execution_count": 21,
616 |      "metadata": {},
617 |      "output_type": "execute_result"
618 |     }
619 |    ],
620 |    "source": [
621 |     "kangVAE.latent_space_names()[-2]"
622 |    ]
623 |   },
624 |   {
625 |    "cell_type": "code",
626 |    "execution_count": 22,
627 |    "metadata": {},
628 |    "outputs": [
629 |     {
630 |      "data": {
631 |       "text/plain": [
632 |        "'AUXILIARY-3'"
633 |       ]
634 |      },
635 |      "execution_count": 22,
636 |      "metadata": {},
637 |      "output_type": "execute_result"
638 |     }
639 |    ],
640 |    "source": [
641 |     "kangVAE.latent_space_names()[-1]"
642 |    ]
643 |   },
644 |   {
645 |    "cell_type": "code",
646 |    "execution_count": 23,
647 |    "metadata": {},
648 |    "outputs": [],
649 |    "source": [
650 |     "def model_latent_wrapper(x):\n",
651 |     "    outs = kangVAE.model(x)\n",
652 |     "    z = outs.mu\n",
653 |     "    return z[:,-4].reshape(-1,1) # which to explain"
654 |    ]
655 |   },
656 |   {
657 |    "cell_type": "code",
658 |    "execution_count": 24,
659 |    "metadata": {},
660 |    "outputs": [],
661 |    "source": [
662 |     "from pathexplainer import PathExplainerTorch"
663 |    ]
664 |   },
665 |   {
666 |    "cell_type": "code",
667 |    "execution_count": 25,
668 |    "metadata": {},
669 |    "outputs": [],
670 |    "source": [
671 |     "input_data = torch.tensor(data.X)\n",
672 |     "input_data.requires_grad = True\n",
673 |     "baseline_data = torch.zeros(data.X.shape[1])\n",
674 |     "baseline_data.requires_grad = True"
675 |    ]
676 |   },
677 |   {
678 |    "cell_type": "code",
679 |    "execution_count": 26,
680 |    "metadata": {},
681 |    "outputs": [],
682 |    "source": [
683 |     "explainer = PathExplainerTorch(model_latent_wrapper)\n",
684 |     "attributions = explainer.attributions(input_data,\n",
685 |     "                                      baseline=baseline_data,\n",
686 |     "                                      num_samples=200,\n",
687 |     "                                      use_expectation=False)"
688 |    ]
689 |   },
690 |   {
691 |    "cell_type": "code",
692 |    "execution_count": 27,
693 |    "metadata": {},
694 |    "outputs": [],
695 |    "source": [
696 |     "np_attribs = attributions.detach().numpy()"
697 |    ]
698 |   },
699 |   {
700 |    "cell_type": "code",
701 |    "execution_count": 28,
702 |    "metadata": {},
703 |    "outputs": [],
704 |    "source": [
705 |     "top = pd.DataFrame(index=membership_mask.columns)\n",
706 |     "top['means'] = np.abs(np_attribs).mean(0)\n",
707 |     "top['stds'] = np.abs(np_attribs).std(0)\n"
708 |    ]
709 |   },
710 |   {
711 |    "cell_type": "code",
712 |    "execution_count": 29,
713 |    "metadata": {},
714 |    "outputs": [
715 |     {
716 |      "data": {
717 |       "text/html": [
718 |        "<div>\n",
719 |        "<style scoped>\n",
720 |        "    .dataframe tbody tr th:only-of-type {\n",
721 |        "        vertical-align: middle;\n",
722 |        "    }\n",
723 |        "\n",
724 |        "    .dataframe tbody tr th {\n",
725 |        "        vertical-align: top;\n",
726 |        "    }\n",
727 |        "\n",
728 |        "    .dataframe thead th {\n",
729 |        "        text-align: right;\n",
730 |        "    }\n",
731 |        "</style>\n",
732 |        "<table border=\"1\" class=\"dataframe\">\n",
733 |        "  <thead>\n",
734 |        "    <tr style=\"text-align: right;\">\n",
735 |        "      <th></th>\n",
736 |        "      <th>means</th>\n",
737 |        "      <th>stds</th>\n",
738 |        "    </tr>\n",
739 |        "    <tr>\n",
740 |        "      <th>index</th>\n",
741 |        "      <th></th>\n",
742 |        "      <th></th>\n",
743 |        "    </tr>\n",
744 |        "  </thead>\n",
745 |        "  <tbody>\n",
746 |        "    <tr>\n",
747 |        "      <th>H2AFZ</th>\n",
748 |        "      <td>1.558621</td>\n",
749 |        "      <td>0.690636</td>\n",
750 |        "    </tr>\n",
751 |        "    <tr>\n",
752 |        "      <th>IL8</th>\n",
753 |        "      <td>0.588597</td>\n",
754 |        "      <td>0.379918</td>\n",
755 |        "    </tr>\n",
756 |        "    <tr>\n",
757 |        "      <th>PLA2G7</th>\n",
758 |        "      <td>0.433617</td>\n",
759 |        "      <td>0.340465</td>\n",
760 |        "    </tr>\n",
761 |        "    <tr>\n",
762 |        "      <th>SSB</th>\n",
763 |        "      <td>0.398044</td>\n",
764 |        "      <td>0.208317</td>\n",
765 |        "    </tr>\n",
766 |        "    <tr>\n",
767 |        "      <th>HIST1H2AC</th>\n",
768 |        "      <td>0.234484</td>\n",
769 |        "      <td>0.173549</td>\n",
770 |        "    </tr>\n",
771 |        "    <tr>\n",
772 |        "      <th>...</th>\n",
773 |        "      <td>...</td>\n",
774 |        "      <td>...</td>\n",
775 |        "    </tr>\n",
776 |        "    <tr>\n",
777 |        "      <th>IFNB1</th>\n",
778 |        "      <td>0.000011</td>\n",
779 |        "      <td>0.000189</td>\n",
780 |        "    </tr>\n",
781 |        "    <tr>\n",
782 |        "      <th>PELI3</th>\n",
783 |        "      <td>0.000010</td>\n",
784 |        "      <td>0.000337</td>\n",
785 |        "    </tr>\n",
786 |        "    <tr>\n",
787 |        "      <th>AURKB</th>\n",
788 |        "      <td>0.000010</td>\n",
789 |        "      <td>0.000136</td>\n",
790 |        "    </tr>\n",
791 |        "    <tr>\n",
792 |        "      <th>SRGAP3</th>\n",
793 |        "      <td>0.000010</td>\n",
794 |        "      <td>0.000202</td>\n",
795 |        "    </tr>\n",
796 |        "    <tr>\n",
797 |        "      <th>ATP6V0A4</th>\n",
798 |        "      <td>0.000005</td>\n",
799 |        "      <td>0.000120</td>\n",
800 |        "    </tr>\n",
801 |        "  </tbody>\n",
802 |        "</table>\n",
803 |        "<p>979 rows × 2 columns</p>\n",
804 |        "</div>"
805 |       ],
806 |       "text/plain": [
807 |        "              means      stds\n",
808 |        "index                        \n",
809 |        "H2AFZ      1.558621  0.690636\n",
810 |        "IL8        0.588597  0.379918\n",
811 |        "PLA2G7     0.433617  0.340465\n",
812 |        "SSB        0.398044  0.208317\n",
813 |        "HIST1H2AC  0.234484  0.173549\n",
814 |        "...             ...       ...\n",
815 |        "IFNB1      0.000011  0.000189\n",
816 |        "PELI3      0.000010  0.000337\n",
817 |        "AURKB      0.000010  0.000136\n",
818 |        "SRGAP3     0.000010  0.000202\n",
819 |        "ATP6V0A4   0.000005  0.000120\n",
820 |        "\n",
821 |        "[979 rows x 2 columns]"
822 |       ]
823 |      },
824 |      "execution_count": 29,
825 |      "metadata": {},
826 |      "output_type": "execute_result"
827 |     }
828 |    ],
829 |    "source": [
830 |     "top.sort_values('means',ascending=False)"
831 |    ]
832 |   },
833 |   {
834 |    "cell_type": "code",
835 |    "execution_count": 30,
836 |    "metadata": {},
837 |    "outputs": [],
838 |    "source": [
839 |     "top.to_csv('kang_remove_g/aux_0.csv')"
840 |    ]
841 |   }
842 |  ],
843 |  "metadata": {
844 |   "kernelspec": {
845 |    "display_name": "newenv",
846 |    "language": "python",
847 |    "name": "newenv"
848 |   },
849 |   "language_info": {
850 |    "codemirror_mode": {
851 |     "name": "ipython",
852 |     "version": 3
853 |    },
854 |    "file_extension": ".py",
855 |    "mimetype": "text/x-python",
856 |    "name": "python",
857 |    "nbconvert_exporter": "python",
858 |    "pygments_lexer": "ipython3",
859 |    "version": "3.9.7"
860 |   }
861 |  },
862 |  "nbformat": 4,
863 |  "nbformat_minor": 4
864 | }
865 | 


--------------------------------------------------------------------------------
/figures/supplementary_figures/g_enrichments.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 61,
  6 |    "id": "23720687-2ad7-4d94-8df1-39b834c5e456",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd \n",
 12 |     "import anndata \n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "import shap as shap \n",
 15 |     "import seaborn as sns\n",
 16 |     "import math as math"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 62,
 22 |    "id": "654a8332-3a7a-4e98-9e86-9a9f44072a58",
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "# drop G\n",
 27 |     "aux_0 = pd.read_csv('kang_remove_g/aux_0.csv')\n",
 28 |     "aux_1 = pd.read_csv('kang_remove_g/aux_1.csv')\n",
 29 |     "aux_2 = pd.read_csv('kang_remove_g/aux_2.csv')\n",
 30 |     "aux_3 = pd.read_csv('kang_remove_g/aux_3.csv')"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 63,
 36 |    "id": "8a1707e6-79f5-434c-b6de-d41a31658f88",
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "data": {
 41 |       "text/html": [
 42 |        "<div>\n",
 43 |        "<style scoped>\n",
 44 |        "    .dataframe tbody tr th:only-of-type {\n",
 45 |        "        vertical-align: middle;\n",
 46 |        "    }\n",
 47 |        "\n",
 48 |        "    .dataframe tbody tr th {\n",
 49 |        "        vertical-align: top;\n",
 50 |        "    }\n",
 51 |        "\n",
 52 |        "    .dataframe thead th {\n",
 53 |        "        text-align: right;\n",
 54 |        "    }\n",
 55 |        "</style>\n",
 56 |        "<table border=\"1\" class=\"dataframe\">\n",
 57 |        "  <thead>\n",
 58 |        "    <tr style=\"text-align: right;\">\n",
 59 |        "      <th></th>\n",
 60 |        "      <th>index</th>\n",
 61 |        "      <th>means</th>\n",
 62 |        "      <th>stds</th>\n",
 63 |        "    </tr>\n",
 64 |        "  </thead>\n",
 65 |        "  <tbody>\n",
 66 |        "    <tr>\n",
 67 |        "      <th>233</th>\n",
 68 |        "      <td>IL8</td>\n",
 69 |        "      <td>1.472220</td>\n",
 70 |        "      <td>1.072127</td>\n",
 71 |        "    </tr>\n",
 72 |        "    <tr>\n",
 73 |        "      <th>250</th>\n",
 74 |        "      <td>H2AFZ</td>\n",
 75 |        "      <td>0.660040</td>\n",
 76 |        "      <td>0.297694</td>\n",
 77 |        "    </tr>\n",
 78 |        "    <tr>\n",
 79 |        "      <th>707</th>\n",
 80 |        "      <td>SQRDL</td>\n",
 81 |        "      <td>0.541798</td>\n",
 82 |        "      <td>0.295089</td>\n",
 83 |        "    </tr>\n",
 84 |        "    <tr>\n",
 85 |        "      <th>347</th>\n",
 86 |        "      <td>PLA2G7</td>\n",
 87 |        "      <td>0.439908</td>\n",
 88 |        "      <td>0.348958</td>\n",
 89 |        "    </tr>\n",
 90 |        "    <tr>\n",
 91 |        "      <th>320</th>\n",
 92 |        "      <td>HIST1H2AC</td>\n",
 93 |        "      <td>0.385302</td>\n",
 94 |        "      <td>0.294543</td>\n",
 95 |        "    </tr>\n",
 96 |        "    <tr>\n",
 97 |        "      <th>...</th>\n",
 98 |        "      <td>...</td>\n",
 99 |        "      <td>...</td>\n",
100 |        "      <td>...</td>\n",
101 |        "    </tr>\n",
102 |        "    <tr>\n",
103 |        "      <th>342</th>\n",
104 |        "      <td>TREM2</td>\n",
105 |        "      <td>0.000012</td>\n",
106 |        "      <td>0.000478</td>\n",
107 |        "    </tr>\n",
108 |        "    <tr>\n",
109 |        "      <th>960</th>\n",
110 |        "      <td>RRM2</td>\n",
111 |        "      <td>0.000011</td>\n",
112 |        "      <td>0.000294</td>\n",
113 |        "    </tr>\n",
114 |        "    <tr>\n",
115 |        "      <th>797</th>\n",
116 |        "      <td>PTRF</td>\n",
117 |        "      <td>0.000010</td>\n",
118 |        "      <td>0.000238</td>\n",
119 |        "    </tr>\n",
120 |        "    <tr>\n",
121 |        "      <th>12</th>\n",
122 |        "      <td>ALDH4A1</td>\n",
123 |        "      <td>0.000009</td>\n",
124 |        "      <td>0.000227</td>\n",
125 |        "    </tr>\n",
126 |        "    <tr>\n",
127 |        "      <th>969</th>\n",
128 |        "      <td>ABCC2</td>\n",
129 |        "      <td>0.000005</td>\n",
130 |        "      <td>0.000174</td>\n",
131 |        "    </tr>\n",
132 |        "  </tbody>\n",
133 |        "</table>\n",
134 |        "<p>979 rows × 3 columns</p>\n",
135 |        "</div>"
136 |       ],
137 |       "text/plain": [
138 |        "         index     means      stds\n",
139 |        "233        IL8  1.472220  1.072127\n",
140 |        "250      H2AFZ  0.660040  0.297694\n",
141 |        "707      SQRDL  0.541798  0.295089\n",
142 |        "347     PLA2G7  0.439908  0.348958\n",
143 |        "320  HIST1H2AC  0.385302  0.294543\n",
144 |        "..         ...       ...       ...\n",
145 |        "342      TREM2  0.000012  0.000478\n",
146 |        "960       RRM2  0.000011  0.000294\n",
147 |        "797       PTRF  0.000010  0.000238\n",
148 |        "12     ALDH4A1  0.000009  0.000227\n",
149 |        "969      ABCC2  0.000005  0.000174\n",
150 |        "\n",
151 |        "[979 rows x 3 columns]"
152 |       ]
153 |      },
154 |      "execution_count": 63,
155 |      "metadata": {},
156 |      "output_type": "execute_result"
157 |     }
158 |    ],
159 |    "source": [
160 |     "aux_0.sort_values('means',ascending=False)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 64,
166 |    "id": "b85cd0f4-6b0c-4a1b-82e7-24e0b207a830",
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "combined_means = pd.DataFrame(index = aux_0.index)"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 65,
176 |    "id": "c9a0bd21-db5e-4109-9fdd-c6c585f99f52",
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "combined_means[0] = aux_0['means']\n",
181 |     "combined_means[1] = aux_1['means']\n",
182 |     "combined_means[2] = aux_2['means']\n",
183 |     "combined_means[3] = aux_3['means']"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 66,
189 |    "id": "a69cab5f-5575-4543-8c05-e3a7dabfe983",
190 |    "metadata": {},
191 |    "outputs": [
192 |     {
193 |      "data": {
194 |       "text/html": [
195 |        "<div>\n",
196 |        "<style scoped>\n",
197 |        "    .dataframe tbody tr th:only-of-type {\n",
198 |        "        vertical-align: middle;\n",
199 |        "    }\n",
200 |        "\n",
201 |        "    .dataframe tbody tr th {\n",
202 |        "        vertical-align: top;\n",
203 |        "    }\n",
204 |        "\n",
205 |        "    .dataframe thead th {\n",
206 |        "        text-align: right;\n",
207 |        "    }\n",
208 |        "</style>\n",
209 |        "<table border=\"1\" class=\"dataframe\">\n",
210 |        "  <thead>\n",
211 |        "    <tr style=\"text-align: right;\">\n",
212 |        "      <th></th>\n",
213 |        "      <th>0</th>\n",
214 |        "      <th>1</th>\n",
215 |        "      <th>2</th>\n",
216 |        "      <th>3</th>\n",
217 |        "    </tr>\n",
218 |        "  </thead>\n",
219 |        "  <tbody>\n",
220 |        "    <tr>\n",
221 |        "      <th>0</th>\n",
222 |        "      <td>0.003762</td>\n",
223 |        "      <td>0.007630</td>\n",
224 |        "      <td>0.005264</td>\n",
225 |        "      <td>0.014630</td>\n",
226 |        "    </tr>\n",
227 |        "    <tr>\n",
228 |        "      <th>1</th>\n",
229 |        "      <td>0.001084</td>\n",
230 |        "      <td>0.002251</td>\n",
231 |        "      <td>0.002387</td>\n",
232 |        "      <td>0.000991</td>\n",
233 |        "    </tr>\n",
234 |        "    <tr>\n",
235 |        "      <th>2</th>\n",
236 |        "      <td>0.000217</td>\n",
237 |        "      <td>0.001149</td>\n",
238 |        "      <td>0.000799</td>\n",
239 |        "      <td>0.000178</td>\n",
240 |        "    </tr>\n",
241 |        "    <tr>\n",
242 |        "      <th>3</th>\n",
243 |        "      <td>0.000471</td>\n",
244 |        "      <td>0.000526</td>\n",
245 |        "      <td>0.001402</td>\n",
246 |        "      <td>0.000415</td>\n",
247 |        "    </tr>\n",
248 |        "    <tr>\n",
249 |        "      <th>4</th>\n",
250 |        "      <td>0.002461</td>\n",
251 |        "      <td>0.001258</td>\n",
252 |        "      <td>0.000957</td>\n",
253 |        "      <td>0.000706</td>\n",
254 |        "    </tr>\n",
255 |        "    <tr>\n",
256 |        "      <th>...</th>\n",
257 |        "      <td>...</td>\n",
258 |        "      <td>...</td>\n",
259 |        "      <td>...</td>\n",
260 |        "      <td>...</td>\n",
261 |        "    </tr>\n",
262 |        "    <tr>\n",
263 |        "      <th>974</th>\n",
264 |        "      <td>0.000141</td>\n",
265 |        "      <td>0.000167</td>\n",
266 |        "      <td>0.000026</td>\n",
267 |        "      <td>0.000326</td>\n",
268 |        "    </tr>\n",
269 |        "    <tr>\n",
270 |        "      <th>975</th>\n",
271 |        "      <td>0.000030</td>\n",
272 |        "      <td>0.000044</td>\n",
273 |        "      <td>0.000069</td>\n",
274 |        "      <td>0.000031</td>\n",
275 |        "    </tr>\n",
276 |        "    <tr>\n",
277 |        "      <th>976</th>\n",
278 |        "      <td>0.000287</td>\n",
279 |        "      <td>0.000068</td>\n",
280 |        "      <td>0.000032</td>\n",
281 |        "      <td>0.000397</td>\n",
282 |        "    </tr>\n",
283 |        "    <tr>\n",
284 |        "      <th>977</th>\n",
285 |        "      <td>0.000135</td>\n",
286 |        "      <td>0.000182</td>\n",
287 |        "      <td>0.000036</td>\n",
288 |        "      <td>0.000130</td>\n",
289 |        "    </tr>\n",
290 |        "    <tr>\n",
291 |        "      <th>978</th>\n",
292 |        "      <td>0.000183</td>\n",
293 |        "      <td>0.000211</td>\n",
294 |        "      <td>0.000104</td>\n",
295 |        "      <td>0.000023</td>\n",
296 |        "    </tr>\n",
297 |        "  </tbody>\n",
298 |        "</table>\n",
299 |        "<p>979 rows × 4 columns</p>\n",
300 |        "</div>"
301 |       ],
302 |       "text/plain": [
303 |        "            0         1         2         3\n",
304 |        "0    0.003762  0.007630  0.005264  0.014630\n",
305 |        "1    0.001084  0.002251  0.002387  0.000991\n",
306 |        "2    0.000217  0.001149  0.000799  0.000178\n",
307 |        "3    0.000471  0.000526  0.001402  0.000415\n",
308 |        "4    0.002461  0.001258  0.000957  0.000706\n",
309 |        "..        ...       ...       ...       ...\n",
310 |        "974  0.000141  0.000167  0.000026  0.000326\n",
311 |        "975  0.000030  0.000044  0.000069  0.000031\n",
312 |        "976  0.000287  0.000068  0.000032  0.000397\n",
313 |        "977  0.000135  0.000182  0.000036  0.000130\n",
314 |        "978  0.000183  0.000211  0.000104  0.000023\n",
315 |        "\n",
316 |        "[979 rows x 4 columns]"
317 |       ]
318 |      },
319 |      "execution_count": 66,
320 |      "metadata": {},
321 |      "output_type": "execute_result"
322 |     }
323 |    ],
324 |    "source": [
325 |     "combined_means"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 67,
331 |    "id": "f2040385-8056-4c9b-bdee-ee1ab665c26f",
332 |    "metadata": {},
333 |    "outputs": [
334 |     {
335 |      "data": {
336 |       "text/html": [
337 |        "<div>\n",
338 |        "<style scoped>\n",
339 |        "    .dataframe tbody tr th:only-of-type {\n",
340 |        "        vertical-align: middle;\n",
341 |        "    }\n",
342 |        "\n",
343 |        "    .dataframe tbody tr th {\n",
344 |        "        vertical-align: top;\n",
345 |        "    }\n",
346 |        "\n",
347 |        "    .dataframe thead th {\n",
348 |        "        text-align: right;\n",
349 |        "    }\n",
350 |        "</style>\n",
351 |        "<table border=\"1\" class=\"dataframe\">\n",
352 |        "  <thead>\n",
353 |        "    <tr style=\"text-align: right;\">\n",
354 |        "      <th></th>\n",
355 |        "      <th>0</th>\n",
356 |        "      <th>1</th>\n",
357 |        "      <th>2</th>\n",
358 |        "      <th>3</th>\n",
359 |        "    </tr>\n",
360 |        "  </thead>\n",
361 |        "  <tbody>\n",
362 |        "    <tr>\n",
363 |        "      <th>0</th>\n",
364 |        "      <td>1.000000</td>\n",
365 |        "      <td>0.690757</td>\n",
366 |        "      <td>0.703157</td>\n",
367 |        "      <td>0.723252</td>\n",
368 |        "    </tr>\n",
369 |        "    <tr>\n",
370 |        "      <th>1</th>\n",
371 |        "      <td>0.690757</td>\n",
372 |        "      <td>1.000000</td>\n",
373 |        "      <td>0.698095</td>\n",
374 |        "      <td>0.634515</td>\n",
375 |        "    </tr>\n",
376 |        "    <tr>\n",
377 |        "      <th>2</th>\n",
378 |        "      <td>0.703157</td>\n",
379 |        "      <td>0.698095</td>\n",
380 |        "      <td>1.000000</td>\n",
381 |        "      <td>0.589254</td>\n",
382 |        "    </tr>\n",
383 |        "    <tr>\n",
384 |        "      <th>3</th>\n",
385 |        "      <td>0.723252</td>\n",
386 |        "      <td>0.634515</td>\n",
387 |        "      <td>0.589254</td>\n",
388 |        "      <td>1.000000</td>\n",
389 |        "    </tr>\n",
390 |        "  </tbody>\n",
391 |        "</table>\n",
392 |        "</div>"
393 |       ],
394 |       "text/plain": [
395 |        "          0         1         2         3\n",
396 |        "0  1.000000  0.690757  0.703157  0.723252\n",
397 |        "1  0.690757  1.000000  0.698095  0.634515\n",
398 |        "2  0.703157  0.698095  1.000000  0.589254\n",
399 |        "3  0.723252  0.634515  0.589254  1.000000"
400 |       ]
401 |      },
402 |      "execution_count": 67,
403 |      "metadata": {},
404 |      "output_type": "execute_result"
405 |     }
406 |    ],
407 |    "source": [
408 |     "combined_means.corr()"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "code",
413 |    "execution_count": 68,
414 |    "id": "7aa9b78a-9769-49f2-ab16-e30ef48a91a1",
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": [
418 |     "corr_mat = combined_means.corr().abs()\n",
419 |     "mask = np.tril(np.ones_like(corr_mat, dtype=bool)) "
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": 69,
425 |    "id": "8c5f0225-033b-4559-bb16-6500ed35ed8f",
426 |    "metadata": {},
427 |    "outputs": [
428 |     {
429 |      "data": {
430 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAdAAAAFpCAYAAAAsmHm9AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAp80lEQVR4nO3dd5gc1Znv8d87PVGjUUJplCMggQRIQiSTk5C9lu1rewETDNgC2+wa4yRfXweW595l7V17HTAwa4PFYsDYJmhtQAQjk4wiAkko51HOk0cT3vvHtETPqCdV18T6fp6nHnVX1ak6fRjm7fecU2fM3QUAAFonraMrAABAV0QABQAgAAIoAAABEEABAAiAAAoAQAAEUAAAAiCAAgC6BDN72Mz2mtnKRo6bmf3czDaY2ftmNiXh2AwzWxs/NieM+hBAAQBdxW8lzWji+DWSxse32ZIekCQzi0m6P358oqTrzGxiqpUhgAIAugR3f13SwSZOmSXpUa/zjqQ+ZpYvabqkDe6+yd2PSnoyfm5KCKAAgO5iqKTtCe8L4/sa25+S9FQv0AKsFdjGpv1gfkdXIRJqqms7ugrdXm0Nvy7a2nv3zbS2unbmWbem9B+wavkjt6uu6/WYAncvaMUlkn02b2J/StojgAIAIsDSYimVjwfL1gTMhgolDU94P0zSTkmZjexPCV24AIBQWFospS0E8yTdFJ+Ne66kI+6+S9JiSePNbLSZZUq6Nn5uSshAAQChCCkINn59syckXSKpv5kVSvqBpAxJcvcHJT0vaaakDZLKJN0SP1ZtZndKmi8pJulhd1+Van0IoACALsHdr2vmuEv6SiPHnlddgA0NARQAEIq2zkA7GwIoACAUFiOAAgDQamkRy0CZhQsAQABkoACAUDAGCgBAAARQAAACsLRojQoSQAEAoYhaBhqtrwsAAISEDBQAEIqoZaAEUABAKAigAAAEwEpEAAAEELUMlElEAAAEQAYKAAhF1DJQAigAIBRRW0yeAAoACEXUMlDGQAEACIAMFAAQiqhloARQAEAoCKAAAARAAAUAIICoBVAmEQEAEAAZKAAgFKyFCwBAAFHrwiWAAgBCQQAFACCAqAVQJhEBABAAGSgCGz0gV9+cOUGTh/VRcUWVnl22Q/+1YINqvfmyl04YqM9fOEZjB/ZURVWNPthRpG/9frkqqmqOn3PrRWP0qanD1Dc3U5v3leqXr6zTOxsPtOEn6jrGDOipb/3DBE0e3lclFVV6ZkmhHvrr+ibb/vbLxumOy8cnPfaL+Wv18Oub2qi2nd+YgT015+MTNXlEXxVXVOmZxdv14CtNt+cdV4zXl65I3p4/e3GtHl6wUWkm3XzRGF106kCNGdhTkrR6xxH94qV1WlV4pC0+SodKS7OOrkK7IoAikLzsdP3qpmnatK9UX3/iXQ3rl6O7rj5FaSY98NcNTZadNWWovjVzgh59a4t+/tJa5WVn6Owx/RRL+J/v8xeO1hcuHquHXtugtbuKNPOMIfrp9VN0228W6oOdRW398Tq1vOx0PXjr2dq0t0Rfe2yphp/UQ3dfc6rMpF+9sr7Rcs8sKdTb6/fX23fphIG65eKxenP9vraudqeVl5Ouh74wXZv2lOiuR+va8+sfPVVmpvtfWtdouacXbddba+u322WnDdKtl4zVW2v3SpKyMmK69ZKxem5JoX6zYKPcpWvPH6nf3nGubnrg71q9o3v9LBsBFGje/zp7uLIyYvrW799VaWWNFm6ScrPSNfuScXr0rc0qraxJWq53jwzdPeNU/fiFNXp2aeHx/QvW7D3+Oj1m+vxHxmjum5s1983NkqR3Nh7Q6AG5+uIlY/W1x99t2w/XyX1m+ghlZcT09cffVWlltRZuPKDcrHTdftl4zX1js0orq5OW21tUob1FFfX2ffHSsdq0t0TrdhW3R9U7pc+cM1LZGTHd/dgylVZW650NdT/Ld1wxXr/926ZWtefsy8dp094SrY23Z2VVjWb+6DUVl394jYUb92veNy7WdeeN0vf/+H7bfbAOYBatAMoYKAI5f1x/vbNhf71AOX/lbmVnxjRlZL9Gy1152mBJ0p+X72j0nGF9e6hndroWbarfXbtw4wGdM7a/0mPR+p+0oQtOHqC/r99X7xf7/Pd3KSczpqmjG2/7hnrlZOjcsf01//1dbVHNLuMjpwzQ2+vqt+eL7+1UTmZM08a0rj3PG9dfL7638/i+Wle94ClJ1TWujXtK1LdnZuqVR4cigCKQUf1ztWV/ab19e45UqPxotUYNyG203OnDemvr/lLNmjJMf7n7Yr3z/Sv12y+eo8nD+xw/Jyu97seyqqa2XtmjNbXKTE/T0L49wvsgXdCoAbnavK9+2+8+1vb9G2/7hq44fbAy0tP04vs7mz+5GxvdVHs28bPc0JWT4u35XtNfSDJiaZo4tLc27SkJVN/OLC3NUtpawsxmmNlaM9tgZnOSHP+mmS2PbyvNrMbM+sWPbTGzFfFjS1L+vKleANHUKydDxRUndm0VlVcrLzuj0XIn9czSyP65uu2iMfrFy+v0tcffVfnRGv38hqnql1v3jbzwULlqa12nDe1dr+yx971zGr9+FOQ10fa9WtE2V0/K1wc7jmjbgbIwq9fl5OVkqLi86oT9ReVVrWrPGWfk64PCI9ra4ItlQ1+8bKx65aTr6cXbW13Xzs7SLKWt2eubxSTdL+kaSRMlXWdmExPPcfcfu/uZ7n6mpO9I+pu7H0w45dL48Wmpft5mA6iZnWpm3zazn5vZz+KvJ6R6Y3R9rhOnKNYNgTQ+dTHN6saX7n1ulV5csUt/37Bf33jyXdW667PnjJAklVZWa/7KXbrlwjGaOqqfeuVk6B/PGaFzxpwkSappyTTfbs49edsn+2+STP+8LE0d3U8vRrz79phkrWYyJWnmpOra8yS98F7T2fyFpwzQFy4dp/98YW2zgbYrausAKmm6pA3uvsndj0p6UtKsJs6/TtITIXy0pJoMoGb2bdVV0CQtkrQ4/vqJZKlzQrnZZrbEzJYUFBSEWV90EkXlVUkzzZ5Z6Umzo2OOxL/pL93y4RfC0soard5ZpDEDeh7f9x8vrNHmfSV66Jaz9dc5l+nG80fpN/HHLA6WVob1Mbqk4vIq5SXJjHpmpZ8w3taYK08fLJP00goCaHF5lfKyT5xP2TM7XcUVJ2amyVw1OV8mNTmefNqw3vrR9Wfpjwu36XdvbQlY284tzSylLTF2xLfZDW4xVFJi6l4Y33cCM+shaYakPyXsdkkvmdnSJNduteZm4d4m6TR3r/dTZGY/kbRK0n3JCrl7gaRjkZN0oRvasr/0hPG2Qb2y1SMrXVv2Nf7Nesu+UtXWuhp+1zSTahO+7h8uq9KX5i7RwF5Z6pmVrq0HynTduSO1v7hSuw5XKMq27CvV6IZt3zve9i3MamZMztfyrYe050i021KSNu8r1eiEL29SQns28bOcaMbkfL3bRHuO7J+rX35+mhZuPKD75q1Kuc7dVYPYkUyyNLWxGPMPkt5q0H17gbvvNLOBkl42szXu/nrA6jbbhVsraUiS/fnxY4iotzfs17lj+6tH5odLd115+mBVHK3Rsq0HGy33xrp9SkszTUuYLZqbla4J+b20fveJj1LsLarUpn2liqWZPn7WUM17t/CEc6LmrXX7dN74+m1/1aR8lR+t0dLNjbf9Mfl9cjR5RF+6b+PeXLtP559cvz2vnlzXnks2Nd+eQ/rm6IyRffXC8uTdt/3zsvTArWdr+4EyzXni3RYtNNJVtUMXbqGk4Qnvh0lqrN/8WjXovnX3nfF/90p6RnVdwoE1l4HeJelVM1uvD9PmEZLGSbozlRuja/vT4u269pyR+vG1Z2rum5s1tG8Pzb5krH739y31Hm155p8v1LKtB3Xvc3XfulfvLNKC1Xv0vVmn6xcvr9PhsqO66SOjVV3remrRtuPlZk7OV3osTTsOlWlw7xxdf95I1brrkTc2t/tn7Wz+sGibrj1/pP7jc1P029c3aWi/HrrjsnF67K36z4A+d/dFWrb5oO55ZmW98jMm56uqplavrNzd3lXvlP6wcKuuP3+kfnLjVD3yt40a1q+HvnTFeP33m/Xb83++cbGWbj6oH/5pRb3yTbVnVnqa7r/lbOXlZOhfn1ul8YN7HT9WVVOrNd1sUZB2WEhhsaTxZjZa0g7VBcnrT6iHWW9JF0u6IWFfrqQ0dy+Ov75K0r+kUpkmA6i7v2hmJ6suSg9VXfpcKGmxuyd/Uh6RUFxRrS/NXaxvzZygn1w/RSUV1Xr8na0qeK3+KkSxtLqxjUTfe3qFvnrVyfrajFOUnRHTe9sO647fLq43dmpmuvkjozW4d7ZKKqv1tzV7df8r61V+lB+74opq3fGbxfr2P0zUf944VcUVVfrd21v04Kv1VyFKb+TRgKsn52vRxgM6VHa0varcqRWXV2v2rxfpO7Mm6uc3T1NxeZUee3OzHmiwqlMs1kh7njFEizYc0KHSE9vzpLwsnTqkLmj+8paz6x3bcahMM/9tQXgfpBNo66X83L3azO6UNF9STNLD7r7KzO6IH38wfuonJb3k7ol98IMkPRNf7CFd0uPu/mIq9bFks/lC1o07LDqHaT+Y39FViISaakYt2lptDb8u2tp7981ssyh3xpznU/oP2JZ1awss5QcACIVFbGUBAigAIBRRWwuXAAoACAV/zgwAgACi9ufMItZjDQBAOMhAAQChiFoGSgAFAISi4TPf3R0BFAAQCjJQAAACiFoAZRIRAAABkIECAELBc6AAAATASkQAAAQQtbVwI/ZxAQAIBxkoACAUjIECABBA1B5jIYACAELBJCIAAAKIWhcuk4gAAAiADBQAEArGQAEACCBGAAUAoPUIoAAABBC1AMokIgAAAiADBQCEImoZKAEUABAKAigAAAGkRyyAMgYKAEAAZKAAgFDQhQsAQAAEUAAAAoilRWtUkAAKAAhF1DLQaH1dAAB0aWY2w8zWmtkGM5uT5PglZnbEzJbHt++3tGxrkYECAELR1hmomcUk3S/pSkmFkhab2Tx3/6DBqW+4+8cClm0xAigAIBTt0IU7XdIGd98kSWb2pKRZkloSBFMpmxQBtBtYcs/VHV2FSJj6vRc7ugrdXs8+2R1dBaQgZqkFUDObLWl2wq4Cdy9IeD9U0vaE94WSzklyqfPM7D1JOyV9w91XtaJsixFAAQChSDUDjQfLgiZOSXYDb/B+maSR7l5iZjMlPStpfAvLtgqTiAAAXUWhpOEJ74epLss8zt2L3L0k/vp5SRlm1r8lZVuLDBQAEIp2GANdLGm8mY2WtEPStZKuTzzBzAZL2uPubmbTVZcoHpB0uLmyrUUABQCEoq0Xk3f3ajO7U9J8STFJD7v7KjO7I378QUmflvQlM6uWVC7pWnd3SUnLplIfAigAIBTtsZBCvFv2+Qb7Hkx4/UtJv2xp2VQwBgoAQABkoACAUERtKT8CKAAgFARQAAACIIACABBA1AIok4gAAAiADBQAEIqoZaAEUABAKAigAAAEQAAFACCAqAVQJhEBABAAGSgAIBRRy0AJoACAUMSMAAoAQKulRSyAMgYKAEAAZKAAgFDEopWAEkABAOFIYxIRAACtxyQiAAACYBIRAABoFhkoACAUTCICACAAJhEBABBA1MZACaAAgFBErQuXSUQAAARABgoACAVduAAABMCfMwMAIAAyUACd3ugBufrWRydq8vA+Kq6o0rNLC1Xw2gbVeuNlZl86TrdfNi7psV++vE6PvL6pjWrb+Y06qYfuuvxknTakl0oqq/Xn93fpkbc3N9mex1w0vr9uOGekxvTPVUV1rdbsLtL/eW6lKqpqJUm3XjBKF40foMG9smUmbTtYpicWbddf1+5t40+FtkYABbqYvOx0PfD5s7VpX4nufnyZhvXroa/NOEVmpgdeXd9ouWeXbtfb6/fV23fphEH6/EVj9Na6fY2U6v56ZqXrp589U1sOlOo7z6zQ0D45+sol42Qm/frNzU2W/dikfN11xXg9sWi7fvW3jcrLTtfUEX3rrQmbm5muF1bu1pYDpap11yUnD9Q9Hz9Ntc+5FnSzdo/aLFwCKNDFfHr6CGVlxPTNJ95VaWWNFm48oNysdN1+6Tg9+uYmlVbWJC23t6hSe4sq6+374iVjtXlfidbtLm6PqndKnzhziLLS0/TdZ1eq7GiNlmw9pNysdN1y/ig9vmibyo4mb8/eORn6p8vG6Wevrtf/vL/r+P431u+vd94vXttQ7/3iLYc0un+urj5tcLcLoFHrwuUxFqCLOX98f/19w/56gfKlFbuUnRnTlFH9WnydXjkZOmdsf81P+OUfReeMPkmLNh+sFyhfWb1H2RkxnTm8T6PlLj1lgCTphZW7W33PI+VVyuiG6VoszVLauhoCKNDFjOqfqy37Suvt232kQuVHqzWqf26Lr3P5aYOUkZ6m+SuiHUBH9uuhrQfL6u3bW1yp8qM1GtmvR6PlJub30raDZfrY5Hz96Y7z9NrdF+uhz03V6UN6JT0/ZqaeWem6csIgnT2qr55bvjPUz9EZpFlqW0uY2QwzW2tmG8xsTpLjnzOz9+Pb22Z2RsKxLWa2wsyWm9mSVD8vXbhAF9MrJ0PFFVUn7C8qr1avnIwWX+fqSflaveOIth0oa/7kbiwvO10lldUn7C+urFJeduPteVJulkb066Gbzh2lB/62UUfKq3T99BH690+foet+/Y4OlX3432hifi89dMNUSVJ1Ta1++up6vbFhf2OXRiPMLCbpfklXSiqUtNjM5rn7BwmnbZZ0sbsfMrNrJBVIOifh+KXuHkrjE0CBrijJ7FCzpLuT6t8zS1NG9dMvXlobarW6Kk/WnjJ5sgPHjpvUIzNd33tulRZtOShJWrnziP54+3n61FnD9Ju3PpyAtGl/ib7w6BL1zE7X+WNO0tcuH6/Symq9uqZ7zcRthz+oPV3SBnffJElm9qSkWZKOB1B3fzvh/HckDWurygTuwjWzW5o4NtvMlpjZkoKCgqC3AJBEUXmVeuac+N23Z1a6istPzEyTufL0wTJJL61o/fhdd1NcUa2e2Se2Z25WLGlmekxRvBdg+fbDx/eVHa3R2j3FGtW/ftdvRVWt1u4p1tKth/SL1zZo/gd79KWLx4bzATqRNLOUtsTYEd9mN7jFUEnbE94Xxvc15jZJLyS8d0kvmdnSJNdutVQy0HskPZLsgLsXqC5tllr+pRhAC2zZX6pR/XvW2zeoV7Z6ZKVry/7SRkrVd9WkfC3fdkh7iiraoopdytaDZSeMdQ7My1KPzPQTxkbrlTtQplp3qUHSVZe5Nn3PdXuK9dFJ+YqlmWpa8rBpFxFLcVZNg9iRTLIUN2kDmtmlqgugH0nYfYG77zSzgZJeNrM17v560Po2+XETBmIbbiskDQp6UwDBvb1+v84bd5J6ZMaO77tq0mBVHK3RsnhXYlPy++Ro8og+kZ99e8zCzQc0fVQ/5WR82J6XnTpQFVU19bLLht7eeEBpZpqSMFM3NzOmkwf11Ia9JU3ec9LQ3tpTVNGtgqeUegbaAoWShie8HybphNlYZjZZ0q8lzXL3A8f2u/vO+L97JT2jui7hwJrLQAdJulrSoYb1k/T2iacDaGt/XLRN1547Qj++7izNfWOzhvbL0exLx+mxt7fUe7Tl2bsu1NIth3Tvsyvrlb960mBV19TqlVV030rSs8t36tNThun/fuJ0/W7RNg3pna1bzh+l3y/ZXu/Rlie+cI6Wbz+sf5tfN268dk+x3li/T3NmnKoHX990fBJRTa3r6Xd3SJIG9crSd2ZM0Cur92jnkXLlZMR00fgBumLCIP07489BLJY03sxGS9oh6VpJ1yeeYGYjJD0t6UZ3X5ewP1dSmrsXx19fJelfUqlMcwH0z5J6uvvyhgfMbEEqNwYQTHFFte54ZLG+/bGJ+ukNU1RSUaXH/75FD/21/gP7sbS0pJM6rpqUr0WbDuhwWcvGS7u7kspq3fXUct11+cn6t09OUklltZ5aUqhH3q6/ClGyZxXv/ctqffmSsbrz0nHKTk/Tip1H9NXfLz8+dlpSUa0DpZW66byR6pebqZLKam3ZX6Zv/vE9vbO5+d6CrqatJxG5e7WZ3SlpvqSYpIfdfZWZ3RE//qCk70s6SdKvrK4+1e4+TXUJ4TPxfemSHnf3F1OpjzU1yywk3auPApE19Xsp/b+GFujRK6ujq9DtvfHNS9ssyi3dfjil3/dTh/fpUqsp8BgLACAUqU4i6moi9nEBAAgHGSgAIBRRW0yeAAoACEXE4icBFAAQjrSk6xx0XwRQAEAoopaBMokIAIAAyEABAKHogn8TOyUEUABAKKLWhUsABQCEImqTiBgDBQAgADJQAEAo6MIFACAAJhEBABBAxOInARQAEI6orYXLJCIAAAIgAwUAhCJiCSgBFAAQjqh1aRJAAQChsIiloARQAEAoovYYS9QybgAAQkEGCgAIRcR6cAmgAIBwRK1LkwAKAAhF1CYRRe0LAwAAoSADBQCEImqzcAmgAIBQRCx+EkABAOEgAwUAIAAmEQEAgGaRgQIAQhG1LlwyUABAKCzFrUX3MJthZmvNbIOZzUly3Mzs5/Hj75vZlJaWbS0CKAAgFGlmKW3NMbOYpPslXSNpoqTrzGxig9OukTQ+vs2W9EAryrbu86ZSGACAY8xS21pguqQN7r7J3Y9KelLSrAbnzJL0qNd5R1IfM8tvYdlWIYACALqKoZK2J7wvjO9ryTktKdsqTCICWmjpvTM6ugrd3tW/equjq4AUmHtq5c1mq67b9ZgCdy9IPCVJsYY3beyclpRtFQIoACAcXpta8bpgWdDEKYWShie8HyZpZwvPyWxB2VahCxcAEArz2pS2FlgsabyZjTazTEnXSprX4Jx5km6Kz8Y9V9IRd9/VwrKtQgYKAOgS3L3azO6UNF9STNLD7r7KzO6IH39Q0vOSZkraIKlM0i1NlU2lPgRQAEA4UuzCbdEt3J9XXZBM3PdgwmuX9JWWlk0FARQAEI4UJxF1NQRQAEA42iED7UwIoACAULRwIlC3wSxcAAACIAMFAIQjYhkoARQAEA4CKAAAARBAAQAIoDZaAZRJRAAABEAGCgAIRdQeYyGAAgDCQQAFACCAiC3lxxgoAAABkIECAMJBFy4AAK3HJCIAAIIggAIAEEDEAiiTiAAACIAMFAAQjohloARQAEAomEQEAEAQEVtMngAKAAgHKxEBAIDmkIECAMLBGCgAAK3HJCIAAIKIWABlDBQAgADIQAEA4YhYBkoABQCEo7amo2vQrgigACJvRN8cffnCMZowKE+lR2v04gd79NiSbapt4rHGQXlZevTGaSfsX7B+n/715XX19l03dZhmThysPjkZ2nqoTI+8s1VLtx8O+VN0PGchBQCIjp5ZMd338dO17WCZ7nlhtfJ752j2+aNkJs1dtK3Z8gVvbdaq3UXH3xeVV9c7/o9Thur6acP134u2aeP+Ul128gDdM3OC7n5mhdbtLQn983QoMlAAiI6PnjZYmbE0/cuLa1RWVSMVHlGPjJhuOHu4/vDujrp9TSg8XK41e5IHwvQ00z9OGaY/LNuhp97dIUlauv2wRvbtoRumDdf3n18d+udB+2EWLoBIO3tEXy3dfqheoFywYZ+yM2KaNKRXStfO752t3Mx0LSs8XG//ssLDOmt4H6WnWUrX73Rqa1LbUmBm/czsZTNbH/+3b5JzhpvZa2a22sxWmdlXE4790Mx2mNny+DazuXuSgQKItOF9crR8x5F6+/aVHFVFVY2G9+2hhVsPNVn+7svGKy8rXYfLq7Rg/T79duE2Ha2pGwvMjNXlKNUNxgaramqVGUtTfq9sbT9cHuKn6Vhe06FduHMkveru95nZnPj7bzc4p1rS1919mZnlSVpqZi+7+wfx4z91939v6Q2bDaBmdqqkoZIWuntJwv4Z7v5iS28EAJ1Rz6x0lVZWn7C/uLJaeVmxRstV1dRq3opdWrr9sMqOVmvy0N767FlDNaR3tn74whpJ0q6iCtW66+SBefW6eU8ZmCdJysvuZjlMx04imiXpkvjruZIWqEEAdfddknbFXxeb2WrVxbcPFECTXbhm9s+SnpP0T5JWmtmshMP/r4lys81siZktKSgoCFIvAGg3ySbbWiP7jzlYVqX739ikd7Yc1Ps7i/TY4u0qeGuLzht9ksaclCtJKjtaowXr9+u6qcN0xpDeystK18cn5eusYb0lSTVNTfPtilLswk2MHfFtdivuPigeII8FyoFNnWxmoySdJWlhwu47zex9M3s4WRdwQ819/fmipKnuXhK/2R/NbJS7/0x1P19JuXuBpGORs5v9hADoTkoqq5WbeeKvwtysdJVUtq5L8o2N+/VPF4/VuAG52nSgVJL04Jub9L+vOkU/+sTpkqS9xZV6fGmhbpo+QofLq1L/AN1Ig9hxAjN7RdLgJIe+25r7mFlPSX+SdJe7H5tC/YCke1UXs+6V9B+Sbm3qOs0F0Nixblt332Jml6guiI5UEwEUALqK7YfLNbxvTr19A3pmKicjpu2Hylp1rWTZwpGKan173ir1z81UbmZM2w+X61NnDNGB0qPaU1yZQs07H2/jx1jc/YrGjpnZHjPLd/ddZpYvaW8j52WoLnj+zt2fTrj2noRz/kvSn5urT3OzcHeb2ZkJNyiR9DFJ/SVNau7iANDZLd52SNOG91FOxofjnReP66+Kqhqt2FnURMkTXTi2vyRp/b4TH2vZX3pUWw+VK5ZmuurUQXppzZ4TzunyamtT21IzT9LN8dc3q274sR4zM0m/kbTa3X/S4Fh+wttPSlrZ3A2by0BvUt2spePcvVrSTWb2UHMXB4DO7i+rdmvWpCH6/oxT9dS7hRrcK1s3nD1CT7+3s96jLY98bore31mkn762QZJ0w9nD1SMjplW7i1R2tEaT8nvr02cN0Zsb92vzgQ8z18tPHqBYmml3UYUG5mXpk5OHqNZdTy4tbPfP2tbaOgNtxn2SnjKz2yRtk/QZSTKzIZJ+7e4zJV0g6UZJK8xsebzc/3b35yX9KJ4wuqQtkm5v7oZNBlB3b/S/sLu/1dzFAaCzK6ms0Zx5K/WVC8fonpkTVFJZo6ff26nHFtdfhSjNTImPbW4/VK5PnzlUMyYMUmZ6mvaVVOqP7+7UE0u31ytnJn32rGEalJel0qPVenvzQT3yzlZVVEdr2bu25u4HJF2eZP9OSTPjr99UI8OP7n5ja+9p7m0+x4dJRABa5Opf8b28rc3/8gVtNn/l6FtPpfT7PvOCz3apuTXd7CEkAECHYTF5AABar4NXImp3BFAAQDgi9tdYWEweAIAAyEABAOGIWAZKAAUAhMKZRAQAQAARy0AZAwUAIAAyUABAOCKWgRJAAQChYAwUAIAgyEABAAggYgGUSUQAAARABgoACAVr4QIAEASTiAAACCBiY6AEUABAKDxiAZRJRAAABEAGCgAIBQspAAAQgNcQQAEAaLWoBVDGQAEACIAMFAAQCsZAAQAIIGpduARQAEAoCKAAAARQG7G1cJlEBABAAGSgAIBQMIkIAIAAGAMFACAAAigAAAFErQuXSUQAgC7PzPqZ2ctmtj7+b99GzttiZivMbLmZLWlt+UQEUABAKGpralPaUjRH0qvuPl7Sq/H3jbnU3c9092kBy0sigAIAQuI1tSltKZolaW789VxJn2jr8oyBAug05n/5go6uAlKQahA0s9mSZifsKnD3ghYWH+TuuyTJ3XeZ2cBGznNJL5mZS3oo4fotLX8cARQA0CnEg1mjAdPMXpE0OMmh77biNhe4+854gHzZzNa4++utrKokAigAICRtPQvX3a9o7JiZ7TGz/Hj2mC9pbyPX2Bn/d6+ZPSNpuqTXJbWofCLGQAEAoejgMdB5km6Ov75Z0nMNTzCzXDPLO/Za0lWSVra0fENkoACAUHTwQgr3SXrKzG6TtE3SZyTJzIZI+rW7z5Q0SNIzZibVxb/H3f3Fpso3hQAKAAhFbQcupODuByRdnmT/Tkkz4683STqjNeWbQhcuAAABkIECAELBWrgAAATgEfuD2gRQAEAooraYPAEUABCKqHXhMokIAIAAyEABAKGIWgZKAAUAhCKEP0nWpRBAAQChiNokIsZAAQAIgAwUABAKxkABAAjAa7yjq9CuCKAAgFAwiQgAgAC8NloZKJOIAAAIgAwUABCKWsZAAQBoPWbhAgAQALNwAQAIIGpduEwiAgAgADJQAEAoGAMFACCA2og9B0oABQCEImqTiBgDBQAgADJQAEAoWAsXAIAAotaFSwAFAISCAAoAQABR68JlEhEAAAGQgQIAQhG1vwdKAAUAhCJqa+ESQAEAoYjaUn6MgQIAQuE1ntKWCjPrZ2Yvm9n6+L99k5xzipktT9iKzOyu+LEfmtmOhGMzm7snARQA0B3MkfSqu4+X9Gr8fT3uvtbdz3T3MyVNlVQm6ZmEU3567Li7P9/cDZvtwjWz6XX39cVmNlHSDElrWnJxAEB0dPAY6CxJl8Rfz5W0QNK3mzj/ckkb3X1r0Bs2mYGa2Q8k/VzSA2b2r5J+KamnpDlm9t0mys02syVmtqSgoCBo3QAAXYjX1qa0JcaO+Da7Fbcf5O67JCn+78Bmzr9W0hMN9t1pZu+b2cPJuoAbMvfGvzGY2QpJZ0rKkrRb0jB3LzKzHEkL3X1yczeQFK1pWQDQuVlbXXj+yVNS+n1/9bplTdbNzF6RNDjJoe9KmuvufRLOPeTuSYOgmWVK2inpNHffE983SNJ+1cWseyXlu/utTdWnuS7canevkVRmZhvdvUiS3L3czKI13QoA0KHc/YrGjpnZHjPLd/ddZpYvaW8Tl7pG0rJjwTN+7eOvzey/JP25ufo0N4noqJn1iL+emnDx3pIIoACA4zpyFq6keZJujr++WdJzTZx7nRp038aD7jGflLSyuRs2l4Fe5O6VkuTuiQEzI6GiAAB09HOg90l6ysxuk7RN0mckycyGSPq1u8+Mv+8h6UpJtzco/yMzO1N1Xbhbkhw/QZNjoCFhDBQAOo82GwP9y/DJKf2+/+j299usbm2BlYgAAKGI2p8zYyEFAAACIAMFAISitu2HBDsVAigAIBQ1BFAAAFovYkOgBFAAQDiiloEyiQgAgADIQAEAoaALFwCAAKLWhUsABQCEImoZKGOgAAAEQAYKAAgFXbgAAAQQtS5cAigAIBQEUAAAAohaFy6TiAAACIAMFAAQCrpwAQAIIGpduARQAEAoopaBMgYKAEAAZKAAgFDQhQsAQABR68IlgAIAQkEGCgBAALUdXYF2xiQiAAACIAMFAISCLlwAAAJgEhEAAAGQgQIAEEDUMlAmEQEAEAAZKAAgFFHrwiUDBQCEosZT21JhZp8xs1VmVmtm05o4b4aZrTWzDWY2J2F/PzN72czWx//t29w9CaAAgFDUuKe0pWilpE9Jer2xE8wsJul+SddImijpOjObGD88R9Kr7j5e0qvx900igAIAujx3X+3ua5s5bbqkDe6+yd2PSnpS0qz4sVmS5sZfz5X0iebuyRgoACAUXWAW7lBJ2xPeF0o6J/56kLvvkiR332VmA5u7WHsEUGuHe4TKzGa7e0FH16M7o43bHm3cPmjnDz3oW1L6fW9msyXNTthVkNi2ZvaKpMFJin7X3Z9ryS2S7Asc9slAk5stif8h2hZt3PZo4/ZBO4ckHiwbbUt3vyLFWxRKGp7wfpiknfHXe8wsP5595kva29zFGAMFAETFYknjzWy0mWVKulbSvPixeZJujr++WVKzGS0BFADQ5ZnZJ82sUNJ5kv5iZvPj+4eY2fOS5O7Vku6UNF/SaklPufuq+CXuk3Slma2XdGX8fdP39Ig9+NoSjGm0Pdq47dHG7YN2ji4CKAAAAdCFCwBAAATQBI0t8YTwmNnDZrbXzFZ2dF26KzMbbmavmdnq+NJmX+3oOnU3ZpZtZovM7L14G9/T0XVC+6MLNy6+xNM61Q0eF6puttZ17v5Bh1asmzGziySVSHrU3U/v6Pp0R/Ep+PnuvszM8iQtlfQJfpbDY2YmKdfdS8wsQ9Kbkr7q7u90cNXQjshAP9TUEk8Iibu/LulgR9ejO3P3Xe6+LP66WHWzDYd2bK26F69TEn+bEd/IRiKGAPqhZEs88UsHXZqZjZJ0lqSFHVyVbsfMYma2XHUP3L/s7rRxxBBAPxTqEk9ARzOznpL+JOkudy/q6Pp0N+5e4+5nqm41m+lmxpBExBBAP9TUEk9AlxIfl/uTpN+5+9MdXZ/uzN0PS1ogaUbH1gTtjQD6oaaWeAK6jPgEl99IWu3uP+no+nRHZjbAzPrEX+dIukLSmg6tFNodATSumSWeEBIze0LS3yWdYmaFZnZbR9epG7pA0o2SLjOz5fFtZkdXqpvJl/Samb2vui/fL7v7nzu4TmhnPMYCAEAAZKAAAARAAAUAIAACKAAAARBAAQAIgAAKAEAABFAAAAIggAIAEAABFACAAP4/bXVxjNw17FEAAAAASUVORK5CYII=\n",
431 |       "text/plain": [
432 |        "<Figure size 576x432 with 2 Axes>"
433 |       ]
434 |      },
435 |      "metadata": {
436 |       "needs_background": "light"
437 |      },
438 |      "output_type": "display_data"
439 |     }
440 |    ],
441 |    "source": [
442 |     "plt.figure(figsize=(8,6))\n",
443 |     "sns.heatmap(combined_means.corr(),cmap='RdBu',\n",
444 |     "           mask=mask,vmax=1,vmin=-1,annot=True,annot_kws={\"size\": 15})\n",
445 |     "plt.savefig('remove_g_corr.pdf')\n",
446 |     "plt.show()"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "code",
451 |    "execution_count": null,
452 |    "id": "801e7e7d-55b0-488f-a4c9-6c81d8b6ef73",
453 |    "metadata": {},
454 |    "outputs": [],
455 |    "source": [
456 |     "# no statistically enrichments found, so no plots "
457 |    ]
458 |   }
459 |  ],
460 |  "metadata": {
461 |   "kernelspec": {
462 |    "display_name": "plot",
463 |    "language": "python",
464 |    "name": "plot"
465 |   },
466 |   "language_info": {
467 |    "codemirror_mode": {
468 |     "name": "ipython",
469 |     "version": 3
470 |    },
471 |    "file_extension": ".py",
472 |    "mimetype": "text/x-python",
473 |    "name": "python",
474 |    "nbconvert_exporter": "python",
475 |    "pygments_lexer": "ipython3",
476 |    "version": "3.9.10"
477 |   }
478 |  },
479 |  "nbformat": 4,
480 |  "nbformat_minor": 5
481 | }
482 | 


--------------------------------------------------------------------------------
/figures/supplementary_figures/.ipynb_checkpoints/g_enrichments-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 61,
  6 |    "id": "23720687-2ad7-4d94-8df1-39b834c5e456",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd \n",
 12 |     "import anndata \n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "import shap as shap \n",
 15 |     "import seaborn as sns\n",
 16 |     "import math as math"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 62,
 22 |    "id": "654a8332-3a7a-4e98-9e86-9a9f44072a58",
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "# drop G\n",
 27 |     "aux_0 = pd.read_csv('kang_remove_g/aux_0.csv')\n",
 28 |     "aux_1 = pd.read_csv('kang_remove_g/aux_1.csv')\n",
 29 |     "aux_2 = pd.read_csv('kang_remove_g/aux_2.csv')\n",
 30 |     "aux_3 = pd.read_csv('kang_remove_g/aux_3.csv')"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 63,
 36 |    "id": "8a1707e6-79f5-434c-b6de-d41a31658f88",
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "data": {
 41 |       "text/html": [
 42 |        "<div>\n",
 43 |        "<style scoped>\n",
 44 |        "    .dataframe tbody tr th:only-of-type {\n",
 45 |        "        vertical-align: middle;\n",
 46 |        "    }\n",
 47 |        "\n",
 48 |        "    .dataframe tbody tr th {\n",
 49 |        "        vertical-align: top;\n",
 50 |        "    }\n",
 51 |        "\n",
 52 |        "    .dataframe thead th {\n",
 53 |        "        text-align: right;\n",
 54 |        "    }\n",
 55 |        "</style>\n",
 56 |        "<table border=\"1\" class=\"dataframe\">\n",
 57 |        "  <thead>\n",
 58 |        "    <tr style=\"text-align: right;\">\n",
 59 |        "      <th></th>\n",
 60 |        "      <th>index</th>\n",
 61 |        "      <th>means</th>\n",
 62 |        "      <th>stds</th>\n",
 63 |        "    </tr>\n",
 64 |        "  </thead>\n",
 65 |        "  <tbody>\n",
 66 |        "    <tr>\n",
 67 |        "      <th>233</th>\n",
 68 |        "      <td>IL8</td>\n",
 69 |        "      <td>1.472220</td>\n",
 70 |        "      <td>1.072127</td>\n",
 71 |        "    </tr>\n",
 72 |        "    <tr>\n",
 73 |        "      <th>250</th>\n",
 74 |        "      <td>H2AFZ</td>\n",
 75 |        "      <td>0.660040</td>\n",
 76 |        "      <td>0.297694</td>\n",
 77 |        "    </tr>\n",
 78 |        "    <tr>\n",
 79 |        "      <th>707</th>\n",
 80 |        "      <td>SQRDL</td>\n",
 81 |        "      <td>0.541798</td>\n",
 82 |        "      <td>0.295089</td>\n",
 83 |        "    </tr>\n",
 84 |        "    <tr>\n",
 85 |        "      <th>347</th>\n",
 86 |        "      <td>PLA2G7</td>\n",
 87 |        "      <td>0.439908</td>\n",
 88 |        "      <td>0.348958</td>\n",
 89 |        "    </tr>\n",
 90 |        "    <tr>\n",
 91 |        "      <th>320</th>\n",
 92 |        "      <td>HIST1H2AC</td>\n",
 93 |        "      <td>0.385302</td>\n",
 94 |        "      <td>0.294543</td>\n",
 95 |        "    </tr>\n",
 96 |        "    <tr>\n",
 97 |        "      <th>...</th>\n",
 98 |        "      <td>...</td>\n",
 99 |        "      <td>...</td>\n",
100 |        "      <td>...</td>\n",
101 |        "    </tr>\n",
102 |        "    <tr>\n",
103 |        "      <th>342</th>\n",
104 |        "      <td>TREM2</td>\n",
105 |        "      <td>0.000012</td>\n",
106 |        "      <td>0.000478</td>\n",
107 |        "    </tr>\n",
108 |        "    <tr>\n",
109 |        "      <th>960</th>\n",
110 |        "      <td>RRM2</td>\n",
111 |        "      <td>0.000011</td>\n",
112 |        "      <td>0.000294</td>\n",
113 |        "    </tr>\n",
114 |        "    <tr>\n",
115 |        "      <th>797</th>\n",
116 |        "      <td>PTRF</td>\n",
117 |        "      <td>0.000010</td>\n",
118 |        "      <td>0.000238</td>\n",
119 |        "    </tr>\n",
120 |        "    <tr>\n",
121 |        "      <th>12</th>\n",
122 |        "      <td>ALDH4A1</td>\n",
123 |        "      <td>0.000009</td>\n",
124 |        "      <td>0.000227</td>\n",
125 |        "    </tr>\n",
126 |        "    <tr>\n",
127 |        "      <th>969</th>\n",
128 |        "      <td>ABCC2</td>\n",
129 |        "      <td>0.000005</td>\n",
130 |        "      <td>0.000174</td>\n",
131 |        "    </tr>\n",
132 |        "  </tbody>\n",
133 |        "</table>\n",
134 |        "<p>979 rows × 3 columns</p>\n",
135 |        "</div>"
136 |       ],
137 |       "text/plain": [
138 |        "         index     means      stds\n",
139 |        "233        IL8  1.472220  1.072127\n",
140 |        "250      H2AFZ  0.660040  0.297694\n",
141 |        "707      SQRDL  0.541798  0.295089\n",
142 |        "347     PLA2G7  0.439908  0.348958\n",
143 |        "320  HIST1H2AC  0.385302  0.294543\n",
144 |        "..         ...       ...       ...\n",
145 |        "342      TREM2  0.000012  0.000478\n",
146 |        "960       RRM2  0.000011  0.000294\n",
147 |        "797       PTRF  0.000010  0.000238\n",
148 |        "12     ALDH4A1  0.000009  0.000227\n",
149 |        "969      ABCC2  0.000005  0.000174\n",
150 |        "\n",
151 |        "[979 rows x 3 columns]"
152 |       ]
153 |      },
154 |      "execution_count": 63,
155 |      "metadata": {},
156 |      "output_type": "execute_result"
157 |     }
158 |    ],
159 |    "source": [
160 |     "aux_0.sort_values('means',ascending=False)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 64,
166 |    "id": "b85cd0f4-6b0c-4a1b-82e7-24e0b207a830",
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "combined_means = pd.DataFrame(index = aux_0.index)"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 65,
176 |    "id": "c9a0bd21-db5e-4109-9fdd-c6c585f99f52",
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "combined_means[0] = aux_0['means']\n",
181 |     "combined_means[1] = aux_1['means']\n",
182 |     "combined_means[2] = aux_2['means']\n",
183 |     "combined_means[3] = aux_3['means']"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 66,
189 |    "id": "a69cab5f-5575-4543-8c05-e3a7dabfe983",
190 |    "metadata": {},
191 |    "outputs": [
192 |     {
193 |      "data": {
194 |       "text/html": [
195 |        "<div>\n",
196 |        "<style scoped>\n",
197 |        "    .dataframe tbody tr th:only-of-type {\n",
198 |        "        vertical-align: middle;\n",
199 |        "    }\n",
200 |        "\n",
201 |        "    .dataframe tbody tr th {\n",
202 |        "        vertical-align: top;\n",
203 |        "    }\n",
204 |        "\n",
205 |        "    .dataframe thead th {\n",
206 |        "        text-align: right;\n",
207 |        "    }\n",
208 |        "</style>\n",
209 |        "<table border=\"1\" class=\"dataframe\">\n",
210 |        "  <thead>\n",
211 |        "    <tr style=\"text-align: right;\">\n",
212 |        "      <th></th>\n",
213 |        "      <th>0</th>\n",
214 |        "      <th>1</th>\n",
215 |        "      <th>2</th>\n",
216 |        "      <th>3</th>\n",
217 |        "    </tr>\n",
218 |        "  </thead>\n",
219 |        "  <tbody>\n",
220 |        "    <tr>\n",
221 |        "      <th>0</th>\n",
222 |        "      <td>0.003762</td>\n",
223 |        "      <td>0.007630</td>\n",
224 |        "      <td>0.005264</td>\n",
225 |        "      <td>0.014630</td>\n",
226 |        "    </tr>\n",
227 |        "    <tr>\n",
228 |        "      <th>1</th>\n",
229 |        "      <td>0.001084</td>\n",
230 |        "      <td>0.002251</td>\n",
231 |        "      <td>0.002387</td>\n",
232 |        "      <td>0.000991</td>\n",
233 |        "    </tr>\n",
234 |        "    <tr>\n",
235 |        "      <th>2</th>\n",
236 |        "      <td>0.000217</td>\n",
237 |        "      <td>0.001149</td>\n",
238 |        "      <td>0.000799</td>\n",
239 |        "      <td>0.000178</td>\n",
240 |        "    </tr>\n",
241 |        "    <tr>\n",
242 |        "      <th>3</th>\n",
243 |        "      <td>0.000471</td>\n",
244 |        "      <td>0.000526</td>\n",
245 |        "      <td>0.001402</td>\n",
246 |        "      <td>0.000415</td>\n",
247 |        "    </tr>\n",
248 |        "    <tr>\n",
249 |        "      <th>4</th>\n",
250 |        "      <td>0.002461</td>\n",
251 |        "      <td>0.001258</td>\n",
252 |        "      <td>0.000957</td>\n",
253 |        "      <td>0.000706</td>\n",
254 |        "    </tr>\n",
255 |        "    <tr>\n",
256 |        "      <th>...</th>\n",
257 |        "      <td>...</td>\n",
258 |        "      <td>...</td>\n",
259 |        "      <td>...</td>\n",
260 |        "      <td>...</td>\n",
261 |        "    </tr>\n",
262 |        "    <tr>\n",
263 |        "      <th>974</th>\n",
264 |        "      <td>0.000141</td>\n",
265 |        "      <td>0.000167</td>\n",
266 |        "      <td>0.000026</td>\n",
267 |        "      <td>0.000326</td>\n",
268 |        "    </tr>\n",
269 |        "    <tr>\n",
270 |        "      <th>975</th>\n",
271 |        "      <td>0.000030</td>\n",
272 |        "      <td>0.000044</td>\n",
273 |        "      <td>0.000069</td>\n",
274 |        "      <td>0.000031</td>\n",
275 |        "    </tr>\n",
276 |        "    <tr>\n",
277 |        "      <th>976</th>\n",
278 |        "      <td>0.000287</td>\n",
279 |        "      <td>0.000068</td>\n",
280 |        "      <td>0.000032</td>\n",
281 |        "      <td>0.000397</td>\n",
282 |        "    </tr>\n",
283 |        "    <tr>\n",
284 |        "      <th>977</th>\n",
285 |        "      <td>0.000135</td>\n",
286 |        "      <td>0.000182</td>\n",
287 |        "      <td>0.000036</td>\n",
288 |        "      <td>0.000130</td>\n",
289 |        "    </tr>\n",
290 |        "    <tr>\n",
291 |        "      <th>978</th>\n",
292 |        "      <td>0.000183</td>\n",
293 |        "      <td>0.000211</td>\n",
294 |        "      <td>0.000104</td>\n",
295 |        "      <td>0.000023</td>\n",
296 |        "    </tr>\n",
297 |        "  </tbody>\n",
298 |        "</table>\n",
299 |        "<p>979 rows × 4 columns</p>\n",
300 |        "</div>"
301 |       ],
302 |       "text/plain": [
303 |        "            0         1         2         3\n",
304 |        "0    0.003762  0.007630  0.005264  0.014630\n",
305 |        "1    0.001084  0.002251  0.002387  0.000991\n",
306 |        "2    0.000217  0.001149  0.000799  0.000178\n",
307 |        "3    0.000471  0.000526  0.001402  0.000415\n",
308 |        "4    0.002461  0.001258  0.000957  0.000706\n",
309 |        "..        ...       ...       ...       ...\n",
310 |        "974  0.000141  0.000167  0.000026  0.000326\n",
311 |        "975  0.000030  0.000044  0.000069  0.000031\n",
312 |        "976  0.000287  0.000068  0.000032  0.000397\n",
313 |        "977  0.000135  0.000182  0.000036  0.000130\n",
314 |        "978  0.000183  0.000211  0.000104  0.000023\n",
315 |        "\n",
316 |        "[979 rows x 4 columns]"
317 |       ]
318 |      },
319 |      "execution_count": 66,
320 |      "metadata": {},
321 |      "output_type": "execute_result"
322 |     }
323 |    ],
324 |    "source": [
325 |     "combined_means"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 67,
331 |    "id": "f2040385-8056-4c9b-bdee-ee1ab665c26f",
332 |    "metadata": {},
333 |    "outputs": [
334 |     {
335 |      "data": {
336 |       "text/html": [
337 |        "<div>\n",
338 |        "<style scoped>\n",
339 |        "    .dataframe tbody tr th:only-of-type {\n",
340 |        "        vertical-align: middle;\n",
341 |        "    }\n",
342 |        "\n",
343 |        "    .dataframe tbody tr th {\n",
344 |        "        vertical-align: top;\n",
345 |        "    }\n",
346 |        "\n",
347 |        "    .dataframe thead th {\n",
348 |        "        text-align: right;\n",
349 |        "    }\n",
350 |        "</style>\n",
351 |        "<table border=\"1\" class=\"dataframe\">\n",
352 |        "  <thead>\n",
353 |        "    <tr style=\"text-align: right;\">\n",
354 |        "      <th></th>\n",
355 |        "      <th>0</th>\n",
356 |        "      <th>1</th>\n",
357 |        "      <th>2</th>\n",
358 |        "      <th>3</th>\n",
359 |        "    </tr>\n",
360 |        "  </thead>\n",
361 |        "  <tbody>\n",
362 |        "    <tr>\n",
363 |        "      <th>0</th>\n",
364 |        "      <td>1.000000</td>\n",
365 |        "      <td>0.690757</td>\n",
366 |        "      <td>0.703157</td>\n",
367 |        "      <td>0.723252</td>\n",
368 |        "    </tr>\n",
369 |        "    <tr>\n",
370 |        "      <th>1</th>\n",
371 |        "      <td>0.690757</td>\n",
372 |        "      <td>1.000000</td>\n",
373 |        "      <td>0.698095</td>\n",
374 |        "      <td>0.634515</td>\n",
375 |        "    </tr>\n",
376 |        "    <tr>\n",
377 |        "      <th>2</th>\n",
378 |        "      <td>0.703157</td>\n",
379 |        "      <td>0.698095</td>\n",
380 |        "      <td>1.000000</td>\n",
381 |        "      <td>0.589254</td>\n",
382 |        "    </tr>\n",
383 |        "    <tr>\n",
384 |        "      <th>3</th>\n",
385 |        "      <td>0.723252</td>\n",
386 |        "      <td>0.634515</td>\n",
387 |        "      <td>0.589254</td>\n",
388 |        "      <td>1.000000</td>\n",
389 |        "    </tr>\n",
390 |        "  </tbody>\n",
391 |        "</table>\n",
392 |        "</div>"
393 |       ],
394 |       "text/plain": [
395 |        "          0         1         2         3\n",
396 |        "0  1.000000  0.690757  0.703157  0.723252\n",
397 |        "1  0.690757  1.000000  0.698095  0.634515\n",
398 |        "2  0.703157  0.698095  1.000000  0.589254\n",
399 |        "3  0.723252  0.634515  0.589254  1.000000"
400 |       ]
401 |      },
402 |      "execution_count": 67,
403 |      "metadata": {},
404 |      "output_type": "execute_result"
405 |     }
406 |    ],
407 |    "source": [
408 |     "combined_means.corr()"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "code",
413 |    "execution_count": 68,
414 |    "id": "7aa9b78a-9769-49f2-ab16-e30ef48a91a1",
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": [
418 |     "corr_mat = combined_means.corr().abs()\n",
419 |     "mask = np.tril(np.ones_like(corr_mat, dtype=bool)) "
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": 69,
425 |    "id": "8c5f0225-033b-4559-bb16-6500ed35ed8f",
426 |    "metadata": {},
427 |    "outputs": [
428 |     {
429 |      "data": {
430 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAdAAAAFpCAYAAAAsmHm9AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAp80lEQVR4nO3dd5gc1Znv8d87PVGjUUJplCMggQRIQiSTk5C9lu1rewETDNgC2+wa4yRfXweW595l7V17HTAwa4PFYsDYJmhtQAQjk4wiAkko51HOk0cT3vvHtETPqCdV18T6fp6nHnVX1ak6fRjm7fecU2fM3QUAAFonraMrAABAV0QABQAgAAIoAAABEEABAAiAAAoAQAAEUAAAAiCAAgC6BDN72Mz2mtnKRo6bmf3czDaY2ftmNiXh2AwzWxs/NieM+hBAAQBdxW8lzWji+DWSxse32ZIekCQzi0m6P358oqTrzGxiqpUhgAIAugR3f13SwSZOmSXpUa/zjqQ+ZpYvabqkDe6+yd2PSnoyfm5KCKAAgO5iqKTtCe8L4/sa25+S9FQv0AKsFdjGpv1gfkdXIRJqqms7ugrdXm0Nvy7a2nv3zbS2unbmWbem9B+wavkjt6uu6/WYAncvaMUlkn02b2J/StojgAIAIsDSYimVjwfL1gTMhgolDU94P0zSTkmZjexPCV24AIBQWFospS0E8yTdFJ+Ne66kI+6+S9JiSePNbLSZZUq6Nn5uSshAAQChCCkINn59syckXSKpv5kVSvqBpAxJcvcHJT0vaaakDZLKJN0SP1ZtZndKmi8pJulhd1+Van0IoACALsHdr2vmuEv6SiPHnlddgA0NARQAEIq2zkA7GwIoACAUFiOAAgDQamkRy0CZhQsAQABkoACAUDAGCgBAAARQAAACsLRojQoSQAEAoYhaBhqtrwsAAISEDBQAEIqoZaAEUABAKAigAAAEwEpEAAAEELUMlElEAAAEQAYKAAhF1DJQAigAIBRRW0yeAAoACEXUMlDGQAEACIAMFAAQiqhloARQAEAoCKAAAARAAAUAIICoBVAmEQEAEAAZKAAgFKyFCwBAAFHrwiWAAgBCQQAFACCAqAVQJhEBABAAGSgCGz0gV9+cOUGTh/VRcUWVnl22Q/+1YINqvfmyl04YqM9fOEZjB/ZURVWNPthRpG/9frkqqmqOn3PrRWP0qanD1Dc3U5v3leqXr6zTOxsPtOEn6jrGDOipb/3DBE0e3lclFVV6ZkmhHvrr+ibb/vbLxumOy8cnPfaL+Wv18Oub2qi2nd+YgT015+MTNXlEXxVXVOmZxdv14CtNt+cdV4zXl65I3p4/e3GtHl6wUWkm3XzRGF106kCNGdhTkrR6xxH94qV1WlV4pC0+SodKS7OOrkK7IoAikLzsdP3qpmnatK9UX3/iXQ3rl6O7rj5FaSY98NcNTZadNWWovjVzgh59a4t+/tJa5WVn6Owx/RRL+J/v8xeO1hcuHquHXtugtbuKNPOMIfrp9VN0228W6oOdRW398Tq1vOx0PXjr2dq0t0Rfe2yphp/UQ3dfc6rMpF+9sr7Rcs8sKdTb6/fX23fphIG65eKxenP9vraudqeVl5Ouh74wXZv2lOiuR+va8+sfPVVmpvtfWtdouacXbddba+u322WnDdKtl4zVW2v3SpKyMmK69ZKxem5JoX6zYKPcpWvPH6nf3nGubnrg71q9o3v9LBsBFGje/zp7uLIyYvrW799VaWWNFm6ScrPSNfuScXr0rc0qraxJWq53jwzdPeNU/fiFNXp2aeHx/QvW7D3+Oj1m+vxHxmjum5s1983NkqR3Nh7Q6AG5+uIlY/W1x99t2w/XyX1m+ghlZcT09cffVWlltRZuPKDcrHTdftl4zX1js0orq5OW21tUob1FFfX2ffHSsdq0t0TrdhW3R9U7pc+cM1LZGTHd/dgylVZW650NdT/Ld1wxXr/926ZWtefsy8dp094SrY23Z2VVjWb+6DUVl394jYUb92veNy7WdeeN0vf/+H7bfbAOYBatAMoYKAI5f1x/vbNhf71AOX/lbmVnxjRlZL9Gy1152mBJ0p+X72j0nGF9e6hndroWbarfXbtw4wGdM7a/0mPR+p+0oQtOHqC/r99X7xf7/Pd3KSczpqmjG2/7hnrlZOjcsf01//1dbVHNLuMjpwzQ2+vqt+eL7+1UTmZM08a0rj3PG9dfL7638/i+Wle94ClJ1TWujXtK1LdnZuqVR4cigCKQUf1ztWV/ab19e45UqPxotUYNyG203OnDemvr/lLNmjJMf7n7Yr3z/Sv12y+eo8nD+xw/Jyu97seyqqa2XtmjNbXKTE/T0L49wvsgXdCoAbnavK9+2+8+1vb9G2/7hq44fbAy0tP04vs7mz+5GxvdVHs28bPc0JWT4u35XtNfSDJiaZo4tLc27SkJVN/OLC3NUtpawsxmmNlaM9tgZnOSHP+mmS2PbyvNrMbM+sWPbTGzFfFjS1L+vKleANHUKydDxRUndm0VlVcrLzuj0XIn9czSyP65uu2iMfrFy+v0tcffVfnRGv38hqnql1v3jbzwULlqa12nDe1dr+yx971zGr9+FOQ10fa9WtE2V0/K1wc7jmjbgbIwq9fl5OVkqLi86oT9ReVVrWrPGWfk64PCI9ra4ItlQ1+8bKx65aTr6cXbW13Xzs7SLKWt2eubxSTdL+kaSRMlXWdmExPPcfcfu/uZ7n6mpO9I+pu7H0w45dL48Wmpft5mA6iZnWpm3zazn5vZz+KvJ6R6Y3R9rhOnKNYNgTQ+dTHN6saX7n1ulV5csUt/37Bf33jyXdW667PnjJAklVZWa/7KXbrlwjGaOqqfeuVk6B/PGaFzxpwkSappyTTfbs49edsn+2+STP+8LE0d3U8vRrz79phkrWYyJWnmpOra8yS98F7T2fyFpwzQFy4dp/98YW2zgbYrausAKmm6pA3uvsndj0p6UtKsJs6/TtITIXy0pJoMoGb2bdVV0CQtkrQ4/vqJZKlzQrnZZrbEzJYUFBSEWV90EkXlVUkzzZ5Z6Umzo2OOxL/pL93y4RfC0soard5ZpDEDeh7f9x8vrNHmfSV66Jaz9dc5l+nG80fpN/HHLA6WVob1Mbqk4vIq5SXJjHpmpZ8w3taYK08fLJP00goCaHF5lfKyT5xP2TM7XcUVJ2amyVw1OV8mNTmefNqw3vrR9Wfpjwu36XdvbQlY284tzSylLTF2xLfZDW4xVFJi6l4Y33cCM+shaYakPyXsdkkvmdnSJNduteZm4d4m6TR3r/dTZGY/kbRK0n3JCrl7gaRjkZN0oRvasr/0hPG2Qb2y1SMrXVv2Nf7Nesu+UtXWuhp+1zSTahO+7h8uq9KX5i7RwF5Z6pmVrq0HynTduSO1v7hSuw5XKMq27CvV6IZt3zve9i3MamZMztfyrYe050i021KSNu8r1eiEL29SQns28bOcaMbkfL3bRHuO7J+rX35+mhZuPKD75q1Kuc7dVYPYkUyyNLWxGPMPkt5q0H17gbvvNLOBkl42szXu/nrA6jbbhVsraUiS/fnxY4iotzfs17lj+6tH5odLd115+mBVHK3Rsq0HGy33xrp9SkszTUuYLZqbla4J+b20fveJj1LsLarUpn2liqWZPn7WUM17t/CEc6LmrXX7dN74+m1/1aR8lR+t0dLNjbf9Mfl9cjR5RF+6b+PeXLtP559cvz2vnlzXnks2Nd+eQ/rm6IyRffXC8uTdt/3zsvTArWdr+4EyzXni3RYtNNJVtUMXbqGk4Qnvh0lqrN/8WjXovnX3nfF/90p6RnVdwoE1l4HeJelVM1uvD9PmEZLGSbozlRuja/vT4u269pyR+vG1Z2rum5s1tG8Pzb5krH739y31Hm155p8v1LKtB3Xvc3XfulfvLNKC1Xv0vVmn6xcvr9PhsqO66SOjVV3remrRtuPlZk7OV3osTTsOlWlw7xxdf95I1brrkTc2t/tn7Wz+sGibrj1/pP7jc1P029c3aWi/HrrjsnF67K36z4A+d/dFWrb5oO55ZmW98jMm56uqplavrNzd3lXvlP6wcKuuP3+kfnLjVD3yt40a1q+HvnTFeP33m/Xb83++cbGWbj6oH/5pRb3yTbVnVnqa7r/lbOXlZOhfn1ul8YN7HT9WVVOrNd1sUZB2WEhhsaTxZjZa0g7VBcnrT6iHWW9JF0u6IWFfrqQ0dy+Ov75K0r+kUpkmA6i7v2hmJ6suSg9VXfpcKGmxuyd/Uh6RUFxRrS/NXaxvzZygn1w/RSUV1Xr8na0qeK3+KkSxtLqxjUTfe3qFvnrVyfrajFOUnRHTe9sO647fLq43dmpmuvkjozW4d7ZKKqv1tzV7df8r61V+lB+74opq3fGbxfr2P0zUf944VcUVVfrd21v04Kv1VyFKb+TRgKsn52vRxgM6VHa0varcqRWXV2v2rxfpO7Mm6uc3T1NxeZUee3OzHmiwqlMs1kh7njFEizYc0KHSE9vzpLwsnTqkLmj+8paz6x3bcahMM/9tQXgfpBNo66X83L3azO6UNF9STNLD7r7KzO6IH38wfuonJb3k7ol98IMkPRNf7CFd0uPu/mIq9bFks/lC1o07LDqHaT+Y39FViISaakYt2lptDb8u2tp7981ssyh3xpznU/oP2JZ1awss5QcACIVFbGUBAigAIBRRWwuXAAoACAV/zgwAgACi9ufMItZjDQBAOMhAAQChiFoGSgAFAISi4TPf3R0BFAAQCjJQAAACiFoAZRIRAAABkIECAELBc6AAAATASkQAAAQQtbVwI/ZxAQAIBxkoACAUjIECABBA1B5jIYACAELBJCIAAAKIWhcuk4gAAAiADBQAEArGQAEACCBGAAUAoPUIoAAABBC1AMokIgAAAiADBQCEImoZKAEUABAKAigAAAGkRyyAMgYKAEAAZKAAgFDQhQsAQAAEUAAAAoilRWtUkAAKAAhF1DLQaH1dAAB0aWY2w8zWmtkGM5uT5PglZnbEzJbHt++3tGxrkYECAELR1hmomcUk3S/pSkmFkhab2Tx3/6DBqW+4+8cClm0xAigAIBTt0IU7XdIGd98kSWb2pKRZkloSBFMpmxQBtBtYcs/VHV2FSJj6vRc7ugrdXs8+2R1dBaQgZqkFUDObLWl2wq4Cdy9IeD9U0vaE94WSzklyqfPM7D1JOyV9w91XtaJsixFAAQChSDUDjQfLgiZOSXYDb/B+maSR7l5iZjMlPStpfAvLtgqTiAAAXUWhpOEJ74epLss8zt2L3L0k/vp5SRlm1r8lZVuLDBQAEIp2GANdLGm8mY2WtEPStZKuTzzBzAZL2uPubmbTVZcoHpB0uLmyrUUABQCEoq0Xk3f3ajO7U9J8STFJD7v7KjO7I378QUmflvQlM6uWVC7pWnd3SUnLplIfAigAIBTtsZBCvFv2+Qb7Hkx4/UtJv2xp2VQwBgoAQABkoACAUERtKT8CKAAgFARQAAACIIACABBA1AIok4gAAAiADBQAEIqoZaAEUABAKAigAAAEQAAFACCAqAVQJhEBABAAGSgAIBRRy0AJoACAUMSMAAoAQKulRSyAMgYKAEAAZKAAgFDEopWAEkABAOFIYxIRAACtxyQiAAACYBIRAABoFhkoACAUTCICACAAJhEBABBA1MZACaAAgFBErQuXSUQAAARABgoACAVduAAABMCfMwMAIAAyUACd3ugBufrWRydq8vA+Kq6o0rNLC1Xw2gbVeuNlZl86TrdfNi7psV++vE6PvL6pjWrb+Y06qYfuuvxknTakl0oqq/Xn93fpkbc3N9mex1w0vr9uOGekxvTPVUV1rdbsLtL/eW6lKqpqJUm3XjBKF40foMG9smUmbTtYpicWbddf1+5t40+FtkYABbqYvOx0PfD5s7VpX4nufnyZhvXroa/NOEVmpgdeXd9ouWeXbtfb6/fV23fphEH6/EVj9Na6fY2U6v56ZqXrp589U1sOlOo7z6zQ0D45+sol42Qm/frNzU2W/dikfN11xXg9sWi7fvW3jcrLTtfUEX3rrQmbm5muF1bu1pYDpap11yUnD9Q9Hz9Ntc+5FnSzdo/aLFwCKNDFfHr6CGVlxPTNJ95VaWWNFm48oNysdN1+6Tg9+uYmlVbWJC23t6hSe4sq6+374iVjtXlfidbtLm6PqndKnzhziLLS0/TdZ1eq7GiNlmw9pNysdN1y/ig9vmibyo4mb8/eORn6p8vG6Wevrtf/vL/r+P431u+vd94vXttQ7/3iLYc0un+urj5tcLcLoFHrwuUxFqCLOX98f/19w/56gfKlFbuUnRnTlFH9WnydXjkZOmdsf81P+OUfReeMPkmLNh+sFyhfWb1H2RkxnTm8T6PlLj1lgCTphZW7W33PI+VVyuiG6VoszVLauhoCKNDFjOqfqy37Suvt232kQuVHqzWqf26Lr3P5aYOUkZ6m+SuiHUBH9uuhrQfL6u3bW1yp8qM1GtmvR6PlJub30raDZfrY5Hz96Y7z9NrdF+uhz03V6UN6JT0/ZqaeWem6csIgnT2qr55bvjPUz9EZpFlqW0uY2QwzW2tmG8xsTpLjnzOz9+Pb22Z2RsKxLWa2wsyWm9mSVD8vXbhAF9MrJ0PFFVUn7C8qr1avnIwWX+fqSflaveOIth0oa/7kbiwvO10lldUn7C+urFJeduPteVJulkb066Gbzh2lB/62UUfKq3T99BH690+foet+/Y4OlX3432hifi89dMNUSVJ1Ta1++up6vbFhf2OXRiPMLCbpfklXSiqUtNjM5rn7BwmnbZZ0sbsfMrNrJBVIOifh+KXuHkrjE0CBrijJ7FCzpLuT6t8zS1NG9dMvXlobarW6Kk/WnjJ5sgPHjpvUIzNd33tulRZtOShJWrnziP54+3n61FnD9Ju3PpyAtGl/ib7w6BL1zE7X+WNO0tcuH6/Symq9uqZ7zcRthz+oPV3SBnffJElm9qSkWZKOB1B3fzvh/HckDWurygTuwjWzW5o4NtvMlpjZkoKCgqC3AJBEUXmVeuac+N23Z1a6istPzEyTufL0wTJJL61o/fhdd1NcUa2e2Se2Z25WLGlmekxRvBdg+fbDx/eVHa3R2j3FGtW/ftdvRVWt1u4p1tKth/SL1zZo/gd79KWLx4bzATqRNLOUtsTYEd9mN7jFUEnbE94Xxvc15jZJLyS8d0kvmdnSJNdutVQy0HskPZLsgLsXqC5tllr+pRhAC2zZX6pR/XvW2zeoV7Z6ZKVry/7SRkrVd9WkfC3fdkh7iiraoopdytaDZSeMdQ7My1KPzPQTxkbrlTtQplp3qUHSVZe5Nn3PdXuK9dFJ+YqlmWpa8rBpFxFLcVZNg9iRTLIUN2kDmtmlqgugH0nYfYG77zSzgZJeNrM17v560Po2+XETBmIbbiskDQp6UwDBvb1+v84bd5J6ZMaO77tq0mBVHK3RsnhXYlPy++Ro8og+kZ99e8zCzQc0fVQ/5WR82J6XnTpQFVU19bLLht7eeEBpZpqSMFM3NzOmkwf11Ia9JU3ec9LQ3tpTVNGtgqeUegbaAoWShie8HybphNlYZjZZ0q8lzXL3A8f2u/vO+L97JT2jui7hwJrLQAdJulrSoYb1k/T2iacDaGt/XLRN1547Qj++7izNfWOzhvbL0exLx+mxt7fUe7Tl2bsu1NIth3Tvsyvrlb960mBV19TqlVV030rSs8t36tNThun/fuJ0/W7RNg3pna1bzh+l3y/ZXu/Rlie+cI6Wbz+sf5tfN268dk+x3li/T3NmnKoHX990fBJRTa3r6Xd3SJIG9crSd2ZM0Cur92jnkXLlZMR00fgBumLCIP07489BLJY03sxGS9oh6VpJ1yeeYGYjJD0t6UZ3X5ewP1dSmrsXx19fJelfUqlMcwH0z5J6uvvyhgfMbEEqNwYQTHFFte54ZLG+/bGJ+ukNU1RSUaXH/75FD/21/gP7sbS0pJM6rpqUr0WbDuhwWcvGS7u7kspq3fXUct11+cn6t09OUklltZ5aUqhH3q6/ClGyZxXv/ctqffmSsbrz0nHKTk/Tip1H9NXfLz8+dlpSUa0DpZW66byR6pebqZLKam3ZX6Zv/vE9vbO5+d6CrqatJxG5e7WZ3SlpvqSYpIfdfZWZ3RE//qCk70s6SdKvrK4+1e4+TXUJ4TPxfemSHnf3F1OpjzU1yywk3auPApE19Xsp/b+GFujRK6ujq9DtvfHNS9ssyi3dfjil3/dTh/fpUqsp8BgLACAUqU4i6moi9nEBAAgHGSgAIBRRW0yeAAoACEXE4icBFAAQjrSk6xx0XwRQAEAoopaBMokIAIAAyEABAKHogn8TOyUEUABAKKLWhUsABQCEImqTiBgDBQAgADJQAEAo6MIFACAAJhEBABBAxOInARQAEI6orYXLJCIAAAIgAwUAhCJiCSgBFAAQjqh1aRJAAQChsIiloARQAEAoovYYS9QybgAAQkEGCgAIRcR6cAmgAIBwRK1LkwAKAAhF1CYRRe0LAwAAoSADBQCEImqzcAmgAIBQRCx+EkABAOEgAwUAIAAmEQEAgGaRgQIAQhG1LlwyUABAKCzFrUX3MJthZmvNbIOZzUly3Mzs5/Hj75vZlJaWbS0CKAAgFGlmKW3NMbOYpPslXSNpoqTrzGxig9OukTQ+vs2W9EAryrbu86ZSGACAY8xS21pguqQN7r7J3Y9KelLSrAbnzJL0qNd5R1IfM8tvYdlWIYACALqKoZK2J7wvjO9ryTktKdsqTCICWmjpvTM6ugrd3tW/equjq4AUmHtq5c1mq67b9ZgCdy9IPCVJsYY3beyclpRtFQIoACAcXpta8bpgWdDEKYWShie8HyZpZwvPyWxB2VahCxcAEArz2pS2FlgsabyZjTazTEnXSprX4Jx5km6Kz8Y9V9IRd9/VwrKtQgYKAOgS3L3azO6UNF9STNLD7r7KzO6IH39Q0vOSZkraIKlM0i1NlU2lPgRQAEA4UuzCbdEt3J9XXZBM3PdgwmuX9JWWlk0FARQAEI4UJxF1NQRQAEA42iED7UwIoACAULRwIlC3wSxcAAACIAMFAIQjYhkoARQAEA4CKAAAARBAAQAIoDZaAZRJRAAABEAGCgAIRdQeYyGAAgDCQQAFACCAiC3lxxgoAAABkIECAMJBFy4AAK3HJCIAAIIggAIAEEDEAiiTiAAACIAMFAAQjohloARQAEAomEQEAEAQEVtMngAKAAgHKxEBAIDmkIECAMLBGCgAAK3HJCIAAIKIWABlDBQAgADIQAEA4YhYBkoABQCEo7amo2vQrgigACJvRN8cffnCMZowKE+lR2v04gd79NiSbapt4rHGQXlZevTGaSfsX7B+n/715XX19l03dZhmThysPjkZ2nqoTI+8s1VLtx8O+VN0PGchBQCIjp5ZMd338dO17WCZ7nlhtfJ752j2+aNkJs1dtK3Z8gVvbdaq3UXH3xeVV9c7/o9Thur6acP134u2aeP+Ul128gDdM3OC7n5mhdbtLQn983QoMlAAiI6PnjZYmbE0/cuLa1RWVSMVHlGPjJhuOHu4/vDujrp9TSg8XK41e5IHwvQ00z9OGaY/LNuhp97dIUlauv2wRvbtoRumDdf3n18d+udB+2EWLoBIO3tEXy3dfqheoFywYZ+yM2KaNKRXStfO752t3Mx0LSs8XG//ssLDOmt4H6WnWUrX73Rqa1LbUmBm/czsZTNbH/+3b5JzhpvZa2a22sxWmdlXE4790Mx2mNny+DazuXuSgQKItOF9crR8x5F6+/aVHFVFVY2G9+2hhVsPNVn+7svGKy8rXYfLq7Rg/T79duE2Ha2pGwvMjNXlKNUNxgaramqVGUtTfq9sbT9cHuKn6Vhe06FduHMkveru95nZnPj7bzc4p1rS1919mZnlSVpqZi+7+wfx4z91939v6Q2bDaBmdqqkoZIWuntJwv4Z7v5iS28EAJ1Rz6x0lVZWn7C/uLJaeVmxRstV1dRq3opdWrr9sMqOVmvy0N767FlDNaR3tn74whpJ0q6iCtW66+SBefW6eU8ZmCdJysvuZjlMx04imiXpkvjruZIWqEEAdfddknbFXxeb2WrVxbcPFECTXbhm9s+SnpP0T5JWmtmshMP/r4lys81siZktKSgoCFIvAGg3ySbbWiP7jzlYVqX739ikd7Yc1Ps7i/TY4u0qeGuLzht9ksaclCtJKjtaowXr9+u6qcN0xpDeystK18cn5eusYb0lSTVNTfPtilLswk2MHfFtdivuPigeII8FyoFNnWxmoySdJWlhwu47zex9M3s4WRdwQ819/fmipKnuXhK/2R/NbJS7/0x1P19JuXuBpGORs5v9hADoTkoqq5WbeeKvwtysdJVUtq5L8o2N+/VPF4/VuAG52nSgVJL04Jub9L+vOkU/+sTpkqS9xZV6fGmhbpo+QofLq1L/AN1Ig9hxAjN7RdLgJIe+25r7mFlPSX+SdJe7H5tC/YCke1UXs+6V9B+Sbm3qOs0F0Nixblt332Jml6guiI5UEwEUALqK7YfLNbxvTr19A3pmKicjpu2Hylp1rWTZwpGKan173ir1z81UbmZM2w+X61NnDNGB0qPaU1yZQs07H2/jx1jc/YrGjpnZHjPLd/ddZpYvaW8j52WoLnj+zt2fTrj2noRz/kvSn5urT3OzcHeb2ZkJNyiR9DFJ/SVNau7iANDZLd52SNOG91FOxofjnReP66+Kqhqt2FnURMkTXTi2vyRp/b4TH2vZX3pUWw+VK5ZmuurUQXppzZ4TzunyamtT21IzT9LN8dc3q274sR4zM0m/kbTa3X/S4Fh+wttPSlrZ3A2by0BvUt2spePcvVrSTWb2UHMXB4DO7i+rdmvWpCH6/oxT9dS7hRrcK1s3nD1CT7+3s96jLY98bore31mkn762QZJ0w9nD1SMjplW7i1R2tEaT8nvr02cN0Zsb92vzgQ8z18tPHqBYmml3UYUG5mXpk5OHqNZdTy4tbPfP2tbaOgNtxn2SnjKz2yRtk/QZSTKzIZJ+7e4zJV0g6UZJK8xsebzc/3b35yX9KJ4wuqQtkm5v7oZNBlB3b/S/sLu/1dzFAaCzK6ms0Zx5K/WVC8fonpkTVFJZo6ff26nHFtdfhSjNTImPbW4/VK5PnzlUMyYMUmZ6mvaVVOqP7+7UE0u31ytnJn32rGEalJel0qPVenvzQT3yzlZVVEdr2bu25u4HJF2eZP9OSTPjr99UI8OP7n5ja+9p7m0+x4dJRABa5Opf8b28rc3/8gVtNn/l6FtPpfT7PvOCz3apuTXd7CEkAECHYTF5AABar4NXImp3BFAAQDgi9tdYWEweAIAAyEABAOGIWAZKAAUAhMKZRAQAQAARy0AZAwUAIAAyUABAOCKWgRJAAQChYAwUAIAgyEABAAggYgGUSUQAAARABgoACAVr4QIAEASTiAAACCBiY6AEUABAKDxiAZRJRAAABEAGCgAIBQspAAAQgNcQQAEAaLWoBVDGQAEACIAMFAAQCsZAAQAIIGpduARQAEAoCKAAAARQG7G1cJlEBABAAGSgAIBQMIkIAIAAGAMFACAAAigAAAFErQuXSUQAgC7PzPqZ2ctmtj7+b99GzttiZivMbLmZLWlt+UQEUABAKGpralPaUjRH0qvuPl7Sq/H3jbnU3c9092kBy0sigAIAQuI1tSltKZolaW789VxJn2jr8oyBAug05n/5go6uAlKQahA0s9mSZifsKnD3ghYWH+TuuyTJ3XeZ2cBGznNJL5mZS3oo4fotLX8cARQA0CnEg1mjAdPMXpE0OMmh77biNhe4+854gHzZzNa4++utrKokAigAICRtPQvX3a9o7JiZ7TGz/Hj2mC9pbyPX2Bn/d6+ZPSNpuqTXJbWofCLGQAEAoejgMdB5km6Ov75Z0nMNTzCzXDPLO/Za0lWSVra0fENkoACAUHTwQgr3SXrKzG6TtE3SZyTJzIZI+rW7z5Q0SNIzZibVxb/H3f3Fpso3hQAKAAhFbQcupODuByRdnmT/Tkkz4683STqjNeWbQhcuAAABkIECAELBWrgAAATgEfuD2gRQAEAooraYPAEUABCKqHXhMokIAIAAyEABAKGIWgZKAAUAhCKEP0nWpRBAAQChiNokIsZAAQAIgAwUABAKxkABAAjAa7yjq9CuCKAAgFAwiQgAgAC8NloZKJOIAAAIgAwUABCKWsZAAQBoPWbhAgAQALNwAQAIIGpduEwiAgAgADJQAEAoGAMFACCA2og9B0oABQCEImqTiBgDBQAgADJQAEAoWAsXAIAAotaFSwAFAISCAAoAQABR68JlEhEAAAGQgQIAQhG1vwdKAAUAhCJqa+ESQAEAoYjaUn6MgQIAQuE1ntKWCjPrZ2Yvm9n6+L99k5xzipktT9iKzOyu+LEfmtmOhGMzm7snARQA0B3MkfSqu4+X9Gr8fT3uvtbdz3T3MyVNlVQm6ZmEU3567Li7P9/cDZvtwjWz6XX39cVmNlHSDElrWnJxAEB0dPAY6CxJl8Rfz5W0QNK3mzj/ckkb3X1r0Bs2mYGa2Q8k/VzSA2b2r5J+KamnpDlm9t0mys02syVmtqSgoCBo3QAAXYjX1qa0JcaO+Da7Fbcf5O67JCn+78Bmzr9W0hMN9t1pZu+b2cPJuoAbMvfGvzGY2QpJZ0rKkrRb0jB3LzKzHEkL3X1yczeQFK1pWQDQuVlbXXj+yVNS+n1/9bplTdbNzF6RNDjJoe9KmuvufRLOPeTuSYOgmWVK2inpNHffE983SNJ+1cWseyXlu/utTdWnuS7canevkVRmZhvdvUiS3L3czKI13QoA0KHc/YrGjpnZHjPLd/ddZpYvaW8Tl7pG0rJjwTN+7eOvzey/JP25ufo0N4noqJn1iL+emnDx3pIIoACA4zpyFq6keZJujr++WdJzTZx7nRp038aD7jGflLSyuRs2l4Fe5O6VkuTuiQEzI6GiAAB09HOg90l6ysxuk7RN0mckycyGSPq1u8+Mv+8h6UpJtzco/yMzO1N1Xbhbkhw/QZNjoCFhDBQAOo82GwP9y/DJKf2+/+j299usbm2BlYgAAKGI2p8zYyEFAAACIAMFAISitu2HBDsVAigAIBQ1BFAAAFovYkOgBFAAQDiiloEyiQgAgADIQAEAoaALFwCAAKLWhUsABQCEImoZKGOgAAAEQAYKAAgFXbgAAAQQtS5cAigAIBQEUAAAAohaFy6TiAAACIAMFAAQCrpwAQAIIGpduARQAEAoopaBMgYKAEAAZKAAgFDQhQsAQABR68IlgAIAQkEGCgBAALUdXYF2xiQiAAACIAMFAISCLlwAAAJgEhEAAAGQgQIAEEDUMlAmEQEAEAAZKAAgFFHrwiUDBQCEosZT21JhZp8xs1VmVmtm05o4b4aZrTWzDWY2J2F/PzN72czWx//t29w9CaAAgFDUuKe0pWilpE9Jer2xE8wsJul+SddImijpOjObGD88R9Kr7j5e0qvx900igAIAujx3X+3ua5s5bbqkDe6+yd2PSnpS0qz4sVmS5sZfz5X0iebuyRgoACAUXWAW7lBJ2xPeF0o6J/56kLvvkiR332VmA5u7WHsEUGuHe4TKzGa7e0FH16M7o43bHm3cPmjnDz3oW1L6fW9msyXNTthVkNi2ZvaKpMFJin7X3Z9ryS2S7Asc9slAk5stif8h2hZt3PZo4/ZBO4ckHiwbbUt3vyLFWxRKGp7wfpiknfHXe8wsP5595kva29zFGAMFAETFYknjzWy0mWVKulbSvPixeZJujr++WVKzGS0BFADQ5ZnZJ82sUNJ5kv5iZvPj+4eY2fOS5O7Vku6UNF/SaklPufuq+CXuk3Slma2XdGX8fdP39Ig9+NoSjGm0Pdq47dHG7YN2ji4CKAAAAdCFCwBAAATQBI0t8YTwmNnDZrbXzFZ2dF26KzMbbmavmdnq+NJmX+3oOnU3ZpZtZovM7L14G9/T0XVC+6MLNy6+xNM61Q0eF6puttZ17v5Bh1asmzGziySVSHrU3U/v6Pp0R/Ep+PnuvszM8iQtlfQJfpbDY2YmKdfdS8wsQ9Kbkr7q7u90cNXQjshAP9TUEk8Iibu/LulgR9ejO3P3Xe6+LP66WHWzDYd2bK26F69TEn+bEd/IRiKGAPqhZEs88UsHXZqZjZJ0lqSFHVyVbsfMYma2XHUP3L/s7rRxxBBAPxTqEk9ARzOznpL+JOkudy/q6Pp0N+5e4+5nqm41m+lmxpBExBBAP9TUEk9AlxIfl/uTpN+5+9MdXZ/uzN0PS1ogaUbH1gTtjQD6oaaWeAK6jPgEl99IWu3uP+no+nRHZjbAzPrEX+dIukLSmg6tFNodATSumSWeEBIze0LS3yWdYmaFZnZbR9epG7pA0o2SLjOz5fFtZkdXqpvJl/Samb2vui/fL7v7nzu4TmhnPMYCAEAAZKAAAARAAAUAIAACKAAAARBAAQAIgAAKAEAABFAAAAIggAIAEAABFACAAP4/bXVxjNw17FEAAAAASUVORK5CYII=\n",
431 |       "text/plain": [
432 |        "<Figure size 576x432 with 2 Axes>"
433 |       ]
434 |      },
435 |      "metadata": {
436 |       "needs_background": "light"
437 |      },
438 |      "output_type": "display_data"
439 |     }
440 |    ],
441 |    "source": [
442 |     "plt.figure(figsize=(8,6))\n",
443 |     "sns.heatmap(combined_means.corr(),cmap='RdBu',\n",
444 |     "           mask=mask,vmax=1,vmin=-1,annot=True,annot_kws={\"size\": 15})\n",
445 |     "plt.savefig('remove_g_corr.pdf')\n",
446 |     "plt.show()"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "code",
451 |    "execution_count": null,
452 |    "id": "801e7e7d-55b0-488f-a4c9-6c81d8b6ef73",
453 |    "metadata": {},
454 |    "outputs": [],
455 |    "source": [
456 |     "# no statistically enrichments found, so no plots "
457 |    ]
458 |   }
459 |  ],
460 |  "metadata": {
461 |   "kernelspec": {
462 |    "display_name": "plot",
463 |    "language": "python",
464 |    "name": "plot"
465 |   },
466 |   "language_info": {
467 |    "codemirror_mode": {
468 |     "name": "ipython",
469 |     "version": 3
470 |    },
471 |    "file_extension": ".py",
472 |    "mimetype": "text/x-python",
473 |    "name": "python",
474 |    "nbconvert_exporter": "python",
475 |    "pygments_lexer": "ipython3",
476 |    "version": "3.9.10"
477 |   }
478 |  },
479 |  "nbformat": 4,
480 |  "nbformat_minor": 5
481 | }
482 | 


--------------------------------------------------------------------------------