├── CeLEry_package ├── CeLEry │ ├── ClusterVAE.py │ ├── DNN.py │ ├── TrainerExe.py │ ├── __init__.py │ ├── __pycache__ │ │ ├── ClusterVAE.cpython-36.pyc │ │ ├── ClusterVAE.cpython-38.pyc │ │ ├── DNN.cpython-36.pyc │ │ ├── DNN.cpython-38.pyc │ │ ├── SpaCluster.cpython-36.pyc │ │ ├── TrainerExe.cpython-38.pyc │ │ ├── VanillaVAE.cpython-36.pyc │ │ ├── __init__.cpython-36.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── autoencoder.cpython-36.pyc │ │ ├── data_augmentation.cpython-38.pyc │ │ ├── datasetgenemap.cpython-36.pyc │ │ ├── datasetgenemap.cpython-38.pyc │ │ ├── fit_functions.cpython-38.pyc │ │ ├── type_.cpython-36.pyc │ │ ├── types_.cpython-36.pyc │ │ ├── types_.cpython-38.pyc │ │ ├── util.cpython-36.pyc │ │ ├── util.cpython-38.pyc │ │ ├── util_Mouse.cpython-36.pyc │ │ └── util_Mouse.cpython-38.pyc │ ├── data_augmentation.py │ ├── datasetgenemap.py │ ├── fit_functions.py │ ├── layers.py │ ├── types_.py │ ├── util.py │ └── util_Mouse.py ├── CeLEryPy.egg-info │ ├── PKG-INFO │ ├── SOURCES.txt │ ├── dependency_links.txt │ ├── requires.txt │ └── top_level.txt ├── LICENSE ├── README.md ├── dist │ ├── CeLEryPy-1.2.1-py3-none-any.whl │ └── CeLEryPy-1.2.1.tar.gz ├── pyproject.toml └── setup.py ├── LICENSE ├── README.md ├── code_paper ├── 1_LIBD │ ├── CeLEry_train_Scenario1and2.py │ ├── CeLEry_train_Scenario3and4.py │ ├── LIBDVisual.R │ ├── LIBDacc.R │ ├── README.md │ ├── prediction.py │ └── preprocess.py ├── 2_Alzheimer │ ├── CeLEry_train.py │ ├── README.md │ ├── preprocess.py │ └── test.py ├── 3_Mouse_10x_Visium │ ├── Mouse_CeLEry.ipynb │ ├── Mouse_Tangram.ipynb │ ├── Mouse_novosparc.ipynb │ └── Mouse_spotOTsc.ipynb ├── 4_Mouse_brain_MERFISH │ ├── CeLEry_brain.ipynb │ ├── CeLEry_figure 6_scenario 2.ipynb │ ├── CeLEry_figure 6_scenario 3.ipynb │ ├── README.md │ ├── SpaOTsc_brain.ipynb │ ├── Tangram_brain.ipynb │ ├── brain_result.ipynb │ └── novoSpaRc_brain.ipynb ├── 5_liver_MERFISH │ ├── CeLEry_liver.ipynb │ ├── README.md │ ├── SpaOTsc_liver.ipynb │ ├── Tangram_liver.ipynb │ ├── liver_result.ipynb │ └── novoSpaRc_liver.ipynb ├── 6_breast_cancer_10x_Xenium │ ├── 2D_locationRecovery │ │ ├── Xenium_BreastCancer_CELERY_Rep1_Scheme4_2DRecovery.ipynb │ │ ├── Xenium_BreastCancer_Tangram_Rep1_Scheme4_2DRecovery.ipynb │ │ ├── Xenium_BreastCancer_novosparc_Rep1_Scheme4_2DRecovery.ipynb │ │ └── Xenium_BreastCancer_spaOTsc_Rep1_Scheme4_2DRecovery.ipynb │ └── Domain_prediction │ │ ├── Xenium_BreastCancer_CELEREY_Scheme2_domainPred.ipynb │ │ ├── Xenium_BreastCancer_novosparc_Rep1_Scheme2_domainPred.ipynb │ │ ├── Xenium_BreastCancer_spaOTsc_Rep1_Scheme2_domainPred.ipynb │ │ └── Xenium_BreastCancer_tangram_Scheme2_domainPred.ipynb ├── 7_data_augmentation │ └── CeLEry-data-agumentation.ipynb └── 8_mouse_single_cell_prediction │ ├── Mouse_sc_analysis.py │ ├── Mouse_sc_analysis_spaOTsc_novosparc.ipynb │ ├── analysis-results.py │ └── preprocessing.py ├── docs └── asserts │ └── images │ └── workflow.png ├── pretrainmodel └── Biogen │ ├── Pretrained_model_075B.obj │ ├── Pretrained_model_075B_probmat.csv │ └── Reference_genes_8_075B.obj └── tutorial ├── BiogenPretrain.ipynb ├── BiogenPretrain.md ├── data ├── AlzheimerToy.h5ad ├── DataLayerToy.h5ad ├── Mouse2D │ └── MP1_SVG.py ├── MousePosteriorToy.h5ad └── MouseSCToy.h5ad ├── figures ├── Density_plot_BiogenExample.png └── segementation_8_075B.png ├── tutorial.ipynb └── tutorial.md /CeLEry_package/CeLEry/ClusterVAE.py: -------------------------------------------------------------------------------- 1 | import torch 2 | # from models import BaseVAE 3 | from torch import nn 4 | from torch.nn import functional as F 5 | from . types_ import * 6 | 7 | 8 | class ClusterVAE(nn.Module): 9 | 10 | def __init__(self, 11 | # in_channels: int, 12 | latent_dim: int, 13 | total_cluster: int, 14 | hidden: List = None, 15 | fgx = 2, fgy = 2, 16 | **kwargs) -> None: 17 | super(ClusterVAE, self).__init__() 18 | 19 | self.latent_dim = latent_dim 20 | self.total_cluster = total_cluster 21 | 22 | scanx = fgx % 4 + 3 23 | scany = fgy % 4 + 3 24 | 25 | if hidden is None: 26 | hidden = [16, 8, 4, 8, 8] 27 | 28 | self.hidden = hidden 29 | # encoder 30 | self.encoderl1 = nn.Sequential( # like the Composition layer you built 31 | nn.Conv2d(1, hidden[0], [scanx,scany]), # 76, 116 178 208 # 80, 86 32 | nn.ReLU()) 33 | # self.encoderl2 = nn.Sequential(nn.MaxPool2d(2, stride=2)) #38, 58 # 78, 82 34 | self.encoderl3 = nn.Sequential( 35 | nn.Conv2d(hidden[0], hidden[1], 4, stride=2), 36 | nn.ReLU()) # 18, 28 # 38, 40 37 | self.encoderl4 = nn.Sequential( 38 | nn.Conv2d(hidden[1], hidden[2], 4, stride=2), #18, 19 39 | nn.ReLU()) # 15, 25 40 | # decoder 41 | self.decoderl4 = nn.Sequential( 42 | nn.ConvTranspose2d(hidden[2], hidden[3], 4, stride=2), 43 | nn.ReLU()) # 35, 54 44 | self.decoderl3 = nn.Sequential( 45 | nn.ConvTranspose2d(hidden[3], hidden[4], 4, stride=2), 46 | nn.ReLU()) # 38,57 47 | # self.decoderl2 = nn.Sequential( 48 | # nn.ConvTranspose2d(16, 8, 2, stride=2), 49 | # nn.ReLU()) #76, 114 50 | self.decoderl1 = nn.Sequential( 51 | nn.ConvTranspose2d(hidden[4], 1, [scanx,scany]) #, 52 | #nn.ReLU() 53 | #nn.Sigmoid() 54 | ) 55 | 56 | self.enbedimx = int(((fgx - scanx + 1)/2-1)/2 -1) 57 | self.enbedimy = int(((fgy - scany + 1)/2-1)/2 -1) 58 | node_int = int(self.enbedimx * self.enbedimy * hidden[2]) 59 | self.fc_mu = nn.Linear(node_int, latent_dim) 60 | self.fc_var = nn.Linear(node_int, latent_dim) 61 | self.decoder_input = nn.Linear(self.latent_dim + self.total_cluster + 1, node_int) 62 | 63 | 64 | if 'KLDw' in kwargs: 65 | self.kld_weight = kwargs['KLDw'] 66 | else: 67 | self.kld_weight = 1 68 | 69 | self.seed = 0 70 | 71 | def encode(self, input: Tensor) -> List[Tensor]: 72 | """ 73 | Encodes the input by passing through the encoder network 74 | and returns the latent codes. 75 | :param input: (Tensor) Input tensor to encoder [N x C x H x W] 76 | :return: (Tensor) List of latent codes 77 | """ 78 | result = self.encoderl1(input) 79 | # result = self.encoderl2(result) 80 | result = self.encoderl3(result) 81 | result = self.encoderl4(result) 82 | result = torch.flatten(result, start_dim=1) 83 | 84 | # Split the result into mu and var components 85 | # of the latent Gaussian distribution 86 | mu = self.fc_mu(result) 87 | log_var = self.fc_var(result) 88 | 89 | return [mu, log_var] 90 | 91 | def decode(self, z: Tensor) -> Tensor: 92 | """ 93 | Maps the given latent codes 94 | onto the image space. 95 | :param z: (Tensor) [B x D] 96 | :return: (Tensor) [B x C x H x W] 97 | """ 98 | result = self.decoder_input(z) 99 | result = result.view(-1, self.hidden[2], self.enbedimx, self.enbedimy) 100 | result = self.decoderl4(result) 101 | result = self.decoderl3(result) 102 | # result = self.decoderl2(result) 103 | result = self.decoderl1(result) 104 | return result 105 | 106 | def reparameterize(self, mu: Tensor, logvar: Tensor) -> Tensor: 107 | """ 108 | Reparameterization trick to sample from N(mu, var) from 109 | N(0,1). 110 | :param mu: (Tensor) Mean of the latent Gaussian [B x D] 111 | :param logvar: (Tensor) Standard deviation of the latent Gaussian [B x D] 112 | :return: (Tensor) [B x D] 113 | """ 114 | std = torch.exp(0.5 * logvar) 115 | torch.manual_seed(self.seed) 116 | eps = torch.randn_like(std) 117 | return eps * std + mu 118 | 119 | def forward(self, input: Tensor, **kwargs) -> List[Tensor]: 120 | mu, log_var = self.encode(input[0]) 121 | z = self.reparameterize(mu, log_var) 122 | zplus = torch.cat((z, input[1]), dim = 1) 123 | return [self.decode(zplus), input, mu, log_var] 124 | 125 | def loss_function(self, 126 | *args, 127 | **kwargs) -> dict: 128 | """ 129 | Computes the VAE loss function. 130 | KL(N(\mu, \sigma), N(0, 1)) = \log \frac{1}{\sigma} + \frac{\sigma^2 + \mu^2}{2} - \frac{1}{2} 131 | :param args: 132 | :param kwargs: 133 | :return: 134 | """ 135 | recons = args[0] 136 | input = args[1] 137 | mu = args[2] 138 | log_var = args[3] 139 | 140 | kld_weight = self.kld_weight # Account for the minibatch samples from the dataset 141 | 142 | 143 | recons_loss = F.mse_loss(recons, input[0]) 144 | 145 | 146 | kld_loss = torch.mean(-0.5 * torch.sum(1 + log_var - mu ** 2 - log_var.exp(), dim = 1), dim = 0) 147 | 148 | loss = recons_loss + kld_weight * kld_loss 149 | return {'loss': loss, 'Reconstruction_Loss':recons_loss, 'KLD':-kld_loss} 150 | 151 | 152 | 153 | class ClusterVAEmask(ClusterVAE): 154 | def __init__(self, 155 | # in_channels: int, 156 | latent_dim: int, 157 | total_cluster: int, 158 | hidden: List = None, 159 | fgx = 2, fgy = 2, 160 | **kwargs) -> None: 161 | super(ClusterVAEmask, self).__init__(latent_dim, total_cluster, hidden, fgx, fgy, **kwargs) 162 | 163 | def forward(self, input: Tensor, **kwargs) -> List[Tensor]: 164 | mu, log_var = self.encode(input[0]) 165 | z = self.reparameterize(mu, log_var) 166 | zplus = torch.cat((z, input[1]), dim = 1) 167 | mask = (input[0] != 0) * 1 168 | return [self.decode(zplus), input, mu, log_var, mask.float()] 169 | 170 | def loss_function(self, 171 | *args, 172 | **kwargs) -> dict: 173 | """ 174 | Computes the VAE loss function. 175 | KL(N(\mu, \sigma), N(0, 1)) = \log \frac{1}{\sigma} + \frac{\sigma^2 + \mu^2}{2} - \frac{1}{2} 176 | :param args: 177 | :param kwargs: 178 | :return: 179 | """ 180 | recons = args[0] 181 | input = args[1] 182 | mu = args[2] 183 | log_var = args[3] 184 | mask = args[4] 185 | 186 | kld_weight = self.kld_weight # Account for the minibatch samples from the dataset 187 | 188 | 189 | recons_loss = F.mse_loss(recons * mask, input[0] * mask) 190 | 191 | 192 | kld_loss = torch.mean(-0.5 * torch.sum(1 + log_var - mu ** 2 - log_var.exp(), dim = 1), dim = 0) 193 | 194 | loss = recons_loss + kld_weight * kld_loss 195 | return {'loss': loss, 'Reconstruction_Loss':recons_loss, 'KLD':-kld_loss} 196 | 197 | 198 | -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/TrainerExe.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | import numpy as np 5 | from tqdm import tqdm 6 | 7 | class TrainerExe(object): 8 | def __init__(self): 9 | super(TrainerExe, self).__init__() 10 | self.l=None 11 | 12 | def set_l(self, l): 13 | self.l=l 14 | 15 | def train(self, model, train_loader, 16 | num_epochs=5, learning_rate=1e-3, annealing = False, KLDwinc = 0.02, n_incr =50, RCcountMax = 40): 17 | self.learning_rate = 1e-2 18 | if (self.learning_rate > learning_rate): 19 | self.learning_rate = learning_rate 20 | self.model = model 21 | 22 | optimizer = optim.Adam(self.model.parameters(), 23 | lr=self.learning_rate, 24 | weight_decay=1e-5) 25 | RCcount = 0 26 | loss_min = 99999999 27 | for epoch in range(num_epochs): 28 | total_loss = 0 29 | for i, img in enumerate(tqdm(train_loader)): 30 | recon = self.model(img) 31 | loss = self.model.loss_function(*recon) 32 | loss.get("loss").backward() 33 | optimizer.step() 34 | optimizer.zero_grad() 35 | total_loss += loss.get("loss").data 36 | print('Epoch:{}, Loss:{:.4f}'.format(epoch+1, float(total_loss))) 37 | if (total_loss>loss_min): 38 | RCcount = RCcount + 1 39 | if (RCcount == RCcountMax): 40 | RCcount = 0 41 | self.learning_rate = self.learning_rate/2 42 | optimizer.param_groups[0]['lr'] = self.learning_rate 43 | loss_min = loss_min + 10 44 | print('New learning rate:{}'.format(float(self.learning_rate))) 45 | else: 46 | loss_min = total_loss 47 | if annealing: 48 | self.model.seed = epoch 49 | if epoch % n_incr == (n_incr-1): 50 | self.model.kld_weight = self.model.kld_weight + KLDwinc 51 | print('KLD weight annealing: increase {}. Now is :{:.4f}'.format(KLDwinc, float(self.model.kld_weight))) 52 | loss_min = loss_min + 500 53 | if (self.learning_rate < 1e-7): 54 | break 55 | 56 | def get_predict(self,train_loader): 57 | output = [] 58 | for i, img in enumerate(train_loader): 59 | recon = self.model(img) 60 | output.append(recon[0].detach().numpy()[0,0,:,:]) 61 | return(np.stack(output)) 62 | 63 | def get_hidecode(self,train_loader): 64 | output = [] 65 | for i, img in enumerate(tqdm(train_loader)): 66 | embedding1 = self.model.encoderl1(img.float()) 67 | embedding2 = self.model.encoderl2(embedding1) 68 | embedding3 = self.model.encoderl3(embedding2) 69 | embedding4 = self.model.encoderl4(embedding3) 70 | output.append(embedding4) 71 | return(output) 72 | 73 | def deep_reshape(self, data, refer): 74 | """ 75 | Given generated data for a sample and a reference coordinates data, reshape the data by (location) X (Gene) 76 | :param data: (Numpy) [nsample X Gene X location_x X location_y] 77 | :return: (Numpy) [nsample X Gene X location(x X y filtered)] 78 | """ 79 | x = refer.iloc[:,0] 80 | y = refer.iloc[:,1] 81 | xmin = x.min() 82 | xmax = x.max() 83 | ymin = y.min() 84 | ymax = y.max() 85 | xlen = xmax - xmin + 1 86 | ylen = ymax - ymin + 1 87 | marker = np.zeros(xlen*ylen, dtype = bool) 88 | for i in range(refer.shape[0]): 89 | marker[(refer.iloc[i,0]-xmin)*ylen + refer.iloc[i,1] - ymin] = True 90 | final = data[:,:,marker] 91 | return(final) 92 | 93 | def fast_generation(self,train_loader, nsample): 94 | """ 95 | Given original gene-image data and the number of samples to be sampled 96 | :param train_loader 97 | nsample: (Int) the number of samples 98 | :return: (Numpy) [nsample X Gene X location(x X y filtered)] 99 | """ 100 | output = [] 101 | for i, img in enumerate(tqdm(train_loader)): 102 | outputinside = [] 103 | self.model.seed = 0 104 | mu, log_var = self.model(img)[2:4] 105 | for j in range(nsample): 106 | self.model.seed = j 107 | z = self.model.reparameterize(mu, log_var) 108 | zplus = torch.cat((z, img[1]), dim = 1) 109 | outputi = self.model.decode(zplus) 110 | outputinside.append(outputi.detach().numpy()[0,0,:,:]) 111 | output.append(np.stack(outputinside)) 112 | final = np.stack(output) 113 | final2 = np.swapaxes( final,0,1) 114 | final3 = final2.reshape((final2.shape[0], final2.shape[1],-1) ) 115 | return(final3) 116 | -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '1.2.1' 2 | from . util import * 3 | from . datasetgenemap import * 4 | from . DNN import * 5 | from . util_Mouse import * 6 | from . fit_functions import * 7 | from . data_augmentation import * 8 | -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/ClusterVAE.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/ClusterVAE.cpython-36.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/ClusterVAE.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/ClusterVAE.cpython-38.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/DNN.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/DNN.cpython-36.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/DNN.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/DNN.cpython-38.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/SpaCluster.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/SpaCluster.cpython-36.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/TrainerExe.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/TrainerExe.cpython-38.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/VanillaVAE.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/VanillaVAE.cpython-36.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/autoencoder.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/autoencoder.cpython-36.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/data_augmentation.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/data_augmentation.cpython-38.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/datasetgenemap.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/datasetgenemap.cpython-36.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/datasetgenemap.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/datasetgenemap.cpython-38.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/fit_functions.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/fit_functions.cpython-38.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/type_.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/type_.cpython-36.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/types_.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/types_.cpython-36.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/types_.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/types_.cpython-38.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/util.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/util.cpython-36.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/util.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/util.cpython-38.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/util_Mouse.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/util_Mouse.cpython-36.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/__pycache__/util_Mouse.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/util_Mouse.cpython-38.pyc -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/data_augmentation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import numpy as np 4 | import pandas as pd 5 | import torch 6 | import pickle 7 | 8 | from .util import * 9 | 10 | from sklearn.cluster import KMeans 11 | from . datasetgenemap import datagenemapclust 12 | from . ClusterVAE import ClusterVAEmask 13 | from . TrainerExe import TrainerExe 14 | from . datasetgenemap import wrap_gene_domain 15 | from . DNN import DNN 16 | from . DNN import DNNordinal 17 | from . DNN import DNNdomain 18 | 19 | def seed_worker(worker_id): 20 | worker_seed = torch.initial_seed() % 2**32 21 | np.random.seed(worker_seed) 22 | random.seed(worker_seed) 23 | 24 | def DataAugmentation (RefDataOrigin, obs_location = ['x_cord','y_cord'], path = "output/Project", filename = "SpatialTranscript", clusterready = False, n_clusters=100, beta = 1e-5, nrep = 2, generateplot = True): 25 | #Prepare 26 | RefDataOriginsort = RefDataOrigin.obs.sort_values (by = obs_location) 27 | RefDataOrigin = RefDataOrigin[RefDataOriginsort.index] 28 | cdata = RefDataOrigin.copy() 29 | getGeneImg(cdata, emptypixel = 0, obsset = obs_location) 30 | cdataexpand = np.expand_dims(cdata.GeneImg, axis=1) 31 | #Clustering 32 | try: 33 | os.makedirs("{path}/DataAugmentation".format(path = path)) 34 | except FileExistsError: 35 | print("Folder already exists") 36 | if clusterready: 37 | kmeansresults = np.load("{path}/DataAugmentation/{filename}_cluster.npy".format(path = path, filename = filename)) 38 | else: 39 | kmeansmodel = KMeans(n_clusters, random_state=0) 40 | cdatacentral = centralize(cdataexpand.copy()) 41 | direclust = [cdatacentral[x,0,:,:] for x in range(cdatacentral.shape[0])] 42 | direflat = [x.flat for x in direclust] 43 | direflatnp = np.stack(direflat) 44 | kmeans = kmeansmodel.fit(direflatnp) 45 | kmeansresults = kmeans.labels_ 46 | np.save("{path}/DataAugmentation/{filename}_cluster.npy".format(path = path, filename = filename), kmeansresults) 47 | # 48 | full_RefData = datagenemapclust(cdataexpand, kmeansresults) 49 | CVAEmodel, clg = FitGenModel(path = path, filename = filename, traindata = full_RefData, cdataexpand = cdataexpand, Kmeans_cluster = kmeansresults, beta = beta) 50 | CVAEmodel, clg = FitGenModel_continue(path = path, filename = filename, model = CVAEmodel, clg = clg, traindata = full_RefData, beta = beta) 51 | if generateplot: 52 | print("Now generating the plots for the augmented data...") 53 | GeneratePlot(path, filename, beta = beta, traindata = full_RefData) 54 | Data_Generation(path, filename, obs_location = obs_location, beta= beta, dataSection1 = RefDataOrigin, traindata = full_RefData, nrep = nrep) 55 | 56 | 57 | def FitGenModel (path, filename, traindata, cdataexpand, Kmeans_cluster, beta, hidden = [8,4,2,4,4], learning_rate = 1e-3, number_error_try = 30): 58 | random.seed(2021) 59 | torch.manual_seed(2021) 60 | np.random.seed(2021) 61 | # 62 | trainloader= torch.utils.data.DataLoader(traindata, batch_size=1, num_workers = 1, shuffle = True, worker_init_fn=seed_worker) 63 | ## Set up Autoencoder 64 | CVAEmodel = ClusterVAEmask(latent_dim = 511-Kmeans_cluster.max(), total_cluster = Kmeans_cluster.max(), fgx = cdataexpand.shape[2], fgy = cdataexpand.shape[3], KLDw = 0, hidden = hidden) 65 | CVAEmodel = CVAEmodel.float() 66 | file = "{path}/DataAugmentation/{filename}_CVAE_{beta}.obj".format(path = path, filename = filename, beta = beta) 67 | # 68 | ## Run Autoencoder 69 | clg = TrainerExe() 70 | clg.train(model = CVAEmodel, train_loader = trainloader, num_epochs= 249, annealing = True, KLDwinc = beta/4, n_incr =50, RCcountMax = number_error_try, learning_rate = learning_rate) 71 | # Save the model to a local folder 72 | filehandler = open(file, 'wb') 73 | pickle.dump(CVAEmodel, filehandler) 74 | print('save model to: {filename}'.format(filename = file)) 75 | CVAEmodel.filename = file 76 | return CVAEmodel, clg 77 | 78 | ## if still converging 79 | def FitGenModel_continue (path, filename, model, clg, traindata, beta): 80 | trainloader= torch.utils.data.DataLoader(traindata, batch_size=1, num_workers = 1, shuffle = True, worker_init_fn=seed_worker) 81 | # 82 | file = "{path}/DataAugmentation/{filename}_CVAE_{beta}.obj".format(path = path, filename = filename, beta = beta) 83 | clg.train(model = model, train_loader = trainloader, num_epochs= 200, annealing = False, RCcountMax = 5, learning_rate = clg.learning_rate) 84 | # Save the model to a local folder 85 | filehandler = open(file, 'wb') 86 | pickle.dump(model, filehandler) 87 | print('save model to: {filename}'.format(filename=file)) 88 | model.filename = file 89 | return model, clg 90 | 91 | def GeneratePlot(path, filename, beta, traindata, sigma = 0): 92 | trainloader= torch.utils.data.DataLoader(traindata, batch_size=1, num_workers = 4) 93 | file = "{path}/DataAugmentation/{filename}_CVAE_{beta}.obj".format(path = path, filename = filename, beta = beta) 94 | # 95 | filehandler = open(file, 'rb') 96 | CVAEmodel = pickle.load(filehandler) 97 | # 98 | clg=TrainerExe() 99 | clg.model = CVAEmodel 100 | try: 101 | os.makedirs("{path}/DataAugmentation/{file}_Generation/Glimps/Gen{beta}".format(path = path, file = filename, beta = beta)) 102 | except FileExistsError: 103 | print("Folder {path}/DataAugmentation/{file}_Generation/Glimps/Gen{beta} already exists".format(path = path, file = filename, beta = beta)) 104 | for j, img in enumerate(trainloader): 105 | # img = next(dataloader_iterator) 106 | plotGeneImg(img[0][0,0,:,:], filename = "{path}/DataAugmentation/{file}_Generation/Glimps/Gen{beta}/img{j}".format(path = path, file = filename, beta = beta, j = j)) 107 | omin = img[0].min() 108 | omax = img[0].max() 109 | if sigma == 0: 110 | sigma = (omax-omin)/6 111 | for i in range(10): 112 | CVAEmodel.seed = i 113 | result = CVAEmodel(img) 114 | outputraw = result[0][0,0,:,:].detach().numpy() 115 | outputimg = (outputraw + np.random.normal(0,sigma,outputraw.shape)) * result[4][0,0,:,:].detach().numpy() 116 | plotGeneImg( outputimg , filename = "{path}/DataAugmentation/{file}_Generation/Glimps/Gen{beta}/img{j}var{i}".format(path = path, file = filename, beta = beta, j = j, i = i), range = (-3, 3)) 117 | 118 | 119 | def Data_Generation(path, filename, beta, dataSection1, traindata, nrep, obs_location = ['x_cord','y_cord']): 120 | trainloader= torch.utils.data.DataLoader(traindata, batch_size=1, num_workers = 4) 121 | random.seed(2021) 122 | torch.manual_seed(2021) 123 | np.random.seed(2021) 124 | # 125 | fileto = "{path}/DataAugmentation/{filename}_CVAE_{beta}.obj".format(path = path, filename = filename, beta = beta) 126 | filehandler = open(fileto, 'rb') 127 | CVAEmodel = pickle.load(filehandler) 128 | # 129 | clg= TrainerExe() 130 | clg.model = CVAEmodel 131 | data_gen=clg.fast_generation(trainloader, nrep) 132 | # data_gen=np.load("../output/{folder}/data_gen.npy".format(folder = folder)) 133 | data_gen_rs = clg.deep_reshape (data = data_gen, refer = dataSection1.obs[obs_location]) 134 | try: 135 | os.makedirs("{path}/DataAugmentation/DataGen".format(path = path)) 136 | except FileExistsError: 137 | print("Folder already exists") 138 | np.save("{path}/DataAugmentation/DataGen/{filename}_data_gen_{beta}_n{nrep}.npy".format(path = path, filename = filename, beta = beta, nrep = nrep), data_gen_rs) 139 | 140 | 141 | def AugFit_domain (RefDataOrigin, domain_weights, domain_data = None, domainkey = "layer", hidden_dims = [50, 10, 5], num_epochs_max = 500, beta = 1e-5, nrep = 2, path = "../output/Biogene", filename = "SpatialTranscript", batch_size = 4, num_workers = 4, number_error_try = 15, initial_learning_rate = 0.0001, seednum = 2021): 142 | random.seed(seednum) 143 | torch.manual_seed(seednum) 144 | np.random.seed(seednum) 145 | if domain_data is None: 146 | domain_data = RefDataOrigin.obs 147 | # 148 | # Original Version 149 | data_gen_rs = np.load("{path}/DataAugmentation/DataGen/{filename}_data_gen_{beta}_n{nrep}.npy".format(path = path, filename = filename, beta = beta, nrep = nrep)) 150 | # Attach the original 151 | tdatax = np.expand_dims(RefDataOrigin.X, axis = 0) 152 | tdata_rs = np.swapaxes(tdatax, 1, 2) 153 | datacomp = np.concatenate((data_gen_rs, tdata_rs), axis=0) 154 | # 155 | dataDNN = wrap_gene_domain(datacomp, domain_data, domainkey) 156 | CoReg_loader = torch.utils.data.DataLoader(dataDNN, batch_size=batch_size, num_workers = num_workers, shuffle = True, worker_init_fn=seed_worker) 157 | # Create Deep Neural Network for Coordinate Regression 158 | DNNmodel = DNNdomain( in_channels = data_gen_rs.shape[1], num_classes = domain_weights.shape[0], hidden_dims = hidden_dims, importance_weights = domain_weights) 159 | DNNmodel = DNNmodel.float() 160 | # 161 | CoReg = TrainerExe() 162 | CoReg.train(model = DNNmodel, train_loader = CoReg_loader, num_epochs= num_epochs_max, RCcountMax = number_error_try, learning_rate = initial_learning_rate) 163 | # 164 | try: 165 | os.makedirs("{path}/DataAugmentation/PredictionModel".format(path = path)) 166 | except FileExistsError: 167 | print("Note: Folder {path}/DataAugmentation/PredictionModel already exists".format(path = path)) 168 | filename2 = "{path}/DataAugmentation/PredictionModel/{filename}_domain_{beta}_n{nrep}.obj".format(filename = filename, path = path, beta = beta, nrep = nrep) 169 | filehandler2 = open(filename2, 'wb') 170 | pickle.dump(DNNmodel, filehandler2) 171 | 172 | -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/datasetgenemap.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.utils.data import TensorDataset 4 | 5 | 6 | 7 | class datasetgenemap(TensorDataset): 8 | """Dataset wrapping unlabeled data tensors. 9 | No longer used. 10 | Each sample will be retrieved by indexing tensors along the first 11 | dimension. 12 | 13 | Arguments: 14 | datainput (numpy array): contains sample data. 15 | """ 16 | def __init__(self, datainput): 17 | self.data_tensor = torch.from_numpy(datainput).float() 18 | 19 | def __getitem__(self, index): 20 | return self.data_tensor[index].astype(np.float32) 21 | 22 | def __len__(self): 23 | return len(self.data_tensor) 24 | 25 | 26 | 27 | class datagenemapclust(TensorDataset): 28 | """Dataset wrapping labeled (cluster label) data tensors with cluster information. 29 | Used in data augmentation models 30 | Each sample will be retrieved by indexing tensors along the first 31 | dimension. 32 | 33 | Arguments: 34 | datainput (numpy array): contains sample data. 35 | """ 36 | def __init__(self, datainput, label): 37 | self.data_tensor = torch.from_numpy(datainput).float() 38 | self.maxnum = label.max() 39 | self.clustempty = np.zeros(self.maxnum + 1,'float32') 40 | self.label = label 41 | 42 | def __getitem__(self, index): 43 | image = self.data_tensor[index] 44 | cluster = self.clustempty.copy() 45 | cluster[self.label[index]] = 1 46 | return image, torch.from_numpy(cluster).float() 47 | 48 | def __len__(self): 49 | return len(self.data_tensor) 50 | 51 | 52 | 53 | class wrap_gene_location(TensorDataset): 54 | """Dataset wrapping labeled (cluster label) data tensors with cluster information. 55 | Used in data prediction models 56 | Each sample will be retrieved by indexing tensors along the first 57 | dimension. 58 | 59 | Arguments: 60 | datainput (numpy array): contains sample data. 61 | """ 62 | def __init__(self, datainput, label): 63 | self.data_tensor = torch.from_numpy(datainput).float() 64 | cord = label.to_numpy().astype('float32') 65 | cordx = cord[:,0] 66 | cordy = cord[:,1] 67 | self.xmin = cordx.min()-1 68 | self.ymin = cordy.min()-1 69 | self.xmax = cordx.max()+1 70 | self.ymax = cordy.max()+1 71 | self.cordx_norm = (cordx - self.xmin)/(self.xmax-self.xmin) 72 | self.cordy_norm = (cordy - self.ymin)/(self.ymax-self.ymin) 73 | self.imagedimension = self.data_tensor.shape 74 | def __getitem__(self, index): 75 | indexsample = index // self.imagedimension[2] 76 | indexspot = index % self.imagedimension[2] 77 | geneseq = self.data_tensor[indexsample,:,indexspot] 78 | cordinates = torch.tensor([self.cordx_norm[indexspot],self.cordy_norm[indexspot]]) 79 | return geneseq, cordinates 80 | def __len__(self): 81 | return self.imagedimension[0] * self.imagedimension[2] 82 | 83 | 84 | class wrap_gene_layer(TensorDataset): 85 | """Dataset wrapping labeled (cluster label) data tensors with cluster information. 86 | Used in data prediction models 87 | Each sample will be retrieved by indexing tensors along the first 88 | dimension. 89 | 90 | Arguments: 91 | datainput (numpy array): contains sample data. 92 | layer (boolean): T if layer information is contained 93 | layerkey: the keyword for layer. Default is "Layer" 94 | """ 95 | def __init__(self, datainput, label, layerkey = "layer"): 96 | self.data_tensor = torch.from_numpy(datainput).float() 97 | getlayer = label[layerkey].to_numpy() 98 | self.layer = getlayer.astype('float32') 99 | self.layersunq = np.sort(np.unique(self.layer)) 100 | self.nlayers = len(self.layersunq) 101 | self.imagedimension = self.data_tensor.shape 102 | def __getitem__(self, index): 103 | indexsample = index // self.imagedimension[2] 104 | indexspot = index % self.imagedimension[2] 105 | geneseq = self.data_tensor[indexsample,:,indexspot] 106 | layeri = int(self.layer[indexspot]) - 1 107 | layerv = np.zeros(self.nlayers-1) 108 | layerv[:layeri] = 1 109 | return geneseq, layerv 110 | def __len__(self): 111 | return self.imagedimension[0] * self.imagedimension[2] 112 | 113 | 114 | class wrap_gene_domain(TensorDataset): 115 | """Dataset wrapping labeled (cluster label) data tensors with cluster information. 116 | Used in data prediction models 117 | Each sample will be retrieved by indexing tensors along the first 118 | dimension. 119 | 120 | Arguments: 121 | datainput (numpy array): contains sample data. 122 | layer (boolean): T if layer information is contained 123 | layerkey: the keyword for layer. Default is "Layer" 124 | """ 125 | def __init__(self, datainput, label, layerkey = "layer"): 126 | self.data_tensor = torch.from_numpy(datainput).float() 127 | getlayer = label[layerkey].to_numpy() 128 | self.layer = getlayer.astype('float32') 129 | self.layersunq = np.sort(np.unique(self.layer)) 130 | self.nlayers = len(self.layersunq) 131 | self.imagedimension = self.data_tensor.shape 132 | def __getitem__(self, index): 133 | indexsample = index // self.imagedimension[2] 134 | indexspot = index % self.imagedimension[2] 135 | geneseq = self.data_tensor[indexsample,:,indexspot] 136 | layeri = self.layer[indexspot].astype('int64') 137 | return geneseq, layeri 138 | def __len__(self): 139 | return self.imagedimension[0] * self.imagedimension[2] 140 | -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/layers.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import scipy 4 | 5 | import math 6 | import torch 7 | from torch.nn.parameter import Parameter 8 | from torch.nn.modules.module import Module 9 | from torch.utils.data import DataLoader 10 | 11 | import matplotlib.pyplot as plt 12 | 13 | 14 | class ConvolutionNN(Module): 15 | """ 16 | Simple CNN layer 17 | """ 18 | def __init__(self, , ) 19 | 20 | def getGeneImg (adata, geneset = None): 21 | # Transform the AnnData file into Genes of images 22 | # adata: the input data of AnnData object 23 | # geneset: the set of gene considered 24 | if geneset is None: 25 | x = adata.obs[["x2"]] 26 | y = adata.obs[["x3"]] 27 | xmin = x.min().iloc[0] 28 | xmax = x.max().iloc[0] 29 | ymin = y.min().iloc[0] 30 | ymax = y.max().iloc[0] 31 | # i = 12 32 | for i in range(adata.X.shape[1]): 33 | z = adata.X[:,i] 34 | zmin = z.min() 35 | zmax = z.max() 36 | # create array for image : zmax+1 is the default value 37 | shape = (xmax-xmin+1,ymax-ymin+1) 38 | img = np.ma.array(np.ones(shape)*0) 39 | for inp in range(x.shape[0]): 40 | img[x.iloc[inp,0]-xmin,y.iloc[inp,0]-ymin]=z[inp,0] 41 | # set mask on default value 42 | img.mask = (img==0) 43 | # set a gray background for test 44 | img_bg_test = np.zeros(shape) 45 | cmap_bg_test = plt.get_cmap('gray') 46 | plt.imshow(img_bg_test,cmap=cmap_bg_test,interpolation='none') 47 | # plot 48 | cmap = plt.get_cmap('jet') 49 | plt.imshow(img,cmap=cmap,interpolation='none',vmin=zmin,vmax=zmax) 50 | plt.colorbar() 51 | plt.show() 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /CeLEry_package/CeLEry/types_.py: -------------------------------------------------------------------------------- 1 | from typing import List, Callable, Union, Any, TypeVar, Tuple 2 | from torch import tensor as Tensor 3 | 4 | Tensor = TypeVar('torch.tensor') 5 | -------------------------------------------------------------------------------- /CeLEry_package/CeLEryPy.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: CeLEryPy 3 | Version: 1.2.1 4 | Summary: Leverage spatial transcriptomics data to recover cell locations in single-cell RNA RNA-seq 5 | Author-email: Qihuang Zhang 6 | License: Copyright (c) 2022 The Python Packaging Authority 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | Project-URL: Homepage, https://github.com/QihuangZhang/CeLEry 26 | Keywords: CeLEry,spatial transcriptomics,scRNA-seq 27 | Classifier: License :: OSI Approved :: MIT License 28 | Classifier: Programming Language :: Python 29 | Classifier: Programming Language :: Python :: 3 30 | Requires-Python: >=3.8 31 | Description-Content-Type: text/markdown 32 | License-File: LICENSE 33 | Requires-Dist: torch>=1.8 34 | Requires-Dist: pandas>=1.4 35 | Requires-Dist: numpy>=1.20 36 | Requires-Dist: scipy 37 | Requires-Dist: tqdm 38 | Requires-Dist: scanpy>=1.5 39 | Requires-Dist: scikit-image 40 | Requires-Dist: anndata 41 | Requires-Dist: scikit-learn 42 | 43 | # CeLEry 44 | ## Leveraging spatial transcriptomics data to recover cell locationsin single-cell RNA-seq with CeLEry 45 | 46 | ### Qihuang Zhang, Jian Hu, Kejie Li, Baohong Zhang, David Dai, Edward B. Lee, Rui Xiao, Mingyao Li* 47 | 48 | Single-cell RNA sequencing provides resourceful information to study the cells systematically. However, their locational information is usually unavailable. We present CeLEry, a supervised deep learning algorithm to recover the origin of tissues in assist of spatial transcriptomic data, integrating a data augmentation procedure via variational autoencoder to improve the robustness of methods in the overfitting and the data contamination. CeLEry provides a generic framework and can be implemented in multiple tasks depending on the research objectives, including the spatial coordinates discovery as well as the layer discovery. It can make use of the information of multiple tissues of spatial transcriptomics data. Thorough assessments exhibit that CeLEry achieves a leading performance compared to the state-of-art methods. We illustrated the usage of CeLEry in the discovery of neuron cell layers to study the development of Alzheimer's disease. The identified cell location information is valuable in many downstream analyses and can be indicative of the spatial organization of the tissues. 49 | 50 | ## System Requirements 51 | Python support packages: torch>1.8, pandas>1.4, numpy>1.20, scipy, tqdm, scanpy>1.5, anndata, sklearn 52 | 53 | ## To install package 54 | In the command, input 55 | ``` 56 | pip install CeLEryPy 57 | ``` 58 | 59 | 60 | To load the package, input 61 | ``` 62 | import CeLEry 63 | ``` 64 | -------------------------------------------------------------------------------- /CeLEry_package/CeLEryPy.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | LICENSE 2 | README.md 3 | pyproject.toml 4 | setup.py 5 | CeLEry/ClusterVAE.py 6 | CeLEry/DNN.py 7 | CeLEry/TrainerExe.py 8 | CeLEry/__init__.py 9 | CeLEry/data_augmentation.py 10 | CeLEry/datasetgenemap.py 11 | CeLEry/fit_functions.py 12 | CeLEry/layers.py 13 | CeLEry/types_.py 14 | CeLEry/util.py 15 | CeLEry/util_Mouse.py 16 | CeLEryPy.egg-info/PKG-INFO 17 | CeLEryPy.egg-info/SOURCES.txt 18 | CeLEryPy.egg-info/dependency_links.txt 19 | CeLEryPy.egg-info/requires.txt 20 | CeLEryPy.egg-info/top_level.txt -------------------------------------------------------------------------------- /CeLEry_package/CeLEryPy.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /CeLEry_package/CeLEryPy.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | torch>=1.8 2 | pandas>=1.4 3 | numpy>=1.20 4 | scipy 5 | tqdm 6 | scanpy>=1.5 7 | scikit-image 8 | anndata 9 | scikit-learn 10 | -------------------------------------------------------------------------------- /CeLEry_package/CeLEryPy.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | CeLEry 2 | -------------------------------------------------------------------------------- /CeLEry_package/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2022 The Python Packaging Authority 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /CeLEry_package/README.md: -------------------------------------------------------------------------------- 1 | # CeLEry 2 | ## Leveraging spatial transcriptomics data to recover cell locationsin single-cell RNA-seq with CeLEry 3 | 4 | ### Qihuang Zhang, Jian Hu, Kejie Li, Baohong Zhang, David Dai, Edward B. Lee, Rui Xiao, Mingyao Li* 5 | 6 | Single-cell RNA sequencing provides resourceful information to study the cells systematically. However, their locational information is usually unavailable. We present CeLEry, a supervised deep learning algorithm to recover the origin of tissues in assist of spatial transcriptomic data, integrating a data augmentation procedure via variational autoencoder to improve the robustness of methods in the overfitting and the data contamination. CeLEry provides a generic framework and can be implemented in multiple tasks depending on the research objectives, including the spatial coordinates discovery as well as the layer discovery. It can make use of the information of multiple tissues of spatial transcriptomics data. Thorough assessments exhibit that CeLEry achieves a leading performance compared to the state-of-art methods. We illustrated the usage of CeLEry in the discovery of neuron cell layers to study the development of Alzheimer's disease. The identified cell location information is valuable in many downstream analyses and can be indicative of the spatial organization of the tissues. 7 | 8 | ## System Requirements 9 | Python support packages: torch>1.8, pandas>1.4, numpy>1.20, scipy, tqdm, scanpy>1.5, anndata, sklearn 10 | 11 | ## To install package 12 | In the command, input 13 | ``` 14 | pip install CeLEryPy 15 | ``` 16 | 17 | 18 | To load the package, input 19 | ``` 20 | import CeLEry 21 | ``` -------------------------------------------------------------------------------- /CeLEry_package/dist/CeLEryPy-1.2.1-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/dist/CeLEryPy-1.2.1-py3-none-any.whl -------------------------------------------------------------------------------- /CeLEry_package/dist/CeLEryPy-1.2.1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/dist/CeLEryPy-1.2.1.tar.gz -------------------------------------------------------------------------------- /CeLEry_package/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "CeLEryPy" 7 | version = "1.2.1" 8 | description = "Leverage spatial transcriptomics data to recover cell locations in single-cell RNA RNA-seq" 9 | readme = "README.md" 10 | authors = [{ name = "Qihuang Zhang", email = "qihuang.zh@gmail.com"}] 11 | license = { file = "LICENSE" } 12 | classifiers = [ 13 | "License :: OSI Approved :: MIT License", 14 | "Programming Language :: Python", 15 | "Programming Language :: Python :: 3", 16 | ] 17 | keywords = ["CeLEry", "spatial transcriptomics", "scRNA-seq"] 18 | dependencies = [ 19 | "torch >= 1.8", 20 | "pandas >= 1.4", 21 | "numpy >= 1.20", 22 | "scipy", 23 | "tqdm", 24 | "scanpy >= 1.5", 25 | "scikit-image", 26 | "anndata", 27 | "scikit-learn", 28 | ] 29 | requires-python = ">=3.8" 30 | 31 | [project.urls] 32 | Homepage = "https://github.com/QihuangZhang/CeLEry" 33 | 34 | -------------------------------------------------------------------------------- /CeLEry_package/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup() -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Qihuang Zhang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CeLEry 2 | ## Leveraging spatial transcriptomics data to recover cell locationsin single-cell RNA-seq with CeLEry 3 | 4 | ### Qihuang Zhang*, Shunzhou Jiang, Amelia Schroeder, Jian Hu, Kejie Li, Baohong Zhang, David Dai, Edward B. Lee, Rui Xiao, Mingyao Li* 5 | 6 | Single-cell RNA sequencing (scRNA-seq) has transformed our understanding of cellular heterogeneity in health and disease, but the lack of physical relationships among dissociated cells has limited its applications. Here we present CeLEry, a supervised deep learning algorithm to recover the spatial origins of cells in scRNA-seq by leveraging gene expression and spatial location information learned from spatial transcriptomics (ST) data. CeLEry has a data augmentation procedure via variational autoencoder to improve the robustness of the method and overcome noise in scRNA-seq. CeLEry can infer the spatial origins of cells in scRNA-seq at multiple levels, including 2D location as well as the spatial domain or tissue layer of a cell. CeLEry also provides uncertainty estimates for the recovered location information. Comprehensive evaluations on multiple datasets generated from mouse and human brains show that CeLEry can reliably recover the spatial location information for cells in scRNA-seq. 7 | 8 | ![CeLEry workflow](docs/asserts/images/workflow.png) 9 | 10 | *The implmentation procedure of CeLEry*: 11 | - CeLEry takes spatial transcriptomic data as input for the training data and the scRNA-seq as testing data set. 12 | - CeLEry optionally generates replicates of the spatial transcriptomic data via variational autoencoder then includes them as the training data together with original spatial transcriptomic data. 13 | - A deep neural network is trained to learn the relationship between the spotwise gene expression and location information, minimizing the loss functions that are specified according to the specific problem. 14 | 15 | 16 | 17 | ## Usage 18 | 19 | The [**CeLEry**](https://github.com/QihuangZhang/CeLEry) package is an implementation of a deep neural network in discovering location information for single cell RNA data. With CeLEry, you can: 20 | 21 | - Preprocess spatial transcriptomics data from various formats. 22 | - Build a deep neural network to predict cell locations. 23 | - Generate synthetic spatial transcriptomic data. 24 | 25 | 26 | 27 | ## Tutorial 28 | 29 | 30 | A Jupyter Notebook of the tutorial is accessible from : 31 |
32 | https://github.com/QihuangZhang/CeLEry/blob/main/tutorial/tutorial.md 33 |
34 | 35 | 36 | The tutorial of the Biogen pretrain model can be accessible from : 37 |
38 | https://github.com/QihuangZhang/CeLEry/blob/main/tutorial/BiogenPretrain.md 39 |
40 | 41 | # System Requirements 42 | 43 | ## Hardware Requirements 44 | 45 | The `CeLEry` package requires only a standard computer with enough RAM to support the operations defined by a user. For minimal performance, this will be a computer with about 2 GB of RAM. For optimal performance, we recommend a computer with the following specs: 46 | 47 | RAM: 16+ GB 48 | CPU: 4+ cores, 3.3+ GHz/core 49 | 50 | ## Software Requirements 51 | 52 | ### OS Requirements 53 | 54 | The package development version is tested on *Linux* operating systems. The developmental version of the package has been tested on the following systems: 55 | 56 | Linux: kernel 3.10.0 57 | Mac OSX: 58 | Windows: 59 | 60 | ## System Requirements 61 | Python (>3.8) support packages: torch>=1.8, pandas>=1.4, numpy>=1.20, scipy, tqdm, scanpy>=1.5, anndata, sklearn, scikit-image 62 | 63 | 64 | # Install packages 65 | In the command, input 66 | ``` 67 | pip install CeLEryPy 68 | ``` 69 | 70 | The installation of CeLEry python package takes approximately 5 minumtes. 71 | -------------------------------------------------------------------------------- /code_paper/1_LIBD/CeLEry_train_Scenario1and2.py: -------------------------------------------------------------------------------- 1 | #!-### Note: Need to run "preprocess.py" first to obtain the available datasets. 2 | 3 | 4 | ## In this version of Cell Location discovEry (LIBD) we consider region of a tissue under Scenarios 1 and 2 5 | 6 | # Application to LIBD data 7 | 8 | import os,csv,re 9 | import pandas as pd 10 | import numpy as np 11 | import scanpy as sc 12 | import math 13 | 14 | from skimage import io, color 15 | from sklearn.cluster import KMeans 16 | 17 | from scipy.sparse import issparse 18 | import random, torch 19 | import warnings 20 | warnings.filterwarnings("ignore") 21 | import matplotlib.colors as clr 22 | import matplotlib.pyplot as plt 23 | import pickle 24 | 25 | #Read original data and save it to h5ad 26 | from scanpy import read_10x_h5 27 | #import SpaGCN as spg 28 | import CeLEry as cel 29 | 30 | from data.LIBD.LIBD_gene_select import d_g 31 | 32 | # import tangram as tg 33 | 34 | ## 1. Data Preperation -------------------------------------------------------------------------- 35 | ### Load MouseBarin Data Section 1: Regarded as Spatial Transcriptomic Data 36 | dataSection1 = sc.read("../data/LIBD/data_151673.h5ad") 37 | 38 | 39 | ## Conduct clustering 40 | cdata = dataSection1.copy() 41 | cel.getGeneImg(cdata,emptypixel = 0) 42 | cdataexpand = np.expand_dims(cdata.GeneImg, axis=1) 43 | 44 | cdatacentral = cel.centralize(cdataexpand.copy()) 45 | direclust = [cdatacentral[x,0,:,:] for x in range(cdatacentral.shape[0])] 46 | direflat = [x.flat for x in direclust] 47 | direflatnp = np.stack(direflat) 48 | 49 | # implementing k-means clustering 50 | kmeansmodel = KMeans(n_clusters=100, random_state=0) 51 | kmeans = kmeansmodel.fit(direflatnp) 52 | np.save("../output/LIBD/cluster_673.npy", kmeans.labels_) 53 | 54 | 55 | ## Calculating z-score 56 | cel.get_zscore(dataSection1) 57 | 58 | # get sorted indeces 59 | 60 | dataSection1sort = dataSection1.obs.sort_values (by = ['x2','x3']) 61 | dataSection1 = dataSection1[dataSection1sort.index] 62 | 63 | def seed_worker(worker_id): 64 | worker_seed = torch.initial_seed() % 2**32 65 | np.random.seed(worker_seed) 66 | random.seed(worker_seed) 67 | 68 | ## 2. Data Augmentation -------------------------------------------------------------------------- 69 | cdata = dataSection1.copy() 70 | cel.getGeneImg(cdata,emptypixel = 0) 71 | cdataexpand = np.expand_dims(cdata.GeneImg, axis=1) 72 | np.save("../output/LIBD/full_geneimg.npy", cdataexpand) 73 | 74 | # Read in gene expression and spatial location 75 | cdataexp_full = np.load("../output/LIBD/full_geneimg.npy") 76 | 77 | 78 | # Load Clustering Results 79 | Kmeans_cluster = np.load("../output/LIBD/cluster_673.npy") 80 | 81 | full = cel.datagenemapclust(cdataexp_full,Kmeans_cluster) 82 | 83 | 84 | ## Step 1: Model Fitting of CAVE------------------------------------------------------------------------------------ 85 | 86 | def FitGenModel (cdataexpand, beta, learning_rate = 1e-3): 87 | g = torch.Generator() 88 | g.manual_seed(2020) 89 | trainloader = torch.utils.data.DataLoader(full, batch_size=1, num_workers = 4, shuffle = True, worker_init_fn=seed_worker, generator=g) 90 | random.seed(2020) 91 | torch.manual_seed(2020) 92 | np.random.seed(2020) 93 | # 94 | ## Set up Autoencoder 95 | CVAEmodel = cel.ClusterVAEmask(latent_dim = 511-Kmeans_cluster.max(), total_cluster = Kmeans_cluster.max(), fgx = cdataexpand.shape[2], fgy = cdataexpand.shape[3], KLDw = 0, hidden = [8,4,2,4,4]) 96 | CVAEmodel = CVAEmodel.float() 97 | filename = "../output/LIBD/Generation/CVAE_{beta}.obj".format(beta = beta) 98 | # 99 | ## Run Autoencoder 100 | clg=cel.SpaCluster() 101 | clg.train(model = CVAEmodel, train_loader = trainloader, num_epochs= 249, annealing = True, KLDwinc = beta/4, n_incr =50, RCcountMax = 30, learning_rate = 0.001) 102 | # Save the model to a local folder 103 | filehandler = open(filename, 'wb') 104 | pickle.dump(CVAEmodel, filehandler) 105 | print('save model to: {filename}'.format(filename=filename)) 106 | CVAEmodel.filename = filename 107 | return CVAEmodel, clg 108 | 109 | CVAEmodel_e5, clg_e5 = FitGenModel(cdataexpand = cdataexp_full, beta = 1e-5) 110 | # CVAEmodel_e2, clg_e2 = FitGenModel(cdataexpand = cdataexp_full, beta = 1e-2) 111 | 112 | 113 | # ## if still converging 114 | def FitGenModel_continue (model, clg, cdataexpand, beta): 115 | g = torch.Generator() 116 | g.manual_seed(2020) 117 | trainloader= torch.utils.data.DataLoader(full, batch_size=1, num_workers = 4, shuffle = True, worker_init_fn=seed_worker, generator=g) 118 | filename = "../output/LIBD/Generation/CVAE_{beta}.obj".format(beta = beta) 119 | clg.train(model = model, train_loader = trainloader, num_epochs= 150, annealing = False, RCcountMax = 30, learning_rate = clg.learning_rate) 120 | # Save the model to a local folder 121 | filehandler = open(filename, 'wb') 122 | pickle.dump(model, filehandler) 123 | print('save model to: {filename}'.format(filename=filename)) 124 | model.filename = filename 125 | return model, clg 126 | 127 | CVAEmodel_e5, clg_e5 = FitGenModel_continue(model = CVAEmodel_e5, clg = clg_e5, cdataexpand = cdataexp_full, beta = 1e-5) 128 | 129 | 130 | ## Step 2: Data Generation ------------------------------------------------------------------------------------ 131 | 132 | ## Glimpse of generate model 133 | def GeneratePlot(beta, traindata): 134 | trainloader= torch.utils.data.DataLoader(traindata, batch_size=1, num_workers = 4) 135 | filename = "../output/LIBD/Generation/CVAE_{beta}.obj".format(beta = beta) 136 | # 137 | filehandler = open(filename, 'rb') 138 | CVAEmodel = pickle.load(filehandler) 139 | # 140 | clg=cel.SpaCluster() 141 | clg.model = CVAEmodel 142 | try: 143 | os.makedirs("../output/LIBD/Generation/Glimps/Gen{beta}".format(beta = beta)) 144 | except FileExistsError: 145 | print("Folder already exists") 146 | for j, img in enumerate(trainloader): 147 | # img = next(dataloader_iterator) 148 | cel.plotGeneImg(img[0][0,0,:,:], filename = "../output/LIBD/Generation/Glimps/Gen{beta}/img{j}".format(beta = beta, j = j)) 149 | omin = img[0].min() 150 | omax = img[0].max() 151 | for i in range(10): 152 | result = CVAEmodel(img) 153 | outputimg = result[0][0,0,:,:].detach().numpy() * result[4][0,0,:,:].detach().numpy() 154 | cel.plotGeneImg( outputimg , filename = "../output/LIBD/Generation/Glimps/Gen{beta}/img{j}var{i}".format(beta = beta, j = j, i = i), range = (omin.item(), omax.item())) 155 | 156 | GeneratePlot(beta = 1e-5, traindata = full) 157 | GeneratePlot(beta = 1e-2, traindata = full) 158 | 159 | 160 | def Data_Generation(beta, dataSection1, traindata, nrep): 161 | trainloader= torch.utils.data.DataLoader(traindata, batch_size=1, num_workers = 4) 162 | random.seed(2021) 163 | torch.manual_seed(2021) 164 | np.random.seed(2021) 165 | # 166 | filename = "../output/LIBD/Generation/CVAE_{beta}.obj".format(beta = beta) 167 | filehandler = open(filename, 'rb') 168 | CVAEmodel = pickle.load(filehandler) 169 | # 170 | clg=cel.SpaCluster() 171 | clg.model = CVAEmodel 172 | data_gen=clg.fast_generation(trainloader, nrep) 173 | # data_gen=np.load("../output/{folder}/data_gen.npy".format(folder = folder)) 174 | data_gen_rs = clg.deep_reshape (data = data_gen, refer = dataSection1.obs) 175 | try: 176 | os.makedirs("../output/LIBD/DataGen") 177 | except FileExistsError: 178 | print("Folder already exists") 179 | np.save("../output/LIBD/DataGen/data_gen_{beta}_n{nrep}.npy".format(beta = beta, nrep = nrep), data_gen_rs) 180 | 181 | 182 | Data_Generation(beta = 1e-5, nrep = 2, dataSection1 = dataSection1, traindata = full) 183 | Data_Generation(beta = 1e-5, nrep = 4, dataSection1 = dataSection1, traindata = full) 184 | Data_Generation(beta = 1e-5, nrep = 6, dataSection1 = dataSection1, traindata = full) 185 | Data_Generation(beta = 1e-5, nrep = 8, dataSection1 = dataSection1, traindata = full) 186 | Data_Generation(beta = 1e-5, nrep = 10, dataSection1 = dataSection1, traindata = full) 187 | 188 | 189 | 190 | ## Step 3** (weighted regression model): Prediction Model ------------------------------------------------------------------------------------ 191 | ## Count the number of spots on each layer 192 | layer_count = dataSection1.obs["Layer"].value_counts().sort_index() 193 | layer_weight = layer_count[7]/layer_count[0:7] 194 | layer_weights = torch.tensor(layer_weight.to_numpy()) 195 | 196 | 197 | 198 | def FitPredModel (beta, nrep, dataSection1): 199 | # 200 | random.seed(2020) 201 | torch.manual_seed(2020) 202 | np.random.seed(2020) 203 | g = torch.Generator() 204 | g.manual_seed(2021) 205 | # Original Version 206 | data_gen_rs = np.load("../output/LIBD/DataGen/data_gen_{beta}_n{nrep}.npy".format(beta = beta, nrep = nrep)) 207 | # Attach the original 208 | tdatax = np.expand_dims(dataSection1.X, axis = 0) 209 | tdata_rs = np.swapaxes(tdatax, 1, 2) 210 | datacomp = np.concatenate((data_gen_rs, tdata_rs), axis=0) 211 | # 212 | dataDNN = cel.wrap_gene_layer(datacomp, dataSection1.obs, "Layer") 213 | CoReg_loader = torch.utils.data.DataLoader(dataDNN, batch_size=4, num_workers = 4, shuffle = True, worker_init_fn=seed_worker, generator=g) 214 | # Create Deep Neural Network for Coordinate Regression 215 | DNNmodel = cel.DNNordinal( in_channels = data_gen_rs.shape[1], num_classes = 7, hidden_dims = [50, 10, 5], importance_weights = layer_weights ) 216 | DNNmodel = DNNmodel.float() 217 | # 218 | CoReg=cel.SpaCluster() 219 | CoReg.train(model = DNNmodel, train_loader = CoReg_loader, num_epochs= 250, RCcountMax = 5, learning_rate = 0.001) 220 | # 221 | filename2 = "../output/LIBD/Prediction/data_gen_layer_{beta}_n{nrep}.obj".format(beta = beta, nrep = nrep) 222 | filehandler2 = open(filename2, 'wb') 223 | pickle.dump(DNNmodel, filehandler2) 224 | 225 | # temp 226 | # beta = 1e-5 227 | # nrep = 10 228 | # dataSection1 = dataSection1 229 | 230 | FitPredModel(beta = 1e-5, nrep = 2, dataSection1 = dataSection1) 231 | FitPredModel(beta = 1e-5, nrep = 4, dataSection1 = dataSection1) 232 | FitPredModel(beta = 1e-5, nrep = 6, dataSection1 = dataSection1) 233 | FitPredModel(beta = 1e-5, nrep = 8, dataSection1 = dataSection1) 234 | FitPredModel(beta = 1e-5, nrep = 10, dataSection1 = dataSection1) 235 | 236 | def FitPredModel_continue (holdoff, beta, nrep, dataSection1, learning_rate): 237 | filename2 = "../output/LIBD/Prediction/data_gen_layer_{holdoff}_{beta}_n{nrep}.obj".format(holdoff = holdoff, beta = beta, nrep = nrep) 238 | filehandler2 = open(filename2, 'rb') 239 | DNNmodel = pickle.load(filehandler2) 240 | # 241 | data_gen_rs = np.load("../output/LIBD/DataGen/data_gen_layer_{holdoff}_{beta}_n{nrep}.npy".format(holdoff = holdoff, beta = beta, nrep = nrep)) 242 | # 243 | dataDNN = cel.wrap_gene_layer(data_gen_rs, dataSection1.obs) 244 | CoReg_loader = torch.utils.data.DataLoader(dataDNN, batch_size=1, num_workers = 4, shuffle = True) 245 | # Create Deep Neural Network for Coordinate Regression 246 | # 247 | random.seed(2021) 248 | torch.manual_seed(2021) 249 | np.random.seed(2021) 250 | # 251 | CoReg=cel.SpaCluster() 252 | CoReg.train(model = DNNmodel, train_loader = CoReg_loader, num_epochs= 60, RCcountMax = 1, learning_rate = learning_rate) 253 | # 254 | filehandler2 = open(filename2, 'wb') 255 | pickle.dump(DNNmodel, filehandler2) 256 | 257 | 258 | FitPredModel_continue(holdoff = 50 , beta = 1e-5, nrep = 10, dataSection1 = Section1train50, learning_rate = 3.125e-06) 259 | 260 | FitPredModel_continue(holdoff = 50 , beta = 1e-9, nrep = 10, dataSection1 = Section1train50, learning_rate = 1.220703125e-08) 261 | 262 | 263 | ## Step 3**.2: Prediction Model of the case without data augmentation ------------------------------------------------------------------------------------ 264 | 265 | def FitPredModelNE (dataSection1): 266 | tdatax = np.expand_dims(dataSection1.X, axis = 0) 267 | tdata_rs = np.swapaxes(tdatax, 1, 2) 268 | DataTra = cel.wrap_gene_layer(tdata_rs, dataSection1.obs, "Layer") 269 | t_loader= torch.utils.data.DataLoader(DataTra, batch_size=1, num_workers = 4, shuffle = True) 270 | # Create Deep Neural Network for Coordinate Regression # 10, 4, 2 271 | DNNmodel = cel.DNNordinal( in_channels = DataTra[1][0].shape[0], num_classes = 7, hidden_dims = [10, 4, 2], importance_weights = layer_weights ) # [100,50,25] ) 272 | DNNmodel = DNNmodel.float() 273 | # 274 | CoOrg=cel.SpaCluster() 275 | CoOrg.train(model = DNNmodel, train_loader = t_loader, num_epochs= 150, RCcountMax = 15, learning_rate = 0.001) 276 | # 277 | filename3 = "../output/LIBD/Prediction/layer_PreOrg.obj" 278 | filehandler2 = open(filename3, 'wb') 279 | pickle.dump(DNNmodel, filehandler2) 280 | 281 | 282 | FitPredModelNE (dataSection1 = dataSection1) 283 | 284 | -------------------------------------------------------------------------------- /code_paper/1_LIBD/LIBDVisual.R: -------------------------------------------------------------------------------- 1 | ### This file trying to visual the hodge results using the histology information 2 | 3 | ## 0. Global Parameters and Packages 4 | 5 | library(dplyr) 6 | library(ggplot2) 7 | library(tidyr) 8 | library(png) 9 | 10 | outputdir <- "output/LIBD/plots/" 11 | 12 | 13 | 14 | # Functions --------------------------------------------------------------- 15 | 16 | Density_plot_data <- function(pred_result, Study){ 17 | dataplot <- cbind(background, pred_result) 18 | data_long <- gather(dataplot, column, prob, V3:V9, factor_key = TRUE) 19 | data_long_new <- data_long %>% 20 | mutate(TargetLayer = factor(column, labels = c("L1","L2","L3","L4","L5","L6","WM"), levels = paste0("V",3:9))) %>% 21 | mutate(study = Study) 22 | return(data_long_new) 23 | } 24 | 25 | 26 | labelvector <- c("L1","L2","L3","L4","L5","L6","WM") 27 | 28 | 29 | 30 | Density_plot <- function(dataall, LayerSet = labelvector){ 31 | png(file = paste0(outputdir,"LIBD_Density_plot.png"), height = 1600, width = 1400) 32 | Boxplot <- ggplot(dataall %>% 33 | filter (TargetLayer %in% LayerSet) %>% 34 | mutate (TargetLayerfull = factor(TargetLayer, 35 | labels = paste0("predicted to ",LayerSet), 36 | levels = LayerSet)), 37 | aes(x = ycord, y = xcord) ) + 38 | theme_bw() + 39 | geom_point(aes(fill = prob), shape = 21, color = "black", size = 1.5, stroke = 0.3) + 40 | annotation_raster(Histology, ymin = 0, ymax= 1, xmin = 0, xmax = 1) + 41 | # scale_y_reverse() + 42 | facet_grid(TargetLayerfull~study, switch = "y") + 43 | # scale_fill_gradient(low = "#08121b", high = "#9fd3fa") + 44 | scale_fill_gradient2(low = "#08121b", mid = "#56b1f7", high = "#EB9486",midpoint = 0.5) + 45 | theme(text=element_text(size=25, family="URWHelvetica"), axis.text = element_text(size = 30, family="URWHelvetica"), 46 | panel.spacing = unit(1, "lines")) + 47 | theme(strip.background =element_rect(fill="#17202A",color="#17202A"))+ 48 | theme(strip.text = element_text(colour = 'white'),axis.text=element_blank()) + #, strip.position = "left" 49 | theme(panel.border = element_rect(colour = "#17202A"), legend.position="none") + # 50 | labs(x = NULL, y = NULL, fill = "Probability") 51 | print(Boxplot) 52 | dev.off() 53 | 54 | png(file = paste0(outputdir,"LIBD_Density_plot_legend.png"), height = 1600, width = 1400) 55 | Boxplot2 <- Boxplot + 56 | theme(legend.position="right") 57 | 58 | print(Boxplot2) 59 | dev.off() 60 | } 61 | 62 | # 1. Prepare the LIBD backgound data -------------------------------------- 63 | 64 | ## Using data of 507 as the background 65 | 66 | tissue_pos <- read.csv("data/LIBD/visualization_151507.csv") 67 | # Histology <- readPNG("data/LIBD/151507_tissue_lowres_image.png") 68 | Histology <- readPNG("data/LIBD/RegionReference_bw2.png") 69 | # Histology_tsp <- matrix(rgb(Histology[,,1],Histology[,,2],Histology[,,3], 0.7), nrow=dim(Histology)[1]) 70 | 71 | Histology_maxx <- max(tissue_pos["x2"])+1 72 | Histology_minx <- min(tissue_pos["x2"]) 73 | Histology_maxy <- max(tissue_pos["x3"])+1 74 | Histology_miny <- min(tissue_pos["x3"]) 75 | 76 | 77 | background <- tissue_pos %>% 78 | mutate (xcord = 0.92-x2/(1.2*Histology_maxx+Histology_minx-5)) %>% 79 | mutate (ycord = 0.12+x3/(1.22*Histology_maxy+Histology_miny)) 80 | 81 | # 2. Apply the Hodge results on to the LIBD background -------------------- 82 | 83 | ## Load the results information 84 | 85 | pred_CeLEry <- read.csv("output/LIBD/Prediction151507/layer_PreOrg_probmat.csv", header = F) 86 | data_CeLEry <- Density_plot_data(pred_CeLEry, "CeLEry") 87 | 88 | pred_CeLEryn2 <- read.csv("output/LIBD/Prediction151507/data_gen_layer_1e-05_n2_probmat.csv", header = F) 89 | data_CeLEryn2 <- Density_plot_data(pred_CeLEryn2, "CeLEry (Augmentation)") 90 | 91 | pred_Tangram <- read.csv("output/LIBD/Prediction151507/Tangram_probmat_151507.csv", header = F) 92 | pred_Tangram_full <- cbind(0,0,pred_Tangram) 93 | names(pred_Tangram_full) <- paste0("V",1:9) 94 | data_Tangram <- Density_plot_data(pred_Tangram_full, "Tangram") 95 | 96 | pred_Multiple <- read.csv("output/LIBDmultiple/Prediction151507/layer_PreOrgv2_probmat.csv", header = F) 97 | data_Multiple <- Density_plot_data(pred_Multiple, "CeLEry (Multiple)") 98 | 99 | 100 | 101 | pred_spaOTsc <- read.csv("output/LIBD/Prediction151507/spaOTsc_probmat.csv", header = F) 102 | pred_spaOTsc_prop <- pred_spaOTsc/rowSums(pred_spaOTsc) 103 | pred_spaOTsc2 <- read.csv("output/LIBD/Prediction151507/spaOTsc_decisionmat.csv", header = F) 104 | pred_spaOTsc_full <- cbind(pred_spaOTsc2,pred_spaOTsc_prop) 105 | names(pred_spaOTsc_full) <- paste0("V",1:9) 106 | data_spaOTsc <- Density_plot_data(pred_spaOTsc_full, "spaOTsc") 107 | 108 | 109 | pred_novosparc <- read.csv("output/LIBD/Prediction151507/novosparc_probmat.csv", header = F) 110 | pred_novosparc_prop <- pred_novosparc/rowSums(pred_novosparc) 111 | pred_novosparc2 <- read.csv("output/LIBD/Prediction151507/novosparc_decisionmat.csv", header = F) 112 | pred_novosparc_full <- cbind(pred_novosparc2,pred_novosparc_prop) 113 | names(pred_novosparc_full) <- paste0("V",1:9) 114 | data_novosparc <- Density_plot_data(pred_novosparc_full, "novosparc") 115 | 116 | 117 | data_all <- rbind(data_CeLEry, data_CeLEryn2, data_Tangram, data_Multiple, data_spaOTsc, data_novosparc) 118 | 119 | Density_plot(data_all) 120 | 121 | # 3. PredictionHodge: Other methods (Discarded) -------------------- 122 | # ## Tangram 123 | # pred_Tangram <- read.csv("output/Hodge/PredictionHodge/Tangram_probmat.csv", header = F) 124 | # 125 | # for (i in 1:6){ 126 | # Density_plot(pred_Tangram, "Tangram", i) 127 | # } 128 | # 129 | # ## ClusterBased 130 | # pred_ClusterBased <- read.csv("output/Hodge/PredictionHodge/ClusterBased_probmat.csv", header = F) 131 | # 132 | # for (i in 1:6){ 133 | # Density_plot(pred_ClusterBased, "ClusterBased", i) 134 | # } 135 | 136 | -------------------------------------------------------------------------------- /code_paper/1_LIBD/LIBDacc.R: -------------------------------------------------------------------------------- 1 | ## 0. Global Parameters and Packages 2 | 3 | library(dplyr) 4 | library(tidyr) 5 | library(ggplot2) 6 | ## 1. Data Processing 7 | 8 | # classresults <- read.csv("output/LIBD/Multiple/data_gene_All_layerv2_1e-05_n2_probmat.csv", header = F) 9 | # classresults <- read.csv("output/Hodge/layer_PreOrgv2_probmat.csv", header = F) 10 | # classresults <- read.csv("output/Hodge/data_gene_All_layerv2_1e-05_n10_probmat.csv", header = F) 11 | 12 | OverallAccSummary <- function (path) { 13 | classresults <- read.csv(path, header = F) 14 | 15 | classresults_new <- classresults %>% 16 | mutate(Type = case_when( 17 | V1 == V2 ~ "Same", 18 | abs(V1-V2) == 1 ~ "Neighbour", 19 | T ~ "Other")) 20 | 21 | summaries <- table(classresults_new$Type) 22 | 23 | exact_acc <- summaries["Same"]/sum(summaries) 24 | cat(exact_acc) 25 | Neighbor_acc <- exact_acc + summaries["Neighbour"]/sum(summaries) 26 | cat(Neighbor_acc) 27 | 28 | # for (i in 1:7) { 29 | # data_curremt <- 30 | # 31 | # } 32 | 33 | return( c(exact_acc, Neighbor_acc) ) 34 | } 35 | 36 | 37 | ordinary507 <- OverallAccSummary("output/LIBD/Prediction151507/layer_PreOrg_probmat.csv") 38 | ordinary676 <- OverallAccSummary("output/LIBD/Prediction151676/layer_PreOrg_probmat.csv") 39 | 40 | 41 | aug507 <- OverallAccSummary("output/LIBD/Prediction151507/data_gen_layer_1e-05_n2_probmat.csv") 42 | aug676 <- OverallAccSummary("output/LIBD/Prediction151676/data_gen_layer_1e-05_n2_probmat.csv") 43 | 44 | # OverallAccSummary("output/LIBD/Prediction151507/data_gen_layer_1e-05_n2_probmat.csv") 45 | # OverallAccSummary("output/LIBD/Prediction151676/data_gen_layer_1e-05_n2_probmat.csv") 46 | 47 | 48 | multiple507 <- OverallAccSummary("output/LIBDmultiple/Prediction151507/layer_PreOrgv2_probmat.csv") 49 | multiple676 <- OverallAccSummary("output/LIBDmultiple/Prediction151676/layer_PreOrgv2_probmat.csv") 50 | 51 | multipleaug507 <- OverallAccSummary("output/LIBDmultiple/Prediction151507/data_gene_All_layerv2_1e-05_n2_probmat.csv") 52 | multipleaug676 <- OverallAccSummary("output/LIBDmultiple/Prediction151676/data_gene_All_layerv2_1e-05_n2_probmat.csv") 53 | 54 | Tangram507 <- OverallAccSummary("output/LIBD/Prediction151507/Tangram_decisionmat.csv") 55 | Tangram676 <- OverallAccSummary("output/LIBD/Prediction151676/Tangram_decisionmat.csv") 56 | 57 | 58 | spaOTsc507 <- OverallAccSummary("output/LIBD/Prediction151507/spaOTsc_decisionmat.csv") 59 | spaOTsc676 <- OverallAccSummary("output/LIBD/Prediction151676/spaOTsc_decisionmat.csv") 60 | 61 | 62 | novosparc507 <- OverallAccSummary("output/LIBD/Prediction151507/novosparc_decisionmat.csv") 63 | novosparc676 <- OverallAccSummary("output/LIBD/Prediction151676/novosparc_decisionmat.csv") 64 | 65 | 66 | accuracy_table <- rbind(ordinary507, ordinary676, aug507, aug676, multiple507, multiple676, multipleaug507, multipleaug676, 67 | Tangram507, Tangram676, spaOTsc507, spaOTsc676, novosparc507, novosparc676) 68 | colnames (accuracy_table) <- c("top1", "top2") 69 | 70 | accuracy_table_long <- data.frame(accuracy_table) %>% 71 | add_rownames(var = "method") %>% 72 | pivot_longer(cols = top1:top2, names_to = "type", values_to = "accuracy") %>% 73 | mutate( tissue = gsub('[A-Za-z]+', '', method)) %>% 74 | mutate( method = gsub('[0-9]+', '', method)) %>% 75 | mutate( Scenario = case_when( 76 | (!method %in% c("multiple", "multipleaug")) & (tissue == 676) ~ 1, 77 | (!method %in% c("multiple", "multipleaug")) & (tissue == 507) ~ 2, 78 | (method %in% c("multiple", "multipleaug")) & (tissue == 676) ~ 3, 79 | (method %in% c("multiple", "multipleaug")) & (tissue == 507) ~ 4 80 | )) %>% 81 | mutate( method = factor (method, levels = unique(method), 82 | labels = c("CeLEry", "CeLEry(aug)", "CeLEry", "CeLEry(aug)", "Tangram", "spaOTsc", "novosparc") )) %>% 83 | mutate(type = factor(type, levels = unique(type), labels = c("top-1", "top-2"))) %>% 84 | mutate(Scenario = factor (Scenario, levels = c(1, 3, 2, 4), labels = paste("Scenario", c(1, 3, 2, 4)))) %>% 85 | data.frame() 86 | 87 | 88 | pdf(file = "output/LIBD/plots/LIBD_barplot.pdf", width = 9, height = 9) 89 | 90 | color_palatte <-c( "#CAE7B9", "#F3DE8A","#EB9486", "#7E7F9A", "#97A7B3") 91 | strip_color <- "#0A1D37" 92 | 93 | barplot <- ggplot(accuracy_table_long, aes(fill = method, x = type, y = accuracy)) + 94 | geom_bar(stat = "identity", position="dodge") + 95 | scale_fill_manual(values=color_palatte[c(3,2,1,4,5)]) + 96 | facet_wrap(~Scenario) + 97 | # scale_y_continuous(trans=scales::pseudo_log_trans(base = 10), breaks = c(0, 1, 10, 100, 500)) + 98 | theme_bw() + 99 | theme(text=element_text(size=25, family="URWHelvetica"), axis.text = element_text(size = 25, family="URWHelvetica"), panel.spacing = unit(1, "lines") ) + 100 | theme(strip.background =element_rect(fill=strip_color,color=strip_color))+ # #535b44 101 | theme(strip.text = element_text(colour = 'white')) + # , axis.text.x = element_text(angle = 45, vjust = 1, hjust=1) 102 | theme(panel.border = element_rect(colour = strip_color), legend.position = "bottom") + 103 | labs(fill = "Method", x = "Type", y = "Accuracy") 104 | 105 | print(barplot) 106 | 107 | dev.off() 108 | 109 | 110 | pdf(file = "output/LIBD/plots/LIBD_barplot_legend.pdf", width = 12, height = 8) 111 | 112 | print(barplot) 113 | 114 | dev.off() 115 | 116 | 117 | # ### Further exploration 118 | # 119 | # OverallAccCeLEry <- function (path, probmat, truth) { 120 | # classresults <- read.csv(paste0(path,probmat), header = F) 121 | # truthresults <- read.csv(paste0(path,truth), header = F) 122 | # names(truthresults) <- "truth" 123 | # 124 | # classresults_new <- data.frame(classresults, truth = truthresults) %>% 125 | # mutate(Type = case_when( 126 | # V1 == truth ~ "Same", 127 | # abs(V1-truth) == 1 ~ "Neighbour", 128 | # T ~ "Other")) 129 | # 130 | # summaries <- table(classresults_new$Type) 131 | # 132 | # exact_acc <- summaries["Same"]/sum(summaries) 133 | # cat(exact_acc) 134 | # Neighbor_acc <- exact_acc + summaries["Neighbour"]/sum(summaries) 135 | # cat(Neighbor_acc) 136 | # 137 | # return( c(exact_acc, Neighbor_acc) ) 138 | # } 139 | # 140 | # 141 | # OverallAccCeLEry(path = "output/LIBD/PredictionEmbd/", 142 | # probmat = "Emd_model_151673_151507_probmat.csv", 143 | # truth = "Emd_model_151673_151507_truth.csv") 144 | # 145 | # OverallAccCeLEry(path = "output/LIBD/PredictionEmbd/", 146 | # probmat = "Emd_model_151673_151676_probmat.csv", 147 | # truth = "Emd_model_151673_151676_truth.csv") 148 | -------------------------------------------------------------------------------- /code_paper/1_LIBD/README.md: -------------------------------------------------------------------------------- 1 | # Guide for LIBD study 2 | ## Pipeline 3 | 4 | preprocess.py -> 5 | CeLEry_train_Scenario1and2.py (or CeLEry_train_Scenario3and4.py) -> 6 | prediction.py 7 | 8 | 9 | ## Datasets 10 | http://spatial.libd.org/spatialLIBD/ -------------------------------------------------------------------------------- /code_paper/1_LIBD/prediction.py: -------------------------------------------------------------------------------- 1 | ## In this version of Cell Location discovEry (LIBD) we consider region of a tissue and we hold off a partial 2 | 3 | # Application to LIBD data 4 | 5 | import os,csv,re 6 | import pandas as pd 7 | import numpy as np 8 | import scanpy as sc 9 | import math 10 | 11 | from skimage import io, color 12 | from sklearn.cluster import KMeans 13 | 14 | from scipy.sparse import issparse 15 | import random, torch 16 | import warnings 17 | warnings.filterwarnings("ignore") 18 | import matplotlib.colors as clr 19 | import matplotlib.pyplot as plt 20 | import pickle 21 | 22 | #Read original data and save it to h5ad 23 | from scanpy import read_10x_h5 24 | os.chdir("SpaClusterPython") 25 | #import SpaGCN as spg 26 | import CeLEry as cel 27 | 28 | from data.LIBD.LIBD_gene_select import d_g 29 | 30 | # import tangram as tg 31 | 32 | ## 1. Data Preperation -------------------------------------------------------------------------- 33 | ### Load MouseBarin Data Section 1: Regarded as Spatial Transcriptomic Data 34 | dataSection1 = sc.read("../data/LIBD/data_151673.h5ad") 35 | dataSection2 = sc.read("../data/LIBD/data_151676.h5ad") 36 | dataSection3 = sc.read("../data/LIBD/data_151507.h5ad") 37 | 38 | # Obtain the number of counts in each layer 39 | layer_count = dataSection2.obs["Layer"].value_counts().sort_index() 40 | layer_count = dataSection3.obs["Layer"].value_counts().sort_index() 41 | 42 | 43 | ## Conduct clustering 44 | # cdata = dataSection1.copy() 45 | # cel.getGeneImg(cdata,emptypixel = 0) 46 | #cdataexpand = np.expand_dims(cdata.GeneImg, axis=1) 47 | 48 | #cdatacentral = cel.centralize(cdataexpand.copy()) 49 | #direclust = [cdatacentral[x,0,:,:] for x in range(cdatacentral.shape[0])] 50 | #direflat = [x.flat for x in direclust] 51 | #direflatnp = np.stack(direflat) 52 | 53 | ## implementing k-means clustering 54 | #kmeansmodel = KMeans(n_clusters=20, random_state=0) 55 | #kmeans = kmeansmodel.fit(direflatnp) 56 | #np.save("../output/LIBD/cluster.npy", kmeans.labels_) 57 | 58 | 59 | ## Calculating z-score 60 | cel.get_zscore(dataSection1) 61 | cel.get_zscore(dataSection2) 62 | cel.get_zscore(dataSection3) 63 | 64 | class_num = 7 65 | 66 | ## 2*. Test (layer ordinal logistic regression) -------------------------------------------------------------------------- 67 | 68 | def report_prop_method_LIBD (folder, tissueID, name, dataSection2, traindata, Val_loader, coloruse, outname = ""): 69 | """ 70 | Report the results of the proposed methods in comparison to the other method 71 | :folder: string: specified the folder that keep the proposed DNN method 72 | :name: string: specified the name of the DNN method, also will be used to name the output files 73 | :dataSection2: AnnData: the data of Section 2 74 | :traindata: AnnData: the data used in training data. This is only needed for compute SSIM 75 | :Val_loader: Dataload: the validation data from dataloader 76 | :outname: string: specified the name of the output, default is the same as the name 77 | :ImageSec2: Numpy: the image data that are refering to 78 | """ 79 | if outname == "": 80 | outname = name 81 | filename2 = "{folder}/{name}.obj".format(folder = folder, name = name) 82 | filehandler = open(filename2, 'rb') 83 | DNNmodel = pickle.load(filehandler) 84 | # 85 | coords_predict = np.zeros(dataSection2.obs.shape[0]) 86 | payer_prob = np.zeros((dataSection2.obs.shape[0],class_num+2)) 87 | for i, img in enumerate(Val_loader): 88 | recon = DNNmodel(img) 89 | logitsvalue = np.squeeze(torch.sigmoid(recon[0]).detach().numpy(), axis = 0) 90 | if (logitsvalue[class_num-2] == 1): 91 | coords_predict[i] = class_num 92 | payer_prob[i,(class_num + 1)] = 1 93 | else: 94 | logitsvalue_min = np.insert(logitsvalue, 0, 1, axis=0) 95 | logitsvalue_max = np.insert(logitsvalue_min, class_num, 0, axis=0) 96 | prb = np.diff(logitsvalue_max) 97 | # prbfull = np.insert(-prb[0], 0, 1 -logitsvalue[0,0], axis=0) 98 | prbfull = -prb.copy() 99 | coords_predict[i] = np.where(prbfull == prbfull.max())[0].max() + 1 100 | payer_prob[i,2:] = prbfull 101 | # 102 | dataSection2.obs["pred_layer"] = coords_predict.astype(int) 103 | payer_prob[:,0] = dataSection2.obs["Layer"] 104 | payer_prob[:,1] = dataSection2.obs["pred_layer"] 105 | dataSection2.obs["pred_layer_str"] = coords_predict.astype(int).astype('str') 106 | cel.plot_layer(adata = dataSection2, folder = "{folder}{tissueID}".format(folder = folder, tissueID = tissueID), name = name, coloruse = coloruse) 107 | cel.plot_confusion_matrix ( referadata = dataSection2, filename = "{folder}{tissueID}/{name}conf_mat_fig".format(folder = folder, tissueID = tissueID, name = name)) 108 | np.savetxt("{folder}{tissueID}/{name}_probmat.csv".format(folder = folder, tissueID = tissueID, name = name), payer_prob, delimiter=',') 109 | 110 | 111 | 112 | 113 | def Evaluate (testdata, tissueID, traindata, beta, nrep, coloruse = None): 114 | ## Wrap up Validation data in to dataloader 115 | vdatax = np.expand_dims(testdata.X, axis = 0) 116 | vdata_rs = np.swapaxes(vdatax, 1, 2) 117 | DataVal = cel.wrap_gene_layer(vdata_rs, testdata.obs, "Layer") 118 | Val_loader= torch.utils.data.DataLoader(DataVal, batch_size=1, num_workers = 4) 119 | # 120 | report_prop_method_LIBD(folder = "../output/LIBD/Prediction", tissueID = tissueID, 121 | name = "data_gen_layer_{beta}_n{nrep}".format(beta = beta, nrep = nrep), 122 | dataSection2 = testdata, traindata = traindata, 123 | Val_loader = Val_loader, coloruse = coloruse) 124 | 125 | 126 | Evaluate(testdata = dataSection2, tissueID = 151676, traindata = dataSection1, beta = 1e-5, nrep = 2) 127 | Evaluate(testdata = dataSection2, tissueID = 151676, traindata = dataSection1, beta = 1e-5, nrep = 4) 128 | Evaluate(testdata = dataSection2, tissueID = 151676, traindata = dataSection1, beta = 1e-5, nrep = 6) 129 | Evaluate(testdata = dataSection2, tissueID = 151676, traindata = dataSection1, beta = 1e-5, nrep = 8) 130 | Evaluate(testdata = dataSection2, tissueID = 151676, traindata = dataSection1, beta = 1e-5, nrep = 10) 131 | 132 | 133 | Evaluate(testdata = dataSection3, tissueID = 151507, traindata = dataSection1, beta = 1e-5, nrep = 2) 134 | Evaluate(testdata = dataSection3, tissueID = 151507, traindata = dataSection1, beta = 1e-5, nrep = 4) 135 | Evaluate(testdata = dataSection3, tissueID = 151507, traindata = dataSection1, beta = 1e-5, nrep = 6) 136 | Evaluate(testdata = dataSection3, tissueID = 151507, traindata = dataSection1, beta = 1e-5, nrep = 8) 137 | Evaluate(testdata = dataSection3, tissueID = 151507, traindata = dataSection1, beta = 1e-5, nrep = 10) 138 | 139 | 140 | def EvaluateOrg (testdata, tissueID, traindata, coloruse = None): 141 | ## Wrap up Validation data in to dataloader 142 | vdatax = np.expand_dims(testdata.X, axis = 0) 143 | vdata_rs = np.swapaxes(vdatax, 1, 2) 144 | DataVal = cel.wrap_gene_layer(vdata_rs, testdata.obs, "Layer") 145 | Val_loader= torch.utils.data.DataLoader(DataVal, batch_size=1, num_workers = 4) 146 | # 147 | report_prop_method_LIBD(folder = "../output/LIBD/Prediction", tissueID = tissueID, 148 | name = "layer_PreOrg", 149 | dataSection2 = testdata, traindata = traindata, 150 | Val_loader = Val_loader, coloruse = coloruse) 151 | 152 | EvaluateOrg (testdata = dataSection2, tissueID = 151676, traindata = dataSection1) 153 | EvaluateOrg (testdata = dataSection3, tissueID = 151507, traindata = dataSection1) 154 | 155 | 156 | 157 | -------------------------------------------------------------------------------- /code_paper/1_LIBD/preprocess.py: -------------------------------------------------------------------------------- 1 | ### Datasets of this study can be download from http://spatial.libd.org/spatialLIBD/ 2 | 3 | 4 | import os,csv,re 5 | import pandas as pd 6 | import numpy as np 7 | import scanpy as sc 8 | import math 9 | from skimage import io, color 10 | 11 | from scipy.sparse import issparse 12 | import warnings 13 | warnings.filterwarnings("ignore") 14 | import matplotlib.colors as clr 15 | import matplotlib.pyplot as plt 16 | 17 | 18 | #Read original data and save it to h5ad 19 | from scanpy import read_10x_h5 20 | import CeLEry as cel 21 | from data.LIBD.LIBD_gene_select import d_g 22 | import json 23 | 24 | 25 | 26 | 27 | 28 | ### ------------------------------------------------------------------------------------------------------- ### 29 | ### Process the genelist 30 | ### ------------------------------------------------------------------------------------------------------- ### 31 | def get_LIBD_top_DEgenes (studyID): 32 | """ 33 | Preprocess the spatial transcriptomic raw data and obtain the optimal DE genes 34 | Parameters 35 | ----------- 36 | studyID : string. the study ID of the LIBD datasets 37 | Returns 38 | ----------- 39 | gene_topDE_list: the list of gene set that contains the highest DE genes between layers 40 | """ 41 | adata = read_10x_h5("../data/LIBD/{studyID}/{studyID}_raw_feature_bc_matrix.h5".format(studyID = studyID)) 42 | spatial = pd.read_csv("../data/LIBD/{studyID}/tissue_positions_list.txt".format(studyID = studyID),sep=",", header = None, na_filter = False, index_col = 0) 43 | adata.obs["x1"] = spatial[1] 44 | adata.obs["x2"] = spatial[2] 45 | adata.obs["x3"] = spatial[3] 46 | # Select captured samples 47 | adata = adata[adata.obs["x1"] == 1] 48 | adata.var_names = [i.upper() for i in list(adata.var_names)] 49 | adata.var["genename"] = adata.var.index.astype("str") 50 | # 51 | del adata.obs["x1"] 52 | # 53 | adata.obs["Layer"] = 0 54 | LayerName =["L1","L2","L3","L4","L5","L6","WM"] # 55 | for i in range(7): 56 | Layer = pd.read_csv("../data/LIBD/{studyID}/{studyID}_{Lname}_barcodes.txt".format(studyID = studyID, Lname = LayerName[i]), sep=",", header = None, na_filter = False, index_col = 0) 57 | adata.obs.loc[Layer.index, "Layer"] = int(i+1) 58 | adata.obs.loc[Layer.index, "Layer_character"] = LayerName[i] 59 | data = adata[adata.obs["Layer"]!=0] # Newly added on May 25 #Remove the spots without any layer label 60 | # 61 | # Preprocessing 62 | adata.var_names_make_unique() 63 | cel.prefilter_genes(adata,min_cells=3) # avoiding all genes are zeros 64 | cel.prefilter_specialgenes(adata) 65 | #Normalize and take log for UMI------- 66 | sc.pp.normalize_per_cell(adata) 67 | sc.pp.log1p(adata) 68 | # 69 | sc.tl.rank_genes_groups(adata, 'Layer_character', method = 'wilcoxon', key_added = "wilcoxon") 70 | # sc.pl.rank_genes_groups(adata, n_genes = 200, sharey = False, key="wilcoxon", save = '{studyID}.pdf'.format(studyID = studyID)) 71 | gene_topDE_list = [] 72 | for layer_i in ['L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'WM']: 73 | gene_rank = sc.get.rank_genes_groups_df (adata, group = layer_i, key = 'wilcoxon') 74 | top_gene_list = list(gene_rank["names"].iloc[0:200]) 75 | gene_topDE_list.append( top_gene_list ) 76 | return gene_topDE_list 77 | 78 | genelist73 = get_LIBD_top_DEgenes (151673) 79 | genelist74 = get_LIBD_top_DEgenes (151674) 80 | genelist75 = get_LIBD_top_DEgenes (151675) 81 | 82 | 83 | # Get the gene list from the pre-screening 84 | genelistlist = genelist73 + genelist74 + genelist75 85 | genelist = sum(genelistlist, []) # merge the list of lists 86 | genelistuni = list( dict.fromkeys(genelist) ) # remove duplicates 87 | 88 | 89 | 90 | 91 | ### ------------------------------------------------------------------------------------------------------- ### 92 | ### Preprocessing for spatial transcriptomics data 93 | ### ------------------------------------------------------------------------------------------------------- ### 94 | 95 | def Preprocess_SpTrans (studyID): 96 | """ 97 | Preprocess the spatial transcriptomic raw data and obtain the optimal DE genes 98 | Parameters: 99 | studyID (string): the study ID of the LIBD datasets 100 | Returns: 101 | gene_topDE_list (list): the list of gene set that contains the highest DE genes between layers 102 | """ 103 | # Read in the spatial transcriptomic data from a 10x Genomics-formatted HDF5 file 104 | adata = sc.read_10x_h5(f"../data/LIBD/{studyID}/{studyID}_raw_feature_bc_matrix.h5") 105 | 106 | # Read in the spatial coordinates of the tissue samples from a CSV file 107 | spatial = pd.read_csv(f"../data/LIBD/{studyID}/tissue_positions_list.txt", sep=",", header=None, na_filter=False, index_col=0) 108 | 109 | # Add the spatial coordinates to the data object 110 | adata.obs["x1"] = spatial[1] 111 | adata.obs["x2"] = spatial[2] 112 | adata.obs["x3"] = spatial[3] 113 | 114 | # Select only the samples that were captured 115 | adata = adata[adata.obs["x1"] == 1] 116 | 117 | # Ensure that all gene names are uppercase 118 | adata.var_names = [i.upper() for i in list(adata.var_names)] 119 | 120 | # Add the gene names as a column in the `var` attribute of the `adata` object 121 | adata.var["genename"] = adata.var.index.astype("str") 122 | 123 | # Remove the "x1" column from the `obs` attribute of the `adata` object 124 | del adata.obs["x1"] 125 | 126 | # Add a "Layer" column to the `obs` attribute of the `adata` object and initialize it to 0 127 | adata.obs["Layer"] = 0 128 | 129 | # Define a list of layer names 130 | LayerName =["L1", "L2", "L3", "L4", "L5", "L6", "WM"] 131 | 132 | # Loop through each layer and add the corresponding layer number to the `Layer` column of the `adata` object 133 | # Also add the layer name as a separate column for convenience 134 | for i in range(7): 135 | Layer = pd.read_csv(f"../data/LIBD/{studyID}/{studyID}_{LayerName[i]}_barcodes.txt", sep=",", header=None, na_filter=False, index_col=0) 136 | adata.obs.loc[Layer.index, "Layer"] = int(i+1) 137 | adata.obs.loc[Layer.index, "Layer_character"] = LayerName[i] 138 | 139 | # Remove the spots without any layer label 140 | data = adata[adata.obs["Layer"] != 0] 141 | 142 | # Make the gene names unique 143 | adata.var_names_make_unique() 144 | 145 | # Filter out genes that are expressed in fewer than 3 cells 146 | cel.prefilter_genes(adata, min_cells=3) 147 | 148 | # Filter out special genes (e.g. mitochondrial genes) 149 | cel.prefilter_specialgenes(adata) 150 | 151 | # Normalize the data by cell and take the log of the UMI counts 152 | sc.pp.normalize_per_cell(adata) 153 | sc.pp.log1p(adata) 154 | # 155 | sc.tl.rank_genes_groups(adata, 'Layer_character', method = 'wilcoxon', key_added = "wilcoxon") 156 | sc.pl.rank_genes_groups(adata, n_genes = 50, sharey = False, key="wilcoxon", save = '{studyID}.pdf'.format(studyID = studyID)) 157 | # 158 | # Filter the Genes that are selected by SpaGCN 159 | genename = adata.var['genename'] 160 | genelistindex = [genename[genename == i].index[0] for i in genelistuni if len(genename[genename == i])>0] # only keep the genes that exists in SpT data 161 | # Filter the genelist and output the results 162 | bdata = adata[:,genelistindex] 163 | cdata = sc.AnnData(X = bdata.X.toarray(), obs = bdata.obs, var = bdata.var, uns =bdata.uns, obsm = bdata.obsm) 164 | cdata.write_h5ad("../data/LIBD/data_{studyID}.h5ad".format(studyID = studyID)) 165 | return genelistindex 166 | 167 | ## Training Data 168 | genelistuni = Preprocess_SpTrans(151673) 169 | genelistuni = Preprocess_SpTrans(151674) 170 | genelistuni = Preprocess_SpTrans(151675) 171 | genelistuni = Preprocess_SpTrans(151676) 172 | genelistuni = Preprocess_SpTrans(151507) 173 | 174 | Preprocess_SpTrans(151673) 175 | Preprocess_SpTrans(151674) 176 | Preprocess_SpTrans(151675) 177 | 178 | 179 | ## Testing Data 180 | Preprocess_SpTrans(151676) 181 | Preprocess_SpTrans(151507) 182 | 183 | 184 | ### ------------------------------------------------------------------------------------------------------- ### 185 | ### Create a merged data set for 73, 74 and 75 186 | ### ------------------------------------------------------------------------------------------------------- ### 187 | 188 | dataSection1 = sc.read("../data/LIBD/data_{studyID}.h5ad".format(studyID = 151673)) 189 | print(dataSection1) 190 | dataSection2 = sc.read("../data/LIBD/data_{studyID}.h5ad".format(studyID = 151674)) 191 | print(dataSection2) 192 | dataSection3 = sc.read("../data/LIBD/data_{studyID}.h5ad".format(studyID = 151675)) 193 | print(dataSection3) 194 | 195 | dataSection = dataSection1.concatenate(dataSection2, dataSection3) 196 | print(dataSection) 197 | 198 | dataSection.write_h5ad("../data/LIBD/MergeTrains737475.h5ad") 199 | 200 | 201 | 202 | datakankan = sc.read("../data/LIBD/MergeTrains737475.h5ad") 203 | 204 | 205 | 206 | 207 | dataSection1 = sc.read("../data/LIBD/data_{studyID}.h5ad".format(studyID = 151673)) 208 | print(dataSection1) 209 | 210 | 211 | dataSection4 = sc.read("../data/LIBD/data_{studyID}.h5ad".format(studyID = 151676)) 212 | dataSection4.obs.to_csv ("../data/LIBD/visualization_151676.csv", sep = ",") 213 | 214 | dataSection5 = sc.read("../data/LIBD/data_{studyID}.h5ad".format(studyID = 151507)) 215 | dataSection5.obs.to_csv ("../data/LIBD/visualization_151507.csv", sep = ",") 216 | 217 | print(dataSection4) 218 | -------------------------------------------------------------------------------- /code_paper/2_Alzheimer/README.md: -------------------------------------------------------------------------------- 1 | # Guide for Alzheimer's disease study 2 | ## Pipeline 3 | 4 | preprocess.py -> 5 | CeLEry_train_.py -> 6 | test.py 7 | 8 | 9 | ## Datasets 10 | Data for training: http://spatial.libd.org/spatialLIBD/ 11 | Quary data: https://upenn.app.box.com/s/e8nf4b384s7oi3o09pj5s8jfdu11swim -------------------------------------------------------------------------------- /code_paper/2_Alzheimer/test.py: -------------------------------------------------------------------------------- 1 | ## In this study, we use LIBD data as the training set and evaluate the performance of the results on the Alzheimer data 2 | 3 | import os,csv,re 4 | import pandas as pd 5 | import numpy as np 6 | import scanpy as sc 7 | import math 8 | 9 | from skimage import io, color 10 | from sklearn.cluster import KMeans 11 | 12 | from scipy.sparse import issparse 13 | import random, torch 14 | import warnings 15 | warnings.filterwarnings("ignore") 16 | import matplotlib.colors as clr 17 | import matplotlib.pyplot as plt 18 | import pickle 19 | 20 | #Read original data and save it to h5ad 21 | from scanpy import read_10x_h5 22 | os.chdir("SpaClusterPython") 23 | #import SpaGCN as spg 24 | import CeLEry as cel 25 | from sklearn.decomposition import PCA 26 | from sklearn.manifold import TSNE 27 | 28 | # from data.LIBD.LIBD_gene_select import d_g 29 | 30 | # import tangram as tg 31 | 32 | ## 1. Data Preperation -------------------------------------------------------------------------- 33 | ### Load MouseBarin Data Section 1: Regarded as Spatial Transcriptomic Data 34 | dataSection1 = sc.read("../data/Alzheimer/MergeTrains73747576.h5ad") 35 | dataSection2 = sc.read("../data/Alzheimer/Alzheimer_spa_DE_snRNA_py.h5ad") 36 | 37 | ## Conduct clustering 38 | #cdata = dataSection1.copy() 39 | #cel.getGeneImg(cdata,emptypixel = 0) 40 | #cdataexpand = np.expand_dims(cdata.GeneImg, axis=1) 41 | 42 | #cdatacentral = cel.centralize(cdataexpand.copy()) 43 | #direclust = [cdatacentral[x,0,:,:] for x in range(cdatacentral.shape[0])] 44 | #direflat = [x.flat for x in direclust] 45 | #direflatnp = np.stack(direflat) 46 | 47 | ## implementing k-means clustering 48 | #kmeansmodel = KMeans(n_clusters=20, random_state=0) 49 | #kmeans = kmeansmodel.fit(direflatnp) 50 | #np.save("../output/Alzheimer/cluster.npy", kmeans.labels_) 51 | 52 | 53 | ## Calculating z-score 54 | cel.get_zscore(dataSection1) 55 | cel.get_zscore(dataSection2) 56 | 57 | ## global parameters 58 | class_num = 7 59 | pca = PCA(n_components=50) 60 | 61 | 62 | ## Compute PCA of cells 63 | principalComponents = pca.fit_transform(dataSection2.X) 64 | PCs = ['PC_{i}'.format(i=i) for i in range(1,51)] 65 | principalDf = pd.DataFrame(data = principalComponents, columns = PCs) 66 | principalDf.to_csv("../output/Alzheimer/PCA_selectedGenes.csv") 67 | 68 | 69 | tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) 70 | tsne_results = tsne.fit_transform(principalDf) 71 | tSNEDf = pd.DataFrame(data = tsne_results) 72 | tSNEDf.to_csv("../output/Alzheimer/tSNEDf.csv") 73 | 74 | ## PCA of subcategories 75 | # neuron 76 | cellneuron = dataSection2[[(i in ["In", "Ex"]) for i in dataSection2.obs["final_celltype"] ] ] 77 | principalComponents = pca.fit_transform(cellneuron.X) 78 | PCs = ['PC_{i}'.format(i=i) for i in range(1,51)] 79 | principalDf = pd.DataFrame(data = principalComponents, columns = PCs) 80 | principalDf["names"] = cellneuron.obs["cellname"] 81 | principalDf.to_csv("../output/Alzheimer/PCA_neuron.csv") 82 | 83 | 84 | tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) 85 | tsne_results = tsne.fit_transform(principalDf) 86 | tSNEDf = pd.DataFrame(data = tsne_results) 87 | tSNEDf.to_csv("../output/Alzheimer/tSNEDf_neuron.csv") 88 | 89 | # oli 90 | celloli = dataSection2[dataSection2.obs["final_celltype"] == "Oli" ] 91 | principalComponents = pca.fit_transform(celloli.X) 92 | PCs = ['PC_{i}'.format(i=i) for i in range(1,51)] 93 | principalDf = pd.DataFrame(data = principalComponents, columns = PCs) 94 | principalDf.to_csv("../output/Alzheimer/PCA_oli.csv") 95 | 96 | 97 | tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) 98 | tsne_results = tsne.fit_transform(principalDf) 99 | tSNEDf = pd.DataFrame(data = tsne_results) 100 | tSNEDf.to_csv("../output/Alzheimer/tSNEDf_oli.csv") 101 | 102 | 103 | ## 2**. Test (layer 2-Stage ordinal logistic regression ) -------------------------------------------------------------------------- 104 | 105 | 106 | def report_prop_method_Alzheimer (folder, name, dataSection2, traindata, Val_loader, outname = ""): 107 | """ 108 | Report the results of the proposed methods in comparison to the other method 109 | :folder: string: specified the folder that keep the proposed DNN method 110 | :name: string: specified the name of the DNN method, also will be used to name the output files 111 | :dataSection2: AnnData: the data of Section 2 112 | :traindata: AnnData: the data used in training data. This is only needed for compute SSIM 113 | :Val_loader: Dataload: the validation data from dataloader 114 | :outname: string: specified the name of the output, default is the same as the name 115 | :ImageSec2: Numpy: the image data that are refering to 116 | """ 117 | if outname == "": 118 | outname = name 119 | filename2 = "{folder}/{name}.obj".format(folder = folder, name = name) 120 | filehandler = open(filename2, 'rb') 121 | DNNmodel = pickle.load(filehandler) 122 | # 123 | coords_predict = np.zeros(dataSection2.obs.shape[0]) 124 | payer_prob = np.zeros((dataSection2.obs.shape[0],class_num+1)) 125 | for i, img in enumerate(Val_loader): 126 | recon = DNNmodel(img) 127 | logitsvalue = np.squeeze(torch.sigmoid(recon[0]).detach().numpy(), axis = 0) 128 | if (logitsvalue[class_num-2] == 1): 129 | coords_predict[i] = class_num 130 | payer_prob[i,(class_num + 1)] = 1 131 | else: 132 | logitsvalue_min = np.insert(logitsvalue, 0, 1, axis=0) 133 | logitsvalue_max = np.insert(logitsvalue_min, class_num, 0, axis=0) 134 | prb = np.diff(logitsvalue_max) 135 | # prbfull = np.insert(-prb[0], 0, 1 -logitsvalue[0,0], axis=0) 136 | prbfull = -prb.copy() 137 | coords_predict[i] = np.where(prbfull == prbfull.max())[0].max() + 1 138 | payer_prob[i,1:] = prbfull 139 | # 140 | dataSection2.obs["pred_layer"] = coords_predict.astype(int) 141 | dataSection2.obs["pred_layer_str"] = coords_predict.astype(int).astype('str') 142 | payer_prob[:,0] = dataSection2.obs["pred_layer"] 143 | np.savetxt("{folder}/{name}_probmat.csv".format(folder = folder, name = name), payer_prob, delimiter=',') 144 | sc.tl.rank_genes_groups(dataSection2, 'pred_layer_str', method = 'wilcoxon', key_added = "wilcoxon") 145 | sc.pl.rank_genes_groups(dataSection2, n_genes = 50, sharey = False, key="wilcoxon", save = 'Alzheimer_DE.pdf') 146 | 147 | 148 | def Evaluate (testdata, traindata, beta, nrep): 149 | ## Wrap up Validation data in to dataloader 150 | vdatax = np.expand_dims(testdata.X, axis = 0) 151 | vdata_rs = np.swapaxes(vdatax, 1, 2) 152 | DataVal = cel.wrap_gene_layer(vdata_rs, testdata.obs) 153 | Val_loader= torch.utils.data.DataLoader(DataVal, batch_size=1, num_workers = 4) 154 | # 155 | report_prop_method_Alzheimer(folder = "../output/Alzheimer/Prediction", 156 | name = "data_gene_All_layerv2_{beta}_n{nrep}".format(beta = beta, nrep = nrep), 157 | dataSection2 = testdata, traindata = traindata, 158 | Val_loader = Val_loader) 159 | 160 | 161 | ## Assing a dummy layers for the cells since it is not known 162 | dataSection2.obs["layer"] = 0 163 | dataSection2.obs["layer"][0:7] = [0,1,2,3,4,5,6] 164 | Evaluate(testdata = dataSection2, traindata = dataSection1, beta = 1e-5, nrep = 2) 165 | 166 | 167 | 168 | 169 | 170 | 171 | Evaluate(testdata = dataSection2, traindata = dataSection1, beta = 1e-5, nrep = 10) 172 | 173 | def EvaluateOrg (testdata, traindata, coloruse = None): 174 | ## Wrap up Validation data in to dataloader 175 | vdatax = np.expand_dims(testdata.X, axis = 0) 176 | vdata_rs = np.swapaxes(vdatax, 1, 2) 177 | DataVal = cel.wrap_gene_layer(vdata_rs, testdata.obs) 178 | Val_loader= torch.utils.data.DataLoader(DataVal, batch_size=1, num_workers = 4) 179 | # 180 | report_prop_method_Alzheimer(folder = "../output/Alzheimer/Prediction", 181 | name = "layer_PreOrgv2", 182 | dataSection2 = testdata, traindata = traindata, 183 | Val_loader = Val_loader, coloruse = coloruse) 184 | 185 | EvaluateOrg (testdata = dataSection2, traindata = dataSection1) 186 | 187 | -------------------------------------------------------------------------------- /code_paper/4_Mouse_brain_MERFISH/CeLEry_brain.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/usr/local/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 13 | " from .autonotebook import tqdm as notebook_tqdm\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import CeLEry as cel\n", 19 | "\n", 20 | "import os,csv,re\n", 21 | "import pandas as pd\n", 22 | "import numpy as np\n", 23 | "import scanpy as sc\n", 24 | "import math\n", 25 | "from skimage import io, color\n", 26 | "\n", 27 | "from scipy.sparse import issparse\n", 28 | "import random, torch\n", 29 | "import warnings\n", 30 | "warnings.filterwarnings(\"ignore\")\n", 31 | "import pickle\n", 32 | "from sklearn.model_selection import train_test_split\n", 33 | "from anndata import AnnData, read_h5ad\n", 34 | "\n", 35 | "import json" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "data_merfish_raw = pd.read_csv(\"data/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_by_gene_S1R1.csv\", index_col=0) \n", 45 | "meta_data = pd.read_csv(\"data/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_metadata_S1R1.csv\", index_col=0)\n", 46 | "\n", 47 | "data_merfish = AnnData(data_merfish_raw)\n", 48 | "\n", 49 | "data_merfish.obs['x_cord'] = meta_data['center_x'].tolist()\n", 50 | "data_merfish.obs['y_cord'] = meta_data['center_y'].tolist()\n", 51 | "data_merfish_raw = data_merfish.copy()\n", 52 | "\n", 53 | "sc.pp.filter_cells(data_merfish, min_counts=500)\n", 54 | "sc.pp.filter_cells(data_merfish, min_genes=100)\n", 55 | "\n", 56 | "sc.pp.neighbors(data_merfish, n_neighbors=15, use_rep='X', random_state=1)\n", 57 | "sc.tl.louvain(data_merfish, 0.4, random_state=1)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "Qdata = data_merfish[data_merfish.obs['x_cord'] <= np.quantile(data_merfish.obs['x_cord'], 0.5)]\n", 67 | "Rdata = data_merfish[data_merfish.obs['x_cord'] > np.quantile(data_merfish.obs['x_cord'], 0.5)]" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "cel.get_zscore(Qdata)\n", 77 | "cel.get_zscore(Rdata)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 5, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stderr", 87 | "output_type": "stream", 88 | "text": [ 89 | "100%|██████████| 2293/2293 [00:08<00:00, 284.61it/s]\n" 90 | ] 91 | }, 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "Epoch:1, Loss:75.2848\n" 97 | ] 98 | }, 99 | { 100 | "name": "stderr", 101 | "output_type": "stream", 102 | "text": [ 103 | "100%|██████████| 2293/2293 [00:07<00:00, 311.46it/s]" 104 | ] 105 | }, 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "Epoch:2, Loss:35.4318\n", 111 | "Folder already exists\n" 112 | ] 113 | }, 114 | { 115 | "name": "stderr", 116 | "output_type": "stream", 117 | "text": [ 118 | "\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "## right brain as training, left brain as testing\n", 124 | "model_train = cel.Fit_cord (data_train = Rdata, hidden_dims = [100, 50, 20], num_epochs_max = 2000, path = \"output/brain\", filename = \"brain_left\")" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 9, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "pred_cord = cel.Predict_cord (data_test = Qdata, path = \"output/brain\", filename = \"brain_left\")\n", 134 | "pred_cord[:,0] = 1-pred_cord[:,0]\n", 135 | "\n", 136 | "data_train = Qdata.copy()\n", 137 | "traindata = (data_train.X.A if issparse(data_train.X) else data_train.X)\n", 138 | "tdatax = np.expand_dims(traindata, axis = 0)\n", 139 | "tdata_rs = np.swapaxes(tdatax, 1, 2)\n", 140 | "test_cord = cel.wrap_gene_location(tdata_rs, data_train.obs[['x_cord', 'y_cord']])\n", 141 | "\n", 142 | "pred_cord_transformx = pred_cord[:,0]*(test_cord.xmax-test_cord.xmin) + test_cord.xmin\n", 143 | "pred_cord_transformy = pred_cord[:,1]*(test_cord.ymax-test_cord.ymin) + test_cord.ymin\n", 144 | "pred_cord_transform = np.array([pred_cord_transformx, pred_cord_transformy]).T\n", 145 | "pred_cord_transform = pd.DataFrame(pred_cord_transform)\n", 146 | "pred_cord_transform.index = Qdata.obs.index\n", 147 | "pred_cord_transform.columns = ['x', 'y']\n", 148 | "\n", 149 | "pred_cord_transform.to_csv(\"output/brain/celery_brain_left.csv\")" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 51, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "## left brain as training, right brain as testing\n", 159 | "model_train = cel.Fit_cord (data_train = Qdata, hidden_dims = [100, 50, 20], num_epochs_max = 2000, path = \"output/brain\", filename = \"brain_right\")" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 57, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "pred_cord = cel.Predict_cord (data_test = Qdata, path = \"output/brain\", filename = \"brain_right\")\n", 169 | "pred_cord[:,0] = 1-pred_cord[:,0]\n", 170 | "\n", 171 | "data_train = Rdata.copy()\n", 172 | "traindata = (data_train.X.A if issparse(data_train.X) else data_train.X)\n", 173 | "tdatax = np.expand_dims(traindata, axis = 0)\n", 174 | "tdata_rs = np.swapaxes(tdatax, 1, 2)\n", 175 | "test_cord = cel.wrap_gene_location(tdata_rs, data_train.obs[['x_cord', 'y_cord']])\n", 176 | "\n", 177 | "pred_cord_transformx = pred_cord[:,0]*(test_cord.xmax-test_cord.xmin) + test_cord.xmin\n", 178 | "pred_cord_transformy = pred_cord[:,1]*(test_cord.ymax-test_cord.ymin) + test_cord.ymin\n", 179 | "pred_cord_transform = np.array([pred_cord_transformx, pred_cord_transformy]).T\n", 180 | "pred_cord_transform = pd.DataFrame(pred_cord_transform)\n", 181 | "pred_cord_transform.index = Rdata.obs.index\n", 182 | "pred_cord_transform.columns = ['x', 'y']\n", 183 | "\n", 184 | "pred_cord_transform.to_csv(\"output/brain/celery_brain_right.csv\")" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [] 193 | } 194 | ], 195 | "metadata": { 196 | "kernelspec": { 197 | "display_name": "Python 3.8.15 64-bit", 198 | "language": "python", 199 | "name": "python3" 200 | }, 201 | "language_info": { 202 | "codemirror_mode": { 203 | "name": "ipython", 204 | "version": 3 205 | }, 206 | "file_extension": ".py", 207 | "mimetype": "text/x-python", 208 | "name": "python", 209 | "nbconvert_exporter": "python", 210 | "pygments_lexer": "ipython3", 211 | "version": "3.8.15 (default, Oct 11 2022, 21:52:37) \n[Clang 14.0.0 (clang-1400.0.29.102)]" 212 | }, 213 | "orig_nbformat": 4, 214 | "vscode": { 215 | "interpreter": { 216 | "hash": "0adcc2737ebf6a4a119f135174df96668767fca1ef1112612db5ecadf2b6d608" 217 | } 218 | } 219 | }, 220 | "nbformat": 4, 221 | "nbformat_minor": 2 222 | } 223 | -------------------------------------------------------------------------------- /code_paper/4_Mouse_brain_MERFISH/CeLEry_figure 6_scenario 2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 19, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import CeLEry as cel\n", 10 | "\n", 11 | "import os\n", 12 | "import pandas as pd\n", 13 | "import numpy as np\n", 14 | "import scanpy as sc\n", 15 | "import scipy\n", 16 | "\n", 17 | "from scipy.sparse import issparse\n", 18 | "from anndata import concat\n", 19 | "import warnings\n", 20 | "warnings.filterwarnings(\"ignore\")\n", 21 | "from anndata import AnnData\n", 22 | "from tqdm import tqdm" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "name": "stderr", 32 | "output_type": "stream", 33 | "text": [ 34 | "100%|██████████| 2/2 [00:00<00:00, 4.81it/s]\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "d11 = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_by_gene_S1R1.csv\", index_col=0) \n", 40 | "d11_meta = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_metadata_S1R1.csv\", index_col=0)\n", 41 | "d12 = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate2_cell_by_gene_S1R2.csv\", index_col=0) \n", 42 | "d12_meta = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate2_cell_metadata_S1R2.csv\", index_col=0)\n", 43 | "\n", 44 | "d11 = AnnData(d11)\n", 45 | "d11.obs['x_cord'] = d11_meta['center_x'].tolist()\n", 46 | "d11.obs['y_cord'] = d11_meta['center_y'].tolist()\n", 47 | "d11.obs['source'] = \"S1R1\"\n", 48 | "\n", 49 | "d12 = AnnData(d12)\n", 50 | "d12.obs['x_cord'] = d12_meta['center_x'].tolist()\n", 51 | "d12.obs['y_cord'] = d12_meta['center_y'].tolist()\n", 52 | "d12.obs['source'] = \"S1R2\"\n", 53 | "\n", 54 | "data = [d11, d12]\n", 55 | "for d in tqdm(data):\n", 56 | " sc.pp.filter_cells(d, min_counts=500)\n", 57 | " sc.pp.filter_cells(d, min_genes=100)\n", 58 | "\n", 59 | "d_tot = concat([d11, d12])\n", 60 | "sc.pp.neighbors(d_tot, n_neighbors = 15, use_rep=\"X\")\n", 61 | "sc.tl.louvain(d_tot, 0.4, random_state=1)\n", 62 | "\n", 63 | "d11 = d_tot[d_tot.obs['source'] == \"S1R1\"].copy()\n", 64 | "d12 = d_tot[d_tot.obs['source'] == \"S1R2\"].copy()" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "In this scenario, We choose the right half of replicates S1R2 as training set, and predict locations of S1R1 cells.\n", 72 | "\n", 73 | "The cutting line for separating right half of S1R2 is x*6/11 + 2436.36 - y = 0. It is mannually defined, a rough separation." 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 3, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "d12_right = d12[d12.obs['x_cord']*6/11 + 2436.36 - d12.obs['y_cord'] > 0].copy()\n", 83 | "\n", 84 | "d11_left = d11[d11.obs['x_cord'] < np.quantile(d11.obs['x_cord'], 0.5)].copy()\n", 85 | "d11_right = d11[d11.obs['x_cord'] >= np.quantile(d11.obs['x_cord'], 0.5)].copy()" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "Rdata = d12_right.copy()\n", 95 | "\n", 96 | "cel.get_zscore(Rdata)\n", 97 | "Rdata.obs = Rdata.obs[['x_cord', 'y_cord']]\n", 98 | "model_train = cel.Fit_cord (data_train = Rdata, hidden_dims = [150, 100, 70, 50, 20], num_epochs_max = 500, path = \"output/fig6\", filename = \"fig6_2\")" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 12, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "def pred_transform(pred_cord):\n", 108 | " data_train = Rdata.copy()\n", 109 | " traindata = (data_train.X.A if issparse(data_train.X) else data_train.X)\n", 110 | " tdatax = np.expand_dims(traindata, axis = 0)\n", 111 | " tdata_rs = np.swapaxes(tdatax, 1, 2)\n", 112 | " test_cord = cel.wrap_gene_location(tdata_rs, Rdata.obs[['x_cord', 'y_cord']])\n", 113 | "\n", 114 | " pred_cord_transformx = pred_cord[:,0]*(test_cord.xmax-test_cord.xmin) + test_cord.xmin\n", 115 | " pred_cord_transformy = pred_cord[:,1]*(test_cord.ymax-test_cord.ymin) + test_cord.ymin\n", 116 | " pred_cord_transform = np.array([pred_cord_transformx, pred_cord_transformy]).T\n", 117 | " return pred_cord_transform" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "cel.get_zscore(d11_left)\n", 127 | "cel.get_zscore(d11_right)\n", 128 | "\n", 129 | "pred_cord_left = cel.Predict_cord (data_test = d11_left, path = \"output/fig6\", filename = \"fig6_2\")\n", 130 | "pred_cord_transform_left = pred_transform(pred_cord_left)\n", 131 | "\n", 132 | "pred_cord_right = cel.Predict_cord (data_test = d11_right, path = \"output/fig6\", filename = \"fig6_2\")\n", 133 | "pred_cord_transform_right = pred_transform(pred_cord_right)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "os.makedirs(\"output/fig6\", exist_ok=True)\n", 143 | "np.save(\"output/fig6/fig6_2_left_celery.npy\", pred_cord_transform_left)\n", 144 | "np.save(\"output/fig6/fig6_2_right_celery.npy\", pred_cord_transform_right)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "Till now, it is fine to compare the correlation between true and predicted pairwise distance." 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "d11_left.obs['x_celery'] = pred_cord_transform_left[:,0]\n", 161 | "d11_left.obs['y_celery'] = pred_cord_transform_left[:,1]\n", 162 | "\n", 163 | "d11_right.obs['x_celery'] = pred_cord_transform_right[:,0]\n", 164 | "d11_right.obs['y_celery'] = pred_cord_transform_right[:,1]\n", 165 | "\n", 166 | "def distCompute(data_merfish):\n", 167 | " celery_dist = []\n", 168 | " true_dist = []\n", 169 | " Qdata_loc = np.array(data_merfish.obs[['x_cord', 'y_cord']])\n", 170 | " celery_pred = np.array(data_merfish.obs[['x_celery', 'y_celery']])\n", 171 | "\n", 172 | " for i in tqdm(range(Qdata_loc.shape[0])):\n", 173 | " celery_i = celery_pred[i, :]\n", 174 | " celery_points = celery_pred[i+1:, :]\n", 175 | " celery_dist.extend(np.sqrt(np.sum((celery_points - celery_i)**2, axis=1)))\n", 176 | "\n", 177 | "\n", 178 | " true_i = Qdata_loc[i, :]\n", 179 | " true_points = Qdata_loc[i+1:, :]\n", 180 | " true_dist.extend(np.sqrt(np.sum((true_points - true_i)**2, axis=1)))\n", 181 | " return celery_dist, true_dist\n", 182 | "\n", 183 | "celery_dist, true_dist = distCompute(d11_left)\n", 184 | "celery_dist_r, true_dist_r = distCompute(d11_right)\n", 185 | "\n", 186 | "celery_dist.extend(celery_dist_r)\n", 187 | "true_dist.extend(true_dist_r)\n", 188 | "\n", 189 | "print(scipy.stats.pearsonr(true_dist, celery_dist))" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "However, the predicted locations of testing set is in the domain of training set locations. If we would like to compare the Euclidean distance between true locations and predicted locations, mannually matching is required to first roughly align the domain and rotation between training set and testing set. This step is imperfect, but it can help us compare the performance of different methods within a single scenario." 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 21, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "## Based on our separation line, project the predicted locations of left brain to the left side.\n", 206 | "\n", 207 | "A = 6/11\n", 208 | "C = 2436.36\n", 209 | "B = -1\n", 210 | "def pointTrans(celery_pred, left, xname, yname):\n", 211 | " x = celery_pred[:, 0]\n", 212 | " y = celery_pred[:, 1]\n", 213 | " x1 = x - 2*A*((A*x + B*y + C)/(A*A + B*B))\n", 214 | " y1 = y - 2*B*((A*x + B*y + C)/(A*A + B*B))\n", 215 | " left.obs[xname] = x1\n", 216 | " left.obs[yname] = y1\n", 217 | " # return x1, y1\n", 218 | "\n", 219 | "pointTrans(pred_cord_transform_left, d11_left, \"x_celery\", \"y_celery\")\n", 220 | "Qdata = concat([d11_left, d11_right])" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 22, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "## Mannually matching\n", 230 | "\n", 231 | "def rotateMatrix(a):\n", 232 | " return np.array([[np.cos(a), -np.sin(a)], [np.sin(a), np.cos(a)]])\n", 233 | "\n", 234 | "x0=np.quantile(d11.obs['x_cord'], 0.5)\n", 235 | "y0=5000\n", 236 | "\n", 237 | "\n", 238 | "def anim(xy, i):\n", 239 | " newxy=(xy-[x0,y0]) @ rotateMatrix(-2*i*np.pi/180) + [x0,y0]\n", 240 | " return newxy\n", 241 | "\n", 242 | "\n", 243 | "newxy = anim(np.array(Qdata.obs[['x_cord', 'y_cord']]), -30)\n", 244 | "Qdata.obs['x_rotate'] = newxy[:, 0]\n", 245 | "Qdata.obs['y_rotate'] = newxy[:, 1]\n", 246 | "Qdata.obs['y_rotate'] = Qdata.obs['y_rotate'] + 500\n", 247 | "Qdata.obs['x_rotate'] = Qdata.obs['x_rotate'] + 800" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "sq = lambda x, y: (x - y)**2\n", 257 | "pred_dist_celery = np.sqrt(np.sum(sq(np.array(Qdata.obs[['x_rotate', 'y_rotate']]), np.array(Qdata.obs[['x_celery', 'y_celery']])), axis=1))\n", 258 | "print(np.median(pred_dist_celery))" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [] 267 | } 268 | ], 269 | "metadata": { 270 | "kernelspec": { 271 | "display_name": "Python 3", 272 | "language": "python", 273 | "name": "python3" 274 | }, 275 | "language_info": { 276 | "codemirror_mode": { 277 | "name": "ipython", 278 | "version": 3 279 | }, 280 | "file_extension": ".py", 281 | "mimetype": "text/x-python", 282 | "name": "python", 283 | "nbconvert_exporter": "python", 284 | "pygments_lexer": "ipython3", 285 | "version": "3.8.15" 286 | }, 287 | "orig_nbformat": 4, 288 | "vscode": { 289 | "interpreter": { 290 | "hash": "0adcc2737ebf6a4a119f135174df96668767fca1ef1112612db5ecadf2b6d608" 291 | } 292 | } 293 | }, 294 | "nbformat": 4, 295 | "nbformat_minor": 2 296 | } 297 | -------------------------------------------------------------------------------- /code_paper/4_Mouse_brain_MERFISH/CeLEry_figure 6_scenario 3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import CeLEry as cel\n", 10 | "\n", 11 | "import os\n", 12 | "import pandas as pd\n", 13 | "import numpy as np\n", 14 | "import scanpy as sc\n", 15 | "import scipy\n", 16 | "\n", 17 | "from scipy.sparse import issparse\n", 18 | "from anndata import concat\n", 19 | "import warnings\n", 20 | "warnings.filterwarnings(\"ignore\")\n", 21 | "from anndata import AnnData\n", 22 | "from tqdm import tqdm" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "name": "stderr", 32 | "output_type": "stream", 33 | "text": [ 34 | "100%|██████████| 3/3 [00:00<00:00, 4.54it/s]\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "d13 = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate3_cell_by_gene_S1R3.csv\", index_col=0) \n", 40 | "d13_meta = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate3_cell_metadata_S1R3.csv\", index_col=0)\n", 41 | "d12 = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate2_cell_by_gene_S1R2.csv\", index_col=0) \n", 42 | "d12_meta = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate2_cell_metadata_S1R2.csv\", index_col=0)\n", 43 | "d11 = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_by_gene_S1R1.csv\", index_col=0) \n", 44 | "d11_meta = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_metadata_S1R1.csv\", index_col=0)\n", 45 | "\n", 46 | "d11 = AnnData(d11)\n", 47 | "d11.obs['x_cord'] = d11_meta['center_x'].tolist()\n", 48 | "d11.obs['y_cord'] = d11_meta['center_y'].tolist()\n", 49 | "d11.obs['source'] = \"S1R1\"\n", 50 | "\n", 51 | "d12 = AnnData(d12)\n", 52 | "d12.obs['x_cord'] = d12_meta['center_x'].tolist()\n", 53 | "d12.obs['y_cord'] = d12_meta['center_y'].tolist()\n", 54 | "d12.obs['source'] = \"S1R2\"\n", 55 | "\n", 56 | "d13 = AnnData(d13)\n", 57 | "d13.obs['x_cord'] = d13_meta['center_x'].tolist()\n", 58 | "d13.obs['y_cord'] = d13_meta['center_y'].tolist()\n", 59 | "d13.obs['source'] = \"S1R3\"\n", 60 | "\n", 61 | "data = [d11, d12, d13]\n", 62 | "for d in tqdm(data):\n", 63 | " sc.pp.filter_cells(d, min_counts=500)\n", 64 | " sc.pp.filter_cells(d, min_genes=100)\n", 65 | "\n", 66 | "d_tot = concat([d11, d12, d13])\n", 67 | "sc.pp.neighbors(d_tot, n_neighbors = 15, use_rep=\"X\")\n", 68 | "sc.tl.louvain(d_tot, 0.2, random_state=1)\n", 69 | "\n", 70 | "d11 = d_tot[d_tot.obs['source'] == \"S1R1\"].copy()\n", 71 | "d12 = d_tot[d_tot.obs['source'] == \"S1R2\"].copy()\n", 72 | "d13 = d_tot[d_tot.obs['source'] == \"S1R3\"].copy()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "In this scenario, We choose the right halves of replicates S1R2 and S1R3 as training set, and predict locations of S1R1 cells. Since the right half of S1R2 and S1R3 are already matched, no further alignment is needed.\n", 80 | "\n", 81 | "The cutting line for separating right halves of S1R2 and S1R3 is x*6/11 + 2436.36 - y = 0. It is mannually defined, a rough separation." 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 3, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "d11_left = d11[d11.obs['x_cord'] < np.quantile(d11.obs['x_cord'], 0.5)]\n", 91 | "d11_right = d11[d11.obs['x_cord'] >= np.quantile(d11.obs['x_cord'], 0.5)]\n", 92 | "\n", 93 | "d12 = d12[d12.obs['x_cord']*6/11 + 2436.36 - d12.obs['y_cord'] > 0]\n", 94 | "d13 = d13[d13.obs['x_cord']*6/11 + 2436.36 - d13.obs['y_cord'] > 0]\n", 95 | "d_training = concat([d12, d13])" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "Rdata = d_training.copy()\n", 105 | "\n", 106 | "cel.get_zscore(Rdata)\n", 107 | "Rdata.obs = Rdata.obs[['x_cord', 'y_cord']]\n", 108 | "model_train = cel.Fit_cord (data_train = Rdata, hidden_dims = [200, 120, 70, 50, 20], num_epochs_max = 500, path = \"output/fig6\", filename = \"fig6_3\")" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 5, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "def pred_transform(pred_cord):\n", 118 | " data_train = Rdata.copy()\n", 119 | " traindata = (data_train.X.A if issparse(data_train.X) else data_train.X)\n", 120 | " tdatax = np.expand_dims(traindata, axis = 0)\n", 121 | " tdata_rs = np.swapaxes(tdatax, 1, 2)\n", 122 | " test_cord = cel.wrap_gene_location(tdata_rs, Rdata.obs[['x_cord', 'y_cord']])\n", 123 | "\n", 124 | " pred_cord_transformx = pred_cord[:,0]*(test_cord.xmax-test_cord.xmin) + test_cord.xmin\n", 125 | " pred_cord_transformy = pred_cord[:,1]*(test_cord.ymax-test_cord.ymin) + test_cord.ymin\n", 126 | " pred_cord_transform = np.array([pred_cord_transformx, pred_cord_transformy]).T\n", 127 | " return pred_cord_transform" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 6, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "cel.get_zscore(d11_left)\n", 137 | "cel.get_zscore(d11_right)\n", 138 | "\n", 139 | "pred_cord_left = cel.Predict_cord (data_test = d11_left, path = \"output/fig6\", filename = \"fig6_3\")\n", 140 | "pred_cord_transform_left = pred_transform(pred_cord_left)\n", 141 | "\n", 142 | "pred_cord_right = cel.Predict_cord (data_test = d11_right, path = \"output/fig6\", filename = \"fig6_3\")\n", 143 | "pred_cord_transform_right = pred_transform(pred_cord_right) \n" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "os.makedirs(\"output/fig6\", exist_ok=True)\n", 153 | "np.save(\"output/fig6/fig6_3_left_celery.npy\", pred_cord_transform_left)\n", 154 | "np.save(\"output/fig6/fig6_3_right_celery.npy\", pred_cord_transform_right)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "Till now, it is fine to compare the correlation between true and predicted pairwise distance." 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "d11_left.obs['x_celery'] = pred_cord_transform_left[:,0]\n", 171 | "d11_left.obs['y_celery'] = pred_cord_transform_left[:,1]\n", 172 | "\n", 173 | "d11_right.obs['x_celery'] = pred_cord_transform_right[:,0]\n", 174 | "d11_right.obs['y_celery'] = pred_cord_transform_right[:,1]\n", 175 | "\n", 176 | "def distCompute(data_merfish):\n", 177 | " celery_dist = []\n", 178 | " true_dist = []\n", 179 | " Qdata_loc = np.array(data_merfish.obs[['x_cord', 'y_cord']])\n", 180 | " celery_pred = np.array(data_merfish.obs[['x_celery', 'y_celery']])\n", 181 | "\n", 182 | " for i in tqdm(range(Qdata_loc.shape[0])):\n", 183 | " celery_i = celery_pred[i, :]\n", 184 | " celery_points = celery_pred[i+1:, :]\n", 185 | " celery_dist.extend(np.sqrt(np.sum((celery_points - celery_i)**2, axis=1)))\n", 186 | "\n", 187 | "\n", 188 | " true_i = Qdata_loc[i, :]\n", 189 | " true_points = Qdata_loc[i+1:, :]\n", 190 | " true_dist.extend(np.sqrt(np.sum((true_points - true_i)**2, axis=1)))\n", 191 | " return celery_dist, true_dist\n", 192 | "\n", 193 | "celery_dist, true_dist = distCompute(d11_left)\n", 194 | "celery_dist_r, true_dist_r = distCompute(d11_right)\n", 195 | "\n", 196 | "celery_dist.extend(celery_dist_r)\n", 197 | "true_dist.extend(true_dist_r)\n", 198 | "\n", 199 | "print(scipy.stats.pearsonr(true_dist, celery_dist))" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "However, the predicted locations of testing set is in the domain of training set locations. If we would like to compare the Euclidean distance between true locations and predicted locations, mannually matching is required to first roughly align the domain and rotation between training set and testing set. This step is imperfect, but it can help us compare the performance of different methods within a single scenario." 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 8, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "## Based on our separation line, project the predicted locations of left brain to the left side.\n", 216 | "\n", 217 | "A = 6/11\n", 218 | "C = 2436.36\n", 219 | "B = -1\n", 220 | "def pointTrans(celery_pred, left, xname, yname):\n", 221 | " x = celery_pred[:, 0]\n", 222 | " y = celery_pred[:, 1]\n", 223 | " x1 = x - 2*A*((A*x + B*y + C)/(A*A + B*B))\n", 224 | " y1 = y - 2*B*((A*x + B*y + C)/(A*A + B*B))\n", 225 | " left.obs[xname] = x1\n", 226 | " left.obs[yname] = y1\n", 227 | " # return x1, y1\n", 228 | "\n", 229 | "pointTrans(pred_cord_transform_left, d11_left, \"x_celery\", \"y_celery\")\n", 230 | "Qdata = concat([d11_left, d11_right])" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 15, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "## Mannually matching\n", 240 | "\n", 241 | "def rotateMatrix(a):\n", 242 | " return np.array([[np.cos(a), -np.sin(a)], [np.sin(a), np.cos(a)]])\n", 243 | "\n", 244 | "x0=np.quantile(d11.obs['x_cord'], 0.5)\n", 245 | "y0=5000\n", 246 | "\n", 247 | "\n", 248 | "def anim(xy, i):\n", 249 | " newxy=(xy-[x0,y0]) @ rotateMatrix(-2*i*np.pi/180) + [x0,y0]\n", 250 | " return newxy\n", 251 | "\n", 252 | "\n", 253 | "newxy = anim(np.array(Qdata.obs[['x_cord', 'y_cord']]), -30)\n", 254 | "Qdata.obs['x_rotate'] = newxy[:, 0]\n", 255 | "Qdata.obs['y_rotate'] = newxy[:, 1]\n", 256 | "Qdata.obs['y_rotate'] = Qdata.obs['y_rotate'] + 500\n", 257 | "Qdata.obs['x_rotate'] = Qdata.obs['x_rotate'] + 800" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "sq = lambda x, y: (x - y)**2\n", 267 | "pred_dist_celery = np.sqrt(np.sum(sq(np.array(Qdata.obs[['x_rotate', 'y_rotate']]), np.array(Qdata.obs[['x_celery', 'y_celery']])), axis=1))\n", 268 | "print(np.median(pred_dist_celery))" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [] 277 | } 278 | ], 279 | "metadata": { 280 | "kernelspec": { 281 | "display_name": "Python 3", 282 | "language": "python", 283 | "name": "python3" 284 | }, 285 | "language_info": { 286 | "codemirror_mode": { 287 | "name": "ipython", 288 | "version": 3 289 | }, 290 | "file_extension": ".py", 291 | "mimetype": "text/x-python", 292 | "name": "python", 293 | "nbconvert_exporter": "python", 294 | "pygments_lexer": "ipython3", 295 | "version": "3.8.15" 296 | }, 297 | "orig_nbformat": 4, 298 | "vscode": { 299 | "interpreter": { 300 | "hash": "0adcc2737ebf6a4a119f135174df96668767fca1ef1112612db5ecadf2b6d608" 301 | } 302 | } 303 | }, 304 | "nbformat": 4, 305 | "nbformat_minor": 2 306 | } 307 | -------------------------------------------------------------------------------- /code_paper/4_Mouse_brain_MERFISH/README.md: -------------------------------------------------------------------------------- 1 | # Note for Brain MERFISH data 2 | 3 | Datasets is available from https://app.box.com/s/6nz5vlp0hjmuq9xruxog96p2woyt5fpm -------------------------------------------------------------------------------- /code_paper/4_Mouse_brain_MERFISH/SpaOTsc_brain.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/usr/local/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 13 | " from .autonotebook import tqdm as notebook_tqdm\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import os\n", 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "import scanpy as sc\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "from scipy.spatial.distance import cdist, squareform, pdist\n", 24 | "from scipy.stats import ks_2samp\n", 25 | "from scipy.stats import pearsonr\n", 26 | "\n", 27 | "import os,csv,re\n", 28 | "import math\n", 29 | "from skimage import io, color\n", 30 | "\n", 31 | "from scipy.sparse import issparse\n", 32 | "import random, torch\n", 33 | "import warnings\n", 34 | "warnings.filterwarnings(\"ignore\")\n", 35 | "import pickle\n", 36 | "from sklearn.model_selection import train_test_split\n", 37 | "from anndata import AnnData, read_h5ad\n", 38 | "import seaborn as sns\n", 39 | "\n", 40 | "import json\n", 41 | "\n", 42 | "from spaotsc import SpaOTsc" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "data_merfish_raw = pd.read_csv(\"data/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_by_gene_S1R1.csv\", index_col=0) \n", 52 | "meta_data = pd.read_csv(\"data/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_metadata_S1R1.csv\", index_col=0)\n", 53 | "\n", 54 | "data_merfish = AnnData(data_merfish_raw)\n", 55 | "\n", 56 | "data_merfish.obs['x_cord'] = meta_data['center_x'].tolist()\n", 57 | "data_merfish.obs['y_cord'] = meta_data['center_y'].tolist()\n", 58 | "data_merfish_raw = data_merfish.copy()\n", 59 | "\n", 60 | "sc.pp.filter_cells(data_merfish, min_counts=500)\n", 61 | "sc.pp.filter_cells(data_merfish, min_genes=100)\n", 62 | "\n", 63 | "sc.pp.neighbors(data_merfish, n_neighbors=15, use_rep='X', random_state=1)\n", 64 | "sc.tl.louvain(data_merfish, 0.4, random_state=1)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 6, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "Qdata = data_merfish[data_merfish.obs['x_cord'] <= np.quantile(data_merfish.obs['x_cord'], 0.5)]\n", 74 | "Rdata = data_merfish[data_merfish.obs['x_cord'] > np.quantile(data_merfish.obs['x_cord'], 0.5)]" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 7, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "## right brain as training, left brain as testing\n", 84 | "datatest = Qdata.copy()\n", 85 | "datatrain = Rdata.copy()\n", 86 | "random.seed(2021)\n", 87 | "torch.manual_seed(2021)\n", 88 | "np.random.seed(2021)\n", 89 | "## Running spaOTsc\n", 90 | "df_sc = pd.DataFrame(datatest.X)\n", 91 | "is_dmat = cdist(np.array(datatrain.obs.iloc[:,0:2]), np.array(datatrain.obs.iloc[:,0:2]), 'euclidean')\n", 92 | "sc_dmat = cdist(datatest.X, datatest.X, 'euclidean')\n", 93 | "\n", 94 | "spsc = SpaOTsc.spatial_sc(sc_data=df_sc, is_dmat=is_dmat, sc_dmat=sc_dmat)\n", 95 | "cost_matrix = cdist(datatest.X, datatrain.X, 'euclidean')\n", 96 | "location_pred = spsc.transport_plan(cost_matrix)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "location_sum = np.sum(location_pred, axis=1)\n", 106 | "location_pred_copy = location_pred / location_sum.reshape(len(location_sum), 1)\n", 107 | "\n", 108 | "pred_cord_transform = location_pred_copy.dot(np.array(Rdata.obs[['x_cord', 'y_cord']]))\n", 109 | "pred_cord_transform[:, 0] = -pred_cord_transform[:, 0] + 2*np.quantile(data_merfish.obs['x_cord'], 0.5)\n", 110 | "pred_cord_transform = pd.DataFrame(pred_cord_transform)\n", 111 | "pred_cord_transform.index = Qdata.obs.index\n", 112 | "pred_cord_transform.columns = ['x', 'y']\n", 113 | "\n", 114 | "pred_cord_transform.to_csv(\"output/brain/spaotsc_brain_left.csv\")" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "## left brain as training, right brain as testing\n", 131 | "datatest = Rdata.copy()\n", 132 | "datatrain = Qdata.copy()\n", 133 | "random.seed(2021)\n", 134 | "torch.manual_seed(2021)\n", 135 | "np.random.seed(2021)\n", 136 | "## Running spaOTsc\n", 137 | "df_sc = pd.DataFrame(datatest.X)\n", 138 | "is_dmat = cdist(np.array(datatrain.obs.iloc[:,0:2]), np.array(datatrain.obs.iloc[:,0:2]), 'euclidean')\n", 139 | "sc_dmat = cdist(datatest.X, datatest.X, 'euclidean')\n", 140 | "\n", 141 | "spsc = SpaOTsc.spatial_sc(sc_data=df_sc, is_dmat=is_dmat, sc_dmat=sc_dmat)\n", 142 | "cost_matrix = cdist(datatest.X, datatrain.X, 'euclidean')\n", 143 | "location_pred = spsc.transport_plan(cost_matrix)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "location_sum = np.sum(location_pred, axis=1)\n", 153 | "location_pred_copy = location_pred / location_sum.reshape(len(location_sum), 1)\n", 154 | "\n", 155 | "pred_cord_transform = location_pred_copy.dot(np.array(Rdata.obs[['x_cord', 'y_cord']]))\n", 156 | "pred_cord_transform[:, 0] = -pred_cord_transform[:, 0] + 2*np.quantile(data_merfish.obs['x_cord'], 0.5)\n", 157 | "pred_cord_transform = pd.DataFrame(pred_cord_transform)\n", 158 | "pred_cord_transform.index = Qdata.obs.index\n", 159 | "pred_cord_transform.columns = ['x', 'y']\n", 160 | "\n", 161 | "pred_cord_transform.to_csv(\"output/brain/spaotsc_brain_right.csv\")" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [] 170 | } 171 | ], 172 | "metadata": { 173 | "kernelspec": { 174 | "display_name": "Python 3.10.8 64-bit", 175 | "language": "python", 176 | "name": "python3" 177 | }, 178 | "language_info": { 179 | "codemirror_mode": { 180 | "name": "ipython", 181 | "version": 3 182 | }, 183 | "file_extension": ".py", 184 | "mimetype": "text/x-python", 185 | "name": "python", 186 | "nbconvert_exporter": "python", 187 | "pygments_lexer": "ipython3", 188 | "version": "3.10.8 (main, Oct 13 2022, 10:17:43) [Clang 14.0.0 (clang-1400.0.29.102)]" 189 | }, 190 | "orig_nbformat": 4, 191 | "vscode": { 192 | "interpreter": { 193 | "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" 194 | } 195 | } 196 | }, 197 | "nbformat": 4, 198 | "nbformat_minor": 2 199 | } 200 | -------------------------------------------------------------------------------- /code_paper/4_Mouse_brain_MERFISH/Tangram_brain.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/usr/local/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 13 | " from .autonotebook import tqdm as notebook_tqdm\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import os, sys\n", 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "import matplotlib.pyplot as plt\n", 22 | "import seaborn as sns\n", 23 | "import scanpy as sc\n", 24 | "import torch\n", 25 | "import tangram as tg\n", 26 | "from sklearn.model_selection import train_test_split\n", 27 | "from anndata import AnnData, read_h5ad\n", 28 | "import random" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "data_merfish_raw = pd.read_csv(\"data/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_by_gene_S1R1.csv\", index_col=0) \n", 38 | "meta_data = pd.read_csv(\"data/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_metadata_S1R1.csv\", index_col=0)\n", 39 | "\n", 40 | "data_merfish = AnnData(data_merfish_raw)\n", 41 | "\n", 42 | "data_merfish.obs['x_cord'] = meta_data['center_x'].tolist()\n", 43 | "data_merfish.obs['y_cord'] = meta_data['center_y'].tolist()\n", 44 | "data_merfish_raw = data_merfish.copy()\n", 45 | "\n", 46 | "sc.pp.filter_cells(data_merfish, min_counts=500)\n", 47 | "sc.pp.filter_cells(data_merfish, min_genes=100)\n", 48 | "\n", 49 | "sc.pp.neighbors(data_merfish, n_neighbors=15, use_rep='X', random_state=1)\n", 50 | "sc.tl.louvain(data_merfish, 0.4, random_state=1)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 6, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "## right brain as training, left brain as testing\n", 60 | "Qdata = data_merfish[data_merfish.obs['x_cord'] <= np.quantile(data_merfish.obs['x_cord'], 0.5)]\n", 61 | "Rdata = data_merfish[data_merfish.obs['x_cord'] > np.quantile(data_merfish.obs['x_cord'], 0.5)]" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 7, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "name": "stderr", 71 | "output_type": "stream", 72 | "text": [ 73 | "INFO:root:649 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.\n", 74 | "INFO:root:649 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.\n", 75 | "INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.\n", 76 | "INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "tg.pp_adatas(Qdata, Rdata, genes=Rdata.var.index)\n", 82 | "\n", 83 | "assert Qdata.uns['training_genes'] == Rdata.uns['training_genes']" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 11, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "name": "stderr", 93 | "output_type": "stream", 94 | "text": [ 95 | "INFO:root:Allocate tensors for mapping.\n", 96 | "INFO:root:Begin training with 649 genes and rna_count_based density_prior in cells mode...\n", 97 | "INFO:root:Printing scores every 100 epochs.\n" 98 | ] 99 | }, 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "Score: 0.260, KL reg: 0.043\n", 105 | "Score: 0.822, KL reg: 0.002\n", 106 | "Score: 0.835, KL reg: 0.002\n", 107 | "Score: 0.838, KL reg: 0.002\n", 108 | "Score: 0.840, KL reg: 0.002\n", 109 | "Score: 0.841, KL reg: 0.002\n", 110 | "Score: 0.841, KL reg: 0.002\n", 111 | "Score: 0.842, KL reg: 0.002\n", 112 | "Score: 0.842, KL reg: 0.002\n", 113 | "Score: 0.842, KL reg: 0.002\n" 114 | ] 115 | }, 116 | { 117 | "name": "stderr", 118 | "output_type": "stream", 119 | "text": [ 120 | "INFO:root:Saving results..\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "ad_map = tg.map_cells_to_space(\n", 126 | " adata_sc=Qdata,\n", 127 | " adata_sp=Rdata,\n", 128 | " device='cpu',\n", 129 | " # device='cuda:0',\n", 130 | ")" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 12, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "Rdata_location_pred = ad_map.X.dot(np.array(Rdata.obs[['x_cord', 'y_cord']]))\n", 140 | "Rdata_location_pred[:, 0] = -Rdata_location_pred[:, 0] + 2*np.quantile(data_merfish.obs['x_cord'], 0.5)\n", 141 | "Rdata_location_pred = pd.DataFrame(Rdata_location_pred)\n", 142 | "Rdata_location_pred.index = Qdata.obs.index\n", 143 | "Rdata_location_pred.columns = ['x', 'y']\n", 144 | "\n", 145 | "Rdata_location_pred.to_csv(\"output/brain/tangram_brain_left.csv\")" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 20, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "name": "stderr", 162 | "output_type": "stream", 163 | "text": [ 164 | "INFO:root:649 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.\n", 165 | "INFO:root:649 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.\n", 166 | "INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.\n", 167 | "INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.\n" 168 | ] 169 | } 170 | ], 171 | "source": [ 172 | "## left brain as training, right brain as testing\n", 173 | "Rdata = data_merfish[data_merfish.obs['x_cord'] <= np.quantile(data_merfish.obs['x_cord'], 0.5)]\n", 174 | "Qdata = data_merfish[data_merfish.obs['x_cord'] > np.quantile(data_merfish.obs['x_cord'], 0.5)]\n", 175 | "\n", 176 | "tg.pp_adatas(Qdata, Rdata, genes=Rdata.var.index)\n", 177 | "assert Qdata.uns['training_genes'] == Rdata.uns['training_genes']" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 21, 183 | "metadata": {}, 184 | "outputs": [ 185 | { 186 | "name": "stderr", 187 | "output_type": "stream", 188 | "text": [ 189 | "INFO:root:Allocate tensors for mapping.\n", 190 | "INFO:root:Begin training with 649 genes and rna_count_based density_prior in cells mode...\n", 191 | "INFO:root:Printing scores every 100 epochs.\n" 192 | ] 193 | }, 194 | { 195 | "name": "stdout", 196 | "output_type": "stream", 197 | "text": [ 198 | "Score: 0.260, KL reg: 0.046\n", 199 | "Score: 0.820, KL reg: 0.002\n", 200 | "Score: 0.833, KL reg: 0.002\n", 201 | "Score: 0.837, KL reg: 0.002\n", 202 | "Score: 0.838, KL reg: 0.002\n", 203 | "Score: 0.839, KL reg: 0.002\n", 204 | "Score: 0.840, KL reg: 0.002\n", 205 | "Score: 0.840, KL reg: 0.002\n", 206 | "Score: 0.841, KL reg: 0.002\n", 207 | "Score: 0.841, KL reg: 0.002\n" 208 | ] 209 | }, 210 | { 211 | "name": "stderr", 212 | "output_type": "stream", 213 | "text": [ 214 | "INFO:root:Saving results..\n" 215 | ] 216 | } 217 | ], 218 | "source": [ 219 | "ad_map = tg.map_cells_to_space(\n", 220 | " adata_sc=Qdata,\n", 221 | " adata_sp=Rdata,\n", 222 | " device='cpu',\n", 223 | " # device='cuda:0',\n", 224 | ")" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 22, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "Rdata_location_pred = ad_map.X.dot(np.array(Rdata.obs[['x_cord', 'y_cord']]))\n", 234 | "Rdata_location_pred[:, 0] = -Rdata_location_pred[:, 0] + 2*np.quantile(data_merfish.obs['x_cord'], 0.5)\n", 235 | "Rdata_location_pred = pd.DataFrame(Rdata_location_pred)\n", 236 | "Rdata_location_pred.index = Qdata.obs.index\n", 237 | "Rdata_location_pred.columns = ['x', 'y']\n", 238 | "\n", 239 | "Rdata_location_pred.to_csv(\"output/brain/tangram_brain_right.csv\")" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [] 248 | } 249 | ], 250 | "metadata": { 251 | "kernelspec": { 252 | "display_name": "Python 3.8.15 64-bit", 253 | "language": "python", 254 | "name": "python3" 255 | }, 256 | "language_info": { 257 | "codemirror_mode": { 258 | "name": "ipython", 259 | "version": 3 260 | }, 261 | "file_extension": ".py", 262 | "mimetype": "text/x-python", 263 | "name": "python", 264 | "nbconvert_exporter": "python", 265 | "pygments_lexer": "ipython3", 266 | "version": "3.8.15 (default, Oct 11 2022, 21:52:37) \n[Clang 14.0.0 (clang-1400.0.29.102)]" 267 | }, 268 | "orig_nbformat": 4, 269 | "vscode": { 270 | "interpreter": { 271 | "hash": "0adcc2737ebf6a4a119f135174df96668767fca1ef1112612db5ecadf2b6d608" 272 | } 273 | } 274 | }, 275 | "nbformat": 4, 276 | "nbformat_minor": 2 277 | } 278 | -------------------------------------------------------------------------------- /code_paper/4_Mouse_brain_MERFISH/novoSpaRc_brain.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/usr/local/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 13 | " from .autonotebook import tqdm as notebook_tqdm\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import novosparc as ns\n", 19 | "\n", 20 | "import os\n", 21 | "import numpy as np\n", 22 | "import pandas as pd\n", 23 | "import scanpy as sc\n", 24 | "import matplotlib.pyplot as plt\n", 25 | "import altair as alt\n", 26 | "from scipy.spatial.distance import cdist, squareform, pdist\n", 27 | "from scipy.stats import ks_2samp\n", 28 | "from scipy.stats import pearsonr\n", 29 | "\n", 30 | "import random\n", 31 | "random.seed(0)\n", 32 | "\n", 33 | "from skimage import io, color\n", 34 | "import torch\n", 35 | "from torch.nn import functional as F\n", 36 | "import json\n", 37 | "\n", 38 | "from sklearn.model_selection import train_test_split\n", 39 | "from anndata import AnnData, read_h5ad\n", 40 | "import matplotlib.pyplot as plt\n", 41 | "import seaborn as sns" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stderr", 51 | "output_type": "stream", 52 | "text": [ 53 | "/var/folders/mc/kqfjr86j5gz9cdyb9w1kfhn40000gp/T/ipykernel_34616/3711195478.py:5: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", 54 | " data_merfish = AnnData(data_merfish_raw)\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "data_merfish_raw = pd.read_csv(\"data/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_by_gene_S1R1.csv\", index_col=0) \n", 60 | "meta_data = pd.read_csv(\"data/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_metadata_S1R1.csv\", index_col=0)\n", 61 | "\n", 62 | "data_merfish = AnnData(data_merfish_raw)\n", 63 | "\n", 64 | "data_merfish.obs['x_cord'] = meta_data['center_x'].tolist()\n", 65 | "data_merfish.obs['y_cord'] = meta_data['center_y'].tolist()\n", 66 | "data_merfish_raw = data_merfish.copy()\n", 67 | "\n", 68 | "sc.pp.filter_cells(data_merfish, min_counts=500)\n", 69 | "sc.pp.filter_cells(data_merfish, min_genes=100)\n", 70 | "\n", 71 | "sc.pp.neighbors(data_merfish, n_neighbors=15, use_rep='X', random_state=1)\n", 72 | "sc.tl.louvain(data_merfish, 0.4, random_state=1)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "Qdata = data_merfish[data_merfish.obs['x_cord'] <= np.quantile(data_merfish.obs['x_cord'], 0.5)]\n", 82 | "Rdata = data_merfish[data_merfish.obs['x_cord'] > np.quantile(data_merfish.obs['x_cord'], 0.5)]" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 5, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "## right brain as training, left brain as testing\n", 92 | "datatrain = Rdata.copy()\n", 93 | "datatest = Qdata.copy()\n", 94 | "\n", 95 | "random.seed(2021)\n", 96 | "torch.manual_seed(2021)\n", 97 | "np.random.seed(2021)\n", 98 | "## Running novosparc\n", 99 | "locations_apriori = datatrain.obs[['x_cord', 'y_cord']].values\n", 100 | "tissue = ns.cm.Tissue(dataset=datatest, locations=locations_apriori)\n", 101 | "num_neighbors_s = num_neighbors_t = 5\n", 102 | "\n", 103 | "# params for linear cost\n", 104 | "atlas_genes = datatrain.var\n", 105 | "markers = list(atlas_genes.index)\n", 106 | "num_genes = len(markers)\n", 107 | "atlas_matrix = datatrain.to_df().values\n", 108 | "markers_idx = pd.DataFrame({'markers_idx': np.arange(num_genes)}, index=markers)\n", 109 | "markers_to_use = np.concatenate(markers_idx.loc[markers].values)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 6, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "Setting up for reconstruction ... done ( 384.1 seconds )\n", 122 | "Reconstructing spatial information with 649 markers: 21578 cells and 21578 locations ... \n", 123 | "Trying with epsilon: 5.00e-03\n" 124 | ] 125 | }, 126 | { 127 | "name": "stderr", 128 | "output_type": "stream", 129 | "text": [ 130 | "/usr/local/lib/python3.10/site-packages/ot/bregman.py:517: UserWarning: Sinkhorn did not converge. You might want to increase the number of iterations `numItermax` or the regularization parameter `reg`.\n", 131 | " warnings.warn(\"Sinkhorn did not converge. You might want to \"\n" 132 | ] 133 | } 134 | ], 135 | "source": [ 136 | "tissue.setup_reconstruction(atlas_matrix=atlas_matrix, \n", 137 | " markers_to_use=markers_to_use, \n", 138 | " num_neighbors_s=num_neighbors_s, \n", 139 | " num_neighbors_t=num_neighbors_t)\n", 140 | " \n", 141 | "tissue.reconstruct(alpha_linear=0.8, epsilon=5e-3)\n", 142 | "\n", 143 | "location_pred = tissue.gw" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 7, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "location_sum = np.sum(location_pred, axis=1)\n", 153 | "location_pred_copy = location_pred / location_sum.reshape(len(location_sum), 1)\n", 154 | "\n", 155 | "pred_cord_transform = location_pred_copy.dot(np.array(Rdata.obs[['x_cord', 'y_cord']]))\n", 156 | "pred_cord_transform[:, 0] = -pred_cord_transform[:, 0] + 2*np.quantile(data_merfish.obs['x_cord'], 0.5)\n", 157 | "pred_cord_transform = pd.DataFrame(pred_cord_transform)\n", 158 | "pred_cord_transform.index = Qdata.obs.index\n", 159 | "pred_cord_transform.columns = ['x', 'y']\n", 160 | "\n", 161 | "pred_cord_transform.to_csv(\"output/brain/novosparc_brain_left.csv\")" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "## left brain as training, right brain as testing\n", 178 | "datatrain = Rdata.copy()\n", 179 | "datatest = Qdata.copy()\n", 180 | "\n", 181 | "random.seed(2021)\n", 182 | "torch.manual_seed(2021)\n", 183 | "np.random.seed(2021)\n", 184 | "## Running novosparc\n", 185 | "locations_apriori = datatrain.obs[['x_cord', 'y_cord']].values\n", 186 | "tissue = ns.cm.Tissue(dataset=datatest, locations=locations_apriori)\n", 187 | "num_neighbors_s = num_neighbors_t = 5\n", 188 | "\n", 189 | "# params for linear cost\n", 190 | "atlas_genes = datatrain.var\n", 191 | "markers = list(atlas_genes.index)\n", 192 | "num_genes = len(markers)\n", 193 | "atlas_matrix = datatrain.to_df().values\n", 194 | "markers_idx = pd.DataFrame({'markers_idx': np.arange(num_genes)}, index=markers)\n", 195 | "markers_to_use = np.concatenate(markers_idx.loc[markers].values)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "tissue.setup_reconstruction(atlas_matrix=atlas_matrix, \n", 205 | " markers_to_use=markers_to_use, \n", 206 | " num_neighbors_s=num_neighbors_s, \n", 207 | " num_neighbors_t=num_neighbors_t)\n", 208 | " \n", 209 | "tissue.reconstruct(alpha_linear=0.8, epsilon=5e-3)\n", 210 | "\n", 211 | "location_pred = tissue.gw" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "location_sum = np.sum(location_pred, axis=1)\n", 221 | "location_pred_copy = location_pred / location_sum.reshape(len(location_sum), 1)\n", 222 | "\n", 223 | "pred_cord_transform = location_pred_copy.dot(np.array(Rdata.obs[['x_cord', 'y_cord']]))\n", 224 | "pred_cord_transform[:, 0] = -pred_cord_transform[:, 0] + 2*np.quantile(data_merfish.obs['x_cord'], 0.5)\n", 225 | "\n", 226 | "pred_cord_transform = pd.DataFrame(pred_cord_transform)\n", 227 | "pred_cord_transform.index = Qdata.obs.index\n", 228 | "pred_cord_transform.columns = ['x', 'y']\n", 229 | "\n", 230 | "pred_cord_transform.to_csv(\"output/brain/novosparc_brain_right.csv\")" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [] 239 | } 240 | ], 241 | "metadata": { 242 | "kernelspec": { 243 | "display_name": "Python 3", 244 | "language": "python", 245 | "name": "python3" 246 | }, 247 | "language_info": { 248 | "codemirror_mode": { 249 | "name": "ipython", 250 | "version": 3 251 | }, 252 | "file_extension": ".py", 253 | "mimetype": "text/x-python", 254 | "name": "python", 255 | "nbconvert_exporter": "python", 256 | "pygments_lexer": "ipython3", 257 | "version": "3.10.8 (main, Oct 13 2022, 10:17:43) [Clang 14.0.0 (clang-1400.0.29.102)]" 258 | }, 259 | "orig_nbformat": 4, 260 | "vscode": { 261 | "interpreter": { 262 | "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" 263 | } 264 | } 265 | }, 266 | "nbformat": 4, 267 | "nbformat_minor": 2 268 | } 269 | -------------------------------------------------------------------------------- /code_paper/5_liver_MERFISH/CeLEry_liver.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/usr/local/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 13 | " from .autonotebook import tqdm as notebook_tqdm\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import CeLEry as cel\n", 19 | "\n", 20 | "import os,csv,re\n", 21 | "import pandas as pd\n", 22 | "import numpy as np\n", 23 | "import scanpy as sc\n", 24 | "import math\n", 25 | "from skimage import io, color\n", 26 | "\n", 27 | "from scipy.sparse import issparse\n", 28 | "import random, torch\n", 29 | "import warnings\n", 30 | "warnings.filterwarnings(\"ignore\")\n", 31 | "import pickle\n", 32 | "from sklearn.model_selection import train_test_split\n", 33 | "from anndata import AnnData, read_h5ad\n", 34 | "\n", 35 | "import json" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "data_merfish_raw = pd.read_csv(\"data/HumanLiverCancerPatient2_cell_by_gene.csv\", index_col=0) \n", 45 | "meta_data = pd.read_csv(\"data/HumanLiverCancerPatient2_cell_metadata.csv\", index_col=0)\n", 46 | "meta_data = meta_data.sort_index()\n", 47 | "\n", 48 | "data_merfish = AnnData(data_merfish_raw)\n", 49 | "data_merfish.obs['x_cord'] = meta_data['center_x'].tolist()\n", 50 | "data_merfish.obs['y_cord'] = meta_data['center_y'].tolist()\n", 51 | "data_merfish_raw = data_merfish.copy()\n", 52 | "\n", 53 | "def findBlank(name):\n", 54 | " return \"Blank\" in name\n", 55 | "\n", 56 | "blank_lst = np.array(list(map(findBlank, data_merfish.var.index)))\n", 57 | "data_merfish = data_merfish[:, blank_lst == False]\n", 58 | "\n", 59 | "sc.pp.filter_cells(data_merfish, min_genes=100)\n", 60 | "sc.pp.filter_cells(data_merfish, min_counts=500)\n", 61 | "\n", 62 | "sc.pp.neighbors(data_merfish, n_neighbors = 15, use_rep=\"X\")\n", 63 | "sc.tl.louvain(data_merfish, 0.3, random_state=1)\n", 64 | "\n", 65 | "data_merfish.write_h5ad(\"data/liver_merfish.h5ad\")" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 2, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "data_merfish = read_h5ad(\"data/liver_merfish.h5ad\")" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(data_merfish.shape[0]), data_merfish.obs['louvain'], test_size=0.5,random_state=1,stratify=data_merfish.obs['louvain'])\n", 84 | "\n", 85 | "Rdata = data_merfish[np.sort(Rdata_ind), :]\n", 86 | "Qdata = data_merfish[np.sort(Qdata_ind), :]\n", 87 | "print((Rdata.shape, Qdata.shape))" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "cel.get_zscore(Qdata)\n", 97 | "cel.get_zscore(Rdata)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 10, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "Rdata.obs = Rdata.obs[['x_cord', 'y_cord']]\n", 107 | "model_train = cel.Fit_cord (data_train = Rdata, hidden_dims = [50, 20, 10], num_epochs_max = 3000, number_error_try=50, batch_size = 128, path = \"output/liver\", filename = \"liver_merfish\")" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "pred_cord = cel.Predict_cord (data_test = Qdata, path = \"output/liver\", filename = \"liver_merfish\")\n", 117 | "\n", 118 | "data_train = Qdata.copy()\n", 119 | "traindata = (data_train.X.A if issparse(data_train.X) else data_train.X)\n", 120 | "tdatax = np.expand_dims(traindata, axis = 0)\n", 121 | "tdata_rs = np.swapaxes(tdatax, 1, 2)\n", 122 | "test_cord = cel.wrap_gene_location(tdata_rs, data_train.obs[['x_cord', 'y_cord']])\n", 123 | "\n", 124 | "pred_cord_transformx = pred_cord[:,0]*(test_cord.xmax-test_cord.xmin) + test_cord.xmin\n", 125 | "pred_cord_transformy = pred_cord[:,1]*(test_cord.ymax-test_cord.ymin) + test_cord.ymin\n", 126 | "pred_cord_transform = np.array([pred_cord_transformx, pred_cord_transformy]).T\n", 127 | "\n", 128 | "np.save(\"output/liver/celery_liver.npy\", pred_cord_transform)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [] 137 | } 138 | ], 139 | "metadata": { 140 | "kernelspec": { 141 | "display_name": "Python 3", 142 | "language": "python", 143 | "name": "python3" 144 | }, 145 | "language_info": { 146 | "codemirror_mode": { 147 | "name": "ipython", 148 | "version": 3 149 | }, 150 | "file_extension": ".py", 151 | "mimetype": "text/x-python", 152 | "name": "python", 153 | "nbconvert_exporter": "python", 154 | "pygments_lexer": "ipython3", 155 | "version": "3.8.15 (default, Oct 11 2022, 21:52:37) \n[Clang 14.0.0 (clang-1400.0.29.102)]" 156 | }, 157 | "orig_nbformat": 4, 158 | "vscode": { 159 | "interpreter": { 160 | "hash": "0adcc2737ebf6a4a119f135174df96668767fca1ef1112612db5ecadf2b6d608" 161 | } 162 | } 163 | }, 164 | "nbformat": 4, 165 | "nbformat_minor": 2 166 | } 167 | -------------------------------------------------------------------------------- /code_paper/5_liver_MERFISH/README.md: -------------------------------------------------------------------------------- 1 | # Note for Liver MERFISH data 2 | 3 | Datasets is available from https://app.box.com/s/6nz5vlp0hjmuq9xruxog96p2woyt5fpm -------------------------------------------------------------------------------- /code_paper/5_liver_MERFISH/SpaOTsc_liver.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/usr/local/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 13 | " from .autonotebook import tqdm as notebook_tqdm\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import os\n", 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "import scanpy as sc\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "from scipy.spatial.distance import cdist, squareform, pdist\n", 24 | "from scipy.stats import ks_2samp\n", 25 | "from scipy.stats import pearsonr\n", 26 | "\n", 27 | "\n", 28 | "\n", 29 | "import os,csv,re\n", 30 | "import math\n", 31 | "from skimage import io, color\n", 32 | "\n", 33 | "from scipy.sparse import issparse\n", 34 | "import random, torch\n", 35 | "import warnings\n", 36 | "warnings.filterwarnings(\"ignore\")\n", 37 | "import pickle\n", 38 | "from sklearn.model_selection import train_test_split\n", 39 | "from anndata import AnnData, read_h5ad\n", 40 | "import seaborn as sns\n", 41 | "\n", 42 | "import json\n", 43 | "\n", 44 | "from spaotsc import SpaOTsc" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 2, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "data_merfish = read_h5ad(\"data/liver_merfish.h5ad\")" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stdout", 63 | "output_type": "stream", 64 | "text": [ 65 | "((78222, 500), (78223, 500))\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(data_merfish.shape[0]), data_merfish.obs['louvain'], test_size=0.5,random_state=1,stratify=data_merfish.obs['louvain'])\n", 71 | "\n", 72 | "Rdata = data_merfish[np.sort(Rdata_ind), :]\n", 73 | "Qdata = data_merfish[np.sort(Qdata_ind), :]\n", 74 | "\n", 75 | "## Cannot run on the entire dataset, downsampling is needed\n", 76 | "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(Rdata.shape[0]), Rdata.obs['louvain'], test_size=0.25,random_state=1,stratify=Rdata.obs['louvain'])\n", 77 | "Rdata = Rdata[np.sort(Qdata_ind), :]\n", 78 | "\n", 79 | "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(Qdata.shape[0]), Qdata.obs['louvain'], test_size=0.25,random_state=1,stratify=Qdata.obs['louvain'])\n", 80 | "Qdata = Qdata[np.sort(Qdata_ind), :]\n", 81 | "\n", 82 | "print((Rdata.shape, Qdata.shape))" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 5, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "datatest = Qdata.copy()\n", 92 | "datatrain = Rdata.copy()\n", 93 | "random.seed(2021)\n", 94 | "torch.manual_seed(2021)\n", 95 | "np.random.seed(2021)\n", 96 | "\n", 97 | "## Running spaOTsc\n", 98 | "df_sc = pd.DataFrame(datatest.X)\n", 99 | "is_dmat = cdist(np.array(datatrain.obs.iloc[:,0:2]), np.array(datatrain.obs.iloc[:,0:2]), 'euclidean')\n", 100 | "sc_dmat = cdist(datatest.X, datatest.X, 'euclidean')" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 6, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "spsc = SpaOTsc.spatial_sc(sc_data=df_sc, is_dmat=is_dmat, sc_dmat=sc_dmat)\n", 110 | "cost_matrix = cdist(datatest.X, datatrain.X, 'euclidean')\n", 111 | " \n", 112 | "location_pred = spsc.transport_plan(cost_matrix)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 9, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "location_sum = np.sum(location_pred, axis=1)\n", 122 | "location_pred_copy = location_pred / location_sum.reshape(len(location_sum), 1)\n", 123 | "\n", 124 | "pred_cord_transform = location_pred_copy.dot(np.array(Rdata.obs[['x_cord', 'y_cord']]))\n", 125 | "np.save(\"output/liver/spaotsc_liver.npy\", pred_cord_transform)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [] 134 | } 135 | ], 136 | "metadata": { 137 | "kernelspec": { 138 | "display_name": "Python 3", 139 | "language": "python", 140 | "name": "python3" 141 | }, 142 | "language_info": { 143 | "codemirror_mode": { 144 | "name": "ipython", 145 | "version": 3 146 | }, 147 | "file_extension": ".py", 148 | "mimetype": "text/x-python", 149 | "name": "python", 150 | "nbconvert_exporter": "python", 151 | "pygments_lexer": "ipython3", 152 | "version": "3.10.8 (main, Oct 13 2022, 10:17:43) [Clang 14.0.0 (clang-1400.0.29.102)]" 153 | }, 154 | "orig_nbformat": 4, 155 | "vscode": { 156 | "interpreter": { 157 | "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" 158 | } 159 | } 160 | }, 161 | "nbformat": 4, 162 | "nbformat_minor": 2 163 | } 164 | -------------------------------------------------------------------------------- /code_paper/5_liver_MERFISH/Tangram_liver.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/usr/local/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 13 | " from .autonotebook import tqdm as notebook_tqdm\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import os, sys\n", 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "import matplotlib.pyplot as plt\n", 22 | "import seaborn as sns\n", 23 | "import scanpy as sc\n", 24 | "import torch\n", 25 | "import tangram as tg\n", 26 | "from sklearn.model_selection import train_test_split\n", 27 | "from anndata import AnnData, read_h5ad" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "data_merfish = read_h5ad(\"data/liver_merfish.h5ad\")" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(data_merfish.shape[0]), data_merfish.obs['louvain'], test_size=0.5,random_state=1,stratify=data_merfish.obs['louvain'])\n", 46 | "\n", 47 | "Rdata = data_merfish[np.sort(Rdata_ind), :]\n", 48 | "Qdata = data_merfish[np.sort(Qdata_ind), :]\n", 49 | "\n", 50 | "## Cannot run on the entire dataset, downsampling is needed\n", 51 | "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(Rdata.shape[0]), Rdata.obs['louvain'], test_size=0.5,random_state=1,stratify=Rdata.obs['louvain'])\n", 52 | "Rdata = Rdata[np.sort(Qdata_ind), :]\n", 53 | "\n", 54 | "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(Qdata.shape[0]), Qdata.obs['louvain'], test_size=0.5,random_state=1,stratify=Qdata.obs['louvain'])\n", 55 | "Qdata = Qdata[np.sort(Qdata_ind), :]\n", 56 | "\n", 57 | "print((Rdata.shape, Qdata.shape))" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 5, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stderr", 67 | "output_type": "stream", 68 | "text": [ 69 | "INFO:root:500 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.\n", 70 | "INFO:root:500 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.\n", 71 | "INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.\n", 72 | "INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "tg.pp_adatas(Qdata, Rdata, genes=Rdata.var.index)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 6, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stderr", 87 | "output_type": "stream", 88 | "text": [ 89 | "INFO:root:Allocate tensors for mapping.\n", 90 | "INFO:root:Begin training with 500 genes and rna_count_based density_prior in cells mode...\n", 91 | "INFO:root:Printing scores every 100 epochs.\n" 92 | ] 93 | }, 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | "Score: 0.386, KL reg: 0.056\n", 99 | "Score: 0.716, KL reg: 0.003\n", 100 | "Score: 0.750, KL reg: 0.002\n", 101 | "Score: 0.756, KL reg: 0.002\n", 102 | "Score: 0.759, KL reg: 0.001\n", 103 | "Score: 0.760, KL reg: 0.001\n", 104 | "Score: 0.761, KL reg: 0.001\n", 105 | "Score: 0.762, KL reg: 0.001\n", 106 | "Score: 0.763, KL reg: 0.001\n", 107 | "Score: 0.763, KL reg: 0.001\n" 108 | ] 109 | }, 110 | { 111 | "name": "stderr", 112 | "output_type": "stream", 113 | "text": [ 114 | "INFO:root:Saving results..\n" 115 | ] 116 | } 117 | ], 118 | "source": [ 119 | "ad_map = tg.map_cells_to_space(\n", 120 | " adata_sc=Qdata,\n", 121 | " adata_sp=Rdata,\n", 122 | " device='cpu',\n", 123 | " # device='cuda:0',\n", 124 | ")" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 8, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "Rdata_location_pred = ad_map.X.dot(np.array(Rdata.obs[['x_cord', 'y_cord']]))\n", 134 | "np.save(\"output/liver/tangram_liver.npy\", Rdata_location_pred)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [] 143 | } 144 | ], 145 | "metadata": { 146 | "kernelspec": { 147 | "display_name": "Python 3", 148 | "language": "python", 149 | "name": "python3" 150 | }, 151 | "language_info": { 152 | "codemirror_mode": { 153 | "name": "ipython", 154 | "version": 3 155 | }, 156 | "file_extension": ".py", 157 | "mimetype": "text/x-python", 158 | "name": "python", 159 | "nbconvert_exporter": "python", 160 | "pygments_lexer": "ipython3", 161 | "version": "3.8.15 (default, Oct 11 2022, 21:52:37) \n[Clang 14.0.0 (clang-1400.0.29.102)]" 162 | }, 163 | "orig_nbformat": 4, 164 | "vscode": { 165 | "interpreter": { 166 | "hash": "0adcc2737ebf6a4a119f135174df96668767fca1ef1112612db5ecadf2b6d608" 167 | } 168 | } 169 | }, 170 | "nbformat": 4, 171 | "nbformat_minor": 2 172 | } 173 | -------------------------------------------------------------------------------- /code_paper/5_liver_MERFISH/novoSpaRc_liver.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import novosparc as ns\n", 10 | "\n", 11 | "import os\n", 12 | "import numpy as np\n", 13 | "import pandas as pd\n", 14 | "import scanpy as sc\n", 15 | "import matplotlib.pyplot as plt\n", 16 | "import altair as alt\n", 17 | "from scipy.spatial.distance import cdist, squareform, pdist\n", 18 | "from scipy.stats import ks_2samp\n", 19 | "from scipy.stats import pearsonr\n", 20 | "\n", 21 | "import random\n", 22 | "random.seed(0)\n", 23 | "\n", 24 | "from skimage import io, color\n", 25 | "import torch\n", 26 | "from torch.nn import functional as F\n", 27 | "import json\n", 28 | "\n", 29 | "from sklearn.model_selection import train_test_split\n", 30 | "from anndata import AnnData, read_h5ad" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "data_merfish = read_h5ad(\"data/liver_merfish.h5ad\")" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(data_merfish.shape[0]), data_merfish.obs['louvain'], test_size=0.5,random_state=1,stratify=data_merfish.obs['louvain'])\n", 49 | "\n", 50 | "Rdata = data_merfish[np.sort(Rdata_ind), :]\n", 51 | "Qdata = data_merfish[np.sort(Qdata_ind), :]\n", 52 | "\n", 53 | "## Cannot run on the entire dataset, downsampling is needed\n", 54 | "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(Rdata.shape[0]), Rdata.obs['louvain'], test_size=0.3,random_state=1,stratify=Rdata.obs['louvain'])\n", 55 | "Rdata = Rdata[np.sort(Qdata_ind), :]\n", 56 | "\n", 57 | "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(Qdata.shape[0]), Qdata.obs['louvain'], test_size=0.3,random_state=1,stratify=Qdata.obs['louvain'])\n", 58 | "Qdata = Qdata[np.sort(Qdata_ind), :]\n", 59 | "\n", 60 | "print((Rdata.shape, Qdata.shape))" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "datatrain = Rdata.copy()\n", 70 | "datatest = Qdata.copy()\n", 71 | "\n", 72 | "random.seed(2021)\n", 73 | "torch.manual_seed(2021)\n", 74 | "np.random.seed(2021)\n", 75 | "## Running novosparc\n", 76 | "locations_apriori = datatrain.obs[['x_cord', 'y_cord']].values\n", 77 | "tissue = ns.cm.Tissue(dataset=datatest, locations=locations_apriori)\n", 78 | "num_neighbors_s = num_neighbors_t = 5\n", 79 | "\n", 80 | "# params for linear cost\n", 81 | "atlas_genes = datatrain.var\n", 82 | "markers = list(atlas_genes.index)\n", 83 | "num_genes = len(markers)\n", 84 | "atlas_matrix = datatrain.to_df().values\n", 85 | "markers_idx = pd.DataFrame({'markers_idx': np.arange(num_genes)}, index=markers)\n", 86 | "markers_to_use = np.concatenate(markers_idx.loc[markers].values)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "tissue.setup_reconstruction(atlas_matrix=atlas_matrix, \n", 96 | " markers_to_use=markers_to_use, \n", 97 | " num_neighbors_s=num_neighbors_s, \n", 98 | " num_neighbors_t=num_neighbors_t)\n", 99 | " \n", 100 | "tissue.reconstruct(alpha_linear=0.8, epsilon=5e-3)\n", 101 | "\n", 102 | "location_pred = tissue.gw" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "location_sum = np.sum(location_pred, axis=1)\n", 112 | "location_pred_copy = location_pred / location_sum.reshape(len(location_sum), 1)\n", 113 | "\n", 114 | "pred_cord_transform = location_pred_copy.dot(np.array(Rdata.obs[['x_cord', 'y_cord']]))\n", 115 | "np.save(\"output/liver/novosparc_liver.npy\", pred_cord_transform)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [] 124 | } 125 | ], 126 | "metadata": { 127 | "kernelspec": { 128 | "display_name": "Python 3", 129 | "language": "python", 130 | "name": "python3" 131 | }, 132 | "language_info": { 133 | "name": "python", 134 | "version": "3.10.8 (main, Oct 13 2022, 10:17:43) [Clang 14.0.0 (clang-1400.0.29.102)]" 135 | }, 136 | "orig_nbformat": 4, 137 | "vscode": { 138 | "interpreter": { 139 | "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" 140 | } 141 | } 142 | }, 143 | "nbformat": 4, 144 | "nbformat_minor": 2 145 | } 146 | -------------------------------------------------------------------------------- /code_paper/8_mouse_single_cell_prediction/Mouse_sc_analysis.py: -------------------------------------------------------------------------------- 1 | import os,csv,re 2 | import pandas as pd 3 | import numpy as np 4 | import scanpy as sc 5 | import math 6 | from skimage import io, color 7 | 8 | from scipy.sparse import issparse 9 | import random, torch 10 | import warnings 11 | warnings.filterwarnings("ignore") 12 | import matplotlib.colors as clr 13 | import matplotlib.pyplot as plt 14 | import pickle 15 | 16 | #Read original data and save it to h5ad 17 | from scanpy import read_10x_h5 18 | # import SpaGCN as spg 19 | 20 | import CeLEry as cel 21 | from data.MouseBrain.MP1_SVG import d_g 22 | import json 23 | import cv2 as cv 24 | 25 | 26 | ### ------------------------------------------------------------------------------------------------------- ### 27 | ### Preprocessing for MouseSC Data 28 | ### ------------------------------------------------------------------------------------------------------- ### 29 | 30 | MouseSC = sc.read("../data/MouseBrain/MouseSC_scRNA.h5ad") 31 | 32 | dataSection1full = sc.read("../data/MouseBrain/MP1_sudo.h5ad") 33 | genename = dataSection1full.var['genename'] 34 | 35 | 36 | # Get the gene list from the pre-screening 37 | genelistlist = [d_g[i] for i in range(len(d_g))] # transform dictionary to a list of lists 38 | genelist = sum(genelistlist, []) # merge the list of lists 39 | genelistuni = list( dict.fromkeys(genelist) ) # remove duplicates 40 | 41 | genelistindex = [genename[genename == i].index[0] for i in genelistuni if len(genename[genename == i])>0] 42 | 43 | #Read in hitology image 44 | ImageSec1=io.imread("../data/MouseBrain/V1_Mouse_Brain_Sagittal_Posterior_image.tif") 45 | ImageSec1sub = ImageSec1[3000:7000,6200:10500,:] 46 | # cel.printimage (ImageSec1sub, "../output/CeLEry/imageselect") 47 | 48 | imgray = cv.cvtColor(ImageSec1sub, cv.COLOR_BGR2GRAY) 49 | imgray2 = imgray.copy() 50 | imgray2[imgray2<160] = 0 51 | imgray2[imgray2>160] = 255 52 | 53 | ## Take the subset of dataSection1 54 | xcords = dataSection1full.obs["x"].to_numpy() 55 | ycords = dataSection1full.obs["y"].to_numpy() 56 | 57 | Section1Sub = dataSection1full[(xcords>=3000) & (xcords<7000) & (ycords>=6200) & (ycords<10500), genelistindex] 58 | Section1Sub.obs = Section1Sub.obs/50 59 | Section1Sub.obs = Section1Sub.obs.astype(int) 60 | Section1Sub.obs["inner"] = 0 61 | 62 | ## Quality Control 63 | 64 | for i in range(Section1Sub.obs.shape[0]): 65 | xi = Section1Sub.obs["x"][i] 66 | yi = Section1Sub.obs["y"][i] 67 | subarea = np.mean(imgray2[(xi*50-3000):(xi*50+50-3000), (yi*50-6200):(yi*50+50-6200)]) 68 | if subarea<140 or xi*50>6000: 69 | Section1Sub.obs["inner"].iloc[i] = 1 70 | if yi*50>10200 or xi*50<1000: 71 | Section1Sub.obs["inner"].iloc[i] = 0 72 | 73 | Section1Sub = Section1Sub[Section1Sub.obs["inner"] == 1] 74 | 75 | ## Calculating z-score 76 | cel.get_zscore(Section1Sub) 77 | cel.get_zscore(MouseSC) 78 | 79 | ### ------------------------------------------------------------------------------------------------------- ### 80 | ### Perform CeLEry analysis 81 | ### ------------------------------------------------------------------------------------------------------- ### 82 | 83 | def seed_worker(worker_id): 84 | worker_seed = torch.initial_seed() % 2**32 85 | np.random.seed(worker_seed) 86 | random.seed(worker_seed) 87 | 88 | def FitPredModelNE (dataSection1): 89 | # 90 | random.seed(2021) 91 | torch.manual_seed(2021) 92 | np.random.seed(2021) 93 | g = torch.Generator() 94 | g.manual_seed(2021) 95 | # 96 | tdatax = np.expand_dims(dataSection1.X, axis = 0) 97 | tdata_rs = np.swapaxes(tdatax, 1, 2) 98 | DataTra = cel.wrap_gene_location(tdata_rs, dataSection1.obs) 99 | t_loader= torch.utils.data.DataLoader(DataTra, batch_size=4, num_workers = 4, shuffle = True, worker_init_fn=seed_worker, generator=g) 100 | # Create Deep Neural Network for Coordinate Regression 101 | DNNmodel = cel.DNN( in_channels = DataTra[1][0].shape[0], hidden_dims = [30, 25, 15] ) # [100,50,25] ) 102 | DNNmodel = DNNmodel.float() 103 | # 104 | CoOrg=cel.SpaCluster() 105 | CoOrg.train(model = DNNmodel, train_loader = t_loader, num_epochs= 500, RCcountMax = 15, learning_rate = 0.0001) 106 | # 107 | filename3 = "../output/CeLEry/Prediction/PreOrg_Mousesc.obj" 108 | filehandler2 = open(filename3, 'wb') 109 | pickle.dump(DNNmodel, filehandler2) 110 | 111 | FitPredModelNE (dataSection1 = Section1Sub) 112 | 113 | 114 | 115 | ### ------------------------------------------------------------------------------------------------------- ### 116 | ### Present Results 117 | ### ------------------------------------------------------------------------------------------------------- ### 118 | 119 | 120 | def report_prop_method_sc (folder, name, dataSection2, Val_loader, outname = ""): 121 | """ 122 | Report the results of the proposed methods in comparison to the other method 123 | :folder: string: specified the folder that keep the proposed DNN method 124 | :name: string: specified the name of the DNN method, also will be used to name the output files 125 | :dataSection2: AnnData: the data of Section 2 126 | :Val_loader: Dataload: the validation data from dataloader 127 | :outname: string: specified the name of the output, default is the same as the name 128 | """ 129 | if outname == "": 130 | outname = name 131 | filename2 = "{folder}/{name}.obj".format(folder = folder, name = name) 132 | filehandler = open(filename2, 'rb') 133 | DNNmodel = pickle.load(filehandler) 134 | # 135 | total_loss_org = [] 136 | coords_predict = np.zeros((dataSection2.obs.shape[0],2)) 137 | for i, img in enumerate(Val_loader): 138 | recon = DNNmodel(img) 139 | coords_predict[i,:] = recon[0].detach().numpy() 140 | np.savetxt("{folder}/{name}_predmatrix.csv".format(folder = folder, name = name), coords_predict, delimiter=",") 141 | 142 | def EvaluateOrg (testdata): 143 | ## Wrap up Validation data in to dataloader 144 | vdatax = np.expand_dims(testdata.X, axis = 0) 145 | vdata_rs = np.swapaxes(vdatax, 1, 2) 146 | DataVal = cel.wrap_gene_location(vdata_rs, testdata.obs[["sex_id","region_id"]]) 147 | Val_loader= torch.utils.data.DataLoader(DataVal, batch_size=1, num_workers = 4) 148 | # 149 | report_prop_method_sc(folder = "../output/CeLEry/Prediction", 150 | name = "PreOrg_Mousesc", dataSection2 = testdata, 151 | Val_loader = Val_loader) 152 | 153 | 154 | EvaluateOrg(testdata = MouseSC) 155 | -------------------------------------------------------------------------------- /code_paper/8_mouse_single_cell_prediction/analysis-results.py: -------------------------------------------------------------------------------- 1 | import os,csv,re 2 | import pandas as pd 3 | import numpy as np 4 | import scanpy as sc 5 | import math 6 | from skimage import io, color 7 | 8 | from scipy.sparse import issparse 9 | import random, torch 10 | import warnings 11 | warnings.filterwarnings("ignore") 12 | import matplotlib.colors as clr 13 | import matplotlib.pyplot as plt 14 | import pickle 15 | 16 | #Read original data and save it to h5ad 17 | from scanpy import read_10x_h5 18 | # import SpaGCN as spg 19 | 20 | import CeLEry as cel 21 | from data.MouseBrain.MP1_SVG import d_g 22 | import json 23 | # import cv2 as cv 24 | 25 | 26 | ### ------------------------------------------------------------------------------------------------------- ### 27 | ### Preprocessing for MouseSC Data 28 | ### ------------------------------------------------------------------------------------------------------- ### 29 | 30 | MouseSC = sc.read("../data/Seurat/MouseSC_scRNA_SeuratMouseSC.h5ad") 31 | 32 | dataSection1full = sc.read("../data/MouseBrain/MP1_sudo.h5ad") 33 | genename = dataSection1full.var['genename'] 34 | 35 | 36 | # # Get the gene list from the pre-screening 37 | # genelistlist = [d_g[i] for i in range(len(d_g))] # transform dictionary to a list of lists 38 | # genelist = sum(genelistlist, []) # merge the list of lists 39 | # genelistuni = list( dict.fromkeys(genelist) ) # remove duplicates 40 | 41 | # genelistindex = [genename[genename == i].index[0] for i in genelistuni if len(genename[genename == i])>0] 42 | 43 | #Read in hitology image 44 | ImageSec1=io.imread("../data/MouseBrain/V1_Mouse_Brain_Sagittal_Posterior_image.tif") 45 | ImageSec1sub = ImageSec1[3000:7000,6200:10500,:] 46 | # cel.printimage (ImageSec1sub, "../output/CeLEry/imageselect") 47 | 48 | imgray = cv.cvtColor(ImageSec1sub, cv.COLOR_BGR2GRAY) 49 | imgray2 = imgray.copy() 50 | imgray2[imgray2<160] = 0 51 | imgray2[imgray2>160] = 255 52 | 53 | ## Take the subset of dataSection1 54 | xcords = dataSection1full.obs["x"].to_numpy() 55 | ycords = dataSection1full.obs["y"].to_numpy() 56 | 57 | Section1Sub = dataSection1full[(xcords>=3000) & (xcords<7000) & (ycords>=6200) & (ycords<10500), MouseSC.var_names] 58 | Section1Sub.obs = Section1Sub.obs/50 59 | Section1Sub.obs = Section1Sub.obs.astype(int) 60 | Section1Sub.obs["inner"] = 0 61 | 62 | ## Quality Control 63 | 64 | for i in range(Section1Sub.obs.shape[0]): 65 | xi = Section1Sub.obs["x"][i] 66 | yi = Section1Sub.obs["y"][i] 67 | subarea = np.mean(imgray2[(xi*50-3000):(xi*50+50-3000), (yi*50-6200):(yi*50+50-6200)]) 68 | if subarea<140 or xi*50>6000: 69 | Section1Sub.obs["inner"].iloc[i] = 1 70 | if yi*50>10200 or xi*50<1000: 71 | Section1Sub.obs["inner"].iloc[i] = 0 72 | 73 | Section1Sub = Section1Sub[Section1Sub.obs["inner"] == 1, ] 74 | 75 | ## Calculating z-score 76 | cel.get_zscore(Section1Sub) 77 | cel.get_zscore(MouseSC) 78 | 79 | ### ------------------------------------------------------------------------------------------------------- ### 80 | ### Perform CeLEry analysis 81 | ### ------------------------------------------------------------------------------------------------------- ### 82 | 83 | def seed_worker(worker_id): 84 | worker_seed = torch.initial_seed() % 2**32 85 | np.random.seed(worker_seed) 86 | random.seed(worker_seed) 87 | 88 | def FitPredModelNE (dataSection1): 89 | # 90 | random.seed(2021) 91 | torch.manual_seed(2021) 92 | np.random.seed(2021) 93 | g = torch.Generator() 94 | g.manual_seed(2021) 95 | # 96 | tdatax = np.expand_dims(dataSection1.X, axis = 0) 97 | tdata_rs = np.swapaxes(tdatax, 1, 2) 98 | DataTra = cel.wrap_gene_location(tdata_rs, dataSection1.obs) 99 | t_loader= torch.utils.data.DataLoader(DataTra, batch_size=4, num_workers = 4, shuffle = True, worker_init_fn=seed_worker, generator=g) 100 | # Create Deep Neural Network for Coordinate Regression 101 | DNNmodel = cel.DNN( in_channels = DataTra[1][0].shape[0], hidden_dims = [30, 25, 15] ) # [100,50,25] ) 102 | DNNmodel = DNNmodel.float() 103 | # 104 | CoOrg=cel.SpaCluster() 105 | CoOrg.train(model = DNNmodel, train_loader = t_loader, num_epochs= 500, RCcountMax = 15, learning_rate = 0.0001) 106 | # 107 | filename3 = "../output/CeLEry/Mousesc/PreOrg_Mousesc.obj" 108 | filehandler2 = open(filename3, 'wb') 109 | pickle.dump(DNNmodel, filehandler2) 110 | 111 | FitPredModelNE (dataSection1 = Section1Sub) 112 | 113 | 114 | 115 | ### ------------------------------------------------------------------------------------------------------- ### 116 | ### Present Results 117 | ### ------------------------------------------------------------------------------------------------------- ### 118 | 119 | 120 | def report_prop_method_sc (folder, name, dataSection2, Val_loader, outname = ""): 121 | """ 122 | Report the results of the proposed methods in comparison to the other method 123 | :folder: string: specified the folder that keep the proposed DNN method 124 | :name: string: specified the name of the DNN method, also will be used to name the output files 125 | :dataSection2: AnnData: the data of Section 2 126 | :Val_loader: Dataload: the validation data from dataloader 127 | :outname: string: specified the name of the output, default is the same as the name 128 | """ 129 | if outname == "": 130 | outname = name 131 | filename2 = "{folder}/{name}.obj".format(folder = folder, name = name) 132 | filehandler = open(filename2, 'rb') 133 | DNNmodel = pickle.load(filehandler) 134 | # 135 | total_loss_org = [] 136 | coords_predict = np.zeros((dataSection2.obs.shape[0],2)) 137 | # 138 | for i, img in enumerate(Val_loader): 139 | recon = DNNmodel(img) 140 | coords_predict[i,:] = recon[0].detach().numpy() 141 | np.savetxt("{folder}/{name}_predmatrix.csv".format(folder = folder, name = name), coords_predict, delimiter=",") 142 | 143 | def EvaluateOrg (testdata): 144 | ## Wrap up Validation data in to dataloader 145 | vdatax = np.expand_dims(testdata.X, axis = 0) 146 | vdata_rs = np.swapaxes(vdatax, 1, 2) 147 | DataVal = cel.wrap_gene_location(vdata_rs, testdata.obs[["sex_id","region_id"]]) 148 | Val_loader= torch.utils.data.DataLoader(DataVal, batch_size=1, num_workers = 4) 149 | # 150 | report_prop_method_sc(folder = "../output/CeLEry/Mousesc/", 151 | name = "PreOrg_Mousesc", dataSection2 = testdata, 152 | Val_loader = Val_loader) 153 | 154 | 155 | EvaluateOrg(testdata = MouseSC) 156 | 157 | ### ------------------------------------------------------------------------------------------------------- ### 158 | ### Perform Tangram analysis 159 | ### ------------------------------------------------------------------------------------------------------- ### 160 | # import tangram as tg 161 | 162 | # tg.pp_adatas(MouseSC, Section1Sub, genes=None) 163 | # map = tg.map_cells_to_space(MouseSC, Section1Sub, device='cpu') 164 | # map.write_h5ad('../output/CeLEry/Mousesc/tangram.h5ad') 165 | 166 | S1_xmax = Section1Sub.obs['x'].max() + 1 167 | S1_xmin = Section1Sub.obs['x'].min() - 1 168 | S1_ymax = Section1Sub.obs['y'].max() + 1 169 | S1_ymin = Section1Sub.obs['y'].min() - 1 170 | 171 | map = sc.read("../output/CeLEry/Mousesc/tangram.h5ad") 172 | 173 | 174 | ## Normalize the coordinates of both Sections 175 | spx = (Section1Sub.obs.iloc[:,0] - S1_xmin) / (S1_xmax - S1_xmin) 176 | spy = (Section1Sub.obs.iloc[:,1] - S1_ymin) / (S1_ymax - S1_ymin) 177 | 178 | coords_predict_tangram = np.zeros((MouseSC.obs.shape[0],2)) 179 | for i in range(map.X.shape[0]): 180 | bestindex = np.argmax(map.X[i,:]) 181 | pred = torch.FloatTensor([spx[bestindex],spy[bestindex]]) 182 | coords_predict_tangram[i,:] = pred 183 | 184 | 185 | np.savetxt("{folder}/{name}_predmatrix.csv".format(folder = "../output/CeLEry/Mousesc/", name = "Tangram_Mousesc"), coords_predict_tangram, delimiter=",") 186 | 187 | -------------------------------------------------------------------------------- /docs/asserts/images/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/docs/asserts/images/workflow.png -------------------------------------------------------------------------------- /pretrainmodel/Biogen/Pretrained_model_075B.obj: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/pretrainmodel/Biogen/Pretrained_model_075B.obj -------------------------------------------------------------------------------- /pretrainmodel/Biogen/Reference_genes_8_075B.obj: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/pretrainmodel/Biogen/Reference_genes_8_075B.obj -------------------------------------------------------------------------------- /tutorial/BiogenPretrain.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

Biogen Pretrained Tutorial - independent version

\n", 8 | "\n", 9 | "\n", 10 | "
Author: Qihuang Zhang*, Jian Hu, Kejie Li, Baohong Zhang, David Dai, Edward B. Lee, Rui Xiao, Mingyao Li*" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "## Outline\n", 18 | "1. Preparation\n", 19 | "2. Load Data\n", 20 | "3. Prediction\n", 21 | "4. Visualization (in ``R``)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "In this tutorial, we illustrate the usage of the CeLEry pretrain model trained by Biogene mouse brain data (Li and Zhang, 2022). This model takes the gene expression input of 886 genes and produce a prediction probability vector to eight regions segemented from the spatial transcriptomics data.\n", 29 | "\n", 30 | "This tutorial can be independent of the CeLEry package. It does not require installing the CeLEry package. \n", 31 | "\n", 32 | "## 1. Preparation\n", 33 | "\n", 34 | "To implemente the model without installing CeLEry package, several helper functions are needed. The ``pickle`` package is used to load the pretrained model. Function ``make_annData_query()`` transform the raw input data into AnnData format and conduct data proprocessing, including normalizing the gene expression per cell and performing ``log(1+p)`` transcformation. The ``get_zscore()`` helps to normalized the gene expression so that batch effect can be removed." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "import pickle\n", 44 | "from scanpy import read_10x_h5\n", 45 | "import CeLEry as cel\n", 46 | "\n", 47 | "import scanpy as sc\n", 48 | "import numpy as np\n", 49 | "import pandas as pd\n", 50 | "from scipy.sparse import issparse\n" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "## 2. Load Data\n", 58 | " \n", 59 | "Load scRNA-seq/snRNA-seq data. Example data can be download from [Li and Zhang (2022)](https://doi.org/10.5281/zenodo.6640285)." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "\n", 69 | "QueryData_raw = read_10x_h5(\"data/Biogen/7G-1/filtered_feature_bc_matrix.h5\")\n", 70 | "QueryData = cel.make_annData_query (QueryData_raw)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "It is import to make sure the query scRNA-seq/snRNA-seq contains all the gene in the trained model." 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "## Load gene list\n", 87 | "filename = \"pretrainmodel/Biogen/Reference_genes_8_075B.obj\"\n", 88 | "filehandler = open(filename, 'rb') \n", 89 | "genenames = pickle.load(filehandler)\n", 90 | "\n", 91 | "## Rearrange the data and filter the selected genes in the trained model.\n", 92 | "Qdata = QueryData[:,list(genenames)]\n", 93 | "cel.get_zscore(Qdata)\n" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "#### 3. Apply Pre-trained CeLEry model to the snRNA data\n", 101 | "\n", 102 | "The gene expression of the first cell (a 1X886 matrix) in the snRNA-seq data is given by:" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "Qdata[0].X.A" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "Load the CeLEry prediction model which is located at the ``\"../output/Biogene/models\"`` named as ``Org_domain_075B``. We use CeLEry function ``Predict_domain()`` to conduct domain prediction for each single cells in the scRNA-seq/snRNA-seq data. The detailed argument are explained as follows:\n", 119 | "\n", 120 | "* data_test: (AnnData object) the input scRNA-seq/snRNA-seq data \n", 121 | "* class_num: (int) the number of class to be predicted. This value should be consistent with the number of domains in the training model.\n", 122 | "* path: (string) the location of the pre-trained model\n", 123 | "* filename: (string) the file name of the saved pre-trained model\n", 124 | "* predtype: (string) if predtype is \"probability\" (default) then a probability prediction matrix will be produced; if predtype is \"deterministic\", then the deterministic assignment based on the maximun probability prediction will be returned; if predtype is \"both\", then both prediction will be outputed. " 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "## 3. Prediction \n", 132 | "\n", 133 | "Prediction of the first cell" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "model_location = \"pretrainmodel/Biogen/Pretrained_model_075B.obj\"\n", 143 | "\n", 144 | "pred_cord = cel.Predict_domain(data_test = Qdata[0], class_num = 8, path = \"pretrainmodel/Biogen\", filename = \"Pretrained_model_075B\", predtype = \"probability\")\n" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "Predict region labels of the entire scRNA-seq data and report the proportion of the cells on different domains." 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "pred_cord_all = cel.Predict_domain(data_test = Qdata, class_num = 8, path = \"pretrainmodel/Biogen\", filename = \"Pretrained_model_075B\", predtype = \"deterministic\")\n", 161 | "\n", 162 | "prop_count = pd.DataFrame(pred_cord_all).value_counts().sort_index()\n", 163 | "prop_weight = prop_count/sum(prop_count)\n", 164 | "prop_weight\n", 165 | "prop_weight.to_csv(\"output/Biogen/prop_8_075B_7G-1.csv\")\n" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "## 4. Visualization\n", 173 | "\n", 174 | "For the following part, we use the ``ggplot()`` in ``R`` to visualize the the proportion predicted according to CeLEry. We are going to use the regions segemented from the spatial transcriptomics data to illustrate how the distribution looks like.\n", 175 | "\n", 176 | "### 4.1 R packages" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": { 183 | "vscode": { 184 | "languageId": "r" 185 | } 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "library(ggplot2)\n", 190 | "library(png)\n", 191 | "\n", 192 | "outputdir <- \"output/Biogen/plots/\"" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "### 4.2 Plotting Functions\n", 200 | "\n", 201 | "The Density plot function use two input paths. \n", 202 | "\n", 203 | "* ``obsdata_path`` specifies the path of the observation data from the spatial transcriptomics data that are used to trained the data, which are saved from the \".obs\" of the annotated data object in python. This files contain the spot ID, the locations of the spots and the regions information, and will be used as the background of the visualization.\n", 204 | "\n", 205 | "* ``prediction_path`` specifies where the path of the prediction results locate.\n", 206 | "\n", 207 | "* ``objectname`` specifies the name of the output figure." 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": { 214 | "vscode": { 215 | "languageId": "r" 216 | } 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "obsdata_path = \"output/Biogen/obsdata_8_075B.csv\"\n", 221 | "prediction_path = \"output/Biogen/prop_8_075B_7G-1.csv\"\n", 222 | "objectname = \"BiogenExample\"\n", 223 | "\n", 224 | "Density_plot <- function(obsdata_path, prediction_path, objectname){\n", 225 | " obsdata <- read.csv(obsdata_path, header = T)\n", 226 | " maxx <- max(obsdata$x_cord)\n", 227 | " obsdata$minus_xcord <- maxx - obsdata$x_cord\n", 228 | " pred_CeLEry <- read.csv(prediction_path, header = T)\n", 229 | " colnames(pred_CeLEry) = c(\"Domain\", \"Density\")\n", 230 | " dataplot <- merge(obsdata, pred_CeLEry, by.x = \"refined_pred\", by.y = \"Domain\")\n", 231 | " png(file = paste0(outputdir,\"Density_plot_\",objectname,\".png\"), height = 300, width = 450)\n", 232 | " DensityPlot2D <- ggplot(dataplot, aes(x = x_cord, y = y_cord) ) + \n", 233 | " theme_bw() + \n", 234 | " geom_point(aes(color = Density), size = 3) + #shape = 21, color = \"black\",, stroke = 0.3\n", 235 | " # scale_y_reverse() +\n", 236 | " scale_color_gradient(low = \"#7E7F9A\", high = \"#F3DE8A\") +\n", 237 | " theme(text=element_text(size=20, family=\"URWHelvetica\"), axis.text = element_blank(),\n", 238 | " axis.ticks=element_blank(),\n", 239 | " panel.spacing = unit(1, \"lines\")) +\n", 240 | " theme(strip.background =element_rect(fill=\"#3F4536\",color=\"#3F4536\"))+\n", 241 | " theme(strip.text = element_text(colour = 'white')) +\n", 242 | " theme(panel.border = element_rect(colour = \"#3F4536\")) +\n", 243 | " labs(x = NULL, y = NULL, color = \"Proportion\")\n", 244 | " print(DensityPlot2D)\n", 245 | " dev.off()\n", 246 | "}\n", 247 | "\n", 248 | "Density_plot(obsdata_path, prediction_path, objectname)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "The output figures display the proportion of cells in the regions segemented in the training data." 256 | ] 257 | } 258 | ], 259 | "metadata": { 260 | "kernelspec": { 261 | "display_name": "Python 3", 262 | "language": "python", 263 | "name": "python3" 264 | }, 265 | "language_info": { 266 | "codemirror_mode": { 267 | "name": "ipython", 268 | "version": 3 269 | }, 270 | "file_extension": ".py", 271 | "mimetype": "text/x-python", 272 | "name": "python", 273 | "nbconvert_exporter": "python", 274 | "pygments_lexer": "ipython3", 275 | "version": "3.8.8" 276 | } 277 | }, 278 | "nbformat": 4, 279 | "nbformat_minor": 2 280 | } 281 | -------------------------------------------------------------------------------- /tutorial/BiogenPretrain.md: -------------------------------------------------------------------------------- 1 |

Biogen Pre-trained Model Tutorial

2 | 3 | Author: Qihuang Zhang*, Jian Hu, Kejie Li, Baohong Zhang, David Dai, 4 | Edward B. Lee, Rui Xiao, Mingyao Li* 5 | 6 | 7 | 8 | ## Outline 9 | 10 | 1. Preparation 11 | 2. Load Data 12 | 3. Prediction 13 | 4. Visualization (in `R`) 14 | 15 | 16 | In this tutorial, we illustrate the usage of the CeLEry pre-train model 17 | trained by Biogene mouse brain data (Li and Zhang, 2022). This model 18 | takes the gene expression input of 886 genes and produces a prediction 19 | probability vector to eight regions segmented from the spatial 20 | transcriptomics data. 21 | 22 | 23 | The prediction model in this tutorial is pre-trained using the spatial transcripitomics data (ID 075B). The domains were segemented using ``spaGCN``: 24 | 25 | ![domain segementation](figures/segementation_8_075B.png) 26 | 27 | 28 | To implement this tutorial, the CeLEry python package needs to be installed. Please see the instruction for installation. 29 | 30 | ## 1. Preparation 31 | 32 | To implement this tutorial, several 33 | helper functions are needed. 34 | 35 | 36 | ``` {.python} 37 | import pickle 38 | from scanpy import read_10x_h5 39 | import CeLEry as cel 40 | 41 | import scanpy as sc 42 | import numpy as np 43 | import pandas as pd 44 | from scipy.sparse import issparse 45 | ``` 46 | 47 | 48 | ## 2. Load Data 49 | 50 | Load scRNA-seq/snRNA-seq data. Example data can be download from [Li and Zhang (2022)](https://doi.org/10.5281/zenodo.6640285). 51 | 52 | ``` {.python} 53 | QueryData_raw = read_10x_h5("data/Biogen/7G-1/filtered_feature_bc_matrix.h5") 54 | QueryData = cel.make_annData_query (QueryData_raw) 55 | ``` 56 | 57 | 58 | It is important to make sure the query scRNA-seq/snRNA-seq contains all the gene in the trained model. 59 | 60 | ``` {.python} 61 | ## Load gene list 62 | filename = "pretrainmodel/Biogen/Reference_genes_8_075B.obj" 63 | filehandler = open(filename, 'rb') 64 | genenames = pickle.load(filehandler) 65 | 66 | ## Rearrange the data and filter the selected genes in the trained model. 67 | Qdata = QueryData[:,list(genenames)] 68 | cel.get_zscore(Qdata) 69 | ``` 70 | 71 | #### 3. Apply Pre-trained CeLEry model to the snRNA data 72 | 73 | The gene expression of the first cell (a 1X886 matrix) in the snRNA-seq data is given by: 74 | 75 | ``` {.python} 76 | Qdata[0].X.A 77 | ``` 78 | 79 | Load the CeLEry prediction model which is located at the 80 | `"../output/Biogene/models"` named as `Org_domain_075B`. We use CeLEry 81 | function `Predict_domain()` to conduct domain prediction for each single 82 | cells in the scRNA-seq/snRNA-seq data. The detailed arguments are 83 | explained as follows: 84 | 85 | - data_test: (AnnData object) the input scRNA-seq/snRNA-seq data 86 | - class_num: (int) the number of classes to be predicted. This value 87 | should be consistent with the number of domains in the training 88 | model. 89 | - path: (string) the location of the pre-trained model 90 | - filename: (string) the file name of the saved pre-trained model 91 | - predtype: (string) if predtype is \"probability\" (default) then a 92 | probability prediction matrix will be produced; if predtype is 93 | \"deterministic\", then the deterministic assignment based on the 94 | maximum probability prediction will be returned; if predtype is 95 | \"both\", then both predictions will be outputed. 96 | 97 | ## 3. Prediction 98 | 99 | Prediction of the first cell 100 | 101 | ``` {.python} 102 | model_location = "pretrainmodel/Biogen/Pretrained_model_075B.obj" 103 | 104 | pred_cord = cel.Predict_domain(data_test = Qdata[0], class_num = 8, path = "pretrainmodel/Biogen", filename = "Pretrained_model_075B", predtype = "probability") 105 | ``` 106 | 107 | 108 | Predict region labels of the entire scRNA-seq data and report the proportion of the cells on each domain. 109 | 110 | ``` {.python} 111 | pred_cord_all = cel.Predict_domain(data_test = Qdata, class_num = 8, path = "pretrainmodel/Biogen", filename = "Pretrained_model_075B", predtype = "deterministic") 112 | 113 | prop_count = pd.DataFrame(pred_cord_all).value_counts().sort_index() 114 | prop_weight = prop_count/sum(prop_count) 115 | prop_weight 116 | prop_weight.to_csv("output/Biogen/prop_8_075B_7G-1.csv") 117 | ``` 118 | 119 | The output of this example is: 120 | 121 | ``` 122 | 0 0.280068876 123 | 1 0.155832975 124 | 2 0.102539819 125 | 3 0.066465777 126 | 4 0.151183814 127 | 5 0.169436074 128 | 6 0.056048214 129 | 7 0.018424451 130 | ``` 131 | The first column corresponds to the domain in the training spatial transcriptomics data as in the previous figure. The second column reports the proportion of the cells located in different regions. 132 | 133 | 134 | ## 4. Visualization 135 | 136 | For the following part, we use the `ggplot()` in `R` to visualize the 137 | proportion predicted according to CeLEry. We are going to use the 138 | regions segmented from the spatial transcriptomics data to illustrate 139 | what the distribution looks like. 140 | 141 | ### 4.1 R packages 142 | 143 | ``` {.R} 144 | library(ggplot2) 145 | library(png) 146 | 147 | outputdir <- "output/Biogen/plots/" 148 | ``` 149 | 150 | 151 | ### 4.2 Plotting Functions 152 | 153 | The Density plot function use two input paths. 154 | 155 | - `obsdata_path` specifies the path of the observation data from the 156 | spatial transcriptomics data that are used to train the data, 157 | which are saved from the \".obs\" of the annotated data object in 158 | python. These files contain the spot ID, the locations of the spots, 159 | and the region ID, and will be used as the background of 160 | the visualization. 161 | 162 | - `prediction_path` specifies where the path of the prediction results 163 | locate. 164 | 165 | - `objectname` specifies the name of the output figure. 166 | 167 | ``` {.R} 168 | obsdata_path = "output/Biogen/obsdata_8_075B.csv" 169 | prediction_path = "output/Biogen/prop_8_075B_7G-1.csv" 170 | objectname = "BiogenExample" 171 | 172 | Density_plot <- function(obsdata_path, prediction_path, objectname){ 173 | obsdata <- read.csv(obsdata_path, header = T) 174 | maxx <- max(obsdata$x_cord) 175 | obsdata$minus_xcord <- maxx - obsdata$x_cord 176 | pred_CeLEry <- read.csv(prediction_path, header = T) 177 | colnames(pred_CeLEry) = c("Domain", "Density") 178 | dataplot <- merge(obsdata, pred_CeLEry, by.x = "refined_pred", by.y = "Domain") 179 | png(file = paste0(outputdir,"Density_plot_",objectname,".png"), height = 300, width = 450) 180 | DensityPlot2D <- ggplot(dataplot, aes(x = x_cord, y = y_cord) ) + 181 | theme_bw() + 182 | geom_point(aes(color = Density), size = 3) + #shape = 21, color = "black",, stroke = 0.3 183 | # scale_y_reverse() + 184 | scale_color_gradient(low = "#7E7F9A", high = "#F3DE8A") + 185 | theme(text=element_text(size=20, family="URWHelvetica"), axis.text = element_blank(), 186 | axis.ticks=element_blank(), 187 | panel.spacing = unit(1, "lines")) + 188 | theme(strip.background =element_rect(fill="#3F4536",color="#3F4536"))+ 189 | theme(strip.text = element_text(colour = 'white')) + 190 | theme(panel.border = element_rect(colour = "#3F4536")) + 191 | labs(x = NULL, y = NULL, color = "Proportion") 192 | print(DensityPlot2D) 193 | dev.off() 194 | } 195 | 196 | Density_plot(obsdata_path, prediction_path, objectname) 197 | ``` 198 | 199 | The output figures display the proportion of cells in the regions 200 | segmented in the training data. 201 | 202 | ![prediction results](figures/Density_plot_BiogenExample.png) 203 | -------------------------------------------------------------------------------- /tutorial/data/AlzheimerToy.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/tutorial/data/AlzheimerToy.h5ad -------------------------------------------------------------------------------- /tutorial/data/DataLayerToy.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/tutorial/data/DataLayerToy.h5ad -------------------------------------------------------------------------------- /tutorial/data/Mouse2D/MP1_SVG.py: -------------------------------------------------------------------------------- 1 | d_g={0: [], 1: ['PVALB', 'RPS29', '2900097C17RIK', 'YWHAH', 'CALB1', 'SLC1A2', 'RPL39', 'NSG1', 'CAR8', 'RPS19', 'PCP4', 'SLC1A3', 'ATP1A3', 'SPARCL1', 'GPR37L1', 'SPARC', 'RGS8', 'MT-ND1', 'MT-CO2', 'ITPR1', 'GNG13', 'MT-ND3', 'MT-ND4', 'MT-ND2', 'GPM6B', 'SPTBN2', 'FAM107A', 'TSPAN3', 'RPL23A', 'SLC6A1', 'S100B', 'HBA-A1', 'SBK1', 'NDRG2', 'GAD1', 'RPS28', 'METRN', 'INPP5A', 'LAMP1', 'GABRA1', 'GRIA1', 'CAMK2N1', 'VDAC1', 'HBA-A2', 'HOPX', 'GSTM1', 'TTYH1', 'DNER', 'MPC2', 'NDUFB6', 'MT-ND5', 'THY1', 'AHCYL1', 'HBB-BT', 'NDUFS8', 'HOMER3', 'TOMM7', 'CLEC2L', 'CSDC2', 'RTN4', 'LHX1OS', 'SCG3', 'SELENOF', 'CALR', 'PRKCG', 'GABARAPL1', 'CLSTN3', 'TMBIM6', 'FAM213A', 'ICMT', 'H2-D1', 'SOD1', '1810037I17RIK', 'CYSTM1', 'PLA2G7', 'TRIM9', 'WSB2', 'PPP1R16B', 'ATOX1', 'MT2', 'WASHC2', 'ACSBG1', 'ABAT', 'MTSS1L', 'GRIA4', 'MALAT1', 'PTPRZ1', 'PMM1', 'ALDH1A1', 'B2M', 'UBL3', 'SLC32A1', 'MAP1A', 'PPP1R17', 'SUB1', 'TSC22D4', 'HSPA5', 'CCK', 'GOT2', 'FRRS1L', 'SELENOP', 'ABHD12', 'GARNL3', 'TIMP4', 'PAQR8', 'INA', 'RORA', 'ADGRB1', 'ABHD17A', 'CABP1', 'INPP4A', 'SCP2', 'ANKS1B', 'RHEB', 'CACNA1G', 'GABRG2', 'PLEKHB2', 'RGS7BP', 'ANKRD40', 'TMEM59', 'OST4', 'PTN', 'SLC24A2', 'FXYD1', 'ELMOD1', 'ARHGAP5', 'GRID2', 'KCNA2', 'CCT7', 'CS', 'EMC7', 'BOLA3', 'LAPTM4A', 'MLC1', 'NEFL', 'DAD1', 'HTRA1', 'VIM', 'SFXN5', 'ERP29', 'S100A1', 'NTRK2', 'PRDX6', 'NTM', 'DLGAP4', 'ASRGL1', 'PLPP3', 'SEC62', 'ID2', 'LSM6', 'NOMO1', 'BAIAP2', 'FAM69B', 'MTSS1', 'GPX1', 'RELL2', 'SERPINI1', 'PSD2', 'VEGFB', 'SLC38A1', 'GNG5', 'OSBPL1A', 'LPGAT1', 'ELMO1', 'GLO1', 'RMDN3', 'MRPL52', 'LY6E', 'MT-ND4L', 'ARHGEF33', 'GSTM5', 'SLC25A23', 'CACNG2', 'SOD2', 'SLC20A1', 'HAPLN4', 'SYNDIG1', 'OAZ2', 'PPP1R1B', 'SLC1A6', 'GABBR2', 'RAMP1', 'KCNJ10', 'PABPC1', 'FAM162A', 'TRIM37', 'LUZP2', 'GRM1', 'DLG2', 'ABR', 'SELENOS', 'MRPL33', 'CHN1', 'FABP7', 'NTSR2', 'KIT', 'EFR3A', 'TSPAN13', 'NACC2', 'MT-ATP8', 'SHANK1', 'KIF5B', 'ARPC2', 'ATL2', 'GAD2', 'SHISA6', 'KCNG4', 'SDHC', 'ATP2A3', 'GRIA3', 'GFOD1', 'FAM107B', 'NEFM', 'TOLLIP', 'LXN', 'LRRN2', 'NRSN2', 'UHMK1', 'ERG28', 'GJA1', 'GNAL', 'RIDA', '1500009C09RIK', 'HEPACAM', 'ZFP385A', 'TRPC3', 'ITPKA', 'PPP1R14B', 'PLTP', 'CHPT1', 'KCTD12', 'BTBD1', 'DNAJC15', 'PRMT8', 'CDC42EP4', 'SLC24A3', 'TMEM47', 'SPOCK3', 'S1PR1'], 2: ['HBB-BT', 'AGT', 'NNAT', 'FXYD1', 'GFAP'], 3: ['SNAP25', 'LDHB', 'ATP5L', 'COX8A', 'COX6B1', 'COX5B', 'HSPA8', 'NDUFB8', 'CKB', 'SPARCL1', 'COX6C', 'CALM2', 'PCSK1N', 'CALM1', 'ALDOA', 'STMN3', 'ATP1A3', 'NDRG4', 'NDUFA4', 'SCN1B', '2010107E04RIK', 'ATP5B', 'ATP1B1', 'GNAS', 'ATP5A1', 'VSNL1', 'TUBA1B', 'COX7A2', 'ATP5J2', 'ATP5G3', 'UQCRQ', 'COX7B', 'SNRPN', 'MDH1', 'SLC25A4', 'CHCHD10', 'UQCRH', 'RAB3A', 'PKM', 'CPLX1', 'COX5A', 'ATP5J', 'UCHL1', 'ATPIF1', 'TPI1', 'USMG5', 'PVALB', 'NEFL', 'THY1', '2900097C17RIK', 'ENO2', 'DNM1', 'ZWINT', 'NDUFC1', 'ATP5K', 'CLSTN1', 'RTN1', 'EEF1A2', 'TCF25', 'SNCB', 'VAMP1', 'YWHAH', 'NSF', 'YWHAG', 'MAP1B', 'NCDN', 'PRDX5', 'STXBP1', 'ATP6V1B2', 'NEFM', 'GOT1', 'CEND1', 'TSPYL4', 'NAT8L', 'NSG1', 'ATP2B2', 'SLC12A5', 'CKMT1'], 4: [], 5: ['NRGN', 'CYFIP2', 'SERINC1', 'EEF1A2', 'APP', 'ZWINT', 'SYP', 'GPM6A', 'GNG3', 'CHN1', 'ATP6V1G2', 'CALM2', 'YWHAH', 'SLC17A7', 'VSNL1', 'CLSTN1', 'MEG3', 'SNHG11', 'RTN1', 'BASP1', 'DNM1', 'SYT1', 'SNAP25', 'CTXN1', 'CCK', 'VAMP2', 'STXBP1', '1110008P14RIK', 'PRKAR1B', 'CX3CL1', 'SCG5', 'LINGO1', 'ARPP19'], 6: ['APOD', 'GFAP'], 7: ['DCLK1', 'CALB1', 'FABP3', 'ITPKA', 'GAP43', 'GDA', 'MARCKS', 'CYP46A1', 'SYT5', 'ARPP21', 'HAP1', 'GRIA3', 'LINGO1', 'SYT4', 'NEGR1', 'LYPD1', 'TSPAN13', 'GABRA1', 'PTPN5', 'ATP2B4', 'MAL2', 'HPCAL1', 'RSRP1', 'NOV', 'DBPHT2', 'CAMK2D'], 8: ['RNF112', 'GRM4', 'SMG1', 'BMP1', 'TRIM62', 'PPFIA4', 'KNDC1', 'TESC', 'PLCH2', 'MPP3', 'ADAMTS10', 'CNTN2', 'TYRO3', 'BSN', 'PXN', 'SCN2A', 'TLE2', 'JPH4', 'USP3', 'IL16', 'LENG8', 'DUSP11', 'FAM131B', 'DOCK9', 'SEL1L3', 'JPH3', 'ODF2'], 9: ['RASGRP1', 'RAB6B', 'TRNP1', 'C1QTNF4', 'ATP2A2', 'YWHAZ', 'NCDN', 'ABHD8', 'REEP2', 'SPTBN1', 'CCK', 'CNTN1', 'SYP', 'SPOCK2', 'PCP4', 'PDP1', 'CAMK2N2', 'ADARB1', 'NSMF', 'INA', 'PTPN4', 'GABBR1', 'ATP2B2', 'KCTD17', 'CX3CL1', 'RAP1GDS1', 'TCF7L2', 'ADGRB1', 'RORA', 'SYT1', 'PRKCD', 'STMN4', 'CAMK2B', 'ATP6AP2', 'ATP2B1', 'RAP1GAP', 'NCS1', 'ELMO1', 'RGS7BP', 'GABBR2', 'SLC24A2', 'RIMS3', 'DLGAP3', 'KIF1A', 'CHN1', 'PPP1R9B', 'MYO5A', 'PCP4L1', 'ZIC1', 'SLC17A6', 'NDUFA10', 'NRXN1', 'ANKS1B', 'PRKCG', 'AMOTL1', 'CDK5R1', 'OGFRL1', 'GRIN1', 'EDIL3', '2900011O08RIK', 'SYT7', 'PLEKHG1', 'CIT', 'ADCY1', 'CCDC136', 'RAB3C', 'CDK16', 'NTNG1', 'CAMK2A', 'DLGAP4', 'SPOCK3', 'KCNC1', 'SYN1', 'AATK', 'LRRN2', 'BTBD3', 'TNNT1', 'KCNC2', 'KCNAB2', 'SLC17A7', 'SPOCK1', 'PTPN3', 'FBXL16', 'BOK', 'PCSK2', 'PSD3', 'HLF', 'KCNA2', 'AI593442', 'RAMP3', 'CD47', 'NRIP3', 'RGS16', 'SMIM13', 'KNDC1', 'BSN', 'RNF112', 'PITPNM1', 'GABRA4', 'PLCB4', 'LYNX1', 'HSPH1', 'ATXN7L3', 'SHANK3', 'GRM1', 'ZFP365', 'NRXN3', 'CELF2', 'NELL2', 'GABRD', 'REPS2', 'SYNPO2', 'RGS4', '6430548M08RIK', 'SLC6A17', 'NR2F1', 'SCN2B', 'NECAB2', 'TRIM9', 'CYP46A1', 'LINGO1', 'CRMP1', 'ZDHHC22', 'SYT13', 'FNDC4', 'GNAL', 'LRRTM1', 'SLC24A3', 'ILDR2', 'SETD7', 'TIAM1', 'NR1D1', 'TTC7B', 'CACNA1G', 'NT5DC3', 'PRKCE', 'KITL', 'CORO2B', 'FAM20C', 'EPHA4', 'CDKL5', 'MTURN', 'NMNAT2', 'KCNIP4', 'PITPNC1', 'ZMAT4', 'BHLHE40', 'NELL1', 'B230334C09RIK', 'SEZ6L', 'CLMN', 'ADGRA1', 'FRRS1L', 'CACNB4', 'SHANK1', 'ZNRF1', 'HRH3', 'KCND2', 'SHOX2', 'MEGF9', 'FGF13', 'L1CAM', '1810041L15RIK', 'PPP2R5D', 'KCNQ3', 'TANC1', 'PATJ', 'CHRNA4', 'LHFP', 'OCIAD2', 'FAM126B', 'ADRA1B', 'BRINP1', 'LRTM2', 'PTK2B', 'KCNJ9', 'CPNE9']} 2 | 3 | -------------------------------------------------------------------------------- /tutorial/data/MousePosteriorToy.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/tutorial/data/MousePosteriorToy.h5ad -------------------------------------------------------------------------------- /tutorial/data/MouseSCToy.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/tutorial/data/MouseSCToy.h5ad -------------------------------------------------------------------------------- /tutorial/figures/Density_plot_BiogenExample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/tutorial/figures/Density_plot_BiogenExample.png -------------------------------------------------------------------------------- /tutorial/figures/segementation_8_075B.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/tutorial/figures/segementation_8_075B.png --------------------------------------------------------------------------------