├── CeLEry_package
    ├── CeLEry
    │   ├── ClusterVAE.py
    │   ├── DNN.py
    │   ├── TrainerExe.py
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── ClusterVAE.cpython-36.pyc
    │   │   ├── ClusterVAE.cpython-38.pyc
    │   │   ├── DNN.cpython-36.pyc
    │   │   ├── DNN.cpython-38.pyc
    │   │   ├── SpaCluster.cpython-36.pyc
    │   │   ├── TrainerExe.cpython-38.pyc
    │   │   ├── VanillaVAE.cpython-36.pyc
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── __init__.cpython-38.pyc
    │   │   ├── autoencoder.cpython-36.pyc
    │   │   ├── data_augmentation.cpython-38.pyc
    │   │   ├── datasetgenemap.cpython-36.pyc
    │   │   ├── datasetgenemap.cpython-38.pyc
    │   │   ├── fit_functions.cpython-38.pyc
    │   │   ├── type_.cpython-36.pyc
    │   │   ├── types_.cpython-36.pyc
    │   │   ├── types_.cpython-38.pyc
    │   │   ├── util.cpython-36.pyc
    │   │   ├── util.cpython-38.pyc
    │   │   ├── util_Mouse.cpython-36.pyc
    │   │   └── util_Mouse.cpython-38.pyc
    │   ├── data_augmentation.py
    │   ├── datasetgenemap.py
    │   ├── fit_functions.py
    │   ├── layers.py
    │   ├── types_.py
    │   ├── util.py
    │   └── util_Mouse.py
    ├── CeLEryPy.egg-info
    │   ├── PKG-INFO
    │   ├── SOURCES.txt
    │   ├── dependency_links.txt
    │   ├── requires.txt
    │   └── top_level.txt
    ├── LICENSE
    ├── README.md
    ├── dist
    │   ├── CeLEryPy-1.2.1-py3-none-any.whl
    │   └── CeLEryPy-1.2.1.tar.gz
    ├── pyproject.toml
    └── setup.py
├── LICENSE
├── README.md
├── code_paper
    ├── 1_LIBD
    │   ├── CeLEry_train_Scenario1and2.py
    │   ├── CeLEry_train_Scenario3and4.py
    │   ├── LIBDVisual.R
    │   ├── LIBDacc.R
    │   ├── README.md
    │   ├── prediction.py
    │   └── preprocess.py
    ├── 2_Alzheimer
    │   ├── CeLEry_train.py
    │   ├── README.md
    │   ├── preprocess.py
    │   └── test.py
    ├── 3_Mouse_10x_Visium
    │   ├── Mouse_CeLEry.ipynb
    │   ├── Mouse_Tangram.ipynb
    │   ├── Mouse_novosparc.ipynb
    │   └── Mouse_spotOTsc.ipynb
    ├── 4_Mouse_brain_MERFISH
    │   ├── CeLEry_brain.ipynb
    │   ├── CeLEry_figure 6_scenario 2.ipynb
    │   ├── CeLEry_figure 6_scenario 3.ipynb
    │   ├── README.md
    │   ├── SpaOTsc_brain.ipynb
    │   ├── Tangram_brain.ipynb
    │   ├── brain_result.ipynb
    │   └── novoSpaRc_brain.ipynb
    ├── 5_liver_MERFISH
    │   ├── CeLEry_liver.ipynb
    │   ├── README.md
    │   ├── SpaOTsc_liver.ipynb
    │   ├── Tangram_liver.ipynb
    │   ├── liver_result.ipynb
    │   └── novoSpaRc_liver.ipynb
    ├── 6_breast_cancer_10x_Xenium
    │   ├── 2D_locationRecovery
    │   │   ├── Xenium_BreastCancer_CELERY_Rep1_Scheme4_2DRecovery.ipynb
    │   │   ├── Xenium_BreastCancer_Tangram_Rep1_Scheme4_2DRecovery.ipynb
    │   │   ├── Xenium_BreastCancer_novosparc_Rep1_Scheme4_2DRecovery.ipynb
    │   │   └── Xenium_BreastCancer_spaOTsc_Rep1_Scheme4_2DRecovery.ipynb
    │   └── Domain_prediction
    │   │   ├── Xenium_BreastCancer_CELEREY_Scheme2_domainPred.ipynb
    │   │   ├── Xenium_BreastCancer_novosparc_Rep1_Scheme2_domainPred.ipynb
    │   │   ├── Xenium_BreastCancer_spaOTsc_Rep1_Scheme2_domainPred.ipynb
    │   │   └── Xenium_BreastCancer_tangram_Scheme2_domainPred.ipynb
    ├── 7_data_augmentation
    │   └── CeLEry-data-agumentation.ipynb
    └── 8_mouse_single_cell_prediction
    │   ├── Mouse_sc_analysis.py
    │   ├── Mouse_sc_analysis_spaOTsc_novosparc.ipynb
    │   ├── analysis-results.py
    │   └── preprocessing.py
├── docs
    └── asserts
    │   └── images
    │       └── workflow.png
├── pretrainmodel
    └── Biogen
    │   ├── Pretrained_model_075B.obj
    │   ├── Pretrained_model_075B_probmat.csv
    │   └── Reference_genes_8_075B.obj
└── tutorial
    ├── BiogenPretrain.ipynb
    ├── BiogenPretrain.md
    ├── data
        ├── AlzheimerToy.h5ad
        ├── DataLayerToy.h5ad
        ├── Mouse2D
        │   └── MP1_SVG.py
        ├── MousePosteriorToy.h5ad
        └── MouseSCToy.h5ad
    ├── figures
        ├── Density_plot_BiogenExample.png
        └── segementation_8_075B.png
    ├── tutorial.ipynb
    └── tutorial.md


/CeLEry_package/CeLEry/ClusterVAE.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | # from models import BaseVAE
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | from . types_ import *
  6 | 
  7 | 
  8 | class ClusterVAE(nn.Module):
  9 | 
 10 | 	def __init__(self,  
 11 | 				 # in_channels: int,
 12 | 				 latent_dim: int,
 13 | 				 total_cluster: int,
 14 | 				 hidden: List = None,
 15 | 				 fgx = 2, fgy = 2,
 16 | 				 **kwargs) -> None:
 17 | 		super(ClusterVAE, self).__init__()
 18 | 
 19 | 		self.latent_dim = latent_dim
 20 | 		self.total_cluster = total_cluster
 21 | 		
 22 | 		scanx = fgx % 4 + 3
 23 | 		scany = fgy % 4 + 3
 24 | 		
 25 | 		if hidden is None:
 26 | 			hidden = [16, 8, 4, 8, 8]
 27 | 		
 28 | 		self.hidden = hidden
 29 | 		# encoder
 30 | 		self.encoderl1 = nn.Sequential( # like the Composition layer you built
 31 | 			nn.Conv2d(1, hidden[0], [scanx,scany]),  # 76,  116		   178 208   # 80, 86
 32 | 			nn.ReLU())
 33 | 		# self.encoderl2 = nn.Sequential(nn.MaxPool2d(2, stride=2))  #38, 58		 # 78, 82
 34 | 		self.encoderl3 = nn.Sequential(
 35 | 			nn.Conv2d(hidden[0], hidden[1], 4, stride=2),
 36 | 			nn.ReLU())	# 18, 28         # 38, 40
 37 | 		self.encoderl4 = nn.Sequential(
 38 | 			nn.Conv2d(hidden[1], hidden[2], 4, stride=2),     #18, 19
 39 | 			nn.ReLU())	# 15, 25
 40 | 		# decoder
 41 | 		self.decoderl4 = nn.Sequential(
 42 | 			nn.ConvTranspose2d(hidden[2], hidden[3], 4, stride=2),
 43 | 			nn.ReLU())	# 35, 54
 44 | 		self.decoderl3 = nn.Sequential(
 45 | 			nn.ConvTranspose2d(hidden[3], hidden[4], 4, stride=2),  
 46 | 			nn.ReLU())  # 38,57
 47 | 		# self.decoderl2 = nn.Sequential(
 48 | 		# 	nn.ConvTranspose2d(16, 8, 2, stride=2),  
 49 | 		# 	nn.ReLU())	 #76, 114
 50 | 		self.decoderl1 = nn.Sequential(
 51 | 			nn.ConvTranspose2d(hidden[4], 1, [scanx,scany]) #,
 52 | 			#nn.ReLU()
 53 | 			#nn.Sigmoid()
 54 | 			)
 55 | 			
 56 | 		self.enbedimx = int(((fgx - scanx + 1)/2-1)/2 -1)
 57 | 		self.enbedimy = int(((fgy - scany + 1)/2-1)/2 -1)
 58 | 		node_int = int(self.enbedimx * self.enbedimy * hidden[2])
 59 | 		self.fc_mu = nn.Linear(node_int, latent_dim)
 60 | 		self.fc_var = nn.Linear(node_int, latent_dim)
 61 | 		self.decoder_input = nn.Linear(self.latent_dim + self.total_cluster + 1, node_int)
 62 | 		
 63 | 		
 64 | 		if 'KLDw' in kwargs:
 65 | 			self.kld_weight = kwargs['KLDw']
 66 | 		else:
 67 | 			self.kld_weight = 1
 68 | 		
 69 | 		self.seed = 0
 70 | 
 71 | 	def encode(self, input: Tensor) -> List[Tensor]:
 72 | 		"""
 73 | 		Encodes the input by passing through the encoder network
 74 | 		and returns the latent codes.
 75 | 		:param input: (Tensor) Input tensor to encoder [N x C x H x W]
 76 | 		:return: (Tensor) List of latent codes
 77 | 		"""
 78 | 		result = self.encoderl1(input)
 79 | 		# result = self.encoderl2(result)
 80 | 		result = self.encoderl3(result)
 81 | 		result = self.encoderl4(result)
 82 | 		result = torch.flatten(result, start_dim=1)
 83 | 
 84 | 		# Split the result into mu and var components
 85 | 		# of the latent Gaussian distribution
 86 | 		mu = self.fc_mu(result)
 87 | 		log_var = self.fc_var(result)
 88 | 
 89 | 		return [mu, log_var]
 90 | 
 91 | 	def decode(self, z: Tensor) -> Tensor:
 92 | 		"""
 93 | 		Maps the given latent codes
 94 | 		onto the image space.
 95 | 		:param z: (Tensor) [B x D]
 96 | 		:return: (Tensor) [B x C x H x W]
 97 | 		"""
 98 | 		result = self.decoder_input(z)
 99 | 		result = result.view(-1, self.hidden[2], self.enbedimx, self.enbedimy)
100 | 		result = self.decoderl4(result)
101 | 		result = self.decoderl3(result)
102 | 		# result = self.decoderl2(result)
103 | 		result = self.decoderl1(result)
104 | 		return result
105 | 
106 | 	def reparameterize(self, mu: Tensor, logvar: Tensor) -> Tensor:
107 | 		"""
108 | 		Reparameterization trick to sample from N(mu, var) from
109 | 		N(0,1).
110 | 		:param mu: (Tensor) Mean of the latent Gaussian [B x D]
111 | 		:param logvar: (Tensor) Standard deviation of the latent Gaussian [B x D]
112 | 		:return: (Tensor) [B x D]
113 | 		"""
114 | 		std = torch.exp(0.5 * logvar)
115 | 		torch.manual_seed(self.seed)
116 | 		eps = torch.randn_like(std)
117 | 		return eps * std + mu
118 | 
119 | 	def forward(self, input: Tensor, **kwargs) -> List[Tensor]:
120 | 		mu, log_var = self.encode(input[0])
121 | 		z = self.reparameterize(mu, log_var)
122 | 		zplus = torch.cat((z, input[1]), dim = 1)
123 | 		return  [self.decode(zplus), input, mu, log_var]
124 | 
125 | 	def loss_function(self,
126 | 					  *args,
127 | 					  **kwargs) -> dict:
128 | 		"""
129 | 		Computes the VAE loss function.
130 | 		KL(N(\mu, \sigma), N(0, 1)) = \log \frac{1}{\sigma} + \frac{\sigma^2 + \mu^2}{2} - \frac{1}{2}
131 | 		:param args:
132 | 		:param kwargs:
133 | 		:return:
134 | 		"""
135 | 		recons = args[0]
136 | 		input = args[1]
137 | 		mu = args[2]
138 | 		log_var = args[3]
139 | 
140 | 		kld_weight = self.kld_weight  # Account for the minibatch samples from the dataset
141 | 		
142 | 		
143 | 		recons_loss = F.mse_loss(recons, input[0])
144 | 
145 | 
146 | 		kld_loss = torch.mean(-0.5 * torch.sum(1 + log_var - mu ** 2 - log_var.exp(), dim = 1), dim = 0)
147 | 
148 | 		loss = recons_loss + kld_weight * kld_loss
149 | 		return {'loss': loss, 'Reconstruction_Loss':recons_loss, 'KLD':-kld_loss}
150 | 
151 | 		
152 | 		
153 | class ClusterVAEmask(ClusterVAE):
154 | 	def __init__(self,  
155 | 		# in_channels: int,
156 | 		latent_dim: int,
157 | 		total_cluster: int,
158 | 		hidden: List = None,
159 | 		fgx = 2, fgy = 2,
160 | 		**kwargs) -> None:
161 | 		super(ClusterVAEmask, self).__init__(latent_dim, total_cluster, hidden, fgx, fgy,  **kwargs)
162 | 	
163 | 	def forward(self, input: Tensor, **kwargs) -> List[Tensor]:
164 | 		mu, log_var = self.encode(input[0])
165 | 		z = self.reparameterize(mu, log_var)
166 | 		zplus = torch.cat((z, input[1]), dim = 1)
167 | 		mask = (input[0] != 0) * 1
168 | 		return  [self.decode(zplus), input, mu, log_var, mask.float()]
169 | 
170 | 	def loss_function(self,
171 | 					*args,
172 | 					**kwargs) -> dict:
173 | 		"""
174 | 		Computes the VAE loss function.
175 | 		KL(N(\mu, \sigma), N(0, 1)) = \log \frac{1}{\sigma} + \frac{\sigma^2 + \mu^2}{2} - \frac{1}{2}
176 | 		:param args:
177 | 		:param kwargs:
178 | 		:return:
179 | 		"""
180 | 		recons = args[0]
181 | 		input = args[1]
182 | 		mu = args[2]
183 | 		log_var = args[3]
184 | 		mask = args[4]
185 | 
186 | 		kld_weight = self.kld_weight  # Account for the minibatch samples from the dataset
187 | 		
188 | 		
189 | 		recons_loss = F.mse_loss(recons * mask, input[0] * mask)
190 | 
191 | 
192 | 		kld_loss = torch.mean(-0.5 * torch.sum(1 + log_var - mu ** 2 - log_var.exp(), dim = 1), dim = 0)
193 | 
194 | 		loss = recons_loss + kld_weight * kld_loss
195 | 		return {'loss': loss, 'Reconstruction_Loss':recons_loss, 'KLD':-kld_loss}
196 |  
197 | 
198 | 


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/TrainerExe.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.optim as optim
  4 | import numpy as np
  5 | from tqdm import tqdm
  6 | 
  7 | class TrainerExe(object):
  8 | 	def __init__(self):
  9 | 		super(TrainerExe, self).__init__()
 10 | 		self.l=None
 11 | 
 12 | 	def set_l(self, l):
 13 | 		self.l=l
 14 |         
 15 | 	def train(self, model, train_loader, 
 16 | 		num_epochs=5, learning_rate=1e-3, annealing = False, KLDwinc = 0.02, n_incr =50, RCcountMax = 40):
 17 | 		self.learning_rate = 1e-2
 18 | 		if (self.learning_rate > learning_rate):
 19 | 			self.learning_rate = learning_rate
 20 | 		self.model = model
 21 | 		
 22 | 		optimizer = optim.Adam(self.model.parameters(),
 23 | 									lr=self.learning_rate, 
 24 | 									weight_decay=1e-5) 
 25 | 		RCcount = 0
 26 | 		loss_min = 99999999
 27 | 		for epoch in range(num_epochs):
 28 | 			total_loss = 0
 29 | 			for i, img in enumerate(tqdm(train_loader)):
 30 | 				recon = self.model(img)
 31 | 				loss = self.model.loss_function(*recon)
 32 | 				loss.get("loss").backward()
 33 | 				optimizer.step()
 34 | 				optimizer.zero_grad()
 35 | 				total_loss += loss.get("loss").data
 36 | 			print('Epoch:{}, Loss:{:.4f}'.format(epoch+1, float(total_loss)))
 37 | 			if (total_loss>loss_min):
 38 | 				RCcount = RCcount + 1
 39 | 				if (RCcount == RCcountMax):
 40 | 					RCcount = 0
 41 | 					self.learning_rate = self.learning_rate/2
 42 | 					optimizer.param_groups[0]['lr'] = self.learning_rate
 43 | 					loss_min = loss_min + 10
 44 | 					print('New learning rate:{}'.format(float(self.learning_rate)))
 45 | 			else:
 46 | 				loss_min = total_loss
 47 | 			if annealing:
 48 | 				self.model.seed = epoch
 49 | 				if epoch % n_incr == (n_incr-1):
 50 | 					self.model.kld_weight = self.model.kld_weight + KLDwinc
 51 | 					print('KLD weight annealing: increase {}. Now is :{:.4f}'.format(KLDwinc, float(self.model.kld_weight)))
 52 | 					loss_min = loss_min + 500
 53 | 			if (self.learning_rate < 1e-7):
 54 | 				break
 55 | 				
 56 | 	def get_predict(self,train_loader):
 57 | 		output = []
 58 | 		for i, img in enumerate(train_loader):
 59 | 			recon = self.model(img)
 60 | 			output.append(recon[0].detach().numpy()[0,0,:,:])
 61 | 		return(np.stack(output))
 62 | 	
 63 | 	def get_hidecode(self,train_loader):
 64 | 		output = []
 65 | 		for i, img in enumerate(tqdm(train_loader)):
 66 | 			embedding1 = self.model.encoderl1(img.float())
 67 | 			embedding2 = self.model.encoderl2(embedding1)
 68 | 			embedding3 = self.model.encoderl3(embedding2)
 69 | 			embedding4 = self.model.encoderl4(embedding3)
 70 | 			output.append(embedding4)
 71 | 		return(output)
 72 | 		
 73 | 	def deep_reshape(self, data, refer):
 74 | 		"""
 75 | 		Given generated data for a sample and a reference coordinates data, reshape the data by (location) X (Gene)
 76 | 		:param data: (Numpy) [nsample X Gene X location_x X location_y]
 77 | 		:return: (Numpy) [nsample X Gene X location(x X y filtered)]
 78 | 		"""
 79 | 		x = refer.iloc[:,0]
 80 | 		y = refer.iloc[:,1]
 81 | 		xmin = x.min()
 82 | 		xmax = x.max()
 83 | 		ymin = y.min()
 84 | 		ymax = y.max()
 85 | 		xlen = xmax - xmin + 1
 86 | 		ylen = ymax - ymin + 1
 87 | 		marker = np.zeros(xlen*ylen, dtype = bool)
 88 | 		for i in range(refer.shape[0]):
 89 | 			marker[(refer.iloc[i,0]-xmin)*ylen + refer.iloc[i,1] - ymin] = True
 90 | 		final = data[:,:,marker]
 91 | 		return(final)
 92 | 	
 93 | 	def fast_generation(self,train_loader, nsample):
 94 | 		"""
 95 | 		Given original gene-image data and the number of samples to be sampled
 96 | 		:param train_loader
 97 | 			   nsample: (Int) the number of samples
 98 | 		:return: (Numpy) [nsample X Gene X location(x X y filtered)]
 99 | 		"""
100 | 		output = []
101 | 		for i, img in enumerate(tqdm(train_loader)):
102 | 			outputinside = []
103 | 			self.model.seed = 0
104 | 			mu, log_var = self.model(img)[2:4]
105 | 			for j in range(nsample):
106 | 				self.model.seed = j
107 | 				z = self.model.reparameterize(mu, log_var)
108 | 				zplus = torch.cat((z, img[1]), dim = 1)
109 | 				outputi = self.model.decode(zplus)
110 | 				outputinside.append(outputi.detach().numpy()[0,0,:,:])
111 | 			output.append(np.stack(outputinside))
112 | 		final = np.stack(output)
113 | 		final2 = np.swapaxes( final,0,1)
114 | 		final3 = final2.reshape((final2.shape[0], final2.shape[1],-1) )
115 | 		return(final3)
116 | 		


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '1.2.1'
2 | from . util import *
3 | from . datasetgenemap import *
4 | from . DNN import *
5 | from . util_Mouse import *
6 | from . fit_functions import *
7 | from . data_augmentation import *
8 | 


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/ClusterVAE.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/ClusterVAE.cpython-36.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/ClusterVAE.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/ClusterVAE.cpython-38.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/DNN.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/DNN.cpython-36.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/DNN.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/DNN.cpython-38.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/SpaCluster.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/SpaCluster.cpython-36.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/TrainerExe.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/TrainerExe.cpython-38.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/VanillaVAE.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/VanillaVAE.cpython-36.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/autoencoder.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/autoencoder.cpython-36.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/data_augmentation.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/data_augmentation.cpython-38.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/datasetgenemap.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/datasetgenemap.cpython-36.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/datasetgenemap.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/datasetgenemap.cpython-38.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/fit_functions.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/fit_functions.cpython-38.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/type_.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/type_.cpython-36.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/types_.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/types_.cpython-36.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/types_.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/types_.cpython-38.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/util.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/util.cpython-36.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/util.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/util.cpython-38.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/util_Mouse.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/util_Mouse.cpython-36.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/__pycache__/util_Mouse.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/CeLEry/__pycache__/util_Mouse.cpython-38.pyc


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/data_augmentation.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import numpy as np
  4 | import pandas as pd
  5 | import torch
  6 | import pickle
  7 | 
  8 | from .util import *
  9 | 
 10 | from sklearn.cluster import KMeans
 11 | from . datasetgenemap import datagenemapclust
 12 | from . ClusterVAE import ClusterVAEmask
 13 | from . TrainerExe import TrainerExe
 14 | from . datasetgenemap import wrap_gene_domain
 15 | from . DNN import DNN
 16 | from . DNN import DNNordinal
 17 | from . DNN import DNNdomain
 18 | 
 19 | def seed_worker(worker_id):
 20 |     worker_seed = torch.initial_seed() % 2**32
 21 |     np.random.seed(worker_seed)
 22 |     random.seed(worker_seed)
 23 | 
 24 | def DataAugmentation (RefDataOrigin, obs_location = ['x_cord','y_cord'], path = "output/Project", filename = "SpatialTranscript", clusterready = False, n_clusters=100,  beta = 1e-5, nrep = 2, generateplot = True):
 25 |     #Prepare
 26 |     RefDataOriginsort = RefDataOrigin.obs.sort_values (by = obs_location)
 27 |     RefDataOrigin = RefDataOrigin[RefDataOriginsort.index]
 28 |     cdata = RefDataOrigin.copy()
 29 |     getGeneImg(cdata, emptypixel = 0, obsset = obs_location)
 30 |     cdataexpand =  np.expand_dims(cdata.GeneImg, axis=1) 
 31 |     #Clustering
 32 |     try:
 33 |         os.makedirs("{path}/DataAugmentation".format(path = path))
 34 |     except FileExistsError:
 35 |         print("Folder already exists")
 36 |     if clusterready:
 37 |         kmeansresults = np.load("{path}/DataAugmentation/{filename}_cluster.npy".format(path = path, filename = filename))
 38 |     else:
 39 |         kmeansmodel =  KMeans(n_clusters, random_state=0)
 40 |         cdatacentral = centralize(cdataexpand.copy())
 41 |         direclust = [cdatacentral[x,0,:,:] for x in range(cdatacentral.shape[0])]
 42 |         direflat = [x.flat for x in direclust]
 43 |         direflatnp = np.stack(direflat)
 44 |         kmeans = kmeansmodel.fit(direflatnp)
 45 |         kmeansresults = kmeans.labels_
 46 |         np.save("{path}/DataAugmentation/{filename}_cluster.npy".format(path = path, filename = filename), kmeansresults)
 47 |     # 
 48 |     full_RefData = datagenemapclust(cdataexpand, kmeansresults)
 49 |     CVAEmodel, clg = FitGenModel(path = path, filename = filename, traindata = full_RefData, cdataexpand = cdataexpand, Kmeans_cluster = kmeansresults, beta = beta)
 50 |     CVAEmodel, clg = FitGenModel_continue(path = path, filename = filename, model = CVAEmodel, clg = clg, traindata = full_RefData, beta = beta)
 51 |     if generateplot:
 52 |         print("Now generating the plots for the augmented data...")
 53 |         GeneratePlot(path, filename, beta = beta, traindata = full_RefData)
 54 |     Data_Generation(path, filename, obs_location = obs_location, beta= beta, dataSection1 = RefDataOrigin, traindata = full_RefData, nrep = nrep)
 55 | 
 56 | 
 57 | def FitGenModel (path, filename, traindata, cdataexpand, Kmeans_cluster, beta, hidden = [8,4,2,4,4], learning_rate = 1e-3,  number_error_try = 30):
 58 |     random.seed(2021)
 59 |     torch.manual_seed(2021)
 60 |     np.random.seed(2021)
 61 |     #
 62 |     trainloader= torch.utils.data.DataLoader(traindata, batch_size=1, num_workers = 1, shuffle = True, worker_init_fn=seed_worker)
 63 |     ## Set up Autoencoder
 64 |     CVAEmodel = ClusterVAEmask(latent_dim = 511-Kmeans_cluster.max(), total_cluster = Kmeans_cluster.max(), fgx = cdataexpand.shape[2], fgy = cdataexpand.shape[3], KLDw = 0, hidden = hidden)
 65 |     CVAEmodel = CVAEmodel.float()
 66 |     file = "{path}/DataAugmentation/{filename}_CVAE_{beta}.obj".format(path = path, filename = filename, beta = beta)
 67 |     #
 68 |     ## Run Autoencoder 
 69 |     clg = TrainerExe()
 70 |     clg.train(model = CVAEmodel, train_loader = trainloader, num_epochs= 249, annealing = True, KLDwinc = beta/4, n_incr =50, RCcountMax = number_error_try, learning_rate = learning_rate)
 71 |     # Save the model to a local folder
 72 |     filehandler = open(file, 'wb') 
 73 |     pickle.dump(CVAEmodel, filehandler)
 74 |     print('save model to: {filename}'.format(filename = file))
 75 |     CVAEmodel.filename = file
 76 |     return CVAEmodel, clg
 77 | 
 78 | ## if still converging
 79 | def FitGenModel_continue (path, filename, model, clg, traindata, beta):
 80 |     trainloader= torch.utils.data.DataLoader(traindata, batch_size=1, num_workers = 1, shuffle = True, worker_init_fn=seed_worker)
 81 |     #
 82 |     file = "{path}/DataAugmentation/{filename}_CVAE_{beta}.obj".format(path = path, filename = filename, beta = beta)
 83 |     clg.train(model = model, train_loader = trainloader, num_epochs= 200, annealing = False, RCcountMax = 5, learning_rate = clg.learning_rate)
 84 |     # Save the model to a local folder
 85 |     filehandler = open(file, 'wb') 
 86 |     pickle.dump(model, filehandler)
 87 |     print('save model to: {filename}'.format(filename=file))
 88 |     model.filename = file
 89 |     return model, clg
 90 | 
 91 | def GeneratePlot(path, filename, beta, traindata, sigma = 0):
 92 |     trainloader= torch.utils.data.DataLoader(traindata, batch_size=1, num_workers = 4)
 93 |     file = "{path}/DataAugmentation/{filename}_CVAE_{beta}.obj".format(path = path, filename = filename, beta = beta)
 94 |     # 
 95 |     filehandler = open(file, 'rb') 
 96 |     CVAEmodel = pickle.load(filehandler)
 97 |     #
 98 |     clg=TrainerExe()
 99 |     clg.model = CVAEmodel
100 |     try:
101 |         os.makedirs("{path}/DataAugmentation/{file}_Generation/Glimps/Gen{beta}".format(path = path, file = filename, beta = beta))
102 |     except FileExistsError:
103 |         print("Folder {path}/DataAugmentation/{file}_Generation/Glimps/Gen{beta} already exists".format(path = path, file = filename, beta = beta))
104 |     for j, img in enumerate(trainloader):
105 |         # img = next(dataloader_iterator)
106 |         plotGeneImg(img[0][0,0,:,:], filename = "{path}/DataAugmentation/{file}_Generation/Glimps/Gen{beta}/img{j}".format(path = path, file = filename, beta = beta, j = j))
107 |         omin = img[0].min()
108 |         omax = img[0].max()
109 |         if sigma == 0:
110 |             sigma = (omax-omin)/6
111 |         for i in range(10):
112 |             CVAEmodel.seed = i
113 |             result = CVAEmodel(img) 
114 |             outputraw = result[0][0,0,:,:].detach().numpy()
115 |             outputimg = (outputraw + np.random.normal(0,sigma,outputraw.shape)) * result[4][0,0,:,:].detach().numpy()
116 |             plotGeneImg( outputimg , filename = "{path}/DataAugmentation/{file}_Generation/Glimps/Gen{beta}/img{j}var{i}".format(path = path, file = filename, beta = beta, j = j, i = i), range = (-3, 3))
117 | 
118 | 
119 | def Data_Generation(path, filename, beta, dataSection1, traindata, nrep, obs_location = ['x_cord','y_cord']):
120 |     trainloader= torch.utils.data.DataLoader(traindata, batch_size=1, num_workers = 4)
121 |     random.seed(2021)
122 |     torch.manual_seed(2021)
123 |     np.random.seed(2021)
124 |     #
125 |     fileto = "{path}/DataAugmentation/{filename}_CVAE_{beta}.obj".format(path = path, filename = filename, beta = beta)
126 |     filehandler = open(fileto, 'rb') 
127 |     CVAEmodel = pickle.load(filehandler)
128 |     #
129 |     clg= TrainerExe()
130 |     clg.model = CVAEmodel
131 |     data_gen=clg.fast_generation(trainloader, nrep)
132 |     # data_gen=np.load("../output/{folder}/data_gen.npy".format(folder = folder))
133 |     data_gen_rs = clg.deep_reshape (data = data_gen, refer = dataSection1.obs[obs_location])
134 |     try:
135 |         os.makedirs("{path}/DataAugmentation/DataGen".format(path = path))
136 |     except FileExistsError:
137 |         print("Folder already exists")
138 |     np.save("{path}/DataAugmentation/DataGen/{filename}_data_gen_{beta}_n{nrep}.npy".format(path = path, filename = filename, beta = beta, nrep = nrep), data_gen_rs)
139 | 
140 | 
141 | def AugFit_domain (RefDataOrigin, domain_weights, domain_data = None, domainkey = "layer", hidden_dims =  [50, 10, 5], num_epochs_max = 500, beta = 1e-5, nrep = 2,  path = "../output/Biogene", filename = "SpatialTranscript", batch_size = 4, num_workers = 4, number_error_try = 15, initial_learning_rate = 0.0001, seednum = 2021):
142 |     random.seed(seednum)
143 |     torch.manual_seed(seednum)
144 |     np.random.seed(seednum)
145 |     if domain_data is None:
146 |         domain_data = RefDataOrigin.obs
147 |     #
148 |     # Original Version
149 |     data_gen_rs = np.load("{path}/DataAugmentation/DataGen/{filename}_data_gen_{beta}_n{nrep}.npy".format(path = path, filename = filename, beta = beta, nrep = nrep))
150 |     # Attach the original
151 |     tdatax = np.expand_dims(RefDataOrigin.X, axis = 0)
152 |     tdata_rs = np.swapaxes(tdatax, 1, 2)
153 |     datacomp = np.concatenate((data_gen_rs, tdata_rs), axis=0)
154 |     #
155 |     dataDNN = wrap_gene_domain(datacomp, domain_data, domainkey)
156 |     CoReg_loader = torch.utils.data.DataLoader(dataDNN, batch_size=batch_size, num_workers = num_workers, shuffle = True, worker_init_fn=seed_worker)
157 |     # Create Deep Neural Network for Coordinate Regression
158 |     DNNmodel = DNNdomain( in_channels = data_gen_rs.shape[1], num_classes = domain_weights.shape[0], hidden_dims = hidden_dims, importance_weights = domain_weights)
159 |     DNNmodel = DNNmodel.float()
160 |     #
161 |     CoReg = TrainerExe()
162 |     CoReg.train(model = DNNmodel, train_loader = CoReg_loader, num_epochs= num_epochs_max, RCcountMax = number_error_try, learning_rate = initial_learning_rate)
163 |     #
164 |     try:
165 |         os.makedirs("{path}/DataAugmentation/PredictionModel".format(path = path))
166 |     except FileExistsError:
167 |         print("Note: Folder {path}/DataAugmentation/PredictionModel already exists".format(path = path))
168 |     filename2 = "{path}/DataAugmentation/PredictionModel/{filename}_domain_{beta}_n{nrep}.obj".format(filename = filename, path = path, beta = beta, nrep = nrep)
169 |     filehandler2 = open(filename2, 'wb') 
170 |     pickle.dump(DNNmodel, filehandler2)
171 | 
172 | 


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/datasetgenemap.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from torch.utils.data import TensorDataset
  4 | 
  5 | 
  6 | 
  7 | class datasetgenemap(TensorDataset):
  8 | 	"""Dataset wrapping unlabeled data tensors.
  9 | 	No longer used.
 10 | 	Each sample will be retrieved by indexing tensors along the first
 11 | 	dimension.
 12 | 
 13 | 	Arguments:
 14 | 		datainput (numpy array): contains sample data.
 15 | 	"""
 16 | 	def __init__(self, datainput):
 17 | 		self.data_tensor = torch.from_numpy(datainput).float()
 18 | 
 19 | 	def __getitem__(self, index):
 20 | 		return self.data_tensor[index].astype(np.float32)
 21 | 
 22 | 	def __len__(self):
 23 | 		return len(self.data_tensor)
 24 | 
 25 | 
 26 | 
 27 | class datagenemapclust(TensorDataset):
 28 | 	"""Dataset wrapping labeled (cluster label) data tensors with cluster information.
 29 | 	Used in data augmentation models
 30 | 	Each sample will be retrieved by indexing tensors along the first
 31 | 	dimension.
 32 | 
 33 | 	Arguments:
 34 | 		datainput (numpy array): contains sample data.
 35 | 	"""
 36 | 	def __init__(self, datainput, label):
 37 | 		self.data_tensor = torch.from_numpy(datainput).float()
 38 | 		self.maxnum = label.max()
 39 | 		self.clustempty = np.zeros(self.maxnum + 1,'float32')
 40 | 		self.label = label
 41 | 
 42 | 	def __getitem__(self, index):
 43 | 		image = self.data_tensor[index]
 44 | 		cluster = self.clustempty.copy()
 45 | 		cluster[self.label[index]] = 1
 46 | 		return image, torch.from_numpy(cluster).float()
 47 | 
 48 | 	def __len__(self):
 49 | 		return len(self.data_tensor)
 50 | 
 51 | 
 52 | 
 53 | class wrap_gene_location(TensorDataset):
 54 | 	"""Dataset wrapping labeled (cluster label) data tensors with cluster information.
 55 | 	Used in data prediction models
 56 | 	Each sample will be retrieved by indexing tensors along the first
 57 | 	dimension.
 58 | 
 59 | 	Arguments:
 60 | 		datainput (numpy array): contains sample data.
 61 | 	"""
 62 | 	def __init__(self, datainput, label):
 63 | 		self.data_tensor = torch.from_numpy(datainput).float()
 64 | 		cord = label.to_numpy().astype('float32')
 65 | 		cordx = cord[:,0]
 66 | 		cordy = cord[:,1]
 67 | 		self.xmin = cordx.min()-1
 68 | 		self.ymin = cordy.min()-1
 69 | 		self.xmax = cordx.max()+1
 70 | 		self.ymax = cordy.max()+1
 71 | 		self.cordx_norm = (cordx - self.xmin)/(self.xmax-self.xmin)
 72 | 		self.cordy_norm = (cordy - self.ymin)/(self.ymax-self.ymin)
 73 | 		self.imagedimension = self.data_tensor.shape
 74 | 	def __getitem__(self, index):
 75 | 		indexsample = index // self.imagedimension[2]
 76 | 		indexspot = index % self.imagedimension[2]
 77 | 		geneseq = self.data_tensor[indexsample,:,indexspot]
 78 | 		cordinates = torch.tensor([self.cordx_norm[indexspot],self.cordy_norm[indexspot]])
 79 | 		return geneseq, cordinates
 80 | 	def __len__(self):
 81 | 		return self.imagedimension[0] * self.imagedimension[2]
 82 | 
 83 | 
 84 | class wrap_gene_layer(TensorDataset):
 85 | 	"""Dataset wrapping labeled (cluster label) data tensors with cluster information.
 86 | 	Used in data prediction models
 87 | 	Each sample will be retrieved by indexing tensors along the first
 88 | 	dimension.
 89 | 
 90 | 	Arguments:
 91 | 		datainput (numpy array): contains sample data.
 92 | 		layer (boolean): T if layer information is contained
 93 | 		layerkey: the keyword for layer. Default is "Layer"
 94 | 	"""
 95 | 	def __init__(self, datainput, label, layerkey = "layer"):
 96 | 		self.data_tensor = torch.from_numpy(datainput).float()
 97 | 		getlayer = label[layerkey].to_numpy()
 98 | 		self.layer = getlayer.astype('float32')
 99 | 		self.layersunq = np.sort(np.unique(self.layer))
100 | 		self.nlayers = len(self.layersunq)
101 | 		self.imagedimension = self.data_tensor.shape
102 | 	def __getitem__(self, index):
103 | 		indexsample = index // self.imagedimension[2]
104 | 		indexspot = index % self.imagedimension[2]
105 | 		geneseq = self.data_tensor[indexsample,:,indexspot]
106 | 		layeri = int(self.layer[indexspot]) - 1
107 | 		layerv = np.zeros(self.nlayers-1)
108 | 		layerv[:layeri] = 1
109 | 		return geneseq, layerv
110 | 	def __len__(self):
111 | 		return self.imagedimension[0] * self.imagedimension[2]
112 | 
113 | 
114 | class wrap_gene_domain(TensorDataset):
115 | 	"""Dataset wrapping labeled (cluster label) data tensors with cluster information.
116 | 	Used in data prediction models
117 | 	Each sample will be retrieved by indexing tensors along the first
118 | 	dimension.
119 | 
120 | 	Arguments:
121 | 		datainput (numpy array): contains sample data.
122 | 		layer (boolean): T if layer information is contained
123 | 		layerkey: the keyword for layer. Default is "Layer"
124 | 	"""
125 | 	def __init__(self, datainput, label, layerkey = "layer"):
126 | 		self.data_tensor = torch.from_numpy(datainput).float()
127 | 		getlayer = label[layerkey].to_numpy()
128 | 		self.layer = getlayer.astype('float32')
129 | 		self.layersunq = np.sort(np.unique(self.layer))
130 | 		self.nlayers = len(self.layersunq)
131 | 		self.imagedimension = self.data_tensor.shape
132 | 	def __getitem__(self, index):
133 | 		indexsample = index // self.imagedimension[2]
134 | 		indexspot = index % self.imagedimension[2]
135 | 		geneseq = self.data_tensor[indexsample,:,indexspot]
136 | 		layeri = self.layer[indexspot].astype('int64')
137 | 		return geneseq, layeri
138 | 	def __len__(self):
139 | 		return self.imagedimension[0] * self.imagedimension[2]
140 | 


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/layers.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import scipy
 4 | 
 5 | import math
 6 | import torch
 7 | from torch.nn.parameter import Parameter
 8 | from torch.nn.modules.module import Module
 9 | from torch.utils.data import DataLoader
10 | 
11 | import matplotlib.pyplot as plt
12 | 
13 | 
14 | class ConvolutionNN(Module):
15 |     """
16 |     Simple CNN layer
17 |     """
18 |     def __init__(self, , )
19 | 
20 | def getGeneImg (adata, geneset = None):
21 |     # Transform the AnnData file into Genes of images
22 |     # adata: the input data of AnnData object
23 |     # geneset: the set of gene considered
24 |     if geneset is None:
25 |         x = adata.obs[["x2"]]
26 |         y = adata.obs[["x3"]]
27 |         xmin = x.min().iloc[0]
28 |         xmax = x.max().iloc[0]
29 |         ymin = y.min().iloc[0]
30 |         ymax = y.max().iloc[0]
31 |         # i = 12
32 |         for i in range(adata.X.shape[1]):
33 |             z = adata.X[:,i] 
34 |             zmin = z.min()
35 |             zmax = z.max()
36 |             # create array for image : zmax+1 is the default value
37 |             shape = (xmax-xmin+1,ymax-ymin+1)
38 |             img = np.ma.array(np.ones(shape)*0)
39 |             for inp in range(x.shape[0]):
40 |                 img[x.iloc[inp,0]-xmin,y.iloc[inp,0]-ymin]=z[inp,0]
41 |             # set mask on default value
42 |             img.mask = (img==0)
43 |             # set a gray background for test
44 |             img_bg_test =  np.zeros(shape)
45 |             cmap_bg_test = plt.get_cmap('gray')
46 |             plt.imshow(img_bg_test,cmap=cmap_bg_test,interpolation='none')
47 |             # plot
48 |             cmap = plt.get_cmap('jet')
49 |             plt.imshow(img,cmap=cmap,interpolation='none',vmin=zmin,vmax=zmax)
50 |             plt.colorbar()
51 |             plt.show()
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/CeLEry_package/CeLEry/types_.py:
--------------------------------------------------------------------------------
1 | from typing import List, Callable, Union, Any, TypeVar, Tuple
2 | from torch import tensor as Tensor
3 | 
4 | Tensor = TypeVar('torch.tensor')
5 | 


--------------------------------------------------------------------------------
/CeLEry_package/CeLEryPy.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 2.1
 2 | Name: CeLEryPy
 3 | Version: 1.2.1
 4 | Summary: Leverage spatial transcriptomics data to recover cell locations in single-cell RNA RNA-seq
 5 | Author-email: Qihuang Zhang <qihuang.zh@gmail.com>
 6 | License: Copyright (c) 2022 The Python Packaging Authority
 7 |         
 8 |         Permission is hereby granted, free of charge, to any person obtaining a copy
 9 |         of this software and associated documentation files (the "Software"), to deal
10 |         in the Software without restriction, including without limitation the rights
11 |         to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 |         copies of the Software, and to permit persons to whom the Software is
13 |         furnished to do so, subject to the following conditions:
14 |         
15 |         The above copyright notice and this permission notice shall be included in all
16 |         copies or substantial portions of the Software.
17 |         
18 |         THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 |         IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 |         FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 |         AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 |         LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 |         OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 |         SOFTWARE.
25 | Project-URL: Homepage, https://github.com/QihuangZhang/CeLEry
26 | Keywords: CeLEry,spatial transcriptomics,scRNA-seq
27 | Classifier: License :: OSI Approved :: MIT License
28 | Classifier: Programming Language :: Python
29 | Classifier: Programming Language :: Python :: 3
30 | Requires-Python: >=3.8
31 | Description-Content-Type: text/markdown
32 | License-File: LICENSE
33 | Requires-Dist: torch>=1.8
34 | Requires-Dist: pandas>=1.4
35 | Requires-Dist: numpy>=1.20
36 | Requires-Dist: scipy
37 | Requires-Dist: tqdm
38 | Requires-Dist: scanpy>=1.5
39 | Requires-Dist: scikit-image
40 | Requires-Dist: anndata
41 | Requires-Dist: scikit-learn
42 | 
43 | # CeLEry
44 | ## Leveraging  spatial  transcriptomics  data  to  recover cell  locationsin  single-cell RNA-seq with CeLEry
45 | 
46 | ### Qihuang Zhang, Jian Hu, Kejie Li, Baohong Zhang, David Dai, Edward B. Lee, Rui Xiao, Mingyao Li*
47 | 
48 | Single-cell RNA sequencing provides resourceful information to study the cells systematically. However, their locational information is usually unavailable. We present CeLEry, a supervised deep learning algorithm to recover the origin of tissues in assist of spatial transcriptomic data, integrating a data augmentation procedure via variational autoencoder to improve the robustness of methods in the overfitting and the data contamination. CeLEry provides a generic framework and can be implemented in multiple tasks depending on the research objectives, including the spatial coordinates discovery as well as the layer discovery. It can make use of the information of multiple tissues of spatial transcriptomics data. Thorough assessments exhibit that CeLEry achieves a leading performance compared to the state-of-art methods. We illustrated the usage of CeLEry in the discovery of neuron cell layers to study the development of Alzheimer's disease. The identified cell location information is valuable in many downstream analyses and can be indicative of the spatial organization of the tissues.
49 | 
50 | ## System Requirements
51 | Python support packages: torch>1.8, pandas>1.4, numpy>1.20, scipy, tqdm, scanpy>1.5, anndata, sklearn
52 | 
53 | ## To install package
54 | In the command, input
55 | ```
56 | pip install CeLEryPy
57 | ```
58 | 
59 | 
60 | To load the package, input
61 | ```
62 | import CeLEry
63 | ```
64 | 


--------------------------------------------------------------------------------
/CeLEry_package/CeLEryPy.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | LICENSE
 2 | README.md
 3 | pyproject.toml
 4 | setup.py
 5 | CeLEry/ClusterVAE.py
 6 | CeLEry/DNN.py
 7 | CeLEry/TrainerExe.py
 8 | CeLEry/__init__.py
 9 | CeLEry/data_augmentation.py
10 | CeLEry/datasetgenemap.py
11 | CeLEry/fit_functions.py
12 | CeLEry/layers.py
13 | CeLEry/types_.py
14 | CeLEry/util.py
15 | CeLEry/util_Mouse.py
16 | CeLEryPy.egg-info/PKG-INFO
17 | CeLEryPy.egg-info/SOURCES.txt
18 | CeLEryPy.egg-info/dependency_links.txt
19 | CeLEryPy.egg-info/requires.txt
20 | CeLEryPy.egg-info/top_level.txt


--------------------------------------------------------------------------------
/CeLEry_package/CeLEryPy.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/CeLEry_package/CeLEryPy.egg-info/requires.txt:
--------------------------------------------------------------------------------
 1 | torch>=1.8
 2 | pandas>=1.4
 3 | numpy>=1.20
 4 | scipy
 5 | tqdm
 6 | scanpy>=1.5
 7 | scikit-image
 8 | anndata
 9 | scikit-learn
10 | 


--------------------------------------------------------------------------------
/CeLEry_package/CeLEryPy.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | CeLEry
2 | 


--------------------------------------------------------------------------------
/CeLEry_package/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2022 The Python Packaging Authority
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.


--------------------------------------------------------------------------------
/CeLEry_package/README.md:
--------------------------------------------------------------------------------
 1 | # CeLEry
 2 | ## Leveraging  spatial  transcriptomics  data  to  recover cell  locationsin  single-cell RNA-seq with CeLEry
 3 | 
 4 | ### Qihuang Zhang, Jian Hu, Kejie Li, Baohong Zhang, David Dai, Edward B. Lee, Rui Xiao, Mingyao Li*
 5 | 
 6 | Single-cell RNA sequencing provides resourceful information to study the cells systematically. However, their locational information is usually unavailable. We present CeLEry, a supervised deep learning algorithm to recover the origin of tissues in assist of spatial transcriptomic data, integrating a data augmentation procedure via variational autoencoder to improve the robustness of methods in the overfitting and the data contamination. CeLEry provides a generic framework and can be implemented in multiple tasks depending on the research objectives, including the spatial coordinates discovery as well as the layer discovery. It can make use of the information of multiple tissues of spatial transcriptomics data. Thorough assessments exhibit that CeLEry achieves a leading performance compared to the state-of-art methods. We illustrated the usage of CeLEry in the discovery of neuron cell layers to study the development of Alzheimer's disease. The identified cell location information is valuable in many downstream analyses and can be indicative of the spatial organization of the tissues.
 7 | 
 8 | ## System Requirements
 9 | Python support packages: torch>1.8, pandas>1.4, numpy>1.20, scipy, tqdm, scanpy>1.5, anndata, sklearn
10 | 
11 | ## To install package
12 | In the command, input
13 | ```
14 | pip install CeLEryPy
15 | ```
16 | 
17 | 
18 | To load the package, input
19 | ```
20 | import CeLEry
21 | ```


--------------------------------------------------------------------------------
/CeLEry_package/dist/CeLEryPy-1.2.1-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/dist/CeLEryPy-1.2.1-py3-none-any.whl


--------------------------------------------------------------------------------
/CeLEry_package/dist/CeLEryPy-1.2.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/CeLEry_package/dist/CeLEryPy-1.2.1.tar.gz


--------------------------------------------------------------------------------
/CeLEry_package/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0.0", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "CeLEryPy"
 7 | version = "1.2.1"
 8 | description = "Leverage spatial transcriptomics data to recover cell locations in single-cell RNA RNA-seq"
 9 | readme = "README.md"
10 | authors = [{ name = "Qihuang Zhang", email = "qihuang.zh@gmail.com"}]
11 | license = { file = "LICENSE" }
12 | classifiers = [
13 |     "License :: OSI Approved :: MIT License",
14 |     "Programming Language :: Python",
15 |     "Programming Language :: Python :: 3",
16 | ]
17 | keywords = ["CeLEry", "spatial transcriptomics", "scRNA-seq"]
18 | dependencies = [
19 |     "torch >= 1.8",
20 |     "pandas >= 1.4",
21 |     "numpy >= 1.20",
22 |     "scipy",
23 |     "tqdm",
24 |     "scanpy >= 1.5",
25 |     "scikit-image",
26 |     "anndata",
27 |     "scikit-learn",
28 | ]
29 | requires-python = ">=3.8"
30 | 
31 | [project.urls]
32 | Homepage = "https://github.com/QihuangZhang/CeLEry"
33 | 
34 | 


--------------------------------------------------------------------------------
/CeLEry_package/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | setup()


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Qihuang Zhang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CeLEry
 2 | ## Leveraging  spatial  transcriptomics  data  to  recover cell  locationsin  single-cell RNA-seq with CeLEry
 3 | 
 4 | ### Qihuang Zhang*, Shunzhou Jiang, Amelia Schroeder, Jian Hu, Kejie Li, Baohong Zhang, David Dai, Edward B. Lee, Rui Xiao, Mingyao Li*
 5 | 
 6 | Single-cell RNA sequencing (scRNA-seq) has transformed our understanding of cellular heterogeneity in health and disease, but the lack of physical relationships among dissociated cells has limited its applications. Here we present CeLEry, a supervised deep learning algorithm to recover the spatial origins of cells in scRNA-seq by leveraging gene expression and spatial location information learned from spatial transcriptomics (ST) data. CeLEry has a data augmentation procedure via variational autoencoder to improve the robustness of the method and overcome noise in scRNA-seq. CeLEry can infer the spatial origins of cells in scRNA-seq at multiple levels, including 2D location as well as the spatial domain or tissue layer of a cell. CeLEry also provides uncertainty estimates for the recovered location information. Comprehensive evaluations on multiple datasets generated from mouse and human brains show that CeLEry can reliably recover the spatial location information for cells in scRNA-seq.
 7 | 
 8 | ![CeLEry workflow](docs/asserts/images/workflow.png)
 9 | 
10 | *The implmentation procedure of CeLEry*:
11 | - CeLEry takes spatial transcriptomic data as input for the training data and the scRNA-seq as testing data set. 
12 | - CeLEry optionally generates replicates of the spatial transcriptomic data via variational autoencoder then includes them as the training data together with original spatial transcriptomic data. 
13 | - A deep neural network is trained to learn the relationship between the spotwise gene expression and location information, minimizing the loss functions that are specified according to the specific problem. 
14 | 
15 | 
16 | 
17 | ## Usage
18 | 
19 | The [**CeLEry**](https://github.com/QihuangZhang/CeLEry) package is an implementation of a deep neural network in discovering location information for single cell RNA data. With CeLEry, you can:
20 | 
21 | - Preprocess spatial transcriptomics data from various formats.
22 | - Build a deep neural network to predict cell locations.
23 | - Generate synthetic spatial transcriptomic data.
24 | 
25 | 
26 | 
27 | ## Tutorial
28 | 
29 | 
30 | A Jupyter Notebook of the tutorial is accessible from : 
31 | <br>
32 | https://github.com/QihuangZhang/CeLEry/blob/main/tutorial/tutorial.md
33 | <br>
34 | 
35 | 
36 | The tutorial of the Biogen pretrain model can be accessible from : 
37 | <br>
38 | https://github.com/QihuangZhang/CeLEry/blob/main/tutorial/BiogenPretrain.md
39 | <br>
40 | 
41 | # System Requirements
42 | 
43 | ## Hardware Requirements
44 | 
45 | The `CeLEry` package requires only a standard computer with enough RAM to support the operations defined by a user. For minimal performance, this will be a computer with about 2 GB of RAM. For optimal performance, we recommend a computer with the following specs:
46 | 
47 | RAM: 16+ GB  
48 | CPU: 4+ cores, 3.3+ GHz/core
49 | 
50 | ## Software Requirements
51 | 
52 | ### OS Requirements
53 | 
54 | The package development version is tested on *Linux* operating systems. The developmental version of the package has been tested on the following systems:
55 | 
56 | Linux: kernel 3.10.0 
57 | Mac OSX:  
58 | Windows:  
59 | 
60 | ## System Requirements
61 | Python (>3.8) support packages: torch>=1.8, pandas>=1.4, numpy>=1.20, scipy, tqdm, scanpy>=1.5, anndata, sklearn, scikit-image
62 | 
63 | 
64 | # Install packages
65 | In the command, input
66 | ```
67 | pip install CeLEryPy
68 | ```
69 | 
70 | The installation of CeLEry python package takes approximately 5 minumtes.
71 | 


--------------------------------------------------------------------------------
/code_paper/1_LIBD/CeLEry_train_Scenario1and2.py:
--------------------------------------------------------------------------------
  1 | #!-### Note: Need to run "preprocess.py" first to obtain the available datasets.
  2 | 
  3 | 
  4 | ## In this version of Cell Location discovEry (LIBD) we consider region of a tissue   under Scenarios 1 and 2
  5 | 
  6 | # Application to LIBD data
  7 | 
  8 | import os,csv,re
  9 | import pandas as pd
 10 | import numpy as np
 11 | import scanpy as sc
 12 | import math
 13 | 
 14 | from skimage import io, color
 15 | from sklearn.cluster import KMeans
 16 | 
 17 | from scipy.sparse import issparse
 18 | import random, torch
 19 | import warnings
 20 | warnings.filterwarnings("ignore")
 21 | import matplotlib.colors as clr
 22 | import matplotlib.pyplot as plt
 23 | import pickle
 24 | 
 25 | #Read original data and save it to h5ad
 26 | from scanpy import read_10x_h5
 27 | #import SpaGCN as spg
 28 | import CeLEry as cel
 29 | 
 30 | from data.LIBD.LIBD_gene_select import d_g
 31 | 
 32 | # import tangram as tg
 33 | 
 34 | ##  1. Data Preperation --------------------------------------------------------------------------
 35 | ### Load MouseBarin Data Section 1: Regarded as Spatial Transcriptomic Data
 36 | dataSection1 = sc.read("../data/LIBD/data_151673.h5ad")
 37 | 
 38 | 
 39 | ## Conduct clustering
 40 | cdata = dataSection1.copy()
 41 | cel.getGeneImg(cdata,emptypixel = 0)
 42 | cdataexpand =  np.expand_dims(cdata.GeneImg, axis=1) 
 43 | 
 44 | cdatacentral = cel.centralize(cdataexpand.copy())
 45 | direclust = [cdatacentral[x,0,:,:] for x in range(cdatacentral.shape[0])]
 46 | direflat = [x.flat for x in direclust]
 47 | direflatnp = np.stack(direflat)
 48 | 
 49 | # implementing k-means clustering
 50 | kmeansmodel =  KMeans(n_clusters=100, random_state=0)
 51 | kmeans = kmeansmodel.fit(direflatnp)
 52 | np.save("../output/LIBD/cluster_673.npy", kmeans.labels_)
 53 | 
 54 | 
 55 | ## Calculating z-score
 56 | cel.get_zscore(dataSection1)
 57 | 
 58 | # get sorted indeces
 59 | 
 60 | dataSection1sort = dataSection1.obs.sort_values (by = ['x2','x3'])
 61 | dataSection1 = dataSection1[dataSection1sort.index]
 62 | 
 63 | def seed_worker(worker_id):
 64 |     worker_seed = torch.initial_seed() % 2**32
 65 |     np.random.seed(worker_seed)
 66 |     random.seed(worker_seed)
 67 | 
 68 | ##  2. Data Augmentation --------------------------------------------------------------------------
 69 | cdata = dataSection1.copy()
 70 | cel.getGeneImg(cdata,emptypixel = 0)
 71 | cdataexpand =  np.expand_dims(cdata.GeneImg, axis=1) 
 72 | np.save("../output/LIBD/full_geneimg.npy", cdataexpand)
 73 | 
 74 | # Read in gene expression and spatial location
 75 | cdataexp_full = np.load("../output/LIBD/full_geneimg.npy")
 76 | 
 77 | 
 78 | # Load Clustering Results
 79 | Kmeans_cluster = np.load("../output/LIBD/cluster_673.npy")
 80 | 
 81 | full = cel.datagenemapclust(cdataexp_full,Kmeans_cluster)
 82 | 
 83 | 
 84 | ## Step 1: Model Fitting of CAVE------------------------------------------------------------------------------------
 85 | 
 86 | def FitGenModel (cdataexpand, beta, learning_rate = 1e-3):
 87 |     g = torch.Generator()
 88 |     g.manual_seed(2020)
 89 |     trainloader = torch.utils.data.DataLoader(full, batch_size=1, num_workers = 4, shuffle = True, worker_init_fn=seed_worker, generator=g)
 90 |     random.seed(2020)
 91 |     torch.manual_seed(2020)
 92 |     np.random.seed(2020)
 93 |     #
 94 |     ## Set up Autoencoder
 95 |     CVAEmodel = cel.ClusterVAEmask(latent_dim = 511-Kmeans_cluster.max(), total_cluster = Kmeans_cluster.max(), fgx = cdataexpand.shape[2], fgy = cdataexpand.shape[3], KLDw = 0, hidden = [8,4,2,4,4])
 96 |     CVAEmodel = CVAEmodel.float()
 97 |     filename = "../output/LIBD/Generation/CVAE_{beta}.obj".format(beta = beta)
 98 |     #
 99 |     ## Run Autoencoder 
100 |     clg=cel.SpaCluster()
101 |     clg.train(model = CVAEmodel, train_loader = trainloader, num_epochs= 249, annealing = True, KLDwinc = beta/4, n_incr =50, RCcountMax = 30, learning_rate = 0.001)
102 |     # Save the model to a local folder
103 |     filehandler = open(filename, 'wb') 
104 |     pickle.dump(CVAEmodel, filehandler)
105 |     print('save model to: {filename}'.format(filename=filename))
106 |     CVAEmodel.filename = filename
107 |     return CVAEmodel, clg
108 | 
109 | CVAEmodel_e5, clg_e5 = FitGenModel(cdataexpand = cdataexp_full, beta = 1e-5)
110 | # CVAEmodel_e2, clg_e2 = FitGenModel(cdataexpand = cdataexp_full, beta = 1e-2)
111 | 
112 | 
113 | # ## if still converging
114 | def FitGenModel_continue (model, clg, cdataexpand, beta):
115 |     g = torch.Generator()
116 |     g.manual_seed(2020)
117 |     trainloader= torch.utils.data.DataLoader(full, batch_size=1, num_workers = 4, shuffle = True, worker_init_fn=seed_worker, generator=g)
118 |     filename = "../output/LIBD/Generation/CVAE_{beta}.obj".format(beta = beta)
119 |     clg.train(model = model, train_loader = trainloader, num_epochs= 150, annealing = False, RCcountMax = 30, learning_rate = clg.learning_rate)
120 |     # Save the model to a local folder
121 |     filehandler = open(filename, 'wb') 
122 |     pickle.dump(model, filehandler)
123 |     print('save model to: {filename}'.format(filename=filename))
124 |     model.filename = filename
125 |     return model, clg
126 | 
127 | CVAEmodel_e5, clg_e5 = FitGenModel_continue(model = CVAEmodel_e5, clg = clg_e5, cdataexpand = cdataexp_full, beta = 1e-5)
128 | 
129 | 
130 | ## Step 2: Data Generation  ------------------------------------------------------------------------------------
131 | 
132 | ## Glimpse of generate model
133 | def GeneratePlot(beta, traindata):
134 |     trainloader= torch.utils.data.DataLoader(traindata, batch_size=1, num_workers = 4)
135 |     filename = "../output/LIBD/Generation/CVAE_{beta}.obj".format(beta = beta)
136 |     # 
137 |     filehandler = open(filename, 'rb') 
138 |     CVAEmodel = pickle.load(filehandler)
139 |     #
140 |     clg=cel.SpaCluster()
141 |     clg.model = CVAEmodel
142 |     try:
143 |         os.makedirs("../output/LIBD/Generation/Glimps/Gen{beta}".format(beta = beta))
144 |     except FileExistsError:
145 |         print("Folder already exists")
146 |     for j, img in enumerate(trainloader):
147 |         # img = next(dataloader_iterator)
148 |         cel.plotGeneImg(img[0][0,0,:,:], filename = "../output/LIBD/Generation/Glimps/Gen{beta}/img{j}".format(beta = beta, j = j))
149 |         omin = img[0].min()
150 |         omax = img[0].max()
151 |         for i in range(10):
152 |             result = CVAEmodel(img) 
153 |             outputimg = result[0][0,0,:,:].detach().numpy() * result[4][0,0,:,:].detach().numpy()
154 |             cel.plotGeneImg( outputimg , filename = "../output/LIBD/Generation/Glimps/Gen{beta}/img{j}var{i}".format(beta = beta, j = j, i = i), range = (omin.item(), omax.item()))
155 | 
156 | GeneratePlot(beta = 1e-5, traindata = full)
157 | GeneratePlot(beta = 1e-2, traindata = full)
158 | 
159 | 
160 | def Data_Generation(beta, dataSection1, traindata, nrep):
161 |     trainloader= torch.utils.data.DataLoader(traindata, batch_size=1, num_workers = 4)
162 |     random.seed(2021)
163 |     torch.manual_seed(2021)
164 |     np.random.seed(2021)
165 |     #
166 |     filename = "../output/LIBD/Generation/CVAE_{beta}.obj".format(beta = beta) 
167 |     filehandler = open(filename, 'rb') 
168 |     CVAEmodel = pickle.load(filehandler)
169 |     #
170 |     clg=cel.SpaCluster()
171 |     clg.model = CVAEmodel
172 |     data_gen=clg.fast_generation(trainloader, nrep)
173 |     # data_gen=np.load("../output/{folder}/data_gen.npy".format(folder = folder))
174 |     data_gen_rs = clg.deep_reshape (data = data_gen, refer = dataSection1.obs)
175 |     try:
176 |         os.makedirs("../output/LIBD/DataGen")
177 |     except FileExistsError:
178 |         print("Folder already exists")
179 |     np.save("../output/LIBD/DataGen/data_gen_{beta}_n{nrep}.npy".format(beta = beta, nrep = nrep), data_gen_rs)
180 | 
181 | 
182 | Data_Generation(beta = 1e-5, nrep = 2, dataSection1 = dataSection1, traindata = full)
183 | Data_Generation(beta = 1e-5, nrep = 4, dataSection1 = dataSection1, traindata = full)
184 | Data_Generation(beta = 1e-5, nrep = 6, dataSection1 = dataSection1, traindata = full)
185 | Data_Generation(beta = 1e-5, nrep = 8, dataSection1 = dataSection1, traindata = full)
186 | Data_Generation(beta = 1e-5, nrep = 10, dataSection1 = dataSection1, traindata = full)
187 | 
188 | 
189 | 
190 | ## Step 3** (weighted regression model): Prediction Model  ------------------------------------------------------------------------------------
191 | ## Count the number of spots on each layer
192 | layer_count =  dataSection1.obs["Layer"].value_counts().sort_index()
193 | layer_weight = layer_count[7]/layer_count[0:7]
194 | layer_weights = torch.tensor(layer_weight.to_numpy())
195 | 
196 | 
197 | 
198 | def FitPredModel (beta, nrep, dataSection1):
199 |     #
200 |     random.seed(2020)
201 |     torch.manual_seed(2020)
202 |     np.random.seed(2020)
203 |     g = torch.Generator()
204 |     g.manual_seed(2021)
205 |     # Original Version
206 |     data_gen_rs = np.load("../output/LIBD/DataGen/data_gen_{beta}_n{nrep}.npy".format(beta = beta, nrep = nrep))
207 |     # Attach the original
208 |     tdatax = np.expand_dims(dataSection1.X, axis = 0)
209 |     tdata_rs = np.swapaxes(tdatax, 1, 2)
210 |     datacomp = np.concatenate((data_gen_rs, tdata_rs), axis=0)
211 |     #
212 |     dataDNN = cel.wrap_gene_layer(datacomp, dataSection1.obs, "Layer")
213 |     CoReg_loader = torch.utils.data.DataLoader(dataDNN, batch_size=4, num_workers = 4, shuffle = True, worker_init_fn=seed_worker, generator=g)
214 |     # Create Deep Neural Network for Coordinate Regression
215 |     DNNmodel = cel.DNNordinal( in_channels = data_gen_rs.shape[1], num_classes = 7, hidden_dims = [50, 10, 5], importance_weights = layer_weights )
216 |     DNNmodel = DNNmodel.float()
217 |     #
218 |     CoReg=cel.SpaCluster()
219 |     CoReg.train(model = DNNmodel, train_loader = CoReg_loader, num_epochs= 250, RCcountMax = 5, learning_rate = 0.001)
220 |     #
221 |     filename2 = "../output/LIBD/Prediction/data_gen_layer_{beta}_n{nrep}.obj".format(beta = beta, nrep = nrep)
222 |     filehandler2 = open(filename2, 'wb') 
223 |     pickle.dump(DNNmodel, filehandler2)
224 | 
225 | # temp
226 | # beta = 1e-5
227 | # nrep = 10
228 | # dataSection1 = dataSection1
229 | 
230 | FitPredModel(beta = 1e-5, nrep = 2, dataSection1 = dataSection1)
231 | FitPredModel(beta = 1e-5, nrep = 4, dataSection1 = dataSection1)
232 | FitPredModel(beta = 1e-5, nrep = 6, dataSection1 = dataSection1)
233 | FitPredModel(beta = 1e-5, nrep = 8, dataSection1 = dataSection1)
234 | FitPredModel(beta = 1e-5, nrep = 10, dataSection1 = dataSection1)
235 | 
236 | def FitPredModel_continue (holdoff, beta, nrep, dataSection1, learning_rate):
237 |     filename2 = "../output/LIBD/Prediction/data_gen_layer_{holdoff}_{beta}_n{nrep}.obj".format(holdoff = holdoff, beta = beta, nrep = nrep)
238 |     filehandler2 = open(filename2, 'rb')
239 |     DNNmodel = pickle.load(filehandler2)
240 |     #
241 |     data_gen_rs = np.load("../output/LIBD/DataGen/data_gen_layer_{holdoff}_{beta}_n{nrep}.npy".format(holdoff = holdoff, beta = beta, nrep = nrep))
242 |     #
243 |     dataDNN = cel.wrap_gene_layer(data_gen_rs, dataSection1.obs)
244 |     CoReg_loader = torch.utils.data.DataLoader(dataDNN, batch_size=1, num_workers = 4, shuffle = True)
245 |     # Create Deep Neural Network for Coordinate Regression
246 |     #
247 |     random.seed(2021)
248 |     torch.manual_seed(2021)
249 |     np.random.seed(2021)
250 |     #
251 |     CoReg=cel.SpaCluster()
252 |     CoReg.train(model = DNNmodel, train_loader = CoReg_loader, num_epochs= 60, RCcountMax = 1, learning_rate = learning_rate)
253 |     #
254 |     filehandler2 = open(filename2, 'wb') 
255 |     pickle.dump(DNNmodel, filehandler2)
256 | 
257 | 
258 | FitPredModel_continue(holdoff = 50 , beta = 1e-5, nrep = 10, dataSection1 = Section1train50, learning_rate = 3.125e-06)
259 | 
260 | FitPredModel_continue(holdoff = 50 , beta = 1e-9, nrep = 10, dataSection1 = Section1train50, learning_rate = 1.220703125e-08)
261 | 
262 | 
263 | ## Step 3**.2: Prediction Model of the case without data augmentation  ------------------------------------------------------------------------------------
264 | 
265 | def FitPredModelNE (dataSection1):
266 |     tdatax = np.expand_dims(dataSection1.X, axis = 0)
267 |     tdata_rs = np.swapaxes(tdatax, 1, 2)
268 |     DataTra = cel.wrap_gene_layer(tdata_rs, dataSection1.obs, "Layer")
269 |     t_loader= torch.utils.data.DataLoader(DataTra, batch_size=1, num_workers = 4, shuffle = True)
270 |     # Create Deep Neural Network for Coordinate Regression  # 10, 4, 2
271 |     DNNmodel = cel.DNNordinal( in_channels = DataTra[1][0].shape[0], num_classes = 7, hidden_dims = [10, 4, 2], importance_weights = layer_weights ) # [100,50,25] )
272 |     DNNmodel = DNNmodel.float()
273 |     #
274 |     CoOrg=cel.SpaCluster()
275 |     CoOrg.train(model = DNNmodel, train_loader = t_loader, num_epochs= 150, RCcountMax = 15, learning_rate = 0.001)
276 |     #
277 |     filename3 = "../output/LIBD/Prediction/layer_PreOrg.obj"
278 |     filehandler2 = open(filename3, 'wb') 
279 |     pickle.dump(DNNmodel, filehandler2)
280 | 
281 | 
282 | FitPredModelNE (dataSection1 = dataSection1)
283 | 
284 | 


--------------------------------------------------------------------------------
/code_paper/1_LIBD/LIBDVisual.R:
--------------------------------------------------------------------------------
  1 | ### This file trying to visual the hodge results using the histology information
  2 | 
  3 | ## 0. Global Parameters and Packages
  4 | 
  5 | library(dplyr)
  6 | library(ggplot2)
  7 | library(tidyr)
  8 | library(png)
  9 | 
 10 | outputdir <- "output/LIBD/plots/"
 11 | 
 12 | 
 13 | 
 14 | # Functions ---------------------------------------------------------------
 15 | 
 16 | Density_plot_data <- function(pred_result, Study){
 17 |   dataplot <- cbind(background, pred_result) 
 18 |   data_long <- gather(dataplot, column, prob, V3:V9, factor_key = TRUE)
 19 |   data_long_new <- data_long %>%
 20 |     mutate(TargetLayer = factor(column, labels = c("L1","L2","L3","L4","L5","L6","WM"), levels = paste0("V",3:9))) %>%
 21 |     mutate(study = Study)
 22 |   return(data_long_new)
 23 | }
 24 | 
 25 | 
 26 | labelvector <- c("L1","L2","L3","L4","L5","L6","WM")
 27 | 
 28 | 
 29 | 
 30 | Density_plot <- function(dataall, LayerSet = labelvector){
 31 |   png(file = paste0(outputdir,"LIBD_Density_plot.png"), height = 1600, width = 1400)
 32 |   Boxplot  <- ggplot(dataall %>%
 33 |                        filter (TargetLayer %in% LayerSet) %>%
 34 |                        mutate (TargetLayerfull = factor(TargetLayer, 
 35 |                                                        labels =  paste0("predicted to ",LayerSet),
 36 |                                                        levels = LayerSet)), 
 37 |                      aes(x = ycord, y = xcord) )  + 
 38 |     theme_bw()  + 
 39 |     geom_point(aes(fill = prob), shape = 21, color = "black",  size = 1.5, stroke = 0.3) +
 40 |     annotation_raster(Histology, ymin = 0, ymax= 1, xmin = 0, xmax = 1) +
 41 |     # scale_y_reverse() +
 42 |     facet_grid(TargetLayerfull~study, switch = "y") +
 43 |     # scale_fill_gradient(low = "#08121b", high = "#9fd3fa") +
 44 |     scale_fill_gradient2(low = "#08121b", mid = "#56b1f7", high = "#EB9486",midpoint = 0.5) +
 45 |     theme(text=element_text(size=25, family="URWHelvetica"), axis.text = element_text(size = 30, family="URWHelvetica"),
 46 |           panel.spacing = unit(1, "lines")) +
 47 |     theme(strip.background =element_rect(fill="#17202A",color="#17202A"))+ 
 48 |     theme(strip.text = element_text(colour = 'white'),axis.text=element_blank()) + #, strip.position = "left"
 49 |     theme(panel.border = element_rect(colour = "#17202A"), legend.position="none")  + #
 50 |     labs(x = NULL, y = NULL, fill = "Probability")
 51 |   print(Boxplot)
 52 |   dev.off()
 53 |   
 54 |   png(file = paste0(outputdir,"LIBD_Density_plot_legend.png"), height = 1600, width = 1400)
 55 |   Boxplot2 <- Boxplot +
 56 |     theme(legend.position="right")
 57 |   
 58 |   print(Boxplot2)
 59 |   dev.off()
 60 | }
 61 | 
 62 | # 1. Prepare the LIBD backgound data --------------------------------------
 63 | 
 64 | ## Using data of 507 as the background
 65 | 
 66 | tissue_pos <- read.csv("data/LIBD/visualization_151507.csv")
 67 | # Histology <- readPNG("data/LIBD/151507_tissue_lowres_image.png")
 68 | Histology <- readPNG("data/LIBD/RegionReference_bw2.png")
 69 | # Histology_tsp <- matrix(rgb(Histology[,,1],Histology[,,2],Histology[,,3], 0.7), nrow=dim(Histology)[1])
 70 | 
 71 | Histology_maxx <- max(tissue_pos["x2"])+1
 72 | Histology_minx <- min(tissue_pos["x2"])
 73 | Histology_maxy <- max(tissue_pos["x3"])+1
 74 | Histology_miny <- min(tissue_pos["x3"])
 75 | 
 76 | 
 77 | background <- tissue_pos %>%
 78 |   mutate (xcord = 0.92-x2/(1.2*Histology_maxx+Histology_minx-5)) %>%
 79 |   mutate (ycord = 0.12+x3/(1.22*Histology_maxy+Histology_miny))
 80 | 
 81 | # 2. Apply the Hodge results on to the LIBD background --------------------
 82 | 
 83 | ## Load the results information
 84 | 
 85 | pred_CeLEry <- read.csv("output/LIBD/Prediction151507/layer_PreOrg_probmat.csv", header = F)
 86 | data_CeLEry <- Density_plot_data(pred_CeLEry, "CeLEry")
 87 | 
 88 | pred_CeLEryn2 <- read.csv("output/LIBD/Prediction151507/data_gen_layer_1e-05_n2_probmat.csv", header = F)
 89 | data_CeLEryn2 <- Density_plot_data(pred_CeLEryn2, "CeLEry (Augmentation)")
 90 | 
 91 | pred_Tangram <- read.csv("output/LIBD/Prediction151507/Tangram_probmat_151507.csv", header = F)
 92 | pred_Tangram_full <- cbind(0,0,pred_Tangram)
 93 | names(pred_Tangram_full) <- paste0("V",1:9)
 94 | data_Tangram <- Density_plot_data(pred_Tangram_full, "Tangram")
 95 | 
 96 | pred_Multiple <- read.csv("output/LIBDmultiple/Prediction151507/layer_PreOrgv2_probmat.csv", header = F)
 97 | data_Multiple <- Density_plot_data(pred_Multiple, "CeLEry (Multiple)")
 98 | 
 99 | 
100 | 
101 | pred_spaOTsc <- read.csv("output/LIBD/Prediction151507/spaOTsc_probmat.csv", header = F)
102 | pred_spaOTsc_prop <- pred_spaOTsc/rowSums(pred_spaOTsc)
103 | pred_spaOTsc2 <- read.csv("output/LIBD/Prediction151507/spaOTsc_decisionmat.csv", header = F)
104 | pred_spaOTsc_full <- cbind(pred_spaOTsc2,pred_spaOTsc_prop)
105 | names(pred_spaOTsc_full) <- paste0("V",1:9)
106 | data_spaOTsc <- Density_plot_data(pred_spaOTsc_full, "spaOTsc")
107 | 
108 | 
109 | pred_novosparc <- read.csv("output/LIBD/Prediction151507/novosparc_probmat.csv", header = F)
110 | pred_novosparc_prop <- pred_novosparc/rowSums(pred_novosparc)
111 | pred_novosparc2 <- read.csv("output/LIBD/Prediction151507/novosparc_decisionmat.csv", header = F)
112 | pred_novosparc_full <- cbind(pred_novosparc2,pred_novosparc_prop)
113 | names(pred_novosparc_full) <- paste0("V",1:9)
114 | data_novosparc <- Density_plot_data(pred_novosparc_full, "novosparc")
115 | 
116 | 
117 | data_all <- rbind(data_CeLEry, data_CeLEryn2, data_Tangram, data_Multiple, data_spaOTsc, data_novosparc)
118 | 
119 | Density_plot(data_all)
120 | 
121 | # 3. PredictionHodge: Other methods (Discarded) --------------------
122 | # ## Tangram
123 | # pred_Tangram <- read.csv("output/Hodge/PredictionHodge/Tangram_probmat.csv", header = F)
124 | # 
125 | # for (i in 1:6){
126 | #   Density_plot(pred_Tangram, "Tangram", i)
127 | # }
128 | # 
129 | # ## ClusterBased
130 | # pred_ClusterBased <- read.csv("output/Hodge/PredictionHodge/ClusterBased_probmat.csv", header = F)
131 | # 
132 | # for (i in 1:6){
133 | #   Density_plot(pred_ClusterBased, "ClusterBased", i)
134 | # }
135 | 
136 | 


--------------------------------------------------------------------------------
/code_paper/1_LIBD/LIBDacc.R:
--------------------------------------------------------------------------------
  1 | ## 0. Global Parameters and Packages
  2 | 
  3 | library(dplyr)
  4 | library(tidyr)
  5 | library(ggplot2)
  6 | ## 1. Data  Processing
  7 | 
  8 | # classresults <- read.csv("output/LIBD/Multiple/data_gene_All_layerv2_1e-05_n2_probmat.csv", header = F)
  9 | # classresults <- read.csv("output/Hodge/layer_PreOrgv2_probmat.csv", header = F)
 10 | # classresults <- read.csv("output/Hodge/data_gene_All_layerv2_1e-05_n10_probmat.csv", header = F)
 11 | 
 12 | OverallAccSummary <- function (path) {
 13 |   classresults <- read.csv(path, header = F)
 14 |   
 15 |   classresults_new <-  classresults %>%
 16 |     mutate(Type = case_when( 
 17 |       V1 == V2 ~ "Same",
 18 |       abs(V1-V2) == 1 ~ "Neighbour",
 19 |       T ~ "Other"))
 20 |   
 21 |   summaries <- table(classresults_new$Type)
 22 |   
 23 |   exact_acc <- summaries["Same"]/sum(summaries)
 24 |   cat(exact_acc)
 25 |   Neighbor_acc <- exact_acc + summaries["Neighbour"]/sum(summaries)
 26 |   cat(Neighbor_acc)
 27 |   
 28 |   # for (i in 1:7) {
 29 |   #   data_curremt <-
 30 |   #   
 31 |   # }
 32 |   
 33 |   return( c(exact_acc, Neighbor_acc) )
 34 | }
 35 | 
 36 | 
 37 | ordinary507 <- OverallAccSummary("output/LIBD/Prediction151507/layer_PreOrg_probmat.csv")
 38 | ordinary676 <- OverallAccSummary("output/LIBD/Prediction151676/layer_PreOrg_probmat.csv")
 39 | 
 40 | 
 41 | aug507 <- OverallAccSummary("output/LIBD/Prediction151507/data_gen_layer_1e-05_n2_probmat.csv")
 42 | aug676 <- OverallAccSummary("output/LIBD/Prediction151676/data_gen_layer_1e-05_n2_probmat.csv")
 43 | 
 44 | # OverallAccSummary("output/LIBD/Prediction151507/data_gen_layer_1e-05_n2_probmat.csv")
 45 | # OverallAccSummary("output/LIBD/Prediction151676/data_gen_layer_1e-05_n2_probmat.csv")
 46 | 
 47 | 
 48 | multiple507 <- OverallAccSummary("output/LIBDmultiple/Prediction151507/layer_PreOrgv2_probmat.csv")
 49 | multiple676 <- OverallAccSummary("output/LIBDmultiple/Prediction151676/layer_PreOrgv2_probmat.csv")
 50 | 
 51 | multipleaug507 <- OverallAccSummary("output/LIBDmultiple/Prediction151507/data_gene_All_layerv2_1e-05_n2_probmat.csv")
 52 | multipleaug676 <- OverallAccSummary("output/LIBDmultiple/Prediction151676/data_gene_All_layerv2_1e-05_n2_probmat.csv")
 53 | 
 54 | Tangram507 <- OverallAccSummary("output/LIBD/Prediction151507/Tangram_decisionmat.csv")
 55 | Tangram676 <- OverallAccSummary("output/LIBD/Prediction151676/Tangram_decisionmat.csv")
 56 | 
 57 | 
 58 | spaOTsc507 <- OverallAccSummary("output/LIBD/Prediction151507/spaOTsc_decisionmat.csv")
 59 | spaOTsc676 <- OverallAccSummary("output/LIBD/Prediction151676/spaOTsc_decisionmat.csv")
 60 | 
 61 | 
 62 | novosparc507 <- OverallAccSummary("output/LIBD/Prediction151507/novosparc_decisionmat.csv")
 63 | novosparc676 <- OverallAccSummary("output/LIBD/Prediction151676/novosparc_decisionmat.csv")
 64 | 
 65 | 
 66 | accuracy_table <- rbind(ordinary507, ordinary676, aug507, aug676, multiple507, multiple676, multipleaug507, multipleaug676,
 67 |                     Tangram507, Tangram676, spaOTsc507, spaOTsc676, novosparc507, novosparc676)
 68 | colnames (accuracy_table) <- c("top1", "top2")
 69 | 
 70 | accuracy_table_long <- data.frame(accuracy_table) %>%
 71 |   add_rownames(var = "method") %>% 
 72 |   pivot_longer(cols = top1:top2, names_to = "type", values_to = "accuracy") %>%
 73 |   mutate( tissue = gsub('[A-Za-z]+', '', method))  %>%
 74 |   mutate( method = gsub('[0-9]+', '', method)) %>%
 75 |   mutate( Scenario = case_when(
 76 |     (!method %in% c("multiple", "multipleaug")) & (tissue == 676) ~ 1,
 77 |     (!method %in% c("multiple", "multipleaug")) & (tissue == 507) ~ 2,
 78 |     (method %in% c("multiple", "multipleaug")) & (tissue == 676) ~ 3,
 79 |     (method %in% c("multiple", "multipleaug")) & (tissue == 507) ~ 4
 80 |   )) %>%
 81 |   mutate( method =  factor (method, levels = unique(method), 
 82 |                             labels = c("CeLEry", "CeLEry(aug)", "CeLEry", "CeLEry(aug)", "Tangram", "spaOTsc", "novosparc") )) %>%
 83 |   mutate(type = factor(type, levels = unique(type), labels = c("top-1", "top-2"))) %>%
 84 |   mutate(Scenario = factor (Scenario, levels = c(1, 3, 2, 4), labels = paste("Scenario", c(1, 3, 2, 4)))) %>%
 85 |   data.frame()
 86 | 
 87 | 
 88 | pdf(file = "output/LIBD/plots/LIBD_barplot.pdf", width = 9, height = 9)
 89 | 
 90 | color_palatte <-c( "#CAE7B9", "#F3DE8A","#EB9486", "#7E7F9A", "#97A7B3")
 91 | strip_color <- "#0A1D37"
 92 | 
 93 | barplot <- ggplot(accuracy_table_long, aes(fill = method, x = type, y = accuracy)) + 
 94 |   geom_bar(stat = "identity", position="dodge") + 
 95 |   scale_fill_manual(values=color_palatte[c(3,2,1,4,5)]) + 
 96 |   facet_wrap(~Scenario)  +
 97 |   # scale_y_continuous(trans=scales::pseudo_log_trans(base = 10), breaks = c(0, 1, 10, 100, 500)) +
 98 |   theme_bw()  +
 99 |   theme(text=element_text(size=25, family="URWHelvetica"), axis.text = element_text(size = 25, family="URWHelvetica"), panel.spacing = unit(1, "lines") ) +
100 |   theme(strip.background =element_rect(fill=strip_color,color=strip_color))+ # #535b44
101 |   theme(strip.text = element_text(colour = 'white')) + # , axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)
102 |   theme(panel.border = element_rect(colour = strip_color), legend.position = "bottom") +
103 |   labs(fill = "Method", x = "Type", y = "Accuracy")
104 | 
105 | print(barplot)
106 | 
107 | dev.off()
108 | 
109 | 
110 | pdf(file = "output/LIBD/plots/LIBD_barplot_legend.pdf", width = 12, height = 8)
111 | 
112 | print(barplot)
113 | 
114 | dev.off()
115 | 
116 | 
117 | # ### Further exploration
118 | # 
119 | # OverallAccCeLEry <- function (path, probmat, truth) {
120 | #   classresults <- read.csv(paste0(path,probmat), header = F)
121 | #   truthresults <- read.csv(paste0(path,truth), header = F)
122 | #   names(truthresults) <- "truth"
123 | #   
124 | #   classresults_new <-  data.frame(classresults, truth = truthresults) %>%
125 | #     mutate(Type = case_when( 
126 | #       V1 == truth ~ "Same",
127 | #       abs(V1-truth) == 1 ~ "Neighbour",
128 | #       T ~ "Other"))
129 | #   
130 | #   summaries <- table(classresults_new$Type)
131 | #   
132 | #   exact_acc <- summaries["Same"]/sum(summaries)
133 | #   cat(exact_acc)
134 | #   Neighbor_acc <- exact_acc + summaries["Neighbour"]/sum(summaries)
135 | #   cat(Neighbor_acc)
136 | #   
137 | #   return( c(exact_acc, Neighbor_acc) )
138 | # }
139 | # 
140 | # 
141 | # OverallAccCeLEry(path = "output/LIBD/PredictionEmbd/", 
142 | #                  probmat = "Emd_model_151673_151507_probmat.csv",
143 | #                  truth = "Emd_model_151673_151507_truth.csv")
144 | # 
145 | # OverallAccCeLEry(path = "output/LIBD/PredictionEmbd/", 
146 | #                  probmat = "Emd_model_151673_151676_probmat.csv",
147 | #                  truth = "Emd_model_151673_151676_truth.csv")
148 | 


--------------------------------------------------------------------------------
/code_paper/1_LIBD/README.md:
--------------------------------------------------------------------------------
 1 | # Guide for LIBD study
 2 | ## Pipeline
 3 | 
 4 | preprocess.py  ->  
 5 | CeLEry_train_Scenario1and2.py (or  CeLEry_train_Scenario3and4.py) ->
 6 | prediction.py
 7 | 
 8 | 
 9 | ## Datasets
10 | http://spatial.libd.org/spatialLIBD/


--------------------------------------------------------------------------------
/code_paper/1_LIBD/prediction.py:
--------------------------------------------------------------------------------
  1 | ## In this version of Cell Location discovEry (LIBD) we consider region of a tissue and we hold off a partial
  2 | 
  3 | # Application to LIBD data
  4 | 
  5 | import os,csv,re
  6 | import pandas as pd
  7 | import numpy as np
  8 | import scanpy as sc
  9 | import math
 10 | 
 11 | from skimage import io, color
 12 | from sklearn.cluster import KMeans
 13 | 
 14 | from scipy.sparse import issparse
 15 | import random, torch
 16 | import warnings
 17 | warnings.filterwarnings("ignore")
 18 | import matplotlib.colors as clr
 19 | import matplotlib.pyplot as plt
 20 | import pickle
 21 | 
 22 | #Read original data and save it to h5ad
 23 | from scanpy import read_10x_h5
 24 | os.chdir("SpaClusterPython")
 25 | #import SpaGCN as spg
 26 | import CeLEry as cel
 27 | 
 28 | from data.LIBD.LIBD_gene_select import d_g
 29 | 
 30 | # import tangram as tg
 31 | 
 32 | ##  1. Data Preperation --------------------------------------------------------------------------
 33 | ### Load MouseBarin Data Section 1: Regarded as Spatial Transcriptomic Data
 34 | dataSection1 = sc.read("../data/LIBD/data_151673.h5ad")
 35 | dataSection2 = sc.read("../data/LIBD/data_151676.h5ad")
 36 | dataSection3 = sc.read("../data/LIBD/data_151507.h5ad")
 37 | 
 38 | # Obtain the number of counts in each layer
 39 | layer_count =  dataSection2.obs["Layer"].value_counts().sort_index()
 40 | layer_count =  dataSection3.obs["Layer"].value_counts().sort_index()
 41 | 
 42 | 
 43 | ## Conduct clustering
 44 | # cdata = dataSection1.copy()
 45 | # cel.getGeneImg(cdata,emptypixel = 0)
 46 | #cdataexpand =  np.expand_dims(cdata.GeneImg, axis=1) 
 47 | 
 48 | #cdatacentral = cel.centralize(cdataexpand.copy())
 49 | #direclust = [cdatacentral[x,0,:,:] for x in range(cdatacentral.shape[0])]
 50 | #direflat = [x.flat for x in direclust]
 51 | #direflatnp = np.stack(direflat)
 52 | 
 53 | ## implementing k-means clustering
 54 | #kmeansmodel =  KMeans(n_clusters=20, random_state=0)
 55 | #kmeans = kmeansmodel.fit(direflatnp)
 56 | #np.save("../output/LIBD/cluster.npy", kmeans.labels_)
 57 | 
 58 | 
 59 | ## Calculating z-score
 60 | cel.get_zscore(dataSection1)
 61 | cel.get_zscore(dataSection2)
 62 | cel.get_zscore(dataSection3)
 63 | 
 64 | class_num = 7
 65 | 
 66 | ##  2*. Test (layer ordinal logistic regression) --------------------------------------------------------------------------
 67 | 
 68 | def report_prop_method_LIBD (folder, tissueID, name, dataSection2, traindata, Val_loader, coloruse, outname = ""):
 69 |     """
 70 |         Report the results of the proposed methods in comparison to the other method
 71 |         :folder: string: specified the folder that keep the proposed DNN method
 72 |         :name: string: specified the name of the DNN method, also will be used to name the output files
 73 |         :dataSection2: AnnData: the data of Section 2
 74 |         :traindata: AnnData: the data used in training data. This is only needed for compute SSIM
 75 |         :Val_loader: Dataload: the validation data from dataloader
 76 |         :outname: string: specified the name of the output, default is the same as the name
 77 |         :ImageSec2: Numpy: the image data that are refering to
 78 |     """
 79 |     if outname == "":
 80 |         outname = name
 81 |     filename2 = "{folder}/{name}.obj".format(folder = folder, name = name)
 82 |     filehandler = open(filename2, 'rb') 
 83 |     DNNmodel = pickle.load(filehandler)
 84 |     #
 85 |     coords_predict = np.zeros(dataSection2.obs.shape[0])
 86 |     payer_prob = np.zeros((dataSection2.obs.shape[0],class_num+2))
 87 |     for i, img in enumerate(Val_loader):
 88 |         recon = DNNmodel(img)
 89 |         logitsvalue = np.squeeze(torch.sigmoid(recon[0]).detach().numpy(), axis = 0)
 90 |         if (logitsvalue[class_num-2] == 1):
 91 |             coords_predict[i] = class_num
 92 |             payer_prob[i,(class_num + 1)] = 1
 93 |         else:
 94 |             logitsvalue_min = np.insert(logitsvalue, 0, 1, axis=0)
 95 |             logitsvalue_max = np.insert(logitsvalue_min, class_num, 0, axis=0) 
 96 |             prb = np.diff(logitsvalue_max)
 97 |             # prbfull = np.insert(-prb[0], 0, 1 -logitsvalue[0,0], axis=0)
 98 |             prbfull = -prb.copy() 
 99 |             coords_predict[i] = np.where(prbfull == prbfull.max())[0].max() + 1
100 |             payer_prob[i,2:] = prbfull
101 |     #
102 |     dataSection2.obs["pred_layer"] = coords_predict.astype(int)
103 |     payer_prob[:,0] = dataSection2.obs["Layer"]
104 |     payer_prob[:,1] = dataSection2.obs["pred_layer"]
105 |     dataSection2.obs["pred_layer_str"] = coords_predict.astype(int).astype('str')
106 |     cel.plot_layer(adata = dataSection2, folder = "{folder}{tissueID}".format(folder = folder, tissueID = tissueID), name = name, coloruse = coloruse)
107 |     cel.plot_confusion_matrix ( referadata = dataSection2, filename = "{folder}{tissueID}/{name}conf_mat_fig".format(folder = folder, tissueID = tissueID, name = name))
108 |     np.savetxt("{folder}{tissueID}/{name}_probmat.csv".format(folder = folder, tissueID = tissueID, name = name), payer_prob, delimiter=',')
109 | 
110 | 
111 |     
112 | 
113 | def Evaluate (testdata, tissueID, traindata, beta, nrep, coloruse = None):
114 |     ## Wrap up Validation data in to dataloader
115 |     vdatax = np.expand_dims(testdata.X, axis = 0)
116 |     vdata_rs = np.swapaxes(vdatax, 1, 2)
117 |     DataVal = cel.wrap_gene_layer(vdata_rs, testdata.obs, "Layer")
118 |     Val_loader= torch.utils.data.DataLoader(DataVal, batch_size=1, num_workers = 4)
119 |     #
120 |     report_prop_method_LIBD(folder = "../output/LIBD/Prediction", tissueID = tissueID,
121 |                        name = "data_gen_layer_{beta}_n{nrep}".format(beta = beta, nrep = nrep),
122 |                        dataSection2 = testdata, traindata = traindata,
123 |                        Val_loader = Val_loader, coloruse = coloruse)
124 | 
125 | 
126 | Evaluate(testdata = dataSection2, tissueID = 151676, traindata = dataSection1, beta = 1e-5, nrep = 2)
127 | Evaluate(testdata = dataSection2, tissueID = 151676, traindata = dataSection1, beta = 1e-5, nrep = 4)
128 | Evaluate(testdata = dataSection2, tissueID = 151676, traindata = dataSection1, beta = 1e-5, nrep = 6)
129 | Evaluate(testdata = dataSection2, tissueID = 151676, traindata = dataSection1, beta = 1e-5, nrep = 8)
130 | Evaluate(testdata = dataSection2, tissueID = 151676, traindata = dataSection1, beta = 1e-5, nrep = 10)
131 | 
132 | 
133 | Evaluate(testdata = dataSection3, tissueID = 151507, traindata = dataSection1, beta = 1e-5, nrep = 2)
134 | Evaluate(testdata = dataSection3, tissueID = 151507, traindata = dataSection1, beta = 1e-5, nrep = 4)
135 | Evaluate(testdata = dataSection3, tissueID = 151507, traindata = dataSection1, beta = 1e-5, nrep = 6)
136 | Evaluate(testdata = dataSection3, tissueID = 151507, traindata = dataSection1, beta = 1e-5, nrep = 8)
137 | Evaluate(testdata = dataSection3, tissueID = 151507, traindata = dataSection1, beta = 1e-5, nrep = 10)
138 | 
139 | 
140 | def EvaluateOrg (testdata, tissueID, traindata, coloruse = None):
141 |     ## Wrap up Validation data in to dataloader
142 |     vdatax = np.expand_dims(testdata.X, axis = 0)
143 |     vdata_rs = np.swapaxes(vdatax, 1, 2)
144 |     DataVal = cel.wrap_gene_layer(vdata_rs, testdata.obs, "Layer")
145 |     Val_loader= torch.utils.data.DataLoader(DataVal, batch_size=1, num_workers = 4)
146 |     #
147 |     report_prop_method_LIBD(folder = "../output/LIBD/Prediction", tissueID = tissueID,
148 |                        name = "layer_PreOrg",
149 |                        dataSection2 = testdata, traindata = traindata,
150 |                        Val_loader = Val_loader, coloruse = coloruse)
151 | 
152 | EvaluateOrg (testdata = dataSection2, tissueID = 151676, traindata = dataSection1)
153 | EvaluateOrg (testdata = dataSection3, tissueID = 151507, traindata = dataSection1)
154 | 
155 | 
156 | 
157 | 


--------------------------------------------------------------------------------
/code_paper/1_LIBD/preprocess.py:
--------------------------------------------------------------------------------
  1 | ### Datasets of this study can be download from  http://spatial.libd.org/spatialLIBD/
  2 | 
  3 | 
  4 | import os,csv,re
  5 | import pandas as pd
  6 | import numpy as np
  7 | import scanpy as sc
  8 | import math
  9 | from skimage import io, color
 10 | 
 11 | from scipy.sparse import issparse
 12 | import warnings
 13 | warnings.filterwarnings("ignore")
 14 | import matplotlib.colors as clr
 15 | import matplotlib.pyplot as plt
 16 | 
 17 | 
 18 | #Read original data and save it to h5ad
 19 | from scanpy import read_10x_h5
 20 | import CeLEry as cel
 21 | from data.LIBD.LIBD_gene_select import d_g
 22 | import json
 23 | 
 24 | 
 25 | 
 26 | 
 27 | 
 28 | ### ------------------------------------------------------------------------------------------------------- ###
 29 | ###        Process the genelist
 30 | ### ------------------------------------------------------------------------------------------------------- ###
 31 | def get_LIBD_top_DEgenes (studyID):
 32 |     """
 33 |     Preprocess the spatial transcriptomic raw data and obtain the optimal DE genes
 34 |     Parameters
 35 |         -----------
 36 |         studyID : string. the study ID of the LIBD datasets
 37 |     Returns
 38 |     -----------
 39 |         gene_topDE_list: the list of gene set that contains the highest DE genes between layers
 40 |     """
 41 |     adata = read_10x_h5("../data/LIBD/{studyID}/{studyID}_raw_feature_bc_matrix.h5".format(studyID = studyID))
 42 |     spatial = pd.read_csv("../data/LIBD/{studyID}/tissue_positions_list.txt".format(studyID = studyID),sep=",", header = None, na_filter = False, index_col = 0) 
 43 |     adata.obs["x1"] = spatial[1]
 44 |     adata.obs["x2"] = spatial[2]
 45 |     adata.obs["x3"] = spatial[3]
 46 |     # Select captured samples
 47 |     adata = adata[adata.obs["x1"] == 1]
 48 |     adata.var_names = [i.upper() for i in list(adata.var_names)]
 49 |     adata.var["genename"] = adata.var.index.astype("str")
 50 |     #
 51 |     del adata.obs["x1"]
 52 |     #
 53 |     adata.obs["Layer"] = 0
 54 |     LayerName =["L1","L2","L3","L4","L5","L6","WM"] #
 55 |     for i in range(7):
 56 |         Layer = pd.read_csv("../data/LIBD/{studyID}/{studyID}_{Lname}_barcodes.txt".format(studyID = studyID, Lname = LayerName[i]), sep=",", header = None, na_filter = False, index_col = 0)
 57 |         adata.obs.loc[Layer.index, "Layer"] = int(i+1)
 58 |         adata.obs.loc[Layer.index, "Layer_character"] = LayerName[i]
 59 |     data = adata[adata.obs["Layer"]!=0]    # Newly added on May 25 #Remove the spots without any layer label
 60 |     #
 61 |     #  Preprocessing
 62 |     adata.var_names_make_unique()
 63 |     cel.prefilter_genes(adata,min_cells=3) # avoiding all genes are zeros
 64 |     cel.prefilter_specialgenes(adata)
 65 |     #Normalize and take log for UMI-------
 66 |     sc.pp.normalize_per_cell(adata)
 67 |     sc.pp.log1p(adata)
 68 |     #
 69 |     sc.tl.rank_genes_groups(adata, 'Layer_character', method = 'wilcoxon', key_added = "wilcoxon")
 70 |     # sc.pl.rank_genes_groups(adata, n_genes = 200, sharey = False, key="wilcoxon", save = '{studyID}.pdf'.format(studyID = studyID))
 71 |     gene_topDE_list = []
 72 |     for layer_i in ['L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'WM']:
 73 |         gene_rank = sc.get.rank_genes_groups_df (adata,  group = layer_i, key = 'wilcoxon')
 74 |         top_gene_list = list(gene_rank["names"].iloc[0:200])
 75 |         gene_topDE_list.append( top_gene_list  )
 76 |     return gene_topDE_list
 77 | 
 78 | genelist73 = get_LIBD_top_DEgenes (151673)
 79 | genelist74 = get_LIBD_top_DEgenes (151674)
 80 | genelist75 = get_LIBD_top_DEgenes (151675)
 81 | 
 82 | 
 83 | # Get the gene list from the pre-screening
 84 | genelistlist = genelist73 + genelist74 + genelist75
 85 | genelist = sum(genelistlist, [])  # merge the list of lists
 86 | genelistuni = list( dict.fromkeys(genelist) )   # remove duplicates
 87 | 
 88 | 
 89 | 
 90 | 
 91 | ### ------------------------------------------------------------------------------------------------------- ###
 92 | ###    Preprocessing for spatial transcriptomics data
 93 | ### ------------------------------------------------------------------------------------------------------- ###
 94 | 
 95 | def Preprocess_SpTrans (studyID):
 96 |     """
 97 |     Preprocess the spatial transcriptomic raw data and obtain the optimal DE genes
 98 |     Parameters:
 99 |         studyID (string): the study ID of the LIBD datasets
100 |     Returns:
101 |         gene_topDE_list (list): the list of gene set that contains the highest DE genes between layers
102 |     """
103 |     # Read in the spatial transcriptomic data from a 10x Genomics-formatted HDF5 file
104 |     adata = sc.read_10x_h5(f"../data/LIBD/{studyID}/{studyID}_raw_feature_bc_matrix.h5")
105 |     
106 |     # Read in the spatial coordinates of the tissue samples from a CSV file
107 |     spatial = pd.read_csv(f"../data/LIBD/{studyID}/tissue_positions_list.txt", sep=",", header=None, na_filter=False, index_col=0) 
108 |     
109 |     # Add the spatial coordinates to the data object
110 |     adata.obs["x1"] = spatial[1]
111 |     adata.obs["x2"] = spatial[2]
112 |     adata.obs["x3"] = spatial[3]
113 |     
114 |     # Select only the samples that were captured
115 |     adata = adata[adata.obs["x1"] == 1]
116 |     
117 |     # Ensure that all gene names are uppercase
118 |     adata.var_names = [i.upper() for i in list(adata.var_names)]
119 |     
120 |     # Add the gene names as a column in the `var` attribute of the `adata` object
121 |     adata.var["genename"] = adata.var.index.astype("str")
122 |     
123 |     # Remove the "x1" column from the `obs` attribute of the `adata` object
124 |     del adata.obs["x1"]
125 |     
126 |     # Add a "Layer" column to the `obs` attribute of the `adata` object and initialize it to 0
127 |     adata.obs["Layer"] = 0
128 |     
129 |     # Define a list of layer names
130 |     LayerName =["L1", "L2", "L3", "L4", "L5", "L6", "WM"]
131 |     
132 |     # Loop through each layer and add the corresponding layer number to the `Layer` column of the `adata` object
133 |     # Also add the layer name as a separate column for convenience
134 |     for i in range(7):
135 |         Layer = pd.read_csv(f"../data/LIBD/{studyID}/{studyID}_{LayerName[i]}_barcodes.txt", sep=",", header=None, na_filter=False, index_col=0)
136 |         adata.obs.loc[Layer.index, "Layer"] = int(i+1)
137 |         adata.obs.loc[Layer.index, "Layer_character"] = LayerName[i]
138 |     
139 |     # Remove the spots without any layer label
140 |     data = adata[adata.obs["Layer"] != 0]
141 |     
142 |     # Make the gene names unique
143 |     adata.var_names_make_unique()
144 |     
145 |     # Filter out genes that are expressed in fewer than 3 cells
146 |     cel.prefilter_genes(adata, min_cells=3)
147 |     
148 |     # Filter out special genes (e.g. mitochondrial genes)
149 |     cel.prefilter_specialgenes(adata)
150 |     
151 |     # Normalize the data by cell and take the log of the UMI counts
152 |     sc.pp.normalize_per_cell(adata)
153 |     sc.pp.log1p(adata)
154 |     #
155 |     sc.tl.rank_genes_groups(adata, 'Layer_character', method = 'wilcoxon', key_added = "wilcoxon")
156 |     sc.pl.rank_genes_groups(adata, n_genes = 50, sharey = False, key="wilcoxon", save = '{studyID}.pdf'.format(studyID = studyID))
157 |     #
158 |     #  Filter the Genes that are selected by SpaGCN
159 |     genename = adata.var['genename']
160 |     genelistindex = [genename[genename == i].index[0] for i in genelistuni if  len(genename[genename == i])>0]  # only keep the genes that exists in SpT data
161 |     # Filter the genelist and output the results
162 |     bdata = adata[:,genelistindex]
163 |     cdata = sc.AnnData(X = bdata.X.toarray(), obs = bdata.obs, var = bdata.var, uns =bdata.uns, obsm = bdata.obsm)
164 |     cdata.write_h5ad("../data/LIBD/data_{studyID}.h5ad".format(studyID = studyID))
165 |     return genelistindex
166 | 
167 | ## Training Data
168 | genelistuni = Preprocess_SpTrans(151673)
169 | genelistuni = Preprocess_SpTrans(151674)
170 | genelistuni = Preprocess_SpTrans(151675)
171 | genelistuni = Preprocess_SpTrans(151676)
172 | genelistuni = Preprocess_SpTrans(151507)
173 | 
174 | Preprocess_SpTrans(151673)
175 | Preprocess_SpTrans(151674)
176 | Preprocess_SpTrans(151675)
177 | 
178 | 
179 | ## Testing Data
180 | Preprocess_SpTrans(151676)
181 | Preprocess_SpTrans(151507)
182 | 
183 | 
184 | ### ------------------------------------------------------------------------------------------------------- ###
185 | ###    Create a merged data set for 73, 74 and 75
186 | ### ------------------------------------------------------------------------------------------------------- ###
187 | 
188 | dataSection1 = sc.read("../data/LIBD/data_{studyID}.h5ad".format(studyID = 151673)) 
189 | print(dataSection1)
190 | dataSection2 = sc.read("../data/LIBD/data_{studyID}.h5ad".format(studyID = 151674)) 
191 | print(dataSection2)
192 | dataSection3 = sc.read("../data/LIBD/data_{studyID}.h5ad".format(studyID = 151675)) 
193 | print(dataSection3)
194 | 
195 | dataSection = dataSection1.concatenate(dataSection2, dataSection3)
196 | print(dataSection)
197 | 
198 | dataSection.write_h5ad("../data/LIBD/MergeTrains737475.h5ad")
199 | 
200 | 
201 | 
202 | datakankan = sc.read("../data/LIBD/MergeTrains737475.h5ad") 
203 | 
204 | 
205 | 
206 | 
207 | dataSection1 = sc.read("../data/LIBD/data_{studyID}.h5ad".format(studyID = 151673)) 
208 | print(dataSection1)
209 | 
210 | 
211 | dataSection4 = sc.read("../data/LIBD/data_{studyID}.h5ad".format(studyID = 151676)) 
212 | dataSection4.obs.to_csv ("../data/LIBD/visualization_151676.csv", sep = ",")
213 | 
214 | dataSection5 = sc.read("../data/LIBD/data_{studyID}.h5ad".format(studyID = 151507)) 
215 | dataSection5.obs.to_csv ("../data/LIBD/visualization_151507.csv", sep = ",")
216 | 
217 | print(dataSection4)
218 | 


--------------------------------------------------------------------------------
/code_paper/2_Alzheimer/README.md:
--------------------------------------------------------------------------------
 1 | # Guide for Alzheimer's disease study
 2 | ## Pipeline
 3 | 
 4 | preprocess.py     ->  
 5 | CeLEry_train_.py  ->
 6 | test.py
 7 | 
 8 | 
 9 | ## Datasets
10 | Data for training: http://spatial.libd.org/spatialLIBD/
11 | Quary data: https://upenn.app.box.com/s/e8nf4b384s7oi3o09pj5s8jfdu11swim


--------------------------------------------------------------------------------
/code_paper/2_Alzheimer/test.py:
--------------------------------------------------------------------------------
  1 | ## In this study, we use LIBD data as the training set and evaluate the performance of the results on the Alzheimer data
  2 | 
  3 | import os,csv,re
  4 | import pandas as pd
  5 | import numpy as np
  6 | import scanpy as sc
  7 | import math
  8 | 
  9 | from skimage import io, color
 10 | from sklearn.cluster import KMeans
 11 | 
 12 | from scipy.sparse import issparse
 13 | import random, torch
 14 | import warnings
 15 | warnings.filterwarnings("ignore")
 16 | import matplotlib.colors as clr
 17 | import matplotlib.pyplot as plt
 18 | import pickle
 19 | 
 20 | #Read original data and save it to h5ad
 21 | from scanpy import read_10x_h5
 22 | os.chdir("SpaClusterPython")
 23 | #import SpaGCN as spg
 24 | import CeLEry as cel
 25 | from sklearn.decomposition import PCA
 26 | from sklearn.manifold import TSNE
 27 | 
 28 | # from data.LIBD.LIBD_gene_select import d_g
 29 | 
 30 | # import tangram as tg
 31 | 
 32 | ##  1. Data Preperation --------------------------------------------------------------------------
 33 | ### Load MouseBarin Data Section 1: Regarded as Spatial Transcriptomic Data
 34 | dataSection1 = sc.read("../data/Alzheimer/MergeTrains73747576.h5ad")
 35 | dataSection2 = sc.read("../data/Alzheimer/Alzheimer_spa_DE_snRNA_py.h5ad")
 36 | 
 37 | ## Conduct clustering
 38 | #cdata = dataSection1.copy()
 39 | #cel.getGeneImg(cdata,emptypixel = 0)
 40 | #cdataexpand =  np.expand_dims(cdata.GeneImg, axis=1) 
 41 | 
 42 | #cdatacentral = cel.centralize(cdataexpand.copy())
 43 | #direclust = [cdatacentral[x,0,:,:] for x in range(cdatacentral.shape[0])]
 44 | #direflat = [x.flat for x in direclust]
 45 | #direflatnp = np.stack(direflat)
 46 | 
 47 | ## implementing k-means clustering
 48 | #kmeansmodel =  KMeans(n_clusters=20, random_state=0)
 49 | #kmeans = kmeansmodel.fit(direflatnp)
 50 | #np.save("../output/Alzheimer/cluster.npy", kmeans.labels_)
 51 | 
 52 | 
 53 | ## Calculating z-score
 54 | cel.get_zscore(dataSection1)
 55 | cel.get_zscore(dataSection2)
 56 | 
 57 | ## global parameters
 58 | class_num = 7
 59 | pca = PCA(n_components=50)
 60 | 
 61 | 
 62 | ## Compute PCA of cells
 63 | principalComponents = pca.fit_transform(dataSection2.X)
 64 | PCs = ['PC_{i}'.format(i=i) for i in range(1,51)]
 65 | principalDf = pd.DataFrame(data = principalComponents, columns = PCs)
 66 | principalDf.to_csv("../output/Alzheimer/PCA_selectedGenes.csv")
 67 | 
 68 | 
 69 | tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
 70 | tsne_results = tsne.fit_transform(principalDf)
 71 | tSNEDf = pd.DataFrame(data = tsne_results)
 72 | tSNEDf.to_csv("../output/Alzheimer/tSNEDf.csv")
 73 | 
 74 | ## PCA of subcategories
 75 | # neuron
 76 | cellneuron = dataSection2[[(i in ["In", "Ex"]) for i in dataSection2.obs["final_celltype"] ] ]
 77 | principalComponents = pca.fit_transform(cellneuron.X)
 78 | PCs = ['PC_{i}'.format(i=i) for i in range(1,51)]
 79 | principalDf = pd.DataFrame(data = principalComponents, columns = PCs)
 80 | principalDf["names"] = cellneuron.obs["cellname"]
 81 | principalDf.to_csv("../output/Alzheimer/PCA_neuron.csv")
 82 | 
 83 | 
 84 | tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
 85 | tsne_results = tsne.fit_transform(principalDf)
 86 | tSNEDf = pd.DataFrame(data = tsne_results)
 87 | tSNEDf.to_csv("../output/Alzheimer/tSNEDf_neuron.csv")
 88 | 
 89 | # oli
 90 | celloli = dataSection2[dataSection2.obs["final_celltype"] == "Oli" ]
 91 | principalComponents = pca.fit_transform(celloli.X)
 92 | PCs = ['PC_{i}'.format(i=i) for i in range(1,51)]
 93 | principalDf = pd.DataFrame(data = principalComponents, columns = PCs)
 94 | principalDf.to_csv("../output/Alzheimer/PCA_oli.csv")
 95 | 
 96 | 
 97 | tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
 98 | tsne_results = tsne.fit_transform(principalDf)
 99 | tSNEDf = pd.DataFrame(data = tsne_results)
100 | tSNEDf.to_csv("../output/Alzheimer/tSNEDf_oli.csv")
101 | 
102 | 
103 | ##  2**. Test (layer 2-Stage ordinal logistic regression ) --------------------------------------------------------------------------
104 | 
105 | 
106 | def report_prop_method_Alzheimer (folder, name, dataSection2, traindata, Val_loader, outname = ""):
107 |     """
108 |         Report the results of the proposed methods in comparison to the other method
109 |         :folder: string: specified the folder that keep the proposed DNN method
110 |         :name: string: specified the name of the DNN method, also will be used to name the output files
111 |         :dataSection2: AnnData: the data of Section 2
112 |         :traindata: AnnData: the data used in training data. This is only needed for compute SSIM
113 |         :Val_loader: Dataload: the validation data from dataloader
114 |         :outname: string: specified the name of the output, default is the same as the name
115 |         :ImageSec2: Numpy: the image data that are refering to
116 |     """
117 |     if outname == "":
118 |         outname = name
119 |     filename2 = "{folder}/{name}.obj".format(folder = folder, name = name)
120 |     filehandler = open(filename2, 'rb') 
121 |     DNNmodel = pickle.load(filehandler)
122 |     #
123 |     coords_predict = np.zeros(dataSection2.obs.shape[0])
124 |     payer_prob = np.zeros((dataSection2.obs.shape[0],class_num+1))
125 |     for i, img in enumerate(Val_loader):
126 |         recon = DNNmodel(img)
127 |         logitsvalue = np.squeeze(torch.sigmoid(recon[0]).detach().numpy(), axis = 0)
128 |         if (logitsvalue[class_num-2] == 1):
129 |             coords_predict[i] = class_num
130 |             payer_prob[i,(class_num + 1)] = 1
131 |         else:
132 |             logitsvalue_min = np.insert(logitsvalue, 0, 1, axis=0)
133 |             logitsvalue_max = np.insert(logitsvalue_min, class_num, 0, axis=0) 
134 |             prb = np.diff(logitsvalue_max)
135 |             # prbfull = np.insert(-prb[0], 0, 1 -logitsvalue[0,0], axis=0)
136 |             prbfull = -prb.copy() 
137 |             coords_predict[i] = np.where(prbfull == prbfull.max())[0].max() + 1
138 |             payer_prob[i,1:] = prbfull
139 |     #
140 |     dataSection2.obs["pred_layer"] = coords_predict.astype(int)
141 |     dataSection2.obs["pred_layer_str"] = coords_predict.astype(int).astype('str')
142 |     payer_prob[:,0] = dataSection2.obs["pred_layer"]
143 |     np.savetxt("{folder}/{name}_probmat.csv".format(folder = folder, name = name), payer_prob, delimiter=',')
144 |     sc.tl.rank_genes_groups(dataSection2, 'pred_layer_str', method = 'wilcoxon', key_added = "wilcoxon")
145 |     sc.pl.rank_genes_groups(dataSection2, n_genes = 50, sharey = False, key="wilcoxon", save = 'Alzheimer_DE.pdf')
146 | 
147 | 
148 | def Evaluate (testdata, traindata, beta, nrep):
149 |     ## Wrap up Validation data in to dataloader
150 |     vdatax = np.expand_dims(testdata.X, axis = 0)
151 |     vdata_rs = np.swapaxes(vdatax, 1, 2)
152 |     DataVal = cel.wrap_gene_layer(vdata_rs, testdata.obs)
153 |     Val_loader= torch.utils.data.DataLoader(DataVal, batch_size=1, num_workers = 4)
154 |     #
155 |     report_prop_method_Alzheimer(folder = "../output/Alzheimer/Prediction",
156 |                        name = "data_gene_All_layerv2_{beta}_n{nrep}".format(beta = beta, nrep = nrep),
157 |                        dataSection2 = testdata, traindata = traindata,
158 |                        Val_loader = Val_loader)
159 | 
160 | 
161 | ## Assing a dummy layers for the cells since it is not known
162 | dataSection2.obs["layer"] = 0
163 | dataSection2.obs["layer"][0:7] = [0,1,2,3,4,5,6]
164 | Evaluate(testdata = dataSection2, traindata = dataSection1, beta = 1e-5, nrep = 2)
165 | 
166 | 
167 | 
168 | 
169 | 
170 | 
171 | Evaluate(testdata = dataSection2, traindata = dataSection1, beta = 1e-5, nrep = 10)
172 | 
173 | def EvaluateOrg (testdata, traindata, coloruse = None):
174 |     ## Wrap up Validation data in to dataloader
175 |     vdatax = np.expand_dims(testdata.X, axis = 0)
176 |     vdata_rs = np.swapaxes(vdatax, 1, 2)
177 |     DataVal = cel.wrap_gene_layer(vdata_rs, testdata.obs)
178 |     Val_loader= torch.utils.data.DataLoader(DataVal, batch_size=1, num_workers = 4)
179 |     #
180 |     report_prop_method_Alzheimer(folder = "../output/Alzheimer/Prediction",
181 |                        name = "layer_PreOrgv2",
182 |                        dataSection2 = testdata, traindata = traindata,
183 |                        Val_loader = Val_loader, coloruse = coloruse)
184 | 
185 | EvaluateOrg (testdata = dataSection2, traindata = dataSection1)
186 | 
187 | 


--------------------------------------------------------------------------------
/code_paper/4_Mouse_brain_MERFISH/CeLEry_brain.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "/usr/local/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
 13 |       "  from .autonotebook import tqdm as notebook_tqdm\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "import CeLEry as cel\n",
 19 |     "\n",
 20 |     "import os,csv,re\n",
 21 |     "import pandas as pd\n",
 22 |     "import numpy as np\n",
 23 |     "import scanpy as sc\n",
 24 |     "import math\n",
 25 |     "from skimage import io, color\n",
 26 |     "\n",
 27 |     "from scipy.sparse import issparse\n",
 28 |     "import random, torch\n",
 29 |     "import warnings\n",
 30 |     "warnings.filterwarnings(\"ignore\")\n",
 31 |     "import pickle\n",
 32 |     "from sklearn.model_selection import train_test_split\n",
 33 |     "from anndata import AnnData, read_h5ad\n",
 34 |     "\n",
 35 |     "import json"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "data_merfish_raw = pd.read_csv(\"data/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_by_gene_S1R1.csv\", index_col=0)   \n",
 45 |     "meta_data = pd.read_csv(\"data/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_metadata_S1R1.csv\", index_col=0)\n",
 46 |     "\n",
 47 |     "data_merfish = AnnData(data_merfish_raw)\n",
 48 |     "\n",
 49 |     "data_merfish.obs['x_cord'] = meta_data['center_x'].tolist()\n",
 50 |     "data_merfish.obs['y_cord'] = meta_data['center_y'].tolist()\n",
 51 |     "data_merfish_raw = data_merfish.copy()\n",
 52 |     "\n",
 53 |     "sc.pp.filter_cells(data_merfish, min_counts=500)\n",
 54 |     "sc.pp.filter_cells(data_merfish, min_genes=100)\n",
 55 |     "\n",
 56 |     "sc.pp.neighbors(data_merfish, n_neighbors=15, use_rep='X', random_state=1)\n",
 57 |     "sc.tl.louvain(data_merfish, 0.4, random_state=1)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 3,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "Qdata = data_merfish[data_merfish.obs['x_cord'] <= np.quantile(data_merfish.obs['x_cord'], 0.5)]\n",
 67 |     "Rdata = data_merfish[data_merfish.obs['x_cord'] > np.quantile(data_merfish.obs['x_cord'], 0.5)]"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 4,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "cel.get_zscore(Qdata)\n",
 77 |     "cel.get_zscore(Rdata)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 5,
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stderr",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "100%|██████████| 2293/2293 [00:08<00:00, 284.61it/s]\n"
 90 |      ]
 91 |     },
 92 |     {
 93 |      "name": "stdout",
 94 |      "output_type": "stream",
 95 |      "text": [
 96 |       "Epoch:1, Loss:75.2848\n"
 97 |      ]
 98 |     },
 99 |     {
100 |      "name": "stderr",
101 |      "output_type": "stream",
102 |      "text": [
103 |       "100%|██████████| 2293/2293 [00:07<00:00, 311.46it/s]"
104 |      ]
105 |     },
106 |     {
107 |      "name": "stdout",
108 |      "output_type": "stream",
109 |      "text": [
110 |       "Epoch:2, Loss:35.4318\n",
111 |       "Folder already exists\n"
112 |      ]
113 |     },
114 |     {
115 |      "name": "stderr",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "## right brain as training, left brain as testing\n",
124 |     "model_train = cel.Fit_cord (data_train = Rdata, hidden_dims = [100, 50, 20], num_epochs_max = 2000, path = \"output/brain\", filename = \"brain_left\")"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 9,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "pred_cord = cel.Predict_cord (data_test = Qdata, path = \"output/brain\", filename = \"brain_left\")\n",
134 |     "pred_cord[:,0] = 1-pred_cord[:,0]\n",
135 |     "\n",
136 |     "data_train = Qdata.copy()\n",
137 |     "traindata = (data_train.X.A if issparse(data_train.X) else data_train.X)\n",
138 |     "tdatax = np.expand_dims(traindata, axis = 0)\n",
139 |     "tdata_rs = np.swapaxes(tdatax, 1, 2)\n",
140 |     "test_cord = cel.wrap_gene_location(tdata_rs, data_train.obs[['x_cord', 'y_cord']])\n",
141 |     "\n",
142 |     "pred_cord_transformx = pred_cord[:,0]*(test_cord.xmax-test_cord.xmin) + test_cord.xmin\n",
143 |     "pred_cord_transformy = pred_cord[:,1]*(test_cord.ymax-test_cord.ymin) + test_cord.ymin\n",
144 |     "pred_cord_transform = np.array([pred_cord_transformx, pred_cord_transformy]).T\n",
145 |     "pred_cord_transform = pd.DataFrame(pred_cord_transform)\n",
146 |     "pred_cord_transform.index = Qdata.obs.index\n",
147 |     "pred_cord_transform.columns = ['x', 'y']\n",
148 |     "\n",
149 |     "pred_cord_transform.to_csv(\"output/brain/celery_brain_left.csv\")"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 51,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "## left brain as training, right brain as testing\n",
159 |     "model_train = cel.Fit_cord (data_train = Qdata, hidden_dims = [100, 50, 20], num_epochs_max = 2000, path = \"output/brain\", filename = \"brain_right\")"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 57,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "pred_cord = cel.Predict_cord (data_test = Qdata, path = \"output/brain\", filename = \"brain_right\")\n",
169 |     "pred_cord[:,0] = 1-pred_cord[:,0]\n",
170 |     "\n",
171 |     "data_train = Rdata.copy()\n",
172 |     "traindata = (data_train.X.A if issparse(data_train.X) else data_train.X)\n",
173 |     "tdatax = np.expand_dims(traindata, axis = 0)\n",
174 |     "tdata_rs = np.swapaxes(tdatax, 1, 2)\n",
175 |     "test_cord = cel.wrap_gene_location(tdata_rs, data_train.obs[['x_cord', 'y_cord']])\n",
176 |     "\n",
177 |     "pred_cord_transformx = pred_cord[:,0]*(test_cord.xmax-test_cord.xmin) + test_cord.xmin\n",
178 |     "pred_cord_transformy = pred_cord[:,1]*(test_cord.ymax-test_cord.ymin) + test_cord.ymin\n",
179 |     "pred_cord_transform = np.array([pred_cord_transformx, pred_cord_transformy]).T\n",
180 |     "pred_cord_transform = pd.DataFrame(pred_cord_transform)\n",
181 |     "pred_cord_transform.index = Rdata.obs.index\n",
182 |     "pred_cord_transform.columns = ['x', 'y']\n",
183 |     "\n",
184 |     "pred_cord_transform.to_csv(\"output/brain/celery_brain_right.csv\")"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": []
193 |   }
194 |  ],
195 |  "metadata": {
196 |   "kernelspec": {
197 |    "display_name": "Python 3.8.15 64-bit",
198 |    "language": "python",
199 |    "name": "python3"
200 |   },
201 |   "language_info": {
202 |    "codemirror_mode": {
203 |     "name": "ipython",
204 |     "version": 3
205 |    },
206 |    "file_extension": ".py",
207 |    "mimetype": "text/x-python",
208 |    "name": "python",
209 |    "nbconvert_exporter": "python",
210 |    "pygments_lexer": "ipython3",
211 |    "version": "3.8.15 (default, Oct 11 2022, 21:52:37) \n[Clang 14.0.0 (clang-1400.0.29.102)]"
212 |   },
213 |   "orig_nbformat": 4,
214 |   "vscode": {
215 |    "interpreter": {
216 |     "hash": "0adcc2737ebf6a4a119f135174df96668767fca1ef1112612db5ecadf2b6d608"
217 |    }
218 |   }
219 |  },
220 |  "nbformat": 4,
221 |  "nbformat_minor": 2
222 | }
223 | 


--------------------------------------------------------------------------------
/code_paper/4_Mouse_brain_MERFISH/CeLEry_figure 6_scenario 2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 19,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import CeLEry as cel\n",
 10 |     "\n",
 11 |     "import os\n",
 12 |     "import pandas as pd\n",
 13 |     "import numpy as np\n",
 14 |     "import scanpy as sc\n",
 15 |     "import scipy\n",
 16 |     "\n",
 17 |     "from scipy.sparse import issparse\n",
 18 |     "from anndata import concat\n",
 19 |     "import warnings\n",
 20 |     "warnings.filterwarnings(\"ignore\")\n",
 21 |     "from anndata import AnnData\n",
 22 |     "from tqdm import tqdm"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "name": "stderr",
 32 |      "output_type": "stream",
 33 |      "text": [
 34 |       "100%|██████████| 2/2 [00:00<00:00,  4.81it/s]\n"
 35 |      ]
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "d11 = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_by_gene_S1R1.csv\", index_col=0)   \n",
 40 |     "d11_meta = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_metadata_S1R1.csv\", index_col=0)\n",
 41 |     "d12 = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate2_cell_by_gene_S1R2.csv\", index_col=0)   \n",
 42 |     "d12_meta = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate2_cell_metadata_S1R2.csv\", index_col=0)\n",
 43 |     "\n",
 44 |     "d11 = AnnData(d11)\n",
 45 |     "d11.obs['x_cord'] = d11_meta['center_x'].tolist()\n",
 46 |     "d11.obs['y_cord'] = d11_meta['center_y'].tolist()\n",
 47 |     "d11.obs['source'] = \"S1R1\"\n",
 48 |     "\n",
 49 |     "d12 = AnnData(d12)\n",
 50 |     "d12.obs['x_cord'] = d12_meta['center_x'].tolist()\n",
 51 |     "d12.obs['y_cord'] = d12_meta['center_y'].tolist()\n",
 52 |     "d12.obs['source'] = \"S1R2\"\n",
 53 |     "\n",
 54 |     "data = [d11, d12]\n",
 55 |     "for d in tqdm(data):\n",
 56 |     "    sc.pp.filter_cells(d, min_counts=500)\n",
 57 |     "    sc.pp.filter_cells(d, min_genes=100)\n",
 58 |     "\n",
 59 |     "d_tot = concat([d11, d12])\n",
 60 |     "sc.pp.neighbors(d_tot, n_neighbors = 15, use_rep=\"X\")\n",
 61 |     "sc.tl.louvain(d_tot, 0.4, random_state=1)\n",
 62 |     "\n",
 63 |     "d11 = d_tot[d_tot.obs['source'] == \"S1R1\"].copy()\n",
 64 |     "d12 = d_tot[d_tot.obs['source'] == \"S1R2\"].copy()"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "In this scenario, We choose the right half of replicates S1R2 as training set, and predict locations of S1R1 cells.\n",
 72 |     "\n",
 73 |     "The cutting line for separating right half of S1R2 is x*6/11 + 2436.36 - y = 0. It is mannually defined, a rough separation."
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 3,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "d12_right = d12[d12.obs['x_cord']*6/11 + 2436.36 - d12.obs['y_cord'] > 0].copy()\n",
 83 |     "\n",
 84 |     "d11_left = d11[d11.obs['x_cord'] < np.quantile(d11.obs['x_cord'], 0.5)].copy()\n",
 85 |     "d11_right = d11[d11.obs['x_cord'] >= np.quantile(d11.obs['x_cord'], 0.5)].copy()"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "Rdata = d12_right.copy()\n",
 95 |     "\n",
 96 |     "cel.get_zscore(Rdata)\n",
 97 |     "Rdata.obs = Rdata.obs[['x_cord', 'y_cord']]\n",
 98 |     "model_train = cel.Fit_cord (data_train = Rdata, hidden_dims = [150, 100, 70, 50, 20], num_epochs_max = 500, path = \"output/fig6\", filename = \"fig6_2\")"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 12,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "def pred_transform(pred_cord):\n",
108 |     "    data_train = Rdata.copy()\n",
109 |     "    traindata = (data_train.X.A if issparse(data_train.X) else data_train.X)\n",
110 |     "    tdatax = np.expand_dims(traindata, axis = 0)\n",
111 |     "    tdata_rs = np.swapaxes(tdatax, 1, 2)\n",
112 |     "    test_cord = cel.wrap_gene_location(tdata_rs, Rdata.obs[['x_cord', 'y_cord']])\n",
113 |     "\n",
114 |     "    pred_cord_transformx = pred_cord[:,0]*(test_cord.xmax-test_cord.xmin) + test_cord.xmin\n",
115 |     "    pred_cord_transformy = pred_cord[:,1]*(test_cord.ymax-test_cord.ymin) + test_cord.ymin\n",
116 |     "    pred_cord_transform = np.array([pred_cord_transformx, pred_cord_transformy]).T\n",
117 |     "    return pred_cord_transform"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "cel.get_zscore(d11_left)\n",
127 |     "cel.get_zscore(d11_right)\n",
128 |     "\n",
129 |     "pred_cord_left = cel.Predict_cord (data_test = d11_left, path = \"output/fig6\", filename = \"fig6_2\")\n",
130 |     "pred_cord_transform_left = pred_transform(pred_cord_left)\n",
131 |     "\n",
132 |     "pred_cord_right = cel.Predict_cord (data_test = d11_right, path = \"output/fig6\", filename = \"fig6_2\")\n",
133 |     "pred_cord_transform_right = pred_transform(pred_cord_right)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "os.makedirs(\"output/fig6\", exist_ok=True)\n",
143 |     "np.save(\"output/fig6/fig6_2_left_celery.npy\", pred_cord_transform_left)\n",
144 |     "np.save(\"output/fig6/fig6_2_right_celery.npy\", pred_cord_transform_right)"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "Till now, it is fine to compare the correlation between true and predicted pairwise distance."
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "d11_left.obs['x_celery'] = pred_cord_transform_left[:,0]\n",
161 |     "d11_left.obs['y_celery'] = pred_cord_transform_left[:,1]\n",
162 |     "\n",
163 |     "d11_right.obs['x_celery'] = pred_cord_transform_right[:,0]\n",
164 |     "d11_right.obs['y_celery'] = pred_cord_transform_right[:,1]\n",
165 |     "\n",
166 |     "def distCompute(data_merfish):\n",
167 |     "    celery_dist = []\n",
168 |     "    true_dist = []\n",
169 |     "    Qdata_loc = np.array(data_merfish.obs[['x_cord', 'y_cord']])\n",
170 |     "    celery_pred = np.array(data_merfish.obs[['x_celery', 'y_celery']])\n",
171 |     "\n",
172 |     "    for i in tqdm(range(Qdata_loc.shape[0])):\n",
173 |     "        celery_i = celery_pred[i, :]\n",
174 |     "        celery_points = celery_pred[i+1:, :]\n",
175 |     "        celery_dist.extend(np.sqrt(np.sum((celery_points - celery_i)**2, axis=1)))\n",
176 |     "\n",
177 |     "\n",
178 |     "        true_i = Qdata_loc[i, :]\n",
179 |     "        true_points = Qdata_loc[i+1:, :]\n",
180 |     "        true_dist.extend(np.sqrt(np.sum((true_points - true_i)**2, axis=1)))\n",
181 |     "    return celery_dist, true_dist\n",
182 |     "\n",
183 |     "celery_dist, true_dist = distCompute(d11_left)\n",
184 |     "celery_dist_r, true_dist_r = distCompute(d11_right)\n",
185 |     "\n",
186 |     "celery_dist.extend(celery_dist_r)\n",
187 |     "true_dist.extend(true_dist_r)\n",
188 |     "\n",
189 |     "print(scipy.stats.pearsonr(true_dist, celery_dist))"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "However, the predicted locations of testing set is in the domain of training set locations. If we would like to compare the Euclidean distance between true locations and predicted locations, mannually matching is required to first roughly align the domain and rotation between training set and testing set. This step is imperfect, but it can help us compare the performance of different methods within a single scenario."
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 21,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": [
205 |     "## Based on our separation line, project the predicted locations of left brain to the left side.\n",
206 |     "\n",
207 |     "A = 6/11\n",
208 |     "C = 2436.36\n",
209 |     "B = -1\n",
210 |     "def pointTrans(celery_pred, left, xname, yname):\n",
211 |     "    x = celery_pred[:, 0]\n",
212 |     "    y = celery_pred[:, 1]\n",
213 |     "    x1 = x - 2*A*((A*x + B*y + C)/(A*A + B*B))\n",
214 |     "    y1 = y - 2*B*((A*x + B*y + C)/(A*A + B*B))\n",
215 |     "    left.obs[xname] = x1\n",
216 |     "    left.obs[yname] = y1\n",
217 |     "    # return x1, y1\n",
218 |     "\n",
219 |     "pointTrans(pred_cord_transform_left, d11_left, \"x_celery\", \"y_celery\")\n",
220 |     "Qdata = concat([d11_left, d11_right])"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 22,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "## Mannually matching\n",
230 |     "\n",
231 |     "def rotateMatrix(a):\n",
232 |     "    return np.array([[np.cos(a), -np.sin(a)], [np.sin(a), np.cos(a)]])\n",
233 |     "\n",
234 |     "x0=np.quantile(d11.obs['x_cord'], 0.5)\n",
235 |     "y0=5000\n",
236 |     "\n",
237 |     "\n",
238 |     "def anim(xy, i):\n",
239 |     "   newxy=(xy-[x0,y0]) @ rotateMatrix(-2*i*np.pi/180) + [x0,y0]\n",
240 |     "   return newxy\n",
241 |     "\n",
242 |     "\n",
243 |     "newxy = anim(np.array(Qdata.obs[['x_cord', 'y_cord']]), -30)\n",
244 |     "Qdata.obs['x_rotate'] = newxy[:, 0]\n",
245 |     "Qdata.obs['y_rotate'] = newxy[:, 1]\n",
246 |     "Qdata.obs['y_rotate'] = Qdata.obs['y_rotate'] + 500\n",
247 |     "Qdata.obs['x_rotate'] = Qdata.obs['x_rotate'] + 800"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "sq = lambda x, y: (x - y)**2\n",
257 |     "pred_dist_celery = np.sqrt(np.sum(sq(np.array(Qdata.obs[['x_rotate', 'y_rotate']]), np.array(Qdata.obs[['x_celery', 'y_celery']])), axis=1))\n",
258 |     "print(np.median(pred_dist_celery))"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": null,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": []
267 |   }
268 |  ],
269 |  "metadata": {
270 |   "kernelspec": {
271 |    "display_name": "Python 3",
272 |    "language": "python",
273 |    "name": "python3"
274 |   },
275 |   "language_info": {
276 |    "codemirror_mode": {
277 |     "name": "ipython",
278 |     "version": 3
279 |    },
280 |    "file_extension": ".py",
281 |    "mimetype": "text/x-python",
282 |    "name": "python",
283 |    "nbconvert_exporter": "python",
284 |    "pygments_lexer": "ipython3",
285 |    "version": "3.8.15"
286 |   },
287 |   "orig_nbformat": 4,
288 |   "vscode": {
289 |    "interpreter": {
290 |     "hash": "0adcc2737ebf6a4a119f135174df96668767fca1ef1112612db5ecadf2b6d608"
291 |    }
292 |   }
293 |  },
294 |  "nbformat": 4,
295 |  "nbformat_minor": 2
296 | }
297 | 


--------------------------------------------------------------------------------
/code_paper/4_Mouse_brain_MERFISH/CeLEry_figure 6_scenario 3.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import CeLEry as cel\n",
 10 |     "\n",
 11 |     "import os\n",
 12 |     "import pandas as pd\n",
 13 |     "import numpy as np\n",
 14 |     "import scanpy as sc\n",
 15 |     "import scipy\n",
 16 |     "\n",
 17 |     "from scipy.sparse import issparse\n",
 18 |     "from anndata import concat\n",
 19 |     "import warnings\n",
 20 |     "warnings.filterwarnings(\"ignore\")\n",
 21 |     "from anndata import AnnData\n",
 22 |     "from tqdm import tqdm"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "name": "stderr",
 32 |      "output_type": "stream",
 33 |      "text": [
 34 |       "100%|██████████| 3/3 [00:00<00:00,  4.54it/s]\n"
 35 |      ]
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "d13 = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate3_cell_by_gene_S1R3.csv\", index_col=0)   \n",
 40 |     "d13_meta = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate3_cell_metadata_S1R3.csv\", index_col=0)\n",
 41 |     "d12 = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate2_cell_by_gene_S1R2.csv\", index_col=0)   \n",
 42 |     "d12_meta = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate2_cell_metadata_S1R2.csv\", index_col=0)\n",
 43 |     "d11 = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_by_gene_S1R1.csv\", index_col=0)   \n",
 44 |     "d11_meta = pd.read_csv(\"data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_metadata_S1R1.csv\", index_col=0)\n",
 45 |     "\n",
 46 |     "d11 = AnnData(d11)\n",
 47 |     "d11.obs['x_cord'] = d11_meta['center_x'].tolist()\n",
 48 |     "d11.obs['y_cord'] = d11_meta['center_y'].tolist()\n",
 49 |     "d11.obs['source'] = \"S1R1\"\n",
 50 |     "\n",
 51 |     "d12 = AnnData(d12)\n",
 52 |     "d12.obs['x_cord'] = d12_meta['center_x'].tolist()\n",
 53 |     "d12.obs['y_cord'] = d12_meta['center_y'].tolist()\n",
 54 |     "d12.obs['source'] = \"S1R2\"\n",
 55 |     "\n",
 56 |     "d13 = AnnData(d13)\n",
 57 |     "d13.obs['x_cord'] = d13_meta['center_x'].tolist()\n",
 58 |     "d13.obs['y_cord'] = d13_meta['center_y'].tolist()\n",
 59 |     "d13.obs['source'] = \"S1R3\"\n",
 60 |     "\n",
 61 |     "data = [d11, d12, d13]\n",
 62 |     "for d in tqdm(data):\n",
 63 |     "    sc.pp.filter_cells(d, min_counts=500)\n",
 64 |     "    sc.pp.filter_cells(d, min_genes=100)\n",
 65 |     "\n",
 66 |     "d_tot = concat([d11, d12, d13])\n",
 67 |     "sc.pp.neighbors(d_tot, n_neighbors = 15, use_rep=\"X\")\n",
 68 |     "sc.tl.louvain(d_tot, 0.2, random_state=1)\n",
 69 |     "\n",
 70 |     "d11 = d_tot[d_tot.obs['source'] == \"S1R1\"].copy()\n",
 71 |     "d12 = d_tot[d_tot.obs['source'] == \"S1R2\"].copy()\n",
 72 |     "d13 = d_tot[d_tot.obs['source'] == \"S1R3\"].copy()"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "In this scenario, We choose the right halves of replicates S1R2 and S1R3 as training set, and predict locations of S1R1 cells. Since the right half of S1R2 and S1R3 are already matched, no further alignment is needed.\n",
 80 |     "\n",
 81 |     "The cutting line for separating right halves of S1R2 and S1R3 is x*6/11 + 2436.36 - y = 0. It is mannually defined, a rough separation."
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 3,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "d11_left = d11[d11.obs['x_cord'] < np.quantile(d11.obs['x_cord'], 0.5)]\n",
 91 |     "d11_right = d11[d11.obs['x_cord'] >= np.quantile(d11.obs['x_cord'], 0.5)]\n",
 92 |     "\n",
 93 |     "d12 = d12[d12.obs['x_cord']*6/11 + 2436.36 - d12.obs['y_cord'] > 0]\n",
 94 |     "d13 = d13[d13.obs['x_cord']*6/11 + 2436.36 - d13.obs['y_cord'] > 0]\n",
 95 |     "d_training = concat([d12, d13])"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "Rdata = d_training.copy()\n",
105 |     "\n",
106 |     "cel.get_zscore(Rdata)\n",
107 |     "Rdata.obs = Rdata.obs[['x_cord', 'y_cord']]\n",
108 |     "model_train = cel.Fit_cord (data_train = Rdata, hidden_dims = [200, 120, 70, 50, 20], num_epochs_max = 500, path = \"output/fig6\", filename = \"fig6_3\")"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 5,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "def pred_transform(pred_cord):\n",
118 |     "    data_train = Rdata.copy()\n",
119 |     "    traindata = (data_train.X.A if issparse(data_train.X) else data_train.X)\n",
120 |     "    tdatax = np.expand_dims(traindata, axis = 0)\n",
121 |     "    tdata_rs = np.swapaxes(tdatax, 1, 2)\n",
122 |     "    test_cord = cel.wrap_gene_location(tdata_rs, Rdata.obs[['x_cord', 'y_cord']])\n",
123 |     "\n",
124 |     "    pred_cord_transformx = pred_cord[:,0]*(test_cord.xmax-test_cord.xmin) + test_cord.xmin\n",
125 |     "    pred_cord_transformy = pred_cord[:,1]*(test_cord.ymax-test_cord.ymin) + test_cord.ymin\n",
126 |     "    pred_cord_transform = np.array([pred_cord_transformx, pred_cord_transformy]).T\n",
127 |     "    return pred_cord_transform"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 6,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "cel.get_zscore(d11_left)\n",
137 |     "cel.get_zscore(d11_right)\n",
138 |     "\n",
139 |     "pred_cord_left = cel.Predict_cord (data_test = d11_left, path = \"output/fig6\", filename = \"fig6_3\")\n",
140 |     "pred_cord_transform_left = pred_transform(pred_cord_left)\n",
141 |     "\n",
142 |     "pred_cord_right = cel.Predict_cord (data_test = d11_right, path = \"output/fig6\", filename = \"fig6_3\")\n",
143 |     "pred_cord_transform_right = pred_transform(pred_cord_right)     \n"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "os.makedirs(\"output/fig6\", exist_ok=True)\n",
153 |     "np.save(\"output/fig6/fig6_3_left_celery.npy\", pred_cord_transform_left)\n",
154 |     "np.save(\"output/fig6/fig6_3_right_celery.npy\", pred_cord_transform_right)"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "Till now, it is fine to compare the correlation between true and predicted pairwise distance."
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "d11_left.obs['x_celery'] = pred_cord_transform_left[:,0]\n",
171 |     "d11_left.obs['y_celery'] = pred_cord_transform_left[:,1]\n",
172 |     "\n",
173 |     "d11_right.obs['x_celery'] = pred_cord_transform_right[:,0]\n",
174 |     "d11_right.obs['y_celery'] = pred_cord_transform_right[:,1]\n",
175 |     "\n",
176 |     "def distCompute(data_merfish):\n",
177 |     "    celery_dist = []\n",
178 |     "    true_dist = []\n",
179 |     "    Qdata_loc = np.array(data_merfish.obs[['x_cord', 'y_cord']])\n",
180 |     "    celery_pred = np.array(data_merfish.obs[['x_celery', 'y_celery']])\n",
181 |     "\n",
182 |     "    for i in tqdm(range(Qdata_loc.shape[0])):\n",
183 |     "        celery_i = celery_pred[i, :]\n",
184 |     "        celery_points = celery_pred[i+1:, :]\n",
185 |     "        celery_dist.extend(np.sqrt(np.sum((celery_points - celery_i)**2, axis=1)))\n",
186 |     "\n",
187 |     "\n",
188 |     "        true_i = Qdata_loc[i, :]\n",
189 |     "        true_points = Qdata_loc[i+1:, :]\n",
190 |     "        true_dist.extend(np.sqrt(np.sum((true_points - true_i)**2, axis=1)))\n",
191 |     "    return celery_dist, true_dist\n",
192 |     "\n",
193 |     "celery_dist, true_dist = distCompute(d11_left)\n",
194 |     "celery_dist_r, true_dist_r = distCompute(d11_right)\n",
195 |     "\n",
196 |     "celery_dist.extend(celery_dist_r)\n",
197 |     "true_dist.extend(true_dist_r)\n",
198 |     "\n",
199 |     "print(scipy.stats.pearsonr(true_dist, celery_dist))"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "However, the predicted locations of testing set is in the domain of training set locations. If we would like to compare the Euclidean distance between true locations and predicted locations, mannually matching is required to first roughly align the domain and rotation between training set and testing set. This step is imperfect, but it can help us compare the performance of different methods within a single scenario."
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 8,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "## Based on our separation line, project the predicted locations of left brain to the left side.\n",
216 |     "\n",
217 |     "A = 6/11\n",
218 |     "C = 2436.36\n",
219 |     "B = -1\n",
220 |     "def pointTrans(celery_pred, left, xname, yname):\n",
221 |     "    x = celery_pred[:, 0]\n",
222 |     "    y = celery_pred[:, 1]\n",
223 |     "    x1 = x - 2*A*((A*x + B*y + C)/(A*A + B*B))\n",
224 |     "    y1 = y - 2*B*((A*x + B*y + C)/(A*A + B*B))\n",
225 |     "    left.obs[xname] = x1\n",
226 |     "    left.obs[yname] = y1\n",
227 |     "    # return x1, y1\n",
228 |     "\n",
229 |     "pointTrans(pred_cord_transform_left, d11_left, \"x_celery\", \"y_celery\")\n",
230 |     "Qdata = concat([d11_left, d11_right])"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 15,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "## Mannually matching\n",
240 |     "\n",
241 |     "def rotateMatrix(a):\n",
242 |     "    return np.array([[np.cos(a), -np.sin(a)], [np.sin(a), np.cos(a)]])\n",
243 |     "\n",
244 |     "x0=np.quantile(d11.obs['x_cord'], 0.5)\n",
245 |     "y0=5000\n",
246 |     "\n",
247 |     "\n",
248 |     "def anim(xy, i):\n",
249 |     "   newxy=(xy-[x0,y0]) @ rotateMatrix(-2*i*np.pi/180) + [x0,y0]\n",
250 |     "   return newxy\n",
251 |     "\n",
252 |     "\n",
253 |     "newxy = anim(np.array(Qdata.obs[['x_cord', 'y_cord']]), -30)\n",
254 |     "Qdata.obs['x_rotate'] = newxy[:, 0]\n",
255 |     "Qdata.obs['y_rotate'] = newxy[:, 1]\n",
256 |     "Qdata.obs['y_rotate'] = Qdata.obs['y_rotate'] + 500\n",
257 |     "Qdata.obs['x_rotate'] = Qdata.obs['x_rotate'] + 800"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "sq = lambda x, y: (x - y)**2\n",
267 |     "pred_dist_celery = np.sqrt(np.sum(sq(np.array(Qdata.obs[['x_rotate', 'y_rotate']]), np.array(Qdata.obs[['x_celery', 'y_celery']])), axis=1))\n",
268 |     "print(np.median(pred_dist_celery))"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": null,
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": []
277 |   }
278 |  ],
279 |  "metadata": {
280 |   "kernelspec": {
281 |    "display_name": "Python 3",
282 |    "language": "python",
283 |    "name": "python3"
284 |   },
285 |   "language_info": {
286 |    "codemirror_mode": {
287 |     "name": "ipython",
288 |     "version": 3
289 |    },
290 |    "file_extension": ".py",
291 |    "mimetype": "text/x-python",
292 |    "name": "python",
293 |    "nbconvert_exporter": "python",
294 |    "pygments_lexer": "ipython3",
295 |    "version": "3.8.15"
296 |   },
297 |   "orig_nbformat": 4,
298 |   "vscode": {
299 |    "interpreter": {
300 |     "hash": "0adcc2737ebf6a4a119f135174df96668767fca1ef1112612db5ecadf2b6d608"
301 |    }
302 |   }
303 |  },
304 |  "nbformat": 4,
305 |  "nbformat_minor": 2
306 | }
307 | 


--------------------------------------------------------------------------------
/code_paper/4_Mouse_brain_MERFISH/README.md:
--------------------------------------------------------------------------------
1 | # Note for Brain MERFISH data
2 | 
3 | Datasets is available from https://app.box.com/s/6nz5vlp0hjmuq9xruxog96p2woyt5fpm


--------------------------------------------------------------------------------
/code_paper/4_Mouse_brain_MERFISH/SpaOTsc_brain.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "/usr/local/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
 13 |       "  from .autonotebook import tqdm as notebook_tqdm\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "import os\n",
 19 |     "import numpy as np\n",
 20 |     "import pandas as pd\n",
 21 |     "import scanpy as sc\n",
 22 |     "import matplotlib.pyplot as plt\n",
 23 |     "from scipy.spatial.distance import cdist, squareform, pdist\n",
 24 |     "from scipy.stats import ks_2samp\n",
 25 |     "from scipy.stats import pearsonr\n",
 26 |     "\n",
 27 |     "import os,csv,re\n",
 28 |     "import math\n",
 29 |     "from skimage import io, color\n",
 30 |     "\n",
 31 |     "from scipy.sparse import issparse\n",
 32 |     "import random, torch\n",
 33 |     "import warnings\n",
 34 |     "warnings.filterwarnings(\"ignore\")\n",
 35 |     "import pickle\n",
 36 |     "from sklearn.model_selection import train_test_split\n",
 37 |     "from anndata import AnnData, read_h5ad\n",
 38 |     "import seaborn as sns\n",
 39 |     "\n",
 40 |     "import json\n",
 41 |     "\n",
 42 |     "from spaotsc import SpaOTsc"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "data_merfish_raw = pd.read_csv(\"data/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_by_gene_S1R1.csv\", index_col=0)   \n",
 52 |     "meta_data = pd.read_csv(\"data/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_metadata_S1R1.csv\", index_col=0)\n",
 53 |     "\n",
 54 |     "data_merfish = AnnData(data_merfish_raw)\n",
 55 |     "\n",
 56 |     "data_merfish.obs['x_cord'] = meta_data['center_x'].tolist()\n",
 57 |     "data_merfish.obs['y_cord'] = meta_data['center_y'].tolist()\n",
 58 |     "data_merfish_raw = data_merfish.copy()\n",
 59 |     "\n",
 60 |     "sc.pp.filter_cells(data_merfish, min_counts=500)\n",
 61 |     "sc.pp.filter_cells(data_merfish, min_genes=100)\n",
 62 |     "\n",
 63 |     "sc.pp.neighbors(data_merfish, n_neighbors=15, use_rep='X', random_state=1)\n",
 64 |     "sc.tl.louvain(data_merfish, 0.4, random_state=1)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 6,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "Qdata = data_merfish[data_merfish.obs['x_cord'] <= np.quantile(data_merfish.obs['x_cord'], 0.5)]\n",
 74 |     "Rdata = data_merfish[data_merfish.obs['x_cord'] > np.quantile(data_merfish.obs['x_cord'], 0.5)]"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 7,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "## right brain as training, left brain as testing\n",
 84 |     "datatest = Qdata.copy()\n",
 85 |     "datatrain = Rdata.copy()\n",
 86 |     "random.seed(2021)\n",
 87 |     "torch.manual_seed(2021)\n",
 88 |     "np.random.seed(2021)\n",
 89 |     "## Running spaOTsc\n",
 90 |     "df_sc = pd.DataFrame(datatest.X)\n",
 91 |     "is_dmat = cdist(np.array(datatrain.obs.iloc[:,0:2]), np.array(datatrain.obs.iloc[:,0:2]), 'euclidean')\n",
 92 |     "sc_dmat = cdist(datatest.X, datatest.X, 'euclidean')\n",
 93 |     "\n",
 94 |     "spsc = SpaOTsc.spatial_sc(sc_data=df_sc, is_dmat=is_dmat, sc_dmat=sc_dmat)\n",
 95 |     "cost_matrix = cdist(datatest.X, datatrain.X, 'euclidean')\n",
 96 |     "location_pred = spsc.transport_plan(cost_matrix)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "location_sum = np.sum(location_pred, axis=1)\n",
106 |     "location_pred_copy = location_pred / location_sum.reshape(len(location_sum), 1)\n",
107 |     "\n",
108 |     "pred_cord_transform = location_pred_copy.dot(np.array(Rdata.obs[['x_cord', 'y_cord']]))\n",
109 |     "pred_cord_transform[:, 0] = -pred_cord_transform[:, 0] + 2*np.quantile(data_merfish.obs['x_cord'], 0.5)\n",
110 |     "pred_cord_transform = pd.DataFrame(pred_cord_transform)\n",
111 |     "pred_cord_transform.index = Qdata.obs.index\n",
112 |     "pred_cord_transform.columns = ['x', 'y']\n",
113 |     "\n",
114 |     "pred_cord_transform.to_csv(\"output/brain/spaotsc_brain_left.csv\")"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": []
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "## left brain as training, right brain as testing\n",
131 |     "datatest = Rdata.copy()\n",
132 |     "datatrain = Qdata.copy()\n",
133 |     "random.seed(2021)\n",
134 |     "torch.manual_seed(2021)\n",
135 |     "np.random.seed(2021)\n",
136 |     "## Running spaOTsc\n",
137 |     "df_sc = pd.DataFrame(datatest.X)\n",
138 |     "is_dmat = cdist(np.array(datatrain.obs.iloc[:,0:2]), np.array(datatrain.obs.iloc[:,0:2]), 'euclidean')\n",
139 |     "sc_dmat = cdist(datatest.X, datatest.X, 'euclidean')\n",
140 |     "\n",
141 |     "spsc = SpaOTsc.spatial_sc(sc_data=df_sc, is_dmat=is_dmat, sc_dmat=sc_dmat)\n",
142 |     "cost_matrix = cdist(datatest.X, datatrain.X, 'euclidean')\n",
143 |     "location_pred = spsc.transport_plan(cost_matrix)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "location_sum = np.sum(location_pred, axis=1)\n",
153 |     "location_pred_copy = location_pred / location_sum.reshape(len(location_sum), 1)\n",
154 |     "\n",
155 |     "pred_cord_transform = location_pred_copy.dot(np.array(Rdata.obs[['x_cord', 'y_cord']]))\n",
156 |     "pred_cord_transform[:, 0] = -pred_cord_transform[:, 0] + 2*np.quantile(data_merfish.obs['x_cord'], 0.5)\n",
157 |     "pred_cord_transform = pd.DataFrame(pred_cord_transform)\n",
158 |     "pred_cord_transform.index = Qdata.obs.index\n",
159 |     "pred_cord_transform.columns = ['x', 'y']\n",
160 |     "\n",
161 |     "pred_cord_transform.to_csv(\"output/brain/spaotsc_brain_right.csv\")"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": []
170 |   }
171 |  ],
172 |  "metadata": {
173 |   "kernelspec": {
174 |    "display_name": "Python 3.10.8 64-bit",
175 |    "language": "python",
176 |    "name": "python3"
177 |   },
178 |   "language_info": {
179 |    "codemirror_mode": {
180 |     "name": "ipython",
181 |     "version": 3
182 |    },
183 |    "file_extension": ".py",
184 |    "mimetype": "text/x-python",
185 |    "name": "python",
186 |    "nbconvert_exporter": "python",
187 |    "pygments_lexer": "ipython3",
188 |    "version": "3.10.8 (main, Oct 13 2022, 10:17:43) [Clang 14.0.0 (clang-1400.0.29.102)]"
189 |   },
190 |   "orig_nbformat": 4,
191 |   "vscode": {
192 |    "interpreter": {
193 |     "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
194 |    }
195 |   }
196 |  },
197 |  "nbformat": 4,
198 |  "nbformat_minor": 2
199 | }
200 | 


--------------------------------------------------------------------------------
/code_paper/4_Mouse_brain_MERFISH/Tangram_brain.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "/usr/local/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
 13 |       "  from .autonotebook import tqdm as notebook_tqdm\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "import os, sys\n",
 19 |     "import numpy as np\n",
 20 |     "import pandas as pd\n",
 21 |     "import matplotlib.pyplot as plt\n",
 22 |     "import seaborn as sns\n",
 23 |     "import scanpy as sc\n",
 24 |     "import torch\n",
 25 |     "import tangram as tg\n",
 26 |     "from sklearn.model_selection import train_test_split\n",
 27 |     "from anndata import AnnData, read_h5ad\n",
 28 |     "import random"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 2,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "data_merfish_raw = pd.read_csv(\"data/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_by_gene_S1R1.csv\", index_col=0)   \n",
 38 |     "meta_data = pd.read_csv(\"data/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_metadata_S1R1.csv\", index_col=0)\n",
 39 |     "\n",
 40 |     "data_merfish = AnnData(data_merfish_raw)\n",
 41 |     "\n",
 42 |     "data_merfish.obs['x_cord'] = meta_data['center_x'].tolist()\n",
 43 |     "data_merfish.obs['y_cord'] = meta_data['center_y'].tolist()\n",
 44 |     "data_merfish_raw = data_merfish.copy()\n",
 45 |     "\n",
 46 |     "sc.pp.filter_cells(data_merfish, min_counts=500)\n",
 47 |     "sc.pp.filter_cells(data_merfish, min_genes=100)\n",
 48 |     "\n",
 49 |     "sc.pp.neighbors(data_merfish, n_neighbors=15, use_rep='X', random_state=1)\n",
 50 |     "sc.tl.louvain(data_merfish, 0.4, random_state=1)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 6,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "## right brain as training, left brain as testing\n",
 60 |     "Qdata = data_merfish[data_merfish.obs['x_cord'] <= np.quantile(data_merfish.obs['x_cord'], 0.5)]\n",
 61 |     "Rdata = data_merfish[data_merfish.obs['x_cord'] > np.quantile(data_merfish.obs['x_cord'], 0.5)]"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 7,
 67 |    "metadata": {},
 68 |    "outputs": [
 69 |     {
 70 |      "name": "stderr",
 71 |      "output_type": "stream",
 72 |      "text": [
 73 |       "INFO:root:649 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.\n",
 74 |       "INFO:root:649 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.\n",
 75 |       "INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.\n",
 76 |       "INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.\n"
 77 |      ]
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "tg.pp_adatas(Qdata, Rdata, genes=Rdata.var.index)\n",
 82 |     "\n",
 83 |     "assert Qdata.uns['training_genes'] == Rdata.uns['training_genes']"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 11,
 89 |    "metadata": {},
 90 |    "outputs": [
 91 |     {
 92 |      "name": "stderr",
 93 |      "output_type": "stream",
 94 |      "text": [
 95 |       "INFO:root:Allocate tensors for mapping.\n",
 96 |       "INFO:root:Begin training with 649 genes and rna_count_based density_prior in cells mode...\n",
 97 |       "INFO:root:Printing scores every 100 epochs.\n"
 98 |      ]
 99 |     },
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "Score: 0.260, KL reg: 0.043\n",
105 |       "Score: 0.822, KL reg: 0.002\n",
106 |       "Score: 0.835, KL reg: 0.002\n",
107 |       "Score: 0.838, KL reg: 0.002\n",
108 |       "Score: 0.840, KL reg: 0.002\n",
109 |       "Score: 0.841, KL reg: 0.002\n",
110 |       "Score: 0.841, KL reg: 0.002\n",
111 |       "Score: 0.842, KL reg: 0.002\n",
112 |       "Score: 0.842, KL reg: 0.002\n",
113 |       "Score: 0.842, KL reg: 0.002\n"
114 |      ]
115 |     },
116 |     {
117 |      "name": "stderr",
118 |      "output_type": "stream",
119 |      "text": [
120 |       "INFO:root:Saving results..\n"
121 |      ]
122 |     }
123 |    ],
124 |    "source": [
125 |     "ad_map = tg.map_cells_to_space(\n",
126 |     "    adata_sc=Qdata,\n",
127 |     "    adata_sp=Rdata,\n",
128 |     "    device='cpu',\n",
129 |     "    # device='cuda:0',\n",
130 |     ")"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 12,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "Rdata_location_pred = ad_map.X.dot(np.array(Rdata.obs[['x_cord', 'y_cord']]))\n",
140 |     "Rdata_location_pred[:, 0] = -Rdata_location_pred[:, 0] + 2*np.quantile(data_merfish.obs['x_cord'], 0.5)\n",
141 |     "Rdata_location_pred = pd.DataFrame(Rdata_location_pred)\n",
142 |     "Rdata_location_pred.index = Qdata.obs.index\n",
143 |     "Rdata_location_pred.columns = ['x', 'y']\n",
144 |     "\n",
145 |     "Rdata_location_pred.to_csv(\"output/brain/tangram_brain_left.csv\")"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": []
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 20,
158 |    "metadata": {},
159 |    "outputs": [
160 |     {
161 |      "name": "stderr",
162 |      "output_type": "stream",
163 |      "text": [
164 |       "INFO:root:649 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.\n",
165 |       "INFO:root:649 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.\n",
166 |       "INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.\n",
167 |       "INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.\n"
168 |      ]
169 |     }
170 |    ],
171 |    "source": [
172 |     "## left brain as training, right brain as testing\n",
173 |     "Rdata = data_merfish[data_merfish.obs['x_cord'] <= np.quantile(data_merfish.obs['x_cord'], 0.5)]\n",
174 |     "Qdata = data_merfish[data_merfish.obs['x_cord'] > np.quantile(data_merfish.obs['x_cord'], 0.5)]\n",
175 |     "\n",
176 |     "tg.pp_adatas(Qdata, Rdata, genes=Rdata.var.index)\n",
177 |     "assert Qdata.uns['training_genes'] == Rdata.uns['training_genes']"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 21,
183 |    "metadata": {},
184 |    "outputs": [
185 |     {
186 |      "name": "stderr",
187 |      "output_type": "stream",
188 |      "text": [
189 |       "INFO:root:Allocate tensors for mapping.\n",
190 |       "INFO:root:Begin training with 649 genes and rna_count_based density_prior in cells mode...\n",
191 |       "INFO:root:Printing scores every 100 epochs.\n"
192 |      ]
193 |     },
194 |     {
195 |      "name": "stdout",
196 |      "output_type": "stream",
197 |      "text": [
198 |       "Score: 0.260, KL reg: 0.046\n",
199 |       "Score: 0.820, KL reg: 0.002\n",
200 |       "Score: 0.833, KL reg: 0.002\n",
201 |       "Score: 0.837, KL reg: 0.002\n",
202 |       "Score: 0.838, KL reg: 0.002\n",
203 |       "Score: 0.839, KL reg: 0.002\n",
204 |       "Score: 0.840, KL reg: 0.002\n",
205 |       "Score: 0.840, KL reg: 0.002\n",
206 |       "Score: 0.841, KL reg: 0.002\n",
207 |       "Score: 0.841, KL reg: 0.002\n"
208 |      ]
209 |     },
210 |     {
211 |      "name": "stderr",
212 |      "output_type": "stream",
213 |      "text": [
214 |       "INFO:root:Saving results..\n"
215 |      ]
216 |     }
217 |    ],
218 |    "source": [
219 |     "ad_map = tg.map_cells_to_space(\n",
220 |     "    adata_sc=Qdata,\n",
221 |     "    adata_sp=Rdata,\n",
222 |     "    device='cpu',\n",
223 |     "    # device='cuda:0',\n",
224 |     ")"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 22,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "Rdata_location_pred = ad_map.X.dot(np.array(Rdata.obs[['x_cord', 'y_cord']]))\n",
234 |     "Rdata_location_pred[:, 0] = -Rdata_location_pred[:, 0] + 2*np.quantile(data_merfish.obs['x_cord'], 0.5)\n",
235 |     "Rdata_location_pred = pd.DataFrame(Rdata_location_pred)\n",
236 |     "Rdata_location_pred.index = Qdata.obs.index\n",
237 |     "Rdata_location_pred.columns = ['x', 'y']\n",
238 |     "\n",
239 |     "Rdata_location_pred.to_csv(\"output/brain/tangram_brain_right.csv\")"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": []
248 |   }
249 |  ],
250 |  "metadata": {
251 |   "kernelspec": {
252 |    "display_name": "Python 3.8.15 64-bit",
253 |    "language": "python",
254 |    "name": "python3"
255 |   },
256 |   "language_info": {
257 |    "codemirror_mode": {
258 |     "name": "ipython",
259 |     "version": 3
260 |    },
261 |    "file_extension": ".py",
262 |    "mimetype": "text/x-python",
263 |    "name": "python",
264 |    "nbconvert_exporter": "python",
265 |    "pygments_lexer": "ipython3",
266 |    "version": "3.8.15 (default, Oct 11 2022, 21:52:37) \n[Clang 14.0.0 (clang-1400.0.29.102)]"
267 |   },
268 |   "orig_nbformat": 4,
269 |   "vscode": {
270 |    "interpreter": {
271 |     "hash": "0adcc2737ebf6a4a119f135174df96668767fca1ef1112612db5ecadf2b6d608"
272 |    }
273 |   }
274 |  },
275 |  "nbformat": 4,
276 |  "nbformat_minor": 2
277 | }
278 | 


--------------------------------------------------------------------------------
/code_paper/4_Mouse_brain_MERFISH/novoSpaRc_brain.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "/usr/local/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
 13 |       "  from .autonotebook import tqdm as notebook_tqdm\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "import novosparc as ns\n",
 19 |     "\n",
 20 |     "import os\n",
 21 |     "import numpy as np\n",
 22 |     "import pandas as pd\n",
 23 |     "import scanpy as sc\n",
 24 |     "import matplotlib.pyplot as plt\n",
 25 |     "import altair as alt\n",
 26 |     "from scipy.spatial.distance import cdist, squareform, pdist\n",
 27 |     "from scipy.stats import ks_2samp\n",
 28 |     "from scipy.stats import pearsonr\n",
 29 |     "\n",
 30 |     "import random\n",
 31 |     "random.seed(0)\n",
 32 |     "\n",
 33 |     "from skimage import io, color\n",
 34 |     "import torch\n",
 35 |     "from torch.nn import functional as F\n",
 36 |     "import json\n",
 37 |     "\n",
 38 |     "from sklearn.model_selection import train_test_split\n",
 39 |     "from anndata import AnnData, read_h5ad\n",
 40 |     "import matplotlib.pyplot as plt\n",
 41 |     "import seaborn as sns"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 2,
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stderr",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "/var/folders/mc/kqfjr86j5gz9cdyb9w1kfhn40000gp/T/ipykernel_34616/3711195478.py:5: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
 54 |       "  data_merfish = AnnData(data_merfish_raw)\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "data_merfish_raw = pd.read_csv(\"data/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_by_gene_S1R1.csv\", index_col=0)   \n",
 60 |     "meta_data = pd.read_csv(\"data/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_metadata_S1R1.csv\", index_col=0)\n",
 61 |     "\n",
 62 |     "data_merfish = AnnData(data_merfish_raw)\n",
 63 |     "\n",
 64 |     "data_merfish.obs['x_cord'] = meta_data['center_x'].tolist()\n",
 65 |     "data_merfish.obs['y_cord'] = meta_data['center_y'].tolist()\n",
 66 |     "data_merfish_raw = data_merfish.copy()\n",
 67 |     "\n",
 68 |     "sc.pp.filter_cells(data_merfish, min_counts=500)\n",
 69 |     "sc.pp.filter_cells(data_merfish, min_genes=100)\n",
 70 |     "\n",
 71 |     "sc.pp.neighbors(data_merfish, n_neighbors=15, use_rep='X', random_state=1)\n",
 72 |     "sc.tl.louvain(data_merfish, 0.4, random_state=1)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 4,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "Qdata = data_merfish[data_merfish.obs['x_cord'] <= np.quantile(data_merfish.obs['x_cord'], 0.5)]\n",
 82 |     "Rdata = data_merfish[data_merfish.obs['x_cord'] > np.quantile(data_merfish.obs['x_cord'], 0.5)]"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 5,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "## right brain as training, left brain as testing\n",
 92 |     "datatrain = Rdata.copy()\n",
 93 |     "datatest = Qdata.copy()\n",
 94 |     "\n",
 95 |     "random.seed(2021)\n",
 96 |     "torch.manual_seed(2021)\n",
 97 |     "np.random.seed(2021)\n",
 98 |     "## Running novosparc\n",
 99 |     "locations_apriori = datatrain.obs[['x_cord', 'y_cord']].values\n",
100 |     "tissue = ns.cm.Tissue(dataset=datatest, locations=locations_apriori)\n",
101 |     "num_neighbors_s = num_neighbors_t = 5\n",
102 |     "\n",
103 |     "# params for linear cost\n",
104 |     "atlas_genes = datatrain.var\n",
105 |     "markers = list(atlas_genes.index)\n",
106 |     "num_genes = len(markers)\n",
107 |     "atlas_matrix = datatrain.to_df().values\n",
108 |     "markers_idx = pd.DataFrame({'markers_idx': np.arange(num_genes)}, index=markers)\n",
109 |     "markers_to_use = np.concatenate(markers_idx.loc[markers].values)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 6,
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "name": "stdout",
119 |      "output_type": "stream",
120 |      "text": [
121 |       "Setting up for reconstruction ... done ( 384.1 seconds )\n",
122 |       "Reconstructing spatial information with 649 markers: 21578 cells and 21578 locations ... \n",
123 |       "Trying with epsilon: 5.00e-03\n"
124 |      ]
125 |     },
126 |     {
127 |      "name": "stderr",
128 |      "output_type": "stream",
129 |      "text": [
130 |       "/usr/local/lib/python3.10/site-packages/ot/bregman.py:517: UserWarning: Sinkhorn did not converge. You might want to increase the number of iterations `numItermax` or the regularization parameter `reg`.\n",
131 |       "  warnings.warn(\"Sinkhorn did not converge. You might want to \"\n"
132 |      ]
133 |     }
134 |    ],
135 |    "source": [
136 |     "tissue.setup_reconstruction(atlas_matrix=atlas_matrix, \n",
137 |     "                        markers_to_use=markers_to_use, \n",
138 |     "                        num_neighbors_s=num_neighbors_s, \n",
139 |     "                        num_neighbors_t=num_neighbors_t)\n",
140 |     "        \n",
141 |     "tissue.reconstruct(alpha_linear=0.8, epsilon=5e-3)\n",
142 |     "\n",
143 |     "location_pred = tissue.gw"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 7,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "location_sum = np.sum(location_pred, axis=1)\n",
153 |     "location_pred_copy = location_pred / location_sum.reshape(len(location_sum), 1)\n",
154 |     "\n",
155 |     "pred_cord_transform = location_pred_copy.dot(np.array(Rdata.obs[['x_cord', 'y_cord']]))\n",
156 |     "pred_cord_transform[:, 0] = -pred_cord_transform[:, 0] + 2*np.quantile(data_merfish.obs['x_cord'], 0.5)\n",
157 |     "pred_cord_transform = pd.DataFrame(pred_cord_transform)\n",
158 |     "pred_cord_transform.index = Qdata.obs.index\n",
159 |     "pred_cord_transform.columns = ['x', 'y']\n",
160 |     "\n",
161 |     "pred_cord_transform.to_csv(\"output/brain/novosparc_brain_left.csv\")"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": []
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "## left brain as training, right brain as testing\n",
178 |     "datatrain = Rdata.copy()\n",
179 |     "datatest = Qdata.copy()\n",
180 |     "\n",
181 |     "random.seed(2021)\n",
182 |     "torch.manual_seed(2021)\n",
183 |     "np.random.seed(2021)\n",
184 |     "## Running novosparc\n",
185 |     "locations_apriori = datatrain.obs[['x_cord', 'y_cord']].values\n",
186 |     "tissue = ns.cm.Tissue(dataset=datatest, locations=locations_apriori)\n",
187 |     "num_neighbors_s = num_neighbors_t = 5\n",
188 |     "\n",
189 |     "# params for linear cost\n",
190 |     "atlas_genes = datatrain.var\n",
191 |     "markers = list(atlas_genes.index)\n",
192 |     "num_genes = len(markers)\n",
193 |     "atlas_matrix = datatrain.to_df().values\n",
194 |     "markers_idx = pd.DataFrame({'markers_idx': np.arange(num_genes)}, index=markers)\n",
195 |     "markers_to_use = np.concatenate(markers_idx.loc[markers].values)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "tissue.setup_reconstruction(atlas_matrix=atlas_matrix, \n",
205 |     "                        markers_to_use=markers_to_use, \n",
206 |     "                        num_neighbors_s=num_neighbors_s, \n",
207 |     "                        num_neighbors_t=num_neighbors_t)\n",
208 |     "        \n",
209 |     "tissue.reconstruct(alpha_linear=0.8, epsilon=5e-3)\n",
210 |     "\n",
211 |     "location_pred = tissue.gw"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": null,
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "location_sum = np.sum(location_pred, axis=1)\n",
221 |     "location_pred_copy = location_pred / location_sum.reshape(len(location_sum), 1)\n",
222 |     "\n",
223 |     "pred_cord_transform = location_pred_copy.dot(np.array(Rdata.obs[['x_cord', 'y_cord']]))\n",
224 |     "pred_cord_transform[:, 0] = -pred_cord_transform[:, 0] + 2*np.quantile(data_merfish.obs['x_cord'], 0.5)\n",
225 |     "\n",
226 |     "pred_cord_transform = pd.DataFrame(pred_cord_transform)\n",
227 |     "pred_cord_transform.index = Qdata.obs.index\n",
228 |     "pred_cord_transform.columns = ['x', 'y']\n",
229 |     "\n",
230 |     "pred_cord_transform.to_csv(\"output/brain/novosparc_brain_right.csv\")"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": []
239 |   }
240 |  ],
241 |  "metadata": {
242 |   "kernelspec": {
243 |    "display_name": "Python 3",
244 |    "language": "python",
245 |    "name": "python3"
246 |   },
247 |   "language_info": {
248 |    "codemirror_mode": {
249 |     "name": "ipython",
250 |     "version": 3
251 |    },
252 |    "file_extension": ".py",
253 |    "mimetype": "text/x-python",
254 |    "name": "python",
255 |    "nbconvert_exporter": "python",
256 |    "pygments_lexer": "ipython3",
257 |    "version": "3.10.8 (main, Oct 13 2022, 10:17:43) [Clang 14.0.0 (clang-1400.0.29.102)]"
258 |   },
259 |   "orig_nbformat": 4,
260 |   "vscode": {
261 |    "interpreter": {
262 |     "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
263 |    }
264 |   }
265 |  },
266 |  "nbformat": 4,
267 |  "nbformat_minor": 2
268 | }
269 | 


--------------------------------------------------------------------------------
/code_paper/5_liver_MERFISH/CeLEry_liver.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "/usr/local/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
 13 |       "  from .autonotebook import tqdm as notebook_tqdm\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "import CeLEry as cel\n",
 19 |     "\n",
 20 |     "import os,csv,re\n",
 21 |     "import pandas as pd\n",
 22 |     "import numpy as np\n",
 23 |     "import scanpy as sc\n",
 24 |     "import math\n",
 25 |     "from skimage import io, color\n",
 26 |     "\n",
 27 |     "from scipy.sparse import issparse\n",
 28 |     "import random, torch\n",
 29 |     "import warnings\n",
 30 |     "warnings.filterwarnings(\"ignore\")\n",
 31 |     "import pickle\n",
 32 |     "from sklearn.model_selection import train_test_split\n",
 33 |     "from anndata import AnnData, read_h5ad\n",
 34 |     "\n",
 35 |     "import json"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "data_merfish_raw = pd.read_csv(\"data/HumanLiverCancerPatient2_cell_by_gene.csv\", index_col=0)   \n",
 45 |     "meta_data = pd.read_csv(\"data/HumanLiverCancerPatient2_cell_metadata.csv\", index_col=0)\n",
 46 |     "meta_data = meta_data.sort_index()\n",
 47 |     "\n",
 48 |     "data_merfish = AnnData(data_merfish_raw)\n",
 49 |     "data_merfish.obs['x_cord'] = meta_data['center_x'].tolist()\n",
 50 |     "data_merfish.obs['y_cord'] = meta_data['center_y'].tolist()\n",
 51 |     "data_merfish_raw = data_merfish.copy()\n",
 52 |     "\n",
 53 |     "def findBlank(name):\n",
 54 |     "    return \"Blank\" in name\n",
 55 |     "\n",
 56 |     "blank_lst = np.array(list(map(findBlank, data_merfish.var.index)))\n",
 57 |     "data_merfish = data_merfish[:, blank_lst == False]\n",
 58 |     "\n",
 59 |     "sc.pp.filter_cells(data_merfish, min_genes=100)\n",
 60 |     "sc.pp.filter_cells(data_merfish, min_counts=500)\n",
 61 |     "\n",
 62 |     "sc.pp.neighbors(data_merfish, n_neighbors = 15, use_rep=\"X\")\n",
 63 |     "sc.tl.louvain(data_merfish, 0.3, random_state=1)\n",
 64 |     "\n",
 65 |     "data_merfish.write_h5ad(\"data/liver_merfish.h5ad\")"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 2,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "data_merfish = read_h5ad(\"data/liver_merfish.h5ad\")"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(data_merfish.shape[0]), data_merfish.obs['louvain'], test_size=0.5,random_state=1,stratify=data_merfish.obs['louvain'])\n",
 84 |     "\n",
 85 |     "Rdata = data_merfish[np.sort(Rdata_ind), :]\n",
 86 |     "Qdata = data_merfish[np.sort(Qdata_ind), :]\n",
 87 |     "print((Rdata.shape, Qdata.shape))"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "cel.get_zscore(Qdata)\n",
 97 |     "cel.get_zscore(Rdata)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 10,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "Rdata.obs = Rdata.obs[['x_cord', 'y_cord']]\n",
107 |     "model_train = cel.Fit_cord (data_train = Rdata, hidden_dims = [50, 20, 10], num_epochs_max = 3000, number_error_try=50, batch_size = 128, path = \"output/liver\", filename = \"liver_merfish\")"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "pred_cord = cel.Predict_cord (data_test = Qdata, path = \"output/liver\", filename = \"liver_merfish\")\n",
117 |     "\n",
118 |     "data_train = Qdata.copy()\n",
119 |     "traindata = (data_train.X.A if issparse(data_train.X) else data_train.X)\n",
120 |     "tdatax = np.expand_dims(traindata, axis = 0)\n",
121 |     "tdata_rs = np.swapaxes(tdatax, 1, 2)\n",
122 |     "test_cord = cel.wrap_gene_location(tdata_rs, data_train.obs[['x_cord', 'y_cord']])\n",
123 |     "\n",
124 |     "pred_cord_transformx = pred_cord[:,0]*(test_cord.xmax-test_cord.xmin) + test_cord.xmin\n",
125 |     "pred_cord_transformy = pred_cord[:,1]*(test_cord.ymax-test_cord.ymin) + test_cord.ymin\n",
126 |     "pred_cord_transform = np.array([pred_cord_transformx, pred_cord_transformy]).T\n",
127 |     "\n",
128 |     "np.save(\"output/liver/celery_liver.npy\", pred_cord_transform)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": []
137 |   }
138 |  ],
139 |  "metadata": {
140 |   "kernelspec": {
141 |    "display_name": "Python 3",
142 |    "language": "python",
143 |    "name": "python3"
144 |   },
145 |   "language_info": {
146 |    "codemirror_mode": {
147 |     "name": "ipython",
148 |     "version": 3
149 |    },
150 |    "file_extension": ".py",
151 |    "mimetype": "text/x-python",
152 |    "name": "python",
153 |    "nbconvert_exporter": "python",
154 |    "pygments_lexer": "ipython3",
155 |    "version": "3.8.15 (default, Oct 11 2022, 21:52:37) \n[Clang 14.0.0 (clang-1400.0.29.102)]"
156 |   },
157 |   "orig_nbformat": 4,
158 |   "vscode": {
159 |    "interpreter": {
160 |     "hash": "0adcc2737ebf6a4a119f135174df96668767fca1ef1112612db5ecadf2b6d608"
161 |    }
162 |   }
163 |  },
164 |  "nbformat": 4,
165 |  "nbformat_minor": 2
166 | }
167 | 


--------------------------------------------------------------------------------
/code_paper/5_liver_MERFISH/README.md:
--------------------------------------------------------------------------------
1 | # Note for Liver MERFISH data
2 | 
3 | Datasets is available from https://app.box.com/s/6nz5vlp0hjmuq9xruxog96p2woyt5fpm


--------------------------------------------------------------------------------
/code_paper/5_liver_MERFISH/SpaOTsc_liver.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "/usr/local/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
 13 |       "  from .autonotebook import tqdm as notebook_tqdm\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "import os\n",
 19 |     "import numpy as np\n",
 20 |     "import pandas as pd\n",
 21 |     "import scanpy as sc\n",
 22 |     "import matplotlib.pyplot as plt\n",
 23 |     "from scipy.spatial.distance import cdist, squareform, pdist\n",
 24 |     "from scipy.stats import ks_2samp\n",
 25 |     "from scipy.stats import pearsonr\n",
 26 |     "\n",
 27 |     "\n",
 28 |     "\n",
 29 |     "import os,csv,re\n",
 30 |     "import math\n",
 31 |     "from skimage import io, color\n",
 32 |     "\n",
 33 |     "from scipy.sparse import issparse\n",
 34 |     "import random, torch\n",
 35 |     "import warnings\n",
 36 |     "warnings.filterwarnings(\"ignore\")\n",
 37 |     "import pickle\n",
 38 |     "from sklearn.model_selection import train_test_split\n",
 39 |     "from anndata import AnnData, read_h5ad\n",
 40 |     "import seaborn as sns\n",
 41 |     "\n",
 42 |     "import json\n",
 43 |     "\n",
 44 |     "from spaotsc import SpaOTsc"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 2,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "data_merfish = read_h5ad(\"data/liver_merfish.h5ad\")"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 3,
 59 |    "metadata": {},
 60 |    "outputs": [
 61 |     {
 62 |      "name": "stdout",
 63 |      "output_type": "stream",
 64 |      "text": [
 65 |       "((78222, 500), (78223, 500))\n"
 66 |      ]
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(data_merfish.shape[0]), data_merfish.obs['louvain'], test_size=0.5,random_state=1,stratify=data_merfish.obs['louvain'])\n",
 71 |     "\n",
 72 |     "Rdata = data_merfish[np.sort(Rdata_ind), :]\n",
 73 |     "Qdata = data_merfish[np.sort(Qdata_ind), :]\n",
 74 |     "\n",
 75 |     "## Cannot run on the entire dataset, downsampling is needed\n",
 76 |     "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(Rdata.shape[0]), Rdata.obs['louvain'], test_size=0.25,random_state=1,stratify=Rdata.obs['louvain'])\n",
 77 |     "Rdata = Rdata[np.sort(Qdata_ind), :]\n",
 78 |     "\n",
 79 |     "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(Qdata.shape[0]), Qdata.obs['louvain'], test_size=0.25,random_state=1,stratify=Qdata.obs['louvain'])\n",
 80 |     "Qdata = Qdata[np.sort(Qdata_ind), :]\n",
 81 |     "\n",
 82 |     "print((Rdata.shape, Qdata.shape))"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 5,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "datatest = Qdata.copy()\n",
 92 |     "datatrain = Rdata.copy()\n",
 93 |     "random.seed(2021)\n",
 94 |     "torch.manual_seed(2021)\n",
 95 |     "np.random.seed(2021)\n",
 96 |     "\n",
 97 |     "## Running spaOTsc\n",
 98 |     "df_sc = pd.DataFrame(datatest.X)\n",
 99 |     "is_dmat = cdist(np.array(datatrain.obs.iloc[:,0:2]), np.array(datatrain.obs.iloc[:,0:2]), 'euclidean')\n",
100 |     "sc_dmat = cdist(datatest.X, datatest.X, 'euclidean')"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 6,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "spsc = SpaOTsc.spatial_sc(sc_data=df_sc, is_dmat=is_dmat, sc_dmat=sc_dmat)\n",
110 |     "cost_matrix = cdist(datatest.X, datatrain.X, 'euclidean')\n",
111 |     "        \n",
112 |     "location_pred = spsc.transport_plan(cost_matrix)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 9,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "location_sum = np.sum(location_pred, axis=1)\n",
122 |     "location_pred_copy = location_pred / location_sum.reshape(len(location_sum), 1)\n",
123 |     "\n",
124 |     "pred_cord_transform = location_pred_copy.dot(np.array(Rdata.obs[['x_cord', 'y_cord']]))\n",
125 |     "np.save(\"output/liver/spaotsc_liver.npy\", pred_cord_transform)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": []
134 |   }
135 |  ],
136 |  "metadata": {
137 |   "kernelspec": {
138 |    "display_name": "Python 3",
139 |    "language": "python",
140 |    "name": "python3"
141 |   },
142 |   "language_info": {
143 |    "codemirror_mode": {
144 |     "name": "ipython",
145 |     "version": 3
146 |    },
147 |    "file_extension": ".py",
148 |    "mimetype": "text/x-python",
149 |    "name": "python",
150 |    "nbconvert_exporter": "python",
151 |    "pygments_lexer": "ipython3",
152 |    "version": "3.10.8 (main, Oct 13 2022, 10:17:43) [Clang 14.0.0 (clang-1400.0.29.102)]"
153 |   },
154 |   "orig_nbformat": 4,
155 |   "vscode": {
156 |    "interpreter": {
157 |     "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
158 |    }
159 |   }
160 |  },
161 |  "nbformat": 4,
162 |  "nbformat_minor": 2
163 | }
164 | 


--------------------------------------------------------------------------------
/code_paper/5_liver_MERFISH/Tangram_liver.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "/usr/local/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
 13 |       "  from .autonotebook import tqdm as notebook_tqdm\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "import os, sys\n",
 19 |     "import numpy as np\n",
 20 |     "import pandas as pd\n",
 21 |     "import matplotlib.pyplot as plt\n",
 22 |     "import seaborn as sns\n",
 23 |     "import scanpy as sc\n",
 24 |     "import torch\n",
 25 |     "import tangram as tg\n",
 26 |     "from sklearn.model_selection import train_test_split\n",
 27 |     "from anndata import AnnData, read_h5ad"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "data_merfish = read_h5ad(\"data/liver_merfish.h5ad\")"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(data_merfish.shape[0]), data_merfish.obs['louvain'], test_size=0.5,random_state=1,stratify=data_merfish.obs['louvain'])\n",
 46 |     "\n",
 47 |     "Rdata = data_merfish[np.sort(Rdata_ind), :]\n",
 48 |     "Qdata = data_merfish[np.sort(Qdata_ind), :]\n",
 49 |     "\n",
 50 |     "## Cannot run on the entire dataset, downsampling is needed\n",
 51 |     "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(Rdata.shape[0]), Rdata.obs['louvain'], test_size=0.5,random_state=1,stratify=Rdata.obs['louvain'])\n",
 52 |     "Rdata = Rdata[np.sort(Qdata_ind), :]\n",
 53 |     "\n",
 54 |     "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(Qdata.shape[0]), Qdata.obs['louvain'], test_size=0.5,random_state=1,stratify=Qdata.obs['louvain'])\n",
 55 |     "Qdata = Qdata[np.sort(Qdata_ind), :]\n",
 56 |     "\n",
 57 |     "print((Rdata.shape, Qdata.shape))"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 5,
 63 |    "metadata": {},
 64 |    "outputs": [
 65 |     {
 66 |      "name": "stderr",
 67 |      "output_type": "stream",
 68 |      "text": [
 69 |       "INFO:root:500 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.\n",
 70 |       "INFO:root:500 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.\n",
 71 |       "INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.\n",
 72 |       "INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.\n"
 73 |      ]
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "tg.pp_adatas(Qdata, Rdata, genes=Rdata.var.index)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 6,
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stderr",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "INFO:root:Allocate tensors for mapping.\n",
 90 |       "INFO:root:Begin training with 500 genes and rna_count_based density_prior in cells mode...\n",
 91 |       "INFO:root:Printing scores every 100 epochs.\n"
 92 |      ]
 93 |     },
 94 |     {
 95 |      "name": "stdout",
 96 |      "output_type": "stream",
 97 |      "text": [
 98 |       "Score: 0.386, KL reg: 0.056\n",
 99 |       "Score: 0.716, KL reg: 0.003\n",
100 |       "Score: 0.750, KL reg: 0.002\n",
101 |       "Score: 0.756, KL reg: 0.002\n",
102 |       "Score: 0.759, KL reg: 0.001\n",
103 |       "Score: 0.760, KL reg: 0.001\n",
104 |       "Score: 0.761, KL reg: 0.001\n",
105 |       "Score: 0.762, KL reg: 0.001\n",
106 |       "Score: 0.763, KL reg: 0.001\n",
107 |       "Score: 0.763, KL reg: 0.001\n"
108 |      ]
109 |     },
110 |     {
111 |      "name": "stderr",
112 |      "output_type": "stream",
113 |      "text": [
114 |       "INFO:root:Saving results..\n"
115 |      ]
116 |     }
117 |    ],
118 |    "source": [
119 |     "ad_map = tg.map_cells_to_space(\n",
120 |     "    adata_sc=Qdata,\n",
121 |     "    adata_sp=Rdata,\n",
122 |     "    device='cpu',\n",
123 |     "    # device='cuda:0',\n",
124 |     ")"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 8,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "Rdata_location_pred = ad_map.X.dot(np.array(Rdata.obs[['x_cord', 'y_cord']]))\n",
134 |     "np.save(\"output/liver/tangram_liver.npy\", Rdata_location_pred)"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": []
143 |   }
144 |  ],
145 |  "metadata": {
146 |   "kernelspec": {
147 |    "display_name": "Python 3",
148 |    "language": "python",
149 |    "name": "python3"
150 |   },
151 |   "language_info": {
152 |    "codemirror_mode": {
153 |     "name": "ipython",
154 |     "version": 3
155 |    },
156 |    "file_extension": ".py",
157 |    "mimetype": "text/x-python",
158 |    "name": "python",
159 |    "nbconvert_exporter": "python",
160 |    "pygments_lexer": "ipython3",
161 |    "version": "3.8.15 (default, Oct 11 2022, 21:52:37) \n[Clang 14.0.0 (clang-1400.0.29.102)]"
162 |   },
163 |   "orig_nbformat": 4,
164 |   "vscode": {
165 |    "interpreter": {
166 |     "hash": "0adcc2737ebf6a4a119f135174df96668767fca1ef1112612db5ecadf2b6d608"
167 |    }
168 |   }
169 |  },
170 |  "nbformat": 4,
171 |  "nbformat_minor": 2
172 | }
173 | 


--------------------------------------------------------------------------------
/code_paper/5_liver_MERFISH/novoSpaRc_liver.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import novosparc as ns\n",
 10 |     "\n",
 11 |     "import os\n",
 12 |     "import numpy as np\n",
 13 |     "import pandas as pd\n",
 14 |     "import scanpy as sc\n",
 15 |     "import matplotlib.pyplot as plt\n",
 16 |     "import altair as alt\n",
 17 |     "from scipy.spatial.distance import cdist, squareform, pdist\n",
 18 |     "from scipy.stats import ks_2samp\n",
 19 |     "from scipy.stats import pearsonr\n",
 20 |     "\n",
 21 |     "import random\n",
 22 |     "random.seed(0)\n",
 23 |     "\n",
 24 |     "from skimage import io, color\n",
 25 |     "import torch\n",
 26 |     "from torch.nn import functional as F\n",
 27 |     "import json\n",
 28 |     "\n",
 29 |     "from sklearn.model_selection import train_test_split\n",
 30 |     "from anndata import AnnData, read_h5ad"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": null,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "data_merfish = read_h5ad(\"data/liver_merfish.h5ad\")"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(data_merfish.shape[0]), data_merfish.obs['louvain'], test_size=0.5,random_state=1,stratify=data_merfish.obs['louvain'])\n",
 49 |     "\n",
 50 |     "Rdata = data_merfish[np.sort(Rdata_ind), :]\n",
 51 |     "Qdata = data_merfish[np.sort(Qdata_ind), :]\n",
 52 |     "\n",
 53 |     "## Cannot run on the entire dataset, downsampling is needed\n",
 54 |     "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(Rdata.shape[0]), Rdata.obs['louvain'], test_size=0.3,random_state=1,stratify=Rdata.obs['louvain'])\n",
 55 |     "Rdata = Rdata[np.sort(Qdata_ind), :]\n",
 56 |     "\n",
 57 |     "Rdata_ind, Qdata_ind, _, _ =train_test_split(range(Qdata.shape[0]), Qdata.obs['louvain'], test_size=0.3,random_state=1,stratify=Qdata.obs['louvain'])\n",
 58 |     "Qdata = Qdata[np.sort(Qdata_ind), :]\n",
 59 |     "\n",
 60 |     "print((Rdata.shape, Qdata.shape))"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "datatrain = Rdata.copy()\n",
 70 |     "datatest = Qdata.copy()\n",
 71 |     "\n",
 72 |     "random.seed(2021)\n",
 73 |     "torch.manual_seed(2021)\n",
 74 |     "np.random.seed(2021)\n",
 75 |     "## Running novosparc\n",
 76 |     "locations_apriori = datatrain.obs[['x_cord', 'y_cord']].values\n",
 77 |     "tissue = ns.cm.Tissue(dataset=datatest, locations=locations_apriori)\n",
 78 |     "num_neighbors_s = num_neighbors_t = 5\n",
 79 |     "\n",
 80 |     "# params for linear cost\n",
 81 |     "atlas_genes = datatrain.var\n",
 82 |     "markers = list(atlas_genes.index)\n",
 83 |     "num_genes = len(markers)\n",
 84 |     "atlas_matrix = datatrain.to_df().values\n",
 85 |     "markers_idx = pd.DataFrame({'markers_idx': np.arange(num_genes)}, index=markers)\n",
 86 |     "markers_to_use = np.concatenate(markers_idx.loc[markers].values)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "tissue.setup_reconstruction(atlas_matrix=atlas_matrix, \n",
 96 |     "                        markers_to_use=markers_to_use, \n",
 97 |     "                        num_neighbors_s=num_neighbors_s, \n",
 98 |     "                        num_neighbors_t=num_neighbors_t)\n",
 99 |     "        \n",
100 |     "tissue.reconstruct(alpha_linear=0.8, epsilon=5e-3)\n",
101 |     "\n",
102 |     "location_pred = tissue.gw"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "location_sum = np.sum(location_pred, axis=1)\n",
112 |     "location_pred_copy = location_pred / location_sum.reshape(len(location_sum), 1)\n",
113 |     "\n",
114 |     "pred_cord_transform = location_pred_copy.dot(np.array(Rdata.obs[['x_cord', 'y_cord']]))\n",
115 |     "np.save(\"output/liver/novosparc_liver.npy\", pred_cord_transform)"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": []
124 |   }
125 |  ],
126 |  "metadata": {
127 |   "kernelspec": {
128 |    "display_name": "Python 3",
129 |    "language": "python",
130 |    "name": "python3"
131 |   },
132 |   "language_info": {
133 |    "name": "python",
134 |    "version": "3.10.8 (main, Oct 13 2022, 10:17:43) [Clang 14.0.0 (clang-1400.0.29.102)]"
135 |   },
136 |   "orig_nbformat": 4,
137 |   "vscode": {
138 |    "interpreter": {
139 |     "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
140 |    }
141 |   }
142 |  },
143 |  "nbformat": 4,
144 |  "nbformat_minor": 2
145 | }
146 | 


--------------------------------------------------------------------------------
/code_paper/8_mouse_single_cell_prediction/Mouse_sc_analysis.py:
--------------------------------------------------------------------------------
  1 | import os,csv,re
  2 | import pandas as pd
  3 | import numpy as np
  4 | import scanpy as sc
  5 | import math
  6 | from skimage import io, color
  7 | 
  8 | from scipy.sparse import issparse
  9 | import random, torch
 10 | import warnings
 11 | warnings.filterwarnings("ignore")
 12 | import matplotlib.colors as clr
 13 | import matplotlib.pyplot as plt
 14 | import pickle
 15 | 
 16 | #Read original data and save it to h5ad
 17 | from scanpy import read_10x_h5
 18 | # import SpaGCN as spg
 19 | 
 20 | import CeLEry as cel
 21 | from data.MouseBrain.MP1_SVG import d_g
 22 | import json
 23 | import cv2 as cv
 24 | 
 25 | 
 26 | ### ------------------------------------------------------------------------------------------------------- ###
 27 | ###        Preprocessing for MouseSC Data
 28 | ### ------------------------------------------------------------------------------------------------------- ###
 29 | 
 30 | MouseSC = sc.read("../data/MouseBrain/MouseSC_scRNA.h5ad")
 31 | 
 32 | dataSection1full = sc.read("../data/MouseBrain/MP1_sudo.h5ad")
 33 | genename = dataSection1full.var['genename']
 34 | 
 35 | 
 36 | # Get the gene list from the pre-screening
 37 | genelistlist = [d_g[i] for i in  range(len(d_g))]  # transform dictionary to a list of lists
 38 | genelist = sum(genelistlist, [])  # merge the list of lists
 39 | genelistuni = list( dict.fromkeys(genelist) )   # remove duplicates
 40 | 
 41 | genelistindex = [genename[genename == i].index[0] for i in genelistuni if  len(genename[genename == i])>0]
 42 | 
 43 | #Read in hitology image
 44 | ImageSec1=io.imread("../data/MouseBrain/V1_Mouse_Brain_Sagittal_Posterior_image.tif")
 45 | ImageSec1sub = ImageSec1[3000:7000,6200:10500,:]
 46 | # cel.printimage (ImageSec1sub, "../output/CeLEry/imageselect")
 47 | 
 48 | imgray = cv.cvtColor(ImageSec1sub, cv.COLOR_BGR2GRAY)
 49 | imgray2 = imgray.copy()
 50 | imgray2[imgray2<160] = 0
 51 | imgray2[imgray2>160] = 255
 52 | 
 53 | ## Take the subset of dataSection1
 54 | xcords = dataSection1full.obs["x"].to_numpy()
 55 | ycords = dataSection1full.obs["y"].to_numpy()
 56 | 
 57 | Section1Sub = dataSection1full[(xcords>=3000) & (xcords<7000) & (ycords>=6200) & (ycords<10500), genelistindex]
 58 | Section1Sub.obs = Section1Sub.obs/50
 59 | Section1Sub.obs = Section1Sub.obs.astype(int)
 60 | Section1Sub.obs["inner"] = 0
 61 | 
 62 | ## Quality Control
 63 | 
 64 | for i in range(Section1Sub.obs.shape[0]):
 65 |     xi = Section1Sub.obs["x"][i]
 66 |     yi = Section1Sub.obs["y"][i]
 67 |     subarea = np.mean(imgray2[(xi*50-3000):(xi*50+50-3000), (yi*50-6200):(yi*50+50-6200)])
 68 |     if subarea<140 or xi*50>6000:
 69 |          Section1Sub.obs["inner"].iloc[i] = 1
 70 |     if yi*50>10200 or xi*50<1000:
 71 |          Section1Sub.obs["inner"].iloc[i] = 0
 72 | 
 73 | Section1Sub = Section1Sub[Section1Sub.obs["inner"] == 1]
 74 | 
 75 | ## Calculating z-score
 76 | cel.get_zscore(Section1Sub)
 77 | cel.get_zscore(MouseSC)
 78 | 
 79 | ### ------------------------------------------------------------------------------------------------------- ###
 80 | ###        Perform CeLEry analysis
 81 | ### ------------------------------------------------------------------------------------------------------- ###
 82 | 
 83 | def seed_worker(worker_id):
 84 |     worker_seed = torch.initial_seed() % 2**32
 85 |     np.random.seed(worker_seed)
 86 |     random.seed(worker_seed)
 87 | 
 88 | def FitPredModelNE (dataSection1):
 89 |     #
 90 |     random.seed(2021)
 91 |     torch.manual_seed(2021)
 92 |     np.random.seed(2021)
 93 |     g = torch.Generator()
 94 |     g.manual_seed(2021)
 95 |     #
 96 |     tdatax = np.expand_dims(dataSection1.X, axis = 0)
 97 |     tdata_rs = np.swapaxes(tdatax, 1, 2)
 98 |     DataTra = cel.wrap_gene_location(tdata_rs, dataSection1.obs)
 99 |     t_loader= torch.utils.data.DataLoader(DataTra, batch_size=4, num_workers = 4, shuffle = True, worker_init_fn=seed_worker, generator=g)
100 |     # Create Deep Neural Network for Coordinate Regression
101 |     DNNmodel = cel.DNN( in_channels = DataTra[1][0].shape[0], hidden_dims = [30, 25, 15] ) # [100,50,25] )
102 |     DNNmodel = DNNmodel.float()
103 |     #
104 |     CoOrg=cel.SpaCluster()
105 |     CoOrg.train(model = DNNmodel, train_loader = t_loader, num_epochs= 500, RCcountMax = 15, learning_rate = 0.0001)
106 |     #
107 |     filename3 = "../output/CeLEry/Prediction/PreOrg_Mousesc.obj"
108 |     filehandler2 = open(filename3, 'wb') 
109 |     pickle.dump(DNNmodel, filehandler2)
110 | 
111 | FitPredModelNE (dataSection1 = Section1Sub)
112 | 
113 | 
114 | 
115 | ### ------------------------------------------------------------------------------------------------------- ###
116 | ###        Present Results
117 | ### ------------------------------------------------------------------------------------------------------- ###
118 | 
119 | 
120 | def report_prop_method_sc (folder, name, dataSection2, Val_loader, outname = ""):
121 | 	"""
122 | 		Report the results of the proposed methods in comparison to the other method
123 | 		:folder: string: specified the folder that keep the proposed DNN method
124 | 		:name: string: specified the name of the DNN method, also will be used to name the output files
125 | 		:dataSection2: AnnData: the data of Section 2
126 | 		:Val_loader: Dataload: the validation data from dataloader
127 | 		:outname: string: specified the name of the output, default is the same as the name
128 | 	"""
129 | 	if outname == "":
130 | 		outname = name
131 | 	filename2 = "{folder}/{name}.obj".format(folder = folder, name = name)
132 | 	filehandler = open(filename2, 'rb') 
133 | 	DNNmodel = pickle.load(filehandler)
134 | 	#
135 | 	total_loss_org = []
136 | 	coords_predict = np.zeros((dataSection2.obs.shape[0],2))
137 | 	for i, img in enumerate(Val_loader):
138 | 		recon = DNNmodel(img)
139 | 		coords_predict[i,:] = recon[0].detach().numpy()
140 | 	np.savetxt("{folder}/{name}_predmatrix.csv".format(folder = folder, name = name), coords_predict, delimiter=",")
141 | 
142 | def EvaluateOrg (testdata):
143 |     ## Wrap up Validation data in to dataloader
144 |     vdatax = np.expand_dims(testdata.X, axis = 0)
145 |     vdata_rs = np.swapaxes(vdatax, 1, 2)
146 |     DataVal = cel.wrap_gene_location(vdata_rs, testdata.obs[["sex_id","region_id"]])
147 |     Val_loader= torch.utils.data.DataLoader(DataVal, batch_size=1, num_workers = 4)
148 |     #
149 |     report_prop_method_sc(folder = "../output/CeLEry/Prediction",
150 |                         name = "PreOrg_Mousesc", dataSection2 = testdata,
151 |                         Val_loader = Val_loader)
152 | 
153 | 
154 | EvaluateOrg(testdata = MouseSC)
155 | 


--------------------------------------------------------------------------------
/code_paper/8_mouse_single_cell_prediction/analysis-results.py:
--------------------------------------------------------------------------------
  1 | import os,csv,re
  2 | import pandas as pd
  3 | import numpy as np
  4 | import scanpy as sc
  5 | import math
  6 | from skimage import io, color
  7 | 
  8 | from scipy.sparse import issparse
  9 | import random, torch
 10 | import warnings
 11 | warnings.filterwarnings("ignore")
 12 | import matplotlib.colors as clr
 13 | import matplotlib.pyplot as plt
 14 | import pickle
 15 | 
 16 | #Read original data and save it to h5ad
 17 | from scanpy import read_10x_h5
 18 | # import SpaGCN as spg
 19 | 
 20 | import CeLEry as cel
 21 | from data.MouseBrain.MP1_SVG import d_g
 22 | import json
 23 | # import cv2 as cv
 24 | 
 25 | 
 26 | ### ------------------------------------------------------------------------------------------------------- ###
 27 | ###        Preprocessing for MouseSC Data
 28 | ### ------------------------------------------------------------------------------------------------------- ###
 29 | 
 30 | MouseSC = sc.read("../data/Seurat/MouseSC_scRNA_SeuratMouseSC.h5ad")
 31 | 
 32 | dataSection1full = sc.read("../data/MouseBrain/MP1_sudo.h5ad")
 33 | genename = dataSection1full.var['genename']
 34 | 
 35 | 
 36 | # # Get the gene list from the pre-screening
 37 | # genelistlist = [d_g[i] for i in  range(len(d_g))]  # transform dictionary to a list of lists
 38 | # genelist = sum(genelistlist, [])  # merge the list of lists
 39 | # genelistuni = list( dict.fromkeys(genelist) )   # remove duplicates
 40 | 
 41 | # genelistindex = [genename[genename == i].index[0] for i in genelistuni if  len(genename[genename == i])>0]
 42 | 
 43 | #Read in hitology image
 44 | ImageSec1=io.imread("../data/MouseBrain/V1_Mouse_Brain_Sagittal_Posterior_image.tif")
 45 | ImageSec1sub = ImageSec1[3000:7000,6200:10500,:]
 46 | # cel.printimage (ImageSec1sub, "../output/CeLEry/imageselect")
 47 | 
 48 | imgray = cv.cvtColor(ImageSec1sub, cv.COLOR_BGR2GRAY)
 49 | imgray2 = imgray.copy()
 50 | imgray2[imgray2<160] = 0
 51 | imgray2[imgray2>160] = 255
 52 | 
 53 | ## Take the subset of dataSection1
 54 | xcords = dataSection1full.obs["x"].to_numpy()
 55 | ycords = dataSection1full.obs["y"].to_numpy()
 56 | 
 57 | Section1Sub = dataSection1full[(xcords>=3000) & (xcords<7000) & (ycords>=6200) & (ycords<10500), MouseSC.var_names]
 58 | Section1Sub.obs = Section1Sub.obs/50
 59 | Section1Sub.obs = Section1Sub.obs.astype(int)
 60 | Section1Sub.obs["inner"] = 0
 61 | 
 62 | ## Quality Control
 63 | 
 64 | for i in range(Section1Sub.obs.shape[0]):
 65 |     xi = Section1Sub.obs["x"][i]
 66 |     yi = Section1Sub.obs["y"][i]
 67 |     subarea = np.mean(imgray2[(xi*50-3000):(xi*50+50-3000), (yi*50-6200):(yi*50+50-6200)])
 68 |     if subarea<140 or xi*50>6000:
 69 |          Section1Sub.obs["inner"].iloc[i] = 1
 70 |     if yi*50>10200 or xi*50<1000:
 71 |          Section1Sub.obs["inner"].iloc[i] = 0
 72 | 
 73 | Section1Sub = Section1Sub[Section1Sub.obs["inner"] == 1, ]
 74 | 
 75 | ## Calculating z-score
 76 | cel.get_zscore(Section1Sub)
 77 | cel.get_zscore(MouseSC)
 78 | 
 79 | ### ------------------------------------------------------------------------------------------------------- ###
 80 | ###        Perform CeLEry analysis
 81 | ### ------------------------------------------------------------------------------------------------------- ###
 82 | 
 83 | def seed_worker(worker_id):
 84 |     worker_seed = torch.initial_seed() % 2**32
 85 |     np.random.seed(worker_seed)
 86 |     random.seed(worker_seed)
 87 | 
 88 | def FitPredModelNE (dataSection1):
 89 |     #
 90 |     random.seed(2021)
 91 |     torch.manual_seed(2021)
 92 |     np.random.seed(2021)
 93 |     g = torch.Generator()
 94 |     g.manual_seed(2021)
 95 |     #
 96 |     tdatax = np.expand_dims(dataSection1.X, axis = 0)
 97 |     tdata_rs = np.swapaxes(tdatax, 1, 2)
 98 |     DataTra = cel.wrap_gene_location(tdata_rs, dataSection1.obs)
 99 |     t_loader= torch.utils.data.DataLoader(DataTra, batch_size=4, num_workers = 4, shuffle = True, worker_init_fn=seed_worker, generator=g)
100 |     # Create Deep Neural Network for Coordinate Regression
101 |     DNNmodel = cel.DNN( in_channels = DataTra[1][0].shape[0], hidden_dims = [30, 25, 15] ) # [100,50,25] )
102 |     DNNmodel = DNNmodel.float()
103 |     #
104 |     CoOrg=cel.SpaCluster()
105 |     CoOrg.train(model = DNNmodel, train_loader = t_loader, num_epochs= 500, RCcountMax = 15, learning_rate = 0.0001)
106 |     #
107 |     filename3 = "../output/CeLEry/Mousesc/PreOrg_Mousesc.obj"
108 |     filehandler2 = open(filename3, 'wb') 
109 |     pickle.dump(DNNmodel, filehandler2)
110 | 
111 | FitPredModelNE (dataSection1 = Section1Sub)
112 | 
113 | 
114 | 
115 | ### ------------------------------------------------------------------------------------------------------- ###
116 | ###        Present Results
117 | ### ------------------------------------------------------------------------------------------------------- ###
118 | 
119 | 
120 | def report_prop_method_sc (folder, name, dataSection2, Val_loader, outname = ""):
121 |     """
122 |         Report the results of the proposed methods in comparison to the other method
123 |         :folder: string: specified the folder that keep the proposed DNN method
124 |         :name: string: specified the name of the DNN method, also will be used to name the output files
125 |         :dataSection2: AnnData: the data of Section 2
126 |         :Val_loader: Dataload: the validation data from dataloader
127 |         :outname: string: specified the name of the output, default is the same as the name
128 |     """
129 |     if outname == "":
130 |         outname = name
131 |     filename2 = "{folder}/{name}.obj".format(folder = folder, name = name)
132 |     filehandler = open(filename2, 'rb') 
133 |     DNNmodel = pickle.load(filehandler)
134 |     #
135 |     total_loss_org = []
136 |     coords_predict = np.zeros((dataSection2.obs.shape[0],2))
137 |     #
138 |     for i, img in enumerate(Val_loader):
139 |         recon = DNNmodel(img)
140 |         coords_predict[i,:] = recon[0].detach().numpy()
141 |     np.savetxt("{folder}/{name}_predmatrix.csv".format(folder = folder, name = name), coords_predict, delimiter=",")
142 | 
143 | def EvaluateOrg (testdata):
144 |     ## Wrap up Validation data in to dataloader
145 |     vdatax = np.expand_dims(testdata.X, axis = 0)
146 |     vdata_rs = np.swapaxes(vdatax, 1, 2)
147 |     DataVal = cel.wrap_gene_location(vdata_rs, testdata.obs[["sex_id","region_id"]])
148 |     Val_loader= torch.utils.data.DataLoader(DataVal, batch_size=1, num_workers = 4)
149 |     #
150 |     report_prop_method_sc(folder = "../output/CeLEry/Mousesc/",
151 |                         name = "PreOrg_Mousesc", dataSection2 = testdata,
152 |                         Val_loader = Val_loader)
153 | 
154 | 
155 | EvaluateOrg(testdata = MouseSC)
156 | 
157 | ### ------------------------------------------------------------------------------------------------------- ###
158 | ###        Perform Tangram analysis
159 | ### ------------------------------------------------------------------------------------------------------- ###
160 | # import tangram as tg
161 | 
162 | # tg.pp_adatas(MouseSC, Section1Sub, genes=None)
163 | # map = tg.map_cells_to_space(MouseSC, Section1Sub, device='cpu')
164 | # map.write_h5ad('../output/CeLEry/Mousesc/tangram.h5ad')
165 | 
166 | S1_xmax = Section1Sub.obs['x'].max() + 1
167 | S1_xmin = Section1Sub.obs['x'].min() - 1
168 | S1_ymax = Section1Sub.obs['y'].max() + 1
169 | S1_ymin = Section1Sub.obs['y'].min() - 1
170 | 
171 | map = sc.read("../output/CeLEry/Mousesc/tangram.h5ad")
172 | 
173 | 
174 | ## Normalize the coordinates of both Sections
175 | spx = (Section1Sub.obs.iloc[:,0] - S1_xmin) / (S1_xmax - S1_xmin)
176 | spy = (Section1Sub.obs.iloc[:,1] - S1_ymin) / (S1_ymax - S1_ymin)
177 | 
178 | coords_predict_tangram = np.zeros((MouseSC.obs.shape[0],2))
179 | for i in range(map.X.shape[0]):
180 |     bestindex = np.argmax(map.X[i,:])
181 |     pred = torch.FloatTensor([spx[bestindex],spy[bestindex]])
182 |     coords_predict_tangram[i,:] = pred
183 | 
184 | 
185 | np.savetxt("{folder}/{name}_predmatrix.csv".format(folder = "../output/CeLEry/Mousesc/", name = "Tangram_Mousesc"), coords_predict_tangram, delimiter=",")
186 | 
187 | 


--------------------------------------------------------------------------------
/docs/asserts/images/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/docs/asserts/images/workflow.png


--------------------------------------------------------------------------------
/pretrainmodel/Biogen/Pretrained_model_075B.obj:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/pretrainmodel/Biogen/Pretrained_model_075B.obj


--------------------------------------------------------------------------------
/pretrainmodel/Biogen/Reference_genes_8_075B.obj:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/pretrainmodel/Biogen/Reference_genes_8_075B.obj


--------------------------------------------------------------------------------
/tutorial/BiogenPretrain.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "<h1><center>Biogen Pretrained Tutorial - independent version</center></h1>\n",
  8 |     "\n",
  9 |     "\n",
 10 |     "<center>Author: Qihuang Zhang*, Jian Hu, Kejie Li, Baohong Zhang, David Dai, Edward B. Lee, Rui Xiao, Mingyao Li*"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "## Outline\n",
 18 |     "1. Preparation\n",
 19 |     "2. Load Data\n",
 20 |     "3. Prediction\n",
 21 |     "4. Visualization (in ``R``)"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "In this tutorial, we illustrate the usage of the CeLEry pretrain model trained by Biogene mouse brain data (Li and Zhang, 2022). This model takes the gene expression input of 886 genes and produce a prediction probability vector to eight regions segemented from the spatial transcriptomics data.\n",
 29 |     "\n",
 30 |     "This tutorial can be independent of the CeLEry package. It does not require installing the CeLEry package. \n",
 31 |     "\n",
 32 |     "## 1. Preparation\n",
 33 |     "\n",
 34 |     "To implemente the model without installing CeLEry package, several helper functions are needed. The ``pickle`` package is used to load the pretrained model. Function ``make_annData_query()`` transform the raw input data into AnnData format and conduct data proprocessing, including  normalizing the gene expression per cell and performing ``log(1+p)`` transcformation. The ``get_zscore()`` helps to normalized the gene expression so that batch effect can be removed."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "import pickle\n",
 44 |     "from scanpy import read_10x_h5\n",
 45 |     "import CeLEry as cel\n",
 46 |     "\n",
 47 |     "import scanpy as sc\n",
 48 |     "import numpy as np\n",
 49 |     "import pandas as pd\n",
 50 |     "from scipy.sparse import issparse\n"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "## 2. Load Data\n",
 58 |     " \n",
 59 |     "Load scRNA-seq/snRNA-seq data. Example data can be download from [Li and Zhang (2022)](https://doi.org/10.5281/zenodo.6640285)."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "\n",
 69 |     "QueryData_raw = read_10x_h5(\"data/Biogen/7G-1/filtered_feature_bc_matrix.h5\")\n",
 70 |     "QueryData = cel.make_annData_query (QueryData_raw)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "It is import to make sure the query scRNA-seq/snRNA-seq contains all the gene in the trained model."
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "## Load gene list\n",
 87 |     "filename = \"pretrainmodel/Biogen/Reference_genes_8_075B.obj\"\n",
 88 |     "filehandler = open(filename, 'rb') \n",
 89 |     "genenames = pickle.load(filehandler)\n",
 90 |     "\n",
 91 |     "## Rearrange the data and filter the selected genes in the trained model.\n",
 92 |     "Qdata = QueryData[:,list(genenames)]\n",
 93 |     "cel.get_zscore(Qdata)\n"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "#### 3. Apply Pre-trained CeLEry model to the snRNA data\n",
101 |     "\n",
102 |     "The gene expression of the first cell (a 1X886 matrix) in the snRNA-seq data is given by:"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "Qdata[0].X.A"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "Load the CeLEry prediction model which is located at the ``\"../output/Biogene/models\"`` named as ``Org_domain_075B``. We use CeLEry function ``Predict_domain()`` to conduct domain prediction for each single cells in the scRNA-seq/snRNA-seq data. The detailed argument are explained as follows:\n",
119 |     "\n",
120 |     "* data_test: (AnnData object) the input scRNA-seq/snRNA-seq data \n",
121 |     "* class_num: (int) the number of class to be predicted. This value should be consistent with the number of domains in the training model.\n",
122 |     "* path: (string) the location of the pre-trained model\n",
123 |     "* filename: (string) the file name of the saved pre-trained model\n",
124 |     "* predtype: (string) if predtype is \"probability\" (default) then a probability prediction matrix will be produced; if predtype is \"deterministic\", then the deterministic assignment based on the maximun probability prediction will be returned; if predtype is \"both\", then both prediction will be outputed. "
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "## 3. Prediction \n",
132 |     "\n",
133 |     "Prediction of the first cell"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "model_location = \"pretrainmodel/Biogen/Pretrained_model_075B.obj\"\n",
143 |     "\n",
144 |     "pred_cord = cel.Predict_domain(data_test = Qdata[0], class_num = 8, path = \"pretrainmodel/Biogen\", filename = \"Pretrained_model_075B\", predtype = \"probability\")\n"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "Predict region labels of the entire scRNA-seq data and report the proportion of the cells on different domains."
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "pred_cord_all = cel.Predict_domain(data_test = Qdata, class_num = 8, path = \"pretrainmodel/Biogen\", filename = \"Pretrained_model_075B\", predtype = \"deterministic\")\n",
161 |     "\n",
162 |     "prop_count = pd.DataFrame(pred_cord_all).value_counts().sort_index()\n",
163 |     "prop_weight = prop_count/sum(prop_count)\n",
164 |     "prop_weight\n",
165 |     "prop_weight.to_csv(\"output/Biogen/prop_8_075B_7G-1.csv\")\n"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "## 4. Visualization\n",
173 |     "\n",
174 |     "For the following part, we use the ``ggplot()`` in ``R`` to visualize the the proportion predicted according to CeLEry. We are going to use the regions segemented from the spatial transcriptomics data to illustrate how the distribution looks like.\n",
175 |     "\n",
176 |     "### 4.1 R packages"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {
183 |     "vscode": {
184 |      "languageId": "r"
185 |     }
186 |    },
187 |    "outputs": [],
188 |    "source": [
189 |     "library(ggplot2)\n",
190 |     "library(png)\n",
191 |     "\n",
192 |     "outputdir <- \"output/Biogen/plots/\""
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "### 4.2 Plotting Functions\n",
200 |     "\n",
201 |     "The Density plot function use two input paths. \n",
202 |     "\n",
203 |     "* ``obsdata_path`` specifies the path of the observation data from the spatial transcriptomics data that are used to trained the data, which are saved from the \".obs\" of the annotated data object in python.  This files contain the spot ID, the locations of the spots and the regions information, and will be used as the background of the visualization.\n",
204 |     "\n",
205 |     "* ``prediction_path`` specifies where the path of the prediction results locate.\n",
206 |     "\n",
207 |     "* ``objectname`` specifies the name of the output figure."
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {
214 |     "vscode": {
215 |      "languageId": "r"
216 |     }
217 |    },
218 |    "outputs": [],
219 |    "source": [
220 |     "obsdata_path = \"output/Biogen/obsdata_8_075B.csv\"\n",
221 |     "prediction_path = \"output/Biogen/prop_8_075B_7G-1.csv\"\n",
222 |     "objectname = \"BiogenExample\"\n",
223 |     "\n",
224 |     "Density_plot <- function(obsdata_path, prediction_path, objectname){\n",
225 |     "  obsdata <- read.csv(obsdata_path, header = T)\n",
226 |     "  maxx <- max(obsdata$x_cord)\n",
227 |     "  obsdata$minus_xcord <- maxx - obsdata$x_cord\n",
228 |     "  pred_CeLEry <- read.csv(prediction_path, header = T)\n",
229 |     "  colnames(pred_CeLEry) = c(\"Domain\", \"Density\")\n",
230 |     "  dataplot <- merge(obsdata, pred_CeLEry, by.x = \"refined_pred\", by.y = \"Domain\")\n",
231 |     "  png(file = paste0(outputdir,\"Density_plot_\",objectname,\".png\"), height = 300, width = 450)\n",
232 |     "  DensityPlot2D  <- ggplot(dataplot, aes(x = x_cord, y = y_cord) )  + \n",
233 |     "    theme_bw()  + \n",
234 |     "    geom_point(aes(color = Density), size = 3) + #shape = 21, color = \"black\",, stroke = 0.3\n",
235 |     "    # scale_y_reverse() +\n",
236 |     "    scale_color_gradient(low = \"#7E7F9A\", high = \"#F3DE8A\") +\n",
237 |     "    theme(text=element_text(size=20, family=\"URWHelvetica\"), axis.text = element_blank(),\n",
238 |     "           axis.ticks=element_blank(),\n",
239 |     "           panel.spacing = unit(1, \"lines\")) +\n",
240 |     "    theme(strip.background =element_rect(fill=\"#3F4536\",color=\"#3F4536\"))+\n",
241 |     "    theme(strip.text = element_text(colour = 'white')) +\n",
242 |     "    theme(panel.border = element_rect(colour = \"#3F4536\"))  +\n",
243 |     "    labs(x = NULL, y = NULL, color = \"Proportion\")\n",
244 |     "  print(DensityPlot2D)\n",
245 |     "  dev.off()\n",
246 |     "}\n",
247 |     "\n",
248 |     "Density_plot(obsdata_path, prediction_path, objectname)"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "The output figures display the proportion of cells in the regions segemented in the training data."
256 |    ]
257 |   }
258 |  ],
259 |  "metadata": {
260 |   "kernelspec": {
261 |    "display_name": "Python 3",
262 |    "language": "python",
263 |    "name": "python3"
264 |   },
265 |   "language_info": {
266 |    "codemirror_mode": {
267 |     "name": "ipython",
268 |     "version": 3
269 |    },
270 |    "file_extension": ".py",
271 |    "mimetype": "text/x-python",
272 |    "name": "python",
273 |    "nbconvert_exporter": "python",
274 |    "pygments_lexer": "ipython3",
275 |    "version": "3.8.8"
276 |   }
277 |  },
278 |  "nbformat": 4,
279 |  "nbformat_minor": 2
280 | }
281 | 


--------------------------------------------------------------------------------
/tutorial/BiogenPretrain.md:
--------------------------------------------------------------------------------
  1 | <h1><center>Biogen Pre-trained Model Tutorial </center></h1>
  2 | 
  3 | Author: Qihuang Zhang*, Jian Hu, Kejie Li, Baohong Zhang, David Dai,
  4 | Edward B. Lee, Rui Xiao, Mingyao Li*
  5 | 
  6 | 
  7 | 
  8 | ## Outline
  9 | 
 10 | 1.  Preparation
 11 | 2.  Load Data
 12 | 3.  Prediction
 13 | 4.  Visualization (in `R`)
 14 | 
 15 | 
 16 | In this tutorial, we illustrate the usage of the CeLEry pre-train model
 17 | trained by Biogene mouse brain data (Li and Zhang, 2022). This model
 18 | takes the gene expression input of 886 genes and produces a prediction
 19 | probability vector to eight regions segmented from the spatial
 20 | transcriptomics data.
 21 | 
 22 | 
 23 | The prediction model in this tutorial is pre-trained using the spatial transcripitomics data (ID 075B). The domains were segemented using ``spaGCN``:
 24 | 
 25 | ![domain segementation](figures/segementation_8_075B.png)
 26 | 
 27 | 
 28 | To implement this tutorial, the CeLEry python package needs to be installed. Please see the instruction for installation.  
 29 | 
 30 | ## 1. Preparation 
 31 | 
 32 | To implement this tutorial, several
 33 | helper functions are needed. 
 34 | 
 35 | 
 36 | ``` {.python}
 37 | import pickle
 38 | from scanpy import read_10x_h5
 39 | import CeLEry as cel
 40 | 
 41 | import scanpy as sc
 42 | import numpy as np
 43 | import pandas as pd
 44 | from scipy.sparse import issparse
 45 | ```
 46 | 
 47 | 
 48 | ## 2. Load Data 
 49 | 
 50 | Load scRNA-seq/snRNA-seq data. Example data can be download from [Li and Zhang (2022)](https://doi.org/10.5281/zenodo.6640285).
 51 | 
 52 | ``` {.python}
 53 | QueryData_raw = read_10x_h5("data/Biogen/7G-1/filtered_feature_bc_matrix.h5")
 54 | QueryData = cel.make_annData_query (QueryData_raw)
 55 | ```
 56 | 
 57 | 
 58 | It is important to make sure the query scRNA-seq/snRNA-seq contains all the gene in the trained model.
 59 | 
 60 | ``` {.python}
 61 | ## Load gene list
 62 | filename = "pretrainmodel/Biogen/Reference_genes_8_075B.obj"
 63 | filehandler = open(filename, 'rb') 
 64 | genenames = pickle.load(filehandler)
 65 | 
 66 | ## Rearrange the data and filter the selected genes in the trained model.
 67 | Qdata = QueryData[:,list(genenames)]
 68 | cel.get_zscore(Qdata)
 69 | ```
 70 | 
 71 | #### 3. Apply Pre-trained CeLEry model to the snRNA data
 72 | 
 73 | The gene expression of the first cell (a 1X886 matrix) in the snRNA-seq data is given by:
 74 | 
 75 | ``` {.python}
 76 | Qdata[0].X.A
 77 | ```
 78 | 
 79 | Load the CeLEry prediction model which is located at the
 80 | `"../output/Biogene/models"` named as `Org_domain_075B`. We use CeLEry
 81 | function `Predict_domain()` to conduct domain prediction for each single
 82 | cells in the scRNA-seq/snRNA-seq data. The detailed arguments are
 83 | explained as follows:
 84 | 
 85 | -   data_test: (AnnData object) the input scRNA-seq/snRNA-seq data
 86 | -   class_num: (int) the number of classes to be predicted. This value
 87 |     should be consistent with the number of domains in the training
 88 |     model.
 89 | -   path: (string) the location of the pre-trained model
 90 | -   filename: (string) the file name of the saved pre-trained model
 91 | -   predtype: (string) if predtype is \"probability\" (default) then a
 92 |     probability prediction matrix will be produced; if predtype is
 93 |     \"deterministic\", then the deterministic assignment based on the
 94 |     maximum probability prediction will be returned; if predtype is
 95 |     \"both\", then both predictions will be outputed.
 96 | 
 97 | ## 3. Prediction
 98 | 
 99 | Prediction of the first cell
100 | 
101 | ``` {.python}
102 | model_location = "pretrainmodel/Biogen/Pretrained_model_075B.obj"
103 | 
104 | pred_cord = cel.Predict_domain(data_test = Qdata[0], class_num = 8, path = "pretrainmodel/Biogen", filename = "Pretrained_model_075B", predtype = "probability")
105 | ```
106 | 
107 | 
108 | Predict region labels of the entire scRNA-seq data and report the proportion of the cells on each domain.
109 | 
110 | ``` {.python}
111 | pred_cord_all = cel.Predict_domain(data_test = Qdata, class_num = 8, path = "pretrainmodel/Biogen", filename = "Pretrained_model_075B", predtype = "deterministic")
112 | 
113 | prop_count = pd.DataFrame(pred_cord_all).value_counts().sort_index()
114 | prop_weight = prop_count/sum(prop_count)
115 | prop_weight
116 | prop_weight.to_csv("output/Biogen/prop_8_075B_7G-1.csv")
117 | ```
118 | 
119 | The output of this example is:
120 | 
121 | ```
122 | 0	0.280068876
123 | 1	0.155832975
124 | 2	0.102539819
125 | 3	0.066465777
126 | 4	0.151183814
127 | 5	0.169436074
128 | 6	0.056048214
129 | 7	0.018424451
130 | ```
131 | The first column corresponds to the domain in the training spatial transcriptomics data as in the previous figure. The second column reports the proportion of the cells located in different regions.
132 | 
133 | 
134 | ## 4. Visualization
135 | 
136 | For the following part, we use the `ggplot()` in `R` to visualize the
137 | proportion predicted according to CeLEry. We are going to use the
138 | regions segmented from the spatial transcriptomics data to illustrate
139 | what the distribution looks like.
140 | 
141 | ### 4.1 R packages
142 | 
143 | ``` {.R}
144 | library(ggplot2)
145 | library(png)
146 | 
147 | outputdir <- "output/Biogen/plots/"
148 | ```
149 | 
150 | 
151 | ### 4.2 Plotting Functions
152 | 
153 | The Density plot function use two input paths.
154 | 
155 | -   `obsdata_path` specifies the path of the observation data from the
156 |     spatial transcriptomics data that are used to train the data,
157 |     which are saved from the \".obs\" of the annotated data object in
158 |     python. These files contain the spot ID, the locations of the spots,
159 |     and the region ID, and will be used as the background of
160 |     the visualization.
161 | 
162 | -   `prediction_path` specifies where the path of the prediction results
163 |     locate.
164 | 
165 | -   `objectname` specifies the name of the output figure.
166 | 
167 | ``` {.R}
168 | obsdata_path = "output/Biogen/obsdata_8_075B.csv"
169 | prediction_path = "output/Biogen/prop_8_075B_7G-1.csv"
170 | objectname = "BiogenExample"
171 | 
172 | Density_plot <- function(obsdata_path, prediction_path, objectname){
173 |   obsdata <- read.csv(obsdata_path, header = T)
174 |   maxx <- max(obsdata$x_cord)
175 |   obsdata$minus_xcord <- maxx - obsdata$x_cord
176 |   pred_CeLEry <- read.csv(prediction_path, header = T)
177 |   colnames(pred_CeLEry) = c("Domain", "Density")
178 |   dataplot <- merge(obsdata, pred_CeLEry, by.x = "refined_pred", by.y = "Domain")
179 |   png(file = paste0(outputdir,"Density_plot_",objectname,".png"), height = 300, width = 450)
180 |   DensityPlot2D  <- ggplot(dataplot, aes(x = x_cord, y = y_cord) )  + 
181 |     theme_bw()  + 
182 |     geom_point(aes(color = Density), size = 3) + #shape = 21, color = "black",, stroke = 0.3
183 |     # scale_y_reverse() +
184 |     scale_color_gradient(low = "#7E7F9A", high = "#F3DE8A") +
185 |     theme(text=element_text(size=20, family="URWHelvetica"), axis.text = element_blank(),
186 |            axis.ticks=element_blank(),
187 |            panel.spacing = unit(1, "lines")) +
188 |     theme(strip.background =element_rect(fill="#3F4536",color="#3F4536"))+
189 |     theme(strip.text = element_text(colour = 'white')) +
190 |     theme(panel.border = element_rect(colour = "#3F4536"))  +
191 |     labs(x = NULL, y = NULL, color = "Proportion")
192 |   print(DensityPlot2D)
193 |   dev.off()
194 | }
195 | 
196 | Density_plot(obsdata_path, prediction_path, objectname)
197 | ```
198 | 
199 | The output figures display the proportion of cells in the regions
200 | segmented in the training data.
201 | 
202 | ![prediction results](figures/Density_plot_BiogenExample.png)
203 | 


--------------------------------------------------------------------------------
/tutorial/data/AlzheimerToy.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/tutorial/data/AlzheimerToy.h5ad


--------------------------------------------------------------------------------
/tutorial/data/DataLayerToy.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/tutorial/data/DataLayerToy.h5ad


--------------------------------------------------------------------------------
/tutorial/data/Mouse2D/MP1_SVG.py:
--------------------------------------------------------------------------------
1 | d_g={0: [], 1: ['PVALB', 'RPS29', '2900097C17RIK', 'YWHAH', 'CALB1', 'SLC1A2', 'RPL39', 'NSG1', 'CAR8', 'RPS19', 'PCP4', 'SLC1A3', 'ATP1A3', 'SPARCL1', 'GPR37L1', 'SPARC', 'RGS8', 'MT-ND1', 'MT-CO2', 'ITPR1', 'GNG13', 'MT-ND3', 'MT-ND4', 'MT-ND2', 'GPM6B', 'SPTBN2', 'FAM107A', 'TSPAN3', 'RPL23A', 'SLC6A1', 'S100B', 'HBA-A1', 'SBK1', 'NDRG2', 'GAD1', 'RPS28', 'METRN', 'INPP5A', 'LAMP1', 'GABRA1', 'GRIA1', 'CAMK2N1', 'VDAC1', 'HBA-A2', 'HOPX', 'GSTM1', 'TTYH1', 'DNER', 'MPC2', 'NDUFB6', 'MT-ND5', 'THY1', 'AHCYL1', 'HBB-BT', 'NDUFS8', 'HOMER3', 'TOMM7', 'CLEC2L', 'CSDC2', 'RTN4', 'LHX1OS', 'SCG3', 'SELENOF', 'CALR', 'PRKCG', 'GABARAPL1', 'CLSTN3', 'TMBIM6', 'FAM213A', 'ICMT', 'H2-D1', 'SOD1', '1810037I17RIK', 'CYSTM1', 'PLA2G7', 'TRIM9', 'WSB2', 'PPP1R16B', 'ATOX1', 'MT2', 'WASHC2', 'ACSBG1', 'ABAT', 'MTSS1L', 'GRIA4', 'MALAT1', 'PTPRZ1', 'PMM1', 'ALDH1A1', 'B2M', 'UBL3', 'SLC32A1', 'MAP1A', 'PPP1R17', 'SUB1', 'TSC22D4', 'HSPA5', 'CCK', 'GOT2', 'FRRS1L', 'SELENOP', 'ABHD12', 'GARNL3', 'TIMP4', 'PAQR8', 'INA', 'RORA', 'ADGRB1', 'ABHD17A', 'CABP1', 'INPP4A', 'SCP2', 'ANKS1B', 'RHEB', 'CACNA1G', 'GABRG2', 'PLEKHB2', 'RGS7BP', 'ANKRD40', 'TMEM59', 'OST4', 'PTN', 'SLC24A2', 'FXYD1', 'ELMOD1', 'ARHGAP5', 'GRID2', 'KCNA2', 'CCT7', 'CS', 'EMC7', 'BOLA3', 'LAPTM4A', 'MLC1', 'NEFL', 'DAD1', 'HTRA1', 'VIM', 'SFXN5', 'ERP29', 'S100A1', 'NTRK2', 'PRDX6', 'NTM', 'DLGAP4', 'ASRGL1', 'PLPP3', 'SEC62', 'ID2', 'LSM6', 'NOMO1', 'BAIAP2', 'FAM69B', 'MTSS1', 'GPX1', 'RELL2', 'SERPINI1', 'PSD2', 'VEGFB', 'SLC38A1', 'GNG5', 'OSBPL1A', 'LPGAT1', 'ELMO1', 'GLO1', 'RMDN3', 'MRPL52', 'LY6E', 'MT-ND4L', 'ARHGEF33', 'GSTM5', 'SLC25A23', 'CACNG2', 'SOD2', 'SLC20A1', 'HAPLN4', 'SYNDIG1', 'OAZ2', 'PPP1R1B', 'SLC1A6', 'GABBR2', 'RAMP1', 'KCNJ10', 'PABPC1', 'FAM162A', 'TRIM37', 'LUZP2', 'GRM1', 'DLG2', 'ABR', 'SELENOS', 'MRPL33', 'CHN1', 'FABP7', 'NTSR2', 'KIT', 'EFR3A', 'TSPAN13', 'NACC2', 'MT-ATP8', 'SHANK1', 'KIF5B', 'ARPC2', 'ATL2', 'GAD2', 'SHISA6', 'KCNG4', 'SDHC', 'ATP2A3', 'GRIA3', 'GFOD1', 'FAM107B', 'NEFM', 'TOLLIP', 'LXN', 'LRRN2', 'NRSN2', 'UHMK1', 'ERG28', 'GJA1', 'GNAL', 'RIDA', '1500009C09RIK', 'HEPACAM', 'ZFP385A', 'TRPC3', 'ITPKA', 'PPP1R14B', 'PLTP', 'CHPT1', 'KCTD12', 'BTBD1', 'DNAJC15', 'PRMT8', 'CDC42EP4', 'SLC24A3', 'TMEM47', 'SPOCK3', 'S1PR1'], 2: ['HBB-BT', 'AGT', 'NNAT', 'FXYD1', 'GFAP'], 3: ['SNAP25', 'LDHB', 'ATP5L', 'COX8A', 'COX6B1', 'COX5B', 'HSPA8', 'NDUFB8', 'CKB', 'SPARCL1', 'COX6C', 'CALM2', 'PCSK1N', 'CALM1', 'ALDOA', 'STMN3', 'ATP1A3', 'NDRG4', 'NDUFA4', 'SCN1B', '2010107E04RIK', 'ATP5B', 'ATP1B1', 'GNAS', 'ATP5A1', 'VSNL1', 'TUBA1B', 'COX7A2', 'ATP5J2', 'ATP5G3', 'UQCRQ', 'COX7B', 'SNRPN', 'MDH1', 'SLC25A4', 'CHCHD10', 'UQCRH', 'RAB3A', 'PKM', 'CPLX1', 'COX5A', 'ATP5J', 'UCHL1', 'ATPIF1', 'TPI1', 'USMG5', 'PVALB', 'NEFL', 'THY1', '2900097C17RIK', 'ENO2', 'DNM1', 'ZWINT', 'NDUFC1', 'ATP5K', 'CLSTN1', 'RTN1', 'EEF1A2', 'TCF25', 'SNCB', 'VAMP1', 'YWHAH', 'NSF', 'YWHAG', 'MAP1B', 'NCDN', 'PRDX5', 'STXBP1', 'ATP6V1B2', 'NEFM', 'GOT1', 'CEND1', 'TSPYL4', 'NAT8L', 'NSG1', 'ATP2B2', 'SLC12A5', 'CKMT1'], 4: [], 5: ['NRGN', 'CYFIP2', 'SERINC1', 'EEF1A2', 'APP', 'ZWINT', 'SYP', 'GPM6A', 'GNG3', 'CHN1', 'ATP6V1G2', 'CALM2', 'YWHAH', 'SLC17A7', 'VSNL1', 'CLSTN1', 'MEG3', 'SNHG11', 'RTN1', 'BASP1', 'DNM1', 'SYT1', 'SNAP25', 'CTXN1', 'CCK', 'VAMP2', 'STXBP1', '1110008P14RIK', 'PRKAR1B', 'CX3CL1', 'SCG5', 'LINGO1', 'ARPP19'], 6: ['APOD', 'GFAP'], 7: ['DCLK1', 'CALB1', 'FABP3', 'ITPKA', 'GAP43', 'GDA', 'MARCKS', 'CYP46A1', 'SYT5', 'ARPP21', 'HAP1', 'GRIA3', 'LINGO1', 'SYT4', 'NEGR1', 'LYPD1', 'TSPAN13', 'GABRA1', 'PTPN5', 'ATP2B4', 'MAL2', 'HPCAL1', 'RSRP1', 'NOV', 'DBPHT2', 'CAMK2D'], 8: ['RNF112', 'GRM4', 'SMG1', 'BMP1', 'TRIM62', 'PPFIA4', 'KNDC1', 'TESC', 'PLCH2', 'MPP3', 'ADAMTS10', 'CNTN2', 'TYRO3', 'BSN', 'PXN', 'SCN2A', 'TLE2', 'JPH4', 'USP3', 'IL16', 'LENG8', 'DUSP11', 'FAM131B', 'DOCK9', 'SEL1L3', 'JPH3', 'ODF2'], 9: ['RASGRP1', 'RAB6B', 'TRNP1', 'C1QTNF4', 'ATP2A2', 'YWHAZ', 'NCDN', 'ABHD8', 'REEP2', 'SPTBN1', 'CCK', 'CNTN1', 'SYP', 'SPOCK2', 'PCP4', 'PDP1', 'CAMK2N2', 'ADARB1', 'NSMF', 'INA', 'PTPN4', 'GABBR1', 'ATP2B2', 'KCTD17', 'CX3CL1', 'RAP1GDS1', 'TCF7L2', 'ADGRB1', 'RORA', 'SYT1', 'PRKCD', 'STMN4', 'CAMK2B', 'ATP6AP2', 'ATP2B1', 'RAP1GAP', 'NCS1', 'ELMO1', 'RGS7BP', 'GABBR2', 'SLC24A2', 'RIMS3', 'DLGAP3', 'KIF1A', 'CHN1', 'PPP1R9B', 'MYO5A', 'PCP4L1', 'ZIC1', 'SLC17A6', 'NDUFA10', 'NRXN1', 'ANKS1B', 'PRKCG', 'AMOTL1', 'CDK5R1', 'OGFRL1', 'GRIN1', 'EDIL3', '2900011O08RIK', 'SYT7', 'PLEKHG1', 'CIT', 'ADCY1', 'CCDC136', 'RAB3C', 'CDK16', 'NTNG1', 'CAMK2A', 'DLGAP4', 'SPOCK3', 'KCNC1', 'SYN1', 'AATK', 'LRRN2', 'BTBD3', 'TNNT1', 'KCNC2', 'KCNAB2', 'SLC17A7', 'SPOCK1', 'PTPN3', 'FBXL16', 'BOK', 'PCSK2', 'PSD3', 'HLF', 'KCNA2', 'AI593442', 'RAMP3', 'CD47', 'NRIP3', 'RGS16', 'SMIM13', 'KNDC1', 'BSN', 'RNF112', 'PITPNM1', 'GABRA4', 'PLCB4', 'LYNX1', 'HSPH1', 'ATXN7L3', 'SHANK3', 'GRM1', 'ZFP365', 'NRXN3', 'CELF2', 'NELL2', 'GABRD', 'REPS2', 'SYNPO2', 'RGS4', '6430548M08RIK', 'SLC6A17', 'NR2F1', 'SCN2B', 'NECAB2', 'TRIM9', 'CYP46A1', 'LINGO1', 'CRMP1', 'ZDHHC22', 'SYT13', 'FNDC4', 'GNAL', 'LRRTM1', 'SLC24A3', 'ILDR2', 'SETD7', 'TIAM1', 'NR1D1', 'TTC7B', 'CACNA1G', 'NT5DC3', 'PRKCE', 'KITL', 'CORO2B', 'FAM20C', 'EPHA4', 'CDKL5', 'MTURN', 'NMNAT2', 'KCNIP4', 'PITPNC1', 'ZMAT4', 'BHLHE40', 'NELL1', 'B230334C09RIK', 'SEZ6L', 'CLMN', 'ADGRA1', 'FRRS1L', 'CACNB4', 'SHANK1', 'ZNRF1', 'HRH3', 'KCND2', 'SHOX2', 'MEGF9', 'FGF13', 'L1CAM', '1810041L15RIK', 'PPP2R5D', 'KCNQ3', 'TANC1', 'PATJ', 'CHRNA4', 'LHFP', 'OCIAD2', 'FAM126B', 'ADRA1B', 'BRINP1', 'LRTM2', 'PTK2B', 'KCNJ9', 'CPNE9']}
2 | 
3 | 


--------------------------------------------------------------------------------
/tutorial/data/MousePosteriorToy.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/tutorial/data/MousePosteriorToy.h5ad


--------------------------------------------------------------------------------
/tutorial/data/MouseSCToy.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/tutorial/data/MouseSCToy.h5ad


--------------------------------------------------------------------------------
/tutorial/figures/Density_plot_BiogenExample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/tutorial/figures/Density_plot_BiogenExample.png


--------------------------------------------------------------------------------
/tutorial/figures/segementation_8_075B.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QihuangZhang/CeLEry/b0b73764bb517b0b7b360d4b18bed5d6b77a4615/tutorial/figures/segementation_8_075B.png


--------------------------------------------------------------------------------