4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/CoreFunctions/FindSpatialDEGs.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.neighbors import KNeighborsClassifier
3 |
4 | def get_neighborhoods_with_null(mat, ct, center, cell, k, niter):
5 | '''
6 | Gets the local neighborhood expression, as well as a null distribution
7 | for cell type "cell"
8 | ---------------------
9 | parameters:
10 | mat: digital gene expression matrix
11 | ct: vector of all cell types
12 | center: x, y, z coordinates for each cell
13 | cell: cell type to find spDEGs for
14 | k: Number of cells in a local neighborhood
15 | niter: number of permutations
16 | ---------------------
17 | returns:
18 | gene_mat: local neighborhood expression
19 | nullm: Null distribution of local neighborhood expression
20 | '''
21 | # Finds where the cells of a given cell type are
22 | locs = np.where(ct == cell)[0]
23 | sub_mat = mat.iloc[locs, :]
24 |
25 | # hard coded, if number of cells is less than
26 | # 4 times the neighborhood size, don't compute spDEGs
27 | # this can be changed
28 | if sub_mat.shape[0] < 4 * k:
29 | print('Only ', str(sub_mat.shape[0]), ' cells present')
30 | return None
31 |
32 | # Train KNN classifier to find neighborshoods
33 | sub_cent = center[locs, :]
34 | sub_ct = ct[locs]
35 | clf = KNeighborsClassifier(n_neighbors=k).fit(sub_cent, sub_ct)
36 | dist, ids = clf.kneighbors(sub_cent)
37 |
38 | # get the null distribution of neighobrhoods
39 | nullm = get_null(sub_mat, ids, niter)
40 |
41 | # get the real neighborhood expression
42 | gene_mat = get_local_neigh(sub_mat, ids)
43 |
44 | return gene_mat, nullm
45 |
46 |
47 | def get_null(sm, ids, niter):
48 | '''
49 | Gets the null distribution of local neighborhoods
50 | ---------------------
51 | parameters:
52 | sm: DGE of cells in the current cell type
53 | ids: indices of nearest neighbors
54 | niter: Number of permutations
55 | ---------------------
56 | returns:
57 | nullmat: null distribution of local neighborhood expression
58 | '''
59 | nullmat = np.zeros((niter,
60 | sm.shape[0],
61 | sm.shape[1]))
62 | ids_rand = ids.copy()
63 | for i in range(niter):
64 | np.random.shuffle(ids_rand.ravel())
65 | nullmat[i, :, :] = get_local_neigh(sm, ids_rand)
66 | return nullmat
67 |
68 |
69 | def get_local_neigh(cm, ids):
70 | '''
71 | Gets the local expression of a neighborhood around each cell
72 | ---------------------
73 | parameters:
74 | cm: DGE of cells in current cell type
75 | ids: indices of nearest neighbors
76 | ---------------------
77 | returns:
78 | neigh_mat: local neighborhood expression around each cell
79 | '''
80 | neigh_mat = np.zeros_like(cm)
81 | for i in range(ids.shape[0]):
82 | temp_mat = cm.iloc[ids[i, :], :]
83 | neigh_mat[i, :] = np.mean(temp_mat, axis=0)
84 |
85 | return neigh_mat
86 |
87 | def get_spatial_pval(cells_mat, celltypes, cell_cent, ct, nneighbors, nperm):
88 | '''
89 | Get the pvalue of spDEGs for each gene in a given cell type
90 | ---------------------
91 | parameters:
92 | cells_mat: DGE matrix (cells x genes)
93 | celltypes: vector of cell types
94 | cell_cent: locations of each cell, (cells x euclidean (xyz))
95 | ct: cell type of interest
96 | nneighbors: number of neighbors in a local neighborhood
97 | nperm: number of permutation to generate a null distribution
98 | ---------------------
99 | returns:
100 | ps_mat_raveled: list of p values, and gene indices
101 | returns: None if there aren't enough cells
102 | '''
103 | neighborhoods_output = get_neighborhoods_with_null(cells_mat, celltypes,
104 | cell_cent, ct,nneighbors,
105 | nperm)
106 |
107 | if neighborhoods_output is not None:
108 |
109 | gm, nm = neighborhoods_output
110 |
111 | ps_mat_raveled = []
112 | for i in range(gm.shape[1]):
113 | nm_rav = nm[:, :, i]
114 |
115 | var_vec = np.var(nm_rav, axis=1)
116 | real_var = np.var(gm[:, i])
117 |
118 | p = 1 - (np.sum(real_var > var_vec) / len(var_vec))
119 |
120 | ps_mat_raveled.append([p, i])
121 |
122 | return ps_mat_raveled
123 |
124 | else:
125 | print('Not enough cells in cell type.')
126 | return None
127 |
--------------------------------------------------------------------------------
/CoreFunctions/JSTA.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import numpy as np
3 | import pandas as pd
4 | import matplotlib.pyplot as plt
5 | import pickle as pkl
6 | from math import exp, sqrt, log
7 | import scipy.spatial
8 | import os
9 | from time import time
10 | from collections import Counter
11 | from sklearn.preprocessing import scale
12 | from skimage.segmentation import watershed
13 | from numpy.ctypeslib import ndpointer
14 | import ctypes
15 | from tensorflow.keras.layers import Input, Dense, BatchNormalization
16 | from tensorflow.keras.models import Model
17 | from tensorflow.keras.regularizers import l2, l1
18 | from tensorflow.keras.optimizers import Adam
19 | from sklearn.model_selection import train_test_split
20 | from sklearn import neighbors
21 |
22 | path_to_file = #REPLACE-WITH-PATH
23 |
24 | def find_empty_pixels(pix):
25 | '''
26 | Finds which pixels are empty to be used later to exclude border pix
27 | ---------------------
28 | parameters:
29 | pix: 4d array of pixel identities
30 | ---------------------
31 | return:
32 | empty_pix: 3d array of pixels 0 if empty 1 if not empty
33 | '''
34 | empty_pix = np.sum(pix,axis=3)
35 | empty_pix[empty_pix > 0] = 1
36 |
37 | return empty_pix
38 |
39 |
40 | def add_coords_to_cell_matrix(cm, nuc):
41 | '''
42 | Adds the cell center coordinates to the cell matrix
43 | ---------------------
44 | parameters:
45 | cm: cell count matrix
46 | nuc: dataframe of nuclei
47 | return:
48 | cm: cell count matrix with coordinates appended to the front
49 | '''
50 | cell_centers, nuc_id = get_cell_centers(nuc)
51 | nuc_xy = pd.DataFrame(np.zeros((cm.shape[0],3)),columns=['x','y','z'])
52 | nuc_xy.index = cm.index
53 | nuc_xy.loc[:,:] = cell_centers
54 | cm = pd.concat([nuc_xy,cm],axis=1)
55 |
56 | return cm
57 |
58 | def get_cell_centers(nuc):
59 | '''
60 | Finds the center of each nucleus
61 | ---------------------
62 | parameters:
63 | nuc: dataframe of nuclei
64 | ---------------------
65 | return:
66 | cell_centers: array of the x,y,z center of each nuclus
67 | nuc_id: array of the nucleus id in the same order as cell centers
68 | '''
69 | cell_centers = []
70 | nuc_id = []
71 | for ind in np.unique(nuc.id):
72 | nuc_id.append(int(ind))
73 | temp = nuc[nuc.id == ind]
74 | x_cent, y_cent, z_cent = np.mean(temp.x), np.mean(temp.y), np.mean(temp.z)
75 | cell_centers.append([x_cent, y_cent, z_cent])
76 | cell_centers = np.array(cell_centers)
77 | nuc_id = np.array(nuc_id)
78 | return cell_centers, nuc_id
79 |
80 | def get_matrix_of_cells(pix, cell_assign, nuclei_clusters):
81 | '''
82 | Gets the count matrix based on current cell assignment and true input pixels
83 | ---------------------
84 | parameters:
85 | pix: 4d array of pixels to with their gene expression
86 | cell_assign: current assignments of each pixel to a cell
87 | nuclei_clusters: dataframe of nuclei points
88 | ---------------------
89 | return: cells_mat nxm count matrix where n is number of cells m is number of genes
90 | '''
91 | n_gene = pix.shape[3]
92 | n_cell = len(np.unique(nuclei_clusters.id))
93 | cells_mat = pd.DataFrame(np.zeros((n_cell,n_gene)),columns = np.arange(n_gene))
94 | cells_mat.index = np.unique(nuclei_clusters.id).astype(int)
95 | background = np.zeros(n_gene)
96 | for i in np.unique(nuclei_clusters.id):
97 | id_loc = np.where(cell_assign == i)
98 | cells_mat.loc[i,:] = np.sum(pix[id_loc[0],
99 | id_loc[1],
100 | id_loc[2],:],axis=0)
101 |
102 | return cells_mat
103 |
104 |
105 | def get_number_similar_surroundings(cell_assign):
106 | '''
107 | counts the cell types of the surrounding pixels for each pixel
108 | ---------------------
109 | parameters:
110 | cell_assign: current assignment of each pixel
111 | num_classes: the number of cell types
112 | celltype_pred: predictions of each cells celltype
113 | returns:
114 | surr_count: 4d array for each pixel the number of surroundings that are of each cell type
115 | '''
116 | surr_count = np.zeros_like(cell_assign)
117 | same_count = np.zeros_like(cell_assign)
118 |
119 | surroundings = np.pad(cell_assign,(1),'constant',constant_values=(-2))
120 | height = cell_assign.shape[0]; width = cell_assign.shape[1]; depth = cell_assign.shape[2]
121 |
122 | surroundings = np.array(surroundings, dtype=int)
123 | surr_count = np.array(surr_count, dtype=int)
124 | same_count = np.array(same_count, dtype=int)
125 |
126 | c_args = [ndpointer(dtype=ctypes.c_int,flags='C'),
127 | ndpointer(dtype=ctypes.c_int,flags='C'),
128 | ndpointer(dtype=ctypes.c_int,flags='C'),
129 | ctypes.c_int, ctypes.c_int, ctypes.c_int]
130 | get_num_surr_func_c.get_sur.argtypes = c_args
131 | get_sur.restype = None
132 |
133 | surroundings = surroundings.ravel().astype(np.int32)
134 | surr_count = surr_count.ravel().astype(np.int32)
135 | same_count = same_count.ravel().astype(np.int32)
136 |
137 |
138 | #c func
139 | get_sur(surroundings,surr_count,same_count, height, width, depth)
140 | same_count -= 1
141 |
142 | return surr_count.reshape(cell_assign.shape), same_count.reshape(cell_assign.shape)
143 |
144 | def classify_pixels_to_nuclei(locs, nuclei_clust, dist_threshold):
145 | '''
146 | Classify each pixel to a nucleus or to nothing (-1)
147 | ---------------------
148 | parameters:
149 | locs: locations of pixels in x, y, z coordinates
150 | nuclei_clust: dataframe of nuclei spots
151 | dist_threshold: maximum distance away from nucleus for classification
152 | '''
153 | neighbors_classifier = neighbors.NearestNeighbors(n_neighbors=1)
154 | neighbors_classifier.fit( nuclei_clust.loc[:,['x','y','z']].values,nuclei_clust.id)
155 | l = locs.shape
156 | new_locs = np.reshape(locs, (l[0]*l[1]*l[2],3))
157 | cell_assignment = -np.ones(l[0:3])
158 | dists, predicted = neighbors_classifier.kneighbors(new_locs)
159 | dists = dists.ravel(); predicted = predicted.ravel()
160 | predicted = nuclei_clust.id.to_numpy()[predicted]
161 | predicted[~(dists < dist_threshold)] = -1
162 | counter = 0
163 | for i in range(len(cell_assignment)):
164 | for j in range(len(cell_assignment[i])):
165 | for k in range(len(cell_assignment[i,j])):
166 | cell_assignment[i,j,k]=predicted[counter]
167 | counter += 1
168 | return cell_assignment.astype(int)
169 |
170 | def get_real_pixels(spots, approximate_binsize, genes_mer, pix_shape,dtype=np.float32):
171 | '''
172 | Returns the array of pixels using count data instead of smoothed values
173 | If there is only one spot in a given gene and z plane, it is ignored so
174 | the number of spots may be slightly more than the sum of the true pixels
175 | ---------------------
176 | parameters:
177 | spots: raw merfish data
178 | approximate_binsize: the approximated binsize of each histogram cell
179 | genes_mer: genes that are in the merfish data
180 | pix_shape: shape of pixels tensor
181 | ---------------------
182 | return:
183 | pix_true: 4d pixel tensor with true count data
184 | '''
185 | min_x = np.min(spots.x); max_x = np.max(spots.x);
186 | min_y = np.min(spots.y); max_y = np.max(spots.y);
187 | min_z = np.min(spots.z); max_z = np.max(spots.z);
188 |
189 | x_steps = get_real_binsize(spots.x, approximate_binsize)
190 | y_steps = get_real_binsize(spots.y, approximate_binsize)
191 | z_steps = get_real_binsize(spots.z, approximate_binsize)
192 |
193 | n_x_bins = len(np.arange(min_x,max_x+x_steps,x_steps))
194 | n_y_bins = len(np.arange(min_y,max_y+y_steps,y_steps))
195 |
196 | pix_true = np.zeros(pix_shape,dtype=dtype)
197 | z_bins = np.arange(min_z, max_z+z_steps+1, z_steps)
198 | ngene = len(genes_mer)
199 |
200 | tic = time()
201 | for i,gene in enumerate(genes_mer):
202 | print(gene)
203 | toc = time()
204 |
205 | spots_temp = spots[spots.gene == gene]
206 | z_counter = 0
207 | for z in range(1,len(z_bins),1):
208 | spots_temp_z = spots_temp[(spots_temp.z >= z_bins[z-1])&
209 | (spots_temp.z < z_bins[z])]
210 | if spots_temp_z.shape[0] > 1:
211 | hist = np.histogram2d(spots_temp_z.x,
212 | spots_temp_z.y,
213 | range=[[min_x,max_x],
214 | [min_y,max_y]],
215 | bins = (n_x_bins,
216 | n_y_bins))[0]
217 | pix_true[:,:,z_counter,i] = hist
218 | z_counter += 1
219 |
220 | return pix_true
221 |
222 |
223 |
224 | def get_locations(spots, approximate_binsize):
225 | '''
226 | Gets the coordinates for each cell in the pixels
227 | ---------------------
228 | parameters:
229 | spots: merfish raw data
230 | approximate_binsize: the approximated binsize of each histogram cell
231 | ---------------------
232 | return:
233 | locations: 4d array with x, y, z coordinates for each pixel
234 | '''
235 | x_steps = get_real_binsize(spots.x, approximate_binsize)
236 | y_steps = get_real_binsize(spots.y, approximate_binsize)
237 | z_steps = get_real_binsize(spots.z, approximate_binsize)
238 | xs = np.arange(np.min(spots.x),np.max(spots.x)+x_steps,x_steps)
239 | ys = np.arange(np.min(spots.y),np.max(spots.y)+y_steps,y_steps)
240 | zs = np.arange(np.min(spots.z),np.max(spots.z)+z_steps,z_steps)
241 | X, Y, Z = np.mgrid[np.min(spots.x):np.max(spots.x)+x_steps:x_steps,
242 | np.min(spots.y):np.max(spots.y)+y_steps:y_steps,
243 | np.min(spots.z):np.max(spots.z)+z_steps:z_steps]
244 | locations = np.zeros((len(xs),len(ys),len(zs),3))
245 | locations[:,:,:,0] = X
246 | locations[:,:,:,1] = Y
247 | locations[:,:,:,2] = Z
248 | return locations
249 |
250 |
251 | def fast_de_all_spots(spots, approximate_binsize,
252 | bandwidth):
253 | '''
254 | Runs psuedo-kde for all genes
255 | ---------------------
256 | parameters:
257 | spots: merfish raw data
258 | approximate_binsize: the approximated binsize of each histogram cell
259 | bandwidth: how far away to get information from for kde
260 | ---------------------
261 | return:
262 | kde_data: 4d array of all kde data for every gene (4th dimension)
263 | '''
264 | positions, x_shape = get_positions_for_kde(spots, approximate_binsize)
265 | kde_data = np.zeros((x_shape[0],x_shape[1],x_shape[2],len(np.unique(spots.gene))))
266 | for i,gene in enumerate(np.unique(spots.gene)):
267 | print(gene, i)
268 | temp = spots[spots.gene == gene]
269 | kde_data[:,:,:,i] = fast_kde_spot(temp, positions,
270 | approximate_binsize, bandwidth,
271 | x_shape)
272 | return kde_data
273 |
274 | def fast_kde_spot(spots, positions, approximate_binsize,
275 | bandwidth,x_shape):
276 | '''
277 | Wrapper for running fast_kde_with_knn for the spots
278 | ---------------------
279 | parameters:
280 | spots: merfish raw data
281 | positions: center points for kde smoothing
282 | approximate_binsize: the approximated binsize of each histogram cell
283 | x_shape: final shape of the 3d array for the smoothed kde
284 | return:
285 | spot_dense: 3d array with smoothed kde vlaues
286 | '''
287 | coords = spots.loc[:,['x','y','z']].to_numpy()
288 | spot_dense = np.reshape(fast_kde_with_knn(positions, coords,
289 | bandwidth),x_shape)
290 | return spot_dense
291 |
292 | def kde_nuclei(spots, nuclei,
293 | approximate_binsize, bandwidth):
294 | '''
295 | Get the smoothed density of the nuclei
296 | ---------------------
297 | parameters:
298 | spots: dataframe of the merfish spots
299 | nuclei: dataframe of the nuclei points
300 | approximate_binsize: approximate binsize in microns
301 | bandwidth: number of neighbors to look for
302 | ---------------------
303 | return:
304 | nuc_dense: 3d array with the smoothed nuclei density
305 | '''
306 |
307 | positions, x_shape = get_positions_for_kde(spots, approximate_binsize)
308 | coords = nuclei.loc[:,['x','y','z']].to_numpy()
309 |
310 | print('getting density for nuclei')
311 | nuc_dense = np.reshape(fast_kde_with_knn(positions, coords,
312 | bandwidth,1),x_shape)
313 | nuc_dense *= nuclei.shape[0]
314 | nuc_dense /= np.mean(nuc_dense)
315 | return nuc_dense
316 |
317 | def get_positions_for_kde(spots, approximate_binsize):
318 | '''
319 | Creates the grid to get the positions where to find kde
320 | ---------------------
321 | parameters:
322 | spots: raw merfish data
323 | approximate_binsize: approximated binsize of the histogram cell
324 | ---------------------
325 | return:
326 | positions: positions with coordinates of where to find kde
327 | x_shape: the shape of the final 3d array
328 | '''
329 |
330 | x_steps = get_real_binsize(spots.x, approximate_binsize)
331 | y_steps = get_real_binsize(spots.y, approximate_binsize)
332 | z_steps = get_real_binsize(spots.z, approximate_binsize)
333 | xs = np.arange(np.min(spots.x),np.max(spots.x)+x_steps,x_steps)
334 | ys = np.arange(np.min(spots.y),np.max(spots.y)+y_steps,y_steps)
335 | zs = np.arange(np.min(spots.z),np.max(spots.z)+z_steps,z_steps)
336 |
337 | X, Y, Z = np.mgrid[np.min(spots.x):np.max(spots.x)+x_steps:x_steps,
338 | np.min(spots.y):np.max(spots.y)+y_steps:y_steps,
339 | np.min(spots.z):np.max(spots.z)+z_steps:z_steps]
340 |
341 | positions = np.vstack([X.ravel(),Y.ravel(),Z.ravel()]).T
342 | return positions, X.shape
343 |
344 | def fast_kde_with_knn(positions, coords, nneigh):
345 | '''
346 | Pseudo-KDE by dividing the number of points near the point of interest
347 | by the volumne take to get to that number
348 | ---------------------
349 | parameters:
350 | positions: vector of coordinates in x,y,z to find kde on
351 | coords: coordinates of the points to be smoothed by kde
352 | nneigh: number of neighbors to use, it is a proxy for bandwidth, or
353 | distance to pull information from
354 | return:
355 | kde vec: vector of the smoothed kde values for each location in positions
356 | '''
357 | nneigh = min(nneigh, coords.shape[0])
358 | nbrs = neighbors.NearestNeighbors(n_neighbors=nneigh, algorithm='kd_tree',
359 | n_jobs=24).fit(coords)
360 | distances, indices = nbrs.kneighbors(positions)
361 | denom = ((4/3*3.14))*distances[:,nneigh-1]**3
362 | denom = np.maximum(denom,1e-1)
363 | return nneigh/denom
364 |
365 |
366 | def get_real_binsize(one_direction, approx_binsize):
367 | '''
368 | We approximate the binsize in microns, but because the actual range may not be divisible by the
369 | approximated binsize we need to change the true binsize
370 | ---------------------
371 | parameters:
372 | one_direction: the vector of coordinates for x, y, or z
373 | approx_binsize: approximate binsize in microns
374 | ---------------------
375 | return:
376 | actual binsize: the range/number of bins
377 |
378 | '''
379 | one_range = get_range(one_direction)
380 | nbins = np.ceil(one_range/approx_binsize) #so we can have equal sized bins
381 | return one_range/nbins
382 |
383 | def get_range(array):
384 | '''
385 | Gets the range of the coordinates for either x, y, or z
386 | ---------------------
387 | parameters:
388 | array: numpy array of x, y, or z values
389 | ---------------------
390 | return:
391 | float: the range of the coordinates max-min
392 | '''
393 | return(np.max(array)-np.min(array))
394 |
395 | def shrink_window(df, x_min, x_max, y_min, y_max):
396 | '''
397 | Shrinks the window down to a smaller size
398 | ---------------------
399 | parameters:
400 | df: dataframe to shrink
401 | x_min: lower bound x
402 | x_max: upper bound x
403 | y_min: lower_bound y
404 | y_max: upper_bound y
405 | ---------------------
406 | return:
407 | new_df: shrunk dataframe
408 | '''
409 | new_df = df[(df.x > x_min) &
410 | (df.x < x_max) &
411 | (df.y > y_min) &
412 | (df.y < y_max)]
413 | return new_df
414 |
415 | def plot_segmentation(assignment, cmap='nipy_spectral'):
416 | '''
417 | Plots segmentation map as the max value through the z-stack
418 | ---------------------
419 | parameters:
420 | assignment: 3d array of the cell segmentation map
421 | cmap: matplotlib color map to use
422 | '''
423 |
424 | #rearange the cell ids so the coloring is more spread out
425 | colors = np.unique(assignment)
426 | copy_assign = assignment.copy()
427 | np.random.shuffle(colors)
428 | colors[colors == -1] = colors[0]
429 | colors[0] == -1
430 | for i,c in enumerate(np.unique(copy_assign)):
431 | #skip -1 in the segmentation map
432 | if c != -1:
433 | copy_assign[copy_assign == c] = colors[i]
434 |
435 | #plot the max value through the z-stack
436 | plt.imshow(np.max(copy_assign, axis=2),
437 | cmap=cmap)
438 |
439 | def reclassify_squares(pix, pixl_true,
440 | cell_matrix, nuc,
441 | cell_assign, sc_ref,
442 | sc_ref_celltypes, all_genes,
443 | locs, clf_cell,
444 | pct_train=0.1, border_other_threshold=5,
445 | border_same_threshold=2,
446 | outer_max=1, inner_max=5,
447 | most_inner_max=5, dist_threshold=2, dist_scaling=5,
448 | anneal_param=0.05, flip_thresh=0.1,
449 | nlayer=3, first_epochs=25, second_epochs=15,
450 | lrs=[1e-3, 1e-4], l1_reg=1e-3):
451 | '''
452 | Method to flip pixels from one cell to another or to no cell assignment to improve cell segmentation
453 | High level description:
454 | a) Classify cells to a cell type.
455 | b)Train on a subset of the pixels to build a pixel level
456 | classifier for determining the celltype identity
457 | c) Flip border pixels acording to their predictions using the model in b.
458 | Keep switching between a, b, and c flipping pixels and retraining the models
459 | ---------------------
460 | parameters:
461 | pix: 4d tensor. 4th dimension is gene expression (kde) vector of each pixel
462 | pixl_true: 4d tensor. 4th dimension is gene expression (counts) vector of each pixel
463 | cell_matrix: voronoi segmented digital gene expression matrix cells x genes
464 | nuc: data frame with x,y,z coordinates of nuclei pixels with nuclei IDs
465 | cell_assign: 3d tensor. each pixel has it's current nuclei classification or none
466 | sc_ref: scRNAseq reference matrix cells x genes
467 | sc_ref_celltypes: vector of cell types for each scRNAseq cell
468 | all_genes: genes in merfish data
469 | locs: 4d tensor. 4th dimension is an x,y,z vector of the coordinates of the center of that pixel
470 | clf_cell: neural network based cell type predictor
471 | pct_train: percentage of pixels to train on for pixel classifier (default: 0.1)
472 | border_other_threshold: how many pixels need to belong to the other cell to flip to that cell (default: 5)
473 | border_same_threshold: border_other_threshold: how many pixels need to belong to the same cell to flip to that cell (default: 2)
474 | outer_max: numbegr of iterations of the outer loop (cell classification (a)) (default: 1)
475 | inner_max: number of iterations of the inner loop (pixel training (b)) (default: 5)
476 | most_inner_max: number of iterations of the flipping pixels loop (pixel flip (c)) (default: 5)
477 | dist_threshold: distance from edge of nucleus to ensure a cell belongs to that nucleus. (default: 2)
478 | dist_scaling: amount of decay of probabilities for flipping as you move away from a nucleus of
479 | interest. The probability decreases by half every dist_threshold*dist_scaling (default: 5)
480 | annealing_param: Parameter to decrease the probabalistic component of pixel and cell classification.
481 | Every iteration the highest probability is multiplied by 1+annealing_param*n_iteration (default: 0.5)
482 | flip_thresh: Pixel probabilities before this value get set to 0. (default: 0.1)
483 | nlayer: number of intermediate layers in the pixel classifier (default: 3)
484 | first_epochs: number of epochs for training after the pixel classifier was initialized (default: 25)
485 | second_epochs: number of epochs for training on subsequent rounds of training (default: 15)
486 | lrs: list of learning rates for training pixel classifier (default: [1e-3, 1e-4])
487 | l1_reg: l1 regularization parameter for neural network
488 | return:
489 | cell_assign: 3d tensor giving the nuclei assignment of each pixel
490 | '''
491 | # copy of original assignment for later use
492 | map_to_keep_nuclei_correctly_labeled = classify_pixels_to_nuclei(
493 | locs, nuc, dist_threshold)
494 |
495 | # name change. Get rid of later
496 | genes_to_use_prediction = all_genes
497 |
498 | # gets the genes we need for cell type prediction
499 | gene_subset_indices = []
500 | for i in all_genes:
501 | if i in genes_to_use_prediction:
502 | gene_subset_indices.append(True)
503 | else:
504 | gene_subset_indices.append(False)
505 |
506 | # train nuclei knn classifier for later use
507 | nuc_clf = neighbors.NearestNeighbors(n_neighbors=10).fit(nuc.loc[:, ['x', 'y', 'z']],
508 | nuc.id)
509 |
510 | # get count of surroundings that are the same and different
511 | surround_count, same_cts = get_number_similar_surroundings(cell_assign)
512 |
513 | # get number of cell types
514 | n_celltype = len(np.unique(sc_ref_celltypes))
515 |
516 | pix_shape = pix.shape
517 | x_max, y_max, z_max = pix_shape[0]-1, pix_shape[1]-1, pix_shape[2]-1
518 |
519 | # name change from before need to clean up
520 | cp_grid = pix
521 |
522 | num_iterations_outer = 0
523 | np.seterr(invalid='raise')
524 | square_param_diff_vec = []
525 |
526 | prediction_mean = []
527 | p_weight = None
528 | p_mean = None
529 | percent_flipped = []
530 | logi_param_diff = []
531 |
532 | # center and scale sc ref
533 | #sc_ref.loc[:,:] = scale(sc_ref, axis=0)
534 |
535 | overlapping_genes_for_merfish_map = np.isin(all_genes,
536 | sc_ref.columns)
537 |
538 | n_combined_cells = cell_matrix.shape[0]+sc_ref.shape[0]
539 | combined_cells = np.zeros((n_combined_cells,
540 | np.sum(overlapping_genes_for_merfish_map)))
541 |
542 | clf_log = pixel_nn_classifier(sc_ref,
543 | sc_ref_celltypes,
544 | nlayer,
545 | l1_reg)
546 |
547 | n_iterations = 0
548 | n_changed = []
549 | n_changed_overall = []
550 |
551 | while(num_iterations_outer < outer_max):
552 | cells_matrix = get_matrix_of_cells(
553 | pixl_true, cell_assign, nuc).to_numpy()
554 |
555 | non_empty_cell_locs = np.where(np.sum(cells_matrix, axis=1) > 100)[0]
556 | cells_matrix = scale(cells_matrix, axis=1)
557 | cells_matrix = scale(cells_matrix, axis=0)
558 | tic = time()
559 |
560 | # combines the mer and sc_ref matrices and builds a classifier for celltype
561 | # based on sc_ref
562 | print('finding celltypes')
563 | cells_probs = clf_cell.predict(cells_matrix)
564 |
565 | toc = time()
566 | print('time to get celltypes', toc-tic)
567 |
568 | prediction_mean.append(np.mean(np.max(cells_probs, axis=1)))
569 |
570 | # adding multiply the max prob by 1+n_iteration*annealing_param
571 | max_pred = np.argmax(cells_probs, axis=1)
572 |
573 | cells_probs[np.arange(len(max_pred)),
574 | max_pred] *= 1+n_iterations*anneal_param
575 | cells_probs /= np.sum(cells_probs, axis=1, keepdims=True)
576 |
577 | # get the identity of the predicted cell types
578 | groupings = np.argmax(cells_probs, axis=1)
579 |
580 | #groupings = t_cell
581 |
582 | last_param = None
583 |
584 | toc = time()
585 | #print('time to find cell types ',toc-tic)
586 |
587 | num_iterations_inner = 0
588 | past_square_param = None
589 | while(num_iterations_inner < inner_max):
590 | #print('inner iterations:',num_iterations_inner)
591 | flat_assign = np.ravel(cell_assign)
592 | group_labels = groupings[flat_assign]
593 |
594 | # the empty cell type will be index of number of celltypes
595 | group_labels[flat_assign == -1] = n_celltype
596 |
597 | # random selection of pixels to use for training the model
598 | subset_indices = np.random.choice(np.arange(0,
599 | flat_assign.shape[0], dtype=int),
600 | size=int(pct_train*len(group_labels)))
601 | tic = time()
602 | merged_pix_info = cp_grid
603 |
604 | merged_pix_shape = merged_pix_info.shape
605 | merged_pix_reshaped = np.reshape(
606 | merged_pix_info, (merged_pix_shape[0]*merged_pix_shape[1]*merged_pix_shape[2], merged_pix_shape[3]))
607 | group_labels_mat = np.reshape(
608 | group_labels, (merged_pix_shape[0], merged_pix_shape[1], merged_pix_shape[2]))
609 |
610 | sub_merged_pix = merged_pix_reshaped[subset_indices, :]
611 | sub_group_labels = group_labels[subset_indices]
612 |
613 | # remove non cell pixels from training
614 | sub_merged_pix = sub_merged_pix[sub_group_labels != n_celltype, :]
615 | sub_group_labels = sub_group_labels[sub_group_labels != n_celltype]
616 |
617 | # train a model to see what a pixel of a certain kind looks like
618 | if ((num_iterations_inner == 0)):
619 | clf_log = train_nn_classifier(sub_merged_pix, sub_group_labels, clf_log,
620 | first_epochs, lrs)
621 | else:
622 | clf_log = train_nn_classifier(sub_merged_pix, sub_group_labels, clf_log,
623 | second_epochs, [lrs[-1]])
624 |
625 | toc = time()
626 | print('time to train ', len(subset_indices), 'samples ', toc-tic)
627 | num_iterations_most_inner = 0
628 | while(num_iterations_most_inner < most_inner_max):
629 | # find border indices where they are not empty, and have more neighbors than the border
630 | # threshold
631 | border_indices = np.where((surround_count >= border_other_threshold) &
632 | (same_cts >= border_same_threshold))
633 |
634 | border_indices_mat = np.stack(
635 | [border_indices[0], border_indices[1], border_indices[2]], axis=1)
636 |
637 | tic = time()
638 | # predict the probability a border pixel is from each class
639 | predictions = clf_log.predict(
640 | merged_pix_info[border_indices[0], border_indices[1], border_indices[2], :])
641 |
642 | predictions[predictions <= flip_thresh] = 0
643 |
644 | ngene = merged_pix_info.shape[3]
645 | ms = merged_pix_info.shape
646 | npix = ms[0]*ms[1]*ms[2]
647 |
648 | # adding multiply the max prob by 1+n_iteration*annealing_param
649 | max_pred = np.argmax(predictions, axis=1)
650 |
651 | predictions[np.arange(len(max_pred)),
652 | max_pred] *= 1+n_iterations*anneal_param
653 |
654 | if predictions.shape[1] < (n_celltype):
655 | diff = np.setdiff1d(
656 | np.arange(n_celltype+1), np.unique(group_labels[subset_indices]))
657 | diff = np.sort(diff)
658 | for missing in diff:
659 | predictions = np.insert(predictions, missing, np.repeat(
660 | 0, predictions.shape[0]), axis=1)
661 | print('MISSSING ROW INSERTED!')
662 |
663 | toc = time()
664 | print('time to predict ', len(
665 | border_indices[0]), 'samples', toc-tic)
666 | # get the number of each cell type pixel surrounding each border pixel
667 |
668 | # some house keeping for the next step by padding arrays
669 | border_indices_mat += 1
670 | cell_assign = np.pad(
671 | cell_assign, (1), 'constant', constant_values=(-2))
672 |
673 | locs = np.pad(locs, ((1, 1), (1, 1), (1, 1), (0, 0)),
674 | 'constant', constant_values=(-2))
675 |
676 | bord_x, bord_y, bord_z = border_indices_mat[:,
677 | 0], border_indices_mat[:, 1], border_indices_mat[:, 2]
678 | surroundings = np.zeros(
679 | (border_indices_mat.shape[0]), dtype=int)
680 | group_key = np.zeros((border_indices_mat.shape[0]), dtype=int)
681 | predic_num = np.zeros_like(predictions)
682 | predic_probs = np.zeros(
683 | (predictions.shape[0], predictions.shape[1]))
684 | tic = time()
685 |
686 | pixels_to_nuc_dist_vec, pixels_to_nuclei_vec = nuc_clf.kneighbors(locs[bord_x,
687 | bord_y,
688 | bord_z, :])
689 |
690 | pixels_to_nuclei_vec = nuc.id.to_numpy().astype('float64')[pixels_to_nuclei_vec]
691 | toc = time()
692 | print('time to predict nuclei distance:', toc-tic)
693 |
694 | tic = time()
695 |
696 | # this creates a matrix where each row is the surrounding cell ids for that border pixel
697 | surroundings = np.zeros((len(bord_x), 27))
698 | sub_counter = 0
699 | for i in range(-1, 2):
700 | for j in range(-1, 2):
701 | for k in range(-1, 2):
702 | surroundings[:, sub_counter] = cell_assign[bord_x+i,
703 | bord_y+j,
704 | bord_z+k].copy()
705 | sub_counter += 1
706 |
707 | tic = time()
708 |
709 | input_surr = (surroundings.__array_interface__[
710 | 'data'][0]+np.arange(surroundings.shape[0])*surroundings.strides[0]).astype(np.uintp)
711 | input_nuc = (pixels_to_nuclei_vec.__array_interface__['data'][0]+np.arange(
712 | pixels_to_nuclei_vec.shape[0])*pixels_to_nuclei_vec.strides[0]).astype(np.uintp)
713 | input_dist = (pixels_to_nuc_dist_vec.__array_interface__['data'][0]+np.arange(
714 | pixels_to_nuc_dist_vec.shape[0])*pixels_to_nuc_dist_vec.strides[0]).astype(np.uintp)
715 | dist_mat = np.zeros(
716 | (surroundings.shape[0], 27), dtype=np.float64)
717 | input_dist_mat = (dist_mat.__array_interface__[
718 | 'data'][0]+np.arange(dist_mat.shape[0])*dist_mat.strides[0]).astype(np.uintp)
719 |
720 | # C function
721 | get_d(input_surr,
722 | input_nuc,
723 | input_dist,
724 | ctypes.c_int(surroundings.shape[0]),
725 | ctypes.c_int(pixels_to_nuclei_vec.shape[1]),
726 | input_dist_mat)
727 |
728 | toc = time()
729 |
730 | tic = time()
731 | for sub_counter in range(27):
732 | surro = surroundings[:, sub_counter].copy().astype(int)
733 |
734 | num_surroundings = len(surro)
735 |
736 | # get dist vec based on the first occurence of the surroundings
737 | # in the nearest neighbors vec from pix to nuclei
738 | dist_vec = dist_mat[:, sub_counter]
739 |
740 | reduced_dist = dist_vec - dist_threshold
741 | reduced_dist[reduced_dist <= 0] = 1e-3
742 |
743 | s = (dist_scaling*dist_threshold)/2
744 | scale_vec = np.divide(s, reduced_dist)
745 | scale_vec[dist_vec < dist_threshold] = 10
746 | scaled_final = np.minimum(10, scale_vec)
747 |
748 | scaled_final[np.where(surro == -1)] = 1
749 | scaled_final[np.where(surro == -2)] = 1
750 |
751 | group_key = groupings[surro].copy()
752 | group_key[np.where(surro == -1)] = -1
753 | group_key[np.where(surro == -2)] = -2
754 | non_neg = np.where(group_key >= 0)
755 | predic_num[non_neg, group_key[non_neg]] += 1
756 | predic_probs[non_neg,
757 | group_key[non_neg]] += scaled_final[non_neg]
758 |
759 | predic_num[np.where(predic_num == 0)] = 1
760 |
761 | predic_probs = np.divide(predic_probs, predic_num)
762 |
763 | predictions = np.multiply(predictions, predic_probs)
764 |
765 | pred_sum = np.sum(predictions, axis=1, keepdims=1)
766 | pred_sum[pred_sum == 0] = 1
767 | predictions /= pred_sum
768 |
769 | toc = time()
770 |
771 | number_of_border_pixels = border_indices_mat.shape[0]
772 |
773 | locs_to_flip = locs[border_indices_mat[:, 0],
774 | border_indices_mat[:, 1], border_indices_mat[:, 2], :]
775 |
776 | # remove 0 pad from locs
777 | ls = locs.shape
778 | locs = locs[1:ls[0]-1, 1:ls[1]-1, 1:ls[2]-1, :]
779 |
780 | flips_arg_bool = (predictions.cumsum(
781 | 1) > np.random.rand(predictions.shape[0])[:, None])
782 |
783 | flips_arg = flips_arg_bool.argmax(1)
784 |
785 | # set flip args that have prediction prob 0 to n_celltype
786 | flips_arg[np.sum(flips_arg_bool, axis=1) == 0] = n_celltype
787 | flips_arg[np.sum(predictions, axis=1) == 0] = n_celltype
788 |
789 | cells_to_flip = flips_arg
790 |
791 | borders_to_flip = border_indices_mat
792 |
793 | tic = time()
794 |
795 | bord_x, bord_y, bord_z = borders_to_flip[:,
796 | 0], borders_to_flip[:, 1], borders_to_flip[:, 2]
797 | old_id = cell_assign[bord_x,
798 | bord_y,
799 | bord_z].copy()
800 |
801 | surroundings = np.zeros(
802 | (border_indices_mat.shape[0], 3**border_indices_mat.shape[1]), dtype=int)
803 | group_key = np.zeros(
804 | (border_indices_mat.shape[0], 3**border_indices_mat.shape[1]), dtype=int)
805 | matching_regions = np.zeros(
806 | (border_indices_mat.shape[0], 3**border_indices_mat.shape[1]), dtype=np.float32)
807 | changed_surround_count = np.zeros(
808 | (border_indices_mat.shape[0], 3**border_indices_mat.shape[1]), dtype=int)
809 | loc_counter = 0
810 | for i in range(-1, 2):
811 | for j in range(-1, 2):
812 | for k in range(-1, 2):
813 | surroundings[:, loc_counter] = cell_assign[bord_x+i,
814 | bord_y+j,
815 | bord_z+k].copy()
816 |
817 | group_key[:, loc_counter] = groupings[surroundings[:, loc_counter]].copy(
818 | )
819 | group_key[:, loc_counter][np.where(
820 | surroundings[:, loc_counter] == -1)] = n_celltype
821 | matching_regions[:, loc_counter] = group_key[:,
822 | loc_counter] == cells_to_flip
823 | loc_counter += 1
824 |
825 | match_dist = matching_regions * dist_mat
826 | match_dist[match_dist == 0] = 1e3
827 | new_cell_loc = np.argmin(match_dist, axis=1)
828 |
829 | new_id = surroundings[np.arange(
830 | 0, surroundings.shape[0]), new_cell_loc].copy()
831 | new_id[np.sum(matching_regions, axis=1) == 0] = -1
832 | new_id[np.sum(flips_arg_bool, axis=1) == 0] = -1
833 | new_id[new_id == -2] = -1
834 |
835 | n_changed_this_round = len(
836 | new_id) - np.sum(np.equal(old_id, new_id))
837 | n_changed.append(n_changed_this_round)
838 | n_changed_overall.append(n_changed_this_round)
839 | if len(n_changed) > 10:
840 | n_changed = n_changed[1:]
841 |
842 | cell_assign[bord_x, bord_y, bord_z] = new_id
843 |
844 | same_and_different = surroundings == new_id.reshape(
845 | surroundings.shape[0], 1)
846 | different_count = np.sum(same_and_different == 0, axis=1)
847 | borders_to_flip -= 1
848 |
849 | # remove 0 pad from cell assignment
850 | cs = cell_assign.shape
851 | cell_assign = cell_assign[1:cs[0]-1, 1:cs[1]-1, 1:cs[2]-1]
852 | cell_assign[map_to_keep_nuclei_correctly_labeled != -1] = map_to_keep_nuclei_correctly_labeled[map_to_keep_nuclei_correctly_labeled != -1].copy()
853 |
854 | bord_x, bord_y, bord_z = borders_to_flip[:,
855 | 0], borders_to_flip[:, 1], borders_to_flip[:, 2]
856 |
857 | surround_count, same_cts = get_number_similar_surroundings(
858 | cell_assign)
859 |
860 | old_cells = groupings[old_id].copy()
861 | old_cells[np.where(old_id == -1)] = n_celltype
862 |
863 | new_cells = groupings[new_id].copy()
864 | new_cells[np.where(new_id == -1)] = n_celltype
865 |
866 | percent_flipped.append(
867 | np.sum(np.equal(new_id, old_id))/number_of_border_pixels)
868 |
869 | toc = time()
870 | print('time to flip pixels ', toc-tic)
871 | n_iterations += 1
872 | num_iterations_most_inner += 1
873 | num_iterations_inner += 1
874 | num_iterations_outer += 1
875 |
876 | # compute counts matrix
877 | cells_matrix = get_matrix_of_cells(pixl_true, cell_assign, nuc)
878 |
879 | return cell_assign, cells_matrix, groupings
880 |
881 | def watershed_nuclei(pix, nuclei, locations):
882 | ls = locations.shape
883 | raveled_locs = locations.reshape((ls[0]*ls[1]*ls[2],3))
884 | square_ids = np.arange(0,raveled_locs.shape[0])
885 | locs_clf = neighbors.KNeighborsClassifier(1,
886 | n_jobs=12).fit(raveled_locs, square_ids)
887 |
888 | predicted_locs = locs_clf.predict(nuclei.loc[:,['x','y','z']])
889 | nuclei_ids = np.zeros(len(square_ids))
890 | nuclei_ids[predicted_locs] = nuclei.id+1
891 | nuclei_ids = nuclei_ids.reshape((ls[0],ls[1],ls[2]))
892 |
893 | pix_dens = np.log2(np.sum(pix,axis=3)+1)
894 | image = np.zeros_like(pix_dens)
895 | image[pix_dens > pix_thresh] = 1
896 | labels = watershed(pix_dens, nuclei_ids,
897 | watershed_line = True,
898 | mask = image, compactness=10) - 1
899 | return labels
900 |
901 | def create_celltype_classifier(sf, sc, nlayers=2, l1_reg=1e-3,
902 | epochs=20, lrs=[5e-3, 5e-4],
903 | test_size=0.25):
904 | '''
905 | Creates a cell type classifier and trains the model based on a neural network
906 | ---------------------
907 | parameters:
908 | sf: single cell reference dataset
909 | sc: single cell reference cell types. (must be int)
910 | nlayers: number of intermediate layers in network (default: 2)
911 | l1_reg: l1 regularization parameter (default: 1e-3)
912 | epochs: number of epochs to train for, per learning rate cycle (default: 20)
913 | lrs: learning rates. Must be a list of learning rates (defaults: [5e-3,5e-4])
914 | test_size: percentage of dataset to use for validation (default: 0.25)
915 | returns:
916 | clf_cell: Trained cell type classifier
917 |
918 | '''
919 |
920 | ncelltype = len(np.unique(sc))
921 |
922 | input_dim = sf.shape[1]
923 | input_vec = Input(shape=(input_dim,))
924 | x = Dense(input_dim*3, activation='tanh',
925 | activity_regularizer=l1(l1_reg))(input_vec)
926 | x = BatchNormalization()(x)
927 | for i in range(nlayers-1):
928 | x = Dense(input_dim*3, activation='tanh',
929 | activity_regularizer=l1(l1_reg))(x)
930 | x = BatchNormalization()(x)
931 | out = Dense(ncelltype, activation='softmax',
932 | activity_regularizer=l1(l1_reg))(x)
933 |
934 | clf_cell = Model(input_vec, out)
935 | clf_cell.summary()
936 |
937 | scaled_ref = scale(sf, axis=1)
938 | scaled_ref = scale(scaled_ref, axis=0)
939 |
940 | X_train, X_test, y_train, y_test = train_test_split(scaled_ref, sc,
941 | test_size=test_size,
942 | random_state=0)
943 |
944 | for lr in lrs:
945 | adam = Adam(learning_rate=lr)
946 | clf_cell.compile(optimizer='Adam',
947 | loss='sparse_categorical_crossentropy',
948 | metrics=['accuracy'])
949 | clf_cell.fit(X_train, y_train,
950 | validation_data=(np.array(X_test), np.array(y_test)),
951 | epochs=epochs,
952 | batch_size=64,
953 | use_multiprocessing=True)
954 |
955 | return clf_cell
956 |
957 | def pixel_nn_classifier(mp,sc,nlayer, l2_reg):
958 | ncelltype = len(np.unique(sc))
959 | input_dim = mp.shape[1]
960 | input_vec = Input(shape=(input_dim,))
961 | x = Dense(input_dim*2,activation='tanh',
962 | activity_regularizer=l1(l2_reg))(input_vec)
963 | x = BatchNormalization()(x)
964 | for i in range(1,nlayer):
965 | x = Dense(input_dim*((2)**(i)),activation='tanh',
966 | activity_regularizer=l1(l2_reg))(x)
967 | x = BatchNormalization()(x)
968 |
969 | out = Dense(ncelltype, activation='softmax',
970 | activity_regularizer=l1(l2_reg))(x)
971 |
972 | clf= Model(input_vec, out)
973 | clf.summary()
974 | return clf
975 |
976 | def train_nn_classifier(mp,sc,clf,epo,lrs):
977 | X_train, X_test, y_train, y_test = train_test_split(mp, sc,
978 | test_size = 0.2,random_state = 0)
979 | for lr in lrs:
980 | adam = Adam(learning_rate = lr)
981 | clf.compile(optimizer = 'Adam',
982 | loss = 'sparse_categorical_crossentropy',
983 | metrics=['accuracy'])
984 | clf.fit(X_train,y_train,
985 | epochs = epo,
986 | batch_size = 64,
987 | validation_data=(X_test,y_test),
988 | use_multiprocessing=True)
989 | return clf
990 |
991 | def get_centers(vec):
992 | '''
993 | # gets the center of each pixel
994 | # input:
995 | # vec: the aranged vector of the edge of each pixel
996 | # output:
997 | # ctr: vector of len(vec)-1 that has the center of each pixel's coordinates
998 | '''
999 | ctr = []
1000 | for i in range(len(vec)-1):
1001 | ctr.append(np.mean([vec[i],vec[i+1]]))
1002 | return np.array(ctr)
1003 |
1004 | def transform_to_stage_coordinates(spt, metadata):
1005 | '''
1006 | # transforms the nuclei to stage coordinates and creates a dataframe
1007 | # that shows the location of each nuclei assignment
1008 | # input:
1009 | # nuc_assign: 3d tensor of the nuclei assignments
1010 | # XY: a vector of the center of each pos [x,y]
1011 | # zstack: vector of the z indices
1012 | # pix_size: size of each pixel from metadata
1013 | # img_shape: 3d shape of the image with nuclei assignments
1014 | # output:
1015 | # df_nuc: dataframe with the nuclei locations
1016 | '''
1017 | new_spots = np.zeros((spt.shape[0],4))
1018 | spt_counter = 0
1019 | genes = []
1020 | for pos in np.unique(spt.posname):
1021 | print(pos)
1022 | temp_spt = spt[spt.posname == pos]
1023 | genes += list(temp_spt.gene.to_numpy())
1024 | subset_metadata = metadata[metadata.Position == pos]
1025 | XY = subset_metadata.XYbeforeTransform.to_numpy()[0]
1026 | pix_size = subset_metadata.PixelSize.to_numpy()[0]
1027 |
1028 | x = XY[0]
1029 | y = XY[1]
1030 |
1031 | for s in zip(temp_spt.centroid,temp_spt.z):
1032 | ctr_x = (s[0][1]-1024) * pix_size
1033 | ctr_y = (s[0][0]-1024) * pix_size
1034 | ctr_z = s[1] * 0.4
1035 | new_spots[spt_counter,1:] = [ctr_x+x,ctr_y+y,ctr_z]
1036 | spt_counter += 1
1037 |
1038 | df_spt = pd.DataFrame(new_spots)
1039 | df_spt.columns = ['gene','x','y','z']
1040 | df_spt.gene = np.array(genes)
1041 |
1042 | return df_spt
1043 |
1044 | c_args = [ndpointer(dtype=np.uintp,ndim=1,flags='C'),
1045 | ndpointer(dtype=np.uintp,ndim=1,flags='C'),
1046 | ndpointer(dtype=np.uintp,ndim=1,flags='C'),
1047 | ctypes.c_int, ctypes.c_int,
1048 | ndpointer(dtype=np.uintp,ndim=1,flags='C')]
1049 |
1050 | dist_func_c = ctypes.CDLL(path_to_file+"/get_distances.so")
1051 | get_d = dist_func_c.get_distances
1052 | dist_func_c.get_distances.argtypes = c_args
1053 | get_d.restype = None
1054 | get_num_surr_func_c = ctypes.CDLL(path_to_file+"/get_number_similar_surroundings.so")
1055 | get_sur = get_num_surr_func_c.get_sur
1056 |
--------------------------------------------------------------------------------
/CoreFunctions/MerfishSimulator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | import pandas as pd
4 | from sklearn.neighbors import KNeighborsClassifier
5 | from random import randint
6 | from collections import Counter
7 | import matplotlib.patches as mpatches
8 | import pickle as pkl
9 | import json
10 | import sys
11 | from sklearn.decomposition import PCA
12 | from sklearn.preprocessing import scale
13 | from sklearn.linear_model import LogisticRegression
14 | from sklearn.model_selection import train_test_split
15 |
16 | class merfish_data_generator:
17 | '''
18 | Class to generate merfish data, all values are in microns
19 | --------------------
20 | parameters:
21 | celltypes: list of the unique cell types (default: None)
22 | genes: list of the genes used for merfish data (default: None)
23 | celltype_props: list of the cell type proportions should be in the same order as
24 | celltypes (default: None)
25 | density_range: range of areas, and how dense to make them (default: [1])
26 | dist_between_cell_centers: range of values of how far apart
27 | to make them (default: [30])
28 | cell_shape_regularity: how round the cell should be (default: 1)
29 | dge: digital gene expression matrix celltypes x genes (default: None)
30 | noise: overlap between cells count data (default: 0)
31 | heterogeneity: how heterogeneous to make the cell type distribution (default: 1)
32 | grid_shape: x, y, z sizze of the grid
33 | nuclei_size_range: variability in the size of the nuclei (default: [0.2])
34 | distance_between_cells: adds to the overall distance between cells
35 | negative values squish cells closer together (default: 0)
36 | subtype: Boolean, indicating whether or not the current cells are
37 | subtypes
38 | --------------------
39 | functions:
40 | generate_grid: Generates the xyz grid for where the cell centers are
41 | generate_cell_centers: Generates a dataframe with the cell centers in xyz coords
42 | assign_pixels_to_cells: Assigns the pixels to the cells based on
43 | modified voronoi from the center of the cell w/ max distance
44 | plot_true: Plots the cell ID grid in a single z_stack value
45 | plot_celltypes: Plots the celltype grid in a single z_stack value
46 | generate_nuclei: Generates the nuclei randomly within a cell
47 | add_dge: Adds the DGE matrices to the object both should be celltypes x genes
48 | Can be used to add a self curated dge instead of computing one
49 | compute_dge: Computes and adds the DGE matrices to the object. It should be in celltypes x genes
50 | Also computes cell type proportions
51 | classify_celltypes: Adds celltypes to the cells based on the cell type proportions
52 | generate_merfish_dge: Generates the merfish dge for the cells, based on their cell type and the dge from
53 | single cell data
54 | place_transcripts: Places the spots within the cell
55 | '''
56 | def __init__(self, celltypes = None,
57 | genes = None,
58 | celltype_props = None,
59 | density_range = [1],
60 | dist_between_cell_centers = [30],
61 | cell_shape_regularity = 1,
62 | dge = None,
63 | noise = 0,
64 | heterogeneity = 1,
65 | grid_shape = (200,200,50),
66 | nuclei_size_range = [0.2],
67 | distance_between_cells = 0,
68 | subtype = False):
69 | self.celltypes = celltypes
70 | self.genes = genes
71 | self.density_range = density_range
72 | self.dist_between_cell_centers = dist_between_cell_centers
73 | self.cell_shape_regularity = cell_shape_regularity
74 | self.celltype_props = celltype_props
75 | self.dge = dge
76 | self.noise = noise
77 | self.heterogeneity = 1
78 | self.grid_shape = grid_shape
79 | #percentage of cell size
80 | self.nuclei_size_range = nuclei_size_range
81 | self.x_max = grid_shape[0]; self.x_min = 0
82 | self.y_max = grid_shape[1]; self.y_min = 0
83 | self.z_max = grid_shape[2]; self.z_min = 0
84 | self.distance_between_cells = distance_between_cells
85 |
86 | self.cell_centers = None
87 | self.true_map = None
88 |
89 | self.subtype = subtype
90 |
91 |
92 | def generate_grid(self, space_between):
93 | '''
94 | Generates the xyz grid for where the cell centers are
95 | --------------------
96 | parameters:
97 | space_between: space in microns between nuclei
98 | --------------------
99 | returns:
100 | x_coord: x coordinate vector of nuclei
101 | y_coord: y coordinate vector of nuclei
102 | z_coord: z coordinate vector of nuclei
103 | '''
104 | xs = np.arange(0,self.x_max, space_between)
105 | ys = np.arange(0,self.y_max, space_between)
106 | zs = np.arange(0,self.z_max, space_between)
107 | x_coord, y_coord, z_coord = np.meshgrid(xs,ys,zs)
108 |
109 | return x_coord, y_coord, z_coord
110 |
111 |
112 | def generate_cell_centers(self):
113 | '''
114 | Generates a dataframe with the cell centers in xyz coords
115 | --------------------
116 | adds:
117 | self.cell_centers: dataframe with xyz coordinates
118 | self.cell_ids: the identification number of each cell
119 | '''
120 | distance_between_nuclei = np.mean(self.dist_between_cell_centers)+self.distance_between_cells
121 | std_distance_between_nuclei = np.std(self.dist_between_cell_centers)
122 |
123 | x_coord, y_coord, z_coord = self.generate_grid(distance_between_nuclei)
124 |
125 | gs = x_coord.shape
126 |
127 | x_rand = np.random.randn(gs[0],gs[1],gs[2])*std_distance_between_nuclei
128 | y_rand = np.random.randn(gs[0],gs[1],gs[2])*std_distance_between_nuclei
129 | z_rand = np.random.randn(gs[0],gs[1],gs[2])*std_distance_between_nuclei
130 |
131 | x_coord += x_rand; y_coord += y_rand; z_coord += z_rand
132 |
133 | cell_centers = pd.DataFrame({'x':x_coord.ravel(),
134 | 'y':y_coord.ravel(),
135 | 'z':z_coord.ravel()})
136 |
137 | self.cell_centers = cell_centers
138 | cell_ids = np.arange(cell_centers.shape[0])
139 | self.cell_ids = cell_ids
140 |
141 | def assign_pixels_to_cells(self,
142 | pixels_per_micron = .5,
143 | max_dist = 20,
144 | noise_in_dist = 0,
145 | min_pix_count = 30):
146 | '''
147 | Assigns the pixels to the cells based on modified voronoi from the
148 | center of the cell w/ max distance
149 | --------------------
150 | parameters:
151 | pixels_per_micron: measure for the resolution of the grid. Higher values increase
152 | resolution, but increase run time and memory requirements (default: .5)
153 | max_dist: maximum distance away from the center for voronoi segmentation (default: 20 microns)
154 | noise_in_dist: adds uneven edge effects the the nuclei (default: 0)
155 | min_pix_count: minimum pixels assigned to a cell for a cell to be considered a cell
156 | --------------------
157 | adds:
158 | self.cell_ids: removes the cell ids with < min_pix_count pixels assigned to it
159 | self.cell_centers: removes the centers of cells with < min_pix_count
160 | self.true_map: the 3d map of pixels -1 is no id, otherwise the number is the cell id
161 | '''
162 | if self.cell_centers is None:
163 | self.generate_cell_centers()
164 |
165 | x_coord, y_coord, z_coord = self.generate_grid(1/pixels_per_micron)
166 | self.pix_per_micron = pixels_per_micron
167 |
168 | #change this later
169 | max_dist = np.mean(self.dist_between_cell_centers)/2
170 |
171 | clf_cell_center = KNeighborsClassifier(n_neighbors=1,
172 | algorithm='kd_tree').fit(self.cell_centers.to_numpy(),
173 | self.cell_ids)
174 |
175 | dist, cell_id = clf_cell_center.kneighbors(np.array([x_coord.ravel(),
176 | y_coord.ravel(),
177 | z_coord.ravel()]).T)
178 | dist = dist.ravel(); cell_id = cell_id.ravel()
179 | dist += np.random.randn(len(dist)) * noise_in_dist
180 | cell_id = self.cell_ids[cell_id]
181 | cell_id[dist > max_dist] = -1
182 | pixel_map = np.reshape(cell_id, x_coord.shape)
183 |
184 | #remove cells without pixels
185 | uniq_cells = np.unique(pixel_map.ravel())
186 | cells_to_remove = []
187 | for i in self.cell_ids:
188 | if i not in uniq_cells:
189 | cells_to_remove.append(i)
190 |
191 | #remove cells with less than a certain pixel number
192 | counted = Counter(pixel_map.ravel())
193 | for c in counted:
194 | if counted[c] < min_pix_count:
195 | pixel_map[pixel_map == c] = -1
196 | cells_to_remove.append(c)
197 |
198 | self.cell_ids = np.delete(self.cell_ids, cells_to_remove)
199 | self.cell_centers.drop(index=cells_to_remove, inplace=True)
200 | self.true_map = pixel_map
201 |
202 | def plot_true(self, cmap = 'nipy_spectral',ax=None, alpha = 1, z_stack = 5):
203 | '''
204 | Plots the cell ID grid in a single z_stack value
205 | --------------------
206 | parameters:
207 | cmap: the color map from matplot lib to use (default: nipy_spectral)
208 | ax: the matplotlib cell to plot in (default: None)
209 | alpha: measure of the transparency of the image (default: 1)
210 | z_stack: number for the z slice to plot in (default: 0)
211 | '''
212 | if self.true_map is None:
213 | self.assign_pixels_to_cells()
214 |
215 | true_map = self.true_map.copy()
216 | new_ids = np.unique(self.true_map.ravel())
217 | new_ids = np.delete(new_ids, 0)
218 | np.random.shuffle(new_ids)
219 |
220 | counter = 0
221 | for i in new_ids:
222 | true_map[self.true_map == i] = counter
223 | counter += 1
224 |
225 | if ax is None:
226 | plt.imshow(true_map[:,:,z_stack], cmap=cmap, alpha = alpha)
227 | else:
228 | ax.imshow(true_map[:,:,z_stack], cmap=cmap, alpha = alpha)
229 |
230 |
231 | def plot_celltypes(self, cmap = 'nipy_spectral',
232 | ax = None, alpha = 1, z_stack = 0):
233 | '''
234 | Plots the celltype grid in a single z_stack value
235 | --------------------
236 | parameters:
237 | cmap: the color map from matplot lib to use (default: nipy_spectral)
238 | ax: the matplotlib cell to plot in (default: None)
239 | alpha: measure of the transparency of the image (default: 1)
240 | z_stack: number for the z slice to plot in (default: 0)
241 | --------------------
242 | adds:
243 | self.cellt_map: Adds the 3d cell type map -1 is no cell, cell type number
244 | is based on the order of self.celltypes
245 | '''
246 | cellt = self.classified_celltypes
247 | colors = []
248 | celltype_colors = {}
249 | for i, cell in enumerate(np.unique(cellt)):
250 | celltype_colors[cell] = i
251 | for cell in cellt:
252 | colors.append(celltype_colors[cell])
253 | colors = np.array(colors)
254 | t_map = self.true_map.ravel().copy()
255 | for i, cell_id in enumerate(np.unique(t_map)):
256 | if cell_id != -1:
257 | t_map[t_map == cell_id] = i - 1
258 | cellt_vec = colors[t_map]
259 | cellt_vec[t_map == -1] = -1
260 | cellt_mat = cellt_vec.reshape(self.true_map.shape)
261 | self.cellt_map = cellt_mat
262 |
263 |
264 | # create legend
265 | leg = np.unique(cellt)
266 | cmap_converter = plt.cm.get_cmap(cmap, len(leg)+1)
267 | leg_patch = []
268 | for i in range(len(leg)):
269 | leg_patch.append(mpatches.Patch(color=cmap_converter(i+1)[:3],
270 | label=leg[i]))
271 | if ax is None:
272 | plt.imshow(cellt_mat[:,:,0], cmap = cmap, alpha = alpha)
273 | plt.legend(handles = leg_patch,bbox_to_anchor=(2,1))
274 | else:
275 | ax.imshow(cellt_mat[:,:,0], cmap = cmap, alpha = alpha)
276 | ax.legend(handles = leg_patch,bbox_to_anchor=(2,1))
277 |
278 |
279 | def generate_nuclei_centers(self, n_pix_per_nuc = 9, dtype='int32'):
280 | '''
281 | Generates the nuclei randomly within a cell
282 | --------------------
283 | parameters:
284 | n_pix_per_nuc: number of pixels in a nucleus (default: 9)
285 | dtype: data type for the nuclei tensor (default: int16)
286 | --------------------
287 | adds:
288 | self.nuclei: 3d tensor of nuclei, -1 is no nucleus, number is according
289 | to the cell ID
290 | '''
291 | if self.cell_centers is None:
292 | self.generate_cell_centers()
293 |
294 | nuclei = np.zeros(self.true_map.shape,
295 | dtype=dtype)
296 | nuclei -= 1
297 |
298 | check_nuclei_surroundings = KNeighborsClassifier(n_neighbors = 27)
299 |
300 | non_zero_cell_locs = np.where(self.true_map != -1)
301 | cell_ids = self.true_map.ravel()
302 | xs = non_zero_cell_locs[0]
303 | ys = non_zero_cell_locs[1]
304 | zs = non_zero_cell_locs[2]
305 | cell_ids = cell_ids[cell_ids != -1]
306 | check_nuclei_surroundings.fit(np.vstack((xs,ys,zs)).T, cell_ids)
307 |
308 | for i in np.unique(self.cell_ids):
309 | if i != -1:
310 | cell_coords = np.where(self.true_map == i)
311 | rand_index = randint(0,len(cell_coords[0])-1)
312 | xs = cell_coords[0]
313 | ys = cell_coords[1]
314 | zs = cell_coords[2]
315 |
316 | locs_as_mat = np.vstack((xs, ys, zs)).T
317 | clf_seed = KNeighborsClassifier(n_neighbors=min(n_pix_per_nuc,
318 | locs_as_mat.shape[0]))
319 | nuclei_seed = locs_as_mat[rand_index,:]
320 |
321 | clf_seed.fit(locs_as_mat, np.arange(locs_as_mat.shape[0]))
322 | nuc_pix_locs = clf_seed.kneighbors([nuclei_seed])[1][0]
323 | nuc_pix = locs_as_mat[nuc_pix_locs,:]
324 |
325 | non_same_celltype = cell_ids[check_nuclei_surroundings.kneighbors(nuc_pix)[1]]
326 | #remove nuc pixels that are on the border
327 | nuc_pix = nuc_pix[np.sum(non_same_celltype == i,axis=1) == 27,:]
328 | if nuc_pix.shape[0] > 0:
329 | nuclei[nuc_pix[:,0],
330 | nuc_pix[:,1],
331 | nuc_pix[:,2]] = i
332 | else:
333 | nuclei[nuclei_seed[0],
334 | nuclei_seed[1],
335 | nuclei_seed[2]] = i
336 | #print(np.unique(nuclei))
337 |
338 | self.nuclei = nuclei
339 |
340 | def compute_covariance(self, counts,
341 | celltypes,
342 | find_celltype_props = True):
343 | '''
344 | Computes and adds the DGE matrices to the object. It should be in celltypes x genes
345 | Also computes cell type proportions
346 | --------------------
347 | parameters:
348 | counts: count matrix of single cell data cells x genes
349 | celltypes: vector of celltypes for each cell in sc matrix
350 | find_celltype_props: indicating whether or not to use the celltype
351 | proportions from the single cell data (default: True)
352 |
353 | --------------------
354 | adds:
355 | self.genes: vector of the genes
356 | self.celltypes: vector of the celltypes
357 | self.ct_means: matrix with mean gene expression of each gene for each celltype
358 | self.ct_stds: matrix with stdev of gene expression of each gene for each celltype
359 | self.ct_covs: covariance matrix for each celltype
360 | self.celltype_props: if indicated, adds the celltype proportions from
361 | the single cell data
362 | '''
363 | self.genes = counts.columns.to_numpy()
364 | self.celltypes = np.unique(celltypes)
365 |
366 | celltype_means = np.zeros((len(np.unique(celltypes)),counts.shape[1]))
367 | celltype_stds = np.zeros((len(np.unique(celltypes)),counts.shape[1]))
368 | cov_mats = np.zeros((len(np.unique(celltypes)),counts.shape[1],counts.shape[1]))
369 | for i,cell in enumerate(np.unique(celltypes)):
370 | loc = np.where(cell == celltypes)[0]
371 | subset_cells = counts.iloc[loc,:]
372 | celltype_means[i,:] = np.mean(subset_cells,axis=0)
373 | celltype_stds[i,:] = np.std(subset_cells,axis=0)
374 |
375 | x = np.cov(subset_cells.T)
376 | min_eig = np.min(np.real(np.linalg.eigvals(x)))
377 | if min_eig < 0:
378 | x -= 100*min_eig * np.eye(*x.shape)
379 | cov_mats[i,:,:] = x
380 |
381 |
382 |
383 | self.ct_means = celltype_means
384 | celltype_stds[celltype_stds == 0] = 1
385 | self.ct_stds = celltype_stds
386 | self.ct_covs = cov_mats
387 |
388 |
389 | if find_celltype_props:
390 | celltype_props = []
391 | for i, cell in enumerate(self.celltypes):
392 | celltype_locs = np.where(celltypes == cell)[0]
393 | celltype_props.append(len(celltype_locs)/len(celltypes))
394 | self.celltype_props = np.array(celltype_props)
395 |
396 | def classify_celltypes(self,subtype = False, ct_list = None, st_list = None):
397 | '''
398 | Adds celltypes to the cells based on the cell type proportions
399 | --------------------
400 | parameters:
401 | ct_list: list of the celltype annotation for the subt
402 | --------------------
403 | adds:
404 | self.celltype_props: if there aren't available cell type proportions
405 | sets them uniform
406 | self.classified_celltypes: vector of the cell type for each cell
407 | '''
408 | if self.celltypes is None:
409 | print('No Celltypes Available')
410 | return
411 | if self.celltype_props is None:
412 | print('No celltype proportions available. Assuming uniform ')
413 | self.celltype_props = [1/len(self.celltypes) for i in range(len(self.celltypes))]
414 | cell_probs = np.tile(self.celltype_props,(len(self.cell_ids),1))
415 | rand_unif = np.random.rand(len(self.cell_ids))
416 | class_celltype_index = (cell_probs.cumsum(1) > np.random.rand(len(self.cell_ids))[:,None]).argmax(1)
417 |
418 | self.classified_celltypes = self.celltypes[class_celltype_index]
419 | if self.subtype:
420 | ct_map = {}
421 | for i,ct in enumerate(st_list):
422 | if ct not in ct_map:
423 | ct_map[ct] = ct_list[i]
424 | self.classified_celltypes_lowres = np.array([ct_map[i] for i in self.classified_celltypes])
425 |
426 | def generate_merfish_dge(self,dge_scaling_factor = 1e1):
427 | '''
428 | Generates the merfish dge for the cells, based on their cell type and the dge from
429 | single cell data
430 | --------------------
431 | parameters:
432 | dge_scaling_factor: multiplies the dge matrix by this number
433 | sometimes the single cell computed matrix values are too small due to sparsity
434 | this increases the values for later use in generating spots (default: 1e1)
435 | --------------------
436 | adds:
437 | merfish_dge: cells x genes dge matrix for the merifsh cells
438 | '''
439 | merfish_dge = np.zeros((len(self.classified_celltypes),
440 | len(self.genes)))
441 | uniq_celltypes = np.unique(self.classified_celltypes)
442 | for i, cell in enumerate(self.cell_ids):
443 | sum_of_count = 0
444 | cellt = self.classified_celltypes[i]
445 | cellt_ind = np.where(uniq_celltypes == cellt)[0][0]
446 |
447 | counts = np.random.multivariate_normal(self.ct_means[cellt_ind,:],
448 | self.ct_covs[cellt_ind,:,:])
449 | counts *= self.ct_stds[cellt_ind,:]
450 | counts += self.ct_means[cellt_ind,:]
451 | merfish_dge[i,:] = counts
452 |
453 | merfish_dge *= dge_scaling_factor
454 | merfish_dge[merfish_dge < 0] = 0
455 | merfish_dge = np.round(merfish_dge)
456 | merfish_dge = pd.DataFrame(merfish_dge)
457 | merfish_dge.columns = self.genes
458 | merfish_dge.index = self.cell_ids
459 | self.merfish_dge = merfish_dge
460 | #self.merfish_dge = np.round(self.merfish_dge)
461 | #self.merfish_dge[self.merfish_dge <= 0] = 0
462 |
463 | #dist_from_nuc_scale 0 is uniform dist, 1 is all right next to nuc
464 | def place_transcripts(self, dist_from_nuc_scale = 0):
465 | '''
466 | Places the spots within the cell
467 | --------------------
468 | parameters:
469 | dist_from_nuc_scale: indicates the uniformity of spots within a cell,
470 | 0 is uniform distribution, as it increases, the spots cluster around the nucleus
471 | --------------------
472 | adds:
473 | self.spots: spot calls matrix cells x 4 (gene, x, y, z)
474 | self.nuc_df: nuclei df matrix cells x 4 (id, x, y, z)
475 | '''
476 | t_map = self.true_map
477 | nuc_map = self.nuclei
478 |
479 | if dist_from_nuc_scale <= 0:
480 | dist_from_nuc_scale = 1e-3
481 |
482 | n_exp_pdf = len(self.cell_ids)*10
483 |
484 | #psuedo exponential distribution
485 | exp_pdf = np.exp(dist_from_nuc_scale*np.random.rand(n_exp_pdf))
486 |
487 | exp_pdf = np.sort(exp_pdf)
488 | exp_pdf /= np.sum(exp_pdf)
489 |
490 | exp_cdf = np.array(np.cumsum(exp_pdf))[::-1]
491 |
492 | n_nuc_pix = np.sum(nuc_map != -1)
493 | nuc_df = np.zeros((n_nuc_pix, 4))
494 |
495 | #spots = pd.DataFrame(columns = ['gene','x','y','z'])
496 |
497 | spots = np.zeros((int(np.sum(self.merfish_dge.to_numpy().ravel())),3),
498 | dtype=np.uint32)
499 |
500 | gene_vec = []
501 | spots_mat_iter = 0
502 | nuc_iter = 0
503 | tot_spot = 0
504 | for i in np.unique(nuc_map.ravel()):
505 | if i != -1:
506 | nuc_loc = np.array(np.where(nuc_map == i)).T
507 | nuc_df[nuc_iter:nuc_loc.shape[0]+nuc_iter,0] = i
508 | nuc_df[nuc_iter:nuc_loc.shape[0]+nuc_iter,1:] = nuc_loc
509 | nuc_iter += nuc_loc.shape[0]
510 |
511 | whole_cell_loc = np.array(np.where(t_map == i)).T
512 |
513 | nuc_mid = [np.mean(nuc_loc,axis=0)]
514 | clf_cell = KNeighborsClassifier(n_neighbors=whole_cell_loc.shape[0]).fit(whole_cell_loc,
515 | np.arange(whole_cell_loc.shape[0]))
516 | cell_count = self.merfish_dge.loc[i,:]
517 | indices = clf_cell.kneighbors(nuc_mid)[1][0]
518 | n_counts = int(np.sum(cell_count))
519 | n_locs = whole_cell_loc.shape[0]
520 |
521 | random_count_locs = exp_cdf[np.array(np.round(np.random.rand(n_counts)*(n_exp_pdf-1)),dtype=int)]
522 | random_count_locs *= n_locs-1
523 | random_count_locs = np.array(np.round(random_count_locs),dtype=int)
524 |
525 | spot_ind = indices[random_count_locs]
526 | spot_locs = whole_cell_loc[spot_ind,:]
527 |
528 | non_zero_genes = np.where(cell_count != 0)[0]
529 | spot_iter = 0
530 | for j in non_zero_genes:
531 | gene = self.genes[j]
532 | n_spots = int(cell_count[j])
533 | tot_spot += n_spots
534 | for k in range(n_spots):
535 | pix_loc = spot_locs[spot_iter,:]
536 | row = [pix_loc[0], pix_loc[1], pix_loc[2]]
537 | gene_vec.append(gene)
538 | spots[spots_mat_iter,:] = row
539 | spot_iter += 1
540 | spots_mat_iter += 1
541 |
542 | spots = spots[0:len(gene_vec),:]
543 | spots = pd.DataFrame(spots)
544 | spots.columns = ['x','y','z']
545 | gene_vec = np.array(gene_vec)
546 | spots['gene'] = gene_vec
547 |
548 | nuc_df = pd.DataFrame(nuc_df)
549 | nuc_df.columns = ['id','x','y','z']
550 |
551 | self.spots = spots
552 | self.nuc_df = nuc_df
553 |
554 | def place_transcripts_at_corners(self):
555 | '''
556 | Places a spot from a random gene at each corner to make sure
557 | The pixel tensor size ends up the same as the simulated
558 | '''
559 | y_max, x_max, z_max = self.grid_shape
560 | y_max *= self.pix_per_micron
561 | x_max *= self.pix_per_micron
562 | z_max *= self.pix_per_micron
563 |
564 | y_min = 0; x_min = 0; z_min = 0;
565 | for x in [x_min, x_max-1]:
566 | for y in [y_min, y_max-1]:
567 | for z in [z_min, z_max-1]:
568 | rand_gene = self.genes[int(np.random.rand(1)*len(self.genes))]
569 | self.spots.loc[self.spots.shape[0],:] = [x, y, z, rand_gene]
570 |
571 | def add_noise(self, avg_spots_per_cell, percent_empty_to_use):
572 | '''
573 | Adds noise to empty space
574 | --------------------
575 | parameters:
576 | avg_spots_per_cell: the average number of extra spots in each empty cell chosen
577 | percent_empty_to_use: percentage of the empty squares to put noise in
578 | '''
579 | empty_slots = np.where(mdg.true_map)
580 | nempty = len(empty_slots[0])
581 | noise_slots = np.random.choice(np.arange(nempty),
582 | size = int(nempty * percent_empty_to_use))
583 | num_spots = np.round(np.random.rand(len(noise_slots)) * 2 * avg_spots_per_cell)
584 | num_genes_to_add = int(np.sum(num_spots))
585 | rand_genes = self.genes[(np.random.rand(num_genes_to_add)*len(self.genes)).astype(int)]
586 | noise_slots = noise_slots[num_spots != 0]
587 | num_spots = num_spots[num_spots != 0]
588 | new_spots = np.zeros(((self.spots.shape[0]+len(rand_genes)),4))
589 | new_spots[0:self.spots.shape[0],0:3] = self.spots.loc[:,['x','y','z']]
590 | new_genes_counter = self.spots.shape[0]
591 |
592 | for i in range(len(num_spots)):
593 | x = empty_slots[0][noise_slots[i]]
594 | y = empty_slots[1][noise_slots[i]]
595 | z = empty_slots[2][noise_slots[i]]
596 | for j in np.arange(num_spots[i]):
597 | new_spots[new_genes_counter,0:3] = [x,y,z]
598 | new_genes_counter += 1
599 |
600 | new_spots = pd.DataFrame(new_spots)
601 | new_spots.columns = self.spots.columns
602 | new_spots.gene = np.concatenate((self.spots.gene.to_numpy(),
603 | rand_genes))
604 | self.spots = new_spots
605 |
606 | def merge_cells(self, n_iter = 1):
607 | '''
608 | Merges cells to create non-circular shapes
609 | --------------------
610 | parameters:
611 | n_iter: number of times to merge cells. (Default: 1)
612 | '''
613 | for n in range(n_iter):
614 | coords = []
615 | ids = []
616 | for i in range(self.true_map.shape[0]):
617 | for j in range(self.true_map.shape[1]):
618 | for k in range(self.true_map.shape[2]):
619 | ids.append(self.true_map[i,j,k])
620 | coords.append([i,j,k])
621 | coords = np.array(coords)
622 | ids = np.array(ids)
623 | clf_nei_cells = KNeighborsClassifier(27).fit(coords,ids)
624 | pred = ids[clf_nei_cells.kneighbors(coords)[1]]
625 | combine_map = {}
626 | for i in range(pred.shape[0]):
627 | if pred[i,0] != -1:
628 | same_locs = np.where((pred[i,1:] != pred[i,0])&
629 | (pred[i,1:] != -1))[0]
630 |
631 | if len(same_locs) > 5:
632 | #plus one because the where was run on pred[i,1:]
633 | combine_map[pred[i,0]] = pred[i,(same_locs[0]+1)]
634 | flipped_before = []
635 | for flip in combine_map:
636 | if ((flip not in flipped_before) &
637 | (combine_map[flip] not in flipped_before)):
638 | self.true_map[self.true_map == flip] = combine_map[flip]
639 | self.cell_ids[self.cell_ids == flip] = combine_map[flip]
640 | flipped_before.append(flip)
641 | flipped_before.append(combine_map[flip])
642 | self.cell_ids = np.unique(self.cell_ids)
643 |
644 |
645 |
--------------------------------------------------------------------------------
/CoreFunctions/environment.yml:
--------------------------------------------------------------------------------
1 | name: jsta
2 | channels:
3 | - bioconda
4 | - conda-forge
5 | - anaconda
6 | - defaults
7 | dependencies:
8 | - _libgcc_mutex=0.1=conda_forge
9 | - _openmp_mutex=4.5=1_gnu
10 | - _tflow_select=2.3.0=mkl
11 | - absl-py=1.0.0=pyhd8ed1ab_0
12 | - aiohttp=3.8.1=py38h497a2fe_0
13 | - aiosignal=1.2.0=pyhd8ed1ab_0
14 | - alsa-lib=1.2.3=h516909a_0
15 | - aom=3.3.0=h27087fc_1
16 | - argon2-cffi=21.3.0=pyhd8ed1ab_0
17 | - argon2-cffi-bindings=21.2.0=py38h497a2fe_1
18 | - arpack=3.7.0=hdefa2d7_2
19 | - astor=0.8.1=pyh9f0ad1d_0
20 | - asttokens=2.0.5=pyhd8ed1ab_0
21 | - astunparse=1.6.3=pyhd8ed1ab_0
22 | - async-timeout=4.0.2=pyhd8ed1ab_0
23 | - attrs=21.4.0=pyhd8ed1ab_0
24 | - backcall=0.2.0=pyh9f0ad1d_0
25 | - backports=1.0=py_2
26 | - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
27 | - black=22.1.0=pyhd8ed1ab_0
28 | - bleach=4.1.0=pyhd8ed1ab_0
29 | - blinker=1.4=py_1
30 | - blosc=1.21.0=h9c3ff4c_0
31 | - brotli=1.0.9=h7f98852_6
32 | - brotli-bin=1.0.9=h7f98852_6
33 | - brotlipy=0.7.0=py38h497a2fe_1003
34 | - brunsli=0.1=h9c3ff4c_0
35 | - bzip2=1.0.8=h7f98852_4
36 | - c-ares=1.18.1=h7f98852_0
37 | - c-blosc2=2.0.4=h5f21a17_1
38 | - ca-certificates=2021.10.8=ha878542_0
39 | - cachetools=5.0.0=pyhd8ed1ab_0
40 | - certifi=2021.10.8=py38h578d9bd_1
41 | - cffi=1.15.0=py38h3931269_0
42 | - cfitsio=4.0.0=h9a35b8e_0
43 | - charls=2.3.4=h9c3ff4c_0
44 | - charset-normalizer=2.0.12=pyhd8ed1ab_0
45 | - click=8.0.4=py38h578d9bd_0
46 | - cloudpickle=2.0.0=pyhd8ed1ab_0
47 | - cryptography=36.0.1=py38h3e25421_0
48 | - cycler=0.11.0=pyhd8ed1ab_0
49 | - cytoolz=0.11.2=py38h497a2fe_1
50 | - dask-core=2022.2.0=pyhd8ed1ab_0
51 | - dataclasses=0.8=pyhc8e2a94_3
52 | - dbus=1.13.6=h5008d03_3
53 | - debugpy=1.5.1=py38h709712a_0
54 | - decorator=5.1.1=pyhd8ed1ab_0
55 | - defusedxml=0.7.1=pyhd8ed1ab_0
56 | - entrypoints=0.4=pyhd8ed1ab_0
57 | - executing=0.8.2=pyhd8ed1ab_0
58 | - expat=2.4.6=h27087fc_0
59 | - flit-core=3.7.1=pyhd8ed1ab_0
60 | - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
61 | - font-ttf-inconsolata=3.000=h77eed37_0
62 | - font-ttf-source-code-pro=2.038=h77eed37_0
63 | - font-ttf-ubuntu=0.83=hab24e00_0
64 | - fontconfig=2.13.96=ha180cfb_0
65 | - fonts-conda-ecosystem=1=0
66 | - fonts-conda-forge=1=0
67 | - fonttools=4.29.1=py38h497a2fe_0
68 | - freetype=2.10.4=h0708190_1
69 | - frozenlist=1.3.0=py38h497a2fe_0
70 | - fsspec=2022.2.0=pyhd8ed1ab_0
71 | - gast=0.3.3=py_0
72 | - gettext=0.19.8.1=h73d1719_1008
73 | - giflib=5.2.1=h36c2ea0_2
74 | - glpk=4.65=h9202a9a_1004
75 | - gmp=6.2.1=h58526e2_0
76 | - google-auth=2.6.0=pyh6c4a22f_1
77 | - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
78 | - google-pasta=0.2.0=pyh8c360ce_0
79 | - grpcio=1.44.0=py38hdd6454d_0
80 | - gst-plugins-base=1.18.5=hf529b03_3
81 | - gstreamer=1.18.5=h9f60fe5_3
82 | - h5py=2.10.0=nompi_py38h9915d05_106
83 | - hdf5=1.10.6=nompi_h6a2412b_1114
84 | - icu=69.1=h9c3ff4c_0
85 | - idna=3.3=pyhd8ed1ab_0
86 | - igraph=0.9.6=ha184e22_0
87 | - imagecodecs=2022.2.22=py38h58c7917_0
88 | - imageio=2.16.0=pyhcf75d05_0
89 | - importlib-metadata=4.11.1=py38h578d9bd_0
90 | - importlib_resources=5.4.0=pyhd8ed1ab_0
91 | - ipykernel=6.9.1=py38he5a9106_0
92 | - ipython=8.0.1=py38h578d9bd_2
93 | - ipython_genutils=0.2.0=py_1
94 | - jbig=2.1=h7f98852_2003
95 | - jedi=0.18.1=py38h578d9bd_0
96 | - jinja2=3.0.3=pyhd8ed1ab_0
97 | - joblib=1.1.0=pyhd8ed1ab_0
98 | - jpeg=9e=h7f98852_0
99 | - jsonschema=4.4.0=pyhd8ed1ab_0
100 | - jupyter_client=7.1.2=pyhd8ed1ab_0
101 | - jupyter_contrib_core=0.3.3=py_2
102 | - jupyter_contrib_nbextensions=0.5.1=pyhd8ed1ab_2
103 | - jupyter_core=4.9.2=py38h578d9bd_0
104 | - jupyter_highlight_selected_word=0.2.0=py38h578d9bd_1005
105 | - jupyter_latex_envs=1.4.6=pyhd8ed1ab_1002
106 | - jupyter_nbextensions_configurator=0.4.1=py38h578d9bd_2
107 | - jupyterlab_pygments=0.1.2=pyh9f0ad1d_0
108 | - jxrlib=1.1=h7f98852_2
109 | - keras-preprocessing=1.1.2=pyhd8ed1ab_0
110 | - kiwisolver=1.3.2=py38h1fd1430_1
111 | - krb5=1.19.2=hcc1bbae_3
112 | - lcms2=2.12=hddcbb42_0
113 | - ld_impl_linux-64=2.36.1=hea4e1c9_2
114 | - leidenalg=0.8.9=py38hfa26641_0
115 | - lerc=3.0=h9c3ff4c_0
116 | - libaec=1.0.6=h9c3ff4c_0
117 | - libavif=0.9.3=h166bdaf_1
118 | - libblas=3.9.0=13_linux64_openblas
119 | - libbrotlicommon=1.0.9=h7f98852_6
120 | - libbrotlidec=1.0.9=h7f98852_6
121 | - libbrotlienc=1.0.9=h7f98852_6
122 | - libcblas=3.9.0=13_linux64_openblas
123 | - libclang=13.0.1=default_hc23dcda_0
124 | - libcurl=7.81.0=h2574ce0_0
125 | - libdeflate=1.10=h7f98852_0
126 | - libedit=3.1.20191231=he28a2e2_2
127 | - libev=4.33=h516909a_1
128 | - libevent=2.1.10=h9b69904_4
129 | - libffi=3.4.2=h7f98852_5
130 | - libgcc-ng=11.2.0=h1d223b6_12
131 | - libgfortran-ng=11.2.0=h69a702a_12
132 | - libgfortran5=11.2.0=h5c6108e_12
133 | - libglib=2.70.2=h174f98d_4
134 | - libgomp=11.2.0=h1d223b6_12
135 | - libiconv=1.16=h516909a_0
136 | - liblapack=3.9.0=13_linux64_openblas
137 | - libllvm11=11.1.0=hf817b99_3
138 | - libllvm13=13.0.1=hf817b99_2
139 | - libnghttp2=1.47.0=h727a467_0
140 | - libnsl=2.0.0=h7f98852_0
141 | - libogg=1.3.4=h7f98852_1
142 | - libopenblas=0.3.18=pthreads_h8fe5266_0
143 | - libopus=1.3.1=h7f98852_1
144 | - libpng=1.6.37=h21135ba_2
145 | - libpq=14.2=hd57d9b9_0
146 | - libprotobuf=3.19.4=h780b84a_0
147 | - libsodium=1.0.18=h36c2ea0_1
148 | - libssh2=1.10.0=ha56f1ee_2
149 | - libstdcxx-ng=11.2.0=he4da1e4_12
150 | - libtiff=4.3.0=h542a066_3
151 | - libuuid=2.32.1=h7f98852_1000
152 | - libvorbis=1.3.7=h9c3ff4c_0
153 | - libwebp=1.2.2=h3452ae3_0
154 | - libwebp-base=1.2.2=h7f98852_1
155 | - libxcb=1.13=h7f98852_1004
156 | - libxkbcommon=1.0.3=he3ba5ed_0
157 | - libxml2=2.9.12=h885dcf4_1
158 | - libxslt=1.1.33=h0ef7038_3
159 | - libzlib=1.2.11=h36c2ea0_1013
160 | - libzopfli=1.0.3=h9c3ff4c_0
161 | - llvmlite=0.38.0=py38h4630a5e_0
162 | - locket=0.2.0=py_2
163 | - lxml=4.8.0=py38hf1fe3a4_0
164 | - lz4-c=1.9.3=h9c3ff4c_1
165 | - lzo=2.10=h516909a_1000
166 | - markdown=3.3.6=pyhd8ed1ab_0
167 | - markupsafe=2.1.0=py38h0a891b7_0
168 | - matplotlib=3.5.1=py38h578d9bd_0
169 | - matplotlib-base=3.5.1=py38hf4fb855_0
170 | - matplotlib-inline=0.1.3=pyhd8ed1ab_0
171 | - metis=5.1.0=h58526e2_1006
172 | - mistune=0.8.4=py38h497a2fe_1005
173 | - mock=4.0.3=py38h578d9bd_2
174 | - mpfr=4.1.0=h9202a9a_1
175 | - multidict=6.0.2=py38h497a2fe_0
176 | - munkres=1.0.7=py_1
177 | - mypy_extensions=0.4.3=py38h578d9bd_4
178 | - mysql-common=8.0.28=ha770c72_0
179 | - mysql-libs=8.0.28=hfa10184_0
180 | - nbclient=0.5.11=pyhd8ed1ab_0
181 | - nbconvert=6.4.2=py38h578d9bd_0
182 | - nbformat=5.1.3=pyhd8ed1ab_0
183 | - ncurses=6.3=h9c3ff4c_0
184 | - nest-asyncio=1.5.4=pyhd8ed1ab_0
185 | - networkx=2.6.3=pyhd8ed1ab_1
186 | - nomkl=1.0=h5ca1d4c_0
187 | - notebook=6.4.8=pyha770c72_0
188 | - nspr=4.32=h9c3ff4c_1
189 | - nss=3.74=hb5efdd6_0
190 | - numba=0.55.1=py38h4bf6c61_0
191 | - numexpr=2.8.0=py38h6045d29_101
192 | - numpy=1.21.5=py38h87f13fb_0
193 | - oauthlib=3.2.0=pyhd8ed1ab_0
194 | - openjpeg=2.4.0=hb52868f_1
195 | - openssl=1.1.1l=h7f98852_0
196 | - opt_einsum=3.3.0=pyhd8ed1ab_1
197 | - packaging=21.3=pyhd8ed1ab_0
198 | - pandas=1.4.1=py38h43a58ef_0
199 | - pandoc=2.17.1.1=ha770c72_0
200 | - pandocfilters=1.5.0=pyhd8ed1ab_0
201 | - parso=0.8.3=pyhd8ed1ab_0
202 | - partd=1.2.0=pyhd8ed1ab_0
203 | - pathspec=0.9.0=pyhd8ed1ab_0
204 | - patsy=0.5.2=pyhd8ed1ab_0
205 | - pcre=8.45=h9c3ff4c_0
206 | - pexpect=4.8.0=pyh9f0ad1d_2
207 | - pickleshare=0.7.5=py_1003
208 | - pillow=9.0.1=py38h0ee0e06_2
209 | - pip=22.0.3=pyhd8ed1ab_0
210 | - platformdirs=2.5.1=pyhd8ed1ab_0
211 | - prometheus_client=0.13.1=pyhd8ed1ab_0
212 | - prompt-toolkit=3.0.27=pyha770c72_0
213 | - protobuf=3.19.4=py38h709712a_0
214 | - pthread-stubs=0.4=h36c2ea0_1001
215 | - ptyprocess=0.7.0=pyhd3deb0d_0
216 | - pure_eval=0.2.2=pyhd8ed1ab_0
217 | - pyasn1=0.4.8=py_0
218 | - pyasn1-modules=0.2.7=py_0
219 | - pycparser=2.21=pyhd8ed1ab_0
220 | - pygments=2.11.2=pyhd8ed1ab_0
221 | - pyjwt=2.3.0=pyhd8ed1ab_1
222 | - pyopenssl=22.0.0=pyhd8ed1ab_0
223 | - pyparsing=3.0.7=pyhd8ed1ab_0
224 | - pyqt=5.12.3=py38h578d9bd_8
225 | - pyqt-impl=5.12.3=py38h0ffb2e6_8
226 | - pyqt5-sip=4.19.18=py38h709712a_8
227 | - pyqtchart=5.12=py38h7400c14_8
228 | - pyqtwebengine=5.12.1=py38h7400c14_8
229 | - pyrsistent=0.18.1=py38h497a2fe_0
230 | - pysocks=1.7.1=py38h578d9bd_4
231 | - pytables=3.6.1=py38hc386592_3
232 | - python=3.8.12=ha38a3c6_3_cpython
233 | - python-dateutil=2.8.2=pyhd8ed1ab_0
234 | - python-igraph=0.9.9=py38h2af5540_0
235 | - python_abi=3.8=2_cp38
236 | - pytz=2021.3=pyhd8ed1ab_0
237 | - pyu2f=0.1.5=pyhd8ed1ab_0
238 | - pywavelets=1.2.0=py38h6c62de6_1
239 | - pyyaml=6.0=py38h497a2fe_3
240 | - pyzmq=22.3.0=py38h2035c66_1
241 | - qt=5.12.9=ha98a1a1_5
242 | - readline=8.1=h46c0cb4_0
243 | - requests=2.27.1=pyhd8ed1ab_0
244 | - requests-oauthlib=1.3.1=pyhd8ed1ab_0
245 | - rsa=4.8=pyhd8ed1ab_0
246 | - scikit-image=0.19.2=py38h43a58ef_0
247 | - scikit-learn=1.0.2=py38h1561384_0
248 | - scipy=1.8.0=py38h56a6a73_1
249 | - seaborn=0.11.2=hd8ed1ab_0
250 | - seaborn-base=0.11.2=pyhd8ed1ab_0
251 | - send2trash=1.8.0=pyhd8ed1ab_0
252 | - setuptools=60.9.3=py38h578d9bd_0
253 | - six=1.16.0=pyh6c4a22f_0
254 | - snappy=1.1.8=he1b5a44_3
255 | - sqlite=3.37.0=h9cd32fc_0
256 | - stack_data=0.2.0=pyhd8ed1ab_0
257 | - statsmodels=0.13.2=py38h6c62de6_0
258 | - suitesparse=5.10.1=h9e50725_1
259 | - tbb=2021.5.0=h4bd325d_0
260 | - tensorboard=2.8.0=pyhd8ed1ab_1
261 | - tensorboard-data-server=0.6.0=py38h3e25421_1
262 | - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
263 | - tensorflow=2.2.0=mkl_py38h6d3daf0_0
264 | - tensorflow-base=2.2.0=mkl_py38h5059a2d_0
265 | - tensorflow-estimator=2.6.0=py38h709712a_0
266 | - termcolor=1.1.0=py_2
267 | - terminado=0.13.1=py38h578d9bd_0
268 | - testpath=0.6.0=pyhd8ed1ab_0
269 | - texttable=1.6.4=pyhd8ed1ab_0
270 | - threadpoolctl=3.1.0=pyh8a188c0_0
271 | - tifffile=2022.2.9=pyhd8ed1ab_0
272 | - tk=8.6.12=h27826a3_0
273 | - tomli=2.0.1=pyhd8ed1ab_0
274 | - toolz=0.11.2=pyhd8ed1ab_0
275 | - tornado=6.1=py38h497a2fe_2
276 | - traitlets=5.1.1=pyhd8ed1ab_0
277 | - typed-ast=1.5.2=py38h497a2fe_0
278 | - typing-extensions=4.1.1=hd8ed1ab_0
279 | - typing_extensions=4.1.1=pyha770c72_0
280 | - unicodedata2=14.0.0=py38h497a2fe_0
281 | - urllib3=1.26.8=pyhd8ed1ab_1
282 | - wcwidth=0.2.5=pyh9f0ad1d_2
283 | - webencodings=0.5.1=py_1
284 | - werkzeug=2.0.3=pyhd8ed1ab_1
285 | - wheel=0.37.1=pyhd8ed1ab_0
286 | - wrapt=1.13.3=py38h497a2fe_1
287 | - xorg-libxau=1.0.9=h7f98852_0
288 | - xorg-libxdmcp=1.1.3=h7f98852_0
289 | - xz=5.2.5=h516909a_1
290 | - yaml=0.2.5=h7f98852_2
291 | - yarl=1.7.2=py38h497a2fe_1
292 | - zeromq=4.3.4=h9c3ff4c_1
293 | - zfp=0.5.5=h9c3ff4c_8
294 | - zipp=3.7.0=pyhd8ed1ab_1
295 | - zlib=1.2.11=h36c2ea0_1013
296 | - zstd=1.5.2=ha95c52a_0
297 |
--------------------------------------------------------------------------------
/CoreFunctions/get_distances.c:
--------------------------------------------------------------------------------
1 | //get_distances.c
2 | #include
3 |
4 | void get_distances(const double **surroundings, const double **nuc_assign, const double **dists,const int num_pix, const int num_nuc, double **dist_mat){
5 |
6 | size_t i, j,k;
7 | double surr;
8 | double nuc_dist;
9 |
10 | //iterate through all pixels
11 | for(i=0;i
3 |
4 | int get_raveled_index(int hei, int wid, int dep, int i_in, int j_in, int k_in){
5 | return i_in*wid*dep+j_in*dep+k_in;
6 | }
7 |
8 | void get_sur(const int *surroundings, int *surr_count, int *same_count, int height, int width, int depth){
9 | int i, j, k;
10 | int l, m, n;
11 | int current_pix;
12 | int surr_pix;
13 | int sur_ind, ind, real_ind;
14 |
15 | for(i=1;i<=height;i++){
16 | for(j=1;j<=width;j++){
17 | for(k=1;k<=depth;k++){
18 | ind = get_raveled_index(height+2, width+2, depth+2, i, j, k);
19 | real_ind = get_raveled_index(height, width, depth, i-1, j-1, k-1);
20 | current_pix = surroundings[ind];
21 | for(l=-1;l<2;l++){
22 | for(m=-1;m<2;m++){
23 | for(n=-1;n<2;n++){
24 | sur_ind = get_raveled_index(height+2, width+2, depth+2, i+l, j+m, k+n);
25 | surr_pix = surroundings[sur_ind];
26 | if (current_pix == surr_pix){
27 | same_count[real_ind]++;
28 | }else{
29 | if (surr_pix != -2)
30 | surr_count[real_ind]++;
31 | }
32 | }
33 | }
34 | }
35 | }
36 | }
37 | }
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/CoreFunctions/requirements.txt:
--------------------------------------------------------------------------------
1 | python=3.6.10
2 | tensorflow
3 | scikit-learn
4 | scikit-image
5 | numpy
6 | notebook
7 | matplotlib
8 | pandas
9 | seaborn
10 | scikit-learn
11 | statsmodels
12 | numba
13 | pytables
14 | python-igraph
15 | leidenalg
16 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # JSTA: joint cell segmentation and cell type annotation for spatial transcriptomics
2 |
3 |
4 |
5 | Initially, watershed based segmentation is performed and a cell level type classifier, parameterized by a deep neural network (DNN), is trained based on the NCTT data. The cell level classifier then assigns cell (sub)types (red and blue in this cartoon example). Based on the current assignment of pixels to cell (sub)types, a new DNN is trained to estimate the probabilities that each pixel comes from each of the possible (sub)types given the local RNA density at each pixel. In this example, two pixels that were initially assigned to the “red” cells got higher probability to be of a blue type. Since the neighbor cell is of type “blue” they were reassigned to that cell during segmentation update. Using the updated segmentation and the cell type classifier cell types are reassigned. The tasks of training, segmentation, and classification are repeated over many iterations until convergence. See the full manuscript here: https://doi.org/10.15252/msb.202010108
6 |
7 | ## Download and Install:
8 | ### In terminal:
9 | ```git clone https://github.com/wollmanlab/JSTA.git```
10 | ### Install python dependencies:
11 | With pip:
12 | ``` pip install -r CoreFunctions/requirements.txt ```
13 | With conda:
14 | ```conda env create -f CoreFunctions/environment.yml ```
15 | or
16 | ```conda install --file CoreFunctions/requirements.txt```
17 | ### Compile c files, and add current path to functions:
18 | ```./install.sh```
19 |
20 | ## Tutorials:
21 | ### tutorials/SimulatingData.ipynb
22 | Simulate spatial transcriptomics data from a reference dataset:
23 | Files needed:
24 | - scRNAseq Reference:
25 | - cells x genes matrix
26 | - Reference celltypes:
27 | - cell type vector
28 |
29 |
30 |
31 | Representative synthetic dataset of nuclei (black) and mRNAs, where each color represents a different gene (left). Ground truth boundaries of the cells. Each color represents a different cell (right).
32 |
33 | ### tutorials/RunningJSTA.ipynb
34 | Run our quick implementation of density estimation, and segmentation with JSTA!
35 | Files needed:
36 | - mRNA spots:
37 | - spots x 4 matrix
38 | - Columns: gene name, x, y, z
39 | - Rows: Each mRNA spot
40 | - nuclei:
41 | - pixels x 4 matrix;
42 | - Columns: cell id, x, y, z
43 | - Rows: Each pixel of nucleus
44 | - scRNAseq Reference:
45 | - cells x genes matrix
46 | - Reference celltypes:
47 | - cell type vector
48 |