├── .gitignore ├── .idea ├── .gitignore ├── JSTA.iml ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── CoreFunctions ├── FindSpatialDEGs.py ├── JSTA.py ├── MerfishSimulator.py ├── environment.yml ├── get_distances.c ├── get_number_similar_surroundings.c └── requirements.txt ├── README.md ├── images ├── JSTAOverview.png ├── SegmentedHippocampus.png └── SimulatedData.png ├── install.sh ├── ref_data ├── celltypes.txt.gz └── sc_ref.csv.gz ├── simulated_example ├── celltypes.npy ├── nuclei.npy ├── spots.npy └── true_map.npy └── tutorials ├── FindSpatialDEGs.ipynb ├── RunningJSTA.ipynb ├── SimulatingData.ipynb └── data_for_spatial ├── cell_centers.npy ├── cells_mat.txt.gz └── celltypes.pkl /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .ipynb_checkpoints 3 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/JSTA.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 12 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /CoreFunctions/FindSpatialDEGs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.neighbors import KNeighborsClassifier 3 | 4 | def get_neighborhoods_with_null(mat, ct, center, cell, k, niter): 5 | ''' 6 | Gets the local neighborhood expression, as well as a null distribution 7 | for cell type "cell" 8 | --------------------- 9 | parameters: 10 | mat: digital gene expression matrix 11 | ct: vector of all cell types 12 | center: x, y, z coordinates for each cell 13 | cell: cell type to find spDEGs for 14 | k: Number of cells in a local neighborhood 15 | niter: number of permutations 16 | --------------------- 17 | returns: 18 | gene_mat: local neighborhood expression 19 | nullm: Null distribution of local neighborhood expression 20 | ''' 21 | # Finds where the cells of a given cell type are 22 | locs = np.where(ct == cell)[0] 23 | sub_mat = mat.iloc[locs, :] 24 | 25 | # hard coded, if number of cells is less than 26 | # 4 times the neighborhood size, don't compute spDEGs 27 | # this can be changed 28 | if sub_mat.shape[0] < 4 * k: 29 | print('Only ', str(sub_mat.shape[0]), ' cells present') 30 | return None 31 | 32 | # Train KNN classifier to find neighborshoods 33 | sub_cent = center[locs, :] 34 | sub_ct = ct[locs] 35 | clf = KNeighborsClassifier(n_neighbors=k).fit(sub_cent, sub_ct) 36 | dist, ids = clf.kneighbors(sub_cent) 37 | 38 | # get the null distribution of neighobrhoods 39 | nullm = get_null(sub_mat, ids, niter) 40 | 41 | # get the real neighborhood expression 42 | gene_mat = get_local_neigh(sub_mat, ids) 43 | 44 | return gene_mat, nullm 45 | 46 | 47 | def get_null(sm, ids, niter): 48 | ''' 49 | Gets the null distribution of local neighborhoods 50 | --------------------- 51 | parameters: 52 | sm: DGE of cells in the current cell type 53 | ids: indices of nearest neighbors 54 | niter: Number of permutations 55 | --------------------- 56 | returns: 57 | nullmat: null distribution of local neighborhood expression 58 | ''' 59 | nullmat = np.zeros((niter, 60 | sm.shape[0], 61 | sm.shape[1])) 62 | ids_rand = ids.copy() 63 | for i in range(niter): 64 | np.random.shuffle(ids_rand.ravel()) 65 | nullmat[i, :, :] = get_local_neigh(sm, ids_rand) 66 | return nullmat 67 | 68 | 69 | def get_local_neigh(cm, ids): 70 | ''' 71 | Gets the local expression of a neighborhood around each cell 72 | --------------------- 73 | parameters: 74 | cm: DGE of cells in current cell type 75 | ids: indices of nearest neighbors 76 | --------------------- 77 | returns: 78 | neigh_mat: local neighborhood expression around each cell 79 | ''' 80 | neigh_mat = np.zeros_like(cm) 81 | for i in range(ids.shape[0]): 82 | temp_mat = cm.iloc[ids[i, :], :] 83 | neigh_mat[i, :] = np.mean(temp_mat, axis=0) 84 | 85 | return neigh_mat 86 | 87 | def get_spatial_pval(cells_mat, celltypes, cell_cent, ct, nneighbors, nperm): 88 | ''' 89 | Get the pvalue of spDEGs for each gene in a given cell type 90 | --------------------- 91 | parameters: 92 | cells_mat: DGE matrix (cells x genes) 93 | celltypes: vector of cell types 94 | cell_cent: locations of each cell, (cells x euclidean (xyz)) 95 | ct: cell type of interest 96 | nneighbors: number of neighbors in a local neighborhood 97 | nperm: number of permutation to generate a null distribution 98 | --------------------- 99 | returns: 100 | ps_mat_raveled: list of p values, and gene indices 101 | returns: None if there aren't enough cells 102 | ''' 103 | neighborhoods_output = get_neighborhoods_with_null(cells_mat, celltypes, 104 | cell_cent, ct,nneighbors, 105 | nperm) 106 | 107 | if neighborhoods_output is not None: 108 | 109 | gm, nm = neighborhoods_output 110 | 111 | ps_mat_raveled = [] 112 | for i in range(gm.shape[1]): 113 | nm_rav = nm[:, :, i] 114 | 115 | var_vec = np.var(nm_rav, axis=1) 116 | real_var = np.var(gm[:, i]) 117 | 118 | p = 1 - (np.sum(real_var > var_vec) / len(var_vec)) 119 | 120 | ps_mat_raveled.append([p, i]) 121 | 122 | return ps_mat_raveled 123 | 124 | else: 125 | print('Not enough cells in cell type.') 126 | return None 127 | -------------------------------------------------------------------------------- /CoreFunctions/JSTA.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | import pickle as pkl 6 | from math import exp, sqrt, log 7 | import scipy.spatial 8 | import os 9 | from time import time 10 | from collections import Counter 11 | from sklearn.preprocessing import scale 12 | from skimage.segmentation import watershed 13 | from numpy.ctypeslib import ndpointer 14 | import ctypes 15 | from tensorflow.keras.layers import Input, Dense, BatchNormalization 16 | from tensorflow.keras.models import Model 17 | from tensorflow.keras.regularizers import l2, l1 18 | from tensorflow.keras.optimizers import Adam 19 | from sklearn.model_selection import train_test_split 20 | from sklearn import neighbors 21 | 22 | path_to_file = #REPLACE-WITH-PATH 23 | 24 | def find_empty_pixels(pix): 25 | ''' 26 | Finds which pixels are empty to be used later to exclude border pix 27 | --------------------- 28 | parameters: 29 | pix: 4d array of pixel identities 30 | --------------------- 31 | return: 32 | empty_pix: 3d array of pixels 0 if empty 1 if not empty 33 | ''' 34 | empty_pix = np.sum(pix,axis=3) 35 | empty_pix[empty_pix > 0] = 1 36 | 37 | return empty_pix 38 | 39 | 40 | def add_coords_to_cell_matrix(cm, nuc): 41 | ''' 42 | Adds the cell center coordinates to the cell matrix 43 | --------------------- 44 | parameters: 45 | cm: cell count matrix 46 | nuc: dataframe of nuclei 47 | return: 48 | cm: cell count matrix with coordinates appended to the front 49 | ''' 50 | cell_centers, nuc_id = get_cell_centers(nuc) 51 | nuc_xy = pd.DataFrame(np.zeros((cm.shape[0],3)),columns=['x','y','z']) 52 | nuc_xy.index = cm.index 53 | nuc_xy.loc[:,:] = cell_centers 54 | cm = pd.concat([nuc_xy,cm],axis=1) 55 | 56 | return cm 57 | 58 | def get_cell_centers(nuc): 59 | ''' 60 | Finds the center of each nucleus 61 | --------------------- 62 | parameters: 63 | nuc: dataframe of nuclei 64 | --------------------- 65 | return: 66 | cell_centers: array of the x,y,z center of each nuclus 67 | nuc_id: array of the nucleus id in the same order as cell centers 68 | ''' 69 | cell_centers = [] 70 | nuc_id = [] 71 | for ind in np.unique(nuc.id): 72 | nuc_id.append(int(ind)) 73 | temp = nuc[nuc.id == ind] 74 | x_cent, y_cent, z_cent = np.mean(temp.x), np.mean(temp.y), np.mean(temp.z) 75 | cell_centers.append([x_cent, y_cent, z_cent]) 76 | cell_centers = np.array(cell_centers) 77 | nuc_id = np.array(nuc_id) 78 | return cell_centers, nuc_id 79 | 80 | def get_matrix_of_cells(pix, cell_assign, nuclei_clusters): 81 | ''' 82 | Gets the count matrix based on current cell assignment and true input pixels 83 | --------------------- 84 | parameters: 85 | pix: 4d array of pixels to with their gene expression 86 | cell_assign: current assignments of each pixel to a cell 87 | nuclei_clusters: dataframe of nuclei points 88 | --------------------- 89 | return: cells_mat nxm count matrix where n is number of cells m is number of genes 90 | ''' 91 | n_gene = pix.shape[3] 92 | n_cell = len(np.unique(nuclei_clusters.id)) 93 | cells_mat = pd.DataFrame(np.zeros((n_cell,n_gene)),columns = np.arange(n_gene)) 94 | cells_mat.index = np.unique(nuclei_clusters.id).astype(int) 95 | background = np.zeros(n_gene) 96 | for i in np.unique(nuclei_clusters.id): 97 | id_loc = np.where(cell_assign == i) 98 | cells_mat.loc[i,:] = np.sum(pix[id_loc[0], 99 | id_loc[1], 100 | id_loc[2],:],axis=0) 101 | 102 | return cells_mat 103 | 104 | 105 | def get_number_similar_surroundings(cell_assign): 106 | ''' 107 | counts the cell types of the surrounding pixels for each pixel 108 | --------------------- 109 | parameters: 110 | cell_assign: current assignment of each pixel 111 | num_classes: the number of cell types 112 | celltype_pred: predictions of each cells celltype 113 | returns: 114 | surr_count: 4d array for each pixel the number of surroundings that are of each cell type 115 | ''' 116 | surr_count = np.zeros_like(cell_assign) 117 | same_count = np.zeros_like(cell_assign) 118 | 119 | surroundings = np.pad(cell_assign,(1),'constant',constant_values=(-2)) 120 | height = cell_assign.shape[0]; width = cell_assign.shape[1]; depth = cell_assign.shape[2] 121 | 122 | surroundings = np.array(surroundings, dtype=int) 123 | surr_count = np.array(surr_count, dtype=int) 124 | same_count = np.array(same_count, dtype=int) 125 | 126 | c_args = [ndpointer(dtype=ctypes.c_int,flags='C'), 127 | ndpointer(dtype=ctypes.c_int,flags='C'), 128 | ndpointer(dtype=ctypes.c_int,flags='C'), 129 | ctypes.c_int, ctypes.c_int, ctypes.c_int] 130 | get_num_surr_func_c.get_sur.argtypes = c_args 131 | get_sur.restype = None 132 | 133 | surroundings = surroundings.ravel().astype(np.int32) 134 | surr_count = surr_count.ravel().astype(np.int32) 135 | same_count = same_count.ravel().astype(np.int32) 136 | 137 | 138 | #c func 139 | get_sur(surroundings,surr_count,same_count, height, width, depth) 140 | same_count -= 1 141 | 142 | return surr_count.reshape(cell_assign.shape), same_count.reshape(cell_assign.shape) 143 | 144 | def classify_pixels_to_nuclei(locs, nuclei_clust, dist_threshold): 145 | ''' 146 | Classify each pixel to a nucleus or to nothing (-1) 147 | --------------------- 148 | parameters: 149 | locs: locations of pixels in x, y, z coordinates 150 | nuclei_clust: dataframe of nuclei spots 151 | dist_threshold: maximum distance away from nucleus for classification 152 | ''' 153 | neighbors_classifier = neighbors.NearestNeighbors(n_neighbors=1) 154 | neighbors_classifier.fit( nuclei_clust.loc[:,['x','y','z']].values,nuclei_clust.id) 155 | l = locs.shape 156 | new_locs = np.reshape(locs, (l[0]*l[1]*l[2],3)) 157 | cell_assignment = -np.ones(l[0:3]) 158 | dists, predicted = neighbors_classifier.kneighbors(new_locs) 159 | dists = dists.ravel(); predicted = predicted.ravel() 160 | predicted = nuclei_clust.id.to_numpy()[predicted] 161 | predicted[~(dists < dist_threshold)] = -1 162 | counter = 0 163 | for i in range(len(cell_assignment)): 164 | for j in range(len(cell_assignment[i])): 165 | for k in range(len(cell_assignment[i,j])): 166 | cell_assignment[i,j,k]=predicted[counter] 167 | counter += 1 168 | return cell_assignment.astype(int) 169 | 170 | def get_real_pixels(spots, approximate_binsize, genes_mer, pix_shape,dtype=np.float32): 171 | ''' 172 | Returns the array of pixels using count data instead of smoothed values 173 | If there is only one spot in a given gene and z plane, it is ignored so 174 | the number of spots may be slightly more than the sum of the true pixels 175 | --------------------- 176 | parameters: 177 | spots: raw merfish data 178 | approximate_binsize: the approximated binsize of each histogram cell 179 | genes_mer: genes that are in the merfish data 180 | pix_shape: shape of pixels tensor 181 | --------------------- 182 | return: 183 | pix_true: 4d pixel tensor with true count data 184 | ''' 185 | min_x = np.min(spots.x); max_x = np.max(spots.x); 186 | min_y = np.min(spots.y); max_y = np.max(spots.y); 187 | min_z = np.min(spots.z); max_z = np.max(spots.z); 188 | 189 | x_steps = get_real_binsize(spots.x, approximate_binsize) 190 | y_steps = get_real_binsize(spots.y, approximate_binsize) 191 | z_steps = get_real_binsize(spots.z, approximate_binsize) 192 | 193 | n_x_bins = len(np.arange(min_x,max_x+x_steps,x_steps)) 194 | n_y_bins = len(np.arange(min_y,max_y+y_steps,y_steps)) 195 | 196 | pix_true = np.zeros(pix_shape,dtype=dtype) 197 | z_bins = np.arange(min_z, max_z+z_steps+1, z_steps) 198 | ngene = len(genes_mer) 199 | 200 | tic = time() 201 | for i,gene in enumerate(genes_mer): 202 | print(gene) 203 | toc = time() 204 | 205 | spots_temp = spots[spots.gene == gene] 206 | z_counter = 0 207 | for z in range(1,len(z_bins),1): 208 | spots_temp_z = spots_temp[(spots_temp.z >= z_bins[z-1])& 209 | (spots_temp.z < z_bins[z])] 210 | if spots_temp_z.shape[0] > 1: 211 | hist = np.histogram2d(spots_temp_z.x, 212 | spots_temp_z.y, 213 | range=[[min_x,max_x], 214 | [min_y,max_y]], 215 | bins = (n_x_bins, 216 | n_y_bins))[0] 217 | pix_true[:,:,z_counter,i] = hist 218 | z_counter += 1 219 | 220 | return pix_true 221 | 222 | 223 | 224 | def get_locations(spots, approximate_binsize): 225 | ''' 226 | Gets the coordinates for each cell in the pixels 227 | --------------------- 228 | parameters: 229 | spots: merfish raw data 230 | approximate_binsize: the approximated binsize of each histogram cell 231 | --------------------- 232 | return: 233 | locations: 4d array with x, y, z coordinates for each pixel 234 | ''' 235 | x_steps = get_real_binsize(spots.x, approximate_binsize) 236 | y_steps = get_real_binsize(spots.y, approximate_binsize) 237 | z_steps = get_real_binsize(spots.z, approximate_binsize) 238 | xs = np.arange(np.min(spots.x),np.max(spots.x)+x_steps,x_steps) 239 | ys = np.arange(np.min(spots.y),np.max(spots.y)+y_steps,y_steps) 240 | zs = np.arange(np.min(spots.z),np.max(spots.z)+z_steps,z_steps) 241 | X, Y, Z = np.mgrid[np.min(spots.x):np.max(spots.x)+x_steps:x_steps, 242 | np.min(spots.y):np.max(spots.y)+y_steps:y_steps, 243 | np.min(spots.z):np.max(spots.z)+z_steps:z_steps] 244 | locations = np.zeros((len(xs),len(ys),len(zs),3)) 245 | locations[:,:,:,0] = X 246 | locations[:,:,:,1] = Y 247 | locations[:,:,:,2] = Z 248 | return locations 249 | 250 | 251 | def fast_de_all_spots(spots, approximate_binsize, 252 | bandwidth): 253 | ''' 254 | Runs psuedo-kde for all genes 255 | --------------------- 256 | parameters: 257 | spots: merfish raw data 258 | approximate_binsize: the approximated binsize of each histogram cell 259 | bandwidth: how far away to get information from for kde 260 | --------------------- 261 | return: 262 | kde_data: 4d array of all kde data for every gene (4th dimension) 263 | ''' 264 | positions, x_shape = get_positions_for_kde(spots, approximate_binsize) 265 | kde_data = np.zeros((x_shape[0],x_shape[1],x_shape[2],len(np.unique(spots.gene)))) 266 | for i,gene in enumerate(np.unique(spots.gene)): 267 | print(gene, i) 268 | temp = spots[spots.gene == gene] 269 | kde_data[:,:,:,i] = fast_kde_spot(temp, positions, 270 | approximate_binsize, bandwidth, 271 | x_shape) 272 | return kde_data 273 | 274 | def fast_kde_spot(spots, positions, approximate_binsize, 275 | bandwidth,x_shape): 276 | ''' 277 | Wrapper for running fast_kde_with_knn for the spots 278 | --------------------- 279 | parameters: 280 | spots: merfish raw data 281 | positions: center points for kde smoothing 282 | approximate_binsize: the approximated binsize of each histogram cell 283 | x_shape: final shape of the 3d array for the smoothed kde 284 | return: 285 | spot_dense: 3d array with smoothed kde vlaues 286 | ''' 287 | coords = spots.loc[:,['x','y','z']].to_numpy() 288 | spot_dense = np.reshape(fast_kde_with_knn(positions, coords, 289 | bandwidth),x_shape) 290 | return spot_dense 291 | 292 | def kde_nuclei(spots, nuclei, 293 | approximate_binsize, bandwidth): 294 | ''' 295 | Get the smoothed density of the nuclei 296 | --------------------- 297 | parameters: 298 | spots: dataframe of the merfish spots 299 | nuclei: dataframe of the nuclei points 300 | approximate_binsize: approximate binsize in microns 301 | bandwidth: number of neighbors to look for 302 | --------------------- 303 | return: 304 | nuc_dense: 3d array with the smoothed nuclei density 305 | ''' 306 | 307 | positions, x_shape = get_positions_for_kde(spots, approximate_binsize) 308 | coords = nuclei.loc[:,['x','y','z']].to_numpy() 309 | 310 | print('getting density for nuclei') 311 | nuc_dense = np.reshape(fast_kde_with_knn(positions, coords, 312 | bandwidth,1),x_shape) 313 | nuc_dense *= nuclei.shape[0] 314 | nuc_dense /= np.mean(nuc_dense) 315 | return nuc_dense 316 | 317 | def get_positions_for_kde(spots, approximate_binsize): 318 | ''' 319 | Creates the grid to get the positions where to find kde 320 | --------------------- 321 | parameters: 322 | spots: raw merfish data 323 | approximate_binsize: approximated binsize of the histogram cell 324 | --------------------- 325 | return: 326 | positions: positions with coordinates of where to find kde 327 | x_shape: the shape of the final 3d array 328 | ''' 329 | 330 | x_steps = get_real_binsize(spots.x, approximate_binsize) 331 | y_steps = get_real_binsize(spots.y, approximate_binsize) 332 | z_steps = get_real_binsize(spots.z, approximate_binsize) 333 | xs = np.arange(np.min(spots.x),np.max(spots.x)+x_steps,x_steps) 334 | ys = np.arange(np.min(spots.y),np.max(spots.y)+y_steps,y_steps) 335 | zs = np.arange(np.min(spots.z),np.max(spots.z)+z_steps,z_steps) 336 | 337 | X, Y, Z = np.mgrid[np.min(spots.x):np.max(spots.x)+x_steps:x_steps, 338 | np.min(spots.y):np.max(spots.y)+y_steps:y_steps, 339 | np.min(spots.z):np.max(spots.z)+z_steps:z_steps] 340 | 341 | positions = np.vstack([X.ravel(),Y.ravel(),Z.ravel()]).T 342 | return positions, X.shape 343 | 344 | def fast_kde_with_knn(positions, coords, nneigh): 345 | ''' 346 | Pseudo-KDE by dividing the number of points near the point of interest 347 | by the volumne take to get to that number 348 | --------------------- 349 | parameters: 350 | positions: vector of coordinates in x,y,z to find kde on 351 | coords: coordinates of the points to be smoothed by kde 352 | nneigh: number of neighbors to use, it is a proxy for bandwidth, or 353 | distance to pull information from 354 | return: 355 | kde vec: vector of the smoothed kde values for each location in positions 356 | ''' 357 | nneigh = min(nneigh, coords.shape[0]) 358 | nbrs = neighbors.NearestNeighbors(n_neighbors=nneigh, algorithm='kd_tree', 359 | n_jobs=24).fit(coords) 360 | distances, indices = nbrs.kneighbors(positions) 361 | denom = ((4/3*3.14))*distances[:,nneigh-1]**3 362 | denom = np.maximum(denom,1e-1) 363 | return nneigh/denom 364 | 365 | 366 | def get_real_binsize(one_direction, approx_binsize): 367 | ''' 368 | We approximate the binsize in microns, but because the actual range may not be divisible by the 369 | approximated binsize we need to change the true binsize 370 | --------------------- 371 | parameters: 372 | one_direction: the vector of coordinates for x, y, or z 373 | approx_binsize: approximate binsize in microns 374 | --------------------- 375 | return: 376 | actual binsize: the range/number of bins 377 | 378 | ''' 379 | one_range = get_range(one_direction) 380 | nbins = np.ceil(one_range/approx_binsize) #so we can have equal sized bins 381 | return one_range/nbins 382 | 383 | def get_range(array): 384 | ''' 385 | Gets the range of the coordinates for either x, y, or z 386 | --------------------- 387 | parameters: 388 | array: numpy array of x, y, or z values 389 | --------------------- 390 | return: 391 | float: the range of the coordinates max-min 392 | ''' 393 | return(np.max(array)-np.min(array)) 394 | 395 | def shrink_window(df, x_min, x_max, y_min, y_max): 396 | ''' 397 | Shrinks the window down to a smaller size 398 | --------------------- 399 | parameters: 400 | df: dataframe to shrink 401 | x_min: lower bound x 402 | x_max: upper bound x 403 | y_min: lower_bound y 404 | y_max: upper_bound y 405 | --------------------- 406 | return: 407 | new_df: shrunk dataframe 408 | ''' 409 | new_df = df[(df.x > x_min) & 410 | (df.x < x_max) & 411 | (df.y > y_min) & 412 | (df.y < y_max)] 413 | return new_df 414 | 415 | def plot_segmentation(assignment, cmap='nipy_spectral'): 416 | ''' 417 | Plots segmentation map as the max value through the z-stack 418 | --------------------- 419 | parameters: 420 | assignment: 3d array of the cell segmentation map 421 | cmap: matplotlib color map to use 422 | ''' 423 | 424 | #rearange the cell ids so the coloring is more spread out 425 | colors = np.unique(assignment) 426 | copy_assign = assignment.copy() 427 | np.random.shuffle(colors) 428 | colors[colors == -1] = colors[0] 429 | colors[0] == -1 430 | for i,c in enumerate(np.unique(copy_assign)): 431 | #skip -1 in the segmentation map 432 | if c != -1: 433 | copy_assign[copy_assign == c] = colors[i] 434 | 435 | #plot the max value through the z-stack 436 | plt.imshow(np.max(copy_assign, axis=2), 437 | cmap=cmap) 438 | 439 | def reclassify_squares(pix, pixl_true, 440 | cell_matrix, nuc, 441 | cell_assign, sc_ref, 442 | sc_ref_celltypes, all_genes, 443 | locs, clf_cell, 444 | pct_train=0.1, border_other_threshold=5, 445 | border_same_threshold=2, 446 | outer_max=1, inner_max=5, 447 | most_inner_max=5, dist_threshold=2, dist_scaling=5, 448 | anneal_param=0.05, flip_thresh=0.1, 449 | nlayer=3, first_epochs=25, second_epochs=15, 450 | lrs=[1e-3, 1e-4], l1_reg=1e-3): 451 | ''' 452 | Method to flip pixels from one cell to another or to no cell assignment to improve cell segmentation 453 | High level description: 454 | a) Classify cells to a cell type. 455 | b)Train on a subset of the pixels to build a pixel level 456 | classifier for determining the celltype identity 457 | c) Flip border pixels acording to their predictions using the model in b. 458 | Keep switching between a, b, and c flipping pixels and retraining the models 459 | --------------------- 460 | parameters: 461 | pix: 4d tensor. 4th dimension is gene expression (kde) vector of each pixel 462 | pixl_true: 4d tensor. 4th dimension is gene expression (counts) vector of each pixel 463 | cell_matrix: voronoi segmented digital gene expression matrix cells x genes 464 | nuc: data frame with x,y,z coordinates of nuclei pixels with nuclei IDs 465 | cell_assign: 3d tensor. each pixel has it's current nuclei classification or none 466 | sc_ref: scRNAseq reference matrix cells x genes 467 | sc_ref_celltypes: vector of cell types for each scRNAseq cell 468 | all_genes: genes in merfish data 469 | locs: 4d tensor. 4th dimension is an x,y,z vector of the coordinates of the center of that pixel 470 | clf_cell: neural network based cell type predictor 471 | pct_train: percentage of pixels to train on for pixel classifier (default: 0.1) 472 | border_other_threshold: how many pixels need to belong to the other cell to flip to that cell (default: 5) 473 | border_same_threshold: border_other_threshold: how many pixels need to belong to the same cell to flip to that cell (default: 2) 474 | outer_max: numbegr of iterations of the outer loop (cell classification (a)) (default: 1) 475 | inner_max: number of iterations of the inner loop (pixel training (b)) (default: 5) 476 | most_inner_max: number of iterations of the flipping pixels loop (pixel flip (c)) (default: 5) 477 | dist_threshold: distance from edge of nucleus to ensure a cell belongs to that nucleus. (default: 2) 478 | dist_scaling: amount of decay of probabilities for flipping as you move away from a nucleus of 479 | interest. The probability decreases by half every dist_threshold*dist_scaling (default: 5) 480 | annealing_param: Parameter to decrease the probabalistic component of pixel and cell classification. 481 | Every iteration the highest probability is multiplied by 1+annealing_param*n_iteration (default: 0.5) 482 | flip_thresh: Pixel probabilities before this value get set to 0. (default: 0.1) 483 | nlayer: number of intermediate layers in the pixel classifier (default: 3) 484 | first_epochs: number of epochs for training after the pixel classifier was initialized (default: 25) 485 | second_epochs: number of epochs for training on subsequent rounds of training (default: 15) 486 | lrs: list of learning rates for training pixel classifier (default: [1e-3, 1e-4]) 487 | l1_reg: l1 regularization parameter for neural network 488 | return: 489 | cell_assign: 3d tensor giving the nuclei assignment of each pixel 490 | ''' 491 | # copy of original assignment for later use 492 | map_to_keep_nuclei_correctly_labeled = classify_pixels_to_nuclei( 493 | locs, nuc, dist_threshold) 494 | 495 | # name change. Get rid of later 496 | genes_to_use_prediction = all_genes 497 | 498 | # gets the genes we need for cell type prediction 499 | gene_subset_indices = [] 500 | for i in all_genes: 501 | if i in genes_to_use_prediction: 502 | gene_subset_indices.append(True) 503 | else: 504 | gene_subset_indices.append(False) 505 | 506 | # train nuclei knn classifier for later use 507 | nuc_clf = neighbors.NearestNeighbors(n_neighbors=10).fit(nuc.loc[:, ['x', 'y', 'z']], 508 | nuc.id) 509 | 510 | # get count of surroundings that are the same and different 511 | surround_count, same_cts = get_number_similar_surroundings(cell_assign) 512 | 513 | # get number of cell types 514 | n_celltype = len(np.unique(sc_ref_celltypes)) 515 | 516 | pix_shape = pix.shape 517 | x_max, y_max, z_max = pix_shape[0]-1, pix_shape[1]-1, pix_shape[2]-1 518 | 519 | # name change from before need to clean up 520 | cp_grid = pix 521 | 522 | num_iterations_outer = 0 523 | np.seterr(invalid='raise') 524 | square_param_diff_vec = [] 525 | 526 | prediction_mean = [] 527 | p_weight = None 528 | p_mean = None 529 | percent_flipped = [] 530 | logi_param_diff = [] 531 | 532 | # center and scale sc ref 533 | #sc_ref.loc[:,:] = scale(sc_ref, axis=0) 534 | 535 | overlapping_genes_for_merfish_map = np.isin(all_genes, 536 | sc_ref.columns) 537 | 538 | n_combined_cells = cell_matrix.shape[0]+sc_ref.shape[0] 539 | combined_cells = np.zeros((n_combined_cells, 540 | np.sum(overlapping_genes_for_merfish_map))) 541 | 542 | clf_log = pixel_nn_classifier(sc_ref, 543 | sc_ref_celltypes, 544 | nlayer, 545 | l1_reg) 546 | 547 | n_iterations = 0 548 | n_changed = [] 549 | n_changed_overall = [] 550 | 551 | while(num_iterations_outer < outer_max): 552 | cells_matrix = get_matrix_of_cells( 553 | pixl_true, cell_assign, nuc).to_numpy() 554 | 555 | non_empty_cell_locs = np.where(np.sum(cells_matrix, axis=1) > 100)[0] 556 | cells_matrix = scale(cells_matrix, axis=1) 557 | cells_matrix = scale(cells_matrix, axis=0) 558 | tic = time() 559 | 560 | # combines the mer and sc_ref matrices and builds a classifier for celltype 561 | # based on sc_ref 562 | print('finding celltypes') 563 | cells_probs = clf_cell.predict(cells_matrix) 564 | 565 | toc = time() 566 | print('time to get celltypes', toc-tic) 567 | 568 | prediction_mean.append(np.mean(np.max(cells_probs, axis=1))) 569 | 570 | # adding multiply the max prob by 1+n_iteration*annealing_param 571 | max_pred = np.argmax(cells_probs, axis=1) 572 | 573 | cells_probs[np.arange(len(max_pred)), 574 | max_pred] *= 1+n_iterations*anneal_param 575 | cells_probs /= np.sum(cells_probs, axis=1, keepdims=True) 576 | 577 | # get the identity of the predicted cell types 578 | groupings = np.argmax(cells_probs, axis=1) 579 | 580 | #groupings = t_cell 581 | 582 | last_param = None 583 | 584 | toc = time() 585 | #print('time to find cell types ',toc-tic) 586 | 587 | num_iterations_inner = 0 588 | past_square_param = None 589 | while(num_iterations_inner < inner_max): 590 | #print('inner iterations:',num_iterations_inner) 591 | flat_assign = np.ravel(cell_assign) 592 | group_labels = groupings[flat_assign] 593 | 594 | # the empty cell type will be index of number of celltypes 595 | group_labels[flat_assign == -1] = n_celltype 596 | 597 | # random selection of pixels to use for training the model 598 | subset_indices = np.random.choice(np.arange(0, 599 | flat_assign.shape[0], dtype=int), 600 | size=int(pct_train*len(group_labels))) 601 | tic = time() 602 | merged_pix_info = cp_grid 603 | 604 | merged_pix_shape = merged_pix_info.shape 605 | merged_pix_reshaped = np.reshape( 606 | merged_pix_info, (merged_pix_shape[0]*merged_pix_shape[1]*merged_pix_shape[2], merged_pix_shape[3])) 607 | group_labels_mat = np.reshape( 608 | group_labels, (merged_pix_shape[0], merged_pix_shape[1], merged_pix_shape[2])) 609 | 610 | sub_merged_pix = merged_pix_reshaped[subset_indices, :] 611 | sub_group_labels = group_labels[subset_indices] 612 | 613 | # remove non cell pixels from training 614 | sub_merged_pix = sub_merged_pix[sub_group_labels != n_celltype, :] 615 | sub_group_labels = sub_group_labels[sub_group_labels != n_celltype] 616 | 617 | # train a model to see what a pixel of a certain kind looks like 618 | if ((num_iterations_inner == 0)): 619 | clf_log = train_nn_classifier(sub_merged_pix, sub_group_labels, clf_log, 620 | first_epochs, lrs) 621 | else: 622 | clf_log = train_nn_classifier(sub_merged_pix, sub_group_labels, clf_log, 623 | second_epochs, [lrs[-1]]) 624 | 625 | toc = time() 626 | print('time to train ', len(subset_indices), 'samples ', toc-tic) 627 | num_iterations_most_inner = 0 628 | while(num_iterations_most_inner < most_inner_max): 629 | # find border indices where they are not empty, and have more neighbors than the border 630 | # threshold 631 | border_indices = np.where((surround_count >= border_other_threshold) & 632 | (same_cts >= border_same_threshold)) 633 | 634 | border_indices_mat = np.stack( 635 | [border_indices[0], border_indices[1], border_indices[2]], axis=1) 636 | 637 | tic = time() 638 | # predict the probability a border pixel is from each class 639 | predictions = clf_log.predict( 640 | merged_pix_info[border_indices[0], border_indices[1], border_indices[2], :]) 641 | 642 | predictions[predictions <= flip_thresh] = 0 643 | 644 | ngene = merged_pix_info.shape[3] 645 | ms = merged_pix_info.shape 646 | npix = ms[0]*ms[1]*ms[2] 647 | 648 | # adding multiply the max prob by 1+n_iteration*annealing_param 649 | max_pred = np.argmax(predictions, axis=1) 650 | 651 | predictions[np.arange(len(max_pred)), 652 | max_pred] *= 1+n_iterations*anneal_param 653 | 654 | if predictions.shape[1] < (n_celltype): 655 | diff = np.setdiff1d( 656 | np.arange(n_celltype+1), np.unique(group_labels[subset_indices])) 657 | diff = np.sort(diff) 658 | for missing in diff: 659 | predictions = np.insert(predictions, missing, np.repeat( 660 | 0, predictions.shape[0]), axis=1) 661 | print('MISSSING ROW INSERTED!') 662 | 663 | toc = time() 664 | print('time to predict ', len( 665 | border_indices[0]), 'samples', toc-tic) 666 | # get the number of each cell type pixel surrounding each border pixel 667 | 668 | # some house keeping for the next step by padding arrays 669 | border_indices_mat += 1 670 | cell_assign = np.pad( 671 | cell_assign, (1), 'constant', constant_values=(-2)) 672 | 673 | locs = np.pad(locs, ((1, 1), (1, 1), (1, 1), (0, 0)), 674 | 'constant', constant_values=(-2)) 675 | 676 | bord_x, bord_y, bord_z = border_indices_mat[:, 677 | 0], border_indices_mat[:, 1], border_indices_mat[:, 2] 678 | surroundings = np.zeros( 679 | (border_indices_mat.shape[0]), dtype=int) 680 | group_key = np.zeros((border_indices_mat.shape[0]), dtype=int) 681 | predic_num = np.zeros_like(predictions) 682 | predic_probs = np.zeros( 683 | (predictions.shape[0], predictions.shape[1])) 684 | tic = time() 685 | 686 | pixels_to_nuc_dist_vec, pixels_to_nuclei_vec = nuc_clf.kneighbors(locs[bord_x, 687 | bord_y, 688 | bord_z, :]) 689 | 690 | pixels_to_nuclei_vec = nuc.id.to_numpy().astype('float64')[pixels_to_nuclei_vec] 691 | toc = time() 692 | print('time to predict nuclei distance:', toc-tic) 693 | 694 | tic = time() 695 | 696 | # this creates a matrix where each row is the surrounding cell ids for that border pixel 697 | surroundings = np.zeros((len(bord_x), 27)) 698 | sub_counter = 0 699 | for i in range(-1, 2): 700 | for j in range(-1, 2): 701 | for k in range(-1, 2): 702 | surroundings[:, sub_counter] = cell_assign[bord_x+i, 703 | bord_y+j, 704 | bord_z+k].copy() 705 | sub_counter += 1 706 | 707 | tic = time() 708 | 709 | input_surr = (surroundings.__array_interface__[ 710 | 'data'][0]+np.arange(surroundings.shape[0])*surroundings.strides[0]).astype(np.uintp) 711 | input_nuc = (pixels_to_nuclei_vec.__array_interface__['data'][0]+np.arange( 712 | pixels_to_nuclei_vec.shape[0])*pixels_to_nuclei_vec.strides[0]).astype(np.uintp) 713 | input_dist = (pixels_to_nuc_dist_vec.__array_interface__['data'][0]+np.arange( 714 | pixels_to_nuc_dist_vec.shape[0])*pixels_to_nuc_dist_vec.strides[0]).astype(np.uintp) 715 | dist_mat = np.zeros( 716 | (surroundings.shape[0], 27), dtype=np.float64) 717 | input_dist_mat = (dist_mat.__array_interface__[ 718 | 'data'][0]+np.arange(dist_mat.shape[0])*dist_mat.strides[0]).astype(np.uintp) 719 | 720 | # C function 721 | get_d(input_surr, 722 | input_nuc, 723 | input_dist, 724 | ctypes.c_int(surroundings.shape[0]), 725 | ctypes.c_int(pixels_to_nuclei_vec.shape[1]), 726 | input_dist_mat) 727 | 728 | toc = time() 729 | 730 | tic = time() 731 | for sub_counter in range(27): 732 | surro = surroundings[:, sub_counter].copy().astype(int) 733 | 734 | num_surroundings = len(surro) 735 | 736 | # get dist vec based on the first occurence of the surroundings 737 | # in the nearest neighbors vec from pix to nuclei 738 | dist_vec = dist_mat[:, sub_counter] 739 | 740 | reduced_dist = dist_vec - dist_threshold 741 | reduced_dist[reduced_dist <= 0] = 1e-3 742 | 743 | s = (dist_scaling*dist_threshold)/2 744 | scale_vec = np.divide(s, reduced_dist) 745 | scale_vec[dist_vec < dist_threshold] = 10 746 | scaled_final = np.minimum(10, scale_vec) 747 | 748 | scaled_final[np.where(surro == -1)] = 1 749 | scaled_final[np.where(surro == -2)] = 1 750 | 751 | group_key = groupings[surro].copy() 752 | group_key[np.where(surro == -1)] = -1 753 | group_key[np.where(surro == -2)] = -2 754 | non_neg = np.where(group_key >= 0) 755 | predic_num[non_neg, group_key[non_neg]] += 1 756 | predic_probs[non_neg, 757 | group_key[non_neg]] += scaled_final[non_neg] 758 | 759 | predic_num[np.where(predic_num == 0)] = 1 760 | 761 | predic_probs = np.divide(predic_probs, predic_num) 762 | 763 | predictions = np.multiply(predictions, predic_probs) 764 | 765 | pred_sum = np.sum(predictions, axis=1, keepdims=1) 766 | pred_sum[pred_sum == 0] = 1 767 | predictions /= pred_sum 768 | 769 | toc = time() 770 | 771 | number_of_border_pixels = border_indices_mat.shape[0] 772 | 773 | locs_to_flip = locs[border_indices_mat[:, 0], 774 | border_indices_mat[:, 1], border_indices_mat[:, 2], :] 775 | 776 | # remove 0 pad from locs 777 | ls = locs.shape 778 | locs = locs[1:ls[0]-1, 1:ls[1]-1, 1:ls[2]-1, :] 779 | 780 | flips_arg_bool = (predictions.cumsum( 781 | 1) > np.random.rand(predictions.shape[0])[:, None]) 782 | 783 | flips_arg = flips_arg_bool.argmax(1) 784 | 785 | # set flip args that have prediction prob 0 to n_celltype 786 | flips_arg[np.sum(flips_arg_bool, axis=1) == 0] = n_celltype 787 | flips_arg[np.sum(predictions, axis=1) == 0] = n_celltype 788 | 789 | cells_to_flip = flips_arg 790 | 791 | borders_to_flip = border_indices_mat 792 | 793 | tic = time() 794 | 795 | bord_x, bord_y, bord_z = borders_to_flip[:, 796 | 0], borders_to_flip[:, 1], borders_to_flip[:, 2] 797 | old_id = cell_assign[bord_x, 798 | bord_y, 799 | bord_z].copy() 800 | 801 | surroundings = np.zeros( 802 | (border_indices_mat.shape[0], 3**border_indices_mat.shape[1]), dtype=int) 803 | group_key = np.zeros( 804 | (border_indices_mat.shape[0], 3**border_indices_mat.shape[1]), dtype=int) 805 | matching_regions = np.zeros( 806 | (border_indices_mat.shape[0], 3**border_indices_mat.shape[1]), dtype=np.float32) 807 | changed_surround_count = np.zeros( 808 | (border_indices_mat.shape[0], 3**border_indices_mat.shape[1]), dtype=int) 809 | loc_counter = 0 810 | for i in range(-1, 2): 811 | for j in range(-1, 2): 812 | for k in range(-1, 2): 813 | surroundings[:, loc_counter] = cell_assign[bord_x+i, 814 | bord_y+j, 815 | bord_z+k].copy() 816 | 817 | group_key[:, loc_counter] = groupings[surroundings[:, loc_counter]].copy( 818 | ) 819 | group_key[:, loc_counter][np.where( 820 | surroundings[:, loc_counter] == -1)] = n_celltype 821 | matching_regions[:, loc_counter] = group_key[:, 822 | loc_counter] == cells_to_flip 823 | loc_counter += 1 824 | 825 | match_dist = matching_regions * dist_mat 826 | match_dist[match_dist == 0] = 1e3 827 | new_cell_loc = np.argmin(match_dist, axis=1) 828 | 829 | new_id = surroundings[np.arange( 830 | 0, surroundings.shape[0]), new_cell_loc].copy() 831 | new_id[np.sum(matching_regions, axis=1) == 0] = -1 832 | new_id[np.sum(flips_arg_bool, axis=1) == 0] = -1 833 | new_id[new_id == -2] = -1 834 | 835 | n_changed_this_round = len( 836 | new_id) - np.sum(np.equal(old_id, new_id)) 837 | n_changed.append(n_changed_this_round) 838 | n_changed_overall.append(n_changed_this_round) 839 | if len(n_changed) > 10: 840 | n_changed = n_changed[1:] 841 | 842 | cell_assign[bord_x, bord_y, bord_z] = new_id 843 | 844 | same_and_different = surroundings == new_id.reshape( 845 | surroundings.shape[0], 1) 846 | different_count = np.sum(same_and_different == 0, axis=1) 847 | borders_to_flip -= 1 848 | 849 | # remove 0 pad from cell assignment 850 | cs = cell_assign.shape 851 | cell_assign = cell_assign[1:cs[0]-1, 1:cs[1]-1, 1:cs[2]-1] 852 | cell_assign[map_to_keep_nuclei_correctly_labeled != -1] = map_to_keep_nuclei_correctly_labeled[map_to_keep_nuclei_correctly_labeled != -1].copy() 853 | 854 | bord_x, bord_y, bord_z = borders_to_flip[:, 855 | 0], borders_to_flip[:, 1], borders_to_flip[:, 2] 856 | 857 | surround_count, same_cts = get_number_similar_surroundings( 858 | cell_assign) 859 | 860 | old_cells = groupings[old_id].copy() 861 | old_cells[np.where(old_id == -1)] = n_celltype 862 | 863 | new_cells = groupings[new_id].copy() 864 | new_cells[np.where(new_id == -1)] = n_celltype 865 | 866 | percent_flipped.append( 867 | np.sum(np.equal(new_id, old_id))/number_of_border_pixels) 868 | 869 | toc = time() 870 | print('time to flip pixels ', toc-tic) 871 | n_iterations += 1 872 | num_iterations_most_inner += 1 873 | num_iterations_inner += 1 874 | num_iterations_outer += 1 875 | 876 | # compute counts matrix 877 | cells_matrix = get_matrix_of_cells(pixl_true, cell_assign, nuc) 878 | 879 | return cell_assign, cells_matrix, groupings 880 | 881 | def watershed_nuclei(pix, nuclei, locations): 882 | ls = locations.shape 883 | raveled_locs = locations.reshape((ls[0]*ls[1]*ls[2],3)) 884 | square_ids = np.arange(0,raveled_locs.shape[0]) 885 | locs_clf = neighbors.KNeighborsClassifier(1, 886 | n_jobs=12).fit(raveled_locs, square_ids) 887 | 888 | predicted_locs = locs_clf.predict(nuclei.loc[:,['x','y','z']]) 889 | nuclei_ids = np.zeros(len(square_ids)) 890 | nuclei_ids[predicted_locs] = nuclei.id+1 891 | nuclei_ids = nuclei_ids.reshape((ls[0],ls[1],ls[2])) 892 | 893 | pix_dens = np.log2(np.sum(pix,axis=3)+1) 894 | image = np.zeros_like(pix_dens) 895 | image[pix_dens > pix_thresh] = 1 896 | labels = watershed(pix_dens, nuclei_ids, 897 | watershed_line = True, 898 | mask = image, compactness=10) - 1 899 | return labels 900 | 901 | def create_celltype_classifier(sf, sc, nlayers=2, l1_reg=1e-3, 902 | epochs=20, lrs=[5e-3, 5e-4], 903 | test_size=0.25): 904 | ''' 905 | Creates a cell type classifier and trains the model based on a neural network 906 | --------------------- 907 | parameters: 908 | sf: single cell reference dataset 909 | sc: single cell reference cell types. (must be int) 910 | nlayers: number of intermediate layers in network (default: 2) 911 | l1_reg: l1 regularization parameter (default: 1e-3) 912 | epochs: number of epochs to train for, per learning rate cycle (default: 20) 913 | lrs: learning rates. Must be a list of learning rates (defaults: [5e-3,5e-4]) 914 | test_size: percentage of dataset to use for validation (default: 0.25) 915 | returns: 916 | clf_cell: Trained cell type classifier 917 | 918 | ''' 919 | 920 | ncelltype = len(np.unique(sc)) 921 | 922 | input_dim = sf.shape[1] 923 | input_vec = Input(shape=(input_dim,)) 924 | x = Dense(input_dim*3, activation='tanh', 925 | activity_regularizer=l1(l1_reg))(input_vec) 926 | x = BatchNormalization()(x) 927 | for i in range(nlayers-1): 928 | x = Dense(input_dim*3, activation='tanh', 929 | activity_regularizer=l1(l1_reg))(x) 930 | x = BatchNormalization()(x) 931 | out = Dense(ncelltype, activation='softmax', 932 | activity_regularizer=l1(l1_reg))(x) 933 | 934 | clf_cell = Model(input_vec, out) 935 | clf_cell.summary() 936 | 937 | scaled_ref = scale(sf, axis=1) 938 | scaled_ref = scale(scaled_ref, axis=0) 939 | 940 | X_train, X_test, y_train, y_test = train_test_split(scaled_ref, sc, 941 | test_size=test_size, 942 | random_state=0) 943 | 944 | for lr in lrs: 945 | adam = Adam(learning_rate=lr) 946 | clf_cell.compile(optimizer='Adam', 947 | loss='sparse_categorical_crossentropy', 948 | metrics=['accuracy']) 949 | clf_cell.fit(X_train, y_train, 950 | validation_data=(np.array(X_test), np.array(y_test)), 951 | epochs=epochs, 952 | batch_size=64, 953 | use_multiprocessing=True) 954 | 955 | return clf_cell 956 | 957 | def pixel_nn_classifier(mp,sc,nlayer, l2_reg): 958 | ncelltype = len(np.unique(sc)) 959 | input_dim = mp.shape[1] 960 | input_vec = Input(shape=(input_dim,)) 961 | x = Dense(input_dim*2,activation='tanh', 962 | activity_regularizer=l1(l2_reg))(input_vec) 963 | x = BatchNormalization()(x) 964 | for i in range(1,nlayer): 965 | x = Dense(input_dim*((2)**(i)),activation='tanh', 966 | activity_regularizer=l1(l2_reg))(x) 967 | x = BatchNormalization()(x) 968 | 969 | out = Dense(ncelltype, activation='softmax', 970 | activity_regularizer=l1(l2_reg))(x) 971 | 972 | clf= Model(input_vec, out) 973 | clf.summary() 974 | return clf 975 | 976 | def train_nn_classifier(mp,sc,clf,epo,lrs): 977 | X_train, X_test, y_train, y_test = train_test_split(mp, sc, 978 | test_size = 0.2,random_state = 0) 979 | for lr in lrs: 980 | adam = Adam(learning_rate = lr) 981 | clf.compile(optimizer = 'Adam', 982 | loss = 'sparse_categorical_crossentropy', 983 | metrics=['accuracy']) 984 | clf.fit(X_train,y_train, 985 | epochs = epo, 986 | batch_size = 64, 987 | validation_data=(X_test,y_test), 988 | use_multiprocessing=True) 989 | return clf 990 | 991 | def get_centers(vec): 992 | ''' 993 | # gets the center of each pixel 994 | # input: 995 | # vec: the aranged vector of the edge of each pixel 996 | # output: 997 | # ctr: vector of len(vec)-1 that has the center of each pixel's coordinates 998 | ''' 999 | ctr = [] 1000 | for i in range(len(vec)-1): 1001 | ctr.append(np.mean([vec[i],vec[i+1]])) 1002 | return np.array(ctr) 1003 | 1004 | def transform_to_stage_coordinates(spt, metadata): 1005 | ''' 1006 | # transforms the nuclei to stage coordinates and creates a dataframe 1007 | # that shows the location of each nuclei assignment 1008 | # input: 1009 | # nuc_assign: 3d tensor of the nuclei assignments 1010 | # XY: a vector of the center of each pos [x,y] 1011 | # zstack: vector of the z indices 1012 | # pix_size: size of each pixel from metadata 1013 | # img_shape: 3d shape of the image with nuclei assignments 1014 | # output: 1015 | # df_nuc: dataframe with the nuclei locations 1016 | ''' 1017 | new_spots = np.zeros((spt.shape[0],4)) 1018 | spt_counter = 0 1019 | genes = [] 1020 | for pos in np.unique(spt.posname): 1021 | print(pos) 1022 | temp_spt = spt[spt.posname == pos] 1023 | genes += list(temp_spt.gene.to_numpy()) 1024 | subset_metadata = metadata[metadata.Position == pos] 1025 | XY = subset_metadata.XYbeforeTransform.to_numpy()[0] 1026 | pix_size = subset_metadata.PixelSize.to_numpy()[0] 1027 | 1028 | x = XY[0] 1029 | y = XY[1] 1030 | 1031 | for s in zip(temp_spt.centroid,temp_spt.z): 1032 | ctr_x = (s[0][1]-1024) * pix_size 1033 | ctr_y = (s[0][0]-1024) * pix_size 1034 | ctr_z = s[1] * 0.4 1035 | new_spots[spt_counter,1:] = [ctr_x+x,ctr_y+y,ctr_z] 1036 | spt_counter += 1 1037 | 1038 | df_spt = pd.DataFrame(new_spots) 1039 | df_spt.columns = ['gene','x','y','z'] 1040 | df_spt.gene = np.array(genes) 1041 | 1042 | return df_spt 1043 | 1044 | c_args = [ndpointer(dtype=np.uintp,ndim=1,flags='C'), 1045 | ndpointer(dtype=np.uintp,ndim=1,flags='C'), 1046 | ndpointer(dtype=np.uintp,ndim=1,flags='C'), 1047 | ctypes.c_int, ctypes.c_int, 1048 | ndpointer(dtype=np.uintp,ndim=1,flags='C')] 1049 | 1050 | dist_func_c = ctypes.CDLL(path_to_file+"/get_distances.so") 1051 | get_d = dist_func_c.get_distances 1052 | dist_func_c.get_distances.argtypes = c_args 1053 | get_d.restype = None 1054 | get_num_surr_func_c = ctypes.CDLL(path_to_file+"/get_number_similar_surroundings.so") 1055 | get_sur = get_num_surr_func_c.get_sur 1056 | -------------------------------------------------------------------------------- /CoreFunctions/MerfishSimulator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import pandas as pd 4 | from sklearn.neighbors import KNeighborsClassifier 5 | from random import randint 6 | from collections import Counter 7 | import matplotlib.patches as mpatches 8 | import pickle as pkl 9 | import json 10 | import sys 11 | from sklearn.decomposition import PCA 12 | from sklearn.preprocessing import scale 13 | from sklearn.linear_model import LogisticRegression 14 | from sklearn.model_selection import train_test_split 15 | 16 | class merfish_data_generator: 17 | ''' 18 | Class to generate merfish data, all values are in microns 19 | -------------------- 20 | parameters: 21 | celltypes: list of the unique cell types (default: None) 22 | genes: list of the genes used for merfish data (default: None) 23 | celltype_props: list of the cell type proportions should be in the same order as 24 | celltypes (default: None) 25 | density_range: range of areas, and how dense to make them (default: [1]) 26 | dist_between_cell_centers: range of values of how far apart 27 | to make them (default: [30]) 28 | cell_shape_regularity: how round the cell should be (default: 1) 29 | dge: digital gene expression matrix celltypes x genes (default: None) 30 | noise: overlap between cells count data (default: 0) 31 | heterogeneity: how heterogeneous to make the cell type distribution (default: 1) 32 | grid_shape: x, y, z sizze of the grid 33 | nuclei_size_range: variability in the size of the nuclei (default: [0.2]) 34 | distance_between_cells: adds to the overall distance between cells 35 | negative values squish cells closer together (default: 0) 36 | subtype: Boolean, indicating whether or not the current cells are 37 | subtypes 38 | -------------------- 39 | functions: 40 | generate_grid: Generates the xyz grid for where the cell centers are 41 | generate_cell_centers: Generates a dataframe with the cell centers in xyz coords 42 | assign_pixels_to_cells: Assigns the pixels to the cells based on 43 | modified voronoi from the center of the cell w/ max distance 44 | plot_true: Plots the cell ID grid in a single z_stack value 45 | plot_celltypes: Plots the celltype grid in a single z_stack value 46 | generate_nuclei: Generates the nuclei randomly within a cell 47 | add_dge: Adds the DGE matrices to the object both should be celltypes x genes 48 | Can be used to add a self curated dge instead of computing one 49 | compute_dge: Computes and adds the DGE matrices to the object. It should be in celltypes x genes 50 | Also computes cell type proportions 51 | classify_celltypes: Adds celltypes to the cells based on the cell type proportions 52 | generate_merfish_dge: Generates the merfish dge for the cells, based on their cell type and the dge from 53 | single cell data 54 | place_transcripts: Places the spots within the cell 55 | ''' 56 | def __init__(self, celltypes = None, 57 | genes = None, 58 | celltype_props = None, 59 | density_range = [1], 60 | dist_between_cell_centers = [30], 61 | cell_shape_regularity = 1, 62 | dge = None, 63 | noise = 0, 64 | heterogeneity = 1, 65 | grid_shape = (200,200,50), 66 | nuclei_size_range = [0.2], 67 | distance_between_cells = 0, 68 | subtype = False): 69 | self.celltypes = celltypes 70 | self.genes = genes 71 | self.density_range = density_range 72 | self.dist_between_cell_centers = dist_between_cell_centers 73 | self.cell_shape_regularity = cell_shape_regularity 74 | self.celltype_props = celltype_props 75 | self.dge = dge 76 | self.noise = noise 77 | self.heterogeneity = 1 78 | self.grid_shape = grid_shape 79 | #percentage of cell size 80 | self.nuclei_size_range = nuclei_size_range 81 | self.x_max = grid_shape[0]; self.x_min = 0 82 | self.y_max = grid_shape[1]; self.y_min = 0 83 | self.z_max = grid_shape[2]; self.z_min = 0 84 | self.distance_between_cells = distance_between_cells 85 | 86 | self.cell_centers = None 87 | self.true_map = None 88 | 89 | self.subtype = subtype 90 | 91 | 92 | def generate_grid(self, space_between): 93 | ''' 94 | Generates the xyz grid for where the cell centers are 95 | -------------------- 96 | parameters: 97 | space_between: space in microns between nuclei 98 | -------------------- 99 | returns: 100 | x_coord: x coordinate vector of nuclei 101 | y_coord: y coordinate vector of nuclei 102 | z_coord: z coordinate vector of nuclei 103 | ''' 104 | xs = np.arange(0,self.x_max, space_between) 105 | ys = np.arange(0,self.y_max, space_between) 106 | zs = np.arange(0,self.z_max, space_between) 107 | x_coord, y_coord, z_coord = np.meshgrid(xs,ys,zs) 108 | 109 | return x_coord, y_coord, z_coord 110 | 111 | 112 | def generate_cell_centers(self): 113 | ''' 114 | Generates a dataframe with the cell centers in xyz coords 115 | -------------------- 116 | adds: 117 | self.cell_centers: dataframe with xyz coordinates 118 | self.cell_ids: the identification number of each cell 119 | ''' 120 | distance_between_nuclei = np.mean(self.dist_between_cell_centers)+self.distance_between_cells 121 | std_distance_between_nuclei = np.std(self.dist_between_cell_centers) 122 | 123 | x_coord, y_coord, z_coord = self.generate_grid(distance_between_nuclei) 124 | 125 | gs = x_coord.shape 126 | 127 | x_rand = np.random.randn(gs[0],gs[1],gs[2])*std_distance_between_nuclei 128 | y_rand = np.random.randn(gs[0],gs[1],gs[2])*std_distance_between_nuclei 129 | z_rand = np.random.randn(gs[0],gs[1],gs[2])*std_distance_between_nuclei 130 | 131 | x_coord += x_rand; y_coord += y_rand; z_coord += z_rand 132 | 133 | cell_centers = pd.DataFrame({'x':x_coord.ravel(), 134 | 'y':y_coord.ravel(), 135 | 'z':z_coord.ravel()}) 136 | 137 | self.cell_centers = cell_centers 138 | cell_ids = np.arange(cell_centers.shape[0]) 139 | self.cell_ids = cell_ids 140 | 141 | def assign_pixels_to_cells(self, 142 | pixels_per_micron = .5, 143 | max_dist = 20, 144 | noise_in_dist = 0, 145 | min_pix_count = 30): 146 | ''' 147 | Assigns the pixels to the cells based on modified voronoi from the 148 | center of the cell w/ max distance 149 | -------------------- 150 | parameters: 151 | pixels_per_micron: measure for the resolution of the grid. Higher values increase 152 | resolution, but increase run time and memory requirements (default: .5) 153 | max_dist: maximum distance away from the center for voronoi segmentation (default: 20 microns) 154 | noise_in_dist: adds uneven edge effects the the nuclei (default: 0) 155 | min_pix_count: minimum pixels assigned to a cell for a cell to be considered a cell 156 | -------------------- 157 | adds: 158 | self.cell_ids: removes the cell ids with < min_pix_count pixels assigned to it 159 | self.cell_centers: removes the centers of cells with < min_pix_count 160 | self.true_map: the 3d map of pixels -1 is no id, otherwise the number is the cell id 161 | ''' 162 | if self.cell_centers is None: 163 | self.generate_cell_centers() 164 | 165 | x_coord, y_coord, z_coord = self.generate_grid(1/pixels_per_micron) 166 | self.pix_per_micron = pixels_per_micron 167 | 168 | #change this later 169 | max_dist = np.mean(self.dist_between_cell_centers)/2 170 | 171 | clf_cell_center = KNeighborsClassifier(n_neighbors=1, 172 | algorithm='kd_tree').fit(self.cell_centers.to_numpy(), 173 | self.cell_ids) 174 | 175 | dist, cell_id = clf_cell_center.kneighbors(np.array([x_coord.ravel(), 176 | y_coord.ravel(), 177 | z_coord.ravel()]).T) 178 | dist = dist.ravel(); cell_id = cell_id.ravel() 179 | dist += np.random.randn(len(dist)) * noise_in_dist 180 | cell_id = self.cell_ids[cell_id] 181 | cell_id[dist > max_dist] = -1 182 | pixel_map = np.reshape(cell_id, x_coord.shape) 183 | 184 | #remove cells without pixels 185 | uniq_cells = np.unique(pixel_map.ravel()) 186 | cells_to_remove = [] 187 | for i in self.cell_ids: 188 | if i not in uniq_cells: 189 | cells_to_remove.append(i) 190 | 191 | #remove cells with less than a certain pixel number 192 | counted = Counter(pixel_map.ravel()) 193 | for c in counted: 194 | if counted[c] < min_pix_count: 195 | pixel_map[pixel_map == c] = -1 196 | cells_to_remove.append(c) 197 | 198 | self.cell_ids = np.delete(self.cell_ids, cells_to_remove) 199 | self.cell_centers.drop(index=cells_to_remove, inplace=True) 200 | self.true_map = pixel_map 201 | 202 | def plot_true(self, cmap = 'nipy_spectral',ax=None, alpha = 1, z_stack = 5): 203 | ''' 204 | Plots the cell ID grid in a single z_stack value 205 | -------------------- 206 | parameters: 207 | cmap: the color map from matplot lib to use (default: nipy_spectral) 208 | ax: the matplotlib cell to plot in (default: None) 209 | alpha: measure of the transparency of the image (default: 1) 210 | z_stack: number for the z slice to plot in (default: 0) 211 | ''' 212 | if self.true_map is None: 213 | self.assign_pixels_to_cells() 214 | 215 | true_map = self.true_map.copy() 216 | new_ids = np.unique(self.true_map.ravel()) 217 | new_ids = np.delete(new_ids, 0) 218 | np.random.shuffle(new_ids) 219 | 220 | counter = 0 221 | for i in new_ids: 222 | true_map[self.true_map == i] = counter 223 | counter += 1 224 | 225 | if ax is None: 226 | plt.imshow(true_map[:,:,z_stack], cmap=cmap, alpha = alpha) 227 | else: 228 | ax.imshow(true_map[:,:,z_stack], cmap=cmap, alpha = alpha) 229 | 230 | 231 | def plot_celltypes(self, cmap = 'nipy_spectral', 232 | ax = None, alpha = 1, z_stack = 0): 233 | ''' 234 | Plots the celltype grid in a single z_stack value 235 | -------------------- 236 | parameters: 237 | cmap: the color map from matplot lib to use (default: nipy_spectral) 238 | ax: the matplotlib cell to plot in (default: None) 239 | alpha: measure of the transparency of the image (default: 1) 240 | z_stack: number for the z slice to plot in (default: 0) 241 | -------------------- 242 | adds: 243 | self.cellt_map: Adds the 3d cell type map -1 is no cell, cell type number 244 | is based on the order of self.celltypes 245 | ''' 246 | cellt = self.classified_celltypes 247 | colors = [] 248 | celltype_colors = {} 249 | for i, cell in enumerate(np.unique(cellt)): 250 | celltype_colors[cell] = i 251 | for cell in cellt: 252 | colors.append(celltype_colors[cell]) 253 | colors = np.array(colors) 254 | t_map = self.true_map.ravel().copy() 255 | for i, cell_id in enumerate(np.unique(t_map)): 256 | if cell_id != -1: 257 | t_map[t_map == cell_id] = i - 1 258 | cellt_vec = colors[t_map] 259 | cellt_vec[t_map == -1] = -1 260 | cellt_mat = cellt_vec.reshape(self.true_map.shape) 261 | self.cellt_map = cellt_mat 262 | 263 | 264 | # create legend 265 | leg = np.unique(cellt) 266 | cmap_converter = plt.cm.get_cmap(cmap, len(leg)+1) 267 | leg_patch = [] 268 | for i in range(len(leg)): 269 | leg_patch.append(mpatches.Patch(color=cmap_converter(i+1)[:3], 270 | label=leg[i])) 271 | if ax is None: 272 | plt.imshow(cellt_mat[:,:,0], cmap = cmap, alpha = alpha) 273 | plt.legend(handles = leg_patch,bbox_to_anchor=(2,1)) 274 | else: 275 | ax.imshow(cellt_mat[:,:,0], cmap = cmap, alpha = alpha) 276 | ax.legend(handles = leg_patch,bbox_to_anchor=(2,1)) 277 | 278 | 279 | def generate_nuclei_centers(self, n_pix_per_nuc = 9, dtype='int32'): 280 | ''' 281 | Generates the nuclei randomly within a cell 282 | -------------------- 283 | parameters: 284 | n_pix_per_nuc: number of pixels in a nucleus (default: 9) 285 | dtype: data type for the nuclei tensor (default: int16) 286 | -------------------- 287 | adds: 288 | self.nuclei: 3d tensor of nuclei, -1 is no nucleus, number is according 289 | to the cell ID 290 | ''' 291 | if self.cell_centers is None: 292 | self.generate_cell_centers() 293 | 294 | nuclei = np.zeros(self.true_map.shape, 295 | dtype=dtype) 296 | nuclei -= 1 297 | 298 | check_nuclei_surroundings = KNeighborsClassifier(n_neighbors = 27) 299 | 300 | non_zero_cell_locs = np.where(self.true_map != -1) 301 | cell_ids = self.true_map.ravel() 302 | xs = non_zero_cell_locs[0] 303 | ys = non_zero_cell_locs[1] 304 | zs = non_zero_cell_locs[2] 305 | cell_ids = cell_ids[cell_ids != -1] 306 | check_nuclei_surroundings.fit(np.vstack((xs,ys,zs)).T, cell_ids) 307 | 308 | for i in np.unique(self.cell_ids): 309 | if i != -1: 310 | cell_coords = np.where(self.true_map == i) 311 | rand_index = randint(0,len(cell_coords[0])-1) 312 | xs = cell_coords[0] 313 | ys = cell_coords[1] 314 | zs = cell_coords[2] 315 | 316 | locs_as_mat = np.vstack((xs, ys, zs)).T 317 | clf_seed = KNeighborsClassifier(n_neighbors=min(n_pix_per_nuc, 318 | locs_as_mat.shape[0])) 319 | nuclei_seed = locs_as_mat[rand_index,:] 320 | 321 | clf_seed.fit(locs_as_mat, np.arange(locs_as_mat.shape[0])) 322 | nuc_pix_locs = clf_seed.kneighbors([nuclei_seed])[1][0] 323 | nuc_pix = locs_as_mat[nuc_pix_locs,:] 324 | 325 | non_same_celltype = cell_ids[check_nuclei_surroundings.kneighbors(nuc_pix)[1]] 326 | #remove nuc pixels that are on the border 327 | nuc_pix = nuc_pix[np.sum(non_same_celltype == i,axis=1) == 27,:] 328 | if nuc_pix.shape[0] > 0: 329 | nuclei[nuc_pix[:,0], 330 | nuc_pix[:,1], 331 | nuc_pix[:,2]] = i 332 | else: 333 | nuclei[nuclei_seed[0], 334 | nuclei_seed[1], 335 | nuclei_seed[2]] = i 336 | #print(np.unique(nuclei)) 337 | 338 | self.nuclei = nuclei 339 | 340 | def compute_covariance(self, counts, 341 | celltypes, 342 | find_celltype_props = True): 343 | ''' 344 | Computes and adds the DGE matrices to the object. It should be in celltypes x genes 345 | Also computes cell type proportions 346 | -------------------- 347 | parameters: 348 | counts: count matrix of single cell data cells x genes 349 | celltypes: vector of celltypes for each cell in sc matrix 350 | find_celltype_props: indicating whether or not to use the celltype 351 | proportions from the single cell data (default: True) 352 | 353 | -------------------- 354 | adds: 355 | self.genes: vector of the genes 356 | self.celltypes: vector of the celltypes 357 | self.ct_means: matrix with mean gene expression of each gene for each celltype 358 | self.ct_stds: matrix with stdev of gene expression of each gene for each celltype 359 | self.ct_covs: covariance matrix for each celltype 360 | self.celltype_props: if indicated, adds the celltype proportions from 361 | the single cell data 362 | ''' 363 | self.genes = counts.columns.to_numpy() 364 | self.celltypes = np.unique(celltypes) 365 | 366 | celltype_means = np.zeros((len(np.unique(celltypes)),counts.shape[1])) 367 | celltype_stds = np.zeros((len(np.unique(celltypes)),counts.shape[1])) 368 | cov_mats = np.zeros((len(np.unique(celltypes)),counts.shape[1],counts.shape[1])) 369 | for i,cell in enumerate(np.unique(celltypes)): 370 | loc = np.where(cell == celltypes)[0] 371 | subset_cells = counts.iloc[loc,:] 372 | celltype_means[i,:] = np.mean(subset_cells,axis=0) 373 | celltype_stds[i,:] = np.std(subset_cells,axis=0) 374 | 375 | x = np.cov(subset_cells.T) 376 | min_eig = np.min(np.real(np.linalg.eigvals(x))) 377 | if min_eig < 0: 378 | x -= 100*min_eig * np.eye(*x.shape) 379 | cov_mats[i,:,:] = x 380 | 381 | 382 | 383 | self.ct_means = celltype_means 384 | celltype_stds[celltype_stds == 0] = 1 385 | self.ct_stds = celltype_stds 386 | self.ct_covs = cov_mats 387 | 388 | 389 | if find_celltype_props: 390 | celltype_props = [] 391 | for i, cell in enumerate(self.celltypes): 392 | celltype_locs = np.where(celltypes == cell)[0] 393 | celltype_props.append(len(celltype_locs)/len(celltypes)) 394 | self.celltype_props = np.array(celltype_props) 395 | 396 | def classify_celltypes(self,subtype = False, ct_list = None, st_list = None): 397 | ''' 398 | Adds celltypes to the cells based on the cell type proportions 399 | -------------------- 400 | parameters: 401 | ct_list: list of the celltype annotation for the subt 402 | -------------------- 403 | adds: 404 | self.celltype_props: if there aren't available cell type proportions 405 | sets them uniform 406 | self.classified_celltypes: vector of the cell type for each cell 407 | ''' 408 | if self.celltypes is None: 409 | print('No Celltypes Available') 410 | return 411 | if self.celltype_props is None: 412 | print('No celltype proportions available. Assuming uniform ') 413 | self.celltype_props = [1/len(self.celltypes) for i in range(len(self.celltypes))] 414 | cell_probs = np.tile(self.celltype_props,(len(self.cell_ids),1)) 415 | rand_unif = np.random.rand(len(self.cell_ids)) 416 | class_celltype_index = (cell_probs.cumsum(1) > np.random.rand(len(self.cell_ids))[:,None]).argmax(1) 417 | 418 | self.classified_celltypes = self.celltypes[class_celltype_index] 419 | if self.subtype: 420 | ct_map = {} 421 | for i,ct in enumerate(st_list): 422 | if ct not in ct_map: 423 | ct_map[ct] = ct_list[i] 424 | self.classified_celltypes_lowres = np.array([ct_map[i] for i in self.classified_celltypes]) 425 | 426 | def generate_merfish_dge(self,dge_scaling_factor = 1e1): 427 | ''' 428 | Generates the merfish dge for the cells, based on their cell type and the dge from 429 | single cell data 430 | -------------------- 431 | parameters: 432 | dge_scaling_factor: multiplies the dge matrix by this number 433 | sometimes the single cell computed matrix values are too small due to sparsity 434 | this increases the values for later use in generating spots (default: 1e1) 435 | -------------------- 436 | adds: 437 | merfish_dge: cells x genes dge matrix for the merifsh cells 438 | ''' 439 | merfish_dge = np.zeros((len(self.classified_celltypes), 440 | len(self.genes))) 441 | uniq_celltypes = np.unique(self.classified_celltypes) 442 | for i, cell in enumerate(self.cell_ids): 443 | sum_of_count = 0 444 | cellt = self.classified_celltypes[i] 445 | cellt_ind = np.where(uniq_celltypes == cellt)[0][0] 446 | 447 | counts = np.random.multivariate_normal(self.ct_means[cellt_ind,:], 448 | self.ct_covs[cellt_ind,:,:]) 449 | counts *= self.ct_stds[cellt_ind,:] 450 | counts += self.ct_means[cellt_ind,:] 451 | merfish_dge[i,:] = counts 452 | 453 | merfish_dge *= dge_scaling_factor 454 | merfish_dge[merfish_dge < 0] = 0 455 | merfish_dge = np.round(merfish_dge) 456 | merfish_dge = pd.DataFrame(merfish_dge) 457 | merfish_dge.columns = self.genes 458 | merfish_dge.index = self.cell_ids 459 | self.merfish_dge = merfish_dge 460 | #self.merfish_dge = np.round(self.merfish_dge) 461 | #self.merfish_dge[self.merfish_dge <= 0] = 0 462 | 463 | #dist_from_nuc_scale 0 is uniform dist, 1 is all right next to nuc 464 | def place_transcripts(self, dist_from_nuc_scale = 0): 465 | ''' 466 | Places the spots within the cell 467 | -------------------- 468 | parameters: 469 | dist_from_nuc_scale: indicates the uniformity of spots within a cell, 470 | 0 is uniform distribution, as it increases, the spots cluster around the nucleus 471 | -------------------- 472 | adds: 473 | self.spots: spot calls matrix cells x 4 (gene, x, y, z) 474 | self.nuc_df: nuclei df matrix cells x 4 (id, x, y, z) 475 | ''' 476 | t_map = self.true_map 477 | nuc_map = self.nuclei 478 | 479 | if dist_from_nuc_scale <= 0: 480 | dist_from_nuc_scale = 1e-3 481 | 482 | n_exp_pdf = len(self.cell_ids)*10 483 | 484 | #psuedo exponential distribution 485 | exp_pdf = np.exp(dist_from_nuc_scale*np.random.rand(n_exp_pdf)) 486 | 487 | exp_pdf = np.sort(exp_pdf) 488 | exp_pdf /= np.sum(exp_pdf) 489 | 490 | exp_cdf = np.array(np.cumsum(exp_pdf))[::-1] 491 | 492 | n_nuc_pix = np.sum(nuc_map != -1) 493 | nuc_df = np.zeros((n_nuc_pix, 4)) 494 | 495 | #spots = pd.DataFrame(columns = ['gene','x','y','z']) 496 | 497 | spots = np.zeros((int(np.sum(self.merfish_dge.to_numpy().ravel())),3), 498 | dtype=np.uint32) 499 | 500 | gene_vec = [] 501 | spots_mat_iter = 0 502 | nuc_iter = 0 503 | tot_spot = 0 504 | for i in np.unique(nuc_map.ravel()): 505 | if i != -1: 506 | nuc_loc = np.array(np.where(nuc_map == i)).T 507 | nuc_df[nuc_iter:nuc_loc.shape[0]+nuc_iter,0] = i 508 | nuc_df[nuc_iter:nuc_loc.shape[0]+nuc_iter,1:] = nuc_loc 509 | nuc_iter += nuc_loc.shape[0] 510 | 511 | whole_cell_loc = np.array(np.where(t_map == i)).T 512 | 513 | nuc_mid = [np.mean(nuc_loc,axis=0)] 514 | clf_cell = KNeighborsClassifier(n_neighbors=whole_cell_loc.shape[0]).fit(whole_cell_loc, 515 | np.arange(whole_cell_loc.shape[0])) 516 | cell_count = self.merfish_dge.loc[i,:] 517 | indices = clf_cell.kneighbors(nuc_mid)[1][0] 518 | n_counts = int(np.sum(cell_count)) 519 | n_locs = whole_cell_loc.shape[0] 520 | 521 | random_count_locs = exp_cdf[np.array(np.round(np.random.rand(n_counts)*(n_exp_pdf-1)),dtype=int)] 522 | random_count_locs *= n_locs-1 523 | random_count_locs = np.array(np.round(random_count_locs),dtype=int) 524 | 525 | spot_ind = indices[random_count_locs] 526 | spot_locs = whole_cell_loc[spot_ind,:] 527 | 528 | non_zero_genes = np.where(cell_count != 0)[0] 529 | spot_iter = 0 530 | for j in non_zero_genes: 531 | gene = self.genes[j] 532 | n_spots = int(cell_count[j]) 533 | tot_spot += n_spots 534 | for k in range(n_spots): 535 | pix_loc = spot_locs[spot_iter,:] 536 | row = [pix_loc[0], pix_loc[1], pix_loc[2]] 537 | gene_vec.append(gene) 538 | spots[spots_mat_iter,:] = row 539 | spot_iter += 1 540 | spots_mat_iter += 1 541 | 542 | spots = spots[0:len(gene_vec),:] 543 | spots = pd.DataFrame(spots) 544 | spots.columns = ['x','y','z'] 545 | gene_vec = np.array(gene_vec) 546 | spots['gene'] = gene_vec 547 | 548 | nuc_df = pd.DataFrame(nuc_df) 549 | nuc_df.columns = ['id','x','y','z'] 550 | 551 | self.spots = spots 552 | self.nuc_df = nuc_df 553 | 554 | def place_transcripts_at_corners(self): 555 | ''' 556 | Places a spot from a random gene at each corner to make sure 557 | The pixel tensor size ends up the same as the simulated 558 | ''' 559 | y_max, x_max, z_max = self.grid_shape 560 | y_max *= self.pix_per_micron 561 | x_max *= self.pix_per_micron 562 | z_max *= self.pix_per_micron 563 | 564 | y_min = 0; x_min = 0; z_min = 0; 565 | for x in [x_min, x_max-1]: 566 | for y in [y_min, y_max-1]: 567 | for z in [z_min, z_max-1]: 568 | rand_gene = self.genes[int(np.random.rand(1)*len(self.genes))] 569 | self.spots.loc[self.spots.shape[0],:] = [x, y, z, rand_gene] 570 | 571 | def add_noise(self, avg_spots_per_cell, percent_empty_to_use): 572 | ''' 573 | Adds noise to empty space 574 | -------------------- 575 | parameters: 576 | avg_spots_per_cell: the average number of extra spots in each empty cell chosen 577 | percent_empty_to_use: percentage of the empty squares to put noise in 578 | ''' 579 | empty_slots = np.where(mdg.true_map) 580 | nempty = len(empty_slots[0]) 581 | noise_slots = np.random.choice(np.arange(nempty), 582 | size = int(nempty * percent_empty_to_use)) 583 | num_spots = np.round(np.random.rand(len(noise_slots)) * 2 * avg_spots_per_cell) 584 | num_genes_to_add = int(np.sum(num_spots)) 585 | rand_genes = self.genes[(np.random.rand(num_genes_to_add)*len(self.genes)).astype(int)] 586 | noise_slots = noise_slots[num_spots != 0] 587 | num_spots = num_spots[num_spots != 0] 588 | new_spots = np.zeros(((self.spots.shape[0]+len(rand_genes)),4)) 589 | new_spots[0:self.spots.shape[0],0:3] = self.spots.loc[:,['x','y','z']] 590 | new_genes_counter = self.spots.shape[0] 591 | 592 | for i in range(len(num_spots)): 593 | x = empty_slots[0][noise_slots[i]] 594 | y = empty_slots[1][noise_slots[i]] 595 | z = empty_slots[2][noise_slots[i]] 596 | for j in np.arange(num_spots[i]): 597 | new_spots[new_genes_counter,0:3] = [x,y,z] 598 | new_genes_counter += 1 599 | 600 | new_spots = pd.DataFrame(new_spots) 601 | new_spots.columns = self.spots.columns 602 | new_spots.gene = np.concatenate((self.spots.gene.to_numpy(), 603 | rand_genes)) 604 | self.spots = new_spots 605 | 606 | def merge_cells(self, n_iter = 1): 607 | ''' 608 | Merges cells to create non-circular shapes 609 | -------------------- 610 | parameters: 611 | n_iter: number of times to merge cells. (Default: 1) 612 | ''' 613 | for n in range(n_iter): 614 | coords = [] 615 | ids = [] 616 | for i in range(self.true_map.shape[0]): 617 | for j in range(self.true_map.shape[1]): 618 | for k in range(self.true_map.shape[2]): 619 | ids.append(self.true_map[i,j,k]) 620 | coords.append([i,j,k]) 621 | coords = np.array(coords) 622 | ids = np.array(ids) 623 | clf_nei_cells = KNeighborsClassifier(27).fit(coords,ids) 624 | pred = ids[clf_nei_cells.kneighbors(coords)[1]] 625 | combine_map = {} 626 | for i in range(pred.shape[0]): 627 | if pred[i,0] != -1: 628 | same_locs = np.where((pred[i,1:] != pred[i,0])& 629 | (pred[i,1:] != -1))[0] 630 | 631 | if len(same_locs) > 5: 632 | #plus one because the where was run on pred[i,1:] 633 | combine_map[pred[i,0]] = pred[i,(same_locs[0]+1)] 634 | flipped_before = [] 635 | for flip in combine_map: 636 | if ((flip not in flipped_before) & 637 | (combine_map[flip] not in flipped_before)): 638 | self.true_map[self.true_map == flip] = combine_map[flip] 639 | self.cell_ids[self.cell_ids == flip] = combine_map[flip] 640 | flipped_before.append(flip) 641 | flipped_before.append(combine_map[flip]) 642 | self.cell_ids = np.unique(self.cell_ids) 643 | 644 | 645 | -------------------------------------------------------------------------------- /CoreFunctions/environment.yml: -------------------------------------------------------------------------------- 1 | name: jsta 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | - anaconda 6 | - defaults 7 | dependencies: 8 | - _libgcc_mutex=0.1=conda_forge 9 | - _openmp_mutex=4.5=1_gnu 10 | - _tflow_select=2.3.0=mkl 11 | - absl-py=1.0.0=pyhd8ed1ab_0 12 | - aiohttp=3.8.1=py38h497a2fe_0 13 | - aiosignal=1.2.0=pyhd8ed1ab_0 14 | - alsa-lib=1.2.3=h516909a_0 15 | - aom=3.3.0=h27087fc_1 16 | - argon2-cffi=21.3.0=pyhd8ed1ab_0 17 | - argon2-cffi-bindings=21.2.0=py38h497a2fe_1 18 | - arpack=3.7.0=hdefa2d7_2 19 | - astor=0.8.1=pyh9f0ad1d_0 20 | - asttokens=2.0.5=pyhd8ed1ab_0 21 | - astunparse=1.6.3=pyhd8ed1ab_0 22 | - async-timeout=4.0.2=pyhd8ed1ab_0 23 | - attrs=21.4.0=pyhd8ed1ab_0 24 | - backcall=0.2.0=pyh9f0ad1d_0 25 | - backports=1.0=py_2 26 | - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 27 | - black=22.1.0=pyhd8ed1ab_0 28 | - bleach=4.1.0=pyhd8ed1ab_0 29 | - blinker=1.4=py_1 30 | - blosc=1.21.0=h9c3ff4c_0 31 | - brotli=1.0.9=h7f98852_6 32 | - brotli-bin=1.0.9=h7f98852_6 33 | - brotlipy=0.7.0=py38h497a2fe_1003 34 | - brunsli=0.1=h9c3ff4c_0 35 | - bzip2=1.0.8=h7f98852_4 36 | - c-ares=1.18.1=h7f98852_0 37 | - c-blosc2=2.0.4=h5f21a17_1 38 | - ca-certificates=2021.10.8=ha878542_0 39 | - cachetools=5.0.0=pyhd8ed1ab_0 40 | - certifi=2021.10.8=py38h578d9bd_1 41 | - cffi=1.15.0=py38h3931269_0 42 | - cfitsio=4.0.0=h9a35b8e_0 43 | - charls=2.3.4=h9c3ff4c_0 44 | - charset-normalizer=2.0.12=pyhd8ed1ab_0 45 | - click=8.0.4=py38h578d9bd_0 46 | - cloudpickle=2.0.0=pyhd8ed1ab_0 47 | - cryptography=36.0.1=py38h3e25421_0 48 | - cycler=0.11.0=pyhd8ed1ab_0 49 | - cytoolz=0.11.2=py38h497a2fe_1 50 | - dask-core=2022.2.0=pyhd8ed1ab_0 51 | - dataclasses=0.8=pyhc8e2a94_3 52 | - dbus=1.13.6=h5008d03_3 53 | - debugpy=1.5.1=py38h709712a_0 54 | - decorator=5.1.1=pyhd8ed1ab_0 55 | - defusedxml=0.7.1=pyhd8ed1ab_0 56 | - entrypoints=0.4=pyhd8ed1ab_0 57 | - executing=0.8.2=pyhd8ed1ab_0 58 | - expat=2.4.6=h27087fc_0 59 | - flit-core=3.7.1=pyhd8ed1ab_0 60 | - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 61 | - font-ttf-inconsolata=3.000=h77eed37_0 62 | - font-ttf-source-code-pro=2.038=h77eed37_0 63 | - font-ttf-ubuntu=0.83=hab24e00_0 64 | - fontconfig=2.13.96=ha180cfb_0 65 | - fonts-conda-ecosystem=1=0 66 | - fonts-conda-forge=1=0 67 | - fonttools=4.29.1=py38h497a2fe_0 68 | - freetype=2.10.4=h0708190_1 69 | - frozenlist=1.3.0=py38h497a2fe_0 70 | - fsspec=2022.2.0=pyhd8ed1ab_0 71 | - gast=0.3.3=py_0 72 | - gettext=0.19.8.1=h73d1719_1008 73 | - giflib=5.2.1=h36c2ea0_2 74 | - glpk=4.65=h9202a9a_1004 75 | - gmp=6.2.1=h58526e2_0 76 | - google-auth=2.6.0=pyh6c4a22f_1 77 | - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0 78 | - google-pasta=0.2.0=pyh8c360ce_0 79 | - grpcio=1.44.0=py38hdd6454d_0 80 | - gst-plugins-base=1.18.5=hf529b03_3 81 | - gstreamer=1.18.5=h9f60fe5_3 82 | - h5py=2.10.0=nompi_py38h9915d05_106 83 | - hdf5=1.10.6=nompi_h6a2412b_1114 84 | - icu=69.1=h9c3ff4c_0 85 | - idna=3.3=pyhd8ed1ab_0 86 | - igraph=0.9.6=ha184e22_0 87 | - imagecodecs=2022.2.22=py38h58c7917_0 88 | - imageio=2.16.0=pyhcf75d05_0 89 | - importlib-metadata=4.11.1=py38h578d9bd_0 90 | - importlib_resources=5.4.0=pyhd8ed1ab_0 91 | - ipykernel=6.9.1=py38he5a9106_0 92 | - ipython=8.0.1=py38h578d9bd_2 93 | - ipython_genutils=0.2.0=py_1 94 | - jbig=2.1=h7f98852_2003 95 | - jedi=0.18.1=py38h578d9bd_0 96 | - jinja2=3.0.3=pyhd8ed1ab_0 97 | - joblib=1.1.0=pyhd8ed1ab_0 98 | - jpeg=9e=h7f98852_0 99 | - jsonschema=4.4.0=pyhd8ed1ab_0 100 | - jupyter_client=7.1.2=pyhd8ed1ab_0 101 | - jupyter_contrib_core=0.3.3=py_2 102 | - jupyter_contrib_nbextensions=0.5.1=pyhd8ed1ab_2 103 | - jupyter_core=4.9.2=py38h578d9bd_0 104 | - jupyter_highlight_selected_word=0.2.0=py38h578d9bd_1005 105 | - jupyter_latex_envs=1.4.6=pyhd8ed1ab_1002 106 | - jupyter_nbextensions_configurator=0.4.1=py38h578d9bd_2 107 | - jupyterlab_pygments=0.1.2=pyh9f0ad1d_0 108 | - jxrlib=1.1=h7f98852_2 109 | - keras-preprocessing=1.1.2=pyhd8ed1ab_0 110 | - kiwisolver=1.3.2=py38h1fd1430_1 111 | - krb5=1.19.2=hcc1bbae_3 112 | - lcms2=2.12=hddcbb42_0 113 | - ld_impl_linux-64=2.36.1=hea4e1c9_2 114 | - leidenalg=0.8.9=py38hfa26641_0 115 | - lerc=3.0=h9c3ff4c_0 116 | - libaec=1.0.6=h9c3ff4c_0 117 | - libavif=0.9.3=h166bdaf_1 118 | - libblas=3.9.0=13_linux64_openblas 119 | - libbrotlicommon=1.0.9=h7f98852_6 120 | - libbrotlidec=1.0.9=h7f98852_6 121 | - libbrotlienc=1.0.9=h7f98852_6 122 | - libcblas=3.9.0=13_linux64_openblas 123 | - libclang=13.0.1=default_hc23dcda_0 124 | - libcurl=7.81.0=h2574ce0_0 125 | - libdeflate=1.10=h7f98852_0 126 | - libedit=3.1.20191231=he28a2e2_2 127 | - libev=4.33=h516909a_1 128 | - libevent=2.1.10=h9b69904_4 129 | - libffi=3.4.2=h7f98852_5 130 | - libgcc-ng=11.2.0=h1d223b6_12 131 | - libgfortran-ng=11.2.0=h69a702a_12 132 | - libgfortran5=11.2.0=h5c6108e_12 133 | - libglib=2.70.2=h174f98d_4 134 | - libgomp=11.2.0=h1d223b6_12 135 | - libiconv=1.16=h516909a_0 136 | - liblapack=3.9.0=13_linux64_openblas 137 | - libllvm11=11.1.0=hf817b99_3 138 | - libllvm13=13.0.1=hf817b99_2 139 | - libnghttp2=1.47.0=h727a467_0 140 | - libnsl=2.0.0=h7f98852_0 141 | - libogg=1.3.4=h7f98852_1 142 | - libopenblas=0.3.18=pthreads_h8fe5266_0 143 | - libopus=1.3.1=h7f98852_1 144 | - libpng=1.6.37=h21135ba_2 145 | - libpq=14.2=hd57d9b9_0 146 | - libprotobuf=3.19.4=h780b84a_0 147 | - libsodium=1.0.18=h36c2ea0_1 148 | - libssh2=1.10.0=ha56f1ee_2 149 | - libstdcxx-ng=11.2.0=he4da1e4_12 150 | - libtiff=4.3.0=h542a066_3 151 | - libuuid=2.32.1=h7f98852_1000 152 | - libvorbis=1.3.7=h9c3ff4c_0 153 | - libwebp=1.2.2=h3452ae3_0 154 | - libwebp-base=1.2.2=h7f98852_1 155 | - libxcb=1.13=h7f98852_1004 156 | - libxkbcommon=1.0.3=he3ba5ed_0 157 | - libxml2=2.9.12=h885dcf4_1 158 | - libxslt=1.1.33=h0ef7038_3 159 | - libzlib=1.2.11=h36c2ea0_1013 160 | - libzopfli=1.0.3=h9c3ff4c_0 161 | - llvmlite=0.38.0=py38h4630a5e_0 162 | - locket=0.2.0=py_2 163 | - lxml=4.8.0=py38hf1fe3a4_0 164 | - lz4-c=1.9.3=h9c3ff4c_1 165 | - lzo=2.10=h516909a_1000 166 | - markdown=3.3.6=pyhd8ed1ab_0 167 | - markupsafe=2.1.0=py38h0a891b7_0 168 | - matplotlib=3.5.1=py38h578d9bd_0 169 | - matplotlib-base=3.5.1=py38hf4fb855_0 170 | - matplotlib-inline=0.1.3=pyhd8ed1ab_0 171 | - metis=5.1.0=h58526e2_1006 172 | - mistune=0.8.4=py38h497a2fe_1005 173 | - mock=4.0.3=py38h578d9bd_2 174 | - mpfr=4.1.0=h9202a9a_1 175 | - multidict=6.0.2=py38h497a2fe_0 176 | - munkres=1.0.7=py_1 177 | - mypy_extensions=0.4.3=py38h578d9bd_4 178 | - mysql-common=8.0.28=ha770c72_0 179 | - mysql-libs=8.0.28=hfa10184_0 180 | - nbclient=0.5.11=pyhd8ed1ab_0 181 | - nbconvert=6.4.2=py38h578d9bd_0 182 | - nbformat=5.1.3=pyhd8ed1ab_0 183 | - ncurses=6.3=h9c3ff4c_0 184 | - nest-asyncio=1.5.4=pyhd8ed1ab_0 185 | - networkx=2.6.3=pyhd8ed1ab_1 186 | - nomkl=1.0=h5ca1d4c_0 187 | - notebook=6.4.8=pyha770c72_0 188 | - nspr=4.32=h9c3ff4c_1 189 | - nss=3.74=hb5efdd6_0 190 | - numba=0.55.1=py38h4bf6c61_0 191 | - numexpr=2.8.0=py38h6045d29_101 192 | - numpy=1.21.5=py38h87f13fb_0 193 | - oauthlib=3.2.0=pyhd8ed1ab_0 194 | - openjpeg=2.4.0=hb52868f_1 195 | - openssl=1.1.1l=h7f98852_0 196 | - opt_einsum=3.3.0=pyhd8ed1ab_1 197 | - packaging=21.3=pyhd8ed1ab_0 198 | - pandas=1.4.1=py38h43a58ef_0 199 | - pandoc=2.17.1.1=ha770c72_0 200 | - pandocfilters=1.5.0=pyhd8ed1ab_0 201 | - parso=0.8.3=pyhd8ed1ab_0 202 | - partd=1.2.0=pyhd8ed1ab_0 203 | - pathspec=0.9.0=pyhd8ed1ab_0 204 | - patsy=0.5.2=pyhd8ed1ab_0 205 | - pcre=8.45=h9c3ff4c_0 206 | - pexpect=4.8.0=pyh9f0ad1d_2 207 | - pickleshare=0.7.5=py_1003 208 | - pillow=9.0.1=py38h0ee0e06_2 209 | - pip=22.0.3=pyhd8ed1ab_0 210 | - platformdirs=2.5.1=pyhd8ed1ab_0 211 | - prometheus_client=0.13.1=pyhd8ed1ab_0 212 | - prompt-toolkit=3.0.27=pyha770c72_0 213 | - protobuf=3.19.4=py38h709712a_0 214 | - pthread-stubs=0.4=h36c2ea0_1001 215 | - ptyprocess=0.7.0=pyhd3deb0d_0 216 | - pure_eval=0.2.2=pyhd8ed1ab_0 217 | - pyasn1=0.4.8=py_0 218 | - pyasn1-modules=0.2.7=py_0 219 | - pycparser=2.21=pyhd8ed1ab_0 220 | - pygments=2.11.2=pyhd8ed1ab_0 221 | - pyjwt=2.3.0=pyhd8ed1ab_1 222 | - pyopenssl=22.0.0=pyhd8ed1ab_0 223 | - pyparsing=3.0.7=pyhd8ed1ab_0 224 | - pyqt=5.12.3=py38h578d9bd_8 225 | - pyqt-impl=5.12.3=py38h0ffb2e6_8 226 | - pyqt5-sip=4.19.18=py38h709712a_8 227 | - pyqtchart=5.12=py38h7400c14_8 228 | - pyqtwebengine=5.12.1=py38h7400c14_8 229 | - pyrsistent=0.18.1=py38h497a2fe_0 230 | - pysocks=1.7.1=py38h578d9bd_4 231 | - pytables=3.6.1=py38hc386592_3 232 | - python=3.8.12=ha38a3c6_3_cpython 233 | - python-dateutil=2.8.2=pyhd8ed1ab_0 234 | - python-igraph=0.9.9=py38h2af5540_0 235 | - python_abi=3.8=2_cp38 236 | - pytz=2021.3=pyhd8ed1ab_0 237 | - pyu2f=0.1.5=pyhd8ed1ab_0 238 | - pywavelets=1.2.0=py38h6c62de6_1 239 | - pyyaml=6.0=py38h497a2fe_3 240 | - pyzmq=22.3.0=py38h2035c66_1 241 | - qt=5.12.9=ha98a1a1_5 242 | - readline=8.1=h46c0cb4_0 243 | - requests=2.27.1=pyhd8ed1ab_0 244 | - requests-oauthlib=1.3.1=pyhd8ed1ab_0 245 | - rsa=4.8=pyhd8ed1ab_0 246 | - scikit-image=0.19.2=py38h43a58ef_0 247 | - scikit-learn=1.0.2=py38h1561384_0 248 | - scipy=1.8.0=py38h56a6a73_1 249 | - seaborn=0.11.2=hd8ed1ab_0 250 | - seaborn-base=0.11.2=pyhd8ed1ab_0 251 | - send2trash=1.8.0=pyhd8ed1ab_0 252 | - setuptools=60.9.3=py38h578d9bd_0 253 | - six=1.16.0=pyh6c4a22f_0 254 | - snappy=1.1.8=he1b5a44_3 255 | - sqlite=3.37.0=h9cd32fc_0 256 | - stack_data=0.2.0=pyhd8ed1ab_0 257 | - statsmodels=0.13.2=py38h6c62de6_0 258 | - suitesparse=5.10.1=h9e50725_1 259 | - tbb=2021.5.0=h4bd325d_0 260 | - tensorboard=2.8.0=pyhd8ed1ab_1 261 | - tensorboard-data-server=0.6.0=py38h3e25421_1 262 | - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0 263 | - tensorflow=2.2.0=mkl_py38h6d3daf0_0 264 | - tensorflow-base=2.2.0=mkl_py38h5059a2d_0 265 | - tensorflow-estimator=2.6.0=py38h709712a_0 266 | - termcolor=1.1.0=py_2 267 | - terminado=0.13.1=py38h578d9bd_0 268 | - testpath=0.6.0=pyhd8ed1ab_0 269 | - texttable=1.6.4=pyhd8ed1ab_0 270 | - threadpoolctl=3.1.0=pyh8a188c0_0 271 | - tifffile=2022.2.9=pyhd8ed1ab_0 272 | - tk=8.6.12=h27826a3_0 273 | - tomli=2.0.1=pyhd8ed1ab_0 274 | - toolz=0.11.2=pyhd8ed1ab_0 275 | - tornado=6.1=py38h497a2fe_2 276 | - traitlets=5.1.1=pyhd8ed1ab_0 277 | - typed-ast=1.5.2=py38h497a2fe_0 278 | - typing-extensions=4.1.1=hd8ed1ab_0 279 | - typing_extensions=4.1.1=pyha770c72_0 280 | - unicodedata2=14.0.0=py38h497a2fe_0 281 | - urllib3=1.26.8=pyhd8ed1ab_1 282 | - wcwidth=0.2.5=pyh9f0ad1d_2 283 | - webencodings=0.5.1=py_1 284 | - werkzeug=2.0.3=pyhd8ed1ab_1 285 | - wheel=0.37.1=pyhd8ed1ab_0 286 | - wrapt=1.13.3=py38h497a2fe_1 287 | - xorg-libxau=1.0.9=h7f98852_0 288 | - xorg-libxdmcp=1.1.3=h7f98852_0 289 | - xz=5.2.5=h516909a_1 290 | - yaml=0.2.5=h7f98852_2 291 | - yarl=1.7.2=py38h497a2fe_1 292 | - zeromq=4.3.4=h9c3ff4c_1 293 | - zfp=0.5.5=h9c3ff4c_8 294 | - zipp=3.7.0=pyhd8ed1ab_1 295 | - zlib=1.2.11=h36c2ea0_1013 296 | - zstd=1.5.2=ha95c52a_0 297 | -------------------------------------------------------------------------------- /CoreFunctions/get_distances.c: -------------------------------------------------------------------------------- 1 | //get_distances.c 2 | #include 3 | 4 | void get_distances(const double **surroundings, const double **nuc_assign, const double **dists,const int num_pix, const int num_nuc, double **dist_mat){ 5 | 6 | size_t i, j,k; 7 | double surr; 8 | double nuc_dist; 9 | 10 | //iterate through all pixels 11 | for(i=0;i 3 | 4 | int get_raveled_index(int hei, int wid, int dep, int i_in, int j_in, int k_in){ 5 | return i_in*wid*dep+j_in*dep+k_in; 6 | } 7 | 8 | void get_sur(const int *surroundings, int *surr_count, int *same_count, int height, int width, int depth){ 9 | int i, j, k; 10 | int l, m, n; 11 | int current_pix; 12 | int surr_pix; 13 | int sur_ind, ind, real_ind; 14 | 15 | for(i=1;i<=height;i++){ 16 | for(j=1;j<=width;j++){ 17 | for(k=1;k<=depth;k++){ 18 | ind = get_raveled_index(height+2, width+2, depth+2, i, j, k); 19 | real_ind = get_raveled_index(height, width, depth, i-1, j-1, k-1); 20 | current_pix = surroundings[ind]; 21 | for(l=-1;l<2;l++){ 22 | for(m=-1;m<2;m++){ 23 | for(n=-1;n<2;n++){ 24 | sur_ind = get_raveled_index(height+2, width+2, depth+2, i+l, j+m, k+n); 25 | surr_pix = surroundings[sur_ind]; 26 | if (current_pix == surr_pix){ 27 | same_count[real_ind]++; 28 | }else{ 29 | if (surr_pix != -2) 30 | surr_count[real_ind]++; 31 | } 32 | } 33 | } 34 | } 35 | } 36 | } 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /CoreFunctions/requirements.txt: -------------------------------------------------------------------------------- 1 | python=3.6.10 2 | tensorflow 3 | scikit-learn 4 | scikit-image 5 | numpy 6 | notebook 7 | matplotlib 8 | pandas 9 | seaborn 10 | scikit-learn 11 | statsmodels 12 | numba 13 | pytables 14 | python-igraph 15 | leidenalg 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # JSTA: joint cell segmentation and cell type annotation for spatial transcriptomics 2 |

3 | 4 |

5 | Initially, watershed based segmentation is performed and a cell level type classifier, parameterized by a deep neural network (DNN), is trained based on the NCTT data. The cell level classifier then assigns cell (sub)types (red and blue in this cartoon example). Based on the current assignment of pixels to cell (sub)types, a new DNN is trained to estimate the probabilities that each pixel comes from each of the possible (sub)types given the local RNA density at each pixel. In this example, two pixels that were initially assigned to the “red” cells got higher probability to be of a blue type. Since the neighbor cell is of type “blue” they were reassigned to that cell during segmentation update. Using the updated segmentation and the cell type classifier cell types are reassigned. The tasks of training, segmentation, and classification are repeated over many iterations until convergence. See the full manuscript here: https://doi.org/10.15252/msb.202010108 6 | 7 | ## Download and Install: 8 | ### In terminal: 9 | ```git clone https://github.com/wollmanlab/JSTA.git``` 10 | ### Install python dependencies: 11 | With pip: 12 | ``` pip install -r CoreFunctions/requirements.txt ``` 13 | With conda: 14 | ```conda env create -f CoreFunctions/environment.yml ``` 15 | or 16 | ```conda install --file CoreFunctions/requirements.txt``` 17 | ### Compile c files, and add current path to functions: 18 | ```./install.sh``` 19 | 20 | ## Tutorials: 21 | ### tutorials/SimulatingData.ipynb 22 | Simulate spatial transcriptomics data from a reference dataset: 23 | Files needed: 24 | - scRNAseq Reference: 25 | - cells x genes matrix 26 | - Reference celltypes: 27 | - cell type vector 28 |

29 | 30 |

31 | Representative synthetic dataset of nuclei (black) and mRNAs, where each color represents a different gene (left). Ground truth boundaries of the cells. Each color represents a different cell (right). 32 | 33 | ### tutorials/RunningJSTA.ipynb 34 | Run our quick implementation of density estimation, and segmentation with JSTA! 35 | Files needed: 36 | - mRNA spots: 37 | - spots x 4 matrix 38 | - Columns: gene name, x, y, z 39 | - Rows: Each mRNA spot 40 | - nuclei: 41 | - pixels x 4 matrix; 42 | - Columns: cell id, x, y, z 43 | - Rows: Each pixel of nucleus 44 | - scRNAseq Reference: 45 | - cells x genes matrix 46 | - Reference celltypes: 47 | - cell type vector 48 |

49 | 50 |

51 | High resolution cell type map of 133 cell (sub)types. Colors match those defined by Neocortical Cell Type Taxonomy. Scale bar is 500 microns. 52 | 53 | ### tutorials/FindSpatialDEGs.ipynb 54 | Run our approach for finding spDEGs in your spatial data. 55 | -------------------------------------------------------------------------------- /images/JSTAOverview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wollmanlab/JSTA/ccce064c3627c46fa2384fd0e0114b4cf627a8d9/images/JSTAOverview.png -------------------------------------------------------------------------------- /images/SegmentedHippocampus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wollmanlab/JSTA/ccce064c3627c46fa2384fd0e0114b4cf627a8d9/images/SegmentedHippocampus.png -------------------------------------------------------------------------------- /images/SimulatedData.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wollmanlab/JSTA/ccce064c3627c46fa2384fd0e0114b4cf627a8d9/images/SimulatedData.png -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | cd CoreFunctions 2 | 3 | #compile c functions 4 | echo "gcc -g -fPIC -shared -o get_distances.so get_distances.c" 5 | gcc -g -fPIC -shared -o get_distances.so get_distances.c 6 | echo "gcc -g -fPIC -shared -o get_number_similar_surroundings.so get_number_similar_surroundings.c" 7 | gcc -g -fPIC -shared -o get_number_similar_surroundings.so get_number_similar_surroundings.c 8 | 9 | #add c functions to python path 10 | sed -i -e 's?#REPLACE-WITH-PATH?'\"`pwd`\"'?' JSTA.py 11 | -------------------------------------------------------------------------------- /ref_data/celltypes.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wollmanlab/JSTA/ccce064c3627c46fa2384fd0e0114b4cf627a8d9/ref_data/celltypes.txt.gz -------------------------------------------------------------------------------- /ref_data/sc_ref.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wollmanlab/JSTA/ccce064c3627c46fa2384fd0e0114b4cf627a8d9/ref_data/sc_ref.csv.gz -------------------------------------------------------------------------------- /simulated_example/celltypes.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wollmanlab/JSTA/ccce064c3627c46fa2384fd0e0114b4cf627a8d9/simulated_example/celltypes.npy -------------------------------------------------------------------------------- /simulated_example/nuclei.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wollmanlab/JSTA/ccce064c3627c46fa2384fd0e0114b4cf627a8d9/simulated_example/nuclei.npy -------------------------------------------------------------------------------- /simulated_example/spots.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wollmanlab/JSTA/ccce064c3627c46fa2384fd0e0114b4cf627a8d9/simulated_example/spots.npy -------------------------------------------------------------------------------- /simulated_example/true_map.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wollmanlab/JSTA/ccce064c3627c46fa2384fd0e0114b4cf627a8d9/simulated_example/true_map.npy -------------------------------------------------------------------------------- /tutorials/FindSpatialDEGs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "4bd18016", 7 | "metadata": { 8 | "ExecuteTime": { 9 | "end_time": "2021-10-08T16:22:16.071031Z", 10 | "start_time": "2021-10-08T16:22:16.068969Z" 11 | } 12 | }, 13 | "outputs": [], 14 | "source": [ 15 | "# Set number of threads to use\n", 16 | "import os\n", 17 | "nthreads = 8\n", 18 | "os.environ[\"MKL_NUM_THREADS\"] = str(nthreads)\n", 19 | "os.environ[\"NUMEXPR_NUM_THREADS\"] = str(nthreads)\n", 20 | "os.environ[\"OMP_NUM_THREADS\"] = str(nthreads)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "id": "5645fb72", 27 | "metadata": { 28 | "ExecuteTime": { 29 | "end_time": "2021-10-08T16:22:16.074257Z", 30 | "start_time": "2021-10-08T16:22:16.071854Z" 31 | } 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "import sys\n", 36 | "import pickle as pkl" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "id": "570d907e", 43 | "metadata": { 44 | "ExecuteTime": { 45 | "end_time": "2021-10-08T16:22:16.077185Z", 46 | "start_time": "2021-10-08T16:22:16.075003Z" 47 | } 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "sys.path.insert(1, '../CoreFunctions/')" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "9baf0ea8", 58 | "metadata": { 59 | "ExecuteTime": { 60 | "end_time": "2021-10-08T16:22:16.490575Z", 61 | "start_time": "2021-10-08T16:22:16.077814Z" 62 | } 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "from FindSpatialDEGs import *" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "id": "3d3e03f7", 73 | "metadata": { 74 | "ExecuteTime": { 75 | "end_time": "2021-10-08T16:22:16.576448Z", 76 | "start_time": "2021-10-08T16:22:16.491461Z" 77 | } 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "# DGE matrix for cells (cells x genes)\n", 82 | "cells_mat = pd.read_csv('data_for_spatial/cells_mat.txt.gz',\n", 83 | " index_col=0)\n", 84 | "# location of cell centers [x, y, z]\n", 85 | "with open('data_for_spatial/cell_centers.npy','rb') as f:\n", 86 | " cell_cent = np.load(f)\n", 87 | "# vector of cell types\n", 88 | "with open('data_for_spatial/celltypes.pkl','rb') as f:\n", 89 | " celltypes = pkl.load(f)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "id": "b5b168dc", 96 | "metadata": { 97 | "ExecuteTime": { 98 | "end_time": "2021-10-08T16:22:16.578911Z", 99 | "start_time": "2021-10-08T16:22:16.577359Z" 100 | } 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "genes = cells_mat.columns" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "id": "fc3df2d8", 110 | "metadata": {}, 111 | "source": [ 112 | "## Example for one cell type" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "id": "71b7af2c", 119 | "metadata": { 120 | "ExecuteTime": { 121 | "end_time": "2021-10-08T16:22:18.649758Z", 122 | "start_time": "2021-10-08T16:22:16.580152Z" 123 | } 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "celltype_of_interest = 'CA1sp_2'\n", 128 | "nneighbors = 10\n", 129 | "npermutations = 1000\n", 130 | "ps_vec = get_spatial_pval(cells_mat, celltypes, cell_cent,\n", 131 | " celltype_of_interest, nneighbors, npermutations)\n", 132 | "ps_vec_with_gene = [[i[0], celltype_of_interest, genes[i[1]]] for i in ps_vec]" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "id": "01942fbf", 138 | "metadata": { 139 | "ExecuteTime": { 140 | "end_time": "2021-10-08T15:59:02.156393Z", 141 | "start_time": "2021-10-08T15:59:02.152447Z" 142 | } 143 | }, 144 | "source": [ 145 | "## Example for all cell types" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "id": "73e22340", 152 | "metadata": { 153 | "ExecuteTime": { 154 | "end_time": "2021-10-08T16:25:15.857211Z", 155 | "start_time": "2021-10-08T16:22:18.650894Z" 156 | } 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "ps_vec_with_gene_all_celltypes = []\n", 161 | "for celltype_of_interest in np.unique(celltypes):\n", 162 | " print(celltype_of_interest)\n", 163 | " ps_vec = get_spatial_pval(cells_mat, celltypes, cell_cent,\n", 164 | " celltype_of_interest, nneighbors, npermutations)\n", 165 | " if ps_vec is not None:\n", 166 | " for i in ps_vec:\n", 167 | " ps_vec_with_gene_all_celltypes.append([i[0], celltype_of_interest, genes[i[1]]])\n", 168 | " " 169 | ] 170 | } 171 | ], 172 | "metadata": { 173 | "hide_input": false, 174 | "kernelspec": { 175 | "display_name": "Python 3", 176 | "language": "python", 177 | "name": "python3" 178 | }, 179 | "language_info": { 180 | "codemirror_mode": { 181 | "name": "ipython", 182 | "version": 3 183 | }, 184 | "file_extension": ".py", 185 | "mimetype": "text/x-python", 186 | "name": "python", 187 | "nbconvert_exporter": "python", 188 | "pygments_lexer": "ipython3", 189 | "version": "3.6.13" 190 | }, 191 | "toc": { 192 | "base_numbering": 1, 193 | "nav_menu": {}, 194 | "number_sections": true, 195 | "sideBar": true, 196 | "skip_h1_title": false, 197 | "title_cell": "Table of Contents", 198 | "title_sidebar": "Contents", 199 | "toc_cell": false, 200 | "toc_position": {}, 201 | "toc_section_display": true, 202 | "toc_window_display": false 203 | }, 204 | "varInspector": { 205 | "cols": { 206 | "lenName": 16, 207 | "lenType": 16, 208 | "lenVar": 40 209 | }, 210 | "kernels_config": { 211 | "python": { 212 | "delete_cmd_postfix": "", 213 | "delete_cmd_prefix": "del ", 214 | "library": "var_list.py", 215 | "varRefreshCmd": "print(var_dic_list())" 216 | }, 217 | "r": { 218 | "delete_cmd_postfix": ") ", 219 | "delete_cmd_prefix": "rm(", 220 | "library": "var_list.r", 221 | "varRefreshCmd": "cat(var_dic_list()) " 222 | } 223 | }, 224 | "types_to_exclude": [ 225 | "module", 226 | "function", 227 | "builtin_function_or_method", 228 | "instance", 229 | "_Feature" 230 | ], 231 | "window_display": false 232 | } 233 | }, 234 | "nbformat": 4, 235 | "nbformat_minor": 5 236 | } 237 | -------------------------------------------------------------------------------- /tutorials/RunningJSTA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Note: This is a toy example on a small simulated data set. These parameters were optimized for real and larger simulated datasets. " 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "ExecuteTime": { 15 | "end_time": "2021-02-23T21:02:34.742278Z", 16 | "start_time": "2021-02-23T21:02:34.739083Z" 17 | } 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "# Set number of threads to use\n", 22 | "import os\n", 23 | "nthreads = 8\n", 24 | "os.environ[\"MKL_NUM_THREADS\"] = str(nthreads)\n", 25 | "os.environ[\"NUMEXPR_NUM_THREADS\"] = str(nthreads)\n", 26 | "os.environ[\"OMP_NUM_THREADS\"] = str(nthreads)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": { 33 | "ExecuteTime": { 34 | "end_time": "2021-02-23T21:02:35.247114Z", 35 | "start_time": "2021-02-23T21:02:35.245019Z" 36 | } 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "import sys" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": { 47 | "ExecuteTime": { 48 | "end_time": "2021-02-23T21:02:35.556701Z", 49 | "start_time": "2021-02-23T21:02:35.554358Z" 50 | } 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "sys.path.insert(1, '../CoreFunctions/')" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 4, 60 | "metadata": { 61 | "ExecuteTime": { 62 | "end_time": "2021-02-23T21:02:38.474499Z", 63 | "start_time": "2021-02-23T21:02:35.649821Z" 64 | } 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "from JSTA import *" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "# Load the simulated data\n", 76 | "spots: spatial transcriptomics data\n", 77 | "nuclei: coordinates of the nuclei pixels, with id numbers" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 5, 83 | "metadata": { 84 | "ExecuteTime": { 85 | "end_time": "2021-02-23T21:02:38.558877Z", 86 | "start_time": "2021-02-23T21:02:38.484092Z" 87 | } 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "pref = '../simulated_example/'\n", 92 | "# load spots\n", 93 | "with open(pref+'spots.npy', 'rb') as f:\n", 94 | " spots = np.load(f, allow_pickle=True)\n", 95 | " spots = pd.DataFrame(spots)\n", 96 | " spots.columns = ['x', 'y', 'z', 'gene']\n", 97 | " spots = spots.loc[:, ['gene', 'x', 'y', 'z']]\n", 98 | "# load nuclei\n", 99 | "with open(pref+'nuclei.npy', 'rb') as f:\n", 100 | " nuclei = np.load(f, allow_pickle=True)\n", 101 | " nuclei = pd.DataFrame(nuclei)\n", 102 | " nuclei.columns = ['id', 'x', 'y', 'z']\n" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "Ensure the nuclei start at 0 and end at the number of nuclei - 1 \n", 110 | "Some nuclei get filtered out during simulation, and the ids are not updated. \n", 111 | "This may also be the case during preprocessing and nuclei segmentation in real data " 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 6, 117 | "metadata": { 118 | "ExecuteTime": { 119 | "end_time": "2021-02-23T21:02:42.121785Z", 120 | "start_time": "2021-02-23T21:02:42.003312Z" 121 | } 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "for i, nuc_id in enumerate(np.unique(nuclei.id)):\n", 126 | " nuclei.loc[nuclei.id == nuc_id, 'id'] = i" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "# Read in reference datasets" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 7, 139 | "metadata": { 140 | "ExecuteTime": { 141 | "end_time": "2021-02-23T21:02:46.423926Z", 142 | "start_time": "2021-02-23T21:02:46.303663Z" 143 | } 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "pref = '../ref_data/'\n", 148 | "sc_ref = pd.read_csv(pref+'sc_ref.csv.gz', index_col=0)\n", 149 | "sc_celltypes = pd.read_csv(pref+'celltypes.txt.gz',\n", 150 | " header=None).to_numpy().ravel()" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "only keep genes in both reference and spatial transcriptomics dataset" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 8, 163 | "metadata": { 164 | "ExecuteTime": { 165 | "end_time": "2021-02-23T21:02:46.879879Z", 166 | "start_time": "2021-02-23T21:02:46.815780Z" 167 | } 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "all_genes = np.intersect1d(sc_ref.columns, np.unique(spots.gene))" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 9, 177 | "metadata": { 178 | "ExecuteTime": { 179 | "end_time": "2021-02-23T21:02:47.268350Z", 180 | "start_time": "2021-02-23T21:02:47.122547Z" 181 | } 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "#remove extra genes from spots and from reference\n", 186 | "sc_ref = sc_ref.loc[:,all_genes]\n", 187 | "spots = spots.loc[np.isin(spots.gene,all_genes),:]" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 10, 193 | "metadata": { 194 | "ExecuteTime": { 195 | "end_time": "2021-02-23T21:02:50.052490Z", 196 | "start_time": "2021-02-23T21:02:49.990239Z" 197 | } 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "# cell type map can be used later to go back to\n", 202 | "# cell type names\n", 203 | "cell_type_map = {}\n", 204 | "for i, c in enumerate(np.unique(sc_celltypes)):\n", 205 | " cell_type_map[i] = c\n", 206 | " \n", 207 | " # change cell types to int for training\n", 208 | " sc_celltypes[sc_celltypes == c] = i\n", 209 | "\n", 210 | "# required for tensorflow formatting\n", 211 | "sc_celltypes = np.array(sc_celltypes, dtype=int)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "# Subset the dataset to marker genes (Optional)\n", 219 | "We train a random forest classifier on the whole reference to identify marker genes" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 11, 225 | "metadata": { 226 | "ExecuteTime": { 227 | "end_time": "2021-02-23T21:03:51.946204Z", 228 | "start_time": "2021-02-23T21:03:51.943665Z" 229 | } 230 | }, 231 | "outputs": [], 232 | "source": [ 233 | "from sklearn.ensemble import RandomForestClassifier" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 12, 239 | "metadata": { 240 | "ExecuteTime": { 241 | "end_time": "2021-02-23T21:05:07.994584Z", 242 | "start_time": "2021-02-23T21:04:56.868809Z" 243 | } 244 | }, 245 | "outputs": [], 246 | "source": [ 247 | "clf_rf = RandomForestClassifier(n_estimators = 1000, n_jobs=nthreads).fit(sc_ref.values, sc_celltypes)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 32, 253 | "metadata": { 254 | "ExecuteTime": { 255 | "end_time": "2021-02-23T21:09:55.480808Z", 256 | "start_time": "2021-02-23T21:09:55.374572Z" 257 | } 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "number_of_genes_to_keep = 82 # For now I selected all genes\n", 262 | "subset_marker_genes = np.sort([i[1] for i in sorted(zip(clf_rf.feature_importances_, sc_ref.columns))][::-1][:number_of_genes_to_keep])" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "Subset the reference and spots to those genes" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 33, 275 | "metadata": { 276 | "ExecuteTime": { 277 | "end_time": "2021-02-23T21:09:55.920626Z", 278 | "start_time": "2021-02-23T21:09:55.886607Z" 279 | } 280 | }, 281 | "outputs": [], 282 | "source": [ 283 | "sc_ref = sc_ref.loc[:,subset_marker_genes]\n", 284 | "spots = spots[np.isin(spots.gene, subset_marker_genes)]" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "# Run density estimation:\n", 292 | "We use a KNN based density estimation to get the local density at each pixel: \n", 293 | " \n", 294 | "$$\\frac{\\textit{num_spots_around}}{\\frac{4}{3}{\\pi}r^{3}}$$ \n", 295 | " \n", 296 | "pixel_length: edge length of each pixel in microns. (Lower is higher resolution) \n", 297 | "num_spots_around: Number of transcripts on which to find the volume" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": { 304 | "ExecuteTime": { 305 | "end_time": "2020-09-17T04:28:28.107770Z", 306 | "start_time": "2020-09-17T04:27:48.895276Z" 307 | } 308 | }, 309 | "outputs": [], 310 | "source": [ 311 | "num_spots_around = 5\n", 312 | "pixel_length = 1\n", 313 | "pixels = fast_de_all_spots(spots, pixel_length, num_spots_around)\n", 314 | "locations = get_locations(spots, pixel_length)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "Bin the count data into pixels" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": { 328 | "ExecuteTime": { 329 | "end_time": "2020-09-17T04:28:32.054270Z", 330 | "start_time": "2020-09-17T04:28:28.555758Z" 331 | } 332 | }, 333 | "outputs": [], 334 | "source": [ 335 | "pix_true = get_real_pixels(spots, pixel_length, all_genes, pixels.shape)" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": { 342 | "ExecuteTime": { 343 | "end_time": "2020-09-17T04:28:32.732206Z", 344 | "start_time": "2020-09-17T04:28:32.674048Z" 345 | } 346 | }, 347 | "outputs": [], 348 | "source": [ 349 | "# Plot the expression intensity\n", 350 | "plt.imshow(np.log2(np.sum(pixels, axis=(2, 3))+1),\n", 351 | " cmap='inferno')\n", 352 | "plt.axis('off')" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "# Initialize segmentation with watershed on distance transform\n", 360 | "max_dist_to_nuclei: maximum distance from the edge of the nucleus ($\\mu$) for a pixel to be assigned to a specific nucleus. We start conservative with a maximum radius of 3" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": { 367 | "ExecuteTime": { 368 | "end_time": "2020-09-17T04:28:33.808990Z", 369 | "start_time": "2020-09-17T04:28:33.293383Z" 370 | } 371 | }, 372 | "outputs": [], 373 | "source": [ 374 | "max_dist_to_nuclei = 3\n", 375 | "cell_assignment = classify_pixels_to_nuclei(\n", 376 | " locations, nuclei, max_dist_to_nuclei)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": { 383 | "ExecuteTime": { 384 | "end_time": "2020-09-17T04:28:34.440878Z", 385 | "start_time": "2020-09-17T04:28:34.349237Z" 386 | } 387 | }, 388 | "outputs": [], 389 | "source": [ 390 | "# note colors are randomized so they will change everytime\n", 391 | "plot_segmentation(cell_assignment,\n", 392 | " 'nipy_spectral')" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": {}, 398 | "source": [ 399 | "# Get the initialized counts matrix" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": { 406 | "ExecuteTime": { 407 | "end_time": "2020-09-17T04:28:35.180060Z", 408 | "start_time": "2020-09-17T04:28:34.989752Z" 409 | } 410 | }, 411 | "outputs": [], 412 | "source": [ 413 | "cells_mat = get_matrix_of_cells(pix_true, cell_assignment, nuclei)" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "# Train cell type classifier" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": { 427 | "ExecuteTime": { 428 | "end_time": "2020-09-17T04:29:18.290112Z", 429 | "start_time": "2020-09-17T04:28:35.910943Z" 430 | } 431 | }, 432 | "outputs": [], 433 | "source": [ 434 | "clf_cell = create_celltype_classifier(sc_ref, sc_celltypes,\n", 435 | " nlayers=2, l1_reg=5e-3,\n", 436 | " epochs=20, lrs=[5e-3, 5e-4],\n", 437 | " test_size=0.25)" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "metadata": { 444 | "ExecuteTime": { 445 | "end_time": "2020-09-17T04:29:32.215800Z", 446 | "start_time": "2020-09-17T04:29:19.109495Z" 447 | } 448 | }, 449 | "outputs": [], 450 | "source": [ 451 | "tic = time()\n", 452 | "cell_assign, counts_mat, cell_types = reclassify_squares(pixels, pix_true,\n", 453 | " cells_mat, nuclei,\n", 454 | " cell_assignment,\n", 455 | " sc_ref, sc_celltypes,\n", 456 | " all_genes, locations,\n", 457 | " clf_cell,\n", 458 | " pct_train=0.1, border_other_threshold=5,\n", 459 | " border_same_threshold=2,\n", 460 | " outer_max=3, inner_max=5,\n", 461 | " most_inner_max=5, dist_threshold=2, dist_scaling=5,\n", 462 | " anneal_param=0.05, flip_thresh=0.2,\n", 463 | " nlayer=3, first_epochs=25, second_epochs=15,\n", 464 | " lrs=[1e-3, 1e-4], l1_reg=1e-3)\n", 465 | "\n", 466 | "toc = time()\n", 467 | "print('time for segmentation:', toc-tic)" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "metadata": { 474 | "ExecuteTime": { 475 | "end_time": "2020-09-17T04:29:33.196337Z", 476 | "start_time": "2020-09-17T04:29:33.102830Z" 477 | } 478 | }, 479 | "outputs": [], 480 | "source": [ 481 | "plot_segmentation(cell_assign)" 482 | ] 483 | }, 484 | { 485 | "cell_type": "markdown", 486 | "metadata": {}, 487 | "source": [ 488 | "# Map cell types back to original names" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": { 495 | "ExecuteTime": { 496 | "end_time": "2020-09-17T04:29:33.971458Z", 497 | "start_time": "2020-09-17T04:29:33.969627Z" 498 | } 499 | }, 500 | "outputs": [], 501 | "source": [ 502 | "real_celltypes = []\n", 503 | "for i in cell_types:\n", 504 | " real_celltypes.append(cell_type_map[i])\n", 505 | "real_celltypes = np.array(real_celltypes)" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "metadata": {}, 512 | "outputs": [], 513 | "source": [ 514 | " " 515 | ] 516 | } 517 | ], 518 | "metadata": { 519 | "hide_input": false, 520 | "kernelspec": { 521 | "display_name": "Python 3", 522 | "language": "python", 523 | "name": "python3" 524 | }, 525 | "language_info": { 526 | "codemirror_mode": { 527 | "name": "ipython", 528 | "version": 3 529 | }, 530 | "file_extension": ".py", 531 | "mimetype": "text/x-python", 532 | "name": "python", 533 | "nbconvert_exporter": "python", 534 | "pygments_lexer": "ipython3", 535 | "version": "3.6.13" 536 | }, 537 | "toc": { 538 | "base_numbering": 1, 539 | "nav_menu": {}, 540 | "number_sections": true, 541 | "sideBar": true, 542 | "skip_h1_title": false, 543 | "title_cell": "Table of Contents", 544 | "title_sidebar": "Contents", 545 | "toc_cell": false, 546 | "toc_position": {}, 547 | "toc_section_display": true, 548 | "toc_window_display": false 549 | }, 550 | "varInspector": { 551 | "cols": { 552 | "lenName": 16, 553 | "lenType": 16, 554 | "lenVar": 40 555 | }, 556 | "kernels_config": { 557 | "python": { 558 | "delete_cmd_postfix": "", 559 | "delete_cmd_prefix": "del ", 560 | "library": "var_list.py", 561 | "varRefreshCmd": "print(var_dic_list())" 562 | }, 563 | "r": { 564 | "delete_cmd_postfix": ") ", 565 | "delete_cmd_prefix": "rm(", 566 | "library": "var_list.r", 567 | "varRefreshCmd": "cat(var_dic_list()) " 568 | } 569 | }, 570 | "types_to_exclude": [ 571 | "module", 572 | "function", 573 | "builtin_function_or_method", 574 | "instance", 575 | "_Feature" 576 | ], 577 | "window_display": false 578 | } 579 | }, 580 | "nbformat": 4, 581 | "nbformat_minor": 4 582 | } 583 | -------------------------------------------------------------------------------- /tutorials/SimulatingData.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "ExecuteTime": { 8 | "end_time": "2020-09-17T05:19:59.688662Z", 9 | "start_time": "2020-09-17T05:19:59.685579Z" 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "#Set number of threads to use\n", 15 | "import os\n", 16 | "nthreads = 4\n", 17 | "os.environ[\"MKL_NUM_THREADS\"] = str(nthreads)\n", 18 | "os.environ[\"NUMEXPR_NUM_THREADS\"] = str(nthreads)\n", 19 | "os.environ[\"OMP_NUM_THREADS\"] = str(nthreads)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": { 26 | "ExecuteTime": { 27 | "end_time": "2020-09-17T16:16:42.832830Z", 28 | "start_time": "2020-09-17T16:16:42.830678Z" 29 | } 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "import sys" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": { 40 | "ExecuteTime": { 41 | "end_time": "2020-09-17T16:16:43.036953Z", 42 | "start_time": "2020-09-17T16:16:43.034448Z" 43 | } 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "sys.path.insert(1,'../CoreFunctions/')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 5, 53 | "metadata": { 54 | "ExecuteTime": { 55 | "end_time": "2020-09-17T16:16:44.291415Z", 56 | "start_time": "2020-09-17T16:16:43.444433Z" 57 | } 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "from MerfishSimulator import *\n", 62 | "from MerfishSimulator import merfish_data_generator as mdg" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "# Load Reference\n", 70 | "Load in the hippocampus cells (29224) from the Allen Mouse Brain Atlas. We subset this matrix to the marker genes (83) we had for our MERFISH dataset, but this can be performed with any number of genes. We also shrank the values for simulation purposes by taking the square root of the counts and rounding the values." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 6, 76 | "metadata": { 77 | "ExecuteTime": { 78 | "end_time": "2020-09-17T16:16:47.251483Z", 79 | "start_time": "2020-09-17T16:16:47.130825Z" 80 | } 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "sc_ref = pd.read_csv('../ref_data/sc_ref.csv.gz',\n", 85 | " index_col=0)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 7, 91 | "metadata": { 92 | "ExecuteTime": { 93 | "end_time": "2020-09-17T16:16:48.568742Z", 94 | "start_time": "2020-09-17T16:16:48.564572Z" 95 | } 96 | }, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/plain": [ 101 | "(29224, 83)" 102 | ] 103 | }, 104 | "execution_count": 7, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "sc_ref.shape" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "# Read in Celltypes" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 8, 123 | "metadata": { 124 | "ExecuteTime": { 125 | "end_time": "2020-09-17T16:16:50.626319Z", 126 | "start_time": "2020-09-17T16:16:50.612554Z" 127 | } 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "celltypes = pd.read_csv('../ref_data/celltypes.txt.gz',\n", 132 | " header = None).to_numpy().ravel()" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 9, 138 | "metadata": { 139 | "ExecuteTime": { 140 | "end_time": "2020-09-17T16:16:51.742896Z", 141 | "start_time": "2020-09-17T16:16:51.739868Z" 142 | } 143 | }, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "text/plain": [ 148 | "array(['Pvalb_8', 'Vip_9', 'Lamp5_5', ..., 'Vip_10', 'Sst_27',\n", 149 | " 'Lamp5 Lhx6_1'], dtype=object)" 150 | ] 151 | }, 152 | "execution_count": 9, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "celltypes" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "# Set Parameters: (assume all distance values are in microns)\n", 166 | "low_cell: smallest radius of a cell \n", 167 | " \n", 168 | "high_cell: largest radius of a cell \n", 169 | " \n", 170 | "dst_btw_cell: How much to increase/decrease the distance between cells. Negative numbers increases cell density. Positive numbers decreases cell density. \n", 171 | " \n", 172 | "unif_const: Changes the cell type distribution from that of the single cell data to uniform. Smaller numbers close to 0 give the true distribution. Larger numbers > 10 are uniform.\n", 173 | " \n", 174 | "grid_size: size of the x,y,z map to simulate cells in.\n", 175 | " \n", 176 | "noise_in_dist: adds uneveness to the radius of a cell. Creates rough edges.\n", 177 | " \n", 178 | "pixels_per_micron: Resolution of the sample. Larger numbers increases resolution, smaller decreases resolution. \n", 179 | " \n", 180 | "n_iter_merg: Number of iterations to merge neighboring cells \n", 181 | " \n", 182 | "pix_per_nuc: Number of pixels in each nucleus. Numbers too large throw an error \n", 183 | " \n", 184 | "dge_scaling_factor: Increase or shrink the values in the DGE \n", 185 | " \n", 186 | "dist_from_nuc_scale: Uniformity of the mRNA spot placement. Small numbers increases uniformity, larger numbers localizes mRNA around the nucleus of a cell" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": { 193 | "ExecuteTime": { 194 | "end_time": "2020-09-17T05:20:43.208062Z", 195 | "start_time": "2020-09-17T05:20:43.204515Z" 196 | } 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "low_cell = 7\n", 201 | "high_cell = 9\n", 202 | "dst_btw_cell = -2\n", 203 | "unif_const = 10\n", 204 | "grid_size = (50,50,20)\n", 205 | "noise_in_dist = 1\n", 206 | "pixels_per_micron = 2\n", 207 | "n_iter_merge = 1\n", 208 | "pix_per_nuc = 20\n", 209 | "dge_scaling_factor = 2.5e-1\n", 210 | "dist_from_nuc_scale = 0.1" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "# Initialize the object and the cell map\n", 218 | "Cell map saved in self.true_map" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": { 225 | "ExecuteTime": { 226 | "end_time": "2020-09-17T05:20:44.246109Z", 227 | "start_time": "2020-09-17T05:20:44.229098Z" 228 | } 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "sim = mdg(dist_between_cell_centers = [low_cell,high_cell],\n", 233 | " distance_between_cells=dst_btw_cell,\n", 234 | " celltypes=np.unique(celltypes),\n", 235 | " grid_shape=grid_size)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": { 242 | "ExecuteTime": { 243 | "end_time": "2020-09-17T05:20:49.785600Z", 244 | "start_time": "2020-09-17T05:20:44.740552Z" 245 | } 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "#adds pixels to cells based on voronoi in the distance transform\n", 250 | "sim.assign_pixels_to_cells(noise_in_dist = noise_in_dist, pixels_per_micron = pixels_per_micron)\n", 251 | "#merges cells \n", 252 | "sim.merge_cells(n_iter = n_iter_merge)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "# Plot the cell map" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": { 266 | "ExecuteTime": { 267 | "end_time": "2020-09-17T05:20:49.927160Z", 268 | "start_time": "2020-09-17T05:20:49.797589Z" 269 | } 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "sim.plot_true()" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "# Compute the covariance structure of the scRNAseq reference\n" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": { 287 | "ExecuteTime": { 288 | "end_time": "2020-09-17T05:20:50.348632Z", 289 | "start_time": "2020-09-17T05:20:49.938585Z" 290 | } 291 | }, 292 | "outputs": [], 293 | "source": [ 294 | "sim.compute_covariance(sc_ref, celltypes)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "# Compute celltype proportions" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": { 308 | "ExecuteTime": { 309 | "end_time": "2020-09-17T05:20:50.361607Z", 310 | "start_time": "2020-09-17T05:20:50.359755Z" 311 | } 312 | }, 313 | "outputs": [], 314 | "source": [ 315 | "sim.celltype_props = sim.celltype_props + np.ones_like(sim.celltype_props) * unif_const\n", 316 | "sim.celltype_props /= np.sum(sim.celltype_props)" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "# Assign celltypes\n", 324 | "Celltypes stored in self.classified+celltypes" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "metadata": { 331 | "ExecuteTime": { 332 | "end_time": "2020-09-17T05:20:50.379092Z", 333 | "start_time": "2020-09-17T05:20:50.377362Z" 334 | } 335 | }, 336 | "outputs": [], 337 | "source": [ 338 | "sim.classify_celltypes(ct_list=celltypes)" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "# Generate nuclei in cells\n", 346 | "Map of nuclei stored in self.nuclei" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": { 353 | "ExecuteTime": { 354 | "end_time": "2020-09-17T05:20:51.555343Z", 355 | "start_time": "2020-09-17T05:20:50.390195Z" 356 | } 357 | }, 358 | "outputs": [], 359 | "source": [ 360 | "sim.generate_nuclei_centers(n_pix_per_nuc=pix_per_nuc)" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "# Generate sample digital gene expression matrix\n", 368 | "DGE stored in self.merfish_dge" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": { 375 | "ExecuteTime": { 376 | "end_time": "2020-09-17T05:20:51.769538Z", 377 | "start_time": "2020-09-17T05:20:51.566853Z" 378 | } 379 | }, 380 | "outputs": [], 381 | "source": [ 382 | "sim.generate_merfish_dge(dge_scaling_factor = dge_scaling_factor)" 383 | ] 384 | }, 385 | { 386 | "cell_type": "markdown", 387 | "metadata": {}, 388 | "source": [ 389 | "# Place mRNA spots in the cells" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "metadata": { 396 | "ExecuteTime": { 397 | "end_time": "2020-09-17T05:20:52.597623Z", 398 | "start_time": "2020-09-17T05:20:51.781360Z" 399 | } 400 | }, 401 | "outputs": [], 402 | "source": [ 403 | "sim.place_transcripts(dist_from_nuc_scale=dist_from_nuc_scale)" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": {}, 409 | "source": [ 410 | "# Place a random transcript at each corner.\n", 411 | "This is not neccesary, however later on if you would like to evaluate accuracy of the segmentation map against the true map this is very useful to keep the two aligned." 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "metadata": { 418 | "ExecuteTime": { 419 | "end_time": "2020-09-17T05:20:52.677649Z", 420 | "start_time": "2020-09-17T05:20:52.609322Z" 421 | } 422 | }, 423 | "outputs": [], 424 | "source": [ 425 | "sim.place_transcripts_at_corners()" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": {}, 431 | "source": [ 432 | "# Save example" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "metadata": {}, 439 | "outputs": [], 440 | "source": [ 441 | "with open('../simulated_example/spots.npy','wb') as f:\n", 442 | " np.save(f,sim.spots,allow_pickle=True)\n", 443 | "with open('../simulated_example/nuclei.npy','wb') as f:\n", 444 | " np.save(f,sim.nuc_df,allow_pickle=True)\n", 445 | "with open('../simulated_example/true_map.npy','wb') as f:\n", 446 | " np.save(f,sim.true_map,allow_pickle=True)\n", 447 | "with open('../simulated_example/celltypes.npy','wb') as f:\n", 448 | " np.save(f,sim.classified_celltypes,allow_pickle=True)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": { 455 | "ExecuteTime": { 456 | "end_time": "2020-09-17T05:22:34.988799Z", 457 | "start_time": "2020-09-17T05:22:34.922177Z" 458 | } 459 | }, 460 | "outputs": [], 461 | "source": [] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": null, 466 | "metadata": {}, 467 | "outputs": [], 468 | "source": [] 469 | } 470 | ], 471 | "metadata": { 472 | "hide_input": false, 473 | "kernelspec": { 474 | "display_name": "Python 3", 475 | "language": "python", 476 | "name": "python3" 477 | }, 478 | "language_info": { 479 | "codemirror_mode": { 480 | "name": "ipython", 481 | "version": 3 482 | }, 483 | "file_extension": ".py", 484 | "mimetype": "text/x-python", 485 | "name": "python", 486 | "nbconvert_exporter": "python", 487 | "pygments_lexer": "ipython3", 488 | "version": "3.8.3" 489 | }, 490 | "toc": { 491 | "base_numbering": 1, 492 | "nav_menu": {}, 493 | "number_sections": true, 494 | "sideBar": true, 495 | "skip_h1_title": false, 496 | "title_cell": "Table of Contents", 497 | "title_sidebar": "Contents", 498 | "toc_cell": false, 499 | "toc_position": {}, 500 | "toc_section_display": true, 501 | "toc_window_display": false 502 | }, 503 | "varInspector": { 504 | "cols": { 505 | "lenName": 16, 506 | "lenType": 16, 507 | "lenVar": 40 508 | }, 509 | "kernels_config": { 510 | "python": { 511 | "delete_cmd_postfix": "", 512 | "delete_cmd_prefix": "del ", 513 | "library": "var_list.py", 514 | "varRefreshCmd": "print(var_dic_list())" 515 | }, 516 | "r": { 517 | "delete_cmd_postfix": ") ", 518 | "delete_cmd_prefix": "rm(", 519 | "library": "var_list.r", 520 | "varRefreshCmd": "cat(var_dic_list()) " 521 | } 522 | }, 523 | "types_to_exclude": [ 524 | "module", 525 | "function", 526 | "builtin_function_or_method", 527 | "instance", 528 | "_Feature" 529 | ], 530 | "window_display": false 531 | } 532 | }, 533 | "nbformat": 4, 534 | "nbformat_minor": 4 535 | } 536 | -------------------------------------------------------------------------------- /tutorials/data_for_spatial/cell_centers.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wollmanlab/JSTA/ccce064c3627c46fa2384fd0e0114b4cf627a8d9/tutorials/data_for_spatial/cell_centers.npy -------------------------------------------------------------------------------- /tutorials/data_for_spatial/cells_mat.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wollmanlab/JSTA/ccce064c3627c46fa2384fd0e0114b4cf627a8d9/tutorials/data_for_spatial/cells_mat.txt.gz -------------------------------------------------------------------------------- /tutorials/data_for_spatial/celltypes.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wollmanlab/JSTA/ccce064c3627c46fa2384fd0e0114b4cf627a8d9/tutorials/data_for_spatial/celltypes.pkl --------------------------------------------------------------------------------