├── README.md
├── notebooks
    ├── aging_10x_analysis.ipynb
    ├── aging_MERFISH_gene_selection.ipynb
    ├── aging_analyze_integrated10x.ipynb
    ├── aging_call_final_celltypes_10x.ipynb
    ├── analyze_merfish_celltyping.ipynb
    ├── int_analysis.ipynb
    ├── merfish_LPS_analysis.ipynb
    ├── merfish_integrate_LPS_data.ipynb
    └── merfish_spatial_celltype_org.ipynb
└── python
    ├── de.py
    ├── find_merfish_markers.py
    ├── integration.py
    ├── plotting.py
    ├── spatial_analysis.py
    └── utils.py


/README.md:
--------------------------------------------------------------------------------
1 | # SpatialBrainAgingCell22
2 | Analysis software for brain aging spatial and single-cell transcriptomics.
3 | 


--------------------------------------------------------------------------------
/notebooks/aging_10x_analysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload\n",
 10 |     "%autoreload 2\n",
 11 |     "import matplotlib.pyplot as plt\n",
 12 |     "%matplotlib inline\n",
 13 |     "%config InlineBackend.figure_format='retina'\n"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import numpy as np\n",
 23 |     "import scanpy as sc\n",
 24 |     "import pandas as pd\n",
 25 |     "import anndata as ad\n",
 26 |     "import seaborn as sns"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)\n",
 36 |     "sc.settings.set_figure_params(dpi=80, facecolor='white')\n"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "mouse_colors = plt.cm.colors.ListedColormap(['red', 'darkred', 'blue','darkblue', 'orange', 'darkorange', 'violet', 'darkviolet',])\n",
 46 |     "\n",
 47 |     "samples = [\n",
 48 |     "    \"Hyp_4wk_1_matrix.h5\",\n",
 49 |     "    \"Hyp_4wk_2_matrix.h5\",\n",
 50 |     "    \"Hyp_4wk_3_matrix.h5\",\n",
 51 |     "    \"Hyp_4wk_4_matrix.h5\",\n",
 52 |     "    \"Hyp_90wk_1_matrix.h5\",\n",
 53 |     "    \"Hyp_90wk_2_matrix.h5\",\n",
 54 |     "    \"Hyp_90wk_3_matrix.h5\",\n",
 55 |     "    \"Hyp_90wk_4_matrix.h5\",\n",
 56 |     "    \"PFC_4wk_1_matrix.h5\",\n",
 57 |     "    \"PFC_4wk_2_matrix.h5\",\n",
 58 |     "    \"PFC_4wk_3_matrix.h5\",\n",
 59 |     "    \"PFC_4wk_4_matrix.h5\",\n",
 60 |     "    \"PFC_90wk_1_matrix.h5\",\n",
 61 |     "    \"PFC_90wk_2_matrix.h5\",\n",
 62 |     "    \"PFC_90wk_3_matrix.h5\",\n",
 63 |     "    \"PFC_90wk_4_matrix.h5\"\n",
 64 |     "]\n",
 65 |     "\n",
 66 |     "mouse_id = {\n",
 67 |     "    0 : 1,\n",
 68 |     "    1 : 1,\n",
 69 |     "    2 : 2,\n",
 70 |     "    3 : 2,\n",
 71 |     "    4 : 3,\n",
 72 |     "    5 : 3,\n",
 73 |     "    6 : 4,\n",
 74 |     "    7 : 4,\n",
 75 |     "    8 : 5,\n",
 76 |     "    9 : 5,\n",
 77 |     "    10 : 6,\n",
 78 |     "    11 : 6,\n",
 79 |     "    12 : 7,\n",
 80 |     "    13 : 7,\n",
 81 |     "    14 : 8,\n",
 82 |     "    15 : 8\n",
 83 |     "}"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "all_adata = []\n",
 93 |     "i = 0\n",
 94 |     "for s in samples:\n",
 95 |     "    area, age, idx, _ = s.split(\"_\")\n",
 96 |     "    print(area, age, idx)\n",
 97 |     "    curr_adata = sc.read_10x_h5(f\"/faststorage/brain_aging/aging10x/{s}\")\n",
 98 |     "    curr_adata.var_names_make_unique()\n",
 99 |     "    curr_adata.obs['area'] = area\n",
100 |     "    curr_adata.obs['age'] = age\n",
101 |     "    curr_adata.obs['idx'] = i\n",
102 |     "    i += 1\n",
103 |     "    curr_adata.var['mt'] = curr_adata.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'\n",
104 |     "    sc.pp.calculate_qc_metrics(curr_adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)\n",
105 |     "\n",
106 |     "    all_adata.append(curr_adata)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "total_cells = np.sum([a.n_obs for a in all_adata])\n",
116 |     "print('total cells:', total_cells)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "adata = ad.concat(all_adata)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "adata[adata.obs.area=='PFC']"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "adata.obs_names_make_unique()"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "sc.pp.filter_cells(adata, min_genes=1000)\n",
153 |     "sc.pp.filter_cells(adata, max_counts=100000)\n",
154 |     "sc.pp.filter_genes(adata, min_cells=3)\n",
155 |     "sc.pp.filter_cells(adata, min_counts=2500)"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "adata"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "adata.obs['mouse_id'] = [mouse_id[i] for i in adata.obs.idx]"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "# run scrublet on adata to identify doublets\n",
183 |     "import scrublet as scr\n",
184 |     "scrub = scr.Scrublet(adata.X, expected_doublet_rate=0.09)\n",
185 |     "doublet_scores, predicted_doublets = scrub.scrub_doublets(min_gene_variability_pctl=85, \n",
186 |     "                                                          n_prin_comps=30)\n",
187 |     "\n"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "scrub.plot_histogram();\n"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": [
205 |     "np.sum(predicted_doublets)/len(doublet_scores)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "adata = adata[~predicted_doublets,:]"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "adata"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "sc.pl.highest_expr_genes(adata, n_top=20, )"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "metadata": {},
239 |    "outputs": [],
240 |    "source": [
241 |     "adata.write(\"adata_combined_nodoublet.h5ad\")"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": null,
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "\n",
251 |     "sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],\n",
252 |     "             jitter=0.4, multi_panel=True,size=0.25)\n"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "print(np.median(adata.obs.n_genes_by_counts))"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": null,
267 |    "metadata": {},
268 |    "outputs": [],
269 |    "source": [
270 |     "print(np.median(adata.obs.total_counts))"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": [
279 |     "adata"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "metadata": {},
286 |    "outputs": [],
287 |    "source": [
288 |     "#adata = adata[adata.obs.n_genes_by_counts < 3000, :]\n",
289 |     "#adata = adata[adata.obs.pct_counts_mt < 5, :]\n"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": null,
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": [
298 |     "sc.pp.normalize_total(adata, target_sum=1e4)\n",
299 |     "\n",
300 |     "sc.pp.log1p(adata)\n",
301 |     "\n",
302 |     "sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)\n"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": null,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "sc.pl.highly_variable_genes(adata)\n"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": null,
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": [
320 |     "adata.raw = adata\n"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "adata = adata[:, adata.var.highly_variable]\n"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "metadata": {},
336 |    "outputs": [],
337 |    "source": [
338 |     "sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])\n"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": null,
344 |    "metadata": {},
345 |    "outputs": [],
346 |    "source": [
347 |     "sc.pp.scale(adata, max_value=10)\n"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": null,
353 |    "metadata": {},
354 |    "outputs": [],
355 |    "source": [
356 |     "sc.tl.pca(adata, svd_solver='arpack')\n"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": null,
362 |    "metadata": {},
363 |    "outputs": [],
364 |    "source": [
365 |     "sc.pl.pca(adata)\n"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "metadata": {},
372 |    "outputs": [],
373 |    "source": [
374 |     "sc.pl.pca_variance_ratio(adata, log=True,n_pcs=50)\n"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": null,
380 |    "metadata": {},
381 |    "outputs": [],
382 |    "source": [
383 |     "sc.pp.neighbors(adata, n_neighbors=10, n_pcs=50)\n"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": null,
389 |    "metadata": {},
390 |    "outputs": [],
391 |    "source": [
392 |     "sc.tl.umap(adata)\n"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": null,
398 |    "metadata": {},
399 |    "outputs": [],
400 |    "source": [
401 |     "sc.tl.leiden(adata,resolution=0.2)\n"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": null,
407 |    "metadata": {},
408 |    "outputs": [],
409 |    "source": [
410 |     "sc.pl.pca(adata,color=['leiden','age','idx'],color_map=plt.cm.rainbow)\n"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": null,
416 |    "metadata": {},
417 |    "outputs": [],
418 |    "source": [
419 |     "sc.pl.umap(adata, color=['leiden','n_genes','total_counts'],color_map=plt.cm.viridis)"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": null,
425 |    "metadata": {},
426 |    "outputs": [],
427 |    "source": [
428 |     "adata.write(\"adata_combined_nodoublet_normalized.h5ad\")"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "markdown",
433 |    "metadata": {},
434 |    "source": [
435 |     "# 1. Merge clusters into neurons and not neurons"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": null,
441 |    "metadata": {},
442 |    "outputs": [],
443 |    "source": [
444 |     "gene_ids = adata.raw.var_names\n",
445 |     "ens_idx = np.in1d(gene_ids, 'Snap25')"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": null,
451 |    "metadata": {},
452 |    "outputs": [],
453 |    "source": [
454 |     "adata.raw.var"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "code",
459 |    "execution_count": null,
460 |    "metadata": {},
461 |    "outputs": [],
462 |    "source": [
463 |     "adata.obs['Snap25'] = adata.raw.X[:,ens_idx].mean(1)"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": null,
469 |    "metadata": {},
470 |    "outputs": [],
471 |    "source": [
472 |     "plt.hist(adata.obs.groupby('leiden')['Snap25'].apply(np.mean).to_numpy(),100)\n",
473 |     "plt.axvline(1.25)"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": null,
479 |    "metadata": {},
480 |    "outputs": [],
481 |    "source": [
482 |     "is_cluster_neuronal = (adata.obs.groupby('leiden')['Snap25'].apply(np.mean).to_numpy()>1.25)\n",
483 |     "neuronal_map = dict(zip([str(i) for i in range(len(is_cluster_neuronal))],is_cluster_neuronal))"
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "code",
488 |    "execution_count": null,
489 |    "metadata": {},
490 |    "outputs": [],
491 |    "source": [
492 |     "# computer cluster mean expression for each gene\n",
493 |     "adata.obs['neuronal'] = [neuronal_map[i] for i in adata.obs.leiden]\n"
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": null,
499 |    "metadata": {},
500 |    "outputs": [],
501 |    "source": [
502 |     "sc.pl.umap(adata,color=['neuronal','Snap25'])"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "code",
507 |    "execution_count": null,
508 |    "metadata": {},
509 |    "outputs": [],
510 |    "source": [
511 |     "sc.pl.dotplot(adata, ['Cx3cr1', 'Aldh1l1','Olig1','Cspg4', 'Snap25', 'Gad1', 'Slc17a6', 'Slc17a7'],groupby='leiden')"
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": null,
517 |    "metadata": {},
518 |    "outputs": [],
519 |    "source": [
520 |     "# subset by neurons in PFC and Hyp\n",
521 |     "adata_neuronal = adata[adata.obs.neuronal].copy()\n",
522 |     "adata_neuronal = adata_neuronal.raw.to_adata()\n"
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "code",
527 |    "execution_count": null,
528 |    "metadata": {},
529 |    "outputs": [],
530 |    "source": [
531 |     "adata_neuronal_pfc = adata_neuronal[adata_neuronal.obs.area == 'PFC'].copy()\n",
532 |     "adata_neuronal_hyp = adata_neuronal[adata_neuronal.obs.area == 'Hyp'].copy()\n"
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "markdown",
537 |    "metadata": {},
538 |    "source": [
539 |     "# 2. Cluster neurons"
540 |    ]
541 |   },
542 |   {
543 |    "cell_type": "markdown",
544 |    "metadata": {},
545 |    "source": [
546 |     "## 2.1 Cluster PFC neurons"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "code",
551 |    "execution_count": null,
552 |    "metadata": {},
553 |    "outputs": [],
554 |    "source": [
555 |     "def reprocess_subset(A,res=0.7):\n",
556 |     "    # assumes data have already been normalized/log transformed\n",
557 |     "    print('finding highly variable genes')\n",
558 |     "    sc.pp.highly_variable_genes(A, min_mean=0.0125, max_mean=3, min_disp=0.5)\n",
559 |     "    A.raw = A\n",
560 |     "    A = A[:, A.var.highly_variable]\n",
561 |     "    print('regressing out')\n",
562 |     "    sc.pp.regress_out(A, ['total_counts', 'pct_counts_mt'])\n",
563 |     "    print('scaling')\n",
564 |     "    sc.pp.scale(A, max_value=10)\n",
565 |     "    print('pca')\n",
566 |     "    sc.tl.pca(A, svd_solver='arpack')\n",
567 |     "    print('neighbors')\n",
568 |     "    sc.pp.neighbors(A, n_neighbors=10, n_pcs=50)\n",
569 |     "    print('umap')\n",
570 |     "    sc.tl.umap(A)\n",
571 |     "    print('leiden')\n",
572 |     "    sc.tl.leiden(A,resolution=res)\n",
573 |     "    return A"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": null,
579 |    "metadata": {},
580 |    "outputs": [],
581 |    "source": [
582 |     "adata_neuronal_pfc = reprocess_subset(adata_neuronal_pfc)"
583 |    ]
584 |   },
585 |   {
586 |    "cell_type": "code",
587 |    "execution_count": null,
588 |    "metadata": {},
589 |    "outputs": [],
590 |    "source": [
591 |     "sc.pl.umap(adata_neuronal_pfc, color=['age'])"
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "code",
596 |    "execution_count": null,
597 |    "metadata": {},
598 |    "outputs": [],
599 |    "source": [
600 |     "sc.external.pp.bbknn(adata_neuronal_pfc,batch_key='age')\n",
601 |     "sc.tl.leiden(adata_neuronal_pfc,resolution=0.6)\n",
602 |     "sc.tl.umap(adata_neuronal_pfc)"
603 |    ]
604 |   },
605 |   {
606 |    "cell_type": "code",
607 |    "execution_count": null,
608 |    "metadata": {},
609 |    "outputs": [],
610 |    "source": [
611 |     "sc.pl.umap(adata_neuronal_pfc, color=['leiden','age','mouse_id'],color_map=mouse_colors)"
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "code",
616 |    "execution_count": null,
617 |    "metadata": {},
618 |    "outputs": [],
619 |    "source": [
620 |     "sc.pl.umap(adata_neuronal_pfc, color='age')"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "code",
625 |    "execution_count": null,
626 |    "metadata": {},
627 |    "outputs": [],
628 |    "source": [
629 |     "sc.pl.umap(adata_neuronal_pfc, \n",
630 |     "           color=['Slc17a7','Gad1','Drd1','Drd2','Sst','Vip','Pvalb',\n",
631 |     "                  'Cux1','Tshz2','Cd44','Vegfd','Pld5','Otof','Npr3'],\n",
632 |     "          use_raw=True)"
633 |    ]
634 |   },
635 |   {
636 |    "cell_type": "code",
637 |    "execution_count": null,
638 |    "metadata": {},
639 |    "outputs": [],
640 |    "source": [
641 |     "sc.tl.rank_genes_groups(adata_neuronal_pfc, 'leiden', method='wilcoxon')\n",
642 |     "#sc.pl.rank_genes_groups(adata_neuronal_pfc, n_genes=25, sharey=False)\n"
643 |    ]
644 |   },
645 |   {
646 |    "cell_type": "code",
647 |    "execution_count": null,
648 |    "metadata": {},
649 |    "outputs": [],
650 |    "source": [
651 |     "sc.tl.filter_rank_genes_groups(adata_neuronal_pfc, min_fold_change=1.5)\n",
652 |     "sc.pl.rank_genes_groups_dotplot(adata_neuronal_pfc, key='rank_genes_groups_filtered')"
653 |    ]
654 |   },
655 |   {
656 |    "cell_type": "code",
657 |    "execution_count": null,
658 |    "metadata": {},
659 |    "outputs": [],
660 |    "source": [
661 |     "sc.pl.rank_genes_groups_heatmap(adata_neuronal_pfc,n_genes=5,groupby='leiden',show_gene_labels=True)"
662 |    ]
663 |   },
664 |   {
665 |    "cell_type": "markdown",
666 |    "metadata": {},
667 |    "source": [
668 |     "## 2.2 Cluster hypothalamus neurons\n"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "code",
673 |    "execution_count": null,
674 |    "metadata": {},
675 |    "outputs": [],
676 |    "source": [
677 |     "adata_neuronal_hyp = reprocess_subset(adata_neuronal_hyp)"
678 |    ]
679 |   },
680 |   {
681 |    "cell_type": "code",
682 |    "execution_count": null,
683 |    "metadata": {},
684 |    "outputs": [],
685 |    "source": [
686 |     "#sc.external.pp.bbknn(adata_neuronal_hyp,batch_key='mouse_id')\n",
687 |     "#sc.tl.leiden(adata_neuronal_hyp,resolution=0.2)\n",
688 |     "#sc.tl.umap(adata_neuronal_hyp)"
689 |    ]
690 |   },
691 |   {
692 |    "cell_type": "code",
693 |    "execution_count": null,
694 |    "metadata": {},
695 |    "outputs": [],
696 |    "source": [
697 |     "sc.pl.umap(adata_neuronal_hyp, color=['leiden','age','mouse_id'],color_map=mouse_colors)"
698 |    ]
699 |   },
700 |   {
701 |    "cell_type": "code",
702 |    "execution_count": null,
703 |    "metadata": {},
704 |    "outputs": [],
705 |    "source": [
706 |     "sc.pl.umap(adata_neuronal_hyp, color=['Gad1','Slc17a6','Slc17a7','Gal','Agtr1a','Esr1','Pomc','Agrp','Nxph4','Adcyap1','Oxt'],use_raw=True)\n"
707 |    ]
708 |   },
709 |   {
710 |    "cell_type": "code",
711 |    "execution_count": null,
712 |    "metadata": {},
713 |    "outputs": [],
714 |    "source": [
715 |     "sc.tl.rank_genes_groups(adata_neuronal_hyp, 'leiden', method='t-test')\n"
716 |    ]
717 |   },
718 |   {
719 |    "cell_type": "code",
720 |    "execution_count": null,
721 |    "metadata": {},
722 |    "outputs": [],
723 |    "source": [
724 |     "sc.tl.filter_rank_genes_groups(adata_neuronal_hyp, min_fold_change=1.5)"
725 |    ]
726 |   },
727 |   {
728 |    "cell_type": "code",
729 |    "execution_count": null,
730 |    "metadata": {},
731 |    "outputs": [],
732 |    "source": [
733 |     "#sc.pl.rank_genes_groups(adata_neuronal_pfc, n_genes=25, sharey=False)\n",
734 |     "\n",
735 |     "sc.pl.rank_genes_groups_heatmap(adata_neuronal_hyp,n_genes=3,key='rank_genes_groups_filtered',groupby='leiden',show_gene_labels=True)\n"
736 |    ]
737 |   },
738 |   {
739 |    "cell_type": "code",
740 |    "execution_count": null,
741 |    "metadata": {},
742 |    "outputs": [],
743 |    "source": [
744 |     "sc.pl.rank_genes_groups_dotplot(adata_neuronal_hyp, key='rank_genes_groups_filtered')"
745 |    ]
746 |   },
747 |   {
748 |    "cell_type": "markdown",
749 |    "metadata": {},
750 |    "source": [
751 |     "# 3. Cluster non neurons"
752 |    ]
753 |   },
754 |   {
755 |    "cell_type": "code",
756 |    "execution_count": null,
757 |    "metadata": {},
758 |    "outputs": [],
759 |    "source": [
760 |     "adata_nonneuronal = adata[~adata.obs.neuronal].copy()\n",
761 |     "adata_nonneuronal = adata_nonneuronal.raw.to_adata()\n"
762 |    ]
763 |   },
764 |   {
765 |    "cell_type": "code",
766 |    "execution_count": null,
767 |    "metadata": {},
768 |    "outputs": [],
769 |    "source": [
770 |     "adata_nonneuronal = reprocess_subset(adata_nonneuronal)"
771 |    ]
772 |   },
773 |   {
774 |    "cell_type": "code",
775 |    "execution_count": null,
776 |    "metadata": {},
777 |    "outputs": [],
778 |    "source": [
779 |     "#sc.external.pp.bbknn(adata_nonneuronal,batch_key='mouse_id')\n",
780 |     "#sc.tl.leiden(adata_nonneuronal,resolution=1.2)\n",
781 |     "#sc.tl.umap(adata_nonneuronal)"
782 |    ]
783 |   },
784 |   {
785 |    "cell_type": "code",
786 |    "execution_count": null,
787 |    "metadata": {},
788 |    "outputs": [],
789 |    "source": [
790 |     "sc.tl.leiden(adata_nonneuronal,resolution=0.7)\n"
791 |    ]
792 |   },
793 |   {
794 |    "cell_type": "code",
795 |    "execution_count": null,
796 |    "metadata": {},
797 |    "outputs": [],
798 |    "source": [
799 |     "sc.pl.umap(adata_nonneuronal, color=['leiden'])"
800 |    ]
801 |   },
802 |   {
803 |    "cell_type": "code",
804 |    "execution_count": null,
805 |    "metadata": {},
806 |    "outputs": [],
807 |    "source": [
808 |     "sc.pl.umap(adata_nonneuronal, color=['leiden','area','age','mouse_id'],color_map=mouse_colors)"
809 |    ]
810 |   },
811 |   {
812 |    "cell_type": "code",
813 |    "execution_count": null,
814 |    "metadata": {},
815 |    "outputs": [],
816 |    "source": [
817 |     "sc.pl.umap(adata_nonneuronal, color=['Cdkn2a','Aldh1l1','Cx3cr1','Plp1','Cspg4',\n",
818 |     "                                     'Gfap','Aqp4','Cldn5','Adgrf5'])\n",
819 |     "\n"
820 |    ]
821 |   },
822 |   {
823 |    "cell_type": "code",
824 |    "execution_count": null,
825 |    "metadata": {},
826 |    "outputs": [],
827 |    "source": [
828 |     "old_to_new = dict(\n",
829 |     "    old_cluster1='new_cluster1',\n",
830 |     "    old_cluster2='new_cluster1',\n",
831 |     "    old_cluster3='new_cluster2',\n",
832 |     ")\n",
833 |     "adata.obs['new_clusters'] = (\n",
834 |     "    adata.obs['old_clusters']\n",
835 |     "    .map(old_to_new)\n",
836 |     "    .astype('category')\n",
837 |     ")\n"
838 |    ]
839 |   },
840 |   {
841 |    "cell_type": "markdown",
842 |    "metadata": {},
843 |    "source": [
844 |     "# Cluster whole dataset"
845 |    ]
846 |   },
847 |   {
848 |    "cell_type": "code",
849 |    "execution_count": null,
850 |    "metadata": {},
851 |    "outputs": [],
852 |    "source": [
853 |     "sc.external.pp.bbknn(adata,batch_key='mouse_id')\n",
854 |     "sc.tl.leiden(adata,resolution=0.2)\n",
855 |     "sc.tl.umap(adata)"
856 |    ]
857 |   },
858 |   {
859 |    "cell_type": "code",
860 |    "execution_count": null,
861 |    "metadata": {},
862 |    "outputs": [],
863 |    "source": [
864 |     "fig = sc.pl.umap(adata, color=['age','area','mouse_id'],color_map=mouse_colors,return_fig=True)\n",
865 |     "fig.savefig(\"/Users/wea/src/tithonus/analysis/aging10x/umap.png\",dpi=300,bbox_inches='tight')"
866 |    ]
867 |   },
868 |   {
869 |    "cell_type": "code",
870 |    "execution_count": null,
871 |    "metadata": {},
872 |    "outputs": [],
873 |    "source": [
874 |     "sc.pl.umap(adata, color=['Cx3cr1', 'Aldh1l1','Olig1','Cspg4', 'Snap25', 'Gad1', 'Slc17a6', 'Slc17a7'],color_map=plt.cm.Reds)"
875 |    ]
876 |   },
877 |   {
878 |    "cell_type": "code",
879 |    "execution_count": null,
880 |    "metadata": {},
881 |    "outputs": [],
882 |    "source": [
883 |     "sc.pl.umap(adata, color=['Vip','Gal','Sst','Cck','Npy','Oxt','Nxph4','Agtr1a','Agrp','Esr1'],cmap=plt.cm.coolwarm,vmin=-5,vmax=5)"
884 |    ]
885 |   },
886 |   {
887 |    "cell_type": "code",
888 |    "execution_count": null,
889 |    "metadata": {},
890 |    "outputs": [],
891 |    "source": []
892 |   },
893 |   {
894 |    "cell_type": "code",
895 |    "execution_count": null,
896 |    "metadata": {},
897 |    "outputs": [],
898 |    "source": [
899 |     "sc.pl.umap(adata, color=['C1qa','C3','Itgam','Trem2'],cmap=plt.cm.coolwarm,use_raw=True,vmin=-3,vmax=3)"
900 |    ]
901 |   },
902 |   {
903 |    "cell_type": "code",
904 |    "execution_count": null,
905 |    "metadata": {},
906 |    "outputs": [],
907 |    "source": [
908 |     "sc.pl.umap(adata, color=['Cdkn2a','C2','C4b','Tspan2','Il33','Aldh1l1','Cd4','Cd74','Agtr1a'],color_map=plt.cm.Reds,use_raw=True)"
909 |    ]
910 |   },
911 |   {
912 |    "cell_type": "code",
913 |    "execution_count": null,
914 |    "metadata": {},
915 |    "outputs": [],
916 |    "source": [
917 |     "\n",
918 |     "sc.pl.umap(adata, color=[i for i in list(adata.raw.var_names) if 'Il' in i],color_map=plt.cm.Reds,use_raw=True)"
919 |    ]
920 |   },
921 |   {
922 |    "cell_type": "code",
923 |    "execution_count": null,
924 |    "metadata": {},
925 |    "outputs": [],
926 |    "source": [
927 |     "sc.pl.umap(adata, color=[i for i in list(adata.raw.var_names) if 'H2-' in i],color_map=plt.cm.coolwarm,use_raw=True,vmin=-3,vmax=3)"
928 |    ]
929 |   },
930 |   {
931 |    "cell_type": "code",
932 |    "execution_count": null,
933 |    "metadata": {},
934 |    "outputs": [],
935 |    "source": [
936 |     "sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')\n",
937 |     "sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)\n"
938 |    ]
939 |   },
940 |   {
941 |    "cell_type": "code",
942 |    "execution_count": null,
943 |    "metadata": {},
944 |    "outputs": [],
945 |    "source": [
946 |     "pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(5)\n"
947 |    ]
948 |   },
949 |   {
950 |    "cell_type": "code",
951 |    "execution_count": null,
952 |    "metadata": {},
953 |    "outputs": [],
954 |    "source": []
955 |   }
956 |  ],
957 |  "metadata": {
958 |   "kernelspec": {
959 |    "display_name": "Python 3 (ipykernel)",
960 |    "language": "python",
961 |    "name": "python3"
962 |   },
963 |   "language_info": {
964 |    "codemirror_mode": {
965 |     "name": "ipython",
966 |     "version": 3
967 |    },
968 |    "file_extension": ".py",
969 |    "mimetype": "text/x-python",
970 |    "name": "python",
971 |    "nbconvert_exporter": "python",
972 |    "pygments_lexer": "ipython3",
973 |    "version": "3.7.4"
974 |   }
975 |  },
976 |  "nbformat": 4,
977 |  "nbformat_minor": 4
978 | }
979 | 


--------------------------------------------------------------------------------
/notebooks/aging_MERFISH_gene_selection.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload\n",
 10 |     "%autoreload 2\n",
 11 |     "%matplotlib inline"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import matplotlib.pyplot as plt\n",
 21 |     "import seaborn as sns\n",
 22 |     "import pandas as pd\n",
 23 |     "import numpy as np\n",
 24 |     "import scanpy as sc\n",
 25 |     "import sys"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)\n",
 35 |     "sc.settings.set_figure_params(dpi=80, facecolor='white', frameon=False, figsize=(5,5))"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "from find_merfish_markers import *"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "# Script to select cell type marker genes for MERFISH"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "adata = sc.read(\"/faststorage/brain_aging/rna_analysis/adata_finalclusts_annot.h5ad\")"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "# Known markers"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "\n",
 77 |     "cortex_major_markers = ['Slc17a7', 'Slc32a1', 'Slc30a3', 'Cux2', 'Rorb', 'Sulf2', \n",
 78 |     "                  'Ptpru', 'Car3', 'Fam84b', 'Syt6', 'Nxph4', 'Tshz2', 'Pvalb', 'Sst', 'Vip',\n",
 79 |     "                 'Sncg', 'Lamp5', 'Sox10', 'Pdgfra', 'Aqp4', 'Igf2', 'Ctss', 'Cldn5', 'Flt1', 'Bgn', 'Vtn', 'Gfap',\n",
 80 |     "                        'Gad1', 'Gad2', 'Fn1', 'Myh11', 'Cd24a', 'Selplg', 'Pdgfra', 'Aqp4', 'Mbp', 'Ttyh2',\n",
 81 |     "                        \"Crhbp\", \"Cnr1\", \"Cpne5\", \"Crh\", \"Kcnip2\", \"Tbr1\", \"Lamp5\", \"Rorb\", \"Syt6\", \"Aldoc\", \"Gfap\",\n",
 82 |     "                        \"Serpinf1\", \"Mfge8\", \"Sox10\", \"Plp1\", \"Pdgfra\", \"Tmem8\", \"Itpr2\", \"Ctps\", \"Bmp4\", \"Anln\",\n",
 83 |     "                        \"Hexb\", \"Mrc1\", \"Vtn\", \"Flt1\", \"Apln\", \"Acta2\", \"Ttr\", \"Foxj1\"\n",
 84 |     "                       ]\n",
 85 |     "\n",
 86 |     "cortex_major_markers.extend([\n",
 87 |     "\"Acta2\",\"Aqp4\",\"Bgn\", \"Calb2\",\"Car3\", \"Cd14\", \"Chat\", \"Chodl\", \"Chrna2\", \"Cldn5\", \"Crhr2\",\n",
 88 |     "\"Crispld2\",\"Cspg4\",\"Ctss\",\"Cux2\",\"Egfr\",\"Enpp6\",\"Fam84b\",\"Fezf2\",\"Flt1\",\"Foxp2\",\"Gfap\",\"Hpse\",\"Igf2\",\"Kcnj8\",\n",
 89 |     "\"Lhx6\",\"Lmo1\",\"Lsp1\",\"Mrc1\",\"Nxph2\",\"Nxph4\",\"Opalin\",\"Osr1\",\"Otof\",\"Pdgfra\",\"Prox1\",\"Rorb\",\"Rspo1\",\"Rxfp1\",\n",
 90 |     "\"Satb2\",\"Serpinf1\",\"Slc17a6\",\"Slc17a8\",\"Slc30a3\",\"Slc32a1\",\"Sncg\",\"Sox10\",\"Sox6\",\"Sulf1\",\"Syt6\",\"Tcap\",\"Th\",\"Tshz2\",\n",
 91 |     "\"Vipr2\",\"Vtn\",\"Vip\",\"Sst\",\"Calb1\",\"Gad2\",\"Slc17a7\",\"Lamp5\",\"Gad1\",\"Pvalb\",\"Fezf2\", \"Bcl11b\", \"Npr3\", \"Otof\"\n",
 92 |     "    ])\n",
 93 |     "\n",
 94 |     "cortex_major_markers = list(set(cortex_major_markers))\n",
 95 |     "hypo_major_markers = [\n",
 96 |     "    'Agtr1a', 'Pomc', 'Oxt', 'Npy', 'Agrp', 'Esr1', 'Slc17a6',\n",
 97 |     "    'Meis2', 'Th', 'Gpr101', 'Hcrt', 'Nrgn', 'Sst', 'Map1b', 'Nts', 'Pmch', 'Cartpt',\n",
 98 |     "    'Gpr83', 'Bdnf', 'Otp', 'Calb2', 'Tac1', 'Tac2', 'Calb1', 'Trh', 'Gal', 'Col25a1', 'Synpr'\n",
 99 |     "]\n",
100 |     "\n",
101 |     "moffitt_genes = [\n",
102 |     "    \"Ace2\", \"Adora2a\", \"Aldh1l1\", \"Amigo2\", \"Ano3\", \"Aqp4\", \"Ar\", \"Arhgap36\", \"Avpr1a\", \"Avpr2\", \"Baiap2\", \"Bdnf\", \"Bmp7\", \"Brs3\",\"Calcr\",\"Cbln1\",\"Cbln2\",\"Cckar\",\"Cckbr\",\"Ccnd2\",\"Cd24a\",\"Cdkn1a\",\"Cenpe\",\"Chat\",\"Coch\",\"Col25a1\",\"Cplx3\",\"Cpne5\",\"Creb3l1\",\"Crhbp\",\"Crhr1\",\"Crhr2\",\"Cspg5\",\"Cxcl14\",\"Cyp19a1\",\"Cyp26a1\",\"Cyr61\",\"Dgkk\",\"Ebf3\",\"Egr2\",\"Ermn\",\"Esr1\",\"Etv1\",\"Fbxw13\",\"Fezf1\",\"Fn1\",\"Fst\",\"Gabra1\",\"Gabrg1\"\"Gad1\",\"Galr1\",\"Galr2\",\"Gbx2\",\"Gda\",\"Gem\",\"Gjc3\",\"Glra3\",\"Gpr165\",\"Greb1\",\"Grpr\",\"Htr2c\",\"Igf1r\",\"Igf2r\",\"Irs4\",\"Isl1\",\"Kiss1r\",\"Klf4\",\"Lepr\",\"Lmod1\",\"Lpar1\",\"Man1a\",\"Mc4r\",\"Mki67\",\"Mlc1\",\"Myh11\",\"Ndnf\",\"Ndrg1\",\"Necab1\",\"Nos1\",\"Npas1\",\"Npy1r\",\"Npy2r\",\"Ntng1\",\"Ntsr1\",\"Nup62cl\",\"Omp\",\"Onecut2\",\"Opalin\",\"Oprd1\",\"Oprk1\",\"Oprl1\",\"Oxtr\",\"Pak3\",\"Pcdh11x\",\"Pdgfra\",\"Pgr\",\"Plin3\",\"Pnoc\",\"Pou3f2\",\"Prlr\",\"Ramp3\",\"Rgs2\",\"Rgs5\",\"Rnd3\",\"Rxfp1\",\"Scgn\",\"Selplg\",\"Sema3c\",\"Sema4d\",\"Serpinb1b\",\"Serpine1\",\"Sgk1\",\"Slc15a3\",\"Slc17a6\",\"Slc17a7\",\"Slc17a8\",\"Slc18a2\",\"Slco1a4\",\"Sox4\",\"Sox6\",\"Sox8\",\"Sp9\",\"Synpr\",\"Syt2\",\"Syt4\",\"Sytl4\",\"Tacr1\",\"Tacr3\",\"Tiparp\",\"Tmem108\",\"Traf4\",\"Trhr\",\"Ttn\",\"Ttyh2\",\"Oxt\",\"Penk\",\"Sst\",\"Tac1\",\"Gal\",\"Cartpt\",\"Vgf\",\"Trh\",\"Nts\",\"Scg2\",\"Gnrh1\",\"Tac2\",\"Cck\",\"Crh\",\"Ucn3\",\"Adcyap1\",\"Nnat\",\"Sln\",\"Mbp\",\n",
103 |     "\"Th\"\n",
104 |     "]\n",
105 |     "t_cell_genes = [\n",
106 |     "    \"Ptprc\", \"Rorc\", \"Gata3\", \"Foxp3\", \"Tbx21\", \"Il2ra\", \"Il7r\", \"Il2rb\", \"Il2rg\", \"Il15ra\", \"Pdcd1\", \"Ctla4\", \"Cd3e\"    \n",
107 |     "]\n",
108 |     "macrophage_genes = [\n",
109 |     "    \"Spi1\", \"Cx3cr1\", \"Ccr2\", \"Adgr1\", \"Aif1\", \"Csf1r\", \"Trem2\", \"H2-Ab1\", \"Itgae\", \"Clec10a\", \"Itgam\", \"Itgax\"\n",
110 |     "]\n",
111 |     "bcell_genes = [\"Ms4a1\", \"Cd19\", \"Prdm1\"]\n",
112 |     "nkcell_genes = [\"Klrk1\", \"Klrb1\", \"Eomes\", \"Klrg1\"]\n",
113 |     "misc_immune = [\"Cxcl9\", \"Cxcl10\", \"Ccl2\", \"Cd1d1\", \"Fcer1a\", \"Fcgr1\", \"Cr2\", \"Cd47\"]\n",
114 |     "innate_bacterial = ['Il1b', 'Tnf', 'Il6', 'Ptges2']\n",
115 |     "innate_viral = ['Ifna12', 'Ifna16', 'Ifna2']\n",
116 |     "th1 = ['Ifng', 'Il12a']\n",
117 |     "th2 = ['Il4', 'Il5', 'Il13']\n",
118 |     "th17 = ['Il17a', 'Il17f', 'Il22', 'Il23a']\n",
119 |     "treg = ['Il10', 'Tgfb2', 'Tgfb1', 'Tgfb3']\n",
120 |     "other_immune = [\"Nfkb1\", \"Nfkbia\", \"Irf3\", \"Nlrp3\", \"Irf7\", \"Gsdmd\", \"Il18\"]\n",
121 |     "\n",
122 |     "minimal_aging = [\"C3\", \"C4b\", \"Il33\",\"Tnf\",\"Cdkn2a\", \"Cdkn2b\", 'B2m', 'C1qa', 'C1qc', 'C4b', 'Ctss', 'Gbp6', 'Gbp10', 'Ifi44', 'Ifit3', 'Ifitm3', 'Itgb2', 'Parp14', 'Serpina3n', 'Tap1', 'Trim30a']"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "known_markers_pfc = np.unique(np.concatenate([\n",
132 |     "    cortex_major_markers,\n",
133 |     "    t_cell_genes, macrophage_genes, bcell_genes, nkcell_genes, \n",
134 |     "    misc_immune, innate_bacterial, innate_viral, \n",
135 |     "    th1, th2, th17, treg, other_immune,\n",
136 |     "    minimal_aging\n",
137 |     "]))"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "# Select MERFISH genes for cell type markers in PFC "
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "adata_pfc = adata[adata.obs.area == \"PFC\"]"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "# Meng/Stephen approach: for pairs of clusters, compute differential expression\n",
163 |     "adata_raw = adata_pfc.raw.to_adata()\n",
164 |     "adata_raw = adata_raw[:, adata_raw.var.highly_variable]"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {
171 |     "scrolled": true
172 |    },
173 |    "outputs": [],
174 |    "source": [
175 |     "# pairwise\n",
176 |     "de_subclusts = compute_pairwise_de_for_clusts(adata_raw, \"clust_label\",n_de=10)\n",
177 |     "de_majorclusts = compute_pairwise_de_for_clusts(adata_raw, \"cell_type\",n_de=10)\n",
178 |     "\n",
179 |     "# one vs all\n",
180 |     "minorclusts_onevsall = compute_onevsall_de_for_clusts(adata_raw, 'clust_label',n_de=15)\n",
181 |     "majorclusts_onevsall = compute_onevsall_de_for_clusts(adata_raw, 'cell_type',n_de=15)\n"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "#de_minorclust_pairwise = greedily_select_markers(de_subclusts, 1, pairwise=True,de_marker_genes=known_markers_pfc)\n",
191 |     "#de_majorclust_pairwise = greedily_select_markers(de_majorclusts, 5, pairwise=True,de_marker_genes=known_markers_pfc)\n",
192 |     "de_minorclusts_onevsall = greedily_select_markers(minorclusts_onevsall, 2, pairwise=False, de_marker_genes=known_markers_pfc)\n",
193 |     "de_majorclusts_onevsall = greedily_select_markers(majorclusts_onevsall, 2, pairwise=False, de_marker_genes=known_markers_pfc)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {},
199 |    "source": [
200 |     "# List of known markers"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "de_combined = list(np.unique( list(de_minorclusts_onevsall) + list(de_majorclusts_onevsall)))"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "de_combined"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "adata_raw = adata_pfc.raw.to_adata()\n",
228 |     "\n",
229 |     "de_combined = [i for i in de_combined if i in adata_raw.var_names]"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": null,
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "len(de_combined)"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "metadata": {},
245 |    "outputs": [],
246 |    "source": [
247 |     "# load per cluster markers and take top N\n",
248 |     "#seurat_clust_markers = pd.read_csv(\"gene_lists/all_clust_markers.csv\")\n",
249 |     "#min_marker_genes = 4\n",
250 |     "#seurat_de_marker_genes = set()\n",
251 |     "#for n,i in enumerate(clust_labels_uniq):\n",
252 |     "#    curr_contrast = seurat_clust_markers[seurat_clust_markers.cluster==i].sort_values('avg_log2FC', ascending=False)\n",
253 |     "#    curr_genes = list(curr_contrast.gene)[:3]\n",
254 |     "#    for k in curr_genes:\n",
255 |     "#        seurat_de_marker_genes.add(k)\n",
256 |     "#seurat_de_marker_genes = list(seurat_de_marker_genes)"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "marker_clust_avgs = []\n",
266 |     "clust_avgs = []\n",
267 |     "for i in adata_raw.obs.clust_label.unique():\n",
268 |     "    clust_avgs.append(compute_mean_expression(adata_raw[adata_raw.obs.clust_label==i,:]))\n",
269 |     "    marker_clust_avgs.append(compute_mean_expression(adata_raw[adata_raw.obs.clust_label==i,:][:,de_combined]))"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": []
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "plot_clustered_celltypes_by_genes(adata_raw, de_combined,normalize=False)"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "adata_raw[:,de_combined].X.sum(1).shape"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": []
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": null,
307 |    "metadata": {},
308 |    "outputs": [],
309 |    "source": [
310 |     "len(de_combined)\n"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "markdown",
315 |    "metadata": {},
316 |    "source": [
317 |     "# Select MERFISH genes for aging"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "markdown",
322 |    "metadata": {},
323 |    "source": [
324 |     "1. Cell types:\n",
325 |     "\t1. Pairwise DE of major clusts\n",
326 |     "\t2. Pairwise DE of minor clusts\n",
327 |     "\t3. One-vs-all DE of major clusts\n",
328 |     "\t4. One-vs-all DE of minor clusts\n",
329 |     "\t5. Random forest features of major clusts\n",
330 |     "\t6. Random forest features of minor clusts\n",
331 |     "2. Aging markers:\n",
332 |     "\t1. Random forest features of major clusts\n",
333 |     "\t2. Random forest features of minor clusts\n",
334 |     "\t3. NB differential expression of major clusts\n",
335 |     "\t4. NB differential expression of minor clusts\n",
336 |     "\t5. TF random forests of major clusts\n",
337 |     "\t6. TF random forests of minor clusts"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": null,
343 |    "metadata": {},
344 |    "outputs": [],
345 |    "source": [
346 |     "# load aging differentially expressed genes\n",
347 |     "age_tf_feats = list(pd.read_csv(\"gene_lists/age_tf_feats.csv\").gene)\n",
348 |     "age_de_minor = pd.read_csv(\"gene_lists/nb_glm_age_de_minor.csv\")\n",
349 |     "age_de_major = pd.read_csv(\"gene_lists/nb_glm_age_de_major.csv\")"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": null,
355 |    "metadata": {},
356 |    "outputs": [],
357 |    "source": [
358 |     "age_de_major = age_de_major[age_de_major.qval < 1e-6]\n",
359 |     "age_de_minor = age_de_minor[age_de_minor.qval < 1e-6]"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": null,
365 |    "metadata": {},
366 |    "outputs": [],
367 |    "source": [
368 |     "age_de_major['log10fc'] = np.abs(age_de_major.coef)\n",
369 |     "#age_de_minor['log10fc'] = np.abs(age_de_minor.coef)"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": [
378 |     "age_de_major_markers = list(select_age_markers({k:age_de_major[age_de_major.cell_type==k] for k in age_de_major.cell_type.unique()}, 5))\n",
379 |     "age_de_minor_markers = list(select_age_markers({k:age_de_minor[age_de_minor.cell_type==k] for k in age_de_minor.cell_type.unique()}, 2))\n"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": null,
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": [
388 |     "combined_age_markers = list(set(age_de_minor_markers + age_de_major_markers + age_tf_feats))"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": null,
394 |    "metadata": {},
395 |    "outputs": [],
396 |    "source": [
397 |     "len(combined_age_markers)"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": null,
403 |    "metadata": {},
404 |    "outputs": [],
405 |    "source": [
406 |     "combined_age_markers = sorted(combined_age_markers)"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": null,
412 |    "metadata": {},
413 |    "outputs": [],
414 |    "source": []
415 |   },
416 |   {
417 |    "cell_type": "markdown",
418 |    "metadata": {},
419 |    "source": [
420 |     "# Add in markers from literature"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": null,
426 |    "metadata": {},
427 |    "outputs": [],
428 |    "source": [
429 |     "# Microglia reactivity signature\n",
430 |     "# IL-6, TGFbeta1, IL10, IL-12/p40, IL-1beta, TNFalpha\n",
431 |     "aging_microglia = ['Ccl4', 'Lgals3', 'Ms4a7', 'Ifitm3'] + ['Il10', 'Il6', 'Il21a', 'Il12b', 'Il1b', 'Tnf']\n",
432 |     "aging_microglia += [\"Tmem119\", \"Apoe\", \"Cst7\", \"Clec7a\", \"Lpl\", \"Hif1a\", \"Igf1\", \"Cd74\", \"Ifit2\", \"Ifit3\", \"Irf7\", \"Oasl2\", \"Top2a\", \"Mcm2\"]\n",
433 |     "aging_microglia += [\"Tyrobp\", \"Ctsb\", \"Ctsd\", \"Fth1\", \"Lyz2\", \"Axl\", \"Cst7\", \"Trem2\", \"Cst7\", \"Lpl\", \"Cd9\", \"Csf1\", \"Ccl6\", \"Itgax\", \"Clec7a\", \"Lilrb4\", \"Timp2\", \"Marcks\", \"Serinc3\", \"P2ry12\", \"Cd9\", \"Cd63\"]"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": null,
439 |    "metadata": {},
440 |    "outputs": [],
441 |    "source": [
442 |     "# Aging astrocytes -- The Aging Astrocyte Transcriptome from Multiple Regions of the Mouse Brain, Boisvert et al\n",
443 |     "aging_astro_allregions = ['Sprina3m', 'Serpina3n', 'C4b', 'Pcdh6', 'Pcdhb1', 'Gfap', 'Prss50', # upregulated\n",
444 |     "                          'Gpx8', 'Hspa1b', 'Hspa1a', 'Rsrp1']\n",
445 |     "aging_astro_regionspecific = list(np.unique(['Serpina3f', 'Rpk4', 'Timp1', 'Fbln5', 'Plin4', 'Rab20', 'Capg', 'Zc3hav1', 'Gbp2', 'Ifi35', 'Hs3st3a1', 'Mboat1', 'Psmb8', 'Cyp27a1',\n",
446 |     "                              'Serpina3f', 'Cdr1', 'Zbtb20', 'Grin2b', 'Hipk2', 'Tcp11l1', 'Ago3', 'Oasl2', 'Lnpep', 'Gan', 'Aqp2', 'Bst2', 'Hmbox1', 'Zc3hav1',\n",
447 |     "                             'Serpina3f', 'Cdr1', 'Lars2', 'Zbtb20', 'Grin2b', 'Rpk4', 'Nr5a1', 'Slc22a18', 'Timp1', 'Fcgr2b', 'Hipk2', 'C3', 'Osmr', 'Oasl2', 'Nupr1', # up\n",
448 |     "                                            'Bmp4', 'Kiss1', 'Fst', 'Cyr61', 'Tead2', 'Dnajb1', 'Banp', 'Cdx8', 'Rbm12b1', 'Ece2', \n",
449 |     "                                            'Bmp4', 'Cd38', 'Sptbn2', 'Sptb', 'Pcdh20', 'Eif5b', 'Gm7120', 'Sptan1', 'Hmgcr', 'Trio',\n",
450 |     "                                            'Sspo', 'Wfdc2', 'Ttr', 'Ctgf', 'Thbs4', 'Bmp4', 'Prom1', 'Sptbn2', 'Bgn', 'Tnc', 'Sparc']))\n",
451 |     "combined_astro_marker = list(np.unique(aging_astro_allregions+aging_astro_regionspecific))"
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "code",
456 |    "execution_count": null,
457 |    "metadata": {},
458 |    "outputs": [],
459 |    "source": [
460 |     "# senescence genes\n",
461 |     "senescence_high = ['Retnla', 'Tnf', 'Cdkn2a', 'Itgax', 'Il12b', 'Il18', 'Cd68', 'Fcgr1',\n",
462 |     "       'Parp14', 'Fcna', 'Cd36', 'Cd38', 'Bst1', 'Itgam', 'Emr1', 'Irg1',\n",
463 |     "       'Il1b', 'Lmnb1', 'Il10', 'Fabp4', 'Lyve1', 'Mrc1', 'Nampt', 'Nadk',\n",
464 |     "       'Bmi1', 'Sirt7']\n",
465 |     "\n",
466 |     "senescence_low = ['Sirt1', 'Nfkbiz', 'Cdkn1a', 'Tiparp', 'Trp53',\n",
467 |     "       'Sirt5', 'Csf1', 'Nfkb1', 'Parp6', 'Sirt2', 'Nnmt', 'Hmgb1', 'Bcl2l2',\n",
468 |     "       'Nt5e', 'Sirt3', 'Serpine1', 'Arg1', 'Parp10', 'Ccl2', 'Il6', 'Nmnat3',\n",
469 |     "       'Cdkn2b', 'Il12a', 'Parp12', 'Parp9', 'Parp11', 'Parp8', 'Sirt6',\n",
470 |     "       'Sirt4', 'Mgl2', 'Parp3', 'Zc3hav1', 'Tnks', 'Parp4', 'Parp2', 'Sarm1',\n",
471 |     "       'Parp16', 'Nmnat2', 'Parp1', 'Nmnat1']\n",
472 |     "\n",
473 |     "reactive_astro1 = ['C3',  'Ggta1', 'Ligp1', 'Gpp2', 'Fbln5', 'Ekbp5', 'Psmb8'] # A1 astrocytes are produced following LPS injection\n",
474 |     "reactive_astro2 = ['Clcf1', 'Tgm1', 'Ptx3', 'S100a10', 'Sphk1', 'Cd109', 'Ptgs2', 'Emp1', 'Slc10a6', 'Tms4sf1', 'B3gnt5', 'Stat3']\n",
475 |     "reactive_astro_pan = ['Lcn2', 'Steap4', 'S1pr3', 'Timp1', 'Hsbp1', 'Cxcl10', 'Cd44', 'Cp', 'Serpina3n', 'Aspg', 'Vim', 'Gfap']\n",
476 |     "\n",
477 |     "# brunet aging genes\n",
478 |     "brunet_genes = ['B2m', 'C1qa', 'C1qc', 'C4b', 'Ctss', 'Gbp6', 'Gbp10', 'Ifi44', 'Ifit3', 'Ifitm3', 'Itgb2', 'Parp14', 'Serpina3n', 'Tap1', 'Trim30a']\n"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": null,
484 |    "metadata": {},
485 |    "outputs": [],
486 |    "source": [
487 |     "combined_senescence = list(np.unique(senescence_high + senescence_low + brunet_genes + reactive_astro1 + reactive_astro2 + reactive_astro_pan))"
488 |    ]
489 |   },
490 |   {
491 |    "cell_type": "code",
492 |    "execution_count": null,
493 |    "metadata": {},
494 |    "outputs": [],
495 |    "source": [
496 |     "orig_all_age_markers = list(list(combined_age_markers + aging_microglia + combined_astro_marker + combined_senescence))"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "code",
501 |    "execution_count": null,
502 |    "metadata": {},
503 |    "outputs": [],
504 |    "source": [
505 |     "# remove genes in cell type markers\n",
506 |     "orig_all_age_markers = list(set([i for i in orig_all_age_markers if i not in de_combined]))\n",
507 |     "good_genes = adata.raw.to_adata().var_names\n",
508 |     "orig_all_age_markers = [i for i in orig_all_age_markers if i in good_genes]"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": null,
514 |    "metadata": {},
515 |    "outputs": [],
516 |    "source": [
517 |     "print(len(orig_all_age_markers))"
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "code",
522 |    "execution_count": null,
523 |    "metadata": {},
524 |    "outputs": [],
525 |    "source": [
526 |     "age_diffexp = compute_average_age_expr_change(adata_raw, orig_all_age_markers)"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": null,
532 |    "metadata": {},
533 |    "outputs": [],
534 |    "source": [
535 |     "# filter based on average change in expression\n",
536 |     "age_threshold = 0.35 #np.log(1.5)\n",
537 |     "all_age_markers = np.array(orig_all_age_markers)[(np.abs(age_diffexp) > age_threshold).any(0)]"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "code",
542 |    "execution_count": null,
543 |    "metadata": {},
544 |    "outputs": [],
545 |    "source": [
546 |     "all_age_markers"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "code",
551 |    "execution_count": null,
552 |    "metadata": {},
553 |    "outputs": [],
554 |    "source": [
555 |     "print(np.sum([1 if i in combined_senescence else 0 for i in orig_all_age_markers ]))"
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "code",
560 |    "execution_count": null,
561 |    "metadata": {},
562 |    "outputs": [],
563 |    "source": [
564 |     "mean_age_expr = compute_mean_expression(adata_raw[:,all_age_markers])\n",
565 |     "plt.plot(np.cumsum(np.sort(mean_age_expr))/np.sum(mean_age_expr),'ko-')"
566 |    ]
567 |   },
568 |   {
569 |    "cell_type": "code",
570 |    "execution_count": null,
571 |    "metadata": {},
572 |    "outputs": [],
573 |    "source": [
574 |     "# remove the top 10% highly expressed age markers\n",
575 |     "#sorted_age_markers = np.array(orig_all_age_markers)[np.argsort(mean_age_expr)]\n",
576 |     "#all_age_markers = list(np.array(orig_all_age_markers[:int(0.8*len(sorted_age_markers))]))"
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "code",
581 |    "execution_count": null,
582 |    "metadata": {},
583 |    "outputs": [],
584 |    "source": [
585 |     "# how \n",
586 |     "#print(np.sum([1 if i in combined_senescence else 0 for i in all_age_markers ]))"
587 |    ]
588 |   },
589 |   {
590 |    "cell_type": "code",
591 |    "execution_count": null,
592 |    "metadata": {},
593 |    "outputs": [],
594 |    "source": [
595 |     "print(\"Senescence genes excluded based on expression\")\n",
596 |     "for i in combined_senescence:\n",
597 |     "    if i not in all_age_markers and i in orig_all_age_markers:\n",
598 |     "        if i in brunet_genes:\n",
599 |     "            print(i, 'brunet')\n",
600 |     "        elif i in senescence_high:\n",
601 |     "            print(i, 'senesce_high')\n",
602 |     "        elif i in senescence_low:\n",
603 |     "            print(i, 'senesce_low')\n",
604 |     "        elif i in combined_astro_marker:\n",
605 |     "            print(i,'combined_astro')\n",
606 |     "        elif i in aging_microglia:\n",
607 |     "            print(i,'microglia')\n",
608 |     "        else:\n",
609 |     "            print(i)"
610 |    ]
611 |   },
612 |   {
613 |    "cell_type": "code",
614 |    "execution_count": null,
615 |    "metadata": {},
616 |    "outputs": [],
617 |    "source": [
618 |     "plot_clustered_ages_by_genes(adata_raw, de_combined)\n",
619 |     "plot_clustered_ages_by_genes(adata_raw, all_age_markers)"
620 |    ]
621 |   },
622 |   {
623 |    "cell_type": "code",
624 |    "execution_count": null,
625 |    "metadata": {},
626 |    "outputs": [],
627 |    "source": [
628 |     "plot_clustered_celltypes_by_genes(adata_raw, all_age_markers,normalize=False)"
629 |    ]
630 |   },
631 |   {
632 |    "cell_type": "code",
633 |    "execution_count": null,
634 |    "metadata": {},
635 |    "outputs": [],
636 |    "source": [
637 |     "plot_per_celltype_sparsity(adata_raw, de_combined)"
638 |    ]
639 |   },
640 |   {
641 |    "cell_type": "code",
642 |    "execution_count": null,
643 |    "metadata": {},
644 |    "outputs": [],
645 |    "source": [
646 |     "plot_per_celltype_sparsity(adata_raw, all_age_markers)"
647 |    ]
648 |   },
649 |   {
650 |    "cell_type": "code",
651 |    "execution_count": null,
652 |    "metadata": {},
653 |    "outputs": [],
654 |    "source": [
655 |     "plot_per_celltype_totalexpr(adata_raw, de_combined)"
656 |    ]
657 |   },
658 |   {
659 |    "cell_type": "code",
660 |    "execution_count": null,
661 |    "metadata": {},
662 |    "outputs": [],
663 |    "source": [
664 |     "plot_per_celltype_totalexpr(adata_raw, all_age_markers)"
665 |    ]
666 |   },
667 |   {
668 |    "cell_type": "code",
669 |    "execution_count": null,
670 |    "metadata": {},
671 |    "outputs": [],
672 |    "source": [
673 |     "plot_per_gene_sparsity(adata_raw, de_combined)"
674 |    ]
675 |   },
676 |   {
677 |    "cell_type": "code",
678 |    "execution_count": null,
679 |    "metadata": {},
680 |    "outputs": [],
681 |    "source": [
682 |     "plot_per_gene_sparsity(adata_raw, all_age_markers)"
683 |    ]
684 |   },
685 |   {
686 |    "cell_type": "code",
687 |    "execution_count": null,
688 |    "metadata": {},
689 |    "outputs": [],
690 |    "source": [
691 |     "pd.DataFrame({'gene':de_combined}).to_csv(\"gene_lists/all_markers_pfc.csv\")\n",
692 |     "pd.DataFrame({'gene':all_age_markers}).to_csv(\"gene_lists/all_markers_pfc_aging.csv\")"
693 |    ]
694 |   },
695 |   {
696 |    "cell_type": "markdown",
697 |    "metadata": {},
698 |    "source": [
699 |     "# Save out per cluster expression for these genes for bit assignment"
700 |    ]
701 |   },
702 |   {
703 |    "cell_type": "code",
704 |    "execution_count": null,
705 |    "metadata": {},
706 |    "outputs": [],
707 |    "source": [
708 |     "# find cluster names\n",
709 |     "adata_raw = adata.raw.to_adata()\n",
710 |     "clust_labels_uniq = adata_raw.obs.clust_label.unique()\n",
711 |     "# find markers actually in adata\n",
712 |     "all_markers = [i for i in all_markers_to_keep if i in adata_raw.var_names]\n",
713 |     "# compute cluster averages\n",
714 |     "clust_avgs = np.vstack([adata_raw[adata_raw.obs.clust_label==i,:][:, list(all_markers_to_keep)].X.mean(0) for i in clust_labels_uniq])"
715 |    ]
716 |   },
717 |   {
718 |    "cell_type": "code",
719 |    "execution_count": null,
720 |    "metadata": {},
721 |    "outputs": [],
722 |    "source": [
723 |     "clust_expr = pd.DataFrame(clust_avgs, index=clust_labels_uniq, columns=all_markers_to_keep).to_csv(\"merfish_cluster_expr.csv\")"
724 |    ]
725 |   },
726 |   {
727 |    "cell_type": "code",
728 |    "execution_count": null,
729 |    "metadata": {},
730 |    "outputs": [],
731 |    "source": [
732 |     "# save this data for bit assignment\n",
733 |     "clust_proportions = np.array([np.sum(adata_raw.obs.clust_label==i) for i in clust_labels_uniq])\n",
734 |     "clust_proportions = clust_proportions/clust_proportions.sum()\n",
735 |     "pd.DataFrame({'clust':clust_labels_uniq,'proportion':clust_proportions}).to_csv(\"merfish_cluster_proportions.csv\")"
736 |    ]
737 |   },
738 |   {
739 |    "cell_type": "code",
740 |    "execution_count": null,
741 |    "metadata": {},
742 |    "outputs": [],
743 |    "source": []
744 |   }
745 |  ],
746 |  "metadata": {
747 |   "kernelspec": {
748 |    "display_name": "Python 3 (ipykernel)",
749 |    "language": "python",
750 |    "name": "python3"
751 |   },
752 |   "language_info": {
753 |    "codemirror_mode": {
754 |     "name": "ipython",
755 |     "version": 3
756 |    },
757 |    "file_extension": ".py",
758 |    "mimetype": "text/x-python",
759 |    "name": "python",
760 |    "nbconvert_exporter": "python",
761 |    "pygments_lexer": "ipython3",
762 |    "version": "3.7.4"
763 |   }
764 |  },
765 |  "nbformat": 4,
766 |  "nbformat_minor": 4
767 | }
768 | 


--------------------------------------------------------------------------------
/notebooks/aging_call_final_celltypes_10x.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload\n",
 10 |     "%autoreload 2\n",
 11 |     "import matplotlib.pyplot as plt\n",
 12 |     "%matplotlib inline\n",
 13 |     "%config InlineBackend.figure_format='retina'\n"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import sys\n"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 3,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "import numpy as np\n",
 32 |     "import scanpy as sc\n",
 33 |     "import pandas as pd\n",
 34 |     "import anndata as ad\n",
 35 |     "import seaborn as sns\n",
 36 |     "sns.set_style('white')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 4,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)\n",
 46 |     "sc.settings.set_figure_params(dpi=80, facecolor='white', frameon=False, figsize=(5,5))"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 5,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "adata = ad.read_h5ad(\"/faststorage/brain_aging/rna_analysis/adata_finalclusts.h5ad\")"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 6,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "adata = adata[adata.obs.final_clusts != 'NA']\n",
 65 |     "adata = adata[adata.obs.total_counts < 50000]"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 7,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "# remove clusters with < 100 cells\n",
 75 |     "good_clusts = [i for i in adata.obs.final_clusts.unique() if np.sum(adata.obs.final_clusts==i)>200]"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 8,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "adata = adata[adata.obs.final_clusts.isin(good_clusts)]"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 9,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "A = adata[~adata.obs.neuronal].copy()"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "n_pcs = 20\n",
103 |     "sc.pp.highly_variable_genes(A, n_top_genes=2000)\n",
104 |     "A.raw = A\n",
105 |     "A = A[:, A.var.highly_variable]\n",
106 |     "print('regressing out')\n",
107 |     "sc.pp.regress_out(A, ['total_counts'])\n",
108 |     "print('scaling')\n",
109 |     "sc.pp.scale(A, max_value=10)\n",
110 |     "print('pca')\n",
111 |     "sc.tl.pca(A, svd_solver='arpack', n_comps=n_pcs)\n",
112 |     "print('neighbors')\n",
113 |     "sc.pp.neighbors(A, n_neighbors=25, n_pcs=n_pcs)\n",
114 |     "#sc.external.pp.bbknn(A,batch_key='age',n_pcs=n_pcs)\n",
115 |     "print('umap')\n",
116 |     "sc.tl.umap(A)\n",
117 |     "print('leiden')\n",
118 |     "sc.tl.leiden(A,resolution=0.6)\n"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "sc.pl.umap(A,color=['age', 'area','final_clusts','mouse_id'],use_raw=True,palette=sns.color_palette('gist_ncar',20))"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "sc.pl.dotplot(A, [\n",
137 |     "                  'Csf1r', 'C1qa', 'Hexb', 'Cx3cr1', 'P2ry12', 'Tmem119', 'Tnf', 'Ccl4', # microglia\n",
138 |     "                                 'Sox10','Cldn11', 'Mog', 'Plp1', # oligo\n",
139 |     "                                  'Aqp4', 'Aldh1l1','Gfap', 'Aldoc', # astrocyte\n",
140 |     "                                  'Vtn', 'Flt1', 'Pecam1','Cldn5', 'Adgrf5', # pericyte\n",
141 |     "                                 'Mgp' ,'Slc47a1', 'Dapl1', 'Igf2', 'Sema3g', 'Acta2',  # vascular\n",
142 |     "                                 'Pdgfra', 'Vcan', 'Cspg4', 'Olig1', # OPC\n",
143 |     "                                 'Ccdc153', 'Tmem212', 'Hdc', 'Kcnj8',# ependymal\n",
144 |     "                                 'Pf4', 'Cd74', 'Cxcl2', 'Lyz2', 'Ms4a7',\n",
145 |     "], groupby='final_clusts',use_raw=True)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "# gene sets\n",
155 |     "# microglia -- Hexb, Csf1r, C1qa, P2ry12\n",
156 |     "# OPCs -- Pdgfra, Vcan, Cspg4, Olig1\n",
157 |     "# Endo -- Vtn, Flt1, Cldn5\n",
158 |     "# Oligo -- Plp1, Mpb, Cldn11, Mog"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "nonneuronal_mapping = {'N.0' : 'Oligodendrocyte', \n",
168 |     "                       'N.1' : 'Oligodendrocyte', \n",
169 |     "                       'N.2':'Oligodendrocyte', \n",
170 |     "                       'N.3':'Astrocyte',\n",
171 |     "                       'N.4': 'Astrocyte',\n",
172 |     "                       'N.5': 'Astrocyte', \n",
173 |     "                       'N.6' : 'OPC',\n",
174 |     "                       'N.7':'Microglia', \n",
175 |     "                       'N.8' : 'Microglia',\n",
176 |     "                        'N.9' : 'NA',\n",
177 |     "                       'N.10' : 'NA',\n",
178 |     "                       'N.11':'NA', \n",
179 |     "                        'N.12' : 'NA',\n",
180 |     "                       'N.13' : 'Vascular',\n",
181 |     "                      'N.14' : 'Oligodendrocyte',\n",
182 |     "                      'N.15' : 'Vascular', # vascular endoethelial cells\n",
183 |     "                      'N.16' : 'Oligodendrocyte',\n",
184 |     "                      'N.17' : 'Astrocyte',\n",
185 |     "                      'N.18' : 'Immune', # perivascular macrophage\n",
186 |     "                      'N.19' : 'Vascular', # pericyte\n",
187 |     "                      'N.20' : 'Vascular' # vascular leptomeningeal cells\n",
188 |     "                      }\n",
189 |     "\n",
190 |     "finer_nonneuronal_mapping = {'N.0' : 'Olig1', \n",
191 |     "                       'N.1' : 'Olig2', \n",
192 |     "                       'N.2':'Olig3', \n",
193 |     "                       'N.3':'Astro1',\n",
194 |     "                       'N.4': 'Astro2',\n",
195 |     "                       'N.5': 'Astro3', \n",
196 |     "                       'N.6' : 'OPC',\n",
197 |     "                       'N.7':'Micro1', \n",
198 |     "                       'N.8' : 'Micro2',\n",
199 |     "                        'N.9' : 'NA',\n",
200 |     "                       'N.10' : 'NA',\n",
201 |     "                       'N.11':'NA', \n",
202 |     "                        'N.12' : 'NA',\n",
203 |     "                       'N.13' : 'Vlmc1',\n",
204 |     "                      'N.14' : 'Olig4',\n",
205 |     "                      'N.15' : 'Peri1', # \n",
206 |     "                      'N.16' : 'Olig5',\n",
207 |     "                      'N.17' : 'Astro4',\n",
208 |     "                      'N.18' : 'Macro', # perivascular macrophage\n",
209 |     "                      'N.19' : 'Peri2', # pericyte\n",
210 |     "                      'N.20' : 'Vlmc2' # vascular leptomeningeal cells\n",
211 |     "                      }\n",
212 |     "\n"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "sc.pl.umap(adata, color=['final_clusts','Tac1', 'Tshz1', 'Cxcl14', 'Pdyn','Penk', 'Drd1', 'Drd2', 'Adora2a', 'Calb1','Pthlh'])"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "# identify striatal neurons\n",
231 |     "sc.pl.dotplot(adata, ['Otof', 'Cacng5', 'Th','Ppp1r1b', 'Drd1','Tac1', 'Tshz1', 'Pdyn', 'Drd2','Penk','Adora2a', 'Calb1','Pthlh','Cxcl14','Chat'], groupby='final_clusts',use_raw=True)"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {},
238 |    "outputs": [],
239 |    "source": [
240 |     "striatal_celltypes = {\n",
241 |     "    'H.I.7' : 'StD1M1',\n",
242 |     "    'H.I.8' : 'StD1M2',\n",
243 |     "    'H.I.20': 'StD2M1',\n",
244 |     "    'H.I.27': 'StD1M3',\n",
245 |     "    'P.I.0' : 'StD1M4',\n",
246 |     "    'P.I.1' : 'StD1M5',\n",
247 |     "    'P.I.2' : 'StD2M2',\n",
248 |     "    'P.I.3' : 'StD2M3',\n",
249 |     "    'P.I.4' : 'StD1M6',\n",
250 |     "    'P.I.5' : 'StD2M4',\n",
251 |     "    'P.I.10': 'StD1M7',\n",
252 |     "    'P.I.18': 'StD2M5',\n",
253 |     "    'P.I.19': 'StD1M8'\n",
254 |     "}"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "cell_types = list(adata.obs.final_clusts.copy())\n",
264 |     "for i,k in enumerate(cell_types):\n",
265 |     "    if k in nonneuronal_mapping:\n",
266 |     "        cell_types[i] = nonneuronal_mapping[k]\n",
267 |     "    else:\n",
268 |     "        #if 'N' in k:\n",
269 |     "        #    pass\n",
270 |     "        #else:\n",
271 |     "        cell_types[i] = 'Neuron'\n",
272 |     "adata.obs['cell_type'] = cell_types\n",
273 |     "\n",
274 |     "cell_types_fine = list(adata.obs.final_clusts.copy())\n",
275 |     "for i,k in enumerate(cell_types_fine):\n",
276 |     "    if k in finer_nonneuronal_mapping:\n",
277 |     "        cell_types_fine[i] = finer_nonneuronal_mapping[k]\n",
278 |     "    elif k in striatal_celltypes:\n",
279 |     "        cell_types_fine[i] = striatal_celltypes[k]\n",
280 |     "    else:\n",
281 |     "        curr_cell_type = k.split(\".\")\n",
282 |     "        if curr_cell_type[0] == \"H\":\n",
283 |     "            curr_area = \"Hy\"\n",
284 |     "        else:\n",
285 |     "            curr_area = \"Fr\"\n",
286 |     "        if curr_cell_type[1] == \"I\":\n",
287 |     "            curr_type = \"In\"\n",
288 |     "        else:\n",
289 |     "            curr_type = \"Ex\"\n",
290 |     "        cell_types_fine[i] = curr_area + curr_type + str(int(curr_cell_type[2])+1)\n",
291 |     "adata.obs['clust_label'] = cell_types_fine#pd.Series(cell_types_fine,dtype='category')\n",
292 |     "adata.obs.clust_label = adata.obs.clust_label.astype('category')"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": null,
298 |    "metadata": {},
299 |    "outputs": [],
300 |    "source": [
301 |     "# remove bad non neuronal clusters\n",
302 |     "adata = adata[~adata.obs.cell_type.isin(['NA'])]"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": null,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "# reprocess\n",
312 |     "adata = adata.raw.to_adata()\n",
313 |     "sc.pp.normalize_total(adata, target_sum=1e4)\n",
314 |     "sc.pp.log1p(adata)\n",
315 |     "sc.pp.highly_variable_genes(adata, n_top_genes=3000)\n",
316 |     "adata.raw = adata\n",
317 |     "adata = adata[:, adata.var.highly_variable]\n"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": null,
323 |    "metadata": {},
324 |    "outputs": [],
325 |    "source": [
326 |     "sc.pp.regress_out(adata, ['total_counts'])"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": null,
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": [
335 |     "sc.pp.scale(adata, max_value=10)\n",
336 |     "sc.tl.pca(adata, n_comps=50,svd_solver='arpack')\n",
337 |     "\n",
338 |     "sc.pl.pca_variance_ratio(adata, log=True,n_pcs=50)\n",
339 |     "\n",
340 |     "sc.pp.neighbors(adata, n_neighbors=10, n_pcs=50)\n",
341 |     "sc.tl.umap(adata)"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": [
350 |     "# final final clust information\n",
351 |     "#adata.obs.to_csv(\"final_clusts.csv\")"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": null,
357 |    "metadata": {},
358 |    "outputs": [],
359 |    "source": [
360 |     "#fig,ax = plt.subplots(figsize=(10,10))\n",
361 |     "sc.pl.umap(adata, color=['doublet_score','total_counts'],size=1,add_outline=False)"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "metadata": {},
368 |    "outputs": [],
369 |    "source": [
370 |     "fig,ax = plt.subplots(figsize=(10,10))\n",
371 |     "sc.pl.umap(adata, color='cell_type',palette=sns.color_palette('Pastel1'), ax=ax,size=10,add_outline=True)"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": null,
377 |    "metadata": {},
378 |    "outputs": [],
379 |    "source": [
380 |     "def gen_light_palette(prefix, color_name, uniq_clusts):\n",
381 |     "    n = np.sum([1 if prefix in i else 0 for i in uniq_clusts])\n",
382 |     "    return sns.light_palette(color_name, n_colors=n+2)[2:]\n",
383 |     "\n",
384 |     "def gen_dark_palette(prefix, color_name, uniq_clusts):\n",
385 |     "    n = np.sum([1 if prefix in i else 0 for i in uniq_clusts])\n",
386 |     "    return sns.dark_palette(color_name, n_colors=n+2)[2:]\n",
387 |     "\n",
388 |     "uniq_clusts = np.sort(adata.obs.clust_label.unique())\n",
389 |     "\n",
390 |     "\n",
391 |     "print(\"Prefrontal excite\")\n",
392 |     "fr_ex_pal = gen_light_palette(\"FrEx\", \"darkgreen\", uniq_clusts) #sns.cubehelix_palette(start=0, rot=0.2, dark=0.25, light=.9, n_colors=n_pe)\n",
393 |     "\n",
394 |     "print(\"Prefrontal inhib\")\n",
395 |     "fr_in_pal = gen_light_palette(\"FrIn\", \"navy\", uniq_clusts)#sns.cubehelix_palette(start=0, rot=0.5, dark=0.25, light=.95, n_colors=n_pi)\n",
396 |     "\n",
397 |     "print(\"Striatal\")\n",
398 |     "st_pal = gen_light_palette(\"St\", \"indigo\", uniq_clusts) #sns.cubehelix_palette(start=0, rot=0.5, dark=0.5, light=.95, n_colors=n_st)\n",
399 |     "\n",
400 |     "\n",
401 |     "print(\"Microglial\")\n",
402 |     "micro_pal = gen_light_palette('Micro', 'dodgerblue', uniq_clusts)\n",
403 |     "\n",
404 |     "print(\"Macro\")\n",
405 |     "macro_pal = gen_light_palette('Macro', 'blue', uniq_clusts)\n",
406 |     "\n",
407 |     "print(\"Astrocyte\")\n",
408 |     "astro_pal = gen_light_palette('Astro', 'darkorange', uniq_clusts)\n",
409 |     "\n",
410 |     "print(\"Peri\")\n",
411 |     "peri_pal = gen_light_palette('Peri', 'lime', uniq_clusts)\n",
412 |     "\n",
413 |     "print(\"VLMC\")\n",
414 |     "vlmc_pal = gen_light_palette('Vlmc', 'aqua', uniq_clusts)\n",
415 |     "\n",
416 |     "print(\"OPC\")\n",
417 |     "opc_pal = gen_dark_palette('OPC', 'black', uniq_clusts)\n",
418 |     "\n",
419 |     "print(\"Oligo\")\n",
420 |     "oligo_pal = gen_light_palette('Olig', 'darkgray', uniq_clusts)\n",
421 |     "\n",
422 |     "pals = [astro_pal, fr_ex_pal, fr_in_pal, hy_ex_pal, hy_in_pal,  macro_pal,  micro_pal, opc_pal, oligo_pal, peri_pal, st_pal, vlmc_pal]\n",
423 |     "for i in pals:\n",
424 |     "    sns.palplot(i)"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": null,
430 |    "metadata": {},
431 |    "outputs": [],
432 |    "source": [
433 |     "from cycler import cycler\n",
434 |     "#pal = cycler(color=)\n",
435 |     "\n",
436 |     "pal = cycler(color=np.vstack(pals))\n",
437 |     "\n",
438 |     "label_colors = {}\n",
439 |     "for i, c in enumerate(iter(pal)):\n",
440 |     "    label_colors[uniq_clusts[i]] = c['color']"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": null,
446 |    "metadata": {},
447 |    "outputs": [],
448 |    "source": [
449 |     "fig,ax = plt.subplots(figsize=(10,10))\n",
450 |     "sc.pl.umap(adata, color='clust_label',palette=pal,ax=ax,size=10,add_outline=True)"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": null,
456 |    "metadata": {},
457 |    "outputs": [],
458 |    "source": [
459 |     "fig,ax = plt.subplots(figsize=(10,10))\n",
460 |     "sc.pl.umap(adata, color='age',ax=ax,size=10,add_outline=True,palette=sns.color_palette('Set2',2))"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "code",
465 |    "execution_count": null,
466 |    "metadata": {},
467 |    "outputs": [],
468 |    "source": [
469 |     "fig,ax = plt.subplots(figsize=(10,10))\n",
470 |     "sc.pl.umap(adata, color='area',ax=ax,size=10,add_outline=True)"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": null,
476 |    "metadata": {},
477 |    "outputs": [],
478 |    "source": [
479 |     "#adata.write(\"adata_finalclusts_annot.h5ad\")"
480 |    ]
481 |   }
482 |  ],
483 |  "metadata": {
484 |   "kernelspec": {
485 |    "display_name": "Python 3 (ipykernel)",
486 |    "language": "python",
487 |    "name": "python3"
488 |   },
489 |   "language_info": {
490 |    "codemirror_mode": {
491 |     "name": "ipython",
492 |     "version": 3
493 |    },
494 |    "file_extension": ".py",
495 |    "mimetype": "text/x-python",
496 |    "name": "python",
497 |    "nbconvert_exporter": "python",
498 |    "pygments_lexer": "ipython3",
499 |    "version": "3.7.4"
500 |   }
501 |  },
502 |  "nbformat": 4,
503 |  "nbformat_minor": 4
504 | }
505 | 


--------------------------------------------------------------------------------
/notebooks/merfish_spatial_celltype_org.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": null,
   6 |    "metadata": {},
   7 |    "outputs": [],
   8 |    "source": [
   9 |     "%load_ext autoreload\n",
  10 |     "%autoreload 2\n",
  11 |     "import scanpy as sc\n",
  12 |     "import matplotlib.pyplot as plt\n",
  13 |     "import numpy as np\n",
  14 |     "import matplotlib as mpl\n",
  15 |     "import os\n",
  16 |     "import anndata as ad\n",
  17 |     "mpl.rcParams['figure.dpi'] = 150\n",
  18 |     "plt.rcParams['pdf.fonttype'] = 42\n",
  19 |     "import sys\n",
  20 |     "from spatial_analysis import *\n",
  21 |     "from plotting import *\n",
  22 |     "from utils import *"
  23 |    ]
  24 |   },
  25 |   {
  26 |    "cell_type": "code",
  27 |    "execution_count": null,
  28 |    "metadata": {},
  29 |    "outputs": [],
  30 |    "source": [
  31 |     "import seaborn as sns\n",
  32 |     "sns.set_style('white')"
  33 |    ]
  34 |   },
  35 |   {
  36 |    "cell_type": "code",
  37 |    "execution_count": null,
  38 |    "metadata": {},
  39 |    "outputs": [],
  40 |    "source": [
  41 |     "def unbinarize_strings(A):\n",
  42 |     "    try:\n",
  43 |     "        A.var_names = [i.decode('ascii') for i in A.var_names]\n",
  44 |     "        A.obs.index = [i.decode('ascii') for i in A.obs.index]\n",
  45 |     "        for i in A.obs.columns:\n",
  46 |     "            if A.obs[i].dtype != np.dtype('bool') and \\\n",
  47 |     "                A.obs[i].dtype != np.dtype('int64') and \\\n",
  48 |     "                A.obs[i].dtype != np.dtype('int32') and \\\n",
  49 |     "                A.obs[i].dtype != np.dtype('object_') and \\\n",
  50 |     "                A.obs[i].dtype != np.dtype('float64') and A.obs[i].dtype != np.dtype('float32'):\n",
  51 |     "                if A.obs[i].dtype.is_dtype('category'):\n",
  52 |     "                    try:\n",
  53 |     "                        A.obs[i] = [i.decode('ascii') for i in A.obs[i]]\n",
  54 |     "                    except Exception as e:\n",
  55 |     "                        pass\n",
  56 |     "    except Exception as e:\n",
  57 |     "        print(e)\n",
  58 |     "    return A\n"
  59 |    ]
  60 |   },
  61 |   {
  62 |    "cell_type": "code",
  63 |    "execution_count": null,
  64 |    "metadata": {},
  65 |    "outputs": [],
  66 |    "source": [
  67 |     "# load annotated data (missing aging-related genes) and full data"
  68 |    ]
  69 |   },
  70 |   {
  71 |    "cell_type": "code",
  72 |    "execution_count": null,
  73 |    "metadata": {},
  74 |    "outputs": [],
  75 |    "source": [
  76 |     "adata_annot = ad.read_h5ad(\"/faststorage/brain_aging/merfish/exported/011722_adata_combined_harmony.h5ad\")\n",
  77 |     "adata_annot = unbinarize_strings(adata_annot)\n",
  78 |     "adata_annot = adata_annot[adata_annot.obs.dtype=='merfish']"
  79 |    ]
  80 |   },
  81 |   {
  82 |    "cell_type": "code",
  83 |    "execution_count": null,
  84 |    "metadata": {},
  85 |    "outputs": [],
  86 |    "source": [
  87 |     "adata_annot.raw = unbinarize_strings(adata_annot.raw.to_adata())"
  88 |    ]
  89 |   },
  90 |   {
  91 |    "cell_type": "code",
  92 |    "execution_count": null,
  93 |    "metadata": {
  94 |     "scrolled": true
  95 |    },
  96 |    "outputs": [],
  97 |    "source": [
  98 |     "celltype_colors, celltype_pals, label_colors, clust_pals = generate_palettes(adata_annot)"
  99 |    ]
 100 |   },
 101 |   {
 102 |    "cell_type": "markdown",
 103 |    "metadata": {},
 104 |    "source": [
 105 |     "# Compute neighborhoods"
 106 |    ]
 107 |   },
 108 |   {
 109 |    "cell_type": "code",
 110 |    "execution_count": null,
 111 |    "metadata": {},
 112 |    "outputs": [],
 113 |    "source": [
 114 |     "adata_annot_young = adata_annot[adata_annot.obs.age=='4wk']\n",
 115 |     "adata_annot_med = adata_annot[adata_annot.obs.age=='24wk']\n",
 116 |     "adata_annot_old = adata_annot[adata_annot.obs.age=='90wk']\n"
 117 |    ]
 118 |   },
 119 |   {
 120 |    "cell_type": "code",
 121 |    "execution_count": null,
 122 |    "metadata": {},
 123 |    "outputs": [],
 124 |    "source": [
 125 |     "clust_order = [\n",
 126 |     " 'ExN-L2/3-1',\n",
 127 |     " 'ExN-L2/3-2',\n",
 128 |     " 'ExN-L5-1',\n",
 129 |     " 'ExN-L5-2',\n",
 130 |     " 'ExN-L5-3',\n",
 131 |     " 'ExN-L6-1',\n",
 132 |     " 'ExN-L6-2',\n",
 133 |     " 'ExN-L6-3',\n",
 134 |     " 'ExN-L6-4',\n",
 135 |     " 'ExN-LatSept',\n",
 136 |     "\n",
 137 |     " 'InN-Calb2',\n",
 138 |     " 'InN-Chat',\n",
 139 |     " 'InN-Lamp5',\n",
 140 |     " 'InN-LatSept',\n",
 141 |     " 'InN-Pvalb-1',\n",
 142 |     " 'InN-Pvalb-2',\n",
 143 |     " 'InN-Pvalb-3',\n",
 144 |     " 'InN-Sst',\n",
 145 |     " 'InN-Vip',\n",
 146 |     " 'MSN-D1-1',\n",
 147 |     " 'MSN-D1-2',\n",
 148 |     " 'MSN-D2',\n",
 149 |     " 'OPC',\n",
 150 |     " 'Olig-1',\n",
 151 |     " 'Olig-2',\n",
 152 |     " 'Olig-3',\n",
 153 |     "\n",
 154 |     "'Astro-1',\n",
 155 |     " 'Astro-2',\n",
 156 |     " 'Vlmc',\n",
 157 |     " 'Peri-1',\n",
 158 |     " 'Peri-2',\n",
 159 |     " 'Endo-1',\n",
 160 |     " 'Endo-2',\n",
 161 |     " 'Endo-3',\n",
 162 |     " 'Epen',\n",
 163 |     "\n",
 164 |     " 'Micro-1',\n",
 165 |     " 'Micro-2',\n",
 166 |     " 'Micro-3',\n",
 167 |     " 'Macro',\n",
 168 |     " 'T cell',\n",
 169 |     "]"
 170 |    ]
 171 |   },
 172 |   {
 173 |    "cell_type": "code",
 174 |    "execution_count": null,
 175 |    "metadata": {},
 176 |    "outputs": [],
 177 |    "source": [
 178 |     "import multiprocessing\n",
 179 |     "from joblib import Parallel, delayed"
 180 |    ]
 181 |   },
 182 |   {
 183 |    "cell_type": "code",
 184 |    "execution_count": null,
 185 |    "metadata": {
 186 |     "scrolled": true
 187 |    },
 188 |    "outputs": [],
 189 |    "source": [
 190 |     "young_neighbors, young_zscore, young_nbor_pvals = compute_celltype_neighborhood(adata_annot_young, 'cell_type',celltypes=clust_annots, niter=500, radius=150)"
 191 |    ]
 192 |   },
 193 |   {
 194 |    "cell_type": "code",
 195 |    "execution_count": null,
 196 |    "metadata": {},
 197 |    "outputs": [],
 198 |    "source": [
 199 |     "med_neighbors, med_zscore, med_nbor_pvals = compute_celltype_neighborhood(adata_annot_med, 'clust_annot',celltypes=clust_annots, niter=500, radius=150)"
 200 |    ]
 201 |   },
 202 |   {
 203 |    "cell_type": "code",
 204 |    "execution_count": null,
 205 |    "metadata": {
 206 |     "scrolled": true
 207 |    },
 208 |    "outputs": [],
 209 |    "source": [
 210 |     "old_neighbors, old_zscore, old_nbor_pvals = compute_celltype_neighborhood(adata_annot_old, 'clust_annot', niter=500, celltypes=clust_annots, radius=150)"
 211 |    ]
 212 |   },
 213 |   {
 214 |    "cell_type": "code",
 215 |    "execution_count": null,
 216 |    "metadata": {},
 217 |    "outputs": [],
 218 |    "source": [
 219 |     "# hierarchically cluster zscore\n",
 220 |     "from scipy.cluster.hierarchy import dendrogram, linkage\n",
 221 |     "from scipy.spatial.distance import pdist\n",
 222 |     "\n",
 223 |     "def hierarchical_cluster_order(mat, method='ward'):\n",
 224 |     "    D = pdist(mat,'cosine')\n",
 225 |     "    D[np.isnan(D)] = 0\n",
 226 |     "    Z = linkage(D,method,optimal_ordering=True)\n",
 227 |     "    den = dendrogram(Z, no_plot=True)\n",
 228 |     "    return np.array(den['leaves'])\n",
 229 |     "\n",
 230 |     "def clust_avg(A, clust_key, clust_names):\n",
 231 |     "    return np.array([A[A.obs[clust_key]==i].X.mean(0) for i in clust_names])"
 232 |    ]
 233 |   },
 234 |   {
 235 |    "cell_type": "code",
 236 |    "execution_count": null,
 237 |    "metadata": {},
 238 |    "outputs": [],
 239 |    "source": [
 240 |     "\n",
 241 |     "young_clust_avg = clust_avg(adata_annot_young, 'clust_annot', clust_annots)\n",
 242 |     "clust_order = hierarchical_cluster_order(young_zscore, method='complete')\n",
 243 |     "ex_clusts = np.argwhere([True if \"Ex\" in i else False for i in clust_annots]).flatten()\n",
 244 |     "in_clusts = np.argwhere([True if (\"In\" in i or \"MSN\" in i) else False for i in clust_annots]).flatten()\n",
 245 |     "nn_clusts = np.argwhere([True if (\"Ex\" not in i and \"In\" not in i and \"MSN\" not in i) else False for i in clust_annots]).flatten()\n",
 246 |     "\n",
 247 |     "exn_order = hierarchical_cluster_order(old_zscore[ex_clusts,:][:, ex_clusts], 'complete')\n",
 248 |     "in_order = hierarchical_cluster_order(old_zscore[in_clusts,:][:, in_clusts], 'complete')+exn_order.max()+1\n",
 249 |     "nn_order = hierarchical_cluster_order(old_zscore[nn_clusts,:][:, nn_clusts], 'complete')+in_order.max()+1\n",
 250 |     "\n",
 251 |     "clust_order = np.arange(len(clust_order))#np.hstack((exn_order, in_order, nn_order))"
 252 |    ]
 253 |   },
 254 |   {
 255 |    "cell_type": "code",
 256 |    "execution_count": null,
 257 |    "metadata": {},
 258 |    "outputs": [],
 259 |    "source": [
 260 |     "seg_points = [9, 20, 26]\n",
 261 |     "def plot_nborhood(zs, clust_order, hide_labels=False, seg_points=None,vmin=-200,vmax=200):\n",
 262 |     "    f, ax = plt.subplots(figsize=(10,10))\n",
 263 |     "    gs = plt.GridSpec(nrows=2,ncols=2,width_ratios=[0.5, 20], height_ratios=[20,0.5], wspace=0.01, hspace=0.01)\n",
 264 |     "\n",
 265 |     "    ax = plt.subplot(gs[0,0])\n",
 266 |     "    curr_cmap = mpl.colors.ListedColormap([label_colors[i] for i in np.array(clust_annots)[clust_order][::-1]])\n",
 267 |     "    ax.imshow(np.expand_dims(np.arange(zs.shape[0]),1),aspect='auto',interpolation='none', cmap=curr_cmap,rasterized=True)\n",
 268 |     "    sns.despine(ax=ax,bottom=True,left=True)\n",
 269 |     "    if hide_labels:\n",
 270 |     "        ax.set_yticks([]);\n",
 271 |     "        ax.set_yticklabels([]);\n",
 272 |     "        ax.set_xticks([])\n",
 273 |     "    else:\n",
 274 |     "        ax.set_yticks(np.arange(len(clust_annots)));\n",
 275 |     "        ax.set_yticklabels(np.array(clust_annots)[clust_order][::-1]);\n",
 276 |     "        ax.set_xticks([])\n",
 277 |     "\n",
 278 |     "    ax = plt.subplot(gs[0,1])\n",
 279 |     "    ax.imshow(np.flipud(zs[clust_order,:][:,clust_order]),aspect='auto',interpolation='none',vmin=vmin,vmax=vmax,cmap=plt.cm.seismic, rasterized=True)\n",
 280 |     "    ax.axis('off')\n",
 281 |     "    if seg_points is not None:\n",
 282 |     "        for i in seg_points:\n",
 283 |     "            ax.axvline(i-0.5,color='k',linestyle='--')\n",
 284 |     "            ax.axhline(len(clust_annots)-i-0.5,color='k',linestyle='--')\n",
 285 |     "\n",
 286 |     "    ax = plt.subplot(gs[1,1])\n",
 287 |     "    ax.imshow(np.expand_dims(np.arange(zs.shape[1])[::-1],1).T,aspect='auto',interpolation='none',cmap=curr_cmap,rasterized=True)\n",
 288 |     "    sns.despine(ax=ax,bottom=True,left=True)\n",
 289 |     "    if hide_labels:\n",
 290 |     "        ax.set_xticks([])\n",
 291 |     "        ax.set_yticks([])\n",
 292 |     "    else:\n",
 293 |     "        ax.set_xticks(np.arange(len(clust_annots)));\n",
 294 |     "        ax.set_yticks([])\n",
 295 |     "        ax.set_xticklabels(np.array(clust_annots)[clust_order],rotation=90);\n",
 296 |     "    return f\n"
 297 |    ]
 298 |   },
 299 |   {
 300 |    "cell_type": "markdown",
 301 |    "metadata": {},
 302 |    "source": [
 303 |     "# Fig. 3: Neighborhood and interaction analysis"
 304 |    ]
 305 |   },
 306 |   {
 307 |    "cell_type": "code",
 308 |    "execution_count": null,
 309 |    "metadata": {},
 310 |    "outputs": [],
 311 |    "source": [
 312 |     "def nearest_neighbor_dists(A, cell_type_source, cell_type_targets):\n",
 313 |     "    \"\"\" Compute nearest neighbor distances from source to targets \"\"\"\n",
 314 |     "    pass"
 315 |    ]
 316 |   },
 317 |   {
 318 |    "cell_type": "code",
 319 |    "execution_count": null,
 320 |    "metadata": {},
 321 |    "outputs": [],
 322 |    "source": [
 323 |     "old_pos = adata_annot[adata_annot.obs.age=='90wk'].obsm['spatial']\n",
 324 |     "old_nn, _ = KDTree(old_pos).query(old_pos, k=2)\n"
 325 |    ]
 326 |   },
 327 |   {
 328 |    "cell_type": "code",
 329 |    "execution_count": null,
 330 |    "metadata": {},
 331 |    "outputs": [],
 332 |    "source": [
 333 |     "# look at gross distances to nearest neighbors\n",
 334 |     "from sklearn.neighbors import KDTree\n",
 335 |     "old_pos = adata_annot[adata_annot.obs.age=='90wk'].obsm['spatial']\n",
 336 |     "old_nn, _ = KDTree(old_pos).query(old_pos, k=2)\n",
 337 |     "old_nn = old_nn[:,1]\n",
 338 |     "\n",
 339 |     "med_pos = adata_annot[adata_annot.obs.age=='24wk'].obsm['spatial']\n",
 340 |     "med_nn, _ = KDTree(old_pos).query(old_pos, k=2)\n",
 341 |     "med_nn = med_nn[:,1]\n",
 342 |     "\n",
 343 |     "young_pos = adata_annot[adata_annot.obs.age=='4wk'].obsm['spatial']\n",
 344 |     "young_nn, _ = KDTree(young_pos).query(young_pos, k=2)\n",
 345 |     "young_nn = young_nn[:,1]\n"
 346 |    ]
 347 |   },
 348 |   {
 349 |    "cell_type": "code",
 350 |    "execution_count": null,
 351 |    "metadata": {},
 352 |    "outputs": [],
 353 |    "source": [
 354 |     "#celltypes = adata_annot.obs.remapped_cell_type.unique()\n",
 355 |     "celltypes = [\n",
 356 |     "    'InN',\n",
 357 |     " 'ExN',\n",
 358 |     " 'MSN',\n",
 359 |     " 'Astro',\n",
 360 |     " 'OPC',\n",
 361 |     " 'Olig',\n",
 362 |     " 'Endo',\n",
 363 |     " 'Vlmc',\n",
 364 |     " 'Peri',\n",
 365 |     " 'Macro',\n",
 366 |     " 'Micro',\n",
 367 |     "]\n"
 368 |    ]
 369 |   },
 370 |   {
 371 |    "cell_type": "code",
 372 |    "execution_count": null,
 373 |    "metadata": {},
 374 |    "outputs": [],
 375 |    "source": [
 376 |     "adata_annot.obs['clust_reduced'] = [\"-\".join(i.split('-')[:-1]) if len(i.split('-'))>1 else i for i in adata_annot.obs.clust_annot]"
 377 |    ]
 378 |   },
 379 |   {
 380 |    "cell_type": "code",
 381 |    "execution_count": null,
 382 |    "metadata": {},
 383 |    "outputs": [],
 384 |    "source": [
 385 |     "clust_reduced_labels = list(adata_annot.obs.clust_reduced.unique())"
 386 |    ]
 387 |   },
 388 |   {
 389 |    "cell_type": "code",
 390 |    "execution_count": null,
 391 |    "metadata": {},
 392 |    "outputs": [],
 393 |    "source": [
 394 |     "niter = 500\n",
 395 |     "perturb_max = 100\n",
 396 |     "dist_thresh = 20"
 397 |    ]
 398 |   },
 399 |   {
 400 |    "cell_type": "code",
 401 |    "execution_count": null,
 402 |    "metadata": {
 403 |     "scrolled": true
 404 |    },
 405 |    "outputs": [],
 406 |    "source": [
 407 |     "#celltypes = sorted(adata_annot.obs.cell_type.unique())\n",
 408 |     "young_interactions_clust, young_pvals_clust, young_qvals_clust = compute_celltype_interactions(adata_annot[adata_annot.obs.age=='4wk'], \n",
 409 |     "                                                                'cell_type', celltypes,niter=niter,dist_thresh=dist_thresh,perturb_max=perturb_max)\n"
 410 |    ]
 411 |   },
 412 |   {
 413 |    "cell_type": "code",
 414 |    "execution_count": null,
 415 |    "metadata": {
 416 |     "scrolled": true
 417 |    },
 418 |    "outputs": [],
 419 |    "source": [
 420 |     "med_interactions_clust, med_pvals_clust, med_qvals_clust = compute_celltype_interactions(adata_annot[adata_annot.obs.age=='24wk'], \n",
 421 |     "                                                                'cell_type', celltypes,niter=niter,dist_thresh=dist_thresh,perturb_max=perturb_max)\n"
 422 |    ]
 423 |   },
 424 |   {
 425 |    "cell_type": "code",
 426 |    "execution_count": null,
 427 |    "metadata": {},
 428 |    "outputs": [],
 429 |    "source": [
 430 |     "old_interactions_clust, old_pvals_clust, old_qvals_clust = compute_celltype_interactions(adata_annot[adata_annot.obs.age=='90wk'], \n",
 431 |     "                                                                'cell_type', celltypes,niter=niter,dist_thresh=dist_thresh,perturb_max=perturb_max)\n"
 432 |    ]
 433 |   },
 434 |   {
 435 |    "cell_type": "code",
 436 |    "execution_count": null,
 437 |    "metadata": {},
 438 |    "outputs": [],
 439 |    "source": [
 440 |     "from statsmodels.stats.multitest import multipletests\n",
 441 |     "def fdr_correct(X):\n",
 442 |     "    new_X = np.zeros_like(X)\n",
 443 |     "    for i in range(X.shape[0]):\n",
 444 |     "        pvals = multipletests(X[i,:],method='fdr_bh')[1]\n",
 445 |     "        new_X[i,:] = multipletests(X[i,:],method='fdr_bh')[1]\n",
 446 |     "        new_X[:,i] = new_X[i,:]\n",
 447 |     "    #X = multipletests(X.flatten(), method='fdr_bh')[1]\n",
 448 |     "    return new_X#X.reshape(X_shape)"
 449 |    ]
 450 |   },
 451 |   {
 452 |    "cell_type": "code",
 453 |    "execution_count": null,
 454 |    "metadata": {},
 455 |    "outputs": [],
 456 |    "source": [
 457 |     "young_qvals_clust = fdr_correct(young_pvals_clust.copy())\n",
 458 |     "med_qvals_clust = fdr_correct(med_pvals_clust.copy())\n",
 459 |     "old_qvals_clust = fdr_correct(old_pvals_clust.copy())\n"
 460 |    ]
 461 |   },
 462 |   {
 463 |    "cell_type": "code",
 464 |    "execution_count": null,
 465 |    "metadata": {},
 466 |    "outputs": [],
 467 |    "source": [
 468 |     "young_interactions_clust[np.isinf(young_interactions_clust)] = 5\n",
 469 |     "med_interactions_clust[np.isinf(med_interactions_clust)] = 5\n",
 470 |     "old_interactions_clust[np.isinf(old_interactions_clust)] = 5\n"
 471 |    ]
 472 |   },
 473 |   {
 474 |    "cell_type": "code",
 475 |    "execution_count": null,
 476 |    "metadata": {},
 477 |    "outputs": [],
 478 |    "source": [
 479 |     "young_qvals_clust[np.isnan(young_qvals_clust)] = 1\n",
 480 |     "med_qvals_clust[np.isnan(med_qvals_clust)] = 1\n",
 481 |     "old_qvals_clust[np.isnan(old_qvals_clust)] = 1"
 482 |    ]
 483 |   },
 484 |   {
 485 |    "cell_type": "code",
 486 |    "execution_count": null,
 487 |    "metadata": {},
 488 |    "outputs": [],
 489 |    "source": [
 490 |     "from plotting import plot_interactions"
 491 |    ]
 492 |   },
 493 |   {
 494 |    "cell_type": "code",
 495 |    "execution_count": null,
 496 |    "metadata": {},
 497 |    "outputs": [],
 498 |    "source": [
 499 |     "f = plot_interactions(young_qvals_clust, young_interactions_clust, celltypes,celltype_colors,cmap=plt.cm.seismic,vmax=5, vmin=-5)\n",
 500 |     "f.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_cell_contact_young.pdf\",bbox_inches='tight', dpi=200)"
 501 |    ]
 502 |   },
 503 |   {
 504 |    "cell_type": "code",
 505 |    "execution_count": null,
 506 |    "metadata": {},
 507 |    "outputs": [],
 508 |    "source": [
 509 |     "f = plot_interactions(med_qvals_clust, med_interactions_clust, celltypes,celltype_colors,cmap=plt.cm.seismic,vmax=3, vmin=-3)\n",
 510 |     "f.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_cell_contact_med.pdf\",bbox_inches='tight', dpi=200)"
 511 |    ]
 512 |   },
 513 |   {
 514 |    "cell_type": "code",
 515 |    "execution_count": null,
 516 |    "metadata": {},
 517 |    "outputs": [],
 518 |    "source": [
 519 |     "f = plot_interactions(old_qvals_clust, old_interactions_clust, celltypes,celltype_colors,cmap=plt.cm.seismic,vmax=3, vmin=-3)\n",
 520 |     "f.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_cell_contact_old.pdf\",bbox_inches='tight', dpi=200)"
 521 |    ]
 522 |   },
 523 |   {
 524 |    "cell_type": "code",
 525 |    "execution_count": null,
 526 |    "metadata": {},
 527 |    "outputs": [],
 528 |    "source": [
 529 |     "diff = old_interactions_clust-young_interactions_clust\n",
 530 |     "diff[np.isnan(diff)] = 0\n",
 531 |     "for i in range(diff.shape[0]):\n",
 532 |     "    for j in range(diff.shape[1]):\n",
 533 |     "        if young_qvals_clust[i,j] < 0.05 or old_pvals_clust[i,j] < 0.05:\n",
 534 |     "            pass\n",
 535 |     "        else:\n",
 536 |     "            diff[i,j] = 0\n",
 537 |     "diff_qvals = np.zeros_like(old_qvals_clust)\n",
 538 |     "for i in range(old_qvals_clust.shape[0]):\n",
 539 |     "    for j in range(old_qvals_clust.shape[0]):\n",
 540 |     "        if old_qvals_clust[i,j] < 0.05 or young_qvals_clust[i,j] < 0.05:\n",
 541 |     "            if (old_interactions_clust[i,j]>0 or young_interactions_clust[i,j]>0):\n",
 542 |     "                diff_qvals[i,j] = 0\n",
 543 |     "            else:\n",
 544 |     "                diff_qvals[i,j] = 1\n",
 545 |     "        else:\n",
 546 |     "            diff_qvals[i,j] = 1"
 547 |    ]
 548 |   },
 549 |   {
 550 |    "cell_type": "code",
 551 |    "execution_count": null,
 552 |    "metadata": {},
 553 |    "outputs": [],
 554 |    "source": [
 555 |     "f = plot_interactions(diff_qvals, diff, celltypes,celltype_colors,cmap=plt.cm.Reds,vmax=1.2, vmin=0)\n",
 556 |     "f.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_cell_contact_diff.pdf\",bbox_inches='tight', dpi=200)"
 557 |    ]
 558 |   },
 559 |   {
 560 |    "cell_type": "code",
 561 |    "execution_count": null,
 562 |    "metadata": {},
 563 |    "outputs": [],
 564 |    "source": [
 565 |     "#f,ax = plt.subplots(figsize=(5,5))\n",
 566 |     "#ax.imshow(np.flipud(diff),cmap=plt.cm.seismic,vmin=-2.5, vmax=2.5)\n",
 567 |     "#ax.set_xticks(np.arange(diff.shape[0]));\n",
 568 |     "#ax.set_xticklabels(celltypes,rotation=90)\n",
 569 |     "#ax.set_yticks(np.arange(diff.shape[1]));\n",
 570 |     "#ax.set_yticklabels(celltypes[::-1]);\n",
 571 |     "#sns.despine(ax=ax,left=True, bottom=True)\n",
 572 |     "#f.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_cell_contact_diff.pdf\",bbox_inches='tight', dpi=200)"
 573 |    ]
 574 |   },
 575 |   {
 576 |    "cell_type": "markdown",
 577 |    "metadata": {},
 578 |    "source": [
 579 |     "# Redo this at higher resolution"
 580 |    ]
 581 |   },
 582 |   {
 583 |    "cell_type": "markdown",
 584 |    "metadata": {},
 585 |    "source": [
 586 |     "# test effect of cell-cell interaction on activated state"
 587 |    ]
 588 |   },
 589 |   {
 590 |    "cell_type": "code",
 591 |    "execution_count": null,
 592 |    "metadata": {},
 593 |    "outputs": [],
 594 |    "source": [
 595 |     "sc.tl.score_genes(adata_annot, gene_list=['B2m','Trem2', 'Ccl2', 'Apoe',  'Axl', 'Itgax', 'Cd9','C1qa','C1qc','Lyz2','Ctss'], score_name='activate_micro', use_raw=False)\n",
 596 |     "sc.tl.score_genes(adata_annot, gene_list=['C4b', 'C3', 'Serpina3n', 'Cxcl10', 'Gfap', 'Vim', 'Il18','Hif3a'], score_name='activate_astro', use_raw=False)\n",
 597 |     "\n",
 598 |     "sc.tl.score_genes(adata_annot, gene_list=activate_endo, score_name='activate_endo',use_raw=False)"
 599 |    ]
 600 |   },
 601 |   {
 602 |    "cell_type": "code",
 603 |    "execution_count": null,
 604 |    "metadata": {},
 605 |    "outputs": [],
 606 |    "source": [
 607 |     "adata_micro = adata_annot[adata_annot.obs.cell_type==\"Micro\"]\n",
 608 |     "adata_annot.obs.activate_micro = adata_annot.obs.activate_micro - np.mean(adata_micro[adata_micro.obs.age=='4wk'].obs.activate_micro)"
 609 |    ]
 610 |   },
 611 |   {
 612 |    "cell_type": "code",
 613 |    "execution_count": null,
 614 |    "metadata": {},
 615 |    "outputs": [],
 616 |    "source": [
 617 |     "adata_astro = adata_annot[adata_annot.obs.cell_type==\"Astro\"]\n",
 618 |     "adata_annot.obs.activate_astro = adata_annot.obs.activate_astro - np.mean(adata_astro[adata_astro.obs.age=='4wk'].obs.activate_astro)"
 619 |    ]
 620 |   },
 621 |   {
 622 |    "cell_type": "code",
 623 |    "execution_count": null,
 624 |    "metadata": {},
 625 |    "outputs": [],
 626 |    "source": [
 627 |     "from spatial_analysis import *\n",
 628 |     "from plotting import *"
 629 |    ]
 630 |   },
 631 |   {
 632 |    "cell_type": "code",
 633 |    "execution_count": null,
 634 |    "metadata": {},
 635 |    "outputs": [],
 636 |    "source": [
 637 |     "sc.pl.umap(adata_annot, color=['age','activate_astro'])"
 638 |    ]
 639 |   },
 640 |   {
 641 |    "cell_type": "code",
 642 |    "execution_count": null,
 643 |    "metadata": {},
 644 |    "outputs": [],
 645 |    "source": [
 646 |     "def identify_nearest_neighbors_with_idx(X,Y,dist_thresh, min_dist_thresh=15):\n",
 647 |     "    if X.shape[0] > 0 and Y.shape[0] > 0:\n",
 648 |     "        kdtree = KDTree(Y)\n",
 649 |     "        ind, dists = kdtree.query_radius(X, r=dist_thresh, count_only=False,return_distance=True)\n",
 650 |     "        ind_X = np.hstack([[i]*len(ind[i]) for i in np.arange(len(ind)) if len(ind[i])>0])\n",
 651 |     "        \n",
 652 |     "        ind = np.hstack(ind)\n",
 653 |     "        dists = np.hstack(dists)\n",
 654 |     "        if len(ind) > 0:\n",
 655 |     "            ind = ind[dists>min_dist_thresh]      \n",
 656 |     "            ind_X = ind_X[dists>min_dist_thresh]\n",
 657 |     "        return ind.astype(np.int), ind_X.astype(np.int)\n",
 658 |     "    else:\n",
 659 |     "        return np.array([])\n",
 660 |     "\n",
 661 |     "def count_neighbors_with_idx(X,Y,dist_thresh, ):\n",
 662 |     "    if X.shape[0] > 0 and Y.shape[0] > 0:\n",
 663 |     "        kdtree = KDTree(Y)\n",
 664 |     "        ind, dists = kdtree.query_radius(X, r=dist_thresh, count_only=False,return_distance=True)\n",
 665 |     "        counts_Y = np.array([len(i) for i in ind])\n",
 666 |     "        ind_X = np.arange(len(ind))#np.array([i for i in np.arange(len(ind)) if len(ind[i])>0])\n",
 667 |     "        return ind_X.astype(np.int), counts_Y.astype(np.int)\n",
 668 |     "    else:\n",
 669 |     "        return np.array([])\n",
 670 |     "\n",
 671 |     "def identify_nearest_neighbors_with_dist(X,Y, min_dist=0):\n",
 672 |     "    if X.shape[0] > 0 and Y.shape[0] > 0:\n",
 673 |     "        kdtree = KDTree(Y)\n",
 674 |     "        dists, ind = kdtree.query(X, k=2,return_distance=True)\n",
 675 |     "        print(dists.shape, ind.shape)\n",
 676 |     "        good_dists = np.zeros(len(dists))\n",
 677 |     "        good_ind = np.zeros(len(ind))\n",
 678 |     "        for i in range(dists.shape[0]):\n",
 679 |     "            if dists[i,0] > 0: # remove duplicates\n",
 680 |     "                good_dists[i] = dists[i,0]\n",
 681 |     "                good_ind[i] = ind[i,0]\n",
 682 |     "            else:\n",
 683 |     "                good_dists[i] = dists[i,1]\n",
 684 |     "                good_ind[i] = ind[i,1]\n",
 685 |     "        #ind_X = np.hstack([[i]*len(ind[i]) for i in np.arange(len(ind)) if len(ind[i])>0])\n",
 686 |     "        return good_dists, good_ind\n",
 687 |     "    else:\n",
 688 |     "        return np.array([])\n",
 689 |     "    \n",
 690 |     "def compute_celltype_obs_count_correlation(A,cell_type_X, cell_type_Y, obs_key_X, celltype_key='cell_type',radius=40, min_dist_thresh=15):\n",
 691 |     "    X = A[A.obs[celltype_key] == cell_type_X]\n",
 692 |     "    Y = A[A.obs[celltype_key] == cell_type_Y]\n",
 693 |     "    obs_X = X.obs[obs_key_X]\n",
 694 |     "    curr_X = X.obsm['spatial']\n",
 695 |     "    curr_Y = Y.obsm['spatial']\n",
 696 |     "    ind_X, counts_Y = count_neighbors_with_idx(curr_X, curr_Y, dist_thresh=radius)\n",
 697 |     "    return obs_X.values[ind_X], ind_X, counts_Y\n",
 698 |     "\n",
 699 |     "\n",
 700 |     "def compute_celltype_obs_distance_correlation(A,cell_type_X, cell_type_Y, obs_key_X, celltype_key1='cell_type', celltype_key2='cell_type'):\n",
 701 |     "    X = A[A.obs[celltype_key1] == cell_type_X]\n",
 702 |     "    Y = A[A.obs[celltype_key2] == cell_type_Y]\n",
 703 |     "    obs_X = X.obs[obs_key_X]\n",
 704 |     "    curr_X = X.obsm['spatial']\n",
 705 |     "    curr_Y = Y.obsm['spatial']\n",
 706 |     "    dists_Y, ind_Y = identify_nearest_neighbors_with_dist(curr_X, curr_Y)\n",
 707 |     "    return obs_X.values, dists_Y\n",
 708 |     "\n",
 709 |     "def compute_celltype_obs_correlation(A,cell_type_X, cell_type_Y, obs_key_X, obs_key_Y, celltype_key='cell_type', radius=40, min_dist_thresh=15):\n",
 710 |     "    X = A[A.obs[celltype_key] == cell_type_X]\n",
 711 |     "    Y = A[A.obs[celltype_key] == cell_type_Y]\n",
 712 |     "    obs_X = X.obs[obs_key_X]\n",
 713 |     "    obs_Y = Y.obs[obs_key_Y]\n",
 714 |     "    curr_X = X.obsm['spatial']\n",
 715 |     "    curr_Y = Y.obsm['spatial']\n",
 716 |     "    neighbors_X, ind_X = identify_nearest_neighbors_with_idx(curr_X, curr_Y, dist_thresh=radius, min_dist_thresh=min_dist_thresh)\n",
 717 |     "    curr_expr = obs_Y[neighbors_X]\n",
 718 |     "    return obs_X.values[ind_X], curr_expr.values"
 719 |    ]
 720 |   },
 721 |   {
 722 |    "cell_type": "code",
 723 |    "execution_count": null,
 724 |    "metadata": {},
 725 |    "outputs": [],
 726 |    "source": [
 727 |     "def compute_binned_values(dists, scores, min_d=0, max_d=100, bin_size=30):\n",
 728 |     "    binned_mean = np.zeros(max_d-min_d-bin_size)\n",
 729 |     "    binned_std = np.zeros(max_d-min_d-bin_size)\n",
 730 |     "    for i in np.arange(min_d, max_d-bin_size):\n",
 731 |     "        # find distances in this bin range\n",
 732 |     "        idx = np.argwhere(np.logical_and(dists>i, dists<=(i+bin_size)))\n",
 733 |     "        curr_scores = scores[idx]\n",
 734 |     "        binned_mean[i] = np.mean(curr_scores)#/len(idx)\n",
 735 |     "        binned_std[i] = np.std(curr_scores)/np.sqrt(len(curr_scores))#/len(idx)\n",
 736 |     "    binned_mean -= binned_mean.mean()\n",
 737 |     "    binned_std -= binned_mean.mean()\n",
 738 |     "    return binned_mean, binned_std"
 739 |    ]
 740 |   },
 741 |   {
 742 |    "cell_type": "code",
 743 |    "execution_count": null,
 744 |    "metadata": {},
 745 |    "outputs": [],
 746 |    "source": [
 747 |     "# astro to peri-1/peri-2\n",
 748 |     "plt.figure(figsize=(3,3))\n",
 749 |     "celltypes = [\"Peri-1\",\"Peri-2\"]\n",
 750 |     "for i in celltypes:\n",
 751 |     "    scores, dists = compute_celltype_obs_distance_correlation(adata_annot[adata_annot.obs.age=='4wk'], \"Astro\", i, \"activate_astro\", celltype_key2='clust_annot')\n",
 752 |     "    binned_mean, binned_std = compute_binned_values(dists, scores,bin_size=30,max_d=80)\n",
 753 |     "    x = np.arange(len(binned_mean))+30\n",
 754 |     "    plt.plot(x,binned_mean,color=label_colors[i])\n",
 755 |     "    plt.fill_between(x,binned_mean-binned_std, binned_mean+binned_std,alpha=0.1,color=label_colors[i])\n",
 756 |     "#plt.legend( celltypes)\n",
 757 |     "plt.ylim([-0.2, 0.3])\n",
 758 |     "sns.despine()\n",
 759 |     "plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_distance_peri_score_4wk.pdf\",bbox_inches='tight',dpi=300)"
 760 |    ]
 761 |   },
 762 |   {
 763 |    "cell_type": "code",
 764 |    "execution_count": null,
 765 |    "metadata": {},
 766 |    "outputs": [],
 767 |    "source": [
 768 |     "# astro to peri-1/peri-2\n",
 769 |     "plt.figure(figsize=(3,3))\n",
 770 |     "celltypes = [\"Peri-1\",\"Peri-2\"]\n",
 771 |     "for i in celltypes:\n",
 772 |     "    scores, dists = compute_celltype_obs_distance_correlation(adata_annot[adata_annot.obs.age=='90wk'], \"Astro\", i, \"activate_astro\", celltype_key2='clust_annot')\n",
 773 |     "    binned_mean, binned_std = compute_binned_values(dists, scores,bin_size=30,max_d=80)\n",
 774 |     "    x = np.arange(len(binned_mean))+30\n",
 775 |     "    plt.plot(x,binned_mean,color=label_colors[i])\n",
 776 |     "    plt.fill_between(x,binned_mean-binned_std, binned_mean+binned_std,alpha=0.1,color=label_colors[i])\n",
 777 |     "#plt.legend( celltypes)\n",
 778 |     "plt.ylim([-0.2, 0.4])\n",
 779 |     "sns.despine()\n",
 780 |     "plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_distance_peri_score_90wk.pdf\",bbox_inches='tight',dpi=300)"
 781 |    ]
 782 |   },
 783 |   {
 784 |    "cell_type": "code",
 785 |    "execution_count": null,
 786 |    "metadata": {},
 787 |    "outputs": [],
 788 |    "source": [
 789 |     "plt.figure(figsize=(3,3))\n",
 790 |     "celltypes = [\"Peri\",\"Endo\",\"Vlmc\", \"Olig\"]\n",
 791 |     "for i in celltypes:\n",
 792 |     "    scores, dists = compute_celltype_obs_distance_correlation(adata_annot[adata_annot.obs.age=='4wk'], \"Micro\", i, \"activate_micro\")\n",
 793 |     "    binned_mean, binned_std = compute_binned_values(dists, scores,bin_size=30)\n",
 794 |     "    x = np.arange(len(binned_mean))+30\n",
 795 |     "    plt.plot(x,binned_mean,color=celltype_colors[i])\n",
 796 |     "    plt.fill_between(x,binned_mean-binned_std, binned_mean+binned_std,alpha=0.1,color=celltype_colors[i])\n",
 797 |     "plt.legend( celltypes)\n",
 798 |     "plt.ylim([-0.05, 0.12])\n",
 799 |     "sns.despine()\n",
 800 |     "#plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_distance_micro_score_4wk.pdf\",bbox_inches='tight',dpi=300)"
 801 |    ]
 802 |   },
 803 |   {
 804 |    "cell_type": "code",
 805 |    "execution_count": null,
 806 |    "metadata": {},
 807 |    "outputs": [],
 808 |    "source": [
 809 |     "plt.figure(figsize=(3,3))\n",
 810 |     "for i in celltypes:\n",
 811 |     "    scores, dists = compute_celltype_obs_distance_correlation(adata_annot[adata_annot.obs.age=='90wk'], \"Micro\", i, \"activate_micro\")\n",
 812 |     "    binned_mean, binned_std = compute_binned_values(dists, scores,bin_size=30)\n",
 813 |     "    x = np.arange(len(binned_mean))+30\n",
 814 |     "    plt.plot(x,binned_mean,color=celltype_colors[i])\n",
 815 |     "    plt.fill_between(x,binned_mean-binned_std, binned_mean+binned_std,alpha=0.1,color=celltype_colors[i])\n",
 816 |     "\n",
 817 |     "plt.legend( celltypes)\n",
 818 |     "plt.ylim([-0.05, 0.12])\n",
 819 |     "sns.despine()\n",
 820 |     "#plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_distance_micro_score_90wk.pdf\",bbox_inches='tight',dpi=300)"
 821 |    ]
 822 |   },
 823 |   {
 824 |    "cell_type": "code",
 825 |    "execution_count": null,
 826 |    "metadata": {},
 827 |    "outputs": [],
 828 |    "source": [
 829 |     "plt.figure(figsize=(3,3))\n",
 830 |     "#celltypes = [\"Endo\",\"Vlmc\", \"Olig\", \"Micro\"]\n",
 831 |     "for i in celltypes:\n",
 832 |     "    scores, dists = compute_celltype_obs_distance_correlation(adata_annot[adata_annot.obs.age=='4wk'], \"Astro\", i, \"activate_astro\")\n",
 833 |     "    binned_mean, binned_std = compute_binned_values(dists, scores,bin_size=30,max_d=100)\n",
 834 |     "    x = np.arange(len(binned_mean))+30\n",
 835 |     "    plt.plot(x,binned_mean,color=celltype_colors[i])\n",
 836 |     "    plt.fill_between(x,binned_mean-binned_std, binned_mean+binned_std,alpha=0.1,color=celltype_colors[i])\n",
 837 |     "plt.legend( celltypes )\n",
 838 |     "sns.despine()\n",
 839 |     "plt.ylim([-0.2, 0.3])\n",
 840 |     "#plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_distance_astro_score_4wk.pdf\",bbox_inches='tight',dpi=300)"
 841 |    ]
 842 |   },
 843 |   {
 844 |    "cell_type": "code",
 845 |    "execution_count": null,
 846 |    "metadata": {},
 847 |    "outputs": [],
 848 |    "source": [
 849 |     "plt.figure(figsize=(3,3))\n",
 850 |     "\n",
 851 |     "for i in celltypes:\n",
 852 |     "    scores, dists = compute_celltype_obs_distance_correlation(adata_annot[adata_annot.obs.age=='90wk'], \"Astro\", i, \"activate_astro\")\n",
 853 |     "    binned_mean, binned_std = compute_binned_values(dists, scores,bin_size=30,max_d=150)\n",
 854 |     "    x = np.arange(len(binned_mean))+30\n",
 855 |     "    plt.plot(x,binned_mean,color=celltype_colors[i])\n",
 856 |     "    plt.fill_between(x,binned_mean-binned_std, binned_mean+binned_std,alpha=0.1,color=celltype_colors[i])\n",
 857 |     "#plt.legend(celltypes)\n",
 858 |     "sns.despine()\n",
 859 |     "#plt.ylim([-0.2, 0.3])\n",
 860 |     "#plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_distance_astro_score_90wk.pdf\",bbox_inches='tight',dpi=300)"
 861 |    ]
 862 |   },
 863 |   {
 864 |    "cell_type": "code",
 865 |    "execution_count": null,
 866 |    "metadata": {},
 867 |    "outputs": [],
 868 |    "source": [
 869 |     "sc.tl.score_genes(adata_annot, gene_list=[ \"C4b\", \"Il18\", \"Il33\"], score_name=\"activate_olig\",use_raw=False)"
 870 |    ]
 871 |   },
 872 |   {
 873 |    "cell_type": "code",
 874 |    "execution_count": null,
 875 |    "metadata": {},
 876 |    "outputs": [],
 877 |    "source": [
 878 |     "adata_olig = adata_annot[adata_annot.obs.cell_type==\"Olig\"]\n",
 879 |     "adata_annot.obs.activate_olig = adata_annot.obs.activate_olig - np.mean(adata_olig[adata_olig.obs.age=='4wk'].obs.activate_olig)"
 880 |    ]
 881 |   },
 882 |   {
 883 |    "cell_type": "code",
 884 |    "execution_count": null,
 885 |    "metadata": {},
 886 |    "outputs": [],
 887 |    "source": [
 888 |     "x,y = compute_celltype_obs_correlation(adata_annot,  \"Olig\",\"Micro\", f\"activate_olig\",f\"activate_micro\",   radius=40)\n"
 889 |    ]
 890 |   },
 891 |   {
 892 |    "cell_type": "code",
 893 |    "execution_count": null,
 894 |    "metadata": {},
 895 |    "outputs": [],
 896 |    "source": [
 897 |     "spatial_regions = [\"Pia\", \"L2/3\", \"L5\", \"L6\", \"CC\", \"Striatum\", \"Ventricle\"]\n",
 898 |     "ct_combos = [[\"Olig\", \"Astro\"],[\"Olig\",\"Micro\"],[\"Micro\", \"Astro\"]]\n",
 899 |     "cc = np.zeros((len(spatial_regions), 3))\n",
 900 |     "for i,r in enumerate(spatial_regions):\n",
 901 |     "    print(r)\n",
 902 |     "    for j,t in enumerate(ct_combos):\n",
 903 |     "        t1 = t[0]\n",
 904 |     "        t2 = t[1]\n",
 905 |     "        #curr_annot = adata_annot[adata_annot.obs.age=='90wk']\n",
 906 |     "        x,y = compute_celltype_obs_correlation(adata_annot[adata_annot.obs.spatial_clust_annots==r],  t1,t2, f\"activate_{t1.lower()}\",f\"activate_{t2.lower()}\",   radius=40)\n",
 907 |     "        cc[i,j] = np.corrcoef(x,y)[0,1]\n",
 908 |     "    "
 909 |    ]
 910 |   },
 911 |   {
 912 |    "cell_type": "code",
 913 |    "execution_count": null,
 914 |    "metadata": {},
 915 |    "outputs": [],
 916 |    "source": [
 917 |     "# look at correlation between Il33 and Activated Micro/Astro\n",
 918 |     "x,y = compute_celltype_obs_correlation(adata_annot[adata_annot.obs.spatial_clust_annots==\"CC\"],  \"Olig\",\"Micro\", f\"activate_olig\",f\"activate_micro\",   radius=30)\n",
 919 |     "plt.figure(figsize=(5,5))\n",
 920 |     "#plt.scatter(x,y,s=1)\n",
 921 |     "plt.title(f\"Olig -> Micro (R={np.corrcoef(x,y)[0,1]})\")\n",
 922 |     "sns.kdeplot(x=x,y=y,fill=True)\n",
 923 |     "#plt.xlim([0,5])\n",
 924 |     "#plt.axis('off')\n",
 925 |     "sns.despine()\n",
 926 |     "plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_activation_corr_olig_micro.pdf\",bbox_inches='tight',dpi=300)"
 927 |    ]
 928 |   },
 929 |   {
 930 |    "cell_type": "code",
 931 |    "execution_count": null,
 932 |    "metadata": {},
 933 |    "outputs": [],
 934 |    "source": [
 935 |     "# look at correlation between Il33 and Activated Micro/Astro\n",
 936 |     "x,y = compute_celltype_obs_correlation(adata_annot[adata_annot.obs.spatial_clust_annots==\"CC\"],  \"Olig\",\"Astro\", f\"activate_olig\",f\"activate_astro\",   radius=30)\n",
 937 |     "plt.figure(figsize=(5,5))\n",
 938 |     "#plt.scatter(x,y,s=1)\n",
 939 |     "plt.title(f\"Olig -> Astro (R={np.corrcoef(x,y)[0,1]})\")\n",
 940 |     "sns.kdeplot(x=x,y=y,fill=True)\n",
 941 |     "#plt.xlim([0,5])\n",
 942 |     "#plt.axis('off')\n",
 943 |     "sns.despine()\n",
 944 |     "#plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_activation_corr_olig_astro.pdf\",bbox_inches='tight',dpi=300)"
 945 |    ]
 946 |   },
 947 |   {
 948 |    "cell_type": "code",
 949 |    "execution_count": null,
 950 |    "metadata": {},
 951 |    "outputs": [],
 952 |    "source": [
 953 |     "adata_annot_yng = adata_annot[adata_annot.obs.age=='4wk']\n",
 954 |     "adata_annot_old = adata_annot[adata_annot.obs.age=='90wk']\n"
 955 |    ]
 956 |   },
 957 |   {
 958 |    "cell_type": "code",
 959 |    "execution_count": null,
 960 |    "metadata": {},
 961 |    "outputs": [],
 962 |    "source": [
 963 |     "# look at correlation between Il33 and Activated Micro/Astro\n",
 964 |     "x,y = compute_celltype_obs_correlation(adata_annot_yng[adata_annot_yng.obs.spatial_clust_annots==\"CC\"],  \"Olig\",\"Micro\", f\"activate_olig\",f\"activate_micro\",   radius=30)\n",
 965 |     "plt.figure(figsize=(5,5))\n",
 966 |     "#plt.scatter(x,y,s=1)\n",
 967 |     "plt.title(f\"Olig -> Micro (R={np.corrcoef(x,y)[0,1]})\")\n",
 968 |     "sns.kdeplot(x=x,y=y,fill=True)\n",
 969 |     "#plt.xlim([0,5])\n",
 970 |     "#plt.axis('off')\n",
 971 |     "sns.despine()\n",
 972 |     "plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_activation_corr_olig_micro_yng.pdf\",bbox_inches='tight',dpi=300)"
 973 |    ]
 974 |   },
 975 |   {
 976 |    "cell_type": "code",
 977 |    "execution_count": null,
 978 |    "metadata": {},
 979 |    "outputs": [],
 980 |    "source": [
 981 |     "# look at correlation between Il33 and Activated Micro/Astro\n",
 982 |     "x,y = compute_celltype_obs_correlation(adata_annot_old[adata_annot_old.obs.spatial_clust_annots==\"CC\"],  \"Olig\",\"Micro\", f\"activate_olig\",f\"activate_micro\",   radius=30)\n",
 983 |     "plt.figure(figsize=(5,5))\n",
 984 |     "#plt.scatter(x,y,s=1)\n",
 985 |     "plt.title(f\"Olig -> Micro (R={np.corrcoef(x,y)[0,1]})\")\n",
 986 |     "sns.kdeplot(x=x,y=y,fill=True)\n",
 987 |     "#plt.xlim([0,5])\n",
 988 |     "#plt.axis('off')\n",
 989 |     "sns.despine()\n",
 990 |     "plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_activation_corr_olig_micro_old.pdf\",bbox_inches='tight',dpi=300)"
 991 |    ]
 992 |   },
 993 |   {
 994 |    "cell_type": "code",
 995 |    "execution_count": null,
 996 |    "metadata": {},
 997 |    "outputs": [],
 998 |    "source": [
 999 |     "# look at correlation between Il33 and Activated Micro/Astro\n",
1000 |     "x,y = compute_celltype_obs_correlation(adata_annot_old[adata_annot_old.obs.spatial_clust_annots==\"CC\"],  \"Olig\",\"Astro\", f\"activate_olig\",f\"activate_astro\",   radius=30)\n",
1001 |     "plt.figure(figsize=(5,5))\n",
1002 |     "#plt.scatter(x,y,s=1)\n",
1003 |     "plt.title(f\"Olig -> Astro (R={np.corrcoef(x,y)[0,1]})\")\n",
1004 |     "sns.kdeplot(x=x,y=y,fill=True)\n",
1005 |     "#plt.xlim([0,5])\n",
1006 |     "#plt.axis('off')\n",
1007 |     "sns.despine()\n",
1008 |     "plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_activation_corr_olig_astro_old.pdf\",bbox_inches='tight',dpi=300)"
1009 |    ]
1010 |   },
1011 |   {
1012 |    "cell_type": "code",
1013 |    "execution_count": null,
1014 |    "metadata": {},
1015 |    "outputs": [],
1016 |    "source": [
1017 |     "# look at correlation between Il33 and Activated Micro/Astro\n",
1018 |     "x,y = compute_celltype_obs_correlation(adata_annot_yng[adata_annot_yng.obs.spatial_clust_annots==\"CC\"],  \"Olig\",\"Astro\", f\"activate_olig\",f\"activate_astro\",   radius=30)\n",
1019 |     "plt.figure(figsize=(5,5))\n",
1020 |     "#plt.scatter(x,y,s=1)\n",
1021 |     "plt.title(f\"Olig -> Astro (R={np.corrcoef(x,y)[0,1]})\")\n",
1022 |     "sns.kdeplot(x=x,y=y,fill=True)\n",
1023 |     "#plt.xlim([0,5])\n",
1024 |     "#plt.axis('off')\n",
1025 |     "sns.despine()\n",
1026 |     "plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_activation_corr_olig_astro_yng.pdf\",bbox_inches='tight',dpi=300)"
1027 |    ]
1028 |   },
1029 |   {
1030 |    "cell_type": "code",
1031 |    "execution_count": null,
1032 |    "metadata": {},
1033 |    "outputs": [],
1034 |    "source": [
1035 |     "x,y = compute_celltype_obs_correlation(adata_annot[adata_annot.obs.spatial_clust_annots==\"CC\"],  \"Micro\",\"Astro\", f\"activate_micro\",f\"activate_astro\",   radius=30)\n",
1036 |     "plt.figure(figsize=(5,5))\n",
1037 |     "plt.title(f\"Micro -> Astro (R={np.corrcoef(x,y)[0,1]})\")\n",
1038 |     "#plt.hist2d(x,y,cmap=plt.cm.viridis,bins=20,rasterized=True);\n",
1039 |     "#plt.scatter(x,y,s=1)\n",
1040 |     "sns.kdeplot(x=x,y=y,fill=True)\n",
1041 |     "#plt.xlim([0,5])\n",
1042 |     "#plt.axis('off')\n",
1043 |     "sns.despine()\n",
1044 |     "\n",
1045 |     "plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_activation_corr_micro_astro.pdf\",bbox_inches='tight',dpi=300)"
1046 |    ]
1047 |   }
1048 |  ],
1049 |  "metadata": {
1050 |   "kernelspec": {
1051 |    "display_name": "Python 3.8.1 64-bit ('scrnaseq': conda)",
1052 |    "language": "python",
1053 |    "name": "python38164bitscrnaseqcondaced2695c94d346d998c0cef2164233d9"
1054 |   },
1055 |   "language_info": {
1056 |    "codemirror_mode": {
1057 |     "name": "ipython",
1058 |     "version": 3
1059 |    },
1060 |    "file_extension": ".py",
1061 |    "mimetype": "text/x-python",
1062 |    "name": "python",
1063 |    "nbconvert_exporter": "python",
1064 |    "pygments_lexer": "ipython3",
1065 |    "version": "3.8.1"
1066 |   }
1067 |  },
1068 |  "nbformat": 4,
1069 |  "nbformat_minor": 4
1070 | }
1071 | 


--------------------------------------------------------------------------------
/python/de.py:
--------------------------------------------------------------------------------
  1 | import statsmodels.api as sm
  2 | import statsmodels.formula.api as smf
  3 | from statsmodels.stats.multitest import multipletests
  4 | import pandas as pd
  5 | import numpy as np
  6 | from tqdm import tqdm
  7 | import diffxpy as de
  8 | 
  9 | def lrtest(llmin,llmax):
 10 |     lr = likelihood_ratio(llmin, llmax)
 11 |     p = chi2.sf(lr,1)
 12 |     return p
 13 | from scipy.stats.distributions import chi2
 14 | def likelihood_ratio(llmin, llmax):
 15 |     llmin = -llmin
 16 |     llmax = -llmax
 17 |     return(2*(llmax-llmin))
 18 | 
 19 | def run_glm_de_age_lps_merfish(adata, family='poisson', grouping='cell_type_annot', logfc_thresh=np.log(1)):
 20 |     # do LR test
 21 |     import warnings
 22 |     from statsmodels.tools.sm_exceptions import ConvergenceWarning, PrecisionWarning, IterationLimitWarning, EstimationWarning, SingularMatrixWarning
 23 |     #from statsmodels.regression.linear_model.OLSResults import compare_lr_test
 24 |     warnings.simplefilter('ignore', ConvergenceWarning)
 25 |     warnings.simplefilter('ignore', PrecisionWarning)
 26 |     warnings.simplefilter('ignore', IterationLimitWarning)
 27 |     warnings.simplefilter('ignore', EstimationWarning)
 28 |     warnings.simplefilter('ignore', SingularMatrixWarning)
 29 |     warnings.simplefilter('ignore', FutureWarning)
 30 | 
 31 |     if family == 'nb':
 32 |         family = sm.families.NegativeBinomial()
 33 |     elif family == 'poisson':
 34 |         family = sm.families.Poisson()
 35 |     
 36 |     all_model_fits = {}
 37 |     all_results = {}
 38 |     
 39 |     for clust in adata.obs[grouping].unique()[::-1]:
 40 |         print(clust)
 41 |         curr_adata = adata[adata.obs[grouping]==clust].copy()
 42 |         print(curr_adata.shape)
 43 |         curr_coefs_age = []
 44 |         curr_coefs_lps = []
 45 |         curr_pvals = []
 46 |         curr_genes = list(curr_adata.var_names)
 47 |         for i in tqdm(range(len(curr_genes))):
 48 |             try:
 49 |                 
 50 |                 curr_adata.obs["Y"] = curr_adata[:,curr_genes[i]].X.toarray()
 51 |                 formula = "Y ~ C(age) + C(cond) + log_umi" 
 52 |                 #mdf = smf.glm(formula, data=curr_adata.obs, family=family).fit(maxiter=50,disp=0)
 53 |                 mdf = smf.ols(formula, data=curr_adata.obs).fit(maxiter=50,disp=0)
 54 |                 #mdf_reduced = smf.ols(formula_reduced, data=curr_adata.obs).fit(maxiter=50,disp=0)
 55 |                 formula_reduced = "Y ~ log_umi"
 56 |                 #mdf_reduced = smf.glm(formula_reduced, data=curr_adata.obs, family=family).fit(maxiter=50,disp=0)
 57 |                 mdf_reduced = smf.ols(formula_reduced, data=curr_adata.obs).fit(maxiter=50,disp=0)
 58 |                 curr_coefs_age.append(mdf.params['C(age)[T.90wk]'])
 59 |                 curr_coefs_lps.append(mdf.params["C(cond)[T.lps]"])
 60 |                 curr_pvals.append(lrtest(mdf.llf, mdf_reduced.llf))
 61 |                 #curr_pvals.append(mdf.compare_lr_test(mdf_reduced)[])
 62 |             except Exception as e:
 63 |                 print(e)
 64 |                 curr_coefs_age.append(None)
 65 |                 curr_coefs_lps.append(None)
 66 |                 curr_pvals.append(None)
 67 |         #curr_genes = [curr_genes[i] for i in range(len(curr_genes)) if curr_coefs_age[i] is not None or curr_coefs_lps is not None]
 68 |         #coef_age = [c for c in curr_coefs_age if c is not None]
 69 |         #coef_lps = [c for c in curr_coefs_lps if c is not None]
 70 |         pvals = [p for p in curr_pvals if p is not None]
 71 |         results = pd.DataFrame({'cell_type':clust, 'coef_age':curr_coefs_age, 'coef_lps':curr_coefs_lps, 'pval':pvals, 'gene':curr_genes})
 72 |         results['qval'] = multipletests(results.pval, method='fdr_bh')[1]
 73 |         all_results[clust] = results
 74 |     return all_results
 75 | 
 76 | def run_glm_de_age_merfish(adata, family='poisson', grouping='cell_type_annot', obs_name="age", comp_name="T.90wk", logfc_thresh=np.log(1)):
 77 |     # do LR test
 78 |     import warnings
 79 |     from statsmodels.tools.sm_exceptions import ConvergenceWarning, PrecisionWarning, IterationLimitWarning, EstimationWarning, SingularMatrixWarning 
 80 |     #from statsmodels.regression.linear_model.OLSResults import compare_lr_test
 81 |     warnings.simplefilter('ignore', ConvergenceWarning)
 82 |     warnings.simplefilter('ignore', PrecisionWarning)
 83 |     warnings.simplefilter('ignore', IterationLimitWarning)
 84 |     warnings.simplefilter('ignore', EstimationWarning)
 85 |     warnings.simplefilter('ignore', SingularMatrixWarning)
 86 |     warnings.simplefilter('ignore', FutureWarning)
 87 |     warnings.simplefilter('ignore',RuntimeWarning)
 88 |     if family == 'nb':
 89 |         family = sm.families.NegativeBinomial()
 90 |     elif family == 'poisson':
 91 |         family = sm.families.Poisson()
 92 |     
 93 |     all_model_fits = {}
 94 |     all_results = {}
 95 |     
 96 |     for clust in adata.obs[grouping].unique()[::-1]:
 97 |         print(clust)
 98 |         curr_adata = adata[adata.obs[grouping]==clust].copy()
 99 |         print(curr_adata.shape)
100 |         curr_coefs = []
101 |         curr_pvals = []
102 |         curr_stderr = []
103 |         curr_genes = list(curr_adata.var_names)
104 |         for i in tqdm(range(len(curr_genes))):
105 |             try:
106 |                 
107 |                 curr_adata.obs["Y"] = curr_adata[:,curr_genes[i]].X.toarray()
108 |                 formula = f"Y ~ C({obs_name}) + 1"# + log_umi"
109 |                 if family != "ols":
110 |                     mdf = smf.glm(formula, data=curr_adata.obs, family=family).fit(maxiter=50,disp=0)
111 |                 else:
112 |                     mdf = smf.ols(formula, data=curr_adata.obs).fit(maxiter=50,disp=0)
113 |                 #mdf_reduced = smf.ols(formula_reduced, data=curr_adata.obs).fit(maxiter=50,disp=0)
114 |                 formula_reduced = "Y ~ 1" #log_umi"
115 |                 if family != "ols":
116 |                     mdf_reduced = smf.glm(formula_reduced, data=curr_adata.obs, family=family).fit(maxiter=50,disp=0)
117 |                 else:
118 |                     mdf_reduced = smf.ols(formula_reduced, data=curr_adata.obs).fit(maxiter=50,disp=0)
119 |                 curr_coefs.append(mdf.params[f'C({obs_name})[{comp_name}]'])
120 |                 curr_pvals.append(lrtest(mdf.llf, mdf_reduced.llf))
121 |                 curr_stderr.append(mdf.bse[f'C({obs_name})[{comp_name}]'])
122 |                 #curr_pvals.append(mdf.compare_lr_test(mdf_reduced)[])
123 |             except Exception as e:
124 |                 print(e)
125 |                 curr_coefs.append(None)
126 |                 curr_pvals.append(None)
127 |                 curr_stderr.append(None)
128 |         #curr_genes = [curr_genes[i] for i in range(len(curr_genes)) if curr_coefs[i] is not None]
129 |         #coef = [c for c in curr_coefs if c is not None]
130 |         #pvals = [p for p in curr_pvals if p is not None]
131 |         stderrs = [s for s in curr_stderr if s is not None]
132 |         results = pd.DataFrame({'cell_type':clust, 'coef':curr_coefs, 'pval':curr_pvals, 'gene':curr_genes, 'stderr': curr_stderr})
133 |         results['qval'] = multipletests(results.pval, method='fdr_bh')[1]
134 |         all_results[clust] = results
135 |     return all_results
136 | 
137 | 
138 | def geomean(X,axis=1,epsilon=1):
139 |     return np.exp(np.mean(np.log(X+epsilon), axis))-epsilon
140 | 
141 | def avg_umi_per_gene(X):
142 |     return np.sum(X,1)
143 | 
144 | def compute_frac_expressed(A):
145 |     return np.array((A.X>0).sum(0)/A.shape[0]).flatten()
146 | 
147 | def compute_mean_expression(A):
148 |     return np.array(A.X.mean(0)).flatten()
149 | 
150 | def filter_2group_1way(A, obs_name, ident, min_pct=None, logfc_thresh=None, min_diff_pct=None, max_cells_per_ident=None, log=True):
151 |     """
152 |     Filter genes before differential expression testing. UNIDIRECTIONAL
153 |     obs is grouping
154 |     ident is what to be compared (ident vs ~ident)
155 |     min_diff_pcr: minimum difference in percentage between genes
156 |     log: is the data log transformed (usually this is the case)
157 |     """
158 |     n_cells, n_genes = A.shape
159 |     X = A[A.obs[obs_name] == ident]
160 |     Y = A[A.obs[obs_name] != ident]
161 |     
162 |     min_pct_mask = np.ones((n_genes,),dtype=np.bool)
163 |     log_fc_mask = np.ones((n_genes,),dtype=np.bool)
164 |     min_diff_pct_mask = np.ones((n_genes,),dtype=np.bool)
165 |     
166 |     pct_X = compute_frac_expressed(X)
167 |     pct_Y = compute_frac_expressed(Y)
168 | 
169 |     if min_pct:
170 |         min_pct_mask = (pct_X>min_pct).flatten()
171 |         
172 |     mean_X = compute_mean_expression(X)
173 |     mean_Y = compute_mean_expression(Y)
174 |     if log:
175 |         logfc_XY = np.log(np.exp(mean_X)/np.exp(mean_Y))
176 |     else:
177 |         logfc_XY = np.log(mean_X/mean_Y)
178 |       
179 |     if logfc_thresh:
180 |         log_fc_mask = (logfc_XY > logfc_thresh).flatten()
181 |     
182 |     if min_diff_pct:
183 |         diff_pct_XY = pct_X-pct_Y
184 |         min_diff_pct_mask = (diff_pct_XY > min_diff_pct).flatten()
185 |     final_mask = np.logical_and(np.logical_and(min_pct_mask, log_fc_mask), min_diff_pct_mask).flatten()
186 |     A = A[:, final_mask]
187 |     
188 |     if max_cells_per_ident:
189 |         idx_X = np.nonzero((adata.obs[obs_name]==ident).values)[0]
190 |         idx_Y = np.nonzero((adata.obs[obs_name]!=ident).values)[0]
191 |         ids_X = idx_A[np.random.permutation(len(idx_X))[:max_cells_per_ident]]
192 |         ids_Y = idx_B[np.random.permutation(len(idx_Y))[:max_cells_per_ident]]
193 |         combined_ids = np.hstack((ids_X, ids_Y)).flatten()
194 |         return A[combined_ids,:], logfc_XY[np.array(final_mask).flatten()]
195 |     else:
196 |         
197 |         return A, logfc_XY[np.array(final_mask).flatten()]
198 | 
199 | def filter_2group(A, obs_name, ident, min_pct=None, logfc_thresh=None, min_diff_pct=None, max_cells_per_ident=None, log=True):
200 |     """
201 |     Filter genes before differential expression testing. NOTE this is bidirectional
202 |     obs is grouping
203 |     ident is what to be compared (ident vs ~ident)
204 |     min_diff_pcr: minimum difference in percentage between genes
205 |     log: is the data log transformed (usually this is the case)
206 |     """
207 |     n_cells, n_genes = A.shape
208 |     X = A[A.obs[obs_name] == ident]
209 |     Y = A[A.obs[obs_name] != ident]
210 |     
211 |     min_pct_mask = np.ones((n_genes,),dtype=np.bool)
212 |     log_fc_mask = np.ones((n_genes,),dtype=np.bool)
213 |     min_diff_pct_mask = np.ones((n_genes,),dtype=np.bool)
214 |     
215 |     pct_X = compute_frac_expressed(X)
216 |     pct_Y = compute_frac_expressed(Y)
217 | 
218 |     if min_pct:
219 |         min_pct_mask = np.logical_or(pct_X>min_pct, pct_Y>min_pct).flatten()
220 |         
221 |     mean_X = compute_mean_expression(X)
222 |     mean_Y = compute_mean_expression(Y)
223 |     if log:
224 |         logfc_XY = np.log(np.exp(mean_X)/np.exp(mean_Y))
225 | 
226 |         logfc_YX = np.log(np.exp(mean_Y)/np.exp(mean_X))
227 |     else:
228 |         logfc_XY = np.log(mean_X/mean_Y)
229 |         logfc_YX = np.log(mean_Y/mean_X)
230 |       
231 |     if logfc_thresh:
232 |         log_fc_mask = np.logical_or(logfc_XY > logfc_thresh, logfc_YX > logfc_thresh).flatten()
233 |     
234 |     if min_diff_pct:
235 |         diff_pct_XY = pct_X-pct_Y
236 |         diff_pct_YX = pct_Y-pct_X
237 |         min_diff_pct_mask = np.logical_or(diff_pct_XY > min_diff_pct, diff_pct_YX > min_diff_pct).flatten()
238 |     final_mask = np.logical_and(np.logical_and(min_pct_mask, log_fc_mask), min_diff_pct_mask).flatten()
239 |     A = A[:, final_mask]
240 |     
241 |     if max_cells_per_ident:
242 |         idx_X = np.nonzero((adata.obs[obs_name]==ident).values)[0]
243 |         idx_Y = np.nonzero((adata.obs[obs_name]!=ident).values)[0]
244 |         ids_X = idx_A[np.random.permutation(len(idx_X))[:max_cells_per_ident]]
245 |         ids_Y = idx_B[np.random.permutation(len(idx_Y))[:max_cells_per_ident]]
246 |         combined_ids = np.hstack((ids_X, ids_Y)).flatten()
247 |         return A[combined_ids,:], logfc_XY[np.array(final_mask).flatten()]
248 |     else:
249 |         
250 |         return A, logfc_XY[np.array(final_mask).flatten()]
251 |     
252 | from scipy.stats.distributions import chi2
253 | def likelihood_ratio(llmin, llmax):
254 |     llmin = -llmin
255 |     llmax = -llmax
256 |     return(2*(llmax-llmin))
257 | 
258 | def lrtest(llmin,llmax):
259 |     lr = likelihood_ratio(llmin, llmax)
260 |     p = chi2.sf(lr,1)
261 |     return p
262 | 
263 | def run_glm_de_pairwise(curr_adata, contrast, lognorm=False):
264 |     # run glm on pair of clusters, using contrast as True/False
265 |     curr_coefs = []
266 |     curr_pvals = []
267 |     curr_stderr = []
268 |     curr_genes = list(curr_adata.var_names)
269 |     family = sm.families.NegativeBinomial()
270 |     for i in range(len(curr_genes)):
271 |         try: 
272 |             if lognorm:
273 |                 curr_adata.obs["Y"] = np.log1p(curr_adata[:,curr_genes[i]].layers['counts'].toarray())
274 |             else:
275 |                 curr_adata.obs["Y"] = curr_adata[:,curr_genes[i]].layers['counts'].toarray()
276 |             formula = f"Y ~  C({contrast}) + log_umi + avg_UMI"
277 |             mdf = smf.glm(formula, data=curr_adata.obs, family=family).fit(maxiter=50,disp=0)
278 | 
279 |             #mdf = smf.ols(formula, data=curr_adata.obs).fit(maxiter=30,disp=0)
280 |             formula_reduced = "Y ~ log_umi + avg_UMI"
281 |             #mdf_reduced = smf.ols(formula_reduced, data=curr_adata.obs).fit(maxiter=30,disp=0)
282 | 
283 |             mdf_reduced = smf.glm(formula_reduced, data=curr_adata.obs, family=family).fit(maxiter=50,disp=0)
284 |             curr_coefs.append(mdf.params[f'C({contrast})[T.True]'])
285 |             curr_pvals.append(lrtest(mdf.llf, mdf_reduced.llf))
286 |             curr_stderr.append(mdf.bse[f'C({contrast})[T.True]'])
287 |         except Exception as e:
288 |             print(e)
289 |             curr_coefs.append(None)
290 |             curr_pvals.append(None)
291 |             curr_stderr.append(None)
292 |     curr_genes = [curr_genes[i] for i in range(len(curr_genes)) if curr_coefs[i] is not None]
293 |     coef = [c for c in curr_coefs if c is not None]
294 |     pvals = [p for p in curr_pvals if p is not None]
295 |     stderrs = [s for s in curr_stderr if s is not None]
296 |     results = pd.DataFrame({'coef':coef, 'pval':pvals, 'gene':curr_genes, 'stderr': curr_stderr})
297 |     results['qval'] = multipletests(results.pval, method='fdr_bh')[1]
298 |     return results
299 | 
300 | 
301 | def run_ttest_de_age(adata, family='nb', grouping='cell_type',  lognorm=False, min_pct=0.1, logfc_thresh=np.log(1)):
302 |     # do LR test
303 |     import warnings
304 |     from scipy.stats import mannwhitneyu
305 |     from statsmodels.tools.sm_exceptions import ConvergenceWarning, PrecisionWarning, IterationLimitWarning, EstimationWarning, SingularMatrixWarning
306 |     #from statsmodels.regression.linear_model.OLSResults import compare_lr_test
307 |     warnings.simplefilter('ignore', ConvergenceWarning)
308 |     warnings.simplefilter('ignore', PrecisionWarning)
309 |     warnings.simplefilter('ignore', IterationLimitWarning)
310 |     warnings.simplefilter('ignore', EstimationWarning)
311 |     warnings.simplefilter('ignore', SingularMatrixWarning)
312 |     warnings.simplefilter('ignore', FutureWarning)
313 |     from scipy.stats import ttest_ind 
314 |     all_model_fits = {}
315 |     all_results = {}
316 |     
317 |     for clust in adata.obs[grouping].unique()[::-1]:
318 |         print(clust)
319 |         curr_adata = adata[adata.obs[grouping]==clust].copy()
320 |         curr_adata, _ = filter_2group(curr_adata, "age", "4wk", min_pct=min_pct, logfc_thresh=logfc_thresh)
321 |         print(curr_adata.shape)
322 |         curr_coefs = []
323 |         curr_pvals = []
324 |         curr_stderr = []
325 |         curr_genes = list(curr_adata.var_names)
326 |         for i in tqdm(range(len(curr_genes))):
327 |             try:
328 |                 #if lognorm:
329 |                 #    curr_adata.obs["Y"] = np.log1p(curr_adata[:,curr_genes[i]].X.toarray())
330 |                 #else:
331 |                 X = curr_adata[:,curr_genes[i]].X.toarray()
332 |                 young_X = X[curr_adata.obs['age'] == '4wk']
333 |                 old_X = X[curr_adata.obs['age'] == '90wk']
334 |                 curr_coefs.append(np.log(old_X.mean()/young_X.mean()))
335 |                 curr_pvals.append(ttest_ind(old_X, young_X)[1])
336 |                 #curr_pvals.append(mannwhitneyu(old_X, young_X)[1])
337 |                 #curr_pvals.append(mdf.compare_lr_test(mdf_reduced)[])
338 |             except Exception as e:
339 |                 #print(e)
340 |                 curr_coefs.append(None)
341 |                 curr_pvals.append(None)
342 |         curr_genes = [curr_genes[i] for i in range(len(curr_genes)) if curr_coefs[i] is not None]
343 |         coef = [c for c in curr_coefs if c is not None]
344 |         pvals = [p for p in curr_pvals if p is not None]
345 |         results = pd.DataFrame({'coef':coef, 'pval':pvals, 'gene':curr_genes})
346 |         results['qval'] = multipletests(results.pval, method='fdr_bh')[1]
347 |         all_results[clust] = results
348 |     return all_results
349 | 
350 | def run_glm_de_age(adata, family='nb', grouping='cell_type',  lognorm=False, min_pct=0.1, logfc_thresh=np.log(1)):
351 |     # do LR test
352 |     import warnings
353 |     from statsmodels.tools.sm_exceptions import ConvergenceWarning, PrecisionWarning, IterationLimitWarning, EstimationWarning, SingularMatrixWarning
354 |     #from statsmodels.regression.linear_model.OLSResults import compare_lr_test
355 |     warnings.simplefilter('ignore', ConvergenceWarning)
356 |     warnings.simplefilter('ignore', PrecisionWarning)
357 |     warnings.simplefilter('ignore', IterationLimitWarning)
358 |     warnings.simplefilter('ignore', EstimationWarning)
359 |     warnings.simplefilter('ignore', SingularMatrixWarning)
360 |     warnings.simplefilter('ignore', FutureWarning)
361 |     if family == 'nb':
362 |         family = sm.families.NegativeBinomial()
363 |     elif family == 'poisson':
364 |         family = sm.families.Poisson()
365 |     
366 |     all_model_fits = {}
367 |     all_results = {}
368 |     
369 |     for clust in adata.obs[grouping].unique()[::-1]:
370 |         print(clust)
371 |         curr_adata = adata[adata.obs[grouping]==clust].copy()
372 |         print(clust, curr_adata.shape)
373 |         curr_adata, _ = filter_2group(curr_adata, "age", "4wk", min_pct=min_pct, logfc_thresh=logfc_thresh)
374 |         print(curr_adata.shape)
375 |         curr_coefs = []
376 |         curr_pvals = []
377 |         curr_stderr = []
378 |         curr_genes = list(curr_adata.var_names)
379 |         print("Using new formula")
380 |         umi_coef = []
381 |         for i in tqdm(range(len(curr_genes))):
382 |             try:
383 |                 #if lognorm:
384 |                 #    curr_adata.obs["Y"] = np.log1p(curr_adata[:,curr_genes[i]].X.toarray())
385 |                 #else:
386 |                 curr_adata.obs["Y"] = curr_adata[:,curr_genes[i]].X.toarray()
387 |                 formula = "Y ~  C(age) + + log_umi"
388 | 
389 |                 if family == "ols":
390 |                     mdf = smf.ols(formula, data=curr_adata.obs).fit(maxiter=50,disp=0)
391 |                 else:
392 |                     mdf = smf.glm(formula, data=curr_adata.obs, family=family).fit(maxiter=50,disp=0)
393 | 
394 |                 formula_reduced = "Y ~ log_umi" # low_umi
395 |                 if family == "ols":
396 |                     mdf_reduced = smf.ols(formula_reduced, data=curr_adata.obs).fit(maxiter=50,disp=0)
397 |                 else:
398 |                     mdf_reduced = smf.glm(formula_reduced, data=curr_adata.obs, family=family).fit(maxiter=50,disp=0)
399 |                 umi_coef.append(mdf.params['log_umi'])
400 |                 curr_coefs.append(mdf.params['C(age)[T.90wk]'])
401 |                 curr_pvals.append(lrtest(mdf.llf, mdf_reduced.llf))
402 |                 curr_stderr.append(mdf.bse['C(age)[T.90wk]'])
403 |                 #curr_pvals.append(mdf.compare_lr_test(mdf_reduced)[])
404 |             except Exception as e:
405 |                 #print(e)
406 |                 curr_coefs.append(None)
407 |                 curr_pvals.append(None)
408 |                 curr_stderr.append(None)
409 |         print('Mean UMI coef', np.mean(umi_coef))
410 |         curr_genes = [curr_genes[i] for i in range(len(curr_genes)) if curr_coefs[i] is not None]
411 |         coef = [c for c in curr_coefs if c is not None]
412 |         pvals = [p for p in curr_pvals if p is not None]
413 |         stderrs = [s for s in curr_stderr if s is not None]
414 |         results = pd.DataFrame({'coef':coef, 'pval':pvals, 'gene':curr_genes, 'stderr': curr_stderr})
415 |         results['qval'] = multipletests(results.pval, method='fdr_bh')[1]
416 |         all_results[clust] = results
417 |     return all_results
418 | 
419 | 
420 | def save_de_results(df_map, out_fname):
421 |     """
422 |     df_map is map of cell_type -> dataframe of differential gene expression results
423 |     """
424 |     de = []
425 |     for k,v in df_map.items():
426 |         v['cell_type'] = k
427 |         #v['qval'] = multipletests(np.array(v['pval']), method='fdr_bh')[1]
428 |         de.append(v)
429 |     out_df = pd.concat(de)
430 |     out_df.to_csv(out_fname)
431 |     return out_df


--------------------------------------------------------------------------------
/python/find_merfish_markers.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import seaborn as sns
  3 | import pandas as pd
  4 | import numpy as np
  5 | import scanpy as sc
  6 | from sklearn.ensemble import RandomForestClassifier
  7 | from sklearn.model_selection import train_test_split
  8 | from sklearn.metrics import roc_auc_score
  9 | from tqdm import tqdm
 10 | 
 11 | import diffxpy.api as de
 12 | 
 13 | from sklearn.inspection import permutation_importance
 14 | def select_age_features(A: sc.AnnData, grouping, Nfeats=500, test_size=0.2):
 15 |     """
 16 |     Use RFClassifier to find important aging features
 17 |     """
 18 |     scores = {}
 19 |     feats = {}
 20 |     for i in np.unique(A.obs[grouping]):
 21 |         print(i)
 22 |         curr_adata = A[A.obs[grouping]==i]
 23 |         X = curr_adata.X.copy()
 24 |         y = curr_adata.obs.age
 25 |         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
 26 |         clf = RandomForestClassifier(verbose=False, n_jobs=-1).fit(X_train, y_train)
 27 |         y_preds = clf.predict_proba(X_test)
 28 |         scores[i] = roc_auc_score(y_test, y_preds[:,1])
 29 |         feats[i] = pd.DataFrame({'clust': [i]*Nfeats,
 30 |                                           'importance': np.sort(clf.feature_importances_)[::-1][:Nfeats],
 31 |                                           'feats': curr_adata.var_names[np.argsort(clf.feature_importances_)[::-1][:Nfeats_age]]})
 32 |     return scores, feats
 33 | 
 34 | def select_celltype_features(A: sc.AnnData, grouping,Nfeats=1000, test_size=0.2):
 35 |     """
 36 |     Use RFClassifier to find important cell type distinguishing features.
 37 |     """
 38 |     scores = {}
 39 |     feats = {}
 40 |     X = A.X.copy()
 41 |     for i in np.unique(A.obs[grouping]):
 42 |         print(i)
 43 |         # train on one vs rest
 44 |         y = A.obs[grouping]==i
 45 |         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
 46 |         clf = RandomForestClassifier(verbose=False, n_jobs=-1).fit(X_train, y_train)
 47 |         y_preds = clf.predict_proba(X_test)
 48 |         scores[i] = roc_auc_score(y_test, y_preds[:,1])
 49 |         feats[i] = pd.DataFrame({'clust': [i]*Nfeats,
 50 |                                           'importance': np.sort(clf.feature_importances_)[::-1][:Nfeats],
 51 |                                           'feats': A.var_names[np.argsort(clf.feature_importances_)[::-1][:Nfeats]]})
 52 |     return scores, feats
 53 | 
 54 | def select_celltype_features_perm(A: sc.AnnData, grouping,Nfeats=1000, test_size=0.2, n_repeats=10):
 55 |     scores = {}
 56 |     feats = {}
 57 |     X = A.X.toarray().copy()
 58 |     for i in np.unique(A.obs[grouping]):
 59 |         print(i)
 60 |         # train on one vs rest
 61 |         y = A.obs[grouping]==i
 62 |         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
 63 |         clf = RandomForestClassifier(verbose=False, n_jobs=-1).fit(X_train, y_train)
 64 |         y_preds = clf.predict_proba(X_test)
 65 |         result = permutation_importance(clf, X_test, y_test, n_repeats=n_repeats, random_state=42, n_jobs=-1)
 66 |         scores[i] = roc_auc_score(y_test, y_preds[:,1])
 67 |         feats[i] = pd.DataFrame({'clust': [i]*Nfeats,
 68 |                                           'importance': np.sort(result.importances_mean)[::-1][:Nfeats],
 69 |                                           'feats': A.var_names[np.argsort(result.importances_mean)[::-1][:Nfeats]]})
 70 |     return scores, feats
 71 | 
 72 | from scipy.spatial.distance import pdist
 73 | import scipy.cluster.hierarchy as hc
 74 | import matplotlib
 75 | 
 76 | def plot_clustered_celltypes_by_genes(A: sc.AnnData, genes, normalize=True, figsize=(20,30)):
 77 | 
 78 |     marker_clust_avgs = []
 79 |     clust_avgs = []
 80 |     for i in A.obs.clust_label.unique():
 81 |         clust_avgs.append(compute_mean_expression(A[A.obs.clust_label==i,:]))
 82 |         marker_clust_avgs.append(compute_mean_expression(A[A.obs.clust_label==i,:][:,genes]))
 83 | 
 84 |     D = pdist(np.vstack(marker_clust_avgs).T, 'euclidean')
 85 |     Z = hc.linkage(D, 'ward', optimal_ordering=True)
 86 |     gene_ordering = hc.leaves_list(Z)
 87 | 
 88 |     D = pdist(clust_avgs, 'euclidean')
 89 |     Z = hc.linkage(D, 'ward', optimal_ordering=True)
 90 |     clust_ordering = hc.leaves_list(Z)
 91 | #plt.imshow(np.corrcoef(clust_avgs)[clust_ordering],vmin=0,vmax=1,cmap=plt.cm.viridis)
 92 |     matplotlib.rcParams.update({'font.size': 8})
 93 |     if normalize:
 94 |         sc.pl.heatmap(A, np.array(list(genes))[gene_ordering], 'clust_label', show_gene_labels=True, dendrogram=True,standard_scale='obs',figsize=figsize)
 95 |     else:
 96 |         sc.pl.heatmap(A, np.array(list(genes))[gene_ordering], 'clust_label', show_gene_labels=True, dendrogram=True,figsize=figsize)
 97 | 
 98 | def plot_clustered_ages_by_genes(A: sc.AnnData, genes, normalize=True, figsize=(20,30)):
 99 | 
100 |     marker_clust_avgs = []
101 |     clust_avgs = []
102 |     clust_names = A.obs.clust_label.unique()
103 |     for i in A.obs.clust_label.unique():
104 |         clust_avgs.append(compute_mean_expression(A[A.obs.clust_label==i,:]))
105 |         marker_clust_avgs.append(compute_mean_expression(A[np.logical_and(A.obs.clust_label==i, A.obs.age=='4wk'),:][:,genes])-
106 |                          compute_mean_expression(A[np.logical_and(A.obs.clust_label==i, A.obs.age=='90wk'),:][:,genes]))
107 | 
108 |     marker_clust_avgs = np.vstack(marker_clust_avgs)
109 |     D = pdist(marker_clust_avgs.T, 'euclidean')
110 |     Z = hc.linkage(D, 'ward', optimal_ordering=True)
111 |     gene_ordering = hc.leaves_list(Z)
112 | 
113 |     D = pdist(clust_avgs, 'euclidean')
114 |     Z = hc.linkage(D, 'ward', optimal_ordering=True)
115 |     clust_ordering = hc.leaves_list(Z)
116 |     
117 |     plt.figure(figsize=(20,10))
118 |     plt.imshow(marker_clust_avgs[:, gene_ordering][clust_ordering,:],vmin=-2,vmax=2,cmap=plt.cm.bwr,aspect='auto',interpolation='none')
119 |     plt.yticks(np.arange(marker_clust_avgs.shape[0]))
120 |     plt.xticks(np.arange(marker_clust_avgs.shape[1]))
121 |     plt.axes().set_xticklabels(np.array(genes)[gene_ordering],rotation=90);
122 |     plt.axes().set_yticklabels(np.array(clust_names)[clust_ordering])
123 |     plt.grid(False)
124 | 
125 | def compute_cluster_proportions(A: sc.AnnData, obs_type='clust_label'):
126 |     """ Compute the fraction of cells in each cluster """
127 |     clusts = A.obs[obs_type].unique()
128 |     clust_proportions = np.zeros((len(clusts),1))
129 |     for k,i in enumerate(clusts):
130 |         clust_proportions[k] = np.sum(A.obs[obs_type]==i)/A.shape[0]
131 |     return clust_proportions, clusts
132 | 
133 | def compute_average_celltype_expr(A: sc.AnnData, genes, obs_type="clust_label"):
134 |     marker_clust_avgs = []
135 |     for i in A.obs[obs_type].unique():
136 |         marker_clust_avgs.append(compute_mean_expression(A[A.obs[obs_type]==i,:][:,genes]))
137 |     return np.vstack(marker_clust_avgs), A.obs[obs_type].unique()
138 | 
139 | def compute_average_age_expr_change(A: sc.AnnData, genes):
140 |     marker_clust_avgs = []
141 |     for i in A.obs.clust_label.unique():
142 |         marker_clust_avgs.append(compute_mean_expression(A[np.logical_and(A.obs.clust_label==i, A.obs.age=='4wk'),:][:,genes])-
143 |                          compute_mean_expression(A[np.logical_and(A.obs.clust_label==i, A.obs.age=='90wk'),:][:,genes]))
144 |     return np.vstack(marker_clust_avgs)
145 | 
146 | # compute per cluster average sparsity
147 | def plot_per_celltype_sparsity(A: sc.AnnData, genes):
148 |     sparsity = []
149 |     celltypes = []
150 |     for i in A.obs.cell_type.unique():
151 |         curr_adata = A[A.obs.cell_type==i][:, genes]
152 |         frac_expr = compute_frac_expressed(curr_adata)
153 |         sparsity.extend(frac_expr)
154 |         celltypes.extend([i]*len(frac_expr))
155 |     sns.swarmplot(data=pd.DataFrame({'clust': celltypes, 'sparsity':sparsity}), 
156 |                   x='clust',
157 |                   y='sparsity')
158 |     
159 | def plot_per_celltype_totalexpr(A: sc.AnnData, genes, exp=False):
160 |     expr = []
161 |     celltypes = []
162 |     for i in A.obs.cell_type.unique():
163 |         curr_adata = A[A.obs.cell_type==i][:, genes]
164 |         total_expr = np.array(curr_adata.X.sum(1)).flatten()
165 |         expr.extend(total_expr)
166 |         celltypes.extend([i]*len(total_expr))
167 |     sns.violinplot(data=pd.DataFrame({'clust': celltypes, 'expr':expr}), 
168 |                   x='clust',
169 |                   y='expr')
170 |     #return pd.DataFrame({'clust': celltypes, 'expr':expr})
171 | 
172 |     
173 | def plot_per_gene_sparsity(A: sc.AnnData, genes):
174 |     """
175 |     Score each gene by the max fraction expression divided by the average across all clusters.
176 |     """
177 |     sparsity = []
178 |     celltypes = []
179 |     for i in A.obs.cell_type.unique():
180 |         curr_adata = A[A.obs.cell_type==i][:, genes]
181 |         frac_expr = compute_frac_expressed(curr_adata)
182 |         sparsity.append(frac_expr)
183 |         #celltypes.extend([i]*len(frac_expr))
184 |     temp = np.vstack(sparsity)
185 |     sparsity_score = temp.mean(0)/temp.max(0)
186 |     sort_idx = np.argsort(sparsity_score)
187 |     plt.figure(figsize=(15,5))
188 |     plt.scatter(np.arange(temp.shape[1]), sparsity_score[sort_idx])
189 |     plt.xticks(np.arange(temp.shape[1]));
190 |     plt.axes().grid(False)
191 |     plt.axes().set_xticklabels(np.array(genes)[sort_idx],rotation=90,fontsize=6);
192 | 
193 | def compute_mean_expression(A: sc.AnnData):
194 |     """
195 |     Average expression for each gene
196 |     """
197 |     return np.array(A.X.mean(0)).flatten()
198 | 
199 | def compute_frac_expressed(A: sc.AnnData):
200 |     """
201 |     Fraction of cells expressing each gene
202 |     """
203 |     return np.array((A.X>0).sum(0)/A.shape[0]).flatten()
204 | 
205 | def filter_2group_1way(A: sc.AnnData, obs_name: str, ident: str, min_pct=None, logfc_thresh=None, min_diff_pct=None, max_cells_per_ident=None, log=True):
206 |     """
207 |     Filter genes before differential expression testing. UNIDIRECTIONAL
208 |     obs is grouping
209 |     ident is what to be compared (ident vs ~ident)
210 |     min_diff_pcr: minimum difference in percentage between genes
211 |     log: is the data log transformed (usually this is the case)
212 |     """
213 |     n_cells, n_genes = A.shape
214 |     X = A[A.obs[obs_name] == ident]
215 |     Y = A[A.obs[obs_name] != ident]
216 |     
217 |     min_pct_mask = np.ones((n_genes,),dtype=np.bool)
218 |     log_fc_mask = np.ones((n_genes,),dtype=np.bool)
219 |     min_diff_pct_mask = np.ones((n_genes,),dtype=np.bool)
220 |     
221 |     pct_X = compute_frac_expressed(X)
222 |     pct_Y = compute_frac_expressed(Y)
223 | 
224 |     if min_pct:
225 |         min_pct_mask = (pct_X>min_pct).flatten()
226 |         
227 |     mean_X = compute_mean_expression(X)
228 |     mean_Y = compute_mean_expression(Y)
229 |     if log:
230 |         logfc_XY = np.log(np.exp(mean_X)/np.exp(mean_Y))
231 |     else:
232 |         logfc_XY = np.log(mean_X/mean_Y)
233 |       
234 |     if logfc_thresh:
235 |         log_fc_mask = (logfc_XY > logfc_thresh).flatten()
236 |     
237 |     if min_diff_pct:
238 |         diff_pct_XY = pct_X-pct_Y
239 |         min_diff_pct_mask = (diff_pct_XY > min_diff_pct).flatten()
240 |     final_mask = np.logical_and(np.logical_and(min_pct_mask, log_fc_mask), min_diff_pct_mask).flatten()
241 |     A = A[:, final_mask]
242 |     
243 |     if max_cells_per_ident:
244 |         idx_X = np.nonzero((A.obs[obs_name]==ident).values)[0]
245 |         idx_Y = np.nonzero((A.obs[obs_name]!=ident).values)[0]
246 |         ids_X = idx_X[np.random.permutation(len(idx_X))[:max_cells_per_ident]]
247 |         ids_Y = idx_Y[np.random.permutation(len(idx_Y))[:max_cells_per_ident]]
248 |         combined_ids = np.hstack((ids_X, ids_Y)).flatten()
249 |         return A[combined_ids,:], logfc_XY[np.array(final_mask).flatten()]
250 |     else:
251 |         
252 |         return A, logfc_XY[np.array(final_mask).flatten()]
253 |     
254 | 
255 | def compute_onevsall_de_for_clusts(A: sc.AnnData, clust_obs, n_de=5):
256 |     clust_labels_uniq = list(np.unique(A.obs[clust_obs]))
257 | 
258 |     de_by_type = {}
259 |     for n,i in enumerate(clust_labels_uniq):
260 |         print(n+1,'/',len(clust_labels_uniq),':',i)
261 |         curr_A = A[np.logical_or(A.obs[clust_obs]==i, A.obs[clust_obs]!=i)].copy()
262 |         curr_A.obs['contrast'] = curr_A.obs[clust_obs]==i
263 |         curr_A, _ = filter_2group_1way(curr_A, 'contrast', True, min_pct=0.2, logfc_thresh=np.log(1.5))
264 |         res = de.test.t_test(data=curr_A, grouping='contrast')
265 | 
266 |         frac_foreground = compute_frac_expressed(curr_A[curr_A.obs[clust_obs]==i])
267 |         frac_background = compute_frac_expressed(curr_A[curr_A.obs[clust_obs]!=i])
268 |         # filter genes
269 |         good_expr = np.logical_and(res.log10_fold_change()>=np.log10(2), res.qval<0.05)
270 |         good_frac = np.logical_and(frac_foreground>0.4, frac_foreground>3*frac_background)
271 |         # require 
272 |         good_genes = np.logical_and(good_expr, good_frac)
273 |         sort_idx = np.argsort(res.qval[good_genes])
274 |         log10fc = res.log10_fold_change()[good_genes][sort_idx]
275 |         de_by_type[i] = pd.DataFrame({'gene':res.gene_ids[good_genes][sort_idx][:n_de],
276 |                                           'log10fc':log10fc[:n_de],
277 |                                           'frac_fg':frac_foreground[good_genes][sort_idx][:n_de],
278 |                                           'frac_bg':frac_background[good_genes][sort_idx][:n_de],
279 |                                           'qval':res.qval[good_genes][sort_idx][:n_de]})
280 |     return de_by_type
281 | 
282 | def select_age_markers(de_map, n_marker_genes=10):
283 |     """
284 |     Select top n_marker_genes for each cluster. log10fc is absolute value of coefficient.
285 |     """
286 |     clust_labels_uniq = list(de_map.keys())
287 |     de_marker_genes = set()
288 |     for n,i in enumerate(clust_labels_uniq):
289 |         curr_contrast = de_map[i].sort_values('log10fc', ascending=False)
290 |         for g in list(curr_contrast.head(n_marker_genes).gene):
291 |             de_marker_genes.add(g)
292 |     return de_marker_genes
293 |         
294 | def greedily_select_markers(de_map, min_marker_genes=4, pairwise=True, n_pass=10, de_marker_genes=None):
295 |     if pairwise:
296 |         clust_labels_uniq = list(set([i[0] for i in de_map.keys()]))
297 |     else:
298 |         clust_labels_uniq = list(de_map.keys())
299 |     if de_marker_genes is None:
300 |         de_marker_genes = set()
301 |     else:
302 |         de_marker_genes = set(de_marker_genes)
303 |     # do n passes through the list, to ensure that all clusters have at least min_marker_genes included
304 |     # in the set
305 | 
306 |     all_clusts_good = True
307 |     for n in range(n_pass):
308 |         if n > 0 and all_clusts_good:
309 |             break
310 |         else:
311 |             all_clusts_good = True
312 |             for n,i in enumerate(clust_labels_uniq):
313 |                 #print(n+1,'/',len(clust_labels_uniq),':',i)
314 |                 if pairwise:
315 |                     for j in clust_labels_uniq:
316 |                         if i != j:
317 |                             curr_contrast = de_map[(i,j)].sort_values('log10fc', ascending=False)
318 |                             curr_genes = list(curr_contrast.gene)
319 |                             # check if has enough genes in this pair
320 |                             if len(curr_genes) > 0:
321 |                                 # check how many of these genes are included in the working set of marker genes
322 |                                 n_curr_marker = np.sum([k in de_marker_genes for k in curr_genes])
323 |                                 # if this cluster has no markers in the marker gene set, add the remaining number
324 |                                 if n_curr_marker < min_marker_genes:
325 |                                     all_clusts_good = False
326 |                                     n_to_add = min(len(curr_genes), int(min_marker_genes-n_curr_marker))
327 |                                     print("Adding", n_to_add)
328 |                                     curr_to_add = [i for i in curr_genes if i not in de_marker_genes]
329 |                                     for k in range(min(n_to_add, len(curr_to_add))):
330 |                                         de_marker_genes.add(curr_to_add[k])
331 |                 else:
332 |                     curr_contrast = de_map[i].sort_values('log10fc', ascending=False)
333 |                     curr_genes = list(curr_contrast.gene)
334 |                     if len(curr_genes) > 0:
335 |                         # check how many of these genes are included in the working set of marker genes
336 |                         n_curr_marker = np.sum([k in de_marker_genes for k in curr_genes])
337 |                         # if this cluster has no markers in the marker gene set, add the remaining number
338 |                         if n_curr_marker < min_marker_genes:
339 |                             n_to_add = min(len(curr_genes), int(min_marker_genes-n_curr_marker))
340 |                             for k in range(n_to_add):
341 |                                 de_marker_genes.add(curr_genes[k])
342 | 
343 |     return de_marker_genes
344 | 
345 | def compute_pairwise_de_for_clusts(A: sc.AnnData, clust_obs, n_de=5, min_pct=0.4):
346 |     clust_labels_uniq = list(np.unique(A.obs[clust_obs]))
347 | 
348 |     pairwise_de = {}
349 |     for n,i in enumerate(clust_labels_uniq):
350 |         print(n+1,'/',len(clust_labels_uniq),':',i)
351 |         for j in tqdm(clust_labels_uniq):
352 |             if i != j:
353 |                 curr_A = A[np.logical_or(A.obs[clust_obs]==i, A.obs[clust_obs]==j)].copy()
354 |                 curr_A.obs['contrast'] = curr_A.obs[clust_obs]==i
355 |                 curr_A, _ = filter_2group_1way(curr_A, 'contrast', True, min_pct=min_pct, logfc_thresh=np.log(1.5))
356 |                 res = de.test.t_test(data=curr_A, grouping='contrast')
357 | 
358 |                 frac_foreground = compute_frac_expressed(curr_A[curr_A.obs[clust_obs]==i])
359 |                 frac_background = compute_frac_expressed(curr_A[curr_A.obs[clust_obs]==j])
360 |                 # filter genes
361 |                 good_expr = np.logical_and(res.log10_fold_change()>=np.log10(2), res.qval<0.05)
362 |                 good_frac = np.logical_and(frac_foreground>0.4, frac_foreground>3*frac_background)
363 |                 good_genes = np.logical_and(good_expr, good_frac)
364 |                 sort_idx = np.argsort(res.qval[good_genes])
365 |                 log10fc = res.log10_fold_change()[good_genes][sort_idx]
366 |                 pairwise_de[(i,j)] = pd.DataFrame({'gene':res.gene_ids[good_genes][sort_idx][:n_de],
367 |                                                   'log10fc':log10fc[:n_de],
368 |                                                   'frac_fg':frac_foreground[good_genes][sort_idx][:n_de],
369 |                                                   'frac_bg':frac_background[good_genes][sort_idx][:n_de],
370 |                                                   'qval':res.qval[good_genes][sort_idx][:n_de]})
371 |     
372 |     # greedily select marker genes based on differential expression ranking
373 |     return pairwise_de


--------------------------------------------------------------------------------
/python/integration.py:
--------------------------------------------------------------------------------
 1 | from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
 2 | from sklearn.neural_network import MLPRegressor, MLPClassifier
 3 | from sklearn.ensemble import RandomForestClassifier
 4 | import numpy as np
 5 | #from xgboost import XGBClassifier
 6 | def train_umap_classifier(adata, label_column, umap_key='X_umap', n_dims=30, method='knn'):
 7 |     '''Train a KNN classifier using the UMAP coordinates.'''
 8 |     if method == 'knn':
 9 |         knnc = KNeighborsClassifier(n_jobs=-1, n_neighbors=25) ##KNeighborsClassifier(n_jobs=-1)#
10 |     elif method == "mlp":
11 |         knnc = MLPClassifier()
12 |     knnc.fit(adata.obsm[umap_key][:, :n_dims], np.array(adata.obs[label_column]))
13 |     return knnc
14 | 
15 | 
16 | def impute_classification(adata, classifier, prediction_column, probability_column, umap_key='X_umap', n_dims=30):
17 |     '''Impute using a trained classifier.'''
18 |     classes = classifier.classes_
19 |     probas = classifier.predict_proba(adata.obsm[umap_key][:, :n_dims])
20 |     max_ids = np.argmax(probas, axis=1)
21 |     max_probas = np.max(probas, axis=1)
22 |     predicted_classes = [classes[i] for i in max_ids]
23 |     
24 |     adata.obs[prediction_column] = predicted_classes
25 |     adata.obs[probability_column] = max_probas
26 | 
27 |     
28 | def impute_gene_expression():
29 |     pass


--------------------------------------------------------------------------------
/python/plotting.py:
--------------------------------------------------------------------------------
  1 | import seaborn as sns
  2 | from cycler import cycler
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | from scipy.spatial.transform import Rotation as R
  6 | from statsmodels.stats.multitest import multipletests
  7 | import matplotlib as mpl
  8 | def gen_light_palette(prefix, color_name, uniq_clusts):
  9 |     n = np.sum([1 if prefix in i else 0 for i in uniq_clusts])
 10 |     return sns.light_palette(color_name, n_colors=n+2)[2:]
 11 | 
 12 | def gen_dark_palette(prefix, color_name, uniq_clusts):
 13 |     n = np.sum([1 if prefix in i else 0 for i in uniq_clusts])
 14 |     return sns.dark_palette(color_name, n_colors=n+2)[2:]
 15 | 
 16 | major_cell_types = {
 17 |     # Astro -- green
 18 |     "Astro" : "seagreen",
 19 |     # Excitatory -- red/orange
 20 |     "ExN" : "lightcoral",
 21 |     # inhibitory -- blue/purple
 22 |     "InN" : "cornflowerblue",
 23 |     "MSN" : "mediumpurple",
 24 |     
 25 |     # immune cells + microglia -- pink
 26 |     "Micro" : "pink",
 27 |     "T cell" : "deeppink",
 28 |     "Macro" : "violet",
 29 |     
 30 |     # Endothelial/vasculure -- gold/tan
 31 |     "Vlmc" : "gold",
 32 |     "Endo" : "khaki",
 33 |     "Peri" : "goldenrod",
 34 |     "Epen" : "darkkhaki",
 35 |     
 36 |     # Oligodendrocytes
 37 |     "Olig" : "slategrey",
 38 |     "OPC" : "black"
 39 | }
 40 | 
 41 | clust_cell_types = {
 42 |     # Astro -- green
 43 |     "Astro" : "seagreen",
 44 |     # Excitatory -- red/orange
 45 |     "ExN-L2/3" : "darkorange",
 46 |     "ExN-L5" : "lightsalmon",
 47 |     "ExN-L6" : "maroon",
 48 |     "ExN-Olf" : "firebrick",
 49 |     # inhibitory -- blue/purple
 50 |     'InN-Olf' : "cornflowerblue",
 51 |     #'InN-Adarb2' : "lightsteelblue",
 52 |     'InN-Chat' : "lavender",
 53 |     #'InN-Egfr' : "turquoise",
 54 |     #'InN-Calb' : "teal",
 55 |      'InN-Lhx6':'lightsteelblue',
 56 | 
 57 |     'InN-Calb2' : "navy",
 58 |     'InN-Lamp5' : "royalblue",
 59 |     'InN-Pvalb' : "steelblue",
 60 |     'InN-Sst' : "dodgerblue",
 61 |     'InN-Vip' : "deepskyblue",
 62 |     "MSN-D1" : "mediumslateblue",
 63 |     "MSN-D2" : "rebeccapurple",
 64 |     # immune cells + microglia -- pink
 65 |     "Micro" : "deeppink",
 66 |     "T cell" : "crimson",
 67 |     "Macro" : "hotpink",
 68 |     
 69 |     # Endothelial/vasculure -- gold/tan
 70 |     "Vlmc" : "olive",
 71 |     "Endo" : "khaki",
 72 |     "Peri" : "goldenrod",
 73 |     "Epen" : "burlywood",
 74 |     
 75 |     # Oligodendrocytes
 76 |     "Olig" : "slategrey",
 77 |     "OPC" : "black"
 78 | }
 79 | 
 80 | 
 81 | 
 82 | def generate_palettes(A,clust_key="clust_annot", cell_type_key="cell_type"):
 83 |     print("Updated")
 84 |     uniq_celltypes = np.sort(np.unique(A.obs[cell_type_key]))
 85 |     uniq_clusts = np.sort(A.obs[clust_key].unique())
 86 | 
 87 |     celltype_pals = []
 88 |     for i in uniq_celltypes:
 89 |         pal = gen_dark_palette(i, major_cell_types[i], uniq_celltypes)
 90 |         celltype_pals.append(pal)
 91 |     celltype_pals = cycler(color=np.vstack(celltype_pals))
 92 | 
 93 |     celltype_colors = {}
 94 |     for i,c in enumerate(iter(celltype_pals)):
 95 |         celltype_colors[uniq_celltypes[i]] = c['color']
 96 | 
 97 |     clust_pals = []
 98 |     label_colors = {}
 99 |     for i in sorted(clust_cell_types.keys()):
100 |         n = np.sum([1 if i in j else 0 for j in uniq_clusts])
101 |         if n > 0:
102 |             pal = gen_dark_palette(i, clust_cell_types[i], uniq_clusts)
103 |             print(i,pal)
104 |             clust_pals.append(pal)
105 |             # find palettes for cell types
106 |             curr_clusts = sorted([k for k in uniq_clusts if i in k])
107 |             for n,p in enumerate(pal):
108 |                 label_colors[curr_clusts[n]] = p
109 |         else:
110 |             print("Couldn't find clust", i)
111 |     clust_pals = cycler(color=np.vstack(clust_pals))
112 |     #label_colors = {}
113 |     #for i, c in enumerate(iter(clust_pals)):
114 |     #    label_colors[valid_clusts[i]] = c['color']
115 | 
116 |     return celltype_colors, celltype_pals, label_colors, clust_pals
117 | 
118 | def calculate_aspect_ratio(A, rot=0,fov_size=221):
119 |     all_pts = A.obsm['spatial']
120 |     if rot>0:
121 |         rotate(all_pts, degrees=rot)
122 |     max_x = all_pts[:,0].max()
123 |     min_x = all_pts[:,0].min()
124 |     max_y = all_pts[:,1].max()
125 |     min_y = all_pts[:,1].min()
126 |     n_tiles_x = np.round((max_x-min_x)/fov_size)
127 |     n_tiles_y = np.round((max_y-min_y)/fov_size)
128 |     aspect_ratio = n_tiles_x/n_tiles_y
129 |     return aspect_ratio, n_tiles_x, n_tiles_y
130 | 
131 | def plot_clust_subset(A, cell_types, cmap, ax=None, rot=0, s=0.1, xlim=None, ylim=None,alpha=0.1,clust_key="clust_annot"):
132 |     if ax is None:
133 |         f,ax = plt.subplots()
134 |     all_pts = A.obsm['spatial'].copy()#np.array([A.obs.center_x, A.obs.center_y]).T
135 |     # zero center all_pts
136 |     all_pts = rotate(all_pts, degrees=rot)
137 |     all_pts[:,0] -= all_pts[:,0].min()
138 |     all_pts[:,1] -= all_pts[:,1].min()
139 | 
140 |     curr_idx = np.argwhere([i in cell_types for i in A.obs[clust_key]]).flatten()
141 |     curr_pts = all_pts[curr_idx,:]
142 |     other_idx = np.array([i for i in np.arange(all_pts.shape[0]) if i not in curr_idx])
143 |     if len(other_idx) > 0:
144 |         ax.scatter(all_pts[other_idx,:][:,0],all_pts[other_idx,:][:,1],s=s,vmin=0,vmax=1, c='lightgray', alpha=alpha,rasterized=True)
145 |     print(all_pts[:,0].min(), all_pts[:,0].max(),all_pts[:,1].min(), all_pts[:,1].max())
146 |     ax.scatter(curr_pts[:,0],curr_pts[:,1],s=s,vmin=0,vmax=A.obs.clust_encoding.max(),c=A.obs.clust_encoding[curr_idx],rasterized=True,cmap=cmap)
147 |     ax.axis('off')
148 |     if xlim is not None:
149 |         ax.set_xlim(xlim)
150 |     if ylim is not None:
151 |         ax.set_ylim(ylim)
152 | 
153 | def plot_seg(A, cmap, ax=None, rot=0, s=0.1, xlim=None, ylim=None,key='spatial_clust_annots_value',vmax=7):
154 |     if ax is None:
155 |         f,ax = plt.subplots()
156 |     all_pts = A.obsm['spatial'].copy()#np.array([A.obs.center_x, A.obs.center_y]).T
157 |     # zero center all_pts
158 |     all_pts = rotate(all_pts, degrees=rot)
159 |     all_pts[:,0] -= all_pts[:,0].min()
160 |     all_pts[:,1] -= all_pts[:,1].min()
161 |     ax.scatter(all_pts[:,0], all_pts[:,1],s=s, c=A.obs[key],cmap=cmap,vmin=0,vmax=vmax)
162 |     ax.axis('off')
163 |     if xlim is not None:
164 |         ax.set_xlim(xlim)
165 |     if ylim is not None:
166 |         ax.set_ylim(ylim)
167 | from scipy.stats import ttest_ind, ranksums
168 | def calc_pvals_for_grouping(x,y,data,hue,order=None):
169 |     if order is None:
170 |         order = sorted(list(data[x].unique()))
171 |     hue_conds = list(data[hue].unique()) # assumes there are only two for this
172 |     pvals = []
173 |     for i in order:
174 |         A = data[np.logical_and(data[x]==i, data[hue]==hue_conds[0])][y]
175 |         B = data[np.logical_and(data[x]==i, data[hue]==hue_conds[1])][y]
176 |         pval = ranksums(A,B)
177 |         pvals.append(pval[1])
178 |     return pvals
179 | 
180 | def plot_pvals(ax, pvals):
181 |     ymin, ymax = ax.get_ylim()
182 |     xticks = ax.get_xticks()
183 |     for i,p in enumerate(pvals):
184 |         if p < 0.01:
185 |             ax.text(xticks[i], ymax, '*')
186 |  
187 | def plot_cond_obs_comparison(data, x, y, cell_type, figsize=(5,3), order=None, clust_key='cell_type', cond_pal=sns.color_palette(['g','m']), ylim=None):
188 |     f, ax = plt.subplots(figsize=figsize)
189 |     curr_df = data[data.obs[clust_key]==cell_type].obs
190 |     if order is None:
191 |         order = sorted(curr_df[x].unique())
192 |     #sns.violinplot(x=x, y=y, data=curr_df,hue='age',fliersize=1,linewidth=1,palette=age_pal, ax=ax,inner=None,order=order,rasterized=True)
193 |     sns.boxplot(x=x, y=y, data=curr_df,hue='cond',fliersize=0,linewidth=1,ax=ax, palette=cond_pal,order=order)
194 |     sns.stripplot(data=curr_df, x=x, y=y, hue="cond",jitter=0.15,size=0.5,dodge=True,color='k', rasterized=True,ax=ax, order=order)
195 |     sns.despine()
196 |     if ylim is not None:
197 |         ax.set_ylim(ylim)
198 |     plt.legend([],[], frameon=False)
199 | 
200 | #    sns.despine()
201 | #    plt.legend([],[], frameon=False)
202 |     #if show_pvals:
203 |     #    pvals = calc_pvals_for_grouping(x,y,curr_df, "cond",order=order)
204 |     #    plot_pvals(ax, pvals)
205 |     return f
206 |            
207 | def plot_age_obs_comparison(data, x, y, cell_type, figsize=(5,3), show_pvals=False, order=None, clust_key='cell_type', age_pal=sns.color_palette(['cornflowerblue','thistle','lightcoral'])):
208 |     f, ax = plt.subplots(figsize=(5,3))
209 |     curr_df = data[data.obs[clust_key]==cell_type].obs
210 |     if order is None:
211 |         order = sorted(curr_df[x].unique())
212 |     #sns.violinplot(x=x, y=y, data=curr_df,hue='age',fliersize=1,linewidth=1,palette=age_pal, ax=ax,inner=None,order=order,rasterized=True)
213 |     sns.boxplot(x=x, y=y, data=curr_df,hue='age',fliersize=0,linewidth=1,palette=age_pal, ax=ax,order=order)
214 | 
215 |     sns.stripplot(data=curr_df, x=x, y=y, hue="age", ax=ax,jitter=0.15,size=0.5,dodge=True,color='k',order=order, rasterized=True)
216 | 
217 |     sns.despine()
218 |     plt.legend([],[], frameon=False)
219 |     if show_pvals:
220 |         pvals = calc_pvals_for_grouping(x,y,curr_df, "age",order=order)
221 |         plot_pvals(ax, pvals)
222 |     return f
223 | 
224 | def plot_obs(A, cell_types, obs_name, cmap, ax=None, rot=0, s=0.1, xlim=None, ylim=None,vmin=0,vmax=10,alpha=0.1,key="clust_annot"):
225 |     if ax is None:
226 |         f,ax = plt.subplots()
227 |     all_pts = A.obsm['spatial'].copy()#np.array([A.obs.center_x, A.obs.center_y]).T
228 |     print("Shape", all_pts.shape)
229 |     # zero center all_pts
230 |     all_pts = rotate(all_pts, degrees=rot)
231 |     all_pts[:,0] -= all_pts[:,0].min()
232 |     all_pts[:,1] -= all_pts[:,1].min()
233 | 
234 |     curr_idx = np.argwhere([i in cell_types for i in A.obs[key]]).flatten()
235 |     curr_pts = all_pts[curr_idx,:]
236 |     other_idx = np.array([i for i in np.arange(all_pts.shape[0]) if i not in curr_idx])
237 |     if len(other_idx) > 0:
238 |         ax.scatter(all_pts[other_idx,:][:,0],all_pts[other_idx,:][:,1],s=s,vmin=0,vmax=1, c='lightgray', alpha=alpha,rasterized=True, edgecolors='face')
239 |     #print(all_pts[:,0].min(), all_pts[:,0].max(),all_pts[:,1].min(), all_pts[:,1].max())
240 |     ax.scatter(curr_pts[:,0],curr_pts[:,1],s=s,vmin=vmin,vmax=vmax,c=np.array(A[curr_idx,:].obs[obs_name]),rasterized=True,cmap=cmap, edgecolors='face')
241 |     ax.axis('off')
242 |     if xlim is not None:
243 |         ax.set_xlim(xlim)
244 |     if ylim is not None:
245 |         ax.set_ylim(ylim)
246 | 
247 | def plot_gene_expr(A, cell_types, gene_name, cmap, ax=None, rot=0, s=0.1, xlim=None, ylim=None,vmin=0,vmax=10,use_raw=True,key='clust_annot',alpha=1):
248 |     if ax is None:
249 |         f,ax = plt.subplots()
250 |     curr_idx = np.argwhere([i in cell_types for i in A.obs[key]]).flatten()
251 |     other_idx = np.array([i for i in np.arange(A.shape[0]) if i not in curr_idx]).astype(np.int)
252 |     curr_adata = A[curr_idx, :]
253 |     other_adata = A[other_idx, :]
254 |     if use_raw:
255 |         curr_adata = curr_adata.raw.to_adata()
256 |     curr_pts = curr_adata.obsm['spatial']#[curr_idx]
257 |     other_pts = other_adata.obsm['spatial']#[other_idx]
258 |     # zero center all_pts
259 |     curr_pts = rotate(curr_pts, degrees=rot)
260 |     curr_pts[:,0] -= curr_pts[:,0].min()
261 |     curr_pts[:,1] -= curr_pts[:,1].min()
262 | 
263 |     gene_idx = np.argwhere([i==gene_name for i in A.var_names]).flatten()[0]
264 |     if len(other_idx) > 0 and other_pts.shape[0] != curr_pts.shape[0]:
265 |         print("plotting background")
266 |         other_pts = rotate(other_pts, degrees=rot)
267 |         other_pts[:,0] -= other_pts[:,0].min()
268 |         other_pts[:,1] -= other_pts[:,1].min()
269 | 
270 |         ax.scatter(other_pts[:,0],other_pts[:,1],s=s,vmin=0,vmax=1, c='lightgray', rasterized=True, zorder=0,alpha=alpha)
271 |     expr = np.array(curr_adata[:,gene_name].X.toarray()).flatten()
272 |     ax.scatter(curr_pts[:,0],curr_pts[:,1],s=s,vmin=vmin,vmax=vmax,c=expr,rasterized=True,cmap=cmap, zorder=1,alpha=alpha)
273 |     #print(curr_pts.shape, len(np.array(curr_adata[:,gene_name].X.flatten())))
274 |     #ax.scatter(curr_pts[:,0],curr_pts[:,1],s=s,c=np.array(curr_adata[:,gene_name].X.toarray()))
275 |     ax.axis('off')
276 |     if xlim is not None:
277 |         ax.set_xlim(xlim)
278 |     if ylim is not None:
279 |         ax.set_ylim(ylim)
280 | 
281 | def rotate(p, origin=(0, 0), degrees=0):
282 |     angle = np.deg2rad(degrees)
283 |     R = np.array([[np.cos(angle), -np.sin(angle)],
284 |                   [np.sin(angle),  np.cos(angle)]])
285 |     o = np.atleast_2d(origin)
286 |     p = np.atleast_2d(p)
287 |     return np.squeeze((R @ (p.T-o.T) + o.T).T)
288 | 
289 | def plot_obs_by_cells(A, obs_name, s=0.1, cmap=plt.cm.gist_rainbow, show_legend=False, vmax=None, rot=0):
290 |     pts = A.obsm['spatial']#np.array([A.obs.center_x, A.obs.center_y]).T
291 |     if rot != 0:
292 |         pts = rotate(pts, degrees=rot)
293 |     pts = pd.DataFrame({'x': pts[:,0], 'y': pts[:,1], 'obs':A.obs[obs_name]})
294 |     if vmax is None:
295 |         vmax = len(pts.obs.unique())
296 |     cols = cmap(np.linspace(0,1,vmax+1))
297 |     #for n,i in enumerate(pts.obs.unique()):
298 |         #curr_pts = pts[pts.obs==i]
299 |     plt.scatter(pts.x,pts.y,s=s,vmin=0,vmax=vmax,c=pts.obs,cmap=cmap)
300 |     if show_legend:
301 |         plt.legend(pts.obs.unique())
302 |         
303 | #def plot_gene_by_cells(A, gene_name, s=0.1, cmap=plt.cm.Reds):
304 | #    gene_idx = np.argwhere(A.var_names==gene_name)[0][0]
305 | #    pts = pd.DataFrame({'x': A.obs.center_x, 'y':  A.obs.center_y, 'obs':A.X[:,gene_idx]})
306 | #    plt.scatter(pts.x,pts.y,c=pts.obs, cmap=cmap, s=s)
307 | 
308 | 
309 | def plot_expr_matrix_single(tstats, pvals, celltypes, vmin=-25, vmax=25,cmap=plt.cm.seismic, ax=None):
310 |     pvals[np.isnan(pvals)] = 1
311 |     pvals_correct = multipletests(pvals.flatten(), method='fdr_bh')[1]
312 |     pvals_correct = pvals_correct.reshape(tstats.shape)
313 |     pvals_correct[pvals_correct<1e-10] = 1e-10
314 |     #for idx in range(200):
315 |     if ax is None:
316 |         f, ax = plt.subplots(figsize=(5,1))
317 |     for i in range(tstats.shape[0]):
318 |         if pvals_correct[i] < 0.05:
319 |             ax.scatter(i, 1, s=-np.log10(pvals_correct[i])*10, c=tstats[i],vmin=vmin,vmax=vmax,cmap=cmap, lw=1, edgecolor='k')
320 |         else:
321 |             ax.scatter(i, 1, s=-np.log10(pvals_correct[i])*10, c=tstats[i],vmin=vmin,vmax=vmax,cmap=cmap, lw=1, edgecolor='w')
322 |     ax.set_xticks(np.arange(len(celltypes)));
323 |     ax.set_yticks([])
324 |     ax.set_xticklabels(celltypes,rotation=90);
325 | 
326 | def plot_expr_matrix_by_name(tstats, pvals, gene_name, var_names,celltypes, vmin=-25, vmax=25,cmap=plt.cm.seismic):
327 |     idx = np.argwhere(var_names==gene_name)[0]
328 |     pvals[np.isnan(pvals)] = 1
329 |     pvals_correct = multipletests(pvals.flatten(), method='fdr_bh')[1]
330 |     pvals_correct = pvals_correct.reshape(tstats.shape)
331 |     pvals_correct[pvals_correct<1e-10] = 1e-10
332 |     #for idx in range(200):
333 |     f, ax = plt.subplots(figsize=(5,5))
334 |     ax.set_title(var_names[idx])
335 |     for i in range(tstats.shape[0]):
336 |         for j in range(tstats.shape[1]):
337 |             if pvals_correct[i,j,idx] < 0.05:
338 |                 ax.scatter(i, j, s=-np.log10(pvals_correct[i,j,idx])*10, c=tstats[i,j,idx],vmin=vmin,vmax=vmax,cmap=plt.cm.bwr, lw=1, edgecolor='k')
339 |             else:
340 |                 pass
341 | #                ax.scatter(i, j, s=-np.log10(pvals_correct[i,j,idx])*10, c=tstats[i,j,idx],vmin=vmin,vmax=vmax,cmap=plt.cm.bwr, lw=1, edgecolor='w')
342 | 
343 |         ax.set_xticks(np.arange(len(celltypes)));
344 |         ax.set_yticks(np.arange(len(celltypes)));
345 |         ax.set_xticklabels(celltypes,rotation=90);
346 |         ax.set_yticklabels(celltypes);
347 |         ax.set_xlabel('Source')
348 |         ax.set_ylabel('Neighbor')
349 | 
350 | def plot_interactions(pvals, interactions, celltypes, celltype_colors,figsize=(5,5),seg_points=None,vmin=0,vmax=5,cmap=plt.cm.Reds, qval_thresh=0.1):
351 |     pvals[pvals<1e-10] = 1e-10
352 |     f, ax = plt.subplots(figsize=figsize)
353 |     gs = plt.GridSpec(nrows=2,ncols=2, width_ratios=[0.5,20], height_ratios=[20,0.5], wspace=0.1, hspace=0.1)
354 |     ax = plt.subplot(gs[0,0])
355 |     curr_cmap = mpl.colors.ListedColormap([celltype_colors[i] for i in celltypes])
356 |     ax.imshow(np.expand_dims(np.arange(interactions.shape[0])[::-1],1),aspect='auto',interpolation='none',cmap=curr_cmap)
357 |     sns.despine(ax=ax,bottom=True,left=True)
358 |     ax.set_xticks([])
359 |     ax.set_yticks(np.arange(len(celltypes)));
360 |     ax.set_yticklabels(celltypes[::-1]);
361 | 
362 |     ax = plt.subplot(gs[0,1])
363 |     ax.imshow(np.zeros_like(interactions), cmap=plt.cm.seismic, rasterized=True, aspect='auto',interpolation='none', vmin=-1,vmax=1)
364 |     for i in range(interactions.shape[0]):
365 |         for j in range(interactions.shape[0]):
366 |             if pvals[i,j] < qval_thresh:
367 |                 #ax.scatter(i,j, s=-np.log10(pvals[i,j])*10, c=interactions[i,j],cmap=cmap,vmin=vmin,vmax=vmax, lw=1, edgecolor='k')
368 |                 ax.scatter(i, interactions.shape[0]-j-1, s=100, c=interactions[i,j],cmap=cmap,vmin=vmin,vmax=vmax, lw=1, edgecolor='k')
369 | 
370 |             else:
371 |                 pass
372 |                 #ax.scatter(i, interactions.shape[0]-j-1, s=100, c=interactions[i,j],cmap=cmap,vmin=vmin,vmax=vmax, lw=1, edgecolor='w')
373 |     #ax.set_xlim([-1, 1+len(celltypes)])
374 |     #ax.set_ylim([-1, 1+len(celltypes)])
375 |     #ax.set_xticks(np.arange(len(celltypes)))
376 |     #ax.set_yticks(np.arange(len(celltypes)))
377 |     if seg_points is not None:
378 |         for i in seg_points:
379 |             ax.axvline(i-0.5,color='k',linestyle='--')
380 |             ax.axhline(len(clust_annots)-i-0.5,color='k',linestyle='--')
381 | 
382 |     ax.axis('off')
383 |     ax = plt.subplot(gs[1,1])
384 |     curr_cmap = mpl.colors.ListedColormap([celltype_colors[i] for i in celltypes])
385 |     ax.imshow(np.expand_dims(np.arange(interactions.shape[0]),1).T,aspect='auto',interpolation='none',cmap=curr_cmap)
386 |     sns.despine(ax=ax,bottom=True,left=True)
387 |     ax.set_xticks(np.arange(len(celltypes)));
388 |     ax.set_xticklabels(celltypes,rotation=90);
389 |     ax.set_yticks([])
390 |     return f
391 | def plot_clust_spatial_enrichment(A,vmin=0,vmax=1,uniq_clusts=None,clust_key='clust_annot',label_colors=None, spatial_domains=['Pia','L2/3', 'L5','L6', 'LatSept', 'CC', 'Striatum','Ventricle'],
392 |     seg_cmap=plt.cm.viridis):
393 |     if uniq_clusts is None:
394 |         uniq_clusts = sorted(A.obs[clust_key].unique())
395 |     n_spatial_domains = int(A.obs.spatial_clust_annots_value.max() + 1)
396 |     clust_counts = np.zeros((n_spatial_domains, len(uniq_clusts)))
397 |     print(clust_counts.shape)
398 |     for i in range(n_spatial_domains):
399 |         curr_clusts = A[A.obs.spatial_clust_annots_value==i,:].obs[clust_key]
400 |         for j,c in enumerate(uniq_clusts):
401 |             clust_counts[i,j] = np.sum(curr_clusts==c)
402 |     clust_avgs = clust_counts.copy()
403 |     for i in range(clust_avgs.shape[1]):
404 |         clust_avgs[:,i] /= clust_avgs[:,i].sum()
405 | 
406 |     f, ax = plt.subplots(figsize=(5.5,1))
407 |     gs = plt.GridSpec(nrows=2,ncols=2,width_ratios=[0.36, 20], height_ratios=[20,2], wspace=0.01, hspace=0.05)
408 | 
409 |     ax = plt.subplot(gs[0,0])
410 |     ax.imshow(np.expand_dims(np.arange(n_spatial_domains),1),aspect='auto',interpolation='none', cmap=seg_cmap,rasterized=True)
411 |     sns.despine(ax=ax,bottom=True,left=True)
412 |     ax.set_yticks(np.arange(clust_avgs.shape[0]));
413 |     ax.set_yticklabels(spatial_domains,fontsize=6)
414 |     ax.set_xticks([])
415 |     ax = plt.subplot(gs[0,1])
416 |     ax.imshow(clust_avgs,aspect='auto',vmin=vmin,vmax=vmax, cmap=plt.cm.viridis)
417 |     ax.set_xticks([])
418 |     ax.set_yticks([])
419 |     ax.axis('off')
420 |     #for i in range(clust_counts.shape[0]):
421 |         #ax.scatter(np.arange(clust_counts.shape[1]), i*np.ones(clust_counts.shape[1]), s=0.005*clust_counts[i,:],c='k')
422 |     ax = plt.subplot(gs[1,1])
423 |     if label_colors is None:
424 |         curr_cmap = plt.cm.viridis
425 |     else:
426 |         curr_cmap = mpl.colors.ListedColormap([label_colors[i] for i in uniq_clusts])
427 |     ax.imshow(np.expand_dims(np.arange(len(uniq_clusts)),1).T,aspect='auto',interpolation='none', cmap=curr_cmap,rasterized=True)
428 | 
429 |     ax.set_xticks(np.arange(clust_avgs.shape[1]));
430 |     ax.set_yticks([])
431 |     ax.set_xticklabels(uniq_clusts,rotation=90,fontsize=6);
432 |     sns.despine(ax=ax, left=True, bottom=True)
433 |     return clust_avgs, clust_counts
434 | 


--------------------------------------------------------------------------------
/python/spatial_analysis.py:
--------------------------------------------------------------------------------
  1 | import scanpy as sc
  2 | import matplotlib.pyplot as plt
  3 | import numpy as np
  4 | import matplotlib as mpl
  5 | import os
  6 | import anndata as ad
  7 | from tqdm import tqdm
  8 | import scipy.stats 
  9 | from statsmodels.stats.multitest import multipletests
 10 | from sklearn.neighbors import KDTree
 11 | import multiprocessing
 12 | from joblib import Parallel, delayed
 13 | 
 14 | # for each cell compute statistics of neighbors within radius
 15 | from sklearn.neighbors import KDTree
 16 | from sklearn.preprocessing import LabelEncoder
 17 | from scipy.stats import zscore
 18 | #curr_adata = adata_annot[adata_annot.obs.batch==9]
 19 | from tqdm import tqdm
 20 | def compute_neighborhood_stats(pos, labels, radius=100):
 21 |     # record labels as numbers
 22 |     labels_quant = LabelEncoder().fit_transform(labels)
 23 |     # for each cell, look up neighbors
 24 |     kdtree = KDTree(pos)
 25 |     nbors_idx, nbors_dist = kdtree.query_radius(pos, r=radius, return_distance=True)
 26 |     nbor_stats = np.zeros((pos.shape[0], len(np.unique(labels_quant))))
 27 | 
 28 |     for i in tqdm(range(pos.shape[0])):
 29 |         curr_nbors_idx = np.sort(nbors_idx[i][nbors_dist[i]>0])#[1:]
 30 |         #curr_nbors_dists = nbors_dist[i][np.argsort(nbors_idx[i])]
 31 |         curr_nbors_labels = labels_quant[curr_nbors_idx]
 32 |         for j in curr_nbors_labels:
 33 |             nbor_stats[i,j] += 1
 34 |     # zscore across each cluster
 35 |     for i in range(nbor_stats.shape[0]):
 36 |         nbor_stats[i,:] = zscore(nbor_stats[i,:])
 37 |     nbor_stats[np.isinf(nbor_stats)] = 0
 38 |     return nbor_stats
 39 | 
 40 | def calc_pval(obs, rand, empirical=False):
 41 |     if empirical:
 42 |         return np.sum(obs <= np.array(rand))/len(rand)
 43 |     else:
 44 |         z = (obs - np.mean(rand))/np.std(rand)
 45 |         return scipy.stats.norm.sf(abs(z))*2
 46 | 
 47 | def calc_pval_onesided(obs, rand):
 48 |     z = (obs-np.mean(rand))/np.std(rand)
 49 |     if z > 0:
 50 |         return scipy.stats.norm.sf(abs(z))
 51 |     else:
 52 |         return 1
 53 | 
 54 | def count_nearest_neighbors(X,Y,dist_thresh):
 55 |     if X.shape[0] > 0 and Y.shape[0] > 0:
 56 |         kdtree = KDTree(Y)
 57 |         idx, dists = kdtree.query_radius(X, r=dist_thresh, count_only=False, return_distance=True)
 58 |         dists = np.hstack(dists)
 59 |         return len(dists[dists>0])
 60 |     else:
 61 |         return 0
 62 | 
 63 | 
 64 | def count_interactions(X,Y, dist_thresh=15):
 65 |     X_pos = X.obsm['spatial']#obs[["center_x","center_y"]].values
 66 |     Y_pos = Y.obsm['spatial']#obs[["center_x", "center_y"]].values
 67 |     return count_nearest_neighbors(X_pos, Y_pos, dist_thresh)
 68 | 
 69 | def _jitter_interaction_parallel(X_pos, Y_pos, dist_thresh, perturb_max):
 70 |     curr_X = X_pos + np.random.uniform(-perturb_max, perturb_max, (X_pos.shape[0],2))
 71 |     curr_Y = Y_pos + np.random.uniform(-perturb_max, perturb_max, (Y_pos.shape[0],2))
 72 |     return count_nearest_neighbors(curr_X, curr_Y, dist_thresh) + count_nearest_neighbors(curr_Y, curr_X, dist_thresh)
 73 | 
 74 | def score_neighborhood(X,Y, dist_thresh=150, niter=500):
 75 |     X_pos = X.obsm['spatial']#.obs[["center_x","center_y"]].values
 76 |     Y_pos = Y.obsm['spatial']#.obs[["center_x", "center_y"]].values
 77 |     obs_freq = count_nearest_neighbors(X_pos, Y_pos, dist_thresh) + count_nearest_neighbors(Y_pos, X_pos, dist_thresh)
 78 |     pvals = np.zeros((niter,))
 79 |     num_cores = multiprocessing.cpu_count()
 80 |     iterations = tqdm(range(niter))
 81 |     random_freq = Parallel(n_jobs=num_cores)(delayed(_jitter_interaction_parallel)(X_pos, Y_pos, dist_thresh, perturb_max) for i in iterations)
 82 |     return obs_freq, random_freq, calc_pval(obs_freq, random_freq)
 83 | 
 84 | def compare_celltype_interactions(A, B, celltype_key, celltypes=None, niter=1000):
 85 |     """
 86 |     Compute distributions of celltype interactions between two conditions.
 87 |     """
 88 |     if celltypes is None:
 89 |         celltypes = sorted(A.obs[celltype_key].unique())
 90 |     celltype_interactions = np.zeros((len(celltypes), len(celltypes)))
 91 |     celltype_pvals = np.zeros((len(celltypes), len(celltypes)))
 92 |     for i, c1 in enumerate(celltypes):
 93 |         print(c1)
 94 |         for j,c2 in enumerate(celltypes):
 95 |             obs_freq_A = count_interactions(A[A.obs[celltype_key]==c1], A[A.obs[celltype_key]==c2], dist_thresh=15)
 96 |             obs_freq_B = count_interactions(B[B.obs[celltype_key]==c1], B[B.obs[celltype_key]==c2], dist_thresh=15)
 97 |             
 98 |             combined_obs = np.hstack((obs_freq_A, obs_freq_B))
 99 |             obs_labels = np.hstack((np.zeros(len(obs_freq_A)), np.ones(len(obs_freq_B))))
100 |             shuffled_obs = []
101 |             for n in tqdm(range(niter)):
102 |                 # shuffle labels
103 |                 curr_obs_labels = obs_labels[np.random.permutation(len(obs_labels))] 
104 |                 # compute score
105 |                 shuffled_obs.append(np.mean(combined_obs[curr_obs_labels==1])/np.mean(combined_obs[curr_obs_labels==0]))
106 |             obs_freq = np.mean(obs_freq_B)/np.mean(obs_freq_A)
107 |             celltype_interactions[i,j] = obs_freq #(obs_freq - np.mean(random_freq))/np.std(random_freq)
108 |             celltype_pvals[i,j] = np.sum(np.abs(obs_freq) < np.abs(shuffled_obs))/niter
109 |     celltype_pvals = celltype_pvals.reshape((len(celltypes)**2,))
110 |     #if len(celltype_pvals)>0:
111 |     #    celltype_pvals = multipletests(celltype_pvals, method='fdr')
112 |     celltype_pvals = celltype_pvals.reshape((len(celltypes), len(celltypes)))
113 |     return celltype_interactions, celltype_pvals
114 | 
115 | def score_interactions(X,Y, dist_thresh=15, niter=100, perturb_max=50, thresh=10, one_sided=False):
116 |     # compute pairwise distances
117 |     X_pos = X.obsm['spatial']#.obs[["center_x","center_y"]].values
118 |     Y_pos = Y.obsm['spatial']#.obs[["center_x", "center_y"]].values
119 |     obs_freq = count_nearest_neighbors(X_pos, Y_pos, dist_thresh) + count_nearest_neighbors(Y_pos, X_pos, dist_thresh)
120 |     if obs_freq < thresh:
121 |         return obs_freq, 0, 1.0
122 |     pvals = np.zeros((niter,))
123 |     num_cores = multiprocessing.cpu_count()
124 |     iterations = range(niter)
125 |     random_freq = Parallel(n_jobs=num_cores)(delayed(_jitter_interaction_parallel)(X_pos, Y_pos, dist_thresh, perturb_max) for i in iterations)
126 |     if one_sided:
127 |         return obs_freq, random_freq, calc_pval_onesided(obs_freq, random_freq)
128 |     else:
129 |         return obs_freq, random_freq, calc_pval(obs_freq, random_freq)
130 | 
131 | def compute_celltype_interactions(A, celltype_key, celltypes=None, niter=100, perturb_max=50, dist_thresh=30, min_cells=10, onesided=False):
132 |     import warnings
133 |     warnings.filterwarnings("ignore")
134 |     print("updated!")
135 |     if celltypes is None:
136 |         celltypes = sorted(A.obs[celltype_key].unique())
137 |     celltype_interactions = np.zeros((len(celltypes), len(celltypes)))
138 |     celltype_pvals = np.zeros((len(celltypes), len(celltypes)))
139 |     for i, c1 in enumerate(celltypes):
140 |         print(c1)
141 |         for j,c2 in enumerate(celltypes):
142 |             if i <= j:
143 |                 # don't do this for pairs where either has < min_cells
144 |                 if np.sum(A.obs[celltype_key]==c1) > min_cells and np.sum(A.obs[celltype_key]==c2) > min_cells:
145 |                     obs_freq, random_freq, pval = score_interactions(A[A.obs[celltype_key]==c1],
146 |                             A[A.obs[celltype_key]==c2], perturb_max=perturb_max, dist_thresh=dist_thresh, niter=niter, one_sided=onesided)
147 |                     print(c1, c2, obs_freq, np.mean(random_freq), obs_freq/np.mean(random_freq), pval)
148 |                     celltype_interactions[i,j] = np.log2(obs_freq/(1e-10+np.mean(random_freq)))#np.log2(obs_freq/np.mean(random_freq))#(obs_freq - np.mean(random_freq))/np.std(random_freq)
149 |                     celltype_interactions[j,i] = celltype_interactions[i,j]#(obs_freq - np.mean(random_freq))/np.std(random_freq)#np.log2(obs_freq/np.mean(random_freq))#
150 | 
151 |                     celltype_pvals[i,j] = pval
152 |                     celltype_pvals[j,i] = pval
153 |                 else:
154 |                     celltype_pvals[i,j] = 1.
155 |                     celltype_pvals[j,i] = 1.
156 |     celltype_pvals = celltype_pvals.reshape((len(celltypes)**2,))
157 |     celltype_qvals = np.zeros_like(celltype_pvals)
158 |     if len(celltype_pvals)>0:
159 |         for i in range(celltype_pvals.shape[0]):
160 |             pass
161 |             #celltype_qvals[i,:] = multipletests(celltype_pvals[i,:], method='fdr_bh')[1]
162 |     celltype_pvals = celltype_pvals.reshape((len(celltypes), len(celltypes)))
163 |     return celltype_interactions, celltype_pvals, celltype_qvals
164 | 
165 | def _compute_neighborhood(pos, labels, celltypes, radius):
166 |     neighbors = np.zeros((len(celltypes), len(celltypes)))
167 | 
168 |     for i, c1 in enumerate(celltypes):
169 |         curr_X = pos[labels==c1]
170 |         #print(c1, curr_X.shape[0])
171 |         for j, c2 in enumerate(celltypes):
172 |             curr_Y = pos[labels==c2]
173 |             if i <= j:
174 |                 neighbors[i,j] = np.sum(count_nearest_neighbors(curr_X, curr_Y, dist_thresh=radius))#/curr_X.shape[0]
175 |                 neighbors[j,i] = neighbors[i,j]
176 |     return neighbors
177 | 
178 | def _compute_neighbor_shuffled(pos, labels, celltypes, radius):
179 |     labels = labels[np.random.permutation(len(labels))]#[labels[i] for i in np.random.choice(len(labels),len(labels))]
180 |     return _compute_neighborhood(pos, labels, celltypes, radius)
181 | 
182 | def compute_celltype_neighborhood(A, celltype_key, celltypes=None, radius=150, niter=10):
183 |     if celltypes is None:
184 |         celltypes = list(sorted(A.obs[celltype_key].unique()))
185 |     pos = A.obsm['spatial']
186 |     labels = A.obs[celltype_key]
187 |     neighbors = _compute_neighborhood(pos, labels, celltypes, radius)
188 |     iterations = tqdm(range(niter))
189 |     # for each iteration, shuffle celltype labels
190 |     num_cores = multiprocessing.cpu_count()
191 |     random_freq = Parallel(n_jobs=num_cores)(delayed(_compute_neighbor_shuffled)(pos, labels, celltypes, radius) for i in iterations)    
192 |     #print(len(random_freq))
193 |     # z score
194 |     zs = np.zeros_like(neighbors)
195 |     pval = np.zeros_like(neighbors)
196 | 
197 |     shuffled_mean = np.dstack(random_freq).mean(2)
198 |     shuffled_std = np.std(np.dstack(random_freq),2)
199 |     for i in range(neighbors.shape[0]):
200 |         for j in range(neighbors.shape[1]):
201 |             zs[i,j] = (neighbors[i,j] - shuffled_mean[i,j])/shuffled_std[i,j]
202 |             pval[i,j] = calc_pval(neighbors[i,j],  np.dstack(random_freq)[i,j,:])#np.sum(neighbors[i,j] <= np.dstack(random_freq)[i,j,:])/niter#np.sum(neighbors[i,j] <= np.dstack(random_freq)[i,j,:])/niter #calc_pval(neighbors[i,j],  np.dstack(random_freq)[i,j,:])#np.sum(neighbors[i,j] <= np.dstack(random_freq)[i,j,:])/niter
203 |     return neighbors, zs, pval
204 | 
205 | def _compare_neighborhoods(pos_A, pos_B, labels_A, labels_B, celltypes, radius):
206 |     neighbors_A = _compute_neighborhood(pos_A, labels_A, celltypes, radius)
207 |     neighbors_B = _compute_neighborhood(pos_B, labels_B, celltypes, radius)
208 |     return neighbors_B - neighbors_A
209 | 
210 | def _compare_neighbor_shuffled(pos_A, pos_B, labels_A, labels_B, celltypes, radius):
211 |     neighbors_A = np.zeros((len(celltypes), len(celltypes)))
212 |     neighbors_B = np.zeros((len(celltypes), len(celltypes)))
213 | 
214 |     for i, c1 in enumerate(celltypes):
215 |         curr_X_A = pos_A[labels_A==c1]
216 |         curr_X_B = pos_B[labels_B==c1]
217 |         for j, c2 in enumerate(celltypes):
218 |             curr_Y_A = pos_A[labels_A==c2]
219 |             curr_Y_B = pos_B[labels_B==c2]
220 |             if i <= j:
221 |                 # make vector of label identities
222 |                 label_idents = np.hstack((np.zeros(curr_X_A.shape[0]), np.ones(curr_X_B.shape[0])))
223 |                 label_idents = label_idents[np.random.permutation(len(label_idents))]
224 |                 nn_A = count_nearest_neighbors(curr_X_A, curr_Y_A, dist_thresh=radius)
225 |                 nn_B = count_nearest_neighbors(curr_X_B, curr_Y_B, dist_thresh=radius)
226 |                 # shuffle which cells came from which identity
227 |                 combined_neighbors = np.hstack((nn_A, nn_B))
228 |                 neighbors_A[i,j] = np.sum(combined_neighbors[label_idents==0])
229 |                 neighbors_A[j,i] = neighbors_A[i,j]
230 |                 neighbors_B[i,j] = np.sum(combined_neighbors[label_idents==1])
231 |                 neighbors_B[j,i] = neighbors_B[i,j]
232 |     return neighbors_B - neighbors_A
233 |  
234 | def compare_celltype_neighborbood(A, B, celltype_key, celltypes=None, radius=150, niter=10):
235 |     if celltypes is None:
236 |         celltypes = list(sorted(A.obs[celltype_key].unique()))
237 |         
238 |     pos_A = A.obsm['spatial']
239 |     labels_A = A.obs[celltype_key]
240 |     pos_B = B.obsm['spatial']
241 |     labels_B = B.obs[celltype_key]
242 |     #_compare_neighbor_shuffled(pos_A, pos_B, labels_A, labels_B, celltypes, radius)
243 |     neighbors_A = _compute_neighborhood(pos_A, labels_A, celltypes, radius)
244 |     neighbors_B = _compute_neighborhood(pos_B, labels_B, celltypes, radius)
245 |     neighbor_diff = neighbors_B - neighbors_A
246 |     iterations = tqdm(range(niter))
247 |     # for each iteration, shuffle celltype labels
248 |     num_cores = multiprocessing.cpu_count()
249 |     random_freq = Parallel(n_jobs=num_cores)(delayed(_compare_neighbor_shuffled)(pos_A, pos_B, labels_A, labels_B, celltypes, radius) for i in iterations)    
250 |     print(len(random_freq))
251 |     # z score
252 |     zs = np.zeros_like(neighbor_diff)
253 |     pval = np.zeros_like(neighbor_diff)
254 | 
255 |     shuffled_mean = np.dstack(random_freq).mean(2)
256 |     shuffled_std = np.std(np.dstack(random_freq),2)
257 |     for i in range(neighbor_diff.shape[0]):
258 |         for j in range(neighbor_diff.shape[1]):
259 |             zs[i,j] = (neighbor_diff[i,j] - shuffled_mean[i,j])/shuffled_std[i,j]
260 |             pval[i,j] = np.sum(np.abs(neighbor_diff[i,j]) <= np.abs(np.dstack(random_freq)[i,j,:]))/niter
261 |     return neighbor_diff, zs, pval, random_freq
262 | 
263 | # algorithm:
264 | # - for each cell - cell pair
265 | #   - select all neighbors of a cell 
266 | #   - compute average expression of all genes for neighbors
267 | #   - compute average expression for all cells that aren't neighbors
268 | #   - find difference
269 | # - shuffle neighbor/not neighbor identities
270 | def identify_nearest_neighbors(X,Y,dist_thresh, min_dist_thresh=0):
271 |     """
272 |     Find all the elements in Y that are neighbors of X.
273 |     min_dist_thresh is to avoid contamination of stray counts from exactly neighboring cells
274 |     """
275 |     if X.shape[0] > 0 and Y.shape[0] > 0:
276 |         kdtree = KDTree(Y)
277 |         ind, dists = kdtree.query_radius(X, r=dist_thresh, count_only=False,return_distance=True)
278 |         ind = np.hstack(ind)
279 |         dists = np.hstack(dists)
280 |         if len(ind) > 0:
281 |             ind = ind[dists>min_dist_thresh]            
282 |         return np.unique(ind)
283 |     else:
284 |         return np.array([])
285 | 
286 | def get_nearest_neighbor_dists(X,Y):
287 |     kdtree = KDTree(Y)
288 |     dist, idx = kdtree.query(X, k=1)
289 |     return dist, idx
290 | 
291 | 
292 | def _compute_neighborhood_expr(pos, expr, labels, celltypes, radius):
293 |     expr_diff = np.zeros((len(celltypes), len(celltypes), expr.shape[1]))
294 |     for i, c1 in enumerate(celltypes):
295 |         curr_X = pos[labels==c1]
296 |         print(c1, curr_X.shape[0])
297 |         for j, c2 in enumerate(celltypes):
298 |             if i != j:
299 |                 curr_Y = pos[labels==c2]
300 |                 neighbors_X = identify_nearest_neighbors(curr_X, curr_Y, dist_thresh=radius).astype(np.int)
301 |                 #print
302 |                 #print(curr_X.shape)
303 |                 #print(neighbors_X.shape)
304 |                 not_neighbors_X = np.array([i for i in np.arange(curr_X.shape[0]).astype(np.int) if i not in neighbors_X])
305 |                 #print(neighbors_X)
306 |                 #print(not_neighbors_X)
307 |                 #print(c1, c2, )
308 |                 expr_diff[i,j,:] = expr[neighbors_X,:].mean(0)/expr[not_neighbors_X,:].mean(0) #/curr_X.shape[0]
309 |     return expr_diff
310 | 
311 | def _compute_neighbor_shuffled_expr(pos, expr, labels, celltypes, radius):
312 |     # shuffle label
313 |     expr_diff = np.zeros((len(celltypes), len(celltypes), expr.shape[1]))
314 |     for i, c1 in enumerate(celltypes):
315 |         curr_X = pos[labels==c1]
316 |         for j, c2 in enumerate(celltypes):
317 |             if i != j:
318 |                 curr_Y = pos[labels==c2]
319 |                 neighbors_X = identify_nearest_neighbors(curr_X, curr_Y, dist_thresh=radius)
320 |                 n_neighbors = len(neighbors_X)
321 |                 #print
322 |                 #print(curr_X.shape)
323 |                 #print(neighbors_X.shape)
324 |                 not_neighbors_X = np.array([i for i in np.arange(curr_X.shape[0]) if i not in neighbors_X])
325 |                 n_not_neighbors = len(not_neighbors_X)
326 | 
327 |                 #print(neighbors_X)
328 |                 #print(not_neighbors_X)
329 |                 combined_neighbors = np.hstack((neighbors_X, not_neighbors_X))
330 |                 combined_neighbors = combined_neighbors[np.random.permutation(len(combined_neighbors))]
331 |                 neighbors_X = combined_neighbors[:n_neighbors]
332 |                 not_neighbors_X = combined_neighbors[n_neighbors:]
333 |                 expr_diff[i,j,:] = expr[neighbors_X,:].mean(0) - expr[not_neighbors_X,:].mean(0) #/curr_X.shape[0]
334 |     return expr_diff
335 | 
336 | def bootstrap_expr_diff(X,Y,n=1000):
337 |     combined_data = np.concatenate((X,Y))
338 |     idx = np.concatenate((np.zeros(len(X)), np.ones(len(Y))))
339 |     obs_diff = np.mean(X) - np.mean(Y)
340 |     shuffle_diffs = []
341 |     for i in range(n):
342 |         shuffled_idx = idx[np.random.permutation(len(idx))]
343 |         curr_X = combined_data[shuffled_idx==0]
344 |         curr_Y = combined_data[shuffled_idx==1]
345 |         shuffle_diffs.append(np.mean(curr_X)-np.mean(curr_Y))
346 |     return obs_diff, np.sum(obs_diff <= np.array(shuffle_diffs))/n #calc_pval(obs_diff, np.array(shuffle_diffs))
347 | 
348 | def compute_celltype_neighborhood_regression(A, celltype_key, source, celltypes=None,min_radiu=0, obs_keys=None):
349 |     if obs_keys is None:
350 |         expr = A.X
351 |     else:
352 |         expr = np.array(A.obs.loc[:,obs_keys].values)
353 |     if celltypes is None:
354 |         celltypes = list(sorted(A.obs[celltype_key].unique()))
355 |     pos = A.obsm['spatial']
356 |     labels = A.obs[celltype_key]
357 |     tstats = np.zeros((len(celltypes), expr.shape[1]))
358 |     pvals = np.zeros((len(celltypes), expr.shape[1]))
359 |     # get all the cells of a certain type
360 |     curr_X = pos[labels==source]
361 |     curr_expr = expr[labels==source]
362 |     interactions = {}
363 |     for i, c1 in enumerate(celltypes):
364 |         # find all the cells of the neighboring type
365 |         curr_Y = pos[labels==c1]
366 |         # identify neighbors of target cell type X to cells in cell type Y
367 |         dists, idx = get_nearest_neighbor_dists(curr_Y, curr_X)
368 |         interactions[c1] = (dists, curr_expr[idx])
369 |     return interactions
370 | 
371 | import scipy
372 | def compute_celltype_neighborhood_ttest_single(A, celltype_key, source, celltypes=None, min_radius=15, radius=150, far_radius=250, niter=500, obs_keys=None,use_ttest=False,spatial_jitter=False):
373 |     if obs_keys is None:
374 |         expr = A.X
375 |     else:
376 |         expr = np.array(A.obs.loc[:,obs_keys].values)
377 |     if celltypes is None:
378 |         celltypes = list(sorted(A.obs[celltype_key].unique()))
379 |     pos = A.obsm['spatial']
380 |     labels = A.obs[celltype_key]
381 |     tstats = np.zeros((len(celltypes), expr.shape[1]))
382 |     pvals = np.zeros((len(celltypes), expr.shape[1]))
383 |     # get all the cells of a certain type
384 |     curr_X = pos[labels==source]
385 |     curr_expr = expr[labels==source]
386 |     for i, c1 in enumerate(celltypes):
387 |         # find all the cells of the neighboring type
388 |         curr_Y = pos[labels==c1]
389 |         # identify neighbors of target cell type X to cells in cell type Y
390 |         neighbors_X = identify_nearest_neighbors(curr_Y, curr_X, dist_thresh=radius, min_dist_thresh=min_radius).astype(np.int)
391 |         far_neighbors_X = identify_nearest_neighbors(curr_Y, curr_X, dist_thresh=far_radius, min_dist_thresh=radius).astype(np.int)
392 |         not_neighbors_X = np.array([i for i in far_neighbors_X if i not in neighbors_X])
393 | 
394 |         #neighbors_X = identify_nearest_neighbors(curr_Y, curr_X, dist_thresh=radius, min_dist_thresh=min_radius).astype(np.int)
395 |         #not_neighbors_X = np.array([i for i in np.arange(curr_X.shape[0]).astype(np.int) if i not in neighbors_X])
396 |         # shuffle what is a neighbor vs what isn't a neighbor
397 |         if not spatial_jitter:
398 |             if len(neighbors_X) > 0 and len(not_neighbors_X) > 0:
399 |                 mean_nbor = np.mean(curr_expr[neighbors_X])
400 |                 mean_not_nbor = np.mean(curr_expr[not_neighbors_X])
401 |                 print("X=%s, Y=%s, curr_X=%d, curr_Y=%d, nbor_X=%d, not_nbor_X=%d, mean_nbor_X=%0.04f, mean_not_nbor_X=%0.04f" % (c1, source, curr_X.shape[0], curr_Y.shape[0], len(neighbors_X), len(not_neighbors_X), mean_nbor, mean_not_nbor)) 
402 |                 if use_ttest:
403 |                     ttest = scipy.stats.ttest_ind(curr_expr[neighbors_X], curr_expr[not_neighbors_X])
404 |                 else:
405 |                     ttest = bootstrap_expr_diff(curr_expr[neighbors_X], curr_expr[not_neighbors_X])#
406 |                 tstats[i] =  ttest[0]#np.log2(np.mean(curr_expr[neighbors_X])/np.mean(curr_expr[not_neighbors_X]))#ttest[0]
407 |                 pvals[i] = ttest[1]#/curr_X.shape[0]
408 |             else:
409 |                 pvals[i] = 1
410 |                 tstats[i] = 0
411 |         else:
412 |             pass
413 |             # jitter cells in space and, then compute gene expression distribution
414 |     return tstats, pvals
415 | 
416 | 
417 | def compute_celltype_neighborhood_ttest(A, celltype_key, celltypes=None, min_radius=15, radius=150, far_radius=150, niter=500, obs_keys=None):
418 |     if obs_keys is None:
419 |         expr = A.X
420 |     else:
421 |         expr = np.array(A.obs.loc[:,obs_keys].values)
422 |     if celltypes is None:
423 |         celltypes = list(sorted(A.obs[celltype_key].unique()))
424 |     pos = A.obsm['spatial']
425 |     labels = A.obs[celltype_key]
426 |     tstats = np.zeros((len(celltypes), len(celltypes), expr.shape[1]))
427 |     pvals = np.zeros((len(celltypes), len(celltypes), expr.shape[1]))
428 |     for i, c1 in enumerate(celltypes):
429 |         curr_X = pos[labels==c1]
430 |         print(c1, curr_X.shape[0])
431 |         for j, c2 in enumerate(celltypes):
432 |            # if i != j:
433 |             curr_Y = pos[labels==c2]
434 |             curr_expr = expr[labels==c2,:]
435 |             # neighbors_X indexes into Y
436 |             neighbors_X = identify_nearest_neighbors(curr_X, curr_Y, dist_thresh=radius, min_dist_thresh=min_radius).astype(np.int)
437 |             far_neighbors_X = identify_nearest_neighbors(curr_X, curr_Y, dist_thresh=far_radius, min_dist_thresh=radius).astype(np.int)
438 |             #print(curr_X.shape[0], curr_Y.shape[0], neighbors_X.max())
439 |             #print
440 |             #print(curr_X.shape)
441 |             #print(neighbors_X.shape)
442 |             not_neighbors_X = np.array([i for i in far_neighbors_X if i not in neighbors_X])
443 |             #print(curr_expr.shape[0])
444 |             if len(neighbors_X) > 0 and len(not_neighbors_X) > 0:
445 |                 print("X=%s, Y=%s, curr_X=%d, curr_Y=%d, nbor_X=%d, not_nbor_X=%d, max_nbor_X=%d, max_not_nbor_X=%d" % (c1, c2, curr_X.shape[0], curr_Y.shape[0], len(neighbors_X), len(not_neighbors_X), neighbors_X.max(), not_neighbors_X.max()))
446 |                 #print(neighbors_X)
447 |                 #print(not_neighbors_X)
448 |                 #print(c1, c2, )
449 |                 for k in range(expr.shape[1]):
450 |                     ttest = scipy.stats.ttest_ind(curr_expr[neighbors_X,:][:,k], curr_expr[not_neighbors_X,:][:,k])
451 |                     tstats[i,j,k] =  ttest[0]
452 |                     pvals[i,j,k] = ttest[1]#/curr_X.shape[0]
453 |             else:
454 |                 pvals[i,j,:] = 1
455 |                 tstats[i,j,:] = 0
456 |             if i == j:
457 |                 pvals[i,j,:] = 1
458 |                 tstats[i,j,:] = 0
459 |     return tstats, pvals
460 | 
461 | def compute_celltype_neighborhood_expr(A, celltype_key, celltypes=None, radius=150, niter=500):
462 |     expr = A.X
463 |     if celltypes is None:
464 |         celltypes = list(sorted(A.obs[celltype_key].unique()))
465 |     pos = A.obsm['spatial']
466 |     labels = A.obs[celltype_key]
467 |     expr_diff = _compute_neighborhood_expr(pos, expr, labels, celltypes, radius)
468 |     iterations = tqdm(range(niter))
469 |     # for each iteration, shuffle celltype labels
470 |     num_cores = multiprocessing.cpu_count()
471 |     # random_freq is niter x n_celltype x n_celltype x n_gene matrix
472 |     #random_freq = np.stack(Parallel(n_jobs=num_cores)(delayed(_compute_neighbor_shuffled_expr)(pos, expr, labels, celltypes, radius) for i in iterations))
473 |     # z score
474 |     zs = np.zeros_like(expr_diff)
475 |     pval = np.zeros_like(expr_diff)
476 |    
477 |     #shuffled_mean = random_freq.mean(0)
478 |     #shuffled_std = np.std(random_freq,0)
479 |     #for i in range(expr_diff.shape[0]):
480 |     #    for j in range(expr_diff.shape[1]):
481 |     #        for k in range(expr_diff.shape[2]):
482 |     #            zs[i,j,k] = (expr_diff[i,j,k] - shuffled_mean[i,j,k])/shuffled_std[i,j,k]
483 |     #            pval[i,j,k] = scipy.stats.norm.sf(abs(zs[i,j,k]))*2 #calc_pval(expr_diff[i,j,k],  random_freq[:,i,j,k])#np.sum(neighbors[i,j] <= np.dstack(random_freq)[i,j,:])/niter
484 |     return expr_diff, zs, pval
485 | 
486 | def quantify_clust_spatial_enrichment(A,uniq_clusts=None,clust_key='clust_annot', normalize=True):
487 |     if uniq_clusts is None:
488 |         uniq_clusts = sorted(A.obs[clust_key].unique())
489 |     n_spatial_domains = A.obs.spatial_clust_annots_value.max() + 1
490 |     clust_counts = np.zeros((n_spatial_domains, len(uniq_clusts)))
491 |     print(clust_counts.shape)
492 |     for i in range(n_spatial_domains):
493 |         curr_clusts = A[A.obs.spatial_clust_annots_value==i,:].obs[clust_key]
494 |         for j,c in enumerate(uniq_clusts):
495 |             clust_counts[i,j] = np.sum(curr_clusts==c)
496 |     clust_avgs = clust_counts.copy()
497 |     for i in range(clust_avgs.shape[0]):
498 |         clust_avgs[i,:] /= np.sum(A.obs.spatial_clust_annots_value==i)#clust_avgs[i,:].sum()
499 |     return clust_counts, clust_avgs
500 | 


--------------------------------------------------------------------------------
/python/utils.py:
--------------------------------------------------------------------------------
 1 | from statsmodels.stats.multitest import multipletests
 2 | import numpy as np
 3 | 
 4 | def fdr_correct(X):
 5 |     new_X = np.zeros_like(X)
 6 |     for i in range(X.shape[-1]):
 7 |         pvals = multipletests(X[i,:],method='fdr_bh')[0]
 8 |         new_X[i,:] = multipletests(X[i,:],method='fdr_bh')[0]
 9 |         new_X[:,i] = new_X[i,:]
10 |     #X = multipletests(X.flatten(), method='fdr_bh')[0]
11 |     return new_X#X.reshape(X_shape)
12 | 
13 | from scipy.spatial.distance import pdist
14 | import scipy.cluster.hierarchy as hc
15 | 
16 | 
17 | def order_values(X, metric='correlation', return_linkage=False):
18 |     D = pdist(X,metric)
19 |     D[np.isinf(D)] = 0
20 |     D[np.isnan(D)] = 0
21 |     Z = hc.linkage(D,'complete',optimal_ordering=True)
22 |     dn = hc.dendrogram(Z,no_plot=True)
23 |     if not return_linkage:
24 |         return np.array(dn['leaves'])
25 |     else:
26 |         return np.array(dn['leaves']), Z
27 | 
28 | def relabel_clust(A, orig_clust, new_clust,key="clust_annot"):
29 |     clusts = np.array(list(A.obs[key]))
30 |     clusts[clusts==orig_clust] = new_clust
31 |     A.obs[key] = list(clusts)
32 |     return A
33 | 
34 | def relabel_anatomy(A, annot_old, annot_new):
35 |     A = relabel_clust(A, annot_old, annot_new, key='spatial_clust_annots')
36 |     spatial_clust_annots_values = {
37 |         'Pia' : 0,
38 |         'Cortex':1,
39 |         'LatSept':2,
40 |         'CC':3,
41 |         'Striatum':4,
42 |         'Ventricle':5
43 |         }
44 |     A.obs['spatial_clust_annots_value'] = [spatial_clust_annots_values[i] if i in spatial_clust_annots_values else None for i in A.obs.spatial_clust_annots]
45 |     return A
46 | 
47 | def relabel_all_clusts(A, clust_mapping,key='clust_annot'):
48 |     old_clust_annots = np.array(A.obs[key].copy())
49 |     new_clust_annots = np.array(old_clust_annots.copy())
50 |     for k,v in clust_mapping.items():
51 |         new_clust_annots[old_clust_annots==k] = v
52 |     A.obs[key] = list(new_clust_annots.copy())
53 |     return A
54 | 
55 | def cleanup_section(A_section,n_neighbors=25):
56 |     np.random.seed(31415)
57 |     from sklearn.neighbors import KNeighborsClassifier
58 |     clf = KNeighborsClassifier(n_jobs=-1,n_neighbors=n_neighbors,weights='uniform').fit(A_section.obsm['spatial'],A_section.obs.spatial_clust_annots_value)
59 |     A_section.obs['smoothed_spatial_clust_annot_values'] = list(clf.predict(A_section.obsm['spatial']))
60 |     return A_section


--------------------------------------------------------------------------------