├── README.md ├── notebooks ├── aging_10x_analysis.ipynb ├── aging_MERFISH_gene_selection.ipynb ├── aging_analyze_integrated10x.ipynb ├── aging_call_final_celltypes_10x.ipynb ├── analyze_merfish_celltyping.ipynb ├── int_analysis.ipynb ├── merfish_LPS_analysis.ipynb ├── merfish_integrate_LPS_data.ipynb └── merfish_spatial_celltype_org.ipynb └── python ├── de.py ├── find_merfish_markers.py ├── integration.py ├── plotting.py ├── spatial_analysis.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # SpatialBrainAgingCell22 2 | Analysis software for brain aging spatial and single-cell transcriptomics. 3 | -------------------------------------------------------------------------------- /notebooks/aging_10x_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "%matplotlib inline\n", 13 | "%config InlineBackend.figure_format='retina'\n" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import numpy as np\n", 23 | "import scanpy as sc\n", 24 | "import pandas as pd\n", 25 | "import anndata as ad\n", 26 | "import seaborn as sns" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)\n", 36 | "sc.settings.set_figure_params(dpi=80, facecolor='white')\n" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "mouse_colors = plt.cm.colors.ListedColormap(['red', 'darkred', 'blue','darkblue', 'orange', 'darkorange', 'violet', 'darkviolet',])\n", 46 | "\n", 47 | "samples = [\n", 48 | " \"Hyp_4wk_1_matrix.h5\",\n", 49 | " \"Hyp_4wk_2_matrix.h5\",\n", 50 | " \"Hyp_4wk_3_matrix.h5\",\n", 51 | " \"Hyp_4wk_4_matrix.h5\",\n", 52 | " \"Hyp_90wk_1_matrix.h5\",\n", 53 | " \"Hyp_90wk_2_matrix.h5\",\n", 54 | " \"Hyp_90wk_3_matrix.h5\",\n", 55 | " \"Hyp_90wk_4_matrix.h5\",\n", 56 | " \"PFC_4wk_1_matrix.h5\",\n", 57 | " \"PFC_4wk_2_matrix.h5\",\n", 58 | " \"PFC_4wk_3_matrix.h5\",\n", 59 | " \"PFC_4wk_4_matrix.h5\",\n", 60 | " \"PFC_90wk_1_matrix.h5\",\n", 61 | " \"PFC_90wk_2_matrix.h5\",\n", 62 | " \"PFC_90wk_3_matrix.h5\",\n", 63 | " \"PFC_90wk_4_matrix.h5\"\n", 64 | "]\n", 65 | "\n", 66 | "mouse_id = {\n", 67 | " 0 : 1,\n", 68 | " 1 : 1,\n", 69 | " 2 : 2,\n", 70 | " 3 : 2,\n", 71 | " 4 : 3,\n", 72 | " 5 : 3,\n", 73 | " 6 : 4,\n", 74 | " 7 : 4,\n", 75 | " 8 : 5,\n", 76 | " 9 : 5,\n", 77 | " 10 : 6,\n", 78 | " 11 : 6,\n", 79 | " 12 : 7,\n", 80 | " 13 : 7,\n", 81 | " 14 : 8,\n", 82 | " 15 : 8\n", 83 | "}" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "all_adata = []\n", 93 | "i = 0\n", 94 | "for s in samples:\n", 95 | " area, age, idx, _ = s.split(\"_\")\n", 96 | " print(area, age, idx)\n", 97 | " curr_adata = sc.read_10x_h5(f\"/faststorage/brain_aging/aging10x/{s}\")\n", 98 | " curr_adata.var_names_make_unique()\n", 99 | " curr_adata.obs['area'] = area\n", 100 | " curr_adata.obs['age'] = age\n", 101 | " curr_adata.obs['idx'] = i\n", 102 | " i += 1\n", 103 | " curr_adata.var['mt'] = curr_adata.var_names.str.startswith('mt-') # annotate the group of mitochondrial genes as 'mt'\n", 104 | " sc.pp.calculate_qc_metrics(curr_adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)\n", 105 | "\n", 106 | " all_adata.append(curr_adata)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "total_cells = np.sum([a.n_obs for a in all_adata])\n", 116 | "print('total cells:', total_cells)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "adata = ad.concat(all_adata)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "adata[adata.obs.area=='PFC']" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "adata.obs_names_make_unique()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "sc.pp.filter_cells(adata, min_genes=1000)\n", 153 | "sc.pp.filter_cells(adata, max_counts=100000)\n", 154 | "sc.pp.filter_genes(adata, min_cells=3)\n", 155 | "sc.pp.filter_cells(adata, min_counts=2500)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "adata" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "adata.obs['mouse_id'] = [mouse_id[i] for i in adata.obs.idx]" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "# run scrublet on adata to identify doublets\n", 183 | "import scrublet as scr\n", 184 | "scrub = scr.Scrublet(adata.X, expected_doublet_rate=0.09)\n", 185 | "doublet_scores, predicted_doublets = scrub.scrub_doublets(min_gene_variability_pctl=85, \n", 186 | " n_prin_comps=30)\n", 187 | "\n" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "scrub.plot_histogram();\n" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "np.sum(predicted_doublets)/len(doublet_scores)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "adata = adata[~predicted_doublets,:]" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "adata" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "sc.pl.highest_expr_genes(adata, n_top=20, )" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "adata.write(\"adata_combined_nodoublet.h5ad\")" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "\n", 251 | "sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],\n", 252 | " jitter=0.4, multi_panel=True,size=0.25)\n" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "print(np.median(adata.obs.n_genes_by_counts))" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "print(np.median(adata.obs.total_counts))" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "adata" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "#adata = adata[adata.obs.n_genes_by_counts < 3000, :]\n", 289 | "#adata = adata[adata.obs.pct_counts_mt < 5, :]\n" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "sc.pp.normalize_total(adata, target_sum=1e4)\n", 299 | "\n", 300 | "sc.pp.log1p(adata)\n", 301 | "\n", 302 | "sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)\n" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "sc.pl.highly_variable_genes(adata)\n" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "adata.raw = adata\n" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "adata = adata[:, adata.var.highly_variable]\n" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])\n" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "sc.pp.scale(adata, max_value=10)\n" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "sc.tl.pca(adata, svd_solver='arpack')\n" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "sc.pl.pca(adata)\n" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "sc.pl.pca_variance_ratio(adata, log=True,n_pcs=50)\n" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": {}, 381 | "outputs": [], 382 | "source": [ 383 | "sc.pp.neighbors(adata, n_neighbors=10, n_pcs=50)\n" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "sc.tl.umap(adata)\n" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "metadata": {}, 399 | "outputs": [], 400 | "source": [ 401 | "sc.tl.leiden(adata,resolution=0.2)\n" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [ 410 | "sc.pl.pca(adata,color=['leiden','age','idx'],color_map=plt.cm.rainbow)\n" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [ 419 | "sc.pl.umap(adata, color=['leiden','n_genes','total_counts'],color_map=plt.cm.viridis)" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [ 428 | "adata.write(\"adata_combined_nodoublet_normalized.h5ad\")" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": {}, 434 | "source": [ 435 | "# 1. Merge clusters into neurons and not neurons" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": null, 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [ 444 | "gene_ids = adata.raw.var_names\n", 445 | "ens_idx = np.in1d(gene_ids, 'Snap25')" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "adata.raw.var" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": null, 460 | "metadata": {}, 461 | "outputs": [], 462 | "source": [ 463 | "adata.obs['Snap25'] = adata.raw.X[:,ens_idx].mean(1)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [ 472 | "plt.hist(adata.obs.groupby('leiden')['Snap25'].apply(np.mean).to_numpy(),100)\n", 473 | "plt.axvline(1.25)" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": null, 479 | "metadata": {}, 480 | "outputs": [], 481 | "source": [ 482 | "is_cluster_neuronal = (adata.obs.groupby('leiden')['Snap25'].apply(np.mean).to_numpy()>1.25)\n", 483 | "neuronal_map = dict(zip([str(i) for i in range(len(is_cluster_neuronal))],is_cluster_neuronal))" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": null, 489 | "metadata": {}, 490 | "outputs": [], 491 | "source": [ 492 | "# computer cluster mean expression for each gene\n", 493 | "adata.obs['neuronal'] = [neuronal_map[i] for i in adata.obs.leiden]\n" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [ 502 | "sc.pl.umap(adata,color=['neuronal','Snap25'])" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "metadata": {}, 509 | "outputs": [], 510 | "source": [ 511 | "sc.pl.dotplot(adata, ['Cx3cr1', 'Aldh1l1','Olig1','Cspg4', 'Snap25', 'Gad1', 'Slc17a6', 'Slc17a7'],groupby='leiden')" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "metadata": {}, 518 | "outputs": [], 519 | "source": [ 520 | "# subset by neurons in PFC and Hyp\n", 521 | "adata_neuronal = adata[adata.obs.neuronal].copy()\n", 522 | "adata_neuronal = adata_neuronal.raw.to_adata()\n" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [ 531 | "adata_neuronal_pfc = adata_neuronal[adata_neuronal.obs.area == 'PFC'].copy()\n", 532 | "adata_neuronal_hyp = adata_neuronal[adata_neuronal.obs.area == 'Hyp'].copy()\n" 533 | ] 534 | }, 535 | { 536 | "cell_type": "markdown", 537 | "metadata": {}, 538 | "source": [ 539 | "# 2. Cluster neurons" 540 | ] 541 | }, 542 | { 543 | "cell_type": "markdown", 544 | "metadata": {}, 545 | "source": [ 546 | "## 2.1 Cluster PFC neurons" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": null, 552 | "metadata": {}, 553 | "outputs": [], 554 | "source": [ 555 | "def reprocess_subset(A,res=0.7):\n", 556 | " # assumes data have already been normalized/log transformed\n", 557 | " print('finding highly variable genes')\n", 558 | " sc.pp.highly_variable_genes(A, min_mean=0.0125, max_mean=3, min_disp=0.5)\n", 559 | " A.raw = A\n", 560 | " A = A[:, A.var.highly_variable]\n", 561 | " print('regressing out')\n", 562 | " sc.pp.regress_out(A, ['total_counts', 'pct_counts_mt'])\n", 563 | " print('scaling')\n", 564 | " sc.pp.scale(A, max_value=10)\n", 565 | " print('pca')\n", 566 | " sc.tl.pca(A, svd_solver='arpack')\n", 567 | " print('neighbors')\n", 568 | " sc.pp.neighbors(A, n_neighbors=10, n_pcs=50)\n", 569 | " print('umap')\n", 570 | " sc.tl.umap(A)\n", 571 | " print('leiden')\n", 572 | " sc.tl.leiden(A,resolution=res)\n", 573 | " return A" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "adata_neuronal_pfc = reprocess_subset(adata_neuronal_pfc)" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": null, 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [ 591 | "sc.pl.umap(adata_neuronal_pfc, color=['age'])" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [ 600 | "sc.external.pp.bbknn(adata_neuronal_pfc,batch_key='age')\n", 601 | "sc.tl.leiden(adata_neuronal_pfc,resolution=0.6)\n", 602 | "sc.tl.umap(adata_neuronal_pfc)" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": null, 608 | "metadata": {}, 609 | "outputs": [], 610 | "source": [ 611 | "sc.pl.umap(adata_neuronal_pfc, color=['leiden','age','mouse_id'],color_map=mouse_colors)" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": null, 617 | "metadata": {}, 618 | "outputs": [], 619 | "source": [ 620 | "sc.pl.umap(adata_neuronal_pfc, color='age')" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": null, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "sc.pl.umap(adata_neuronal_pfc, \n", 630 | " color=['Slc17a7','Gad1','Drd1','Drd2','Sst','Vip','Pvalb',\n", 631 | " 'Cux1','Tshz2','Cd44','Vegfd','Pld5','Otof','Npr3'],\n", 632 | " use_raw=True)" 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": null, 638 | "metadata": {}, 639 | "outputs": [], 640 | "source": [ 641 | "sc.tl.rank_genes_groups(adata_neuronal_pfc, 'leiden', method='wilcoxon')\n", 642 | "#sc.pl.rank_genes_groups(adata_neuronal_pfc, n_genes=25, sharey=False)\n" 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": null, 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [ 651 | "sc.tl.filter_rank_genes_groups(adata_neuronal_pfc, min_fold_change=1.5)\n", 652 | "sc.pl.rank_genes_groups_dotplot(adata_neuronal_pfc, key='rank_genes_groups_filtered')" 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": null, 658 | "metadata": {}, 659 | "outputs": [], 660 | "source": [ 661 | "sc.pl.rank_genes_groups_heatmap(adata_neuronal_pfc,n_genes=5,groupby='leiden',show_gene_labels=True)" 662 | ] 663 | }, 664 | { 665 | "cell_type": "markdown", 666 | "metadata": {}, 667 | "source": [ 668 | "## 2.2 Cluster hypothalamus neurons\n" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": null, 674 | "metadata": {}, 675 | "outputs": [], 676 | "source": [ 677 | "adata_neuronal_hyp = reprocess_subset(adata_neuronal_hyp)" 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": null, 683 | "metadata": {}, 684 | "outputs": [], 685 | "source": [ 686 | "#sc.external.pp.bbknn(adata_neuronal_hyp,batch_key='mouse_id')\n", 687 | "#sc.tl.leiden(adata_neuronal_hyp,resolution=0.2)\n", 688 | "#sc.tl.umap(adata_neuronal_hyp)" 689 | ] 690 | }, 691 | { 692 | "cell_type": "code", 693 | "execution_count": null, 694 | "metadata": {}, 695 | "outputs": [], 696 | "source": [ 697 | "sc.pl.umap(adata_neuronal_hyp, color=['leiden','age','mouse_id'],color_map=mouse_colors)" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": null, 703 | "metadata": {}, 704 | "outputs": [], 705 | "source": [ 706 | "sc.pl.umap(adata_neuronal_hyp, color=['Gad1','Slc17a6','Slc17a7','Gal','Agtr1a','Esr1','Pomc','Agrp','Nxph4','Adcyap1','Oxt'],use_raw=True)\n" 707 | ] 708 | }, 709 | { 710 | "cell_type": "code", 711 | "execution_count": null, 712 | "metadata": {}, 713 | "outputs": [], 714 | "source": [ 715 | "sc.tl.rank_genes_groups(adata_neuronal_hyp, 'leiden', method='t-test')\n" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": null, 721 | "metadata": {}, 722 | "outputs": [], 723 | "source": [ 724 | "sc.tl.filter_rank_genes_groups(adata_neuronal_hyp, min_fold_change=1.5)" 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": null, 730 | "metadata": {}, 731 | "outputs": [], 732 | "source": [ 733 | "#sc.pl.rank_genes_groups(adata_neuronal_pfc, n_genes=25, sharey=False)\n", 734 | "\n", 735 | "sc.pl.rank_genes_groups_heatmap(adata_neuronal_hyp,n_genes=3,key='rank_genes_groups_filtered',groupby='leiden',show_gene_labels=True)\n" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": null, 741 | "metadata": {}, 742 | "outputs": [], 743 | "source": [ 744 | "sc.pl.rank_genes_groups_dotplot(adata_neuronal_hyp, key='rank_genes_groups_filtered')" 745 | ] 746 | }, 747 | { 748 | "cell_type": "markdown", 749 | "metadata": {}, 750 | "source": [ 751 | "# 3. Cluster non neurons" 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": null, 757 | "metadata": {}, 758 | "outputs": [], 759 | "source": [ 760 | "adata_nonneuronal = adata[~adata.obs.neuronal].copy()\n", 761 | "adata_nonneuronal = adata_nonneuronal.raw.to_adata()\n" 762 | ] 763 | }, 764 | { 765 | "cell_type": "code", 766 | "execution_count": null, 767 | "metadata": {}, 768 | "outputs": [], 769 | "source": [ 770 | "adata_nonneuronal = reprocess_subset(adata_nonneuronal)" 771 | ] 772 | }, 773 | { 774 | "cell_type": "code", 775 | "execution_count": null, 776 | "metadata": {}, 777 | "outputs": [], 778 | "source": [ 779 | "#sc.external.pp.bbknn(adata_nonneuronal,batch_key='mouse_id')\n", 780 | "#sc.tl.leiden(adata_nonneuronal,resolution=1.2)\n", 781 | "#sc.tl.umap(adata_nonneuronal)" 782 | ] 783 | }, 784 | { 785 | "cell_type": "code", 786 | "execution_count": null, 787 | "metadata": {}, 788 | "outputs": [], 789 | "source": [ 790 | "sc.tl.leiden(adata_nonneuronal,resolution=0.7)\n" 791 | ] 792 | }, 793 | { 794 | "cell_type": "code", 795 | "execution_count": null, 796 | "metadata": {}, 797 | "outputs": [], 798 | "source": [ 799 | "sc.pl.umap(adata_nonneuronal, color=['leiden'])" 800 | ] 801 | }, 802 | { 803 | "cell_type": "code", 804 | "execution_count": null, 805 | "metadata": {}, 806 | "outputs": [], 807 | "source": [ 808 | "sc.pl.umap(adata_nonneuronal, color=['leiden','area','age','mouse_id'],color_map=mouse_colors)" 809 | ] 810 | }, 811 | { 812 | "cell_type": "code", 813 | "execution_count": null, 814 | "metadata": {}, 815 | "outputs": [], 816 | "source": [ 817 | "sc.pl.umap(adata_nonneuronal, color=['Cdkn2a','Aldh1l1','Cx3cr1','Plp1','Cspg4',\n", 818 | " 'Gfap','Aqp4','Cldn5','Adgrf5'])\n", 819 | "\n" 820 | ] 821 | }, 822 | { 823 | "cell_type": "code", 824 | "execution_count": null, 825 | "metadata": {}, 826 | "outputs": [], 827 | "source": [ 828 | "old_to_new = dict(\n", 829 | " old_cluster1='new_cluster1',\n", 830 | " old_cluster2='new_cluster1',\n", 831 | " old_cluster3='new_cluster2',\n", 832 | ")\n", 833 | "adata.obs['new_clusters'] = (\n", 834 | " adata.obs['old_clusters']\n", 835 | " .map(old_to_new)\n", 836 | " .astype('category')\n", 837 | ")\n" 838 | ] 839 | }, 840 | { 841 | "cell_type": "markdown", 842 | "metadata": {}, 843 | "source": [ 844 | "# Cluster whole dataset" 845 | ] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": null, 850 | "metadata": {}, 851 | "outputs": [], 852 | "source": [ 853 | "sc.external.pp.bbknn(adata,batch_key='mouse_id')\n", 854 | "sc.tl.leiden(adata,resolution=0.2)\n", 855 | "sc.tl.umap(adata)" 856 | ] 857 | }, 858 | { 859 | "cell_type": "code", 860 | "execution_count": null, 861 | "metadata": {}, 862 | "outputs": [], 863 | "source": [ 864 | "fig = sc.pl.umap(adata, color=['age','area','mouse_id'],color_map=mouse_colors,return_fig=True)\n", 865 | "fig.savefig(\"/Users/wea/src/tithonus/analysis/aging10x/umap.png\",dpi=300,bbox_inches='tight')" 866 | ] 867 | }, 868 | { 869 | "cell_type": "code", 870 | "execution_count": null, 871 | "metadata": {}, 872 | "outputs": [], 873 | "source": [ 874 | "sc.pl.umap(adata, color=['Cx3cr1', 'Aldh1l1','Olig1','Cspg4', 'Snap25', 'Gad1', 'Slc17a6', 'Slc17a7'],color_map=plt.cm.Reds)" 875 | ] 876 | }, 877 | { 878 | "cell_type": "code", 879 | "execution_count": null, 880 | "metadata": {}, 881 | "outputs": [], 882 | "source": [ 883 | "sc.pl.umap(adata, color=['Vip','Gal','Sst','Cck','Npy','Oxt','Nxph4','Agtr1a','Agrp','Esr1'],cmap=plt.cm.coolwarm,vmin=-5,vmax=5)" 884 | ] 885 | }, 886 | { 887 | "cell_type": "code", 888 | "execution_count": null, 889 | "metadata": {}, 890 | "outputs": [], 891 | "source": [] 892 | }, 893 | { 894 | "cell_type": "code", 895 | "execution_count": null, 896 | "metadata": {}, 897 | "outputs": [], 898 | "source": [ 899 | "sc.pl.umap(adata, color=['C1qa','C3','Itgam','Trem2'],cmap=plt.cm.coolwarm,use_raw=True,vmin=-3,vmax=3)" 900 | ] 901 | }, 902 | { 903 | "cell_type": "code", 904 | "execution_count": null, 905 | "metadata": {}, 906 | "outputs": [], 907 | "source": [ 908 | "sc.pl.umap(adata, color=['Cdkn2a','C2','C4b','Tspan2','Il33','Aldh1l1','Cd4','Cd74','Agtr1a'],color_map=plt.cm.Reds,use_raw=True)" 909 | ] 910 | }, 911 | { 912 | "cell_type": "code", 913 | "execution_count": null, 914 | "metadata": {}, 915 | "outputs": [], 916 | "source": [ 917 | "\n", 918 | "sc.pl.umap(adata, color=[i for i in list(adata.raw.var_names) if 'Il' in i],color_map=plt.cm.Reds,use_raw=True)" 919 | ] 920 | }, 921 | { 922 | "cell_type": "code", 923 | "execution_count": null, 924 | "metadata": {}, 925 | "outputs": [], 926 | "source": [ 927 | "sc.pl.umap(adata, color=[i for i in list(adata.raw.var_names) if 'H2-' in i],color_map=plt.cm.coolwarm,use_raw=True,vmin=-3,vmax=3)" 928 | ] 929 | }, 930 | { 931 | "cell_type": "code", 932 | "execution_count": null, 933 | "metadata": {}, 934 | "outputs": [], 935 | "source": [ 936 | "sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')\n", 937 | "sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)\n" 938 | ] 939 | }, 940 | { 941 | "cell_type": "code", 942 | "execution_count": null, 943 | "metadata": {}, 944 | "outputs": [], 945 | "source": [ 946 | "pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(5)\n" 947 | ] 948 | }, 949 | { 950 | "cell_type": "code", 951 | "execution_count": null, 952 | "metadata": {}, 953 | "outputs": [], 954 | "source": [] 955 | } 956 | ], 957 | "metadata": { 958 | "kernelspec": { 959 | "display_name": "Python 3 (ipykernel)", 960 | "language": "python", 961 | "name": "python3" 962 | }, 963 | "language_info": { 964 | "codemirror_mode": { 965 | "name": "ipython", 966 | "version": 3 967 | }, 968 | "file_extension": ".py", 969 | "mimetype": "text/x-python", 970 | "name": "python", 971 | "nbconvert_exporter": "python", 972 | "pygments_lexer": "ipython3", 973 | "version": "3.7.4" 974 | } 975 | }, 976 | "nbformat": 4, 977 | "nbformat_minor": 4 978 | } 979 | -------------------------------------------------------------------------------- /notebooks/aging_MERFISH_gene_selection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2\n", 11 | "%matplotlib inline" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import matplotlib.pyplot as plt\n", 21 | "import seaborn as sns\n", 22 | "import pandas as pd\n", 23 | "import numpy as np\n", 24 | "import scanpy as sc\n", 25 | "import sys" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)\n", 35 | "sc.settings.set_figure_params(dpi=80, facecolor='white', frameon=False, figsize=(5,5))" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "from find_merfish_markers import *" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "# Script to select cell type marker genes for MERFISH" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "adata = sc.read(\"/faststorage/brain_aging/rna_analysis/adata_finalclusts_annot.h5ad\")" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "# Known markers" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "\n", 77 | "cortex_major_markers = ['Slc17a7', 'Slc32a1', 'Slc30a3', 'Cux2', 'Rorb', 'Sulf2', \n", 78 | " 'Ptpru', 'Car3', 'Fam84b', 'Syt6', 'Nxph4', 'Tshz2', 'Pvalb', 'Sst', 'Vip',\n", 79 | " 'Sncg', 'Lamp5', 'Sox10', 'Pdgfra', 'Aqp4', 'Igf2', 'Ctss', 'Cldn5', 'Flt1', 'Bgn', 'Vtn', 'Gfap',\n", 80 | " 'Gad1', 'Gad2', 'Fn1', 'Myh11', 'Cd24a', 'Selplg', 'Pdgfra', 'Aqp4', 'Mbp', 'Ttyh2',\n", 81 | " \"Crhbp\", \"Cnr1\", \"Cpne5\", \"Crh\", \"Kcnip2\", \"Tbr1\", \"Lamp5\", \"Rorb\", \"Syt6\", \"Aldoc\", \"Gfap\",\n", 82 | " \"Serpinf1\", \"Mfge8\", \"Sox10\", \"Plp1\", \"Pdgfra\", \"Tmem8\", \"Itpr2\", \"Ctps\", \"Bmp4\", \"Anln\",\n", 83 | " \"Hexb\", \"Mrc1\", \"Vtn\", \"Flt1\", \"Apln\", \"Acta2\", \"Ttr\", \"Foxj1\"\n", 84 | " ]\n", 85 | "\n", 86 | "cortex_major_markers.extend([\n", 87 | "\"Acta2\",\"Aqp4\",\"Bgn\", \"Calb2\",\"Car3\", \"Cd14\", \"Chat\", \"Chodl\", \"Chrna2\", \"Cldn5\", \"Crhr2\",\n", 88 | "\"Crispld2\",\"Cspg4\",\"Ctss\",\"Cux2\",\"Egfr\",\"Enpp6\",\"Fam84b\",\"Fezf2\",\"Flt1\",\"Foxp2\",\"Gfap\",\"Hpse\",\"Igf2\",\"Kcnj8\",\n", 89 | "\"Lhx6\",\"Lmo1\",\"Lsp1\",\"Mrc1\",\"Nxph2\",\"Nxph4\",\"Opalin\",\"Osr1\",\"Otof\",\"Pdgfra\",\"Prox1\",\"Rorb\",\"Rspo1\",\"Rxfp1\",\n", 90 | "\"Satb2\",\"Serpinf1\",\"Slc17a6\",\"Slc17a8\",\"Slc30a3\",\"Slc32a1\",\"Sncg\",\"Sox10\",\"Sox6\",\"Sulf1\",\"Syt6\",\"Tcap\",\"Th\",\"Tshz2\",\n", 91 | "\"Vipr2\",\"Vtn\",\"Vip\",\"Sst\",\"Calb1\",\"Gad2\",\"Slc17a7\",\"Lamp5\",\"Gad1\",\"Pvalb\",\"Fezf2\", \"Bcl11b\", \"Npr3\", \"Otof\"\n", 92 | " ])\n", 93 | "\n", 94 | "cortex_major_markers = list(set(cortex_major_markers))\n", 95 | "hypo_major_markers = [\n", 96 | " 'Agtr1a', 'Pomc', 'Oxt', 'Npy', 'Agrp', 'Esr1', 'Slc17a6',\n", 97 | " 'Meis2', 'Th', 'Gpr101', 'Hcrt', 'Nrgn', 'Sst', 'Map1b', 'Nts', 'Pmch', 'Cartpt',\n", 98 | " 'Gpr83', 'Bdnf', 'Otp', 'Calb2', 'Tac1', 'Tac2', 'Calb1', 'Trh', 'Gal', 'Col25a1', 'Synpr'\n", 99 | "]\n", 100 | "\n", 101 | "moffitt_genes = [\n", 102 | " \"Ace2\", \"Adora2a\", \"Aldh1l1\", \"Amigo2\", \"Ano3\", \"Aqp4\", \"Ar\", \"Arhgap36\", \"Avpr1a\", \"Avpr2\", \"Baiap2\", \"Bdnf\", \"Bmp7\", \"Brs3\",\"Calcr\",\"Cbln1\",\"Cbln2\",\"Cckar\",\"Cckbr\",\"Ccnd2\",\"Cd24a\",\"Cdkn1a\",\"Cenpe\",\"Chat\",\"Coch\",\"Col25a1\",\"Cplx3\",\"Cpne5\",\"Creb3l1\",\"Crhbp\",\"Crhr1\",\"Crhr2\",\"Cspg5\",\"Cxcl14\",\"Cyp19a1\",\"Cyp26a1\",\"Cyr61\",\"Dgkk\",\"Ebf3\",\"Egr2\",\"Ermn\",\"Esr1\",\"Etv1\",\"Fbxw13\",\"Fezf1\",\"Fn1\",\"Fst\",\"Gabra1\",\"Gabrg1\"\"Gad1\",\"Galr1\",\"Galr2\",\"Gbx2\",\"Gda\",\"Gem\",\"Gjc3\",\"Glra3\",\"Gpr165\",\"Greb1\",\"Grpr\",\"Htr2c\",\"Igf1r\",\"Igf2r\",\"Irs4\",\"Isl1\",\"Kiss1r\",\"Klf4\",\"Lepr\",\"Lmod1\",\"Lpar1\",\"Man1a\",\"Mc4r\",\"Mki67\",\"Mlc1\",\"Myh11\",\"Ndnf\",\"Ndrg1\",\"Necab1\",\"Nos1\",\"Npas1\",\"Npy1r\",\"Npy2r\",\"Ntng1\",\"Ntsr1\",\"Nup62cl\",\"Omp\",\"Onecut2\",\"Opalin\",\"Oprd1\",\"Oprk1\",\"Oprl1\",\"Oxtr\",\"Pak3\",\"Pcdh11x\",\"Pdgfra\",\"Pgr\",\"Plin3\",\"Pnoc\",\"Pou3f2\",\"Prlr\",\"Ramp3\",\"Rgs2\",\"Rgs5\",\"Rnd3\",\"Rxfp1\",\"Scgn\",\"Selplg\",\"Sema3c\",\"Sema4d\",\"Serpinb1b\",\"Serpine1\",\"Sgk1\",\"Slc15a3\",\"Slc17a6\",\"Slc17a7\",\"Slc17a8\",\"Slc18a2\",\"Slco1a4\",\"Sox4\",\"Sox6\",\"Sox8\",\"Sp9\",\"Synpr\",\"Syt2\",\"Syt4\",\"Sytl4\",\"Tacr1\",\"Tacr3\",\"Tiparp\",\"Tmem108\",\"Traf4\",\"Trhr\",\"Ttn\",\"Ttyh2\",\"Oxt\",\"Penk\",\"Sst\",\"Tac1\",\"Gal\",\"Cartpt\",\"Vgf\",\"Trh\",\"Nts\",\"Scg2\",\"Gnrh1\",\"Tac2\",\"Cck\",\"Crh\",\"Ucn3\",\"Adcyap1\",\"Nnat\",\"Sln\",\"Mbp\",\n", 103 | "\"Th\"\n", 104 | "]\n", 105 | "t_cell_genes = [\n", 106 | " \"Ptprc\", \"Rorc\", \"Gata3\", \"Foxp3\", \"Tbx21\", \"Il2ra\", \"Il7r\", \"Il2rb\", \"Il2rg\", \"Il15ra\", \"Pdcd1\", \"Ctla4\", \"Cd3e\" \n", 107 | "]\n", 108 | "macrophage_genes = [\n", 109 | " \"Spi1\", \"Cx3cr1\", \"Ccr2\", \"Adgr1\", \"Aif1\", \"Csf1r\", \"Trem2\", \"H2-Ab1\", \"Itgae\", \"Clec10a\", \"Itgam\", \"Itgax\"\n", 110 | "]\n", 111 | "bcell_genes = [\"Ms4a1\", \"Cd19\", \"Prdm1\"]\n", 112 | "nkcell_genes = [\"Klrk1\", \"Klrb1\", \"Eomes\", \"Klrg1\"]\n", 113 | "misc_immune = [\"Cxcl9\", \"Cxcl10\", \"Ccl2\", \"Cd1d1\", \"Fcer1a\", \"Fcgr1\", \"Cr2\", \"Cd47\"]\n", 114 | "innate_bacterial = ['Il1b', 'Tnf', 'Il6', 'Ptges2']\n", 115 | "innate_viral = ['Ifna12', 'Ifna16', 'Ifna2']\n", 116 | "th1 = ['Ifng', 'Il12a']\n", 117 | "th2 = ['Il4', 'Il5', 'Il13']\n", 118 | "th17 = ['Il17a', 'Il17f', 'Il22', 'Il23a']\n", 119 | "treg = ['Il10', 'Tgfb2', 'Tgfb1', 'Tgfb3']\n", 120 | "other_immune = [\"Nfkb1\", \"Nfkbia\", \"Irf3\", \"Nlrp3\", \"Irf7\", \"Gsdmd\", \"Il18\"]\n", 121 | "\n", 122 | "minimal_aging = [\"C3\", \"C4b\", \"Il33\",\"Tnf\",\"Cdkn2a\", \"Cdkn2b\", 'B2m', 'C1qa', 'C1qc', 'C4b', 'Ctss', 'Gbp6', 'Gbp10', 'Ifi44', 'Ifit3', 'Ifitm3', 'Itgb2', 'Parp14', 'Serpina3n', 'Tap1', 'Trim30a']" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "known_markers_pfc = np.unique(np.concatenate([\n", 132 | " cortex_major_markers,\n", 133 | " t_cell_genes, macrophage_genes, bcell_genes, nkcell_genes, \n", 134 | " misc_immune, innate_bacterial, innate_viral, \n", 135 | " th1, th2, th17, treg, other_immune,\n", 136 | " minimal_aging\n", 137 | "]))" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "# Select MERFISH genes for cell type markers in PFC " 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "adata_pfc = adata[adata.obs.area == \"PFC\"]" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "# Meng/Stephen approach: for pairs of clusters, compute differential expression\n", 163 | "adata_raw = adata_pfc.raw.to_adata()\n", 164 | "adata_raw = adata_raw[:, adata_raw.var.highly_variable]" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": { 171 | "scrolled": true 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "# pairwise\n", 176 | "de_subclusts = compute_pairwise_de_for_clusts(adata_raw, \"clust_label\",n_de=10)\n", 177 | "de_majorclusts = compute_pairwise_de_for_clusts(adata_raw, \"cell_type\",n_de=10)\n", 178 | "\n", 179 | "# one vs all\n", 180 | "minorclusts_onevsall = compute_onevsall_de_for_clusts(adata_raw, 'clust_label',n_de=15)\n", 181 | "majorclusts_onevsall = compute_onevsall_de_for_clusts(adata_raw, 'cell_type',n_de=15)\n" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "#de_minorclust_pairwise = greedily_select_markers(de_subclusts, 1, pairwise=True,de_marker_genes=known_markers_pfc)\n", 191 | "#de_majorclust_pairwise = greedily_select_markers(de_majorclusts, 5, pairwise=True,de_marker_genes=known_markers_pfc)\n", 192 | "de_minorclusts_onevsall = greedily_select_markers(minorclusts_onevsall, 2, pairwise=False, de_marker_genes=known_markers_pfc)\n", 193 | "de_majorclusts_onevsall = greedily_select_markers(majorclusts_onevsall, 2, pairwise=False, de_marker_genes=known_markers_pfc)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "# List of known markers" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "de_combined = list(np.unique( list(de_minorclusts_onevsall) + list(de_majorclusts_onevsall)))" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "de_combined" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "adata_raw = adata_pfc.raw.to_adata()\n", 228 | "\n", 229 | "de_combined = [i for i in de_combined if i in adata_raw.var_names]" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "len(de_combined)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "# load per cluster markers and take top N\n", 248 | "#seurat_clust_markers = pd.read_csv(\"gene_lists/all_clust_markers.csv\")\n", 249 | "#min_marker_genes = 4\n", 250 | "#seurat_de_marker_genes = set()\n", 251 | "#for n,i in enumerate(clust_labels_uniq):\n", 252 | "# curr_contrast = seurat_clust_markers[seurat_clust_markers.cluster==i].sort_values('avg_log2FC', ascending=False)\n", 253 | "# curr_genes = list(curr_contrast.gene)[:3]\n", 254 | "# for k in curr_genes:\n", 255 | "# seurat_de_marker_genes.add(k)\n", 256 | "#seurat_de_marker_genes = list(seurat_de_marker_genes)" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "marker_clust_avgs = []\n", 266 | "clust_avgs = []\n", 267 | "for i in adata_raw.obs.clust_label.unique():\n", 268 | " clust_avgs.append(compute_mean_expression(adata_raw[adata_raw.obs.clust_label==i,:]))\n", 269 | " marker_clust_avgs.append(compute_mean_expression(adata_raw[adata_raw.obs.clust_label==i,:][:,de_combined]))" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "plot_clustered_celltypes_by_genes(adata_raw, de_combined,normalize=False)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "adata_raw[:,de_combined].X.sum(1).shape" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "len(de_combined)\n" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "# Select MERFISH genes for aging" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "1. Cell types:\n", 325 | "\t1. Pairwise DE of major clusts\n", 326 | "\t2. Pairwise DE of minor clusts\n", 327 | "\t3. One-vs-all DE of major clusts\n", 328 | "\t4. One-vs-all DE of minor clusts\n", 329 | "\t5. Random forest features of major clusts\n", 330 | "\t6. Random forest features of minor clusts\n", 331 | "2. Aging markers:\n", 332 | "\t1. Random forest features of major clusts\n", 333 | "\t2. Random forest features of minor clusts\n", 334 | "\t3. NB differential expression of major clusts\n", 335 | "\t4. NB differential expression of minor clusts\n", 336 | "\t5. TF random forests of major clusts\n", 337 | "\t6. TF random forests of minor clusts" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "# load aging differentially expressed genes\n", 347 | "age_tf_feats = list(pd.read_csv(\"gene_lists/age_tf_feats.csv\").gene)\n", 348 | "age_de_minor = pd.read_csv(\"gene_lists/nb_glm_age_de_minor.csv\")\n", 349 | "age_de_major = pd.read_csv(\"gene_lists/nb_glm_age_de_major.csv\")" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "age_de_major = age_de_major[age_de_major.qval < 1e-6]\n", 359 | "age_de_minor = age_de_minor[age_de_minor.qval < 1e-6]" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "age_de_major['log10fc'] = np.abs(age_de_major.coef)\n", 369 | "#age_de_minor['log10fc'] = np.abs(age_de_minor.coef)" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "age_de_major_markers = list(select_age_markers({k:age_de_major[age_de_major.cell_type==k] for k in age_de_major.cell_type.unique()}, 5))\n", 379 | "age_de_minor_markers = list(select_age_markers({k:age_de_minor[age_de_minor.cell_type==k] for k in age_de_minor.cell_type.unique()}, 2))\n" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "combined_age_markers = list(set(age_de_minor_markers + age_de_major_markers + age_tf_feats))" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "len(combined_age_markers)" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": {}, 404 | "outputs": [], 405 | "source": [ 406 | "combined_age_markers = sorted(combined_age_markers)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "metadata": {}, 413 | "outputs": [], 414 | "source": [] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "# Add in markers from literature" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "# Microglia reactivity signature\n", 430 | "# IL-6, TGFbeta1, IL10, IL-12/p40, IL-1beta, TNFalpha\n", 431 | "aging_microglia = ['Ccl4', 'Lgals3', 'Ms4a7', 'Ifitm3'] + ['Il10', 'Il6', 'Il21a', 'Il12b', 'Il1b', 'Tnf']\n", 432 | "aging_microglia += [\"Tmem119\", \"Apoe\", \"Cst7\", \"Clec7a\", \"Lpl\", \"Hif1a\", \"Igf1\", \"Cd74\", \"Ifit2\", \"Ifit3\", \"Irf7\", \"Oasl2\", \"Top2a\", \"Mcm2\"]\n", 433 | "aging_microglia += [\"Tyrobp\", \"Ctsb\", \"Ctsd\", \"Fth1\", \"Lyz2\", \"Axl\", \"Cst7\", \"Trem2\", \"Cst7\", \"Lpl\", \"Cd9\", \"Csf1\", \"Ccl6\", \"Itgax\", \"Clec7a\", \"Lilrb4\", \"Timp2\", \"Marcks\", \"Serinc3\", \"P2ry12\", \"Cd9\", \"Cd63\"]" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "# Aging astrocytes -- The Aging Astrocyte Transcriptome from Multiple Regions of the Mouse Brain, Boisvert et al\n", 443 | "aging_astro_allregions = ['Sprina3m', 'Serpina3n', 'C4b', 'Pcdh6', 'Pcdhb1', 'Gfap', 'Prss50', # upregulated\n", 444 | " 'Gpx8', 'Hspa1b', 'Hspa1a', 'Rsrp1']\n", 445 | "aging_astro_regionspecific = list(np.unique(['Serpina3f', 'Rpk4', 'Timp1', 'Fbln5', 'Plin4', 'Rab20', 'Capg', 'Zc3hav1', 'Gbp2', 'Ifi35', 'Hs3st3a1', 'Mboat1', 'Psmb8', 'Cyp27a1',\n", 446 | " 'Serpina3f', 'Cdr1', 'Zbtb20', 'Grin2b', 'Hipk2', 'Tcp11l1', 'Ago3', 'Oasl2', 'Lnpep', 'Gan', 'Aqp2', 'Bst2', 'Hmbox1', 'Zc3hav1',\n", 447 | " 'Serpina3f', 'Cdr1', 'Lars2', 'Zbtb20', 'Grin2b', 'Rpk4', 'Nr5a1', 'Slc22a18', 'Timp1', 'Fcgr2b', 'Hipk2', 'C3', 'Osmr', 'Oasl2', 'Nupr1', # up\n", 448 | " 'Bmp4', 'Kiss1', 'Fst', 'Cyr61', 'Tead2', 'Dnajb1', 'Banp', 'Cdx8', 'Rbm12b1', 'Ece2', \n", 449 | " 'Bmp4', 'Cd38', 'Sptbn2', 'Sptb', 'Pcdh20', 'Eif5b', 'Gm7120', 'Sptan1', 'Hmgcr', 'Trio',\n", 450 | " 'Sspo', 'Wfdc2', 'Ttr', 'Ctgf', 'Thbs4', 'Bmp4', 'Prom1', 'Sptbn2', 'Bgn', 'Tnc', 'Sparc']))\n", 451 | "combined_astro_marker = list(np.unique(aging_astro_allregions+aging_astro_regionspecific))" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "metadata": {}, 458 | "outputs": [], 459 | "source": [ 460 | "# senescence genes\n", 461 | "senescence_high = ['Retnla', 'Tnf', 'Cdkn2a', 'Itgax', 'Il12b', 'Il18', 'Cd68', 'Fcgr1',\n", 462 | " 'Parp14', 'Fcna', 'Cd36', 'Cd38', 'Bst1', 'Itgam', 'Emr1', 'Irg1',\n", 463 | " 'Il1b', 'Lmnb1', 'Il10', 'Fabp4', 'Lyve1', 'Mrc1', 'Nampt', 'Nadk',\n", 464 | " 'Bmi1', 'Sirt7']\n", 465 | "\n", 466 | "senescence_low = ['Sirt1', 'Nfkbiz', 'Cdkn1a', 'Tiparp', 'Trp53',\n", 467 | " 'Sirt5', 'Csf1', 'Nfkb1', 'Parp6', 'Sirt2', 'Nnmt', 'Hmgb1', 'Bcl2l2',\n", 468 | " 'Nt5e', 'Sirt3', 'Serpine1', 'Arg1', 'Parp10', 'Ccl2', 'Il6', 'Nmnat3',\n", 469 | " 'Cdkn2b', 'Il12a', 'Parp12', 'Parp9', 'Parp11', 'Parp8', 'Sirt6',\n", 470 | " 'Sirt4', 'Mgl2', 'Parp3', 'Zc3hav1', 'Tnks', 'Parp4', 'Parp2', 'Sarm1',\n", 471 | " 'Parp16', 'Nmnat2', 'Parp1', 'Nmnat1']\n", 472 | "\n", 473 | "reactive_astro1 = ['C3', 'Ggta1', 'Ligp1', 'Gpp2', 'Fbln5', 'Ekbp5', 'Psmb8'] # A1 astrocytes are produced following LPS injection\n", 474 | "reactive_astro2 = ['Clcf1', 'Tgm1', 'Ptx3', 'S100a10', 'Sphk1', 'Cd109', 'Ptgs2', 'Emp1', 'Slc10a6', 'Tms4sf1', 'B3gnt5', 'Stat3']\n", 475 | "reactive_astro_pan = ['Lcn2', 'Steap4', 'S1pr3', 'Timp1', 'Hsbp1', 'Cxcl10', 'Cd44', 'Cp', 'Serpina3n', 'Aspg', 'Vim', 'Gfap']\n", 476 | "\n", 477 | "# brunet aging genes\n", 478 | "brunet_genes = ['B2m', 'C1qa', 'C1qc', 'C4b', 'Ctss', 'Gbp6', 'Gbp10', 'Ifi44', 'Ifit3', 'Ifitm3', 'Itgb2', 'Parp14', 'Serpina3n', 'Tap1', 'Trim30a']\n" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [ 487 | "combined_senescence = list(np.unique(senescence_high + senescence_low + brunet_genes + reactive_astro1 + reactive_astro2 + reactive_astro_pan))" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "metadata": {}, 494 | "outputs": [], 495 | "source": [ 496 | "orig_all_age_markers = list(list(combined_age_markers + aging_microglia + combined_astro_marker + combined_senescence))" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [ 505 | "# remove genes in cell type markers\n", 506 | "orig_all_age_markers = list(set([i for i in orig_all_age_markers if i not in de_combined]))\n", 507 | "good_genes = adata.raw.to_adata().var_names\n", 508 | "orig_all_age_markers = [i for i in orig_all_age_markers if i in good_genes]" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [ 517 | "print(len(orig_all_age_markers))" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": null, 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "age_diffexp = compute_average_age_expr_change(adata_raw, orig_all_age_markers)" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": {}, 533 | "outputs": [], 534 | "source": [ 535 | "# filter based on average change in expression\n", 536 | "age_threshold = 0.35 #np.log(1.5)\n", 537 | "all_age_markers = np.array(orig_all_age_markers)[(np.abs(age_diffexp) > age_threshold).any(0)]" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "metadata": {}, 544 | "outputs": [], 545 | "source": [ 546 | "all_age_markers" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": null, 552 | "metadata": {}, 553 | "outputs": [], 554 | "source": [ 555 | "print(np.sum([1 if i in combined_senescence else 0 for i in orig_all_age_markers ]))" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": null, 561 | "metadata": {}, 562 | "outputs": [], 563 | "source": [ 564 | "mean_age_expr = compute_mean_expression(adata_raw[:,all_age_markers])\n", 565 | "plt.plot(np.cumsum(np.sort(mean_age_expr))/np.sum(mean_age_expr),'ko-')" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": null, 571 | "metadata": {}, 572 | "outputs": [], 573 | "source": [ 574 | "# remove the top 10% highly expressed age markers\n", 575 | "#sorted_age_markers = np.array(orig_all_age_markers)[np.argsort(mean_age_expr)]\n", 576 | "#all_age_markers = list(np.array(orig_all_age_markers[:int(0.8*len(sorted_age_markers))]))" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": null, 582 | "metadata": {}, 583 | "outputs": [], 584 | "source": [ 585 | "# how \n", 586 | "#print(np.sum([1 if i in combined_senescence else 0 for i in all_age_markers ]))" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": null, 592 | "metadata": {}, 593 | "outputs": [], 594 | "source": [ 595 | "print(\"Senescence genes excluded based on expression\")\n", 596 | "for i in combined_senescence:\n", 597 | " if i not in all_age_markers and i in orig_all_age_markers:\n", 598 | " if i in brunet_genes:\n", 599 | " print(i, 'brunet')\n", 600 | " elif i in senescence_high:\n", 601 | " print(i, 'senesce_high')\n", 602 | " elif i in senescence_low:\n", 603 | " print(i, 'senesce_low')\n", 604 | " elif i in combined_astro_marker:\n", 605 | " print(i,'combined_astro')\n", 606 | " elif i in aging_microglia:\n", 607 | " print(i,'microglia')\n", 608 | " else:\n", 609 | " print(i)" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": null, 615 | "metadata": {}, 616 | "outputs": [], 617 | "source": [ 618 | "plot_clustered_ages_by_genes(adata_raw, de_combined)\n", 619 | "plot_clustered_ages_by_genes(adata_raw, all_age_markers)" 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": null, 625 | "metadata": {}, 626 | "outputs": [], 627 | "source": [ 628 | "plot_clustered_celltypes_by_genes(adata_raw, all_age_markers,normalize=False)" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": null, 634 | "metadata": {}, 635 | "outputs": [], 636 | "source": [ 637 | "plot_per_celltype_sparsity(adata_raw, de_combined)" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": null, 643 | "metadata": {}, 644 | "outputs": [], 645 | "source": [ 646 | "plot_per_celltype_sparsity(adata_raw, all_age_markers)" 647 | ] 648 | }, 649 | { 650 | "cell_type": "code", 651 | "execution_count": null, 652 | "metadata": {}, 653 | "outputs": [], 654 | "source": [ 655 | "plot_per_celltype_totalexpr(adata_raw, de_combined)" 656 | ] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": null, 661 | "metadata": {}, 662 | "outputs": [], 663 | "source": [ 664 | "plot_per_celltype_totalexpr(adata_raw, all_age_markers)" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": null, 670 | "metadata": {}, 671 | "outputs": [], 672 | "source": [ 673 | "plot_per_gene_sparsity(adata_raw, de_combined)" 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": null, 679 | "metadata": {}, 680 | "outputs": [], 681 | "source": [ 682 | "plot_per_gene_sparsity(adata_raw, all_age_markers)" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": null, 688 | "metadata": {}, 689 | "outputs": [], 690 | "source": [ 691 | "pd.DataFrame({'gene':de_combined}).to_csv(\"gene_lists/all_markers_pfc.csv\")\n", 692 | "pd.DataFrame({'gene':all_age_markers}).to_csv(\"gene_lists/all_markers_pfc_aging.csv\")" 693 | ] 694 | }, 695 | { 696 | "cell_type": "markdown", 697 | "metadata": {}, 698 | "source": [ 699 | "# Save out per cluster expression for these genes for bit assignment" 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": null, 705 | "metadata": {}, 706 | "outputs": [], 707 | "source": [ 708 | "# find cluster names\n", 709 | "adata_raw = adata.raw.to_adata()\n", 710 | "clust_labels_uniq = adata_raw.obs.clust_label.unique()\n", 711 | "# find markers actually in adata\n", 712 | "all_markers = [i for i in all_markers_to_keep if i in adata_raw.var_names]\n", 713 | "# compute cluster averages\n", 714 | "clust_avgs = np.vstack([adata_raw[adata_raw.obs.clust_label==i,:][:, list(all_markers_to_keep)].X.mean(0) for i in clust_labels_uniq])" 715 | ] 716 | }, 717 | { 718 | "cell_type": "code", 719 | "execution_count": null, 720 | "metadata": {}, 721 | "outputs": [], 722 | "source": [ 723 | "clust_expr = pd.DataFrame(clust_avgs, index=clust_labels_uniq, columns=all_markers_to_keep).to_csv(\"merfish_cluster_expr.csv\")" 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": null, 729 | "metadata": {}, 730 | "outputs": [], 731 | "source": [ 732 | "# save this data for bit assignment\n", 733 | "clust_proportions = np.array([np.sum(adata_raw.obs.clust_label==i) for i in clust_labels_uniq])\n", 734 | "clust_proportions = clust_proportions/clust_proportions.sum()\n", 735 | "pd.DataFrame({'clust':clust_labels_uniq,'proportion':clust_proportions}).to_csv(\"merfish_cluster_proportions.csv\")" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": null, 741 | "metadata": {}, 742 | "outputs": [], 743 | "source": [] 744 | } 745 | ], 746 | "metadata": { 747 | "kernelspec": { 748 | "display_name": "Python 3 (ipykernel)", 749 | "language": "python", 750 | "name": "python3" 751 | }, 752 | "language_info": { 753 | "codemirror_mode": { 754 | "name": "ipython", 755 | "version": 3 756 | }, 757 | "file_extension": ".py", 758 | "mimetype": "text/x-python", 759 | "name": "python", 760 | "nbconvert_exporter": "python", 761 | "pygments_lexer": "ipython3", 762 | "version": "3.7.4" 763 | } 764 | }, 765 | "nbformat": 4, 766 | "nbformat_minor": 4 767 | } 768 | -------------------------------------------------------------------------------- /notebooks/aging_call_final_celltypes_10x.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "%matplotlib inline\n", 13 | "%config InlineBackend.figure_format='retina'\n" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import sys\n" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import numpy as np\n", 32 | "import scanpy as sc\n", 33 | "import pandas as pd\n", 34 | "import anndata as ad\n", 35 | "import seaborn as sns\n", 36 | "sns.set_style('white')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 4, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)\n", 46 | "sc.settings.set_figure_params(dpi=80, facecolor='white', frameon=False, figsize=(5,5))" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 5, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "adata = ad.read_h5ad(\"/faststorage/brain_aging/rna_analysis/adata_finalclusts.h5ad\")" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 6, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "adata = adata[adata.obs.final_clusts != 'NA']\n", 65 | "adata = adata[adata.obs.total_counts < 50000]" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 7, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "# remove clusters with < 100 cells\n", 75 | "good_clusts = [i for i in adata.obs.final_clusts.unique() if np.sum(adata.obs.final_clusts==i)>200]" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 8, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "adata = adata[adata.obs.final_clusts.isin(good_clusts)]" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 9, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "A = adata[~adata.obs.neuronal].copy()" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "n_pcs = 20\n", 103 | "sc.pp.highly_variable_genes(A, n_top_genes=2000)\n", 104 | "A.raw = A\n", 105 | "A = A[:, A.var.highly_variable]\n", 106 | "print('regressing out')\n", 107 | "sc.pp.regress_out(A, ['total_counts'])\n", 108 | "print('scaling')\n", 109 | "sc.pp.scale(A, max_value=10)\n", 110 | "print('pca')\n", 111 | "sc.tl.pca(A, svd_solver='arpack', n_comps=n_pcs)\n", 112 | "print('neighbors')\n", 113 | "sc.pp.neighbors(A, n_neighbors=25, n_pcs=n_pcs)\n", 114 | "#sc.external.pp.bbknn(A,batch_key='age',n_pcs=n_pcs)\n", 115 | "print('umap')\n", 116 | "sc.tl.umap(A)\n", 117 | "print('leiden')\n", 118 | "sc.tl.leiden(A,resolution=0.6)\n" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "sc.pl.umap(A,color=['age', 'area','final_clusts','mouse_id'],use_raw=True,palette=sns.color_palette('gist_ncar',20))" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "sc.pl.dotplot(A, [\n", 137 | " 'Csf1r', 'C1qa', 'Hexb', 'Cx3cr1', 'P2ry12', 'Tmem119', 'Tnf', 'Ccl4', # microglia\n", 138 | " 'Sox10','Cldn11', 'Mog', 'Plp1', # oligo\n", 139 | " 'Aqp4', 'Aldh1l1','Gfap', 'Aldoc', # astrocyte\n", 140 | " 'Vtn', 'Flt1', 'Pecam1','Cldn5', 'Adgrf5', # pericyte\n", 141 | " 'Mgp' ,'Slc47a1', 'Dapl1', 'Igf2', 'Sema3g', 'Acta2', # vascular\n", 142 | " 'Pdgfra', 'Vcan', 'Cspg4', 'Olig1', # OPC\n", 143 | " 'Ccdc153', 'Tmem212', 'Hdc', 'Kcnj8',# ependymal\n", 144 | " 'Pf4', 'Cd74', 'Cxcl2', 'Lyz2', 'Ms4a7',\n", 145 | "], groupby='final_clusts',use_raw=True)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "# gene sets\n", 155 | "# microglia -- Hexb, Csf1r, C1qa, P2ry12\n", 156 | "# OPCs -- Pdgfra, Vcan, Cspg4, Olig1\n", 157 | "# Endo -- Vtn, Flt1, Cldn5\n", 158 | "# Oligo -- Plp1, Mpb, Cldn11, Mog" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "nonneuronal_mapping = {'N.0' : 'Oligodendrocyte', \n", 168 | " 'N.1' : 'Oligodendrocyte', \n", 169 | " 'N.2':'Oligodendrocyte', \n", 170 | " 'N.3':'Astrocyte',\n", 171 | " 'N.4': 'Astrocyte',\n", 172 | " 'N.5': 'Astrocyte', \n", 173 | " 'N.6' : 'OPC',\n", 174 | " 'N.7':'Microglia', \n", 175 | " 'N.8' : 'Microglia',\n", 176 | " 'N.9' : 'NA',\n", 177 | " 'N.10' : 'NA',\n", 178 | " 'N.11':'NA', \n", 179 | " 'N.12' : 'NA',\n", 180 | " 'N.13' : 'Vascular',\n", 181 | " 'N.14' : 'Oligodendrocyte',\n", 182 | " 'N.15' : 'Vascular', # vascular endoethelial cells\n", 183 | " 'N.16' : 'Oligodendrocyte',\n", 184 | " 'N.17' : 'Astrocyte',\n", 185 | " 'N.18' : 'Immune', # perivascular macrophage\n", 186 | " 'N.19' : 'Vascular', # pericyte\n", 187 | " 'N.20' : 'Vascular' # vascular leptomeningeal cells\n", 188 | " }\n", 189 | "\n", 190 | "finer_nonneuronal_mapping = {'N.0' : 'Olig1', \n", 191 | " 'N.1' : 'Olig2', \n", 192 | " 'N.2':'Olig3', \n", 193 | " 'N.3':'Astro1',\n", 194 | " 'N.4': 'Astro2',\n", 195 | " 'N.5': 'Astro3', \n", 196 | " 'N.6' : 'OPC',\n", 197 | " 'N.7':'Micro1', \n", 198 | " 'N.8' : 'Micro2',\n", 199 | " 'N.9' : 'NA',\n", 200 | " 'N.10' : 'NA',\n", 201 | " 'N.11':'NA', \n", 202 | " 'N.12' : 'NA',\n", 203 | " 'N.13' : 'Vlmc1',\n", 204 | " 'N.14' : 'Olig4',\n", 205 | " 'N.15' : 'Peri1', # \n", 206 | " 'N.16' : 'Olig5',\n", 207 | " 'N.17' : 'Astro4',\n", 208 | " 'N.18' : 'Macro', # perivascular macrophage\n", 209 | " 'N.19' : 'Peri2', # pericyte\n", 210 | " 'N.20' : 'Vlmc2' # vascular leptomeningeal cells\n", 211 | " }\n", 212 | "\n" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "sc.pl.umap(adata, color=['final_clusts','Tac1', 'Tshz1', 'Cxcl14', 'Pdyn','Penk', 'Drd1', 'Drd2', 'Adora2a', 'Calb1','Pthlh'])" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "# identify striatal neurons\n", 231 | "sc.pl.dotplot(adata, ['Otof', 'Cacng5', 'Th','Ppp1r1b', 'Drd1','Tac1', 'Tshz1', 'Pdyn', 'Drd2','Penk','Adora2a', 'Calb1','Pthlh','Cxcl14','Chat'], groupby='final_clusts',use_raw=True)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "striatal_celltypes = {\n", 241 | " 'H.I.7' : 'StD1M1',\n", 242 | " 'H.I.8' : 'StD1M2',\n", 243 | " 'H.I.20': 'StD2M1',\n", 244 | " 'H.I.27': 'StD1M3',\n", 245 | " 'P.I.0' : 'StD1M4',\n", 246 | " 'P.I.1' : 'StD1M5',\n", 247 | " 'P.I.2' : 'StD2M2',\n", 248 | " 'P.I.3' : 'StD2M3',\n", 249 | " 'P.I.4' : 'StD1M6',\n", 250 | " 'P.I.5' : 'StD2M4',\n", 251 | " 'P.I.10': 'StD1M7',\n", 252 | " 'P.I.18': 'StD2M5',\n", 253 | " 'P.I.19': 'StD1M8'\n", 254 | "}" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "cell_types = list(adata.obs.final_clusts.copy())\n", 264 | "for i,k in enumerate(cell_types):\n", 265 | " if k in nonneuronal_mapping:\n", 266 | " cell_types[i] = nonneuronal_mapping[k]\n", 267 | " else:\n", 268 | " #if 'N' in k:\n", 269 | " # pass\n", 270 | " #else:\n", 271 | " cell_types[i] = 'Neuron'\n", 272 | "adata.obs['cell_type'] = cell_types\n", 273 | "\n", 274 | "cell_types_fine = list(adata.obs.final_clusts.copy())\n", 275 | "for i,k in enumerate(cell_types_fine):\n", 276 | " if k in finer_nonneuronal_mapping:\n", 277 | " cell_types_fine[i] = finer_nonneuronal_mapping[k]\n", 278 | " elif k in striatal_celltypes:\n", 279 | " cell_types_fine[i] = striatal_celltypes[k]\n", 280 | " else:\n", 281 | " curr_cell_type = k.split(\".\")\n", 282 | " if curr_cell_type[0] == \"H\":\n", 283 | " curr_area = \"Hy\"\n", 284 | " else:\n", 285 | " curr_area = \"Fr\"\n", 286 | " if curr_cell_type[1] == \"I\":\n", 287 | " curr_type = \"In\"\n", 288 | " else:\n", 289 | " curr_type = \"Ex\"\n", 290 | " cell_types_fine[i] = curr_area + curr_type + str(int(curr_cell_type[2])+1)\n", 291 | "adata.obs['clust_label'] = cell_types_fine#pd.Series(cell_types_fine,dtype='category')\n", 292 | "adata.obs.clust_label = adata.obs.clust_label.astype('category')" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "# remove bad non neuronal clusters\n", 302 | "adata = adata[~adata.obs.cell_type.isin(['NA'])]" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "# reprocess\n", 312 | "adata = adata.raw.to_adata()\n", 313 | "sc.pp.normalize_total(adata, target_sum=1e4)\n", 314 | "sc.pp.log1p(adata)\n", 315 | "sc.pp.highly_variable_genes(adata, n_top_genes=3000)\n", 316 | "adata.raw = adata\n", 317 | "adata = adata[:, adata.var.highly_variable]\n" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "sc.pp.regress_out(adata, ['total_counts'])" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "sc.pp.scale(adata, max_value=10)\n", 336 | "sc.tl.pca(adata, n_comps=50,svd_solver='arpack')\n", 337 | "\n", 338 | "sc.pl.pca_variance_ratio(adata, log=True,n_pcs=50)\n", 339 | "\n", 340 | "sc.pp.neighbors(adata, n_neighbors=10, n_pcs=50)\n", 341 | "sc.tl.umap(adata)" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "# final final clust information\n", 351 | "#adata.obs.to_csv(\"final_clusts.csv\")" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "#fig,ax = plt.subplots(figsize=(10,10))\n", 361 | "sc.pl.umap(adata, color=['doublet_score','total_counts'],size=1,add_outline=False)" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": {}, 368 | "outputs": [], 369 | "source": [ 370 | "fig,ax = plt.subplots(figsize=(10,10))\n", 371 | "sc.pl.umap(adata, color='cell_type',palette=sns.color_palette('Pastel1'), ax=ax,size=10,add_outline=True)" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "def gen_light_palette(prefix, color_name, uniq_clusts):\n", 381 | " n = np.sum([1 if prefix in i else 0 for i in uniq_clusts])\n", 382 | " return sns.light_palette(color_name, n_colors=n+2)[2:]\n", 383 | "\n", 384 | "def gen_dark_palette(prefix, color_name, uniq_clusts):\n", 385 | " n = np.sum([1 if prefix in i else 0 for i in uniq_clusts])\n", 386 | " return sns.dark_palette(color_name, n_colors=n+2)[2:]\n", 387 | "\n", 388 | "uniq_clusts = np.sort(adata.obs.clust_label.unique())\n", 389 | "\n", 390 | "\n", 391 | "print(\"Prefrontal excite\")\n", 392 | "fr_ex_pal = gen_light_palette(\"FrEx\", \"darkgreen\", uniq_clusts) #sns.cubehelix_palette(start=0, rot=0.2, dark=0.25, light=.9, n_colors=n_pe)\n", 393 | "\n", 394 | "print(\"Prefrontal inhib\")\n", 395 | "fr_in_pal = gen_light_palette(\"FrIn\", \"navy\", uniq_clusts)#sns.cubehelix_palette(start=0, rot=0.5, dark=0.25, light=.95, n_colors=n_pi)\n", 396 | "\n", 397 | "print(\"Striatal\")\n", 398 | "st_pal = gen_light_palette(\"St\", \"indigo\", uniq_clusts) #sns.cubehelix_palette(start=0, rot=0.5, dark=0.5, light=.95, n_colors=n_st)\n", 399 | "\n", 400 | "\n", 401 | "print(\"Microglial\")\n", 402 | "micro_pal = gen_light_palette('Micro', 'dodgerblue', uniq_clusts)\n", 403 | "\n", 404 | "print(\"Macro\")\n", 405 | "macro_pal = gen_light_palette('Macro', 'blue', uniq_clusts)\n", 406 | "\n", 407 | "print(\"Astrocyte\")\n", 408 | "astro_pal = gen_light_palette('Astro', 'darkorange', uniq_clusts)\n", 409 | "\n", 410 | "print(\"Peri\")\n", 411 | "peri_pal = gen_light_palette('Peri', 'lime', uniq_clusts)\n", 412 | "\n", 413 | "print(\"VLMC\")\n", 414 | "vlmc_pal = gen_light_palette('Vlmc', 'aqua', uniq_clusts)\n", 415 | "\n", 416 | "print(\"OPC\")\n", 417 | "opc_pal = gen_dark_palette('OPC', 'black', uniq_clusts)\n", 418 | "\n", 419 | "print(\"Oligo\")\n", 420 | "oligo_pal = gen_light_palette('Olig', 'darkgray', uniq_clusts)\n", 421 | "\n", 422 | "pals = [astro_pal, fr_ex_pal, fr_in_pal, hy_ex_pal, hy_in_pal, macro_pal, micro_pal, opc_pal, oligo_pal, peri_pal, st_pal, vlmc_pal]\n", 423 | "for i in pals:\n", 424 | " sns.palplot(i)" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [ 433 | "from cycler import cycler\n", 434 | "#pal = cycler(color=)\n", 435 | "\n", 436 | "pal = cycler(color=np.vstack(pals))\n", 437 | "\n", 438 | "label_colors = {}\n", 439 | "for i, c in enumerate(iter(pal)):\n", 440 | " label_colors[uniq_clusts[i]] = c['color']" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [ 449 | "fig,ax = plt.subplots(figsize=(10,10))\n", 450 | "sc.pl.umap(adata, color='clust_label',palette=pal,ax=ax,size=10,add_outline=True)" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": {}, 457 | "outputs": [], 458 | "source": [ 459 | "fig,ax = plt.subplots(figsize=(10,10))\n", 460 | "sc.pl.umap(adata, color='age',ax=ax,size=10,add_outline=True,palette=sns.color_palette('Set2',2))" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": null, 466 | "metadata": {}, 467 | "outputs": [], 468 | "source": [ 469 | "fig,ax = plt.subplots(figsize=(10,10))\n", 470 | "sc.pl.umap(adata, color='area',ax=ax,size=10,add_outline=True)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "#adata.write(\"adata_finalclusts_annot.h5ad\")" 480 | ] 481 | } 482 | ], 483 | "metadata": { 484 | "kernelspec": { 485 | "display_name": "Python 3 (ipykernel)", 486 | "language": "python", 487 | "name": "python3" 488 | }, 489 | "language_info": { 490 | "codemirror_mode": { 491 | "name": "ipython", 492 | "version": 3 493 | }, 494 | "file_extension": ".py", 495 | "mimetype": "text/x-python", 496 | "name": "python", 497 | "nbconvert_exporter": "python", 498 | "pygments_lexer": "ipython3", 499 | "version": "3.7.4" 500 | } 501 | }, 502 | "nbformat": 4, 503 | "nbformat_minor": 4 504 | } 505 | -------------------------------------------------------------------------------- /notebooks/merfish_spatial_celltype_org.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2\n", 11 | "import scanpy as sc\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np\n", 14 | "import matplotlib as mpl\n", 15 | "import os\n", 16 | "import anndata as ad\n", 17 | "mpl.rcParams['figure.dpi'] = 150\n", 18 | "plt.rcParams['pdf.fonttype'] = 42\n", 19 | "import sys\n", 20 | "from spatial_analysis import *\n", 21 | "from plotting import *\n", 22 | "from utils import *" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import seaborn as sns\n", 32 | "sns.set_style('white')" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "def unbinarize_strings(A):\n", 42 | " try:\n", 43 | " A.var_names = [i.decode('ascii') for i in A.var_names]\n", 44 | " A.obs.index = [i.decode('ascii') for i in A.obs.index]\n", 45 | " for i in A.obs.columns:\n", 46 | " if A.obs[i].dtype != np.dtype('bool') and \\\n", 47 | " A.obs[i].dtype != np.dtype('int64') and \\\n", 48 | " A.obs[i].dtype != np.dtype('int32') and \\\n", 49 | " A.obs[i].dtype != np.dtype('object_') and \\\n", 50 | " A.obs[i].dtype != np.dtype('float64') and A.obs[i].dtype != np.dtype('float32'):\n", 51 | " if A.obs[i].dtype.is_dtype('category'):\n", 52 | " try:\n", 53 | " A.obs[i] = [i.decode('ascii') for i in A.obs[i]]\n", 54 | " except Exception as e:\n", 55 | " pass\n", 56 | " except Exception as e:\n", 57 | " print(e)\n", 58 | " return A\n" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# load annotated data (missing aging-related genes) and full data" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "adata_annot = ad.read_h5ad(\"/faststorage/brain_aging/merfish/exported/011722_adata_combined_harmony.h5ad\")\n", 77 | "adata_annot = unbinarize_strings(adata_annot)\n", 78 | "adata_annot = adata_annot[adata_annot.obs.dtype=='merfish']" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "adata_annot.raw = unbinarize_strings(adata_annot.raw.to_adata())" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": { 94 | "scrolled": true 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "celltype_colors, celltype_pals, label_colors, clust_pals = generate_palettes(adata_annot)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "# Compute neighborhoods" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "adata_annot_young = adata_annot[adata_annot.obs.age=='4wk']\n", 115 | "adata_annot_med = adata_annot[adata_annot.obs.age=='24wk']\n", 116 | "adata_annot_old = adata_annot[adata_annot.obs.age=='90wk']\n" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "clust_order = [\n", 126 | " 'ExN-L2/3-1',\n", 127 | " 'ExN-L2/3-2',\n", 128 | " 'ExN-L5-1',\n", 129 | " 'ExN-L5-2',\n", 130 | " 'ExN-L5-3',\n", 131 | " 'ExN-L6-1',\n", 132 | " 'ExN-L6-2',\n", 133 | " 'ExN-L6-3',\n", 134 | " 'ExN-L6-4',\n", 135 | " 'ExN-LatSept',\n", 136 | "\n", 137 | " 'InN-Calb2',\n", 138 | " 'InN-Chat',\n", 139 | " 'InN-Lamp5',\n", 140 | " 'InN-LatSept',\n", 141 | " 'InN-Pvalb-1',\n", 142 | " 'InN-Pvalb-2',\n", 143 | " 'InN-Pvalb-3',\n", 144 | " 'InN-Sst',\n", 145 | " 'InN-Vip',\n", 146 | " 'MSN-D1-1',\n", 147 | " 'MSN-D1-2',\n", 148 | " 'MSN-D2',\n", 149 | " 'OPC',\n", 150 | " 'Olig-1',\n", 151 | " 'Olig-2',\n", 152 | " 'Olig-3',\n", 153 | "\n", 154 | "'Astro-1',\n", 155 | " 'Astro-2',\n", 156 | " 'Vlmc',\n", 157 | " 'Peri-1',\n", 158 | " 'Peri-2',\n", 159 | " 'Endo-1',\n", 160 | " 'Endo-2',\n", 161 | " 'Endo-3',\n", 162 | " 'Epen',\n", 163 | "\n", 164 | " 'Micro-1',\n", 165 | " 'Micro-2',\n", 166 | " 'Micro-3',\n", 167 | " 'Macro',\n", 168 | " 'T cell',\n", 169 | "]" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "import multiprocessing\n", 179 | "from joblib import Parallel, delayed" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": { 186 | "scrolled": true 187 | }, 188 | "outputs": [], 189 | "source": [ 190 | "young_neighbors, young_zscore, young_nbor_pvals = compute_celltype_neighborhood(adata_annot_young, 'cell_type',celltypes=clust_annots, niter=500, radius=150)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "med_neighbors, med_zscore, med_nbor_pvals = compute_celltype_neighborhood(adata_annot_med, 'clust_annot',celltypes=clust_annots, niter=500, radius=150)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": { 206 | "scrolled": true 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "old_neighbors, old_zscore, old_nbor_pvals = compute_celltype_neighborhood(adata_annot_old, 'clust_annot', niter=500, celltypes=clust_annots, radius=150)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "# hierarchically cluster zscore\n", 220 | "from scipy.cluster.hierarchy import dendrogram, linkage\n", 221 | "from scipy.spatial.distance import pdist\n", 222 | "\n", 223 | "def hierarchical_cluster_order(mat, method='ward'):\n", 224 | " D = pdist(mat,'cosine')\n", 225 | " D[np.isnan(D)] = 0\n", 226 | " Z = linkage(D,method,optimal_ordering=True)\n", 227 | " den = dendrogram(Z, no_plot=True)\n", 228 | " return np.array(den['leaves'])\n", 229 | "\n", 230 | "def clust_avg(A, clust_key, clust_names):\n", 231 | " return np.array([A[A.obs[clust_key]==i].X.mean(0) for i in clust_names])" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "\n", 241 | "young_clust_avg = clust_avg(adata_annot_young, 'clust_annot', clust_annots)\n", 242 | "clust_order = hierarchical_cluster_order(young_zscore, method='complete')\n", 243 | "ex_clusts = np.argwhere([True if \"Ex\" in i else False for i in clust_annots]).flatten()\n", 244 | "in_clusts = np.argwhere([True if (\"In\" in i or \"MSN\" in i) else False for i in clust_annots]).flatten()\n", 245 | "nn_clusts = np.argwhere([True if (\"Ex\" not in i and \"In\" not in i and \"MSN\" not in i) else False for i in clust_annots]).flatten()\n", 246 | "\n", 247 | "exn_order = hierarchical_cluster_order(old_zscore[ex_clusts,:][:, ex_clusts], 'complete')\n", 248 | "in_order = hierarchical_cluster_order(old_zscore[in_clusts,:][:, in_clusts], 'complete')+exn_order.max()+1\n", 249 | "nn_order = hierarchical_cluster_order(old_zscore[nn_clusts,:][:, nn_clusts], 'complete')+in_order.max()+1\n", 250 | "\n", 251 | "clust_order = np.arange(len(clust_order))#np.hstack((exn_order, in_order, nn_order))" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "seg_points = [9, 20, 26]\n", 261 | "def plot_nborhood(zs, clust_order, hide_labels=False, seg_points=None,vmin=-200,vmax=200):\n", 262 | " f, ax = plt.subplots(figsize=(10,10))\n", 263 | " gs = plt.GridSpec(nrows=2,ncols=2,width_ratios=[0.5, 20], height_ratios=[20,0.5], wspace=0.01, hspace=0.01)\n", 264 | "\n", 265 | " ax = plt.subplot(gs[0,0])\n", 266 | " curr_cmap = mpl.colors.ListedColormap([label_colors[i] for i in np.array(clust_annots)[clust_order][::-1]])\n", 267 | " ax.imshow(np.expand_dims(np.arange(zs.shape[0]),1),aspect='auto',interpolation='none', cmap=curr_cmap,rasterized=True)\n", 268 | " sns.despine(ax=ax,bottom=True,left=True)\n", 269 | " if hide_labels:\n", 270 | " ax.set_yticks([]);\n", 271 | " ax.set_yticklabels([]);\n", 272 | " ax.set_xticks([])\n", 273 | " else:\n", 274 | " ax.set_yticks(np.arange(len(clust_annots)));\n", 275 | " ax.set_yticklabels(np.array(clust_annots)[clust_order][::-1]);\n", 276 | " ax.set_xticks([])\n", 277 | "\n", 278 | " ax = plt.subplot(gs[0,1])\n", 279 | " ax.imshow(np.flipud(zs[clust_order,:][:,clust_order]),aspect='auto',interpolation='none',vmin=vmin,vmax=vmax,cmap=plt.cm.seismic, rasterized=True)\n", 280 | " ax.axis('off')\n", 281 | " if seg_points is not None:\n", 282 | " for i in seg_points:\n", 283 | " ax.axvline(i-0.5,color='k',linestyle='--')\n", 284 | " ax.axhline(len(clust_annots)-i-0.5,color='k',linestyle='--')\n", 285 | "\n", 286 | " ax = plt.subplot(gs[1,1])\n", 287 | " ax.imshow(np.expand_dims(np.arange(zs.shape[1])[::-1],1).T,aspect='auto',interpolation='none',cmap=curr_cmap,rasterized=True)\n", 288 | " sns.despine(ax=ax,bottom=True,left=True)\n", 289 | " if hide_labels:\n", 290 | " ax.set_xticks([])\n", 291 | " ax.set_yticks([])\n", 292 | " else:\n", 293 | " ax.set_xticks(np.arange(len(clust_annots)));\n", 294 | " ax.set_yticks([])\n", 295 | " ax.set_xticklabels(np.array(clust_annots)[clust_order],rotation=90);\n", 296 | " return f\n" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "# Fig. 3: Neighborhood and interaction analysis" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "def nearest_neighbor_dists(A, cell_type_source, cell_type_targets):\n", 313 | " \"\"\" Compute nearest neighbor distances from source to targets \"\"\"\n", 314 | " pass" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "old_pos = adata_annot[adata_annot.obs.age=='90wk'].obsm['spatial']\n", 324 | "old_nn, _ = KDTree(old_pos).query(old_pos, k=2)\n" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "# look at gross distances to nearest neighbors\n", 334 | "from sklearn.neighbors import KDTree\n", 335 | "old_pos = adata_annot[adata_annot.obs.age=='90wk'].obsm['spatial']\n", 336 | "old_nn, _ = KDTree(old_pos).query(old_pos, k=2)\n", 337 | "old_nn = old_nn[:,1]\n", 338 | "\n", 339 | "med_pos = adata_annot[adata_annot.obs.age=='24wk'].obsm['spatial']\n", 340 | "med_nn, _ = KDTree(old_pos).query(old_pos, k=2)\n", 341 | "med_nn = med_nn[:,1]\n", 342 | "\n", 343 | "young_pos = adata_annot[adata_annot.obs.age=='4wk'].obsm['spatial']\n", 344 | "young_nn, _ = KDTree(young_pos).query(young_pos, k=2)\n", 345 | "young_nn = young_nn[:,1]\n" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "#celltypes = adata_annot.obs.remapped_cell_type.unique()\n", 355 | "celltypes = [\n", 356 | " 'InN',\n", 357 | " 'ExN',\n", 358 | " 'MSN',\n", 359 | " 'Astro',\n", 360 | " 'OPC',\n", 361 | " 'Olig',\n", 362 | " 'Endo',\n", 363 | " 'Vlmc',\n", 364 | " 'Peri',\n", 365 | " 'Macro',\n", 366 | " 'Micro',\n", 367 | "]\n" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "adata_annot.obs['clust_reduced'] = [\"-\".join(i.split('-')[:-1]) if len(i.split('-'))>1 else i for i in adata_annot.obs.clust_annot]" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "clust_reduced_labels = list(adata_annot.obs.clust_reduced.unique())" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "niter = 500\n", 395 | "perturb_max = 100\n", 396 | "dist_thresh = 20" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": { 403 | "scrolled": true 404 | }, 405 | "outputs": [], 406 | "source": [ 407 | "#celltypes = sorted(adata_annot.obs.cell_type.unique())\n", 408 | "young_interactions_clust, young_pvals_clust, young_qvals_clust = compute_celltype_interactions(adata_annot[adata_annot.obs.age=='4wk'], \n", 409 | " 'cell_type', celltypes,niter=niter,dist_thresh=dist_thresh,perturb_max=perturb_max)\n" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": { 416 | "scrolled": true 417 | }, 418 | "outputs": [], 419 | "source": [ 420 | "med_interactions_clust, med_pvals_clust, med_qvals_clust = compute_celltype_interactions(adata_annot[adata_annot.obs.age=='24wk'], \n", 421 | " 'cell_type', celltypes,niter=niter,dist_thresh=dist_thresh,perturb_max=perturb_max)\n" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [ 430 | "old_interactions_clust, old_pvals_clust, old_qvals_clust = compute_celltype_interactions(adata_annot[adata_annot.obs.age=='90wk'], \n", 431 | " 'cell_type', celltypes,niter=niter,dist_thresh=dist_thresh,perturb_max=perturb_max)\n" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "from statsmodels.stats.multitest import multipletests\n", 441 | "def fdr_correct(X):\n", 442 | " new_X = np.zeros_like(X)\n", 443 | " for i in range(X.shape[0]):\n", 444 | " pvals = multipletests(X[i,:],method='fdr_bh')[1]\n", 445 | " new_X[i,:] = multipletests(X[i,:],method='fdr_bh')[1]\n", 446 | " new_X[:,i] = new_X[i,:]\n", 447 | " #X = multipletests(X.flatten(), method='fdr_bh')[1]\n", 448 | " return new_X#X.reshape(X_shape)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "young_qvals_clust = fdr_correct(young_pvals_clust.copy())\n", 458 | "med_qvals_clust = fdr_correct(med_pvals_clust.copy())\n", 459 | "old_qvals_clust = fdr_correct(old_pvals_clust.copy())\n" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | "young_interactions_clust[np.isinf(young_interactions_clust)] = 5\n", 469 | "med_interactions_clust[np.isinf(med_interactions_clust)] = 5\n", 470 | "old_interactions_clust[np.isinf(old_interactions_clust)] = 5\n" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "young_qvals_clust[np.isnan(young_qvals_clust)] = 1\n", 480 | "med_qvals_clust[np.isnan(med_qvals_clust)] = 1\n", 481 | "old_qvals_clust[np.isnan(old_qvals_clust)] = 1" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "metadata": {}, 488 | "outputs": [], 489 | "source": [ 490 | "from plotting import plot_interactions" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "f = plot_interactions(young_qvals_clust, young_interactions_clust, celltypes,celltype_colors,cmap=plt.cm.seismic,vmax=5, vmin=-5)\n", 500 | "f.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_cell_contact_young.pdf\",bbox_inches='tight', dpi=200)" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [ 509 | "f = plot_interactions(med_qvals_clust, med_interactions_clust, celltypes,celltype_colors,cmap=plt.cm.seismic,vmax=3, vmin=-3)\n", 510 | "f.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_cell_contact_med.pdf\",bbox_inches='tight', dpi=200)" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": null, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [ 519 | "f = plot_interactions(old_qvals_clust, old_interactions_clust, celltypes,celltype_colors,cmap=plt.cm.seismic,vmax=3, vmin=-3)\n", 520 | "f.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_cell_contact_old.pdf\",bbox_inches='tight', dpi=200)" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": null, 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [ 529 | "diff = old_interactions_clust-young_interactions_clust\n", 530 | "diff[np.isnan(diff)] = 0\n", 531 | "for i in range(diff.shape[0]):\n", 532 | " for j in range(diff.shape[1]):\n", 533 | " if young_qvals_clust[i,j] < 0.05 or old_pvals_clust[i,j] < 0.05:\n", 534 | " pass\n", 535 | " else:\n", 536 | " diff[i,j] = 0\n", 537 | "diff_qvals = np.zeros_like(old_qvals_clust)\n", 538 | "for i in range(old_qvals_clust.shape[0]):\n", 539 | " for j in range(old_qvals_clust.shape[0]):\n", 540 | " if old_qvals_clust[i,j] < 0.05 or young_qvals_clust[i,j] < 0.05:\n", 541 | " if (old_interactions_clust[i,j]>0 or young_interactions_clust[i,j]>0):\n", 542 | " diff_qvals[i,j] = 0\n", 543 | " else:\n", 544 | " diff_qvals[i,j] = 1\n", 545 | " else:\n", 546 | " diff_qvals[i,j] = 1" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": null, 552 | "metadata": {}, 553 | "outputs": [], 554 | "source": [ 555 | "f = plot_interactions(diff_qvals, diff, celltypes,celltype_colors,cmap=plt.cm.Reds,vmax=1.2, vmin=0)\n", 556 | "f.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_cell_contact_diff.pdf\",bbox_inches='tight', dpi=200)" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": null, 562 | "metadata": {}, 563 | "outputs": [], 564 | "source": [ 565 | "#f,ax = plt.subplots(figsize=(5,5))\n", 566 | "#ax.imshow(np.flipud(diff),cmap=plt.cm.seismic,vmin=-2.5, vmax=2.5)\n", 567 | "#ax.set_xticks(np.arange(diff.shape[0]));\n", 568 | "#ax.set_xticklabels(celltypes,rotation=90)\n", 569 | "#ax.set_yticks(np.arange(diff.shape[1]));\n", 570 | "#ax.set_yticklabels(celltypes[::-1]);\n", 571 | "#sns.despine(ax=ax,left=True, bottom=True)\n", 572 | "#f.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_cell_contact_diff.pdf\",bbox_inches='tight', dpi=200)" 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "metadata": {}, 578 | "source": [ 579 | "# Redo this at higher resolution" 580 | ] 581 | }, 582 | { 583 | "cell_type": "markdown", 584 | "metadata": {}, 585 | "source": [ 586 | "# test effect of cell-cell interaction on activated state" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": null, 592 | "metadata": {}, 593 | "outputs": [], 594 | "source": [ 595 | "sc.tl.score_genes(adata_annot, gene_list=['B2m','Trem2', 'Ccl2', 'Apoe', 'Axl', 'Itgax', 'Cd9','C1qa','C1qc','Lyz2','Ctss'], score_name='activate_micro', use_raw=False)\n", 596 | "sc.tl.score_genes(adata_annot, gene_list=['C4b', 'C3', 'Serpina3n', 'Cxcl10', 'Gfap', 'Vim', 'Il18','Hif3a'], score_name='activate_astro', use_raw=False)\n", 597 | "\n", 598 | "sc.tl.score_genes(adata_annot, gene_list=activate_endo, score_name='activate_endo',use_raw=False)" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": null, 604 | "metadata": {}, 605 | "outputs": [], 606 | "source": [ 607 | "adata_micro = adata_annot[adata_annot.obs.cell_type==\"Micro\"]\n", 608 | "adata_annot.obs.activate_micro = adata_annot.obs.activate_micro - np.mean(adata_micro[adata_micro.obs.age=='4wk'].obs.activate_micro)" 609 | ] 610 | }, 611 | { 612 | "cell_type": "code", 613 | "execution_count": null, 614 | "metadata": {}, 615 | "outputs": [], 616 | "source": [ 617 | "adata_astro = adata_annot[adata_annot.obs.cell_type==\"Astro\"]\n", 618 | "adata_annot.obs.activate_astro = adata_annot.obs.activate_astro - np.mean(adata_astro[adata_astro.obs.age=='4wk'].obs.activate_astro)" 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": null, 624 | "metadata": {}, 625 | "outputs": [], 626 | "source": [ 627 | "from spatial_analysis import *\n", 628 | "from plotting import *" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": null, 634 | "metadata": {}, 635 | "outputs": [], 636 | "source": [ 637 | "sc.pl.umap(adata_annot, color=['age','activate_astro'])" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": null, 643 | "metadata": {}, 644 | "outputs": [], 645 | "source": [ 646 | "def identify_nearest_neighbors_with_idx(X,Y,dist_thresh, min_dist_thresh=15):\n", 647 | " if X.shape[0] > 0 and Y.shape[0] > 0:\n", 648 | " kdtree = KDTree(Y)\n", 649 | " ind, dists = kdtree.query_radius(X, r=dist_thresh, count_only=False,return_distance=True)\n", 650 | " ind_X = np.hstack([[i]*len(ind[i]) for i in np.arange(len(ind)) if len(ind[i])>0])\n", 651 | " \n", 652 | " ind = np.hstack(ind)\n", 653 | " dists = np.hstack(dists)\n", 654 | " if len(ind) > 0:\n", 655 | " ind = ind[dists>min_dist_thresh] \n", 656 | " ind_X = ind_X[dists>min_dist_thresh]\n", 657 | " return ind.astype(np.int), ind_X.astype(np.int)\n", 658 | " else:\n", 659 | " return np.array([])\n", 660 | "\n", 661 | "def count_neighbors_with_idx(X,Y,dist_thresh, ):\n", 662 | " if X.shape[0] > 0 and Y.shape[0] > 0:\n", 663 | " kdtree = KDTree(Y)\n", 664 | " ind, dists = kdtree.query_radius(X, r=dist_thresh, count_only=False,return_distance=True)\n", 665 | " counts_Y = np.array([len(i) for i in ind])\n", 666 | " ind_X = np.arange(len(ind))#np.array([i for i in np.arange(len(ind)) if len(ind[i])>0])\n", 667 | " return ind_X.astype(np.int), counts_Y.astype(np.int)\n", 668 | " else:\n", 669 | " return np.array([])\n", 670 | "\n", 671 | "def identify_nearest_neighbors_with_dist(X,Y, min_dist=0):\n", 672 | " if X.shape[0] > 0 and Y.shape[0] > 0:\n", 673 | " kdtree = KDTree(Y)\n", 674 | " dists, ind = kdtree.query(X, k=2,return_distance=True)\n", 675 | " print(dists.shape, ind.shape)\n", 676 | " good_dists = np.zeros(len(dists))\n", 677 | " good_ind = np.zeros(len(ind))\n", 678 | " for i in range(dists.shape[0]):\n", 679 | " if dists[i,0] > 0: # remove duplicates\n", 680 | " good_dists[i] = dists[i,0]\n", 681 | " good_ind[i] = ind[i,0]\n", 682 | " else:\n", 683 | " good_dists[i] = dists[i,1]\n", 684 | " good_ind[i] = ind[i,1]\n", 685 | " #ind_X = np.hstack([[i]*len(ind[i]) for i in np.arange(len(ind)) if len(ind[i])>0])\n", 686 | " return good_dists, good_ind\n", 687 | " else:\n", 688 | " return np.array([])\n", 689 | " \n", 690 | "def compute_celltype_obs_count_correlation(A,cell_type_X, cell_type_Y, obs_key_X, celltype_key='cell_type',radius=40, min_dist_thresh=15):\n", 691 | " X = A[A.obs[celltype_key] == cell_type_X]\n", 692 | " Y = A[A.obs[celltype_key] == cell_type_Y]\n", 693 | " obs_X = X.obs[obs_key_X]\n", 694 | " curr_X = X.obsm['spatial']\n", 695 | " curr_Y = Y.obsm['spatial']\n", 696 | " ind_X, counts_Y = count_neighbors_with_idx(curr_X, curr_Y, dist_thresh=radius)\n", 697 | " return obs_X.values[ind_X], ind_X, counts_Y\n", 698 | "\n", 699 | "\n", 700 | "def compute_celltype_obs_distance_correlation(A,cell_type_X, cell_type_Y, obs_key_X, celltype_key1='cell_type', celltype_key2='cell_type'):\n", 701 | " X = A[A.obs[celltype_key1] == cell_type_X]\n", 702 | " Y = A[A.obs[celltype_key2] == cell_type_Y]\n", 703 | " obs_X = X.obs[obs_key_X]\n", 704 | " curr_X = X.obsm['spatial']\n", 705 | " curr_Y = Y.obsm['spatial']\n", 706 | " dists_Y, ind_Y = identify_nearest_neighbors_with_dist(curr_X, curr_Y)\n", 707 | " return obs_X.values, dists_Y\n", 708 | "\n", 709 | "def compute_celltype_obs_correlation(A,cell_type_X, cell_type_Y, obs_key_X, obs_key_Y, celltype_key='cell_type', radius=40, min_dist_thresh=15):\n", 710 | " X = A[A.obs[celltype_key] == cell_type_X]\n", 711 | " Y = A[A.obs[celltype_key] == cell_type_Y]\n", 712 | " obs_X = X.obs[obs_key_X]\n", 713 | " obs_Y = Y.obs[obs_key_Y]\n", 714 | " curr_X = X.obsm['spatial']\n", 715 | " curr_Y = Y.obsm['spatial']\n", 716 | " neighbors_X, ind_X = identify_nearest_neighbors_with_idx(curr_X, curr_Y, dist_thresh=radius, min_dist_thresh=min_dist_thresh)\n", 717 | " curr_expr = obs_Y[neighbors_X]\n", 718 | " return obs_X.values[ind_X], curr_expr.values" 719 | ] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": null, 724 | "metadata": {}, 725 | "outputs": [], 726 | "source": [ 727 | "def compute_binned_values(dists, scores, min_d=0, max_d=100, bin_size=30):\n", 728 | " binned_mean = np.zeros(max_d-min_d-bin_size)\n", 729 | " binned_std = np.zeros(max_d-min_d-bin_size)\n", 730 | " for i in np.arange(min_d, max_d-bin_size):\n", 731 | " # find distances in this bin range\n", 732 | " idx = np.argwhere(np.logical_and(dists>i, dists<=(i+bin_size)))\n", 733 | " curr_scores = scores[idx]\n", 734 | " binned_mean[i] = np.mean(curr_scores)#/len(idx)\n", 735 | " binned_std[i] = np.std(curr_scores)/np.sqrt(len(curr_scores))#/len(idx)\n", 736 | " binned_mean -= binned_mean.mean()\n", 737 | " binned_std -= binned_mean.mean()\n", 738 | " return binned_mean, binned_std" 739 | ] 740 | }, 741 | { 742 | "cell_type": "code", 743 | "execution_count": null, 744 | "metadata": {}, 745 | "outputs": [], 746 | "source": [ 747 | "# astro to peri-1/peri-2\n", 748 | "plt.figure(figsize=(3,3))\n", 749 | "celltypes = [\"Peri-1\",\"Peri-2\"]\n", 750 | "for i in celltypes:\n", 751 | " scores, dists = compute_celltype_obs_distance_correlation(adata_annot[adata_annot.obs.age=='4wk'], \"Astro\", i, \"activate_astro\", celltype_key2='clust_annot')\n", 752 | " binned_mean, binned_std = compute_binned_values(dists, scores,bin_size=30,max_d=80)\n", 753 | " x = np.arange(len(binned_mean))+30\n", 754 | " plt.plot(x,binned_mean,color=label_colors[i])\n", 755 | " plt.fill_between(x,binned_mean-binned_std, binned_mean+binned_std,alpha=0.1,color=label_colors[i])\n", 756 | "#plt.legend( celltypes)\n", 757 | "plt.ylim([-0.2, 0.3])\n", 758 | "sns.despine()\n", 759 | "plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_distance_peri_score_4wk.pdf\",bbox_inches='tight',dpi=300)" 760 | ] 761 | }, 762 | { 763 | "cell_type": "code", 764 | "execution_count": null, 765 | "metadata": {}, 766 | "outputs": [], 767 | "source": [ 768 | "# astro to peri-1/peri-2\n", 769 | "plt.figure(figsize=(3,3))\n", 770 | "celltypes = [\"Peri-1\",\"Peri-2\"]\n", 771 | "for i in celltypes:\n", 772 | " scores, dists = compute_celltype_obs_distance_correlation(adata_annot[adata_annot.obs.age=='90wk'], \"Astro\", i, \"activate_astro\", celltype_key2='clust_annot')\n", 773 | " binned_mean, binned_std = compute_binned_values(dists, scores,bin_size=30,max_d=80)\n", 774 | " x = np.arange(len(binned_mean))+30\n", 775 | " plt.plot(x,binned_mean,color=label_colors[i])\n", 776 | " plt.fill_between(x,binned_mean-binned_std, binned_mean+binned_std,alpha=0.1,color=label_colors[i])\n", 777 | "#plt.legend( celltypes)\n", 778 | "plt.ylim([-0.2, 0.4])\n", 779 | "sns.despine()\n", 780 | "plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_distance_peri_score_90wk.pdf\",bbox_inches='tight',dpi=300)" 781 | ] 782 | }, 783 | { 784 | "cell_type": "code", 785 | "execution_count": null, 786 | "metadata": {}, 787 | "outputs": [], 788 | "source": [ 789 | "plt.figure(figsize=(3,3))\n", 790 | "celltypes = [\"Peri\",\"Endo\",\"Vlmc\", \"Olig\"]\n", 791 | "for i in celltypes:\n", 792 | " scores, dists = compute_celltype_obs_distance_correlation(adata_annot[adata_annot.obs.age=='4wk'], \"Micro\", i, \"activate_micro\")\n", 793 | " binned_mean, binned_std = compute_binned_values(dists, scores,bin_size=30)\n", 794 | " x = np.arange(len(binned_mean))+30\n", 795 | " plt.plot(x,binned_mean,color=celltype_colors[i])\n", 796 | " plt.fill_between(x,binned_mean-binned_std, binned_mean+binned_std,alpha=0.1,color=celltype_colors[i])\n", 797 | "plt.legend( celltypes)\n", 798 | "plt.ylim([-0.05, 0.12])\n", 799 | "sns.despine()\n", 800 | "#plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_distance_micro_score_4wk.pdf\",bbox_inches='tight',dpi=300)" 801 | ] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "execution_count": null, 806 | "metadata": {}, 807 | "outputs": [], 808 | "source": [ 809 | "plt.figure(figsize=(3,3))\n", 810 | "for i in celltypes:\n", 811 | " scores, dists = compute_celltype_obs_distance_correlation(adata_annot[adata_annot.obs.age=='90wk'], \"Micro\", i, \"activate_micro\")\n", 812 | " binned_mean, binned_std = compute_binned_values(dists, scores,bin_size=30)\n", 813 | " x = np.arange(len(binned_mean))+30\n", 814 | " plt.plot(x,binned_mean,color=celltype_colors[i])\n", 815 | " plt.fill_between(x,binned_mean-binned_std, binned_mean+binned_std,alpha=0.1,color=celltype_colors[i])\n", 816 | "\n", 817 | "plt.legend( celltypes)\n", 818 | "plt.ylim([-0.05, 0.12])\n", 819 | "sns.despine()\n", 820 | "#plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_distance_micro_score_90wk.pdf\",bbox_inches='tight',dpi=300)" 821 | ] 822 | }, 823 | { 824 | "cell_type": "code", 825 | "execution_count": null, 826 | "metadata": {}, 827 | "outputs": [], 828 | "source": [ 829 | "plt.figure(figsize=(3,3))\n", 830 | "#celltypes = [\"Endo\",\"Vlmc\", \"Olig\", \"Micro\"]\n", 831 | "for i in celltypes:\n", 832 | " scores, dists = compute_celltype_obs_distance_correlation(adata_annot[adata_annot.obs.age=='4wk'], \"Astro\", i, \"activate_astro\")\n", 833 | " binned_mean, binned_std = compute_binned_values(dists, scores,bin_size=30,max_d=100)\n", 834 | " x = np.arange(len(binned_mean))+30\n", 835 | " plt.plot(x,binned_mean,color=celltype_colors[i])\n", 836 | " plt.fill_between(x,binned_mean-binned_std, binned_mean+binned_std,alpha=0.1,color=celltype_colors[i])\n", 837 | "plt.legend( celltypes )\n", 838 | "sns.despine()\n", 839 | "plt.ylim([-0.2, 0.3])\n", 840 | "#plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_distance_astro_score_4wk.pdf\",bbox_inches='tight',dpi=300)" 841 | ] 842 | }, 843 | { 844 | "cell_type": "code", 845 | "execution_count": null, 846 | "metadata": {}, 847 | "outputs": [], 848 | "source": [ 849 | "plt.figure(figsize=(3,3))\n", 850 | "\n", 851 | "for i in celltypes:\n", 852 | " scores, dists = compute_celltype_obs_distance_correlation(adata_annot[adata_annot.obs.age=='90wk'], \"Astro\", i, \"activate_astro\")\n", 853 | " binned_mean, binned_std = compute_binned_values(dists, scores,bin_size=30,max_d=150)\n", 854 | " x = np.arange(len(binned_mean))+30\n", 855 | " plt.plot(x,binned_mean,color=celltype_colors[i])\n", 856 | " plt.fill_between(x,binned_mean-binned_std, binned_mean+binned_std,alpha=0.1,color=celltype_colors[i])\n", 857 | "#plt.legend(celltypes)\n", 858 | "sns.despine()\n", 859 | "#plt.ylim([-0.2, 0.3])\n", 860 | "#plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_distance_astro_score_90wk.pdf\",bbox_inches='tight',dpi=300)" 861 | ] 862 | }, 863 | { 864 | "cell_type": "code", 865 | "execution_count": null, 866 | "metadata": {}, 867 | "outputs": [], 868 | "source": [ 869 | "sc.tl.score_genes(adata_annot, gene_list=[ \"C4b\", \"Il18\", \"Il33\"], score_name=\"activate_olig\",use_raw=False)" 870 | ] 871 | }, 872 | { 873 | "cell_type": "code", 874 | "execution_count": null, 875 | "metadata": {}, 876 | "outputs": [], 877 | "source": [ 878 | "adata_olig = adata_annot[adata_annot.obs.cell_type==\"Olig\"]\n", 879 | "adata_annot.obs.activate_olig = adata_annot.obs.activate_olig - np.mean(adata_olig[adata_olig.obs.age=='4wk'].obs.activate_olig)" 880 | ] 881 | }, 882 | { 883 | "cell_type": "code", 884 | "execution_count": null, 885 | "metadata": {}, 886 | "outputs": [], 887 | "source": [ 888 | "x,y = compute_celltype_obs_correlation(adata_annot, \"Olig\",\"Micro\", f\"activate_olig\",f\"activate_micro\", radius=40)\n" 889 | ] 890 | }, 891 | { 892 | "cell_type": "code", 893 | "execution_count": null, 894 | "metadata": {}, 895 | "outputs": [], 896 | "source": [ 897 | "spatial_regions = [\"Pia\", \"L2/3\", \"L5\", \"L6\", \"CC\", \"Striatum\", \"Ventricle\"]\n", 898 | "ct_combos = [[\"Olig\", \"Astro\"],[\"Olig\",\"Micro\"],[\"Micro\", \"Astro\"]]\n", 899 | "cc = np.zeros((len(spatial_regions), 3))\n", 900 | "for i,r in enumerate(spatial_regions):\n", 901 | " print(r)\n", 902 | " for j,t in enumerate(ct_combos):\n", 903 | " t1 = t[0]\n", 904 | " t2 = t[1]\n", 905 | " #curr_annot = adata_annot[adata_annot.obs.age=='90wk']\n", 906 | " x,y = compute_celltype_obs_correlation(adata_annot[adata_annot.obs.spatial_clust_annots==r], t1,t2, f\"activate_{t1.lower()}\",f\"activate_{t2.lower()}\", radius=40)\n", 907 | " cc[i,j] = np.corrcoef(x,y)[0,1]\n", 908 | " " 909 | ] 910 | }, 911 | { 912 | "cell_type": "code", 913 | "execution_count": null, 914 | "metadata": {}, 915 | "outputs": [], 916 | "source": [ 917 | "# look at correlation between Il33 and Activated Micro/Astro\n", 918 | "x,y = compute_celltype_obs_correlation(adata_annot[adata_annot.obs.spatial_clust_annots==\"CC\"], \"Olig\",\"Micro\", f\"activate_olig\",f\"activate_micro\", radius=30)\n", 919 | "plt.figure(figsize=(5,5))\n", 920 | "#plt.scatter(x,y,s=1)\n", 921 | "plt.title(f\"Olig -> Micro (R={np.corrcoef(x,y)[0,1]})\")\n", 922 | "sns.kdeplot(x=x,y=y,fill=True)\n", 923 | "#plt.xlim([0,5])\n", 924 | "#plt.axis('off')\n", 925 | "sns.despine()\n", 926 | "plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_activation_corr_olig_micro.pdf\",bbox_inches='tight',dpi=300)" 927 | ] 928 | }, 929 | { 930 | "cell_type": "code", 931 | "execution_count": null, 932 | "metadata": {}, 933 | "outputs": [], 934 | "source": [ 935 | "# look at correlation between Il33 and Activated Micro/Astro\n", 936 | "x,y = compute_celltype_obs_correlation(adata_annot[adata_annot.obs.spatial_clust_annots==\"CC\"], \"Olig\",\"Astro\", f\"activate_olig\",f\"activate_astro\", radius=30)\n", 937 | "plt.figure(figsize=(5,5))\n", 938 | "#plt.scatter(x,y,s=1)\n", 939 | "plt.title(f\"Olig -> Astro (R={np.corrcoef(x,y)[0,1]})\")\n", 940 | "sns.kdeplot(x=x,y=y,fill=True)\n", 941 | "#plt.xlim([0,5])\n", 942 | "#plt.axis('off')\n", 943 | "sns.despine()\n", 944 | "#plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_activation_corr_olig_astro.pdf\",bbox_inches='tight',dpi=300)" 945 | ] 946 | }, 947 | { 948 | "cell_type": "code", 949 | "execution_count": null, 950 | "metadata": {}, 951 | "outputs": [], 952 | "source": [ 953 | "adata_annot_yng = adata_annot[adata_annot.obs.age=='4wk']\n", 954 | "adata_annot_old = adata_annot[adata_annot.obs.age=='90wk']\n" 955 | ] 956 | }, 957 | { 958 | "cell_type": "code", 959 | "execution_count": null, 960 | "metadata": {}, 961 | "outputs": [], 962 | "source": [ 963 | "# look at correlation between Il33 and Activated Micro/Astro\n", 964 | "x,y = compute_celltype_obs_correlation(adata_annot_yng[adata_annot_yng.obs.spatial_clust_annots==\"CC\"], \"Olig\",\"Micro\", f\"activate_olig\",f\"activate_micro\", radius=30)\n", 965 | "plt.figure(figsize=(5,5))\n", 966 | "#plt.scatter(x,y,s=1)\n", 967 | "plt.title(f\"Olig -> Micro (R={np.corrcoef(x,y)[0,1]})\")\n", 968 | "sns.kdeplot(x=x,y=y,fill=True)\n", 969 | "#plt.xlim([0,5])\n", 970 | "#plt.axis('off')\n", 971 | "sns.despine()\n", 972 | "plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_activation_corr_olig_micro_yng.pdf\",bbox_inches='tight',dpi=300)" 973 | ] 974 | }, 975 | { 976 | "cell_type": "code", 977 | "execution_count": null, 978 | "metadata": {}, 979 | "outputs": [], 980 | "source": [ 981 | "# look at correlation between Il33 and Activated Micro/Astro\n", 982 | "x,y = compute_celltype_obs_correlation(adata_annot_old[adata_annot_old.obs.spatial_clust_annots==\"CC\"], \"Olig\",\"Micro\", f\"activate_olig\",f\"activate_micro\", radius=30)\n", 983 | "plt.figure(figsize=(5,5))\n", 984 | "#plt.scatter(x,y,s=1)\n", 985 | "plt.title(f\"Olig -> Micro (R={np.corrcoef(x,y)[0,1]})\")\n", 986 | "sns.kdeplot(x=x,y=y,fill=True)\n", 987 | "#plt.xlim([0,5])\n", 988 | "#plt.axis('off')\n", 989 | "sns.despine()\n", 990 | "plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_activation_corr_olig_micro_old.pdf\",bbox_inches='tight',dpi=300)" 991 | ] 992 | }, 993 | { 994 | "cell_type": "code", 995 | "execution_count": null, 996 | "metadata": {}, 997 | "outputs": [], 998 | "source": [ 999 | "# look at correlation between Il33 and Activated Micro/Astro\n", 1000 | "x,y = compute_celltype_obs_correlation(adata_annot_old[adata_annot_old.obs.spatial_clust_annots==\"CC\"], \"Olig\",\"Astro\", f\"activate_olig\",f\"activate_astro\", radius=30)\n", 1001 | "plt.figure(figsize=(5,5))\n", 1002 | "#plt.scatter(x,y,s=1)\n", 1003 | "plt.title(f\"Olig -> Astro (R={np.corrcoef(x,y)[0,1]})\")\n", 1004 | "sns.kdeplot(x=x,y=y,fill=True)\n", 1005 | "#plt.xlim([0,5])\n", 1006 | "#plt.axis('off')\n", 1007 | "sns.despine()\n", 1008 | "plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_activation_corr_olig_astro_old.pdf\",bbox_inches='tight',dpi=300)" 1009 | ] 1010 | }, 1011 | { 1012 | "cell_type": "code", 1013 | "execution_count": null, 1014 | "metadata": {}, 1015 | "outputs": [], 1016 | "source": [ 1017 | "# look at correlation between Il33 and Activated Micro/Astro\n", 1018 | "x,y = compute_celltype_obs_correlation(adata_annot_yng[adata_annot_yng.obs.spatial_clust_annots==\"CC\"], \"Olig\",\"Astro\", f\"activate_olig\",f\"activate_astro\", radius=30)\n", 1019 | "plt.figure(figsize=(5,5))\n", 1020 | "#plt.scatter(x,y,s=1)\n", 1021 | "plt.title(f\"Olig -> Astro (R={np.corrcoef(x,y)[0,1]})\")\n", 1022 | "sns.kdeplot(x=x,y=y,fill=True)\n", 1023 | "#plt.xlim([0,5])\n", 1024 | "#plt.axis('off')\n", 1025 | "sns.despine()\n", 1026 | "plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_activation_corr_olig_astro_yng.pdf\",bbox_inches='tight',dpi=300)" 1027 | ] 1028 | }, 1029 | { 1030 | "cell_type": "code", 1031 | "execution_count": null, 1032 | "metadata": {}, 1033 | "outputs": [], 1034 | "source": [ 1035 | "x,y = compute_celltype_obs_correlation(adata_annot[adata_annot.obs.spatial_clust_annots==\"CC\"], \"Micro\",\"Astro\", f\"activate_micro\",f\"activate_astro\", radius=30)\n", 1036 | "plt.figure(figsize=(5,5))\n", 1037 | "plt.title(f\"Micro -> Astro (R={np.corrcoef(x,y)[0,1]})\")\n", 1038 | "#plt.hist2d(x,y,cmap=plt.cm.viridis,bins=20,rasterized=True);\n", 1039 | "#plt.scatter(x,y,s=1)\n", 1040 | "sns.kdeplot(x=x,y=y,fill=True)\n", 1041 | "#plt.xlim([0,5])\n", 1042 | "#plt.axis('off')\n", 1043 | "sns.despine()\n", 1044 | "\n", 1045 | "plt.savefig(\"/home/user/Dropbox/zhuang_lab/aging/aging_atlas_paper/figures_int/fig3_activation_corr_micro_astro.pdf\",bbox_inches='tight',dpi=300)" 1046 | ] 1047 | } 1048 | ], 1049 | "metadata": { 1050 | "kernelspec": { 1051 | "display_name": "Python 3.8.1 64-bit ('scrnaseq': conda)", 1052 | "language": "python", 1053 | "name": "python38164bitscrnaseqcondaced2695c94d346d998c0cef2164233d9" 1054 | }, 1055 | "language_info": { 1056 | "codemirror_mode": { 1057 | "name": "ipython", 1058 | "version": 3 1059 | }, 1060 | "file_extension": ".py", 1061 | "mimetype": "text/x-python", 1062 | "name": "python", 1063 | "nbconvert_exporter": "python", 1064 | "pygments_lexer": "ipython3", 1065 | "version": "3.8.1" 1066 | } 1067 | }, 1068 | "nbformat": 4, 1069 | "nbformat_minor": 4 1070 | } 1071 | -------------------------------------------------------------------------------- /python/de.py: -------------------------------------------------------------------------------- 1 | import statsmodels.api as sm 2 | import statsmodels.formula.api as smf 3 | from statsmodels.stats.multitest import multipletests 4 | import pandas as pd 5 | import numpy as np 6 | from tqdm import tqdm 7 | import diffxpy as de 8 | 9 | def lrtest(llmin,llmax): 10 | lr = likelihood_ratio(llmin, llmax) 11 | p = chi2.sf(lr,1) 12 | return p 13 | from scipy.stats.distributions import chi2 14 | def likelihood_ratio(llmin, llmax): 15 | llmin = -llmin 16 | llmax = -llmax 17 | return(2*(llmax-llmin)) 18 | 19 | def run_glm_de_age_lps_merfish(adata, family='poisson', grouping='cell_type_annot', logfc_thresh=np.log(1)): 20 | # do LR test 21 | import warnings 22 | from statsmodels.tools.sm_exceptions import ConvergenceWarning, PrecisionWarning, IterationLimitWarning, EstimationWarning, SingularMatrixWarning 23 | #from statsmodels.regression.linear_model.OLSResults import compare_lr_test 24 | warnings.simplefilter('ignore', ConvergenceWarning) 25 | warnings.simplefilter('ignore', PrecisionWarning) 26 | warnings.simplefilter('ignore', IterationLimitWarning) 27 | warnings.simplefilter('ignore', EstimationWarning) 28 | warnings.simplefilter('ignore', SingularMatrixWarning) 29 | warnings.simplefilter('ignore', FutureWarning) 30 | 31 | if family == 'nb': 32 | family = sm.families.NegativeBinomial() 33 | elif family == 'poisson': 34 | family = sm.families.Poisson() 35 | 36 | all_model_fits = {} 37 | all_results = {} 38 | 39 | for clust in adata.obs[grouping].unique()[::-1]: 40 | print(clust) 41 | curr_adata = adata[adata.obs[grouping]==clust].copy() 42 | print(curr_adata.shape) 43 | curr_coefs_age = [] 44 | curr_coefs_lps = [] 45 | curr_pvals = [] 46 | curr_genes = list(curr_adata.var_names) 47 | for i in tqdm(range(len(curr_genes))): 48 | try: 49 | 50 | curr_adata.obs["Y"] = curr_adata[:,curr_genes[i]].X.toarray() 51 | formula = "Y ~ C(age) + C(cond) + log_umi" 52 | #mdf = smf.glm(formula, data=curr_adata.obs, family=family).fit(maxiter=50,disp=0) 53 | mdf = smf.ols(formula, data=curr_adata.obs).fit(maxiter=50,disp=0) 54 | #mdf_reduced = smf.ols(formula_reduced, data=curr_adata.obs).fit(maxiter=50,disp=0) 55 | formula_reduced = "Y ~ log_umi" 56 | #mdf_reduced = smf.glm(formula_reduced, data=curr_adata.obs, family=family).fit(maxiter=50,disp=0) 57 | mdf_reduced = smf.ols(formula_reduced, data=curr_adata.obs).fit(maxiter=50,disp=0) 58 | curr_coefs_age.append(mdf.params['C(age)[T.90wk]']) 59 | curr_coefs_lps.append(mdf.params["C(cond)[T.lps]"]) 60 | curr_pvals.append(lrtest(mdf.llf, mdf_reduced.llf)) 61 | #curr_pvals.append(mdf.compare_lr_test(mdf_reduced)[]) 62 | except Exception as e: 63 | print(e) 64 | curr_coefs_age.append(None) 65 | curr_coefs_lps.append(None) 66 | curr_pvals.append(None) 67 | #curr_genes = [curr_genes[i] for i in range(len(curr_genes)) if curr_coefs_age[i] is not None or curr_coefs_lps is not None] 68 | #coef_age = [c for c in curr_coefs_age if c is not None] 69 | #coef_lps = [c for c in curr_coefs_lps if c is not None] 70 | pvals = [p for p in curr_pvals if p is not None] 71 | results = pd.DataFrame({'cell_type':clust, 'coef_age':curr_coefs_age, 'coef_lps':curr_coefs_lps, 'pval':pvals, 'gene':curr_genes}) 72 | results['qval'] = multipletests(results.pval, method='fdr_bh')[1] 73 | all_results[clust] = results 74 | return all_results 75 | 76 | def run_glm_de_age_merfish(adata, family='poisson', grouping='cell_type_annot', obs_name="age", comp_name="T.90wk", logfc_thresh=np.log(1)): 77 | # do LR test 78 | import warnings 79 | from statsmodels.tools.sm_exceptions import ConvergenceWarning, PrecisionWarning, IterationLimitWarning, EstimationWarning, SingularMatrixWarning 80 | #from statsmodels.regression.linear_model.OLSResults import compare_lr_test 81 | warnings.simplefilter('ignore', ConvergenceWarning) 82 | warnings.simplefilter('ignore', PrecisionWarning) 83 | warnings.simplefilter('ignore', IterationLimitWarning) 84 | warnings.simplefilter('ignore', EstimationWarning) 85 | warnings.simplefilter('ignore', SingularMatrixWarning) 86 | warnings.simplefilter('ignore', FutureWarning) 87 | warnings.simplefilter('ignore',RuntimeWarning) 88 | if family == 'nb': 89 | family = sm.families.NegativeBinomial() 90 | elif family == 'poisson': 91 | family = sm.families.Poisson() 92 | 93 | all_model_fits = {} 94 | all_results = {} 95 | 96 | for clust in adata.obs[grouping].unique()[::-1]: 97 | print(clust) 98 | curr_adata = adata[adata.obs[grouping]==clust].copy() 99 | print(curr_adata.shape) 100 | curr_coefs = [] 101 | curr_pvals = [] 102 | curr_stderr = [] 103 | curr_genes = list(curr_adata.var_names) 104 | for i in tqdm(range(len(curr_genes))): 105 | try: 106 | 107 | curr_adata.obs["Y"] = curr_adata[:,curr_genes[i]].X.toarray() 108 | formula = f"Y ~ C({obs_name}) + 1"# + log_umi" 109 | if family != "ols": 110 | mdf = smf.glm(formula, data=curr_adata.obs, family=family).fit(maxiter=50,disp=0) 111 | else: 112 | mdf = smf.ols(formula, data=curr_adata.obs).fit(maxiter=50,disp=0) 113 | #mdf_reduced = smf.ols(formula_reduced, data=curr_adata.obs).fit(maxiter=50,disp=0) 114 | formula_reduced = "Y ~ 1" #log_umi" 115 | if family != "ols": 116 | mdf_reduced = smf.glm(formula_reduced, data=curr_adata.obs, family=family).fit(maxiter=50,disp=0) 117 | else: 118 | mdf_reduced = smf.ols(formula_reduced, data=curr_adata.obs).fit(maxiter=50,disp=0) 119 | curr_coefs.append(mdf.params[f'C({obs_name})[{comp_name}]']) 120 | curr_pvals.append(lrtest(mdf.llf, mdf_reduced.llf)) 121 | curr_stderr.append(mdf.bse[f'C({obs_name})[{comp_name}]']) 122 | #curr_pvals.append(mdf.compare_lr_test(mdf_reduced)[]) 123 | except Exception as e: 124 | print(e) 125 | curr_coefs.append(None) 126 | curr_pvals.append(None) 127 | curr_stderr.append(None) 128 | #curr_genes = [curr_genes[i] for i in range(len(curr_genes)) if curr_coefs[i] is not None] 129 | #coef = [c for c in curr_coefs if c is not None] 130 | #pvals = [p for p in curr_pvals if p is not None] 131 | stderrs = [s for s in curr_stderr if s is not None] 132 | results = pd.DataFrame({'cell_type':clust, 'coef':curr_coefs, 'pval':curr_pvals, 'gene':curr_genes, 'stderr': curr_stderr}) 133 | results['qval'] = multipletests(results.pval, method='fdr_bh')[1] 134 | all_results[clust] = results 135 | return all_results 136 | 137 | 138 | def geomean(X,axis=1,epsilon=1): 139 | return np.exp(np.mean(np.log(X+epsilon), axis))-epsilon 140 | 141 | def avg_umi_per_gene(X): 142 | return np.sum(X,1) 143 | 144 | def compute_frac_expressed(A): 145 | return np.array((A.X>0).sum(0)/A.shape[0]).flatten() 146 | 147 | def compute_mean_expression(A): 148 | return np.array(A.X.mean(0)).flatten() 149 | 150 | def filter_2group_1way(A, obs_name, ident, min_pct=None, logfc_thresh=None, min_diff_pct=None, max_cells_per_ident=None, log=True): 151 | """ 152 | Filter genes before differential expression testing. UNIDIRECTIONAL 153 | obs is grouping 154 | ident is what to be compared (ident vs ~ident) 155 | min_diff_pcr: minimum difference in percentage between genes 156 | log: is the data log transformed (usually this is the case) 157 | """ 158 | n_cells, n_genes = A.shape 159 | X = A[A.obs[obs_name] == ident] 160 | Y = A[A.obs[obs_name] != ident] 161 | 162 | min_pct_mask = np.ones((n_genes,),dtype=np.bool) 163 | log_fc_mask = np.ones((n_genes,),dtype=np.bool) 164 | min_diff_pct_mask = np.ones((n_genes,),dtype=np.bool) 165 | 166 | pct_X = compute_frac_expressed(X) 167 | pct_Y = compute_frac_expressed(Y) 168 | 169 | if min_pct: 170 | min_pct_mask = (pct_X>min_pct).flatten() 171 | 172 | mean_X = compute_mean_expression(X) 173 | mean_Y = compute_mean_expression(Y) 174 | if log: 175 | logfc_XY = np.log(np.exp(mean_X)/np.exp(mean_Y)) 176 | else: 177 | logfc_XY = np.log(mean_X/mean_Y) 178 | 179 | if logfc_thresh: 180 | log_fc_mask = (logfc_XY > logfc_thresh).flatten() 181 | 182 | if min_diff_pct: 183 | diff_pct_XY = pct_X-pct_Y 184 | min_diff_pct_mask = (diff_pct_XY > min_diff_pct).flatten() 185 | final_mask = np.logical_and(np.logical_and(min_pct_mask, log_fc_mask), min_diff_pct_mask).flatten() 186 | A = A[:, final_mask] 187 | 188 | if max_cells_per_ident: 189 | idx_X = np.nonzero((adata.obs[obs_name]==ident).values)[0] 190 | idx_Y = np.nonzero((adata.obs[obs_name]!=ident).values)[0] 191 | ids_X = idx_A[np.random.permutation(len(idx_X))[:max_cells_per_ident]] 192 | ids_Y = idx_B[np.random.permutation(len(idx_Y))[:max_cells_per_ident]] 193 | combined_ids = np.hstack((ids_X, ids_Y)).flatten() 194 | return A[combined_ids,:], logfc_XY[np.array(final_mask).flatten()] 195 | else: 196 | 197 | return A, logfc_XY[np.array(final_mask).flatten()] 198 | 199 | def filter_2group(A, obs_name, ident, min_pct=None, logfc_thresh=None, min_diff_pct=None, max_cells_per_ident=None, log=True): 200 | """ 201 | Filter genes before differential expression testing. NOTE this is bidirectional 202 | obs is grouping 203 | ident is what to be compared (ident vs ~ident) 204 | min_diff_pcr: minimum difference in percentage between genes 205 | log: is the data log transformed (usually this is the case) 206 | """ 207 | n_cells, n_genes = A.shape 208 | X = A[A.obs[obs_name] == ident] 209 | Y = A[A.obs[obs_name] != ident] 210 | 211 | min_pct_mask = np.ones((n_genes,),dtype=np.bool) 212 | log_fc_mask = np.ones((n_genes,),dtype=np.bool) 213 | min_diff_pct_mask = np.ones((n_genes,),dtype=np.bool) 214 | 215 | pct_X = compute_frac_expressed(X) 216 | pct_Y = compute_frac_expressed(Y) 217 | 218 | if min_pct: 219 | min_pct_mask = np.logical_or(pct_X>min_pct, pct_Y>min_pct).flatten() 220 | 221 | mean_X = compute_mean_expression(X) 222 | mean_Y = compute_mean_expression(Y) 223 | if log: 224 | logfc_XY = np.log(np.exp(mean_X)/np.exp(mean_Y)) 225 | 226 | logfc_YX = np.log(np.exp(mean_Y)/np.exp(mean_X)) 227 | else: 228 | logfc_XY = np.log(mean_X/mean_Y) 229 | logfc_YX = np.log(mean_Y/mean_X) 230 | 231 | if logfc_thresh: 232 | log_fc_mask = np.logical_or(logfc_XY > logfc_thresh, logfc_YX > logfc_thresh).flatten() 233 | 234 | if min_diff_pct: 235 | diff_pct_XY = pct_X-pct_Y 236 | diff_pct_YX = pct_Y-pct_X 237 | min_diff_pct_mask = np.logical_or(diff_pct_XY > min_diff_pct, diff_pct_YX > min_diff_pct).flatten() 238 | final_mask = np.logical_and(np.logical_and(min_pct_mask, log_fc_mask), min_diff_pct_mask).flatten() 239 | A = A[:, final_mask] 240 | 241 | if max_cells_per_ident: 242 | idx_X = np.nonzero((adata.obs[obs_name]==ident).values)[0] 243 | idx_Y = np.nonzero((adata.obs[obs_name]!=ident).values)[0] 244 | ids_X = idx_A[np.random.permutation(len(idx_X))[:max_cells_per_ident]] 245 | ids_Y = idx_B[np.random.permutation(len(idx_Y))[:max_cells_per_ident]] 246 | combined_ids = np.hstack((ids_X, ids_Y)).flatten() 247 | return A[combined_ids,:], logfc_XY[np.array(final_mask).flatten()] 248 | else: 249 | 250 | return A, logfc_XY[np.array(final_mask).flatten()] 251 | 252 | from scipy.stats.distributions import chi2 253 | def likelihood_ratio(llmin, llmax): 254 | llmin = -llmin 255 | llmax = -llmax 256 | return(2*(llmax-llmin)) 257 | 258 | def lrtest(llmin,llmax): 259 | lr = likelihood_ratio(llmin, llmax) 260 | p = chi2.sf(lr,1) 261 | return p 262 | 263 | def run_glm_de_pairwise(curr_adata, contrast, lognorm=False): 264 | # run glm on pair of clusters, using contrast as True/False 265 | curr_coefs = [] 266 | curr_pvals = [] 267 | curr_stderr = [] 268 | curr_genes = list(curr_adata.var_names) 269 | family = sm.families.NegativeBinomial() 270 | for i in range(len(curr_genes)): 271 | try: 272 | if lognorm: 273 | curr_adata.obs["Y"] = np.log1p(curr_adata[:,curr_genes[i]].layers['counts'].toarray()) 274 | else: 275 | curr_adata.obs["Y"] = curr_adata[:,curr_genes[i]].layers['counts'].toarray() 276 | formula = f"Y ~ C({contrast}) + log_umi + avg_UMI" 277 | mdf = smf.glm(formula, data=curr_adata.obs, family=family).fit(maxiter=50,disp=0) 278 | 279 | #mdf = smf.ols(formula, data=curr_adata.obs).fit(maxiter=30,disp=0) 280 | formula_reduced = "Y ~ log_umi + avg_UMI" 281 | #mdf_reduced = smf.ols(formula_reduced, data=curr_adata.obs).fit(maxiter=30,disp=0) 282 | 283 | mdf_reduced = smf.glm(formula_reduced, data=curr_adata.obs, family=family).fit(maxiter=50,disp=0) 284 | curr_coefs.append(mdf.params[f'C({contrast})[T.True]']) 285 | curr_pvals.append(lrtest(mdf.llf, mdf_reduced.llf)) 286 | curr_stderr.append(mdf.bse[f'C({contrast})[T.True]']) 287 | except Exception as e: 288 | print(e) 289 | curr_coefs.append(None) 290 | curr_pvals.append(None) 291 | curr_stderr.append(None) 292 | curr_genes = [curr_genes[i] for i in range(len(curr_genes)) if curr_coefs[i] is not None] 293 | coef = [c for c in curr_coefs if c is not None] 294 | pvals = [p for p in curr_pvals if p is not None] 295 | stderrs = [s for s in curr_stderr if s is not None] 296 | results = pd.DataFrame({'coef':coef, 'pval':pvals, 'gene':curr_genes, 'stderr': curr_stderr}) 297 | results['qval'] = multipletests(results.pval, method='fdr_bh')[1] 298 | return results 299 | 300 | 301 | def run_ttest_de_age(adata, family='nb', grouping='cell_type', lognorm=False, min_pct=0.1, logfc_thresh=np.log(1)): 302 | # do LR test 303 | import warnings 304 | from scipy.stats import mannwhitneyu 305 | from statsmodels.tools.sm_exceptions import ConvergenceWarning, PrecisionWarning, IterationLimitWarning, EstimationWarning, SingularMatrixWarning 306 | #from statsmodels.regression.linear_model.OLSResults import compare_lr_test 307 | warnings.simplefilter('ignore', ConvergenceWarning) 308 | warnings.simplefilter('ignore', PrecisionWarning) 309 | warnings.simplefilter('ignore', IterationLimitWarning) 310 | warnings.simplefilter('ignore', EstimationWarning) 311 | warnings.simplefilter('ignore', SingularMatrixWarning) 312 | warnings.simplefilter('ignore', FutureWarning) 313 | from scipy.stats import ttest_ind 314 | all_model_fits = {} 315 | all_results = {} 316 | 317 | for clust in adata.obs[grouping].unique()[::-1]: 318 | print(clust) 319 | curr_adata = adata[adata.obs[grouping]==clust].copy() 320 | curr_adata, _ = filter_2group(curr_adata, "age", "4wk", min_pct=min_pct, logfc_thresh=logfc_thresh) 321 | print(curr_adata.shape) 322 | curr_coefs = [] 323 | curr_pvals = [] 324 | curr_stderr = [] 325 | curr_genes = list(curr_adata.var_names) 326 | for i in tqdm(range(len(curr_genes))): 327 | try: 328 | #if lognorm: 329 | # curr_adata.obs["Y"] = np.log1p(curr_adata[:,curr_genes[i]].X.toarray()) 330 | #else: 331 | X = curr_adata[:,curr_genes[i]].X.toarray() 332 | young_X = X[curr_adata.obs['age'] == '4wk'] 333 | old_X = X[curr_adata.obs['age'] == '90wk'] 334 | curr_coefs.append(np.log(old_X.mean()/young_X.mean())) 335 | curr_pvals.append(ttest_ind(old_X, young_X)[1]) 336 | #curr_pvals.append(mannwhitneyu(old_X, young_X)[1]) 337 | #curr_pvals.append(mdf.compare_lr_test(mdf_reduced)[]) 338 | except Exception as e: 339 | #print(e) 340 | curr_coefs.append(None) 341 | curr_pvals.append(None) 342 | curr_genes = [curr_genes[i] for i in range(len(curr_genes)) if curr_coefs[i] is not None] 343 | coef = [c for c in curr_coefs if c is not None] 344 | pvals = [p for p in curr_pvals if p is not None] 345 | results = pd.DataFrame({'coef':coef, 'pval':pvals, 'gene':curr_genes}) 346 | results['qval'] = multipletests(results.pval, method='fdr_bh')[1] 347 | all_results[clust] = results 348 | return all_results 349 | 350 | def run_glm_de_age(adata, family='nb', grouping='cell_type', lognorm=False, min_pct=0.1, logfc_thresh=np.log(1)): 351 | # do LR test 352 | import warnings 353 | from statsmodels.tools.sm_exceptions import ConvergenceWarning, PrecisionWarning, IterationLimitWarning, EstimationWarning, SingularMatrixWarning 354 | #from statsmodels.regression.linear_model.OLSResults import compare_lr_test 355 | warnings.simplefilter('ignore', ConvergenceWarning) 356 | warnings.simplefilter('ignore', PrecisionWarning) 357 | warnings.simplefilter('ignore', IterationLimitWarning) 358 | warnings.simplefilter('ignore', EstimationWarning) 359 | warnings.simplefilter('ignore', SingularMatrixWarning) 360 | warnings.simplefilter('ignore', FutureWarning) 361 | if family == 'nb': 362 | family = sm.families.NegativeBinomial() 363 | elif family == 'poisson': 364 | family = sm.families.Poisson() 365 | 366 | all_model_fits = {} 367 | all_results = {} 368 | 369 | for clust in adata.obs[grouping].unique()[::-1]: 370 | print(clust) 371 | curr_adata = adata[adata.obs[grouping]==clust].copy() 372 | print(clust, curr_adata.shape) 373 | curr_adata, _ = filter_2group(curr_adata, "age", "4wk", min_pct=min_pct, logfc_thresh=logfc_thresh) 374 | print(curr_adata.shape) 375 | curr_coefs = [] 376 | curr_pvals = [] 377 | curr_stderr = [] 378 | curr_genes = list(curr_adata.var_names) 379 | print("Using new formula") 380 | umi_coef = [] 381 | for i in tqdm(range(len(curr_genes))): 382 | try: 383 | #if lognorm: 384 | # curr_adata.obs["Y"] = np.log1p(curr_adata[:,curr_genes[i]].X.toarray()) 385 | #else: 386 | curr_adata.obs["Y"] = curr_adata[:,curr_genes[i]].X.toarray() 387 | formula = "Y ~ C(age) + + log_umi" 388 | 389 | if family == "ols": 390 | mdf = smf.ols(formula, data=curr_adata.obs).fit(maxiter=50,disp=0) 391 | else: 392 | mdf = smf.glm(formula, data=curr_adata.obs, family=family).fit(maxiter=50,disp=0) 393 | 394 | formula_reduced = "Y ~ log_umi" # low_umi 395 | if family == "ols": 396 | mdf_reduced = smf.ols(formula_reduced, data=curr_adata.obs).fit(maxiter=50,disp=0) 397 | else: 398 | mdf_reduced = smf.glm(formula_reduced, data=curr_adata.obs, family=family).fit(maxiter=50,disp=0) 399 | umi_coef.append(mdf.params['log_umi']) 400 | curr_coefs.append(mdf.params['C(age)[T.90wk]']) 401 | curr_pvals.append(lrtest(mdf.llf, mdf_reduced.llf)) 402 | curr_stderr.append(mdf.bse['C(age)[T.90wk]']) 403 | #curr_pvals.append(mdf.compare_lr_test(mdf_reduced)[]) 404 | except Exception as e: 405 | #print(e) 406 | curr_coefs.append(None) 407 | curr_pvals.append(None) 408 | curr_stderr.append(None) 409 | print('Mean UMI coef', np.mean(umi_coef)) 410 | curr_genes = [curr_genes[i] for i in range(len(curr_genes)) if curr_coefs[i] is not None] 411 | coef = [c for c in curr_coefs if c is not None] 412 | pvals = [p for p in curr_pvals if p is not None] 413 | stderrs = [s for s in curr_stderr if s is not None] 414 | results = pd.DataFrame({'coef':coef, 'pval':pvals, 'gene':curr_genes, 'stderr': curr_stderr}) 415 | results['qval'] = multipletests(results.pval, method='fdr_bh')[1] 416 | all_results[clust] = results 417 | return all_results 418 | 419 | 420 | def save_de_results(df_map, out_fname): 421 | """ 422 | df_map is map of cell_type -> dataframe of differential gene expression results 423 | """ 424 | de = [] 425 | for k,v in df_map.items(): 426 | v['cell_type'] = k 427 | #v['qval'] = multipletests(np.array(v['pval']), method='fdr_bh')[1] 428 | de.append(v) 429 | out_df = pd.concat(de) 430 | out_df.to_csv(out_fname) 431 | return out_df -------------------------------------------------------------------------------- /python/find_merfish_markers.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | import pandas as pd 4 | import numpy as np 5 | import scanpy as sc 6 | from sklearn.ensemble import RandomForestClassifier 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.metrics import roc_auc_score 9 | from tqdm import tqdm 10 | 11 | import diffxpy.api as de 12 | 13 | from sklearn.inspection import permutation_importance 14 | def select_age_features(A: sc.AnnData, grouping, Nfeats=500, test_size=0.2): 15 | """ 16 | Use RFClassifier to find important aging features 17 | """ 18 | scores = {} 19 | feats = {} 20 | for i in np.unique(A.obs[grouping]): 21 | print(i) 22 | curr_adata = A[A.obs[grouping]==i] 23 | X = curr_adata.X.copy() 24 | y = curr_adata.obs.age 25 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) 26 | clf = RandomForestClassifier(verbose=False, n_jobs=-1).fit(X_train, y_train) 27 | y_preds = clf.predict_proba(X_test) 28 | scores[i] = roc_auc_score(y_test, y_preds[:,1]) 29 | feats[i] = pd.DataFrame({'clust': [i]*Nfeats, 30 | 'importance': np.sort(clf.feature_importances_)[::-1][:Nfeats], 31 | 'feats': curr_adata.var_names[np.argsort(clf.feature_importances_)[::-1][:Nfeats_age]]}) 32 | return scores, feats 33 | 34 | def select_celltype_features(A: sc.AnnData, grouping,Nfeats=1000, test_size=0.2): 35 | """ 36 | Use RFClassifier to find important cell type distinguishing features. 37 | """ 38 | scores = {} 39 | feats = {} 40 | X = A.X.copy() 41 | for i in np.unique(A.obs[grouping]): 42 | print(i) 43 | # train on one vs rest 44 | y = A.obs[grouping]==i 45 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) 46 | clf = RandomForestClassifier(verbose=False, n_jobs=-1).fit(X_train, y_train) 47 | y_preds = clf.predict_proba(X_test) 48 | scores[i] = roc_auc_score(y_test, y_preds[:,1]) 49 | feats[i] = pd.DataFrame({'clust': [i]*Nfeats, 50 | 'importance': np.sort(clf.feature_importances_)[::-1][:Nfeats], 51 | 'feats': A.var_names[np.argsort(clf.feature_importances_)[::-1][:Nfeats]]}) 52 | return scores, feats 53 | 54 | def select_celltype_features_perm(A: sc.AnnData, grouping,Nfeats=1000, test_size=0.2, n_repeats=10): 55 | scores = {} 56 | feats = {} 57 | X = A.X.toarray().copy() 58 | for i in np.unique(A.obs[grouping]): 59 | print(i) 60 | # train on one vs rest 61 | y = A.obs[grouping]==i 62 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) 63 | clf = RandomForestClassifier(verbose=False, n_jobs=-1).fit(X_train, y_train) 64 | y_preds = clf.predict_proba(X_test) 65 | result = permutation_importance(clf, X_test, y_test, n_repeats=n_repeats, random_state=42, n_jobs=-1) 66 | scores[i] = roc_auc_score(y_test, y_preds[:,1]) 67 | feats[i] = pd.DataFrame({'clust': [i]*Nfeats, 68 | 'importance': np.sort(result.importances_mean)[::-1][:Nfeats], 69 | 'feats': A.var_names[np.argsort(result.importances_mean)[::-1][:Nfeats]]}) 70 | return scores, feats 71 | 72 | from scipy.spatial.distance import pdist 73 | import scipy.cluster.hierarchy as hc 74 | import matplotlib 75 | 76 | def plot_clustered_celltypes_by_genes(A: sc.AnnData, genes, normalize=True, figsize=(20,30)): 77 | 78 | marker_clust_avgs = [] 79 | clust_avgs = [] 80 | for i in A.obs.clust_label.unique(): 81 | clust_avgs.append(compute_mean_expression(A[A.obs.clust_label==i,:])) 82 | marker_clust_avgs.append(compute_mean_expression(A[A.obs.clust_label==i,:][:,genes])) 83 | 84 | D = pdist(np.vstack(marker_clust_avgs).T, 'euclidean') 85 | Z = hc.linkage(D, 'ward', optimal_ordering=True) 86 | gene_ordering = hc.leaves_list(Z) 87 | 88 | D = pdist(clust_avgs, 'euclidean') 89 | Z = hc.linkage(D, 'ward', optimal_ordering=True) 90 | clust_ordering = hc.leaves_list(Z) 91 | #plt.imshow(np.corrcoef(clust_avgs)[clust_ordering],vmin=0,vmax=1,cmap=plt.cm.viridis) 92 | matplotlib.rcParams.update({'font.size': 8}) 93 | if normalize: 94 | sc.pl.heatmap(A, np.array(list(genes))[gene_ordering], 'clust_label', show_gene_labels=True, dendrogram=True,standard_scale='obs',figsize=figsize) 95 | else: 96 | sc.pl.heatmap(A, np.array(list(genes))[gene_ordering], 'clust_label', show_gene_labels=True, dendrogram=True,figsize=figsize) 97 | 98 | def plot_clustered_ages_by_genes(A: sc.AnnData, genes, normalize=True, figsize=(20,30)): 99 | 100 | marker_clust_avgs = [] 101 | clust_avgs = [] 102 | clust_names = A.obs.clust_label.unique() 103 | for i in A.obs.clust_label.unique(): 104 | clust_avgs.append(compute_mean_expression(A[A.obs.clust_label==i,:])) 105 | marker_clust_avgs.append(compute_mean_expression(A[np.logical_and(A.obs.clust_label==i, A.obs.age=='4wk'),:][:,genes])- 106 | compute_mean_expression(A[np.logical_and(A.obs.clust_label==i, A.obs.age=='90wk'),:][:,genes])) 107 | 108 | marker_clust_avgs = np.vstack(marker_clust_avgs) 109 | D = pdist(marker_clust_avgs.T, 'euclidean') 110 | Z = hc.linkage(D, 'ward', optimal_ordering=True) 111 | gene_ordering = hc.leaves_list(Z) 112 | 113 | D = pdist(clust_avgs, 'euclidean') 114 | Z = hc.linkage(D, 'ward', optimal_ordering=True) 115 | clust_ordering = hc.leaves_list(Z) 116 | 117 | plt.figure(figsize=(20,10)) 118 | plt.imshow(marker_clust_avgs[:, gene_ordering][clust_ordering,:],vmin=-2,vmax=2,cmap=plt.cm.bwr,aspect='auto',interpolation='none') 119 | plt.yticks(np.arange(marker_clust_avgs.shape[0])) 120 | plt.xticks(np.arange(marker_clust_avgs.shape[1])) 121 | plt.axes().set_xticklabels(np.array(genes)[gene_ordering],rotation=90); 122 | plt.axes().set_yticklabels(np.array(clust_names)[clust_ordering]) 123 | plt.grid(False) 124 | 125 | def compute_cluster_proportions(A: sc.AnnData, obs_type='clust_label'): 126 | """ Compute the fraction of cells in each cluster """ 127 | clusts = A.obs[obs_type].unique() 128 | clust_proportions = np.zeros((len(clusts),1)) 129 | for k,i in enumerate(clusts): 130 | clust_proportions[k] = np.sum(A.obs[obs_type]==i)/A.shape[0] 131 | return clust_proportions, clusts 132 | 133 | def compute_average_celltype_expr(A: sc.AnnData, genes, obs_type="clust_label"): 134 | marker_clust_avgs = [] 135 | for i in A.obs[obs_type].unique(): 136 | marker_clust_avgs.append(compute_mean_expression(A[A.obs[obs_type]==i,:][:,genes])) 137 | return np.vstack(marker_clust_avgs), A.obs[obs_type].unique() 138 | 139 | def compute_average_age_expr_change(A: sc.AnnData, genes): 140 | marker_clust_avgs = [] 141 | for i in A.obs.clust_label.unique(): 142 | marker_clust_avgs.append(compute_mean_expression(A[np.logical_and(A.obs.clust_label==i, A.obs.age=='4wk'),:][:,genes])- 143 | compute_mean_expression(A[np.logical_and(A.obs.clust_label==i, A.obs.age=='90wk'),:][:,genes])) 144 | return np.vstack(marker_clust_avgs) 145 | 146 | # compute per cluster average sparsity 147 | def plot_per_celltype_sparsity(A: sc.AnnData, genes): 148 | sparsity = [] 149 | celltypes = [] 150 | for i in A.obs.cell_type.unique(): 151 | curr_adata = A[A.obs.cell_type==i][:, genes] 152 | frac_expr = compute_frac_expressed(curr_adata) 153 | sparsity.extend(frac_expr) 154 | celltypes.extend([i]*len(frac_expr)) 155 | sns.swarmplot(data=pd.DataFrame({'clust': celltypes, 'sparsity':sparsity}), 156 | x='clust', 157 | y='sparsity') 158 | 159 | def plot_per_celltype_totalexpr(A: sc.AnnData, genes, exp=False): 160 | expr = [] 161 | celltypes = [] 162 | for i in A.obs.cell_type.unique(): 163 | curr_adata = A[A.obs.cell_type==i][:, genes] 164 | total_expr = np.array(curr_adata.X.sum(1)).flatten() 165 | expr.extend(total_expr) 166 | celltypes.extend([i]*len(total_expr)) 167 | sns.violinplot(data=pd.DataFrame({'clust': celltypes, 'expr':expr}), 168 | x='clust', 169 | y='expr') 170 | #return pd.DataFrame({'clust': celltypes, 'expr':expr}) 171 | 172 | 173 | def plot_per_gene_sparsity(A: sc.AnnData, genes): 174 | """ 175 | Score each gene by the max fraction expression divided by the average across all clusters. 176 | """ 177 | sparsity = [] 178 | celltypes = [] 179 | for i in A.obs.cell_type.unique(): 180 | curr_adata = A[A.obs.cell_type==i][:, genes] 181 | frac_expr = compute_frac_expressed(curr_adata) 182 | sparsity.append(frac_expr) 183 | #celltypes.extend([i]*len(frac_expr)) 184 | temp = np.vstack(sparsity) 185 | sparsity_score = temp.mean(0)/temp.max(0) 186 | sort_idx = np.argsort(sparsity_score) 187 | plt.figure(figsize=(15,5)) 188 | plt.scatter(np.arange(temp.shape[1]), sparsity_score[sort_idx]) 189 | plt.xticks(np.arange(temp.shape[1])); 190 | plt.axes().grid(False) 191 | plt.axes().set_xticklabels(np.array(genes)[sort_idx],rotation=90,fontsize=6); 192 | 193 | def compute_mean_expression(A: sc.AnnData): 194 | """ 195 | Average expression for each gene 196 | """ 197 | return np.array(A.X.mean(0)).flatten() 198 | 199 | def compute_frac_expressed(A: sc.AnnData): 200 | """ 201 | Fraction of cells expressing each gene 202 | """ 203 | return np.array((A.X>0).sum(0)/A.shape[0]).flatten() 204 | 205 | def filter_2group_1way(A: sc.AnnData, obs_name: str, ident: str, min_pct=None, logfc_thresh=None, min_diff_pct=None, max_cells_per_ident=None, log=True): 206 | """ 207 | Filter genes before differential expression testing. UNIDIRECTIONAL 208 | obs is grouping 209 | ident is what to be compared (ident vs ~ident) 210 | min_diff_pcr: minimum difference in percentage between genes 211 | log: is the data log transformed (usually this is the case) 212 | """ 213 | n_cells, n_genes = A.shape 214 | X = A[A.obs[obs_name] == ident] 215 | Y = A[A.obs[obs_name] != ident] 216 | 217 | min_pct_mask = np.ones((n_genes,),dtype=np.bool) 218 | log_fc_mask = np.ones((n_genes,),dtype=np.bool) 219 | min_diff_pct_mask = np.ones((n_genes,),dtype=np.bool) 220 | 221 | pct_X = compute_frac_expressed(X) 222 | pct_Y = compute_frac_expressed(Y) 223 | 224 | if min_pct: 225 | min_pct_mask = (pct_X>min_pct).flatten() 226 | 227 | mean_X = compute_mean_expression(X) 228 | mean_Y = compute_mean_expression(Y) 229 | if log: 230 | logfc_XY = np.log(np.exp(mean_X)/np.exp(mean_Y)) 231 | else: 232 | logfc_XY = np.log(mean_X/mean_Y) 233 | 234 | if logfc_thresh: 235 | log_fc_mask = (logfc_XY > logfc_thresh).flatten() 236 | 237 | if min_diff_pct: 238 | diff_pct_XY = pct_X-pct_Y 239 | min_diff_pct_mask = (diff_pct_XY > min_diff_pct).flatten() 240 | final_mask = np.logical_and(np.logical_and(min_pct_mask, log_fc_mask), min_diff_pct_mask).flatten() 241 | A = A[:, final_mask] 242 | 243 | if max_cells_per_ident: 244 | idx_X = np.nonzero((A.obs[obs_name]==ident).values)[0] 245 | idx_Y = np.nonzero((A.obs[obs_name]!=ident).values)[0] 246 | ids_X = idx_X[np.random.permutation(len(idx_X))[:max_cells_per_ident]] 247 | ids_Y = idx_Y[np.random.permutation(len(idx_Y))[:max_cells_per_ident]] 248 | combined_ids = np.hstack((ids_X, ids_Y)).flatten() 249 | return A[combined_ids,:], logfc_XY[np.array(final_mask).flatten()] 250 | else: 251 | 252 | return A, logfc_XY[np.array(final_mask).flatten()] 253 | 254 | 255 | def compute_onevsall_de_for_clusts(A: sc.AnnData, clust_obs, n_de=5): 256 | clust_labels_uniq = list(np.unique(A.obs[clust_obs])) 257 | 258 | de_by_type = {} 259 | for n,i in enumerate(clust_labels_uniq): 260 | print(n+1,'/',len(clust_labels_uniq),':',i) 261 | curr_A = A[np.logical_or(A.obs[clust_obs]==i, A.obs[clust_obs]!=i)].copy() 262 | curr_A.obs['contrast'] = curr_A.obs[clust_obs]==i 263 | curr_A, _ = filter_2group_1way(curr_A, 'contrast', True, min_pct=0.2, logfc_thresh=np.log(1.5)) 264 | res = de.test.t_test(data=curr_A, grouping='contrast') 265 | 266 | frac_foreground = compute_frac_expressed(curr_A[curr_A.obs[clust_obs]==i]) 267 | frac_background = compute_frac_expressed(curr_A[curr_A.obs[clust_obs]!=i]) 268 | # filter genes 269 | good_expr = np.logical_and(res.log10_fold_change()>=np.log10(2), res.qval<0.05) 270 | good_frac = np.logical_and(frac_foreground>0.4, frac_foreground>3*frac_background) 271 | # require 272 | good_genes = np.logical_and(good_expr, good_frac) 273 | sort_idx = np.argsort(res.qval[good_genes]) 274 | log10fc = res.log10_fold_change()[good_genes][sort_idx] 275 | de_by_type[i] = pd.DataFrame({'gene':res.gene_ids[good_genes][sort_idx][:n_de], 276 | 'log10fc':log10fc[:n_de], 277 | 'frac_fg':frac_foreground[good_genes][sort_idx][:n_de], 278 | 'frac_bg':frac_background[good_genes][sort_idx][:n_de], 279 | 'qval':res.qval[good_genes][sort_idx][:n_de]}) 280 | return de_by_type 281 | 282 | def select_age_markers(de_map, n_marker_genes=10): 283 | """ 284 | Select top n_marker_genes for each cluster. log10fc is absolute value of coefficient. 285 | """ 286 | clust_labels_uniq = list(de_map.keys()) 287 | de_marker_genes = set() 288 | for n,i in enumerate(clust_labels_uniq): 289 | curr_contrast = de_map[i].sort_values('log10fc', ascending=False) 290 | for g in list(curr_contrast.head(n_marker_genes).gene): 291 | de_marker_genes.add(g) 292 | return de_marker_genes 293 | 294 | def greedily_select_markers(de_map, min_marker_genes=4, pairwise=True, n_pass=10, de_marker_genes=None): 295 | if pairwise: 296 | clust_labels_uniq = list(set([i[0] for i in de_map.keys()])) 297 | else: 298 | clust_labels_uniq = list(de_map.keys()) 299 | if de_marker_genes is None: 300 | de_marker_genes = set() 301 | else: 302 | de_marker_genes = set(de_marker_genes) 303 | # do n passes through the list, to ensure that all clusters have at least min_marker_genes included 304 | # in the set 305 | 306 | all_clusts_good = True 307 | for n in range(n_pass): 308 | if n > 0 and all_clusts_good: 309 | break 310 | else: 311 | all_clusts_good = True 312 | for n,i in enumerate(clust_labels_uniq): 313 | #print(n+1,'/',len(clust_labels_uniq),':',i) 314 | if pairwise: 315 | for j in clust_labels_uniq: 316 | if i != j: 317 | curr_contrast = de_map[(i,j)].sort_values('log10fc', ascending=False) 318 | curr_genes = list(curr_contrast.gene) 319 | # check if has enough genes in this pair 320 | if len(curr_genes) > 0: 321 | # check how many of these genes are included in the working set of marker genes 322 | n_curr_marker = np.sum([k in de_marker_genes for k in curr_genes]) 323 | # if this cluster has no markers in the marker gene set, add the remaining number 324 | if n_curr_marker < min_marker_genes: 325 | all_clusts_good = False 326 | n_to_add = min(len(curr_genes), int(min_marker_genes-n_curr_marker)) 327 | print("Adding", n_to_add) 328 | curr_to_add = [i for i in curr_genes if i not in de_marker_genes] 329 | for k in range(min(n_to_add, len(curr_to_add))): 330 | de_marker_genes.add(curr_to_add[k]) 331 | else: 332 | curr_contrast = de_map[i].sort_values('log10fc', ascending=False) 333 | curr_genes = list(curr_contrast.gene) 334 | if len(curr_genes) > 0: 335 | # check how many of these genes are included in the working set of marker genes 336 | n_curr_marker = np.sum([k in de_marker_genes for k in curr_genes]) 337 | # if this cluster has no markers in the marker gene set, add the remaining number 338 | if n_curr_marker < min_marker_genes: 339 | n_to_add = min(len(curr_genes), int(min_marker_genes-n_curr_marker)) 340 | for k in range(n_to_add): 341 | de_marker_genes.add(curr_genes[k]) 342 | 343 | return de_marker_genes 344 | 345 | def compute_pairwise_de_for_clusts(A: sc.AnnData, clust_obs, n_de=5, min_pct=0.4): 346 | clust_labels_uniq = list(np.unique(A.obs[clust_obs])) 347 | 348 | pairwise_de = {} 349 | for n,i in enumerate(clust_labels_uniq): 350 | print(n+1,'/',len(clust_labels_uniq),':',i) 351 | for j in tqdm(clust_labels_uniq): 352 | if i != j: 353 | curr_A = A[np.logical_or(A.obs[clust_obs]==i, A.obs[clust_obs]==j)].copy() 354 | curr_A.obs['contrast'] = curr_A.obs[clust_obs]==i 355 | curr_A, _ = filter_2group_1way(curr_A, 'contrast', True, min_pct=min_pct, logfc_thresh=np.log(1.5)) 356 | res = de.test.t_test(data=curr_A, grouping='contrast') 357 | 358 | frac_foreground = compute_frac_expressed(curr_A[curr_A.obs[clust_obs]==i]) 359 | frac_background = compute_frac_expressed(curr_A[curr_A.obs[clust_obs]==j]) 360 | # filter genes 361 | good_expr = np.logical_and(res.log10_fold_change()>=np.log10(2), res.qval<0.05) 362 | good_frac = np.logical_and(frac_foreground>0.4, frac_foreground>3*frac_background) 363 | good_genes = np.logical_and(good_expr, good_frac) 364 | sort_idx = np.argsort(res.qval[good_genes]) 365 | log10fc = res.log10_fold_change()[good_genes][sort_idx] 366 | pairwise_de[(i,j)] = pd.DataFrame({'gene':res.gene_ids[good_genes][sort_idx][:n_de], 367 | 'log10fc':log10fc[:n_de], 368 | 'frac_fg':frac_foreground[good_genes][sort_idx][:n_de], 369 | 'frac_bg':frac_background[good_genes][sort_idx][:n_de], 370 | 'qval':res.qval[good_genes][sort_idx][:n_de]}) 371 | 372 | # greedily select marker genes based on differential expression ranking 373 | return pairwise_de -------------------------------------------------------------------------------- /python/integration.py: -------------------------------------------------------------------------------- 1 | from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor 2 | from sklearn.neural_network import MLPRegressor, MLPClassifier 3 | from sklearn.ensemble import RandomForestClassifier 4 | import numpy as np 5 | #from xgboost import XGBClassifier 6 | def train_umap_classifier(adata, label_column, umap_key='X_umap', n_dims=30, method='knn'): 7 | '''Train a KNN classifier using the UMAP coordinates.''' 8 | if method == 'knn': 9 | knnc = KNeighborsClassifier(n_jobs=-1, n_neighbors=25) ##KNeighborsClassifier(n_jobs=-1)# 10 | elif method == "mlp": 11 | knnc = MLPClassifier() 12 | knnc.fit(adata.obsm[umap_key][:, :n_dims], np.array(adata.obs[label_column])) 13 | return knnc 14 | 15 | 16 | def impute_classification(adata, classifier, prediction_column, probability_column, umap_key='X_umap', n_dims=30): 17 | '''Impute using a trained classifier.''' 18 | classes = classifier.classes_ 19 | probas = classifier.predict_proba(adata.obsm[umap_key][:, :n_dims]) 20 | max_ids = np.argmax(probas, axis=1) 21 | max_probas = np.max(probas, axis=1) 22 | predicted_classes = [classes[i] for i in max_ids] 23 | 24 | adata.obs[prediction_column] = predicted_classes 25 | adata.obs[probability_column] = max_probas 26 | 27 | 28 | def impute_gene_expression(): 29 | pass -------------------------------------------------------------------------------- /python/plotting.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | from cycler import cycler 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from scipy.spatial.transform import Rotation as R 6 | from statsmodels.stats.multitest import multipletests 7 | import matplotlib as mpl 8 | def gen_light_palette(prefix, color_name, uniq_clusts): 9 | n = np.sum([1 if prefix in i else 0 for i in uniq_clusts]) 10 | return sns.light_palette(color_name, n_colors=n+2)[2:] 11 | 12 | def gen_dark_palette(prefix, color_name, uniq_clusts): 13 | n = np.sum([1 if prefix in i else 0 for i in uniq_clusts]) 14 | return sns.dark_palette(color_name, n_colors=n+2)[2:] 15 | 16 | major_cell_types = { 17 | # Astro -- green 18 | "Astro" : "seagreen", 19 | # Excitatory -- red/orange 20 | "ExN" : "lightcoral", 21 | # inhibitory -- blue/purple 22 | "InN" : "cornflowerblue", 23 | "MSN" : "mediumpurple", 24 | 25 | # immune cells + microglia -- pink 26 | "Micro" : "pink", 27 | "T cell" : "deeppink", 28 | "Macro" : "violet", 29 | 30 | # Endothelial/vasculure -- gold/tan 31 | "Vlmc" : "gold", 32 | "Endo" : "khaki", 33 | "Peri" : "goldenrod", 34 | "Epen" : "darkkhaki", 35 | 36 | # Oligodendrocytes 37 | "Olig" : "slategrey", 38 | "OPC" : "black" 39 | } 40 | 41 | clust_cell_types = { 42 | # Astro -- green 43 | "Astro" : "seagreen", 44 | # Excitatory -- red/orange 45 | "ExN-L2/3" : "darkorange", 46 | "ExN-L5" : "lightsalmon", 47 | "ExN-L6" : "maroon", 48 | "ExN-Olf" : "firebrick", 49 | # inhibitory -- blue/purple 50 | 'InN-Olf' : "cornflowerblue", 51 | #'InN-Adarb2' : "lightsteelblue", 52 | 'InN-Chat' : "lavender", 53 | #'InN-Egfr' : "turquoise", 54 | #'InN-Calb' : "teal", 55 | 'InN-Lhx6':'lightsteelblue', 56 | 57 | 'InN-Calb2' : "navy", 58 | 'InN-Lamp5' : "royalblue", 59 | 'InN-Pvalb' : "steelblue", 60 | 'InN-Sst' : "dodgerblue", 61 | 'InN-Vip' : "deepskyblue", 62 | "MSN-D1" : "mediumslateblue", 63 | "MSN-D2" : "rebeccapurple", 64 | # immune cells + microglia -- pink 65 | "Micro" : "deeppink", 66 | "T cell" : "crimson", 67 | "Macro" : "hotpink", 68 | 69 | # Endothelial/vasculure -- gold/tan 70 | "Vlmc" : "olive", 71 | "Endo" : "khaki", 72 | "Peri" : "goldenrod", 73 | "Epen" : "burlywood", 74 | 75 | # Oligodendrocytes 76 | "Olig" : "slategrey", 77 | "OPC" : "black" 78 | } 79 | 80 | 81 | 82 | def generate_palettes(A,clust_key="clust_annot", cell_type_key="cell_type"): 83 | print("Updated") 84 | uniq_celltypes = np.sort(np.unique(A.obs[cell_type_key])) 85 | uniq_clusts = np.sort(A.obs[clust_key].unique()) 86 | 87 | celltype_pals = [] 88 | for i in uniq_celltypes: 89 | pal = gen_dark_palette(i, major_cell_types[i], uniq_celltypes) 90 | celltype_pals.append(pal) 91 | celltype_pals = cycler(color=np.vstack(celltype_pals)) 92 | 93 | celltype_colors = {} 94 | for i,c in enumerate(iter(celltype_pals)): 95 | celltype_colors[uniq_celltypes[i]] = c['color'] 96 | 97 | clust_pals = [] 98 | label_colors = {} 99 | for i in sorted(clust_cell_types.keys()): 100 | n = np.sum([1 if i in j else 0 for j in uniq_clusts]) 101 | if n > 0: 102 | pal = gen_dark_palette(i, clust_cell_types[i], uniq_clusts) 103 | print(i,pal) 104 | clust_pals.append(pal) 105 | # find palettes for cell types 106 | curr_clusts = sorted([k for k in uniq_clusts if i in k]) 107 | for n,p in enumerate(pal): 108 | label_colors[curr_clusts[n]] = p 109 | else: 110 | print("Couldn't find clust", i) 111 | clust_pals = cycler(color=np.vstack(clust_pals)) 112 | #label_colors = {} 113 | #for i, c in enumerate(iter(clust_pals)): 114 | # label_colors[valid_clusts[i]] = c['color'] 115 | 116 | return celltype_colors, celltype_pals, label_colors, clust_pals 117 | 118 | def calculate_aspect_ratio(A, rot=0,fov_size=221): 119 | all_pts = A.obsm['spatial'] 120 | if rot>0: 121 | rotate(all_pts, degrees=rot) 122 | max_x = all_pts[:,0].max() 123 | min_x = all_pts[:,0].min() 124 | max_y = all_pts[:,1].max() 125 | min_y = all_pts[:,1].min() 126 | n_tiles_x = np.round((max_x-min_x)/fov_size) 127 | n_tiles_y = np.round((max_y-min_y)/fov_size) 128 | aspect_ratio = n_tiles_x/n_tiles_y 129 | return aspect_ratio, n_tiles_x, n_tiles_y 130 | 131 | def plot_clust_subset(A, cell_types, cmap, ax=None, rot=0, s=0.1, xlim=None, ylim=None,alpha=0.1,clust_key="clust_annot"): 132 | if ax is None: 133 | f,ax = plt.subplots() 134 | all_pts = A.obsm['spatial'].copy()#np.array([A.obs.center_x, A.obs.center_y]).T 135 | # zero center all_pts 136 | all_pts = rotate(all_pts, degrees=rot) 137 | all_pts[:,0] -= all_pts[:,0].min() 138 | all_pts[:,1] -= all_pts[:,1].min() 139 | 140 | curr_idx = np.argwhere([i in cell_types for i in A.obs[clust_key]]).flatten() 141 | curr_pts = all_pts[curr_idx,:] 142 | other_idx = np.array([i for i in np.arange(all_pts.shape[0]) if i not in curr_idx]) 143 | if len(other_idx) > 0: 144 | ax.scatter(all_pts[other_idx,:][:,0],all_pts[other_idx,:][:,1],s=s,vmin=0,vmax=1, c='lightgray', alpha=alpha,rasterized=True) 145 | print(all_pts[:,0].min(), all_pts[:,0].max(),all_pts[:,1].min(), all_pts[:,1].max()) 146 | ax.scatter(curr_pts[:,0],curr_pts[:,1],s=s,vmin=0,vmax=A.obs.clust_encoding.max(),c=A.obs.clust_encoding[curr_idx],rasterized=True,cmap=cmap) 147 | ax.axis('off') 148 | if xlim is not None: 149 | ax.set_xlim(xlim) 150 | if ylim is not None: 151 | ax.set_ylim(ylim) 152 | 153 | def plot_seg(A, cmap, ax=None, rot=0, s=0.1, xlim=None, ylim=None,key='spatial_clust_annots_value',vmax=7): 154 | if ax is None: 155 | f,ax = plt.subplots() 156 | all_pts = A.obsm['spatial'].copy()#np.array([A.obs.center_x, A.obs.center_y]).T 157 | # zero center all_pts 158 | all_pts = rotate(all_pts, degrees=rot) 159 | all_pts[:,0] -= all_pts[:,0].min() 160 | all_pts[:,1] -= all_pts[:,1].min() 161 | ax.scatter(all_pts[:,0], all_pts[:,1],s=s, c=A.obs[key],cmap=cmap,vmin=0,vmax=vmax) 162 | ax.axis('off') 163 | if xlim is not None: 164 | ax.set_xlim(xlim) 165 | if ylim is not None: 166 | ax.set_ylim(ylim) 167 | from scipy.stats import ttest_ind, ranksums 168 | def calc_pvals_for_grouping(x,y,data,hue,order=None): 169 | if order is None: 170 | order = sorted(list(data[x].unique())) 171 | hue_conds = list(data[hue].unique()) # assumes there are only two for this 172 | pvals = [] 173 | for i in order: 174 | A = data[np.logical_and(data[x]==i, data[hue]==hue_conds[0])][y] 175 | B = data[np.logical_and(data[x]==i, data[hue]==hue_conds[1])][y] 176 | pval = ranksums(A,B) 177 | pvals.append(pval[1]) 178 | return pvals 179 | 180 | def plot_pvals(ax, pvals): 181 | ymin, ymax = ax.get_ylim() 182 | xticks = ax.get_xticks() 183 | for i,p in enumerate(pvals): 184 | if p < 0.01: 185 | ax.text(xticks[i], ymax, '*') 186 | 187 | def plot_cond_obs_comparison(data, x, y, cell_type, figsize=(5,3), order=None, clust_key='cell_type', cond_pal=sns.color_palette(['g','m']), ylim=None): 188 | f, ax = plt.subplots(figsize=figsize) 189 | curr_df = data[data.obs[clust_key]==cell_type].obs 190 | if order is None: 191 | order = sorted(curr_df[x].unique()) 192 | #sns.violinplot(x=x, y=y, data=curr_df,hue='age',fliersize=1,linewidth=1,palette=age_pal, ax=ax,inner=None,order=order,rasterized=True) 193 | sns.boxplot(x=x, y=y, data=curr_df,hue='cond',fliersize=0,linewidth=1,ax=ax, palette=cond_pal,order=order) 194 | sns.stripplot(data=curr_df, x=x, y=y, hue="cond",jitter=0.15,size=0.5,dodge=True,color='k', rasterized=True,ax=ax, order=order) 195 | sns.despine() 196 | if ylim is not None: 197 | ax.set_ylim(ylim) 198 | plt.legend([],[], frameon=False) 199 | 200 | # sns.despine() 201 | # plt.legend([],[], frameon=False) 202 | #if show_pvals: 203 | # pvals = calc_pvals_for_grouping(x,y,curr_df, "cond",order=order) 204 | # plot_pvals(ax, pvals) 205 | return f 206 | 207 | def plot_age_obs_comparison(data, x, y, cell_type, figsize=(5,3), show_pvals=False, order=None, clust_key='cell_type', age_pal=sns.color_palette(['cornflowerblue','thistle','lightcoral'])): 208 | f, ax = plt.subplots(figsize=(5,3)) 209 | curr_df = data[data.obs[clust_key]==cell_type].obs 210 | if order is None: 211 | order = sorted(curr_df[x].unique()) 212 | #sns.violinplot(x=x, y=y, data=curr_df,hue='age',fliersize=1,linewidth=1,palette=age_pal, ax=ax,inner=None,order=order,rasterized=True) 213 | sns.boxplot(x=x, y=y, data=curr_df,hue='age',fliersize=0,linewidth=1,palette=age_pal, ax=ax,order=order) 214 | 215 | sns.stripplot(data=curr_df, x=x, y=y, hue="age", ax=ax,jitter=0.15,size=0.5,dodge=True,color='k',order=order, rasterized=True) 216 | 217 | sns.despine() 218 | plt.legend([],[], frameon=False) 219 | if show_pvals: 220 | pvals = calc_pvals_for_grouping(x,y,curr_df, "age",order=order) 221 | plot_pvals(ax, pvals) 222 | return f 223 | 224 | def plot_obs(A, cell_types, obs_name, cmap, ax=None, rot=0, s=0.1, xlim=None, ylim=None,vmin=0,vmax=10,alpha=0.1,key="clust_annot"): 225 | if ax is None: 226 | f,ax = plt.subplots() 227 | all_pts = A.obsm['spatial'].copy()#np.array([A.obs.center_x, A.obs.center_y]).T 228 | print("Shape", all_pts.shape) 229 | # zero center all_pts 230 | all_pts = rotate(all_pts, degrees=rot) 231 | all_pts[:,0] -= all_pts[:,0].min() 232 | all_pts[:,1] -= all_pts[:,1].min() 233 | 234 | curr_idx = np.argwhere([i in cell_types for i in A.obs[key]]).flatten() 235 | curr_pts = all_pts[curr_idx,:] 236 | other_idx = np.array([i for i in np.arange(all_pts.shape[0]) if i not in curr_idx]) 237 | if len(other_idx) > 0: 238 | ax.scatter(all_pts[other_idx,:][:,0],all_pts[other_idx,:][:,1],s=s,vmin=0,vmax=1, c='lightgray', alpha=alpha,rasterized=True, edgecolors='face') 239 | #print(all_pts[:,0].min(), all_pts[:,0].max(),all_pts[:,1].min(), all_pts[:,1].max()) 240 | ax.scatter(curr_pts[:,0],curr_pts[:,1],s=s,vmin=vmin,vmax=vmax,c=np.array(A[curr_idx,:].obs[obs_name]),rasterized=True,cmap=cmap, edgecolors='face') 241 | ax.axis('off') 242 | if xlim is not None: 243 | ax.set_xlim(xlim) 244 | if ylim is not None: 245 | ax.set_ylim(ylim) 246 | 247 | def plot_gene_expr(A, cell_types, gene_name, cmap, ax=None, rot=0, s=0.1, xlim=None, ylim=None,vmin=0,vmax=10,use_raw=True,key='clust_annot',alpha=1): 248 | if ax is None: 249 | f,ax = plt.subplots() 250 | curr_idx = np.argwhere([i in cell_types for i in A.obs[key]]).flatten() 251 | other_idx = np.array([i for i in np.arange(A.shape[0]) if i not in curr_idx]).astype(np.int) 252 | curr_adata = A[curr_idx, :] 253 | other_adata = A[other_idx, :] 254 | if use_raw: 255 | curr_adata = curr_adata.raw.to_adata() 256 | curr_pts = curr_adata.obsm['spatial']#[curr_idx] 257 | other_pts = other_adata.obsm['spatial']#[other_idx] 258 | # zero center all_pts 259 | curr_pts = rotate(curr_pts, degrees=rot) 260 | curr_pts[:,0] -= curr_pts[:,0].min() 261 | curr_pts[:,1] -= curr_pts[:,1].min() 262 | 263 | gene_idx = np.argwhere([i==gene_name for i in A.var_names]).flatten()[0] 264 | if len(other_idx) > 0 and other_pts.shape[0] != curr_pts.shape[0]: 265 | print("plotting background") 266 | other_pts = rotate(other_pts, degrees=rot) 267 | other_pts[:,0] -= other_pts[:,0].min() 268 | other_pts[:,1] -= other_pts[:,1].min() 269 | 270 | ax.scatter(other_pts[:,0],other_pts[:,1],s=s,vmin=0,vmax=1, c='lightgray', rasterized=True, zorder=0,alpha=alpha) 271 | expr = np.array(curr_adata[:,gene_name].X.toarray()).flatten() 272 | ax.scatter(curr_pts[:,0],curr_pts[:,1],s=s,vmin=vmin,vmax=vmax,c=expr,rasterized=True,cmap=cmap, zorder=1,alpha=alpha) 273 | #print(curr_pts.shape, len(np.array(curr_adata[:,gene_name].X.flatten()))) 274 | #ax.scatter(curr_pts[:,0],curr_pts[:,1],s=s,c=np.array(curr_adata[:,gene_name].X.toarray())) 275 | ax.axis('off') 276 | if xlim is not None: 277 | ax.set_xlim(xlim) 278 | if ylim is not None: 279 | ax.set_ylim(ylim) 280 | 281 | def rotate(p, origin=(0, 0), degrees=0): 282 | angle = np.deg2rad(degrees) 283 | R = np.array([[np.cos(angle), -np.sin(angle)], 284 | [np.sin(angle), np.cos(angle)]]) 285 | o = np.atleast_2d(origin) 286 | p = np.atleast_2d(p) 287 | return np.squeeze((R @ (p.T-o.T) + o.T).T) 288 | 289 | def plot_obs_by_cells(A, obs_name, s=0.1, cmap=plt.cm.gist_rainbow, show_legend=False, vmax=None, rot=0): 290 | pts = A.obsm['spatial']#np.array([A.obs.center_x, A.obs.center_y]).T 291 | if rot != 0: 292 | pts = rotate(pts, degrees=rot) 293 | pts = pd.DataFrame({'x': pts[:,0], 'y': pts[:,1], 'obs':A.obs[obs_name]}) 294 | if vmax is None: 295 | vmax = len(pts.obs.unique()) 296 | cols = cmap(np.linspace(0,1,vmax+1)) 297 | #for n,i in enumerate(pts.obs.unique()): 298 | #curr_pts = pts[pts.obs==i] 299 | plt.scatter(pts.x,pts.y,s=s,vmin=0,vmax=vmax,c=pts.obs,cmap=cmap) 300 | if show_legend: 301 | plt.legend(pts.obs.unique()) 302 | 303 | #def plot_gene_by_cells(A, gene_name, s=0.1, cmap=plt.cm.Reds): 304 | # gene_idx = np.argwhere(A.var_names==gene_name)[0][0] 305 | # pts = pd.DataFrame({'x': A.obs.center_x, 'y': A.obs.center_y, 'obs':A.X[:,gene_idx]}) 306 | # plt.scatter(pts.x,pts.y,c=pts.obs, cmap=cmap, s=s) 307 | 308 | 309 | def plot_expr_matrix_single(tstats, pvals, celltypes, vmin=-25, vmax=25,cmap=plt.cm.seismic, ax=None): 310 | pvals[np.isnan(pvals)] = 1 311 | pvals_correct = multipletests(pvals.flatten(), method='fdr_bh')[1] 312 | pvals_correct = pvals_correct.reshape(tstats.shape) 313 | pvals_correct[pvals_correct<1e-10] = 1e-10 314 | #for idx in range(200): 315 | if ax is None: 316 | f, ax = plt.subplots(figsize=(5,1)) 317 | for i in range(tstats.shape[0]): 318 | if pvals_correct[i] < 0.05: 319 | ax.scatter(i, 1, s=-np.log10(pvals_correct[i])*10, c=tstats[i],vmin=vmin,vmax=vmax,cmap=cmap, lw=1, edgecolor='k') 320 | else: 321 | ax.scatter(i, 1, s=-np.log10(pvals_correct[i])*10, c=tstats[i],vmin=vmin,vmax=vmax,cmap=cmap, lw=1, edgecolor='w') 322 | ax.set_xticks(np.arange(len(celltypes))); 323 | ax.set_yticks([]) 324 | ax.set_xticklabels(celltypes,rotation=90); 325 | 326 | def plot_expr_matrix_by_name(tstats, pvals, gene_name, var_names,celltypes, vmin=-25, vmax=25,cmap=plt.cm.seismic): 327 | idx = np.argwhere(var_names==gene_name)[0] 328 | pvals[np.isnan(pvals)] = 1 329 | pvals_correct = multipletests(pvals.flatten(), method='fdr_bh')[1] 330 | pvals_correct = pvals_correct.reshape(tstats.shape) 331 | pvals_correct[pvals_correct<1e-10] = 1e-10 332 | #for idx in range(200): 333 | f, ax = plt.subplots(figsize=(5,5)) 334 | ax.set_title(var_names[idx]) 335 | for i in range(tstats.shape[0]): 336 | for j in range(tstats.shape[1]): 337 | if pvals_correct[i,j,idx] < 0.05: 338 | ax.scatter(i, j, s=-np.log10(pvals_correct[i,j,idx])*10, c=tstats[i,j,idx],vmin=vmin,vmax=vmax,cmap=plt.cm.bwr, lw=1, edgecolor='k') 339 | else: 340 | pass 341 | # ax.scatter(i, j, s=-np.log10(pvals_correct[i,j,idx])*10, c=tstats[i,j,idx],vmin=vmin,vmax=vmax,cmap=plt.cm.bwr, lw=1, edgecolor='w') 342 | 343 | ax.set_xticks(np.arange(len(celltypes))); 344 | ax.set_yticks(np.arange(len(celltypes))); 345 | ax.set_xticklabels(celltypes,rotation=90); 346 | ax.set_yticklabels(celltypes); 347 | ax.set_xlabel('Source') 348 | ax.set_ylabel('Neighbor') 349 | 350 | def plot_interactions(pvals, interactions, celltypes, celltype_colors,figsize=(5,5),seg_points=None,vmin=0,vmax=5,cmap=plt.cm.Reds, qval_thresh=0.1): 351 | pvals[pvals<1e-10] = 1e-10 352 | f, ax = plt.subplots(figsize=figsize) 353 | gs = plt.GridSpec(nrows=2,ncols=2, width_ratios=[0.5,20], height_ratios=[20,0.5], wspace=0.1, hspace=0.1) 354 | ax = plt.subplot(gs[0,0]) 355 | curr_cmap = mpl.colors.ListedColormap([celltype_colors[i] for i in celltypes]) 356 | ax.imshow(np.expand_dims(np.arange(interactions.shape[0])[::-1],1),aspect='auto',interpolation='none',cmap=curr_cmap) 357 | sns.despine(ax=ax,bottom=True,left=True) 358 | ax.set_xticks([]) 359 | ax.set_yticks(np.arange(len(celltypes))); 360 | ax.set_yticklabels(celltypes[::-1]); 361 | 362 | ax = plt.subplot(gs[0,1]) 363 | ax.imshow(np.zeros_like(interactions), cmap=plt.cm.seismic, rasterized=True, aspect='auto',interpolation='none', vmin=-1,vmax=1) 364 | for i in range(interactions.shape[0]): 365 | for j in range(interactions.shape[0]): 366 | if pvals[i,j] < qval_thresh: 367 | #ax.scatter(i,j, s=-np.log10(pvals[i,j])*10, c=interactions[i,j],cmap=cmap,vmin=vmin,vmax=vmax, lw=1, edgecolor='k') 368 | ax.scatter(i, interactions.shape[0]-j-1, s=100, c=interactions[i,j],cmap=cmap,vmin=vmin,vmax=vmax, lw=1, edgecolor='k') 369 | 370 | else: 371 | pass 372 | #ax.scatter(i, interactions.shape[0]-j-1, s=100, c=interactions[i,j],cmap=cmap,vmin=vmin,vmax=vmax, lw=1, edgecolor='w') 373 | #ax.set_xlim([-1, 1+len(celltypes)]) 374 | #ax.set_ylim([-1, 1+len(celltypes)]) 375 | #ax.set_xticks(np.arange(len(celltypes))) 376 | #ax.set_yticks(np.arange(len(celltypes))) 377 | if seg_points is not None: 378 | for i in seg_points: 379 | ax.axvline(i-0.5,color='k',linestyle='--') 380 | ax.axhline(len(clust_annots)-i-0.5,color='k',linestyle='--') 381 | 382 | ax.axis('off') 383 | ax = plt.subplot(gs[1,1]) 384 | curr_cmap = mpl.colors.ListedColormap([celltype_colors[i] for i in celltypes]) 385 | ax.imshow(np.expand_dims(np.arange(interactions.shape[0]),1).T,aspect='auto',interpolation='none',cmap=curr_cmap) 386 | sns.despine(ax=ax,bottom=True,left=True) 387 | ax.set_xticks(np.arange(len(celltypes))); 388 | ax.set_xticklabels(celltypes,rotation=90); 389 | ax.set_yticks([]) 390 | return f 391 | def plot_clust_spatial_enrichment(A,vmin=0,vmax=1,uniq_clusts=None,clust_key='clust_annot',label_colors=None, spatial_domains=['Pia','L2/3', 'L5','L6', 'LatSept', 'CC', 'Striatum','Ventricle'], 392 | seg_cmap=plt.cm.viridis): 393 | if uniq_clusts is None: 394 | uniq_clusts = sorted(A.obs[clust_key].unique()) 395 | n_spatial_domains = int(A.obs.spatial_clust_annots_value.max() + 1) 396 | clust_counts = np.zeros((n_spatial_domains, len(uniq_clusts))) 397 | print(clust_counts.shape) 398 | for i in range(n_spatial_domains): 399 | curr_clusts = A[A.obs.spatial_clust_annots_value==i,:].obs[clust_key] 400 | for j,c in enumerate(uniq_clusts): 401 | clust_counts[i,j] = np.sum(curr_clusts==c) 402 | clust_avgs = clust_counts.copy() 403 | for i in range(clust_avgs.shape[1]): 404 | clust_avgs[:,i] /= clust_avgs[:,i].sum() 405 | 406 | f, ax = plt.subplots(figsize=(5.5,1)) 407 | gs = plt.GridSpec(nrows=2,ncols=2,width_ratios=[0.36, 20], height_ratios=[20,2], wspace=0.01, hspace=0.05) 408 | 409 | ax = plt.subplot(gs[0,0]) 410 | ax.imshow(np.expand_dims(np.arange(n_spatial_domains),1),aspect='auto',interpolation='none', cmap=seg_cmap,rasterized=True) 411 | sns.despine(ax=ax,bottom=True,left=True) 412 | ax.set_yticks(np.arange(clust_avgs.shape[0])); 413 | ax.set_yticklabels(spatial_domains,fontsize=6) 414 | ax.set_xticks([]) 415 | ax = plt.subplot(gs[0,1]) 416 | ax.imshow(clust_avgs,aspect='auto',vmin=vmin,vmax=vmax, cmap=plt.cm.viridis) 417 | ax.set_xticks([]) 418 | ax.set_yticks([]) 419 | ax.axis('off') 420 | #for i in range(clust_counts.shape[0]): 421 | #ax.scatter(np.arange(clust_counts.shape[1]), i*np.ones(clust_counts.shape[1]), s=0.005*clust_counts[i,:],c='k') 422 | ax = plt.subplot(gs[1,1]) 423 | if label_colors is None: 424 | curr_cmap = plt.cm.viridis 425 | else: 426 | curr_cmap = mpl.colors.ListedColormap([label_colors[i] for i in uniq_clusts]) 427 | ax.imshow(np.expand_dims(np.arange(len(uniq_clusts)),1).T,aspect='auto',interpolation='none', cmap=curr_cmap,rasterized=True) 428 | 429 | ax.set_xticks(np.arange(clust_avgs.shape[1])); 430 | ax.set_yticks([]) 431 | ax.set_xticklabels(uniq_clusts,rotation=90,fontsize=6); 432 | sns.despine(ax=ax, left=True, bottom=True) 433 | return clust_avgs, clust_counts 434 | -------------------------------------------------------------------------------- /python/spatial_analysis.py: -------------------------------------------------------------------------------- 1 | import scanpy as sc 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import matplotlib as mpl 5 | import os 6 | import anndata as ad 7 | from tqdm import tqdm 8 | import scipy.stats 9 | from statsmodels.stats.multitest import multipletests 10 | from sklearn.neighbors import KDTree 11 | import multiprocessing 12 | from joblib import Parallel, delayed 13 | 14 | # for each cell compute statistics of neighbors within radius 15 | from sklearn.neighbors import KDTree 16 | from sklearn.preprocessing import LabelEncoder 17 | from scipy.stats import zscore 18 | #curr_adata = adata_annot[adata_annot.obs.batch==9] 19 | from tqdm import tqdm 20 | def compute_neighborhood_stats(pos, labels, radius=100): 21 | # record labels as numbers 22 | labels_quant = LabelEncoder().fit_transform(labels) 23 | # for each cell, look up neighbors 24 | kdtree = KDTree(pos) 25 | nbors_idx, nbors_dist = kdtree.query_radius(pos, r=radius, return_distance=True) 26 | nbor_stats = np.zeros((pos.shape[0], len(np.unique(labels_quant)))) 27 | 28 | for i in tqdm(range(pos.shape[0])): 29 | curr_nbors_idx = np.sort(nbors_idx[i][nbors_dist[i]>0])#[1:] 30 | #curr_nbors_dists = nbors_dist[i][np.argsort(nbors_idx[i])] 31 | curr_nbors_labels = labels_quant[curr_nbors_idx] 32 | for j in curr_nbors_labels: 33 | nbor_stats[i,j] += 1 34 | # zscore across each cluster 35 | for i in range(nbor_stats.shape[0]): 36 | nbor_stats[i,:] = zscore(nbor_stats[i,:]) 37 | nbor_stats[np.isinf(nbor_stats)] = 0 38 | return nbor_stats 39 | 40 | def calc_pval(obs, rand, empirical=False): 41 | if empirical: 42 | return np.sum(obs <= np.array(rand))/len(rand) 43 | else: 44 | z = (obs - np.mean(rand))/np.std(rand) 45 | return scipy.stats.norm.sf(abs(z))*2 46 | 47 | def calc_pval_onesided(obs, rand): 48 | z = (obs-np.mean(rand))/np.std(rand) 49 | if z > 0: 50 | return scipy.stats.norm.sf(abs(z)) 51 | else: 52 | return 1 53 | 54 | def count_nearest_neighbors(X,Y,dist_thresh): 55 | if X.shape[0] > 0 and Y.shape[0] > 0: 56 | kdtree = KDTree(Y) 57 | idx, dists = kdtree.query_radius(X, r=dist_thresh, count_only=False, return_distance=True) 58 | dists = np.hstack(dists) 59 | return len(dists[dists>0]) 60 | else: 61 | return 0 62 | 63 | 64 | def count_interactions(X,Y, dist_thresh=15): 65 | X_pos = X.obsm['spatial']#obs[["center_x","center_y"]].values 66 | Y_pos = Y.obsm['spatial']#obs[["center_x", "center_y"]].values 67 | return count_nearest_neighbors(X_pos, Y_pos, dist_thresh) 68 | 69 | def _jitter_interaction_parallel(X_pos, Y_pos, dist_thresh, perturb_max): 70 | curr_X = X_pos + np.random.uniform(-perturb_max, perturb_max, (X_pos.shape[0],2)) 71 | curr_Y = Y_pos + np.random.uniform(-perturb_max, perturb_max, (Y_pos.shape[0],2)) 72 | return count_nearest_neighbors(curr_X, curr_Y, dist_thresh) + count_nearest_neighbors(curr_Y, curr_X, dist_thresh) 73 | 74 | def score_neighborhood(X,Y, dist_thresh=150, niter=500): 75 | X_pos = X.obsm['spatial']#.obs[["center_x","center_y"]].values 76 | Y_pos = Y.obsm['spatial']#.obs[["center_x", "center_y"]].values 77 | obs_freq = count_nearest_neighbors(X_pos, Y_pos, dist_thresh) + count_nearest_neighbors(Y_pos, X_pos, dist_thresh) 78 | pvals = np.zeros((niter,)) 79 | num_cores = multiprocessing.cpu_count() 80 | iterations = tqdm(range(niter)) 81 | random_freq = Parallel(n_jobs=num_cores)(delayed(_jitter_interaction_parallel)(X_pos, Y_pos, dist_thresh, perturb_max) for i in iterations) 82 | return obs_freq, random_freq, calc_pval(obs_freq, random_freq) 83 | 84 | def compare_celltype_interactions(A, B, celltype_key, celltypes=None, niter=1000): 85 | """ 86 | Compute distributions of celltype interactions between two conditions. 87 | """ 88 | if celltypes is None: 89 | celltypes = sorted(A.obs[celltype_key].unique()) 90 | celltype_interactions = np.zeros((len(celltypes), len(celltypes))) 91 | celltype_pvals = np.zeros((len(celltypes), len(celltypes))) 92 | for i, c1 in enumerate(celltypes): 93 | print(c1) 94 | for j,c2 in enumerate(celltypes): 95 | obs_freq_A = count_interactions(A[A.obs[celltype_key]==c1], A[A.obs[celltype_key]==c2], dist_thresh=15) 96 | obs_freq_B = count_interactions(B[B.obs[celltype_key]==c1], B[B.obs[celltype_key]==c2], dist_thresh=15) 97 | 98 | combined_obs = np.hstack((obs_freq_A, obs_freq_B)) 99 | obs_labels = np.hstack((np.zeros(len(obs_freq_A)), np.ones(len(obs_freq_B)))) 100 | shuffled_obs = [] 101 | for n in tqdm(range(niter)): 102 | # shuffle labels 103 | curr_obs_labels = obs_labels[np.random.permutation(len(obs_labels))] 104 | # compute score 105 | shuffled_obs.append(np.mean(combined_obs[curr_obs_labels==1])/np.mean(combined_obs[curr_obs_labels==0])) 106 | obs_freq = np.mean(obs_freq_B)/np.mean(obs_freq_A) 107 | celltype_interactions[i,j] = obs_freq #(obs_freq - np.mean(random_freq))/np.std(random_freq) 108 | celltype_pvals[i,j] = np.sum(np.abs(obs_freq) < np.abs(shuffled_obs))/niter 109 | celltype_pvals = celltype_pvals.reshape((len(celltypes)**2,)) 110 | #if len(celltype_pvals)>0: 111 | # celltype_pvals = multipletests(celltype_pvals, method='fdr') 112 | celltype_pvals = celltype_pvals.reshape((len(celltypes), len(celltypes))) 113 | return celltype_interactions, celltype_pvals 114 | 115 | def score_interactions(X,Y, dist_thresh=15, niter=100, perturb_max=50, thresh=10, one_sided=False): 116 | # compute pairwise distances 117 | X_pos = X.obsm['spatial']#.obs[["center_x","center_y"]].values 118 | Y_pos = Y.obsm['spatial']#.obs[["center_x", "center_y"]].values 119 | obs_freq = count_nearest_neighbors(X_pos, Y_pos, dist_thresh) + count_nearest_neighbors(Y_pos, X_pos, dist_thresh) 120 | if obs_freq < thresh: 121 | return obs_freq, 0, 1.0 122 | pvals = np.zeros((niter,)) 123 | num_cores = multiprocessing.cpu_count() 124 | iterations = range(niter) 125 | random_freq = Parallel(n_jobs=num_cores)(delayed(_jitter_interaction_parallel)(X_pos, Y_pos, dist_thresh, perturb_max) for i in iterations) 126 | if one_sided: 127 | return obs_freq, random_freq, calc_pval_onesided(obs_freq, random_freq) 128 | else: 129 | return obs_freq, random_freq, calc_pval(obs_freq, random_freq) 130 | 131 | def compute_celltype_interactions(A, celltype_key, celltypes=None, niter=100, perturb_max=50, dist_thresh=30, min_cells=10, onesided=False): 132 | import warnings 133 | warnings.filterwarnings("ignore") 134 | print("updated!") 135 | if celltypes is None: 136 | celltypes = sorted(A.obs[celltype_key].unique()) 137 | celltype_interactions = np.zeros((len(celltypes), len(celltypes))) 138 | celltype_pvals = np.zeros((len(celltypes), len(celltypes))) 139 | for i, c1 in enumerate(celltypes): 140 | print(c1) 141 | for j,c2 in enumerate(celltypes): 142 | if i <= j: 143 | # don't do this for pairs where either has < min_cells 144 | if np.sum(A.obs[celltype_key]==c1) > min_cells and np.sum(A.obs[celltype_key]==c2) > min_cells: 145 | obs_freq, random_freq, pval = score_interactions(A[A.obs[celltype_key]==c1], 146 | A[A.obs[celltype_key]==c2], perturb_max=perturb_max, dist_thresh=dist_thresh, niter=niter, one_sided=onesided) 147 | print(c1, c2, obs_freq, np.mean(random_freq), obs_freq/np.mean(random_freq), pval) 148 | celltype_interactions[i,j] = np.log2(obs_freq/(1e-10+np.mean(random_freq)))#np.log2(obs_freq/np.mean(random_freq))#(obs_freq - np.mean(random_freq))/np.std(random_freq) 149 | celltype_interactions[j,i] = celltype_interactions[i,j]#(obs_freq - np.mean(random_freq))/np.std(random_freq)#np.log2(obs_freq/np.mean(random_freq))# 150 | 151 | celltype_pvals[i,j] = pval 152 | celltype_pvals[j,i] = pval 153 | else: 154 | celltype_pvals[i,j] = 1. 155 | celltype_pvals[j,i] = 1. 156 | celltype_pvals = celltype_pvals.reshape((len(celltypes)**2,)) 157 | celltype_qvals = np.zeros_like(celltype_pvals) 158 | if len(celltype_pvals)>0: 159 | for i in range(celltype_pvals.shape[0]): 160 | pass 161 | #celltype_qvals[i,:] = multipletests(celltype_pvals[i,:], method='fdr_bh')[1] 162 | celltype_pvals = celltype_pvals.reshape((len(celltypes), len(celltypes))) 163 | return celltype_interactions, celltype_pvals, celltype_qvals 164 | 165 | def _compute_neighborhood(pos, labels, celltypes, radius): 166 | neighbors = np.zeros((len(celltypes), len(celltypes))) 167 | 168 | for i, c1 in enumerate(celltypes): 169 | curr_X = pos[labels==c1] 170 | #print(c1, curr_X.shape[0]) 171 | for j, c2 in enumerate(celltypes): 172 | curr_Y = pos[labels==c2] 173 | if i <= j: 174 | neighbors[i,j] = np.sum(count_nearest_neighbors(curr_X, curr_Y, dist_thresh=radius))#/curr_X.shape[0] 175 | neighbors[j,i] = neighbors[i,j] 176 | return neighbors 177 | 178 | def _compute_neighbor_shuffled(pos, labels, celltypes, radius): 179 | labels = labels[np.random.permutation(len(labels))]#[labels[i] for i in np.random.choice(len(labels),len(labels))] 180 | return _compute_neighborhood(pos, labels, celltypes, radius) 181 | 182 | def compute_celltype_neighborhood(A, celltype_key, celltypes=None, radius=150, niter=10): 183 | if celltypes is None: 184 | celltypes = list(sorted(A.obs[celltype_key].unique())) 185 | pos = A.obsm['spatial'] 186 | labels = A.obs[celltype_key] 187 | neighbors = _compute_neighborhood(pos, labels, celltypes, radius) 188 | iterations = tqdm(range(niter)) 189 | # for each iteration, shuffle celltype labels 190 | num_cores = multiprocessing.cpu_count() 191 | random_freq = Parallel(n_jobs=num_cores)(delayed(_compute_neighbor_shuffled)(pos, labels, celltypes, radius) for i in iterations) 192 | #print(len(random_freq)) 193 | # z score 194 | zs = np.zeros_like(neighbors) 195 | pval = np.zeros_like(neighbors) 196 | 197 | shuffled_mean = np.dstack(random_freq).mean(2) 198 | shuffled_std = np.std(np.dstack(random_freq),2) 199 | for i in range(neighbors.shape[0]): 200 | for j in range(neighbors.shape[1]): 201 | zs[i,j] = (neighbors[i,j] - shuffled_mean[i,j])/shuffled_std[i,j] 202 | pval[i,j] = calc_pval(neighbors[i,j], np.dstack(random_freq)[i,j,:])#np.sum(neighbors[i,j] <= np.dstack(random_freq)[i,j,:])/niter#np.sum(neighbors[i,j] <= np.dstack(random_freq)[i,j,:])/niter #calc_pval(neighbors[i,j], np.dstack(random_freq)[i,j,:])#np.sum(neighbors[i,j] <= np.dstack(random_freq)[i,j,:])/niter 203 | return neighbors, zs, pval 204 | 205 | def _compare_neighborhoods(pos_A, pos_B, labels_A, labels_B, celltypes, radius): 206 | neighbors_A = _compute_neighborhood(pos_A, labels_A, celltypes, radius) 207 | neighbors_B = _compute_neighborhood(pos_B, labels_B, celltypes, radius) 208 | return neighbors_B - neighbors_A 209 | 210 | def _compare_neighbor_shuffled(pos_A, pos_B, labels_A, labels_B, celltypes, radius): 211 | neighbors_A = np.zeros((len(celltypes), len(celltypes))) 212 | neighbors_B = np.zeros((len(celltypes), len(celltypes))) 213 | 214 | for i, c1 in enumerate(celltypes): 215 | curr_X_A = pos_A[labels_A==c1] 216 | curr_X_B = pos_B[labels_B==c1] 217 | for j, c2 in enumerate(celltypes): 218 | curr_Y_A = pos_A[labels_A==c2] 219 | curr_Y_B = pos_B[labels_B==c2] 220 | if i <= j: 221 | # make vector of label identities 222 | label_idents = np.hstack((np.zeros(curr_X_A.shape[0]), np.ones(curr_X_B.shape[0]))) 223 | label_idents = label_idents[np.random.permutation(len(label_idents))] 224 | nn_A = count_nearest_neighbors(curr_X_A, curr_Y_A, dist_thresh=radius) 225 | nn_B = count_nearest_neighbors(curr_X_B, curr_Y_B, dist_thresh=radius) 226 | # shuffle which cells came from which identity 227 | combined_neighbors = np.hstack((nn_A, nn_B)) 228 | neighbors_A[i,j] = np.sum(combined_neighbors[label_idents==0]) 229 | neighbors_A[j,i] = neighbors_A[i,j] 230 | neighbors_B[i,j] = np.sum(combined_neighbors[label_idents==1]) 231 | neighbors_B[j,i] = neighbors_B[i,j] 232 | return neighbors_B - neighbors_A 233 | 234 | def compare_celltype_neighborbood(A, B, celltype_key, celltypes=None, radius=150, niter=10): 235 | if celltypes is None: 236 | celltypes = list(sorted(A.obs[celltype_key].unique())) 237 | 238 | pos_A = A.obsm['spatial'] 239 | labels_A = A.obs[celltype_key] 240 | pos_B = B.obsm['spatial'] 241 | labels_B = B.obs[celltype_key] 242 | #_compare_neighbor_shuffled(pos_A, pos_B, labels_A, labels_B, celltypes, radius) 243 | neighbors_A = _compute_neighborhood(pos_A, labels_A, celltypes, radius) 244 | neighbors_B = _compute_neighborhood(pos_B, labels_B, celltypes, radius) 245 | neighbor_diff = neighbors_B - neighbors_A 246 | iterations = tqdm(range(niter)) 247 | # for each iteration, shuffle celltype labels 248 | num_cores = multiprocessing.cpu_count() 249 | random_freq = Parallel(n_jobs=num_cores)(delayed(_compare_neighbor_shuffled)(pos_A, pos_B, labels_A, labels_B, celltypes, radius) for i in iterations) 250 | print(len(random_freq)) 251 | # z score 252 | zs = np.zeros_like(neighbor_diff) 253 | pval = np.zeros_like(neighbor_diff) 254 | 255 | shuffled_mean = np.dstack(random_freq).mean(2) 256 | shuffled_std = np.std(np.dstack(random_freq),2) 257 | for i in range(neighbor_diff.shape[0]): 258 | for j in range(neighbor_diff.shape[1]): 259 | zs[i,j] = (neighbor_diff[i,j] - shuffled_mean[i,j])/shuffled_std[i,j] 260 | pval[i,j] = np.sum(np.abs(neighbor_diff[i,j]) <= np.abs(np.dstack(random_freq)[i,j,:]))/niter 261 | return neighbor_diff, zs, pval, random_freq 262 | 263 | # algorithm: 264 | # - for each cell - cell pair 265 | # - select all neighbors of a cell 266 | # - compute average expression of all genes for neighbors 267 | # - compute average expression for all cells that aren't neighbors 268 | # - find difference 269 | # - shuffle neighbor/not neighbor identities 270 | def identify_nearest_neighbors(X,Y,dist_thresh, min_dist_thresh=0): 271 | """ 272 | Find all the elements in Y that are neighbors of X. 273 | min_dist_thresh is to avoid contamination of stray counts from exactly neighboring cells 274 | """ 275 | if X.shape[0] > 0 and Y.shape[0] > 0: 276 | kdtree = KDTree(Y) 277 | ind, dists = kdtree.query_radius(X, r=dist_thresh, count_only=False,return_distance=True) 278 | ind = np.hstack(ind) 279 | dists = np.hstack(dists) 280 | if len(ind) > 0: 281 | ind = ind[dists>min_dist_thresh] 282 | return np.unique(ind) 283 | else: 284 | return np.array([]) 285 | 286 | def get_nearest_neighbor_dists(X,Y): 287 | kdtree = KDTree(Y) 288 | dist, idx = kdtree.query(X, k=1) 289 | return dist, idx 290 | 291 | 292 | def _compute_neighborhood_expr(pos, expr, labels, celltypes, radius): 293 | expr_diff = np.zeros((len(celltypes), len(celltypes), expr.shape[1])) 294 | for i, c1 in enumerate(celltypes): 295 | curr_X = pos[labels==c1] 296 | print(c1, curr_X.shape[0]) 297 | for j, c2 in enumerate(celltypes): 298 | if i != j: 299 | curr_Y = pos[labels==c2] 300 | neighbors_X = identify_nearest_neighbors(curr_X, curr_Y, dist_thresh=radius).astype(np.int) 301 | #print 302 | #print(curr_X.shape) 303 | #print(neighbors_X.shape) 304 | not_neighbors_X = np.array([i for i in np.arange(curr_X.shape[0]).astype(np.int) if i not in neighbors_X]) 305 | #print(neighbors_X) 306 | #print(not_neighbors_X) 307 | #print(c1, c2, ) 308 | expr_diff[i,j,:] = expr[neighbors_X,:].mean(0)/expr[not_neighbors_X,:].mean(0) #/curr_X.shape[0] 309 | return expr_diff 310 | 311 | def _compute_neighbor_shuffled_expr(pos, expr, labels, celltypes, radius): 312 | # shuffle label 313 | expr_diff = np.zeros((len(celltypes), len(celltypes), expr.shape[1])) 314 | for i, c1 in enumerate(celltypes): 315 | curr_X = pos[labels==c1] 316 | for j, c2 in enumerate(celltypes): 317 | if i != j: 318 | curr_Y = pos[labels==c2] 319 | neighbors_X = identify_nearest_neighbors(curr_X, curr_Y, dist_thresh=radius) 320 | n_neighbors = len(neighbors_X) 321 | #print 322 | #print(curr_X.shape) 323 | #print(neighbors_X.shape) 324 | not_neighbors_X = np.array([i for i in np.arange(curr_X.shape[0]) if i not in neighbors_X]) 325 | n_not_neighbors = len(not_neighbors_X) 326 | 327 | #print(neighbors_X) 328 | #print(not_neighbors_X) 329 | combined_neighbors = np.hstack((neighbors_X, not_neighbors_X)) 330 | combined_neighbors = combined_neighbors[np.random.permutation(len(combined_neighbors))] 331 | neighbors_X = combined_neighbors[:n_neighbors] 332 | not_neighbors_X = combined_neighbors[n_neighbors:] 333 | expr_diff[i,j,:] = expr[neighbors_X,:].mean(0) - expr[not_neighbors_X,:].mean(0) #/curr_X.shape[0] 334 | return expr_diff 335 | 336 | def bootstrap_expr_diff(X,Y,n=1000): 337 | combined_data = np.concatenate((X,Y)) 338 | idx = np.concatenate((np.zeros(len(X)), np.ones(len(Y)))) 339 | obs_diff = np.mean(X) - np.mean(Y) 340 | shuffle_diffs = [] 341 | for i in range(n): 342 | shuffled_idx = idx[np.random.permutation(len(idx))] 343 | curr_X = combined_data[shuffled_idx==0] 344 | curr_Y = combined_data[shuffled_idx==1] 345 | shuffle_diffs.append(np.mean(curr_X)-np.mean(curr_Y)) 346 | return obs_diff, np.sum(obs_diff <= np.array(shuffle_diffs))/n #calc_pval(obs_diff, np.array(shuffle_diffs)) 347 | 348 | def compute_celltype_neighborhood_regression(A, celltype_key, source, celltypes=None,min_radiu=0, obs_keys=None): 349 | if obs_keys is None: 350 | expr = A.X 351 | else: 352 | expr = np.array(A.obs.loc[:,obs_keys].values) 353 | if celltypes is None: 354 | celltypes = list(sorted(A.obs[celltype_key].unique())) 355 | pos = A.obsm['spatial'] 356 | labels = A.obs[celltype_key] 357 | tstats = np.zeros((len(celltypes), expr.shape[1])) 358 | pvals = np.zeros((len(celltypes), expr.shape[1])) 359 | # get all the cells of a certain type 360 | curr_X = pos[labels==source] 361 | curr_expr = expr[labels==source] 362 | interactions = {} 363 | for i, c1 in enumerate(celltypes): 364 | # find all the cells of the neighboring type 365 | curr_Y = pos[labels==c1] 366 | # identify neighbors of target cell type X to cells in cell type Y 367 | dists, idx = get_nearest_neighbor_dists(curr_Y, curr_X) 368 | interactions[c1] = (dists, curr_expr[idx]) 369 | return interactions 370 | 371 | import scipy 372 | def compute_celltype_neighborhood_ttest_single(A, celltype_key, source, celltypes=None, min_radius=15, radius=150, far_radius=250, niter=500, obs_keys=None,use_ttest=False,spatial_jitter=False): 373 | if obs_keys is None: 374 | expr = A.X 375 | else: 376 | expr = np.array(A.obs.loc[:,obs_keys].values) 377 | if celltypes is None: 378 | celltypes = list(sorted(A.obs[celltype_key].unique())) 379 | pos = A.obsm['spatial'] 380 | labels = A.obs[celltype_key] 381 | tstats = np.zeros((len(celltypes), expr.shape[1])) 382 | pvals = np.zeros((len(celltypes), expr.shape[1])) 383 | # get all the cells of a certain type 384 | curr_X = pos[labels==source] 385 | curr_expr = expr[labels==source] 386 | for i, c1 in enumerate(celltypes): 387 | # find all the cells of the neighboring type 388 | curr_Y = pos[labels==c1] 389 | # identify neighbors of target cell type X to cells in cell type Y 390 | neighbors_X = identify_nearest_neighbors(curr_Y, curr_X, dist_thresh=radius, min_dist_thresh=min_radius).astype(np.int) 391 | far_neighbors_X = identify_nearest_neighbors(curr_Y, curr_X, dist_thresh=far_radius, min_dist_thresh=radius).astype(np.int) 392 | not_neighbors_X = np.array([i for i in far_neighbors_X if i not in neighbors_X]) 393 | 394 | #neighbors_X = identify_nearest_neighbors(curr_Y, curr_X, dist_thresh=radius, min_dist_thresh=min_radius).astype(np.int) 395 | #not_neighbors_X = np.array([i for i in np.arange(curr_X.shape[0]).astype(np.int) if i not in neighbors_X]) 396 | # shuffle what is a neighbor vs what isn't a neighbor 397 | if not spatial_jitter: 398 | if len(neighbors_X) > 0 and len(not_neighbors_X) > 0: 399 | mean_nbor = np.mean(curr_expr[neighbors_X]) 400 | mean_not_nbor = np.mean(curr_expr[not_neighbors_X]) 401 | print("X=%s, Y=%s, curr_X=%d, curr_Y=%d, nbor_X=%d, not_nbor_X=%d, mean_nbor_X=%0.04f, mean_not_nbor_X=%0.04f" % (c1, source, curr_X.shape[0], curr_Y.shape[0], len(neighbors_X), len(not_neighbors_X), mean_nbor, mean_not_nbor)) 402 | if use_ttest: 403 | ttest = scipy.stats.ttest_ind(curr_expr[neighbors_X], curr_expr[not_neighbors_X]) 404 | else: 405 | ttest = bootstrap_expr_diff(curr_expr[neighbors_X], curr_expr[not_neighbors_X])# 406 | tstats[i] = ttest[0]#np.log2(np.mean(curr_expr[neighbors_X])/np.mean(curr_expr[not_neighbors_X]))#ttest[0] 407 | pvals[i] = ttest[1]#/curr_X.shape[0] 408 | else: 409 | pvals[i] = 1 410 | tstats[i] = 0 411 | else: 412 | pass 413 | # jitter cells in space and, then compute gene expression distribution 414 | return tstats, pvals 415 | 416 | 417 | def compute_celltype_neighborhood_ttest(A, celltype_key, celltypes=None, min_radius=15, radius=150, far_radius=150, niter=500, obs_keys=None): 418 | if obs_keys is None: 419 | expr = A.X 420 | else: 421 | expr = np.array(A.obs.loc[:,obs_keys].values) 422 | if celltypes is None: 423 | celltypes = list(sorted(A.obs[celltype_key].unique())) 424 | pos = A.obsm['spatial'] 425 | labels = A.obs[celltype_key] 426 | tstats = np.zeros((len(celltypes), len(celltypes), expr.shape[1])) 427 | pvals = np.zeros((len(celltypes), len(celltypes), expr.shape[1])) 428 | for i, c1 in enumerate(celltypes): 429 | curr_X = pos[labels==c1] 430 | print(c1, curr_X.shape[0]) 431 | for j, c2 in enumerate(celltypes): 432 | # if i != j: 433 | curr_Y = pos[labels==c2] 434 | curr_expr = expr[labels==c2,:] 435 | # neighbors_X indexes into Y 436 | neighbors_X = identify_nearest_neighbors(curr_X, curr_Y, dist_thresh=radius, min_dist_thresh=min_radius).astype(np.int) 437 | far_neighbors_X = identify_nearest_neighbors(curr_X, curr_Y, dist_thresh=far_radius, min_dist_thresh=radius).astype(np.int) 438 | #print(curr_X.shape[0], curr_Y.shape[0], neighbors_X.max()) 439 | #print 440 | #print(curr_X.shape) 441 | #print(neighbors_X.shape) 442 | not_neighbors_X = np.array([i for i in far_neighbors_X if i not in neighbors_X]) 443 | #print(curr_expr.shape[0]) 444 | if len(neighbors_X) > 0 and len(not_neighbors_X) > 0: 445 | print("X=%s, Y=%s, curr_X=%d, curr_Y=%d, nbor_X=%d, not_nbor_X=%d, max_nbor_X=%d, max_not_nbor_X=%d" % (c1, c2, curr_X.shape[0], curr_Y.shape[0], len(neighbors_X), len(not_neighbors_X), neighbors_X.max(), not_neighbors_X.max())) 446 | #print(neighbors_X) 447 | #print(not_neighbors_X) 448 | #print(c1, c2, ) 449 | for k in range(expr.shape[1]): 450 | ttest = scipy.stats.ttest_ind(curr_expr[neighbors_X,:][:,k], curr_expr[not_neighbors_X,:][:,k]) 451 | tstats[i,j,k] = ttest[0] 452 | pvals[i,j,k] = ttest[1]#/curr_X.shape[0] 453 | else: 454 | pvals[i,j,:] = 1 455 | tstats[i,j,:] = 0 456 | if i == j: 457 | pvals[i,j,:] = 1 458 | tstats[i,j,:] = 0 459 | return tstats, pvals 460 | 461 | def compute_celltype_neighborhood_expr(A, celltype_key, celltypes=None, radius=150, niter=500): 462 | expr = A.X 463 | if celltypes is None: 464 | celltypes = list(sorted(A.obs[celltype_key].unique())) 465 | pos = A.obsm['spatial'] 466 | labels = A.obs[celltype_key] 467 | expr_diff = _compute_neighborhood_expr(pos, expr, labels, celltypes, radius) 468 | iterations = tqdm(range(niter)) 469 | # for each iteration, shuffle celltype labels 470 | num_cores = multiprocessing.cpu_count() 471 | # random_freq is niter x n_celltype x n_celltype x n_gene matrix 472 | #random_freq = np.stack(Parallel(n_jobs=num_cores)(delayed(_compute_neighbor_shuffled_expr)(pos, expr, labels, celltypes, radius) for i in iterations)) 473 | # z score 474 | zs = np.zeros_like(expr_diff) 475 | pval = np.zeros_like(expr_diff) 476 | 477 | #shuffled_mean = random_freq.mean(0) 478 | #shuffled_std = np.std(random_freq,0) 479 | #for i in range(expr_diff.shape[0]): 480 | # for j in range(expr_diff.shape[1]): 481 | # for k in range(expr_diff.shape[2]): 482 | # zs[i,j,k] = (expr_diff[i,j,k] - shuffled_mean[i,j,k])/shuffled_std[i,j,k] 483 | # pval[i,j,k] = scipy.stats.norm.sf(abs(zs[i,j,k]))*2 #calc_pval(expr_diff[i,j,k], random_freq[:,i,j,k])#np.sum(neighbors[i,j] <= np.dstack(random_freq)[i,j,:])/niter 484 | return expr_diff, zs, pval 485 | 486 | def quantify_clust_spatial_enrichment(A,uniq_clusts=None,clust_key='clust_annot', normalize=True): 487 | if uniq_clusts is None: 488 | uniq_clusts = sorted(A.obs[clust_key].unique()) 489 | n_spatial_domains = A.obs.spatial_clust_annots_value.max() + 1 490 | clust_counts = np.zeros((n_spatial_domains, len(uniq_clusts))) 491 | print(clust_counts.shape) 492 | for i in range(n_spatial_domains): 493 | curr_clusts = A[A.obs.spatial_clust_annots_value==i,:].obs[clust_key] 494 | for j,c in enumerate(uniq_clusts): 495 | clust_counts[i,j] = np.sum(curr_clusts==c) 496 | clust_avgs = clust_counts.copy() 497 | for i in range(clust_avgs.shape[0]): 498 | clust_avgs[i,:] /= np.sum(A.obs.spatial_clust_annots_value==i)#clust_avgs[i,:].sum() 499 | return clust_counts, clust_avgs 500 | -------------------------------------------------------------------------------- /python/utils.py: -------------------------------------------------------------------------------- 1 | from statsmodels.stats.multitest import multipletests 2 | import numpy as np 3 | 4 | def fdr_correct(X): 5 | new_X = np.zeros_like(X) 6 | for i in range(X.shape[-1]): 7 | pvals = multipletests(X[i,:],method='fdr_bh')[0] 8 | new_X[i,:] = multipletests(X[i,:],method='fdr_bh')[0] 9 | new_X[:,i] = new_X[i,:] 10 | #X = multipletests(X.flatten(), method='fdr_bh')[0] 11 | return new_X#X.reshape(X_shape) 12 | 13 | from scipy.spatial.distance import pdist 14 | import scipy.cluster.hierarchy as hc 15 | 16 | 17 | def order_values(X, metric='correlation', return_linkage=False): 18 | D = pdist(X,metric) 19 | D[np.isinf(D)] = 0 20 | D[np.isnan(D)] = 0 21 | Z = hc.linkage(D,'complete',optimal_ordering=True) 22 | dn = hc.dendrogram(Z,no_plot=True) 23 | if not return_linkage: 24 | return np.array(dn['leaves']) 25 | else: 26 | return np.array(dn['leaves']), Z 27 | 28 | def relabel_clust(A, orig_clust, new_clust,key="clust_annot"): 29 | clusts = np.array(list(A.obs[key])) 30 | clusts[clusts==orig_clust] = new_clust 31 | A.obs[key] = list(clusts) 32 | return A 33 | 34 | def relabel_anatomy(A, annot_old, annot_new): 35 | A = relabel_clust(A, annot_old, annot_new, key='spatial_clust_annots') 36 | spatial_clust_annots_values = { 37 | 'Pia' : 0, 38 | 'Cortex':1, 39 | 'LatSept':2, 40 | 'CC':3, 41 | 'Striatum':4, 42 | 'Ventricle':5 43 | } 44 | A.obs['spatial_clust_annots_value'] = [spatial_clust_annots_values[i] if i in spatial_clust_annots_values else None for i in A.obs.spatial_clust_annots] 45 | return A 46 | 47 | def relabel_all_clusts(A, clust_mapping,key='clust_annot'): 48 | old_clust_annots = np.array(A.obs[key].copy()) 49 | new_clust_annots = np.array(old_clust_annots.copy()) 50 | for k,v in clust_mapping.items(): 51 | new_clust_annots[old_clust_annots==k] = v 52 | A.obs[key] = list(new_clust_annots.copy()) 53 | return A 54 | 55 | def cleanup_section(A_section,n_neighbors=25): 56 | np.random.seed(31415) 57 | from sklearn.neighbors import KNeighborsClassifier 58 | clf = KNeighborsClassifier(n_jobs=-1,n_neighbors=n_neighbors,weights='uniform').fit(A_section.obsm['spatial'],A_section.obs.spatial_clust_annots_value) 59 | A_section.obs['smoothed_spatial_clust_annot_values'] = list(clf.predict(A_section.obsm['spatial'])) 60 | return A_section --------------------------------------------------------------------------------