├── pyproject.toml ├── src └── utils │ ├── pl │ ├── assets │ │ ├── abbreviations.json │ │ ├── default.mplstyle │ │ └── named_colors.json │ ├── plot_custom.py │ ├── plot_ternary.py │ ├── plot_overlaps.py │ ├── plot_embedding.py │ └── plot_trends.py │ ├── tl │ ├── clinical_associations.py │ ├── state_transitions.py │ └── transfer_labels.py │ └── pp │ ├── assets │ └── ribosomal_genes.gmt │ └── preprocess.py ├── notebooks ├── requirements.py ├── download_data.ipynb └── Figure_4.ipynb ├── .gitignore └── README.md /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=58.0.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "my_project" 7 | version = "1.0.0" 8 | description = "My project description" 9 | authors = ["Andrew Moorman "] 10 | license = "MIT" 11 | 12 | keywords = [] 13 | classifiers = [] 14 | 15 | requires-python = ">=3.8" 16 | dependencies = [ 17 | "scanpy", 18 | "matplotlib", 19 | "pandas", 20 | "numpy", 21 | "seaborn", 22 | "openpyxl", 23 | "scipy", 24 | "scikit-misc", 25 | "PhenoGraph", 26 | "magic-impute", 27 | "python-ternary", 28 | "colorsys", 29 | "boto3", 30 | ] 31 | -------------------------------------------------------------------------------- /src/utils/pl/assets/abbreviations.json: -------------------------------------------------------------------------------- 1 | { 2 | "Absorptive Intestine": "Abs", 3 | "EMT": "EMT", 4 | "Injury Repair": "Inj", 5 | "Osteoblast": "Osteo", 6 | "Squamous": "Squa", 7 | "Neuroendocrine": "Neuro", 8 | "Endoderm Development": "Endo", 9 | "Tumor ISC-like": "ISC", 10 | "Secretory Intestine": "Sec", 11 | "Intestine": "Int", 12 | "Metastasis": "Metastasis", 13 | "Primary Tumor": "Primary", 14 | "ISC": "ISC", 15 | "Secretory Precursor": "Pre-Sec.", 16 | "Absorptive Precursor": "Pre-Abs.", 17 | "Goblet": "Goblet", 18 | "Enterocytes": "Enterocytes", 19 | "Enterocytes (BEST4+)": "BEST4+", 20 | "Tuft": "Tuft", 21 | "Enteroendocrine": "Enteroendocrine" 22 | } -------------------------------------------------------------------------------- /src/utils/pl/assets/default.mplstyle: -------------------------------------------------------------------------------- 1 | figure.dpi : 300 2 | font.family : "Arial" 3 | mathtext.default : "default" 4 | xtick.labelsize : 7 5 | ytick.labelsize : 7 6 | axes.labelsize : 9 7 | axes.titlesize : 9 8 | axes.facecolor : "white" 9 | xtick.direction : "out" 10 | ytick.direction : "out" 11 | image.cmap : "viridis" 12 | lines.linewidth : 0.75 13 | axes.spines.right : False 14 | axes.spines.top : False 15 | legend.borderaxespad : 0.25 16 | legend.borderpad : 0.25 17 | legend.fontsize : 7 18 | legend.title_fontsize : 7 19 | legend.handleheight : 0.5 20 | legend.handlelength : 1.0 21 | legend.markerscale : 0.5 22 | patch.linewidth : 0.75 23 | axes.prop_cycle : cycler("color", ["#F04437", "#E81F64", "#903E97", "#65499E", "#4356A5", "#478FCC", "#34A4DD", "#00BCD4", "#009889", "#4BB04F", "#8BC34C", "#CCDA3A", "#FCED3A", "#FFC10E", "#F8991D", "#F1592C", "#7A5649", "#9F9E9E", "#607F8C",]) -------------------------------------------------------------------------------- /notebooks/requirements.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This file contains standardized imports and variables used throughout the 3 | analysis notebooks for "Progressive Plasticity in Colorectal Cancer Metastasis" 4 | by Moorman et al (DOI: ) 5 | ''' 6 | # Packages used throughout 7 | import scanpy as sc 8 | import anndata 9 | import numpy as np 10 | import scipy as sp 11 | import pandas as pd 12 | from matplotlib import pyplot as plt 13 | import seaborn as sns 14 | import os 15 | import sys 16 | import json 17 | import tqdm 18 | import pickle 19 | from pathlib import Path 20 | 21 | # Local modules used throughout 22 | module_path = os.path.abspath('../src') 23 | if module_path not in sys.path: 24 | sys.path.append(module_path) 25 | 26 | # Standard variables referenced throughout 27 | data_dir = Path(f"{os.getcwd()}/../data") 28 | media_dir = Path(f"{os.getcwd()}/../media") 29 | 30 | # Plotting styles used throughout 31 | module_path = os.path.abspath('../src') 32 | stylesheet = f'{module_path}/utils/pl/assets/default.mplstyle' 33 | plt.style.use(stylesheet) 34 | with open(f'{module_path}/utils/pl/assets/named_colors.json', 'r') as f: 35 | named_colors = json.load(f) 36 | with open(f'{module_path}/utils/pl/assets/abbreviations.json', 'r') as f: 37 | abbreviations = json.load(f) -------------------------------------------------------------------------------- /src/utils/tl/clinical_associations.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import scipy as sp 3 | from typing import List, Tuple 4 | 5 | 6 | def get_binary_association( 7 | enrichments: pd.DataFrame, 8 | annotations: pd.DataFrame, 9 | module: str, 10 | column: str, 11 | negative_vals: List, 12 | ): 13 | idx = enrichments[module].dropna().index 14 | idx = idx.intersection(annotations[column].dropna().index) 15 | mask = annotations.loc[idx, column].isin(negative_vals) 16 | r, p = sp.stats.ranksums( 17 | enrichments.loc[idx[~mask], module], 18 | enrichments.loc[idx[mask], module], 19 | ) 20 | return r, p 21 | 22 | 23 | def get_continuous_association( 24 | enrichments: pd.DataFrame, 25 | annotations: pd.DataFrame, 26 | module: str, 27 | column: str, 28 | ): 29 | idx = enrichments[module].dropna().index 30 | idx = idx.intersection(annotations[column].dropna().index) 31 | r, p = sp.stats.pearsonr(enrichments[module]) 32 | return r, p 33 | 34 | 35 | def get_associations( 36 | enrichments: pd.DataFrame, 37 | annotations: pd.DataFrame, 38 | modules: List, 39 | binary_columns: dict, 40 | continuous_columns: List[str], 41 | ): 42 | associations = pd.DataFrame( 43 | index=modules, 44 | columns=list(binary_columns.keys()) + continuous_columns, 45 | dtype=float, 46 | ) 47 | pvals = associations.copy() 48 | 49 | for module in modules: 50 | for column in continuous_columns: 51 | r, p = get_continuous_association( 52 | enrichments, annotations, module, column, 53 | ) 54 | associations.loc[module, column] = r 55 | pvals.loc[module, column] = p 56 | 57 | for column, negative in binary_columns.items(): 58 | if type(negative) != list: negative = [negative] 59 | r, p = get_binary_association( 60 | enrichments, annotations, module, column, negative, 61 | ) 62 | associations.loc[module, column] = r 63 | pvals.loc[module, column] = p 64 | 65 | return associations, pvals 66 | -------------------------------------------------------------------------------- /src/utils/pl/assets/named_colors.json: -------------------------------------------------------------------------------- 1 | { 2 | "Module Absorptive Intestine Score": "#244B9E", 3 | "Module EMT Score": "#924599", 4 | "Module Injury Repair Score": "#D357A1", 5 | "Module Osteoblast Score": "#831518", 6 | "Module Squamous Score": "#E61F32", 7 | "Module Neuroendocrine Score": "#E25E27", 8 | "Module Endoderm Development Score": "#FAA71B", 9 | "Module Tumor ISC-like Score": "#099848", 10 | "Module Secretory Intestine Score": "#009DCE", 11 | "Module Intestine Score": "#0077A9", 12 | "Absorptive Intestine": "#244B9E", 13 | "EMT": "#924599", 14 | "Injury Repair": "#D357A1", 15 | "Osteoblast": "#831518", 16 | "Squamous": "#E61F32", 17 | "Neuroendocrine": "#E25E27", 18 | "Endoderm Development": "#FAA71B", 19 | "Tumor ISC-like": "#099848", 20 | "Secretory Intestine": "#009DCE", 21 | "Intestine": "#0077A9", 22 | "Absorptive Intestine Module Gene Score": "#244B9E", 23 | "EMT Module Gene Score": "#924599", 24 | "Injury Repair Module Gene Score": "#D357A1", 25 | "Osteoblast Module Gene Score": "#831518", 26 | "Squamous Module Gene Score": "#E61F32", 27 | "Neuroendocrine Module Gene Score": "#E25E27", 28 | "Endoderm Development Module Gene Score": "#FAA71B", 29 | "Tumor ISC-like Module Gene Score": "#099848", 30 | "Secretory Intestine Module Gene Score": "#009DCE", 31 | "Intestine Module Gene Score": "#0077A9", 32 | "Fetal": "#771434", 33 | "Fetal, Conserved": "#771434", 34 | "Fetal, Conserved (Normalized)": "#771434", 35 | "palantir_neuroendocrine_branch_probability": "#E25E27", 36 | "palantir_squamous_branch_probability": "#E61F32", 37 | "Primary": "#EBA131", 38 | "Primary Tumor": "#EBA131", 39 | "Metastasis": "#625793", 40 | "ISC": "#6BBE45", 41 | "Secretory Precursor": "#5587A2", 42 | "Goblet": "#97D5E0", 43 | "Absorptive Precursor": "#F99B86", 44 | "Enterocytes": "#C04D92", 45 | "Enterocytes (BEST4+)": "#7F2631", 46 | "Enteroendocrine": "#EE5428", 47 | "Tuft": "#C05950", 48 | "Treatment-Naive": "#448C82", 49 | "Treated": "#C35E34", 50 | "Canonical": "#35739B", 51 | "Non-Canonical": "#AE3138" 52 | } -------------------------------------------------------------------------------- /src/utils/tl/state_transitions.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import scanpy as sc 3 | import itertools 4 | from typing import List 5 | import numpy as np 6 | import scipy as sp 7 | import tqdm 8 | 9 | 10 | def get_feature_overlaps( 11 | adata: sc.AnnData, 12 | feature_columns: List[str], 13 | qtl_threshold: float = 0.75, 14 | ) -> pd.DataFrame: 15 | # Count co-occurrence of features in cells 16 | thresholds = adata.obs[feature_columns].quantile(qtl_threshold) 17 | counts = adata.obs[feature_columns].ge(thresholds) 18 | overlaps = pd.DataFrame(dtype=float) 19 | for f1 in feature_columns: 20 | for f2 in feature_columns: 21 | overlaps.loc[f1, f2] = counts[[f1, f2]].all(axis=1).sum() 22 | # Frequencies of co-occurrence of features in cells 23 | overlaps = overlaps.divide(overlaps.max(0), 0) 24 | return overlaps 25 | 26 | 27 | def test_ratios( 28 | df: pd.DataFrame, 29 | groupby: List[str], 30 | n_iters: int = 1000, 31 | ): 32 | # Reference fractions 33 | fracs = df.groupby(groupby).mean()['High'] 34 | ratios = fracs.xs('Metastasis', level=1) / fracs.xs('Primary', level=1) 35 | mask = ~ratios.isin([np.inf, 0]) 36 | ratios = ratios.replace(np.inf, ratios[mask].max()) 37 | ratios = ratios.replace(0, ratios[mask].min()) 38 | 39 | # Null fractions from random shuffling 40 | null_ratios = pd.DataFrame( 41 | index=ratios.index, 42 | columns=range(n_iters), 43 | dtype=float, 44 | ) 45 | 46 | def shuffle(group): 47 | idx = group.index 48 | group = group.sample(frac=1) 49 | group.index = idx 50 | return group 51 | 52 | for i in tqdm.tqdm(null_ratios.columns): 53 | null_df = df.copy() 54 | gb = null_df.groupby('Patient', group_keys=False) 55 | null_df['High'] = gb['High'].apply(shuffle) 56 | null_fracs = null_df.groupby(groupby).mean()['High'] 57 | null_ratios[i] = null_fracs.xs('Metastasis', level=1) 58 | null_ratios[i] /= null_fracs.xs('Primary', level=1) 59 | 60 | # Remove infinities for stats calculation 61 | mask = ~null_ratios.isin([np.inf, 0]) 62 | null_ratios = null_ratios.replace(np.inf, null_ratios[mask].max()) 63 | null_ratios = null_ratios.replace(0, null_ratios[mask].min()) 64 | r, p = sp.stats.ranksums( 65 | np.log10(ratios.values.flatten()), 66 | np.log10(null_ratios.values.flatten()), 67 | alternative='greater', 68 | ) 69 | 70 | return r, p 71 | -------------------------------------------------------------------------------- /src/utils/pl/plot_custom.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from scipy.stats import gaussian_kde 4 | from sklearn.preprocessing import MinMaxScaler 5 | import sys 6 | import os 7 | 8 | 9 | # Class to suppress output from PhenoGraph 10 | class HiddenPrints: 11 | def __enter__(self): 12 | self._original_stdout = sys.stdout 13 | sys.stdout = open(os.devnull, "w") 14 | 15 | def __exit__(self, exc_type, exc_val, exc_tb): 16 | sys.stdout.close() 17 | sys.stdout = self._original_stdout 18 | 19 | 20 | def quartile_to_level(data, quantile): 21 | """Return data levels corresponding to quantile cuts of mass.""" 22 | isoprop = np.asarray(quantile) 23 | values = np.ravel(data) 24 | sorted_values = np.sort(values)[::-1] 25 | normalized_values = np.cumsum(sorted_values) / values.sum() 26 | idx = np.searchsorted(normalized_values, 1 - isoprop) 27 | levels = np.take(sorted_values, idx, mode="clip") 28 | return levels 29 | 30 | 31 | def get_kde( 32 | data, 33 | grid_size=500, 34 | min_q=0.0, 35 | **kwargs 36 | ): 37 | kernel = gaussian_kde(data, **kwargs) 38 | positions = np.linspace(data.min(), data.max(), grid_size) 39 | estimate = kernel(positions) 40 | level = quartile_to_level(estimate, min_q) 41 | mask = estimate>=level 42 | return positions[mask], estimate[mask]-level 43 | 44 | 45 | def plot_violin( 46 | data, x, y, 47 | palette, 48 | ax, 49 | bw_adjust=1, 50 | grid_size=100, 51 | h=2, 52 | norm=True, 53 | min_q=0.01, 54 | lw=0.5, 55 | ): 56 | n_groups = data[x].nunique() 57 | x_ticks = np.linspace(0, -h * (n_groups - 1), n_groups) 58 | 59 | for (name, group), xpos in zip(data.sort_values(x).groupby(x), x_ticks): 60 | 61 | # Violin Plot 62 | n = group.shape[0] 63 | bw_method = group.shape[0] ** (-1/(5)) * bw_adjust 64 | pos, est = get_kde(group[y], min_q=min_q, bw_method=bw_method) 65 | if norm: 66 | est = MinMaxScaler().fit_transform(est.reshape(-1,1)).flatten() 67 | ax.fill_betweenx( 68 | pos, xpos - est, xpos + est, 69 | facecolor=palette[name], alpha=0.75, lw=0, 70 | zorder=1, 71 | ) 72 | ax.plot(xpos + est, pos, lw=lw, color="w", zorder=1) 73 | ax.plot(xpos - est, pos, lw=lw, color="w", zorder=1) 74 | # IQR Plot 75 | ymin, ymid, ymax = group[y].quantile([0.25, 0.5, 0.75]) 76 | ax.plot([xpos, xpos], [ymin, ymax], lw=0.5, color="black", alpha=0.75) 77 | ax.scatter([xpos], [ymid], s=2, color="black", alpha=0.9) 78 | 79 | ax.set_xticks(x_ticks) 80 | 81 | 82 | def plot_heatmap( 83 | data: pd.DataFrame, 84 | k: int = 35, 85 | ): 86 | pass 87 | -------------------------------------------------------------------------------- /notebooks/download_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 298, 6 | "id": "46d41c2d-35de-4e93-ad78-d2e372c35880", 7 | "metadata": { 8 | "execution": { 9 | "iopub.execute_input": "2024-10-29T17:20:47.654917Z", 10 | "iopub.status.busy": "2024-10-29T17:20:47.654596Z", 11 | "iopub.status.idle": "2024-10-29T17:20:47.681942Z", 12 | "shell.execute_reply": "2024-10-29T17:20:47.681451Z", 13 | "shell.execute_reply.started": "2024-10-29T17:20:47.654895Z" 14 | }, 15 | "tags": [] 16 | }, 17 | "outputs": [ 18 | { 19 | "name": "stdout", 20 | "output_type": "stream", 21 | "text": [ 22 | "The autoreload extension is already loaded. To reload it, use:\n", 23 | " %reload_ext autoreload\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "%load_ext autoreload\n", 29 | "%autoreload 2" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 299, 35 | "id": "4e588df2-1dba-41b5-b9af-38fd7359cfa5", 36 | "metadata": { 37 | "execution": { 38 | "iopub.execute_input": "2024-10-29T17:20:47.833464Z", 39 | "iopub.status.busy": "2024-10-29T17:20:47.833074Z", 40 | "iopub.status.idle": "2024-10-29T17:20:47.857477Z", 41 | "shell.execute_reply": "2024-10-29T17:20:47.857016Z", 42 | "shell.execute_reply.started": "2024-10-29T17:20:47.833445Z" 43 | }, 44 | "tags": [] 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "from requirements import *\n", 49 | "import boto3" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "id": "9497daca-f0eb-4f70-ab62-04299e457023", 55 | "metadata": {}, 56 | "source": [ 57 | "## Download Source Data" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 297, 63 | "id": "554a293d-e26f-49dd-88f2-ffe73bd1081e", 64 | "metadata": { 65 | "execution": { 66 | "iopub.execute_input": "2024-10-29T17:20:17.716064Z", 67 | "iopub.status.busy": "2024-10-29T17:20:17.715748Z", 68 | "iopub.status.idle": "2024-10-29T17:20:18.646661Z", 69 | "shell.execute_reply": "2024-10-29T17:20:18.646169Z", 70 | "shell.execute_reply.started": "2024-10-29T17:20:17.716043Z" 71 | } 72 | }, 73 | "outputs": [ 74 | { 75 | "name": "stderr", 76 | "output_type": "stream", 77 | "text": [ 78 | "28it [00:00, 31.47it/s]\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "s3 = boto3.client('s3')\n", 84 | "bucket_name = 'dp-lab-data-public'\n", 85 | "prefix = 'progressive-plasticity-crc-metastasis'\n", 86 | "data_bucket = boto3.resource('s3').Bucket(bucket_name)\n", 87 | "\n", 88 | "for obj in tqdm.tqdm(data_bucket.objects.filter(Prefix=prefix)):\n", 89 | " src_path = obj.key\n", 90 | " dst_path = obj.key.replace(prefix, str(data_dir))\n", 91 | " s3.download_file(bucket_name, src_path, dst_path)" 92 | ] 93 | } 94 | ], 95 | "metadata": { 96 | "kernelspec": { 97 | "display_name": "Python 3 (ipykernel)", 98 | "language": "python", 99 | "name": "python3" 100 | }, 101 | "language_info": { 102 | "codemirror_mode": { 103 | "name": "ipython", 104 | "version": 3 105 | }, 106 | "file_extension": ".py", 107 | "mimetype": "text/x-python", 108 | "name": "python", 109 | "nbconvert_exporter": "python", 110 | "pygments_lexer": "ipython3", 111 | "version": "3.8.1" 112 | } 113 | }, 114 | "nbformat": 4, 115 | "nbformat_minor": 5 116 | } 117 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | 164 | data* 165 | media* 166 | notebooks/_*.ipynb -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Progressive Plasticity in Colorectal Cancer Metastasis 2 | 3 | This repository accompanies the study *"Progressive Plasticity During Colorectal Cancer Metastasis"* published in *Nature* (available [here](https://www.nature.com/articles/s41586-024-08150-0)), aiming to reproduce key figures and analyses from the paper. 4 | 5 | ### Project Overview 6 | The study investigates the progressive plasticity of cellular states during the metastasis of colorectal cancer. Using single-cell RNA sequencing data from primary tumors and metastases, we uncover dynamic cellular state transitions, highlighting specific lineage and state shifts associated with metastatic progression. 7 | 8 | ### Repository Structure 9 | 10 | - **data/**: Contains required data files: 11 | - **h5ads/**: AnnData files, such as `Tumor.h5ad`, `Epithelial.h5ad`, etc. 12 | - **tables/**: Supplementary tables in `.xlsx` format. 13 | - **other/**: Additional annotation and enrichment files. 14 | 15 | - **notebooks/**: Jupyter notebooks for data download, preprocessing, and reproducing figures: 16 | - `download_data.ipynb`: Guide for downloading data directly from AWS S3. 17 | - `Figure_X.ipynb`: Notebooks for reproducing figures in the paper. 18 | 19 | - **src/**: Source code modules organized by functionality, including utilities for data preprocessing (`pp`), plotting (`pl`), and label transfer or state analysis (`tl`). 20 | 21 | ### Data Access 22 | The processed H5AD data for reproducing this analysis is hosted on AWS S3 at: 23 | ``` 24 | s3://dp-lab-data-public/progressive-plasticity-crc-metastasis 25 | ``` 26 | You can download H5ADs directly using the following links: 27 | ``` 28 | https://dp-lab-data-public.s3.us-east-1.amazonaws.com/progressive-plasticity-crc-metastasis/h5ads/All.h5ad 29 | https://dp-lab-data-public.s3.us-east-1.amazonaws.com/progressive-plasticity-crc-metastasis/h5ads/Epithelial.h5ad 30 | https://dp-lab-data-public.s3.us-east-1.amazonaws.com/progressive-plasticity-crc-metastasis/h5ads/KG146_Organoids.h5ad 31 | https://dp-lab-data-public.s3.us-east-1.amazonaws.com/progressive-plasticity-crc-metastasis/h5ads/KG146_Tumor.h5ad 32 | https://dp-lab-data-public.s3.us-east-1.amazonaws.com/progressive-plasticity-crc-metastasis/h5ads/KG146_Tumor_Mapping_Reference.h5ad 33 | https://dp-lab-data-public.s3.us-east-1.amazonaws.com/progressive-plasticity-crc-metastasis/h5ads/KG146_shPROX1_Knockdown.h5ad 34 | https://dp-lab-data-public.s3.us-east-1.amazonaws.com/progressive-plasticity-crc-metastasis/h5ads/KG150_Tumor.h5ad 35 | https://dp-lab-data-public.s3.us-east-1.amazonaws.com/progressive-plasticity-crc-metastasis/h5ads/KG182_Tumor.h5ad 36 | https://dp-lab-data-public.s3.us-east-1.amazonaws.com/progressive-plasticity-crc-metastasis/h5ads/KG183_Tumor.h5ad 37 | https://dp-lab-data-public.s3.us-east-1.amazonaws.com/progressive-plasticity-crc-metastasis/h5ads/Non-Tumor_Epithelial.h5ad 38 | https://dp-lab-data-public.s3.us-east-1.amazonaws.com/progressive-plasticity-crc-metastasis/h5ads/Tumor.h5ad 39 | https://dp-lab-data-public.s3.us-east-1.amazonaws.com/progressive-plasticity-crc-metastasis/h5ads/Untreated_Epithelial.h5ad 40 | https://dp-lab-data-public.s3.us-east-1.amazonaws.com/progressive-plasticity-crc-metastasis/h5ads/Wang_etal_Tumor.h5ad 41 | https://dp-lab-data-public.s3.us-east-1.amazonaws.com/progressive-plasticity-crc-metastasis/h5ads/Wang_etal_s1231_Tumor.h5ad 42 | ``` 43 | Additionally, all data is available vis a single download link in .tar.gz format: 44 | ``` 45 | https://dp-lab-data-public.s3.us-east-1.amazonaws.com/progressive-plasticity-crc-metastasis/data.tar.gz 46 | ``` 47 | 48 | ### Installation 49 | 50 | To install the required dependencies, ensure you have Python 3.8 or higher and use the `pyproject.toml`: 51 | ```bash 52 | pip install . 53 | ``` 54 | 55 | For specific package versions, review the `pyproject.toml`. 56 | 57 | ### Quickstart 58 | 59 | 1. **Download Data**: Start with `download_data.ipynb` to load data files from AWS. 60 | 2. **Run Notebooks**: Open notebooks in the `notebooks` directory to generate individual figures. 61 | 62 | --- 63 | 64 | This README provides a basic overview of the repository's contents and is designed to support the reproducibility of key analyses and findings from the paper. 65 | -------------------------------------------------------------------------------- /src/utils/pl/plot_ternary.py: -------------------------------------------------------------------------------- 1 | import scipy as sp 2 | import numpy as np 3 | import pandas as pd 4 | import ternary 5 | from ternary.helpers import ( 6 | project_point, 7 | planar_to_coordinates, 8 | simplex_iterator 9 | ) 10 | from matplotlib.colors import cnames, to_rgb 11 | from matplotlib import collections, lines, pyplot as plt 12 | import seaborn as sns 13 | import colorsys 14 | from typing import List 15 | 16 | 17 | def ternary_kde( 18 | points, 19 | tax, 20 | n_levels=10, 21 | cmap="viridis", 22 | outline=False, 23 | bw_method='scott', 24 | ): 25 | # Project to 2D simplex 26 | simplex_points = np.apply_along_axis(project_point, 0, points) 27 | 28 | # Fit density model to 2D projection 29 | kde = sp.stats.gaussian_kde(simplex_points, bw_method=bw_method) 30 | 31 | # Evaluate density on triangular grid 32 | n = 100 33 | tri_grid = np.array(list(simplex_iterator(n))).T 34 | simplex_grid = np.apply_along_axis(project_point, 0, tri_grid)/n 35 | densities = kde(simplex_grid) 36 | levels = np.linspace( 37 | np.percentile(densities, 5), 38 | np.percentile(densities, 95), 39 | n_levels, 40 | ) 41 | tax.ax.tricontourf( 42 | simplex_grid[0], simplex_grid[1], 43 | densities, levels=levels, cmap=cmap, 44 | extend="both" 45 | ) 46 | if outline: 47 | tax.ax.tricontour( 48 | simplex_grid[0], simplex_grid[1], 49 | densities, levels=levels, 50 | colors=[[0,0,0,0.25]]+[[0,0,0,0.25]]*4, 51 | linewidths=[0.25]+[0.25]*4 52 | ) 53 | 54 | 55 | def format_tax( 56 | tax, 57 | labels, 58 | fontsize, 59 | tick_width, 60 | boundary_width, 61 | pad, 62 | ): 63 | tax.gridlines( 64 | color="gray", lw=tick_width, linestyle='--', alpha=0.5, multiple=0.5 65 | ) 66 | tax.ticks( 67 | axis='lbr', lw=tick_width, fontsize=fontsize, tick_formats='%.1f', 68 | offset=0.05, multiple=1.0, 69 | ) 70 | tax.boundary(linewidth=boundary_width, zorder=4) 71 | tax.clear_matplotlib_ticks() 72 | tax.get_axes().axis('off') 73 | 74 | 75 | def plot_ternary( 76 | points, 77 | cmap, 78 | titles, 79 | ax, 80 | n_pts: int = 1000, 81 | ): 82 | _, tax = ternary.figure(ax=ax) 83 | ternary_kde( 84 | points.T, 85 | tax, 86 | n_levels=9, 87 | cmap=cmap, 88 | outline=True, 89 | bw_method=0.3, 90 | ) 91 | idx = np.random.choice(points.shape[0], n_pts) 92 | tax.scatter(points[idx], s=0.5, lw=0, color="gray", alpha=0.33) 93 | format_tax( 94 | tax=tax, 95 | labels=titles, 96 | fontsize=8, 97 | tick_width=0.5, 98 | boundary_width=1, 99 | pad=2, 100 | ) 101 | 102 | 103 | def lighten_color(color, amount: float): 104 | # Lookup color in matplotlib named colors 105 | try: 106 | color = cnames[color] 107 | except: 108 | pass 109 | h, l, s = colorsys.rgb_to_hls(*to_rgb(color)) 110 | color = colorsys.hls_to_rgb(h, 1 - amount * (1 - l), s) 111 | return color 112 | 113 | 114 | def patch_violinplot(ax): 115 | children = ax.get_children() 116 | i = 0 117 | for n in range(0, len(children), 4): 118 | art = children[n: n+4] 119 | is_violin = len(art) == 4 120 | is_violin &= isinstance(art[0], collections.PolyCollection) 121 | is_violin &= all([isinstance(a, lines.Line2D) for a in art[1:]]) 122 | if is_violin: 123 | violin, q1, q2, q3 = art 124 | c = violin.get_facecolor() 125 | if i%2==1: c = lighten_color(c, 0.5) 126 | violin.set_facecolor(c) 127 | violin.set_edgecolor(c) 128 | violin.set_linewidth(0.1) 129 | q2.set_linestyle('solid') 130 | q2.set_linewidth(0.5) 131 | q2.set_solid_capstyle('butt') 132 | for q in [q1, q3]: 133 | q.set_alpha(0) 134 | i += 1 135 | 136 | 137 | def plot_kde( 138 | data: pd.DataFrame, 139 | row: str, 140 | row_order: List, 141 | **kwargs, 142 | ): 143 | # Update with default styles 144 | styles = dict( 145 | cut=0, 146 | common_norm=False, 147 | density_norm='width', 148 | width=0.75, 149 | gap=0, 150 | split=True, 151 | fill=True, 152 | linewidth=0.5, 153 | legend=False, 154 | inner='quart', 155 | ) 156 | kwargs.update(styles) 157 | 158 | # Plot KDEs 159 | grid = sns.FacetGrid( 160 | data, 161 | row=row, 162 | row_order=row_order, 163 | height=1.5, 164 | aspect=1.5, 165 | sharex=False, 166 | gridspec_kws=dict(hspace=0.5), 167 | despine=False, 168 | ) 169 | fig = grid.map_dataframe(sns.violinplot, **kwargs) 170 | 171 | # Formatting 172 | for ax in fig.axes.flat: 173 | patch_violinplot(ax) 174 | ax.set_title('') 175 | ax.set_xlim(0, 1) 176 | ax.yaxis.set_visible(False) 177 | ax.spines[['top', 'left', 'right']].set_visible(False) 178 | ax.set_xlabel(kwargs['x']) -------------------------------------------------------------------------------- /src/utils/pl/plot_overlaps.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import numpy as np 4 | from matplotlib import pyplot as plt 5 | from typing import List, Optional 6 | 7 | 8 | def plot_overlaps( 9 | overlaps: pd.DataFrame, 10 | feature_colors: dict, 11 | labels: List[str] = None, 12 | ): 13 | 14 | # Plot overlaps 15 | fig, ax = plt.subplots(1, 1, figsize=(2, 2)) 16 | sns.heatmap( 17 | overlaps, 18 | cmap='Purples', 19 | cbar_kws={'shrink': 0.2, 'aspect': 6}, 20 | vmin=0, 21 | vmax=1, 22 | ax=ax, 23 | ) 24 | n = overlaps.shape[0] 25 | ax.hlines(range(n+1), -1, n+0.1, color='w', lw=3, clip_on=False) 26 | 27 | # Adjust tick labels 28 | ax.tick_params(which='major', length=0, labelsize=6.5) 29 | ax.tick_params(axis='x', pad=7) 30 | ax.tick_params(axis='y', pad=8) 31 | if labels is None: 32 | labels = overlaps.columns.tolist() 33 | ax.set_yticklabels(labels, rotation=0) 34 | ax.set_xticks(np.arange(len(labels))+0.75) # offset to align with center 35 | ax.set_xticklabels(labels, rotation=45, ha='right', va='top') 36 | 37 | # Add row and column color annotations 38 | for i, f in enumerate(overlaps): 39 | color = feature_colors[f] 40 | kwargs = dict( 41 | fill=True, facecolor=color, lw=1.5, edgecolor='w', 42 | clip_on=False, zorder=0 43 | ) 44 | p = 0.075 45 | row_color = plt.Rectangle( 46 | (-p, i), p*0.75, 1, transform=ax.get_yaxis_transform(), **kwargs 47 | ) 48 | ax.add_patch(row_color) 49 | p = 0.055 50 | col_color = plt.Rectangle( 51 | (i, -p), 1, p, transform=ax.get_xaxis_transform(), **kwargs 52 | ) 53 | ax.add_patch(col_color) 54 | 55 | return fig 56 | 57 | 58 | def plot_ratios( 59 | ratios: pd.Series, 60 | cmap, 61 | ax, 62 | row_colors: Optional[pd.Series] = None, 63 | ): 64 | # Cap infinity and negative infinity values 65 | mask = ratios.ne(0) & ratios.ne(np.inf) 66 | log_ratios = ratios.copy() 67 | log_ratios.loc[mask] = np.log(ratios.loc[mask]) 68 | offset = log_ratios[mask].abs().max() * 0.05 69 | vmin = log_ratios[mask].min() 70 | vmax = log_ratios[mask].max() 71 | log_ratios.replace({0: vmin - offset, np.inf: vmax + offset}, inplace=True) 72 | 73 | # Setup colors 74 | colors = log_ratios.copy() 75 | colors[colors > 0] += vmax / 10 76 | colors[colors < 0] -= vmin / 10 77 | absmax = colors.abs().max() 78 | colors = (colors + absmax) / (absmax * 2) 79 | colors = colors.apply(cmap) 80 | 81 | # Plot log-ratios 82 | x = np.power(np.e, log_ratios) - 1 83 | n = log_ratios.shape[0] 84 | y = np.arange(n) 85 | left = np.repeat(1, n) 86 | ax.barh(y, x, height=0.75, lw=0, left=left, color=colors) 87 | 88 | # Formatting 89 | ax.spines[['top', 'left', 'right']].set_visible(False) 90 | ax.set_ylim(-1.25, n+0.25) 91 | ax.set_yticks(y, log_ratios.index) 92 | ax.tick_params(axis='y', length=0, labelsize=6, pad=6) 93 | ax.tick_params(axis='x', direction='in', labelsize=6) 94 | ax.set_xscale('log') 95 | 96 | # Add row and column color annotations 97 | for i, color in enumerate(row_colors.loc[log_ratios.index]): 98 | kwargs = dict( 99 | fill=True, facecolor=color, lw=1.5, edgecolor='w', 100 | clip_on=False, zorder=0 101 | ) 102 | row_color = plt.Rectangle( 103 | (-0.08, i-0.5), 104 | 0.06, 1, transform=ax.get_yaxis_transform(), **kwargs 105 | ) 106 | ax.add_patch(row_color) 107 | 108 | 109 | def plot_fractions( 110 | fractions: pd.DataFrame, 111 | cmap: dict, 112 | ax, 113 | row_colors: pd.Series, 114 | ): 115 | # Plot cumulative fractions 116 | lefts = np.zeros(fractions.shape[0]) 117 | yvals = np.arange(fractions.shape[0]) 118 | for col in fractions.columns: 119 | ax.barh( 120 | y=yvals, 121 | width=fractions[col], 122 | height=0.75, 123 | left=lefts, 124 | color=cmap[col], 125 | lw=0., 126 | edgecolor='w', 127 | ) 128 | lefts += fractions[col] 129 | if 'Osteoblast' in col: 130 | styles = dict(lw=0.5, color='k', clip_on=False) 131 | for x, y in zip(lefts, yvals): 132 | ax.plot([x]*2, [y-0.5, y+0.5], **styles) 133 | 134 | # Formatting 135 | ax.spines[['top', 'left', 'right']].set_visible(False) 136 | ax.set_ylim(-1.25, fractions.shape[0] + 0.25) 137 | ax.set_yticks(yvals, fractions.index) 138 | ax.tick_params(axis='y', length=0, labelsize=6, pad=6) 139 | ax.tick_params(axis='x', direction='in', labelsize=6) 140 | 141 | # Add row and column color annotations 142 | for i, color in enumerate(row_colors.loc[fractions.index]): 143 | kwargs = dict( 144 | fill=True, facecolor=color, lw=1.5, edgecolor='w', 145 | clip_on=False, zorder=0 146 | ) 147 | row_color = plt.Rectangle( 148 | (-0.08, i-0.5), 149 | 0.06, 1, transform=ax.get_yaxis_transform(), **kwargs 150 | ) 151 | ax.add_patch(row_color) -------------------------------------------------------------------------------- /src/utils/pp/assets/ribosomal_genes.gmt: -------------------------------------------------------------------------------- 1 | RIBOSOMAL_GENES_ALL GM10020 GM10031 GM10036 GM10053 GM10073 GM10076 GM10093 GM10101 GM10110 GM10126 GM10131 GM10146 GM10184 GM10250 GM10260 GM10263 GM10269 GM10273 GM10282 GM10320 GM10382 GM10392 GM10451 GM10505 GM10643 GM10762 GM10767 GM10827 GM10837 GM10874 GM10941 GM11110 GM11205 GM11214 GM1123 GM11273 GM11579 GM11707 GM11808 GM12117 GM12166 GM12185 GM12216 GM12253 GM12258 GM12355 GM12728 GM12840 GM13212 GM13830 GM13842 GM14214 GM14288 GM14295 GM14296 GM14305 GM14322 GM14325 GM14326 GM14391 GM14403 GM14410 GM14418 GM14419 GM14698 GM15013 GM15232 GM15234 GM15326 GM15440 GM15446 GM15473 GM156 GM15800 GM15834 GM15892 GM16181 GM16225 GM16253 GM16286 GM16386 GM16519 GM16576 GM16602 GM16702 GM1673 GM16861 GM16867 GM16973 GM17018 GM17087 GM17259 GM17275 GM17322 GM17334 GM17354 GM17399 GM17430 GM17484 GM17509 GM17518 GM17541 GM17552 GM17586 GM17655 GM17669 GM17767 GM17949 GM18025 GM19325 GM19585 GM19684 GM19705 GM1976 GM2000 GM2004 GM20045 GM20109 GM20186 GM2026 GM20342 GM20517 GM20604 GM20696 GM20707 GM20939 GM21269 GM21887 GM21967 GM21975 GM21992 GM21994 GM2237 GM26510 GM26514 GM26517 GM26518 GM26520 GM26522 GM26525 GM26526 GM26531 GM26532 GM26533 GM26534 GM26541 GM26542 GM26545 GM26549 GM26551 GM26563 GM26590 GM26609 GM26610 GM26614 GM26615 GM26619 GM26637 GM26640 GM26656 GM26664 GM26669 GM26690 GM26692 GM26698 GM26699 GM26720 GM26724 GM26733 GM26734 GM26762 GM26767 GM26782 GM26785 GM26789 GM26799 GM26802 GM26809 GM26825 GM26830 GM26847 GM26853 GM26867 GM26877 GM26882 GM26884 GM26888 GM26890 GM26901 GM26909 GM26910 GM26916 GM26917 GM26981 GM27010 GM27017 GM27029 GM27162 GM2800 GM28041 GM28053 GM28068 GM28187 GM28285 GM28347 GM28557 GM28707 GM28873 GM28874 GM28875 GM28935 GM29243 GM29336 GM29394 GM29562 GM29666 GM2A GM340 GM34086 GM3468 GM35315 GM3550 GM3604 GM3636 GM36445 GM37170 GM37233 GM37387 GM37494 GM3839 GM4070 GM42031 GM4208 GM42372 GM42418 GM42595 GM42715 GM42743 GM42903 GM42921 GM42997 GM43042 GM43291 GM43464 GM4349 GM43597 GM43672 GM43698 GM43703 GM43796 GM43848 GM44066 GM44067 GM44148 GM44174 GM44175 GM44238 GM44728 GM44751 GM45028 GM45035 GM45036 GM45069 GM45123 GM45184 GM45351 GM45353 GM45509 GM45620 GM4631 GM4707 GM4724 GM4799 GM4924 GM4950 GM4951 GM5093 GM5111 GM5113 GM5127 GM5141 GM5145 GM5148 GM5160 GM5218 GM5239 GM5424 GM5426 GM5449 GM5547 GM5580 GM561 GM5617 GM5786 GM5914 GM6133 GM614 GM6169 GM6225 GM6297 GM6525 GM6563 GM6576 GM6710 GM6712 GM6768 GM6793 GM6904 GM7102 GM7160 GM7334 GM7535 GM7879 GM7967 GM8013 GM8186 GM8225 GM8369 GM8444 GM8773 GM8797 GM8817 GM8953 GM8973 GM8994 GM9242 GM9493 GM9774 GM9776 GM9797 GM9803 GM9843 GM9844 GM9949 GM9958 MRPL1 MRPL10 MRPL11 MRPL12 MRPL13 MRPL14 MRPL15 MRPL16 MRPL17 MRPL18 MRPL19 MRPL2 MRPL20 MRPL21 MRPL22 MRPL23 MRPL24 MRPL27 MRPL28 MRPL3 MRPL30 MRPL32 MRPL33 MRPL34 MRPL35 MRPL36 MRPL37 MRPL38 MRPL39 MRPL4 MRPL40 MRPL41 MRPL42 MRPL43 MRPL44 MRPL45 MRPL46 MRPL47 MRPL48 MRPL49 MRPL50 MRPL51 MRPL52 MRPL53 MRPL54 MRPL55 MRPL57 MRPL58 MRPL9 MRPS10 MRPS11 MRPS12 MRPS14 MRPS15 MRPS16 MRPS17 MRPS18A MRPS18B MRPS18C MRPS2 MRPS21 MRPS22 MRPS23 MRPS24 MRPS25 MRPS26 MRPS27 MRPS28 MRPS30 MRPS31 MRPS33 MRPS34 MRPS35 MRPS36 MRPS5 MRPS6 MRPS7 MRPS9 PRPS1 PRPS1L3 PRPS2 PRPSAP1 PRPSAP2 RPL10 RPL10-PS3 RPL10A RPL11 RPL12 RPL13 RPL13-PS3 RPL13A RPL13A-PS1 RPL14 RPL15 RPL17 RPL18 RPL18A RPL19 RPL21 RPL21-PS4 RPL22 RPL22L1 RPL23 RPL23A RPL23A-PS3 RPL24 RPL26 RPL27 RPL27-PS3 RPL27A RPL28 RPL29 RPL3 RPL30 RPL31 RPL32 RPL34 RPL35 RPL35A RPL36 RPL36-PS3 RPL36A RPL36AL RPL37 RPL37A RPL38 RPL39 RPL39L RPL4 RPL41 RPL5 RPL6 RPL6L RPL7 RPL7A RPL7A-PS3 RPL7A-PS5 RPL7L1 RPL8 RPL9 RPL9-PS1 RPL9-PS6 RPLP0 RPLP1 RPLP2 RPS10 RPS11 RPS12 RPS12-PS3 RPS13 RPS14 RPS15 RPS15A RPS16 RPS17 RPS18 RPS19 RPS19BP1 RPS2 RPS2-PS6 RPS20 RPS21 RPS23 RPS24 RPS25 RPS26 RPS27 RPS27A RPS27L RPS27RT RPS28 RPS29 RPS3 RPS3A1 RPS4X RPS5 RPS6 FAU RPL10L RPL26L1 RPL3L RPS3A RPS4Y1 RPS7 RPS8 RPS9 RPSA RSL24D1 RSL24D1P11 UBA52 RPS4Y2 RPS10P5 RPL39P5 RPLP0P6 GM10062 GM10306 GM1043 GM10521 GM10638 GM10642 GM10655 GM10840 GM11261 GM11454 GM11464 GM11674 GM12057 GM12088 GM12107 GM12184 GM12474 GM13166 GM13218 GM14085 GM14327 GM15340 GM15675 GM15706 GM15853 GM16084 GM16196 GM17305 GM17324 GM17349 GM17455 GM1818 GM19331 GM20219 GM20498 GM20506 GM20721 GM20878 GM2093 GM21781 GM26511 GM26521 GM26566 GM26583 GM26601 GM26620 GM26732 GM26764 GM26766 GM26798 GM26808 GM26835 GM26860 GM26870 GM26885 GM26887 GM26964 GM27019 GM27042 GM28048 GM28556 GM29609 GM29642 GM29650 GM31108 GM31363 GM33994 GM35000 GM3512 GM35339 GM3696 GM3716 GM37276 GM37294 GM3739 GM37933 GM3854 GM42428 GM42555 GM43062 GM43254 GM43332 GM43661 GM43740 GM4419 GM45055 GM45155 GM45250 GM45423 GM45599 GM4787 GM5134 GM527 GM5432 GM5533 GM5608 GM568 GM6213 GM7008 GM8251 GM9008 GM9484 GM9833 GM9889 GM9903 GM9923 RP23-128C4.4 RP23-162P10.8 RP23-181A8.2 RP23-181A8.4 RP23-181A8.7 RP23-186O3.13 RP23-220F20.2 RP23-240G3.3 RP23-242K3.5 RP23-265B15.1 RP23-292G1.2 RP23-312B17.2 RP23-353K11.3 RP23-381H23.1 RP23-395M5.6 RP23-396C4.2 RP23-440L7.5 RP23-72D18.1 RP24-122D14.3 RP24-175C20.18 RP24-282C4.3 RP24-286J21.7 RP24-318O6.3 RP24-496O17.7 2 | RIBOSOMAL_GENES_HUMAN MRPL1 MRPL10 MRPL11 MRPL12 MRPL13 MRPL14 MRPL15 MRPL16 MRPL17 MRPL18 MRPL19 MRPL2 MRPL20 MRPL21 MRPL22 MRPL23 MRPL24 MRPL27 MRPL28 MRPL3 MRPL30 MRPL32 MRPL33 MRPL34 MRPL35 MRPL36 MRPL37 MRPL38 MRPL39 MRPL4 MRPL40 MRPL41 MRPL42 MRPL43 MRPL44 MRPL45 MRPL46 MRPL47 MRPL48 MRPL49 MRPL50 MRPL51 MRPL52 MRPL53 MRPL54 MRPL55 MRPL57 MRPL58 MRPL9 MRPS10 MRPS11 MRPS12 MRPS14 MRPS15 MRPS16 MRPS17 MRPS18A MRPS18B MRPS18C MRPS2 MRPS21 MRPS22 MRPS23 MRPS24 MRPS25 MRPS26 MRPS27 MRPS28 MRPS30 MRPS31 MRPS33 MRPS34 MRPS35 MRPS36 MRPS5 MRPS6 MRPS7 MRPS9 PRPS1 PRPS1L3 PRPS2 PRPSAP1 PRPSAP2 RPL10 RPL10-PS3 RPL10A RPL11 RPL12 RPL13 RPL13-PS3 RPL13A RPL13A-PS1 RPL14 RPL15 RPL17 RPL18 RPL18A RPL19 RPL21 RPL21-PS4 RPL22 RPL22L1 RPL23 RPL23A RPL23A-PS3 RPL24 RPL26 RPL27 RPL27-PS3 RPL27A RPL28 RPL29 RPL3 RPL30 RPL31 RPL32 RPL34 RPL35 RPL35A RPL36 RPL36-PS3 RPL36A RPL36AL RPL37 RPL37A RPL38 RPL39 RPL39L RPL4 RPL41 RPL5 RPL6 RPL6L RPL7 RPL7A RPL7A-PS3 RPL7A-PS5 RPL7L1 RPL8 RPL9 RPL9-PS1 RPL9-PS6 RPLP0 RPLP1 RPLP2 RPS10 RPS11 RPS12 RPS12-PS3 RPS13 RPS14 RPS15 RPS15A RPS16 RPS17 RPS18 RPS19 RPS19BP1 RPS2 RPS2-PS6 RPS20 RPS21 RPS23 RPS24 RPS25 RPS26 RPS27 RPS27A RPS27L RPS27RT RPS28 RPS29 RPS3 RPS3A1 RPS4X RPS5 RPS6 FAU RPL10L RPL26L1 RPL3L RPS3A RPS4Y1 RPS7 RPS8 RPS9 RPSA RSL24D1 RSL24D1P11 UBA52 RPS4Y2 RPS10P5 RPL39P5 RPLP0P6 RP23-128C4.4 RP23-162P10.8 RP23-181A8.2 RP23-181A8.4 RP23-181A8.7 RP23-186O3.13 RP23-220F20.2 RP23-240G3.3 RP23-242K3.5 RP23-265B15.1 RP23-292G1.2 RP23-312B17.2 RP23-353K11.3 RP23-381H23.1 RP23-395M5.6 RP23-396C4.2 RP23-440L7.5 RP23-72D18.1 RP24-122D14.3 RP24-175C20.18 RP24-282C4.3 RP24-286J21.7 RP24-318O6.3 RP24-496O17.7 -------------------------------------------------------------------------------- /src/utils/pp/preprocess.py: -------------------------------------------------------------------------------- 1 | import scanpy as sc 2 | import numpy as np 3 | from typing import List 4 | import sys 5 | import os 6 | import pandas as pd 7 | import warnings 8 | from tqdm.autonotebook import tqdm 9 | import logging 10 | 11 | 12 | # Warnings to ignore throughout 13 | sc.settings.verbosity = 0 14 | warnings.simplefilter("ignore", UserWarning) 15 | logging.raiseExceptions = False 16 | 17 | 18 | # Class to suppress output from inside functions (e.g., PhenoGraph) 19 | class HiddenPrints: 20 | def __enter__(self): 21 | self._original_stdout = sys.stdout 22 | sys.stdout = open(os.devnull, "w") 23 | 24 | def __exit__(self, exc_type, exc_val, exc_tb): 25 | sys.stdout.close() 26 | sys.stdout = self._original_stdout 27 | 28 | 29 | def preprocess( 30 | adata: sc.AnnData, 31 | show_progress: bool = True, 32 | **kwargs, 33 | ): 34 | # Set all arguments below with defaults 35 | defaults = { 36 | 'hvgs': None, 37 | 'hvgs__n': 2000, 38 | 'hvgs__whitelist': None, 39 | 'pca__var_explained': 0.67, 40 | 'pca__max_comps': 1000, 41 | 'cluster': True, 42 | 'cluster__k': 30, 43 | 'neighbors': True, 44 | 'neighbors__k': 30, 45 | 'umap': True, 46 | 'umap__random_state': 1, 47 | 'impute': True, 48 | 'impute__k': 5, 49 | 'impute__t': 3, 50 | } 51 | pp_args = defaults 52 | pp_args.update(kwargs) 53 | 54 | n_steps = 4 55 | for key in ['cluster', 'neighbors', 'umap', 'impute']: 56 | if bool(pp_args[key]): n_steps += 1 57 | if show_progress: pbar = tqdm(total=n_steps) 58 | 59 | # Median library-size normalization 60 | if show_progress: pbar.set_description('Normalizing') 61 | adata.layers['median'] = adata.layers['raw'].copy() 62 | sc.pp.normalize_total(adata, layer='median') 63 | if show_progress: pbar.update(1) 64 | 65 | # Log-transformation (natural log, pseudocount of 1) 66 | if show_progress: pbar.set_description('Log-transforming') 67 | adata.layers['log'] = adata.layers['median'].copy() 68 | sc.pp.log1p(adata, layer='log') 69 | if show_progress: pbar.update(1) 70 | 71 | # Set HVGs 72 | if show_progress: pbar.set_description('Setting HVGs') 73 | hvgs = pp_args['hvgs'] 74 | if hvgs is None: 75 | hvgs = get_hvgs(adata, pp_args['hvgs__n'], pp_args['hvgs__whitelist']) 76 | adata.var['highly_variable'] = adata.var.index.isin(hvgs) 77 | n_hvgs = adata.var['highly_variable'].sum() 78 | if show_progress: pbar.update(1) 79 | 80 | # PCA 81 | if show_progress: pbar.set_description('Running PCA') 82 | adata.X = adata.layers['log'] 83 | n_comps = min(pp_args['pca__max_comps'], *adata.shape, n_hvgs) - 2 84 | sc.tl.pca(adata, n_comps=n_comps, use_highly_variable=True) 85 | X_pca_full = adata.obsm['X_pca'].copy() 86 | cum_vars = adata.uns['pca']['variance_ratio'].cumsum() 87 | n_comps = np.argmin(abs(cum_vars - pp_args['pca__var_explained'])) 88 | adata.obsm['X_pca'] = X_pca_full[:, :n_comps] 89 | if show_progress: pbar.update(1) 90 | 91 | # Cluster with PhenoGraph 92 | if pp_args['cluster']: 93 | if show_progress: pbar.set_description('Clustering with PhenoGraph') 94 | with HiddenPrints(): 95 | communities, _, _ = sc.external.tl.phenograph( 96 | pd.DataFrame(adata.obsm['X_pca']), 97 | k=pp_args['cluster__k'], 98 | nn_method='brute', 99 | njobs=-1, 100 | ) 101 | adata.obs['PhenoGraph_clusters'] = pd.Categorical(communities) 102 | if show_progress: pbar.update(1) 103 | 104 | # Nearest neighbors in PC space 105 | if pp_args['neighbors']: 106 | if show_progress: pbar.set_description('Finding nearest neighbors') 107 | sc.pp.neighbors( 108 | adata, 109 | use_rep='X_pca', 110 | n_neighbors=pp_args['neighbors__k'] 111 | ) 112 | if show_progress: pbar.update(1) 113 | 114 | # UMAP 115 | if pp_args['umap']: 116 | if show_progress: pbar.set_description('Calculating UMAP') 117 | # Default to PAGA with clusters if clustering, else spectral 118 | init_pos = 'spectral' 119 | if pp_args['cluster']: 120 | sc.tl.paga(adata, groups='PhenoGraph_clusters') 121 | sc.pl.paga(adata, plot=False) 122 | init_pos="paga" 123 | sc.tl.umap(adata, random_state=1, init_pos=init_pos) 124 | if show_progress: pbar.update(1) 125 | 126 | # Impute expression with MAGIC 127 | if pp_args['impute']: 128 | if show_progress: pbar.set_description('Imputing expression with MAGIC') 129 | adata.X = adata.layers["log"] 130 | with HiddenPrints(): 131 | try: 132 | adata_magic = sc.external.pp.magic( 133 | adata, 134 | copy=True, 135 | n_pca=n_comps, 136 | knn=pp_args['impute__k'], 137 | t=pp_args['impute__t'], 138 | verbose=False, 139 | ) 140 | except ValueError as e: 141 | pass 142 | adata.layers['imputed'] = adata_magic.X 143 | if show_progress: pbar.update(1) 144 | 145 | return adata 146 | 147 | 148 | def get_hvgs( 149 | adata: sc.AnnData, 150 | n_hvgs: int, 151 | whitelist: List[str], 152 | ): 153 | hvgs = sc.pp.highly_variable_genes( 154 | adata, 155 | layer='raw', 156 | n_top_genes=n_hvgs, 157 | n_bins=1000, 158 | flavor='seurat_v3', 159 | inplace=False, 160 | ) 161 | hvgs['rank'] = hvgs['highly_variable_rank'] 162 | 163 | # Remove genes in blacklist from HVGs 164 | cwd = os.path.dirname(os.path.realpath(__file__)) 165 | ribosomal = pd.read_csv( 166 | f'{cwd}/assets/ribosomal_genes.gmt', 167 | sep='\t', 168 | index_col=0, 169 | header=None 170 | ).loc['RIBOSOMAL_GENES_HUMAN'].iloc[1:].dropna().tolist() 171 | mitochondrial = hvgs.index[hvgs.index.str.startswith('MT-')].tolist() 172 | blacklist = set(ribosomal).union(mitochondrial) 173 | blacklist = hvgs.index.intersection(blacklist) 174 | for gene in blacklist: 175 | rank = hvgs.loc[gene, 'rank'] 176 | if not np.isnan(rank): 177 | hvgs.loc[gene, 'rank'] = np.nan # remove this gene from rank 178 | below_rank = hvgs['rank'].gt(rank) & ~hvgs['rank'].isna() 179 | hvgs.loc[below_rank, 'rank'] -= 1 # move everything else up 1 180 | 181 | # Final list is union of HVGs with whitelist 182 | hvgs = hvgs.dropna().sort_values('rank').iloc[:n_hvgs] 183 | hvgs = hvgs.index.union(whitelist).tolist() 184 | 185 | return hvgs -------------------------------------------------------------------------------- /src/utils/pl/plot_embedding.py: -------------------------------------------------------------------------------- 1 | import scanpy as sc 2 | from collections.abc import Iterable 3 | from typing import Union, List 4 | import itertools 5 | from mpl_toolkits.axes_grid1 import make_axes_locatable 6 | from matplotlib import pyplot as plt 7 | import math 8 | import os 9 | 10 | # Set style sheet on import 11 | cwd = os.path.dirname(os.path.realpath(__file__)) 12 | plt.style.use(f'{cwd}/assets/default.mplstyle') 13 | 14 | def format_ax( 15 | fig, ax, 16 | style="umap", 17 | title="", 18 | cbar=True, 19 | dim_label="UMAP", 20 | fs=12, 21 | lw=1.5, 22 | arrow_len=0.2, 23 | draw_arrows=True, 24 | ): 25 | ax.set_facecolor('white') 26 | ax.set_xticklabels([]) 27 | ax.set_yticklabels([]) 28 | ax.get_xaxis().set_visible(False) 29 | ax.get_yaxis().set_visible(False) 30 | ax.grid(False) 31 | ax.spines[list(ax.spines)].set_visible(False) 32 | 33 | if style == "umap": 34 | change_aspect(ax) 35 | if draw_arrows: 36 | arrowed_spines(ax, arrow_len, text=dim_label, fs=fs, lw=lw) 37 | ax.set_title(title, weight="bold") 38 | if cbar: 39 | format_cbar(fig, ax) 40 | 41 | 42 | def format_cbar(fig, ax): 43 | 44 | cbar = ax.get_children()[0].colorbar 45 | if cbar: 46 | cbar.remove() 47 | data = ax.get_children()[0] 48 | 49 | # Create colorbar ax 50 | bbox = ax.get_position() 51 | cax = fig.add_axes([ 52 | bbox.x1+bbox.width*0.025, #min x 53 | bbox.y0+bbox.height*0.25, #min y 54 | bbox.width*0.03, #width 55 | bbox.height*0.5 #height 56 | ]) 57 | cax.grid(False) 58 | new_cbar = fig.colorbar( 59 | data, ax=ax, cax=cax, 60 | ) 61 | new_cbar.outline.set_visible(False) 62 | if not cbar: 63 | bbox = ax.get_position() 64 | ax.get_children()[0].colorbar.remove() 65 | ax.set_position(bbox) 66 | 67 | 68 | def change_aspect(ax): 69 | 70 | # Reset x and y limits for square plotting 71 | xmin, xmax = ax.get_xlim() 72 | xrange = xmax - xmin 73 | xcenter = (xrange/2) + xmin 74 | 75 | ymin, ymax = ax.get_ylim() 76 | yrange = ymax - ymin 77 | ycenter = (yrange/2) + ymin 78 | 79 | axrange = max(xrange, yrange)/2 80 | 81 | xmin = xcenter - (axrange) 82 | xmax = xcenter + (axrange) 83 | ax.set_xlim(xmin, xmax) 84 | 85 | ymin = ycenter - (axrange) 86 | ymax = ycenter + (axrange) 87 | ax.set_ylim(ymin, ymax) 88 | 89 | ax.set_aspect('equal', adjustable = 'box') 90 | 91 | 92 | def arrowed_spines( 93 | ax, 94 | length = 0.2, 95 | text = None, 96 | fs = None, 97 | lw = 1.5, 98 | ): 99 | xmin, xmax = ax.get_xlim() 100 | ymin, ymax = ax.get_ylim() 101 | 102 | hw = 1./30.*(ymax-ymin) 103 | hl = 1./30.*(xmax-xmin) 104 | lw = lw # axis line width 105 | ohg = 0.0 # arrow overhang 106 | 107 | ax.spines[list(ax.spines)].set_visible(False) 108 | ax.arrow( 109 | xmin, ymin, (xmax-xmin)*length, 0, fc='k', ec='k', lw = lw, 110 | head_width=hw, head_length=hl, overhang = ohg, 111 | length_includes_head= True, clip_on = False 112 | ) 113 | ax.arrow( 114 | xmin, ymin, 0, (ymax-ymin)*length, fc='k', ec='k', lw = lw, 115 | head_width=hw, head_length=hl, overhang = ohg, 116 | length_includes_head= True, clip_on = False 117 | ) 118 | if fs == None: 119 | fs = plt.rcParams["xtick.labelsize"] 120 | ax.text( 121 | s=f"{text}1", 122 | y=ymin-(ymax-ymin)*0.05, x=xmin+(xmax-xmin)*length/2, 123 | ha="center", va="top", 124 | fontsize = fs 125 | ) 126 | ax.text( 127 | s=f"{text}2", 128 | x=xmin-(xmax-xmin)*0.05, y=ymin+(ymax-ymin)*length/2, 129 | ha="right", va="center", rotation=90, 130 | fontsize = fs 131 | ) 132 | ax.set_xlim(xmin, xmax) 133 | ax.set_ylim(ymin, ymax) 134 | 135 | 136 | def plot_embedding( 137 | adata: sc.AnnData, 138 | features: Union[str, List[str]], 139 | basis: str = 'X_umap', 140 | palette: str = "tab20", 141 | cmap: str = "plasma", 142 | titles: Union[str, List[str]] = None, 143 | ncols: int = 5, 144 | dim: int = 5, 145 | layer: str = "imputed", 146 | dim_label = "UMAP", 147 | ax = None, 148 | fs: int = 12, 149 | lw: float = 1.5, 150 | arrow_len: float = 0.2, 151 | draw_arrows=False, 152 | rasterized=False, 153 | **kwargs, 154 | ): 155 | iterify = lambda x: x if isinstance(x, Iterable) and not isinstance(x, str) else [x] 156 | features = iterify(features) 157 | titles = iterify(titles) 158 | if not ax: 159 | nrows = math.ceil(len(features)) 160 | fig, axes = plt.subplots( 161 | nrows, ncols, 162 | figsize=(dim*ncols,dim*nrows), 163 | ) 164 | fig.tight_layout(pad=dim*0.75) 165 | axes = axes.flat if isinstance(axes, Iterable) else [axes] 166 | else: 167 | assert (len(features)==1) and (len(titles)==1) 168 | fig = ax.get_figure() 169 | axes = [ax] 170 | for ax, feature, title in itertools.zip_longest(axes, features, titles): 171 | if not title: title = feature 172 | if feature: 173 | sc.pl.embedding( 174 | adata, 175 | basis=basis, 176 | color=feature, 177 | ax=ax, 178 | show=False, 179 | palette=palette, cmap=cmap, 180 | layer=layer, 181 | **kwargs, 182 | ) 183 | if rasterized: 184 | ax.get_children()[0].set_rasterized(True) 185 | format_ax( 186 | fig, ax, style="umap", 187 | title=title, dim_label=dim_label, fs=fs, 188 | arrow_len=arrow_len, lw=lw, draw_arrows=draw_arrows, 189 | ) 190 | else: 191 | ax.set_visible(False) 192 | return fig 193 | 194 | 195 | def lighten_color(color, amount=0.5): 196 | import matplotlib.colors as mc 197 | import colorsys 198 | try: 199 | c = mc.cnames[color] 200 | except: 201 | c = color 202 | c = colorsys.rgb_to_hls(*mc.to_rgb(c)) 203 | return colorsys.hls_to_rgb(c[0], 1 - amount * (1 - c[1]), c[2]) 204 | 205 | 206 | def saturate(c, s=1.0): 207 | from matplotlib import colors 208 | from collections.abc import Iterable 209 | # Assumed to be hex if string 210 | if isinstance(c, str): 211 | rgb = colors.to_rgb(c) 212 | hex = True 213 | # Assumed to be RGB if iterable 214 | elif isinstance(c, Iterable): 215 | rgb = c 216 | hex = False 217 | hsv = colors.rgb_to_hsv(rgb) 218 | hsv[1] *= s 219 | c = colors.hsv_to_rgb(hsv) 220 | if hex: 221 | return colors.to_hex(c) 222 | else: 223 | return c -------------------------------------------------------------------------------- /src/utils/tl/transfer_labels.py: -------------------------------------------------------------------------------- 1 | import scipy as sp 2 | import numpy as np 3 | import scanpy as sc 4 | import pandas as pd 5 | from sklearn.neighbors import NearestNeighbors 6 | from phenograph.classify import random_walk_probabilities 7 | from sklearn.linear_model import LinearRegression 8 | 9 | 10 | # Deal with font/plotting bugs introduced by Harmony 11 | from matplotlib import pyplot as plt 12 | import seaborn as sns 13 | font = plt.rcParams['font.family'] 14 | backend = plt.rcParams['backend'] 15 | import harmony.core 16 | sns.set(font=font) 17 | plt.rcParams['backend'] = backend 18 | 19 | 20 | # Get affinity matrix for subset of AnnData (not augmented) 21 | def get_affinity_matrix( 22 | adata: sc.AnnData, 23 | n_neighbors: int, 24 | metric: str, 25 | ): 26 | # Get nearest neighbors 27 | sc.pp.neighbors( 28 | adata, 29 | use_rep='X_pca', 30 | n_neighbors=n_neighbors, 31 | metric=metric, 32 | ) 33 | dists = adata.obsp['distances'] 34 | 35 | # Get the (adaptive) kth nearest neighbor for each cell 36 | adaptive_k = int(np.floor(n_neighbors/3)) 37 | adaptive_std = np.zeros(adata.shape[0]) 38 | for i, chunk in enumerate(np.split(dists.data, dists.indptr[1:-1])): 39 | adaptive_std[i] = np.sort(chunk)[adaptive_k-1] 40 | 41 | # Normalize dists and return diffusion kernel 42 | i, j, d = sp.sparse.find(dists) 43 | d /= adaptive_std[i] # normalize dists by the (adaptive) kth nearest neighbor 44 | W = sp.sparse.csr_matrix( 45 | (np.exp(-d), (i,j)), 46 | shape=adata.obsp['distances'].shape, 47 | ) 48 | kernel = W + W.T 49 | 50 | return kernel, pd.Series(adaptive_std, index=adata.obs.index) 51 | 52 | 53 | # Function to construct mutually nearest neighbors bewteen two datasets 54 | def construct_mnn( 55 | t1_cells, 56 | t2_cells, 57 | data_df: pd.DataFrame, 58 | n_neighbors: int, 59 | metric: str, 60 | n_jobs=-2, 61 | ): 62 | nbrs = NearestNeighbors( 63 | n_neighbors=n_neighbors, 64 | metric=metric, 65 | n_jobs=n_jobs 66 | ) 67 | t1_data = data_df.loc[t1_cells, :].values 68 | t2_data = data_df.loc[t2_cells, :].values 69 | # Dataset 1 neighbors 70 | nbrs.fit(t1_data) 71 | t1_nbrs = nbrs.kneighbors_graph(t2_data, mode='distance') 72 | 73 | # Dataset 2 neighbors 74 | nbrs.fit(t2_data) 75 | t2_nbrs = nbrs.kneighbors_graph(t1_data, mode='distance') 76 | 77 | # Mututally nearest neighbors 78 | mnn = t2_nbrs.multiply(t1_nbrs.T) 79 | mnn = mnn.sqrt() 80 | return mnn 81 | 82 | 83 | # From Harmony 84 | def mnn_ka_distances(mnn, n_neighbors): 85 | # Function to find distance kth neighbor in the mutual nearest neighbor matrix 86 | ka = int(n_neighbors / 3) 87 | ka_dists = np.repeat(None, mnn.shape[0]) 88 | x, y, z = sp.sparse.find(mnn) 89 | rows=pd.Series(x).value_counts() 90 | for r in rows.index[rows >= ka]: 91 | ka_dists[r] = np.sort(z[x==r])[ka - 1] 92 | return ka_dists 93 | 94 | 95 | # From Harmony 96 | def mnn_scaling_factors(mnn_ka_dists, scaling_factors): 97 | cells = mnn_ka_dists.index[~mnn_ka_dists.isnull()] 98 | # Linear model fit 99 | x = scaling_factors[cells] 100 | y = mnn_ka_dists[cells] 101 | lm = LinearRegression() 102 | lm.fit(x.values.reshape(-1, 1), y.values.reshape(-1, 1)) 103 | # Predict 104 | x = scaling_factors[mnn_ka_dists.index] 105 | vals = np.ravel(lm.predict(x.values.reshape(-1, 1))) 106 | mnn_scaling_factors = pd.Series(vals, index=mnn_ka_dists.index) 107 | return mnn_scaling_factors 108 | 109 | 110 | def get_mnn_affinity_function( 111 | index_1, 112 | index_2, 113 | scaling_factors, 114 | adata, 115 | n_neighbors: int, 116 | metric: str, 117 | ): 118 | sorted_index = index_1.append(index_2) 119 | 120 | # Construct MNN between groups 121 | mnn = construct_mnn( 122 | index_1, index_2, 123 | pd.DataFrame(adata.obsm['X_pca'], index=adata.obs.index), 124 | n_neighbors, 125 | metric=metric 126 | ) 127 | # MNN adaptive distances 128 | ka_dists = pd.Series(0.0, index=sorted_index) # distance to (adaptive) kth neighbor 129 | ka_dists[index_1] = mnn_ka_distances(mnn, n_neighbors) 130 | ka_dists[index_2] = mnn_ka_distances(mnn.T, n_neighbors) 131 | 132 | # MNN scaling factors 133 | mnn_sf = pd.Series(0.0, index=sorted_index) 134 | mnn_sf[index_1] = mnn_scaling_factors( 135 | ka_dists[index_1], scaling_factors, 136 | ) 137 | mnn_sf[index_2] = mnn_scaling_factors( 138 | ka_dists[index_2], scaling_factors, 139 | ) 140 | # MNN affinity matrix 141 | mnn_aff = harmony.core._mnn_affinity( 142 | mnn, mnn_sf, 143 | np.where(adata.obs.index.isin(index_1))[0][0], 144 | np.where(adata.obs.index.isin(index_2))[0][0], 145 | device='cpu', 146 | ) 147 | return mnn_aff 148 | 149 | 150 | # Calculate augmented affinity matrix between labeled and unlabeled datasets 151 | def get_augmented_affinity_matrix( 152 | adata: sc.AnnData, 153 | label_column: str, 154 | knn: int = 30, 155 | mnn: int = 60, 156 | ): 157 | # Assume unlabeled samples have NaN in label column 158 | is_labeled = ~adata.obs[label_column].isna() 159 | 160 | # Affinity matrix for unlabeled data 161 | aff_unl, sf_unl = get_affinity_matrix( 162 | adata[~is_labeled], 163 | n_neighbors=knn, 164 | metric='euclidean', 165 | ) 166 | # Affinity matrix for labeled data 167 | aff_lbl, sf_lbl = get_affinity_matrix( 168 | adata[is_labeled], 169 | n_neighbors=knn, 170 | metric='euclidean', 171 | ) 172 | 173 | # Affinity matrix between labeled and unlabeled data 174 | sf_cmb = pd.concat([sf_unl, sf_lbl]) 175 | adata = adata[sf_cmb.index].copy() # reorder AnnData 176 | mnn_aff = get_mnn_affinity_function( 177 | sf_unl.index, 178 | sf_lbl.index, 179 | sf_cmb, 180 | adata, 181 | n_neighbors=mnn, 182 | metric='cosine', 183 | ) 184 | 185 | # Combine affinity matrices 186 | # Expand shape of affinity matrices to shape of combined AnnData 187 | shape = [adata.obs.shape[0]]*2 # cells x cells 188 | 189 | # In-vitro affinity matrix 190 | i, j, v = sp.sparse.find(aff_unl) 191 | aff_unl_full = sp.sparse.csr_matrix((v, (i, j)), shape=shape) 192 | 193 | # In-vivo affinity matrix 194 | i, j, v = sp.sparse.find(aff_lbl) 195 | offset = aff_unl.shape[0] 196 | aff_lbl_full = sp.sparse.csr_matrix((v, (i+offset, j+offset)), shape=shape) 197 | 198 | # Combine all affinity matrices into symmetric matrix 199 | comb_aff = aff_unl_full + aff_lbl_full + mnn_aff + mnn_aff.T 200 | 201 | return comb_aff 202 | 203 | 204 | def transfer_labels( 205 | adata, 206 | label_column: str, 207 | knn: int = 30, 208 | mnn: int = 60, 209 | ): 210 | # Assume unlabeled samples have NaN in label column 211 | is_labeled = ~adata.obs[label_column].isna() 212 | 213 | # Get labels for known and unknown data 214 | labels = adata.obs.loc[is_labeled, label_column].values 215 | lbl_codes, lbl_uniques = pd.factorize(labels) 216 | lbl_codes = np.append( 217 | np.zeros((np.sum(~is_labeled),), dtype=int), # unlabeled is zero 218 | lbl_codes + 1 219 | ) 220 | 221 | # Map labels using PhenoGraph classify and affinity matrix 222 | A = get_augmented_affinity_matrix(adata, label_column, knn, mnn) 223 | P = random_walk_probabilities(A, lbl_codes) 224 | c = np.argmax(P, axis=1) 225 | 226 | # Annotate AnnData 227 | c_map = dict(enumerate(lbl_uniques)) 228 | adata.obs.loc[~is_labeled, label_column] = pd.Series(c).map(c_map).values 229 | for key, val in c_map.items(): 230 | adata.obs[f'P({val})'] = 0. 231 | adata.obs.loc[is_labeled, f'P({val})'] = 1. 232 | adata.obs.loc[~is_labeled, f'P({val})'] = P.T[key] 233 | -------------------------------------------------------------------------------- /src/utils/pl/plot_trends.py: -------------------------------------------------------------------------------- 1 | from pygam import LinearGAM, s as spline_term 2 | import numpy as np 3 | import pandas as pd 4 | import scanpy as sc 5 | from matplotlib import pyplot as plt 6 | import matplotlib.gridspec as gridspec 7 | from matplotlib.colors import LinearSegmentedColormap as lsc 8 | from typing import List, Tuple, Union, Optional 9 | import scipy as sp 10 | 11 | 12 | def get_gam_trend( 13 | x: np.ndarray, 14 | y: np.ndarray, 15 | n_splines: int = 8, #8 16 | spline_order: int = 4, #3 17 | x_res: int = 500, 18 | weights: Optional[np.ndarray] = None, 19 | ): 20 | x_pred = np.linspace(x.min(), x.max(), x_res) 21 | spline = spline_term(0, n_splines=n_splines, spline_order=spline_order) 22 | gam = LinearGAM(spline).fit(x, y, weights) 23 | y_pred = gam.predict(x_pred) 24 | p = gam.predict(x) 25 | n = len(x) 26 | mu = np.mean(x) 27 | sigma = np.sqrt(((y - p) ** 2).sum() / (n - 2)) 28 | std = ( 29 | np.sqrt( 30 | 1 + 1 / n + (x_pred - mu) ** 2 / ((x - mu) ** 2).sum() 31 | ) * sigma / 2 32 | ) 33 | return pd.Series(y_pred, index=x_pred), pd.Series(std, index=x_pred) 34 | 35 | 36 | def plot_palantir_trends( 37 | adata: sc.AnnData, 38 | features: List[str], 39 | ps_column: str, 40 | branch_column: str, 41 | fig: plt.Figure, 42 | feature_colors: Optional[dict] = None, 43 | gs: Optional[gridspec.GridSpec] = None, 44 | **kwargs, 45 | ): 46 | # Colors for each trend/row 47 | if feature_colors is None: 48 | feature_colors = dict() 49 | cycler = plt.rcParams['axes.prop_cycle'] 50 | for f, props in zip(features, cycler): 51 | feature_colors[f] = props['color'] 52 | 53 | # Plot organization 54 | if gs is None: 55 | gs = fig.add_gridspec(1, 1) 56 | gs_0 = gs.subgridspec(2, 1, hspace=0.5) 57 | ax_1 = fig.add_subplot(gs_0[0]) 58 | ax_2 = fig.add_subplot(gs_0[1], sharex=ax_1) 59 | 60 | # Calculate and plot feature trends for this branch 61 | ps_max = adata.obs.loc[adata.obs[branch_column] > 0.9, ps_column].max() 62 | mask = adata.obs[ps_column].lt(ps_max) 63 | x = adata.obs.loc[mask, ps_column] 64 | weights = adata.obs.loc[mask, branch_column] 65 | # Strengthen weights to account for instability near w=0 66 | eps = 1e-5 67 | weights = weights.clip(eps, 1-eps) ** 2 68 | for f in features: 69 | if f in adata.obs: 70 | y = adata.obs.loc[mask, f] 71 | else: 72 | y = adata[mask, f].layers['palantir_imputed'].flatten() 73 | # Plot trend 74 | trend, std = get_gam_trend(x, y, weights=weights, **kwargs) 75 | trend = sp.stats.zscore(trend) 76 | color = feature_colors[f] 77 | ax_1.plot(trend, color=color) 78 | # Plot first derivative 79 | diffs = pd.Series(np.diff(trend.values, n=1), trend.index[:-1]) 80 | diffs = sp.stats.zscore(diffs) 81 | ax_2.plot(diffs, color=color) 82 | # Mark maxima 83 | signs = np.sign(np.diff(trend.values, n=2)) 84 | try: 85 | max_idx = np.argwhere(signs[:-1] > signs[1:])[0] 86 | except: 87 | max_idx = np.argmax(diffs) 88 | styles = dict(lw=0.5, linestyle='--', color=color) 89 | for ax in [ax_1, ax_2]: 90 | t = ax.get_xaxis_transform() 91 | ax.vlines(trend.index[max_idx], 0, 1, transform=t, **styles) 92 | 93 | # Formatting 94 | ax_1.xaxis.set_visible(False) 95 | for ax in [ax_1, ax_2]: 96 | ax.tick_params(direction='in', length=2, labelsize=5, pad=2) 97 | 98 | 99 | def plot_pseudotime_trends( 100 | adata: sc.AnnData, 101 | ps_column: str, 102 | feature_columns: Union[str, List[str]], 103 | ax, 104 | feature_colors: Optional[dict] = None, 105 | clip: Tuple[float] = (0.2, 1.0), 106 | labels: Optional[List[str]] = None, 107 | **kwargs, 108 | ): 109 | # Colors for each trend/row 110 | if feature_colors is None: 111 | feature_colors = dict() 112 | cycler = plt.rcParams['axes.prop_cycle'] 113 | for f, props in zip(feature_columns, cycler): 114 | feature_colors[f] = props['color'] 115 | 116 | # Calculate feature trends and reformat them as rows in a heatmap 117 | # 2) Find positions where trends increase (maxima in 2nd derivative) 118 | heatmap_trends = [] 119 | tick_positions = [] 120 | x = adata.obs[ps_column] 121 | for f in feature_columns: 122 | y = adata.obs[f] 123 | trend, _ = get_gam_trend(x, y, **kwargs) 124 | norm = plt.Normalize(*trend.quantile(clip)) 125 | trend_cmap = lsc.from_list("", [[1,1,1], feature_colors[f]]) 126 | row = trend.apply(norm).apply(trend_cmap).values.tolist() 127 | heatmap_trends.append(row) 128 | 129 | # Plotting 130 | ax.imshow(heatmap_trends, aspect="auto", interpolation='none',) 131 | borders = np.arange(-0.5, len(feature_columns)+0.5, 1) 132 | for y_pos, x_pos in enumerate(tick_positions): 133 | ax.plot([x_pos]*2, [y_pos-0.5, y_pos+0.5], c='k') 134 | 135 | # Formatting 136 | ax.hlines(borders, -1, 501, color="w", lw=1, zorder=2, clip_on=False) 137 | ax.set_xticks([0, 500], [0., 1.0]) 138 | ax.set_xlim(-5, 505) 139 | ax.set_ylim(len(feature_columns)-0.5+0.2, -0.5-0.2) 140 | # Hide y-axis 141 | ax.yaxis.set_visible(False) 142 | ax.spines['left'].set_visible(False) 143 | 144 | # Row colors 145 | for i, f in enumerate(feature_columns): 146 | color = feature_colors[f] 147 | kwargs = dict( 148 | fill=True, facecolor=color, lw=1.5, edgecolor='w', 149 | clip_on=False, zorder=0 150 | ) 151 | row_color = plt.Rectangle( 152 | (-0.05, i-0.5), 0.04, 1, 153 | transform=ax.get_yaxis_transform(), **kwargs 154 | ) 155 | ax.add_patch(row_color) 156 | 157 | 158 | def plot_module_progressions( 159 | adata: sc.AnnData, 160 | named_colors: dict, 161 | features: List = None, 162 | peaks: List = None, 163 | figsize: Tuple = (2, 1.2), 164 | ): 165 | # Setup figure 166 | fig, axes = plt.subplots( 167 | 2,1, 168 | figsize=figsize, 169 | gridspec_kw=dict(height_ratios=[1, 0.33], hspace=0.33/(1.33/2)), 170 | ) 171 | 172 | # Standard columns to plot 173 | if features is None: 174 | features = [ 175 | 'Module Absorptive Intestine Score', 176 | 'Module Secretory Intestine Score', 177 | 'Module Intestine Score', 178 | 'Module Tumor ISC-like Score', 179 | 'Module Endoderm Development Score', 180 | f"Module {adata.uns['DC Terminal State']} Score", 181 | 'Fetal, Conserved' 182 | ] 183 | 184 | # Plot DC trends for all modules 185 | ax = axes[0] 186 | ps_column = adata.uns['DC'] 187 | plot_pseudotime_trends(adata, ps_column, features, ax, named_colors) 188 | 189 | # Mark positions in fetal and terminal states where trend crosses 0.8 190 | if peaks is None: peaks = features[-2:] 191 | for feature in peaks: 192 | # Calculate trends 193 | trend, _ = get_gam_trend(adata.obs[ps_column], adata.obs[feature]) 194 | trend -= trend.min() 195 | trend /= trend.max() 196 | # Mark positions 197 | signs = np.sign(trend.values - 0.75) 198 | idx = np.argwhere(signs[:-1] < signs[1:]).flatten() 199 | t = ax.get_xaxis_transform() 200 | styles = dict(lw=0.5, linestyle='--', color=named_colors[feature]) 201 | ax.vlines(idx, 0, 1, transform=t, **styles) 202 | 203 | # Plot sample type positions 204 | ax = axes[1] 205 | x = adata.obs[ps_column] 206 | y = adata.obs['Sample Type'].str.contains('Primary') 207 | c = adata.obs['Sample Type'].map(named_colors).tolist() 208 | styles = dict(s=8, lw=0, alpha=0.25, rasterized=True, clip_on=False) 209 | ax.scatter(x, y, c=c, **styles) 210 | 211 | # Formatting 212 | offset = (x.max() - x.min()) / 100 213 | ax.set_xlim(x.min() - offset, x.max() + offset) 214 | ax.set_xticks([x.min(), x.max()], [0., 1.]) 215 | ax.set_ylim(-0.67, 1.67) 216 | ax.yaxis.set_visible(False) 217 | ax.spines['left'].set_visible(False) 218 | 219 | return fig 220 | -------------------------------------------------------------------------------- /notebooks/Figure_4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 107, 6 | "id": "75109198-3a92-414d-ad34-9fbb62693963", 7 | "metadata": { 8 | "execution": { 9 | "iopub.execute_input": "2024-10-29T00:37:21.735490Z", 10 | "iopub.status.busy": "2024-10-29T00:37:21.735145Z", 11 | "iopub.status.idle": "2024-10-29T00:37:21.764685Z", 12 | "shell.execute_reply": "2024-10-29T00:37:21.764180Z", 13 | "shell.execute_reply.started": "2024-10-29T00:37:21.735468Z" 14 | }, 15 | "tags": [] 16 | }, 17 | "outputs": [ 18 | { 19 | "name": "stdout", 20 | "output_type": "stream", 21 | "text": [ 22 | "The autoreload extension is already loaded. To reload it, use:\n", 23 | " %reload_ext autoreload\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "%load_ext autoreload\n", 29 | "%autoreload 2\n", 30 | "%matplotlib inline" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 166, 36 | "id": "6714aa7e-74b6-433b-a37d-fc44d3fb577e", 37 | "metadata": { 38 | "execution": { 39 | "iopub.execute_input": "2024-10-29T01:17:56.471464Z", 40 | "iopub.status.busy": "2024-10-29T01:17:56.471133Z", 41 | "iopub.status.idle": "2024-10-29T01:17:56.502029Z", 42 | "shell.execute_reply": "2024-10-29T01:17:56.501547Z", 43 | "shell.execute_reply.started": "2024-10-29T01:17:56.471444Z" 44 | }, 45 | "scrolled": true, 46 | "tags": [] 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "from utils.pp.preprocess import preprocess\n", 51 | "from utils.pl.plot_embedding import plot_embedding\n", 52 | "from utils.tl.transfer_labels import transfer_labels\n", 53 | "from utils.pl.plot_ternary import plot_kde\n", 54 | "from requirements import *" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "id": "5478eb79-6299-430e-bd33-908e5c60a512", 60 | "metadata": {}, 61 | "source": [ 62 | "## Import AnnDatas" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 95, 68 | "id": "a2aed545-d549-411a-9642-0b79be235082", 69 | "metadata": { 70 | "execution": { 71 | "iopub.execute_input": "2024-10-29T00:32:54.789647Z", 72 | "iopub.status.busy": "2024-10-29T00:32:54.789348Z", 73 | "iopub.status.idle": "2024-10-29T00:32:56.852337Z", 74 | "shell.execute_reply": "2024-10-29T00:32:56.851764Z", 75 | "shell.execute_reply.started": "2024-10-29T00:32:54.789627Z" 76 | }, 77 | "tags": [] 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "# Read in KG146 tumor data for mapping\n", 82 | "filepath = f'{data_dir}/h5ads/KG146_Tumor_Mapping_Reference.h5ad'\n", 83 | "ad_146 = sc.read_h5ad(filepath, backed=False)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 96, 89 | "id": "ebd3cead-acac-4de4-a58c-7517face7c56", 90 | "metadata": { 91 | "execution": { 92 | "iopub.execute_input": "2024-10-29T00:32:56.853614Z", 93 | "iopub.status.busy": "2024-10-29T00:32:56.853340Z", 94 | "iopub.status.idle": "2024-10-29T00:32:58.799914Z", 95 | "shell.execute_reply": "2024-10-29T00:32:58.799351Z", 96 | "shell.execute_reply.started": "2024-10-29T00:32:56.853593Z" 97 | }, 98 | "tags": [] 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "# Read in KG146 organoid data\n", 103 | "filepath = f'{data_dir}/h5ads/KG146_shPROX1_Knockdown.h5ad'\n", 104 | "ad_146_kd = sc.read_h5ad(filepath, backed=False)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "id": "5b87a459-dc5b-4810-9949-e79f192150ca", 110 | "metadata": { 111 | "execution": { 112 | "iopub.execute_input": "2024-10-28T23:50:13.027675Z", 113 | "iopub.status.busy": "2024-10-28T23:50:13.027119Z", 114 | "iopub.status.idle": "2024-10-28T23:50:13.054085Z", 115 | "shell.execute_reply": "2024-10-28T23:50:13.053599Z", 116 | "shell.execute_reply.started": "2024-10-28T23:50:13.027654Z" 117 | } 118 | }, 119 | "source": [ 120 | "## Figure 4a. Label mapping from patient data to shPROX1 knockdown organoids" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "id": "5ff70ddc-3aa5-4c29-aaaf-0f17868dc57a", 126 | "metadata": {}, 127 | "source": [ 128 | "### Get genes to use as feature space for mapping" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 98, 134 | "id": "231386eb-d3f9-4e29-830c-d608f575fb8d", 135 | "metadata": { 136 | "execution": { 137 | "iopub.execute_input": "2024-10-29T00:33:08.976110Z", 138 | "iopub.status.busy": "2024-10-29T00:33:08.975794Z", 139 | "iopub.status.idle": "2024-10-29T00:33:12.624017Z", 140 | "shell.execute_reply": "2024-10-29T00:33:12.623294Z", 141 | "shell.execute_reply.started": "2024-10-29T00:33:08.976091Z" 142 | } 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "# Get DEGs per cell state\n", 147 | "sc.tl.rank_genes_groups(\n", 148 | " ad_146,\n", 149 | " \"cell_state\",\n", 150 | " layer='log',\n", 151 | " use_raw=False,\n", 152 | " method='wilcoxon',\n", 153 | ")" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 99, 159 | "id": "7c7aba72-a060-4236-822b-848635747b1c", 160 | "metadata": { 161 | "execution": { 162 | "iopub.execute_input": "2024-10-29T00:33:12.625646Z", 163 | "iopub.status.busy": "2024-10-29T00:33:12.625332Z", 164 | "iopub.status.idle": "2024-10-29T00:33:12.689923Z", 165 | "shell.execute_reply": "2024-10-29T00:33:12.689434Z", 166 | "shell.execute_reply.started": "2024-10-29T00:33:12.625626Z" 167 | } 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "# Get features to use for cell state mapping\n", 172 | "n_genes = 200\n", 173 | "degs = ad_146.uns['rank_genes_groups']\n", 174 | "cell_states = dict()\n", 175 | "keys = list(degs['names'].dtype.fields.keys())\n", 176 | "for key in keys:\n", 177 | " genes = degs['names'][key]\n", 178 | " mask = degs['logfoldchanges'][key] > 1\n", 179 | " mask &= degs['pvals_adj'][key] < 0.001\n", 180 | " mask &= pd.Index(genes).isin(ad_146_kd.var.index)\n", 181 | " cell_states[key] = genes[mask][:n_genes].tolist()\n", 182 | "label_features = [v for vals in cell_states.values() for v in vals]" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "id": "860ac356-df9c-415d-b804-272ba560cbfd", 188 | "metadata": {}, 189 | "source": [ 190 | "### Perform label transfer on each organoid and condition" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 100, 196 | "id": "e4d97bbd-4a7b-4776-a4e4-2a13e0660343", 197 | "metadata": { 198 | "execution": { 199 | "iopub.execute_input": "2024-10-29T00:33:12.690865Z", 200 | "iopub.status.busy": "2024-10-29T00:33:12.690575Z", 201 | "iopub.status.idle": "2024-10-29T00:33:12.716547Z", 202 | "shell.execute_reply": "2024-10-29T00:33:12.716077Z", 203 | "shell.execute_reply.started": "2024-10-29T00:33:12.690846Z" 204 | } 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "# Columns to retain after co-embedding\n", 209 | "keep_cols = [\n", 210 | " 'sample_id',\n", 211 | " 'label_group',\n", 212 | " 'cell_state',\n", 213 | " 'original_line',\n", 214 | " 'genotype',\n", 215 | "]" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 101, 221 | "id": "e1425a56-4a52-4599-9e69-accde114bfcc", 222 | "metadata": { 223 | "execution": { 224 | "iopub.execute_input": "2024-10-29T00:33:13.674743Z", 225 | "iopub.status.busy": "2024-10-29T00:33:13.674394Z", 226 | "iopub.status.idle": "2024-10-29T00:35:18.230619Z", 227 | "shell.execute_reply": "2024-10-29T00:35:18.230018Z", 228 | "shell.execute_reply.started": "2024-10-29T00:33:13.674725Z" 229 | } 230 | }, 231 | "outputs": [ 232 | { 233 | "name": "stderr", 234 | "output_type": "stream", 235 | "text": [ 236 | "100%|██████████| 4/4 [02:04<00:00, 31.13s/it]\n" 237 | ] 238 | } 239 | ], 240 | "source": [ 241 | "from tqdm import tqdm\n", 242 | "\n", 243 | "# Reference and unlabeled group\n", 244 | "ad_146.obs['label_group'] = 'reference'\n", 245 | "ad_146_kd.obs['label_group'] = 'unlabeled'\n", 246 | "labeled_ad = dict()\n", 247 | "\n", 248 | "# Label transfer is performed separately for each sample\n", 249 | "for name, group in tqdm(ad_146_kd.obs.groupby(['original_line', 'genotype'])):\n", 250 | "\n", 251 | " # Coembed each sample with tumor AnnData (labeled reference)\n", 252 | " ad_cmb = anndata.concat(\n", 253 | " adatas=[ad_146, ad_146_kd[group.index]],\n", 254 | " join=\"outer\",\n", 255 | " label=\"label_group\",\n", 256 | " keys=[\"reference\", \"unlabeled\"],\n", 257 | " index_unique=None,\n", 258 | " )\n", 259 | " ad_cmb.obsm.clear()\n", 260 | " ad_cmb.obs = ad_cmb.obs[keep_cols]\n", 261 | "\n", 262 | " # Re-process data after co-embedding\n", 263 | " kwargs = dict(\n", 264 | " hvgs=label_features, show_progress=False,\n", 265 | " cluster=False, neighbors=False, umap=False, impute=False,\n", 266 | " )\n", 267 | " ad_cmb = preprocess(ad_cmb, **kwargs)\n", 268 | "\n", 269 | " # Transfer labels from tumor to organoid\n", 270 | " transfer_labels(ad_cmb, label_column='cell_state', knn=30, mnn=60)\n", 271 | " labeled_ad[name] = ad_cmb" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 102, 277 | "id": "4b4ddd9a-b279-43ef-ba36-efde27d99455", 278 | "metadata": { 279 | "execution": { 280 | "iopub.execute_input": "2024-10-29T00:35:18.232073Z", 281 | "iopub.status.busy": "2024-10-29T00:35:18.231792Z", 282 | "iopub.status.idle": "2024-10-29T00:35:18.295231Z", 283 | "shell.execute_reply": "2024-10-29T00:35:18.294741Z", 284 | "shell.execute_reply.started": "2024-10-29T00:35:18.232054Z" 285 | } 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "# Transfer organoid labels back to original AnnData\n", 290 | "columns = [f'P({state})' for state in cell_states.keys()] # probabilities\n", 291 | "columns += ['cell_state'] # label\n", 292 | "\n", 293 | "for ad in labeled_ad.values():\n", 294 | " ixn = ad_146_kd.obs.index.intersection(ad.obs.index)\n", 295 | " ad_146_kd.obs.loc[ixn, columns] = ad.obs[columns]" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 103, 301 | "id": "e260fd56-4956-4cdb-b402-8f29c2ab9556", 302 | "metadata": { 303 | "execution": { 304 | "iopub.execute_input": "2024-10-29T00:35:18.296167Z", 305 | "iopub.status.busy": "2024-10-29T00:35:18.295989Z", 306 | "iopub.status.idle": "2024-10-29T00:35:18.328673Z", 307 | "shell.execute_reply": "2024-10-29T00:35:18.328203Z", 308 | "shell.execute_reply.started": "2024-10-29T00:35:18.296149Z" 309 | } 310 | }, 311 | "outputs": [], 312 | "source": [ 313 | "# Aggregate probabilities over 3 axes/groups of cell states\n", 314 | "label_map = {\n", 315 | " 'ISC/TA-like': [\"Proliferative\", \"ISC\"],\n", 316 | " 'Diff. Intestine-like': [\"Absorptive\", \"Secretory\"],\n", 317 | " 'Non-Canonical': [\"Injury Repair\", \"Neuroendocrine\", \"Fetal\", \"Squamous\"],\n", 318 | "}\n", 319 | "for key, vals in label_map.items():\n", 320 | " prob = ad_146_kd.obs[[f\"P({v})\" for v in vals]].sum(axis=1)\n", 321 | " ad_146_kd.obs[f\"P({key})\"] = prob" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "id": "01ad518a-ad09-4042-bb3c-99f3578da12b", 327 | "metadata": {}, 328 | "source": [ 329 | "### Visualize results as ternary plots" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 170, 335 | "id": "43e8649c-06e7-4b57-a7b5-569fc5ea2702", 336 | "metadata": { 337 | "execution": { 338 | "iopub.execute_input": "2024-10-29T01:19:56.401370Z", 339 | "iopub.status.busy": "2024-10-29T01:19:56.401043Z", 340 | "iopub.status.idle": "2024-10-29T01:19:56.787875Z", 341 | "shell.execute_reply": "2024-10-29T01:19:56.787388Z", 342 | "shell.execute_reply.started": "2024-10-29T01:19:56.401347Z" 343 | } 344 | }, 345 | "outputs": [ 346 | { 347 | "data": { 348 | "image/png": "", 349 | "text/plain": [ 350 | "
" 351 | ] 352 | }, 353 | "metadata": {}, 354 | "output_type": "display_data" 355 | } 356 | ], 357 | "source": [ 358 | "plot_kde(\n", 359 | " ad_146_kd.obs,\n", 360 | " row='original_line',\n", 361 | " row_order=['OKG146P', 'OKG146Li'],\n", 362 | " x='P(Non-Canonical)',\n", 363 | " y='genotype',\n", 364 | " hue='sample_type',\n", 365 | " palette=named_colors,\n", 366 | ")" 367 | ] 368 | } 369 | ], 370 | "metadata": { 371 | "kernelspec": { 372 | "display_name": "Python 3 (ipykernel)", 373 | "language": "python", 374 | "name": "python3" 375 | }, 376 | "language_info": { 377 | "codemirror_mode": { 378 | "name": "ipython", 379 | "version": 3 380 | }, 381 | "file_extension": ".py", 382 | "mimetype": "text/x-python", 383 | "name": "python", 384 | "nbconvert_exporter": "python", 385 | "pygments_lexer": "ipython3", 386 | "version": "3.8.1" 387 | } 388 | }, 389 | "nbformat": 4, 390 | "nbformat_minor": 5 391 | } 392 | --------------------------------------------------------------------------------