├── .gitignore ├── LICENSE ├── README.md ├── dorothea ├── __init__.py ├── data │ ├── c_dorothea_hs.pkl │ ├── c_dorothea_mm.pkl │ ├── dorothea_hs.pkl │ └── dorothea_mm.pkl └── dorothea.py ├── example └── dorothea_introduction.ipynb ├── pyproject.toml └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | example/data 2 | example/sciraR.ipynb 3 | example/viperR.ipynb 4 | example/cache/ 5 | dorothea/__pycache__/ 6 | example/.ipynb_checkpoints/* 7 | example/methods_exploration.ipynb 8 | example/sinfo-requirements.txt 9 | dist/ 10 | dorothea/dorothea_py.egg-info/ 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Saez Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dorothea-py 2 | 3 | This package has been deprecated. For transcription factor inference, please visit [decoupler](https://github.com/saezlab/decoupler-py), specifically [this](https://decoupler-py.readthedocs.io/en/latest/notebooks/dorothea.html) tutorial. 4 | -------------------------------------------------------------------------------- /dorothea/__init__.py: -------------------------------------------------------------------------------- 1 | from .dorothea import * -------------------------------------------------------------------------------- /dorothea/data/c_dorothea_hs.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saezlab/dorothea-py/833165d3c790ced3a3e3852899e93412c63f0f44/dorothea/data/c_dorothea_hs.pkl -------------------------------------------------------------------------------- /dorothea/data/c_dorothea_mm.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saezlab/dorothea-py/833165d3c790ced3a3e3852899e93412c63f0f44/dorothea/data/c_dorothea_mm.pkl -------------------------------------------------------------------------------- /dorothea/data/dorothea_hs.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saezlab/dorothea-py/833165d3c790ced3a3e3852899e93412c63f0f44/dorothea/data/dorothea_hs.pkl -------------------------------------------------------------------------------- /dorothea/data/dorothea_mm.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saezlab/dorothea-py/833165d3c790ced3a3e3852899e93412c63f0f44/dorothea/data/dorothea_mm.pkl -------------------------------------------------------------------------------- /dorothea/dorothea.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from scipy.sparse import csr_matrix 4 | from scipy.sparse import issparse 5 | import scanpy as sc 6 | from anndata import AnnData 7 | import pickle 8 | import pkg_resources 9 | import os 10 | from numpy.random import default_rng 11 | from tqdm import tqdm 12 | import seaborn as sns 13 | import matplotlib.pyplot as plt 14 | 15 | 16 | """TF activity prediction in Python""" 17 | 18 | 19 | def load_regulons(levels=['A', 'B', 'C', 'D', 'E'], organism='Human', commercial=False): 20 | """ 21 | Loads DoRothEA's regulons. 22 | 23 | Parameters 24 | ---------- 25 | levels 26 | List of confidence levels to use. A regulons are the most confident, E the least. 27 | organism 28 | String determining which organism to use. Only `Human` and `Mouse` are supported. 29 | commercial 30 | Whether to use the academic or commercial version. 31 | 32 | Returns 33 | ------- 34 | DataFrame containing the relationships between gene targets (rows) and their TFs (columns). 35 | 36 | Examples 37 | -------- 38 | >>> import dorothea 39 | >>> regulons = dorothea.load_regulons(levels=['A'], organism='Human', commercial=False) 40 | """ 41 | # Get package path 42 | path = 'data' 43 | fname = 'dorothea_' 44 | 45 | if commercial: 46 | fname = 'c_' + fname 47 | if organism == "Human": 48 | fname = fname + 'hs' 49 | elif organism == "Mouse": 50 | fname = fname + 'mm' 51 | else: 52 | raise("Wrong organism name. Please specify 'Human' or 'Mouse'.") 53 | fname = fname + '.pkl' 54 | path = pkg_resources.resource_filename(__name__, os.path.join(path, fname)) 55 | 56 | # Open pickle object 57 | df = pickle.load(open(path, "rb" )) 58 | 59 | #Filter by levels of confidence 60 | df = df[df['confidence'].isin(levels)] 61 | 62 | # Transform to binary dataframe 63 | dorothea_df = df.pivot(index='target', columns='tf', values='mor') 64 | 65 | # Set nans to 0 66 | dorothea_df[np.isnan(dorothea_df)] = 0 67 | 68 | return dorothea_df 69 | 70 | def extract(adata, obsm_key='dorothea'): 71 | """ 72 | Generates a new AnnData object with TF activities stored in `.obsm` instead of gene expression. 73 | 74 | Parameters 75 | ---------- 76 | adata 77 | Annotated data matrix. 78 | obsm_key 79 | `.osbm` key where TF activities are stored. 80 | 81 | Returns 82 | ------- 83 | AnnData object with TF activities 84 | """ 85 | obsm = adata.obsm 86 | obs = adata.obs 87 | df = adata.obsm[obsm_key] 88 | var = pd.DataFrame(index=df.columns) 89 | tf_adata = AnnData(np.array(df), obs=obs, var=var, obsm=obsm) 90 | return tf_adata 91 | 92 | 93 | def process_input(data, use_raw=False, use_hvg=False): 94 | """ 95 | Processes different input types so that they can be used downstream. 96 | 97 | Parameters 98 | ---------- 99 | data 100 | Annotated data matrix or DataFrame 101 | use_raw 102 | If data is an AnnData object, whether to use values stored in `.raw`. 103 | use_hvg 104 | If data is an AnnData object, whether to only use high variable genes. 105 | 106 | Returns 107 | ------- 108 | genes : list of genes names 109 | samples : list of sample names 110 | X : gene expression matrix 111 | """ 112 | if isinstance(data, AnnData): 113 | if not use_raw: 114 | genes = np.array(data.var.index) 115 | idx = np.argsort(genes) 116 | genes = genes[idx] 117 | samples = data.obs.index 118 | X = data.X[:,idx] 119 | if use_hvg: 120 | hvg_msk = data.var.loc[genes].highly_variable 121 | X = X[:,hvg_msk] 122 | genes = genes[hvg_msk] 123 | else: 124 | genes = np.array(data.raw.var.index) 125 | idx = np.argsort(genes) 126 | genes = genes[idx] 127 | samples= data.raw.obs_names 128 | X = data.raw.X[:,idx] 129 | if use_hvg: 130 | hvg_msk = data.raw.var.loc[genes].highly_variable 131 | X = X[:,hvg_msk] 132 | genes = genes[hvg_msk] 133 | elif isinstance(data, pd.DataFrame): 134 | genes = np.array(data.columns) 135 | idx = np.argsort(genes) 136 | genes = genes[idx] 137 | samples = data.index 138 | X = np.array(data)[:,idx] 139 | else: 140 | raise ValueError('Input must be AnnData or pandas DataFrame.') 141 | if not issparse(X): 142 | X = csr_matrix(X) 143 | return genes, samples, X 144 | 145 | def dot_mult(X, R): 146 | # Run matrix mult 147 | tf_act = np.asarray(X.dot(R)) 148 | return tf_act 149 | 150 | def scale_arr(X, scale_axis): 151 | std = np.std(X, ddof=1, axis=scale_axis) 152 | std[std == 0] = 1 153 | mean = np.mean(X, axis=scale_axis) 154 | if scale_axis == 0: 155 | X = (X - mean) / std 156 | elif scale_axis == 1: 157 | X = (X - mean.reshape(-1,1)) / std.reshape(-1,1) 158 | return X 159 | 160 | 161 | def center_arr(X): 162 | X = X.copy() 163 | sums = np.squeeze(X.sum(1).A) 164 | counts = np.diff(X.tocsr().indptr) 165 | means = sums/counts 166 | X.data -= np.repeat(means, counts) 167 | return X 168 | 169 | 170 | def run(data, regnet, center=True, num_perm=0, norm=True, scale=True, scale_axis=0, inplace=True, 171 | use_raw=False, use_hvg=False, obsm_key='dorothea', min_size=5): 172 | """ 173 | Runs TF activity prediction from gene expression using DoRothEA's regulons. 174 | 175 | Parameters 176 | ---------- 177 | data 178 | Annotated data matrix or DataFrame. 179 | regnet 180 | Regulon network in DataFrame format. 181 | center 182 | Whether to center gene expression by cell/sample. 183 | num_perm 184 | Number of permutations to calculate p-vals of random activities. 185 | norm 186 | Whether to normalize activities per regulon size to correct for large regulons. 187 | scale 188 | Whether to scale the final activities. 189 | scale_axis 190 | 0 to scale per feature, 1 to scale per cell/sample. 191 | inplace 192 | If `data` is an AnnData object, whether to update `data` or return a DataFrame. 193 | use_raw 194 | If data is an AnnData object, whether to use values stored in `.raw`. 195 | use_hvg 196 | If data is an AnnData object, whether to only use high variable genes. 197 | obsm_key 198 | `.osbm` key where TF activities will be stored. 199 | min_size 200 | TFs with regulons with less targets than `min_size` will be ignored. 201 | 202 | Returns 203 | ------- 204 | Returns a DataFrame with TF activities or adds it to the `.obsm` key 'dorothea' 205 | of the input AnnData object, depending on `inplace` and input data type. 206 | """ 207 | # Get genes, samples/tfs and matrices from data and regnet 208 | x_genes, x_samples, X = process_input(data, use_raw=use_raw, use_hvg=use_hvg) 209 | 210 | assert len(x_genes) == len(set(x_genes)), 'Gene names are not unique' 211 | 212 | # Center gene expresison by cell 213 | if center: 214 | X = center_arr(X) 215 | 216 | # Back to normal arr 217 | X = X.A 218 | 219 | # Sort targets (rows) alphabetically 220 | regnet = regnet.sort_index() 221 | r_targets, r_tfs = regnet.index, regnet.columns 222 | 223 | assert len(r_targets) == len(set(r_targets)), 'regnet target names are not unique' 224 | assert len(r_tfs) == len(set(r_tfs)), 'regnet tf names are not unique' 225 | 226 | # Subset by common genes 227 | common_genes = np.sort(list(set(r_targets) & set(x_genes))) 228 | 229 | target_fraction = len(common_genes) / len(r_targets) 230 | assert target_fraction > .05, f'Too few ({len(common_genes)}) target genes found. \ 231 | Make sure you are using the correct organism.' 232 | 233 | print(f'{len(common_genes)} targets found') 234 | 235 | idx_x = np.searchsorted(x_genes, common_genes) 236 | X = X[:,idx_x] 237 | R = regnet.loc[common_genes].values 238 | 239 | # Check min size and filter 240 | msk_size = np.sum(R != 0, axis=0) < min_size 241 | num_small_reg = np.sum(msk_size) 242 | if num_small_reg > 0: 243 | print(f'{num_small_reg} TFs with < {min_size} targets') 244 | R[:, msk_size] = 0 245 | 246 | # Run matrix mult 247 | estimate = dot_mult(X, R) 248 | 249 | # Permutations 250 | if num_perm > 0: 251 | pvals = np.zeros(estimate.shape) 252 | for i in tqdm(range(num_perm)): 253 | perm = dot_mult(X, default_rng(seed=i).permutation(R)) 254 | pvals += np.abs(perm) > np.abs(estimate) 255 | pvals = pvals / num_perm 256 | pvals[pvals == 0] = 1/num_perm 257 | else: 258 | pvals = np.full(estimate.shape, 0.1) 259 | 260 | # Normalize by num edges 261 | if norm: 262 | norm = np.sum(np.abs(R), axis=0) 263 | norm[norm == 0] = 1 264 | estimate = estimate / norm 265 | 266 | # Weight estimate by pvals 267 | tf_act = estimate * -np.log10(pvals) 268 | 269 | # Scale output 270 | if scale: 271 | tf_act = scale_arr(tf_act, scale_axis) 272 | 273 | # Store in df 274 | result = pd.DataFrame(tf_act, columns=r_tfs, index=x_samples) 275 | 276 | if isinstance(data, AnnData) and inplace: 277 | # Update AnnData object 278 | data.obsm[obsm_key] = result 279 | else: 280 | # Return dataframe object 281 | data = result 282 | inplace = False 283 | 284 | return data if not inplace else None 285 | 286 | def rank_tfs_groups(adata, groupby, group, reference='all', obsm_key='dorothea'): 287 | """ 288 | Runs Wilcoxon rank-sum test between one group and a reference group. 289 | 290 | Parameters 291 | ---------- 292 | adata 293 | Annotated data matrix. 294 | groupby 295 | The key of the observations grouping to consider. 296 | group 297 | Group or list of groups to compare. 298 | reference 299 | Reference group or list of reference groups to use as reference. 300 | obsm_key 301 | `.osbm` key to use to extract TF activities. 302 | 303 | Returns 304 | ------- 305 | DataFrame with changes in TF activity between groups. 306 | """ 307 | from scipy.stats import ranksums 308 | from statsmodels.stats.multitest import multipletests 309 | 310 | # Get TF activites 311 | adata = extract(adata, obsm_key=obsm_key) 312 | 313 | # Get tf names 314 | features = adata.var.index.values 315 | 316 | # Generate mask for group samples 317 | if isinstance(group, str): 318 | g_msk = (adata.obs[groupby] == group).values 319 | else: 320 | cond_lst = [(adata.obs[groupby] == grp).values for grp in group] 321 | g_msk = np.sum(cond_lst, axis=0).astype(bool) 322 | group = ', '.join(group) 323 | 324 | # Generate mask for reference samples 325 | if reference == 'all': 326 | ref_msk = ~g_msk 327 | elif isinstance(reference, str): 328 | ref_msk = (adata.obs[groupby] == reference).values 329 | else: 330 | cond_lst = [(adata.obs[groupby] == ref).values for ref in reference] 331 | ref_msk = np.sum(cond_lst, axis=0).astype(bool) 332 | reference = ', '.join(reference) 333 | 334 | assert np.sum(g_msk) > 0, 'No group samples found' 335 | assert np.sum(ref_msk) > 0, 'No reference samples found' 336 | 337 | # Wilcoxon rank-sum test 338 | results = [] 339 | for i in np.arange(len(features)): 340 | stat, pval = ranksums(adata.X[g_msk,i], adata.X[ref_msk,i]) 341 | mc = np.mean(adata.X[g_msk,i]) - np.mean(adata.X[ref_msk,i]) 342 | results.append([features[i], group, reference, stat, mc, pval]) 343 | 344 | # Tranform to df 345 | results = pd.DataFrame( 346 | results, 347 | columns=['name', 'group', 'reference', 'statistic', 'meanchange', 'pval'] 348 | ).set_index('name') 349 | 350 | # Correct pvalues by FDR 351 | results[np.isnan(results['pval'])] = 1 352 | _, pvals_adj, _, _ = multipletests( 353 | results['pval'].values, alpha=0.05, method='fdr_bh' 354 | ) 355 | results['pval_adj'] = pvals_adj 356 | 357 | # Sort by statistic 358 | results = results.sort_values('meanchange', ascending=False) 359 | return results 360 | 361 | 362 | def check_regulon(adata, regnet, tf, groupby, use_raw=False, use_hvg=False, figsize=(12,6), 363 | cmap='rocket', show=None, return_fig=None): 364 | """ 365 | Plots a heatmap with the expression of target genes for a given TF. 366 | 367 | Parameters 368 | ---------- 369 | adata 370 | Annotated data matrix. 371 | regnet 372 | Regulon network in DataFrame format. 373 | tf 374 | Name of TF. 375 | groupby 376 | The key of the observations grouping to consider. 377 | use_raw 378 | If data is an AnnData object, whether to use values stored in `.raw`. 379 | use_hvg 380 | If data is an AnnData object, whether to only use high variable genes. 381 | figsize 382 | Size of the figure. 383 | cmap 384 | Color map to use. 385 | show 386 | Show the plot, do not return axis. 387 | return_fig 388 | Return the matplotlib figure. 389 | Returns 390 | ------- 391 | Heatmap figure. 392 | """ 393 | # Get genes, samples/tfs and matrices from data and regnet 394 | x_genes, x_samples, X = process_input(adata, use_raw=use_raw, use_hvg=use_hvg) 395 | 396 | # Sort targets (rows) alphabetically 397 | regnet = regnet.sort_index() 398 | r_targets, r_tfs = regnet.index, regnet.columns 399 | 400 | assert len(r_targets) == len(set(r_targets)), 'regnet target names are not unique' 401 | assert len(r_tfs) == len(set(r_tfs)), 'regnet tf names are not unique' 402 | 403 | # Subset by common genes 404 | common_genes = np.sort(list(set(r_targets) & set(x_genes))) 405 | 406 | target_fraction = len(common_genes) / len(r_targets) 407 | assert target_fraction > .05, f'Too few ({len(common_genes)}) target genes found. \ 408 | Make sure you are using the correct organism.' 409 | 410 | idx_x = np.searchsorted(x_genes, common_genes) 411 | X = X[:,idx_x] 412 | R = regnet.loc[common_genes].values 413 | R = R[:,list(r_tfs).index(tf)] 414 | 415 | X = X[:,R!=0] 416 | common_genes = common_genes[R!=0] 417 | 418 | sort_genes = np.argsort(np.mean(X*-1,axis=0)).flat 419 | X = X[:,sort_genes] 420 | common_genes = common_genes[sort_genes] 421 | 422 | groups = np.unique(adata.obs[groupby]) 423 | fig, axes = plt.subplots(len(groups), 1, 424 | gridspec_kw={'hspace': 0.05}, 425 | sharex=True, 426 | figsize=figsize 427 | ) 428 | fig.suptitle(tf, fontsize=16) 429 | axes = axes.flatten() 430 | max_n = np.max(X) 431 | min_n = np.min(X) 432 | i = 1 433 | X = pd.DataFrame(X.A, columns=common_genes) 434 | for group,ax in zip(groups, axes): 435 | msk = (adata.obs[groupby] == group).values 436 | if i == len(groups): 437 | sns.heatmap(X.loc[msk], cbar=True, 438 | yticklabels='', ax=ax, vmin=min_n, vmax=max_n, 439 | cbar_kws = {"shrink": .70}, cmap=cmap 440 | ) 441 | else: 442 | sns.heatmap(X.loc[msk], cbar=True, 443 | yticklabels='', ax=ax, vmin=min_n, vmax=max_n, 444 | cbar_kws = {"shrink": .70}, cmap=cmap 445 | ) 446 | ax.axes.xaxis.set_visible(False) 447 | ax.set_ylabel(group, rotation='horizontal', ha='right') 448 | i += 1 449 | if return_fig is True: 450 | return fig 451 | if show is False: 452 | return axes 453 | plt.show() -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | 'flit_core >=3.1,<4', 4 | 'setuptools_scm', 5 | 'pytoml', 6 | 'importlib_metadata>=0.7; python_version < "3.8"', 7 | 'packaging', 8 | ] 9 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from distutils.core import setup 4 | 5 | with open("README.md", "r", encoding="utf-8") as fh: 6 | long_description = fh.read() 7 | 8 | setup( 9 | name='dorothea-py', 10 | version='1.0.5', 11 | author='Pau Badia i Mompel', 12 | author_email="pau.badia@uni-heidelberg.de", 13 | description='dorothea-py is a python package to compute TF activity \ 14 | from RNA-seq data using DoRothEA as regulon resource', 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | url='https://github.com/saezlab/dorothea-py', 18 | project_urls={ 19 | "Bug Tracker": "https://github.com/saezlab/dorothea-py/issues", 20 | }, 21 | classifiers=[ 22 | "Programming Language :: Python :: 3", 23 | "Operating System :: OS Independent", 24 | ], 25 | packages=['dorothea'], 26 | license='LICENSE.txt', 27 | package_data={'dorothea': ['data/dorothea_hs.pkl', 28 | 'data/dorothea_mm.pkl', 29 | 'data/c_dorothea_hs.pkl', 30 | 'data/c_dorothea_mm.pkl'] 31 | }, 32 | install_requires=[ 33 | 'anndata', 34 | 'scanpy', 35 | 'numpy', 36 | 'pandas', 37 | 'tqdm', 38 | 'seaborn' 39 | ] 40 | ) 41 | --------------------------------------------------------------------------------