├── .gitignore
├── LICENSE
├── README.md
├── dorothea
    ├── __init__.py
    ├── data
    │   ├── c_dorothea_hs.pkl
    │   ├── c_dorothea_mm.pkl
    │   ├── dorothea_hs.pkl
    │   └── dorothea_mm.pkl
    └── dorothea.py
├── example
    └── dorothea_introduction.ipynb
├── pyproject.toml
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | example/data
 2 | example/sciraR.ipynb
 3 | example/viperR.ipynb
 4 | example/cache/
 5 | dorothea/__pycache__/
 6 | example/.ipynb_checkpoints/*
 7 | example/methods_exploration.ipynb
 8 | example/sinfo-requirements.txt
 9 | dist/
10 | dorothea/dorothea_py.egg-info/
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Saez Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # dorothea-py
2 | 
3 | This package has been deprecated. For transcription factor inference, please visit [decoupler](https://github.com/saezlab/decoupler-py), specifically [this](https://decoupler-py.readthedocs.io/en/latest/notebooks/dorothea.html) tutorial.
4 | 


--------------------------------------------------------------------------------
/dorothea/__init__.py:
--------------------------------------------------------------------------------
1 | from .dorothea import *


--------------------------------------------------------------------------------
/dorothea/data/c_dorothea_hs.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saezlab/dorothea-py/833165d3c790ced3a3e3852899e93412c63f0f44/dorothea/data/c_dorothea_hs.pkl


--------------------------------------------------------------------------------
/dorothea/data/c_dorothea_mm.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saezlab/dorothea-py/833165d3c790ced3a3e3852899e93412c63f0f44/dorothea/data/c_dorothea_mm.pkl


--------------------------------------------------------------------------------
/dorothea/data/dorothea_hs.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saezlab/dorothea-py/833165d3c790ced3a3e3852899e93412c63f0f44/dorothea/data/dorothea_hs.pkl


--------------------------------------------------------------------------------
/dorothea/data/dorothea_mm.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saezlab/dorothea-py/833165d3c790ced3a3e3852899e93412c63f0f44/dorothea/data/dorothea_mm.pkl


--------------------------------------------------------------------------------
/dorothea/dorothea.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from scipy.sparse import csr_matrix
  4 | from scipy.sparse import issparse
  5 | import scanpy as sc
  6 | from anndata import AnnData
  7 | import pickle
  8 | import pkg_resources
  9 | import os
 10 | from numpy.random import default_rng
 11 | from tqdm import tqdm
 12 | import seaborn as sns
 13 | import matplotlib.pyplot as plt
 14 | 
 15 | 
 16 | """TF activity prediction in Python"""
 17 | 
 18 | 
 19 | def load_regulons(levels=['A', 'B', 'C', 'D', 'E'], organism='Human', commercial=False):
 20 |     """
 21 |     Loads DoRothEA's regulons.
 22 |     
 23 |     Parameters
 24 |     ----------
 25 |     levels
 26 |         List of confidence levels to use. A regulons are the most confident, E the least.
 27 |     organism
 28 |         String determining which organism to use. Only `Human` and `Mouse` are supported.
 29 |     commercial
 30 |         Whether to use the academic or commercial version. 
 31 |     
 32 |     Returns
 33 |     -------
 34 |     DataFrame containing the relationships between gene targets (rows) and their TFs (columns). 
 35 | 
 36 |     Examples
 37 |     --------
 38 |     >>> import dorothea
 39 |     >>> regulons = dorothea.load_regulons(levels=['A'], organism='Human', commercial=False)
 40 |     """
 41 |     # Get package path
 42 |     path = 'data'
 43 |     fname = 'dorothea_'
 44 |     
 45 |     if commercial:
 46 |         fname = 'c_' + fname
 47 |     if organism == "Human":
 48 |         fname = fname + 'hs'
 49 |     elif organism == "Mouse":
 50 |         fname = fname + 'mm'
 51 |     else:
 52 |         raise("Wrong organism name. Please specify 'Human' or 'Mouse'.")
 53 |     fname = fname + '.pkl'
 54 |     path = pkg_resources.resource_filename(__name__, os.path.join(path, fname))
 55 |     
 56 |     # Open pickle object
 57 |     df = pickle.load(open(path, "rb" ))
 58 |     
 59 |     #Filter by levels of confidence
 60 |     df = df[df['confidence'].isin(levels)]
 61 |     
 62 |     # Transform to binary dataframe
 63 |     dorothea_df = df.pivot(index='target', columns='tf', values='mor')
 64 |     
 65 |     # Set nans to 0
 66 |     dorothea_df[np.isnan(dorothea_df)] = 0
 67 |     
 68 |     return dorothea_df
 69 | 
 70 | def extract(adata, obsm_key='dorothea'):
 71 |     """
 72 |     Generates a new AnnData object with TF activities stored in `.obsm` instead of gene expression. 
 73 |     
 74 |     Parameters
 75 |     ----------
 76 |     adata
 77 |         Annotated data matrix.
 78 |     obsm_key
 79 |         `.osbm` key where TF activities are stored.
 80 |     
 81 |     Returns
 82 |     -------
 83 |     AnnData object with TF activities
 84 |     """
 85 |     obsm = adata.obsm
 86 |     obs = adata.obs
 87 |     df = adata.obsm[obsm_key]
 88 |     var = pd.DataFrame(index=df.columns)
 89 |     tf_adata = AnnData(np.array(df), obs=obs, var=var, obsm=obsm)
 90 |     return tf_adata
 91 |     
 92 | 
 93 | def process_input(data, use_raw=False, use_hvg=False):
 94 |     """
 95 |     Processes different input types so that they can be used downstream. 
 96 |     
 97 |     Parameters
 98 |     ----------
 99 |     data
100 |         Annotated data matrix or DataFrame
101 |     use_raw
102 |         If data is an AnnData object, whether to use values stored in `.raw`.
103 |     use_hvg
104 |         If data is an AnnData object, whether to only use high variable genes.
105 |     
106 |     Returns
107 |     -------
108 |     genes : list of genes names
109 |     samples : list of sample names
110 |     X : gene expression matrix
111 |     """
112 |     if isinstance(data, AnnData):
113 |         if not use_raw:
114 |             genes = np.array(data.var.index)
115 |             idx = np.argsort(genes)
116 |             genes = genes[idx]
117 |             samples = data.obs.index
118 |             X = data.X[:,idx]
119 |             if use_hvg:
120 |                 hvg_msk = data.var.loc[genes].highly_variable
121 |                 X = X[:,hvg_msk]
122 |                 genes = genes[hvg_msk]
123 |         else:
124 |             genes = np.array(data.raw.var.index)
125 |             idx = np.argsort(genes)
126 |             genes = genes[idx]
127 |             samples= data.raw.obs_names
128 |             X = data.raw.X[:,idx]
129 |             if use_hvg:
130 |                 hvg_msk = data.raw.var.loc[genes].highly_variable
131 |                 X = X[:,hvg_msk]
132 |                 genes = genes[hvg_msk]
133 |     elif isinstance(data, pd.DataFrame):
134 |         genes = np.array(data.columns)
135 |         idx = np.argsort(genes)
136 |         genes = genes[idx]
137 |         samples = data.index
138 |         X = np.array(data)[:,idx]
139 |     else:
140 |         raise ValueError('Input must be AnnData or pandas DataFrame.')
141 |     if not issparse(X):
142 |         X = csr_matrix(X)
143 |     return genes, samples, X
144 | 
145 | def dot_mult(X, R):
146 |     # Run matrix mult
147 |     tf_act = np.asarray(X.dot(R))
148 |     return tf_act
149 | 
150 | def scale_arr(X, scale_axis):
151 |     std = np.std(X, ddof=1, axis=scale_axis)
152 |     std[std == 0] = 1
153 |     mean = np.mean(X, axis=scale_axis)
154 |     if scale_axis == 0:
155 |         X = (X - mean) / std
156 |     elif scale_axis == 1:
157 |             X = (X - mean.reshape(-1,1)) / std.reshape(-1,1)
158 |     return X
159 | 
160 | 
161 | def center_arr(X):
162 |     X = X.copy()
163 |     sums = np.squeeze(X.sum(1).A)
164 |     counts = np.diff(X.tocsr().indptr)
165 |     means = sums/counts
166 |     X.data -= np.repeat(means, counts)
167 |     return X
168 | 
169 | 
170 | def run(data, regnet, center=True, num_perm=0, norm=True, scale=True, scale_axis=0, inplace=True, 
171 |         use_raw=False, use_hvg=False, obsm_key='dorothea', min_size=5):
172 |     """
173 |     Runs TF activity prediction from gene expression using DoRothEA's regulons.
174 |     
175 |     Parameters
176 |     ----------
177 |     data
178 |         Annotated data matrix or DataFrame.
179 |     regnet
180 |         Regulon network in DataFrame format.
181 |     center
182 |         Whether to center gene expression by cell/sample.
183 |     num_perm
184 |         Number of permutations to calculate p-vals of random activities.
185 |     norm
186 |         Whether to normalize activities per regulon size to correct for large regulons.
187 |     scale
188 |         Whether to scale the final activities.
189 |     scale_axis
190 |         0 to scale per feature, 1 to scale per cell/sample.
191 |     inplace
192 |         If `data` is an AnnData object, whether to update `data` or return a DataFrame.
193 |     use_raw
194 |         If data is an AnnData object, whether to use values stored in `.raw`.
195 |     use_hvg
196 |         If data is an AnnData object, whether to only use high variable genes.
197 |     obsm_key
198 |         `.osbm` key where TF activities will be stored.
199 |     min_size
200 |         TFs with regulons with less targets than `min_size` will be ignored.
201 |     
202 |     Returns
203 |     -------
204 |     Returns a DataFrame with TF activities or adds it to the `.obsm` key 'dorothea' 
205 |     of the input AnnData object, depending on `inplace` and input data type.
206 |     """
207 |     # Get genes, samples/tfs and matrices from data and regnet
208 |     x_genes, x_samples, X = process_input(data, use_raw=use_raw, use_hvg=use_hvg)
209 | 
210 |     assert len(x_genes) == len(set(x_genes)), 'Gene names are not unique'
211 |     
212 |     # Center gene expresison by cell
213 |     if center:
214 |         X = center_arr(X)
215 |     
216 |     # Back to normal arr
217 |     X = X.A
218 | 
219 |     # Sort targets (rows) alphabetically
220 |     regnet = regnet.sort_index()
221 |     r_targets, r_tfs = regnet.index, regnet.columns
222 | 
223 |     assert len(r_targets) == len(set(r_targets)), 'regnet target names are not unique'
224 |     assert len(r_tfs) == len(set(r_tfs)), 'regnet tf names are not unique'
225 | 
226 |     # Subset by common genes
227 |     common_genes = np.sort(list(set(r_targets) & set(x_genes)))
228 | 
229 |     target_fraction = len(common_genes) / len(r_targets)
230 |     assert target_fraction > .05, f'Too few ({len(common_genes)}) target genes found. \
231 |     Make sure you are using the correct organism.'
232 | 
233 |     print(f'{len(common_genes)} targets found')
234 | 
235 |     idx_x = np.searchsorted(x_genes, common_genes)
236 |     X = X[:,idx_x]
237 |     R = regnet.loc[common_genes].values
238 |     
239 |     # Check min size and filter
240 |     msk_size = np.sum(R != 0, axis=0) < min_size
241 |     num_small_reg = np.sum(msk_size)
242 |     if num_small_reg > 0:
243 |         print(f'{num_small_reg} TFs with < {min_size} targets')
244 |         R[:, msk_size] = 0
245 | 
246 |     # Run matrix mult
247 |     estimate = dot_mult(X, R)
248 |     
249 |     # Permutations
250 |     if num_perm > 0:
251 |         pvals = np.zeros(estimate.shape)
252 |         for i in tqdm(range(num_perm)):
253 |             perm = dot_mult(X, default_rng(seed=i).permutation(R))
254 |             pvals += np.abs(perm) > np.abs(estimate)
255 |         pvals = pvals / num_perm
256 |         pvals[pvals == 0] = 1/num_perm
257 |     else:
258 |         pvals = np.full(estimate.shape, 0.1)
259 |     
260 |     # Normalize by num edges
261 |     if norm:
262 |         norm = np.sum(np.abs(R), axis=0)
263 |         norm[norm == 0] = 1
264 |         estimate = estimate / norm
265 | 
266 |     # Weight estimate by pvals
267 |     tf_act = estimate * -np.log10(pvals)
268 |     
269 |     # Scale output
270 |     if scale:
271 |         tf_act = scale_arr(tf_act, scale_axis)
272 | 
273 |     # Store in df
274 |     result = pd.DataFrame(tf_act, columns=r_tfs, index=x_samples)
275 | 
276 |     if isinstance(data, AnnData) and inplace:
277 |         # Update AnnData object
278 |         data.obsm[obsm_key] = result
279 |     else:
280 |         # Return dataframe object
281 |         data = result
282 |         inplace = False
283 | 
284 |     return data if not inplace else None
285 | 
286 | def rank_tfs_groups(adata, groupby, group, reference='all', obsm_key='dorothea'):
287 |     """
288 |     Runs Wilcoxon rank-sum test between one group and a reference group.
289 |     
290 |     Parameters
291 |     ----------
292 |     adata
293 |         Annotated data matrix.
294 |     groupby
295 |         The key of the observations grouping to consider.
296 |     group
297 |         Group or list of groups to compare.
298 |     reference
299 |         Reference group or list of reference groups to use as reference.
300 |     obsm_key
301 |          `.osbm` key to use to extract TF activities.
302 |     
303 |     Returns
304 |     -------
305 |     DataFrame with changes in TF activity between groups.
306 |     """
307 |     from scipy.stats import ranksums
308 |     from statsmodels.stats.multitest import multipletests
309 | 
310 |     # Get TF activites
311 |     adata = extract(adata, obsm_key=obsm_key)
312 |     
313 |     # Get tf names
314 |     features = adata.var.index.values
315 | 
316 |     # Generate mask for group samples
317 |     if isinstance(group, str):
318 |         g_msk = (adata.obs[groupby] == group).values
319 |     else:
320 |         cond_lst = [(adata.obs[groupby] == grp).values for grp in group]
321 |         g_msk = np.sum(cond_lst, axis=0).astype(bool)
322 |         group = ', '.join(group)
323 | 
324 |     # Generate mask for reference samples
325 |     if reference == 'all':
326 |         ref_msk = ~g_msk
327 |     elif isinstance(reference, str):
328 |         ref_msk = (adata.obs[groupby] == reference).values
329 |     else:
330 |         cond_lst = [(adata.obs[groupby] == ref).values for ref in reference]
331 |         ref_msk = np.sum(cond_lst, axis=0).astype(bool)
332 |         reference = ', '.join(reference)
333 |         
334 |     assert np.sum(g_msk) > 0, 'No group samples found'
335 |     assert np.sum(ref_msk) > 0, 'No reference samples found'
336 | 
337 |     # Wilcoxon rank-sum test 
338 |     results = []
339 |     for i in np.arange(len(features)):
340 |         stat, pval = ranksums(adata.X[g_msk,i], adata.X[ref_msk,i])
341 |         mc = np.mean(adata.X[g_msk,i]) - np.mean(adata.X[ref_msk,i])
342 |         results.append([features[i], group, reference, stat, mc, pval])
343 | 
344 |     # Tranform to df
345 |     results = pd.DataFrame(
346 |         results, 
347 |         columns=['name', 'group', 'reference', 'statistic', 'meanchange', 'pval']
348 |     ).set_index('name')
349 |     
350 |     # Correct pvalues by FDR
351 |     results[np.isnan(results['pval'])] = 1
352 |     _, pvals_adj, _, _ = multipletests(
353 |         results['pval'].values, alpha=0.05, method='fdr_bh'
354 |     )
355 |     results['pval_adj'] = pvals_adj
356 |     
357 |     # Sort by statistic
358 |     results = results.sort_values('meanchange', ascending=False)
359 |     return results
360 | 
361 | 
362 | def check_regulon(adata, regnet, tf, groupby, use_raw=False, use_hvg=False, figsize=(12,6), 
363 |                   cmap='rocket', show=None, return_fig=None):
364 |     """
365 |     Plots a heatmap with the expression of target genes for a given TF.
366 |     
367 |     Parameters
368 |     ----------
369 |     adata
370 |         Annotated data matrix.
371 |     regnet
372 |         Regulon network in DataFrame format.
373 |     tf
374 |         Name of TF.
375 |     groupby
376 |         The key of the observations grouping to consider.
377 |     use_raw
378 |         If data is an AnnData object, whether to use values stored in `.raw`.
379 |     use_hvg
380 |         If data is an AnnData object, whether to only use high variable genes.
381 |     figsize
382 |         Size of the figure.
383 |     cmap
384 |         Color map to use.
385 |     show
386 |         Show the plot, do not return axis.
387 |     return_fig
388 |         Return the matplotlib figure.
389 |     Returns
390 |     -------
391 |     Heatmap figure.
392 |     """
393 |     # Get genes, samples/tfs and matrices from data and regnet
394 |     x_genes, x_samples, X = process_input(adata, use_raw=use_raw, use_hvg=use_hvg)
395 | 
396 |     # Sort targets (rows) alphabetically
397 |     regnet = regnet.sort_index()
398 |     r_targets, r_tfs = regnet.index, regnet.columns
399 | 
400 |     assert len(r_targets) == len(set(r_targets)), 'regnet target names are not unique'
401 |     assert len(r_tfs) == len(set(r_tfs)), 'regnet tf names are not unique'
402 | 
403 |     # Subset by common genes
404 |     common_genes = np.sort(list(set(r_targets) & set(x_genes)))
405 | 
406 |     target_fraction = len(common_genes) / len(r_targets)
407 |     assert target_fraction > .05, f'Too few ({len(common_genes)}) target genes found. \
408 |     Make sure you are using the correct organism.'
409 | 
410 |     idx_x = np.searchsorted(x_genes, common_genes)
411 |     X = X[:,idx_x]
412 |     R = regnet.loc[common_genes].values
413 |     R = R[:,list(r_tfs).index(tf)]
414 | 
415 |     X = X[:,R!=0]
416 |     common_genes = common_genes[R!=0]
417 |     
418 |     sort_genes = np.argsort(np.mean(X*-1,axis=0)).flat
419 |     X = X[:,sort_genes]
420 |     common_genes = common_genes[sort_genes]
421 |     
422 |     groups = np.unique(adata.obs[groupby])
423 |     fig, axes = plt.subplots(len(groups), 1, 
424 |                              gridspec_kw={'hspace': 0.05}, 
425 |                              sharex=True,
426 |                              figsize=figsize
427 |                             )
428 |     fig.suptitle(tf, fontsize=16)
429 |     axes = axes.flatten()
430 |     max_n = np.max(X)
431 |     min_n = np.min(X)
432 |     i = 1
433 |     X = pd.DataFrame(X.A, columns=common_genes)
434 |     for group,ax in zip(groups, axes):
435 |         msk = (adata.obs[groupby] == group).values
436 |         if i == len(groups):
437 |             sns.heatmap(X.loc[msk], cbar=True,
438 |                         yticklabels='', ax=ax, vmin=min_n, vmax=max_n,
439 |                         cbar_kws = {"shrink": .70}, cmap=cmap
440 |                        )
441 |         else:
442 |             sns.heatmap(X.loc[msk], cbar=True,
443 |                         yticklabels='', ax=ax, vmin=min_n, vmax=max_n,
444 |                         cbar_kws = {"shrink": .70}, cmap=cmap
445 |                        )
446 |             ax.axes.xaxis.set_visible(False)
447 |         ax.set_ylabel(group, rotation='horizontal', ha='right')
448 |         i += 1
449 |     if return_fig is True:
450 |         return fig
451 |     if show is False:
452 |         return axes
453 |     plt.show()


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     'flit_core >=3.1,<4',
4 |     'setuptools_scm',
5 |     'pytoml',
6 |     'importlib_metadata>=0.7; python_version < "3.8"',
7 |     'packaging',
8 | ]
9 | build-backend = "setuptools.build_meta"


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from distutils.core import setup
 4 | 
 5 | with open("README.md", "r", encoding="utf-8") as fh:
 6 |     long_description = fh.read()
 7 | 
 8 | setup(
 9 |     name='dorothea-py',
10 |     version='1.0.5',
11 |     author='Pau Badia i Mompel',
12 |     author_email="pau.badia@uni-heidelberg.de",
13 |     description='dorothea-py is a python package to compute TF activity \
14 |     from RNA-seq data using DoRothEA as regulon resource',
15 |     long_description=long_description,
16 |     long_description_content_type="text/markdown",
17 |     url='https://github.com/saezlab/dorothea-py',
18 |     project_urls={
19 |         "Bug Tracker": "https://github.com/saezlab/dorothea-py/issues",
20 |     },
21 |     classifiers=[
22 |         "Programming Language :: Python :: 3",
23 |         "Operating System :: OS Independent",
24 |     ],
25 |     packages=['dorothea'], 
26 |     license='LICENSE.txt',
27 |     package_data={'dorothea': ['data/dorothea_hs.pkl', 
28 |                               'data/dorothea_mm.pkl', 
29 |                               'data/c_dorothea_hs.pkl', 
30 |                               'data/c_dorothea_mm.pkl']
31 |                 },
32 |     install_requires=[
33 |         'anndata',
34 |         'scanpy',
35 |         'numpy',
36 |         'pandas',
37 |         'tqdm',
38 |         'seaborn'
39 |     ]
40 | )
41 | 


--------------------------------------------------------------------------------