├── .github
    └── workflows
    │   └── deploy-doc-to-ghpages.yml
├── .gitignore
├── LICENSE
├── README.md
├── docs
    ├── .gitignore
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── .gitignore
    │   ├── conf.py
    │   └── index.rst
├── examples
    └── biojupies.py
├── maayanlab_bioinformatics
    ├── __init__.py
    ├── api
    │   ├── __init__.py
    │   ├── enrichr.py
    │   └── speedrichr.py
    ├── clustering
    │   ├── __init__.py
    │   └── silhouette_analysis.py
    ├── dge
    │   ├── __init__.py
    │   ├── characteristic_direction.py
    │   ├── deseq2.py
    │   ├── limma_voom.py
    │   ├── logfc.py
    │   └── ttest.py
    ├── enrichment
    │   ├── __init__.py
    │   ├── crisp.py
    │   ├── gsea2003.py
    │   └── gsea2005.py
    ├── harmonization
    │   ├── __init__.py
    │   ├── homologs.py
    │   ├── id_mapper.py
    │   ├── ncbi_genes.py
    │   └── transcripts.py
    ├── normalization
    │   ├── __init__.py
    │   ├── cpm.py
    │   ├── filter.py
    │   ├── log.py
    │   ├── quantile.py
    │   ├── quantile_legacy.py
    │   └── zscore.py
    ├── parse
    │   ├── __init__.py
    │   ├── gmt.py
    │   └── suerat.py
    ├── plotting
    │   ├── __init__.py
    │   ├── bridge.py
    │   ├── clustergrammer.py
    │   └── upset.py
    ├── setup
    │   ├── R.py
    │   └── __init__.py
    └── utils
    │   ├── __init__.py
    │   ├── chunked.py
    │   ├── describe.py
    │   ├── fetch_save_read.py
    │   ├── maybe_tqdm.py
    │   ├── merge.py
    │   └── sparse.py
├── poetry.lock
├── pyproject.toml
└── tests
    ├── __init__.py
    ├── dge
        ├── __init__.py
        ├── test_deseq2.py
        ├── test_limma.py
        ├── test_logfc.py
        └── test_ttest.py
    ├── enrichment
        ├── __init__.py
        └── test_crisp.py
    ├── normalization
        ├── __init__.py
        ├── test_cpm.py
        ├── test_quantile.py
        └── test_quantile_legacy.py
    ├── parse
        ├── __init__.py
        └── test_gmt.py
    ├── test_enrichr_results.txt
    ├── test_example_matrix.txt
    ├── test_example_matrix_dge_results.txt
    ├── test_example_metadata.txt
    ├── test_geneset.txt
    └── test_gmt.gmt


/.github/workflows/deploy-doc-to-ghpages.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy docs to gh-pages
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 | jobs:
 7 |   build:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |     - uses: actions/checkout@v2
11 |     - name: Set up Python
12 |       uses: actions/setup-python@v1
13 |       with:
14 |         python-version: '3.9.x'
15 |     - name: Install python dependencies
16 |       run: |
17 |         python -m pip install --upgrade pip
18 |         pip install llvmlite poetry
19 |         poetry install -E docs
20 |     - name: Building docs
21 |       run: eval $(poetry env activate) && cd docs && make build-html
22 |     - name: Deploy
23 |       uses: peaceiris/actions-gh-pages@v3
24 |       with:
25 |         github_token: ${{ secrets.GITHUB_TOKEN }}
26 |         publish_dir: ./docs/build
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .vscode
3 | *.egg-info
4 | *.pyc
5 | /build


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Ma'ayanlab Bioinformatics
 2 | 
 3 | A collection of useful functions for bioinformatics data analysis.
 4 | 
 5 | This library contains many functions and methods I use again and again in different analyses including:
 6 | - quantile normalization
 7 | - other common normalizations
 8 | - logcpm, zscore, filter variance
 9 | - gmt parser
10 | - single-cell sparse matrix parsing
11 | - transcript to gene conversions
12 | - ...
13 | - various dge including chdir, lima_voom, deseq2, etc..
14 | 
15 | ## Installation
16 | ```
17 | # minimal
18 | pip install "maayanlab-bioinformatics@git+https://github.com/Maayanlab/maayanlab-bioinformatics.git"
19 | 
20 | # complete
21 | pip install "maayanlab-bioinformatics[all]@git+https://github.com/Maayanlab/maayanlab-bioinformatics.git"
22 | # [OPTIONAL] for some R functionality like limma_voom & filter_by_expr
23 | python -m maayanlab_bioinformatics.setup.R
24 | ```
25 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | source/*.rst
3 | !index.rst


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | MODULE        = maayanlab_bioinformatics
11 | 
12 | # Put it first so that "make" without argument is like "make help".
13 | help:
14 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
15 | 
16 | .PHONY: help Makefile
17 | 
18 | # Catch-all target: route all unknown targets to Sphinx using the new
19 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
20 | %: Makefile
21 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
22 | 
23 | prepare-apidoc:
24 | 	sphinx-apidoc -o $(SOURCEDIR) ../$(MODULE)
25 | 
26 | build-html: prepare-apidoc
27 | 	$(SPHINXBUILD) -b html $(SOURCEDIR) $(BUILDDIR)
28 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/.gitignore:
--------------------------------------------------------------------------------
1 | maayanlab_bioinformatics.*.rst


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('../..'))
16 |  
17 | import sphinx
18 | import commonmark
19 | from m2r2 import MdInclude
20 | from recommonmark.transform import AutoStructify
21 | 
22 | 
23 | # -- Project information -----------------------------------------------------
24 | 
25 | project = "Ma'ayanlab Bioinformatics"
26 | copyright = "2020, Ma'ayanlab"
27 | author = "Ma'ayanlab"
28 | 
29 | # The full version, including alpha/beta/rc tags
30 | release = '0.0.1'
31 | 
32 | 
33 | # -- General configuration ---------------------------------------------------
34 | 
35 | # Add any Sphinx extension module names here, as strings. They can be
36 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
37 | # ones.
38 | import sys, os; sys.path.insert(0, os.path.abspath('../..'))
39 | extensions = [
40 |   'recommonmark',
41 |   'sphinx.ext.autodoc',
42 |   'sphinx.ext.viewcode',
43 |   'sphinx.ext.autosectionlabel',
44 | ]
45 | 
46 | # Add any paths that contain templates here, relative to this directory.
47 | templates_path = ['_templates']
48 | 
49 | # List of patterns, relative to source directory, that match files and
50 | # directories to ignore when looking for source files.
51 | # This pattern also affects html_static_path and html_extra_path.
52 | exclude_patterns = []
53 | 
54 | 
55 | # -- Options for HTML output -------------------------------------------------
56 | 
57 | # The theme to use for HTML and HTML Help pages.  See the documentation for
58 | # a list of builtin themes.
59 | #
60 | html_theme = 'nature'
61 | 
62 | # Add any paths that contain custom static files (such as style sheets) here,
63 | # relative to this directory. They are copied after the builtin static files,
64 | # so a file named "default.css" will overwrite the builtin "default.css".
65 | html_static_path = ['_static']
66 | 
67 | html_static_path = ['_static']
68 | 
69 | autosectionlabel_prefix_document = True
70 | 
71 | def docstring(app, what, name, obj, options, lines):
72 |     md  = '\n'.join(lines)
73 |     ast = commonmark.Parser().parse(md)
74 |     rst = commonmark.ReStructuredTextRenderer().render(ast)
75 |     lines.clear()
76 |     lines += rst.splitlines()
77 | 
78 | def setup(app):
79 |     config = {
80 |         # 'url_resolver': lambda url: github_doc_root + url,
81 |         'auto_toc_tree_section': 'Contents',
82 |         'enable_eval_rst': True,
83 |     }
84 |     app.add_config_value('recommonmark_config', config, True)
85 |     app.add_transform(AutoStructify)
86 |     app.connect('autodoc-process-docstring', docstring)
87 |     app.add_config_value('no_underscore_emphasis', False, 'env')
88 |     app.add_config_value('m2r_parse_relative_links', False, 'env')
89 |     app.add_config_value('m2r_anonymous_references', False, 'env')
90 |     app.add_config_value('m2r_disable_inline_math', False, 'env')
91 |     app.add_config_value('m2r_use_mermaid', False, 'env')
92 |     app.add_directive('mdinclude', MdInclude)
93 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. Ma'ayanlab Bioinformatics documentation master file, created by
 2 |    sphinx-quickstart on Thu May 21 12:32:42 2020.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to Ma'ayanlab Bioinformatics's documentation!
 7 | =====================================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 | Indices and tables
14 | ==================
15 | 
16 | * :ref:`genindex`
17 | * :ref:`modindex`
18 | * :ref:`search`
19 | 
20 | .. mdinclude:: ../../README.md
21 | 


--------------------------------------------------------------------------------
/examples/biojupies.py:
--------------------------------------------------------------------------------
 1 | #%%[markdown]
 2 | 
 3 | # Here we walk through building a biojupies notebook using methods in this library for the purpose of integration testing.
 4 | 
 5 | #%%
 6 | import sys; sys.path.insert(0, '..')
 7 | import os
 8 | import numpy as np
 9 | import pandas as pd
10 | from IPython.display import display
11 | from sklearn.decomposition import PCA
12 | from matplotlib import pyplot as plt
13 | import plotly.express as px
14 | from maayanlab_bioinformatics.normalization import filter_by_var, cpm_normalize, zscore_normalize, log10_normalize
15 | from maayanlab_bioinformatics.dge import limma_voom_differential_expression
16 | from maayanlab_bioinformatics.utils import merge
17 | os.environ['R_LIBS_USER'] = '/home/u8sand/.r_libs'
18 | 
19 | #%%
20 | # biojupies settings
21 | pca_top_genes = 2500
22 | normalization = 'logCPM'
23 | zscore = True
24 | pval_thresh = 0.05
25 | logFC_thresh = 1.5
26 | enrichr_geneset_size = 500
27 | sort_genes_by = 't'
28 | 
29 | #%%
30 | # ## Load Dataset
31 | df_data = pd.read_csv('../tests/test_example_matrix.txt', sep='\t', index_col=0)
32 | display(df_data.head())
33 | 
34 | df_metadata = pd.read_csv('../tests/test_example_metadata.txt', sep='\t', index_col=0)
35 | display(df_metadata.head())
36 | 
37 | #%%
38 | # Biojupies selection
39 | normal = df_data.loc[:, df_metadata['cell type'] == 'normal melanocytes']
40 | perturbation = df_data.loc[:, df_metadata['cell type'] == 'melanoma cell line']
41 | 
42 | #%%
43 | # ## PCA
44 | # Principal Component Analysis was performed using the PCA function from in the sklearn Python module.
45 | # Prior to performing PCA, the raw gene counts were normalized using the logCPM method
46 | df_data_norm_for_pca = log10_normalize(cpm_normalize(df_data))
47 | # filtered by selecting the 2500 genes with most variable expression
48 | df_data_norm_for_pca = filter_by_var(df_data_norm_for_pca, top_n=2500)
49 | # and finally transformed using the Z-score method.
50 | df_data_norm_for_pca = zscore_normalize(df_data_norm_for_pca)
51 | 
52 | pca = PCA()
53 | pca.fit(df_data_norm_for_pca.T.values)
54 | df_pca = pd.DataFrame(
55 |   pca.transform(df_data_norm_for_pca.T.values),
56 |   index=df_data_norm_for_pca.columns,
57 |   columns=[
58 |     f"PC{n}({var*100:2.1f}% var. explained)"
59 |     for n, var in enumerate(pca.explained_variance_ratio_)
60 |   ]
61 | )
62 | px.scatter_3d(
63 |   merge(df_pca, df_metadata),
64 |   x=df_pca.columns[0],
65 |   y=df_pca.columns[1],
66 |   z=df_pca.columns[2],
67 |   color='cell type',
68 |   hover_data=[df_pca.index],
69 | )
70 | 
71 | #%%
72 | # ## clustergrammer
73 | 
74 | #%%
75 | # ## library size analysis
76 | 
77 | #%%
78 | # ## Differential Expression Table
79 | dge_table = limma_voom_differential_expression(
80 |   normal,
81 |   perturbation,
82 |   filter_genes=True,
83 |   voom_design=False,
84 | )
85 | dge_table.sort_values('P.Value')
86 | 
87 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/__init__.py:
--------------------------------------------------------------------------------
 1 | '''A collection of useful functions for bioinformatics data analysis.
 2 | 
 3 | This library contains many functions and methods I use again and again in different analyses including:
 4 | - api: enrichr, l1000fwd, ...
 5 | - clustering: silhouette analysis, ...
 6 | - dge: various differential gene expression techniques including chdir, lima_voom, ..
 7 | - harmonization: transcript to gene conversions, ...
 8 | - normalization: log, cpm, zscore, filterByExpr, quantile, ...
 9 | - parse: gmt, suerat ready files, ...
10 | - utils: fetch_save_read, merge, ...
11 | '''
12 | 
13 | from maayanlab_bioinformatics import api, clustering, dge, harmonization, normalization, parse, utils
14 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/api/__init__.py:
--------------------------------------------------------------------------------
1 | '''This module contains API wrappers for some commonly used tool APIs
2 | '''
3 | 
4 | from maayanlab_bioinformatics.api.enrichr import enrichr_link_from_genes, enrichr_get_top_results, enrichr_term_genes, EnrichrUserList
5 | from maayanlab_bioinformatics.api.speedrichr import speedenrich
6 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/api/enrichr.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import requests
  3 | import pandas as pd
  4 | 
  5 | def enrichr_link_from_genes(genes, description='', enrichr_link='https://maayanlab.cloud/Enrichr', sleep=1):
  6 |   ''' Functional access to Enrichr API
  7 |   '''
  8 |   if sleep:
  9 |     time.sleep(sleep)
 10 |   resp = requests.post(enrichr_link + '/addList', files={
 11 |     'list': (None, '\n'.join(genes)),
 12 |     'description': (None, description),
 13 |   })
 14 |   if resp.status_code != 200:
 15 |     raise Exception('Enrichr failed with status {}: {}'.format(
 16 |       resp.status_code,
 17 |       resp.text,
 18 |     ))
 19 |   result = resp.json()
 20 |   return dict(result, link=enrichr_link + '/enrich?dataset=' + resp.json()['shortId'])
 21 | 
 22 | def enrichr_get_top_results(userListId, bg, enrichr_link='https://maayanlab.cloud/Enrichr', sleep=1):
 23 |   if sleep:
 24 |     time.sleep(sleep)
 25 |   resp = requests.get(enrichr_link + '/enrich?userListId={}&backgroundType={}'.format(userListId, bg))
 26 |   if resp.status_code != 200:
 27 |     raise Exception('Enrichr failed with status {}: {}'.format(
 28 |       resp.status_code,
 29 |       resp.text,
 30 |     ))
 31 |   return pd.DataFrame(resp.json()[bg], columns=[
 32 |     'rank', 'term', 'pvalue', 'zscore', 'combinedscore', 'overlapping_genes', 'adjusted_pvalue', '', ''
 33 |   ])
 34 | 
 35 | def enrichr_term_genes(bg, terms, enrichr_link='https://maayanlab.cloud/Enrichr', sleep=1):
 36 |   if sleep:
 37 |     time.sleep(sleep)
 38 |   resp = requests.get(enrichr_link + '/geneSetLibrary?mode=json&libraryName={}&term={}'.format(
 39 |       bg,
 40 |       ';'.join(terms),
 41 |   ))
 42 |   if resp.status_code != 200:
 43 |     raise Exception('Enrichr failed with status {}: {}'.format(
 44 |       resp.status_code,
 45 |       resp.text,
 46 |     ))
 47 |   return resp.json()
 48 | 
 49 | class EnrichrUserList:
 50 |   ''' Object oriented access to Enrichr results.
 51 | 
 52 |   Example:
 53 |   ```python
 54 |   from maayanlab_bioinformatics.api import EnrichrUserList
 55 |   mylist = EnrichrUserList([
 56 |     'STAT1', 'ACE2', #...
 57 |   ], 'mylist')
 58 |   print(mylist.link)
 59 |   mylist['GO_Biological_Process_2021'] # returns dataframe with enrichment results
 60 |   ```
 61 |   '''
 62 |   def __init__(self, genes, description='', shortId=None, userListId=None, enrichr_link='https://maayanlab.cloud/Enrichr'):
 63 |     self._enrichr_link = enrichr_link
 64 |     self._genes = genes
 65 |     self._description = description
 66 |     self._shortId = shortId
 67 |     self._userListId = userListId
 68 |     self._results = {}
 69 | 
 70 |   def __repr__(self):
 71 |     return f"EnrichrUserList(..., description={repr(self._description)}, userListId={repr(self._userListId)}, shortId={repr(self._shortId)})"
 72 |   
 73 |   @staticmethod
 74 |   def from_url(enrichrUrl):
 75 |     ''' Build object from an existing enrichr share url,
 76 |     e.g.
 77 |     userlist = EnrichrUserList.from_url('https://maayanlab.cloud/Enrichr/enrich?dataset=285c88882ac50767f2a452c1e93632fd')
 78 |     '''
 79 |     import requests
 80 |     import urllib.parse
 81 |     from bs4 import BeautifulSoup
 82 |     enrichrUrl_parsed = urllib.parse.urlparse(enrichrUrl)
 83 |     enrichr_link = f"{enrichrUrl_parsed.scheme}://{enrichrUrl_parsed.netloc}{'/'.join(enrichrUrl_parsed.path.split('/')[:-1])}"
 84 |     shortId = dict(urllib.parse.parse_qsl(enrichrUrl_parsed.query))['dataset']
 85 |     time.sleep(0.5)
 86 |     root = BeautifulSoup(requests.get(enrichrUrl).content, features='lxml')
 87 |     userListId = root.select_one('#userListId').get('value')
 88 |     time.sleep(0.5)
 89 |     res = requests.get(enrichr_link + '/view', params=dict(userListId=userListId)).json()
 90 |     genes = res['genes']
 91 |     description = res['description']
 92 |     return EnrichrUserList(genes, description=description, shortId=shortId, userListId=userListId, enrichr_link=enrichr_link)
 93 | 
 94 |   @property
 95 |   def genes(self):
 96 |     return self._genes
 97 | 
 98 |   @property
 99 |   def description(self):
100 |     return self._description
101 | 
102 |   @property
103 |   def link(self):
104 |     return self._enrichr_link + '/enrich?dataset=' + self.shortId
105 | 
106 |   @property
107 |   def shortId(self):
108 |     if self._shortId is None: self._addList()
109 |     return self._shortId
110 | 
111 |   @property
112 |   def userListId(self):
113 |     if self._userListId is None: self._addList()
114 |     return self._userListId
115 |   
116 |   def __getitem__(self, library):
117 |     if library not in self._results: self._enrich(library)
118 |     return self._results[library].copy()
119 | 
120 |   def _addList(self):
121 |     resp = requests.post(self._enrichr_link + '/addList', files={
122 |       'list': (None, '\n'.join(self.genes)),
123 |       'description': (None, self.description),
124 |     })
125 |     if resp.status_code != 200:
126 |       raise Exception('Enrichr failed with status {}: {}'.format(
127 |         resp.status_code,
128 |         resp.text,
129 |       ))
130 |     results = resp.json()
131 |     # wait a tinybit before returning link (backoff)
132 |     time.sleep(0.5)
133 |     #
134 |     self._userListId = results['userListId']
135 |     self._shortId = results['shortId']
136 |   
137 |   def _enrich(self, library):
138 |     resp = requests.get(self._enrichr_link + '/enrich', params={
139 |       'userListId': self.userListId,
140 |       'backgroundType': library,
141 |     })
142 |     if resp.status_code != 200:
143 |       raise Exception('Enrichr failed with status {}: {}'.format(
144 |         resp.status_code,
145 |         resp.text,
146 |       ))
147 |     results = resp.json()
148 |     # wait a tinybit before returning link (backoff)
149 |     time.sleep(0.5)
150 |     self._results[library] = pd.DataFrame([
151 |       dict(
152 |         rank=rank,
153 |         term=term,
154 |         pvalue=pvalue,
155 |         zscore=zscore,
156 |         combinedscore=combinedscore,
157 |         overlapping_genes=overlapping_genes,
158 |         adjusted_pvalue=adjusted_pvalue,
159 |       )
160 |       for rank, term, pvalue, zscore, combinedscore, overlapping_genes, adjusted_pvalue, _, _ in results[library]
161 |     ])
162 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/api/speedrichr.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import typing as t
 3 | import pandas as pd
 4 | 
 5 | def speedenrich(userlist: t.List[str], libraries: t.List[str]=None, background: t.List[str]=None, description='Example gene list', base_url='https://maayanlab.cloud/speedrichr'):
 6 |   ''' Perform enrichment analysis using speedrichr.
 7 | 
 8 |   :param userlist: A list of genes (e.g. ['PHF14', 'RBM3', 'MSL1', ...])
 9 |   :param libraries: One or more Enrichr Libraries (https://maayanlab.cloud/Enrichr/#libraries) (e.g. ["TRANSFAC_and_JASPAR_PWMs"])
10 |   :param background: An optional background geneset for background correction (e.g. ['PHF14', 'RBM3', 'MSL1', ...])
11 |   :param base_url: If using a different enrichr instance than the public one, specify the base prefix
12 |   :return: A pandas dataframe with enrichment results
13 |   '''
14 |   userlist_response = requests.post(
15 |     base_url+'/api/addList',
16 |     files=dict(
17 |       list=(None, '\n'.join(userlist)),
18 |       description=(None, description),
19 |     )
20 |   ).json()
21 |   if background:
22 |     background_response = requests.post(
23 |       base_url+'/api/addbackground',
24 |       data=dict(background='\n'.join(background)),
25 |     ).json()
26 |     results = {}
27 |     for library in set(libraries):
28 |       results.update(
29 |         requests.post(
30 |           base_url+'/api/backgroundenrich',
31 |           data=dict(
32 |             userListId=userlist_response['userListId'],
33 |             backgroundid=background_response['backgroundid'],
34 |             backgroundType=library,
35 |           )
36 |         ).json()
37 |       )
38 |   else:
39 |     results = {}
40 |     for library in set(libraries):
41 |       results.update(
42 |         requests.get(
43 |           base_url+'/api/enrich',
44 |           params=dict(
45 |             userListId=userlist_response['userListId'],
46 |             backgroundType=library,
47 |           ),
48 |         ).json()
49 |       )
50 |   return pd.DataFrame([
51 |     [l, *r]
52 |     for l, result in results.items()
53 |     for r in result
54 |   ], columns=['library', 'rank', 'term', 'pvalue', 'oddsratio', 'combined score', 'overlap', 'adj pvalue', 'legacy_0', 'legacy_1']).sort_values('rank', ascending=True)
55 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/clustering/__init__.py:
--------------------------------------------------------------------------------
1 | '''This module contains functions relating to high level cluster analysis
2 | '''
3 | 
4 | from maayanlab_bioinformatics.clustering.silhouette_analysis import silhouette_analysis
5 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/clustering/silhouette_analysis.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.cluster import KMeans
 3 | from sklearn.metrics import silhouette_score
 4 | 
 5 | def silhouette_analysis(mat: pd.DataFrame, min_clusters=2, max_clusters=25, metric='cosine', random_state=None, **kwargs):
 6 |   ''' Compute KMeans repeatedly on the matrix with different cluster
 7 |   values between min_clusters and max_clusters, compute the silhouette_score,
 8 |   and return the best kmeans model/predictions.
 9 |   '''
10 |   silhouette_scores = {}
11 |   best = None
12 |   for n in range(min_clusters, max_clusters+1):
13 |     km = KMeans(n_clusters=n, random_state=random_state)
14 |     y_pred = km.fit_predict(mat.values)
15 |     score = silhouette_score(mat.values, y_pred, metric='cosine')
16 |     silhouette_scores[n] = score
17 |     if best is None or score > best[0]:
18 |       best = (score, km, y_pred)
19 |   #
20 |   silhouette_scores = pd.DataFrame([
21 |     {'N Clusters': k, 'Silhouette Score': v}
22 |     for k, v in silhouette_scores.items()
23 |   ])
24 |   #
25 |   score, km, y_pred = best
26 |   y_pred = pd.DataFrame({
27 |     'Cluster': [
28 |       'Cluster {c}'.format(c=c)
29 |       for c in km.fit_predict(mat.values)
30 |     ]
31 |   }, index=mat.index)
32 |   return type('SilhouetteAnalysis', tuple(), dict(
33 |     silhouette_scores=silhouette_scores,
34 |     best_score=score,
35 |     best_km=km,
36 |     best_preds=y_pred,
37 |   ))
38 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/dge/__init__.py:
--------------------------------------------------------------------------------
1 | ''' This module contains functions for differential expression analysis
2 | '''
3 | 
4 | from maayanlab_bioinformatics.dge.characteristic_direction import characteristic_direction, up_down_from_characteristic_direction
5 | from maayanlab_bioinformatics.dge.limma_voom import limma_voom_differential_expression, limma_voom_differential_expression_design, up_down_from_limma_voom
6 | from maayanlab_bioinformatics.dge.deseq2 import deseq2_differential_expression
7 | from maayanlab_bioinformatics.dge.ttest import ttest_differential_expression
8 | from maayanlab_bioinformatics.dge.logfc import logfc_differential_expression
9 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/dge/characteristic_direction.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from scipy.stats import chi2
  4 | from scipy.stats.mstats import zscore
  5 | from sklearn.decomposition import PCA
  6 | 
  7 | # TODO: revamp _chdir
  8 | def _chdir(data, sampleclass, genes, gamma=1., sort=True, calculate_sig=False, nnull=10, sig_only=False, norm_vector=True):
  9 |   """ repurposed from https://github.com/MaayanLab/geode/blob/master/geode/geode.py#L10-L115
 10 | 
 11 |   Calculate the characteristic direction for a gene expression dataset
 12 |   
 13 |   Input:
 14 |     data: numpy.array, is the data matrix of gene expression where rows correspond to genes and columns correspond to samples
 15 |     sampleclass: list or numpy.array, labels of the samples, it has to be consist of 0, 1 and 2, with 0 being columns to be excluded, 1 being control and 2 being perturbation
 16 |         example: sampleclass = [1,1,1,2,2,2]
 17 |     genes: list or numpy.array, row labels for genes 
 18 |     gamma: float, regulaized term. A parameter that smooths the covariance matrix and reduces potential noise in the dataset
 19 |     sort: bool, whether to sort the output by the absolute value of chdir
 20 |     calculate_sig: bool, whether to calculate the significance of characteristic directions
 21 |     nnull: int, number of null characteristic directions to calculate for significance
 22 |     sig_only: bool, whether to return only significant genes; active only when calculate_sig is True
 23 |     norm_vector: bool, whether to return a characteristic direction vector normalized to unit vector
 24 |   Output:
 25 |     A list of tuples sorted by the absolute value in descending order characteristic directions of genes.
 26 |       If calculate_sig is set to True, each tuple contains a third element which is the ratio of characteristic directions to null ChDir
 27 |   """
 28 |   
 29 |   ## check input
 30 |   data.astype(float)
 31 |   sampleclass = np.array(list(map(int, sampleclass)))
 32 |   # masks
 33 |   m_non0 = sampleclass != 0
 34 |   m1 = sampleclass[m_non0] == 1
 35 |   m2 = sampleclass[m_non0] == 2
 36 | 
 37 |   if type(gamma) not in [float, int]:
 38 |     raise ValueError("gamma has to be a numeric number")
 39 |   if set(sampleclass) != set([1,2]) and set(sampleclass) != set([0,1,2]):
 40 |     raise ValueError("sampleclass has to be a list whose elements are in only 0, 1 or 2")
 41 |   # if m1.sum()<2 or m2.sum()<2:
 42 |   #   raise ValueError("Too few samples to calculate characteristic directions")
 43 |   if len(genes) != data.shape[0]:
 44 |     raise ValueError("Number of genes does not match the demension of the expression matrix")
 45 | 
 46 |   ## normalize data
 47 |   data = data[:, m_non0]
 48 |   data = zscore(data) # standardize for each genes across samples
 49 | 
 50 |   ## start to compute
 51 |   n1 = m1.sum() # number of controls
 52 |   n2 = m2.sum() # number of experiments
 53 | 
 54 |   ## the difference between experiment mean vector and control mean vector.
 55 |   meanvec = data[:,m2].mean(axis=1) - data[:,m1].mean(axis=1) 
 56 | 
 57 |   ## initialize the pca object
 58 |   pca = PCA(n_components=None)
 59 |   pca.fit(data.T)
 60 | 
 61 |   ## compute the number of PCs to keep
 62 |   cumsum = pca.explained_variance_ratio_ # explained variance of each PC
 63 |   keepPC = len(cumsum[cumsum > 0.001]) # number of PCs to keep
 64 | 
 65 |   v = pca.components_[0:keepPC].T # rotated data 
 66 |   r = pca.transform(data.T)[:,0:keepPC] # transformed data
 67 | 
 68 |   dd = ( np.dot(r[m1].T,r[m1]) + np.dot(r[m2].T,r[m2]) ) / float(n1+n2-2) # covariance
 69 |   sigma = np.mean(np.diag(dd)) # the scalar covariance
 70 | 
 71 |   shrunkMats = np.linalg.inv(gamma*dd + sigma*(1-gamma)*np.eye(keepPC))
 72 | 
 73 |   b = np.dot(v, np.dot(np.dot(v.T, meanvec), shrunkMats))
 74 | 
 75 |   if norm_vector:
 76 |     b /= np.linalg.norm(b) # normalize b to unit vector
 77 | 
 78 |   grouped = zip([abs(item) for item in b],b,genes)
 79 |   if sort:
 80 |     grouped = sorted(grouped,key=lambda x: x[0], reverse=True)
 81 | 
 82 | 
 83 |   if not calculate_sig: # return sorted b and genes.
 84 |     res = [(item[1],item[2]) for item in grouped]
 85 |     return res
 86 |   else: # generate a null distribution of chdirs
 87 |     nu = n1 + n2 - 2
 88 |     y1 = np.random.multivariate_normal(np.zeros(keepPC), dd, nnull).T * np.sqrt(nu / chi2.rvs(nu,size=nnull))
 89 |     y2 = np.random.multivariate_normal(np.zeros(keepPC), dd, nnull).T * np.sqrt(nu / chi2.rvs(nu,size=nnull))
 90 |     y = y2 - y1 ## y is the null of v
 91 | 
 92 |     nullchdirs = []
 93 |     for col in y.T:
 94 |       bn = np.dot(np.dot(np.dot(v,shrunkMats), v.T), np.dot(col,v.T))
 95 |       bn /= np.linalg.norm(bn)
 96 |       bn = bn ** 2
 97 |       bn.sort()
 98 |       bn = bn[::-1] ## sort in decending order
 99 |       nullchdirs.append(bn)
100 | 
101 |     nullchdirs = np.array(nullchdirs).T
102 |     nullchdirs = nullchdirs.mean(axis=1)
103 |     b_s = b ** 2 
104 |     b_s.sort()
105 |     b_s = b_s[::-1] # sorted b in decending order
106 |     relerr = b_s / nullchdirs ## relative error
107 |     # ratio_to_null
108 |     ratios = np.cumsum(relerr)/np.sum(relerr)- np.linspace(1./len(meanvec),1,len(meanvec))
109 |     res = [(item[1],item[2], ratio) for item, ratio in zip(grouped, ratios)] 
110 |     # print('Number of significant genes: %s'%(np.argmax(ratios)+1))
111 |     if sig_only:
112 |       return res[0:np.argmax(ratios)+1]
113 |     else:
114 |       return res
115 | 
116 | def characteristic_direction(controls_mat: pd.DataFrame, cases_mat: pd.DataFrame, gamma=1., nnull=10, norm_vector=True, sort=True, calculate_sig=False):
117 |   ''' Given two separate dataframes (controls, cases) with a shared index (genes), we compute the characteristic direction coefficients for all genes.
118 |   e.g.
119 | 
120 |   control_mat:
121 |         |control_replicate_1|control_replicate_2|...
122 |   gene_1|        ..         |        ..         |...
123 |   gene_2|        ..         |        ..         |...
124 | 
125 |   cases_mat:
126 |         |case_replicate_1|case_replicate_2|...
127 |   gene_1|     ..         |     ..         |...
128 |   gene_2|     ..         |     ..         |...
129 |   '''
130 |   assert (controls_mat.index == cases_mat.index).all(), 'Index between controls and cases must be the same'
131 |   n_genes = controls_mat.shape[0]
132 |   # Compute characteristic direction
133 |   results = pd.DataFrame(
134 |     data=_chdir(
135 |       np.concatenate([controls_mat, cases_mat], axis=1),
136 |       np.array(
137 |         [1]*controls_mat.shape[1] + [2]*cases_mat.shape[1]
138 |       ),
139 |       # genes
140 |       np.array(list(controls_mat.index)),
141 |       gamma=gamma,
142 |       nnull=nnull,
143 |       norm_vector=norm_vector,
144 |       sort=False,
145 |       calculate_sig=calculate_sig,
146 |     ),
147 |     columns=['CD-coefficient', 'Index', 'Significance'] if calculate_sig else ['CD-coefficient', 'Index'],
148 |   )
149 |   results.index = results['Index']
150 |   results.index.name = controls_mat.index.name
151 |   if sort:
152 |     results = results.sort_values('CD-coefficient')
153 |   return results.drop('Index', axis=1)
154 | 
155 | def up_down_from_characteristic_direction(expr: pd.DataFrame, top_n=600):
156 |   ''' Using the output of `characteristic_direction`, we can extract the top n genes
157 |   with the highest absolute characteristic direction coefficients and split them into `up` and `down`.
158 |   '''
159 |   highest_abs_expr = expr.loc[expr.abs().sort_values('CD-coefficient', ascending=False)[:top_n].index]
160 |   return type('UpDownGeneset', tuple(), dict(
161 |     up=list(highest_abs_expr[highest_abs_expr > 0].dropna().index),
162 |     down=list(highest_abs_expr[highest_abs_expr < 0].dropna().index),
163 |   ))
164 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/dge/deseq2.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import contextlib
 3 | import pandas as pd
 4 | from pydeseq2.dds import DeseqDataSet
 5 | from pydeseq2.default_inference import DefaultInference
 6 | from pydeseq2.ds import DeseqStats
 7 | 
 8 | class _DevNull:
 9 |     def write(self, *args, **kwargs): pass
10 |     def flush(self, *args, **kwargs): pass
11 | 
12 | def deseq2_differential_expression(
13 |   controls_mat: pd.DataFrame,
14 |   cases_mat: pd.DataFrame,
15 |   n_cpus=os.cpu_count() or 1,
16 |   remove_na=True,
17 |   sorted=True,
18 |   stdout=_DevNull(),
19 | ):
20 |     ''' Use pydeseq2 for differential expression.
21 | 
22 |     Note that this function expects the original raw gene counts.
23 | 
24 |     :param controls_mat: (pd.DataFrame) the control samples (samples as columns and genes as rows)
25 |     :param cases_mat: (pd.DataFrame) the case samples (samples as columns and genes as rows)
26 |     :param n_cpus: (int) number of CPUs used (default: number of cpus)
27 |     :param remove_na: (bool) remove genes with NAN values (default: True)
28 |     :param sorted: (bool) sort genes from most significant to least significant (default: True)
29 |     :param stdout: (writeable stream) direct deseq's output, e.g. sys.stdout (default: suppress)
30 |     :return: A data frame with the results
31 |     '''
32 |     # Check if controls_mat and cases_mat have the same number of rows
33 |     if controls_mat.shape[0] != cases_mat.shape[0]:
34 |         raise ValueError("controls_mat and cases_mat must have the same number of rows.")
35 |     if (controls_mat.shape[1] < 2) | (cases_mat.shape[1] < 2):
36 |         raise ValueError("controls_mat and cases_mat must have at least two samples.")
37 |     with contextlib.redirect_stdout(stdout):
38 |         exp = pd.concat([controls_mat, cases_mat], axis=1)
39 |         condition_labels = ['C'] * controls_mat.shape[1] + ['RS'] * cases_mat.shape[1]
40 |         sample_names = controls_mat.columns.tolist() + cases_mat.columns.tolist()
41 |         metadata = pd.DataFrame({'Sample': sample_names, 'Condition': condition_labels}).set_index("Sample")
42 |         dds = DeseqDataSet(counts=exp.T.astype(int), metadata=metadata, design_factors="Condition")
43 |         dds.deseq2()
44 |         stat_res = DeseqStats(dds, contrast = ('Condition', 'RS', 'C'), inference=DefaultInference(n_cpus=n_cpus))
45 |         stat_res.summary()
46 |         if sorted:
47 |             stat_res.results_df = stat_res.results_df.sort_values("pvalue")
48 |     if remove_na:
49 |         return stat_res.results_df.dropna()
50 |     else:
51 |         return stat_res.results_df
52 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/dge/limma_voom.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from typing import Optional, Tuple
  4 | 
  5 | R = None
  6 | 
  7 | def _lazy_load():
  8 |   global R
  9 |   if R is not None:
 10 |     return R
 11 |   #
 12 |   from rpy2.robjects import r
 13 |   #
 14 |   r('''
 15 |   suppressMessages(library("R.utils"))
 16 |   suppressMessages(library("RCurl"))
 17 |   suppressMessages(library("DESeq2"))
 18 |   suppressMessages(library("limma"))
 19 |   suppressMessages(library("statmod"))
 20 |   suppressMessages(library("edgeR"))
 21 |   diffExpression <- function(expression, design_dataframe, filter_genes=FALSE, voom_design=FALSE, adjust="BH") {
 22 |     design <- as.matrix(design_dataframe)
 23 |     # turn count matrix into a expression object compatible with edgeR
 24 |     dge <- DGEList(counts=expression)
 25 |     # filter genes
 26 |     if (isTRUE(filter_genes)) {
 27 |       keep <- filterByExpr(dge, design)
 28 |       dge <- dge[keep,]
 29 |     }
 30 |     # calculate normalization factors, here the different library sizes per sample are accounted for
 31 |     # the normalization factors are appended to the dge object and used in later steps
 32 |     dge <- calcNormFactors(dge)
 33 |     # to be honest I am not sure what exactly happens here. Limma was developed for affymetrix chips that have
 34 |     # values that follow different distributions than read counts. It will apply some sort of log transformation
 35 |     # and make it compatible with lmFit
 36 |     if (isTRUE(voom_design)) {
 37 |       v <- voom(dge, design, plot=FALSE)
 38 |     } else {
 39 |       v <- voom(dge, plot=FALSE)
 40 |     }
 41 |     # this is basically just applying a linear fit. The test will calculate how much the differentiation between controls and samples
 42 |     # improves the fit over the simplest possible model (average)
 43 |     fit <- lmFit(v, design)
 44 |     # this is what makes it differential expression from B - A, B and A are set in the design matrix
 45 |     cont.matrix <- makeContrasts(de=B-A, levels=design)
 46 |     fit <- contrasts.fit(fit, cont.matrix)
 47 |     # this will calculate moderated t-statistics using empirical bayes moderation of the standard errors towards a common value
 48 |     fit <- eBayes(fit)
 49 |     # Get results
 50 |     limma_dataframe <- topTable(fit, adjust=adjust, number=nrow(expression))
 51 |     limma_dataframe$gene_symbol <- rownames(limma_dataframe)
 52 |     #
 53 |     return (limma_dataframe)
 54 |   }
 55 |   ''')
 56 |   R = r
 57 |   return R
 58 | 
 59 | def limma_voom_differential_expression_design(
 60 |   expression: pd.DataFrame,
 61 |   design: pd.DataFrame,
 62 |   de: Tuple[str, str],
 63 |   filter_genes: bool = False,
 64 |   voom_design: bool = False,
 65 | ):
 66 |   ''' Use R's voom and limma for differential expression.
 67 | 
 68 |   Note that this function expects the original raw gene counts.
 69 | 
 70 |   alex version: voom_design=True, filter_genes=False
 71 |   biojupies version: voom_design=False, filter_genes=True
 72 | 
 73 |   :param expression: (pd.DataFrame) the samples
 74 |   :param design: (pd.DataFrame) the design dataframe
 75 |   :param de: (Tuple[str, str]) ('control_column_name', 'case_column_name')
 76 |   :param filter_genes: (bool) Whether to perform R's `filterByExpr` during normalization
 77 |   :param voom_design: (bool) Whether to give R's voom function the design matrix (supervised)
 78 |   :return: A data frame with the results
 79 |   '''
 80 |   expression = expression.copy()
 81 |   design = design.copy().loc[expression.columns]
 82 |   expression.columns = design.index = ['s'+str(i) for i, _ in enumerate(expression.columns)]
 83 |   design.columns = [
 84 |     {de[0]: 'A', de[1]: 'B'}.get(col, 'c'+str(i))
 85 |     for i, col in enumerate(design.columns)
 86 |   ]
 87 |   assert 'A' in design.columns and 'B' in design.columns
 88 |   import rpy2.robjects as ro
 89 |   from rpy2.robjects import pandas2ri
 90 |   from rpy2.robjects.conversion import localconverter
 91 |   r = _lazy_load()
 92 |   with localconverter(ro.default_converter + pandas2ri.converter):
 93 |     return r.diffExpression(
 94 |       expression,
 95 |       design,
 96 |       filter_genes=filter_genes,
 97 |       voom_design=voom_design,
 98 |     ).sort_values('t', ascending=False).set_index('gene_symbol')
 99 | 
100 | def make_design_matrix(expression_df, controls, cases):
101 |   expression_df = expression_df.copy()
102 |   expression_df.index.name = 'index'
103 |   expression_df = expression_df.reset_index().groupby('index').sum()
104 |   design_df = pd.DataFrame([{'index': 's'+str(i), 'A': int(x in controls), 'B': int(x in cases)} for i, x in enumerate(expression_df.columns)]).set_index('index')
105 |   expression_df.columns = ['s'+str(i) for i, _ in enumerate(expression_df.columns)]
106 |   return expression_df, design_df
107 | 
108 | def limma_voom_differential_expression(
109 |   controls_mat: pd.DataFrame,
110 |   cases_mat: pd.DataFrame,
111 |   all_data_mat: Optional[pd.DataFrame] = None,
112 |   filter_genes: bool = False,
113 |   voom_design: bool = False,
114 | ):
115 |   ''' Use R's voom and limma for differential expression.
116 | 
117 |   Note that this function expects the original raw gene counts.
118 | 
119 |   alex version: voom_design=True, filter_genes=False
120 |   biojupies version: voom_design=False, filter_genes=True
121 | 
122 |   :param controls_mat: (pd.DataFrame) the control samples
123 |   :param cases_mat: (pd.DataFrame) the case samples
124 |   :param all_data_mat: (pd.DataFrame) *all* samples (for full experiment normalization)
125 |   :param filter_genes: (bool) Whether to perform R's `filterByExpr` during normalization
126 |   :param voom_design: (bool) Whether to give R's voom function the design matrix (supervised)
127 |   :return: A data frame with the results
128 |   '''
129 |   if all_data_mat is None:
130 |     all_data_mat = pd.concat([controls_mat, cases_mat], axis=1)
131 |   all_data_mat.columns = all_data_mat.columns.to_flat_index()
132 |   # transform input into expression/design ready for R functions
133 |   expression, design = make_design_matrix(all_data_mat, set(controls_mat.columns.to_flat_index()), set(cases_mat.columns.to_flat_index()))
134 |   import rpy2.robjects as ro
135 |   from rpy2.robjects import pandas2ri
136 |   from rpy2.robjects.conversion import localconverter
137 |   r = _lazy_load()
138 |   with localconverter(ro.default_converter + pandas2ri.converter):
139 |     return r.diffExpression(
140 |       expression,
141 |       design,
142 |       filter_genes=filter_genes,
143 |       voom_design=voom_design,
144 |     ).sort_values('t', ascending=False).set_index('gene_symbol')
145 | 
146 | def up_down_from_limma_voom(expr: pd.DataFrame, top_n: int = 600):
147 |   ''' Given limma_voom_differential_expression output, produce a discrete up/down geneset
148 | 
149 |   :param top_n: (int) the number of genes in total to produce
150 |   :return: UpDownGeneset a type with `.up` and `.down` methods corresponding to the list of genes.
151 |   '''
152 |   most_significant_expr = expr.sort_values('P.Value').iloc[:top_n]
153 |   return type('UpDownGeneset', tuple(), dict(
154 |     up=list(most_significant_expr[most_significant_expr['logFC'] > 0].dropna().index),
155 |     down=list(most_significant_expr[most_significant_expr['logFC'] < 0].dropna().index),
156 |   ))
157 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/dge/logfc.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from maayanlab_bioinformatics.normalization import log2_normalize
 3 | 
 4 | def logfc_differential_expression(controls_mat: pd.DataFrame, cases_mat: pd.DataFrame):
 5 |   ''' NOT RECOMMENDED. Given two separate dataframes (controls, cases) with a shared index (genes),
 6 |   we compute the logFC differential expression for all genes, also the average expression.
 7 | 
 8 |   :param controls_mat: (pd.DataFrame) the control samples (samples as columns and genes as rows)
 9 |   :param cases_mat: (pd.DataFrame) the case samples (samples as columns and genes as rows)
10 |   :return: A data frame with the results
11 |   '''
12 |   assert (controls_mat.index == cases_mat.index).all(), 'Index between controls and cases must be the same'
13 |   df_results = pd.DataFrame({
14 |     'LogFC': log2_normalize(cases_mat.mean(axis=1)) - log2_normalize(controls_mat.mean(axis=1)),
15 |   }, index=controls_mat.index)
16 |   df_results.sort_values('LogFC', key=lambda logfc: logfc.abs(), ascending=False, inplace=True)
17 |   return df_results
18 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/dge/ttest.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import scipy.stats
 3 | from maayanlab_bioinformatics.normalization import log2_normalize
 4 | 
 5 | def ttest_differential_expression(controls_mat: pd.DataFrame, cases_mat: pd.DataFrame, equal_var=False, alternative='two-sided', log2norm=True):
 6 |   ''' Given two separate dataframes (controls, cases) with a shared index (genes),
 7 |   we compute the ttest differential expression for all genes. Benjamini-Hochberg Adjusted p-value.
 8 | 
 9 |   :param controls_mat: (pd.DataFrame) the control samples (samples as columns and genes as rows)
10 |   :param cases_mat: (pd.DataFrame) the case samples (samples as columns and genes as rows)
11 |   :param equal_var: (bool) Should t-test assume equal variance (default: False)
12 |   :param alternative: (str) Alternative hypothesis (see scipy.stats.ttest_ind) (default: two-sided)
13 |   :param log2norm: (bool) Apply log2norm, typically keep with raw counts but disable if you have normalized data (default: True)
14 |   :return: A data frame with the results
15 |   '''
16 |   assert (controls_mat.index == cases_mat.index).all(), 'Index between controls and cases must be the same'
17 |   if log2norm:
18 |     cases_mat = log2_normalize(cases_mat)
19 |     controls_mat = log2_normalize(controls_mat)
20 |   results = scipy.stats.ttest_ind(cases_mat.T, controls_mat.T, equal_var=equal_var, alternative=alternative)
21 |   df_results = pd.DataFrame({
22 |     'Statistic': results.statistic,
23 |     'Pval': results.pvalue,
24 |   }, index=controls_mat.index)
25 |   df_results['AdjPval'] = scipy.stats.false_discovery_control(df_results['Pval'].fillna(1.), method='bh')
26 |   df_results.sort_values('AdjPval', inplace=True)
27 |   return df_results
28 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/enrichment/__init__.py:
--------------------------------------------------------------------------------
1 | ''' This module contains functions that perform enrichment analysis.
2 | '''
3 | 
4 | from maayanlab_bioinformatics.enrichment.crisp import fisher_overlap, enrich_crisp, safe_odds_ratio
5 | from maayanlab_bioinformatics.enrichment.gsea2003 import GSEA2003
6 | from maayanlab_bioinformatics.enrichment.gsea2005 import GSEA2005


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/enrichment/crisp.py:
--------------------------------------------------------------------------------
  1 | # import fisher
  2 | import scipy.stats
  3 | from typing import Union, Dict, Set, Iterable, Tuple, Hashable, Any, TypeVar, Optional
  4 | from dataclasses import dataclass
  5 | 
  6 | @dataclass(frozen=True)
  7 | class FisherOverlap:
  8 |   pvalue: float
  9 |   odds_ratio: float
 10 |   n_overlap: int
 11 |   overlap: Optional[Set[Hashable]]
 12 | 
 13 | T = TypeVar('T')
 14 | DictOrIterableTuple = Union[Dict[Hashable, T], Iterable[Tuple[Hashable, T]]]
 15 | CompatibleSignature = Union[DictOrIterableTuple[Any], Set[Hashable]]
 16 | CompatibleSignatures = DictOrIterableTuple[CompatibleSignature]
 17 | EnrichmentResult = Iterable[Tuple[Hashable, FisherOverlap]]
 18 | 
 19 | def _dict_or_iterable_tuple(it: DictOrIterableTuple[T]) -> Iterable[Tuple[Hashable, T]]:
 20 |   if callable(getattr(it, 'items', None)):
 21 |     return it.items()
 22 |   else:
 23 |     return it
 24 | 
 25 | def safe_odds_ratio(a, b, c, d):
 26 |   ''' Compute the odds ratio returning helpful answers in the case of division by zero issues..
 27 |   '''
 28 |   # numerator
 29 |   if a == 0 and c == 0:
 30 |     ac = float('nan')
 31 |   elif c == 0: # a != 0
 32 |     ac = float('inf')
 33 |   else:
 34 |     ac = float(a / c)
 35 |   # denominator
 36 |   if b == 0 and d == 0:
 37 |     bd = float('nan')
 38 |   elif d == 0: # b != 0
 39 |     bd = float('inf')
 40 |   else:
 41 |     bd = float(b / d)
 42 |   # odds ratio (numerator / denominator)
 43 |   if ac == float('nan') or bd == float('nan'):
 44 |     # not going to bother.. this would only happen if you had empty signatures
 45 |     return float('nan')
 46 |   elif ac == float('inf') and bd == float('inf'):
 47 |     # this would mean *everything* is in the input set..
 48 |     # inf probably makes sense given that the occurrence
 49 |     # of the event would be *guaranteed* in this case
 50 |     return float('inf')
 51 |   elif ac == float('inf'): # bd != float('inf')
 52 |     # inf / number = inf
 53 |     return float('inf')
 54 |   elif bd == float('inf'): # ac != float('inf')
 55 |     # number / inf = 0
 56 |     return 0.0
 57 |   elif bd == 0:
 58 |     return float('inf')
 59 |   else:
 60 |     return ac / bd
 61 | 
 62 | def fisher_overlap(
 63 |   input_signature: Set[Hashable],
 64 |   background_signature: Set[Hashable],
 65 |   n_background_entities: int,
 66 |   preserve_overlap: bool = False,
 67 | ) -> Optional[FisherOverlap]:
 68 |   ''' Given input and background set, compute the overlap, fisher significance, and odds ratio.
 69 |   In the case of no overlap, will return None.
 70 |   '''
 71 |   overlap = input_signature & background_signature
 72 |   n_overlap = len(overlap)
 73 |   n_input_signature = len(input_signature)
 74 |   n_background_signature = len(background_signature)
 75 |   if n_overlap == 0:
 76 |     return None
 77 |   #
 78 |   a = n_overlap
 79 |   b = n_input_signature - n_overlap
 80 |   c = n_background_signature - n_overlap
 81 |   d = n_background_entities - n_background_signature - n_input_signature + n_overlap
 82 |   if d < 0:
 83 |     raise Exception('The total population cannot be smaller than the current overlap..')
 84 |   #
 85 |   # pvalue = fisher.pvalue(a, b, c, d).right_tail
 86 |   pvalue = scipy.stats.fisher_exact([[a, b], [c, d]], 'greater')[1]
 87 |   odds_ratio = safe_odds_ratio(a, b, c, d)
 88 |   #
 89 |   return FisherOverlap(
 90 |     pvalue=pvalue,
 91 |     odds_ratio=odds_ratio,
 92 |     n_overlap=n_overlap,
 93 |     overlap=overlap if preserve_overlap else None,
 94 |   )
 95 | 
 96 | def enrich_crisp(
 97 |   input_signature: CompatibleSignature,
 98 |   background_signatures: CompatibleSignatures,
 99 |   n_background_entities: int,
100 |   preserve_overlap: bool = False,
101 | ) -> Iterable[Tuple[Hashable, FisherOverlap]]:
102 |   ''' Perform crisp set enrichment analysis using fisher overlap.
103 |   Eriches the signature in input_signature against signatures in background_signatures.
104 | 
105 |   :param n_background_entities: should correspond to the approximate number of entities exist, in the case of Human Genes for instance this might be 21000.
106 |   '''
107 |   input_signature = set(input_signature)
108 |   for background_signature_term, background_signature in _dict_or_iterable_tuple(background_signatures):
109 |     background_signature = set(background_signature)
110 |     result = fisher_overlap(
111 |       input_signature,
112 |       background_signature,
113 |       n_background_entities=n_background_entities,
114 |       preserve_overlap=preserve_overlap,
115 |     )
116 |     if result is not None:
117 |       yield background_signature_term, result
118 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/enrichment/gsea2003.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def GSEA2003(geneset_membership: pd.Series, gene_difference_metric: pd.Series):
 5 |   '''
 6 |   Implementation of algorithm described here:
 7 |   https://pubmed.ncbi.nlm.nih.gov/12808457/
 8 | 
 9 |   :param geneset_membership: (pd.Series) True if in set, False if not, index: all genes
10 |   :param gene_difference_metric: (pd.Series) Difference metric between two classes, e.g. SNR difference
11 |   :return (Tuple[np.array, np.array]) x and y arrays ready to be plotted. ES = y.max()
12 |   '''
13 |   R_i = gene_difference_metric.sort_values(ascending=False) # R_1, ... R_N ordered by difference metric
14 |   S = geneset_membership[R_i.index] # S containing gene_membership members
15 |   G = geneset_membership.sum()
16 |   N = geneset_membership.count()
17 |   X = (
18 |         S  * np.sqrt((N - G) / G) # X_i when member of S
19 |     - (~S) * np.sqrt(G / (N - G)) # X_i when not member of S
20 |   )
21 |   # 0 added to beginning for plotting, doesn't affect sum
22 |   x = np.arange(N + 1)
23 |   y = np.concatenate([[0],np.cumsum(X)])
24 |   return x, y
25 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/enrichment/gsea2005.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def GSEA2005(geneset_membership: pd.Series, correlations: pd.Series):
 5 |   '''
 6 |   Implementation of algorithm described here:
 7 |   https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1239896/
 8 | 
 9 |   :param geneset_membership: (pd.Series) True if in set, False if not, index: all genes
10 |   :param correlations: (pd.Series) Correlation of a given gene
11 |   :return (Tuple[np.array, np.array]) x and y arrays ready to be plotted. ES = y.max()
12 |   '''
13 |   r_j = correlations.abs().sort_values(ascending=False) # r_j: correlation of gene_j in ranked order
14 |   S = geneset_membership[correlations.index] # S: geneset mask aligned with r_j
15 |   N = S.count()             # N: number of genes
16 |   N_H = S.sum()             # N_H: number of hits
17 |   N_R = r_j[S].sum()        # N_R: sum of r_j for g_j \in S
18 |   P_hit =    S  * r_j/N_R   # P_hit: fraction of hits weighted by r_j
19 |   P_miss = (~S) * 1/(N-N_H) # P_hit: fraction of misses up to position i
20 |   # 0 added to beginning for plotting, doesn't affect sum
21 |   x = np.arange(N + 1)
22 |   y = np.concatenate([[0],np.cumsum(P_hit - P_miss)])
23 |   return x, y
24 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/harmonization/__init__.py:
--------------------------------------------------------------------------------
1 | '''This module contains functions relating to data harmonization.
2 | '''
3 | 
4 | from maayanlab_bioinformatics.harmonization.ncbi_genes import ncbi_genes_fetch, ncbi_genes_lookup
5 | from maayanlab_bioinformatics.harmonization.transcripts import transcripts_to_genes
6 | from maayanlab_bioinformatics.harmonization.id_mapper import IDMapper
7 | from maayanlab_bioinformatics.harmonization.homologs import mouse_human_homologs, human_expression_to_mouse, mouse_expression_to_human
8 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/harmonization/homologs.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from maayanlab_bioinformatics.utils.fetch_save_read import fetch_save_read
 4 | 
 5 | def mouse_human_homologs(uppercase=False):
 6 |   ''' Returns a dataframe with mouse/human gene mappings based on MGI.
 7 |   See: http://www.informatics.jax.org/homology.shtml
 8 |   
 9 |   @param uppercase: bool should mappings be uppercase (i.e. for case insensitive mapping)
10 |   @returns pd.DataFrame
11 |   ```
12 |   |mouse|human|
13 |   |-----|-----|
14 |   |sp140|SP140|
15 |   ```
16 |   '''
17 |   mouse_human_sequence = fetch_save_read(
18 |     'http://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt',
19 |     'HOM_MouseHumanSequence.rpt',
20 |     sep='\t',
21 |   )
22 |   mouse_human_sequence_simplified = pd.DataFrame([
23 |     dict(
24 |       mouse=d.loc[d['Common Organism Name'].str.contains('mouse'), 'Symbol'].values,
25 |       human=d.loc[d['Common Organism Name'].str.contains('human'), 'Symbol'].values,
26 |     )
27 |     for _, d in mouse_human_sequence.groupby('DB Class Key')
28 |   ]).explode('mouse').explode('human').dropna()
29 |   if uppercase:
30 |     mouse_human_sequence_simplified['mouse'] = mouse_human_sequence_simplified['mouse'].str.upper()
31 |     mouse_human_sequence_simplified['human'] = mouse_human_sequence_simplified['human'].str.upper()
32 |   return mouse_human_sequence_simplified
33 | 
34 | def human_expression_to_mouse(human_expression, strategy='sum', uppercase=False):
35 |   ''' Given a human expression matrix, produce a mouse-compatible expression matrix by mapping
36 |   homologs.
37 | 
38 |   @param human_expression: pd.DataFrame(columns=samples, index=human_genes, values=counts)
39 |   @param strategy: 'sum' -- the strategy to use when aggregating duplicates
40 |   @returns pd.DataFrame(columns=samples, index=mouse_genes, values=counts)
41 |   '''
42 |   if strategy == 'sum':
43 |     mouse_expression = pd.merge(
44 |       left=human_expression.set_index(human_expression.index.str.upper()), left_index=True,
45 |       right=mouse_human_homologs(uppercase=uppercase), right_on='human'
46 |     ).groupby('mouse').sum()
47 |   else:
48 |     raise NotImplementedError
49 |   return mouse_expression
50 | 
51 | def mouse_expression_to_human(mouse_expression, strategy='sum', uppercase=False):
52 |   ''' Given a mouse expression matrix, produce a human-compatible expression matrix by mapping
53 |   homologs.
54 |   
55 |   @param mouse_expression: pd.DataFrame(columns=samples, index=mouse_genes, values=counts)
56 |   @param strategy: 'sum' -- the strategy to use when aggregating duplicates
57 |   @returns pd.DataFrame(columns=samples, index=human_genes, values=counts)
58 |   '''
59 |   if strategy == 'sum':
60 |     human_expression = pd.merge(
61 |       left=mouse_expression.set_index(mouse_expression.index.str.upper()), left_index=True,
62 |       right=mouse_human_homologs(uppercase=uppercase), right_on='mouse'
63 |     ).groupby('human').sum()
64 |   else:
65 |     raise NotImplementedError
66 |   return human_expression
67 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/harmonization/id_mapper.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | from collections import Counter
  3 | 
  4 | class IDMapper:
  5 |   ''' Stores id mappings and makes it easy to use many of them in tandem.
  6 | 
  7 |   ```python
  8 |   mapper = IDMapper()
  9 | 
 10 |   mapper.update({ 'a': {'A', 'C'} }, namespace='source_1')
 11 |   mapper.update({ 'b': {'A', 'B'} }, namespace='source_2')
 12 |   mapper.get('C', namespace='source_2') == 'b'
 13 | 
 14 |   Because of the overlap in synonyms it is inferred that source_1's 'a' and source_2's 'b' correspond to the same
 15 |     id, we can get using any of the synyonyms to retreive the id in a given namespace.
 16 |   Since this can be problematic when synonyms are malformed, mapper.conflicts_summary() and mapper.conflicts_counts()
 17 |     provide ways of debugging excess synonym applications.
 18 |   ```
 19 |   '''
 20 |   def __init__(self):
 21 |     # { uuid1: {id1: 1, id2: 1, ...} }
 22 |     self._forward = {}
 23 |     # { id1: uuid1, id2, uuid1, ... }
 24 |     self._reverse = {}
 25 |     # { uuid1: { ns1: id1 }, ... }
 26 |     self._namespaces = {}
 27 |     # { ns1: { shared_synonym: { conflictid1: origid1 }, ... } } }
 28 |     self._conflicts = {}
 29 | 
 30 |   def summary(self):
 31 |     ''' Return counts of overlapping namespaces (like a venn diagram)
 32 |     '''
 33 |     return Counter(
 34 |       frozenset(ns_ids.keys())
 35 |       for ns_ids in self._namespaces.values()
 36 |     )
 37 |     
 38 |   def conflicts_summary(self):
 39 |     ''' Return counts of conflicts in each namespace
 40 |     '''
 41 |     return Counter({
 42 |       ns: len(conflicts)
 43 |       for ns, conflicts in self._conflicts.items()
 44 |     })
 45 | 
 46 |   def top_conflicts(self):
 47 |     ''' Return conflicting synonym counts
 48 |     '''
 49 |     return Counter({
 50 |       (ns, conflict): len(cases)
 51 |       for ns, cc in self._conflicts.items()
 52 |       for conflict, cases in cc.items()
 53 |     })
 54 | 
 55 |   def get_id(self, id, namespace=None):
 56 |     if id is None: return None
 57 |     if namespace is None:
 58 |       return dict(
 59 |         id=id,
 60 |         refs=self._namespaces[id],
 61 |         synonyms=self._forward[id],
 62 |       )
 63 |     else:
 64 |       return self._namespaces[id].get(namespace)
 65 | 
 66 |   def get(self, term, namespace=None):
 67 |     id = self._reverse.get(term)
 68 |     return self.get_id(id, namespace=namespace)
 69 | 
 70 |   def find(self, term):
 71 |     potential_ids = {
 72 |       id
 73 |       for k, id in self._reverse.items()
 74 |       if str(term).lower().strip() in str(k).lower().strip() or str(k).lower().strip() in str(term).lower().strip()
 75 |     }
 76 |     return {
 77 |       id: self.get_id(id)
 78 |       for id in potential_ids
 79 |     }
 80 | 
 81 |   def update(self, mappings, namespace=None):
 82 |     ''' Add mappings of the form:
 83 |     { identifier: { synonyms } }
 84 |     '''
 85 |     for key, synonyms in (mappings.items() if type(mappings) == dict else mappings):
 86 |       id = uuid.uuid4()
 87 |       self._forward[id] = Counter()
 88 |       self._namespaces[id] = {namespace: key}
 89 |       for synonym in {key, *synonyms}:
 90 |         if synonym not in self._reverse:
 91 |           self._forward[id].update([synonym])
 92 |           self._reverse[synonym] = id
 93 |         else:
 94 |           orig_id = self._reverse[synonym]
 95 |           if orig_id == id:
 96 |             self._forward[id].update([synonym])
 97 |           else:
 98 |             for ns, k in self._namespaces.pop(id, {}).items():
 99 |               if orig_id not in self._namespaces:
100 |                 self._namespaces[orig_id] = {}
101 |               orig_k = self._namespaces[orig_id].get(ns)
102 |               if orig_k is not None:
103 |                 if orig_k != k:
104 |                   if ns not in self._conflicts:
105 |                     self._conflicts[ns] = {}
106 |                   if synonym not in self._conflicts[ns]:
107 |                     self._conflicts[ns][synonym] = {}
108 |                   self._conflicts[ns][synonym][k] = orig_k
109 |               else:
110 |                 self._namespaces[orig_id][ns] = k
111 |             new_cnt = self._forward.pop(id)
112 |             self._forward[orig_id] += new_cnt
113 |             self._reverse.update({s: orig_id for s in new_cnt.keys()})
114 |             id = orig_id
115 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/harmonization/ncbi_genes.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from functools import lru_cache
 3 | from maayanlab_bioinformatics.utils import fetch_save_read
 4 | 
 5 | @lru_cache()
 6 | def ncbi_genes_fetch(organism='Mammalia/Homo_sapiens', filters=lambda ncbi: ncbi['type_of_gene']=='protein-coding'):
 7 |   ''' Fetch the current NCBI Human Gene Info database.
 8 |   See ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/ for the directory/file of the organism of interest.
 9 |   '''
10 |   def maybe_split(record):
11 |     ''' NCBI Stores Nulls as '-' and lists '|' delimited
12 |     '''
13 |     if record in {'', '-'}:
14 |       return set()
15 |     return set(record.split('|'))
16 |   #
17 |   def supplement_dbXref_prefix_omitted(ids):
18 |     ''' NCBI Stores external IDS with Foreign:ID while most datasets just use the ID
19 |     '''
20 |     for id in ids:
21 |       # add original id
22 |       yield id
23 |       # also add id *without* prefix
24 |       if ':' in id:
25 |         yield id.split(':', maxsplit=1)[1]
26 |   #
27 |   ncbi = fetch_save_read(
28 |     'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/{}.gene_info.gz'.format(organism),
29 |     '{}.gene_info.tsv'.format(organism),
30 |     sep='\t',
31 |   )
32 |   if filters and callable(filters):
33 |     ncbi = ncbi[filters(ncbi)]
34 |   #
35 |   ncbi['All_synonyms'] = [
36 |     set.union(
37 |       maybe_split(gene_info['Symbol']),
38 |       maybe_split(gene_info['Symbol_from_nomenclature_authority']),
39 |       maybe_split(str(gene_info['GeneID'])),
40 |       maybe_split(gene_info['Synonyms']),
41 |       maybe_split(gene_info['Other_designations']),
42 |       maybe_split(gene_info['LocusTag']),
43 |       set(supplement_dbXref_prefix_omitted(maybe_split(gene_info['dbXrefs']))),
44 |     )
45 |     for _, gene_info in ncbi.iterrows()
46 |   ]
47 |   return ncbi
48 | 
49 | @lru_cache()
50 | def ncbi_genes_lookup(organism='Mammalia/Homo_sapiens', filters=lambda ncbi: ncbi['type_of_gene']=='protein-coding'):
51 |   ''' Return a lookup dictionary with synonyms as the keys, and official symbols as the values
52 |   Usage:
53 |   ```python
54 |   ncbi_lookup = ncbi_genes_lookup('Mammalia/Homo_sapiens')
55 |   print(ncbi_lookup('STAT3')) # any alias will get converted into the official symbol
56 |   ```
57 |   '''
58 |   ncbi_genes = ncbi_genes_fetch(organism=organism, filters=filters)
59 |   synonyms, symbols = zip(*{
60 |     (synonym, gene_info['Symbol'])
61 |     for _, gene_info in ncbi_genes.iterrows()
62 |     for synonym in gene_info['All_synonyms']
63 |   })
64 |   ncbi_lookup = pd.Series(symbols, index=synonyms)
65 |   index_values = ncbi_lookup.index.value_counts()
66 |   ambiguous = index_values[index_values > 1].index
67 |   ncbi_lookup_disambiguated = ncbi_lookup[(
68 |     (ncbi_lookup.index == ncbi_lookup) | (~ncbi_lookup.index.isin(ambiguous))
69 |   )]
70 |   return ncbi_lookup_disambiguated.to_dict().get
71 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/harmonization/transcripts.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from typing import Dict, Optional
 3 | 
 4 | from maayanlab_bioinformatics.harmonization.ncbi_genes import ncbi_genes_lookup
 5 | from maayanlab_bioinformatics.utils.merge import merge
 6 | 
 7 | def transcripts_to_genes(
 8 |   df_expression: pd.DataFrame,
 9 |   df_features: pd.DataFrame=None,
10 |   strategy='var',
11 |   uppercasegenes=False,
12 |   lookup_dict: Optional[Dict[str, str]]=None,
13 |   organism='Mammalia/Homo_sapiens',
14 | ):
15 |   ''' Map gene alternative ids/transcripts to gene symbols using `ncbi_genes_lookup`
16 |   We take a matrix with genes/transcripts on the rows and samples on the columns.
17 |   In the case of multiple gene/transcript to symbol mappings, we adopt the collision strategy specified.
18 |   If df_features is provided, we will use 'symbol' column as the transcript names,
19 |    otherwise we will use the df_expression index column.
20 |   The resulting matrix will naturally have fewer samples, corresponding to gene symbols in the
21 |    `lookup_dict` which defaults to official ncbi_gene symbols for homo sapiens.
22 |   
23 |   :param strategy: ('var'|'sum') collision strategy (select one with highest variance, or sum counts)
24 |   '''
25 |   # resolve lookup_dict if necessary
26 |   if lookup_dict is None:
27 |     lookup_dict = ncbi_genes_lookup(organism=organism)
28 |   elif callable(lookup_dict):
29 |     lookup_dict = lookup_dict()
30 |   # construct df_features if not provided
31 |   if df_features is None:
32 |     df_features = pd.Series(df_expression.index).to_frame('symbol')
33 |     df_features.index = df_expression.index
34 |   # uppercase genes if necessary
35 |   if uppercasegenes:
36 |     df_features['symbol'] = df_features['symbol'].apply(str.upper)
37 |   # get df_expression but only the highest variance transcript that
38 |   #  corresponds to the same set of genes
39 |   if strategy == 'var':
40 |     df_transcript_genes = merge(
41 |       df_expression.var(axis=1).to_frame('var'),
42 |       df_features[['symbol']].applymap(lambda s: lookup_dict(s))
43 |     ).groupby('symbol')['var'].idxmax().reset_index()
44 |     df_transcript_genes.index = df_transcript_genes['var']
45 |     df_transcript_genes = df_transcript_genes.drop('var', axis=1)
46 |     # perform the actual mapping
47 |     df_gene_expression = df_expression.loc[df_transcript_genes.index]
48 |     df_gene_expression.index = df_transcript_genes['symbol']
49 |   elif strategy == 'sum':
50 |     df_gene_expression = merge(
51 |       df_expression,
52 |       df_features[['symbol']].applymap(lambda s: lookup_dict(s))
53 |     ).groupby('symbol').sum()
54 |   else:
55 |     raise NotImplementedError
56 |   return df_gene_expression
57 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/normalization/__init__.py:
--------------------------------------------------------------------------------
1 | '''This module contains functions relating to data normalization.
2 | '''
3 | 
4 | from maayanlab_bioinformatics.normalization.cpm import cpm_normalize
5 | from maayanlab_bioinformatics.normalization.filter import filter_by_var, filter_by_expr
6 | from maayanlab_bioinformatics.normalization.log import log2_normalize, log10_normalize
7 | from maayanlab_bioinformatics.normalization.quantile import quantile_normalize
8 | from maayanlab_bioinformatics.normalization.zscore import zscore_normalize
9 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/normalization/cpm.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import numpy as np
 3 | import pandas as pd
 4 | from functools import singledispatch
 5 | 
 6 | 
 7 | @singledispatch
 8 | def cpm_normalize(mat):
 9 |   ''' Compute counts-per-million value of counts
10 |   Simple division of each column by the total sum of its counts and multiplying it by 10^6
11 |   '''
12 |   logging.warn('Unrecognized type: ' + type(mat).__name__)
13 |   return cpm_normalize_np(mat)
14 | 
15 | @cpm_normalize.register
16 | def cpm_normalize_np(mat: np.ndarray):
17 |   return (mat / mat.sum(axis=0)) * 1e6
18 | 
19 | @cpm_normalize.register
20 | def cpm_normalize_pd(mat: pd.DataFrame):
21 |   return (mat / mat.sum(axis=0)) * 1e6
22 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/normalization/filter.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import numpy as np
 3 | import pandas as pd
 4 | from functools import singledispatch
 5 | from maayanlab_bioinformatics.normalization.cpm import cpm_normalize
 6 | 
 7 | 
 8 | def filter_by_var(mat: pd.DataFrame, top_n=2500, axis=1):
 9 |   ''' Select rows with the most variable expression accross all samples.
10 |   Takes a dataframe and returns a filtered dataframe in the same orientation.
11 |   e.g.
12 |         |condition_1|condition_2|
13 |   gene_1|    1      |     1     |
14 |   gene_2|    0      |    10     |
15 | 
16 |   gene_1 here is *not* variable at all,
17 |   gene_2 here is *very* variable.
18 | 
19 |   gene_1 will be dropped, while gene_2 is kept.
20 |   '''
21 |   return mat.loc[mat.var(axis=1).sort_values(ascending=False).index[:top_n], :]
22 | 
23 | 
24 | def filter_by_expr(mat, design: pd.Series=None, group: pd.DataFrame=None, min_count=10, min_total_count=15, large_n=10, min_prop=0.7, tol=1e-14):
25 |   ''' Ported from R https://rdrr.io/bioc/edgeR/src/R/filterByExpr.R
26 |   '''
27 |   lib_size = mat.sum(axis=0)
28 |   #  Minimum effect sample sample size for any of the coefficients
29 |   if group is None:
30 |     if design is None:
31 |       logging.warn('No group or design set. Assuming all samples belong to one group.')
32 |       min_sample_size = mat.shape[1]
33 |     else:
34 |       min_sample_size = 1 / design.max()
35 |   else:
36 |     min_sample_size = group[group > 0].min()
37 |   #
38 |   if min_sample_size > large_n:
39 |     min_sample_size = large_n + (min_sample_size - large_n) * min_prop
40 |   # CPM cutoff
41 |   median_lib_size = lib_size.median()
42 |   cpm_cutoff = (min_count / median_lib_size) * 1e6
43 |   cpm = cpm_normalize(mat)
44 |   keep_cpm = (cpm >= cpm_cutoff).sum(axis=1) >= (min_sample_size - tol)
45 |   # Total count cutoff
46 |   keep_total_count = mat.sum(axis=1) >= min_total_count - tol
47 |   #
48 |   return mat.loc[keep_cpm & keep_total_count, :]
49 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/normalization/log.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import numpy as np
 3 | import pandas as pd
 4 | from functools import singledispatch
 5 | 
 6 | 
 7 | @singledispatch
 8 | def log2_normalize(mat, offset=1.):
 9 |   ''' Compute log normalization of matrix
10 |   Simple `log2(x + offset)`, offset usually set to 1. because log(0) is undefined.
11 |   '''
12 |   logging.warn('Unrecognized type: ' + type(mat).__name__)
13 |   return log2_normalize_np(mat, offset=offset)
14 | 
15 | @log2_normalize.register
16 | def log2_normalize_np(mat: np.ndarray, offset=1.):
17 |   return np.log2(mat + offset)
18 | 
19 | @log2_normalize.register
20 | def log2_normalize_pd(mat: pd.DataFrame, offset=1.):
21 |   return np.log2(mat + offset)
22 | 
23 | @log2_normalize.register
24 | def log2_normalize_pds(mat: pd.Series, offset=1.):
25 |   return np.log2(mat + offset)
26 | 
27 | 
28 | @singledispatch
29 | def log10_normalize(mat, offset=1.):
30 |   ''' Compute log normalization of matrix
31 |   Simple `log10(x + offset)`, offset usually set to 1. because log(0) is undefined.
32 |   '''
33 |   logging.warn('Unrecognized type: ' + type(mat).__name__)
34 |   return log10_normalize_np(mat, offset=offset)
35 | 
36 | @log10_normalize.register
37 | def log10_normalize_np(mat: np.ndarray, offset=1.):
38 |   return np.log10(mat + offset)
39 | 
40 | @log10_normalize.register
41 | def log10_normalize_pd(mat: pd.DataFrame, offset=1.):
42 |   return np.log10(mat + offset)
43 | 
44 | @log10_normalize.register
45 | def log10_normalize_pds(mat: pd.Series, offset=1.):
46 |   return np.log10(mat + offset)
47 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/normalization/quantile.py:
--------------------------------------------------------------------------------
1 | from qnorm import quantile_normalize


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/normalization/quantile_legacy.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import numpy as np
 3 | import pandas as pd
 4 | from functools import singledispatch
 5 | 
 6 | 
 7 | @singledispatch
 8 | def quantile_normalize(mat):
 9 |   ''' Perform quantile normalization on the values of a matrix
10 |   In the case of a pd.DataFrame, preserve the index on the output frame.
11 |   See: https://en.wikipedia.org/wiki/Quantile_normalization
12 |   '''
13 |   logging.warn('Unrecognized type: ' + type(mat).__name__)
14 |   return quantile_normalize_np(mat)
15 | 
16 | @quantile_normalize.register
17 | def quantile_normalize_np(mat: np.ndarray):
18 |   # sort vector in np (reuse in np)
19 |   sorted_vec = np.sort(mat, axis=0)
20 |   # rank vector in np (no dict necessary)
21 |   rank = sorted_vec.mean(axis=1)
22 |   # construct quantile normalized matrix
23 |   return np.array([
24 |     [
25 |       rank[i]
26 |       for i in np.searchsorted(sorted_vec[:, c], mat[:, c])
27 |     ] for c in range(mat.shape[1])
28 |   ]).T
29 | 
30 | @quantile_normalize.register
31 | def quantile_normalize_pd(mat: pd.DataFrame):
32 |   return pd.DataFrame(
33 |     quantile_normalize_np(mat.values),
34 |     index=mat.index,
35 |     columns=mat.columns,
36 |   )
37 | 
38 | def quantile_normalize_h5(in_mat, out_mat, tmp=None):
39 |   import os, tempfile, h5py
40 |   '''
41 |   Maximum memory required (3 * in_mat.shape[1] * sizeof(dtype))
42 |   Storage required 4 * in_mat.size
43 |     - input matrix
44 |     - transposed copy
45 |     - sorted & transposed copy
46 |     - output matrix
47 |   '''
48 |   assert isinstance(in_mat, h5py.Dataset)
49 |   assert isinstance(out_mat, h5py.Dataset)
50 |   assert in_mat.shape == out_mat.shape
51 |   # transpose + sort
52 |   tmp_f = tempfile.mktemp() if tmp is None else tmp
53 |   tmp_h5 = h5py.File(tmp_f, 'w')
54 |   tmp_T_mat = tmp_h5.create_dataset('tmp_T', shape=(in_mat.shape[1], in_mat.shape[0]), dtype=in_mat.dtype)
55 |   tmp_T_sorted_mat = tmp_h5.create_dataset('tmp_T_sorted', shape=(in_mat.shape[1], in_mat.shape[0]), dtype=in_mat.dtype)
56 |   sorted_col_vec_agg_rank = np.zeros(in_mat.shape[0])
57 |   for col in range(in_mat.shape[1]):
58 |     # this single read is potentially expensive but the two writes are cheap
59 |     col_vec = in_mat[:, col]
60 |     tmp_T_mat[col, :] = col_vec
61 |     sorted_col_vec = np.sort(col_vec)
62 |     tmp_T_sorted_mat[col, :] = sorted_col_vec
63 |     sorted_col_vec_agg_rank += sorted_col_vec
64 |   # setup rank matrix
65 |   sorted_col_vec_agg_rank /= in_mat.shape[1]
66 |   # construct output matrix
67 |   for c in range(in_mat.shape[1]):
68 |     # this write is potentially expensive but the reads are cheap
69 |     out_mat[:, c] = [
70 |       sorted_col_vec_agg_rank[i]
71 |       for i in np.searchsorted(tmp_T_sorted_mat[c, :], tmp_T_mat[c, :])
72 |     ]
73 |   # close and remove tmp file
74 |   tmp_h5.close()
75 |   os.remove(tmp_f)
76 |   return out_mat
77 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/normalization/zscore.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import numpy as np
 3 | import pandas as pd
 4 | from functools import singledispatch
 5 | from scipy.stats import zscore
 6 | 
 7 | 
 8 | @singledispatch
 9 | def zscore_normalize(mat, ddof=0):
10 |   ''' Compute the z score of each value in the sample, relative to the sample mean and standard deviation.
11 |   In the case of a pd.DataFrame, preserve the index on the output frame.
12 |   '''
13 |   logging.warn('Unrecognized type: ' + type(mat).__name__)
14 |   return zscore_normalize_np(mat, ddof=ddof)
15 | 
16 | @zscore_normalize.register
17 | def zscore_normalize_np(mat: np.ndarray, ddof=0):
18 |   return zscore(mat, axis=0, ddof=ddof)
19 | 
20 | @zscore_normalize.register
21 | def zscore_normalize_pd(mat: pd.DataFrame, ddof=0):
22 |   return pd.DataFrame(zscore_normalize_np(mat, ddof=ddof), index=mat.index, columns=mat.columns)
23 | 
24 | @zscore_normalize.register
25 | def zscore_normalize_pds(mat: pd.Series, ddof=0):
26 |   return pd.DataFrame(zscore_normalize_np(mat, ddof=ddof), index=mat.index)
27 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/parse/__init__.py:
--------------------------------------------------------------------------------
1 | '''This module contains functions relating to file parsing into easier to ready-to-go formats.
2 | '''
3 | 
4 | from maayanlab_bioinformatics.parse.gmt import gmt_read_iter, gmt_read_dict, gmt_read_pd, gmt_write_dict, gmt_write_pd
5 | from maayanlab_bioinformatics.parse.suerat import suerat_load, suerat_load_multiple
6 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/parse/gmt.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import io
  3 | import math as m
  4 | import pandas as pd
  5 | import contextlib
  6 | import logging
  7 | import pathlib
  8 | import typing
  9 | from dataclasses import dataclass
 10 | from maayanlab_bioinformatics.utils.maybe_tqdm import maybe_tqdm
 11 | 
 12 | def _try_load_number(s):
 13 |   try:
 14 |     return int(s)
 15 |   except ValueError:
 16 |     pass
 17 |   try:
 18 |     return float(s)
 19 |   except ValueError:
 20 |     pass
 21 |   return s
 22 | 
 23 | @contextlib.contextmanager
 24 | def _ensure_fp(fp, mode):
 25 |   if type(fp) == str or isinstance(fp, pathlib.Path):
 26 |     with open(fp, mode) as fh:
 27 |       yield fh
 28 |   else:
 29 |     yield fp
 30 | 
 31 | def parse_gene_weight(gene):
 32 |   ''' A helper to parse the gmt potentially with numeric weights
 33 |   '''
 34 |   gene, *_weight = re.split(r'[,:;]', gene.strip(), maxsplit=1)
 35 |   if _weight:
 36 |     _weight, = _weight
 37 |     _weight = _try_load_number(_weight)
 38 |     if type(_weight) == str:
 39 |       gene += _weight
 40 |       weight = 1
 41 |     else:
 42 |       weight = _weight
 43 |   else:
 44 |     weight = 1
 45 |   return gene.strip(), weight
 46 | 
 47 | def parse_gene_unweighted(gene):
 48 |   ''' A helper to parse the gmt unweighted
 49 |   '''
 50 |   return gene.strip(), 1
 51 | 
 52 | def gmt_read_iter(fh, parse_gene=parse_gene_weight):
 53 |   with _ensure_fp(fh, 'r') as fh:
 54 |     for n, line in enumerate(fh):
 55 |       try:
 56 |         term1, term2, genes_str = line.strip().split('\t', maxsplit=2)
 57 |       except ValueError:
 58 |         logging.warn('Ignoring line {}:{} because it seems empty'.format(n, line))
 59 |         continue
 60 |       term = '\t'.join(filter(None, map(str.strip, (term1, term2))))
 61 |       geneset = {
 62 |         k: v
 63 |         for k, v in map(parse_gene, genes_str.split('\t'))
 64 |         if k
 65 |       }
 66 |       yield term, geneset
 67 | 
 68 | def gmt_read_dict(fh, parse_gene=parse_gene_weight):
 69 |   ''' Read .gmt files into a dictionary of the form:
 70 |   {
 71 |     'term_1\tterm_2': {
 72 |       gene_1: weight or 1,
 73 |       ...
 74 |     },
 75 |     ...
 76 |   }
 77 | 
 78 |   If your genes are encoded in a weird way you can also provide your own `parse_gene` function,
 79 |    the current one supports just gene names or gene names with weights separated by non-word/numeric characters.
 80 |   '''
 81 |   gmt = {}
 82 |   for n, (term, geneset) in enumerate(gmt_read_iter(fh, parse_gene=parse_gene)):
 83 |     if term in gmt:
 84 |       logging.warn('Duplicate term: {}:{}, merging'.format(n, term))
 85 |     else:
 86 |       gmt[term] = {}
 87 |     gmt[term].update(**geneset)
 88 |   return gmt
 89 | 
 90 | def gmt_read_pd(fh, parse_gene=parse_gene_weight):
 91 |   ''' Read .gmt files directly into a data frame.
 92 |   '''
 93 |   return pd.DataFrame(gmt_read_dict(fh, parse_gene=parse_gene))
 94 | 
 95 | 
 96 | def _serialize_gene_weight_pair(gene, weight):
 97 |   if weight == 1 or m.isclose(weight, 1.): return gene
 98 |   elif m.isclose(weight, 0.) or m.isnan(weight): return None
 99 |   else: return '{},{}'.format(gene, weight)
100 | 
101 | def _ensure_weight(gs):
102 |   if isinstance(gs, dict):
103 |     return gs.items()
104 |   else:
105 |     return ((g, 1) for g in gs)
106 | 
107 | def gmt_write_dict(gmt, fh, serialize_gene_weight_pair=_serialize_gene_weight_pair):
108 |   ''' Opposite of gmt_read_dict, write a dictionary to a file pointer
109 |   serialize_gene_weight_pair can be used to customize serialization when dealing with weights.
110 |     - it should return the serialized gene,weight pair or None if it should be removed
111 |   By default, 0/nans are dropped, 1s result in a gene (crisp), and everything else uses gene,weight.
112 |   '''
113 |   with _ensure_fp(fh, 'w') as fh:
114 |     for term, geneset in gmt.items():
115 |       if '\t' not in term: serialized_term = term + '\t'
116 |       else: serialized_term = term
117 |       serialized_geneset = '\t'.join(filter(None, (
118 |         serialize_gene_weight_pair(gene, weight)
119 |         for gene, weight in _ensure_weight(geneset)
120 |       )))
121 |       if not serialized_geneset:
122 |         logging.warn('Ignoring term {} because its geneset seems empty'.format(term))
123 |         continue
124 |       print(serialized_term, serialized_geneset, sep='\t', file=fh)
125 | 
126 | def gmt_write_pd(df, fh, serialize_gene_weight_pair=_serialize_gene_weight_pair):
127 |   ''' Write a pandas dataframe as a gmt, where rows are genes and columns are terms.
128 |   See gmt_write_dict for more information.
129 |   '''
130 |   gmt_write_dict(df.to_dict(), fh, serialize_gene_weight_pair=serialize_gene_weight_pair)
131 | 
132 | @dataclass
133 | class GMT:
134 |   ''' a data structure for GMTs in memory
135 |   '''
136 |   # the unique set of genes across all gene lists
137 |   background: list[str]
138 |   # first two columns of the GMT
139 |   terms: list[(str, str)]
140 |   # variable gene lists of the GMT
141 |   gene_lists: list[list[str]]
142 | 
143 |   @staticmethod
144 |   def reader(gmtfile: io.TextIOBase | str | pathlib.Path) -> typing.Iterator[tuple[tuple[str, str], list[str]]]:
145 |     ''' read the .gmt format, a tab separated file with variable columns
146 |     '''
147 |     gene_expr = re.compile(r'^([^:;,]+?)([:;,].+)?$')
148 |     with _ensure_fp(gmtfile, 'r') as fr:
149 |       for line in fr:
150 |         line_split = [cell.strip() for cell in line.strip().split('\t')]
151 |         if len(line_split) < 3: continue
152 |         term, desc, *genes = line_split
153 |         genes = [
154 |           m.group(1)
155 |           for gene in genes
156 |           if gene
157 |           for m in (gene_expr.match(gene),)
158 |           if m
159 |         ]
160 |         yield (term, desc), genes
161 | 
162 |   @staticmethod
163 |   def from_iter(it: typing.Iterator[tuple[tuple[str, str], list[str]]]):
164 |     ''' initialize a GMT from Iterator[(term, desc), gene_list] (i.e. read_gmt)
165 |     '''
166 |     background = set()
167 |     terms = []
168 |     gene_lists = []
169 |     for (term, desc), genes in maybe_tqdm(it, desc='Reading gmt...'):
170 |       background.update(genes)
171 |       terms.append((term, desc))
172 |       gene_lists.append(genes)
173 |     return GMT(list(background), terms, gene_lists)
174 | 
175 |   @staticmethod
176 |   def concat(*gmts):
177 |     background = set()
178 |     terms = []
179 |     gene_lists = []
180 |     for gmt in gmts:
181 |       background.update(gmt.background)
182 |       terms += gmt.terms
183 |       gene_lists += gmt.gene_lists
184 |     return GMT(list(background), terms, gene_lists)
185 | 
186 |   @staticmethod
187 |   def from_file(gmtfile: io.TextIOBase | str | pathlib.Path):
188 |     ''' initialze a GMT from a file
189 |     '''
190 |     return GMT.from_iter(GMT.reader(gmtfile))
191 | 
192 |   def to_spmatrix(self):
193 |     ''' create a sparse matrix from this GMT
194 |     '''
195 |     import scipy.sparse
196 |     import numpy as np
197 |     spmatrix = scipy.sparse.dok_matrix((len(self.gene_lists), len(self.background)), dtype=np.int8)
198 |     gene_index = { gene: index for index, gene in enumerate(self.background) }
199 |     for i, gene_list in enumerate(maybe_tqdm(self.gene_lists, desc='Building spmatrix...')):
200 |       spmatrix[i, [gene_index[g] for g in gene_list]] = 1
201 |     return spmatrix
202 | 
203 |   def to_df(self):
204 |     ''' create a sparse pandas dataframe from this GMT
205 |     '''
206 |     import pandas as pd
207 |     return pd.DataFrame.sparse.from_spmatrix(
208 |       self.to_spmatrix(),
209 |       columns=self.background,
210 |       index=self.terms,
211 |     )
212 | 
213 |   def dedupe(self):
214 |     ''' de-duplicate gene sets in a GMT
215 |     '''
216 |     deduped_terms = []
217 |     deduped_gene_lists = []
218 |     gene_list_hashes = {}
219 |     for term, gene_list in maybe_tqdm(zip(self.terms, self.gene_lists), desc='De-duping...'):
220 |       gene_list_hash = hash(frozenset(gene_list))
221 |       if gene_list_hash not in gene_list_hashes:
222 |         gene_list_hashes[gene_list_hash] = len(deduped_terms)-1
223 |         deduped_terms.append(term)
224 |         deduped_gene_lists.append(gene_list)
225 |       else:
226 |         gene_list_index = gene_list_hashes[gene_list_hash]
227 |         deduped_terms[gene_list_index] = (*deduped_terms[gene_list_index], *term)
228 |     return GMT(self.background, deduped_terms, deduped_gene_lists)
229 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/parse/suerat.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import scipy.sparse as sp_sparse
 4 | from maayanlab_bioinformatics.utils import merge
 5 | 
 6 | def suerat_load(base_dir):
 7 |   ''' Files prepared for suerat are quite common, this function will load them
 8 |   given the directory that contains `barcodes.tsv.gz`, `features.tsv.gz`, and `matrix.tsv.gz`.
 9 |   '''
10 |   df_barcodes = pd.read_csv(
11 |     os.path.join(base_dir, 'barcodes.tsv.gz'),
12 |     index_col=0,
13 |     header=None,
14 |     sep='\t',
15 |   )
16 |   df_features = pd.read_csv(
17 |     os.path.join(base_dir, 'features.tsv.gz'),
18 |     header=None,
19 |     names=['symbol', 'type'],
20 |     index_col=0,
21 |     sep='\t',
22 |   )
23 |   matrix = pd.read_csv(
24 |     os.path.join(base_dir, 'matrix.mtx.gz'),
25 |     header=None,
26 |     names=['indices', 'indptr', 'data'],
27 |     skiprows=2,
28 |     sep=' ',
29 |   )
30 |   csc_matrix = sp_sparse.csc_matrix(
31 |     (
32 |       matrix['data'].values,
33 |       (
34 |         matrix['indices'].values - 1, # 0 based indexing
35 |         matrix['indptr'].values - 1,  # 0 based indexing
36 |       )
37 |     ),
38 |   )
39 |   df_expression = pd.DataFrame(csc_matrix.todense())
40 |   df_expression.index = df_features.index
41 |   df_expression.columns = df_barcodes.index
42 |   return df_features, df_barcodes, df_expression
43 | 
44 | def suerat_load_multiple(base_dirs):
45 |   ''' Sets of suerat directories that are meant to be analyzed together are quite common,
46 |   providing all those directories to this function (much like load_suerat_files) will load
47 |   each individually and return a merged version that captures the filename in the barcodes.
48 |   '''
49 |   all_df_features = []
50 |   all_df_barcodes = []
51 |   all_df_expression = []
52 |   #
53 |   for ind, base_dir in enumerate(base_dirs):
54 |     df_features, df_barcodes, df_expression = suerat_load(base_dir)
55 |     df_barcodes['barcode'] = df_barcodes.index
56 |     df_barcodes['file'] = f'File {ind}'
57 |     df_barcodes.index = df_barcodes.index.map(lambda s, ind=ind: f'{ind}:{s}')
58 |     df_expression.columns = df_barcodes.index
59 |     all_df_features.append(df_features)
60 |     all_df_barcodes.append(df_barcodes)
61 |     all_df_expression.append(df_expression)
62 |   #
63 |   df_features = merge(*all_df_features, how='left', suffixes=('', '_')).drop(['symbol_', 'type_'], axis=1)
64 |   df_barcodes = pd.concat(all_df_barcodes)
65 |   df_expression = merge(*all_df_expression)
66 |   #
67 |   return df_features, df_barcodes, df_expression
68 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/plotting/__init__.py:
--------------------------------------------------------------------------------
1 | ''' This module contains various helpers for plotting things
2 | '''
3 | 
4 | from maayanlab_bioinformatics.plotting.bridge import bridge_plot
5 | from maayanlab_bioinformatics.plotting.upset import upset_from_dict_of_sets
6 | from maayanlab_bioinformatics.plotting.clustergrammer import display_clustergrammer
7 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/plotting/bridge.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def bridge_plot(select: pd.Series, weights: pd.Series = None):
 5 |   ''' Use the filter to construct a bridge plot.
 6 | 
 7 |   ```python
 8 |   import numpy as np
 9 |   from matplotlib import pyplot as plt
10 |   from maayanlab_bioinformatics.plotting import bridge_plot
11 | 
12 |   x, y = bridge_plot(select)
13 |   plt.plot(x, y)
14 |   plt.vlines(np.argwhere(select.values)[:, 0], ymin=-1, ymax=0)
15 |   plt.show()
16 |   ```
17 | 
18 |   :param select: (pd.Series) selection of hits (i.e. `df['gene'] == 'my_target'`) in ranked order
19 |   :param weights: (pd.Series) optional weights for each hit in the same order
20 |   :return: (Tuple[np.array, np.array]) x and y arrays ready to be plotted.
21 |   '''
22 |   if weights is None:
23 |     weights = pd.Series(np.ones(select.shape[0]), index=select.index)
24 |   max_es = weights[select].abs().sum() # maximum enrichment score if we were to hit everything (positively)
25 |   up = select * weights / max_es # go up/dn by normalized weight on each hit
26 |   dn = - (1 - select) * up.sum() / (~select).sum()
27 |   x = np.arange(select.shape[0]+1)
28 |   y = np.concatenate([
29 |     np.zeros(1),
30 |     np.cumsum(up + dn),
31 |   ])
32 |   return x, y
33 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/plotting/clustergrammer.py:
--------------------------------------------------------------------------------
 1 | def display_clustergrammer(net):
 2 |   ''' This function displays clustergrammer in a jupyter notebook without dependencies
 3 |   on ipywidgets or any locally installed jupyter extensions. This is convenient for
 4 |   static exports, colab, and appyters.
 5 | 
 6 |   Example:
 7 |   ```python
 8 |   from maayanlab_bioinformatics.plotting import display_clustergrammer
 9 |   from clustergrammer import Network
10 |   net = Network()
11 |   net.load_df(df)
12 |   net.cluster()
13 |   display_clustergrammer(net)
14 |   ```
15 |   '''
16 |   from IPython.display import HTML
17 |   import uuid, json
18 |   id = '_' + str(uuid.uuid4())
19 |   return HTML(f"""
20 |     <div id='{id}' style="width: 950px; height: 800px"></div>
21 |     <script src="/static/components/requirejs/require.js"></script>
22 |     <script>
23 |     requirejs.config({json.dumps({
24 |       'paths': {
25 |         'jquery': 'https://cdnjs.cloudflare.com/ajax/libs/jquery/1.11.2/jquery.min',
26 |         'd3': 'https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.15/d3.min',
27 |         f"clustergrammer": 'https://raw.githack.com/MaayanLab/clustergrammer/0024d8cd245dc597113a860db9f1dc989a8876c2/clustergrammer',
28 |       },
29 |       'shim': {
30 |         f"clustergrammer": {
31 |           'exports': 'Clustergrammer',
32 |           'deps': ['d3', 'jquery'],
33 |         }
34 |       }
35 |     })})
36 |     require(['clustergrammer'], function (clustergrammer) {{
37 |       clustergrammer({json.dumps({
38 |         'root': f"#{id}",
39 |         'network_data': json.loads(net.export_net_json()),
40 |       })})
41 |     }})
42 |     </script>
43 |   """)
44 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/plotting/upset.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import pandas as pd
 3 | from typing import Dict, Set, Hashable
 4 | 
 5 | def upset_from_dict_of_sets(inputs: Dict[Hashable, Set[Hashable]]):
 6 |   ''' Given a dictionary of sets, produce input ready for `upsetplot` python package
 7 | 
 8 |   We produce this input by computing set intersections of all relevant combinations
 9 |    of sets interacting with one another.
10 | 
11 |   Example:
12 |   ```python
13 |   import upsetplot
14 |   from maayanlab_bioinformatics.plotting import upset_from_dict_of_sets
15 |   upsetplot.plot(upset_from_dict_of_sets({
16 |     'A': {'a', 'b', 'c'},
17 |     'B': {'b', 'c', 'd'},
18 |     'C': {'d', 'e', 'f'},
19 |   }))
20 |   ```
21 |   :param inputs: (Dict[Hashable, Set[Hashable]]) Several named sets
22 |   :return: (pd.DataFrame) in a form ready for `upsetplot.plot`
23 |   '''
24 |   sets = []
25 |   for n in range(1, len(inputs)+1):
26 |     if n == 1:
27 |       it = [[k] for k in inputs.keys()]
28 |     else:
29 |       it = map(list, itertools.combinations(inputs.keys(), n))
30 |     for V in it:
31 |       size = len(inputs[V[0]] if n == 1 else set.intersection(*[inputs[v] for v in V]))
32 |       if size > 0:
33 |         sets.append(dict({vv: vv in V for vv in inputs.keys()}, size=size))
34 |   return pd.DataFrame(sets).groupby(list(inputs.keys()))['size'].sum()
35 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/setup/R.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Install R dependencies with
 3 | ```
 4 | python -m maayanlab_bioinformatics.setup.R
 5 | ```
 6 | '''
 7 | if __name__ == '__main__':
 8 |     import rpy2.robjects as ro
 9 |     ro.r('''
10 |     install.packages("R.utils", repos="https://cloud.r-project.org/")
11 |     install.packages("RCurl", repos="https://cloud.r-project.org/")
12 | 
13 |     if (!requireNamespace("BiocManager", quietly = TRUE))
14 |         install.packages("BiocManager", repos="https://cloud.r-project.org/")
15 | 
16 |     BiocManager::install("DESeq2")
17 |     BiocManager::install("limma")
18 |     BiocManager::install("statmod")
19 |     BiocManager::install("edgeR")
20 |     ''')
21 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/setup/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaayanLab/maayanlab-bioinformatics/5b38cf2ce8f67928777852b69f2e2659c6eb9043/maayanlab_bioinformatics/setup/__init__.py


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | '''This module contains general utility functions for convenient analysis
 2 | '''
 3 | 
 4 | from maayanlab_bioinformatics.utils.describe import np_describe
 5 | from maayanlab_bioinformatics.utils.chunked import chunk_slices, chunk_applymap
 6 | from maayanlab_bioinformatics.utils.fetch_save_read import fetch_save_read
 7 | from maayanlab_bioinformatics.utils.merge import merge
 8 | from maayanlab_bioinformatics.utils.sparse import sp_hdf_dump, sp_hdf_load
 9 | from maayanlab_bioinformatics.utils.maybe_tqdm import maybe_tqdm
10 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/utils/chunked.py:
--------------------------------------------------------------------------------
  1 | ''' Chunked module has useful helper functions for manipulating ndarrays in chunks, this is
  2 | especially useful when working with h5py matrices since operations which respect chunk boundaries
  3 | avoid excessive disk random access.
  4 | '''
  5 | import numpy as np
  6 | import itertools as it
  7 | import logging
  8 | from maayanlab_bioinformatics.utils.maybe_tqdm import maybe_tqdm
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | def chunk_slices(shape, chunks, progress=False):
 12 |   '''
 13 |   Return slices to chunk through an ndarray.
 14 | 
 15 |   :param shape:    The shape of the ndarray or size in 1d.
 16 |   :param chunks:   The shape of the chunks or size in all dimensions.
 17 |   :param progress: Show tqdm progress bar or not
 18 | 
 19 |   :returns: Iterator[slice(start, stop) for each dimension in shape]
 20 | 
 21 |   Usage:
 22 |   N = np.arange(10)
 23 |   [N[s] for s in chunk_slices(len(N), 3)]
 24 | 
 25 |   I = np.eye(10)
 26 |   [I[i, j] for i, j in chunk_slices(I.shape, 3)]
 27 |   '''
 28 |   tqdm = maybe_tqdm if progress else lambda it, **kwargs: it
 29 |   # ensure shape is a tuple (we'll flatten it when we're done if not)
 30 |   if type(shape) == int:
 31 |     shape = (shape,)
 32 |     flatten = True
 33 |   else:
 34 |     flatten = False
 35 |   # spread chunks to length of shape
 36 |   if type(chunks) == int:
 37 |     chunks = (chunks,)*len(shape)
 38 |   # ensure shape and chunks are the same dimensions
 39 |   assert len(shape) == len(chunks)
 40 |   for indices in tqdm(
 41 |     # we take the iterable cartesian product, equivalent to a nested for loop on each dimension
 42 |     it.product(*[
 43 |       # for each dimension, shape / chunks + 1 when chunks are not divisible by shape
 44 |       range((s//cs)+int(s%cs!=0))
 45 |       for s, cs in zip(shape, chunks)
 46 |     ]),
 47 |     # the total (for progress bar) is just the actual product of steps in each dimension
 48 |     total=np.product([(s//cs)+int(s%cs!=0) for s, cs in zip(shape, chunks)]),
 49 |   ):
 50 |     # indices have the chunk start index, we can then get the slice (start, end) for each dimension
 51 |     slices = tuple([slice(i*cs, min(s, (i+1)*cs)) for i, s, cs in zip(indices, shape, chunks)])
 52 |     # if not multiple dimensions, we'll flatten the slice
 53 |     if flatten:
 54 |       slices, = slices
 55 |     yield slices
 56 | 
 57 | def chunk_infer(x, chunks=None):
 58 |   ''' Helper function for interpreting the chunks param with respect to a matrix x
 59 | 
 60 |   :param x:      The matrix (ndarray)
 61 |   :param chunks: The chunks parameter,
 62 |                  if None (default): Try to infer from chunks attribute (h5py)
 63 |                  if int: Use a multiple of the inferred chunks attribute, or alternatively that size in each dimension
 64 |                  if tuple: Use the explicit chunks provided for slicing
 65 | 
 66 |   :returns: tuple chunks parameter
 67 |   '''
 68 |   if type(chunks) in (tuple, list):
 69 |     assert len(chunks) == len(x.shape), f"Matrix has {len(x.shape)} dimensions but chunks has {len(chunks)}"
 70 |     return chunks
 71 |   inferred_chunks = getattr(x, 'chunks', None)
 72 |   if inferred_chunks is not None:
 73 |     if chunks is None:
 74 |       chunks = inferred_chunks
 75 |     elif type(chunks) == int:
 76 |       chunks = np.array(inferred_chunks)*chunks
 77 |     else:
 78 |       raise NotImplementedError('chunks should be int or tuple')
 79 |   elif type(chunks) == int:
 80 |     chunks = (chunks,)*len(x.shape)
 81 |   else:
 82 |     raise NotImplementedError('chunks should be int or tuple')
 83 |   return tuple(chunks)
 84 | 
 85 | def chunk_applymap(func, x, *, out=None, chunks=None, progress=False):
 86 |   ''' Apply function to all elements in a matrix in chunks
 87 | 
 88 |   :param func:     The function to apply to each chunk
 89 |   :param x:        The matrix to apply it to
 90 |   :param out:      The matrix to write to (pass variable to out for inplace)
 91 |   :param chunks:   The shape of the chunks in each dimension,
 92 |                    can be inferred for h5py arrays based on actual chunks on disk,
 93 |                    can be a multiple of an integer value of chunks.
 94 |   :param progress: Show tqdm progress bar or not
 95 | 
 96 |   :returns:        The augmented matrix (or the original matrix, augmented)
 97 |   '''
 98 |   if out is None: out = np.zeros(x.shape)
 99 |   if x.shape != out.shape: logger.warning('x and out shape do not match, is this what you wanted?')
100 |   if x.dtype != out.dtype: logger.warning('x and out dtype do not match, is this what you wanted?')
101 |   for s in chunk_slices(x.shape, chunks=chunk_infer(x, chunks), progress=progress):
102 |     out[s] = func(x[s])
103 |   return out
104 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/utils/describe.py:
--------------------------------------------------------------------------------
 1 | ''' Descriptive statistics on things that aren't pandas data frames.
 2 | This can often be a lot more efficient.
 3 | '''
 4 | import numpy as np
 5 | import typing as t
 6 | 
 7 | def np_describe(x, axis=0, *, percentiles=[25, 50, 75]) -> t.Dict[str, np.array]:
 8 |   ''' Like pandas Series.describe() but operating on numpy arrays / matrices.
 9 |   This can be a lot faster especially when working with h5py or sparse data frames.
10 | 
11 |   :params x: The numpy array to describe
12 |   :params axis: The axis for which to perform describe against
13 |   :returns: A dictionary mapping metric name to results
14 |   '''
15 |   results = {
16 |     'count': (~np.isnan(x)).sum(axis=axis),
17 |     'mean': x.mean(axis=axis),
18 |     'std': x.std(axis=axis),
19 |     'min': x.min(axis=axis),
20 |     'max': x.max(axis=axis),
21 |   }
22 |   if percentiles:
23 |     percentile = np.percentile(x, percentiles, axis=axis)
24 |     results.update({
25 |       f"{p}%": percentile[i]
26 |       for i, p in enumerate(percentiles)
27 |     })
28 |   return results
29 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/utils/fetch_save_read.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | 
 4 | def fetch_save_read(url, file, reader=pd.read_csv, sep=',', **kwargs):
 5 |   ''' Download file from {url}, save it to {file}, and subsequently read it with {reader} using pandas options on {**kwargs}.
 6 |   '''
 7 |   if not os.path.exists(file):
 8 |     if os.path.dirname(file):
 9 |       os.makedirs(os.path.dirname(file), exist_ok=True)
10 |     df = reader(url, sep=sep, index_col=None)
11 |     df.to_csv(file, sep=sep, index=False)
12 |   return pd.read_csv(file, sep=sep, **kwargs)
13 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/utils/maybe_tqdm.py:
--------------------------------------------------------------------------------
1 | def maybe_tqdm(iterable, **kwargs):
2 |   ''' Optional tqdm (omitted if tqdm is not installed)
3 |   '''
4 |   try:
5 |     from tqdm.auto import tqdm
6 |     return tqdm(iterable, **kwargs)
7 |   except ImportError:
8 |     return iterable
9 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/utils/merge.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | def merge(*dfs, **kwargs):
 4 |   ''' Helper function for many trivial (index based) joins
 5 |   Deprecated: Use `pd.concat([dfs], axis=1)` instead
 6 |   '''
 7 |   if not dfs:
 8 |     return pd.DataFrame()
 9 |   #
10 |   left, *rights = dfs
11 |   #
12 |   merged = left
13 |   for right in rights:
14 |     merged = pd.merge(left=merged, left_index=True, right=right, right_index=True, **kwargs)
15 |   #
16 |   return merged
17 | 


--------------------------------------------------------------------------------
/maayanlab_bioinformatics/utils/sparse.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.sparse as sp_sparse
 3 | 
 4 | def sp_hdf_dump(hdf, sdf, **kwargs):
 5 |   ''' Dump Sparse Pandas DataFrame to h5py object.
 6 | 
 7 |   Usage:
 8 |   ```python
 9 |   import h5py
10 |   import pandas as pd
11 |   import scipy.sparse as sp_sparse
12 | 
13 |   # write
14 |   f = h5py.File('sparse.h5', 'w')
15 |   sdf = pd.DataFrame.sparse.from_spmatrix(sp_sparse.eye(3))
16 |   sp_hdf_dump(f, sdf)
17 |   f.close()
18 |   ```
19 |   '''
20 |   s = sdf.sparse.to_coo()
21 |   hdf.create_dataset('data', data=s.data, **kwargs)
22 |   hdf.create_dataset('row', data=s.row, **kwargs)
23 |   hdf.create_dataset('col', data=s.col, **kwargs)
24 |   hdf.create_dataset('index', data=sdf.index.values, **kwargs)
25 |   hdf.create_dataset('columns', data=sdf.columns.values, **kwargs)
26 |   hdf.attrs['shape'] = s.shape
27 |   return hdf
28 | 
29 | def sp_hdf_load(hdf):
30 |   ''' Load Sparse Pandas DataFrame from h5py object.
31 | 
32 |   Usage:
33 |   ```python
34 |   import h5py
35 |   import pandas as pd
36 |   import scipy.sparse as sp_sparse
37 | 
38 |   f = h5py.File('sparse.h5', 'r')
39 |   sdf = sp_hdf_load(f)
40 |   f.close()
41 |   ```
42 |   '''
43 |   import pandas as pd
44 |   return pd.DataFrame.sparse.from_spmatrix(
45 |     sp_sparse.coo_array((hdf['data'], (hdf['row'], hdf['col'])), shape=hdf.attrs['shape']),
46 |     index=pd.Series(hdf['index']).str.decode('utf8'),
47 |     columns=pd.Series(hdf['columns']).str.decode('utf8'),
48 |   )
49 | 
50 | def sp_std(X_ij, ddof=1):
51 |   ''' Standard deviation for a matrix compatible with sparse matrices.
52 |   i is the row index, j is the column index.
53 | 
54 |   \sigma_j = \sqrt{\frac{\sum(x_ij - \mu_j)^2}{N_j - ddof}}}
55 |   '''
56 |   N_j = X_ij.shape[-1]
57 |   mu_j = X_ij.sum(axis=0) / N_j
58 |   num_j = ((X_ij - mu_j)**2).sum(axis=0)
59 |   denom_j = N_j - ddof
60 |   if sp_sparse.isspmatrix(X_ij):
61 |     return (num_j / denom_j).A.squeeze()**(1/2)
62 |   else:
63 |     return (num_j / denom_j)**(1/2)
64 | 
65 | def sp_nanpercentile(sp, q, axis=None, method='linear'):
66 |   ''' nanpercentile for a sparse matrix, basically we use np.percentile on the underlying data.
67 |   '''
68 |   coo = sp_sparse.coo_array(sp)
69 |   if axis is None:
70 |     return np.percentile(coo.data, q, method=method)
71 |   elif axis == 0:
72 |     return np.array([
73 |       np.percentile(coo.data[coo.col == c], q, method=method)
74 |       for c in range(coo.shape[1])
75 |     ])
76 |   elif axis == 1:
77 |     return np.array([
78 |       np.percentile(coo.data[coo.row == r], q, method=method)
79 |       for r in range(coo.shape[0])
80 |     ])
81 |   else:
82 |     raise NotImplementedError
83 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "maayanlab-bioinformatics"
 3 | version = "0.9.0"
 4 | description = "A collection of useful functions for bioinformatics data analysis."
 5 | authors = [
 6 |     {name = "Daniel J. B. Clarke",email = "danieljbclarkemssm@gmail.com"}
 7 | ]
 8 | license = {text = "Apache-2.0"}
 9 | readme = "README.md"
10 | requires-python = ">=3.9"
11 | dependencies = [
12 |     "numpy (<2)",
13 |     "pandas (<2)",
14 |     "qnorm",
15 |     "requests",
16 |     "scikit-learn",
17 |     "scipy"
18 | ]
19 | 
20 | [project.optional-dependencies]
21 | h5py = ["h5py"]
22 | limma_voom = ["rpy2"]
23 | progress = ["tqdm"]
24 | enrichr_user_list = ["bs4", "lxml"]
25 | deseq2 = ["pydeseq2"]
26 | all = ["h5py", "rpy2", "tqdm", "bs4", "lxml", "pydeseq2"]
27 | docs = ["recommonmark","sphinx","m2r2"]
28 | 
29 | [build-system]
30 | requires = ["poetry-core>=2.0.0,<3.0.0"]
31 | build-backend = "poetry.core.masonry.api"
32 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaayanLab/maayanlab-bioinformatics/5b38cf2ce8f67928777852b69f2e2659c6eb9043/tests/__init__.py


--------------------------------------------------------------------------------
/tests/dge/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaayanLab/maayanlab-bioinformatics/5b38cf2ce8f67928777852b69f2e2659c6eb9043/tests/dge/__init__.py


--------------------------------------------------------------------------------
/tests/dge/test_deseq2.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import pathlib
 3 | import pandas as pd
 4 | from maayanlab_bioinformatics.dge.deseq2 import deseq2_differential_expression
 5 | 
 6 | def test_deseq2():
 7 |   df = pd.read_csv(pathlib.Path(__file__).parent.parent/'test_example_matrix.txt', sep='\t', index_col=0)
 8 |   df_results = deseq2_differential_expression(
 9 |     df.iloc[:, :3],
10 |     df.iloc[:, 3:],
11 |     stdout=sys.stdout,
12 |   )
13 |   print(df_results)
14 |   assert (df_results['pvalue'] < 0.05).any()
15 | 


--------------------------------------------------------------------------------
/tests/dge/test_limma.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import numpy as np
 3 | import pandas as pd
 4 | from maayanlab_bioinformatics.dge import limma_voom_differential_expression, limma_voom_differential_expression_design
 5 | 
 6 | def test_limma():
 7 |   df = pd.read_csv(pathlib.Path(__file__).parent.parent/'test_example_matrix.txt', sep='\t', index_col=0)
 8 |   df_expected = pd.read_csv(pathlib.Path(__file__).parent.parent/'test_example_matrix_dge_results.txt', sep='\t', index_col=0)
 9 |   df_results = limma_voom_differential_expression(
10 |     df.iloc[:, :3],
11 |     df.iloc[:, 3:],
12 |     filter_genes=True,
13 |   )
14 |   print(df_expected)
15 |   print(df_results)
16 |   isclose = pd.DataFrame(np.isclose(df_expected, df_results), index=df_results.index, columns=df_results.columns)
17 |   print(isclose.value_counts())
18 |   assert isclose.all().all()
19 | 
20 | def test_limma_design():
21 |   df = pd.read_csv(pathlib.Path(__file__).parent.parent/'test_example_matrix.txt', sep='\t', index_col=0)
22 |   df_meta = pd.read_csv(pathlib.Path(__file__).parent.parent/'test_example_metadata.txt', sep='\t', index_col=0)
23 |   df_expected = pd.read_csv(pathlib.Path(__file__).parent.parent/'test_example_matrix_dge_results.txt', sep='\t', index_col=0)
24 |   design = pd.get_dummies(df_meta['Stage']).astype(int)
25 |   print(design)
26 |   df_results = limma_voom_differential_expression_design(
27 |     df,
28 |     design,
29 |     ('primary melanocytes', 'metastatic'),
30 |     filter_genes=True,
31 |   )
32 |   print(df_expected)
33 |   print(df_results)
34 |   isclose = pd.DataFrame(np.isclose(df_expected, df_results), index=df_results.index, columns=df_results.columns)
35 |   print(isclose.value_counts())
36 |   assert isclose.all().all()
37 | 
38 | def test_limma_shuffled():
39 |   df = pd.read_csv(pathlib.Path(__file__).parent.parent/'test_example_matrix.txt', sep='\t', index_col=0)
40 |   df_expected = pd.read_csv(pathlib.Path(__file__).parent.parent/'test_example_matrix_dge_results.txt', sep='\t', index_col=0)
41 |   index = df.index.values.copy()
42 |   np.random.shuffle(index)
43 |   controls, cases = df.iloc[:, :3], df.iloc[:, 3:]
44 |   controls_columns = controls.columns.values.copy()
45 |   np.random.shuffle(controls_columns)
46 |   cases_columns = cases.columns.values.copy()
47 |   np.random.shuffle(cases_columns)
48 |   df_results = limma_voom_differential_expression(
49 |     controls.loc[index, controls_columns], cases.loc[index, cases_columns],
50 |     filter_genes=True,
51 |   )
52 |   print(df_expected)
53 |   print(df_results)
54 |   isclose = pd.DataFrame(np.isclose(df_expected, df_results), index=df_results.index, columns=df_results.columns)
55 |   print(isclose.value_counts())
56 |   assert isclose.all().all()
57 | 
58 | def test_limma_determanism():
59 |   df = pd.read_csv(pathlib.Path(__file__).parent.parent/'test_example_matrix.txt', sep='\t', index_col=0)
60 |   index = df.index.values.copy()
61 |   np.random.shuffle(index)
62 |   controls, cases = df.iloc[:, :3], df.iloc[:, 3:]
63 |   controls_columns = controls.columns.values.copy()
64 |   np.random.shuffle(controls_columns)
65 |   cases_columns = cases.columns.values.copy()
66 |   np.random.shuffle(cases_columns)
67 |   df_expected = limma_voom_differential_expression(
68 |     controls, cases,
69 |     voom_design=True,
70 |   )
71 |   df_results = limma_voom_differential_expression(
72 |     controls.loc[index, controls_columns], cases.loc[index, cases_columns],
73 |     voom_design=True,
74 |   )
75 |   print(df_expected)
76 |   print(df_results)
77 |   isclose = pd.DataFrame(np.isclose(df_expected, df_results), index=df_results.index, columns=df_results.columns)
78 |   print(isclose.value_counts())
79 |   assert isclose.all().all()
80 | 


--------------------------------------------------------------------------------
/tests/dge/test_logfc.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import pandas as pd
 3 | import numpy as np
 4 | from maayanlab_bioinformatics.dge.logfc import logfc_differential_expression
 5 | 
 6 | def test_logfc():
 7 |   df = pd.read_csv(pathlib.Path(__file__).parent.parent/'test_example_matrix.txt', sep='\t', index_col=0)
 8 |   df_expected = pd.read_csv(pathlib.Path(__file__).parent.parent/'test_example_matrix_dge_results.txt', sep='\t', index_col=0)
 9 |   df_results = logfc_differential_expression(
10 |     df.iloc[:, :3],
11 |     df.iloc[:, 3:],
12 |   )
13 |   df_cmp = pd.concat({'expected': df_expected['logFC'], 'computed': df_results.loc[df_expected.index, 'LogFC']}, axis=1)
14 |   df_cmp_close = np.isclose(df_cmp['expected'], df_cmp['computed'], atol=1.)
15 |   print(df_cmp)
16 |   print(df_cmp[~df_cmp_close])
17 |   # most logfc values are pretty close to those reported by limma
18 |   #  they won't be the same since limma applies a normalization but they should be similar orders of magnitude
19 |   #  so we ensure 95% are relatively close
20 |   assert df_cmp_close.sum() > (0.95*df_expected.shape[0])
21 | 


--------------------------------------------------------------------------------
/tests/dge/test_ttest.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import pandas as pd
 3 | import numpy as np
 4 | from maayanlab_bioinformatics.dge.ttest import ttest_differential_expression
 5 | 
 6 | def test_ttest():
 7 |   df = pd.read_csv(pathlib.Path(__file__).parent.parent/'test_example_matrix.txt', sep='\t', index_col=0)
 8 |   df_expected = pd.read_csv(pathlib.Path(__file__).parent.parent/'test_example_matrix_dge_results.txt', sep='\t', index_col=0)
 9 |   df_results = ttest_differential_expression(
10 |     df.iloc[:, :3],
11 |     df.iloc[:, 3:],
12 |   )
13 |   df_cmp = pd.concat({'expected': df_expected['t'], 'computed': df_results.loc[df_expected.index, 'Statistic']}, axis=1)
14 |   df_cmp_close = np.isclose(df_cmp['expected'], df_cmp['computed'], atol=10.)
15 |   print(df_cmp)
16 |   print(df_cmp[~df_cmp_close])
17 |   # most t values are pretty close to those reported by limma
18 |   #  they won't be the same since limma applies a normalization but they should be similar orders of magnitude
19 |   #  so we ensure 95% are relatively close
20 |   assert df_cmp_close.sum() > (0.95*df_expected.shape[0])
21 | 


--------------------------------------------------------------------------------
/tests/enrichment/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaayanLab/maayanlab-bioinformatics/5b38cf2ce8f67928777852b69f2e2659c6eb9043/tests/enrichment/__init__.py


--------------------------------------------------------------------------------
/tests/enrichment/test_crisp.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | from maayanlab_bioinformatics.enrichment import enrich_crisp
 4 | from maayanlab_bioinformatics.parse import gmt_read_iter
 5 | 
 6 | def test_enrich_crisp():
 7 |   # TODO: compare pvalues & oddsratio
 8 |   geneset = [
 9 |     line.strip().upper()
10 |     for line in open(os.path.join(os.path.dirname(__file__), '..', 'test_geneset.txt'), 'r')
11 |   ]
12 |   library_iter = gmt_read_iter(os.path.join(os.path.dirname(__file__), '..', 'test_gmt.gmt'))
13 |   expectation = pd.read_csv(os.path.join(os.path.dirname(__file__), '..', 'test_enrichr_results.txt'), sep='\t').sort_values('P-value')
14 |   expectation_terms = expectation['Term'].tolist()
15 |   results = sorted(enrich_crisp(geneset, library_iter, 20000, True), key=lambda r: r[1].pvalue)
16 |   result_terms = [term for term, _ in results]
17 |   assert expectation_terms == result_terms, f"{expectation_terms} != {result_terms}"
18 | 


--------------------------------------------------------------------------------
/tests/normalization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaayanLab/maayanlab-bioinformatics/5b38cf2ce8f67928777852b69f2e2659c6eb9043/tests/normalization/__init__.py


--------------------------------------------------------------------------------
/tests/normalization/test_cpm.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | from maayanlab_bioinformatics.normalization.cpm import cpm_normalize
 4 | 
 5 | def test_cpm_normalization():
 6 |   given = np.array([
 7 |     [5, 4, 3],
 8 |     [2, 1, 4],
 9 |     [3, 4, 6],
10 |     [4, 2, 8],
11 |   ])
12 |   from rpy2.robjects import numpy2ri
13 |   from rpy2.robjects.packages import importr
14 |   edgeR = importr('edgeR')
15 |   expectation = numpy2ri.rpy2py(edgeR.cpm(numpy2ri.py2rpy(given)))
16 |   assert np.allclose(cpm_normalize(given), expectation, atol=1e-2)
17 | 


--------------------------------------------------------------------------------
/tests/normalization/test_quantile.py:
--------------------------------------------------------------------------------
 1 | # Test based on information here: https://en.wikipedia.org/wiki/Quantile_normalization
 2 | 
 3 | import numpy as np
 4 | from maayanlab_bioinformatics.normalization.quantile import quantile_normalize
 5 | 
 6 | 
 7 | def test_quantile_normalization():
 8 |   given = np.array([
 9 |     [5, 4, 3],
10 |     [2, 1, 4],
11 |     [3, 4, 6],
12 |     [4, 2, 8],
13 |   ])
14 |   expectation = np.array([
15 |     [5.67, 5.17, 2.00],
16 |     [2.00, 2.00, 3.00],
17 |     [3.00, 5.17, 4.67],
18 |     [4.67, 3.00, 5.67],
19 |   ])
20 |   assert np.allclose(quantile_normalize(given), expectation, atol=1e-2)
21 | 


--------------------------------------------------------------------------------
/tests/normalization/test_quantile_legacy.py:
--------------------------------------------------------------------------------
 1 | # Test based on information here: https://en.wikipedia.org/wiki/Quantile_normalization
 2 | 
 3 | import numpy as np
 4 | from maayanlab_bioinformatics.normalization.quantile_legacy import quantile_normalize, quantile_normalize_h5
 5 | 
 6 | 
 7 | def test_quantile_normalization():
 8 |   given = np.array([
 9 |     [5, 4, 3],
10 |     [2, 1, 4],
11 |     [3, 4, 6],
12 |     [4, 2, 8],
13 |   ])
14 |   expectation = np.array([
15 |     [5.67, 4.67, 2.00],
16 |     [2.00, 2.00, 3.00],
17 |     [3.00, 4.67, 4.67],
18 |     [4.67, 3.00, 5.67],
19 |   ])
20 |   assert np.allclose(quantile_normalize(given), expectation, atol=1e-2)
21 | 
22 | def test_quantile_normalization_h5():
23 |   import os
24 |   import h5py
25 |   import tempfile
26 |   fname = tempfile.mktemp('test_quantile.h5')
27 |   f = h5py.File(fname, 'w')
28 |   given = f.create_dataset('given', data=np.array([
29 |       [5, 4, 3],
30 |       [2, 1, 4],
31 |       [3, 4, 6],
32 |       [4, 2, 8],
33 |   ]))
34 |   norm = f.create_dataset('norm', shape=given.shape, dtype='float64')
35 |   quantile_normalize_h5(given, norm)
36 |   expectation = np.array([
37 |     [5.67, 4.67, 2.00],
38 |     [2.00, 2.00, 3.00],
39 |     [3.00, 4.67, 4.67],
40 |     [4.67, 3.00, 5.67],
41 |   ])
42 |   assert np.allclose(norm[:], expectation, atol=1e-2)
43 |   os.unlink(fname)
44 | 


--------------------------------------------------------------------------------
/tests/parse/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaayanLab/maayanlab-bioinformatics/5b38cf2ce8f67928777852b69f2e2659c6eb9043/tests/parse/__init__.py


--------------------------------------------------------------------------------
/tests/parse/test_gmt.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def test_gmt_read_dict():
 4 |   from maayanlab_bioinformatics.parse.gmt import gmt_read_dict
 5 |   gmt = gmt_read_dict(open(os.path.join(os.path.dirname(__file__), '..', 'test_gmt.gmt'), 'r'))
 6 |   assert 'ChIP-seq\ttest desc' in gmt
 7 |   assert gmt['ChIP-seq\ttest desc']['LRRC37A3'] == 2.0
 8 |   assert 'LRRC37A3' in gmt['ChIP-seq\ttest desc']
 9 |   assert 'TLE3' in gmt['ChIP-seq\ttest desc']
10 |   assert gmt['ChIP-seq\ttest desc']['TLE3'] == 5.0
11 |   assert 'Data aggregation' in gmt
12 |   assert 'CD24L4' in gmt['Data aggregation']
13 |   assert gmt['Data aggregation']['CD24L4'] == 1
14 | 
15 | def test_gmt_read_pd():
16 |   from maayanlab_bioinformatics.parse.gmt import gmt_read_pd
17 |   gmt = gmt_read_pd(os.path.join(os.path.dirname(__file__), '..', 'test_gmt.gmt'))
18 |   assert 'ChIP-seq\ttest desc' in gmt.columns
19 |   assert 'LRRC37A3' in gmt.index
20 |   assert 'TLE3' in gmt.index
21 |   assert gmt.loc['LRRC37A3', 'ChIP-seq\ttest desc'] == 2.0
22 |   assert gmt.loc['TLE3', 'ChIP-seq\ttest desc'] == 5.0
23 |   assert 'Data aggregation' in gmt.columns
24 |   assert 'CD24L4' in gmt.index
25 |   assert gmt.loc['CD24L4', 'Data aggregation'] == 1
26 | 


--------------------------------------------------------------------------------
/tests/test_enrichr_results.txt:
--------------------------------------------------------------------------------
1 | Term	Overlap	P-value	Adjusted P-value	Old P-value	Old Adjusted P-value	Odds Ratio	Combined Score	Genes
2 | Relative Cell Proliferation	3/105	0.31473973611433437	1.0	0	0	1.5238095238095237	1.7615378533090662	MED14;TCN2;ATP6V1B2
3 | L1000 Expression Profiling	2/100	0.5621081938009421	1.0	0	0	1.0666666666666667	0.6144649940876461	EI24;ADH5
4 | Literature	2/100	0.5621081938009421	1.0	0	0	1.0666666666666667	0.6144649940876461	PLSCR2;CHPT1
5 | Microarray	2/100	0.5621081938009421	1.0	0	0	1.0666666666666667	0.6144649940876461	BRI3;GBE1
6 | "ChIP-seq	test desc"	1/100	0.8500626642593614	1.0	0	0	0.5333333333333333	0.08663744509711163	PRPF18
7 | Mass Spectrometry	1/100	0.8500626642593614	1.0	0	0	0.5333333333333333	0.08663744509711163	RBM39
8 | Sequence Analysis	1/100	0.8500626642593614	1.0	0	0	0.5333333333333333	0.08663744509711163	KLF12
9 | 


--------------------------------------------------------------------------------
/tests/test_example_metadata.txt:
--------------------------------------------------------------------------------
1 | Sample	Stage	cell typemelanocyte_1	primary melanocytes	normal melanocytesmelanocyte_2	primary melanocytes	normal melanocytesmelanocyte_3	primary melanocytes	normal melanocytesmelanoma_1	metastatic	melanoma cell linemelanoma_2	metastatic	melanoma cell linemelanoma_3	metastatic	melanoma cell line


--------------------------------------------------------------------------------
/tests/test_geneset.txt:
--------------------------------------------------------------------------------
  1 | Nsun3
  2 | Polrmt
  3 | Nlrx1
  4 | Sfxn5
  5 | Zc3h12c
  6 | Slc25a39
  7 | Arsg
  8 | Defb29
  9 | Ndufb6
 10 | Zfand1
 11 | Tmem77
 12 | 5730403B10Rik
 13 | RP23-195K8.6
 14 | Tlcd1
 15 | Psmc6
 16 | Slc30a6
 17 | LOC100047292
 18 | Lrrc40
 19 | Orc5l
 20 | Mpp7
 21 | Unc119b
 22 | Prkaca
 23 | Tcn2
 24 | Psmc3ip
 25 | Pcmtd2
 26 | Acaa1a
 27 | Lrrc1
 28 | 2810432D09Rik
 29 | Sephs2
 30 | Sac3d1
 31 | Tmlhe
 32 | LOC623451
 33 | Tsr2
 34 | Plekha7
 35 | Gys2
 36 | Arhgef12
 37 | Hibch
 38 | Lyrm2
 39 | Zbtb44
 40 | Entpd5
 41 | Rab11fip2
 42 | Lipt1
 43 | Intu
 44 | Anxa13
 45 | Klf12
 46 | Sat2
 47 | Gal3st2
 48 | Vamp8
 49 | Fkbpl
 50 | Aqp11
 51 | Trap1
 52 | Pmpcb
 53 | Tm7sf3
 54 | Rbm39
 55 | Bri3
 56 | Kdr
 57 | Zfp748
 58 | Nap1l1
 59 | Dhrs1
 60 | Lrrc56
 61 | Wdr20a
 62 | Stxbp2
 63 | Klf1
 64 | Ufc1
 65 | Ccdc16
 66 | 9230114K14Rik
 67 | Rwdd3
 68 | 2610528K11Rik
 69 | Aco1
 70 | Cables1
 71 | LOC100047214
 72 | Yars2
 73 | Lypla1
 74 | Kalrn
 75 | Gyk
 76 | Zfp787
 77 | Zfp655
 78 | Rabepk
 79 | Zfp650
 80 | 4732466D17Rik
 81 | Exosc4
 82 | Wdr42a
 83 | Gphn
 84 | 2610528J11Rik
 85 | 1110003E01Rik
 86 | Mdh1
 87 | 1200014M14Rik
 88 | AW209491
 89 | Mut
 90 | 1700123L14Rik
 91 | 2610036D13Rik
 92 | Cox15
 93 | Tmem30a
 94 | Nsmce4a
 95 | Tm2d2
 96 | Rhbdd3
 97 | Atxn2
 98 | Nfs1
 99 | 3110001I20Rik
100 | BC038156
101 | LOC100047782
102 | 2410012H22Rik
103 | Rilp
104 | A230062G08Rik
105 | Pttg1ip
106 | Rab1
107 | Afap1l1
108 | Lyrm5
109 | 2310026E23Rik
110 | C330002I19Rik
111 | Zfyve20
112 | Poli
113 | Tomm70a
114 | Slc7a6os
115 | Mat2b
116 | 4932438A13Rik
117 | Lrrc8a
118 | Smo
119 | Nupl2
120 | Trpc2
121 | Arsk
122 | D630023B12Rik
123 | Mtfr1
124 | 5730414N17Rik
125 | Scp2
126 | Zrsr1
127 | Nol7
128 | C330018D20Rik
129 | Ift122
130 | LOC100046168
131 | D730039F16Rik
132 | Scyl1
133 | 1700023B02Rik
134 | 1700034H14Rik
135 | Fbxo8
136 | Paip1
137 | Tmem186
138 | Atpaf1
139 | LOC100046254
140 | LOC100047604
141 | Coq10a
142 | Fn3k
143 | Sipa1l1
144 | Slc25a16
145 | Slc25a40
146 | Rps6ka5
147 | Trim37
148 | Lrrc61
149 | Abhd3
150 | Gbe1
151 | Parp16
152 | Hsd3b2
153 | Esm1
154 | Dnajc18
155 | Dolpp1
156 | Lass2
157 | Wdr34
158 | Rfesd
159 | Cacnb4
160 | 2310042D19Rik
161 | Srr
162 | Bpnt1
163 | 6530415H11Rik
164 | Clcc1
165 | Tfb1m
166 | 4632404H12Rik
167 | D4Bwg0951e
168 | Med14
169 | Adhfe1
170 | Thtpa
171 | Cat
172 | Ell3
173 | Akr7a5
174 | Mtmr14
175 | Timm44
176 | Sf1
177 | Ipp
178 | Iah1
179 | Trim23
180 | Wdr89
181 | Gstz1
182 | Cradd
183 | 2510006D16Rik
184 | Fbxl6
185 | LOC100044400
186 | Zfp106
187 | Cd55
188 | 0610013E23Rik
189 | Afmid
190 | Tmem86a
191 | Aldh6a1
192 | Dalrd3
193 | Smyd4
194 | Nme7
195 | Fars2
196 | Tasp1
197 | Cldn10
198 | A930005H10Rik
199 | Slc9a6
200 | Adk
201 | Rbks
202 | 2210016F16Rik
203 | Vwce
204 | 4732435N03Rik
205 | Zfp11
206 | Vldlr
207 | 9630013D21Rik
208 | 4933407N01Rik
209 | Fahd1
210 | Mipol1
211 | 1810019D21Rik
212 | 1810049H13Rik
213 | Tfam
214 | Paics
215 | 1110032A03Rik
216 | LOC100044139
217 | Dnajc19
218 | BC016495
219 | A930041I02Rik
220 | Rqcd1
221 | Usp34
222 | Zcchc3
223 | H2afj
224 | Phf7
225 | 4921508D12Rik
226 | Kmo
227 | Prpf18
228 | Mcat
229 | Txndc4
230 | 4921530L18Rik
231 | Vps13b
232 | Scrn3
233 | Tor1a
234 | AI316807
235 | Acbd4
236 | Fah
237 | Apool
238 | Col4a4
239 | Lrrc19
240 | Gnmt
241 | Nr3c1
242 | Sip1
243 | Ascc1
244 | Fech
245 | Abhd14a
246 | Arhgap18
247 | 2700046G09Rik
248 | Yme1l1
249 | Gk5
250 | Glo1
251 | Sbk1
252 | Cisd1
253 | 2210011C24Rik
254 | Nxt2
255 | Notum
256 | Ankrd42
257 | Ube2e1
258 | Ndufv1
259 | Slc33a1
260 | Cep68
261 | Rps6kb1
262 | Hyi
263 | Aldh1a3
264 | Mynn
265 | 3110048L19Rik
266 | Rdh14
267 | Proz
268 | Gorasp1
269 | LOC674449
270 | Zfp775
271 | 5430437P03Rik
272 | Npy
273 | Adh5
274 | Sybl1
275 | 4930432O21Rik
276 | Nat9
277 | LOC100048387
278 | Mettl8
279 | Eny2
280 | 2410018G20Rik
281 | Pgm2
282 | Fgfr4
283 | Mobkl2b
284 | Atad3a
285 | 4932432K03Rik
286 | Dhtkd1
287 | Ubox5
288 | A530050D06Rik
289 | Zdhhc5
290 | Mgat1
291 | Nudt6
292 | Tpmt
293 | Wbscr18
294 | LOC100041586
295 | Cdk5rap1
296 | 4833426J09Rik
297 | Myo6
298 | Cpt1a
299 | Gadd45gip1
300 | Tmbim4
301 | 2010309E21Rik
302 | Asb9
303 | 2610019F03Rik
304 | 7530414M10Rik
305 | Atp6v1b2
306 | 2310068J16Rik
307 | Ddt
308 | Klhdc4
309 | Hpn
310 | Lifr
311 | Ovol1
312 | Nudt12
313 | Cdan1
314 | Fbxo9
315 | Fbxl3
316 | Hoxa7
317 | Aldh8a1
318 | 3110057O12Rik
319 | Abhd11
320 | Psmb1
321 | ENSMUSG00000074286
322 | Chpt1
323 | Oxsm
324 | 2310009A05Rik
325 | 1700001L05Rik
326 | Zfp148
327 | 39509
328 | Mrpl9
329 | Tmem80
330 | 9030420J04Rik
331 | Naglu
332 | Plscr2
333 | Agbl3
334 | Pex1
335 | Cno
336 | Neo1
337 | Asf1a
338 | Tnfsf5ip1
339 | Pkig
340 | AI931714
341 | D130020L05Rik
342 | Cntd1
343 | Clec2h
344 | Zkscan1
345 | 1810044D09Rik
346 | Mettl7a
347 | Siae
348 | Fbxo3
349 | Fzd5
350 | Tmem166
351 | Tmed4
352 | Gpr155
353 | Rnf167
354 | Sptlc1
355 | Riok2
356 | Tgds
357 | Pms1
358 | Pitpnc1
359 | Pcsk7
360 | 4933403G14Rik
361 | Ei24
362 | Crebl2
363 | Tln1
364 | Mrpl35
365 | 2700038C09Rik
366 | Ubie
367 | Osgepl1
368 | 2410166I05Rik
369 | Wdr24
370 | Ap4s1
371 | Lrrc44
372 | B3bp
373 | Itfg1
374 | Dmxl1
375 | C1d


--------------------------------------------------------------------------------
/tests/test_gmt.gmt:
--------------------------------------------------------------------------------
 1 | ChIP-seq	test desc	LRRC37A3,2.0	TLE3:5.0	MED4	CRBN	TRIP4	DHDDS	ANKHD1	FLJ40852	TBL1XR1	DEPDC1B	FKBP9	HCG25	DNA2	CDKN2AIPNL	ZNF830	ASF1B	PPP1R15B	DCLRE1B	ZFP36L1	HIRA	SLC9A1	TOMM22	TWISTNB	MYC	C10ORF2	MACROD1	CEP95	HES1	PLEKHJ1	ZC3H10	AQR	GADD45A	ALG6	MIRLET7I	SPAG5-AS1	SNHG7	ERAL1	EXT1	MED23	NOXA1	NDUFAF1	ADNP	HPD	RPS20	NOP10	ENO1	TRIAP1	RPS15	C7ORF55	ALKBH2	ZCWPW1	PIM1	DUSP6	RBM15	JUN	SSSCA1	C1ORF27	SAMD4B	PRPF18	PTMA	MEIS1	NDUFS7	ID3	ID2	CITED2	CHD9	MRPL39	MTIF2	TROVE2	MED19	MRPL40	PPP6R1	JARID2	ACP2	SF3A2	SF3A3	SPRYD4	ZNF460	STAT3	FOS	GATC	SLC39A13	TPD52L2	KCTD5	SULF2	LOC284385	KLF9	HNRNPUL1	KLF6	DDIT4	SSBP1	TRIB1	NRP2	POLDIP3	TTK	MIR663A	SMG7	SPRED2	SNORD54	REV3L	
 2 | Data aggregation		CD24L4	PDGFRA	UBE2C	CDC20	CXCL10	COL1A2	MELK	RAB31	BUB1B	SLPI	VIM	IL6ST	MAD2L1	PCNA	ATF3	IFITM1	SERPINE1	AURKA	CLU	TYMS	EGFR	CTGF	CCND2	MYB	CCND1	CDH1	PTTG1	MYC	CYP1B1	KPNA2	EGR1	XBP1	PLK1	IGFBP5	IGFBP3	ESR1	CCNA2	BIRC5	SPARC	GATA3	FOXM1	CCNB2	CCNB1	LGALS1	STMN1	NEK2	JUNB	MYBL2	DUSP6	RFC4	JUN	CAV1	FN1	CDC6	VEGFA	CENPF	NFKBIA	PTPRC	IL6	IL8	ID2	MCM4	BCL2	CDK1	MCM2	CDKN3	MCM3	BTG2	TOP2A	CKS1B	TNFAIP3	HMMR	CXCL2	CXCL1	SOCS2	ERBB2	TK1	TIMP1	RRM2	MME	FOS	PRC1	PLAUR	KLF4	SOD2	ASPM	KIT	CCNE1	CKS2	CD44	CDKN1A	FHL1	THBS1	MKI67	TTK	CENPA	CXCR4	CCL2	NUSAP1	BUB1	
 3 | GWAS		RBFOX1	KIRREL3	PCDH9	LRP1B	ELMO1	CNTN4	DCHS2	CNTN5	ASTN2	PARK2	SEMA5A	HFE	CTNND2	NPAS3	ZMIZ1	MACROD2	CPNE4	GPC6	GPC5	ANKS1B	ABCA1	GCKR	ANK1	NAALADL2	DLG2	KCNMA1	CDH13	CCSER1	DPP10	FTO	ZNF492	FAM13A	NTM	CACNA1C	PCDH15	LPP	ABO	RORA	GLIS3	NKAIN2	ZNF385D	SOX5	KCNIP4	NRG1	NRG3	MARCH1	PTPRD	DAB1	GALNTL6	MDGA2	RBMS3	ALK	CNTNAP2	PTPRT	TENM4	TRA	NRXN1	NRXN3	GRIK2	FHIT	PTPRG	DPP6	ERBB4	MECOM	DLGAP1	ATXN1	OPCML	PRKG1	ASIC2	MAGI2	WWOX	MYO16	DCC	NEGR1	PDE4D	SGCZ	NELL1	FAM155A	PALLD	AGBL1	ZPR1	CNTNAP5	SLC24A3	RYR2	TOMM40	SYNE1	FMN2	HDAC9	ROBO2	EGFLAM	GRM7	HECTD4	CHN2	CTNNA2	THSD7B	DMD	CTNNA3	CSMD1	CASC15	FADS1	
 4 | KO Mice Phenotyping		TGFB1	PDGFRA	DICER1	MITF	TRP53	PRKAB1	FOXP3	COL1A1	APC	FAS	LEP	IL6ST	TP53	ATF2	RB1	IDUA	CSF1	AHR	EGFR	ARNTL	RELB	LMNA	RXRA	E2F1	EGR1	CREBBP	HPRT	ESR1	ESR2	KITL	FGFR1	GNAS	FBN1	FGFR3	FGFR2	PSEN1	HIF1A	IGF1R	STK11	GJA1	LEPR	APOE	CCR2	VDR	IL10	CAV1	PTPN11	PAX6	MMP9	SIRT1	TNFRSF1A	PAX3	VEGFA	TLR2	BMP4	IL6	IFNG	TLR4	BMPR1A	FOXC1	TERC	BRCA1	BRCA2	MEOX2	SHH	SGPL1	PSAP	SMN1	DRD2	THUMPD3-AS1	CDKN2A	NOS3	CHUK	STAT3	SOD2	ERCC1	ADAM17	KIT	PPARG	KRAS	CEBPB	ITGB1	CDKN1B	PKD1	CDKN1C	CDKN1A	THRA	CTNNB1	SIX1	PTEN	LYST	PTGS2	PHEX	GLI3	EDNRB	NOS1	CCL2	DMD	NOS2	ATP7A	
 5 | L1000 Expression Profiling		CA12	BAD	HUS1	DICER1	CDC25A	ASRGL1	ARHGAP28	ZNF79	ARHGAP25	JMJD6	EML3	TM4SF1	CDC20	MELK	CAPN10	BRE	CNOT4	PKP3	KIAA0528	TRIM13	C11ORF71	EI24	HFE	AURKA	CLU	PRCP	ADCY2	EGFR	AQP1	HSP90B1	APEH	CCND2	CDH3	SH3BP5	ATXN7L1	ZNF821	STAT5B	HEMK1	DTNA	XBP1	DAXX	ARID5B	ERLIN1	NR0B2	NR0B1	ESR1	CCNA2	FOSL1	CDHR1	NOTCH1	ATP6V1D	CFH	CDC14B	CCDC144A	HSPH1	ADH5	C5	AMDHD2	HMOX1	APOE	PLS1	CEP76	AASS	CCL25	JUN	IL16	SMC1A	C6ORF62	PTMA	CPD	HLA-DRA	CALU	CDK1	DSG2	CD24	TOP2A	HNF4G	SYNGR3	PHGDH	HAVCR1	ZNF586	HN1L	EED	RRM2	AKR1C2	AKR1C1	ATP2B1	DHRS2	EPOR	TBC1D1	CBLL1	IGHM	CARKD	HSD17B11	OXCT1	CCL5	UBQLN2	CCDC53	PTK2B	SMNDC1	
 6 | Literature		TGFB1	PLCL1	PLCL2	ATP11C	ATP11B	PISD	ATP11A	PLCD1	PLCD4	TP53	PLCD3	DGKG	RELA	ATP10D	ATP10B	TNF	ATP10A	EGFR	MYC	AKT1	PLCE1	ATP9A	PRKCH	CREBBP	PRKCG	ATP8B4	PRKCI	PRKCD	ATP8B3	ATP8B2	ATP8B1	PRKCE	PLA2G4A	PLA2G4B	IGF1	PRKCA	ESR1	NFKB1	PLSCR2	PLSCR1	PLSCR4	PIK3R1	PLSCR3	PLSCR5	NOTCH1	ATP8A2	ATP8A1	PLA2G3	PLA2G5	PLA2G6	UBC	UBB	PLCG1	APOE	PLCG2	PLA2G2E	PRKCB1	JUN	PLA2G2F	PLA2G2D	PLA2G2A	CAV1	DGAT2L7	VEGFA	DGAT2L6	IL1B	IL6	CHPT1	PIK3CA	PLCH2	GSK3B	DAGLA	DAGLB	MOGAT3	MOGAT2	MOGAT1	BRCA1	PLD2	PLD1	PLD4	PLD3	PLTP	RPS27A	PLA2G12B	STAT3	PLCB4	PLA2G10	PLCB2	UBA52	PLCB3	PLCB1	SRC	PLA2G1B	CTNNB1	PTEN	EP300	MAPK3	CEPT1	MAPK1	HRAS	
 7 | Mass Spectrometry		RPS8	HSPA1L	RP11-631M21.2	EEF1A2	HNRNPM	EEF1A1	HNRNPK	CNOT1	VIM	TARDBP	HSPA1B	GTF2I	RPL11	HNRNPU	NOLC1	NDRG1	LMNA	RUVBL1	ACTB	C4B	TUBA3E	TUBB6	KIAA1967	RACGAP1	TUBB4	TCOF1	RUVBL2	FLNA	RPS3	XRCC6	ACTL6A	KIF23	HCFC1	NCL	SNRNP200	MAD1L1	DDX5	RIF1	NUMA1	ACTG1	PCBP1	EPRS	RPL7	ACTG2	RPL8	PCBP2	PRDX1	RFC4	DDX39	HSP90AA1	RFC2	HSPA6	HSPA5	RFC3	HSPA8	TUBB	HSPA2	TUBB2A	TUBB2C	SFPQ	EEF1G	MYH9	DSG1	TOP2B	DDX3Y	TOP2A	IQGAP1	HSPB1	PTBP1	GCN1L1	RBM39	NPM1	DSP	SMARCC2	CCT2	DDX17	DCD	PARP1	DDB1	TRIP13	MATR3	GAPDH	EIF4A3	EIF4A2	HDAC1	RPLP0	POLDIP3	HSP90AB1	FHL3	DDX21	MKI67	U2AF1	HSPD1	ACTC1	SEPT9	TUBA1C	TUBA1B	PUF60	TRIM28	CCT8	
 8 | Microarray		CD74	RPS8	RPS6	RARRES2	RPL23	SDHB	SEPP1	HSPE1	RPSA	SERT1	GSTA2	NPTXR	RPL10	ZFP36L1	ATP5A1	MRFAP1	BSG	ADORA3	SMAD5	HPRT	RGD1563438	TMEM8	RPS20	PSME1	RT1-DA	TXN1	RPL6	SPARC	RGD1564596	RPL32	BRI3	PECR	RGD1309529	ATP5C1	RGD1561926	UQCRH	EEF1B2	RPS15	RPS17	RPS19	PRDX1	IBSP	FTH1	GLUL	APOE	RPL39	RPS10	ACAA2	RPS11	HSPA5	TSC22D1	HSPA8	HMGA2	TCP1	GNB2L1	AKR1A1	RGD1564906	TACR1	RPS3A	ATP5F1	MRPS5	NME2	TUBB2C	CYCS	PRPH1	HRG	RGD1562690	NLGN2	COX7B	BRP44L	NDUFB8	NREP	KHK	ATP5O	SEP15	LDHA	PGK1	NDUFA5	PGAM1	LOC681389	G0S2	SOD1	LOC499782	RGD1561181	VDAC2	TPT1	NP	POPDC2	KCNA2	GBE1	RPL10A	ATP5G3	ATP5G2	ATP5G1	PTS	PSMB7	PSMB3	RGD1310316	HADHSC	CDTW1	
 9 | Multiple Assays		D630039A03RIK	SFMBT2	CDX2	PCDH8	EBF1	DKK1	SOX11	CXCL12	MYCN	DLX1AS	TCL1	LRRN2	LEF1	SLC27A2	HOXB1	RAX	FZD10	SEZ6	ASCL2	NTN1	DLL1	SFRP1	HS6ST2	DLL4	GBX2	JAM2	MSX1	MSX2	PITX2	TCFCP2L1	PHC1	ESRRB	SLC15A1	SEMA6A	ST8SIA1	IGFBP5	BCL11B	BCL11A	GADD45G	EVX1	INHBB	NR0B1	GJB3	FGF15	NANOG	HOXB8	TDGF1	FGFR2	DIDO1	PDGFC	GATA6	SOX21	HOXB13	FGF5	SOX2	SALL3	SALL4	MYBL2	HS3ST3B1	FOXD4	FOXD3	PTCH1	PAX6	BMP4	MRAS	CD24A	ARL4C	DNAJC6	ID3	DPPA3	ID2	SP8	ONECUT1	ZC3HAV1	TRH	HOXC12	CYR61	LRRC2	OTX2	IRX2	IRX3	LEFTY2	LEFTY1	PRMT8	GAD1	DMRT1	COBL	POU5F1	HCK	TBX4	TCFAP2C	TBX3	KLF2	KIT	INSM1	FOXA2	NRP2	GNA14	ZIC2	NEFL	
10 | Relative Cell Proliferation		MED1	CHRNB2	CASP8AP2	CRYBB2	ZNF79	GTPBP3	SMARCA4	NUP93	HNRNPK	RAB31	PSMC4	CIRH1A	PSMC5	PSMC2	SRSF3	SNRPF	PDCD1	SRSF7	AHCYL1	NUP205	GPS1	MYL12B	STRN4	EFTUD2	CWC22	CSE1L	MAP3K8	PRPF38B	EIF2B3	IGFBP5	GPN3	EIF2S2	LSM3	COPZ1	RNF168	MYL12A	PARP10	SMU1	TCN2	BCL2L1	MDM4	RGL4	PSME1	TCF3	RPS25	ATP6V1B2	SRP19	TFRC	CLSTN2	CTR9	PCDH15	LPAR4	PPP4C	UBC	MYO18A	ZNF645	SLC37A1	EIF5A	NCBP2	PTPN11	NUTF2	SMC1A	NUP54	PAFAH1B1	CYB561	COPE	ARF4	PSMD12	URI1	ABCB7	PSIP1	HNF4G	RPAP1	MRPL38	SULT1C4	MED14	SCAP	TEAD1	SUZ12	HSDL2	ADSL	ARNT	SSRP1	HAUS1	ARCN1	PSMA7	PSMA4	GPR143	NAA15	HNRNPH3	INSM2	SLC25A12	EIF3E	TAF2	RYR2	TOMM40	CTNNB1	KCNA6	POLE	PSMB5	RNF138	FIP1L1	PUF60	ITGAV	NPTN	
11 | RNA-seq		RPS6	APCDD1L	RUNDC3B	RPSA	SRRM2	GRIK1-AS1	EEF1A1	PPIA	SEPT7P3	ASPHD1	SNORA29	HOXA10-AS	SCG3	PFN1	MT-ND5	MT-CO1	MT-CO2	CLDN3	MT-ND4	HSP90B1	MT-ND1	MT-ND2	ACTB	BCL2L14	EEF1A1P30	SPOCK3	FLNA	RPS2	RPS3	MLLT10P1	CELA2A	EEF2	SCRT1	TDRD1	EIF4G1	TAGLN3	MT-CO3	EIF4G2	MTND5P11	RPL3	RPL4	DDX5	CCL4L2	ACTG1	ENO1	RPL7	FCRLA	UBC	IGF2-AS	UBB	NETO1	FTH1	FFAR4	RPL21P11	ATCAY	PDZD9	SH3GL3	TRIM67	IL11	HSP90AA1	HSPA5	HSPA8	RPL13A	ARHGAP23P1	RPS11P6	PTMA	KCNS1	PABPC1	CALR	ALDOA	PHBP12	TAT	ELAVL3	ATP5B	CSRP3	LDHA	PSAP	SNORD109A	ERICH5	KIAA0319	SPRR2E	FAM83B	PKM	HNRNPA2B1	AGBL4	RPL21P44	SNRPGP10	GAPDH	TPT1	RPLP0	CLEC12B	HSP90AB1	SNRPEP2	PAICSP4	CLCN1	TMSB4X	MT-ATP6	KRT8P39	TUBA1B	CTNNA2	UPP2	RACK1	
12 | Sequence Analysis		PRLR	HIPK2	SOX11	DCLK1	DOK6	BRWD1	ADCY1	FAXC	SLC1A2	SYNPO2	TMEM245	NACC1	LAMP2	CCND1	SH3TC2	PTAR1	NSD2	UBN2	ZNF704	PPARGC1B	CALN1	HMGN2	SAMD12	KLF12	PHC3	PRKCA	GIGYF1	KCNMA1	PGR	STRN	ZNF652	INO80D	RAB3B	KLHL15	DCUN1D5	CACNA1E	SESTD1	LPP	IGF1R	GRIN2A	ZXDC	G3BP1	ABL2	KCNN3	PDK3	PLXNA4	RIMKLA	POU2F1	KSR2	CBX5	MICAL3	C1ORF21	ACVR2B	ENAH	QKI	SLC7A5	CDK6	AGO2	LCOR	AAK1	TNRC6B	TENM1	CSRNP3	GPR26	BTG2	PTPRT	NUFIP2	BNC2	ONECUT2	FOXK1	TOR1AIP2	AFF2	MECP2	SOCS4	FUT9	DLGAP2	KIF1B	DISC1	ATXN1	NTRK2	MBNL3	PPARA	NANOS1	ZNF460	NFIA	GFRA1	SOD2	MKLN1	PPM1A	PKM	NFIC	XIAP	TAOK1	SLC24A2	CDKN1A	ZDHHC3	SGCD	PSD3	CUX1	DMD	
13 | 


--------------------------------------------------------------------------------