├── .codecov.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── docs ├── Makefile ├── requirements-docs.txt └── source │ ├── CONTRIBUTING.md │ ├── Links.md │ ├── conf.py │ ├── data-formats.md │ ├── edgePy.data_import.mongodb.rst │ ├── edgePy.data_import.rst │ ├── edgePy.rst │ ├── functionality.md │ ├── index.rst │ ├── modules.rst │ ├── planned_statistical_tests.rst │ └── scripts.rst ├── edgePy ├── DGEList.py ├── __init__.py ├── benchmarking │ ├── 00.GSE49712.Rscript.txt │ ├── 00.GSE49712_gene_FPKM.txt │ ├── 01.diagnostic_fig1.png │ ├── 02.diagnostic_fig2.png │ ├── 03.mds.png │ ├── 04.diagnostic_fig3.png │ ├── 05.diagnostic_fig4.png │ ├── 06.heatmap_fig5.png │ ├── 07.DEGs.tsv │ ├── 08.topDEGs.tsv │ ├── 09.analysis.RNAseq.gse49712.Rdata │ └── README_benchmark.md ├── data │ ├── GSE49712_HTSeq.txt.gz │ ├── GSE49712_HTSeq.txt.npz │ ├── example_gene_list.txt │ ├── groups.json │ ├── symbols_homo_sapiens_core_75_37.tsv │ └── transcripts_homo_sapiens_core_75_37.tsv ├── data_import │ ├── __init__.py │ ├── data_import.py │ ├── ensembl │ │ ├── __init__.py │ │ ├── canonical_transcripts.py │ │ ├── ensembl_flat_file_reader.py │ │ └── mysql_wrapper.py │ └── mongodb │ │ ├── __init__.py │ │ ├── gene_functions.py │ │ ├── mongo_import.py │ │ └── mongo_wrapper.py └── util.py ├── pyproject.toml ├── pytest.ini ├── requirements-test.txt ├── scripts ├── __init__.py └── edgepy.py ├── setup.cfg ├── setup.py ├── tests ├── ensembl │ ├── test_canonical_transcripts.py │ └── test_ensembl_flat_file_reader.py ├── mongodb │ ├── fixtures │ │ ├── RNASeq.json │ │ ├── ensg_by_symbol.json │ │ ├── samples.json │ │ └── symbol_by_ensg.json │ ├── test_gene_functions.py │ ├── test_mongo_import.py │ └── test_mongo_wrapper.py ├── test_DGEList.py └── test_edgePy.py └── tox.ini /.codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | precision: 1 3 | round: down 4 | status: 5 | project: 6 | default: 7 | enabled: yes 8 | target: 90% 9 | threshold: 0.25% 10 | patch: 11 | default: 12 | target: auto 13 | 14 | comment: 15 | layout: "diff" 16 | behavior: default 17 | require_changes: no 18 | require_base: no 19 | require_head: yes 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Uncompressed project data files 2 | edgePy/data/*.csv 3 | edgePy/data/*.tsv 4 | edgePy/data/*.txt 5 | !edgePy/data/groups.txt 6 | !edgePy/data/example_gene_list.txt 7 | !edgePy/data/transcripts_homo_sapiens_core_75_37.tsv 8 | !edgePy/data/symbols_homo_sapiens_core_75_37.tsv 9 | 10 | # Intellij project files 11 | .idea 12 | 13 | # VS Code project files 14 | .vscode/ 15 | 16 | # Concatenated from the following sources on 2018-05-08: 17 | # 18 | # Lang. URI 19 | # Python https://github.com/github/gitignore/blob/18e28746b0862059dbee8694fd366a679cb812fb/Python.gitignore 20 | # R https://github.com/github/gitignore/blob/18e28746b0862059dbee8694fd366a679cb812fb/R.gitignore 21 | # 22 | 23 | # Byte-compiled / optimized / DLL files 24 | __pycache__/ 25 | *.py[cod] 26 | *$py.class 27 | 28 | # C extensions 29 | *.so 30 | 31 | # Distribution / packaging 32 | .Python 33 | build/ 34 | develop-eggs/ 35 | dist/ 36 | downloads/ 37 | eggs/ 38 | .eggs/ 39 | lib/ 40 | lib64/ 41 | parts/ 42 | sdist/oh 43 | var/ 44 | wheels/ 45 | *.egg-info/ 46 | .installed.cfg 47 | *.egg 48 | MANIFEST 49 | 50 | # PyInstaller 51 | # Usually these files are written by a python script from a template 52 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 53 | *.manifest 54 | *.spec 55 | 56 | # Installer logs 57 | pip-log.txt 58 | pip-delete-this-directory.txt 59 | 60 | # Unit test / coverage reports 61 | htmlcov/` 62 | .tox/ 63 | .coverage 64 | .coverage.* 65 | .cache 66 | nosetests.xml 67 | coverage.xml 68 | *.cover 69 | .hypothesis/ 70 | .pytest_cache/ 71 | 72 | # Translations 73 | *.mo 74 | *.pot 75 | 76 | # Django stuff: 77 | *.log 78 | local_settings.py 79 | db.sqlite3 80 | 81 | # Flask stuff: 82 | instance/ 83 | .webassets-cache 84 | 85 | # Scrapy stuff: 86 | .scrapy 87 | 88 | # Sphinx documentation 89 | docs/_build/ 90 | 91 | # PyBuilder 92 | target/ 93 | 94 | # Jupyter Notebook 95 | .ipynb_checkpoints 96 | 97 | # pyenv 98 | .python-version 99 | 100 | # celery beat schedule file 101 | celerybeat-schedule 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | 128 | # History files 129 | .Rhistory 130 | .Rapp.history 131 | 132 | # Session Data files 133 | .RData 134 | 135 | # Example code in package build process 136 | *-Ex.R 137 | 138 | # Output files from R CMD build 139 | /*.tar.gz 140 | 141 | # Output files from R CMD check 142 | /*.Rcheck/ 143 | 144 | # RStudio files 145 | .Rproj.user/ 146 | 147 | # produced vignettes 148 | vignettes/*.html 149 | vignettes/*.pdf 150 | 151 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 152 | .httr-oauth 153 | 154 | # knitr and R markdown default cache directories 155 | /*_cache/ 156 | /cache/ 157 | 158 | # Temporary files created by R markdown 159 | *.utf8.md 160 | *.knit.md 161 | 162 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html 163 | rsconnect/ 164 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/ambv/black 3 | rev: stable 4 | hooks: 5 | - id: black 6 | language_version: python3.6 7 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | build: 2 | image: latest 3 | 4 | python: 5 | version: 3.6 6 | setup_py_install: true -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - 3.6 4 | # Corrupt `boto.cfg` on TravisCI images 5 | # https://github.com/travis-ci/travis-ci/issues/7940 6 | before_install: 7 | - sudo rm -f /etc/boto.cfg 8 | install: 9 | - pip install codecov tox-travis 10 | script: 11 | - tox 12 | after_success: 13 | - codecov 14 | notifications: 15 | email: false 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright © 2018 [R-Bioinformatics Group] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include edgePy/data/* 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # edgePy 2 | 3 | ### Notice: 4 | 5 | This project is still in development. While we are a small band of bioinformaticians with big goals and aspirations, this code base is still too new for use on any real world projects. 6 | While there's no official timeline for the project, functionality is being developed rapidly, so please feel free to check back on our progress frequently. If you'd like to do more 7 | than just check on our progress, we're always happy to welcome new members of the community, both to slack group where we're organizing this project, as well as on the git hub repository 8 | hosting the project. To join the slack, send your email to @apfejes (on github) or /u/apfejes on reddit - we're looking forward to working with you. 9 | 10 | [//]: # (TODO: Remove sample-sheet dummy library until we release on PyPi) 11 | [![PyPI Version](https://badge.fury.io/py/edgePy.svg)](https://pypi.org/project/edgePy) 12 | [![Build Status](https://travis-ci.org/r-bioinformatics/edgePy.svg?branch=master)](https://travis-ci.org/r-bioinformatics/edgePy) 13 | [![Documentation Status](https://readthedocs.org/projects/edgepy/badge/?version=latest)](http://edgepy.readthedocs.io/en/latest/?badge=latest) 14 | [![codecov](https://codecov.io/gh/r-bioinformatics/edgePy/branch/master/graph/badge.svg)](https://codecov.io/gh/r-bioinformatics/edgePy) 15 | [![Checked with MyPy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/) 16 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) 17 | [![GitHub License](https://img.shields.io/pypi/l/sample-sheet.svg)](https://github.com/r-bioinformatics/edgePy/blob/master/LICENSE) 18 | 19 | 20 | The `edgePy` library will become an implementation of [`edgeR`](https://bioconductor.org/packages/release/bioc/html/edgeR.html) for differential expression analysis in the Python language. 21 | This library will have advantages over [`edgeR`](https://bioconductor.org/packages/release/bioc/html/edgeR.html) in that it will be well-tested and will run faster by utilizing Cythonized routines. 22 | `edgePy` will maintain the functionality of [`edgeR`](https://bioconductor.org/packages/release/bioc/html/edgeR.html) in that it's primary goals are differential expression analysis of RNA-Seq expression profiles with biological replication. 23 | The statistical methods for negative binomial distributions will include empirical Bayes estimations, exact tests, generalized linear models, and quasi-likelihood tests. 24 | 25 | ## Project Aims 26 | 27 | The `edgePy` library will be used for data import, normalization with respect to conditions, application of generalized linear models, and visualization. 28 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = --color 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = EdgePy 8 | SOURCEDIR = source 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | recommonmark 2 | Sphinx >= 1.4.4 3 | sphinx_rtd_theme 4 | tox >= 3.1.2 5 | -------------------------------------------------------------------------------- /docs/source/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | When contributing to this repository discuss the change you wish to make _via_ this project's [GitHub issues](https://github.com/r-bioinformatics/edgePy/issues) first. 4 | 5 | ## PR Process for Project Contributers 6 | 7 | Always ensure that you have fetched (_via_ `git pull`) the most recent material into your local clone. 8 | 9 | 1. Checkout a branch (`git checkout -b `) prefixed with your initials and suffixed with the issue you are addressing or a brief few words describing the feature/bug fix joined by underscores (`_`). Here are valid formats: 10 | - `cv_issue_45` 11 | - `af_issue_2123` 12 | - `cv_fix_requests_regression` 13 | - `af_transpose_docs` 14 | 2. Commit changes. 15 | 3. Execute and create tests regularly. Use `py.test`. 16 | 4. Request informal review from peers by pointing them to your branch. 17 | 5. Create a Pull Request against `master` when a formal review is needed. 18 | 6. Optionally, squash commits and reword messages as needed for easier review. 19 | 7. Ensure all continuous integration (CI) tests and code reviews pass before rebasing (or squashing and then rebasing) onto `master`. 20 | 21 | - Avoid directly merging a PR onto `master` without first rebasing. 22 | 23 | ## Documentation and Code Style 24 | 25 | 1. Strictly adhere to PEP8. 26 | 2. Use [Google Style](http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) docstrings. 27 | 3. Implement doctests. 28 | 4. Provide accurate type annotations. 29 | 5. Limit line lengths to 120 characters. 30 | 31 | An example function showcasing the above requirements: 32 | 33 | ```python 34 | def get_dataset_path( 35 | filename: Union[str, Path], 36 | dead_arg: Optional[Any] = None 37 | ) -> Path: 38 | """Example function with PEP 484 type annotations. 39 | 40 | Args: 41 | filename: The first parameter. 42 | dead_arg: The second parameter. 43 | 44 | Returns: 45 | The path to the dataset, may not really exist. 46 | 47 | Examples: 48 | >>> from module.io import get_dataset_path 49 | >>> str(get_dataset_path("GSE49712_HTSeq.txt.gz")) # doctest:+ELLIPSIS 50 | '.../data/GSE49712_HTSeq.txt.gz' 51 | 52 | Notes: 53 | 1. See ``module.rationalize`` for an equivalent method. 54 | 55 | """ 56 | import module 57 | directory = Path(module.__file__).expanduser().resolve().parent 58 | return directory / 'data' / filename 59 | ``` 60 | 61 | ### Code Style 62 | This repository uses [Black](https://github.com/ambv/black) as a code formatter. 63 | 64 | It can be ran a few different ways: 65 | 66 | 1. Manually by running `$ black .` in the repository root 67 | 2. Though [pre-commit](https://pre-commit.com/) a git hook that runs it whenever a commit is made. 68 | 3. It can also be integrated in the of your choice by following the instructions in the [documention](https://github.com/ambv/black#editor-integration). 69 | 70 | ## Updating the documentation 71 | 72 | New documentation files must be of the following format: 73 | - reStructuredText (**.rst**) -- _preferred_ 74 | - Markdown (**.md**) 75 | 76 | A new file can be added to the appropriate gloassary tree in `edgePy/docs/source/index.rst`. 77 | 78 | The service `readthedocs.org` will automatically source the *conf.py* file in `edgePy/docs/sources/conf.py` and update the docs accordingly on each commit pushed to GitHub, on any branch. 79 | 80 | Local HTML renders of the documentation can be built with the following: 81 | 82 | ```bash 83 | ❯ cd edgePy/docs 84 | ❯ pip install -r requirements-docs.txt 85 | ❯ make html 86 | ``` 87 | 88 | This will create or update the HTML documents in the `\docs\_build\html` directory. 89 | 90 | ## Developing in a Virtual Environment 91 | 92 | The development environment is listed as an additional `Tox` environment: 93 | 94 | ```bash 95 | ❯ tox -lv 96 | 97 | using tox.ini: .../edgePy/tox.ini 98 | using tox-3.1.2 from .../python3.6/dist-packages/tox/__init__.py 99 | default environments: 100 | py36 -> run the test suite with (basepython) 101 | py36-lint -> check the code style 102 | py36-type -> type check the library 103 | py36-docs -> test building of HTML docs 104 | 105 | additional environments: 106 | dev -> the official edgePy development environment 107 | ``` 108 | 109 | To create and activate that environment issue the following: 110 | 111 | ```bash 112 | ❯ cd edgePy 113 | # Create the development environment (force recreation) 114 | ❯ tox --recreate -e dev 115 | # Activate the development environment 116 | ❯ source venv/bin/activate 117 | 118 | ``` 119 | 120 | ## Running the Test Suite 121 | 122 | All tests are coordinated by `Tox`. Running the unit tests, code coverage, code style (linting) checks, static analysis of typing, and successful compilation of the docs is as simple as the following commands! 123 | 124 | > **Note**: This command takes a long time the first time it is invoked since all virtual environments need to be created! 125 | 126 | ```bash 127 | ❯ cd edgePy 128 | ❯ tox 129 | ``` 130 | 131 | ## Running Parts of the Test Suite 132 | 133 | You can select only a part of the test suite by looking at which `Tox` groups are available: 134 | 135 | ```bash 136 | ❯ cd edgePy 137 | ❯ tox -lv 138 | 139 | using tox.ini: ../edgePy/tox.ini 140 | using tox-3.1.2 from ../python3.6/dist-packages/tox/__init__.py 141 | default environments: 142 | py36 -> run the test suite with (basepython) 143 | py36-lint -> check the code style 144 | py36-type -> type check the library 145 | py36-docs -> test building of HTML docs 146 | ``` 147 | 148 | Choose a specific group to run with the following syntax: 149 | 150 | ```bash 151 | ❯ cd edgePy 152 | ❯ tox -e py36-type 153 | ``` 154 | 155 | Almost all dynamic and static analysis tools are configured in `setup.cfg` so check there for the configuration of the test suite first. 156 | -------------------------------------------------------------------------------- /docs/source/Links.md: -------------------------------------------------------------------------------- 1 | # Links 2 | 3 | Project Notes: [Google doc]( 4 | https://docs.google.com/document/d/1s-GMQld8DYtfxupELuYw2VtpXL6gSAUMCqITSmi7Udg/edit#heading=h.uk4y7e3dhmh6) -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Configuration file for the Sphinx documentation builder. 5 | # 6 | # This file does only contain a selection of the most common options. For a 7 | # full list see the documentation: 8 | # http://www.sphinx-doc.org/en/master/config 9 | 10 | # -- Path setup -------------------------------------------------------------- 11 | 12 | # If extensions (or modules to document with autodoc) are in another directory, 13 | # add these directories to sys.path here. If the directory is relative to the 14 | # documentation root, use os.path.abspath to make it absolute, like shown here. 15 | # 16 | import os 17 | import sys 18 | 19 | sys.path.insert(0, os.path.abspath('../../..')) 20 | sys.path.insert(0, os.path.abspath('../..')) 21 | sys.path.insert(0, os.path.abspath('../')) 22 | 23 | 24 | # -- Project information ----------------------------------------------------- 25 | 26 | project = "EdgePy" 27 | copyright = "2018, R-Bioinformatics" 28 | author = "R-Bioinformatics" 29 | 30 | # The short X.Y version 31 | version = "" 32 | # The full version, including alpha/beta/rc tags 33 | release = "0.0.1" 34 | 35 | 36 | # -- General configuration --------------------------------------------------- 37 | 38 | # If your documentation needs a minimal Sphinx version, state it here. 39 | # 40 | # needs_sphinx = '1.0' 41 | 42 | # Add any Sphinx extension module names here, as strings. They can be 43 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 44 | # ones. 45 | extensions = [ 46 | 'sphinx.ext.viewcode', 47 | 'sphinx.ext.mathjax', 48 | 'sphinx.ext.intersphinx', 49 | 'sphinx.ext.autodoc', 50 | 'sphinx.ext.napoleon', 51 | # 'sphinx_autodoc_annotation', 52 | # 'sphinx_autodoc_napoleon_typehints', 53 | ] 54 | 55 | napoleon_include_private_with_doc = True 56 | napoleon_google_docstring = True 57 | napoleon_numpy_docstring = False 58 | napoleon_use_param = False 59 | napoleon_use_ivar = False 60 | napoleon_use_rtype = True 61 | 62 | intersphinx_mapping = { 63 | 'python': ('https://docs.python.org/3', None), 64 | 'requests': ('http://docs.python-requests.org/en/latest/', None), 65 | } 66 | 67 | # Add any paths that contain templates here, relative to this directory. 68 | templates_path = ['_templates'] 69 | 70 | # The suffix(es) of source filenames. 71 | # You can specify multiple suffix as a list of string: 72 | # 73 | 74 | # MARKDOWN PARSER 75 | source_suffix = ['.rst', '.md'] 76 | source_parsers = {'.md': 'recommonmark.parser.CommonMarkParser'} 77 | 78 | # The master toctree document. 79 | master_doc = 'index' 80 | 81 | # The language for content autogenerated by Sphinx. Refer to documentation 82 | # for a list of supported languages. 83 | # 84 | # This is also used if you do content translation via gettext catalogs. 85 | # Usually you set "language" from the command line for these cases. 86 | language = 'en' 87 | 88 | # List of patterns, relative to source directory, that match files and 89 | # directories to ignore when looking for source files. 90 | # This pattern also affects html_static_path and html_extra_path . 91 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 92 | 93 | # The name of the Pygments (syntax highlighting) style to use. 94 | pygments_style = 'default' 95 | 96 | # -- Options for HTML output ------------------------------------------------- 97 | 98 | # The theme to use for HTML and HTML Help pages. 99 | # See the documentation for a list of builtin themes. 100 | # 101 | html_theme = "sphinx_rtd_theme" 102 | 103 | # Theme options are theme-specific and customize the look and feel of a theme 104 | # further. For a list of options available for each theme, see the 105 | # documentation. 106 | # 107 | html_theme_options = { 108 | 'canonical_url': '', 109 | 'analytics_id': '', 110 | 'logo_only': False, 111 | 'display_version': True, 112 | 'prev_next_buttons_location': 'bottom', 113 | 'style_external_links': False, 114 | # 'vcs_pageview_mode': '', 115 | # Toc options 116 | 'collapse_navigation': True, 117 | 'sticky_navigation': True, 118 | 'navigation_depth': 4, 119 | 'includehidden': True, 120 | 'titles_only': False, 121 | } 122 | 123 | autodoc_mock_imports = ['pymongo'] 124 | 125 | # html_theme_path = ["_themes/sphinx_rtd_theme", ] 126 | 127 | # Add any paths that contain custom static files (such as style sheets) here, 128 | # relative to this directory. They are copied after the builtin static files, 129 | # so a file named "default.css" will overwrite the builtin "default.css". 130 | # html_static_path = ['_static'] 131 | 132 | # Custom sidebar templates, must be a dictionary that maps document names 133 | # to template names. 134 | # 135 | # The default sidebars (for documents that don't match any pattern) are 136 | # defined by theme itself. Builtin themes are using these templates by 137 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 138 | # 'searchbox.html']``. 139 | # 140 | # html_sidebars = {} 141 | 142 | 143 | # -- Options for HTMLHelp output --------------------------------------------- 144 | 145 | # Output file base name for HTML help builder. 146 | htmlhelp_basename = "EdgePydoc" 147 | 148 | 149 | # -- Options for LaTeX output ------------------------------------------------ 150 | 151 | latex_elements = { 152 | # The paper size ('letterpaper' or 'a4paper'). 153 | # 154 | # 'papersize': 'letterpaper', 155 | # The font size ('10pt', '11pt' or '12pt'). 156 | # 157 | # 'pointsize': '10pt', 158 | # Additional stuff for the LaTeX preamble. 159 | # 160 | # 'preamble': '', 161 | # Latex figure (float) alignment 162 | # 163 | # 'figure_align': 'htbp', 164 | } 165 | 166 | # Grouping the document tree into LaTeX files. List of tuples 167 | # (source start file, target name, title, 168 | # author, documentclass [howto, manual, or own class]). 169 | latex_documents = [ 170 | (master_doc, 'EdgePy.tex', 'EdgePy Documentation', 'R-Bioinformatics', 'manual') 171 | ] 172 | 173 | 174 | # -- Options for manual page output ------------------------------------------ 175 | 176 | # One entry per manual page. List of tuples 177 | # (source start file, name, description, authors, manual section). 178 | man_pages = [(master_doc, 'edgepy', 'EdgePy Documentation', [author], 1)] 179 | 180 | 181 | # -- Options for Texinfo output ---------------------------------------------- 182 | 183 | # Grouping the document tree into Texinfo files. List of tuples 184 | # (source start file, target name, title, author, 185 | # dir menu entry, description, category) 186 | texinfo_documents = [ 187 | ( 188 | master_doc, 189 | 'EdgePy', 190 | 'EdgePy Documentation', 191 | author, 192 | 'EdgePy', 193 | 'One line description of project.', 194 | 'Miscellaneous', 195 | ) 196 | ] 197 | -------------------------------------------------------------------------------- /docs/source/data-formats.md: -------------------------------------------------------------------------------- 1 | ## Input 2 | 3 | Input should be given in tab-delimited format with the following header column names: 4 | 5 | - `FeatureID` (Can be gene, transcript, splice variant or protein) 6 | - `SampleID` (Generic label to keep track of an individual sample) 7 | 8 | Input main content: 9 | 10 | - `Data Matrix` A numeric matrix (Previously `counts`). 11 | A numeric matrix. Each row represents a single feature and each column represent a single sample. 12 | 13 | Sample data can be found in the [`data/`](../edgePy/data/) folder, which was derived from data on the [NCBI Gene Expression Ombnibus](https://www.ncbi.nlm.nih.gov/geo/). 14 | -------------------------------------------------------------------------------- /docs/source/edgePy.data_import.mongodb.rst: -------------------------------------------------------------------------------- 1 | edgePy.data\_import.mongodb package 2 | =================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | edgePy.data\_import.mongodb.gene\_functions module 8 | -------------------------------------------------- 9 | 10 | .. automodule:: edgePy.data_import.mongodb.gene_functions 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | edgePy.data\_import.mongodb.mongo\_import module 16 | ------------------------------------------------ 17 | 18 | .. automodule:: edgePy.data_import.mongodb.mongo_import 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | edgePy.data\_import.mongodb.mongo\_wrapper module 24 | ------------------------------------------------- 25 | 26 | .. automodule:: edgePy.data_import.mongodb.mongo_wrapper 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | 32 | Module contents 33 | --------------- 34 | 35 | .. automodule:: edgePy.data_import.mongodb 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | -------------------------------------------------------------------------------- /docs/source/edgePy.data_import.rst: -------------------------------------------------------------------------------- 1 | edgePy.data\_import package 2 | =========================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | edgePy.data_import.mongodb 10 | 11 | Submodules 12 | ---------- 13 | 14 | edgePy.data\_import.data\_import module 15 | --------------------------------------- 16 | 17 | .. automodule:: edgePy.data_import.data_import 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: edgePy.data_import 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /docs/source/edgePy.rst: -------------------------------------------------------------------------------- 1 | edgePy package 2 | ============== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | edgePy.data_import 10 | 11 | Submodules 12 | ---------- 13 | 14 | edgePy.DGEList module 15 | --------------------- 16 | 17 | .. automodule:: edgePy.DGEList 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: edgePy 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /docs/source/functionality.md: -------------------------------------------------------------------------------- 1 | # Functionality 2 | 3 | OMICs analysis is made easy with R tools such as “edgeR” and “limma” packages. R has serious limitations when applied to large datasets. 4 | 5 | The First objective of edgePY is to offer an alternative free tool for such analysis. 6 | 7 | # Components 8 | 9 | Input(1) -> Normalization(2) -> Analysis(3) -> Visualization/Results(4). 10 | 11 | ## Input 12 | 13 | Read correctly the file 14 | 15 | A data matrix separated by tab. Of genes/proteins in lines and samples/observations in columns. Groups for the main analysis usually are defined there, or assigned to the samples. 16 | 17 | ## Normalization 18 | 19 | Quality -> Library -> (TMM or RLE or upperquartile or none) -> commonDispersion -> TagwiseDispersion -> Norm. Matrix 20 | 21 | ## Analysis 22 | 23 | Norm. Matrix -> Set the sample groups to be compared -> Statistical analysis of choice (ebayes/treat/QLF) -> DE genes list and statistics 24 | 25 | ## Visualization/Results 26 | 27 | DE genes list / Statistics -> Visualization ( Venn / The mean-variance relationship of log-CPM / Heatmaps / Volcano plots / Dispersion plots 28 | 29 | 30 | More details should be added as we progress in the coding. 31 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. EdgePy documentation master file, created by 2 | sphinx-quickstart on Thu Jul 12 11:20:59 2018. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to EdgePy's documentation! 7 | ================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Main Documentation 12 | 13 | functionality 14 | data-formats 15 | planned_statistical_tests 16 | CONTRIBUTING 17 | Links 18 | modules 19 | 20 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | edgePy 2 | ====== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | edgePy 8 | scripts 9 | -------------------------------------------------------------------------------- /docs/source/planned_statistical_tests.rst: -------------------------------------------------------------------------------- 1 | Planned statistical tests 2 | ============================= 3 | 4 | This table includes statistical tests that will be implemented within this package. 5 | 6 | +-------------------------------------------------+-------------------------------+-----------------------------------------------------------------------------------------+----------------------+----------------------------------------------------------+ 7 | | Statistical test | R src location | Python test link | Python library | Notes | 8 | +=================================================+===============================+=========================================================================================+======================+==========================================================+ 9 | | Multiple exact binomial tests | edgeR/R/binomTest.R | https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.binom_test.html | scipy | | 10 | +-------------------------------------------------+-------------------------------+-----------------------------------------------------------------------------------------+----------------------+----------------------------------------------------------+ 11 | | F test | edgeR/R/decidetestsDGE.R | https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f.html | scipy | | 12 | +-------------------------------------------------+-------------------------------+-----------------------------------------------------------------------------------------+----------------------+----------------------------------------------------------+ 13 | | Linear Modelling | limma/R/lmfit.R | https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.linregress.html | scipy | lmFit and contrasts. fit | 14 | +-------------------------------------------------+-------------------------------+-----------------------------------------------------------------------------------------+----------------------+----------------------------------------------------------+ 15 | | Negative Binomial | edgeR/R/glmfit.R | https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.nbinom.html | scipy | | 16 | +-------------------------------------------------+-------------------------------+-----------------------------------------------------------------------------------------+----------------------+----------------------------------------------------------+ 17 | | Z score of Binomial | edgerR/R/zscoreNBinom.R | https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.zscore.html | scipy | Tentative / Not sure if this is the same as scipy zscore | 18 | +-------------------------------------------------+-------------------------------+-----------------------------------------------------------------------------------------+----------------------+----------------------------------------------------------+ 19 | | Quasi-Likelihood GLM tests | edgeR/R/glmQLFTest.R | https://docs.pymc.io/notebooks/GLM-negative-binomial-regression.html | MC3 | Tentative | 20 | +-------------------------------------------------+-------------------------------+-----------------------------------------------------------------------------------------+----------------------+----------------------------------------------------------+ 21 | | Exact conditional likelihood | edgeR/R/estimateCommonDisp.R | couldnt find it on scipy | n/a | Very important for DE | 22 | +-------------------------------------------------+-------------------------------+-----------------------------------------------------------------------------------------+----------------------+----------------------------------------------------------+ 23 | | Weighted conditional likelihood empirical Bayes | edgeR/R/estimateTagwiseDisp.R | couldnt find it on scipy | n/a | Very important for DE | 24 | +-------------------------------------------------+-------------------------------+-----------------------------------------------------------------------------------------+----------------------+----------------------------------------------------------+ -------------------------------------------------------------------------------- /docs/source/scripts.rst: -------------------------------------------------------------------------------- 1 | scripts package 2 | =============== 3 | 4 | Submodules 5 | ---------- 6 | 7 | scripts.edgepy module 8 | --------------------- 9 | 10 | .. automodule:: scripts.edgepy 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: scripts 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /edgePy/DGEList.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from io import StringIO 4 | from pathlib import Path 5 | from typing import Generator, Iterable, Mapping, Optional, Union, Dict, List, Hashable, Any 6 | 7 | # TODO: Implement `mypy` stubs for NumPy imports 8 | import numpy as np # type: ignore 9 | from smart_open import smart_open # type: ignore 10 | 11 | from edgePy.util import getLogger 12 | from edgePy.data_import.ensembl.ensembl_flat_file_reader import CanonicalDataStore 13 | 14 | __all__ = ["DGEList"] 15 | 16 | PRIOR_COUNT: float = 0.25 17 | 18 | log = getLogger(name=__name__) 19 | 20 | 21 | class DGEList(object): 22 | """Class containing read counts over genes for multiple samples and their 23 | corresponding metadata. 24 | 25 | Args: 26 | counts: Columns correspond to samples and row to genes. 27 | samples: Array of sample names, same length as ncol(counts). 28 | genes: Array of gene names, same length as nrow(counts). 29 | norm_factors: Weighting factors for each sample. 30 | groups_in_list: a list of groups to which each sample belongs, in the same order as samples *or* 31 | groups_in_dict: a dictionary of groups, containing sample names. 32 | to_remove_zeroes: To remove genes with zero counts for all samples. 33 | filename: a shortcut to import NPZ (zipped numpy format) files. 34 | current_type: None means raw counts, otherwise, if transformed, store a string (eg. 'cpm', 'rpkm', etc) 35 | current_log: Optional[bool] = False, If counts has already been log transformed, store True. 36 | Examples: 37 | 38 | >>> from edgePy.data_import import get_dataset_path 39 | >>> dataset = 'GSE49712_HTSeq.txt.gz' 40 | >>> group_file = 'groups.json' 41 | >>> DGEList.create_DGEList_data_file(get_dataset_path(dataset), get_dataset_path(group_file)) 42 | DGEList(num_samples=10, num_genes=21,711) 43 | 44 | """ 45 | 46 | # Pattern to delete from field names anytime they are assigned. 47 | _field_strip_re = re.compile(r'[\s"]+') 48 | 49 | # Metatags used in older HTSeq datasets without underscore prefixes. 50 | _old_metatags = np.array( 51 | ['no_feature', 'ambiguous', 'too_low_aQual', 'not_aligned', 'alignment_not_unique'] 52 | ) 53 | 54 | def __init__( 55 | self, 56 | counts: Optional[np.ndarray] = None, 57 | samples: Optional[np.array] = None, 58 | genes: Optional[np.array] = None, 59 | norm_factors: Optional[np.array] = None, 60 | groups_in_list: Optional[np.array] = None, 61 | groups_in_dict: Optional[Dict] = None, 62 | to_remove_zeroes: Optional[bool] = False, 63 | filename: Optional[str] = None, 64 | current_transform_type: Optional[str] = None, 65 | current_log_status: Optional[bool] = False, 66 | ) -> None: 67 | 68 | self.to_remove_zeroes = to_remove_zeroes 69 | self.current_data_format = current_transform_type 70 | self.current_log_status = current_log_status 71 | 72 | if filename: 73 | if counts or samples or genes or norm_factors or groups_in_list or groups_in_dict: 74 | raise Exception("if filename is provided, you can't also provide other parameters") 75 | self._counts = None 76 | self.read_npz_file(filename) 77 | 78 | else: 79 | if counts is None: 80 | raise Exception("counts must be provided at init") 81 | 82 | if norm_factors is None: 83 | try: 84 | norm_factors = np.ones(np.size(counts, 1)) 85 | except IndexError: 86 | raise ValueError( 87 | "counts must have more than one sample " "- eg, have two dimensions" 88 | ) 89 | 90 | self.counts = counts 91 | self.samples = samples 92 | self.genes = genes 93 | self.norm_factors = norm_factors 94 | 95 | if groups_in_dict is not None and groups_in_list is not None: 96 | self.groups_dict = groups_in_dict 97 | self.groups_list = groups_in_list 98 | elif groups_in_dict is not None and self.samples is not None: 99 | self.groups_dict = groups_in_dict 100 | self.groups_list = self._sample_group_list(groups_in_dict, self.samples) 101 | elif groups_in_list is not None and self.samples is not None: 102 | self.groups_list = groups_in_list 103 | self.groups_dict = self._sample_group_dict(groups_in_list, self.samples) 104 | else: 105 | raise ValueError( 106 | "You must provide either group by sample or sample by group, " 107 | "and samples must be present" 108 | ) 109 | 110 | def copy( 111 | self, 112 | counts: Optional[np.ndarray] = None, 113 | samples: Optional[np.array] = None, 114 | genes: Optional[np.array] = None, 115 | norm_factors: Optional[np.array] = None, 116 | groups_in_list: Optional[np.array] = None, 117 | groups_in_dict: Optional[Dict] = None, 118 | to_remove_zeroes: Optional[bool] = False, 119 | current_type: Optional[str] = None, 120 | current_log: Optional[bool] = False, 121 | ) -> "DGEList": 122 | 123 | return DGEList( 124 | counts=self.counts if counts is None else counts, 125 | samples=self.samples if samples is None else samples, 126 | genes=self.genes if genes is None else genes, 127 | norm_factors=self.norm_factors if norm_factors is None else norm_factors, 128 | groups_in_list=self.groups_dict if groups_in_dict is None else groups_in_dict, 129 | groups_in_dict=self.groups_list if groups_in_list is None else groups_in_list, 130 | to_remove_zeroes=self.to_remove_zeroes 131 | if to_remove_zeroes is None 132 | else to_remove_zeroes, 133 | current_transform_type=self.current_data_format 134 | if current_type is None 135 | else current_type, 136 | current_log_status=self.current_log_status if current_log is None else current_log, 137 | ) 138 | 139 | @staticmethod 140 | def _sample_group_dict(groups_list: List[str], samples: np.array): 141 | """ 142 | Converts data in the form ['group1', 'group1', 'group2', 'group2'] 143 | to the form {'group1': ['sample1', 'sample2'], 'group2': ['sample3', 'sample4'} 144 | 145 | Args: 146 | groups_list: group names in a list, in the same order as samples. 147 | 148 | Returns: 149 | dictionary containing the sample types, each with a list of samples. 150 | 151 | """ 152 | d: Dict[Hashable, Any] = {} 153 | log.info(samples) 154 | for idx, group in enumerate(groups_list): 155 | if group not in d: 156 | d[group] = [] 157 | d[group].append(samples[idx]) 158 | return d 159 | 160 | @staticmethod 161 | def _sample_group_list(groups_dict, samples): 162 | """ 163 | Converts data in the form {'group1': ['sample1', 'sample2'], 'group2': ['sample3', 'sample4'} 164 | to the form ['group1', 'group1', 'group2', 'group2'] 165 | 166 | Args: 167 | groups_dict: dictionary containing the sample types, each with a list of samples. 168 | samples: order of samples in the DGEList 169 | 170 | Returns: 171 | data in a list, in the same order as samples. 172 | 173 | """ 174 | d = [] 175 | temp_d = {} 176 | for group in groups_dict: 177 | for sample in groups_dict[group]: 178 | temp_d[sample] = group 179 | 180 | for sample in samples: 181 | d.append(temp_d[sample]) 182 | 183 | return np.array(d) 184 | 185 | @staticmethod 186 | def _format_fields(fields: Iterable[Union[str, bytes]]) -> Generator[str, None, None]: 187 | """Clean fields in the header of any read data. 188 | 189 | Yields: 190 | The next field that has been cleaned. 191 | 192 | """ 193 | for field in fields: 194 | if isinstance(field, bytes): 195 | field = field.decode() 196 | yield DGEList._field_strip_re.sub("", field) 197 | 198 | @property 199 | def counts(self) -> np.matrix: 200 | """The read counts for the genes in all samples. 201 | 202 | Returns: 203 | counts: Columns correspond to samples and row to genes. 204 | 205 | """ 206 | return self._counts 207 | 208 | @counts.setter 209 | def counts(self, counts: np.ndarray) -> None: 210 | """Validate setting ``DGEList.counts`` for the illegal conditions: 211 | 212 | * Must be of type ``np.ndarray`` 213 | * Negative values 214 | * Values that are not numbers 215 | * No values can be N/A 216 | 217 | Args: 218 | counts: Columns correspond to samples and row to genes. 219 | 220 | """ 221 | if counts is None: 222 | self._counts = None 223 | return 224 | 225 | if not isinstance(counts, np.ndarray): 226 | raise TypeError("Counts matrix must be of type ``np.ndarray``.") 227 | 228 | if hasattr(self, "_counts"): 229 | # do checks for things here. You shouldn't modify counts 230 | # if it has already been set. Create a new obj. 231 | if hasattr(self, "_samples") and self._samples is not None: 232 | gene_count, sample_count = counts.shape 233 | log.info(f"sample count: {sample_count}, gene count: {gene_count}") 234 | log.info( 235 | f"samples shape {self.samples.shape[0]}, gene shape {self.genes.shape[0]}" 236 | ) 237 | log.info(self.genes) 238 | 239 | if sample_count != self.samples.shape[0] or gene_count != self.genes.shape[0]: 240 | raise ValueError( 241 | "Attempting to substitute counts data " 242 | "into DGEList object with different " 243 | "dimensions fails." 244 | ) 245 | 246 | if np.isnan(counts).any(): 247 | raise ValueError("Counts matrix must have only real values.") 248 | if not self.current_log_status and (counts < 0).any(): 249 | raise ValueError("Counts matrix cannot contain negative values.") 250 | 251 | if self.to_remove_zeroes: 252 | # this is not working. Does not remove rows with only zeros. 253 | counts = counts[np.all(counts != 0, axis=1)] 254 | 255 | self._counts = counts 256 | 257 | @property 258 | def samples(self) -> np.array: 259 | """Array of sample names.""" 260 | return self._samples 261 | 262 | @samples.setter 263 | def samples(self, samples: Optional[np.ndarray]) -> None: 264 | """Validate setting ``DGEList.samples`` for the illegal conditions: 265 | 266 | * Must be the same length as the columns in counts` 267 | 268 | Args: 269 | samples: 1D string array representing identifiers of count columns 270 | 271 | """ 272 | if samples is not None: 273 | if self.counts is not None and len(samples) != self.counts.shape[1]: 274 | raise ValueError( 275 | f"Shape of counts does not match samples: " 276 | f"len(samples) = {len(samples)}," 277 | f" self.counts.shape = {self.counts.shape}" 278 | ) 279 | 280 | samples = np.array(list(self._format_fields(samples))) 281 | self._samples = samples 282 | 283 | @property 284 | def genes(self) -> np.array: 285 | """Array of gene names.""" 286 | return self._genes 287 | 288 | @genes.setter 289 | def genes(self, genes: Optional[np.ndarray]) -> None: 290 | # TODO: Validate genes here 291 | # - Genes same length as nrow(self.counts) if defined 292 | if genes is not None: 293 | genes = np.array(list(self._format_fields(genes))) 294 | # Creates boolean mask and filters out metatag rows from samples and counts 295 | metatag_mask = ~(np.isin(genes, self._old_metatags) | np.char.startswith(genes, '__')) 296 | genes = genes[metatag_mask].copy() 297 | self._counts = self.counts[metatag_mask].copy() 298 | self._genes = genes 299 | 300 | @property 301 | def library_size(self) -> np.array: 302 | """The total read counts per sample. 303 | 304 | Returns: 305 | library_size: The size of the library. 306 | 307 | """ 308 | return np.sum(self.counts, 0) 309 | 310 | def log_transform(self, counts, prior_count): 311 | """Compute the log of the counts""" 312 | counts[counts == 0] = prior_count 313 | return np.log(counts) 314 | 315 | def cpm(self, transform_to_log: bool = False, prior_count: float = PRIOR_COUNT) -> "DGEList": 316 | """Normalize the DGEList to read counts per million.""" 317 | counts = 1e6 * self.counts / np.sum(self.counts, axis=0) 318 | current_log = self.current_log_status 319 | if transform_to_log: 320 | counts = self.log_transform(counts, prior_count) 321 | current_log = True 322 | 323 | return self.copy(counts=counts, current_log=current_log) 324 | 325 | def rpkm( 326 | self, 327 | gene_data: CanonicalDataStore, 328 | transform_to_log: bool = False, 329 | prior_count: float = PRIOR_COUNT, 330 | ) -> "DGEList": 331 | """Return the DGEList normalized to reads per kilobase of gene length 332 | per million reads. (RPKM = numReads / ( geneLength/1000 * totalNumReads/1,000,000 ) 333 | 334 | Args: 335 | gene_data: An object that works to import Ensembl based data, for use in calculations 336 | transform_to_log: true, if you wish to convert to log after converting to RPKM 337 | prior_count: a minimum value for genes, if you do log transforms. 338 | """ 339 | current_log = self.current_log_status 340 | 341 | if self.current_log_status: 342 | self.counts = np.exp(self.counts) 343 | current_log = False 344 | col_sum = np.sum(self.counts, axis=0) 345 | 346 | gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data) 347 | 348 | genes = self.genes[gene_mask].copy() 349 | counts = self.counts[gene_mask].copy() 350 | 351 | counts = (counts.T / gene_len_ordered).T 352 | counts = counts / (col_sum / 1e6) 353 | 354 | if transform_to_log: 355 | counts = self.log_transform(counts, prior_count) 356 | current_log = True 357 | 358 | return self.copy(counts=counts, current_log=current_log, genes=genes) 359 | 360 | def get_gene_mask_and_lengths(self, gene_data): 361 | 362 | """ 363 | use gene_data to get the gene lenths and a gene mask for the tranformation. 364 | Args: 365 | gene_data: the object that holds gene data from ensembl 366 | 367 | """ 368 | gene_len_ordered = [] 369 | gene_mask = [] 370 | gene_ensg = [] 371 | for gene in self.genes: 372 | if gene.startswith("ENSG"): 373 | gene_name = gene 374 | gene_ensg.append(gene_name) 375 | if gene_data.has_gene(gene_name): 376 | gene_mask.append(True) 377 | gene_len_ordered.append( 378 | gene_data.get_length_of_canonical_transcript(gene_name) / 1e3 379 | ) 380 | else: 381 | gene_mask.append(False) 382 | else: 383 | t_gene = gene_data.get_genes_from_symbol(gene) 384 | if t_gene: 385 | if len(t_gene) > 1: 386 | gene_name = gene_data.pick_gene_id(t_gene) 387 | else: 388 | gene_name = t_gene[0] 389 | gene_ensg.append(gene_name) 390 | if gene_data.has_gene(gene_name): 391 | gene_mask.append(True) 392 | gene_len_ordered.append( 393 | gene_data.get_length_of_canonical_transcript(gene_name) / 1e3 394 | ) 395 | else: 396 | gene_mask.append(False) 397 | else: 398 | gene_mask.append(False) 399 | return gene_len_ordered, gene_mask 400 | 401 | def tpm( 402 | self, 403 | gene_lengths: np.ndarray, 404 | transform_to_log: bool = False, 405 | prior_count: float = PRIOR_COUNT, 406 | mean_fragment_lengths: np.ndarray = None, 407 | ) -> "DGEList": 408 | """Normalize the DGEList to transcripts per million. 409 | 410 | Adapted from Wagner, et al. 'Measurement of mRNA abundance using RNA-seq data: 411 | RPKM measure is inconsistent among samples.' doi:10.1007/s12064-012-0162-3 412 | 413 | Read counts :math:`X_i` (for each gene :math:`i` with gene length :math:`\widetilde{l_j}` ) 414 | are normalized as follows: 415 | 416 | .. math:: 417 | 418 | TPM_i = \\frac{X_i}{\\widetilde{l_i}}\cdot \\ 419 | \\left(\\frac{1}{\sum_j \\frac{X_j}{\widetilde{l_j}}}\\right) \cdot 10^6 420 | 421 | Args: 422 | gene_lengths: 1D array of gene lengths for each gene in the rows of `DGEList.counts`. 423 | transform_to_log: store log outputs 424 | prior_count: 425 | mean_fragment_lengths: 1D array of mean fragment lengths for each sample in the columns of `DGEList.counts` 426 | (optional) 427 | 428 | """ 429 | 430 | # compute effective length not allowing negative lengths 431 | if mean_fragment_lengths: 432 | effective_lengths = ( 433 | gene_lengths[:, np.newaxis] - mean_fragment_lengths[np.newaxis, :] 434 | ).clip(min=1) 435 | else: 436 | effective_lengths = gene_lengths[:, np.newaxis] 437 | 438 | # how many counts per base 439 | base_counts = self.counts / effective_lengths 440 | 441 | counts = 1e6 * base_counts / np.sum(base_counts, axis=0)[np.newaxis, :] 442 | current_log = self.current_log_status 443 | if transform_to_log: 444 | counts = self.log_transform(counts, prior_count) 445 | current_log = True 446 | 447 | return self.copy(counts=counts, current_log=current_log) 448 | 449 | def __repr__(self) -> str: 450 | """Give a pretty non-executeable representation of this object.""" 451 | num_samples = len(self._samples) if self._samples is not None else 0 452 | num_genes = len(self._genes) if self._genes is not None else 0 453 | 454 | return ( 455 | f"{self.__class__.__name__}(" 456 | f"num_samples={num_samples:,}, " 457 | f"num_genes={num_genes:,})" 458 | ) 459 | 460 | def write_npz_file(self, filename: str) -> None: 461 | """Convert the object to a byte representation, which can be stored or imported.""" 462 | 463 | # TODO: validate file name 464 | 465 | log.info(f"Exporting data to compressed .dge file ({filename}.npz)...") 466 | 467 | np.savez_compressed( 468 | filename, 469 | samples=self.samples, 470 | genes=self.genes, 471 | norm_factors=self.norm_factors, 472 | counts=self.counts, 473 | groups_list=self.groups_list, 474 | ) 475 | 476 | def read_npz_file(self, filename: str) -> None: 477 | """Import a file name stored in the dge export format. 478 | 479 | Args: 480 | filename: the name of the file to read from. 481 | 482 | """ 483 | 484 | log.info(f"Importing data from .dge file ({filename})...") 485 | 486 | npzfile = np.load(filename) 487 | self.counts = npzfile["counts"] 488 | self.genes = npzfile["genes"] 489 | self.samples = npzfile["samples"] 490 | self.norm_factors = npzfile["norm_factors"] 491 | self.groups_list = npzfile["groups_list"].tolist() 492 | 493 | self.groups_dict = self._sample_group_dict(self.groups_list, self.samples) 494 | 495 | @classmethod 496 | def create_DGEList( 497 | cls, 498 | sample_list: List[str], 499 | data_set: Dict[Hashable, Any], # {sample: {gene1: x, gene2: y}}, 500 | gene_list: List[str], 501 | sample_to_category: Optional[List[str]] = None, 502 | category_to_samples: Optional[Dict[Hashable, List[str]]] = None, 503 | ) -> "DGEList": 504 | """ sample list and gene list must be pre-sorted 505 | Use this to create the DGE object for future work.""" 506 | 507 | log.info("Creating DGE list object...") 508 | temp_data_store = np.zeros(shape=(len(gene_list), len(sample_list))) 509 | 510 | for idx_s, sample in enumerate(sample_list): 511 | for idx_g, gene in enumerate(gene_list): 512 | if sample in data_set and gene in data_set[sample]: 513 | if data_set[sample][gene]: 514 | temp_data_store[idx_g, idx_s] = data_set[sample][gene] 515 | 516 | return cls( 517 | counts=temp_data_store, 518 | genes=np.array(gene_list), 519 | samples=np.array(sample_list), 520 | groups_in_list=sample_to_category if sample_to_category else None, 521 | groups_in_dict=category_to_samples if category_to_samples else None, 522 | to_remove_zeroes=False, 523 | ) 524 | 525 | @classmethod 526 | def create_DGEList_data_file( 527 | cls, data_file: Path, group_file: Path, **kwargs: Mapping 528 | ) -> "DGEList": 529 | """Wrapper for creating DGEList objects from file locations. Performs open and passes 530 | the file handles to the method for creating a DGEList object. 531 | 532 | This function uses smart_open, which provides a broad list of data sources that can be 533 | opened. For a full list of data sources, see smart_open's documentation at 534 | https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst 535 | 536 | Args: 537 | data_file: Text file defining the data set. 538 | group_file: The JSON file defining the groups. 539 | kwargs: Additional arguments supported by ``np.genfromtxt``. 540 | 541 | Returns: 542 | DGEList: Container for storing read counts for samples. 543 | 544 | """ 545 | with smart_open(data_file, 'r') as data_handle, smart_open( 546 | group_file, 'r' 547 | ) as group_handle: 548 | return cls.create_DGEList_handle(data_handle, group_handle, **kwargs) 549 | 550 | @classmethod 551 | def create_DGEList_handle( 552 | cls, data_handle: StringIO, group_handle: StringIO, **kwargs: Mapping 553 | ) -> "DGEList": 554 | """Read in a file-like object of delimited data for instantiation. 555 | 556 | Args:get_canonical 557 | data_handle: Text file defining the data set. 558 | group_handle: The JSON file defining the groups. 559 | kwargs: Additional arguments supported by ``np.genfromtxt``. 560 | 561 | Returns: 562 | DGEList: Container for storing read counts for samples. 563 | 564 | """ 565 | _, *samples = next(data_handle).strip().split() 566 | 567 | genes = [] 568 | frame = np.genfromtxt( 569 | fname=data_handle, 570 | dtype=np.int, 571 | converters={0: lambda _: genes.append(_.decode("utf-8")) or 0}, # type: ignore 572 | autostrip=kwargs.pop("autostrip", True), 573 | replace_space=kwargs.pop("replace_space", "_"), 574 | case_sensitive=kwargs.pop("case_sensitive", True), 575 | invalid_raise=kwargs.pop("invalid_raise", True), 576 | # skip_header=kwargs.pop("skip_headers", 1), 577 | **kwargs, 578 | ) 579 | 580 | # Delete the first column as it is copied on assignment to `genes`. 581 | counts = np.delete(frame, 0, axis=1) 582 | # Delete the first element in the genes list: (should be 'genes' but was a 583 | # duplicate gene name, due to a putative bug in genfromtxt 584 | genes = genes[1:] 585 | 586 | group = json.load(group_handle) 587 | 588 | return cls( 589 | counts=counts, 590 | genes=genes, 591 | samples=samples, 592 | groups_in_dict=group, 593 | to_remove_zeroes=False, 594 | ) 595 | -------------------------------------------------------------------------------- /edgePy/__init__.py: -------------------------------------------------------------------------------- 1 | from edgePy import data_import 2 | 3 | from edgePy.DGEList import DGEList 4 | 5 | from edgePy.util import getLogger 6 | -------------------------------------------------------------------------------- /edgePy/benchmarking/00.GSE49712.Rscript.txt: -------------------------------------------------------------------------------- 1 | ############################## 2 | x<-c("pheatmap","limma","gplots","edgeR","RColorBrewer") 3 | require(x) 4 | lapply(x, require, character.only = TRUE) 5 | rm(x) 6 | ############################## 7 | dados<-read.table("GSE49712_gene_FPKM.txt",sep="\t",header=TRUE,row.names=NULL) 8 | ############################## 9 | group <- as.factor(c(rep("A",5),rep("B",5))) 10 | ############################## 11 | x<-dados[,2:11] 12 | rownames(x)<-dados$Geneid 13 | cpm <- cpm(x) 14 | lcpm <- cpm(x, log=TRUE) 15 | table(keep.exprs) 16 | keep.exprs <- rowSums(cpm>1)>=3 17 | x <- x[keep.exprs,] 18 | geneSymbols<-dados[,1][keep.exprs] 19 | ############################## 20 | png("diagnostic_fig1.png") 21 | nsamples <- ncol(x) 22 | col <- brewer.pal(nsamples, "Paired") 23 | par(mfrow=c(1,2)) 24 | plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.21), las=2, 25 | main="", xlab="") 26 | title(main="A. Raw data", xlab="Log-cpm") 27 | abline(v=0, lty=3) 28 | for (i in 2:nsamples){ 29 | den <- density(lcpm[,i]) 30 | lines(den$x, den$y, col=col[i], lwd=2) 31 | } 32 | legend("topright", legend=group, text.col=col, bty="n") 33 | ### 34 | lcpm <- cpm(x, log=TRUE) 35 | plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.21), las=2, 36 | main="", xlab="") 37 | title(main="B. Filtered data", xlab="Log-cpm") 38 | abline(v=0, lty=3) 39 | for (i in 2:nsamples){ 40 | den <- density(lcpm[,i]) 41 | lines(den$x, den$y, col=col[i], lwd=2) 42 | } 43 | legend("topright", legend=group, text.col=col, bty="n") 44 | dev.off() 45 | ############################## 46 | x<-as.matrix(x) 47 | rownames(x)<-geneSymbols 48 | d.cpm.x <- DGEList(counts=x,group=group) 49 | d.cpm.x <- calcNormFactors(d.cpm.x, method = "TMM") 50 | ############################## 51 | d.cpm.x2 <- d.cpm.x 52 | d.cpm.x2$samples$norm.factors <- 1 53 | d.cpm.x2$counts[,1] <- ceiling(d.cpm.x2$counts[,1]*0.05) 54 | d.cpm.x2$counts[,2] <- d.cpm.x2$counts[,2]*5 55 | ############################## 56 | png("diagnostic_fig2.png") 57 | par(mfrow=c(1,2)) 58 | lcpm <- cpm(d.cpm.x2, log=TRUE) 59 | boxplot(lcpm, las=2, col=col, main="") 60 | title(main="A. Example: Unnormalised data",ylab="Log-cpm") 61 | d.cpm.x2 <- calcNormFactors(d.cpm.x2,method = "TMM") 62 | d.cpm.x2$samples$norm.factors 63 | lcpm <- cpm(d.cpm.x2, log=TRUE) 64 | boxplot(lcpm, las=2, col=col, main="") 65 | title(main="B. Example: Normalised data",ylab="Log-cpm") 66 | dev.off() 67 | ############################## 68 | lcpm <- cpm(d.cpm.x, log=TRUE) 69 | png("mds.png") 70 | plotMDS(lcpm, labels=group, col=as.numeric(group)) 71 | title(main="MDS - Sample groups") 72 | dev.off() 73 | ############################## 74 | png("diagnostic_fig3.png") 75 | design = model.matrix( ~ 0 + group, data=d.cpm.x$samples) 76 | colnames(design) <- levels(group) 77 | d.cpm.x = estimateCommonDisp(d.cpm.x, verbose=TRUE) 78 | d.cpm.x = estimateTagwiseDisp(d.cpm.x) 79 | par(mfrow=c(1,2)) 80 | v <- voom(d.cpm.x, design, plot=TRUE) 81 | ############################## 82 | contr.matrix <- makeContrasts( 83 | AvsB = A - B, #1 84 | levels = colnames(design)) 85 | ############################## 86 | vfit <- lmFit(v, design) 87 | vfit <- contrasts.fit(vfit, contrasts=contr.matrix) 88 | efit <- eBayes(vfit) 89 | et <- decideTests(vfit) 90 | plotSA(efit, main="Final model: Mean Variance Trend") 91 | summary(decideTests(efit)) 92 | tfit <- treat(vfit, lfc=1) 93 | dt <- decideTests(tfit) 94 | summary(dt) 95 | dev.off() 96 | ############################## 97 | 98 | ############################## 99 | png("diagnostic_fig4.png") 100 | vennDiagram(dt[,1], circle.col=c("turquoise", "red","green")) 101 | dev.off() 102 | #de.common<-which(dt[,1]!=0 & dt[,2]!=1 & dt[,3]!=1 & dt[,2]!=-1 & dt[,3]!=-1) 103 | #length(de.common) 104 | topGenes<-topTreat(tfit, coef=1, n=Inf,adjust.method = "fdr",lfc=2,p.value=0.01) 105 | ############################## 106 | png("heatmap_fig5.png") 107 | pheatmap(as.matrix(v$E[which(rownames(v$E) %in% as.character(topGenes$ID)),]), color = colorRampPalette(c("navy", "white","firebrick4"))(255), 108 | cluster_cols = F, cluster_rows=T, show_colnames = TRUE, 109 | show_rownames = FALSE,clustering_distance_rows ="euclidean", scale="row") 110 | dev.off() 111 | ############################## 112 | write.table(file="genes+DEGs.tsv",topTreat(tfit, coef=1, n=7001),sep="\t") 113 | write.table(file="topGenesEdgar.tsv",topGenes,sep="\t") 114 | ############################## 115 | save.image("analysis.RNAseq.gse49712.Rdata") 116 | ############################## 117 | -------------------------------------------------------------------------------- /edgePy/benchmarking/01.diagnostic_fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/benchmarking/01.diagnostic_fig1.png -------------------------------------------------------------------------------- /edgePy/benchmarking/02.diagnostic_fig2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/benchmarking/02.diagnostic_fig2.png -------------------------------------------------------------------------------- /edgePy/benchmarking/03.mds.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/benchmarking/03.mds.png -------------------------------------------------------------------------------- /edgePy/benchmarking/04.diagnostic_fig3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/benchmarking/04.diagnostic_fig3.png -------------------------------------------------------------------------------- /edgePy/benchmarking/05.diagnostic_fig4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/benchmarking/05.diagnostic_fig4.png -------------------------------------------------------------------------------- /edgePy/benchmarking/06.heatmap_fig5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/benchmarking/06.heatmap_fig5.png -------------------------------------------------------------------------------- /edgePy/benchmarking/09.analysis.RNAseq.gse49712.Rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/benchmarking/09.analysis.RNAseq.gse49712.Rdata -------------------------------------------------------------------------------- /edgePy/benchmarking/README_benchmark.md: -------------------------------------------------------------------------------- 1 | Analysis workflow for Bulk-RNAseq to compare the differentially expressed genes (DEGs) produced by edgeR and limma. 2 | ===== 3 | ### 1. Script with all R packages and command lines 4 | GSE49712.Rscript.txt 5 | 6 | ### 2. Data matrix extract from GEO 7 | GSE49712_gene_FPKM.txt 8 | 9 | ### 3. Figure produced by filtering the dataset based on Zero count rows/genes 10 | diagnostic_fig1.png 11 | ![01 diagnostic_fig1](https://user-images.githubusercontent.com/13422225/44564680-eeff3300-a729-11e8-9446-0e7643dbb651.png) 12 | 13 | ### 4. Normalization of data distribution with TMM 14 | diagnostic_fig2.png 15 | ![02 diagnostic_fig2](https://user-images.githubusercontent.com/13422225/44564681-f58daa80-a729-11e8-9fb0-5fb64de760cd.png) 16 | 17 | ### 5. Multidimensional scaling for general sample comparison. 18 | mds.png 19 | ![03 mds](https://user-images.githubusercontent.com/13422225/44564682-f9213180-a729-11e8-9afc-0271fe9cea34.png) 20 | 21 | ### 6. Variance dispersion comparison for unnormalized and normalized data based on dispersion and a linear model 22 | diagnostic_fig3.png 23 | ![04 diagnostic_fig3](https://user-images.githubusercontent.com/13422225/44564686-fe7e7c00-a729-11e8-95e7-f2db1ba53d69.png) 24 | 25 | ### 7. Venn with quantity of DEGs 26 | diagnostic_fig4.png 27 | ![05 diagnostic_fig4](https://user-images.githubusercontent.com/13422225/44564694-076f4d80-a72a-11e8-8bba-424dd4421883.png) 28 | 29 | ### 8. Sample heatmap for topDEGs 30 | heatmap_fig5.png 31 | ![06 heatmap_fig5](https://user-images.githubusercontent.com/13422225/44564698-0b9b6b00-a72a-11e8-8e5d-8fe72477a942.png) 32 | 33 | 34 | ### 9. DEG list for 7001 genes 35 | * Filtered using p-value < 0.05 36 | 37 | **TODO: show file 07.DEGs.tsv** 38 | 39 | ### 10. Top DEG list for 300 genes 40 | * Filtered using p value < 0.01, FDR < 0.05 and fold change of 2. 41 | 42 | **TODO: show file 08.topDEGs.tsv** 43 | 44 | ### Versions 45 | *R version 3.2.3 (2015-12-10) 46 | *Platform: x86_64-pc-linux-gnu (64-bit) Running under: Ubuntu 16.04.2 LTS 47 | 48 | ### Packages 49 | 50 | ``` 51 | [1] ggplot2_2.2.1 pheatmap_1.0.10 edgeR_3.12.1 limma_3.26.9 52 | loaded via a namespace (and not attached): 53 | [1] colorspace_1.3-2 scales_0.5.0 lazyeval_0.2.1 plyr_1.8.4 54 | [5] pillar_1.2.1 gtable_0.2.0 RColorBrewer_1.1-2 tibble_1.4.2 55 | [9] Rcpp_0.12.15 grid_3.2.3 rlang_0.2.0 munsell_0.4.3 56 | ``` 57 | 58 | 59 | 60 | ### Compute resources 61 | 62 | ``` 63 | 64 | Cores 4; 8GB RAM; 500GB HD 65 | Processor: AMD Phenom(tm) II X4 B97 Processor × 4 66 | Graphics: Gallium 0.4 on AMD RS880 (DRM 2.50.0 / 4.15.0-32-generic, LLVM **3.8.0) 67 | 68 | ``` 69 | ### Computing 70 | 71 | ``` 72 | Running the script from cero, including packages loading time: 73 | user system elapsed 74 | 1.213 0.136 167.807 75 | user system elapsed 76 | 12.951 0.366 180.962 77 | RAM used: 850mB 78 | Cores used: 1 79 | ``` 80 | 81 | 82 | -------------------------------------------------------------------------------- /edgePy/data/GSE49712_HTSeq.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/data/GSE49712_HTSeq.txt.gz -------------------------------------------------------------------------------- /edgePy/data/GSE49712_HTSeq.txt.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/data/GSE49712_HTSeq.txt.npz -------------------------------------------------------------------------------- /edgePy/data/example_gene_list.txt: -------------------------------------------------------------------------------- 1 | TP53 2 | BRCA1 3 | BRCA2 4 | -------------------------------------------------------------------------------- /edgePy/data/groups.json: -------------------------------------------------------------------------------- 1 | { 2 | "Group 1": [ 3 | "A_1", 4 | "A_2", 5 | "A_3", 6 | "A_4", 7 | "A_5" 8 | ], 9 | "Group 2": [ 10 | "B_1", 11 | "B_2", 12 | "B_3", 13 | "B_4", 14 | "B_5" 15 | ] 16 | } -------------------------------------------------------------------------------- /edgePy/data_import/__init__.py: -------------------------------------------------------------------------------- 1 | from .data_import import * 2 | -------------------------------------------------------------------------------- /edgePy/data_import/data_import.py: -------------------------------------------------------------------------------- 1 | """ Skeleton class for importing files """ 2 | 3 | from pathlib import Path 4 | from typing import Union 5 | 6 | 7 | __all__ = ["get_dataset_path"] 8 | 9 | 10 | def get_dataset_path(filename: Union[str, Path]) -> Path: 11 | """Return the filesystem path to the packaged data file. 12 | 13 | Args: 14 | filename (str, pathlib.Path) : The full name of the packaged data file. 15 | 16 | Returns: 17 | path (pathlib.Path) : The filesystem path to the packaged data file. 18 | 19 | Examples 20 | >>> from edgePy.data_import.data_import import get_dataset_path 21 | >>> str(get_dataset_path("GSE49712_HTSeq.txt.gz")) # doctest:+ELLIPSIS 22 | '.../edgePy/data/GSE49712_HTSeq.txt.gz' 23 | 24 | """ 25 | import edgePy 26 | 27 | directory = Path(edgePy.__file__).expanduser().resolve().parent 28 | return directory / "data" / filename 29 | -------------------------------------------------------------------------------- /edgePy/data_import/ensembl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/data_import/ensembl/__init__.py -------------------------------------------------------------------------------- /edgePy/data_import/ensembl/canonical_transcripts.py: -------------------------------------------------------------------------------- 1 | """Some macro-level functions for dealing with the mysql library""" 2 | 3 | import argparse 4 | from smart_open import smart_open # type: ignore 5 | 6 | from edgePy.data_import.ensembl.mysql_wrapper import MySQLWrapper 7 | 8 | CANONICAL_TRANSCRIPT_SQL = """select gene.stable_id as gene, transcript.stable_id as transcript, 9 | t_len.exon_len as length, IF(gene.canonical_transcript_id = transcript.transcript_id, "True", "False") as canonical 10 | from transcript, 11 | (select et.transcript_id, sum(exon.seq_region_end - exon.seq_region_start) as exon_len 12 | from exon, exon_transcript as et where et.exon_id = exon.exon_id group by et.transcript_id ) as t_len, gene 13 | where t_len.transcript_id = transcript.transcript_id and transcript.gene_id = gene.gene_id;""" 14 | 15 | GENE_SYMBOL_SQL = """select xref.display_label as symbol, gene.stable_id as gene 16 | from xref, gene 17 | where xref.xref_id = gene.display_xref_id 18 | and xref.external_db_id = 1100; 19 | """ 20 | 21 | GENE_SYNONYM_SQL = """select g.stable_id as gene_id, es.synonym as synonym 22 | from gene g 23 | join xref x on (g.display_xref_id = x.xref_id) 24 | left join external_synonym es using (xref_id) 25 | join external_db ed using (external_db_id) 26 | where synonym is not NULL and ed.db_name='HGNC';""" 27 | 28 | 29 | def parse_arguments(parser=None): 30 | if not parser: 31 | parser = argparse.ArgumentParser() 32 | 33 | parser.add_argument("--host", help="name of the mysql host", default="ensembldb.ensembl.org") 34 | parser.add_argument("--port", help="name of the mysql port", default=3337) 35 | parser.add_argument("--username", help="user name for the mysql service", default="anonymous") 36 | parser.add_argument("--password", help="password for the mysql service", default=None) 37 | parser.add_argument( 38 | "--database", 39 | help="database to use for the query, for example homo_sapiens_core_75_37 or " 40 | "homo_sapiens_core_93_38 or mus_musculus_core_93_38 ", 41 | default="homo_sapiens_core_75_37", 42 | ) 43 | 44 | parser.add_argument( 45 | "--output_transcripts", 46 | help="where to put the file with the transcript data", 47 | default="blank", 48 | ) 49 | parser.add_argument( 50 | "--output_symbols", help="where to put the file with the gene symbols", default="blank" 51 | ) 52 | args = parser.parse_args() 53 | 54 | if args.output_transcripts == "blank": 55 | args.output_transcripts = f"../../data/transcripts_{args.database}.tsv" 56 | 57 | if args.output_symbols == "blank": 58 | args.output_symbols = f"../../data/symbols_{args.database}.tsv" 59 | 60 | return args 61 | 62 | 63 | class CanonicalTranscript(object): 64 | """A simple class for storing Ensembl transcript data, as well as 65 | supplemental data for gene id/symbols/synnonyms""" 66 | 67 | def __init__(self, host, port, user, password, database): 68 | # needs to go into a config file, but for now: 69 | self.exon_store = {} 70 | 71 | self.mysql_wrapper = MySQLWrapper( 72 | host=host, port=port, username=user, password=password, database=database 73 | ) 74 | 75 | print("retrieving canonical transcript data.") 76 | self.canonical_transcripts = self.mysql_wrapper.run_sql_query(CANONICAL_TRANSCRIPT_SQL) 77 | 78 | print("retrieving gene symbol data.") 79 | self.gene_symbols = self.mysql_wrapper.run_sql_query(GENE_SYMBOL_SQL) 80 | 81 | print("retrieving gene synonym data.") 82 | self.gene_synonyms = self.mysql_wrapper.run_sql_query(GENE_SYNONYM_SQL) 83 | 84 | print("completed") 85 | self.mysql_wrapper.close() 86 | 87 | 88 | def main(): 89 | args = parse_arguments() 90 | default_class = CanonicalTranscript( 91 | args.host, args.port, args.username, args.password, args.database 92 | ) 93 | canonical = default_class.canonical_transcripts 94 | 95 | with smart_open(args.output_transcripts, 'w') as output: 96 | for transcript in canonical: 97 | output.write( 98 | f"{transcript['gene']}\t{transcript['transcript']}\t" 99 | f"{transcript['length']}\t{transcript['canonical']}\n" 100 | ) 101 | 102 | symbols = default_class.gene_symbols 103 | synonyms = default_class.gene_synonyms 104 | 105 | with smart_open(args.output_symbols, 'w') as output: 106 | """ The order here is important - symbols contain duplicates, so make sure the symbols 107 | are procesesed before synonyms. The matching script (ensembl_flat_file_reader.py) will ignore new symbols 108 | for translating to gene, if there's already one accepted.""" 109 | 110 | for symbol in symbols: 111 | output.write(f"{symbol['symbol']}\t{symbol['gene']}\n") 112 | for synonym in synonyms: 113 | output.write(f"{synonym['synonym']}\t{synonym['gene_id']}\n") 114 | 115 | 116 | if __name__ == "__main__": 117 | main() 118 | -------------------------------------------------------------------------------- /edgePy/data_import/ensembl/ensembl_flat_file_reader.py: -------------------------------------------------------------------------------- 1 | from smart_open import smart_open # type: ignore 2 | from typing import Optional, Union, Dict, Hashable, Any, List 3 | from pathlib import Path 4 | 5 | 6 | class CanonicalDataStore(object): 7 | """ 8 | A simple tool for reading canonical data, generated from the canonical_transcripts.py script provided with edgePy. 9 | 10 | Args: 11 | transcript_filename: the name of the transcript file, generated by canonical_transcripts.py 12 | symbols_filename: the name of the gene symbol file, generated by canonical_transcripts.py 13 | 14 | """ 15 | 16 | def __init__( 17 | self, transcript_filename: Union[str, Path], symbols_filename: Union[str, Path] 18 | ) -> None: 19 | 20 | self.by_transcript: Dict[Hashable, Dict[Hashable, Any]] = {} 21 | self.canonical_transcript: Dict[Hashable, str] = {} 22 | 23 | self.gene_to_symbol: Dict[Hashable, str] = {} 24 | self.symbol_to_genes: Dict[Hashable, List] = {} 25 | 26 | with smart_open(transcript_filename, 'r') as data: 27 | for line in data: 28 | gene_info = line.strip().split("\t") 29 | gene = gene_info[0] 30 | transcript = gene_info[1] 31 | length = int(gene_info[2]) 32 | canonical = True if gene_info[3] == "True" else False 33 | 34 | self.by_transcript[transcript] = {'len': length, 'can': canonical} 35 | 36 | if canonical: 37 | self.canonical_transcript[gene] = transcript 38 | 39 | with smart_open(symbols_filename, 'r') as data: 40 | for line in data: 41 | symbol_info = line.strip().split("\t") 42 | symbol = symbol_info[0] 43 | gene = symbol_info[1] 44 | 45 | if gene not in self.gene_to_symbol: 46 | self.gene_to_symbol[gene] = symbol 47 | 48 | if symbol not in self.symbol_to_genes: 49 | self.symbol_to_genes[symbol] = [] 50 | if gene not in self.symbol_to_genes[symbol]: 51 | self.symbol_to_genes[symbol].append(gene) 52 | 53 | def has_gene(self, gene: Optional[str]) -> bool: 54 | """ 55 | Check if a gene is present in the dataset. 56 | Args: 57 | gene: the ensembl gene id. 58 | """ 59 | if gene and gene in self.canonical_transcript: 60 | return True 61 | else: 62 | return False 63 | 64 | def get_symbol_from_gene(self, gene: Optional[str]) -> Optional[str]: 65 | """ 66 | Given a gene name, get the symbol - should give you the default ENSEMBL name, and not a synonym. 67 | Args: 68 | gene: the ensembl gene id. 69 | """ 70 | if not gene: 71 | return None 72 | try: 73 | return self.gene_to_symbol[gene] 74 | except KeyError: 75 | print(f"gene {gene} not found in gene to symbol.") 76 | raise KeyError 77 | 78 | def get_genes_from_symbol(self, symbol: str) -> List: 79 | """ 80 | Given the gene symbol (or a recognized synonym), get the ensembl id. 81 | Args: 82 | symbol: HUGO or HGNC symbol 83 | """ 84 | 85 | try: 86 | return self.symbol_to_genes[symbol] 87 | except KeyError: 88 | return [] 89 | 90 | @staticmethod 91 | def pick_gene_id(gene_ids: List) -> Optional[str]: 92 | """ 93 | Where there are more than one gene ID for a symbol, pick the one with the largest ensembl ID integer. 94 | Args: 95 | gene_ids: list of gene IDs. 96 | """ 97 | if not gene_ids: 98 | return None 99 | length = len(gene_ids) 100 | if length == 1: 101 | return gene_ids[0] 102 | else: 103 | gene_ids.sort(reverse=True) 104 | return gene_ids[0] 105 | 106 | def is_known_symbol(self, symbol: str) -> bool: 107 | """ 108 | Check to see if we recognize a given symbol - there always will be things we don't recognize. 109 | Args: 110 | symbol: what you think is a gene symbol. 111 | """ 112 | if symbol in self.symbol_to_genes: 113 | return True 114 | return False 115 | 116 | def is_known_gene(self, gene: str) -> bool: 117 | """ 118 | Check to see if we can recognize a given gene ID from ENSEMBL. If you have one that isn't recognized, it might 119 | belong to a different version. 120 | Args: 121 | gene: what you think is a gene id. 122 | :return: 123 | """ 124 | if gene in self.gene_to_symbol: 125 | return True 126 | return False 127 | 128 | def is_canonical_by_transcript(self, transcript_id: str) -> bool: 129 | """ 130 | Return a boolean indicating whether the supplied transcript is canonical or not. 131 | 132 | Args: 133 | transcript_id: an Ensembl transcript ID, starting with ENST 134 | """ 135 | 136 | if transcript_id not in self.by_transcript: 137 | return False 138 | else: 139 | return self.by_transcript[transcript_id]['can'] 140 | 141 | def get_canonical_transcript(self, gene_id: str) -> Optional[str]: 142 | """ 143 | Return the Ensembl canonical transcript ID, given an ensembl transcript ID. 144 | 145 | Args: 146 | gene_id: An Ensembl gene ID, starting with ENSG 147 | """ 148 | 149 | if gene_id in self.canonical_transcript: 150 | return self.canonical_transcript[gene_id] 151 | else: 152 | return None 153 | 154 | def get_length_of_transcript(self, transcript_id: str) -> int: 155 | """ 156 | Return the length of a transcript, given an ensembl transcript ID. 157 | 158 | Args: 159 | transcript_id: an Ensembl transcript ID, starting with ENST 160 | """ 161 | if transcript_id not in self.by_transcript: 162 | return False 163 | else: 164 | return self.by_transcript[transcript_id]['len'] 165 | 166 | def get_length_of_canonical_transcript(self, gene_id: Optional[str]) -> int: 167 | """ 168 | Return the length of a transcript, given an ensembl gene ID. 169 | 170 | Args: 171 | gene_id: An Ensembl gene ID, starting with ENSG 172 | """ 173 | if not gene_id: 174 | return 0 175 | 176 | transcript_id = self.get_canonical_transcript(gene_id) 177 | 178 | if not transcript_id or transcript_id not in self.by_transcript: 179 | return False 180 | else: 181 | return self.by_transcript[transcript_id]['len'] 182 | -------------------------------------------------------------------------------- /edgePy/data_import/ensembl/mysql_wrapper.py: -------------------------------------------------------------------------------- 1 | """ 2 | This will be a wrapper file for using mysql, to make queries easier. I may eventually just replace this with 3 | SQLAlchemy, but for the moment, it's probalby easiest just to make a simple library to handle this type of transaction. 4 | 5 | """ 6 | 7 | import pymysql 8 | from typing import List 9 | from pymysql.cursors import DictCursor 10 | 11 | 12 | class MySQLWrapper(object): 13 | def __init__( 14 | self, 15 | host: str = None, 16 | port: int = None, 17 | username: str = None, 18 | password: str = None, 19 | database: str = None, 20 | ) -> None: 21 | self.host = host 22 | self.port = port 23 | self.username = username 24 | self.password = password 25 | self.database = database 26 | self.connection = pymysql.connect( 27 | host=self.host, 28 | user=self.username, 29 | password=self.password, 30 | db=self.database, 31 | charset="utf8mb4", 32 | cursorclass=DictCursor, 33 | ) 34 | 35 | def find_one(self, sql: str) -> object: 36 | with self.connection.cursor() as cursor: 37 | # Read a single record 38 | cursor.execute(sql) 39 | result = cursor.fetchone() 40 | return result 41 | 42 | def insert(self, sql: str) -> None: 43 | with self.connection.cursor() as cursor: 44 | # Create a new record 45 | cursor.execute(sql) 46 | self.connection.commit() 47 | 48 | def update(self) -> None: 49 | raise NotImplementedError 50 | 51 | def run_sql_query(self, sql: str) -> List: 52 | with self.connection.cursor() as cursor: 53 | cursor.execute(sql) 54 | result = cursor.fetchall() 55 | return result 56 | 57 | def close(self) -> None: 58 | self.connection.close() 59 | -------------------------------------------------------------------------------- /edgePy/data_import/mongodb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/data_import/mongodb/__init__.py -------------------------------------------------------------------------------- /edgePy/data_import/mongodb/gene_functions.py: -------------------------------------------------------------------------------- 1 | """The core Python code for generating data.""" 2 | from typing import Dict, Optional, List, Tuple, Any, Hashable 3 | 4 | 5 | def get_genelist_from_file(filename: str) -> Optional[List]: 6 | """ 7 | Converts a genelist file into a list of genes. Simple function, but can be expanded if needed. 8 | Args: 9 | filename: gene list file name. 10 | """ 11 | 12 | # TODO: should be expanded to handle gzip genelists too. 13 | 14 | if not filename: 15 | return None 16 | gene_list = [] 17 | with open(filename, "r") as file_handle: 18 | for line in file_handle: 19 | gene_list.append(line.strip()) 20 | return gene_list 21 | 22 | 23 | def translate_genes( 24 | genes: Optional[List[str]], mongo_reader: Any, database: str = "ensembl_90_37" 25 | ) -> Tuple[List[str], Dict[str, str]]: 26 | """ 27 | Functions to translate a list of genes in to ENGS symbols and vice versa. 28 | 29 | Args: 30 | genes: list of genes to filter on. 31 | mongo_reader: the mongo connector 32 | database: the name of the database to use. "pytest" for unit testimg (mocking) 33 | 34 | Returns: 35 | a list of ensg symbols, a list of gene symbols 36 | """ 37 | 38 | ensg_genes = [] 39 | non_ensg_genes = [] 40 | gene_symbols = {} 41 | query: Dict[Hashable, Any] = {} 42 | 43 | if genes: 44 | for gene in genes: 45 | if gene.startswith("ENSG"): 46 | ensg_genes.append(gene) 47 | else: 48 | non_ensg_genes.append(gene) 49 | if ensg_genes or not genes: 50 | if genes: 51 | query["_id"] = {"$in": ensg_genes} 52 | symbol_gene_list = mongo_reader.find_as_cursor(database, "symbol_by_ensg", query=query) 53 | for symbol_gene in symbol_gene_list: 54 | for symbol in symbol_gene["symbols"]: 55 | gene_symbols[symbol_gene["_id"]] = symbol 56 | for ensg in ensg_genes: 57 | if ensg not in gene_symbols: 58 | gene_symbols[ensg] = ensg 59 | if non_ensg_genes or not genes: 60 | query = {"_id": {"$in": non_ensg_genes}} if genes else {} 61 | translated_gene_list = mongo_reader.find_as_cursor(database, "ensg_by_symbol", query=query) 62 | for trans_gene in translated_gene_list: 63 | symbol = trans_gene["_id"] 64 | ensgs = trans_gene["ensgs"] 65 | for ensg in ensgs: 66 | gene_symbols[ensg] = symbol 67 | ensg_genes.append(ensg) 68 | return ensg_genes, gene_symbols 69 | 70 | 71 | def get_gene_list(mongo_reader: Any, database: str = "ensembl_90_37") -> Dict[str, str]: 72 | """ 73 | get the list of genes from the mongo database, to translated ensg ids to symbols. 74 | 75 | Args: 76 | mongo_reader: the mongo wrapper 77 | database: database name to use. 78 | 79 | """ 80 | 81 | genes = mongo_reader.find_as_cursor(database, "symbol_by_ensg", query={}) 82 | gene_symbols = {} 83 | for symbol_gene in genes: 84 | for symbol in symbol_gene["symbols"]: 85 | gene_symbols[symbol_gene["_id"]] = symbol 86 | return gene_symbols 87 | 88 | 89 | def get_sample_details( 90 | group_by: str, mongo_reader: Any, database: str 91 | ) -> Dict[Any, Dict[str, Any]]: 92 | """ 93 | Get details from the samples collection. Use this to decide which samples to query data for. 94 | 95 | Args: 96 | group_by: the name of the key to group samples by (Category-based key) 97 | mongo_reader: the mongo wrapper 98 | database: the database to use 99 | 100 | Returns: 101 | details required for each sample available. 102 | 103 | """ 104 | 105 | sample_details = {} 106 | search = {group_by: {"$exists": True}} 107 | sample_grouping = mongo_reader.find_as_cursor( 108 | database, 109 | "samples", 110 | query=search, 111 | projection={"_id": 0, group_by: 1, "sample_name": 1, "Description": 1}, 112 | ) 113 | 114 | for sample in sample_grouping: 115 | sample_details[sample["sample_name"]] = { 116 | "description": sample["Description"] 117 | if "Description" in sample 118 | else sample["sample_name"], 119 | "category": sample[group_by], 120 | } 121 | 122 | return sample_details 123 | 124 | 125 | def get_canonical_rpkm(result: Dict[str, Any]) -> Optional[int]: 126 | """ 127 | Get the rpkm from the database for a given entry in the data collection. 128 | 129 | Args: 130 | result: the entry in the data collection 131 | 132 | Returns: 133 | the rpkm value 134 | 135 | """ 136 | transcript_list = result["transcripts"] 137 | for trans in transcript_list.values(): 138 | if int(trans["canonical"]) == 1: 139 | return trans["rpkm"] 140 | return None 141 | 142 | 143 | def get_canonical_raw(result: Dict[str, Any]) -> Optional[int]: 144 | """ 145 | An approximation of the raw count of reads. 146 | 147 | Args: 148 | result: the entry from the data collection 149 | 150 | Returns: 151 | the raw count (as an integer) 152 | 153 | """ 154 | 155 | transcript_list = result["transcripts"] 156 | for trans in transcript_list.values(): 157 | if int(trans["canonical"]) == 1: 158 | raw = 0 159 | for exon in trans["exons"]: 160 | raw += int(trans["exons"][exon]["raw"]) 161 | return raw 162 | return None 163 | -------------------------------------------------------------------------------- /edgePy/data_import/mongodb/mongo_import.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import Dict, Hashable, Any, Tuple, List, Optional, Union 3 | 4 | from edgePy.data_import.mongodb.mongo_wrapper import MongoWrapper 5 | from edgePy.data_import.mongodb.gene_functions import get_canonical_rpkm 6 | from edgePy.data_import.mongodb.gene_functions import get_canonical_raw 7 | from edgePy.data_import.mongodb.gene_functions import get_genelist_from_file 8 | from edgePy.data_import.mongodb.gene_functions import translate_genes 9 | from edgePy.util import getLogger 10 | 11 | log = getLogger(name=__name__) 12 | 13 | 14 | def parse_arguments(parser: Any = None, ci_values: List[str] = None) -> Any: 15 | 16 | """ 17 | Standard argparse wrapper for interpreting command line arguments. 18 | 19 | Args: 20 | parser: if there's an existing parser, provide it, else, this will 21 | create a new one. 22 | ci_values: use for testing purposes only. 23 | """ 24 | if not parser: 25 | parser = argparse.ArgumentParser() 26 | 27 | parser.add_argument("--config", help="location of the config file", required=True) 28 | parser.add_argument("--key_name", default="Project") 29 | parser.add_argument("--key_value", default="RNA-Seq1") 30 | parser.add_argument("--gene_list", default=None) 31 | 32 | if ci_values: 33 | args = parser.parse_args(ci_values) 34 | else: 35 | args = parse_arguments() 36 | return args 37 | 38 | 39 | class ImportFromMongodb(object): 40 | """ 41 | A utility for importing mongo data from a proprietary mongodb database - hopefully we'll 42 | open this database up in the future. If not, we can re-engineer it from the examples given. 43 | 44 | Args: 45 | host: the name of the machine hosting the database 46 | port: the port number (usually 27017) 47 | mongo_key: a key in the samples collection to filter on 48 | mongo_value: accepted values in the samples collection to 49 | gene_list_file: a list of genes to filter the results on. 50 | 51 | """ 52 | 53 | def __init__( 54 | self, 55 | host: str, 56 | port: int, 57 | mongo_key: Optional[str], 58 | mongo_value: Union[str, List, None], 59 | gene_list_file: Optional[str], 60 | ) -> None: 61 | 62 | self.mongo_host = host 63 | self.mongo_port = port 64 | 65 | self.mongo_reader = MongoWrapper(host=self.mongo_host, port=self.mongo_port, connect=False) 66 | 67 | self.search_key = mongo_key 68 | self.search_value = mongo_value 69 | 70 | self.input_gene_file = gene_list_file 71 | self.gene_list: Optional[List[str]] = None 72 | 73 | def translate_gene_list(self, database: str) -> None: 74 | """ 75 | If there was a list of genes provided, convert them to ENSG symbols. 76 | 77 | Args: 78 | database: name of the database 79 | 80 | """ 81 | 82 | if self.input_gene_file: 83 | input_genes = get_genelist_from_file(self.input_gene_file) 84 | ensg_genes, gene_symbols = translate_genes( 85 | input_genes, self.mongo_reader, database=database 86 | ) 87 | self.gene_list = ensg_genes 88 | 89 | def get_data_from_mongo( 90 | self, database: str, rpkm_flag: bool = False 91 | ) -> Tuple[List[str], Dict[Hashable, Any], List[str], Dict[Hashable, Any]]: 92 | """ 93 | Run the queries to get the samples, from mongo, and then use that data to retrieve 94 | the counts. 95 | 96 | Args: 97 | database: name of the database to retrieve data from. 98 | rpkm_flag: takes the rpkm values from the mongodb, instead of the raw counts 99 | 100 | Returns: 101 | the list of samples, the data itself, 102 | the gene list and the categories of the samples. 103 | 104 | """ 105 | 106 | if self.input_gene_file and not self.gene_list: 107 | self.translate_gene_list(database) 108 | 109 | query: Dict[Hashable, Any] = {} 110 | if self.search_key and self.search_value: 111 | 112 | if self.search_value == 'regex': 113 | query = {self.search_key: {'$regex': 'myocyte|fibroblast'}} 114 | else: 115 | if isinstance(self.search_value, list): 116 | query[self.search_key] = {'$in': self.search_value} 117 | else: 118 | query[self.search_key] = self.search_value 119 | 120 | elif self.search_key and not self.search_value: 121 | query[self.search_key] = {"$exists": True} 122 | elif not self.search_key and not self.search_value: 123 | pass 124 | else: 125 | raise Exception( 126 | "Invalid input - you can't specify a " "key_value without specifying a key_name" 127 | ) 128 | 129 | projection: Dict[Hashable, Any] = {"sample_name": 1, "_id": 0} 130 | if self.search_key and not self.search_key == "sample_name": 131 | projection[self.search_key] = 1 132 | 133 | cursor = self.mongo_reader.find_as_cursor( 134 | database=database, collection="samples", query=query, projection=projection 135 | ) 136 | sample_names = set() 137 | sample_category = {} 138 | for result in cursor: 139 | log.info(result) 140 | sample_names.add(result["sample_name"]) 141 | sample_category[result["sample_name"]] = ( 142 | result[self.search_key] if self.search_key else result["sample_name"] 143 | ) 144 | log.info(f"Get data for sample_names {list(sample_names)}") 145 | 146 | query = {"sample_name": {"$in": list(sample_names)}} 147 | if self.gene_list: 148 | log.info(self.gene_list) 149 | query["gene"] = {"$in": list(self.gene_list)} 150 | cursor = self.mongo_reader.find_as_cursor( 151 | database=database, collection="RNASeq", query=query, projection={"_id": 0} 152 | ) 153 | 154 | # make it a list of lists 155 | log.info(f"Importing data from mongo ({self.mongo_host})...") 156 | dataset: Dict[Hashable, Dict[Hashable, Optional[int]]] = {} 157 | gene_list = set() 158 | sample_list = set() 159 | for count, result in enumerate(cursor): 160 | if count % 100_000 == 0: 161 | log.info(f"{count} rows processed.") 162 | sample = result["sample_name"] 163 | rpkm = get_canonical_rpkm(result) if rpkm_flag else get_canonical_raw(result) 164 | gene = result["gene"] 165 | if sample not in dataset: 166 | dataset[sample] = {} 167 | dataset[sample][gene] = rpkm 168 | sample_list.add(sample) 169 | gene_list.add(gene) 170 | 171 | return sorted(sample_list), dataset, sorted(gene_list), sample_category 172 | -------------------------------------------------------------------------------- /edgePy/data_import/mongodb/mongo_wrapper.py: -------------------------------------------------------------------------------- 1 | """ 2 | A simple library for wrapping around mongo collections and access issues. 3 | """ 4 | from typing import Dict, Hashable, Any, Iterable, List, Union 5 | 6 | import pymongo # type: ignore 7 | from pymongo.errors import BulkWriteError # type: ignore 8 | from pymongo import InsertOne, UpdateOne 9 | 10 | from edgePy.util import getLogger 11 | 12 | log = getLogger(name=__name__) 13 | 14 | 15 | class MongoWrapper(object): 16 | """This class is for use as a thin layer for interactinvg with the Mongo Database 17 | using pymongo. Pymongo is an entirely reasonable way of working with Mongodb, but 18 | fails to provide some very common functions that are frequently used. 19 | 20 | This class should be used for efficient retrieval of information from the database. 21 | 22 | Args: 23 | host: the name of the machine hosting the database 24 | port: the port number (usually 27017 25 | connect: whether to create the new session, or to attach to an existing session, 26 | set to false, if this is being instantiated by a subprocesses. 27 | verbose: suppresses output, when set to false. 28 | 29 | """ 30 | 31 | def __init__( 32 | self, host: str, port: Union[str, int] = 27017, connect: bool = True, verbose: bool = False 33 | ) -> None: 34 | self.host = host 35 | self.port = int(port) 36 | self.session = pymongo.MongoClient(host=self.host, port=self.port, connect=connect) 37 | self.verbose = verbose 38 | 39 | def get_db(self, database: str, collection: str) -> Any: 40 | """ 41 | This function simply hides the db name when using pytest-mongodb, when the database name 42 | should always be 'pytest' 43 | 44 | Args: 45 | database: database name 46 | collection: collection name 47 | 48 | Returns: 49 | the collection object ready for use with .find() or similar. 50 | 51 | """ 52 | 53 | if database == "pytest": 54 | return self.session[collection] 55 | else: 56 | return self.session[database][collection] 57 | 58 | def find_as_cursor( 59 | self, 60 | database: str, 61 | collection: str, 62 | query: Dict[Hashable, Any] = None, 63 | projection: Dict[Hashable, Any] = None, 64 | ) -> Iterable: 65 | """ 66 | Do a find operation on a mongo collection and return the data as a cursor, 67 | the (native MongoClient find return type.) 68 | 69 | Args: 70 | database: db name 71 | collection: collection name 72 | query: a dictionary providing the criteria for the find command 73 | projection: a dictionary that gives the projection - the fields to return. 74 | 75 | Returns: 76 | a cursor object, to be used as an iterator. 77 | 78 | """ 79 | 80 | try: 81 | cursor = self.get_db(database, collection).find(query, projection) 82 | except Exception as exception: 83 | log.exception(exception) 84 | raise Exception("Mongo find failed") 85 | 86 | return cursor 87 | 88 | def find_as_list( 89 | self, 90 | database: str, 91 | collection: str, 92 | query: Dict[Hashable, Any] = None, 93 | projection: Dict[Hashable, Any] = None, 94 | ) -> Iterable: 95 | """ 96 | Do a find operation on a mongo collection, but return the data as a list 97 | 98 | Args: 99 | database: db name 100 | collection: collection name 101 | query: a dictionary providing the criteria for the find command 102 | projection: a dictionary that gives the projection - the fields to return. 103 | 104 | Returns: 105 | a list representation of the returned data. 106 | 107 | """ 108 | cursor = self.find_as_cursor( 109 | database=database, collection=collection, query=query, projection=projection 110 | ) 111 | return [c for c in cursor] 112 | 113 | def find_as_dict( 114 | self, 115 | database: str, 116 | collection: str, 117 | query: Dict[Hashable, Any] = None, 118 | field: str = "_id", 119 | projection: Dict[Hashable, Any] = None, 120 | ) -> Iterable: 121 | """ 122 | Do a find operation on a mongo collection, but return the data as a dictionary 123 | 124 | Args: 125 | database: db name 126 | collection: collection name 127 | query: a dictionary providing the criteria for the find command 128 | projection: a dictionary that gives the projection - the fields to return. 129 | field: the field in the projection for which the value will be used as the Hashable key of the dict. 130 | 131 | Returns: 132 | a dictionary representation of the returned data. 133 | 134 | """ 135 | cursor = self.find_as_cursor( 136 | database=database, collection=collection, query=query, projection=projection 137 | ) 138 | return {c[field]: c for c in cursor} 139 | 140 | def insert(self, database: str, collection: str, data_list: List[Any]) -> None: 141 | """ 142 | bulk insert of items into a mongodb collection. 143 | 144 | Args: 145 | database: db name 146 | collection: collection name 147 | data_list: a list of documents to insert into mongodb. 148 | 149 | """ 150 | 151 | try: 152 | self.get_db(database, collection).test.insert_many(data_list, ordered=False) 153 | except BulkWriteError as bwe: 154 | log.exception(bwe.details) 155 | 156 | def create_index(self, database: str, collection: str, key: str) -> None: 157 | 158 | """ 159 | A tool for creating indexes on a given collection. 160 | 161 | Args: 162 | database: db name 163 | collection: collection name 164 | key: the field name to create the index on. 165 | 166 | """ 167 | self.get_db(database, collection).create_index(key) 168 | 169 | 170 | class MongoInserter(MongoWrapper): 171 | """ 172 | 173 | This class is a thin layer on the MongoWrapper class, which is a thin layer on the pymongo library. 174 | It is used for instances where you want to insert data into a mongodb collection. It creates 175 | a buffer which is periodically flushed to Mongo. 176 | 177 | Args: 178 | host: the name of the machine hosting the database 179 | port: the port number (usually 27017) 180 | database: db name 181 | collection: collection name 182 | connect: whether to create the new session, or to attach to an existing session, set to false, 183 | if this is being instantiated by a subprocesses. 184 | 185 | """ 186 | 187 | def __init__( 188 | self, host: str, port: int, database: str, collection: str, connect: bool = True 189 | ) -> None: 190 | MongoWrapper.__init__(self, host, port, connect=connect) 191 | self.database = database 192 | self.collection = collection 193 | self.to_insert: List = [] 194 | self.mongo_col = self.get_db(database, collection) 195 | 196 | def flush(self) -> None: 197 | """ 198 | Flush out the buffer and write to mongo db. 199 | 200 | """ 201 | if self.to_insert: 202 | try: 203 | result = self.mongo_col.bulk_write(self.to_insert, ordered=False) 204 | if result and self.verbose: 205 | log.info(result.bulk_api_result) 206 | except BulkWriteError as bwe: 207 | log.exception(bwe.details) 208 | raise Exception("Mongo bulk write failed.") 209 | del self.to_insert[:] 210 | 211 | def add(self, record: Union[List[Any], Dict[Hashable, Any]]) -> None: 212 | """ 213 | Add a record to the buffer 214 | 215 | Args: 216 | record: the record to add to the mongo inserter buffer 217 | 218 | """ 219 | self.to_insert.append(InsertOne(record)) 220 | if len(self.to_insert) > 1000: 221 | self.flush() 222 | 223 | def close(self) -> None: 224 | """ 225 | Close the MongoInserter - flush the buffer. 226 | 227 | """ 228 | 229 | self.flush() 230 | 231 | def create_index_key(self, key: str) -> None: 232 | """ 233 | A tool for creating indexes on the collection. 234 | """ 235 | self.create_index(self.database, self.collection, key) 236 | 237 | 238 | class MongoUpdater(MongoWrapper): 239 | """ 240 | 241 | This class is a thin layer on the MongoWrapper class, which is a thin layer on the pymongo library. 242 | It is used for instances where you want to Update data in a mongodb collection. It creates 243 | a buffer which is periodically flushed and written to mongo. 244 | 245 | Args: 246 | host: the name of the machine hosting the database 247 | port: the port number (usually 27017 248 | database: db name 249 | collection: collection name 250 | connect: whether to create the new session, or to attach to an existing session, 251 | set to false, if this is being instantiated by a subprocesses. 252 | 253 | """ 254 | 255 | def __init__( 256 | self, host: str, port: int, database: str, collection: str, connect: bool = True 257 | ) -> None: 258 | MongoWrapper.__init__(self, host, port, connect=connect) 259 | self.database = database 260 | self.to_update: List[Any] = [] 261 | self.mongo_col = self.get_db(database, collection) 262 | 263 | def flush(self) -> None: 264 | """ 265 | Flush out the buffer and write to mongo db. 266 | 267 | """ 268 | if self.to_update: 269 | try: 270 | result = self.mongo_col.bulk_write(self.to_update, ordered=False) 271 | if result and self.verbose: 272 | log.info(result.bulk_api_result) 273 | except BulkWriteError as bwe: 274 | log.exception(bwe.details) 275 | raise Exception("Mongo bulk write failed.") 276 | del self.to_update[:] 277 | 278 | def add(self, updatedict: Dict[Hashable, Any], setdict: Dict[Hashable, Any]) -> None: 279 | """ 280 | Add a record to the buffer 281 | 282 | Args: 283 | updatedict: the criteria for the update query 284 | setdict: the dictionary describing the new record - OR use {$set: {}} to update a 285 | particular key without replacing the existing record. 286 | 287 | """ 288 | 289 | self.to_update.append(UpdateOne(updatedict, setdict)) 290 | if len(self.to_update) > 1000: 291 | self.flush() 292 | 293 | def close(self) -> None: 294 | """ 295 | Close the MongoInserter - flush the buffer. 296 | 297 | """ 298 | self.flush() 299 | -------------------------------------------------------------------------------- /edgePy/util.py: -------------------------------------------------------------------------------- 1 | """ Utilities to support functions and classes """ 2 | from typing import Optional 3 | 4 | import logzero # type: ignore 5 | 6 | __all__ = ["getLogger"] 7 | 8 | LOG_FORMAT = ( 9 | "%(color)s[%(levelname)s | %(asctime)s | " 10 | "%(name)s | %(module)s | line %(lineno)d]:%(end_color)s %(message)s" 11 | ) 12 | 13 | 14 | def getLogger( 15 | name: str, 16 | level: int = logzero.logging.DEBUG, 17 | formatter: Optional[logzero.LogFormatter] = logzero.LogFormatter(fmt=LOG_FORMAT), 18 | ) -> logzero.logger: 19 | """Formats and sets up the logger instance. 20 | 21 | Args: 22 | name (str): The name of the Logger. 23 | level (int): The default level (logzero.logging.INFO = 20) of the logger. 24 | formatter (:obj:, optional): The format of the log message. Defaults to the default logzero format. 25 | 26 | Returns: 27 | An instance of a logger. 28 | 29 | Examples: 30 | >>> from edgePy.util import getLogger 31 | >>> log = getLogger(name="script") 32 | >>> log.info('This is your DGElist.') 33 | ... 34 | 35 | Notes: 36 | 1. See https://docs.python.org/3/library/logging.html#levels for more information about logging levels. 37 | 38 | """ 39 | log_formatter = ( 40 | logzero.LogFormatter(fmt=logzero.LogFormatter.DEFAULT_FORMAT) 41 | if formatter is None 42 | else formatter 43 | ) 44 | logger = logzero.setup_logger(name=name, level=level, formatter=log_formatter) 45 | 46 | return logger 47 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 99 3 | py36 = true 4 | skip-string-normalization = true 5 | include = '\.pyi?$' 6 | exclude = ''' 7 | /( 8 | \.git 9 | | \.mypy_cache 10 | | \.tox 11 | | venv 12 | | _build 13 | | buck-out 14 | | build 15 | | dist 16 | )/ 17 | ''' 18 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | mongodb_fixture_dir = 3 | tests/mongodb/fixtures -------------------------------------------------------------------------------- /requirements-test.txt: -------------------------------------------------------------------------------- 1 | coverage==4.5 2 | flake8==3.5.0 3 | mypy==0.620 4 | pylint==2.0.0 5 | pytest==3.6.3 6 | pytest-cov==2.5.1 7 | pytest-parallel==0.0.2 8 | pymongo==3.7.1 9 | pytest-mongodb==2.1.2 10 | black==18.6b3 11 | -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/scripts/__init__.py -------------------------------------------------------------------------------- /scripts/edgepy.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import List, Dict, Hashable, Any 3 | import configparser 4 | 5 | import numpy as np 6 | from scipy.stats import ks_2samp 7 | from smart_open import smart_open # type: ignore 8 | 9 | 10 | from edgePy.DGEList import DGEList 11 | from edgePy.data_import.mongodb.mongo_import import ImportFromMongodb 12 | from edgePy.util import getLogger 13 | 14 | log = getLogger(name="script") 15 | 16 | 17 | def parse_arguments(parser=None): 18 | if not parser: 19 | parser = argparse.ArgumentParser() 20 | 21 | parser.add_argument("--count_file", help="name of the count file") 22 | parser.add_argument("--groups_file", help="name of the groups file") 23 | parser.add_argument("--dge_file", help="import from .dge file;") 24 | parser.add_argument("--gene_list", default=None, help="a list of genes to filter the data set") 25 | 26 | # mongo parameters 27 | parser.add_argument( 28 | "--mongo_config", help="a way to import data from a supported mongo database" 29 | ) 30 | parser.add_argument("--mongo_key_name", default="Project") 31 | parser.add_argument("--mongo_key_value", default="RNA-Seq1") 32 | parser.add_argument("--database_name") 33 | parser.add_argument( 34 | "--group1_sample_names", nargs='+', help="List of samples names for first group" 35 | ) 36 | parser.add_argument( 37 | "--group2_sample_names", nargs='+', help="List of samples names for second group" 38 | ) 39 | parser.add_argument( 40 | "--groups_json", help="A JSON file with the group names, and list of samples. see example." 41 | ) 42 | 43 | parser.add_argument("--output", help="optional output file for results") 44 | parser.add_argument("--cutoff", help="p-value cutoff to accept.", default=0.05) 45 | parser.add_argument( 46 | "--minimum_cpm", help="discard results for which no group has this many counts", default=1 47 | ) 48 | 49 | args = parser.parse_args() 50 | 51 | return args 52 | 53 | 54 | class EdgePy(object): 55 | def __init__(self, args): 56 | 57 | self.dge_list = None 58 | 59 | if args.dge_file: 60 | self.dge_list = DGEList(filename=args.dge_file) 61 | log.info(f"The DGE list is {self.dge_list}") 62 | 63 | elif args.mongo_config: 64 | # This section is only useful for MongoDB based analyses. Talk to @apfejes about this section if you have 65 | # any questions. 66 | 67 | config = configparser.ConfigParser() 68 | config.read(args.mongo_config) 69 | 70 | if args.group1_sample_names and args.group2_sample_names: 71 | key = 'sample_name' 72 | value = args.group1_sample_names + args.group2_sample_names 73 | 74 | elif args.key_name and args.mongo_key_value: 75 | key = args.mongo_key_name 76 | value = args.mongo_key_value 77 | else: 78 | raise ValueError("Insufficient parameters for use of Mongodb") 79 | 80 | mongo_importer = ImportFromMongodb( 81 | host=config.get("Mongo", "host"), 82 | port=config.get("Mongo", "port"), 83 | mongo_key=key, 84 | mongo_value=value, 85 | gene_list_file=args.gene_list, 86 | ) 87 | 88 | sample_list, data_set, gene_list, sample_category = mongo_importer.get_data_from_mongo( 89 | database=args.database_name 90 | ) 91 | 92 | if key == 'sample_name': 93 | # Override sample categories if sample name is the source of the categories. 94 | sample_category_list = [ 95 | "group1" if sample_name in args.group1_sample_names else "group2" 96 | for sample_name in sample_list 97 | ] 98 | sample_category_dict = None 99 | else: 100 | # TODO: read from file 101 | sample_category_dict = args.groups_json 102 | sample_category_list = None 103 | 104 | self.dge_list = DGEList.create_DGEList( 105 | sample_list, 106 | data_set, 107 | gene_list, 108 | sample_to_category=sample_category_list, 109 | category_to_samples=sample_category_dict, 110 | ) 111 | 112 | self.ensg_to_symbol = mongo_importer.mongo_reader.find_as_dict( 113 | 'ensembl_90_37', "symbol_by_ensg", query={} 114 | ) 115 | 116 | else: 117 | self.dge_list = DGEList.create_DGEList_data_file( 118 | data_file=args.counts_file, group_file=args.groups_file 119 | ) 120 | 121 | self.output = args.output if args.output else None 122 | self.p_value_cutoff = args.cutoff 123 | self.minimum_cpm = args.minimum_cpm 124 | 125 | def run_ks(self): 126 | """ 127 | First pass implementation of a Kolmogorov-Smirnov test for different groups, using the Scipy KS test two-tailed 128 | implementation. 129 | 130 | Args: 131 | None. 132 | 133 | """ 134 | 135 | log.info(self.dge_list.groups_list) 136 | 137 | gene_details, gene_likelyhood1, group_types = self.ks_2_samples() 138 | 139 | results = self.generate_results( 140 | gene_details, gene_likelyhood1, group_types[0], group_types[1] 141 | ) 142 | 143 | if self.output: 144 | with smart_open(self.output, 'w') as out: 145 | out.writelines(results) 146 | log.info(f"Wrote to {self.output}") 147 | else: 148 | for line in results: 149 | log.info(line) 150 | 151 | def ks_2_samples(self): 152 | """Run a 2-tailed Kolmogorov-Smirnov test on the DGEList object. 153 | 154 | Args: 155 | None. 156 | 157 | Returns: 158 | gene_details: a dictionary of dictionary (key, gene), holding mean1 and mean2 for the two groups 159 | gene_likelihood: a dictionary (key, gene), holding the p-value of the separation of the two groups 160 | group_types: list of the groups in order. 161 | 162 | """ 163 | gene_likelihood1: Dict[Hashable, float] = {} 164 | group_types = set(self.dge_list.groups_list) 165 | group_types = list(group_types) 166 | group_filters: Dict[Hashable, Any] = {} 167 | gene_details: Dict[Hashable, Dict[Hashable, Any]] = {} 168 | for group in group_types: 169 | group_filters[group] = [g == group for g in self.dge_list.groups_list] 170 | for gene_idx, gene in enumerate(self.dge_list.genes): 171 | gene_row = self.dge_list.counts[gene_idx] 172 | if len(group_types) == 2: 173 | group_data1 = gene_row.compress(group_filters[group_types[0]]) 174 | mean1 = np.mean(group_data1) 175 | 176 | group_data2 = gene_row.compress(group_filters[group_types[1]]) 177 | mean2 = np.mean(group_data2) 178 | 179 | gene_likelihood1[gene] = ks_2samp(group_data1, group_data2)[1] 180 | 181 | gene_details[gene] = {'mean1': mean1, 'mean2': mean2} 182 | return gene_details, gene_likelihood1, group_types 183 | 184 | def generate_results( 185 | self, 186 | gene_details: Dict[Hashable, Dict[Hashable, Any]], 187 | gene_likelihood1: Dict[Hashable, float], 188 | group_type1: str, 189 | group_type2: str, 190 | ) -> List[str]: 191 | 192 | """ 193 | This function simply prepares a summary of the results of the analysis for dumping to file or to screen 194 | 195 | Args: 196 | gene_details: information about the genes - should contain fields 'mean1' and 'mean2' for display 197 | gene_likelihood1: dictionary of gene names and the p-value associated. used to sort the data 198 | group_type1: the name of the first grouping 199 | group_type2: the name of the second grouping 200 | 201 | """ 202 | 203 | results: List[str] = [] 204 | sorted_likely = [ 205 | (gene, gene_likelihood1[gene]) 206 | for gene in sorted(gene_likelihood1, key=gene_likelihood1.get) 207 | ] 208 | results.append(f"gene_name\tp-value\t{group_type1}\t{group_type2}\n") 209 | for gene, p in sorted_likely: 210 | m1 = gene_details[gene]['mean1'] 211 | m2 = gene_details[gene]['mean2'] 212 | symbol = ( 213 | self.ensg_to_symbol[gene]['symbols'][0] if gene in self.ensg_to_symbol else gene 214 | ) 215 | 216 | if ( 217 | p < self.p_value_cutoff 218 | and not (m1 < self.minimum_cpm and m2 < self.minimum_cpm) 219 | and m1 < m2 220 | ): 221 | results.append( 222 | f"{gene}\t" 223 | f"{symbol}\t" 224 | f"{gene_likelihood1[gene]}\t" 225 | f"{gene_details[gene]['mean1']:.2f}\t" 226 | f"{gene_details[gene]['mean2']:.2f}\n" 227 | ) 228 | 229 | return results 230 | 231 | 232 | def main(): 233 | 234 | args = parse_arguments() 235 | default_class = EdgePy(args) 236 | default_class.run_ks() 237 | 238 | 239 | if __name__ == "__main__": 240 | main() 241 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = edgePy 3 | version = 0.0.1 4 | author = r-bioinformatics 5 | url = https://github.com/r-bioinformatics/edgePy 6 | description = A Python port of edgeR for differential expression analysis. 7 | long_description = file: README.md, LICENSE 8 | long_description_content_type = text/markdown 9 | keywords = bioinformatics, gene, differential, expression, edgeR 10 | requires-dist = setuptools>=30.3.0 11 | license = MIT 12 | classifiers = 13 | Development Status :: 2 - Pre-Alpha 14 | Intended Audience :: Science/Research 15 | License :: OSI Approved :: MIT License 16 | Topic :: Scientific/Engineering :: Bio-Informatics 17 | Programming Language :: Python :: 3.6 18 | project-urls = 19 | Slack-Group = https://r-bioinformatics.slack.com/ 20 | Subreddit = https://reddit.com/r/bioinformatics/ 21 | 22 | [options] 23 | zip_safe = True 24 | include_package_data = True 25 | packages = find: 26 | install_requires = 27 | numpy>=1.14.5 28 | smart_open>=1.6.0 29 | tox>=3.1.2 30 | scipy>=1.1.0 31 | logzero>=1.0.0 32 | sphinx>=1.7 33 | pymysql>=0.9.2 34 | 35 | [mypy] 36 | warn_redundant_casts = True 37 | warn_unused_ignores = True 38 | mypy_path = docs/stubs 39 | python_version = 3.6 40 | 41 | [mypy-*] 42 | disallow_untyped_calls = True 43 | disallow_untyped_defs = True 44 | warn_return_any = True 45 | no_implicit_optional = True 46 | strict_optional = True 47 | ignore_missing_imports = False 48 | 49 | [flake8] 50 | max-line-length = 120 51 | doctests = True 52 | show-source = True 53 | ignore = 54 | exclude = 55 | .git 56 | .mypy_cache 57 | .pytest_cache 58 | libs 59 | docs 60 | tests 61 | __init__.py 62 | 63 | [tool:pytest] 64 | addopts = --verbose --doctest-modules --cov 65 | 66 | [coverage:run] 67 | branch = true 68 | source = edgePy 69 | parallel = true 70 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | # Name and version specified here for `sdist` only. 4 | setup(name="edgePy", version="0.0.1") 5 | -------------------------------------------------------------------------------- /tests/ensembl/test_canonical_transcripts.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import unittest 3 | 4 | from edgePy.data_import.data_import import get_dataset_path 5 | 6 | TEST_DATASET = "transcripts_homo_sapiens_core_75_37.tsv" 7 | TEST_GENE_SYMBOLS = "symbols_homo_sapiens_core_75_37.tsv" 8 | from edgePy.data_import.ensembl.ensembl_flat_file_reader import CanonicalDataStore 9 | 10 | 11 | class TestCanonicalTranscripts(unittest.TestCase): 12 | @classmethod 13 | def setUpClass(cls): 14 | cls.canonicaldata = CanonicalDataStore( 15 | get_dataset_path(TEST_DATASET), get_dataset_path(TEST_GENE_SYMBOLS) 16 | ) 17 | 18 | def test_is_canonical_by_transcript(self): 19 | # ENSG00000224451 ENST00000433775 567 True 20 | # ENSG00000175063 ENST00000405520 1441 False 21 | assert self.canonicaldata.is_canonical_by_transcript("ENST00000433775") is True 22 | assert self.canonicaldata.is_canonical_by_transcript("ENST00000405520") is False 23 | 24 | def test_get_canonical_transcript(self): 25 | # ENSG00000104047 ENST00000403028 1579 False 26 | # ENSG00000104047 ENST00000557968 932 False 27 | # ENSG00000104047 ENST00000559223 789 False 28 | # ENSG00000104047 ENST00000558653 1126 False 29 | # ENSG00000104047 ENST00000561188 588 False 30 | # ENSG00000104047 ENST00000557988 1195 False 31 | # ENSG00000104047 ENST00000560735 596 False 32 | # ENSG00000104047 ENST00000559164 673 False 33 | # ENSG00000104047 ENST00000560632 548 False 34 | # ENSG00000104047 ENST00000559405 580 False 35 | # ENSG00000104047 ENST00000251250 2674 True 36 | # ENSG00000104047 ENST00000329873 476 False 37 | # ENSG00000104047 ENST00000415425 2208 False 38 | assert self.canonicaldata.get_canonical_transcript("ENSG00000104047") == "ENST00000251250" 39 | 40 | def test_get_length_of_transcript(self): 41 | 42 | # ENSG00000224451 ENST00000433775 567 True 43 | # ENSG00000175063 ENST00000405520 1441 False 44 | assert self.canonicaldata.get_length_of_transcript("ENST00000433775") == 567 45 | assert self.canonicaldata.get_length_of_transcript("ENST00000405520") == 1441 46 | 47 | def test_get_length_of_canonical_transcript(self): 48 | # ENSG00000224451 ENST00000433775 567 True 49 | # ENSG00000104047 ENST00000251250 2674 True 50 | assert self.canonicaldata.get_length_of_canonical_transcript("ENSG00000224451") == 567 51 | assert self.canonicaldata.get_length_of_canonical_transcript("ENSG00000104047") == 2674 52 | -------------------------------------------------------------------------------- /tests/ensembl/test_ensembl_flat_file_reader.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import unittest 3 | from edgePy.data_import.data_import import get_dataset_path 4 | from edgePy.data_import.ensembl.ensembl_flat_file_reader import CanonicalDataStore 5 | 6 | 7 | TEST_GENE_SET_DATA = "transcripts_homo_sapiens_core_75_37.tsv" 8 | TEST_GENE_SYMBOLS = "symbols_homo_sapiens_core_75_37.tsv" 9 | 10 | 11 | class TestEnsembleFlatFileReader(unittest.TestCase): 12 | @classmethod 13 | def setUpClass(cls): 14 | cls.icd = CanonicalDataStore( 15 | get_dataset_path(TEST_GENE_SET_DATA), get_dataset_path(TEST_GENE_SYMBOLS) 16 | ) 17 | 18 | def test_pick_gene_id_1(self): 19 | gene_list = ["ENG000000123", "ENG000000125", "ENG000000130"] 20 | 21 | best_gene = self.icd.pick_gene_id(gene_list) 22 | 23 | assert best_gene == "ENG000000130" 24 | 25 | def test_has_gene(self): 26 | # SLC25A26 ENSG00000261657 27 | assert self.icd.has_gene("ENSG00000261657") 28 | # HMGA1P6 ENSG00000233440 29 | assert self.icd.has_gene("ENSG00000233440") 30 | # fake genes that don't exist. 31 | assert not self.icd.has_gene("ENSG00000000001") 32 | assert not self.icd.has_gene("ENSG00000000010") 33 | 34 | def test_get_symbol_from_gene(self): 35 | # FABP3P2 ENSG00000233259 36 | # DHFRP1 ENSG00000188985 37 | # LINC01050 ENSG00000271216 38 | 39 | assert self.icd.get_symbol_from_gene("ENSG00000233259") == "FABP3P2" 40 | assert self.icd.get_symbol_from_gene("ENSG00000188985") == "DHFRP1" 41 | assert self.icd.get_symbol_from_gene("ENSG00000271216") == "LINC01050" 42 | assert self.icd.get_symbol_from_gene("ENSG00000271216") == "LINC01050" 43 | with self.assertRaises(KeyError): 44 | self.icd.get_symbol_from_gene("NOTAREALGENE") 45 | 46 | def test_get_genes_from_symbol(self): 47 | 48 | assert self.icd.get_genes_from_symbol("FABP3P2") == ['ENSG00000233259'] 49 | assert self.icd.get_genes_from_symbol("FAKEGENE1") == [] 50 | print(self.icd.get_genes_from_symbol("PAN1")) 51 | self.assertListEqual( 52 | self.icd.get_genes_from_symbol("PAN1"), 53 | [ 54 | 'ENSG00000022556', 55 | 'ENSG00000270370', 56 | 'ENSG00000262615', 57 | 'ENSG00000262886', 58 | 'ENSG00000262329', 59 | 'ENSG00000262811', 60 | 'ENSG00000262175', 61 | 'ENSG00000262929', 62 | 'ENSG00000262260', 63 | ], 64 | ) 65 | 66 | def test_is_known_symbol(self): 67 | # SLC25A26 ENSG00000261657 68 | assert self.icd.is_known_symbol("FABP3P2") 69 | # HMGA1P6 ENSG00000233440 70 | assert self.icd.is_known_symbol("DHFRP1") 71 | # fake genes that don't exist. 72 | assert not self.icd.is_known_symbol("FAKEGENE1") 73 | 74 | def test_is_known_gene(self): 75 | # SLC25A26 ENSG00000261657 76 | assert self.icd.is_known_gene("ENSG00000261657") 77 | # HMGA1P6 ENSG00000233440 78 | assert self.icd.is_known_gene("ENSG00000233440") 79 | # fake genes that don't exist. 80 | assert not self.icd.is_known_gene("ENSG00000000001") 81 | assert not self.icd.is_known_gene("ENSG00000000010") 82 | 83 | def test_is_canonical_by_transcript(self): 84 | """ 85 | ENSG00000171448 ENST00000373656 4441 True 86 | ENSG00000171448 ENST00000373654 2045 False 87 | 88 | ENSG00000140157 ENST00000337451 3225 True 89 | ENSG00000140157 ENST00000398013 2274 False 90 | """ 91 | assert self.icd.is_canonical_by_transcript("ENST00000373656") is True 92 | assert self.icd.is_canonical_by_transcript("ENST00000373654") is False 93 | assert self.icd.is_canonical_by_transcript("ENST00000337451") is True 94 | assert self.icd.is_canonical_by_transcript("ENST00000398013") is False 95 | 96 | def test_get_canonical_transcript(self): 97 | """ 98 | ENSG00000171448 ENST00000373656 4441 True 99 | ENSG00000171448 ENST00000373654 2045 False 100 | 101 | ENSG00000140157 ENST00000337451 3225 True 102 | ENSG00000140157 ENST00000398013 2274 False 103 | """ 104 | assert self.icd.get_canonical_transcript("ENSG00000171448") == "ENST00000373656" 105 | assert self.icd.get_canonical_transcript("ENSG00000140157") == "ENST00000337451" 106 | 107 | def test_get_length_of_transcript(self): 108 | """ 109 | ENSG00000171448 ENST00000373656 4441 True 110 | ENSG00000171448 ENST00000373654 2045 False 111 | 112 | ENSG00000140157 ENST00000337451 3225 True 113 | ENSG00000140157 ENST00000398013 2274 False 114 | """ 115 | assert self.icd.get_length_of_transcript("ENST00000373656") == 4441 116 | assert self.icd.get_length_of_transcript("ENST00000373654") == 2045 117 | assert self.icd.get_length_of_transcript("ENST00000337451") == 3225 118 | assert self.icd.get_length_of_transcript("ENST00000398013") == 2274 119 | 120 | def test_get_length_of_canonical_transcript(self): 121 | """ 122 | ENSG00000171448 ENST00000373656 4441 True 123 | ENSG00000171448 ENST00000373654 2045 False 124 | 125 | ENSG00000140157 ENST00000337451 3225 True 126 | ENSG00000140157 ENST00000398013 2274 False 127 | """ 128 | assert self.icd.get_length_of_canonical_transcript("ENSG00000171448") == 4441 129 | assert self.icd.get_length_of_canonical_transcript("ENSG00000140157") == 3225 130 | -------------------------------------------------------------------------------- /tests/mongodb/fixtures/ensg_by_symbol.json: -------------------------------------------------------------------------------- 1 | [{ "_id" : "BRCA1", "ensgs" : [ "ENSG00000012048" ] }, 2 | { "_id" : "BRCA2", "ensgs" : [ "ENSG00000139618" ] }, 3 | { "_id" : "TP53", "ensgs" : [ "ENSG00000141510" ] }] -------------------------------------------------------------------------------- /tests/mongodb/fixtures/samples.json: -------------------------------------------------------------------------------- 1 | [{ "_id" : "5aa97335f8848a3fd2ccd3e4", "assay_type" : "RNA-Seq", "avgspotlen" : "50", "bioproject" : "PRJNA362579", "biosample" : "SAMN06242166", "center_name" : "GEO", "consent" : "public", "experiment" : "SRX2505175", "insertsize" : "0", "instrument" : "Illumina HiSeq 2500", "librarylayout" : "SINGLE", "libraryselection" : "cDNA", "librarysource" : "TRANSCRIPTOMIC", "loaddate" : "2017-01-19", "mbases" : "1232", "mbytes" : "611", "organism" : "Homo sapiens", "platform" : "ILLUMINA", "releasedate" : "2017-08-30", "run" : "SRR5189264", "sra_sample" : "SRS1929803", "sra_study" : "SRP097153", "sample_name" : "SRR5189264", "cell_type" : "ES-cell derived human cardiac organoids", "source_name" : "human cardiac organoids", "sample_sra_name" : "GSM2463762", "Tenaya_set_type" : "Cardiomyocytes", "Project" : "Public Data" }, 2 | { "_id" : "5aa97335f8848a3fd2ccd3e5", "assay_type" : "RNA-Seq", "avgspotlen" : "50", "bioproject" : "PRJNA362579", "biosample" : "SAMN06242165", "center_name" : "GEO", "consent" : "public", "experiment" : "SRX2505176", "insertsize" : "0", "instrument" : "Illumina HiSeq 2500", "librarylayout" : "SINGLE", "libraryselection" : "cDNA", "librarysource" : "TRANSCRIPTOMIC", "loaddate" : "2017-01-19", "mbases" : "1134", "mbytes" : "566", "organism" : "Homo sapiens", "platform" : "ILLUMINA", "releasedate" : "2017-08-30", "run" : "SRR5189265", "sra_sample" : "SRS1929804", "sra_study" : "SRP097153", "sample_name" : "SRR5189265", "cell_type" : "ES-cell derived human cardiac organoids", "source_name" : "human cardiac organoids", "sample_sra_name" : "GSM2463763", "Tenaya_set_type" : "Cardiomyocytes", "Project" : "Public Data" }, 3 | { "_id" : "5aa97335f8848a3fd2ccd3e6", "assay_type" : "RNA-Seq", "avgspotlen" : "50", "bioproject" : "PRJNA362579", "biosample" : "SAMN06242164", "center_name" : "GEO", "consent" : "public", "experiment" : "SRX2505177", "insertsize" : "0", "instrument" : "Illumina HiSeq 2500", "librarylayout" : "SINGLE", "libraryselection" : "cDNA", "librarysource" : "TRANSCRIPTOMIC", "loaddate" : "2017-01-19", "mbases" : "1094", "mbytes" : "543", "organism" : "Homo sapiens", "platform" : "ILLUMINA", "releasedate" : "2017-08-30", "run" : "SRR5189266", "sra_sample" : "SRS1929805", "sra_study" : "SRP097153", "sample_name" : "SRR5189266", "cell_type" : "ES-cell derived human cardiac organoids", "source_name" : "human cardiac organoids", "sample_sra_name" : "GSM2463764", "Tenaya_set_type" : "Cardiomyocytes", "Project" : "Public Data" }] -------------------------------------------------------------------------------- /tests/mongodb/fixtures/symbol_by_ensg.json: -------------------------------------------------------------------------------- 1 | [{ "_id" : "ENSG00000012048", "symbols" : [ "BRCA1" ] }, 2 | { "_id" : "ENSG00000139618", "symbols" : [ "BRCA2" ] }, 3 | { "_id" : "ENSG00000141510", "symbols" : [ "TP53" ] }] -------------------------------------------------------------------------------- /tests/mongodb/test_gene_functions.py: -------------------------------------------------------------------------------- 1 | """The core Python code for generating data.""" 2 | import pytest 3 | from edgePy.data_import.data_import import get_dataset_path 4 | 5 | from edgePy.data_import.mongodb.gene_functions import * 6 | from edgePy.data_import.mongodb.mongo_wrapper import MongoWrapper 7 | 8 | GENE_LIST_DATASET = "example_gene_list.txt" 9 | 10 | RNASeq_RECORD = { 11 | "_id": "5a7519801fd85c0e41c94c51", 12 | "gene": "ENSG00000232977", 13 | "sample_name": "SRR4011901", 14 | "transcripts": { 15 | "ENST00000575689": { 16 | "size": 720, 17 | "canonical": "0", 18 | "exons": { 19 | "ENSE00002642039": {"raw": 6.435643564356435, "rpkm": 0.3682833603326866}, 20 | "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676}, 21 | }, 22 | "rpkm": 0.39507510837872767, 23 | }, 24 | "ENST00000576696": { 25 | "size": 1306, 26 | "canonical": "0", 27 | "exons": { 28 | "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676}, 29 | "ENSE00002672617": {"raw": 5.564356435643564, "rpkm": 0.16002265523850875}, 30 | }, 31 | "rpkm": 0.195204453741728, 32 | }, 33 | "ENST00000443778": { 34 | "size": 2084, 35 | "canonical": "1", 36 | "exons": { 37 | "ENSE00001729822": {"raw": 2, "rpkm": 0.0997865579744511}, 38 | "ENSE00001608298": {"raw": 1.1777177717771776, "rpkm": 0.22669418591124607}, 39 | }, 40 | "rpkm": 0.05165702955135873, 41 | }, 42 | }, 43 | "star_rpkm": None, 44 | } 45 | 46 | 47 | RNASeq_RECORD_NO_CANONICAL = { 48 | "_id": "5a7519801fd85c0e41c94c51", 49 | "gene": "ENSG00000232977", 50 | "sample_name": "SRR4011901", 51 | "transcripts": { 52 | "ENST00000575689": { 53 | "size": 720, 54 | "canonical": "0", 55 | "exons": { 56 | "ENSE00002642039": {"raw": 6.435643564356435, "rpkm": 0.3682833603326866}, 57 | "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676}, 58 | }, 59 | "rpkm": 0.39507510837872767, 60 | }, 61 | "ENST00000576696": { 62 | "size": 1306, 63 | "canonical": "0", 64 | "exons": { 65 | "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676}, 66 | "ENSE00002672617": {"raw": 5.564356435643564, "rpkm": 0.16002265523850875}, 67 | }, 68 | "rpkm": 0.195204453741728, 69 | }, 70 | }, 71 | "star_rpkm": None, 72 | } 73 | 74 | 75 | @pytest.fixture 76 | def gene_list_file(): 77 | return get_dataset_path(GENE_LIST_DATASET) 78 | 79 | 80 | def test_get_genelist_from_file(): 81 | gene_list = get_genelist_from_file(gene_list_file()) 82 | assert gene_list == ["TP53", "BRCA1", "BRCA2"] 83 | 84 | 85 | def test_get_genelist_from_file_no_file(): 86 | gene_list = get_genelist_from_file(None) 87 | assert gene_list is None 88 | 89 | 90 | def test_translate_genes_symbol(mongodb): 91 | mw = MongoWrapper("localhost", "27017") 92 | mw.session = mongodb 93 | gene_list = get_genelist_from_file(gene_list_file()) 94 | ensg_genes, gene_symbols = translate_genes(gene_list, mw, "pytest") 95 | assert ensg_genes == ["ENSG00000012048", "ENSG00000139618", "ENSG00000141510"] 96 | assert gene_symbols == { 97 | "ENSG00000012048": "BRCA1", 98 | "ENSG00000139618": "BRCA2", 99 | "ENSG00000141510": "TP53", 100 | } 101 | 102 | 103 | def test_translate_genes_ensg(mongodb): 104 | mw = MongoWrapper("localhost", "27017") 105 | mw.session = mongodb 106 | gene_list = ["ENSG00000012048", "ENSG00000139618", "ENSG00000141510"] 107 | ensg_genes, gene_symbols = translate_genes(gene_list, mw, "pytest") 108 | assert ensg_genes == ["ENSG00000012048", "ENSG00000139618", "ENSG00000141510"] 109 | assert gene_symbols == { 110 | "ENSG00000012048": "BRCA1", 111 | "ENSG00000139618": "BRCA2", 112 | "ENSG00000141510": "TP53", 113 | } 114 | 115 | 116 | def test_get_gene_list(mongodb): 117 | mw = MongoWrapper("localhost", "27017") 118 | mw.session = mongodb 119 | gene_list = get_gene_list(mw, database="pytest") 120 | assert gene_list == { 121 | "ENSG00000012048": "BRCA1", 122 | "ENSG00000139618": "BRCA2", 123 | "ENSG00000141510": "TP53", 124 | } 125 | 126 | 127 | def test_get_sample_details(mongodb): 128 | mw = MongoWrapper("localhost", "27017") 129 | mw.session = mongodb 130 | details = get_sample_details("Project", mw, "pytest") 131 | assert details == { 132 | "SRR5189264": {"category": "Public Data", "description": "SRR5189264"}, 133 | "SRR5189265": {"category": "Public Data", "description": "SRR5189265"}, 134 | "SRR5189266": {"category": "Public Data", "description": "SRR5189266"}, 135 | } 136 | 137 | 138 | def test_get_canonical_rpkm(): 139 | rpkm = get_canonical_rpkm(RNASeq_RECORD) 140 | assert rpkm == 0.05165702955135873 141 | 142 | 143 | def test_get_canonical_rpkm_no_canonical(): 144 | rpkm = get_canonical_rpkm(RNASeq_RECORD_NO_CANONICAL) 145 | assert rpkm is None 146 | 147 | 148 | def test_get_canonical_raw_no_canonical(): 149 | raw = get_canonical_raw(RNASeq_RECORD_NO_CANONICAL) 150 | assert raw is None 151 | -------------------------------------------------------------------------------- /tests/mongodb/test_mongo_import.py: -------------------------------------------------------------------------------- 1 | from edgePy.data_import.mongodb.mongo_import import ImportFromMongodb 2 | from edgePy.data_import.mongodb.mongo_import import parse_arguments 3 | from edgePy.data_import.data_import import get_dataset_path 4 | 5 | 6 | def test_parse_arguments(): 7 | config = "file.txt" 8 | gene_list = "groups.txt" 9 | key_name = "Project" 10 | key_value = "Publie Data" 11 | 12 | ci_values = [ 13 | "--config", 14 | config, 15 | "--gene_list", 16 | gene_list, 17 | "--key_name", 18 | "Project", 19 | "--key_value", 20 | "Publie Data", 21 | ] 22 | 23 | args = parse_arguments(None, ci_values=ci_values) 24 | 25 | assert config == args.config 26 | assert gene_list == args.gene_list 27 | assert key_name == args.key_name 28 | assert key_value == args.key_value 29 | 30 | 31 | def test_get_data_from_mongo_nofilters(mongodb): 32 | im = ImportFromMongodb( 33 | host="localhost", port=27017, mongo_key=None, mongo_value=None, gene_list_file=None 34 | ) 35 | im.mongo_reader.session = mongodb 36 | sample_list, dataset, gene_list, sample_category = im.get_data_from_mongo(database="pytest") 37 | assert sample_list == ["SRR5189264", "SRR5189265", "SRR5189266"] 38 | assert dataset == { 39 | "SRR5189264": {"ENSG00000012048": 70, "ENSG00000139618": 105, "ENSG00000141510": 270}, 40 | "SRR5189265": {"ENSG00000012048": 76, "ENSG00000139618": 168, "ENSG00000141510": 347}, 41 | "SRR5189266": {"ENSG00000012048": 62, "ENSG00000139618": 104, "ENSG00000141510": 191}, 42 | } 43 | assert gene_list == ["ENSG00000012048", "ENSG00000139618", "ENSG00000141510"] 44 | assert sample_category == { 45 | "SRR5189264": "SRR5189264", 46 | "SRR5189265": "SRR5189265", 47 | "SRR5189266": "SRR5189266", 48 | } 49 | 50 | 51 | def test_get_data_from_mongo_filters(mongodb): 52 | im = ImportFromMongodb( 53 | host="localhost", 54 | port=27017, 55 | mongo_key="Project", 56 | mongo_value="Public Data", 57 | gene_list_file=None, 58 | ) 59 | im.mongo_reader.session = mongodb 60 | sample_list, dataset, gene_list, sample_category = im.get_data_from_mongo(database="pytest") 61 | assert sample_list == ["SRR5189264", "SRR5189265", "SRR5189266"] 62 | assert dataset == { 63 | "SRR5189264": {"ENSG00000012048": 70, "ENSG00000139618": 105, "ENSG00000141510": 270}, 64 | "SRR5189265": {"ENSG00000012048": 76, "ENSG00000139618": 168, "ENSG00000141510": 347}, 65 | "SRR5189266": {"ENSG00000012048": 62, "ENSG00000139618": 104, "ENSG00000141510": 191}, 66 | } 67 | assert gene_list == ["ENSG00000012048", "ENSG00000139618", "ENSG00000141510"] 68 | assert sample_category == { 69 | "SRR5189264": "Public Data", 70 | "SRR5189265": "Public Data", 71 | "SRR5189266": "Public Data", 72 | } 73 | 74 | 75 | def test_get_data_from_mongo_gene_list(mongodb): 76 | filename = str(get_dataset_path("example_gene_list.txt")) 77 | im = ImportFromMongodb( 78 | host="localhost", 79 | port=27017, 80 | mongo_key="Project", 81 | mongo_value="Public Data", 82 | gene_list_file=filename, 83 | ) 84 | im.mongo_reader.session = mongodb 85 | sample_list, dataset, gene_list, sample_category = im.get_data_from_mongo(database="pytest") 86 | assert sample_list == ["SRR5189264", "SRR5189265", "SRR5189266"] 87 | assert dataset == { 88 | "SRR5189264": {"ENSG00000012048": 70, "ENSG00000139618": 105, "ENSG00000141510": 270}, 89 | "SRR5189265": {"ENSG00000012048": 76, "ENSG00000139618": 168, "ENSG00000141510": 347}, 90 | "SRR5189266": {"ENSG00000012048": 62, "ENSG00000139618": 104, "ENSG00000141510": 191}, 91 | } 92 | assert gene_list == ["ENSG00000012048", "ENSG00000139618", "ENSG00000141510"] 93 | assert sample_category == { 94 | "SRR5189264": "Public Data", 95 | "SRR5189265": "Public Data", 96 | "SRR5189266": "Public Data", 97 | } 98 | -------------------------------------------------------------------------------- /tests/mongodb/test_mongo_wrapper.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from edgePy.data_import.mongodb.mongo_wrapper import MongoWrapper 3 | from edgePy.data_import.mongodb.mongo_wrapper import MongoInserter 4 | from edgePy.data_import.mongodb.mongo_wrapper import MongoUpdater 5 | 6 | 7 | def test_mongo_wrapper_find_as_cursor(mongodb): 8 | mw = MongoWrapper("localhost", "27017") 9 | mw.session = mongodb 10 | assert "ensg_by_symbol" in mongodb.collection_names() 11 | cursor = mw.find_as_cursor("pytest", "ensg_by_symbol", {}, {}) 12 | 13 | count = 0 14 | for i in cursor: 15 | count += 1 16 | 17 | assert 3 == count 18 | 19 | 20 | def test_mongo_wrapper_find_as_cursor_fail(): 21 | mw = MongoWrapper("localhost", "27017") 22 | # do not set session to mongodb to cause this error. 23 | with pytest.raises(Exception): 24 | mw.find_as_cursor("pytest", "fake_table", {"_id_": "something"}, {}) 25 | 26 | 27 | def test_mongo_wrapper_find_as_list(mongodb): 28 | mw = MongoWrapper("localhost", "27017") 29 | mw.session = mongodb 30 | assert "ensg_by_symbol" in mongodb.collection_names() 31 | value = mw.find_as_list("pytest", "ensg_by_symbol", {}, {}) 32 | assert value == [{"_id": "BRCA1"}, {"_id": "BRCA2"}, {"_id": "TP53"}] 33 | 34 | 35 | def test_mongo_wrapper_find_as_dict(mongodb): 36 | mw = MongoWrapper("localhost", "27017") 37 | mw.session = mongodb 38 | assert "ensg_by_symbol" in mongodb.collection_names() 39 | value = mw.find_as_dict("pytest", "ensg_by_symbol", {}) 40 | assert value == { 41 | "BRCA1": {"_id": "BRCA1", "ensgs": ["ENSG00000012048"]}, 42 | "TP53": {"_id": "TP53", "ensgs": ["ENSG00000141510"]}, 43 | "BRCA2": {"_id": "BRCA2", "ensgs": ["ENSG00000139618"]}, 44 | } 45 | 46 | 47 | def test_mongo_wrapper_insert(mongodb): 48 | mw = MongoWrapper("localhost", "27017") 49 | mw.session = mongodb 50 | mw.insert("pytest", "test", [{"rec1": "val1"}, {"rec2": "val2"}]) 51 | 52 | 53 | def test_mongo_wrapper_create_index(mongodb): 54 | mw = MongoWrapper("localhost", "27017") 55 | mw.session = mongodb 56 | mw.create_index("pytest", "test", "_id") 57 | 58 | 59 | def test_mongo_inserter_flush(mongodb): 60 | """This is not testable - the mongodb pytest module does not support bulk writes. """ 61 | 62 | mi = MongoInserter("localhost", 27017, "pytest", "test") 63 | mi.session = mongodb 64 | # mi.add(['A', 'B', 'C']) 65 | mi.flush() 66 | 67 | pass 68 | 69 | 70 | def test_mongo_inserter_add(mongodb): 71 | mi = MongoInserter("localhost", 27017, "pytest", "test") 72 | mi.session = mongodb 73 | mi.add(["A", "B", "C"]) 74 | 75 | 76 | def test_mongo_inserter_close(mongodb): 77 | mi = MongoInserter("localhost", 27017, "pytest", "test") 78 | mi.session = mongodb 79 | # mi.add(['A', 'B', 'C']) 80 | mi.close() 81 | 82 | 83 | def test_mongo_inserter_create_index_key(mongodb): 84 | mi = MongoInserter("localhost", 27017, "pytest", "test") 85 | mi.session = mongodb 86 | mi.create_index("pytest", "test", "_id") 87 | 88 | 89 | def test_mongo_updater_flush(mongodb): 90 | """This is not testable - the mongodb pytest module does not support bulk writes. """ 91 | mu = MongoUpdater("localhost", 27017, "pytest", "test") 92 | mu.session = mongodb 93 | mu.flush() 94 | 95 | 96 | def test_mongo_updater_add(mongodb): 97 | mu = MongoUpdater("localhost", 27017, "pytest", "test") 98 | mu.session = mongodb 99 | mu.add({}, {"a": "b"}) 100 | 101 | 102 | def test_mongo_updater_close(mongodb): 103 | mu = MongoUpdater("localhost", 27017, "pytest", "test") 104 | mu.session = mongodb 105 | mu.close() 106 | -------------------------------------------------------------------------------- /tests/test_DGEList.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pkgutil 3 | import numpy as np 4 | from smart_open import smart_open # type: ignore 5 | 6 | from edgePy.DGEList import DGEList 7 | from edgePy.data_import.data_import import get_dataset_path 8 | from edgePy.data_import.ensembl.ensembl_flat_file_reader import CanonicalDataStore 9 | 10 | TEST_GENE_SET_DATA = "transcripts_homo_sapiens_core_75_37.tsv" 11 | TEST_GENE_SYMBOLS = "symbols_homo_sapiens_core_75_37.tsv" 12 | 13 | TEST_DATASET = "GSE49712_HTSeq.txt.gz" 14 | TEST_DATASET_NPZ = "GSE49712_HTSeq.txt.npz" 15 | TEST_GROUPS = "groups.json" 16 | 17 | 18 | @pytest.fixture 19 | def dge_list(): 20 | with smart_open(get_dataset_path(TEST_DATASET), 'r') as data_handle, smart_open( 21 | get_dataset_path(TEST_GROUPS), 'r' 22 | ) as group_handle: 23 | return DGEList.create_DGEList_handle(data_handle, group_handle) 24 | 25 | 26 | def test_sample_by_group(): 27 | samples = ["A", "B", "C", "D", "E"] 28 | expected_output = {'group1': ["A", "B"], 'group2': ["C", "D", "E"]} 29 | group_by_sample = ['group1', 'group1', 'group2', 'group2', 'group2'] 30 | output = DGEList._sample_group_dict(group_by_sample, samples) 31 | assert output == expected_output 32 | 33 | 34 | def test_sample_group_list(): 35 | samples = ["A", "B", "C", "D", "E"] 36 | sample_by_group = {'group1': ["A", "B"], 'group2': ["C", "D", "E"]} 37 | expected_output = np.array(['group1', 'group1', 'group2', 'group2', 'group2']) 38 | output = DGEList._sample_group_list(sample_by_group, samples) 39 | assert np.array_equal(output, expected_output) 40 | 41 | 42 | def test_minimal_init(): 43 | 44 | dge_list = DGEList( 45 | to_remove_zeroes=False, 46 | counts=np.ones(shape=(5, 5)), 47 | samples=["A", "B", "C", "D", "E"], 48 | genes=["ENSG001", "ENSG002", "ENSG003", "ENSG004", "ENSG005"], 49 | groups_in_dict={'group1': ["A", "B"], 'group2': ["C", "D", "E"]}, 50 | ) 51 | assert dge_list.__repr__() == "DGEList(num_samples=5, num_genes=5)" 52 | 53 | 54 | def test_too_much(): 55 | # TODO: Refactor into smaller units. 56 | # - Test blank non-parameterized `DGEList()` 57 | # - Test opening handles, both gzipped or not 58 | # - Test samples and genes are set, validated, typed right 59 | assert len(dge_list().samples) == 10 60 | assert len(dge_list().genes) == 21711 61 | 62 | 63 | def test_too_many_options(): 64 | with pytest.raises(Exception): 65 | DGEList(counts=np.zeros(shape=(5, 10)), filename=str(get_dataset_path(TEST_DATASET_NPZ))) 66 | 67 | 68 | def test_too_many_options2(): 69 | with pytest.raises(Exception): 70 | DGEList(counts=np.ones(shape=(5, 10)), filename=str(get_dataset_path(TEST_DATASET_NPZ))) 71 | 72 | 73 | def test_library_size(): 74 | dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ))) 75 | assert np.array_equal( 76 | dge_list.library_size, 77 | np.array( 78 | [ 79 | 63_579_607, 80 | 58_531_933, 81 | 39_138_521, 82 | 78_565_885, 83 | 48_667_119, 84 | 62_799_917, 85 | 66_032_107, 86 | 66_194_776, 87 | 55_085_875, 88 | 37_760_315, 89 | ] 90 | ), 91 | ) 92 | 93 | 94 | def test_setting_DGElist_counts(): 95 | 96 | dge_list = DGEList( 97 | counts=np.zeros(shape=(5, 10)), 98 | groups_in_list=['A', 'A', 'B', 'B', 'B'], 99 | samples=['S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9'], 100 | ) 101 | assert 5 == dge_list.counts.shape[0] 102 | assert 10 == dge_list.counts.shape[1] 103 | 104 | with pytest.raises(ValueError): 105 | c = np.array([[1, 1, 1], [-1, 1, 1]]) 106 | DGEList(counts=c, groups_in_list=["a", "b"]) 107 | with pytest.raises(ValueError): 108 | c = np.array([[1, 1, 1], [np.nan, 1, 1]]) 109 | DGEList(counts=c, groups_in_list=["a", "b"]) 110 | with pytest.raises(ValueError): 111 | c = np.array([1, 1, 1]) 112 | DGEList(counts=c, groups_in_list=["a", "b"]) 113 | with pytest.raises(TypeError): 114 | c = [1, 1, 1] 115 | dge_list.counts = c 116 | 117 | 118 | def test_cycle_dge_npz(): 119 | 120 | import tempfile 121 | import os 122 | 123 | tempdir = tempfile.mkdtemp(prefix="edgePy_tmp") 124 | file_name = tempdir + os.sep + next(tempfile._get_candidate_names()) 125 | dge_list_first = dge_list() 126 | dge_list_first.write_npz_file(filename=file_name) 127 | 128 | dge_list_second = DGEList(filename=file_name + ".npz") 129 | assert np.array_equal(dge_list_first.counts, dge_list_second.counts) 130 | assert np.array_equal(dge_list_first.genes, dge_list_second.genes) 131 | assert np.array_equal(dge_list_first.samples, dge_list_second.samples) 132 | assert np.array_equal(dge_list_first.norm_factors, dge_list_second.norm_factors) 133 | assert np.array_equal(dge_list_first.groups_list, dge_list_second.groups_list) 134 | os.remove(file_name + ".npz") 135 | os.rmdir(tempdir) 136 | 137 | 138 | def testing_setting_samples_and_counts(): 139 | # Empty list should fail 140 | with pytest.raises(Exception): 141 | DGEList( 142 | to_remove_zeroes=False, 143 | groups_in_list=['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'], 144 | ) 145 | 146 | # Lists with just counts should fail 147 | with pytest.raises(ValueError): 148 | DGEList(counts=np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]]), groups_in_list=['A', 'A', 'B']) 149 | 150 | # lists sith samples and counts and groups should pass: 151 | DGEList( 152 | counts=np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]]), 153 | groups_in_list=['A', 'A', 'B'], 154 | samples=["S1", 'S2', 'S3'], 155 | ) 156 | 157 | # Lists with just samples should fail 158 | with pytest.raises(Exception): 159 | DGEList( 160 | samples=np.array(["1", "2", "3"]), 161 | to_remove_zeroes=False, 162 | groups_in_list=['A', 'A', 'B'], 163 | ) 164 | 165 | # Properly formed samples and counts should pass 166 | DGEList( 167 | samples=np.array(["1", "2", "3"]), 168 | counts=np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]]), 169 | groups_in_list=['A', 'A', 'B'], 170 | ) 171 | 172 | # Lists with ill-matched samples and counts should fail 173 | pytest.raises( 174 | ValueError, 175 | "DGEList(samples = np.array(['2', '3'])," 176 | " counts = np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]]))", 177 | ) 178 | 179 | 180 | def test_repr(): 181 | assert dge_list().__repr__() == "DGEList(num_samples=10, num_genes=21,711)" 182 | 183 | 184 | def test_broken_dge_caGENE_SYMBOL_SQLll(): 185 | with pytest.raises(Exception): 186 | DGEList(filename="fake_filename", counts=np.array([[1, 1, 1], [1, 1, 1]])) 187 | with pytest.raises(Exception): 188 | DGEList(counts=None) 189 | 190 | 191 | def test_cpm(): 192 | dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ))) 193 | first_pos = dge_list.counts[0][0] 194 | col_sum = np.sum(dge_list.counts, axis=0) 195 | assert isinstance(first_pos, np.integer) 196 | new_dge_list = dge_list.cpm() 197 | assert new_dge_list.counts[0][0] == first_pos * 1e6 / col_sum[0] 198 | 199 | 200 | def test_rpkm(): 201 | dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ))) 202 | icd = CanonicalDataStore( 203 | get_dataset_path(TEST_GENE_SET_DATA), get_dataset_path(TEST_GENE_SYMBOLS) 204 | ) 205 | first_pos = dge_list.counts[0][0] 206 | first_gene = dge_list.genes[0] 207 | 208 | col_sum = np.sum(dge_list.counts, axis=0) 209 | assert isinstance(first_pos, np.integer) 210 | rpm_dge = dge_list.rpkm(icd) 211 | ensg_gene = icd.pick_gene_id(icd.get_genes_from_symbol(first_gene)) 212 | gene_len = icd.get_length_of_canonical_transcript(ensg_gene) 213 | # RPKM=numReads / (geneLength / 1000 * totalNumReads / 1, 000, 000) 214 | assert rpm_dge.counts[0][0] == (first_pos / ((gene_len / 1e3) * (col_sum[0] / 1e6))) 215 | 216 | 217 | def test_tpm(): 218 | # example hand calculated as in https://www.youtube.com/watch?time_continue=611&v=TTUrtCY2k-w 219 | counts = np.array([[10, 12, 30], [20, 25, 60], [5, 8, 15], [0, 0, 1]]) 220 | gene_lengths = np.array([2000, 4000, 1000, 10000]) 221 | 222 | expected = np.array( 223 | [ 224 | [333_333.333_333_33, 296_296.296_296_3, 332_594.235_033_26], 225 | [333_333.333_333_33, 308_641.975_308_64, 332_594.235_033_26], 226 | [333_333.333_333_33, 395_061.728_395_06, 332_594.235_033_26], 227 | [0.0, 0.0, 2217.294_900_22], 228 | ] 229 | ) 230 | 231 | dge_list = DGEList( 232 | counts=counts, 233 | samples=np.array(['a', 'b', 'c']), 234 | genes=np.array(['a', 'b', 'c', 'd']), 235 | groups_in_dict={'group1': ['a', 'c'], 'group2': ['b', 'd']}, 236 | ) 237 | assert isinstance(dge_list.counts[0][0], np.integer) 238 | new_dge_list = dge_list.tpm(gene_lengths) 239 | 240 | assert np.allclose(new_dge_list.counts, expected, atol=1e-1) 241 | 242 | # make sure that the sums of all genes across are the same the each sample (an important property of TPM) 243 | gene_sums = new_dge_list.counts.sum(axis=0) 244 | assert np.allclose(gene_sums, [gene_sums[0]] * len(gene_sums)) 245 | 246 | 247 | # Unit tests for ``edgePy.data_import.Importer``.\ 248 | def test_init(): 249 | dge_list = DGEList.create_DGEList_data_file( 250 | data_file=get_dataset_path(TEST_DATASET), group_file=get_dataset_path(TEST_GROUPS) 251 | ) 252 | 253 | assert dge_list.__repr__() == "DGEList(num_samples=10, num_genes=21,711)" 254 | 255 | dge_list = DGEList.create_DGEList_handle( 256 | data_handle=smart_open(get_dataset_path(TEST_DATASET)), 257 | group_handle=smart_open(get_dataset_path(TEST_GROUPS)), 258 | ) 259 | 260 | assert dge_list.__repr__() == "DGEList(num_samples=10, num_genes=21,711)" 261 | 262 | 263 | # TestGroupImporter. 264 | def test_create_DGEList_handle_init(): 265 | dge_list = DGEList.create_DGEList_handle( 266 | data_handle=smart_open(get_dataset_path(TEST_DATASET)), 267 | group_handle=smart_open(get_dataset_path(TEST_GROUPS)), 268 | ) 269 | assert 2 == len(dge_list.groups_dict) 270 | assert 5 == len(dge_list.groups_dict["Group 1"]) 271 | assert 5 == len(dge_list.groups_dict["Group 2"]) 272 | 273 | assert dge_list.samples.shape == dge_list.groups_list.shape 274 | 275 | 276 | # Unit tests for packaged (optionally zipped during install) data. 277 | def test_get_data_stream(): 278 | """Tests finding packaged data with ``pkgutil.get_data()``""" 279 | pkgutil.get_data("edgePy", "data/GSE49712_HTSeq.txt.gz") 280 | 281 | 282 | def test_create_DGEList(): 283 | """Tests the function that converts data into a DGE_List object""" 284 | samples = ["AAA", "BBB", "CCC"] 285 | genes = ["ENSG001", "ENSG002"] 286 | 287 | data_set = { 288 | "AAA": {"ENSG001": 10, "ENSG002": 20}, 289 | "BBB": {"ENSG001": 15, "ENSG002": 40}, 290 | "CCC": {"ENSG001": 20, "ENSG002": 80}, 291 | } 292 | categories = {"One": ["AAA", "BBB"], "Two": ["CCC"]} 293 | 294 | dge_list = DGEList.create_DGEList( 295 | sample_list=samples, data_set=data_set, gene_list=genes, category_to_samples=categories 296 | ) 297 | 298 | assert np.array_equal(dge_list.samples, np.array(samples)) 299 | # 2 rows (genes), 3 columns(samples) 300 | assert np.array_equal(dge_list.counts, np.array([[10, 15, 20], [20, 40, 80]])) 301 | 302 | assert np.array_equal(dge_list.groups_list, np.array(["One", "One", "Two"])) 303 | assert dge_list.groups_dict, {"One:"} 304 | assert np.array_equal(dge_list.genes, np.array(genes)) 305 | -------------------------------------------------------------------------------- /tests/test_edgePy.py: -------------------------------------------------------------------------------- 1 | # from edgePy.edgePy import parse_arguments 2 | 3 | 4 | def test_parse_argumants(): 5 | text_file = "file.txt" 6 | groups_file = "groups.txt" 7 | # args = parse_arguments(['--count_file', text_file, "--groups_file", groups_file]) 8 | # eq_(text_file, args.count_file) 9 | # eq_(groups_file, args.groups_file) 10 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | minversion = 3.1.2 3 | skip_missing_interpreters = true 4 | envlist = 5 | py36 6 | py36-lint 7 | py36-type 8 | py36-docs 9 | 10 | [testenv] 11 | description = run the test suite with (basepython) 12 | deps = -rrequirements-test.txt 13 | commands = pytest {posargs} 14 | 15 | [testenv:py36-lint] 16 | description = check the code style 17 | basepython = python3.6 18 | commands = 19 | black -v --check {toxinidir} 20 | flake8 {toxinidir}/edgePy 21 | pylint {toxinidir}/edgePy --errors-only --output-format=colorized 22 | 23 | [testenv:py36-type] 24 | description = type check the library 25 | basepython = python3.6 26 | commands = mypy {toxinidir}/edgePy 27 | 28 | [testenv:py36-docs] 29 | description = test building of HTML docs 30 | basepython = python3.6 31 | deps: -rdocs/requirements-docs.txt 32 | commands = sphinx-build docs/source {toxworkdir}/docs/_build -a --color -W -bhtml {posargs} 33 | 34 | [testenv:dev] 35 | description = the official edgePy development environment 36 | envdir = venv 37 | basepython = python3.6 38 | usedevelop = True 39 | commands = 40 | python -m pip list --format=columns 41 | python -c 'print("\n\nTo activate type: `source venv/bin/activate`\n\n")' 42 | --------------------------------------------------------------------------------