├── .codecov.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs
    ├── Makefile
    ├── requirements-docs.txt
    └── source
    │   ├── CONTRIBUTING.md
    │   ├── Links.md
    │   ├── conf.py
    │   ├── data-formats.md
    │   ├── edgePy.data_import.mongodb.rst
    │   ├── edgePy.data_import.rst
    │   ├── edgePy.rst
    │   ├── functionality.md
    │   ├── index.rst
    │   ├── modules.rst
    │   ├── planned_statistical_tests.rst
    │   └── scripts.rst
├── edgePy
    ├── DGEList.py
    ├── __init__.py
    ├── benchmarking
    │   ├── 00.GSE49712.Rscript.txt
    │   ├── 00.GSE49712_gene_FPKM.txt
    │   ├── 01.diagnostic_fig1.png
    │   ├── 02.diagnostic_fig2.png
    │   ├── 03.mds.png
    │   ├── 04.diagnostic_fig3.png
    │   ├── 05.diagnostic_fig4.png
    │   ├── 06.heatmap_fig5.png
    │   ├── 07.DEGs.tsv
    │   ├── 08.topDEGs.tsv
    │   ├── 09.analysis.RNAseq.gse49712.Rdata
    │   └── README_benchmark.md
    ├── data
    │   ├── GSE49712_HTSeq.txt.gz
    │   ├── GSE49712_HTSeq.txt.npz
    │   ├── example_gene_list.txt
    │   ├── groups.json
    │   ├── symbols_homo_sapiens_core_75_37.tsv
    │   └── transcripts_homo_sapiens_core_75_37.tsv
    ├── data_import
    │   ├── __init__.py
    │   ├── data_import.py
    │   ├── ensembl
    │   │   ├── __init__.py
    │   │   ├── canonical_transcripts.py
    │   │   ├── ensembl_flat_file_reader.py
    │   │   └── mysql_wrapper.py
    │   └── mongodb
    │   │   ├── __init__.py
    │   │   ├── gene_functions.py
    │   │   ├── mongo_import.py
    │   │   └── mongo_wrapper.py
    └── util.py
├── pyproject.toml
├── pytest.ini
├── requirements-test.txt
├── scripts
    ├── __init__.py
    └── edgepy.py
├── setup.cfg
├── setup.py
├── tests
    ├── ensembl
    │   ├── test_canonical_transcripts.py
    │   └── test_ensembl_flat_file_reader.py
    ├── mongodb
    │   ├── fixtures
    │   │   ├── RNASeq.json
    │   │   ├── ensg_by_symbol.json
    │   │   ├── samples.json
    │   │   └── symbol_by_ensg.json
    │   ├── test_gene_functions.py
    │   ├── test_mongo_import.py
    │   └── test_mongo_wrapper.py
    ├── test_DGEList.py
    └── test_edgePy.py
└── tox.ini


/.codecov.yml:
--------------------------------------------------------------------------------
 1 | coverage:
 2 |   precision: 1
 3 |   round: down
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         enabled: yes
 8 |         target: 90%
 9 |         threshold: 0.25%
10 |     patch:
11 |       default:
12 |         target: auto
13 | 
14 | comment:
15 |   layout: "diff"
16 |   behavior: default
17 |   require_changes: no
18 |   require_base: no
19 |   require_head: yes
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Uncompressed project data files
  2 | edgePy/data/*.csv
  3 | edgePy/data/*.tsv
  4 | edgePy/data/*.txt
  5 | !edgePy/data/groups.txt
  6 | !edgePy/data/example_gene_list.txt
  7 | !edgePy/data/transcripts_homo_sapiens_core_75_37.tsv
  8 | !edgePy/data/symbols_homo_sapiens_core_75_37.tsv
  9 | 
 10 | # Intellij project files
 11 | .idea
 12 | 
 13 | # VS Code project files
 14 | .vscode/
 15 | 
 16 | # Concatenated from the following sources on 2018-05-08:
 17 | #
 18 | #   Lang.    URI
 19 | #   Python   https://github.com/github/gitignore/blob/18e28746b0862059dbee8694fd366a679cb812fb/Python.gitignore
 20 | #   R        https://github.com/github/gitignore/blob/18e28746b0862059dbee8694fd366a679cb812fb/R.gitignore
 21 | #
 22 | 
 23 | # Byte-compiled / optimized / DLL files
 24 | __pycache__/
 25 | *.py[cod]
 26 | *$py.class
 27 | 
 28 | # C extensions
 29 | *.so
 30 | 
 31 | # Distribution / packaging
 32 | .Python
 33 | build/
 34 | develop-eggs/
 35 | dist/
 36 | downloads/
 37 | eggs/
 38 | .eggs/
 39 | lib/
 40 | lib64/
 41 | parts/
 42 | sdist/oh
 43 | var/
 44 | wheels/
 45 | *.egg-info/
 46 | .installed.cfg
 47 | *.egg
 48 | MANIFEST
 49 | 
 50 | # PyInstaller
 51 | #  Usually these files are written by a python script from a template
 52 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 53 | *.manifest
 54 | *.spec
 55 | 
 56 | # Installer logs
 57 | pip-log.txt
 58 | pip-delete-this-directory.txt
 59 | 
 60 | # Unit test / coverage reports
 61 | htmlcov/`
 62 | .tox/
 63 | .coverage
 64 | .coverage.*
 65 | .cache
 66 | nosetests.xml
 67 | coverage.xml
 68 | *.cover
 69 | .hypothesis/
 70 | .pytest_cache/
 71 | 
 72 | # Translations
 73 | *.mo
 74 | *.pot
 75 | 
 76 | # Django stuff:
 77 | *.log
 78 | local_settings.py
 79 | db.sqlite3
 80 | 
 81 | # Flask stuff:
 82 | instance/
 83 | .webassets-cache
 84 | 
 85 | # Scrapy stuff:
 86 | .scrapy
 87 | 
 88 | # Sphinx documentation
 89 | docs/_build/
 90 | 
 91 | # PyBuilder
 92 | target/
 93 | 
 94 | # Jupyter Notebook
 95 | .ipynb_checkpoints
 96 | 
 97 | # pyenv
 98 | .python-version
 99 | 
100 | # celery beat schedule file
101 | celerybeat-schedule
102 | 
103 | # SageMath parsed files
104 | *.sage.py
105 | 
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 | 
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 | 
119 | # Rope project settings
120 | .ropeproject
121 | 
122 | # mkdocs documentation
123 | /site
124 | 
125 | # mypy
126 | .mypy_cache/
127 | 
128 | # History files
129 | .Rhistory
130 | .Rapp.history
131 | 
132 | # Session Data files
133 | .RData
134 | 
135 | # Example code in package build process
136 | *-Ex.R
137 | 
138 | # Output files from R CMD build
139 | /*.tar.gz
140 | 
141 | # Output files from R CMD check
142 | /*.Rcheck/
143 | 
144 | # RStudio files
145 | .Rproj.user/
146 | 
147 | # produced vignettes
148 | vignettes/*.html
149 | vignettes/*.pdf
150 | 
151 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
152 | .httr-oauth
153 | 
154 | # knitr and R markdown default cache directories
155 | /*_cache/
156 | /cache/
157 | 
158 | # Temporary files created by R markdown
159 | *.utf8.md
160 | *.knit.md
161 | 
162 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
163 | rsconnect/
164 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | -   repo: https://github.com/ambv/black
3 |     rev: stable
4 |     hooks:
5 |     - id: black
6 |       language_version: python3.6
7 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | build:
2 |   image: latest
3 | 
4 | python:
5 |   version: 3.6
6 |   setup_py_install: true


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - 3.6
 4 | # Corrupt `boto.cfg` on TravisCI images
 5 | # https://github.com/travis-ci/travis-ci/issues/7940
 6 | before_install:
 7 |   - sudo rm -f /etc/boto.cfg
 8 | install:
 9 |   - pip install codecov tox-travis
10 | script:
11 |   - tox
12 | after_success:
13 |   - codecov
14 | notifications:
15 |   email: false
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright © 2018 [R-Bioinformatics Group]
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include edgePy/data/*
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # edgePy
 2 | 
 3 | ### Notice:
 4 | 
 5 | This project is still in development.  While we are a small band of bioinformaticians with big goals and aspirations, this code base is still too new for use on any real world projects.
 6 | While there's no official timeline for the project, functionality is being developed rapidly, so please feel free to check back on our progress frequently.  If you'd like to do more
 7 | than just check on our progress, we're always happy to welcome new members of the community, both to slack group where we're organizing this project, as well as on the git hub repository
 8 | hosting the project.  To join the slack, send your email to @apfejes (on github) or /u/apfejes on reddit - we're looking forward to working with you.
 9 | 
10 | [//]: # (TODO: Remove sample-sheet dummy library until we release on PyPi)
11 | [![PyPI Version](https://badge.fury.io/py/edgePy.svg)](https://pypi.org/project/edgePy)
12 | [![Build Status](https://travis-ci.org/r-bioinformatics/edgePy.svg?branch=master)](https://travis-ci.org/r-bioinformatics/edgePy)
13 | [![Documentation Status](https://readthedocs.org/projects/edgepy/badge/?version=latest)](http://edgepy.readthedocs.io/en/latest/?badge=latest)
14 | [![codecov](https://codecov.io/gh/r-bioinformatics/edgePy/branch/master/graph/badge.svg)](https://codecov.io/gh/r-bioinformatics/edgePy)
15 | [![Checked with MyPy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
16 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black)
17 | [![GitHub License](https://img.shields.io/pypi/l/sample-sheet.svg)](https://github.com/r-bioinformatics/edgePy/blob/master/LICENSE)
18 | 
19 | 
20 | The `edgePy` library will become an implementation of [`edgeR`](https://bioconductor.org/packages/release/bioc/html/edgeR.html) for differential expression analysis in the Python language.
21 | This library will have advantages over [`edgeR`](https://bioconductor.org/packages/release/bioc/html/edgeR.html) in that it will be well-tested and will run faster by utilizing Cythonized routines.
22 | `edgePy` will maintain the functionality of [`edgeR`](https://bioconductor.org/packages/release/bioc/html/edgeR.html) in that it's primary goals are differential expression analysis of RNA-Seq expression profiles with biological replication.
23 | The statistical methods for negative binomial distributions will include empirical Bayes estimations, exact tests, generalized linear models, and quasi-likelihood tests.
24 | 
25 | ## Project Aims
26 | 
27 | The `edgePy` library will be used for data import, normalization with respect to conditions, application of generalized linear models, and visualization.
28 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    = --color
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = EdgePy
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/requirements-docs.txt:
--------------------------------------------------------------------------------
1 | recommonmark
2 | Sphinx >= 1.4.4
3 | sphinx_rtd_theme
4 | tox >= 3.1.2
5 | 


--------------------------------------------------------------------------------
/docs/source/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | When contributing to this repository discuss the change you wish to make _via_ this project's [GitHub issues](https://github.com/r-bioinformatics/edgePy/issues) first.
  4 | 
  5 | ## PR Process for Project Contributers
  6 | 
  7 | Always ensure that you have fetched (_via_ `git pull`) the most recent material into your local clone.
  8 | 
  9 | 1. Checkout a branch (`git checkout -b <branch_name>`) prefixed with your initials and suffixed with the issue you are addressing or a brief few words describing the feature/bug fix joined by underscores (`_`). Here are valid formats:
 10 |     - `cv_issue_45`
 11 |     - `af_issue_2123`
 12 |     - `cv_fix_requests_regression`
 13 |     - `af_transpose_docs`
 14 | 2. Commit changes.
 15 | 3. Execute and create tests regularly. Use `py.test`.
 16 | 4. Request informal review from peers by pointing them to your branch.
 17 | 5. Create a Pull Request against `master` when a formal review is needed.
 18 | 6. Optionally, squash commits and reword messages as needed for easier review.
 19 | 7. Ensure all continuous integration (CI) tests and code reviews pass before rebasing (or squashing and then rebasing) onto `master`.
 20 | 
 21 |     - Avoid directly merging a PR onto `master` without first rebasing.
 22 | 
 23 | ## Documentation and Code Style
 24 | 
 25 | 1. Strictly adhere to PEP8.
 26 | 2. Use [Google Style](http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) docstrings.
 27 | 3. Implement doctests.
 28 | 4. Provide accurate type annotations.
 29 | 5. Limit line lengths to 120 characters.
 30 | 
 31 | An example function showcasing the above requirements:
 32 | 
 33 | ```python
 34 | def get_dataset_path(
 35 |     filename: Union[str, Path],
 36 |     dead_arg: Optional[Any] = None
 37 | ) -> Path:
 38 |     """Example function with PEP 484 type annotations.
 39 | 
 40 |     Args:
 41 |         filename: The first parameter.
 42 |         dead_arg: The second parameter.
 43 | 
 44 |     Returns:
 45 |         The path to the dataset, may not really exist.
 46 | 
 47 |     Examples:
 48 |         >>> from module.io import get_dataset_path
 49 |         >>> str(get_dataset_path("GSE49712_HTSeq.txt.gz"))  # doctest:+ELLIPSIS
 50 |         '.../data/GSE49712_HTSeq.txt.gz'
 51 | 
 52 |     Notes:
 53 |         1. See ``module.rationalize`` for an equivalent method.
 54 | 
 55 |     """
 56 |     import module
 57 |     directory = Path(module.__file__).expanduser().resolve().parent
 58 |     return directory / 'data' / filename
 59 | ```
 60 | 
 61 | ### Code Style
 62 | This repository uses [Black](https://github.com/ambv/black) as a code formatter.
 63 | 
 64 | It can be ran a few different ways:
 65 | 
 66 | 1. Manually by running `$ black .` in the repository root
 67 | 2. Though [pre-commit](https://pre-commit.com/) a git hook that runs it whenever a commit is made.
 68 | 3. It can also be integrated in the of your choice by following the instructions in the [documention](https://github.com/ambv/black#editor-integration).
 69 | 
 70 | ## Updating the documentation
 71 | 
 72 | New documentation files must be of the following format:
 73 | - reStructuredText (**.rst**) -- _preferred_
 74 | - Markdown (**.md**)
 75 | 
 76 | A new file can be added to the appropriate gloassary tree in `edgePy/docs/source/index.rst`.
 77 | 
 78 | The service `readthedocs.org` will automatically source the *conf.py* file in `edgePy/docs/sources/conf.py` and update the docs accordingly on each commit pushed to GitHub, on any branch.
 79 | 
 80 | Local HTML renders of the documentation can be built with the following:
 81 | 
 82 | ```bash
 83 | ❯ cd edgePy/docs
 84 | ❯ pip install -r requirements-docs.txt
 85 | ❯ make html
 86 | ```
 87 | 
 88 | This will create or update the HTML documents in the `\docs\_build\html` directory.
 89 | 
 90 | ## Developing in a Virtual Environment
 91 | 
 92 | The development environment is listed as an additional `Tox` environment:
 93 | 
 94 | ```bash
 95 | ❯ tox -lv
 96 | 
 97 | using tox.ini: .../edgePy/tox.ini
 98 | using tox-3.1.2 from .../python3.6/dist-packages/tox/__init__.py
 99 | default environments:
100 | py36      -> run the test suite with (basepython)
101 | py36-lint -> check the code style
102 | py36-type -> type check the library
103 | py36-docs -> test building of HTML docs
104 | 
105 | additional environments:
106 | dev       -> the official edgePy development environment
107 | ```
108 | 
109 | To create and activate that environment issue the following:
110 | 
111 | ```bash
112 | ❯ cd edgePy
113 | # Create the development environment (force recreation)
114 | ❯ tox --recreate -e dev
115 | # Activate the development environment
116 | ❯ source venv/bin/activate
117 | 
118 | ```
119 | 
120 | ## Running the Test Suite
121 | 
122 | All tests are coordinated by `Tox`. Running the unit tests, code coverage, code style (linting) checks, static analysis of typing, and successful compilation of the docs is as simple as the following commands!
123 | 
124 | > **Note**: This command takes a long time the first time it is invoked since all virtual environments need to be created!
125 | 
126 | ```bash
127 | ❯ cd edgePy
128 | ❯ tox
129 | ```
130 | 
131 | ## Running Parts of the Test Suite
132 | 
133 | You can select only a part of the test suite by looking at which `Tox` groups are available:
134 | 
135 | ```bash
136 | ❯ cd edgePy
137 | ❯ tox -lv
138 | 
139 | using tox.ini: ../edgePy/tox.ini
140 | using tox-3.1.2 from ../python3.6/dist-packages/tox/__init__.py
141 | default environments:
142 | py36      -> run the test suite with (basepython)
143 | py36-lint -> check the code style
144 | py36-type -> type check the library
145 | py36-docs -> test building of HTML docs
146 | ```
147 | 
148 | Choose a specific group to run with the following syntax:
149 | 
150 | ```bash
151 | ❯ cd edgePy
152 | ❯ tox -e py36-type
153 | ```
154 | 
155 | Almost all dynamic and static analysis tools are configured in `setup.cfg` so check there for the configuration of the test suite first.
156 | 


--------------------------------------------------------------------------------
/docs/source/Links.md:
--------------------------------------------------------------------------------
1 | # Links
2 | 
3 | Project Notes: [Google doc](
4 | https://docs.google.com/document/d/1s-GMQld8DYtfxupELuYw2VtpXL6gSAUMCqITSmi7Udg/edit#heading=h.uk4y7e3dhmh6)


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Configuration file for the Sphinx documentation builder.
  5 | #
  6 | # This file does only contain a selection of the most common options. For a
  7 | # full list see the documentation:
  8 | # http://www.sphinx-doc.org/en/master/config
  9 | 
 10 | # -- Path setup --------------------------------------------------------------
 11 | 
 12 | # If extensions (or modules to document with autodoc) are in another directory,
 13 | # add these directories to sys.path here. If the directory is relative to the
 14 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 15 | #
 16 | import os
 17 | import sys
 18 | 
 19 | sys.path.insert(0, os.path.abspath('../../..'))
 20 | sys.path.insert(0, os.path.abspath('../..'))
 21 | sys.path.insert(0, os.path.abspath('../'))
 22 | 
 23 | 
 24 | # -- Project information -----------------------------------------------------
 25 | 
 26 | project = "EdgePy"
 27 | copyright = "2018, R-Bioinformatics"
 28 | author = "R-Bioinformatics"
 29 | 
 30 | # The short X.Y version
 31 | version = ""
 32 | # The full version, including alpha/beta/rc tags
 33 | release = "0.0.1"
 34 | 
 35 | 
 36 | # -- General configuration ---------------------------------------------------
 37 | 
 38 | # If your documentation needs a minimal Sphinx version, state it here.
 39 | #
 40 | # needs_sphinx = '1.0'
 41 | 
 42 | # Add any Sphinx extension module names here, as strings. They can be
 43 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 44 | # ones.
 45 | extensions = [
 46 |     'sphinx.ext.viewcode',
 47 |     'sphinx.ext.mathjax',
 48 |     'sphinx.ext.intersphinx',
 49 |     'sphinx.ext.autodoc',
 50 |     'sphinx.ext.napoleon',
 51 |     # 'sphinx_autodoc_annotation',
 52 |     # 'sphinx_autodoc_napoleon_typehints',
 53 | ]
 54 | 
 55 | napoleon_include_private_with_doc = True
 56 | napoleon_google_docstring = True
 57 | napoleon_numpy_docstring = False
 58 | napoleon_use_param = False
 59 | napoleon_use_ivar = False
 60 | napoleon_use_rtype = True
 61 | 
 62 | intersphinx_mapping = {
 63 |     'python': ('https://docs.python.org/3', None),
 64 |     'requests': ('http://docs.python-requests.org/en/latest/', None),
 65 | }
 66 | 
 67 | # Add any paths that contain templates here, relative to this directory.
 68 | templates_path = ['_templates']
 69 | 
 70 | # The suffix(es) of source filenames.
 71 | # You can specify multiple suffix as a list of string:
 72 | #
 73 | 
 74 | # MARKDOWN PARSER
 75 | source_suffix = ['.rst', '.md']
 76 | source_parsers = {'.md': 'recommonmark.parser.CommonMarkParser'}
 77 | 
 78 | # The master toctree document.
 79 | master_doc = 'index'
 80 | 
 81 | # The language for content autogenerated by Sphinx. Refer to documentation
 82 | # for a list of supported languages.
 83 | #
 84 | # This is also used if you do content translation via gettext catalogs.
 85 | # Usually you set "language" from the command line for these cases.
 86 | language = 'en'
 87 | 
 88 | # List of patterns, relative to source directory, that match files and
 89 | # directories to ignore when looking for source files.
 90 | # This pattern also affects html_static_path and html_extra_path .
 91 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 92 | 
 93 | # The name of the Pygments (syntax highlighting) style to use.
 94 | pygments_style = 'default'
 95 | 
 96 | # -- Options for HTML output -------------------------------------------------
 97 | 
 98 | # The theme to use for HTML and HTML Help pages.
 99 | # See the documentation for a list of builtin themes.
100 | #
101 | html_theme = "sphinx_rtd_theme"
102 | 
103 | # Theme options are theme-specific and customize the look and feel of a theme
104 | # further.  For a list of options available for each theme, see the
105 | # documentation.
106 | #
107 | html_theme_options = {
108 |     'canonical_url': '',
109 |     'analytics_id': '',
110 |     'logo_only': False,
111 |     'display_version': True,
112 |     'prev_next_buttons_location': 'bottom',
113 |     'style_external_links': False,
114 |     # 'vcs_pageview_mode': '',
115 |     # Toc options
116 |     'collapse_navigation': True,
117 |     'sticky_navigation': True,
118 |     'navigation_depth': 4,
119 |     'includehidden': True,
120 |     'titles_only': False,
121 | }
122 | 
123 | autodoc_mock_imports = ['pymongo']
124 | 
125 | # html_theme_path = ["_themes/sphinx_rtd_theme", ]
126 | 
127 | # Add any paths that contain custom static files (such as style sheets) here,
128 | # relative to this directory. They are copied after the builtin static files,
129 | # so a file named "default.css" will overwrite the builtin "default.css".
130 | # html_static_path = ['_static']
131 | 
132 | # Custom sidebar templates, must be a dictionary that maps document names
133 | # to template names.
134 | #
135 | # The default sidebars (for documents that don't match any pattern) are
136 | # defined by theme itself.  Builtin themes are using these templates by
137 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
138 | # 'searchbox.html']``.
139 | #
140 | # html_sidebars = {}
141 | 
142 | 
143 | # -- Options for HTMLHelp output ---------------------------------------------
144 | 
145 | # Output file base name for HTML help builder.
146 | htmlhelp_basename = "EdgePydoc"
147 | 
148 | 
149 | # -- Options for LaTeX output ------------------------------------------------
150 | 
151 | latex_elements = {
152 |     # The paper size ('letterpaper' or 'a4paper').
153 |     #
154 |     # 'papersize': 'letterpaper',
155 |     # The font size ('10pt', '11pt' or '12pt').
156 |     #
157 |     # 'pointsize': '10pt',
158 |     # Additional stuff for the LaTeX preamble.
159 |     #
160 |     # 'preamble': '',
161 |     # Latex figure (float) alignment
162 |     #
163 |     # 'figure_align': 'htbp',
164 | }
165 | 
166 | # Grouping the document tree into LaTeX files. List of tuples
167 | # (source start file, target name, title,
168 | #  author, documentclass [howto, manual, or own class]).
169 | latex_documents = [
170 |     (master_doc, 'EdgePy.tex', 'EdgePy Documentation', 'R-Bioinformatics', 'manual')
171 | ]
172 | 
173 | 
174 | # -- Options for manual page output ------------------------------------------
175 | 
176 | # One entry per manual page. List of tuples
177 | # (source start file, name, description, authors, manual section).
178 | man_pages = [(master_doc, 'edgepy', 'EdgePy Documentation', [author], 1)]
179 | 
180 | 
181 | # -- Options for Texinfo output ----------------------------------------------
182 | 
183 | # Grouping the document tree into Texinfo files. List of tuples
184 | # (source start file, target name, title, author,
185 | #  dir menu entry, description, category)
186 | texinfo_documents = [
187 |     (
188 |         master_doc,
189 |         'EdgePy',
190 |         'EdgePy Documentation',
191 |         author,
192 |         'EdgePy',
193 |         'One line description of project.',
194 |         'Miscellaneous',
195 |     )
196 | ]
197 | 


--------------------------------------------------------------------------------
/docs/source/data-formats.md:
--------------------------------------------------------------------------------
 1 | ## Input
 2 | 
 3 | Input should be given in tab-delimited format with the following header column names:
 4 | 
 5 | - `FeatureID` (Can be gene, transcript, splice variant or protein)
 6 | - `SampleID` (Generic label to keep track of an individual sample)
 7 | 
 8 | Input main content:
 9 | 
10 | - `Data Matrix` A numeric matrix (Previously `counts`).
11 | A numeric matrix. Each row represents a single feature and each column represent a single sample.
12 | 
13 | Sample data can be found in the [`data/`](../edgePy/data/) folder, which was derived from data on the [NCBI Gene Expression Ombnibus](https://www.ncbi.nlm.nih.gov/geo/).
14 | 


--------------------------------------------------------------------------------
/docs/source/edgePy.data_import.mongodb.rst:
--------------------------------------------------------------------------------
 1 | edgePy.data\_import.mongodb package
 2 | ===================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | edgePy.data\_import.mongodb.gene\_functions module
 8 | --------------------------------------------------
 9 | 
10 | .. automodule:: edgePy.data_import.mongodb.gene_functions
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | edgePy.data\_import.mongodb.mongo\_import module
16 | ------------------------------------------------
17 | 
18 | .. automodule:: edgePy.data_import.mongodb.mongo_import
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | edgePy.data\_import.mongodb.mongo\_wrapper module
24 | -------------------------------------------------
25 | 
26 | .. automodule:: edgePy.data_import.mongodb.mongo_wrapper
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | 
32 | Module contents
33 | ---------------
34 | 
35 | .. automodule:: edgePy.data_import.mongodb
36 |     :members:
37 |     :undoc-members:
38 |     :show-inheritance:
39 | 


--------------------------------------------------------------------------------
/docs/source/edgePy.data_import.rst:
--------------------------------------------------------------------------------
 1 | edgePy.data\_import package
 2 | ===========================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     edgePy.data_import.mongodb
10 | 
11 | Submodules
12 | ----------
13 | 
14 | edgePy.data\_import.data\_import module
15 | ---------------------------------------
16 | 
17 | .. automodule:: edgePy.data_import.data_import
18 |     :members:
19 |     :undoc-members:
20 |     :show-inheritance:
21 | 
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: edgePy.data_import
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 


--------------------------------------------------------------------------------
/docs/source/edgePy.rst:
--------------------------------------------------------------------------------
 1 | edgePy package
 2 | ==============
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     edgePy.data_import
10 | 
11 | Submodules
12 | ----------
13 | 
14 | edgePy.DGEList module
15 | ---------------------
16 | 
17 | .. automodule:: edgePy.DGEList
18 |     :members:
19 |     :undoc-members:
20 |     :show-inheritance:
21 | 
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: edgePy
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 


--------------------------------------------------------------------------------
/docs/source/functionality.md:
--------------------------------------------------------------------------------
 1 | # Functionality
 2 | 
 3 | OMICs analysis is made easy with R tools such as “edgeR” and “limma” packages. R has serious limitations when applied to large datasets. 
 4 | 
 5 | The First objective of edgePY is to offer an alternative free tool for such analysis.
 6 | 
 7 | # Components
 8 | 
 9 |     Input(1) -> Normalization(2) -> Analysis(3) -> Visualization/Results(4).
10 | 
11 | ## Input 
12 |  
13 |     Read correctly the file 
14 | 
15 | A data matrix separated by tab. Of genes/proteins in lines and samples/observations in columns. Groups for the main analysis usually are defined there, or assigned to the samples.
16 | 
17 | ## Normalization
18 | 
19 |     Quality ->  Library -> (TMM or RLE or upperquartile or none) -> commonDispersion -> TagwiseDispersion -> Norm. Matrix
20 | 
21 | ## Analysis
22 | 
23 |     Norm. Matrix -> Set the sample groups to be compared  -> Statistical analysis of choice (ebayes/treat/QLF) -> DE genes list and statistics
24 | 
25 | ## Visualization/Results
26 | 
27 |     DE genes list / Statistics -> Visualization ( Venn / The mean-variance relationship of log-CPM / Heatmaps / Volcano plots / Dispersion plots 
28 | 
29 | 
30 | More details should be added as we progress in the coding.
31 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. EdgePy documentation master file, created by
 2 |    sphinx-quickstart on Thu Jul 12 11:20:59 2018.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to EdgePy's documentation!
 7 | ==================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Main Documentation
12 |    
13 |    functionality
14 |    data-formats
15 |    planned_statistical_tests
16 |    CONTRIBUTING
17 |    Links
18 |    modules
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | edgePy
2 | ======
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    edgePy
8 |    scripts
9 | 


--------------------------------------------------------------------------------
/docs/source/planned_statistical_tests.rst:
--------------------------------------------------------------------------------
 1 | Planned statistical tests
 2 | =============================
 3 | 
 4 | This table includes statistical tests that will be implemented within this package.
 5 | 
 6 | +-------------------------------------------------+-------------------------------+-----------------------------------------------------------------------------------------+----------------------+----------------------------------------------------------+
 7 | | Statistical test                                | R src location                | Python test link                                                                        | Python library       | Notes                                                    |
 8 | +=================================================+===============================+=========================================================================================+======================+==========================================================+
 9 | | Multiple exact binomial tests                   | edgeR/R/binomTest.R           | https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.binom_test.html        | scipy                |                                                          |
10 | +-------------------------------------------------+-------------------------------+-----------------------------------------------------------------------------------------+----------------------+----------------------------------------------------------+
11 | | F test                                          | edgeR/R/decidetestsDGE.R      | https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f.html                 | scipy                |                                                          |
12 | +-------------------------------------------------+-------------------------------+-----------------------------------------------------------------------------------------+----------------------+----------------------------------------------------------+
13 | | Linear Modelling                                | limma/R/lmfit.R               | https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.linregress.html | scipy                | lmFit and contrasts. fit                                 |
14 | +-------------------------------------------------+-------------------------------+-----------------------------------------------------------------------------------------+----------------------+----------------------------------------------------------+
15 | | Negative Binomial                               | edgeR/R/glmfit.R              | https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.nbinom.html            | scipy                |                                                          |
16 | +-------------------------------------------------+-------------------------------+-----------------------------------------------------------------------------------------+----------------------+----------------------------------------------------------+
17 | | Z score of Binomial                             | edgerR/R/zscoreNBinom.R       | https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.zscore.html            | scipy                | Tentative / Not sure if this is the same as scipy zscore |
18 | +-------------------------------------------------+-------------------------------+-----------------------------------------------------------------------------------------+----------------------+----------------------------------------------------------+
19 | | Quasi-Likelihood GLM tests                      | edgeR/R/glmQLFTest.R          | https://docs.pymc.io/notebooks/GLM-negative-binomial-regression.html                    | MC3                  | Tentative                                                |
20 | +-------------------------------------------------+-------------------------------+-----------------------------------------------------------------------------------------+----------------------+----------------------------------------------------------+
21 | | Exact conditional likelihood                    | edgeR/R/estimateCommonDisp.R  | couldnt find it on scipy                                                                | n/a                  | Very important for DE                                    |
22 | +-------------------------------------------------+-------------------------------+-----------------------------------------------------------------------------------------+----------------------+----------------------------------------------------------+
23 | | Weighted conditional likelihood empirical Bayes | edgeR/R/estimateTagwiseDisp.R | couldnt find it on scipy                                                                | n/a                  | Very important for DE                                    |
24 | +-------------------------------------------------+-------------------------------+-----------------------------------------------------------------------------------------+----------------------+----------------------------------------------------------+


--------------------------------------------------------------------------------
/docs/source/scripts.rst:
--------------------------------------------------------------------------------
 1 | scripts package
 2 | ===============
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | scripts.edgepy module
 8 | ---------------------
 9 | 
10 | .. automodule:: scripts.edgepy
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: scripts
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/edgePy/DGEList.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import json
  3 | from io import StringIO
  4 | from pathlib import Path
  5 | from typing import Generator, Iterable, Mapping, Optional, Union, Dict, List, Hashable, Any
  6 | 
  7 | # TODO: Implement `mypy` stubs for NumPy imports
  8 | import numpy as np  # type: ignore
  9 | from smart_open import smart_open  # type: ignore
 10 | 
 11 | from edgePy.util import getLogger
 12 | from edgePy.data_import.ensembl.ensembl_flat_file_reader import CanonicalDataStore
 13 | 
 14 | __all__ = ["DGEList"]
 15 | 
 16 | PRIOR_COUNT: float = 0.25
 17 | 
 18 | log = getLogger(name=__name__)
 19 | 
 20 | 
 21 | class DGEList(object):
 22 |     """Class containing read counts over genes for multiple samples and their
 23 |     corresponding metadata.
 24 | 
 25 |     Args:
 26 |         counts: Columns correspond to samples and row to genes.
 27 |         samples: Array of sample names, same length as ncol(counts).
 28 |         genes: Array of gene names, same length as nrow(counts).
 29 |         norm_factors: Weighting factors for each sample.
 30 |         groups_in_list: a list of groups to which each sample belongs, in the same order as samples *or*
 31 |         groups_in_dict: a dictionary of groups, containing sample names.
 32 |         to_remove_zeroes: To remove genes with zero counts for all samples.
 33 |         filename: a shortcut to import NPZ (zipped numpy format) files.
 34 |         current_type:  None means raw counts, otherwise, if transformed, store a string (eg. 'cpm', 'rpkm', etc)
 35 |         current_log: Optional[bool] = False,  If counts has already been log transformed, store True.
 36 |     Examples:
 37 | 
 38 |         >>> from edgePy.data_import import get_dataset_path
 39 |         >>> dataset = 'GSE49712_HTSeq.txt.gz'
 40 |         >>> group_file = 'groups.json'
 41 |         >>> DGEList.create_DGEList_data_file(get_dataset_path(dataset), get_dataset_path(group_file))
 42 |         DGEList(num_samples=10, num_genes=21,711)
 43 | 
 44 |     """
 45 | 
 46 |     # Pattern to delete from field names anytime they are assigned.
 47 |     _field_strip_re = re.compile(r'[\s"]+')
 48 | 
 49 |     # Metatags used in older HTSeq datasets without underscore prefixes.
 50 |     _old_metatags = np.array(
 51 |         ['no_feature', 'ambiguous', 'too_low_aQual', 'not_aligned', 'alignment_not_unique']
 52 |     )
 53 | 
 54 |     def __init__(
 55 |         self,
 56 |         counts: Optional[np.ndarray] = None,
 57 |         samples: Optional[np.array] = None,
 58 |         genes: Optional[np.array] = None,
 59 |         norm_factors: Optional[np.array] = None,
 60 |         groups_in_list: Optional[np.array] = None,
 61 |         groups_in_dict: Optional[Dict] = None,
 62 |         to_remove_zeroes: Optional[bool] = False,
 63 |         filename: Optional[str] = None,
 64 |         current_transform_type: Optional[str] = None,
 65 |         current_log_status: Optional[bool] = False,
 66 |     ) -> None:
 67 | 
 68 |         self.to_remove_zeroes = to_remove_zeroes
 69 |         self.current_data_format = current_transform_type
 70 |         self.current_log_status = current_log_status
 71 | 
 72 |         if filename:
 73 |             if counts or samples or genes or norm_factors or groups_in_list or groups_in_dict:
 74 |                 raise Exception("if filename is provided, you can't also provide other parameters")
 75 |             self._counts = None
 76 |             self.read_npz_file(filename)
 77 | 
 78 |         else:
 79 |             if counts is None:
 80 |                 raise Exception("counts must be provided at init")
 81 | 
 82 |             if norm_factors is None:
 83 |                 try:
 84 |                     norm_factors = np.ones(np.size(counts, 1))
 85 |                 except IndexError:
 86 |                     raise ValueError(
 87 |                         "counts must have more than one sample " "- eg, have two dimensions"
 88 |                     )
 89 | 
 90 |             self.counts = counts
 91 |             self.samples = samples
 92 |             self.genes = genes
 93 |             self.norm_factors = norm_factors
 94 | 
 95 |             if groups_in_dict is not None and groups_in_list is not None:
 96 |                 self.groups_dict = groups_in_dict
 97 |                 self.groups_list = groups_in_list
 98 |             elif groups_in_dict is not None and self.samples is not None:
 99 |                 self.groups_dict = groups_in_dict
100 |                 self.groups_list = self._sample_group_list(groups_in_dict, self.samples)
101 |             elif groups_in_list is not None and self.samples is not None:
102 |                 self.groups_list = groups_in_list
103 |                 self.groups_dict = self._sample_group_dict(groups_in_list, self.samples)
104 |             else:
105 |                 raise ValueError(
106 |                     "You must provide either group by sample or sample by group, "
107 |                     "and samples must be present"
108 |                 )
109 | 
110 |     def copy(
111 |         self,
112 |         counts: Optional[np.ndarray] = None,
113 |         samples: Optional[np.array] = None,
114 |         genes: Optional[np.array] = None,
115 |         norm_factors: Optional[np.array] = None,
116 |         groups_in_list: Optional[np.array] = None,
117 |         groups_in_dict: Optional[Dict] = None,
118 |         to_remove_zeroes: Optional[bool] = False,
119 |         current_type: Optional[str] = None,
120 |         current_log: Optional[bool] = False,
121 |     ) -> "DGEList":
122 | 
123 |         return DGEList(
124 |             counts=self.counts if counts is None else counts,
125 |             samples=self.samples if samples is None else samples,
126 |             genes=self.genes if genes is None else genes,
127 |             norm_factors=self.norm_factors if norm_factors is None else norm_factors,
128 |             groups_in_list=self.groups_dict if groups_in_dict is None else groups_in_dict,
129 |             groups_in_dict=self.groups_list if groups_in_list is None else groups_in_list,
130 |             to_remove_zeroes=self.to_remove_zeroes
131 |             if to_remove_zeroes is None
132 |             else to_remove_zeroes,
133 |             current_transform_type=self.current_data_format
134 |             if current_type is None
135 |             else current_type,
136 |             current_log_status=self.current_log_status if current_log is None else current_log,
137 |         )
138 | 
139 |     @staticmethod
140 |     def _sample_group_dict(groups_list: List[str], samples: np.array):
141 |         """
142 |         Converts data in the form ['group1', 'group1', 'group2', 'group2']
143 |         to the form  {'group1': ['sample1', 'sample2'], 'group2': ['sample3', 'sample4'}
144 | 
145 |         Args:
146 |             groups_list: group names in a list, in the same order as samples.
147 | 
148 |         Returns:
149 |             dictionary containing the sample types, each with a list of samples.
150 | 
151 |         """
152 |         d: Dict[Hashable, Any] = {}
153 |         log.info(samples)
154 |         for idx, group in enumerate(groups_list):
155 |             if group not in d:
156 |                 d[group] = []
157 |             d[group].append(samples[idx])
158 |         return d
159 | 
160 |     @staticmethod
161 |     def _sample_group_list(groups_dict, samples):
162 |         """
163 |         Converts data in the form {'group1': ['sample1', 'sample2'], 'group2': ['sample3', 'sample4'}
164 |         to the form ['group1', 'group1', 'group2', 'group2']
165 | 
166 |         Args:
167 |             groups_dict: dictionary containing the sample types, each with a list of samples.
168 |             samples: order of samples in the DGEList
169 | 
170 |         Returns:
171 |             data in a list, in the same order as samples.
172 | 
173 |         """
174 |         d = []
175 |         temp_d = {}
176 |         for group in groups_dict:
177 |             for sample in groups_dict[group]:
178 |                 temp_d[sample] = group
179 | 
180 |         for sample in samples:
181 |             d.append(temp_d[sample])
182 | 
183 |         return np.array(d)
184 | 
185 |     @staticmethod
186 |     def _format_fields(fields: Iterable[Union[str, bytes]]) -> Generator[str, None, None]:
187 |         """Clean fields in the header of any read data.
188 | 
189 |         Yields:
190 |             The next field that has been cleaned.
191 | 
192 |         """
193 |         for field in fields:
194 |             if isinstance(field, bytes):
195 |                 field = field.decode()
196 |             yield DGEList._field_strip_re.sub("", field)
197 | 
198 |     @property
199 |     def counts(self) -> np.matrix:
200 |         """The read counts for the genes in all samples.
201 | 
202 |         Returns:
203 |             counts: Columns correspond to samples and row to genes.
204 | 
205 |         """
206 |         return self._counts
207 | 
208 |     @counts.setter
209 |     def counts(self, counts: np.ndarray) -> None:
210 |         """Validate setting ``DGEList.counts`` for the illegal conditions:
211 | 
212 |             * Must be of type ``np.ndarray``
213 |             * Negative values
214 |             * Values that are not numbers
215 |             * No values can be N/A
216 | 
217 |         Args:
218 |             counts: Columns correspond to samples and row to genes.
219 | 
220 |         """
221 |         if counts is None:
222 |             self._counts = None
223 |             return
224 | 
225 |         if not isinstance(counts, np.ndarray):
226 |             raise TypeError("Counts matrix must be of type ``np.ndarray``.")
227 | 
228 |         if hasattr(self, "_counts"):
229 |             # do checks for things here.  You shouldn't modify counts
230 |             # if it has already been set.  Create a new obj.
231 |             if hasattr(self, "_samples") and self._samples is not None:
232 |                 gene_count, sample_count = counts.shape
233 |                 log.info(f"sample count: {sample_count}, gene count: {gene_count}")
234 |                 log.info(
235 |                     f"samples shape {self.samples.shape[0]}, gene shape {self.genes.shape[0]}"
236 |                 )
237 |                 log.info(self.genes)
238 | 
239 |                 if sample_count != self.samples.shape[0] or gene_count != self.genes.shape[0]:
240 |                     raise ValueError(
241 |                         "Attempting to substitute counts data "
242 |                         "into DGEList object with different "
243 |                         "dimensions fails."
244 |                     )
245 | 
246 |         if np.isnan(counts).any():
247 |             raise ValueError("Counts matrix must have only real values.")
248 |         if not self.current_log_status and (counts < 0).any():
249 |             raise ValueError("Counts matrix cannot contain negative values.")
250 | 
251 |         if self.to_remove_zeroes:
252 |             # this is not working.  Does not remove rows with only zeros.
253 |             counts = counts[np.all(counts != 0, axis=1)]
254 | 
255 |         self._counts = counts
256 | 
257 |     @property
258 |     def samples(self) -> np.array:
259 |         """Array of sample names."""
260 |         return self._samples
261 | 
262 |     @samples.setter
263 |     def samples(self, samples: Optional[np.ndarray]) -> None:
264 |         """Validate setting ``DGEList.samples`` for the illegal conditions:
265 | 
266 |             * Must be the same length as the columns in counts`
267 | 
268 |         Args:
269 |             samples: 1D string array representing identifiers of count columns
270 | 
271 |         """
272 |         if samples is not None:
273 |             if self.counts is not None and len(samples) != self.counts.shape[1]:
274 |                 raise ValueError(
275 |                     f"Shape of counts does not match samples: "
276 |                     f"len(samples) = {len(samples)},"
277 |                     f" self.counts.shape = {self.counts.shape}"
278 |                 )
279 | 
280 |             samples = np.array(list(self._format_fields(samples)))
281 |         self._samples = samples
282 | 
283 |     @property
284 |     def genes(self) -> np.array:
285 |         """Array of gene names."""
286 |         return self._genes
287 | 
288 |     @genes.setter
289 |     def genes(self, genes: Optional[np.ndarray]) -> None:
290 |         # TODO: Validate genes here
291 |         # - Genes same length as nrow(self.counts) if defined
292 |         if genes is not None:
293 |             genes = np.array(list(self._format_fields(genes)))
294 |             # Creates boolean mask and filters out metatag rows from samples and counts
295 |             metatag_mask = ~(np.isin(genes, self._old_metatags) | np.char.startswith(genes, '__'))
296 |             genes = genes[metatag_mask].copy()
297 |             self._counts = self.counts[metatag_mask].copy()
298 |         self._genes = genes
299 | 
300 |     @property
301 |     def library_size(self) -> np.array:
302 |         """The total read counts per sample.
303 | 
304 |         Returns:
305 |             library_size: The size of the library.
306 | 
307 |         """
308 |         return np.sum(self.counts, 0)
309 | 
310 |     def log_transform(self, counts, prior_count):
311 |         """Compute the log of the counts"""
312 |         counts[counts == 0] = prior_count
313 |         return np.log(counts)
314 | 
315 |     def cpm(self, transform_to_log: bool = False, prior_count: float = PRIOR_COUNT) -> "DGEList":
316 |         """Normalize the DGEList to read counts per million."""
317 |         counts = 1e6 * self.counts / np.sum(self.counts, axis=0)
318 |         current_log = self.current_log_status
319 |         if transform_to_log:
320 |             counts = self.log_transform(counts, prior_count)
321 |             current_log = True
322 | 
323 |         return self.copy(counts=counts, current_log=current_log)
324 | 
325 |     def rpkm(
326 |         self,
327 |         gene_data: CanonicalDataStore,
328 |         transform_to_log: bool = False,
329 |         prior_count: float = PRIOR_COUNT,
330 |     ) -> "DGEList":
331 |         """Return the DGEList normalized to reads per kilobase of gene length
332 |         per million reads. (RPKM =   numReads / ( geneLength/1000 * totalNumReads/1,000,000 )
333 | 
334 |         Args:
335 |             gene_data: An object that works to import Ensembl based data, for use in calculations
336 |             transform_to_log: true, if you wish to convert to log after converting to RPKM
337 |             prior_count: a minimum value for genes, if you do log transforms.
338 |         """
339 |         current_log = self.current_log_status
340 | 
341 |         if self.current_log_status:
342 |             self.counts = np.exp(self.counts)
343 |             current_log = False
344 |         col_sum = np.sum(self.counts, axis=0)
345 | 
346 |         gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data)
347 | 
348 |         genes = self.genes[gene_mask].copy()
349 |         counts = self.counts[gene_mask].copy()
350 | 
351 |         counts = (counts.T / gene_len_ordered).T
352 |         counts = counts / (col_sum / 1e6)
353 | 
354 |         if transform_to_log:
355 |             counts = self.log_transform(counts, prior_count)
356 |             current_log = True
357 | 
358 |         return self.copy(counts=counts, current_log=current_log, genes=genes)
359 | 
360 |     def get_gene_mask_and_lengths(self, gene_data):
361 | 
362 |         """
363 |         use gene_data to get the gene lenths and a gene mask for the tranformation.
364 |         Args:
365 |             gene_data: the object that holds gene data from ensembl
366 | 
367 |         """
368 |         gene_len_ordered = []
369 |         gene_mask = []
370 |         gene_ensg = []
371 |         for gene in self.genes:
372 |             if gene.startswith("ENSG"):
373 |                 gene_name = gene
374 |                 gene_ensg.append(gene_name)
375 |                 if gene_data.has_gene(gene_name):
376 |                     gene_mask.append(True)
377 |                     gene_len_ordered.append(
378 |                         gene_data.get_length_of_canonical_transcript(gene_name) / 1e3
379 |                     )
380 |                 else:
381 |                     gene_mask.append(False)
382 |             else:
383 |                 t_gene = gene_data.get_genes_from_symbol(gene)
384 |                 if t_gene:
385 |                     if len(t_gene) > 1:
386 |                         gene_name = gene_data.pick_gene_id(t_gene)
387 |                     else:
388 |                         gene_name = t_gene[0]
389 |                     gene_ensg.append(gene_name)
390 |                     if gene_data.has_gene(gene_name):
391 |                         gene_mask.append(True)
392 |                         gene_len_ordered.append(
393 |                             gene_data.get_length_of_canonical_transcript(gene_name) / 1e3
394 |                         )
395 |                     else:
396 |                         gene_mask.append(False)
397 |                 else:
398 |                     gene_mask.append(False)
399 |         return gene_len_ordered, gene_mask
400 | 
401 |     def tpm(
402 |         self,
403 |         gene_lengths: np.ndarray,
404 |         transform_to_log: bool = False,
405 |         prior_count: float = PRIOR_COUNT,
406 |         mean_fragment_lengths: np.ndarray = None,
407 |     ) -> "DGEList":
408 |         """Normalize the DGEList to transcripts per million.
409 | 
410 |         Adapted from Wagner, et al. 'Measurement of mRNA abundance using RNA-seq data:
411 |         RPKM measure is inconsistent among samples.' doi:10.1007/s12064-012-0162-3
412 | 
413 |         Read counts :math:`X_i` (for each gene :math:`i` with gene length :math:`\widetilde{l_j}` )
414 |         are normalized as follows:
415 | 
416 |         .. math::
417 | 
418 |            TPM_i = \\frac{X_i}{\\widetilde{l_i}}\cdot \\
419 |            \\left(\\frac{1}{\sum_j \\frac{X_j}{\widetilde{l_j}}}\\right) \cdot 10^6
420 | 
421 |         Args:
422 |             gene_lengths: 1D array of gene lengths for each gene in the rows of `DGEList.counts`.
423 |             transform_to_log: store log outputs
424 |             prior_count:
425 |             mean_fragment_lengths: 1D array of mean fragment lengths for each sample in the columns of `DGEList.counts`
426 |                 (optional)
427 | 
428 |         """
429 | 
430 |         # compute effective length not allowing negative lengths
431 |         if mean_fragment_lengths:
432 |             effective_lengths = (
433 |                 gene_lengths[:, np.newaxis] - mean_fragment_lengths[np.newaxis, :]
434 |             ).clip(min=1)
435 |         else:
436 |             effective_lengths = gene_lengths[:, np.newaxis]
437 | 
438 |         # how many counts per base
439 |         base_counts = self.counts / effective_lengths
440 | 
441 |         counts = 1e6 * base_counts / np.sum(base_counts, axis=0)[np.newaxis, :]
442 |         current_log = self.current_log_status
443 |         if transform_to_log:
444 |             counts = self.log_transform(counts, prior_count)
445 |             current_log = True
446 | 
447 |         return self.copy(counts=counts, current_log=current_log)
448 | 
449 |     def __repr__(self) -> str:
450 |         """Give a pretty non-executeable representation of this object."""
451 |         num_samples = len(self._samples) if self._samples is not None else 0
452 |         num_genes = len(self._genes) if self._genes is not None else 0
453 | 
454 |         return (
455 |             f"{self.__class__.__name__}("
456 |             f"num_samples={num_samples:,}, "
457 |             f"num_genes={num_genes:,})"
458 |         )
459 | 
460 |     def write_npz_file(self, filename: str) -> None:
461 |         """Convert the object to a byte representation, which can be stored or imported."""
462 | 
463 |         # TODO: validate file name
464 | 
465 |         log.info(f"Exporting data to compressed .dge file ({filename}.npz)...")
466 | 
467 |         np.savez_compressed(
468 |             filename,
469 |             samples=self.samples,
470 |             genes=self.genes,
471 |             norm_factors=self.norm_factors,
472 |             counts=self.counts,
473 |             groups_list=self.groups_list,
474 |         )
475 | 
476 |     def read_npz_file(self, filename: str) -> None:
477 |         """Import a file name stored in the dge export format.
478 | 
479 |         Args:
480 |             filename: the name of the file to read from.
481 | 
482 |         """
483 | 
484 |         log.info(f"Importing data from .dge file ({filename})...")
485 | 
486 |         npzfile = np.load(filename)
487 |         self.counts = npzfile["counts"]
488 |         self.genes = npzfile["genes"]
489 |         self.samples = npzfile["samples"]
490 |         self.norm_factors = npzfile["norm_factors"]
491 |         self.groups_list = npzfile["groups_list"].tolist()
492 | 
493 |         self.groups_dict = self._sample_group_dict(self.groups_list, self.samples)
494 | 
495 |     @classmethod
496 |     def create_DGEList(
497 |         cls,
498 |         sample_list: List[str],
499 |         data_set: Dict[Hashable, Any],  # {sample: {gene1: x, gene2: y}},
500 |         gene_list: List[str],
501 |         sample_to_category: Optional[List[str]] = None,
502 |         category_to_samples: Optional[Dict[Hashable, List[str]]] = None,
503 |     ) -> "DGEList":
504 |         """ sample list and gene list must be pre-sorted
505 |             Use this to create the DGE object for future work."""
506 | 
507 |         log.info("Creating DGE list object...")
508 |         temp_data_store = np.zeros(shape=(len(gene_list), len(sample_list)))
509 | 
510 |         for idx_s, sample in enumerate(sample_list):
511 |             for idx_g, gene in enumerate(gene_list):
512 |                 if sample in data_set and gene in data_set[sample]:
513 |                     if data_set[sample][gene]:
514 |                         temp_data_store[idx_g, idx_s] = data_set[sample][gene]
515 | 
516 |         return cls(
517 |             counts=temp_data_store,
518 |             genes=np.array(gene_list),
519 |             samples=np.array(sample_list),
520 |             groups_in_list=sample_to_category if sample_to_category else None,
521 |             groups_in_dict=category_to_samples if category_to_samples else None,
522 |             to_remove_zeroes=False,
523 |         )
524 | 
525 |     @classmethod
526 |     def create_DGEList_data_file(
527 |         cls, data_file: Path, group_file: Path, **kwargs: Mapping
528 |     ) -> "DGEList":
529 |         """Wrapper for creating DGEList objects from file locations.  Performs open and passes
530 |         the file handles to the method for creating a DGEList object.
531 | 
532 |         This function uses smart_open, which provides a broad list of data sources that can be
533 |         opened.  For a full list of data sources, see smart_open's documentation at
534 |         https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst
535 | 
536 |         Args:
537 |             data_file: Text file defining the data set.
538 |             group_file: The JSON file defining the groups.
539 |             kwargs: Additional arguments supported by ``np.genfromtxt``.
540 | 
541 |         Returns:
542 |             DGEList: Container for storing read counts for samples.
543 | 
544 |         """
545 |         with smart_open(data_file, 'r') as data_handle, smart_open(
546 |             group_file, 'r'
547 |         ) as group_handle:
548 |             return cls.create_DGEList_handle(data_handle, group_handle, **kwargs)
549 | 
550 |     @classmethod
551 |     def create_DGEList_handle(
552 |         cls, data_handle: StringIO, group_handle: StringIO, **kwargs: Mapping
553 |     ) -> "DGEList":
554 |         """Read in a file-like object of delimited data for instantiation.
555 | 
556 |         Args:get_canonical
557 |             data_handle: Text file defining the data set.
558 |             group_handle: The JSON file defining the groups.
559 |             kwargs: Additional arguments supported by ``np.genfromtxt``.
560 | 
561 |         Returns:
562 |             DGEList: Container for storing read counts for samples.
563 | 
564 |         """
565 |         _, *samples = next(data_handle).strip().split()
566 | 
567 |         genes = []
568 |         frame = np.genfromtxt(
569 |             fname=data_handle,
570 |             dtype=np.int,
571 |             converters={0: lambda _: genes.append(_.decode("utf-8")) or 0},  # type: ignore
572 |             autostrip=kwargs.pop("autostrip", True),
573 |             replace_space=kwargs.pop("replace_space", "_"),
574 |             case_sensitive=kwargs.pop("case_sensitive", True),
575 |             invalid_raise=kwargs.pop("invalid_raise", True),
576 |             # skip_header=kwargs.pop("skip_headers", 1),
577 |             **kwargs,
578 |         )
579 | 
580 |         # Delete the first column as it is copied on assignment to `genes`.
581 |         counts = np.delete(frame, 0, axis=1)
582 |         # Delete the first element in the genes list: (should be 'genes' but was a
583 |         # duplicate gene name, due to a putative bug in genfromtxt
584 |         genes = genes[1:]
585 | 
586 |         group = json.load(group_handle)
587 | 
588 |         return cls(
589 |             counts=counts,
590 |             genes=genes,
591 |             samples=samples,
592 |             groups_in_dict=group,
593 |             to_remove_zeroes=False,
594 |         )
595 | 


--------------------------------------------------------------------------------
/edgePy/__init__.py:
--------------------------------------------------------------------------------
1 | from edgePy import data_import
2 | 
3 | from edgePy.DGEList import DGEList
4 | 
5 | from edgePy.util import getLogger
6 | 


--------------------------------------------------------------------------------
/edgePy/benchmarking/00.GSE49712.Rscript.txt:
--------------------------------------------------------------------------------
  1 | ##############################
  2 | x<-c("pheatmap","limma","gplots","edgeR","RColorBrewer")
  3 | require(x)
  4 | lapply(x, require, character.only = TRUE)
  5 | rm(x)
  6 | ##############################
  7 | dados<-read.table("GSE49712_gene_FPKM.txt",sep="\t",header=TRUE,row.names=NULL)
  8 | ##############################
  9 | group <- as.factor(c(rep("A",5),rep("B",5)))
 10 | ##############################
 11 | x<-dados[,2:11]
 12 | rownames(x)<-dados$Geneid
 13 | cpm <- cpm(x)
 14 | lcpm <- cpm(x, log=TRUE)
 15 | table(keep.exprs)
 16 | keep.exprs <- rowSums(cpm>1)>=3
 17 | x <- x[keep.exprs,]
 18 | geneSymbols<-dados[,1][keep.exprs]
 19 | ##############################
 20 | png("diagnostic_fig1.png")
 21 | nsamples <- ncol(x)
 22 | col <- brewer.pal(nsamples, "Paired")
 23 | par(mfrow=c(1,2))
 24 | plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.21), las=2, 
 25 |      main="", xlab="")
 26 | title(main="A. Raw data", xlab="Log-cpm")
 27 | abline(v=0, lty=3)
 28 | for (i in 2:nsamples){
 29 |   den <- density(lcpm[,i])
 30 |   lines(den$x, den$y, col=col[i], lwd=2)
 31 | }
 32 | legend("topright", legend=group, text.col=col, bty="n")
 33 | ###
 34 | lcpm <- cpm(x, log=TRUE)
 35 | plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.21), las=2, 
 36 |      main="", xlab="")
 37 | title(main="B. Filtered data", xlab="Log-cpm")
 38 | abline(v=0, lty=3)
 39 | for (i in 2:nsamples){
 40 |   den <- density(lcpm[,i])
 41 |   lines(den$x, den$y, col=col[i], lwd=2)
 42 | }
 43 | legend("topright", legend=group, text.col=col, bty="n")
 44 | dev.off()
 45 | ##############################
 46 | x<-as.matrix(x)
 47 | rownames(x)<-geneSymbols
 48 | d.cpm.x <- DGEList(counts=x,group=group) 
 49 | d.cpm.x <- calcNormFactors(d.cpm.x, method = "TMM") 
 50 | ##############################
 51 | d.cpm.x2 <- d.cpm.x
 52 | d.cpm.x2$samples$norm.factors <- 1
 53 | d.cpm.x2$counts[,1] <- ceiling(d.cpm.x2$counts[,1]*0.05)
 54 | d.cpm.x2$counts[,2] <- d.cpm.x2$counts[,2]*5
 55 | ##############################
 56 | png("diagnostic_fig2.png")
 57 | par(mfrow=c(1,2))
 58 | lcpm <- cpm(d.cpm.x2, log=TRUE)
 59 | boxplot(lcpm, las=2, col=col, main="")
 60 | title(main="A. Example: Unnormalised data",ylab="Log-cpm")
 61 | d.cpm.x2 <- calcNormFactors(d.cpm.x2,method = "TMM")
 62 | d.cpm.x2$samples$norm.factors
 63 | lcpm <- cpm(d.cpm.x2, log=TRUE)
 64 | boxplot(lcpm, las=2, col=col, main="")
 65 | title(main="B. Example: Normalised data",ylab="Log-cpm")
 66 | dev.off()
 67 | ##############################
 68 | lcpm <- cpm(d.cpm.x, log=TRUE)
 69 | png("mds.png")
 70 | plotMDS(lcpm, labels=group, col=as.numeric(group))
 71 | title(main="MDS - Sample groups")
 72 | dev.off()
 73 | ##############################
 74 | png("diagnostic_fig3.png")
 75 | design = model.matrix( ~ 0 + group, data=d.cpm.x$samples)
 76 | colnames(design) <- levels(group)
 77 | d.cpm.x = estimateCommonDisp(d.cpm.x, verbose=TRUE)
 78 | d.cpm.x = estimateTagwiseDisp(d.cpm.x)
 79 | par(mfrow=c(1,2))
 80 | v <- voom(d.cpm.x, design, plot=TRUE)
 81 | ##############################
 82 | contr.matrix <- makeContrasts(
 83 |   AvsB = A - B, #1
 84 |   levels = colnames(design))
 85 | ##############################
 86 | vfit <- lmFit(v, design)
 87 | vfit <- contrasts.fit(vfit, contrasts=contr.matrix)
 88 | efit <- eBayes(vfit)
 89 | et <- decideTests(vfit)
 90 | plotSA(efit, main="Final model: Mean Variance Trend")
 91 | summary(decideTests(efit))
 92 | tfit <- treat(vfit, lfc=1)
 93 | dt <- decideTests(tfit)
 94 | summary(dt)
 95 | dev.off()
 96 | ##############################
 97 | 
 98 | ##############################
 99 | png("diagnostic_fig4.png")
100 | vennDiagram(dt[,1], circle.col=c("turquoise", "red","green"))
101 | dev.off()
102 | #de.common<-which(dt[,1]!=0 & dt[,2]!=1 & dt[,3]!=1 & dt[,2]!=-1 & dt[,3]!=-1)
103 | #length(de.common)
104 | topGenes<-topTreat(tfit, coef=1, n=Inf,adjust.method = "fdr",lfc=2,p.value=0.01)
105 | ##############################
106 | png("heatmap_fig5.png")
107 | pheatmap(as.matrix(v$E[which(rownames(v$E) %in% as.character(topGenes$ID)),]), color = colorRampPalette(c("navy", "white","firebrick4"))(255), 
108 |          cluster_cols = F, cluster_rows=T, show_colnames = TRUE, 
109 |          show_rownames = FALSE,clustering_distance_rows ="euclidean", scale="row")
110 | dev.off()
111 | ##############################
112 | write.table(file="genes+DEGs.tsv",topTreat(tfit, coef=1, n=7001),sep="\t")
113 | write.table(file="topGenesEdgar.tsv",topGenes,sep="\t")
114 | ##############################
115 | save.image("analysis.RNAseq.gse49712.Rdata")
116 | ##############################
117 | 


--------------------------------------------------------------------------------
/edgePy/benchmarking/01.diagnostic_fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/benchmarking/01.diagnostic_fig1.png


--------------------------------------------------------------------------------
/edgePy/benchmarking/02.diagnostic_fig2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/benchmarking/02.diagnostic_fig2.png


--------------------------------------------------------------------------------
/edgePy/benchmarking/03.mds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/benchmarking/03.mds.png


--------------------------------------------------------------------------------
/edgePy/benchmarking/04.diagnostic_fig3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/benchmarking/04.diagnostic_fig3.png


--------------------------------------------------------------------------------
/edgePy/benchmarking/05.diagnostic_fig4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/benchmarking/05.diagnostic_fig4.png


--------------------------------------------------------------------------------
/edgePy/benchmarking/06.heatmap_fig5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/benchmarking/06.heatmap_fig5.png


--------------------------------------------------------------------------------
/edgePy/benchmarking/09.analysis.RNAseq.gse49712.Rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/benchmarking/09.analysis.RNAseq.gse49712.Rdata


--------------------------------------------------------------------------------
/edgePy/benchmarking/README_benchmark.md:
--------------------------------------------------------------------------------
 1 | Analysis workflow for Bulk-RNAseq to compare the differentially expressed genes (DEGs) produced by edgeR and limma.
 2 | =====
 3 | ### 1. Script with all R packages and command lines
 4 | GSE49712.Rscript.txt 
 5 | 
 6 | ### 2. Data matrix extract from GEO
 7 | GSE49712_gene_FPKM.txt
 8 | 
 9 | ### 3. Figure produced by filtering the dataset based on Zero count rows/genes
10 | diagnostic_fig1.png
11 | ![01 diagnostic_fig1](https://user-images.githubusercontent.com/13422225/44564680-eeff3300-a729-11e8-9446-0e7643dbb651.png)
12 | 
13 | ### 4. Normalization of data distribution with TMM
14 | diagnostic_fig2.png
15 | ![02 diagnostic_fig2](https://user-images.githubusercontent.com/13422225/44564681-f58daa80-a729-11e8-9fb0-5fb64de760cd.png)
16 | 
17 | ### 5. Multidimensional scaling for general sample comparison.
18 | mds.png
19 | ![03 mds](https://user-images.githubusercontent.com/13422225/44564682-f9213180-a729-11e8-9afc-0271fe9cea34.png)
20 | 
21 | ### 6. Variance dispersion comparison for unnormalized and normalized data based on dispersion and a linear model 
22 | diagnostic_fig3.png
23 | ![04 diagnostic_fig3](https://user-images.githubusercontent.com/13422225/44564686-fe7e7c00-a729-11e8-95e7-f2db1ba53d69.png)
24 | 
25 | ### 7. Venn with quantity of DEGs
26 | diagnostic_fig4.png
27 | ![05 diagnostic_fig4](https://user-images.githubusercontent.com/13422225/44564694-076f4d80-a72a-11e8-8bba-424dd4421883.png)
28 | 
29 | ### 8. Sample heatmap for topDEGs
30 | heatmap_fig5.png
31 | ![06 heatmap_fig5](https://user-images.githubusercontent.com/13422225/44564698-0b9b6b00-a72a-11e8-8e5d-8fe72477a942.png)
32 | 
33 | 
34 | ### 9. DEG list for 7001 genes
35 |     * Filtered using p-value < 0.05 
36 | 
37 | **TODO: show file 07.DEGs.tsv**
38 | 
39 | ### 10. Top DEG list for 300 genes
40 |     * Filtered using p value < 0.01, FDR < 0.05 and fold change of 2. 
41 | 
42 | **TODO: show file 08.topDEGs.tsv**
43 | 
44 | ### Versions 
45 |     *R version 3.2.3 (2015-12-10)
46 |     *Platform: x86_64-pc-linux-gnu (64-bit) Running under: Ubuntu 16.04.2 LTS
47 | 
48 | ### Packages
49 | 
50 | ```
51 |     [1] ggplot2_2.2.1   pheatmap_1.0.10 edgeR_3.12.1    limma_3.26.9   
52 |     loaded via a namespace (and not attached):
53 |     [1] colorspace_1.3-2   scales_0.5.0       lazyeval_0.2.1     plyr_1.8.4
54 |     [5] pillar_1.2.1       gtable_0.2.0       RColorBrewer_1.1-2 tibble_1.4.2
55 |     [9] Rcpp_0.12.15       grid_3.2.3         rlang_0.2.0        munsell_0.4.3 
56 | ```
57 |  
58 |  
59 |  
60 | ### Compute resources 
61 | 
62 | ```
63 | 
64 |     Cores 4; 8GB RAM; 500GB HD
65 |     Processor: AMD Phenom(tm) II X4 B97 Processor × 4 
66 |     Graphics: Gallium 0.4 on AMD RS880 (DRM 2.50.0 / 4.15.0-32-generic, LLVM **3.8.0)
67 | 
68 | ```
69 | ### Computing
70 | 
71 | ```
72 | Running the script from cero, including packages loading time:
73 |    user  system elapsed 
74 |   1.213   0.136 167.807 
75 |    user  system elapsed 
76 |  12.951   0.366 180.962 
77 |  RAM used: 850mB
78 |  Cores used: 1
79 | ```
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/edgePy/data/GSE49712_HTSeq.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/data/GSE49712_HTSeq.txt.gz


--------------------------------------------------------------------------------
/edgePy/data/GSE49712_HTSeq.txt.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/data/GSE49712_HTSeq.txt.npz


--------------------------------------------------------------------------------
/edgePy/data/example_gene_list.txt:
--------------------------------------------------------------------------------
1 | TP53
2 | BRCA1
3 | BRCA2
4 | 


--------------------------------------------------------------------------------
/edgePy/data/groups.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"Group 1": [
 3 | 		"A_1",
 4 | 		"A_2",
 5 | 		"A_3",
 6 | 		"A_4",
 7 | 		"A_5"
 8 | 	],
 9 | 	"Group 2": [
10 | 		"B_1",
11 | 		"B_2",
12 | 		"B_3",
13 | 		"B_4",
14 | 		"B_5"
15 | 	]
16 | }


--------------------------------------------------------------------------------
/edgePy/data_import/__init__.py:
--------------------------------------------------------------------------------
1 | from .data_import import *
2 | 


--------------------------------------------------------------------------------
/edgePy/data_import/data_import.py:
--------------------------------------------------------------------------------
 1 | """ Skeleton class for importing files """
 2 | 
 3 | from pathlib import Path
 4 | from typing import Union
 5 | 
 6 | 
 7 | __all__ = ["get_dataset_path"]
 8 | 
 9 | 
10 | def get_dataset_path(filename: Union[str, Path]) -> Path:
11 |     """Return the filesystem path to the packaged data file.
12 | 
13 |     Args:
14 |         filename (str, pathlib.Path) : The full name of the packaged data file.
15 | 
16 |     Returns:
17 |         path (pathlib.Path) : The filesystem path to the packaged data file.
18 | 
19 |     Examples
20 |     >>> from edgePy.data_import.data_import import get_dataset_path
21 |     >>> str(get_dataset_path("GSE49712_HTSeq.txt.gz"))  # doctest:+ELLIPSIS
22 |     '.../edgePy/data/GSE49712_HTSeq.txt.gz'
23 | 
24 |     """
25 |     import edgePy
26 | 
27 |     directory = Path(edgePy.__file__).expanduser().resolve().parent
28 |     return directory / "data" / filename
29 | 


--------------------------------------------------------------------------------
/edgePy/data_import/ensembl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/data_import/ensembl/__init__.py


--------------------------------------------------------------------------------
/edgePy/data_import/ensembl/canonical_transcripts.py:
--------------------------------------------------------------------------------
  1 | """Some macro-level functions for dealing with the mysql library"""
  2 | 
  3 | import argparse
  4 | from smart_open import smart_open  # type: ignore
  5 | 
  6 | from edgePy.data_import.ensembl.mysql_wrapper import MySQLWrapper
  7 | 
  8 | CANONICAL_TRANSCRIPT_SQL = """select gene.stable_id as gene, transcript.stable_id as transcript,
  9 | t_len.exon_len as length, IF(gene.canonical_transcript_id = transcript.transcript_id, "True", "False") as canonical
 10 | from transcript,
 11 | (select et.transcript_id, sum(exon.seq_region_end - exon.seq_region_start) as exon_len
 12 | from exon, exon_transcript as et where et.exon_id = exon.exon_id group by et.transcript_id ) as t_len, gene
 13 | where t_len.transcript_id = transcript.transcript_id and transcript.gene_id = gene.gene_id;"""
 14 | 
 15 | GENE_SYMBOL_SQL = """select xref.display_label as symbol, gene.stable_id as gene
 16 | from xref, gene
 17 | where xref.xref_id = gene.display_xref_id
 18 |       and xref.external_db_id = 1100;
 19 | """
 20 | 
 21 | GENE_SYNONYM_SQL = """select g.stable_id as gene_id, es.synonym as synonym
 22 | from gene g
 23 | join xref x on (g.display_xref_id = x.xref_id)
 24 | left join external_synonym es using (xref_id)
 25 | join external_db ed using (external_db_id)
 26 | where synonym is not NULL and ed.db_name='HGNC';"""
 27 | 
 28 | 
 29 | def parse_arguments(parser=None):
 30 |     if not parser:
 31 |         parser = argparse.ArgumentParser()
 32 | 
 33 |     parser.add_argument("--host", help="name of the mysql host", default="ensembldb.ensembl.org")
 34 |     parser.add_argument("--port", help="name of the mysql port", default=3337)
 35 |     parser.add_argument("--username", help="user name for the mysql service", default="anonymous")
 36 |     parser.add_argument("--password", help="password for the mysql service", default=None)
 37 |     parser.add_argument(
 38 |         "--database",
 39 |         help="database to use for the query, for example homo_sapiens_core_75_37 or "
 40 |         "homo_sapiens_core_93_38 or mus_musculus_core_93_38 ",
 41 |         default="homo_sapiens_core_75_37",
 42 |     )
 43 | 
 44 |     parser.add_argument(
 45 |         "--output_transcripts",
 46 |         help="where to put the file with the transcript data",
 47 |         default="blank",
 48 |     )
 49 |     parser.add_argument(
 50 |         "--output_symbols", help="where to put the file with the gene symbols", default="blank"
 51 |     )
 52 |     args = parser.parse_args()
 53 | 
 54 |     if args.output_transcripts == "blank":
 55 |         args.output_transcripts = f"../../data/transcripts_{args.database}.tsv"
 56 | 
 57 |     if args.output_symbols == "blank":
 58 |         args.output_symbols = f"../../data/symbols_{args.database}.tsv"
 59 | 
 60 |     return args
 61 | 
 62 | 
 63 | class CanonicalTranscript(object):
 64 |     """A simple class for storing Ensembl transcript data, as well as
 65 |     supplemental data for gene id/symbols/synnonyms"""
 66 | 
 67 |     def __init__(self, host, port, user, password, database):
 68 |         # needs to go into a config file, but for now:
 69 |         self.exon_store = {}
 70 | 
 71 |         self.mysql_wrapper = MySQLWrapper(
 72 |             host=host, port=port, username=user, password=password, database=database
 73 |         )
 74 | 
 75 |         print("retrieving canonical transcript data.")
 76 |         self.canonical_transcripts = self.mysql_wrapper.run_sql_query(CANONICAL_TRANSCRIPT_SQL)
 77 | 
 78 |         print("retrieving gene symbol data.")
 79 |         self.gene_symbols = self.mysql_wrapper.run_sql_query(GENE_SYMBOL_SQL)
 80 | 
 81 |         print("retrieving gene synonym data.")
 82 |         self.gene_synonyms = self.mysql_wrapper.run_sql_query(GENE_SYNONYM_SQL)
 83 | 
 84 |         print("completed")
 85 |         self.mysql_wrapper.close()
 86 | 
 87 | 
 88 | def main():
 89 |     args = parse_arguments()
 90 |     default_class = CanonicalTranscript(
 91 |         args.host, args.port, args.username, args.password, args.database
 92 |     )
 93 |     canonical = default_class.canonical_transcripts
 94 | 
 95 |     with smart_open(args.output_transcripts, 'w') as output:
 96 |         for transcript in canonical:
 97 |             output.write(
 98 |                 f"{transcript['gene']}\t{transcript['transcript']}\t"
 99 |                 f"{transcript['length']}\t{transcript['canonical']}\n"
100 |             )
101 | 
102 |     symbols = default_class.gene_symbols
103 |     synonyms = default_class.gene_synonyms
104 | 
105 |     with smart_open(args.output_symbols, 'w') as output:
106 |         """ The order here is important - symbols contain duplicates, so make sure the symbols
107 |         are procesesed before synonyms.  The matching script (ensembl_flat_file_reader.py) will ignore new symbols
108 |         for translating to gene, if there's already one accepted."""
109 | 
110 |         for symbol in symbols:
111 |             output.write(f"{symbol['symbol']}\t{symbol['gene']}\n")
112 |         for synonym in synonyms:
113 |             output.write(f"{synonym['synonym']}\t{synonym['gene_id']}\n")
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     main()
118 | 


--------------------------------------------------------------------------------
/edgePy/data_import/ensembl/ensembl_flat_file_reader.py:
--------------------------------------------------------------------------------
  1 | from smart_open import smart_open  # type: ignore
  2 | from typing import Optional, Union, Dict, Hashable, Any, List
  3 | from pathlib import Path
  4 | 
  5 | 
  6 | class CanonicalDataStore(object):
  7 |     """
  8 |     A simple tool for reading canonical data, generated from the canonical_transcripts.py script provided with edgePy.
  9 | 
 10 |     Args:
 11 |         transcript_filename: the name of the transcript file, generated by canonical_transcripts.py
 12 |         symbols_filename: the name of the gene symbol file, generated by canonical_transcripts.py
 13 | 
 14 |     """
 15 | 
 16 |     def __init__(
 17 |         self, transcript_filename: Union[str, Path], symbols_filename: Union[str, Path]
 18 |     ) -> None:
 19 | 
 20 |         self.by_transcript: Dict[Hashable, Dict[Hashable, Any]] = {}
 21 |         self.canonical_transcript: Dict[Hashable, str] = {}
 22 | 
 23 |         self.gene_to_symbol: Dict[Hashable, str] = {}
 24 |         self.symbol_to_genes: Dict[Hashable, List] = {}
 25 | 
 26 |         with smart_open(transcript_filename, 'r') as data:
 27 |             for line in data:
 28 |                 gene_info = line.strip().split("\t")
 29 |                 gene = gene_info[0]
 30 |                 transcript = gene_info[1]
 31 |                 length = int(gene_info[2])
 32 |                 canonical = True if gene_info[3] == "True" else False
 33 | 
 34 |                 self.by_transcript[transcript] = {'len': length, 'can': canonical}
 35 | 
 36 |                 if canonical:
 37 |                     self.canonical_transcript[gene] = transcript
 38 | 
 39 |         with smart_open(symbols_filename, 'r') as data:
 40 |             for line in data:
 41 |                 symbol_info = line.strip().split("\t")
 42 |                 symbol = symbol_info[0]
 43 |                 gene = symbol_info[1]
 44 | 
 45 |                 if gene not in self.gene_to_symbol:
 46 |                     self.gene_to_symbol[gene] = symbol
 47 | 
 48 |                 if symbol not in self.symbol_to_genes:
 49 |                     self.symbol_to_genes[symbol] = []
 50 |                 if gene not in self.symbol_to_genes[symbol]:
 51 |                     self.symbol_to_genes[symbol].append(gene)
 52 | 
 53 |     def has_gene(self, gene: Optional[str]) -> bool:
 54 |         """
 55 |         Check if a gene is present in the dataset.
 56 |         Args:
 57 |             gene: the ensembl gene id.
 58 |         """
 59 |         if gene and gene in self.canonical_transcript:
 60 |             return True
 61 |         else:
 62 |             return False
 63 | 
 64 |     def get_symbol_from_gene(self, gene: Optional[str]) -> Optional[str]:
 65 |         """
 66 |         Given a gene name, get the symbol - should give you the default ENSEMBL name, and not a synonym.
 67 |         Args:
 68 |             gene: the ensembl gene id.
 69 |         """
 70 |         if not gene:
 71 |             return None
 72 |         try:
 73 |             return self.gene_to_symbol[gene]
 74 |         except KeyError:
 75 |             print(f"gene {gene} not found in gene to symbol.")
 76 |             raise KeyError
 77 | 
 78 |     def get_genes_from_symbol(self, symbol: str) -> List:
 79 |         """
 80 |         Given the gene symbol (or a recognized synonym), get the ensembl id.
 81 |         Args:
 82 |             symbol: HUGO or HGNC symbol
 83 |         """
 84 | 
 85 |         try:
 86 |             return self.symbol_to_genes[symbol]
 87 |         except KeyError:
 88 |             return []
 89 | 
 90 |     @staticmethod
 91 |     def pick_gene_id(gene_ids: List) -> Optional[str]:
 92 |         """
 93 |         Where there are more than one gene ID for a symbol, pick the one with the largest ensembl ID integer.
 94 |         Args:
 95 |             gene_ids: list of gene IDs.
 96 |         """
 97 |         if not gene_ids:
 98 |             return None
 99 |         length = len(gene_ids)
100 |         if length == 1:
101 |             return gene_ids[0]
102 |         else:
103 |             gene_ids.sort(reverse=True)
104 |             return gene_ids[0]
105 | 
106 |     def is_known_symbol(self, symbol: str) -> bool:
107 |         """
108 |         Check to see if we recognize a given symbol - there always will be things we don't recognize.
109 |         Args:
110 |             symbol: what you think is a gene symbol.
111 |         """
112 |         if symbol in self.symbol_to_genes:
113 |             return True
114 |         return False
115 | 
116 |     def is_known_gene(self, gene: str) -> bool:
117 |         """
118 |         Check to see if we can recognize a given gene ID from ENSEMBL.  If you have one that isn't recognized, it might
119 |         belong to a different version.
120 |         Args:
121 |             gene: what you think is a gene id.
122 |         :return:
123 |         """
124 |         if gene in self.gene_to_symbol:
125 |             return True
126 |         return False
127 | 
128 |     def is_canonical_by_transcript(self, transcript_id: str) -> bool:
129 |         """
130 |         Return a boolean indicating whether the supplied transcript is canonical or not.
131 | 
132 |         Args:
133 |             transcript_id: an Ensembl transcript ID, starting with ENST
134 |         """
135 | 
136 |         if transcript_id not in self.by_transcript:
137 |             return False
138 |         else:
139 |             return self.by_transcript[transcript_id]['can']
140 | 
141 |     def get_canonical_transcript(self, gene_id: str) -> Optional[str]:
142 |         """
143 |         Return the Ensembl canonical transcript ID, given an ensembl transcript ID.
144 | 
145 |         Args:
146 |             gene_id: An Ensembl gene ID, starting with ENSG
147 |         """
148 | 
149 |         if gene_id in self.canonical_transcript:
150 |             return self.canonical_transcript[gene_id]
151 |         else:
152 |             return None
153 | 
154 |     def get_length_of_transcript(self, transcript_id: str) -> int:
155 |         """
156 |         Return the length of a transcript, given an ensembl transcript ID.
157 | 
158 |         Args:
159 |              transcript_id: an Ensembl transcript ID, starting with ENST
160 |         """
161 |         if transcript_id not in self.by_transcript:
162 |             return False
163 |         else:
164 |             return self.by_transcript[transcript_id]['len']
165 | 
166 |     def get_length_of_canonical_transcript(self, gene_id: Optional[str]) -> int:
167 |         """
168 |         Return the length of a transcript, given an ensembl gene ID.
169 | 
170 |         Args:
171 |              gene_id: An Ensembl gene ID, starting with ENSG
172 |         """
173 |         if not gene_id:
174 |             return 0
175 | 
176 |         transcript_id = self.get_canonical_transcript(gene_id)
177 | 
178 |         if not transcript_id or transcript_id not in self.by_transcript:
179 |             return False
180 |         else:
181 |             return self.by_transcript[transcript_id]['len']
182 | 


--------------------------------------------------------------------------------
/edgePy/data_import/ensembl/mysql_wrapper.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This will be a wrapper file for using mysql, to make queries easier.  I may eventually just replace this with
 3 | SQLAlchemy, but for the moment, it's probalby easiest just to make a simple library to handle this type of transaction.
 4 | 
 5 | """
 6 | 
 7 | import pymysql
 8 | from typing import List
 9 | from pymysql.cursors import DictCursor
10 | 
11 | 
12 | class MySQLWrapper(object):
13 |     def __init__(
14 |         self,
15 |         host: str = None,
16 |         port: int = None,
17 |         username: str = None,
18 |         password: str = None,
19 |         database: str = None,
20 |     ) -> None:
21 |         self.host = host
22 |         self.port = port
23 |         self.username = username
24 |         self.password = password
25 |         self.database = database
26 |         self.connection = pymysql.connect(
27 |             host=self.host,
28 |             user=self.username,
29 |             password=self.password,
30 |             db=self.database,
31 |             charset="utf8mb4",
32 |             cursorclass=DictCursor,
33 |         )
34 | 
35 |     def find_one(self, sql: str) -> object:
36 |         with self.connection.cursor() as cursor:
37 |             # Read a single record
38 |             cursor.execute(sql)
39 |             result = cursor.fetchone()
40 |         return result
41 | 
42 |     def insert(self, sql: str) -> None:
43 |         with self.connection.cursor() as cursor:
44 |             # Create a new record
45 |             cursor.execute(sql)
46 |         self.connection.commit()
47 | 
48 |     def update(self) -> None:
49 |         raise NotImplementedError
50 | 
51 |     def run_sql_query(self, sql: str) -> List:
52 |         with self.connection.cursor() as cursor:
53 |             cursor.execute(sql)
54 |             result = cursor.fetchall()
55 |         return result
56 | 
57 |     def close(self) -> None:
58 |         self.connection.close()
59 | 


--------------------------------------------------------------------------------
/edgePy/data_import/mongodb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/edgePy/data_import/mongodb/__init__.py


--------------------------------------------------------------------------------
/edgePy/data_import/mongodb/gene_functions.py:
--------------------------------------------------------------------------------
  1 | """The core Python code for generating data."""
  2 | from typing import Dict, Optional, List, Tuple, Any, Hashable
  3 | 
  4 | 
  5 | def get_genelist_from_file(filename: str) -> Optional[List]:
  6 |     """
  7 |     Converts a genelist file into a list of genes.   Simple function, but can be expanded if needed.
  8 |     Args:
  9 |         filename: gene list file name.
 10 |     """
 11 | 
 12 |     # TODO: should be expanded to handle gzip genelists too.
 13 | 
 14 |     if not filename:
 15 |         return None
 16 |     gene_list = []
 17 |     with open(filename, "r") as file_handle:
 18 |         for line in file_handle:
 19 |             gene_list.append(line.strip())
 20 |     return gene_list
 21 | 
 22 | 
 23 | def translate_genes(
 24 |     genes: Optional[List[str]], mongo_reader: Any, database: str = "ensembl_90_37"
 25 | ) -> Tuple[List[str], Dict[str, str]]:
 26 |     """
 27 |     Functions to translate a list of genes in to ENGS symbols and vice versa.
 28 | 
 29 |     Args:
 30 |         genes: list of genes to filter on.
 31 |         mongo_reader: the mongo connector
 32 |         database: the name of the database to use.  "pytest" for unit testimg (mocking)
 33 | 
 34 |     Returns:
 35 |         a list of ensg symbols, a list of gene symbols
 36 |     """
 37 | 
 38 |     ensg_genes = []
 39 |     non_ensg_genes = []
 40 |     gene_symbols = {}
 41 |     query: Dict[Hashable, Any] = {}
 42 | 
 43 |     if genes:
 44 |         for gene in genes:
 45 |             if gene.startswith("ENSG"):
 46 |                 ensg_genes.append(gene)
 47 |             else:
 48 |                 non_ensg_genes.append(gene)
 49 |     if ensg_genes or not genes:
 50 |         if genes:
 51 |             query["_id"] = {"$in": ensg_genes}
 52 |         symbol_gene_list = mongo_reader.find_as_cursor(database, "symbol_by_ensg", query=query)
 53 |         for symbol_gene in symbol_gene_list:
 54 |             for symbol in symbol_gene["symbols"]:
 55 |                 gene_symbols[symbol_gene["_id"]] = symbol
 56 |         for ensg in ensg_genes:
 57 |             if ensg not in gene_symbols:
 58 |                 gene_symbols[ensg] = ensg
 59 |     if non_ensg_genes or not genes:
 60 |         query = {"_id": {"$in": non_ensg_genes}} if genes else {}
 61 |         translated_gene_list = mongo_reader.find_as_cursor(database, "ensg_by_symbol", query=query)
 62 |         for trans_gene in translated_gene_list:
 63 |             symbol = trans_gene["_id"]
 64 |             ensgs = trans_gene["ensgs"]
 65 |             for ensg in ensgs:
 66 |                 gene_symbols[ensg] = symbol
 67 |                 ensg_genes.append(ensg)
 68 |     return ensg_genes, gene_symbols
 69 | 
 70 | 
 71 | def get_gene_list(mongo_reader: Any, database: str = "ensembl_90_37") -> Dict[str, str]:
 72 |     """
 73 |     get the list of genes from the mongo database, to translated ensg ids to symbols.
 74 | 
 75 |     Args:
 76 |         mongo_reader: the mongo wrapper
 77 |         database: database name to use.
 78 | 
 79 |     """
 80 | 
 81 |     genes = mongo_reader.find_as_cursor(database, "symbol_by_ensg", query={})
 82 |     gene_symbols = {}
 83 |     for symbol_gene in genes:
 84 |         for symbol in symbol_gene["symbols"]:
 85 |             gene_symbols[symbol_gene["_id"]] = symbol
 86 |     return gene_symbols
 87 | 
 88 | 
 89 | def get_sample_details(
 90 |     group_by: str, mongo_reader: Any, database: str
 91 | ) -> Dict[Any, Dict[str, Any]]:
 92 |     """
 93 |     Get details from the samples collection.  Use this to decide which samples to query data for.
 94 | 
 95 |     Args:
 96 |         group_by: the name of the key to group samples by (Category-based key)
 97 |         mongo_reader: the mongo wrapper
 98 |         database: the database to use
 99 | 
100 |     Returns:
101 |         details required for each sample available.
102 | 
103 |     """
104 | 
105 |     sample_details = {}
106 |     search = {group_by: {"$exists": True}}
107 |     sample_grouping = mongo_reader.find_as_cursor(
108 |         database,
109 |         "samples",
110 |         query=search,
111 |         projection={"_id": 0, group_by: 1, "sample_name": 1, "Description": 1},
112 |     )
113 | 
114 |     for sample in sample_grouping:
115 |         sample_details[sample["sample_name"]] = {
116 |             "description": sample["Description"]
117 |             if "Description" in sample
118 |             else sample["sample_name"],
119 |             "category": sample[group_by],
120 |         }
121 | 
122 |     return sample_details
123 | 
124 | 
125 | def get_canonical_rpkm(result: Dict[str, Any]) -> Optional[int]:
126 |     """
127 |     Get the rpkm from the database for a given entry in the data collection.
128 | 
129 |     Args:
130 |         result: the entry in the data collection
131 | 
132 |     Returns:
133 |         the rpkm value
134 | 
135 |     """
136 |     transcript_list = result["transcripts"]
137 |     for trans in transcript_list.values():
138 |         if int(trans["canonical"]) == 1:
139 |             return trans["rpkm"]
140 |     return None
141 | 
142 | 
143 | def get_canonical_raw(result: Dict[str, Any]) -> Optional[int]:
144 |     """
145 |     An approximation of the raw count of reads.
146 | 
147 |     Args:
148 |         result: the entry from the data collection
149 | 
150 |     Returns:
151 |         the raw count (as an integer)
152 | 
153 |     """
154 | 
155 |     transcript_list = result["transcripts"]
156 |     for trans in transcript_list.values():
157 |         if int(trans["canonical"]) == 1:
158 |             raw = 0
159 |             for exon in trans["exons"]:
160 |                 raw += int(trans["exons"][exon]["raw"])
161 |             return raw
162 |     return None
163 | 


--------------------------------------------------------------------------------
/edgePy/data_import/mongodb/mongo_import.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from typing import Dict, Hashable, Any, Tuple, List, Optional, Union
  3 | 
  4 | from edgePy.data_import.mongodb.mongo_wrapper import MongoWrapper
  5 | from edgePy.data_import.mongodb.gene_functions import get_canonical_rpkm
  6 | from edgePy.data_import.mongodb.gene_functions import get_canonical_raw
  7 | from edgePy.data_import.mongodb.gene_functions import get_genelist_from_file
  8 | from edgePy.data_import.mongodb.gene_functions import translate_genes
  9 | from edgePy.util import getLogger
 10 | 
 11 | log = getLogger(name=__name__)
 12 | 
 13 | 
 14 | def parse_arguments(parser: Any = None, ci_values: List[str] = None) -> Any:
 15 | 
 16 |     """
 17 |     Standard argparse wrapper for interpreting command line arguments.
 18 | 
 19 |     Args:
 20 |         parser: if there's an existing parser, provide it, else, this will
 21 |         create a new one.
 22 |         ci_values: use for testing purposes only.
 23 |     """
 24 |     if not parser:
 25 |         parser = argparse.ArgumentParser()
 26 | 
 27 |     parser.add_argument("--config", help="location of the config file", required=True)
 28 |     parser.add_argument("--key_name", default="Project")
 29 |     parser.add_argument("--key_value", default="RNA-Seq1")
 30 |     parser.add_argument("--gene_list", default=None)
 31 | 
 32 |     if ci_values:
 33 |         args = parser.parse_args(ci_values)
 34 |     else:
 35 |         args = parse_arguments()
 36 |     return args
 37 | 
 38 | 
 39 | class ImportFromMongodb(object):
 40 |     """
 41 |     A utility for importing mongo data from a proprietary mongodb database - hopefully we'll
 42 |     open this database up in the future.  If not, we can re-engineer it from the examples given.
 43 | 
 44 |     Args:
 45 |         host: the name of the machine hosting the database
 46 |         port: the port number (usually 27017)
 47 |         mongo_key: a key in the samples collection to filter on
 48 |         mongo_value: accepted values in the samples collection to
 49 |         gene_list_file: a list of genes to filter the results on.
 50 | 
 51 |     """
 52 | 
 53 |     def __init__(
 54 |         self,
 55 |         host: str,
 56 |         port: int,
 57 |         mongo_key: Optional[str],
 58 |         mongo_value: Union[str, List, None],
 59 |         gene_list_file: Optional[str],
 60 |     ) -> None:
 61 | 
 62 |         self.mongo_host = host
 63 |         self.mongo_port = port
 64 | 
 65 |         self.mongo_reader = MongoWrapper(host=self.mongo_host, port=self.mongo_port, connect=False)
 66 | 
 67 |         self.search_key = mongo_key
 68 |         self.search_value = mongo_value
 69 | 
 70 |         self.input_gene_file = gene_list_file
 71 |         self.gene_list: Optional[List[str]] = None
 72 | 
 73 |     def translate_gene_list(self, database: str) -> None:
 74 |         """
 75 |         If there was a list of genes provided, convert them to ENSG symbols.
 76 | 
 77 |         Args:
 78 |             database: name of the database
 79 | 
 80 |         """
 81 | 
 82 |         if self.input_gene_file:
 83 |             input_genes = get_genelist_from_file(self.input_gene_file)
 84 |             ensg_genes, gene_symbols = translate_genes(
 85 |                 input_genes, self.mongo_reader, database=database
 86 |             )
 87 |             self.gene_list = ensg_genes
 88 | 
 89 |     def get_data_from_mongo(
 90 |         self, database: str, rpkm_flag: bool = False
 91 |     ) -> Tuple[List[str], Dict[Hashable, Any], List[str], Dict[Hashable, Any]]:
 92 |         """
 93 |         Run the queries to get the samples, from mongo, and then use that data to retrieve
 94 |         the counts.
 95 | 
 96 |         Args:
 97 |             database: name of the database to retrieve data from.
 98 |             rpkm_flag: takes the rpkm values from the mongodb, instead of the raw counts
 99 | 
100 |         Returns:
101 |             the list of samples, the data itself,
102 |             the gene list and the categories of the samples.
103 | 
104 |         """
105 | 
106 |         if self.input_gene_file and not self.gene_list:
107 |             self.translate_gene_list(database)
108 | 
109 |         query: Dict[Hashable, Any] = {}
110 |         if self.search_key and self.search_value:
111 | 
112 |             if self.search_value == 'regex':
113 |                 query = {self.search_key: {'$regex': 'myocyte|fibroblast'}}
114 |             else:
115 |                 if isinstance(self.search_value, list):
116 |                     query[self.search_key] = {'$in': self.search_value}
117 |                 else:
118 |                     query[self.search_key] = self.search_value
119 | 
120 |         elif self.search_key and not self.search_value:
121 |             query[self.search_key] = {"$exists": True}
122 |         elif not self.search_key and not self.search_value:
123 |             pass
124 |         else:
125 |             raise Exception(
126 |                 "Invalid input - you can't specify a " "key_value without specifying a key_name"
127 |             )
128 | 
129 |         projection: Dict[Hashable, Any] = {"sample_name": 1, "_id": 0}
130 |         if self.search_key and not self.search_key == "sample_name":
131 |             projection[self.search_key] = 1
132 | 
133 |         cursor = self.mongo_reader.find_as_cursor(
134 |             database=database, collection="samples", query=query, projection=projection
135 |         )
136 |         sample_names = set()
137 |         sample_category = {}
138 |         for result in cursor:
139 |             log.info(result)
140 |             sample_names.add(result["sample_name"])
141 |             sample_category[result["sample_name"]] = (
142 |                 result[self.search_key] if self.search_key else result["sample_name"]
143 |             )
144 |         log.info(f"Get data for sample_names {list(sample_names)}")
145 | 
146 |         query = {"sample_name": {"$in": list(sample_names)}}
147 |         if self.gene_list:
148 |             log.info(self.gene_list)
149 |             query["gene"] = {"$in": list(self.gene_list)}
150 |         cursor = self.mongo_reader.find_as_cursor(
151 |             database=database, collection="RNASeq", query=query, projection={"_id": 0}
152 |         )
153 | 
154 |         # make it a list of lists
155 |         log.info(f"Importing data from mongo ({self.mongo_host})...")
156 |         dataset: Dict[Hashable, Dict[Hashable, Optional[int]]] = {}
157 |         gene_list = set()
158 |         sample_list = set()
159 |         for count, result in enumerate(cursor):
160 |             if count % 100_000 == 0:
161 |                 log.info(f"{count} rows processed.")
162 |             sample = result["sample_name"]
163 |             rpkm = get_canonical_rpkm(result) if rpkm_flag else get_canonical_raw(result)
164 |             gene = result["gene"]
165 |             if sample not in dataset:
166 |                 dataset[sample] = {}
167 |             dataset[sample][gene] = rpkm
168 |             sample_list.add(sample)
169 |             gene_list.add(gene)
170 | 
171 |         return sorted(sample_list), dataset, sorted(gene_list), sample_category
172 | 


--------------------------------------------------------------------------------
/edgePy/data_import/mongodb/mongo_wrapper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A simple library for wrapping around mongo collections and access issues.
  3 | """
  4 | from typing import Dict, Hashable, Any, Iterable, List, Union
  5 | 
  6 | import pymongo  # type: ignore
  7 | from pymongo.errors import BulkWriteError  # type: ignore
  8 | from pymongo import InsertOne, UpdateOne
  9 | 
 10 | from edgePy.util import getLogger
 11 | 
 12 | log = getLogger(name=__name__)
 13 | 
 14 | 
 15 | class MongoWrapper(object):
 16 |     """This class is for use as a thin layer for interactinvg with the Mongo Database
 17 |     using pymongo. Pymongo is an entirely reasonable way of working with Mongodb, but
 18 |     fails to provide some very common functions that are frequently used.
 19 | 
 20 |     This class should be used for efficient retrieval of information from the database.
 21 | 
 22 |     Args:
 23 |         host: the name of the machine hosting the database
 24 |         port: the port number (usually 27017
 25 |         connect: whether to create the new session, or to attach to an existing session,
 26 |             set to false, if this is being instantiated by a subprocesses.
 27 |         verbose: suppresses output, when set to false.
 28 | 
 29 |     """
 30 | 
 31 |     def __init__(
 32 |         self, host: str, port: Union[str, int] = 27017, connect: bool = True, verbose: bool = False
 33 |     ) -> None:
 34 |         self.host = host
 35 |         self.port = int(port)
 36 |         self.session = pymongo.MongoClient(host=self.host, port=self.port, connect=connect)
 37 |         self.verbose = verbose
 38 | 
 39 |     def get_db(self, database: str, collection: str) -> Any:
 40 |         """
 41 |         This function simply hides the db name when using pytest-mongodb, when the database name
 42 |         should always be 'pytest'
 43 | 
 44 |         Args:
 45 |             database:  database name
 46 |             collection:  collection name
 47 | 
 48 |         Returns:
 49 |             the collection object ready for use with .find() or similar.
 50 | 
 51 |         """
 52 | 
 53 |         if database == "pytest":
 54 |             return self.session[collection]
 55 |         else:
 56 |             return self.session[database][collection]
 57 | 
 58 |     def find_as_cursor(
 59 |         self,
 60 |         database: str,
 61 |         collection: str,
 62 |         query: Dict[Hashable, Any] = None,
 63 |         projection: Dict[Hashable, Any] = None,
 64 |     ) -> Iterable:
 65 |         """
 66 |         Do a find operation on a mongo collection and return the data as a cursor,
 67 |         the (native MongoClient find return type.)
 68 | 
 69 |         Args:
 70 |             database: db name
 71 |             collection: collection name
 72 |             query: a dictionary providing the criteria for the find command
 73 |             projection: a dictionary that gives the projection - the fields to return.
 74 | 
 75 |         Returns:
 76 |             a cursor object, to be used as an iterator.
 77 | 
 78 |         """
 79 | 
 80 |         try:
 81 |             cursor = self.get_db(database, collection).find(query, projection)
 82 |         except Exception as exception:
 83 |             log.exception(exception)
 84 |             raise Exception("Mongo find failed")
 85 | 
 86 |         return cursor
 87 | 
 88 |     def find_as_list(
 89 |         self,
 90 |         database: str,
 91 |         collection: str,
 92 |         query: Dict[Hashable, Any] = None,
 93 |         projection: Dict[Hashable, Any] = None,
 94 |     ) -> Iterable:
 95 |         """
 96 |         Do a find operation on a mongo collection, but return the data as a list
 97 | 
 98 |         Args:
 99 |             database: db name
100 |             collection: collection name
101 |             query: a dictionary providing the criteria for the find command
102 |             projection: a dictionary that gives the projection - the fields to return.
103 | 
104 |         Returns:
105 |             a list representation of the returned data.
106 | 
107 |         """
108 |         cursor = self.find_as_cursor(
109 |             database=database, collection=collection, query=query, projection=projection
110 |         )
111 |         return [c for c in cursor]
112 | 
113 |     def find_as_dict(
114 |         self,
115 |         database: str,
116 |         collection: str,
117 |         query: Dict[Hashable, Any] = None,
118 |         field: str = "_id",
119 |         projection: Dict[Hashable, Any] = None,
120 |     ) -> Iterable:
121 |         """
122 |          Do a find operation on a mongo collection, but return the data as a dictionary
123 | 
124 |         Args:
125 |             database: db name
126 |             collection: collection name
127 |             query: a dictionary providing the criteria for the find command
128 |             projection: a dictionary that gives the projection - the fields to return.
129 |             field: the field in the projection for which the value will be used as the Hashable key of the dict.
130 | 
131 |         Returns:
132 |             a dictionary representation of the returned data.
133 | 
134 |         """
135 |         cursor = self.find_as_cursor(
136 |             database=database, collection=collection, query=query, projection=projection
137 |         )
138 |         return {c[field]: c for c in cursor}
139 | 
140 |     def insert(self, database: str, collection: str, data_list: List[Any]) -> None:
141 |         """
142 |         bulk insert of items into a mongodb collection.
143 | 
144 |         Args:
145 |             database: db name
146 |             collection: collection name
147 |             data_list: a list of documents to insert into mongodb.
148 | 
149 |         """
150 | 
151 |         try:
152 |             self.get_db(database, collection).test.insert_many(data_list, ordered=False)
153 |         except BulkWriteError as bwe:
154 |             log.exception(bwe.details)
155 | 
156 |     def create_index(self, database: str, collection: str, key: str) -> None:
157 | 
158 |         """
159 |         A tool for creating indexes on a given collection.
160 | 
161 |         Args:
162 |             database: db name
163 |             collection: collection name
164 |             key: the field name to create the index on.
165 | 
166 |         """
167 |         self.get_db(database, collection).create_index(key)
168 | 
169 | 
170 | class MongoInserter(MongoWrapper):
171 |     """
172 | 
173 |     This class is a thin layer on the MongoWrapper class, which is a thin layer on the pymongo library.
174 |     It is used for instances where you want to insert data into a mongodb collection.  It creates
175 |     a buffer which is periodically flushed to Mongo.
176 | 
177 |     Args:
178 |         host: the name of the machine hosting the database
179 |         port: the port number (usually 27017)
180 |         database: db name
181 |         collection: collection name
182 |         connect: whether to create the new session, or to attach to an existing session, set to false,
183 |         if this is being instantiated by a subprocesses.
184 | 
185 |     """
186 | 
187 |     def __init__(
188 |         self, host: str, port: int, database: str, collection: str, connect: bool = True
189 |     ) -> None:
190 |         MongoWrapper.__init__(self, host, port, connect=connect)
191 |         self.database = database
192 |         self.collection = collection
193 |         self.to_insert: List = []
194 |         self.mongo_col = self.get_db(database, collection)
195 | 
196 |     def flush(self) -> None:
197 |         """
198 |         Flush out the buffer and write to mongo db.
199 | 
200 |         """
201 |         if self.to_insert:
202 |             try:
203 |                 result = self.mongo_col.bulk_write(self.to_insert, ordered=False)
204 |                 if result and self.verbose:
205 |                     log.info(result.bulk_api_result)
206 |             except BulkWriteError as bwe:
207 |                 log.exception(bwe.details)
208 |                 raise Exception("Mongo bulk write failed.")
209 |         del self.to_insert[:]
210 | 
211 |     def add(self, record: Union[List[Any], Dict[Hashable, Any]]) -> None:
212 |         """
213 |         Add a record to the buffer
214 | 
215 |         Args:
216 |             record: the record to add to the mongo inserter buffer
217 | 
218 |         """
219 |         self.to_insert.append(InsertOne(record))
220 |         if len(self.to_insert) > 1000:
221 |             self.flush()
222 | 
223 |     def close(self) -> None:
224 |         """
225 |         Close the MongoInserter - flush the buffer.
226 | 
227 |         """
228 | 
229 |         self.flush()
230 | 
231 |     def create_index_key(self, key: str) -> None:
232 |         """
233 |         A tool for creating indexes on the collection.
234 |         """
235 |         self.create_index(self.database, self.collection, key)
236 | 
237 | 
238 | class MongoUpdater(MongoWrapper):
239 |     """
240 | 
241 |         This class is a thin layer on the MongoWrapper class, which is a thin layer on the pymongo library.
242 |         It is used for instances where you want to Update data in a mongodb collection.  It creates
243 |         a buffer which is periodically flushed  and written to mongo.
244 | 
245 |         Args:
246 |             host: the name of the machine hosting the database
247 |             port: the port number (usually 27017
248 |             database: db name
249 |             collection: collection name
250 |             connect: whether to create the new session, or to attach to an existing session,
251 |                 set to false, if this is being instantiated by a subprocesses.
252 | 
253 |         """
254 | 
255 |     def __init__(
256 |         self, host: str, port: int, database: str, collection: str, connect: bool = True
257 |     ) -> None:
258 |         MongoWrapper.__init__(self, host, port, connect=connect)
259 |         self.database = database
260 |         self.to_update: List[Any] = []
261 |         self.mongo_col = self.get_db(database, collection)
262 | 
263 |     def flush(self) -> None:
264 |         """
265 |         Flush out the buffer and write to mongo db.
266 | 
267 |         """
268 |         if self.to_update:
269 |             try:
270 |                 result = self.mongo_col.bulk_write(self.to_update, ordered=False)
271 |                 if result and self.verbose:
272 |                     log.info(result.bulk_api_result)
273 |             except BulkWriteError as bwe:
274 |                 log.exception(bwe.details)
275 |                 raise Exception("Mongo bulk write failed.")
276 |         del self.to_update[:]
277 | 
278 |     def add(self, updatedict: Dict[Hashable, Any], setdict: Dict[Hashable, Any]) -> None:
279 |         """
280 |         Add a record to the buffer
281 | 
282 |         Args:
283 |             updatedict: the criteria for the update query
284 |             setdict: the dictionary describing the new record - OR use {$set: {}} to update a
285 |                 particular key without replacing the existing record.
286 | 
287 |         """
288 | 
289 |         self.to_update.append(UpdateOne(updatedict, setdict))
290 |         if len(self.to_update) > 1000:
291 |             self.flush()
292 | 
293 |     def close(self) -> None:
294 |         """
295 |         Close the MongoInserter - flush the buffer.
296 | 
297 |         """
298 |         self.flush()
299 | 


--------------------------------------------------------------------------------
/edgePy/util.py:
--------------------------------------------------------------------------------
 1 | """ Utilities to support functions and classes """
 2 | from typing import Optional
 3 | 
 4 | import logzero  # type: ignore
 5 | 
 6 | __all__ = ["getLogger"]
 7 | 
 8 | LOG_FORMAT = (
 9 |     "%(color)s[%(levelname)s | %(asctime)s | "
10 |     "%(name)s | %(module)s | line %(lineno)d]:%(end_color)s %(message)s"
11 | )
12 | 
13 | 
14 | def getLogger(
15 |     name: str,
16 |     level: int = logzero.logging.DEBUG,
17 |     formatter: Optional[logzero.LogFormatter] = logzero.LogFormatter(fmt=LOG_FORMAT),
18 | ) -> logzero.logger:
19 |     """Formats and sets up the logger instance.
20 | 
21 |     Args:
22 |         name (str): The name of the Logger.
23 |         level (int): The default level (logzero.logging.INFO = 20) of the logger.
24 |         formatter (:obj:, optional): The format of the log message. Defaults to the default logzero format.
25 | 
26 |     Returns:
27 |         An instance of a logger.
28 | 
29 |     Examples:
30 |         >>> from edgePy.util import getLogger
31 |         >>> log = getLogger(name="script")
32 |         >>> log.info('This is your DGElist.')
33 |         ...
34 | 
35 |     Notes:
36 |         1. See https://docs.python.org/3/library/logging.html#levels for more information about logging levels.
37 | 
38 |     """
39 |     log_formatter = (
40 |         logzero.LogFormatter(fmt=logzero.LogFormatter.DEFAULT_FORMAT)
41 |         if formatter is None
42 |         else formatter
43 |     )
44 |     logger = logzero.setup_logger(name=name, level=level, formatter=log_formatter)
45 | 
46 |     return logger
47 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 99
 3 | py36 = true
 4 | skip-string-normalization = true
 5 | include = '\.pyi?$'
 6 | exclude = '''
 7 | /(
 8 |     \.git
 9 |   | \.mypy_cache
10 |   | \.tox
11 |   | venv
12 |   | _build
13 |   | buck-out
14 |   | build
15 |   | dist
16 | )/
17 | '''
18 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | mongodb_fixture_dir =
3 |   tests/mongodb/fixtures


--------------------------------------------------------------------------------
/requirements-test.txt:
--------------------------------------------------------------------------------
 1 | coverage==4.5
 2 | flake8==3.5.0
 3 | mypy==0.620
 4 | pylint==2.0.0
 5 | pytest==3.6.3
 6 | pytest-cov==2.5.1
 7 | pytest-parallel==0.0.2
 8 | pymongo==3.7.1
 9 | pytest-mongodb==2.1.2
10 | black==18.6b3
11 | 


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-bioinformatics/edgePy/298834f38565c9b0d0b476f4cd47d93522b3cfcd/scripts/__init__.py


--------------------------------------------------------------------------------
/scripts/edgepy.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from typing import List, Dict, Hashable, Any
  3 | import configparser
  4 | 
  5 | import numpy as np
  6 | from scipy.stats import ks_2samp
  7 | from smart_open import smart_open  # type: ignore
  8 | 
  9 | 
 10 | from edgePy.DGEList import DGEList
 11 | from edgePy.data_import.mongodb.mongo_import import ImportFromMongodb
 12 | from edgePy.util import getLogger
 13 | 
 14 | log = getLogger(name="script")
 15 | 
 16 | 
 17 | def parse_arguments(parser=None):
 18 |     if not parser:
 19 |         parser = argparse.ArgumentParser()
 20 | 
 21 |     parser.add_argument("--count_file", help="name of the count file")
 22 |     parser.add_argument("--groups_file", help="name of the groups file")
 23 |     parser.add_argument("--dge_file", help="import from .dge file;")
 24 |     parser.add_argument("--gene_list", default=None, help="a list of genes to filter the data set")
 25 | 
 26 |     # mongo parameters
 27 |     parser.add_argument(
 28 |         "--mongo_config", help="a way to import data from a supported mongo database"
 29 |     )
 30 |     parser.add_argument("--mongo_key_name", default="Project")
 31 |     parser.add_argument("--mongo_key_value", default="RNA-Seq1")
 32 |     parser.add_argument("--database_name")
 33 |     parser.add_argument(
 34 |         "--group1_sample_names", nargs='+', help="List of samples names for first group"
 35 |     )
 36 |     parser.add_argument(
 37 |         "--group2_sample_names", nargs='+', help="List of samples names for second group"
 38 |     )
 39 |     parser.add_argument(
 40 |         "--groups_json", help="A JSON file with the group names, and list of samples. see example."
 41 |     )
 42 | 
 43 |     parser.add_argument("--output", help="optional output file for results")
 44 |     parser.add_argument("--cutoff", help="p-value cutoff to accept.", default=0.05)
 45 |     parser.add_argument(
 46 |         "--minimum_cpm", help="discard results for which no group has this many counts", default=1
 47 |     )
 48 | 
 49 |     args = parser.parse_args()
 50 | 
 51 |     return args
 52 | 
 53 | 
 54 | class EdgePy(object):
 55 |     def __init__(self, args):
 56 | 
 57 |         self.dge_list = None
 58 | 
 59 |         if args.dge_file:
 60 |             self.dge_list = DGEList(filename=args.dge_file)
 61 |             log.info(f"The DGE list is {self.dge_list}")
 62 | 
 63 |         elif args.mongo_config:
 64 |             # This section is only useful for MongoDB based analyses.  Talk to @apfejes about this section if you have
 65 |             # any questions.
 66 | 
 67 |             config = configparser.ConfigParser()
 68 |             config.read(args.mongo_config)
 69 | 
 70 |             if args.group1_sample_names and args.group2_sample_names:
 71 |                 key = 'sample_name'
 72 |                 value = args.group1_sample_names + args.group2_sample_names
 73 | 
 74 |             elif args.key_name and args.mongo_key_value:
 75 |                 key = args.mongo_key_name
 76 |                 value = args.mongo_key_value
 77 |             else:
 78 |                 raise ValueError("Insufficient parameters for use of Mongodb")
 79 | 
 80 |             mongo_importer = ImportFromMongodb(
 81 |                 host=config.get("Mongo", "host"),
 82 |                 port=config.get("Mongo", "port"),
 83 |                 mongo_key=key,
 84 |                 mongo_value=value,
 85 |                 gene_list_file=args.gene_list,
 86 |             )
 87 | 
 88 |             sample_list, data_set, gene_list, sample_category = mongo_importer.get_data_from_mongo(
 89 |                 database=args.database_name
 90 |             )
 91 | 
 92 |             if key == 'sample_name':
 93 |                 # Override sample categories if sample name is the source of the categories.
 94 |                 sample_category_list = [
 95 |                     "group1" if sample_name in args.group1_sample_names else "group2"
 96 |                     for sample_name in sample_list
 97 |                 ]
 98 |                 sample_category_dict = None
 99 |             else:
100 |                 # TODO: read from file
101 |                 sample_category_dict = args.groups_json
102 |                 sample_category_list = None
103 | 
104 |             self.dge_list = DGEList.create_DGEList(
105 |                 sample_list,
106 |                 data_set,
107 |                 gene_list,
108 |                 sample_to_category=sample_category_list,
109 |                 category_to_samples=sample_category_dict,
110 |             )
111 | 
112 |             self.ensg_to_symbol = mongo_importer.mongo_reader.find_as_dict(
113 |                 'ensembl_90_37', "symbol_by_ensg", query={}
114 |             )
115 | 
116 |         else:
117 |             self.dge_list = DGEList.create_DGEList_data_file(
118 |                 data_file=args.counts_file, group_file=args.groups_file
119 |             )
120 | 
121 |         self.output = args.output if args.output else None
122 |         self.p_value_cutoff = args.cutoff
123 |         self.minimum_cpm = args.minimum_cpm
124 | 
125 |     def run_ks(self):
126 |         """
127 |         First pass implementation of a Kolmogorov-Smirnov test for different groups, using the Scipy KS test two-tailed
128 |         implementation.
129 | 
130 |         Args:
131 |              None.
132 | 
133 |         """
134 | 
135 |         log.info(self.dge_list.groups_list)
136 | 
137 |         gene_details, gene_likelyhood1, group_types = self.ks_2_samples()
138 | 
139 |         results = self.generate_results(
140 |             gene_details, gene_likelyhood1, group_types[0], group_types[1]
141 |         )
142 | 
143 |         if self.output:
144 |             with smart_open(self.output, 'w') as out:
145 |                 out.writelines(results)
146 |             log.info(f"Wrote to {self.output}")
147 |         else:
148 |             for line in results:
149 |                 log.info(line)
150 | 
151 |     def ks_2_samples(self):
152 |         """Run a 2-tailed Kolmogorov-Smirnov test on the DGEList object.
153 | 
154 |         Args:
155 |             None.
156 | 
157 |         Returns:
158 |             gene_details: a dictionary of dictionary (key, gene), holding mean1 and mean2 for the two groups
159 |             gene_likelihood: a dictionary (key, gene), holding the p-value of the separation of the two groups
160 |             group_types: list of the groups in order.
161 | 
162 |         """
163 |         gene_likelihood1: Dict[Hashable, float] = {}
164 |         group_types = set(self.dge_list.groups_list)
165 |         group_types = list(group_types)
166 |         group_filters: Dict[Hashable, Any] = {}
167 |         gene_details: Dict[Hashable, Dict[Hashable, Any]] = {}
168 |         for group in group_types:
169 |             group_filters[group] = [g == group for g in self.dge_list.groups_list]
170 |         for gene_idx, gene in enumerate(self.dge_list.genes):
171 |             gene_row = self.dge_list.counts[gene_idx]
172 |             if len(group_types) == 2:
173 |                 group_data1 = gene_row.compress(group_filters[group_types[0]])
174 |                 mean1 = np.mean(group_data1)
175 | 
176 |                 group_data2 = gene_row.compress(group_filters[group_types[1]])
177 |                 mean2 = np.mean(group_data2)
178 | 
179 |                 gene_likelihood1[gene] = ks_2samp(group_data1, group_data2)[1]
180 | 
181 |                 gene_details[gene] = {'mean1': mean1, 'mean2': mean2}
182 |         return gene_details, gene_likelihood1, group_types
183 | 
184 |     def generate_results(
185 |         self,
186 |         gene_details: Dict[Hashable, Dict[Hashable, Any]],
187 |         gene_likelihood1: Dict[Hashable, float],
188 |         group_type1: str,
189 |         group_type2: str,
190 |     ) -> List[str]:
191 | 
192 |         """
193 |         This function simply prepares a summary of the results of the analysis for dumping to file or to screen
194 | 
195 |         Args:
196 |              gene_details: information about the genes - should contain fields 'mean1' and 'mean2' for display
197 |              gene_likelihood1: dictionary of gene names and the p-value associated. used to sort the data
198 |              group_type1: the name of the first grouping
199 |              group_type2: the name of the second grouping
200 | 
201 |         """
202 | 
203 |         results: List[str] = []
204 |         sorted_likely = [
205 |             (gene, gene_likelihood1[gene])
206 |             for gene in sorted(gene_likelihood1, key=gene_likelihood1.get)
207 |         ]
208 |         results.append(f"gene_name\tp-value\t{group_type1}\t{group_type2}\n")
209 |         for gene, p in sorted_likely:
210 |             m1 = gene_details[gene]['mean1']
211 |             m2 = gene_details[gene]['mean2']
212 |             symbol = (
213 |                 self.ensg_to_symbol[gene]['symbols'][0] if gene in self.ensg_to_symbol else gene
214 |             )
215 | 
216 |             if (
217 |                 p < self.p_value_cutoff
218 |                 and not (m1 < self.minimum_cpm and m2 < self.minimum_cpm)
219 |                 and m1 < m2
220 |             ):
221 |                 results.append(
222 |                     f"{gene}\t"
223 |                     f"{symbol}\t"
224 |                     f"{gene_likelihood1[gene]}\t"
225 |                     f"{gene_details[gene]['mean1']:.2f}\t"
226 |                     f"{gene_details[gene]['mean2']:.2f}\n"
227 |                 )
228 | 
229 |         return results
230 | 
231 | 
232 | def main():
233 | 
234 |     args = parse_arguments()
235 |     default_class = EdgePy(args)
236 |     default_class.run_ks()
237 | 
238 | 
239 | if __name__ == "__main__":
240 |     main()
241 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = edgePy
 3 | version = 0.0.1
 4 | author = r-bioinformatics
 5 | url = https://github.com/r-bioinformatics/edgePy
 6 | description = A Python port of edgeR for differential expression analysis.
 7 | long_description = file: README.md, LICENSE
 8 | long_description_content_type = text/markdown
 9 | keywords = bioinformatics, gene, differential, expression, edgeR
10 | requires-dist = setuptools>=30.3.0
11 | license = MIT
12 | classifiers =
13 |     Development Status :: 2 - Pre-Alpha
14 |     Intended Audience :: Science/Research
15 |     License :: OSI Approved :: MIT License
16 |     Topic :: Scientific/Engineering :: Bio-Informatics
17 |     Programming Language :: Python :: 3.6
18 | project-urls =
19 |     Slack-Group = https://r-bioinformatics.slack.com/
20 |     Subreddit = https://reddit.com/r/bioinformatics/
21 | 
22 | [options]
23 | zip_safe = True
24 | include_package_data = True
25 | packages = find:
26 | install_requires =
27 |     numpy>=1.14.5
28 |     smart_open>=1.6.0
29 |     tox>=3.1.2
30 |     scipy>=1.1.0
31 |     logzero>=1.0.0
32 |     sphinx>=1.7
33 |     pymysql>=0.9.2
34 | 
35 | [mypy]
36 | warn_redundant_casts = True
37 | warn_unused_ignores = True
38 | mypy_path = docs/stubs
39 | python_version = 3.6
40 | 
41 | [mypy-*]
42 | disallow_untyped_calls = True
43 | disallow_untyped_defs = True
44 | warn_return_any = True
45 | no_implicit_optional = True
46 | strict_optional = True
47 | ignore_missing_imports = False
48 | 
49 | [flake8]
50 | max-line-length = 120
51 | doctests = True
52 | show-source = True
53 | ignore =
54 | exclude =
55 |     .git
56 |     .mypy_cache
57 |     .pytest_cache
58 |     libs
59 |     docs
60 |     tests
61 |     __init__.py
62 | 
63 | [tool:pytest]
64 | addopts = --verbose --doctest-modules --cov
65 | 
66 | [coverage:run]
67 | branch = true
68 | source = edgePy
69 | parallel = true
70 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | # Name and version specified here for `sdist` only.
4 | setup(name="edgePy", version="0.0.1")
5 | 


--------------------------------------------------------------------------------
/tests/ensembl/test_canonical_transcripts.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import unittest
 3 | 
 4 | from edgePy.data_import.data_import import get_dataset_path
 5 | 
 6 | TEST_DATASET = "transcripts_homo_sapiens_core_75_37.tsv"
 7 | TEST_GENE_SYMBOLS = "symbols_homo_sapiens_core_75_37.tsv"
 8 | from edgePy.data_import.ensembl.ensembl_flat_file_reader import CanonicalDataStore
 9 | 
10 | 
11 | class TestCanonicalTranscripts(unittest.TestCase):
12 |     @classmethod
13 |     def setUpClass(cls):
14 |         cls.canonicaldata = CanonicalDataStore(
15 |             get_dataset_path(TEST_DATASET), get_dataset_path(TEST_GENE_SYMBOLS)
16 |         )
17 | 
18 |     def test_is_canonical_by_transcript(self):
19 |         # ENSG00000224451	ENST00000433775	567	True
20 |         # ENSG00000175063	ENST00000405520	1441	False
21 |         assert self.canonicaldata.is_canonical_by_transcript("ENST00000433775") is True
22 |         assert self.canonicaldata.is_canonical_by_transcript("ENST00000405520") is False
23 | 
24 |     def test_get_canonical_transcript(self):
25 |         # ENSG00000104047	ENST00000403028	1579	False
26 |         # ENSG00000104047	ENST00000557968	932	False
27 |         # ENSG00000104047	ENST00000559223	789	False
28 |         # ENSG00000104047	ENST00000558653	1126	False
29 |         # ENSG00000104047	ENST00000561188	588	False
30 |         # ENSG00000104047	ENST00000557988	1195	False
31 |         # ENSG00000104047	ENST00000560735	596	False
32 |         # ENSG00000104047	ENST00000559164	673	False
33 |         # ENSG00000104047	ENST00000560632	548	False
34 |         # ENSG00000104047	ENST00000559405	580	False
35 |         # ENSG00000104047	ENST00000251250	2674	True
36 |         # ENSG00000104047	ENST00000329873	476	False
37 |         # ENSG00000104047	ENST00000415425	2208	False
38 |         assert self.canonicaldata.get_canonical_transcript("ENSG00000104047") == "ENST00000251250"
39 | 
40 |     def test_get_length_of_transcript(self):
41 | 
42 |         # ENSG00000224451	ENST00000433775	567	True
43 |         # ENSG00000175063	ENST00000405520	1441	False
44 |         assert self.canonicaldata.get_length_of_transcript("ENST00000433775") == 567
45 |         assert self.canonicaldata.get_length_of_transcript("ENST00000405520") == 1441
46 | 
47 |     def test_get_length_of_canonical_transcript(self):
48 |         # ENSG00000224451	ENST00000433775	567	True
49 |         # ENSG00000104047	ENST00000251250	2674	True
50 |         assert self.canonicaldata.get_length_of_canonical_transcript("ENSG00000224451") == 567
51 |         assert self.canonicaldata.get_length_of_canonical_transcript("ENSG00000104047") == 2674
52 | 


--------------------------------------------------------------------------------
/tests/ensembl/test_ensembl_flat_file_reader.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import unittest
  3 | from edgePy.data_import.data_import import get_dataset_path
  4 | from edgePy.data_import.ensembl.ensembl_flat_file_reader import CanonicalDataStore
  5 | 
  6 | 
  7 | TEST_GENE_SET_DATA = "transcripts_homo_sapiens_core_75_37.tsv"
  8 | TEST_GENE_SYMBOLS = "symbols_homo_sapiens_core_75_37.tsv"
  9 | 
 10 | 
 11 | class TestEnsembleFlatFileReader(unittest.TestCase):
 12 |     @classmethod
 13 |     def setUpClass(cls):
 14 |         cls.icd = CanonicalDataStore(
 15 |             get_dataset_path(TEST_GENE_SET_DATA), get_dataset_path(TEST_GENE_SYMBOLS)
 16 |         )
 17 | 
 18 |     def test_pick_gene_id_1(self):
 19 |         gene_list = ["ENG000000123", "ENG000000125", "ENG000000130"]
 20 | 
 21 |         best_gene = self.icd.pick_gene_id(gene_list)
 22 | 
 23 |         assert best_gene == "ENG000000130"
 24 | 
 25 |     def test_has_gene(self):
 26 |         # SLC25A26	ENSG00000261657
 27 |         assert self.icd.has_gene("ENSG00000261657")
 28 |         # HMGA1P6	ENSG00000233440
 29 |         assert self.icd.has_gene("ENSG00000233440")
 30 |         # fake genes that don't exist.
 31 |         assert not self.icd.has_gene("ENSG00000000001")
 32 |         assert not self.icd.has_gene("ENSG00000000010")
 33 | 
 34 |     def test_get_symbol_from_gene(self):
 35 |         # FABP3P2 ENSG00000233259
 36 |         # DHFRP1 ENSG00000188985
 37 |         # LINC01050 ENSG00000271216
 38 | 
 39 |         assert self.icd.get_symbol_from_gene("ENSG00000233259") == "FABP3P2"
 40 |         assert self.icd.get_symbol_from_gene("ENSG00000188985") == "DHFRP1"
 41 |         assert self.icd.get_symbol_from_gene("ENSG00000271216") == "LINC01050"
 42 |         assert self.icd.get_symbol_from_gene("ENSG00000271216") == "LINC01050"
 43 |         with self.assertRaises(KeyError):
 44 |             self.icd.get_symbol_from_gene("NOTAREALGENE")
 45 | 
 46 |     def test_get_genes_from_symbol(self):
 47 | 
 48 |         assert self.icd.get_genes_from_symbol("FABP3P2") == ['ENSG00000233259']
 49 |         assert self.icd.get_genes_from_symbol("FAKEGENE1") == []
 50 |         print(self.icd.get_genes_from_symbol("PAN1"))
 51 |         self.assertListEqual(
 52 |             self.icd.get_genes_from_symbol("PAN1"),
 53 |             [
 54 |                 'ENSG00000022556',
 55 |                 'ENSG00000270370',
 56 |                 'ENSG00000262615',
 57 |                 'ENSG00000262886',
 58 |                 'ENSG00000262329',
 59 |                 'ENSG00000262811',
 60 |                 'ENSG00000262175',
 61 |                 'ENSG00000262929',
 62 |                 'ENSG00000262260',
 63 |             ],
 64 |         )
 65 | 
 66 |     def test_is_known_symbol(self):
 67 |         # SLC25A26	ENSG00000261657
 68 |         assert self.icd.is_known_symbol("FABP3P2")
 69 |         # HMGA1P6	ENSG00000233440
 70 |         assert self.icd.is_known_symbol("DHFRP1")
 71 |         # fake genes that don't exist.
 72 |         assert not self.icd.is_known_symbol("FAKEGENE1")
 73 | 
 74 |     def test_is_known_gene(self):
 75 |         # SLC25A26	ENSG00000261657
 76 |         assert self.icd.is_known_gene("ENSG00000261657")
 77 |         # HMGA1P6	ENSG00000233440
 78 |         assert self.icd.is_known_gene("ENSG00000233440")
 79 |         # fake genes that don't exist.
 80 |         assert not self.icd.is_known_gene("ENSG00000000001")
 81 |         assert not self.icd.is_known_gene("ENSG00000000010")
 82 | 
 83 |     def test_is_canonical_by_transcript(self):
 84 |         """
 85 |         ENSG00000171448	ENST00000373656	4441	True
 86 |         ENSG00000171448	ENST00000373654	2045	False
 87 | 
 88 |         ENSG00000140157	ENST00000337451	3225	True
 89 |         ENSG00000140157	ENST00000398013	2274	False
 90 |         """
 91 |         assert self.icd.is_canonical_by_transcript("ENST00000373656") is True
 92 |         assert self.icd.is_canonical_by_transcript("ENST00000373654") is False
 93 |         assert self.icd.is_canonical_by_transcript("ENST00000337451") is True
 94 |         assert self.icd.is_canonical_by_transcript("ENST00000398013") is False
 95 | 
 96 |     def test_get_canonical_transcript(self):
 97 |         """
 98 |         ENSG00000171448	ENST00000373656	4441	True
 99 |         ENSG00000171448	ENST00000373654	2045	False
100 | 
101 |         ENSG00000140157	ENST00000337451	3225	True
102 |         ENSG00000140157	ENST00000398013	2274	False
103 |         """
104 |         assert self.icd.get_canonical_transcript("ENSG00000171448") == "ENST00000373656"
105 |         assert self.icd.get_canonical_transcript("ENSG00000140157") == "ENST00000337451"
106 | 
107 |     def test_get_length_of_transcript(self):
108 |         """
109 |         ENSG00000171448	ENST00000373656	4441	True
110 |         ENSG00000171448	ENST00000373654	2045	False
111 | 
112 |         ENSG00000140157	ENST00000337451	3225	True
113 |         ENSG00000140157	ENST00000398013	2274	False
114 |         """
115 |         assert self.icd.get_length_of_transcript("ENST00000373656") == 4441
116 |         assert self.icd.get_length_of_transcript("ENST00000373654") == 2045
117 |         assert self.icd.get_length_of_transcript("ENST00000337451") == 3225
118 |         assert self.icd.get_length_of_transcript("ENST00000398013") == 2274
119 | 
120 |     def test_get_length_of_canonical_transcript(self):
121 |         """
122 |         ENSG00000171448	ENST00000373656	4441	True
123 |         ENSG00000171448	ENST00000373654	2045	False
124 | 
125 |         ENSG00000140157	ENST00000337451	3225	True
126 |         ENSG00000140157	ENST00000398013	2274	False
127 |         """
128 |         assert self.icd.get_length_of_canonical_transcript("ENSG00000171448") == 4441
129 |         assert self.icd.get_length_of_canonical_transcript("ENSG00000140157") == 3225
130 | 


--------------------------------------------------------------------------------
/tests/mongodb/fixtures/ensg_by_symbol.json:
--------------------------------------------------------------------------------
1 | [{ "_id" : "BRCA1", "ensgs" : [ "ENSG00000012048" ] },
2 | { "_id" : "BRCA2", "ensgs" : [ "ENSG00000139618" ] },
3 | { "_id" : "TP53", "ensgs" : [ "ENSG00000141510" ] }]


--------------------------------------------------------------------------------
/tests/mongodb/fixtures/samples.json:
--------------------------------------------------------------------------------
1 | [{ "_id" : "5aa97335f8848a3fd2ccd3e4", "assay_type" : "RNA-Seq", "avgspotlen" : "50", "bioproject" : "PRJNA362579", "biosample" : "SAMN06242166", "center_name" : "GEO", "consent" : "public", "experiment" : "SRX2505175", "insertsize" : "0", "instrument" : "Illumina HiSeq 2500", "librarylayout" : "SINGLE", "libraryselection" : "cDNA", "librarysource" : "TRANSCRIPTOMIC", "loaddate" : "2017-01-19", "mbases" : "1232", "mbytes" : "611", "organism" : "Homo sapiens", "platform" : "ILLUMINA", "releasedate" : "2017-08-30", "run" : "SRR5189264", "sra_sample" : "SRS1929803", "sra_study" : "SRP097153", "sample_name" : "SRR5189264", "cell_type" : "ES-cell derived human cardiac organoids", "source_name" : "human cardiac organoids", "sample_sra_name" : "GSM2463762", "Tenaya_set_type" : "Cardiomyocytes", "Project" : "Public Data" },
2 | { "_id" : "5aa97335f8848a3fd2ccd3e5", "assay_type" : "RNA-Seq", "avgspotlen" : "50", "bioproject" : "PRJNA362579", "biosample" : "SAMN06242165", "center_name" : "GEO", "consent" : "public", "experiment" : "SRX2505176", "insertsize" : "0", "instrument" : "Illumina HiSeq 2500", "librarylayout" : "SINGLE", "libraryselection" : "cDNA", "librarysource" : "TRANSCRIPTOMIC", "loaddate" : "2017-01-19", "mbases" : "1134", "mbytes" : "566", "organism" : "Homo sapiens", "platform" : "ILLUMINA", "releasedate" : "2017-08-30", "run" : "SRR5189265", "sra_sample" : "SRS1929804", "sra_study" : "SRP097153", "sample_name" : "SRR5189265", "cell_type" : "ES-cell derived human cardiac organoids", "source_name" : "human cardiac organoids", "sample_sra_name" : "GSM2463763", "Tenaya_set_type" : "Cardiomyocytes", "Project" : "Public Data" },
3 | { "_id" : "5aa97335f8848a3fd2ccd3e6", "assay_type" : "RNA-Seq", "avgspotlen" : "50", "bioproject" : "PRJNA362579", "biosample" : "SAMN06242164", "center_name" : "GEO", "consent" : "public", "experiment" : "SRX2505177", "insertsize" : "0", "instrument" : "Illumina HiSeq 2500", "librarylayout" : "SINGLE", "libraryselection" : "cDNA", "librarysource" : "TRANSCRIPTOMIC", "loaddate" : "2017-01-19", "mbases" : "1094", "mbytes" : "543", "organism" : "Homo sapiens", "platform" : "ILLUMINA", "releasedate" : "2017-08-30", "run" : "SRR5189266", "sra_sample" : "SRS1929805", "sra_study" : "SRP097153", "sample_name" : "SRR5189266", "cell_type" : "ES-cell derived human cardiac organoids", "source_name" : "human cardiac organoids", "sample_sra_name" : "GSM2463764", "Tenaya_set_type" : "Cardiomyocytes", "Project" : "Public Data" }]


--------------------------------------------------------------------------------
/tests/mongodb/fixtures/symbol_by_ensg.json:
--------------------------------------------------------------------------------
1 | [{ "_id" : "ENSG00000012048", "symbols" : [ "BRCA1" ] },
2 | { "_id" : "ENSG00000139618", "symbols" : [ "BRCA2" ] },
3 | { "_id" : "ENSG00000141510", "symbols" : [ "TP53" ] }]


--------------------------------------------------------------------------------
/tests/mongodb/test_gene_functions.py:
--------------------------------------------------------------------------------
  1 | """The core Python code for generating data."""
  2 | import pytest
  3 | from edgePy.data_import.data_import import get_dataset_path
  4 | 
  5 | from edgePy.data_import.mongodb.gene_functions import *
  6 | from edgePy.data_import.mongodb.mongo_wrapper import MongoWrapper
  7 | 
  8 | GENE_LIST_DATASET = "example_gene_list.txt"
  9 | 
 10 | RNASeq_RECORD = {
 11 |     "_id": "5a7519801fd85c0e41c94c51",
 12 |     "gene": "ENSG00000232977",
 13 |     "sample_name": "SRR4011901",
 14 |     "transcripts": {
 15 |         "ENST00000575689": {
 16 |             "size": 720,
 17 |             "canonical": "0",
 18 |             "exons": {
 19 |                 "ENSE00002642039": {"raw": 6.435643564356435, "rpkm": 0.3682833603326866},
 20 |                 "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676},
 21 |             },
 22 |             "rpkm": 0.39507510837872767,
 23 |         },
 24 |         "ENST00000576696": {
 25 |             "size": 1306,
 26 |             "canonical": "0",
 27 |             "exons": {
 28 |                 "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676},
 29 |                 "ENSE00002672617": {"raw": 5.564356435643564, "rpkm": 0.16002265523850875},
 30 |             },
 31 |             "rpkm": 0.195204453741728,
 32 |         },
 33 |         "ENST00000443778": {
 34 |             "size": 2084,
 35 |             "canonical": "1",
 36 |             "exons": {
 37 |                 "ENSE00001729822": {"raw": 2, "rpkm": 0.0997865579744511},
 38 |                 "ENSE00001608298": {"raw": 1.1777177717771776, "rpkm": 0.22669418591124607},
 39 |             },
 40 |             "rpkm": 0.05165702955135873,
 41 |         },
 42 |     },
 43 |     "star_rpkm": None,
 44 | }
 45 | 
 46 | 
 47 | RNASeq_RECORD_NO_CANONICAL = {
 48 |     "_id": "5a7519801fd85c0e41c94c51",
 49 |     "gene": "ENSG00000232977",
 50 |     "sample_name": "SRR4011901",
 51 |     "transcripts": {
 52 |         "ENST00000575689": {
 53 |             "size": 720,
 54 |             "canonical": "0",
 55 |             "exons": {
 56 |                 "ENSE00002642039": {"raw": 6.435643564356435, "rpkm": 0.3682833603326866},
 57 |                 "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676},
 58 |             },
 59 |             "rpkm": 0.39507510837872767,
 60 |         },
 61 |         "ENST00000576696": {
 62 |             "size": 1306,
 63 |             "canonical": "0",
 64 |             "exons": {
 65 |                 "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676},
 66 |                 "ENSE00002672617": {"raw": 5.564356435643564, "rpkm": 0.16002265523850875},
 67 |             },
 68 |             "rpkm": 0.195204453741728,
 69 |         },
 70 |     },
 71 |     "star_rpkm": None,
 72 | }
 73 | 
 74 | 
 75 | @pytest.fixture
 76 | def gene_list_file():
 77 |     return get_dataset_path(GENE_LIST_DATASET)
 78 | 
 79 | 
 80 | def test_get_genelist_from_file():
 81 |     gene_list = get_genelist_from_file(gene_list_file())
 82 |     assert gene_list == ["TP53", "BRCA1", "BRCA2"]
 83 | 
 84 | 
 85 | def test_get_genelist_from_file_no_file():
 86 |     gene_list = get_genelist_from_file(None)
 87 |     assert gene_list is None
 88 | 
 89 | 
 90 | def test_translate_genes_symbol(mongodb):
 91 |     mw = MongoWrapper("localhost", "27017")
 92 |     mw.session = mongodb
 93 |     gene_list = get_genelist_from_file(gene_list_file())
 94 |     ensg_genes, gene_symbols = translate_genes(gene_list, mw, "pytest")
 95 |     assert ensg_genes == ["ENSG00000012048", "ENSG00000139618", "ENSG00000141510"]
 96 |     assert gene_symbols == {
 97 |         "ENSG00000012048": "BRCA1",
 98 |         "ENSG00000139618": "BRCA2",
 99 |         "ENSG00000141510": "TP53",
100 |     }
101 | 
102 | 
103 | def test_translate_genes_ensg(mongodb):
104 |     mw = MongoWrapper("localhost", "27017")
105 |     mw.session = mongodb
106 |     gene_list = ["ENSG00000012048", "ENSG00000139618", "ENSG00000141510"]
107 |     ensg_genes, gene_symbols = translate_genes(gene_list, mw, "pytest")
108 |     assert ensg_genes == ["ENSG00000012048", "ENSG00000139618", "ENSG00000141510"]
109 |     assert gene_symbols == {
110 |         "ENSG00000012048": "BRCA1",
111 |         "ENSG00000139618": "BRCA2",
112 |         "ENSG00000141510": "TP53",
113 |     }
114 | 
115 | 
116 | def test_get_gene_list(mongodb):
117 |     mw = MongoWrapper("localhost", "27017")
118 |     mw.session = mongodb
119 |     gene_list = get_gene_list(mw, database="pytest")
120 |     assert gene_list == {
121 |         "ENSG00000012048": "BRCA1",
122 |         "ENSG00000139618": "BRCA2",
123 |         "ENSG00000141510": "TP53",
124 |     }
125 | 
126 | 
127 | def test_get_sample_details(mongodb):
128 |     mw = MongoWrapper("localhost", "27017")
129 |     mw.session = mongodb
130 |     details = get_sample_details("Project", mw, "pytest")
131 |     assert details == {
132 |         "SRR5189264": {"category": "Public Data", "description": "SRR5189264"},
133 |         "SRR5189265": {"category": "Public Data", "description": "SRR5189265"},
134 |         "SRR5189266": {"category": "Public Data", "description": "SRR5189266"},
135 |     }
136 | 
137 | 
138 | def test_get_canonical_rpkm():
139 |     rpkm = get_canonical_rpkm(RNASeq_RECORD)
140 |     assert rpkm == 0.05165702955135873
141 | 
142 | 
143 | def test_get_canonical_rpkm_no_canonical():
144 |     rpkm = get_canonical_rpkm(RNASeq_RECORD_NO_CANONICAL)
145 |     assert rpkm is None
146 | 
147 | 
148 | def test_get_canonical_raw_no_canonical():
149 |     raw = get_canonical_raw(RNASeq_RECORD_NO_CANONICAL)
150 |     assert raw is None
151 | 


--------------------------------------------------------------------------------
/tests/mongodb/test_mongo_import.py:
--------------------------------------------------------------------------------
 1 | from edgePy.data_import.mongodb.mongo_import import ImportFromMongodb
 2 | from edgePy.data_import.mongodb.mongo_import import parse_arguments
 3 | from edgePy.data_import.data_import import get_dataset_path
 4 | 
 5 | 
 6 | def test_parse_arguments():
 7 |     config = "file.txt"
 8 |     gene_list = "groups.txt"
 9 |     key_name = "Project"
10 |     key_value = "Publie Data"
11 | 
12 |     ci_values = [
13 |         "--config",
14 |         config,
15 |         "--gene_list",
16 |         gene_list,
17 |         "--key_name",
18 |         "Project",
19 |         "--key_value",
20 |         "Publie Data",
21 |     ]
22 | 
23 |     args = parse_arguments(None, ci_values=ci_values)
24 | 
25 |     assert config == args.config
26 |     assert gene_list == args.gene_list
27 |     assert key_name == args.key_name
28 |     assert key_value == args.key_value
29 | 
30 | 
31 | def test_get_data_from_mongo_nofilters(mongodb):
32 |     im = ImportFromMongodb(
33 |         host="localhost", port=27017, mongo_key=None, mongo_value=None, gene_list_file=None
34 |     )
35 |     im.mongo_reader.session = mongodb
36 |     sample_list, dataset, gene_list, sample_category = im.get_data_from_mongo(database="pytest")
37 |     assert sample_list == ["SRR5189264", "SRR5189265", "SRR5189266"]
38 |     assert dataset == {
39 |         "SRR5189264": {"ENSG00000012048": 70, "ENSG00000139618": 105, "ENSG00000141510": 270},
40 |         "SRR5189265": {"ENSG00000012048": 76, "ENSG00000139618": 168, "ENSG00000141510": 347},
41 |         "SRR5189266": {"ENSG00000012048": 62, "ENSG00000139618": 104, "ENSG00000141510": 191},
42 |     }
43 |     assert gene_list == ["ENSG00000012048", "ENSG00000139618", "ENSG00000141510"]
44 |     assert sample_category == {
45 |         "SRR5189264": "SRR5189264",
46 |         "SRR5189265": "SRR5189265",
47 |         "SRR5189266": "SRR5189266",
48 |     }
49 | 
50 | 
51 | def test_get_data_from_mongo_filters(mongodb):
52 |     im = ImportFromMongodb(
53 |         host="localhost",
54 |         port=27017,
55 |         mongo_key="Project",
56 |         mongo_value="Public Data",
57 |         gene_list_file=None,
58 |     )
59 |     im.mongo_reader.session = mongodb
60 |     sample_list, dataset, gene_list, sample_category = im.get_data_from_mongo(database="pytest")
61 |     assert sample_list == ["SRR5189264", "SRR5189265", "SRR5189266"]
62 |     assert dataset == {
63 |         "SRR5189264": {"ENSG00000012048": 70, "ENSG00000139618": 105, "ENSG00000141510": 270},
64 |         "SRR5189265": {"ENSG00000012048": 76, "ENSG00000139618": 168, "ENSG00000141510": 347},
65 |         "SRR5189266": {"ENSG00000012048": 62, "ENSG00000139618": 104, "ENSG00000141510": 191},
66 |     }
67 |     assert gene_list == ["ENSG00000012048", "ENSG00000139618", "ENSG00000141510"]
68 |     assert sample_category == {
69 |         "SRR5189264": "Public Data",
70 |         "SRR5189265": "Public Data",
71 |         "SRR5189266": "Public Data",
72 |     }
73 | 
74 | 
75 | def test_get_data_from_mongo_gene_list(mongodb):
76 |     filename = str(get_dataset_path("example_gene_list.txt"))
77 |     im = ImportFromMongodb(
78 |         host="localhost",
79 |         port=27017,
80 |         mongo_key="Project",
81 |         mongo_value="Public Data",
82 |         gene_list_file=filename,
83 |     )
84 |     im.mongo_reader.session = mongodb
85 |     sample_list, dataset, gene_list, sample_category = im.get_data_from_mongo(database="pytest")
86 |     assert sample_list == ["SRR5189264", "SRR5189265", "SRR5189266"]
87 |     assert dataset == {
88 |         "SRR5189264": {"ENSG00000012048": 70, "ENSG00000139618": 105, "ENSG00000141510": 270},
89 |         "SRR5189265": {"ENSG00000012048": 76, "ENSG00000139618": 168, "ENSG00000141510": 347},
90 |         "SRR5189266": {"ENSG00000012048": 62, "ENSG00000139618": 104, "ENSG00000141510": 191},
91 |     }
92 |     assert gene_list == ["ENSG00000012048", "ENSG00000139618", "ENSG00000141510"]
93 |     assert sample_category == {
94 |         "SRR5189264": "Public Data",
95 |         "SRR5189265": "Public Data",
96 |         "SRR5189266": "Public Data",
97 |     }
98 | 


--------------------------------------------------------------------------------
/tests/mongodb/test_mongo_wrapper.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from edgePy.data_import.mongodb.mongo_wrapper import MongoWrapper
  3 | from edgePy.data_import.mongodb.mongo_wrapper import MongoInserter
  4 | from edgePy.data_import.mongodb.mongo_wrapper import MongoUpdater
  5 | 
  6 | 
  7 | def test_mongo_wrapper_find_as_cursor(mongodb):
  8 |     mw = MongoWrapper("localhost", "27017")
  9 |     mw.session = mongodb
 10 |     assert "ensg_by_symbol" in mongodb.collection_names()
 11 |     cursor = mw.find_as_cursor("pytest", "ensg_by_symbol", {}, {})
 12 | 
 13 |     count = 0
 14 |     for i in cursor:
 15 |         count += 1
 16 | 
 17 |     assert 3 == count
 18 | 
 19 | 
 20 | def test_mongo_wrapper_find_as_cursor_fail():
 21 |     mw = MongoWrapper("localhost", "27017")
 22 |     # do not set session to mongodb to cause this error.
 23 |     with pytest.raises(Exception):
 24 |         mw.find_as_cursor("pytest", "fake_table", {"_id_": "something"}, {})
 25 | 
 26 | 
 27 | def test_mongo_wrapper_find_as_list(mongodb):
 28 |     mw = MongoWrapper("localhost", "27017")
 29 |     mw.session = mongodb
 30 |     assert "ensg_by_symbol" in mongodb.collection_names()
 31 |     value = mw.find_as_list("pytest", "ensg_by_symbol", {}, {})
 32 |     assert value == [{"_id": "BRCA1"}, {"_id": "BRCA2"}, {"_id": "TP53"}]
 33 | 
 34 | 
 35 | def test_mongo_wrapper_find_as_dict(mongodb):
 36 |     mw = MongoWrapper("localhost", "27017")
 37 |     mw.session = mongodb
 38 |     assert "ensg_by_symbol" in mongodb.collection_names()
 39 |     value = mw.find_as_dict("pytest", "ensg_by_symbol", {})
 40 |     assert value == {
 41 |         "BRCA1": {"_id": "BRCA1", "ensgs": ["ENSG00000012048"]},
 42 |         "TP53": {"_id": "TP53", "ensgs": ["ENSG00000141510"]},
 43 |         "BRCA2": {"_id": "BRCA2", "ensgs": ["ENSG00000139618"]},
 44 |     }
 45 | 
 46 | 
 47 | def test_mongo_wrapper_insert(mongodb):
 48 |     mw = MongoWrapper("localhost", "27017")
 49 |     mw.session = mongodb
 50 |     mw.insert("pytest", "test", [{"rec1": "val1"}, {"rec2": "val2"}])
 51 | 
 52 | 
 53 | def test_mongo_wrapper_create_index(mongodb):
 54 |     mw = MongoWrapper("localhost", "27017")
 55 |     mw.session = mongodb
 56 |     mw.create_index("pytest", "test", "_id")
 57 | 
 58 | 
 59 | def test_mongo_inserter_flush(mongodb):
 60 |     """This is not testable - the mongodb pytest module does not support bulk writes. """
 61 | 
 62 |     mi = MongoInserter("localhost", 27017, "pytest", "test")
 63 |     mi.session = mongodb
 64 |     # mi.add(['A', 'B', 'C'])
 65 |     mi.flush()
 66 | 
 67 |     pass
 68 | 
 69 | 
 70 | def test_mongo_inserter_add(mongodb):
 71 |     mi = MongoInserter("localhost", 27017, "pytest", "test")
 72 |     mi.session = mongodb
 73 |     mi.add(["A", "B", "C"])
 74 | 
 75 | 
 76 | def test_mongo_inserter_close(mongodb):
 77 |     mi = MongoInserter("localhost", 27017, "pytest", "test")
 78 |     mi.session = mongodb
 79 |     # mi.add(['A', 'B', 'C'])
 80 |     mi.close()
 81 | 
 82 | 
 83 | def test_mongo_inserter_create_index_key(mongodb):
 84 |     mi = MongoInserter("localhost", 27017, "pytest", "test")
 85 |     mi.session = mongodb
 86 |     mi.create_index("pytest", "test", "_id")
 87 | 
 88 | 
 89 | def test_mongo_updater_flush(mongodb):
 90 |     """This is not testable - the mongodb pytest module does not support bulk writes. """
 91 |     mu = MongoUpdater("localhost", 27017, "pytest", "test")
 92 |     mu.session = mongodb
 93 |     mu.flush()
 94 | 
 95 | 
 96 | def test_mongo_updater_add(mongodb):
 97 |     mu = MongoUpdater("localhost", 27017, "pytest", "test")
 98 |     mu.session = mongodb
 99 |     mu.add({}, {"a": "b"})
100 | 
101 | 
102 | def test_mongo_updater_close(mongodb):
103 |     mu = MongoUpdater("localhost", 27017, "pytest", "test")
104 |     mu.session = mongodb
105 |     mu.close()
106 | 


--------------------------------------------------------------------------------
/tests/test_DGEList.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import pkgutil
  3 | import numpy as np
  4 | from smart_open import smart_open  # type: ignore
  5 | 
  6 | from edgePy.DGEList import DGEList
  7 | from edgePy.data_import.data_import import get_dataset_path
  8 | from edgePy.data_import.ensembl.ensembl_flat_file_reader import CanonicalDataStore
  9 | 
 10 | TEST_GENE_SET_DATA = "transcripts_homo_sapiens_core_75_37.tsv"
 11 | TEST_GENE_SYMBOLS = "symbols_homo_sapiens_core_75_37.tsv"
 12 | 
 13 | TEST_DATASET = "GSE49712_HTSeq.txt.gz"
 14 | TEST_DATASET_NPZ = "GSE49712_HTSeq.txt.npz"
 15 | TEST_GROUPS = "groups.json"
 16 | 
 17 | 
 18 | @pytest.fixture
 19 | def dge_list():
 20 |     with smart_open(get_dataset_path(TEST_DATASET), 'r') as data_handle, smart_open(
 21 |         get_dataset_path(TEST_GROUPS), 'r'
 22 |     ) as group_handle:
 23 |         return DGEList.create_DGEList_handle(data_handle, group_handle)
 24 | 
 25 | 
 26 | def test_sample_by_group():
 27 |     samples = ["A", "B", "C", "D", "E"]
 28 |     expected_output = {'group1': ["A", "B"], 'group2': ["C", "D", "E"]}
 29 |     group_by_sample = ['group1', 'group1', 'group2', 'group2', 'group2']
 30 |     output = DGEList._sample_group_dict(group_by_sample, samples)
 31 |     assert output == expected_output
 32 | 
 33 | 
 34 | def test_sample_group_list():
 35 |     samples = ["A", "B", "C", "D", "E"]
 36 |     sample_by_group = {'group1': ["A", "B"], 'group2': ["C", "D", "E"]}
 37 |     expected_output = np.array(['group1', 'group1', 'group2', 'group2', 'group2'])
 38 |     output = DGEList._sample_group_list(sample_by_group, samples)
 39 |     assert np.array_equal(output, expected_output)
 40 | 
 41 | 
 42 | def test_minimal_init():
 43 | 
 44 |     dge_list = DGEList(
 45 |         to_remove_zeroes=False,
 46 |         counts=np.ones(shape=(5, 5)),
 47 |         samples=["A", "B", "C", "D", "E"],
 48 |         genes=["ENSG001", "ENSG002", "ENSG003", "ENSG004", "ENSG005"],
 49 |         groups_in_dict={'group1': ["A", "B"], 'group2': ["C", "D", "E"]},
 50 |     )
 51 |     assert dge_list.__repr__() == "DGEList(num_samples=5, num_genes=5)"
 52 | 
 53 | 
 54 | def test_too_much():
 55 |     # TODO: Refactor into smaller units.
 56 |     #    - Test blank non-parameterized `DGEList()`
 57 |     #    - Test opening handles, both gzipped or not
 58 |     #    - Test samples and genes are set, validated, typed right
 59 |     assert len(dge_list().samples) == 10
 60 |     assert len(dge_list().genes) == 21711
 61 | 
 62 | 
 63 | def test_too_many_options():
 64 |     with pytest.raises(Exception):
 65 |         DGEList(counts=np.zeros(shape=(5, 10)), filename=str(get_dataset_path(TEST_DATASET_NPZ)))
 66 | 
 67 | 
 68 | def test_too_many_options2():
 69 |     with pytest.raises(Exception):
 70 |         DGEList(counts=np.ones(shape=(5, 10)), filename=str(get_dataset_path(TEST_DATASET_NPZ)))
 71 | 
 72 | 
 73 | def test_library_size():
 74 |     dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ)))
 75 |     assert np.array_equal(
 76 |         dge_list.library_size,
 77 |         np.array(
 78 |             [
 79 |                 63_579_607,
 80 |                 58_531_933,
 81 |                 39_138_521,
 82 |                 78_565_885,
 83 |                 48_667_119,
 84 |                 62_799_917,
 85 |                 66_032_107,
 86 |                 66_194_776,
 87 |                 55_085_875,
 88 |                 37_760_315,
 89 |             ]
 90 |         ),
 91 |     )
 92 | 
 93 | 
 94 | def test_setting_DGElist_counts():
 95 | 
 96 |     dge_list = DGEList(
 97 |         counts=np.zeros(shape=(5, 10)),
 98 |         groups_in_list=['A', 'A', 'B', 'B', 'B'],
 99 |         samples=['S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9'],
100 |     )
101 |     assert 5 == dge_list.counts.shape[0]
102 |     assert 10 == dge_list.counts.shape[1]
103 | 
104 |     with pytest.raises(ValueError):
105 |         c = np.array([[1, 1, 1], [-1, 1, 1]])
106 |         DGEList(counts=c, groups_in_list=["a", "b"])
107 |     with pytest.raises(ValueError):
108 |         c = np.array([[1, 1, 1], [np.nan, 1, 1]])
109 |         DGEList(counts=c, groups_in_list=["a", "b"])
110 |     with pytest.raises(ValueError):
111 |         c = np.array([1, 1, 1])
112 |         DGEList(counts=c, groups_in_list=["a", "b"])
113 |     with pytest.raises(TypeError):
114 |         c = [1, 1, 1]
115 |         dge_list.counts = c
116 | 
117 | 
118 | def test_cycle_dge_npz():
119 | 
120 |     import tempfile
121 |     import os
122 | 
123 |     tempdir = tempfile.mkdtemp(prefix="edgePy_tmp")
124 |     file_name = tempdir + os.sep + next(tempfile._get_candidate_names())
125 |     dge_list_first = dge_list()
126 |     dge_list_first.write_npz_file(filename=file_name)
127 | 
128 |     dge_list_second = DGEList(filename=file_name + ".npz")
129 |     assert np.array_equal(dge_list_first.counts, dge_list_second.counts)
130 |     assert np.array_equal(dge_list_first.genes, dge_list_second.genes)
131 |     assert np.array_equal(dge_list_first.samples, dge_list_second.samples)
132 |     assert np.array_equal(dge_list_first.norm_factors, dge_list_second.norm_factors)
133 |     assert np.array_equal(dge_list_first.groups_list, dge_list_second.groups_list)
134 |     os.remove(file_name + ".npz")
135 |     os.rmdir(tempdir)
136 | 
137 | 
138 | def testing_setting_samples_and_counts():
139 |     # Empty list should fail
140 |     with pytest.raises(Exception):
141 |         DGEList(
142 |             to_remove_zeroes=False,
143 |             groups_in_list=['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'],
144 |         )
145 | 
146 |     # Lists with just counts should fail
147 |     with pytest.raises(ValueError):
148 |         DGEList(counts=np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]]), groups_in_list=['A', 'A', 'B'])
149 | 
150 |     # lists sith samples and counts and groups should pass:
151 |     DGEList(
152 |         counts=np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]]),
153 |         groups_in_list=['A', 'A', 'B'],
154 |         samples=["S1", 'S2', 'S3'],
155 |     )
156 | 
157 |     # Lists with just samples should fail
158 |     with pytest.raises(Exception):
159 |         DGEList(
160 |             samples=np.array(["1", "2", "3"]),
161 |             to_remove_zeroes=False,
162 |             groups_in_list=['A', 'A', 'B'],
163 |         )
164 | 
165 |     # Properly formed samples and counts should pass
166 |     DGEList(
167 |         samples=np.array(["1", "2", "3"]),
168 |         counts=np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]]),
169 |         groups_in_list=['A', 'A', 'B'],
170 |     )
171 | 
172 |     # Lists with ill-matched samples and counts should fail
173 |     pytest.raises(
174 |         ValueError,
175 |         "DGEList(samples = np.array(['2', '3']),"
176 |         " counts = np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]]))",
177 |     )
178 | 
179 | 
180 | def test_repr():
181 |     assert dge_list().__repr__() == "DGEList(num_samples=10, num_genes=21,711)"
182 | 
183 | 
184 | def test_broken_dge_caGENE_SYMBOL_SQLll():
185 |     with pytest.raises(Exception):
186 |         DGEList(filename="fake_filename", counts=np.array([[1, 1, 1], [1, 1, 1]]))
187 |     with pytest.raises(Exception):
188 |         DGEList(counts=None)
189 | 
190 | 
191 | def test_cpm():
192 |     dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ)))
193 |     first_pos = dge_list.counts[0][0]
194 |     col_sum = np.sum(dge_list.counts, axis=0)
195 |     assert isinstance(first_pos, np.integer)
196 |     new_dge_list = dge_list.cpm()
197 |     assert new_dge_list.counts[0][0] == first_pos * 1e6 / col_sum[0]
198 | 
199 | 
200 | def test_rpkm():
201 |     dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ)))
202 |     icd = CanonicalDataStore(
203 |         get_dataset_path(TEST_GENE_SET_DATA), get_dataset_path(TEST_GENE_SYMBOLS)
204 |     )
205 |     first_pos = dge_list.counts[0][0]
206 |     first_gene = dge_list.genes[0]
207 | 
208 |     col_sum = np.sum(dge_list.counts, axis=0)
209 |     assert isinstance(first_pos, np.integer)
210 |     rpm_dge = dge_list.rpkm(icd)
211 |     ensg_gene = icd.pick_gene_id(icd.get_genes_from_symbol(first_gene))
212 |     gene_len = icd.get_length_of_canonical_transcript(ensg_gene)
213 |     # RPKM=numReads / (geneLength / 1000 * totalNumReads / 1, 000, 000)
214 |     assert rpm_dge.counts[0][0] == (first_pos / ((gene_len / 1e3) * (col_sum[0] / 1e6)))
215 | 
216 | 
217 | def test_tpm():
218 |     # example hand calculated as in https://www.youtube.com/watch?time_continue=611&v=TTUrtCY2k-w
219 |     counts = np.array([[10, 12, 30], [20, 25, 60], [5, 8, 15], [0, 0, 1]])
220 |     gene_lengths = np.array([2000, 4000, 1000, 10000])
221 | 
222 |     expected = np.array(
223 |         [
224 |             [333_333.333_333_33, 296_296.296_296_3, 332_594.235_033_26],
225 |             [333_333.333_333_33, 308_641.975_308_64, 332_594.235_033_26],
226 |             [333_333.333_333_33, 395_061.728_395_06, 332_594.235_033_26],
227 |             [0.0, 0.0, 2217.294_900_22],
228 |         ]
229 |     )
230 | 
231 |     dge_list = DGEList(
232 |         counts=counts,
233 |         samples=np.array(['a', 'b', 'c']),
234 |         genes=np.array(['a', 'b', 'c', 'd']),
235 |         groups_in_dict={'group1': ['a', 'c'], 'group2': ['b', 'd']},
236 |     )
237 |     assert isinstance(dge_list.counts[0][0], np.integer)
238 |     new_dge_list = dge_list.tpm(gene_lengths)
239 | 
240 |     assert np.allclose(new_dge_list.counts, expected, atol=1e-1)
241 | 
242 |     # make sure that the sums of all genes across are the same the each sample (an important property of TPM)
243 |     gene_sums = new_dge_list.counts.sum(axis=0)
244 |     assert np.allclose(gene_sums, [gene_sums[0]] * len(gene_sums))
245 | 
246 | 
247 | # Unit tests for ``edgePy.data_import.Importer``.\
248 | def test_init():
249 |     dge_list = DGEList.create_DGEList_data_file(
250 |         data_file=get_dataset_path(TEST_DATASET), group_file=get_dataset_path(TEST_GROUPS)
251 |     )
252 | 
253 |     assert dge_list.__repr__() == "DGEList(num_samples=10, num_genes=21,711)"
254 | 
255 |     dge_list = DGEList.create_DGEList_handle(
256 |         data_handle=smart_open(get_dataset_path(TEST_DATASET)),
257 |         group_handle=smart_open(get_dataset_path(TEST_GROUPS)),
258 |     )
259 | 
260 |     assert dge_list.__repr__() == "DGEList(num_samples=10, num_genes=21,711)"
261 | 
262 | 
263 | # TestGroupImporter.
264 | def test_create_DGEList_handle_init():
265 |     dge_list = DGEList.create_DGEList_handle(
266 |         data_handle=smart_open(get_dataset_path(TEST_DATASET)),
267 |         group_handle=smart_open(get_dataset_path(TEST_GROUPS)),
268 |     )
269 |     assert 2 == len(dge_list.groups_dict)
270 |     assert 5 == len(dge_list.groups_dict["Group 1"])
271 |     assert 5 == len(dge_list.groups_dict["Group 2"])
272 | 
273 |     assert dge_list.samples.shape == dge_list.groups_list.shape
274 | 
275 | 
276 | # Unit tests for packaged (optionally zipped during install) data.
277 | def test_get_data_stream():
278 |     """Tests finding packaged data with ``pkgutil.get_data()``"""
279 |     pkgutil.get_data("edgePy", "data/GSE49712_HTSeq.txt.gz")
280 | 
281 | 
282 | def test_create_DGEList():
283 |     """Tests the function that converts data into a DGE_List object"""
284 |     samples = ["AAA", "BBB", "CCC"]
285 |     genes = ["ENSG001", "ENSG002"]
286 | 
287 |     data_set = {
288 |         "AAA": {"ENSG001": 10, "ENSG002": 20},
289 |         "BBB": {"ENSG001": 15, "ENSG002": 40},
290 |         "CCC": {"ENSG001": 20, "ENSG002": 80},
291 |     }
292 |     categories = {"One": ["AAA", "BBB"], "Two": ["CCC"]}
293 | 
294 |     dge_list = DGEList.create_DGEList(
295 |         sample_list=samples, data_set=data_set, gene_list=genes, category_to_samples=categories
296 |     )
297 | 
298 |     assert np.array_equal(dge_list.samples, np.array(samples))
299 |     # 2 rows (genes), 3 columns(samples)
300 |     assert np.array_equal(dge_list.counts, np.array([[10, 15, 20], [20, 40, 80]]))
301 | 
302 |     assert np.array_equal(dge_list.groups_list, np.array(["One", "One", "Two"]))
303 |     assert dge_list.groups_dict, {"One:"}
304 |     assert np.array_equal(dge_list.genes, np.array(genes))
305 | 


--------------------------------------------------------------------------------
/tests/test_edgePy.py:
--------------------------------------------------------------------------------
 1 | # from edgePy.edgePy import parse_arguments
 2 | 
 3 | 
 4 | def test_parse_argumants():
 5 |     text_file = "file.txt"
 6 |     groups_file = "groups.txt"
 7 |     # args = parse_arguments(['--count_file', text_file, "--groups_file", groups_file])
 8 |     # eq_(text_file, args.count_file)
 9 |     # eq_(groups_file, args.groups_file)
10 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | minversion = 3.1.2
 3 | skip_missing_interpreters = true
 4 | envlist =
 5 |     py36
 6 |     py36-lint
 7 |     py36-type
 8 |     py36-docs
 9 | 
10 | [testenv]
11 | description = run the test suite with (basepython)
12 | deps = -rrequirements-test.txt
13 | commands = pytest {posargs}
14 | 
15 | [testenv:py36-lint]
16 | description = check the code style
17 | basepython = python3.6
18 | commands =
19 |     black -v --check {toxinidir}
20 |     flake8 {toxinidir}/edgePy
21 |     pylint {toxinidir}/edgePy --errors-only --output-format=colorized
22 | 
23 | [testenv:py36-type]
24 | description = type check the library
25 | basepython = python3.6
26 | commands = mypy {toxinidir}/edgePy
27 | 
28 | [testenv:py36-docs]
29 | description = test building of HTML docs
30 | basepython = python3.6
31 | deps: -rdocs/requirements-docs.txt
32 | commands = sphinx-build docs/source {toxworkdir}/docs/_build -a --color -W -bhtml {posargs}
33 | 
34 | [testenv:dev]
35 | description = the official edgePy development environment
36 | envdir = venv
37 | basepython = python3.6
38 | usedevelop = True
39 | commands =
40 |     python -m pip list --format=columns
41 |     python -c 'print("\n\nTo activate type: `source venv/bin/activate`\n\n")'
42 | 


--------------------------------------------------------------------------------