├── .github └── workflows │ └── test.yaml ├── .gitignore ├── .gitpod.yml ├── .readthedocs.yml ├── .travis.yml ├── Dockerfile ├── LICENCE.txt ├── Makefile ├── Manifest.in ├── README.md ├── docs ├── Makefile └── source │ ├── api.rst │ ├── changelog.rst │ ├── concepts.rst │ ├── conf.py │ ├── distributed.rst │ ├── examples.rst │ ├── index.rst │ ├── install.rst │ ├── log_config.rst │ ├── manager_programs.rst │ ├── recipes.rst │ ├── report.rst │ ├── test.rst │ └── usage.rst ├── ngs_toolkit ├── .readthedocs.yml ├── __init__.py ├── analysis.py ├── atacseq.py ├── chipseq.py ├── cnv.py ├── config │ ├── default.yaml │ └── example.yaml ├── constants.py ├── decorators.py ├── demo │ ├── __init__.py │ └── data_generator.py ├── exceptions.py ├── general.py ├── graphics.py ├── parsers.py ├── project_manager.py ├── recipes │ ├── __init__.py │ ├── call_peaks.py │ ├── coverage.py │ ├── deseq2.py │ ├── enrichr.py │ ├── generate_project.py │ ├── lola.py │ ├── merge_signal.py │ ├── ngs_analysis.py │ ├── region_enrichment.py │ └── region_set_frip.py ├── rnaseq.py ├── templates │ ├── __init__.py │ └── report.html ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── test_analysis.py │ ├── test_atacseq_analysis.py │ ├── test_chipseq_analysis.py │ ├── test_cnv_analysis.py │ ├── test_config.py │ ├── test_decorators.py │ ├── test_differential_analysis.py │ ├── test_differential_enrichment.py │ ├── test_general.py │ ├── test_install.py │ ├── test_logger.py │ ├── test_plot_differential.py │ ├── test_plot_differential_enrichment.py │ ├── test_project_manager.py │ ├── test_recipes.py │ ├── test_regression_tests.py │ ├── test_report.py │ ├── test_rnaseq_analysis.py │ ├── test_sample_input_files.py │ └── test_unsupervised_analysis.py ├── track_manager.py └── utils.py ├── requirements ├── requirements.docs.txt ├── requirements.single_cell.txt ├── requirements.test.txt └── requirements.txt ├── setup.cfg └── setup.py /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - master 5 | - dev 6 | schedule: 7 | # Run every sunday at 2am 8 | - cron: 0 2 * * 0 9 | 10 | jobs: 11 | test: 12 | name: ngs_toolkit 13 | runs-on: ubuntu-20.04 14 | strategy: 15 | matrix: 16 | python_version: ['3.7'] 17 | 18 | steps: 19 | - uses: actions/checkout@v1 20 | - name: Set up Python 21 | uses: actions/setup-python@v1 22 | with: 23 | python-version: ${{ matrix.python }} 24 | architecture: x64 25 | - name: Update pip 26 | run: | 27 | sudo apt-get install python3-setuptools 28 | python3 -m pip install --upgrade pip 29 | - name: Install bedtools 2.27.1 30 | run: | 31 | wget http://ftp.br.debian.org/debian/pool/main/b/bedtools/bedtools_2.27.1+dfsg-4_amd64.deb 32 | sudo dpkg -i bedtools_2.27.1+dfsg-4_amd64.deb 33 | - name: Install R 3.6 34 | run: | 35 | sudo apt-get remove -y r-base 36 | sudo apt-get autoremove 37 | sudo apt-get update 38 | sudo apt-get -y install r-base 39 | - name: Install bioconductor libraries 40 | run: | 41 | sudo apt-get update 42 | sudo apt-get -y install r-bioc-deseq2 r-bioc-preprocesscore 43 | - name: Install Combat 44 | run: | 45 | python3 -m pip install git+https://github.com/afrendeiro/combat.git 46 | - name: Install ngs-toolkit 47 | run: python3 -m pip install .[testing] 48 | - name: Lint with flake8 49 | run: | 50 | python3 -m pip install flake8 51 | # stop the build if there are Python syntax errors or undefined names 52 | python3 -m flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 53 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 54 | python3 -m flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 55 | - name: Test with pytest 56 | run: | 57 | python3 -m pytest -n 2 --disable-warnings --show-capture=no --cov ./ --cov-report term --cov-report xml --pyargs ngs_toolkit 58 | - name: Report coverage 59 | env: 60 | CODACY_PROJECT_TOKEN: ${{ secrets.CODACY_PROJECT_TOKEN }} 61 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 62 | run: | 63 | python3 -m coverage xml 64 | python3 -m codecov -f coverage.xml 65 | python3 -m codacy -r coverage.xml 66 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ignore test files 2 | .tox 3 | .mypy_cache 4 | _version.py 5 | pytest.log 6 | .coverage* 7 | docs/html 8 | 9 | # Build-related stuff 10 | build/ 11 | dist/ 12 | *.egg-info 13 | 14 | 15 | # toy/experimental files 16 | *.txt 17 | *.csv 18 | *.tsv 19 | *.pkl 20 | *.pickle 21 | *.svg 22 | *.png 23 | *.jpg 24 | *.jpeg 25 | 26 | # ignore eggs 27 | .eggs/ 28 | 29 | # ignore built docs 30 | doc/build/* 31 | 32 | # generic ignore list: 33 | *.lst 34 | 35 | # Compiled source 36 | *.com 37 | *.class 38 | *.dll 39 | *.exe 40 | *.o 41 | *.so 42 | *.pyc 43 | 44 | # Packages 45 | # it's better to unpack these files and commit the raw source 46 | # git has its own built in compression methods 47 | *.7z 48 | *.dmg 49 | *.gz 50 | *.iso 51 | *.jar 52 | *.rar 53 | *.tar 54 | *.zip 55 | 56 | # Logs and databases 57 | *.log 58 | *.sql 59 | *.sqlite 60 | 61 | # OS generated files 62 | .DS_Store 63 | .DS_Store? 64 | ._* 65 | .Spotlight-V100 66 | .Trashes 67 | ehthumbs.db 68 | Thumbs.db 69 | 70 | # Sublime files 71 | *.sublime-* 72 | 73 | # Gedit temporary files 74 | *~ 75 | 76 | # libreoffice lock files: 77 | .~lock* 78 | 79 | # IDE-specific items 80 | .idea/ 81 | 82 | # pytest-related 83 | .cache/ 84 | .coverage* 85 | coverage.xml 86 | 87 | # Reserved files for comparison 88 | *RESERVE* 89 | -------------------------------------------------------------------------------- /.gitpod.yml: -------------------------------------------------------------------------------- 1 | image: 2 | file: Dockerfile 3 | 4 | tasks: 5 | - init: ln -s ngs_toolkit/config/example.yaml ~/.ngs_toolkit.config.yaml 6 | - command: ipython3 7 | - command: sh 8 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/source/conf.py 11 | 12 | # Build documentation with MkDocs 13 | #mkdocs: 14 | # configuration: mkdocs.yml 15 | 16 | # Optionally build your docs in additional formats such as PDF and ePub 17 | formats: all 18 | 19 | # Optionally set the version of Python and requirements required to build your docs 20 | python: 21 | version: 3.7 22 | system_packages: False 23 | install: 24 | - method: pip 25 | path: . 26 | extra_requirements: 27 | - docs 28 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | os: linux 3 | dist: focal 4 | 5 | python: 6 | - "3.7" 7 | - "3.8" 8 | 9 | cache: 10 | directories: 11 | - $HOME/.cache/pip 12 | 13 | services: 14 | - xvfb 15 | 16 | before_install: 17 | # Install bedtools 18 | - sudo apt-get -y install bedtools samtools gawk 19 | # Install R >= 3.6 20 | - sudo apt-get -y install r-base 21 | # Add bioconductor libraries 22 | - sudo apt-get -y install r-bioc-deseq2 r-bioc-preprocesscore 23 | # Install Combat 24 | - pip install git+https://github.com/afrendeiro/combat.git 25 | 26 | install: 27 | - pip install .[testing] 28 | 29 | before_script: 30 | - export DISPLAY=:99.0 31 | - pip freeze 32 | - R -q -e "suppressMessages(library('DESeq2')); suppressMessages(library('preprocessCore')); sessionInfo()" 33 | 34 | script: 35 | # - pytest -n 2 --disable-warnings --show-capture=no --cov ./ --cov-report term --cov-report xml --pyargs ngs_toolkit 36 | - pytest -n 2 --disable-warnings --cov ./ --cov-report term --cov-report xml --pyargs ngs_toolkit 37 | 38 | after_success: 39 | - coverage xml 40 | - codecov -f coverage.xml 41 | - python-codacy-coverage -r coverage.xml 42 | 43 | deploy: 44 | - provider: pypi 45 | user: afrendeiro 46 | password: 47 | secure: D1KWSJZHlw/v2JLR0ClOiaw4xAuj550NmSVF1jHpIb8An9WrcfJ2YSg4VWoNL8HCMHPTC+EPeaphQa68+RhZTy5fmvifs8qaKnvzADA3Tz514uAA+vjSa3ohI0fZnX3CsI2q8f0Zr45T1O9dgDLecrYnyNqq5iOwcxTNzpxTwTxgSFoj81k3qhJBer4DLG0yVmfG9SsV+V7ApTv3iUp+PZhiGW+duXsTRzFwZAObDfJuMwuT7O9gwSZ7ACm4pXVRk22CzJtjLX/MKT74QY9+eJehtaWfkGRsl9cVqhQZSb2PELLpbXa8sOAdtEcsvg0IlMuFDjPoV5vxgA5PiZL836Ec1Koi+GD5KJY1RFUoXB1Fq3wP4s9mTlSLggVr+C0YZK6XU1hiJp5+YUZycwxtQBIZmLzT+eUDuQnYCdrowqcnqyoWV3Mjd2Aan0Kn5ZlSb73UD+KX+5C9c8CPhrNBo9odDtK8f6Wuz8s6Szz9kbPKQNIaCWD5MTZGN8te5rq/qWjHEayUJprirgRY7eoJgFjOKO15U2+5QyBF9Z4r98WC03FYUr3EExcTQWhGbsqzVY1GsFgSnlNNgbgl3DwIyPprXMn9G8LJjl4thNZqDAs8q2YH6GJkc6DUkXAM4Ma9bS+PsUNJs0vclBXH3utasARF66t5X36KeYo5W8shsj0= 48 | distributions: "sdist bdist_wheel" 49 | on: 50 | branch: master 51 | tags: true 52 | python: 3.7 53 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gitpod/workspace-full-vnc 2 | 3 | USER root 4 | 5 | # Install bedtools 6 | RUN apt-get update \ 7 | && sudo apt-get install -y --no-install-recommends \ 8 | bedtools \ 9 | && rm -rf /tmp/downloaded_packages/ /tmp/*.rds \ 10 | && rm -rf /var/lib/apt/lists/* 11 | 12 | # Install R and bioconductor libraries 13 | RUN apt-get update \ 14 | && apt-get install -y --no-install-recommends \ 15 | r-base \ 16 | r-bioc-deseq2 \ 17 | r-bioc-preprocesscore \ 18 | && rm -rf /tmp/downloaded_packages/ /tmp/*.rds \ 19 | && rm -rf /var/lib/apt/lists/* 20 | 21 | USER gitpod 22 | 23 | ENV PYTHONPATH=/home/gitpod/.local/lib/python3.7/site-packages/ 24 | 25 | # Install IPython 26 | RUN pip3 install --user ipython 27 | 28 | # Install Python dependencies of ngs-toolkit 29 | RUN pip3 install --user -r \ 30 | https://raw.githubusercontent.com/afrendeiro/toolkit/master/requirements/requirements.txt \ 31 | && pip3 install --user -r \ 32 | https://raw.githubusercontent.com/afrendeiro/toolkit/master/requirements/requirements.test.txt \ 33 | && pip3 install --user git+https://github.com/afrendeiro/combat.git 34 | 35 | # Install library 36 | RUN pip3 install --user \ 37 | git+https://github.com/afrendeiro/toolkit.git#egg=ngs-toolkit[testing] 38 | 39 | ENV PATH="/home/gitpod/.local/bin:${PATH}" 40 | 41 | USER root 42 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .DEFAULT_GOAL := pypitest 2 | 3 | install: 4 | python -m \ 5 | pip \ 6 | install \ 7 | git+https://github.com/afrendeiro/toolkit.git#egg=ngs-toolkit \ 8 | --user 9 | 10 | test: 11 | python -m \ 12 | pytest -n 3 \ 13 | --disable-warnings \ 14 | --show-capture=no \ 15 | --cov=ngs_toolkit \ 16 | --lf \ 17 | --cov-report xml \ 18 | ngs_toolkit/tests/test_*.py 19 | 20 | 21 | test_cov: 22 | python -m \ 23 | pytest \ 24 | --testmon \ 25 | --disable-warnings \ 26 | --show-capture=no \ 27 | ngs_toolkit/tests/test_*.py 28 | 29 | 30 | coverage: test 31 | python -m codecov \ 32 | -f coverage.xml 33 | python -m codacy \ 34 | -r coverage.xml 35 | 36 | docs: 37 | cd docs && $(MAKE) html 38 | xdg-open docs/build/html/index.html 39 | 40 | build: test 41 | python setup.py sdist bdist_wheel 42 | 43 | pypitest: build 44 | twine \ 45 | upload \ 46 | -r pypitest dist/* 47 | 48 | pypi: build 49 | twine \ 50 | upload \ 51 | dist/* 52 | 53 | gh: 54 | docker \ 55 | build \ 56 | -t ngs-toolkit \ 57 | . 58 | docker \ 59 | tag \ 60 | ngs-toolkit \ 61 | docker.pkg.github.com/afrendeiro/toolkit/ngs-toolkit:latest 62 | docker \ 63 | push \ 64 | docker.pkg.github.com/afrendeiro/toolkit/ngs-toolkit:latest 65 | 66 | gh-release: install 67 | $(eval VERSION := \ 68 | $(shell \ 69 | python3 \ 70 | -c 'from ngs_toolkit import __version__ as v; print(v)')) 71 | docker \ 72 | build \ 73 | -t ngs-toolkit:$(VERSION) \ 74 | . 75 | docker \ 76 | tag \ 77 | ngs-toolkit \ 78 | docker.pkg.github.com/afrendeiro/toolkit/ngs-toolkit:$(VERSION) 79 | docker \ 80 | push \ 81 | docker.pkg.github.com/afrendeiro/toolkit/ngs-toolkit:$(VERSION) 82 | 83 | clean_pyc: 84 | find . -name \*.pyc -delete 85 | 86 | clean_mypy: 87 | rm -rf .mypy_cache/ 88 | 89 | clean_test: 90 | rm -rf .pytest_cache/ 91 | rm -rf /tmp/pytest* 92 | find . -name "__pycache__" -exec rm -rf {} \; 93 | rm -rf .coverage* 94 | rm -rf .tox/ 95 | 96 | clean_cov: clean_test 97 | rm -fr coverage.xml htmlcov 98 | 99 | clean_docs: 100 | rm -fr docs/build/ 101 | 102 | clean_dist: 103 | rm -fr dist/ 104 | 105 | clean_build: 106 | rm -fr build/ 107 | rm -rf ngs_toolkit/_version.py 108 | 109 | clean_eggs: 110 | rm -fr ngs_toolkit.egg-info 111 | rm -fr .eggs 112 | 113 | clean: \ 114 | clean_pyc \ 115 | clean_mypy \ 116 | clean_test \ 117 | clean_cov \ 118 | clean_docs \ 119 | clean_dist \ 120 | clean_build \ 121 | clean_eggs 122 | 123 | all: \ 124 | test \ 125 | coverage \ 126 | docs \ 127 | build \ 128 | pypitest \ 129 | pypi \ 130 | clean 131 | 132 | .PHONY: \ 133 | test \ 134 | coverage \ 135 | docs \ 136 | build \ 137 | pypitest \ 138 | pypi \ 139 | clean_pyc \ 140 | clean_mypy \ 141 | clean_test \ 142 | clean_cov \ 143 | clean_docs \ 144 | clean_dist \ 145 | clean_build \ 146 | clean_eggs \ 147 | clean 148 | -------------------------------------------------------------------------------- /Manifest.in: -------------------------------------------------------------------------------- 1 | include requirements/requirements*.txt 2 | include ngs_toolkit/templates/* 3 | include README.md 4 | include config/*.yaml 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![Documentation status](https://readthedocs.org/projects/ngs-toolkit/badge/?version=latest)](http://ngs-toolkit.readthedocs.io/en/latest/?badge=latest) 3 | [![PyPI version](https://badge.fury.io/py/ngs-toolkit.svg)](https://badge.fury.io/py/ngs-toolkit) 4 | [![Codacy Badge](https://api.codacy.com/project/badge/Grade/30fcafc027e64b21bf9ddfe8d7f0ff3a)](https://app.codacy.com/app/afrendeiro/toolkit?utm_source=github.com&utm_medium=referral&utm_content=afrendeiro/toolkit&utm_campaign=Badge_Grade_Dashboard) 5 | [![Build Status](https://travis-ci.org/afrendeiro/toolkit.svg?branch=master)](https://travis-ci.org/afrendeiro/toolkit) 6 | [![Gitter](https://badges.gitter.im/ngs-toolkit/Lobby.svg)](https://gitter.im/ngs-toolkit/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) 7 | [![PEP compatible](http://pepkit.github.io/img/PEP-compatible-green.svg)](http://pepkit.github.io) 8 | 9 | [![Open in Gitpod](https://gitpod.io/button/open-in-gitpod.svg)](https://gitpod.io/#https://github.com/afrendeiro/toolkit) 10 | 11 | # ngs-toolkit 12 | 13 | This is my NGS analysis toolkit: ``ngs_toolkit``. 14 | 15 | Head to the [documentation](http://ngs-toolkit.readthedocs.io/) to see how to install and use the toolkit, and have a look at the catalogue of available functions. 16 | 17 | Install with: 18 | 19 | ```bash 20 | pip install ngs-toolkit 21 | ``` 22 | 23 | You might need to add a ``--user`` flag to the above command. 24 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " applehelp to make an Apple Help Book" 34 | @echo " devhelp to make HTML files and a Devhelp project" 35 | @echo " epub to make an epub" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " coverage to run coverage check of the documentation (if enabled)" 50 | 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | html: 55 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 56 | @echo 57 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 58 | 59 | dirhtml: 60 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 61 | @echo 62 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 63 | 64 | singlehtml: 65 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 66 | @echo 67 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 68 | 69 | pickle: 70 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 71 | @echo 72 | @echo "Build finished; now you can process the pickle files." 73 | 74 | json: 75 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 76 | @echo 77 | @echo "Build finished; now you can process the JSON files." 78 | 79 | htmlhelp: 80 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 81 | @echo 82 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 83 | ".hhp project file in $(BUILDDIR)/htmlhelp." 84 | 85 | qthelp: 86 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 87 | @echo 88 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 89 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 90 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pipelines.qhcp" 91 | @echo "To view the help file:" 92 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pipelines.qhc" 93 | 94 | applehelp: 95 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 96 | @echo 97 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 98 | @echo "N.B. You won't be able to view it unless you put it in" \ 99 | "~/Library/Documentation/Help or install it in your application" \ 100 | "bundle." 101 | 102 | devhelp: 103 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 104 | @echo 105 | @echo "Build finished." 106 | @echo "To view the help file:" 107 | @echo "# mkdir -p $$HOME/.local/share/devhelp/pipelines" 108 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pipelines" 109 | @echo "# devhelp" 110 | 111 | epub: 112 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 113 | @echo 114 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 115 | 116 | latex: 117 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 118 | @echo 119 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 120 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 121 | "(use \`make latexpdf' here to do that automatically)." 122 | 123 | latexpdf: 124 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 125 | @echo "Running LaTeX files through pdflatex..." 126 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 127 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 128 | 129 | latexpdfja: 130 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 131 | @echo "Running LaTeX files through platex and dvipdfmx..." 132 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 133 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 134 | 135 | text: 136 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 137 | @echo 138 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 139 | 140 | man: 141 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 142 | @echo 143 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 144 | 145 | texinfo: 146 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 147 | @echo 148 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 149 | @echo "Run \`make' in that directory to run these through makeinfo" \ 150 | "(use \`make info' here to do that automatically)." 151 | 152 | info: 153 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 154 | @echo "Running Texinfo files through makeinfo..." 155 | make -C $(BUILDDIR)/texinfo info 156 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 157 | 158 | gettext: 159 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 160 | @echo 161 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 162 | 163 | changes: 164 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 165 | @echo 166 | @echo "The overview file is in $(BUILDDIR)/changes." 167 | 168 | linkcheck: 169 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 170 | @echo 171 | @echo "Link check complete; look for any errors in the above output " \ 172 | "or in $(BUILDDIR)/linkcheck/output.txt." 173 | 174 | doctest: 175 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 176 | @echo "Testing of doctests in the sources finished, look at the " \ 177 | "results in $(BUILDDIR)/doctest/output.txt." 178 | 179 | coverage: 180 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 181 | @echo "Testing of coverage in the sources finished, look at the " \ 182 | "results in $(BUILDDIR)/coverage/python.txt." 183 | 184 | xml: 185 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 186 | @echo 187 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 188 | 189 | pseudoxml: 190 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 191 | @echo 192 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 193 | -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | API 2 | === 3 | 4 | The great flexibility of ``ngs_toolkit`` comes from the ability to compose workflows using the API. 5 | 6 | It provides a rich but abstract :class:`~ngs_toolkit.analysis.Analysis` object and implements various modules building on it depending on the data type. 7 | 8 | In addition, the :mod:`~ngs_toolkit.general` module contains several analysis-independent methods and the :mod:`~ngs_toolkit.utils` module provides low-level functions of general use. 9 | 10 | ngs_toolkit.analysis 11 | ----------------------- 12 | .. automodule:: ngs_toolkit.analysis 13 | :members: 14 | 15 | ngs_toolkit.atacseq 16 | ----------------------- 17 | .. automodule:: ngs_toolkit.atacseq 18 | :members: 19 | 20 | ngs_toolkit.chipseq 21 | ----------------------- 22 | .. automodule:: ngs_toolkit.chipseq 23 | :members: 24 | 25 | ngs_toolkit.cnv 26 | ----------------------- 27 | .. automodule:: ngs_toolkit.cnv 28 | :members: 29 | 30 | ngs_toolkit.rnaseq 31 | ----------------------- 32 | .. automodule:: ngs_toolkit.rnaseq 33 | :members: 34 | 35 | ngs_toolkit.demo 36 | ----------------------- 37 | .. automodule:: ngs_toolkit.demo.data_generator 38 | :members: 39 | 40 | ngs_toolkit.general 41 | ----------------------- 42 | .. automodule:: ngs_toolkit.general 43 | :members: 44 | 45 | ngs_toolkit.graphics 46 | ----------------------- 47 | .. automodule:: ngs_toolkit.graphics 48 | :members: 49 | 50 | ngs_toolkit.utils 51 | ----------------------- 52 | .. automodule:: ngs_toolkit.utils 53 | :members: 54 | 55 | ngs_toolkit.parsers 56 | ----------------------- 57 | .. automodule:: ngs_toolkit.parsers 58 | :members: 59 | 60 | ngs_toolkit 61 | ----------------------- 62 | .. automodule:: ngs_toolkit 63 | :members: 64 | -------------------------------------------------------------------------------- /docs/source/concepts.rst: -------------------------------------------------------------------------------- 1 | Concepts 2 | ****************************** 3 | 4 | A few notes on the way some of the library and its objects were designed to be used. 5 | 6 | .. _AnalysisObjects: 7 | 8 | Analysis objects 9 | ============================== 10 | 11 | The ``Analysis`` object and its data-type specific dependents are central to the usage of ``ngs-toolkit``. These objects hold attributes and functions relevant to the analysis, such as ``Sample`` objects (and their attributes), Dataframes with numerical values, and others. 12 | 13 | .. _LeveragingOnThePEPFormat: 14 | 15 | Leveraging on the PEP format 16 | ---------------------------- 17 | 18 | One easy and recommended way to instantiate ``Analysis`` objects is with a ``PEP Project`` file. 19 | This has several advantages: 20 | 21 | - Usage of the language-agnostic PEP format to store a project description and interoperability with other tools (see https://github.com/pepkit for other tools); 22 | - Initialization of project-specific variables into the ``Analysis`` object that are derived from the PEP. Examples: analysis samples, genome(s), sample and sample group attributes, sample comparison table. 23 | 24 | The example below shows how this works: 25 | 26 | .. code-block:: python 27 | 28 | >>> from ngs_toolkit import Analysis 29 | >>> an = Analysis(from_pep="my_project/metadata/project_config.yaml") 30 | [INFO] > Setting project's 'sample_attributes' as the analysis 'sample_attributes'. 31 | [INFO] > Setting project's 'group_attributes' as the analysis 'group_attributes'. 32 | [INFO] > Setting project's 'comparison_table' as the analysis 'comparison_table'. 33 | [INFO] > Setting analysis organism as 'mouse'. 34 | [INFO] > Setting analysis genome as 'mm10'. 35 | >>> print(an) 36 | Analysis 'my_project' with 12 samples of organism 'mouse' (mm10). 37 | 38 | .. note:: The verbosity of ``ngs-toolkit`` can be controlled 39 | 40 | See the section on `logging `__ to control the verbosity of ``ngs-toolkit``. 41 | 42 | .. _ReasonableDefaults: 43 | 44 | Reasonable defaults with full customization 45 | ------------------------------------------- 46 | 47 | Functions in the ``Analysis`` object are aware of these attributes and will 48 | use them by default, making calling the functions very simple (other overiding 49 | arguments can be passed though). 50 | 51 | In the example below, we will generate a consensus peak set for ATAC-seq 52 | analyss using the ``get_consensus_sites`` function. This will demonstrate 53 | several things that "come for free": 54 | 55 | .. code-block:: python 56 | 57 | >>> from ngs_toolkit import ATACSeqAnalysis 58 | >>> an = ATACSeqAnalysis(from_pep="my_project/metadata/project_config.yaml") 59 | [INFO] > Setting project's 'sample_attributes' as the analysis 'sample_attributes'. 60 | [INFO] > Setting project's 'group_attributes' as the analysis 'group_attributes'. 61 | [INFO] > Setting project's 'comparison_table' as the analysis 'comparison_table'. 62 | [INFO] > Subsetting samples for samples of type 'ATAC-seq'. 63 | [INFO] > Subsetting comparison_table for comparisons of type 'ATAC-seq'. 64 | [INFO] > Setting analysis organism as 'mouse'. 65 | [INFO] > Setting analysis genome as 'mm10'. 66 | >>> an.get_consensus_sites() 67 | 68 | - even though the PEP project includes samples from several data types (ATAC-, ChIP- and RNA-seq), the current analysis will only consider ATAC-seq samples. 69 | - the necessary files with peak calls for each sample are not specified - ``ngs-toolkit`` knows where to find them; 70 | - a BED file with ENCODE blacklisted regions will not be given, but these regions will be filtered out - ``ngs-toolkit`` will download this and use it. No static files are distributed with the package. 71 | - related to the above, the correct blacklist file is downloaded because the genome assembly for the project is infered from the samples - even though it is not directly specified. 72 | 73 | .. _Workflow: 74 | 75 | Workflow 76 | ------------------------------ 77 | 78 | Most functions of the ``Analysis`` object will take some input (usually a 79 | dataframe), apply some transformation and assign the result to a 80 | variable of the same Analysis object. 81 | 82 | To see what variable has been assigned within a given function check the 83 | relevant function in the `API `__, specifically the 84 | `Variables` value. Some functions will assign attributes that are used almost 85 | ubiquitily. See the `common attributes section `__ for some examples. 86 | 87 | High-level functions will also often assign their outputs to the object 88 | itself. To see which attribute holds it, note the ``Attributes`` section of 89 | the respective function documentation. 90 | Assignment allows the exchange of information between analysis steps without 91 | the user always providing all required inputs, which would make using such a 92 | toolkit quite verbose. 93 | 94 | The example below illustrates this: 95 | 96 | .. code-block:: python 97 | 98 | >>> from ngs_toolkit import ATACSeqAnalysis 99 | >>> an = ATACSeqAnalysis(from_pep="my_project/metadata/project_config.yaml") 100 | >>> print(an) 101 | 'ATAC-seq' analysis 'test-project_ATAC-seq_mm10_1_100_1' with 2 samples of organism 'mouse' (mm10). 102 | >>> an.get_consensus_sites() 103 | >>> an.measure_coverage() 104 | >>> print(an.matrix_raw.head()) 105 | S1_a1 S2_a2 106 | region 107 | chr1:42447241-42447627 955 2211 108 | chr1:44445678-44446750 1939 2122 109 | chr1:44743959-44744926 1264 1443 110 | chr1:90513210-90513978 1262 1354 111 | chr1:93565764-93567191 911 892 112 | >>> an.normalize() 113 | >>> print(an.matrix_norm.head()) 114 | region S1_a1 S2_a2 115 | chr1:42447241-42447627 12.681954 13.822151 116 | chr1:44445678-44446750 13.703582 13.762881 117 | chr1:44743959-44744926 13.086324 13.206576 118 | chr1:90513210-90513978 13.084040 13.114743 119 | chr1:93565764-93567191 12.613915 12.512715 120 | 121 | All three ``get_consensus_sites``, ``measure_coverage`` and ``normalize`` build 122 | on the output of each other, but the user doesn't have to specify the input to 123 | any. Changing either the name of the attribute that stores either output or the 124 | location of files outputed is nonetheless easy. 125 | 126 | Many functions also have a ``save`` argument which will save the result as a 127 | ``CSV`` file. 128 | 129 | .. _CommonAttributes: 130 | 131 | Common attributes 132 | ----------------- 133 | 134 | To allow a uniform usage across different data types and analysis types, 135 | a few but important attributes of the ``Analysis`` object and its derivatives 136 | have naming conventions: 137 | 138 | - ``data_type``: The type of data of the analysis. Matches the object type. 139 | - ``matrix_raw``: A dataframe of raw, unnormalized values of shape (features, samples) 140 | - ``matrix_norm``: A dataframe of normalized values of shape (features, samples) 141 | - ``quantity``: The name of the units of the values measured. E.g. "expression" for RNA-seq or "accessibility" for ATAC-seq 142 | - ``var_unit_name``: The name of the variables measured. E.g. "gene" for RNA-seq or "region" for ATAC-seq or ChIP-seq 143 | - ``norm_method``: The method used to normalize the ``matrix_norm`` dataframe 144 | - ``thresholds``: A dictionary with keys "log_fold_change" and "p_value" storing thresholds used in the analysis 145 | 146 | .. _ComparisonTable: 147 | 148 | Comparison table 149 | =============================== 150 | 151 | ``ngs-toolkit`` has functions to perform supervised differntial comparisons 152 | between groups of samples. The sample groupings are specified in a CSV file called ``comparison_table``. 153 | 154 | An example of a typical "case vs control" comparison table is given below: 155 | 156 | .. csv-table:: Typical example of comparison_table 157 | :header: "comparison_name", "comparison_side", "sample_name", "sample_group" 158 | :widths: 30, 30, 30, 30 159 | 160 | "KOA_vs_WT", "1", "ATAC-seq_KOA_r1", "KO_A" 161 | "KOA_vs_WT", "1", "ATAC-seq_KOA_r2", "KO_A" 162 | "KOA_vs_WT", "0", "ATAC-seq_WT_r1", "WT" 163 | "KOA_vs_WT", "0", "ATAC-seq_WT_r2", "WT" 164 | "KOB_vs_WT", "1", "ATAC-seq_KOB_r1", "KO_B" 165 | "KOB_vs_WT", "1", "ATAC-seq_KOB_r2", "KO_B" 166 | "KOB_vs_WT", "0", "ATAC-seq_WT_r1", "WT" 167 | "KOB_vs_WT", "0", "ATAC-seq_WT_r2", "WT" 168 | 169 | 170 | Each row is reserved for a given sample. Samples of the same group (typically 171 | replicates) should have the same value of "sample_group" and same 172 | "comparison_side". The group of interest (comparison foreground) should have a 173 | value of 1 as "comparison_side" and the background a value of 0. Finally, the 174 | comparison will be labeled with the value of "comparison_name", which should 175 | be constant for all samples in both foreground and background groups. 176 | 177 | 178 | For an all-vs-all group comparison, I recommend labeling all background sample groups as a new group in the following manner: 179 | 180 | .. csv-table:: "All-vs-all" example of comparison table 181 | :header: "comparison_name", "comparison_side", "sample_name", "sample_group" 182 | :widths: 30, 30, 30, 30 183 | 184 | "celltypeA", "1", "ATAC-seq_celltypeA_r1", "ct_A" 185 | "celltypeA", "1", "ATAC-seq_celltypeA_r2", "ct_A" 186 | "celltypeA", "0", "ATAC-seq_celltypeB_r1", "ct_A_background" 187 | "celltypeA", "0", "ATAC-seq_celltypeB_r2", "ct_A_background" 188 | "celltypeA", "0", "ATAC-seq_celltypeC_r1", "ct_A_background" 189 | "celltypeA", "0", "ATAC-seq_celltypeC_r2", "ct_A_background" 190 | "celltypeB", "1", "ATAC-seq_celltypeB_r1", "ct_B" 191 | "celltypeB", "1", "ATAC-seq_celltypeB_r2", "ct_B" 192 | "celltypeB", "0", "ATAC-seq_celltypeA_r1", "ct_B_background" 193 | "celltypeB", "0", "ATAC-seq_celltypeA_r2", "ct_B_background" 194 | "celltypeB", "0", "ATAC-seq_celltypeC_r1", "ct_B_background" 195 | "celltypeB", "0", "ATAC-seq_celltypeC_r2", "ct_B_background" 196 | 197 | 198 | Additional useful columns are `data_type` (to subset comparisons based on type 199 | of NGS data), `comparison_type` to specify the type of comparison to perform 200 | (e.g. one of 'differential' or 'peaks') and `toggle` for subsetting 201 | comparisons to perform. 202 | 203 | 204 | .. note:: **Hyphens and other symbols in comparison_table** 205 | 206 | Since differential comparisons are perfomed using DESeq2, R is used 207 | (throught the Python-R interface library rpy2). 208 | ngs_toolkit will create the required tables by DESeq2 which includes names 209 | of samples and comparisons as dataframe columns. Unfortunately due to the 210 | way R handles column names, these get changed. 211 | 212 | In the future this will be accounted for but for now avoid using hyphens 213 | and any other symbols as values for sample names or groups. 214 | 215 | 216 | .. _LowLevelFunctions: 217 | 218 | Low-level functions - ``utils`` 219 | =============================== 220 | 221 | Functions from Analysis objects are generally pretty high level functions, 222 | often performing several tasks by calling other more general-purpose 223 | functions. However, one of the concepts I really wanted to have is that the 224 | user retains as much control as they wish. 225 | 226 | They may choose to use the high level functions which generally provide 227 | sensible defaults, or retain more control and build their analysis pipeline 228 | from the lower level helper functions. 229 | 230 | One example: calling ``ATACSeqAnalysis.normalize()`` will by default run 3-4 231 | other functions to return a quantile normalized, GC-corrected, log-transformed 232 | output - a fairly complex normalization procedure but made simple by providing 233 | sensible defaults. 234 | 235 | A user may easily change the procedure by choosing one of the ~4 types of 236 | normalization using keyword arguments or implement an alternative method which 237 | can be plugged in to the next step of the analysis. 238 | 239 | In the future the low level functions will be moved to `ngs_toolkit.utils` and 240 | the data type-specific modules will have only classes and functions specific 241 | to those data which are usually more high-level. 242 | -------------------------------------------------------------------------------- /docs/source/distributed.rst: -------------------------------------------------------------------------------- 1 | Distributed computing 2 | ============================= 3 | 4 | divvy 5 | ----------------------------- 6 | 7 | Certain functions in the ``ngs_toolkit`` toolkit can make use of distributed 8 | computing. To achieve this for a variety of computing configurations 9 | it uses the `divvy library `_. 10 | 11 | Divvy provides an abstract way of submitting a job to various job managers by 12 | shipping job templates for each configuration. 13 | 14 | When ``divvy`` starts, a configuration is chosen (the ``compute_configuration`` 15 | attribute) and that template gets filled with the attributes of the job - 16 | the code to be executed, the resouce requirements and others 17 | (e.g. "cores", "mem", "time" attributes). 18 | 19 | To see all supported compute configurations run: 20 | 21 | .. code-block:: bash 22 | 23 | divvy list 24 | 25 | For more information on how to configure ``divvy``, see its documentation: 26 | http://divvy.databio.org/ 27 | 28 | To let ``ngs_toolkit`` know which ``divvy`` configuration to use by default, 29 | modify the following section in the ``ngs_toolkit`` configuration file: 30 | 31 | .. code-block:: yaml 32 | 33 | preferences: 34 | # The next item is the default computing configuration to use from divvy. 35 | # Run "divvy list" to see all options. 36 | # See more here: http://code.databio.org/divvy/ 37 | computing_configuration: 'slurm' 38 | 39 | This will make ``ngs_toolkit`` send jobs to a slurm cluster if wanted. 40 | 41 | All functions that allow running a task in a distributed manner have a 42 | ``distributed`` keyword argument. 43 | 44 | In addition, they also accept additional keyword arguments (`kwargs` in the 45 | function signature) where additional options can be passed. 46 | These options must match fields available to format of the currently selected 47 | ``compute_configuration``. 48 | 49 | Sending jobs and collecting output 50 | ---------------------------------- 51 | 52 | Performing a taks in a distributed manner can therefore be as simple as calling 53 | the desired function with ``distributed=True``. Jobs will be sent to the 54 | job manager of the chosen computing configuration. 55 | 56 | However, since the jobs are often run individually for a sample/group of samples, 57 | functions called with ``distributed=True`` may not return the same output as 58 | ``distributed=False``. 59 | 60 | For that reason, for all such functions, there is a reciprocal function of 61 | identical name as the first prefixed with ``collect``. 62 | 63 | .. code-block:: python 64 | 65 | from ngs_toolkit.demo import generate_project 66 | an = generate_project(sample_input_files=True) 67 | an.measure_coverage(distributed=True) 68 | coverage = collect_coverage() 69 | 70 | Implementing automatic collection of job outputs in part of future plans. 71 | 72 | Example 73 | ----------------------------- 74 | 75 | The :func:`ngs_toolkit.atacseq.ATACSeqAnalysis.measure_coverage` function has 76 | ``distributed`` and ``kwargs`` options. 77 | 78 | This provides code portability and allows customization of various aspects of 79 | the jobs: 80 | 81 | .. code-block:: python 82 | 83 | from ngs_toolkit.demo import generate_project 84 | an = generate_project(sample_input_files=True) 85 | # in serial 86 | cov1 = an.measure_coverage() 87 | # as slurm jobs (because the config computing_configuration is set to 'slurm') 88 | an.measure_coverage(distributed=True) 89 | cov2 = collect_coverage() 90 | # confirm the output is the same 91 | assert (cov2 == cov1).all().all() 92 | 93 | .. code-block:: python 94 | 95 | # as slurm jobs to a particular queue and more memory 96 | an.measure_coverage(distributed=True, partition="longq", mem=24000) 97 | # here 'partition' and 'mem' are attributes of the slurm divvy template 98 | # and not magic attributes 99 | -------------------------------------------------------------------------------- /docs/source/examples.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | ****************************** 3 | 4 | 5 | Analysis example 6 | ============================== 7 | 8 | The following is an example of how to use ``ngs_toolkit`` in a ATAC-seq project. 9 | While straightforward, it still allows considerable customization due to the modularity of the toolkit and the parametrization of most functions (this example uses default values everywhere nonetheless). 10 | 11 | .. note:: 12 | ``ngs_toolkit`` from version 0.25.0 on uses the `PEP 2.0 specification `_. If you have a PEP made in an earlier version, you must update it in order to use ``ngs_toolkit``>=0.25.0. 13 | 14 | 15 | We have the following `PEP project `_ config YAML file: 16 | 17 | .. code-block:: yaml 18 | 19 | pep_version: "2.0.0" 20 | name: example_project 21 | description: example_project 22 | username: user 23 | email: user@email.com 24 | 25 | sample_table: annotation.csv 26 | subsample_table: 27 | comparison_table: comparison_table.csv 28 | 29 | submission_subdir: submission 30 | results_subdir: data 31 | output_dir: example_project 32 | 33 | pipeline_interfaces: /home/user/workspace/open_pipelines/pipeline_interface.yaml 34 | 35 | sample_attributes: 36 | - sample_name 37 | - genotype 38 | - replicate 39 | group_attributes: 40 | - genotype 41 | - replicate 42 | sample_modifiers: 43 | imply: 44 | - if: 45 | organism: 'human' 46 | then: 47 | genome: 'hg38' 48 | derive: 49 | attributes: [data_source] 50 | sources: 51 | local: data/{sample_name}.bam 52 | bsf: /scratch/lab_bsf/samples/{flowcell}/{flowcell}_{lane}_samples/{flowcell}_{lane}#{BSF_name}.bam 53 | 54 | 55 | 56 | The following sample annotation CSV file, 'annotation.csv': 57 | 58 | .. csv-table:: Annotation table for example 59 | :header: "sample_name", "protocol", "genotype", "replicate", "organism", flowcell, lane 60 | 61 | "ATAC-seq_KOA_r1", "ATAC-seq", KO_A", "1", "human", "C0AXX", "1" 62 | "ATAC-seq_KOA_r2", "ATAC-seq", KO_A", "2", "human", "C0AXX", "1" 63 | "ATAC-seq_KOB_r1", "ATAC-seq", KO_B", "1", "human", "C0AXX", "1" 64 | "ATAC-seq_KOB_r2", "ATAC-seq", KO_B", "2", "human", "C0AXX", "1" 65 | "ATAC-seq_WT_r1", "ATAC-seq", WT", "1", "human", "C0AXX", "1" 66 | "ATAC-seq_WT_r2", "ATAC-seq", WT", "2", "human", "C0AXX", "1" 67 | 68 | 69 | And the following comparison table, 'comparison_table.csv': 70 | 71 | .. csv-table:: Comparison table for example 72 | :header: "comparison_name", "comparison_side", "sample_name", "sample_group" 73 | :widths: 30, 30, 30, 30 74 | 75 | "KOA_vs_WT", "1", "ATAC-seq_KOA_r1", "KO_A" 76 | "KOA_vs_WT", "1", "ATAC-seq_KOA_r2", "KO_A" 77 | "KOA_vs_WT", "0", "ATAC-seq_WT_r1", "WT" 78 | "KOA_vs_WT", "0", "ATAC-seq_WT_r2", "WT" 79 | "KOB_vs_WT", "1", "ATAC-seq_KOB_r1", "KO_B" 80 | "KOB_vs_WT", "1", "ATAC-seq_KOB_r2", "KO_B" 81 | "KOB_vs_WT", "0", "ATAC-seq_WT_r1", "WT" 82 | "KOB_vs_WT", "0", "ATAC-seq_WT_r2", "WT" 83 | 84 | 85 | 86 | ATAC-seq analysis example 87 | ------------------------------- 88 | 89 | .. code-block:: python 90 | 91 | import os 92 | from ngs_toolkit.atacseq import ATACSeqAnalysis 93 | 94 | # Start project and analysis objects 95 | analysis = ATACSeqAnalysis(from_pep="project_config.yaml") 96 | 97 | # Generate consensus peak set and annotate it 98 | ## get consensus peak set from all samples 99 | analysis.get_consensus_sites() 100 | ## annotate peak set with genomic context 101 | analysis.get_peak_genomic_location() 102 | ## annotate peak set with chromatin context 103 | analysis.get_peak_chromatin_state( 104 | os.path.join( 105 | analysis.data_dir, 106 | "external", 107 | "E032_15_coreMarks_mnemonics.bed")) 108 | ## annotate peak set with genes 109 | analysis.get_peak_gene_annotation() 110 | 111 | # Use accessibility quantitatively 112 | ## get coverage values for each peak in each sample of ATAC-seq 113 | analysis.measure_coverage() 114 | 115 | # Normalize accessibility (quantile normalization + GC correction, requires cqn R library) 116 | analysis.normalize(method="cqn") 117 | 118 | # Annotate normalized accessibility with sample and region info 119 | # # annotate dataframe with peak metadata 120 | analysis.annotate_features() 121 | # # annotate dataframe with sample metadata 122 | analysis.accessibility = analysis.annotate_samples() 123 | 124 | # UNSUPERVISED ANALYSIS 125 | # # plot pairwise sample correlations, 126 | # # perform dimensionality reduction (MDS, PCA) 127 | # # and plot samples in this spaces, annotated with their attributes 128 | analysis.unsupervised_analysis() 129 | 130 | 131 | # SUPERVISED ANALYSIS 132 | # # differential analysis with DESeq2 133 | analysis.differential_analysis() 134 | 135 | # # plot scatter, volcano, MA, heatmaps on the differential regions 136 | # # by groups and with individual samples, with normalized values 137 | # # and scalled values (Z-score). 138 | analysis.plot_differential( 139 | alpha=0.05, 140 | corrected_p_value=True, 141 | fold_change=1) 142 | 143 | # # perform enrichment analysis on differnetial region sets 144 | # # using LOLA, MEME-AME, HOMER and Enrichr 145 | analysis.differential_enrichment( 146 | directional=True, 147 | max_diff=1000, 148 | sort_var="pvalue") 149 | 150 | # # for each type of enrichment results, 151 | # # plot bar and scatter plots of odds ratio vs p-value, 152 | # # heatmaps of enrichment across terms for each comparison 153 | # # and comparison correlation in enrichment terms 154 | analysis.plot_differential_enrichment() 155 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | Welcome 2 | ^^^^^^^^ 3 | 4 | ``ngs_toolkit`` is a Python library for the analysis of NGS data. 5 | 6 | Its goals are to provide a highly customizable set of objects and tools that 7 | interact with each other to create data processing and analysis workflows in 8 | both a interactive and scripted way. 9 | 10 | ``ngs-toolkit`` is unique in the following aspects: 11 | 12 | - Includes tried-and-tested (and published) workflows for end-to-end analysis of NGS data, while at the same time allowing high customization; 13 | - Tailored for well-established NGS data types, but supporting arbitrary data types; 14 | - Its target audience are mid-level computational biologists who want to "get it done" and focus on interpretation of results. At the same time, it allows running workflows with minimal programming experience. 15 | 16 | ``ngs-toolkit`` is reaching maturity, with a stable API (from version 0.14.0 on), 17 | improving documentation and increasing test coverage. 18 | 19 | Head to the :doc:`Installation ` to see installation instructions, to 20 | :doc:`Usage ` for quick use, or have a look at the catalogue of available 21 | functions in the :doc:`API `. 22 | 23 | Contents 24 | ^^^^^^^^ 25 | 26 | .. toctree:: 27 | :maxdepth: 1 28 | 29 | install 30 | usage 31 | examples 32 | concepts 33 | log_config 34 | report 35 | distributed 36 | manager_programs 37 | recipes 38 | api 39 | test 40 | changelog 41 | 42 | 43 | Indices and tables 44 | ================== 45 | 46 | * :ref:`genindex` 47 | * :ref:`modindex` 48 | * :ref:`search` 49 | 50 | Links 51 | ^^^^^^^^ 52 | 53 | * Documentation: http://toolkit.readthedocs.io/ 54 | * Issues and source code: https://github.com/afrendeiro/toolkit 55 | -------------------------------------------------------------------------------- /docs/source/install.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | With pip 5 | ============================= 6 | 7 | ``ngs_toolkit`` is available for Python 3 only. Is is tested in Python 3.6 and 3.7. 8 | 9 | To install, simply do: 10 | 11 | .. code-block:: bash 12 | 13 | pip install ngs-toolkit 14 | 15 | you might need to add a ``--user`` flag if not root or running in a virtual environment. 16 | 17 | This will install all the Python dependencies needed too. 18 | See `here `_ a list of all Python dependencies used. 19 | 20 | To install the latest development version: 21 | 22 | .. code-block:: bash 23 | 24 | pip install git+https://github.com/afrendeiro/toolkit.git#egg=ngs-toolkit 25 | 26 | 27 | Using a conda environment 28 | ============================= 29 | 30 | Get the `latest Python 3 installation of miniconda from the conda website `_ and follow the instructions for installation and activation of the environment. 31 | 32 | Setup the bioconda channel: 33 | 34 | .. code-block:: bash 35 | 36 | conda config --add channels defaults 37 | conda config --add channels bioconda 38 | conda config --add channels conda-forge 39 | 40 | Install non-Python dependencies: 41 | 42 | .. code-block:: bash 43 | 44 | conda install -y bedtools==2.27.1 45 | conda install -y ucsc-twobittofa 46 | conda install -y bioconductor-deseq2 47 | conda install -y bioconductor-cqn 48 | 49 | And then install the ``ngs-toolkit`` library with pip (available only through PyPi). 50 | 51 | .. code-block:: bash 52 | 53 | pip install ngs-toolkit 54 | 55 | 56 | Non-Python requirements 57 | ============================= 58 | 59 | 60 | ``ngs_toolkit`` makes use of some non-Python dependencies. 61 | 62 | - `bedtools `_: version should be at least 2.27.1 63 | 64 | The following are highly recommended only for some data or analysis types: 65 | 66 | - `R `_ and some bioconductor libraries (optional): 67 | - `DESeq2 `_ (optional): used for differential testing of genes/regulatory elements and variance stabilization transformation of data. 68 | - `cqn `_ (optional): used for GC-content aware normalization of NGS data. 69 | - `Kent tools `_ (optional): 70 | - the ``twoBitToFa`` binary from UCSC's Kent bioinformatics toolkit is used to convert between the 2bit and FASTA formats. 71 | 72 | For region-based enrichment analysis, you may also want to have the following software installed (entirely optional): 73 | 74 | - `MEME suite `_ 75 | - `HOMER motif analysis `_ 76 | - `LOLA R package `_ 77 | 78 | You can see how to install all requirements in an Ubuntu-based system in the provided `Dockerfile `_. 79 | 80 | 81 | Docker 82 | ============================= 83 | 84 | A Docker image containing ``ngs_toolkit`` and its dependencies is also available: https://hub.docker.com/r/afrendeiro/ngs-toolkit 85 | 86 | To pull the image and run a module for example in this way: 87 | 88 | .. code-block:: bash 89 | 90 | docker pull afrendeiro/ngs-toolkit 91 | docker run ngs-toolkit python3 -m ngs_toolkit.recipes.ngs_analysis --help 92 | 93 | You can also run an interactive session of ``ngs_toolkit`` `based on the docker image on Gitpod `_. 94 | 95 | The Dockerfile that produced the image is available in the github repository: https://github.com/afrendeiro/toolkit/blob/master/Dockerfile 96 | -------------------------------------------------------------------------------- /docs/source/log_config.rst: -------------------------------------------------------------------------------- 1 | Configuration, logging and versioning 2 | ************************************* 3 | 4 | .. _Configuration: 5 | 6 | 7 | Configuration 8 | ============================= 9 | 10 | ``ngs_toolkit`` uses a YAML configuration file. 11 | 12 | While entirely optional, this allows the user to specify preferences, patterns and allows usage across different computing environments. 13 | 14 | The user can provide its own configuration in two ways: 15 | 16 | * In a YAML file located in ``$HOME/.ngs_toolkit.config.yaml``; 17 | * A user provided file given during interactive runtime passed to ``ngs_toolkit.setup_config()``. 18 | 19 | If more than one is given values in the configuration files will be updated in the following order: 20 | 21 | 1. A minimal configuration file from the package data; 22 | 2. The user provided file in ``$HOME/.ngs_toolkit.config.yaml``; 23 | 3. The user provided file passed to ``ngs_toolkit.setup_config()``. 24 | 25 | To see how to structure the YAML file, see section below. 26 | 27 | 28 | 29 | Example configuration files 30 | ----------------------------- 31 | 32 | To see all available configuration fields have a look at the default configuration file: https://github.com/afrendeiro/toolkit/blob/master/ngs_toolkit/config/default.yaml#L1 33 | 34 | For a full example of a fully configured file have a look at the example configuration file: https://github.com/afrendeiro/toolkit/blob/master/ngs_toolkit/config/example.yaml#L1 35 | 36 | However, the configuration file does not need to include all fields. Below is a minimal example of a configuration file. 37 | 38 | .. code-block:: yaml 39 | 40 | username: user 41 | email: user@mail.com 42 | website_root: userwebsite.web.com 43 | preferences: 44 | # For the next item, environment variables are formatted if they are of the form ${VAR} 45 | root_reference_dir: ${USER}/reference_data 46 | root_projects_dir: ${USER}/projects 47 | default_genome_assemblies: 48 | - human: hg38 49 | - mouse: mm10 50 | # Below is the name of the divvy package configuration (http://divvy.databio.org/en/latest/) 51 | computing_configuration: 'slurm' 52 | sample_input_files: 53 | ATAC-seq: 54 | aligned_filtered_bam: "{data_dir}/{sample_name}/mapped/{sample_name}.bowtie2.filtered.bam" 55 | peaks: "{data_dir}/{sample_name}/peaks/{sample_name}_peaks.narrowPeak" 56 | summits: "{data_dir}/{sample_name}/peaks/{sample_name}_summits.narrowPeak" 57 | ChIP-seq: 58 | aligned_filtered_bam: "{data_dir}/{sample_name}/mapped/{sample_name}.bowtie2.filtered.bam" 59 | CNV: 60 | log2_read_counts: "{data_dir}/{sample_name}/{sample_name}_{resolution}/CNAprofiles/log2_read_counts.igv" 61 | RNA-seq: 62 | aligned_filtered_bam: "{data_dir}/{sample_name}/mapped/{sample_name}.bowtie2.filtered.bam" 63 | bitseq_counts: "{data_dir}/{sample_name}/quantification/{sample_name}_bitseq.tsv" 64 | 65 | 66 | .. note:: `Not all elements are required` 67 | 68 | In fact none of it is required, but it is recommended to have a look at the template configuration file and set custom options. 69 | 70 | .. _Logging: 71 | 72 | Logging 73 | ============================= 74 | 75 | ``ngs_toolkit`` will log its operations and errors using the Python standard logging library. 76 | 77 | This will happen by default to standard output (sys.stdout) but also to a file in ``$HOME/.ngs_toolkit.log.txt``. 78 | 79 | The location of the log file and the level of events to be reported can be customized in the ``ngs_toolkit.setup_logger()`` function. 80 | 81 | 82 | .. _Versioning: 83 | 84 | Versioning 85 | ============================= 86 | 87 | ``ngs_toolkit`` will by default timestamp every output it produces (CSV and figure files). 88 | 89 | This behaviour can be controlled independently for tables and figures by setting the respective values of the configuration file: 90 | 91 | .. code-block:: yaml 92 | 93 | preferences: 94 | report: 95 | timestamp_figures: False 96 | timestamp_tables: False 97 | -------------------------------------------------------------------------------- /docs/source/manager_programs.rst: -------------------------------------------------------------------------------- 1 | Manager programs 2 | ****************************** 3 | 4 | `ngs_toolkit` comes with two programs that provide a command line interface (CLI): 5 | - ``projectmanager`` handles the creation and execution of a `looper` project, providing sensible configuration templates and git-enabled tracking of changes. 6 | - ``trackmanager`` handles the creation of a UCSC trackhub or IGV link for ATAC/ChIP-seq data based on bigWig files created by ``looper`` pipelines. 7 | 8 | 9 | Here you can see the command-line usage instructions for the main looper command and for each subcommand: 10 | 11 | 12 | projectmanager 13 | ============================= 14 | 15 | .. code-block:: none 16 | 17 | usage: projectmanager [-h] {create,recipe} ... 18 | 19 | projectmanager - A project manager. 20 | 21 | positional arguments: 22 | {create,recipe} 23 | create Create project. 24 | recipe Run ngs_toolkit recipe for a given project. 25 | 26 | optional arguments: 27 | -h, --help show this help message and exit 28 | 29 | https://github.com/afrendeiro/toolkit 30 | 31 | 32 | 33 | projectmanager::create 34 | ----------------------------- 35 | 36 | .. code-block:: none 37 | 38 | usage: projectmanager create [-h] [-r ROOT_DIR] [-d] [--overwrite] 39 | project_name 40 | 41 | Create project. 42 | 43 | positional arguments: 44 | project_name Project name. 45 | 46 | optional arguments: 47 | -h, --help show this help message and exit 48 | -r ROOT_DIR, --root-dir ROOT_DIR 49 | Root directory to create projects. 50 | -d, --dry-run Don't actually do anything. 51 | --overwrite Don't overwrite any existing directory or file. 52 | 53 | 54 | projectmanager::recipe 55 | ----------------------------- 56 | 57 | .. code-block:: none 58 | 59 | usage: projectmanager recipe [-h] recipe_name project_config 60 | 61 | Run recipe. 62 | 63 | positional arguments: 64 | recipe_name Recipe name. 65 | project_config Project config. 66 | 67 | optional arguments: 68 | -h, --help show this help message and exit 69 | 70 | 71 | trackmanager 72 | ============================= 73 | 74 | .. code-block:: none 75 | 76 | usage: trackmanager [-h] [-a [ATTRIBUTES]] [-c COLOR_ATTRIBUTE] [-r] [-l] 77 | project_config_file 78 | 79 | positional arguments: 80 | project_config_file 81 | 82 | optional arguments: 83 | -h, --help show this help message and exit 84 | -a [ATTRIBUTES], --attrs [ATTRIBUTES] 85 | Sample attributes (annotation sheet columns) to use to 86 | order tracks. Add attributes comma-separated with no 87 | whitespace. 88 | -c COLOR_ATTRIBUTE, --color-attr COLOR_ATTRIBUTE 89 | Sample attribute to use to color tracks with. Default 90 | is first attribute passed. 91 | -r, --overlay-replicates 92 | Whether replicate samples should be overlaied in same 93 | track. Default=False. 94 | -l, --link Whether bigWig files should be soft-linked to the 95 | track database directory. Default=False. 96 | 97 | 98 | .. note:: `Copying vs linking bigWig files files in trackmanager` 99 | 100 | The intention of trackmanager is to create a hierarchy of files in a HTTP server which can be used by genome browsers. 101 | This requires files (and their parent directories) to be readable and executable. 102 | When soft-linking files, they will retain the permission attributes of the original files and this may not be appropriate to serve through a server. 103 | Be aware that copying or linking these files does not always works (manual movement of files might be required). 104 | 105 | 106 | .. note:: `Changing permissions of files and directories in bigwig directory` 107 | 108 | Trackmanager will try to change the permissions of the bigwig files and their parent directories to allow reading and execution by everyone. 109 | Be aware that this does not always works (manual permission changes might be required). 110 | 111 | -------------------------------------------------------------------------------- /docs/source/recipes.rst: -------------------------------------------------------------------------------- 1 | Recipes 2 | =========== 3 | 4 | 5 | ``ngs_toolkit`` provides scripts to perform routine tasks on NGS data - 6 | they are called ``recipes``. 7 | 8 | Recipes are distributed with ``ngs_toolkit`` and can be seen in the 9 | `github repository `_. 10 | 11 | To make it convenient to run the scripts on data from a project, 12 | recipes can also be run with the command 13 | ``projectmanager recipe ``. 14 | 15 | 16 | ngs_toolkit.recipes.ngs_analysis 17 | -------------------------------- 18 | 19 | .. argparse:: 20 | :module: ngs_toolkit.recipes.ngs_analysis 21 | :func: parse_arguments 22 | 23 | 24 | ngs_toolkit.recipes.call_peaks 25 | ------------------------------ 26 | 27 | .. argparse:: 28 | :module: ngs_toolkit.recipes.call_peaks 29 | :func: parse_arguments 30 | 31 | 32 | ngs_toolkit.recipes.coverage 33 | ---------------------------- 34 | 35 | .. argparse:: 36 | :module: ngs_toolkit.recipes.coverage 37 | :func: parse_arguments 38 | 39 | 40 | ngs_toolkit.recipes.deseq2 41 | -------------------------- 42 | 43 | .. argparse:: 44 | :module: ngs_toolkit.recipes.deseq2 45 | :func: parse_arguments 46 | 47 | 48 | ngs_toolkit.recipes.enrichr 49 | --------------------------- 50 | 51 | .. argparse:: 52 | :module: ngs_toolkit.recipes.enrichr 53 | :func: parse_arguments 54 | 55 | 56 | ngs_toolkit.recipes.generate_project 57 | ------------------------------------ 58 | 59 | .. argparse:: 60 | :module: ngs_toolkit.recipes.generate_project 61 | :func: parse_arguments 62 | 63 | 64 | ngs_toolkit.recipes.lola 65 | ------------------------ 66 | 67 | .. argparse:: 68 | :module: ngs_toolkit.recipes.lola 69 | :func: parse_arguments 70 | 71 | 72 | ngs_toolkit.recipes.merge_signal 73 | -------------------------------- 74 | 75 | .. argparse:: 76 | :module: ngs_toolkit.recipes.merge_signal 77 | :func: parse_arguments 78 | 79 | 80 | ngs_toolkit.recipes.region_enrichment 81 | ------------------------------------- 82 | 83 | .. argparse:: 84 | :module: ngs_toolkit.recipes.region_enrichment 85 | :func: parse_arguments 86 | 87 | 88 | ngs_toolkit.recipes.region_set_frip 89 | ----------------------------------- 90 | 91 | .. argparse:: 92 | :module: ngs_toolkit.recipes.region_set_frip 93 | :func: parse_arguments 94 | 95 | -------------------------------------------------------------------------------- /docs/source/report.rst: -------------------------------------------------------------------------------- 1 | Analysis reports 2 | ****************************** 3 | 4 | .. _Report: 5 | 6 | Each analysis object in the ``ngs_toolkit`` will by default record the outputs it produces (e.g. tables, figures). 7 | This allows the collection of all outputs in a standardized way and the generation of an HTML report. 8 | 9 | By default the location of the report will be in: ``/.analysis_report.html`` 10 | 11 | Every time a new output is produced, a new report is generated, in a way that analysis progress can be easily monitored in a user-friendly way by simply refreshing the HTML report file. This continuous generation behaviour can be controlled in the configuration file. 12 | 13 | The recording behaviour can also be controlled independently for tables and figures by setting the respective values of the configuration file: 14 | 15 | .. code-block:: yaml 16 | 17 | preferences: 18 | report: 19 | record_figures: True 20 | record_csv: True 21 | continuous_generation: True 22 | 23 | The report will by default be generated in the root of the project directory, but this can be controlled by manually calling the :func:`ngs_toolkit.analysis.Analysis.generate_report` function at the user's will. 24 | -------------------------------------------------------------------------------- /docs/source/test.rst: -------------------------------------------------------------------------------- 1 | Testing 2 | ============================= 3 | 4 | To make sure everything is correctly configured, the user is encouraged to test the library prior to use. 5 | 6 | In order to do this, install testing requirements and simply run ``pytest``: 7 | 8 | .. code-block:: bash 9 | 10 | pip install ngs-toolkit[testing] 11 | pytest --pyargs ngs_toolkit 12 | 13 | 14 | Pytest will output summary results (`see for example `_) and further outputs can be seen in ``${TMPDIR}/pytest-of-${USER}/`` or ``/tmp/pytest-of-${USER}/`` if $TMPDIR is not defined. 15 | 16 | -------------------------------------------------------------------------------- /docs/source/usage.rst: -------------------------------------------------------------------------------- 1 | Quick usage 2 | ============================= 3 | 4 | 5 | Interactive usage through the API 6 | --------------------------------- 7 | 8 | To use a particular class or function from the toolkit, simply import it 9 | following the structure of the library: 10 | 11 | .. code-block:: python 12 | 13 | from ngs_toolkit import ATACSeqAnalysis 14 | from ngs_toolkit.utils import log_p_values 15 | 16 | The :class:`ngs_toolkit.analysis.Analysis` and their data type-specific 17 | children are the main drivers of the workflow, storing attributes and providing 18 | various methods through an OOP interface: 19 | 20 | .. code-block:: python 21 | 22 | from ngs_toolkit.demo import generate_project 23 | 24 | an = generate_project(data_type="ATAC-seq", sample_input_files=True) 25 | an.measure_coverage() 26 | an.normalize() 27 | an.unsupervised_analysis() 28 | an.differential_analysis() 29 | an.plot_differential() 30 | an.get_peak_gene_annotation() 31 | an.annotate_features() 32 | an.differential_enrichment(steps=['enrichr']) 33 | an.plot_differential_enrichment() 34 | 35 | 36 | Running recipes through the command-line interface 37 | -------------------------------------------------- 38 | 39 | ``ngs_toolkit`` also has some command-line programs on some commonly used 40 | workflows (here called ``recipes``), which can be run in the following manner: 41 | 42 | .. code-block:: bash 43 | 44 | PEP=`python -m ngs_toolkit.recipes.generate_project --sample-input-files True` 45 | python -m ngs_toolkit.recipes.ngs_analysis $PEP 46 | 47 | This example is roughly equivalent to the on above with interactive usage. 48 | -------------------------------------------------------------------------------- /ngs_toolkit/.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/source/conf.py 11 | 12 | # Optionally build your docs in additional formats such as PDF and ePub 13 | formats: 14 | - htmlzip 15 | 16 | # Optionally set the version of Python and requirements required to build your docs 17 | python: 18 | version: 3.7 19 | install: 20 | - requirements: requirements/requirements.docs.txt 21 | -------------------------------------------------------------------------------- /ngs_toolkit/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | from joblib import Memory 5 | 6 | try: 7 | # Even though there is no "ngs_toolkit/_version" file, 8 | # it should be generated by 9 | # setuptools_scm when building the package 10 | from ngs_toolkit._version import __version__ 11 | except ImportError: 12 | from setuptools_scm import get_version as _get_version 13 | 14 | __version__ = _get_version(root="..", relative_to=__file__) 15 | 16 | 17 | # Setup joblib memory 18 | JOBLIB_CACHE_DIR = os.path.join(os.path.expanduser("~"), ".ngs_toolkit") 19 | MEMORY = Memory(location=JOBLIB_CACHE_DIR, verbose=0) 20 | 21 | 22 | def setup_logger(name="ngs_toolkit", level="INFO", logfile=None): 23 | """ 24 | Set up a logger for the library. 25 | 26 | Parameters 27 | ---------- 28 | 29 | level : :obj:`str`, optional 30 | Level of logging to display. 31 | See possible levels here: 32 | https://docs.python.org/2/library/logging.html#levels 33 | 34 | Defaults to "INFO". 35 | 36 | logfile : :obj:`str`, optional 37 | File to write log to. 38 | 39 | Defaults to "~/.ngs_toolkit.log.txt". 40 | 41 | Returns 42 | ------- 43 | :class:`logging.Logger` 44 | A logger called "ngs_toolkit". 45 | """ 46 | import logging 47 | import os 48 | 49 | _LOGGER = logging.getLogger(name) 50 | _LOGGER.setLevel(logging.DEBUG) 51 | # create file handler which logs even debug messages 52 | if logfile is None: 53 | logfile = os.path.join(os.path.expanduser("~"), "." + name + ".log.txt") 54 | fh = logging.FileHandler(logfile) 55 | fh.setLevel(logging.DEBUG) 56 | # create console handler with a higher log level 57 | ch = logging.StreamHandler() 58 | ch.setLevel(logging.getLevelName(level)) 59 | # create formatter and add it to the handlers 60 | fmt = "ngs_toolkit.v{}:%(module)s:L%(lineno)d ".format(__version__) 61 | fmt += "(%(funcName)s) [%(levelname)s] %(asctime)s > %(message)s" 62 | formatter = logging.Formatter(fmt, datefmt="%Y-%m-%d %H:%M:%S") 63 | fh.setFormatter(formatter) 64 | # fmt = "[%(levelname)s] > %(message)s" 65 | fmt = "ngs_toolkit:%(module)s:L%(lineno)d (%(funcName)s) [%(levelname)s]" 66 | fmt += " > %(message)s" 67 | formatter = logging.Formatter(fmt) 68 | ch.setFormatter(formatter) 69 | # add the handlers to the logger 70 | _LOGGER.addHandler(fh) 71 | _LOGGER.addHandler(ch) 72 | 73 | _LOGGER.debug( 74 | "This is ngs_toolkit (http://ngs-toolkit.rtfd.io), version: {}".format(__version__) 75 | ) 76 | return _LOGGER 77 | 78 | 79 | def setup_config(custom_yaml_config=None): 80 | """ 81 | Set up global library configuration. 82 | 83 | It reads ngs_toolkit's package data to load a default configuration, 84 | tries to update it by reading a file in ``~/.ngs_toolkit.config.yaml`` 85 | if present, and lastly, updates it by reading a possible passed yaml file 86 | ``custom_yaml_config``. 87 | Non-exisiting fields will maintain the previous values, so that the user 88 | needs only to specify the section(s) as needed. 89 | 90 | Parameters 91 | ---------- 92 | custom_yaml_config : :obj:`str`, optional 93 | Path to YAML file with configuration. 94 | To see the structure of the YAML file, see 95 | https://github.com/afrendeiro/toolkit/blob/master/ngs_toolkit/config/default.yaml 96 | 97 | Defaults to :obj:`None`. 98 | 99 | Returns 100 | ------- 101 | :obj:`dict` 102 | Dictionary with configurations. 103 | """ 104 | import pkg_resources 105 | import os 106 | import yaml 107 | from ngs_toolkit.utils import _format_string_with_environment_variables 108 | 109 | default_config_path = "config/default.yaml" 110 | default_config_path = pkg_resources.resource_filename(__name__, default_config_path) 111 | _LOGGER.debug( 112 | "Reading default configuration file distributed" 113 | " with package from '{}'.".format(default_config_path) 114 | ) 115 | try: 116 | _CONFIG = yaml.safe_load(open(default_config_path, "r")) 117 | _LOGGER.debug("Default config: {}".format(_CONFIG)) 118 | except IOError: 119 | _LOGGER.error("Couldn't read configuration file from '{}'.".format(default_config_path)) 120 | _CONFIG = dict() 121 | 122 | user_config_path = os.path.join(os.path.expanduser("~"), ".ngs_toolkit.config.yaml") 123 | if os.path.exists(user_config_path): 124 | # Read up and format user variables 125 | _LOGGER.debug("Found custom user config: {}".format(user_config_path)) 126 | try: 127 | string = open(user_config_path, "r").read() 128 | # filter out comments (to prevent formating stuff there) 129 | string = "\n".join(filter(lambda x: not x.strip().startswith("#"), string.split("\n"))) 130 | string = _format_string_with_environment_variables(string) 131 | 132 | custom_config = yaml.safe_load(string) 133 | _LOGGER.debug("Custom user config: {}".format(custom_config)) 134 | # Update config 135 | _LOGGER.debug( 136 | "Updating configuration with custom file from '{}'.".format(user_config_path) 137 | ) 138 | _CONFIG.update(custom_config) 139 | _LOGGER.debug("Current config: {}".format(custom_config)) 140 | except IOError: 141 | _LOGGER.error( 142 | "Configuration file from '{}' exists but is not readable." 143 | " Ignoring.".format(user_config_path) 144 | ) 145 | else: 146 | _LOGGER.debug( 147 | "To use custom configurations including paths to static files," 148 | " create a '{}' file.".format(user_config_path) 149 | ) 150 | 151 | if custom_yaml_config is not None: 152 | # Read up 153 | try: 154 | custom_config = yaml.safe_load(open(custom_yaml_config, "r")) 155 | _LOGGER.debug("Custom passed config: {}".format(custom_config)) 156 | # Update config 157 | _LOGGER.debug( 158 | "Updating configuration with custom file from '{}'.".format(custom_yaml_config) 159 | ) 160 | _CONFIG.update(custom_config) 161 | _LOGGER.debug("Current config: {}".format(custom_config)) 162 | except IOError as e: 163 | _LOGGER.error( 164 | "Passed configuration from '{}' exists but is not readable.".format( 165 | custom_yaml_config 166 | ) 167 | ) 168 | raise e 169 | 170 | return _CONFIG 171 | 172 | 173 | def setup_graphic_preferences(): 174 | """ 175 | Set up graphic preferences. 176 | 177 | It uses the values under "preferences:graphics:matplotlib:rcParams" 178 | and "preferences:graphics:seaborn:parameters" to matplotlib 179 | and seaborn respectively. 180 | """ 181 | import matplotlib 182 | import seaborn as sns 183 | 184 | graphics = _CONFIG["preferences"]["graphics"] 185 | # matplotlib 186 | rc_params = graphics["matplotlib"]["rcParams"] 187 | matplotlib.rcParams.update(rc_params) 188 | matplotlib.rcParams["svg.fonttype"] = "none" 189 | matplotlib.rc("text", usetex=False) 190 | 191 | # seaborn 192 | seaborn_params = graphics["seaborn"]["parameters"] 193 | sns.set(**seaborn_params) 194 | 195 | 196 | def clear_log(): 197 | import os 198 | 199 | logfile = os.path.join(os.path.expanduser("~"), ".ngs_toolkit.log.txt") 200 | open(logfile, "w") 201 | 202 | 203 | def setup_timestamping(): 204 | if _CONFIG["preferences"]["report"]["record_csv"]: 205 | from ngs_toolkit.decorators import ( 206 | read_csv_timestamped, 207 | to_csv_timestamped, 208 | timestamped_input, 209 | ) 210 | import pandas as pd 211 | 212 | pd.io.parsers.TextFileReader = read_csv_timestamped(pd.io.parsers.TextFileReader) 213 | pd.DataFrame.to_csv = to_csv_timestamped( 214 | pd.DataFrame.to_csv, exclude_functions=["from_dataframe"] 215 | ) 216 | 217 | os.remove = timestamped_input(os.remove) 218 | 219 | 220 | def check_bedtools_version(): 221 | import pybedtools 222 | 223 | version = pybedtools.helpers.settings.bedtools_version 224 | # not existing 225 | v = ".".join([str(x) for x in version]) 226 | msg = "Bedtools does not seem to be installed or is not in $PATH." 227 | if v == "": 228 | _LOGGER.warning(msg) 229 | return None 230 | 231 | # too low version 232 | msg = "Bedtools version '{}' is smaller than 2.26.".format(v) 233 | msg += " Please upgrade to newer version." 234 | if (version[0] < 2) or (version[1] < 26): 235 | _LOGGER.warning(msg) 236 | return None 237 | return v 238 | 239 | 240 | # setup 241 | _LOGGER = setup_logger() 242 | _CONFIG = setup_config() 243 | check_bedtools_version() 244 | setup_graphic_preferences() 245 | setup_timestamping() 246 | 247 | 248 | # Easier API: 249 | # import all children of Analysis class 250 | from ngs_toolkit.analysis import Analysis 251 | from ngs_toolkit.atacseq import ATACSeqAnalysis 252 | from ngs_toolkit.chipseq import ChIPSeqAnalysis 253 | from ngs_toolkit.cnv import CNVAnalysis 254 | from ngs_toolkit.rnaseq import RNASeqAnalysis 255 | -------------------------------------------------------------------------------- /ngs_toolkit/config/default.yaml: -------------------------------------------------------------------------------- 1 | username: 2 | email: 3 | website_root: 4 | supported_data_types: 5 | - ATAC-seq 6 | - ChIP-seq 7 | - RNA-seq 8 | - CNV 9 | preferences: 10 | # For the next item, environment variables are formatted if they are of the form ${VAR} 11 | root_reference_dir: 12 | root_projects_dir: 13 | default_genome_assemblies: 14 | - human: hg38 15 | - mouse: mm10 16 | # The next item is the default computing configuration to use from divvy. 17 | # Run "divvy list" to see all options. 18 | # See more here: http://code.databio.org/divvy/ 19 | computing_configuration: 'default' 20 | report: 21 | record_figures: True 22 | record_csv: True 23 | continuous_generation: True 24 | timestamp_figures: True 25 | timestamp_tables: True 26 | graphics: 27 | matplotlib: 28 | backend: TkAgg # Agg 29 | # key:values under rcParams are used to update matplotlib.rcParams 30 | rcParams: 31 | # this ensures text in plots is exported as text objects 32 | svg.fonttype: "none" 33 | seaborn: 34 | # key:values under parameters are passed to seaborn.set 35 | parameters: 36 | context: "paper" 37 | style: "ticks" 38 | palette: "colorblind" 39 | color_codes: True 40 | figure_saving: 41 | # these arguments are passed to matplotlib.pyplot.savefig 42 | # https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html 43 | format: svg 44 | dpi: 300 45 | bbox_inches: "tight" 46 | close_saved_figures: True 47 | 48 | sample_input_files: 49 | # values in this section can use string formatting 50 | # of the form {variable} to be completed with variables from the sample objects 51 | # Example: 52 | # ATAC-seq: 53 | # aligned_filtered_bam: 54 | # "{data_dir}/{sample_name}/mapped/{sample_name}.bowtie2.filtered.bam" 55 | ATAC-seq: 56 | aligned_filtered_bam: 57 | peaks: 58 | summits: 59 | ChIP-seq: 60 | aligned_filtered_bam: 61 | ChIPmentation: 62 | aligned_filtered_bam: 63 | CNV: 64 | log2_read_counts: 65 | 1000kb: 66 | 100kb: 67 | 10kb: 68 | RNA-seq: 69 | aligned_filtered_bam: 70 | counts: 71 | 72 | resources: 73 | lola: 74 | region_databases: 75 | # under each section, there should be a list of items 76 | hg19: 77 | - 78 | - 79 | hg38: 80 | - 81 | - 82 | mm10: 83 | - 84 | - 85 | region_set_labeling_columns: 86 | - "collection" 87 | - "description" 88 | - "filename" 89 | - "cellType" 90 | - "tissue" 91 | - "antibody" 92 | - "treatment" 93 | output_column_names: 94 | odds_ratio: "oddsRatio" 95 | log_p_value: "pValueLog" 96 | meme: 97 | motif_databases: 98 | human: 99 | mouse: 100 | vertebrate: 101 | motif_id_mapping: 102 | mouse: 103 | enrichr: 104 | gene_set_libraries: 105 | # this should be a list of items 106 | - "GO_Biological_Process_2015" 107 | - "ChEA_2015" 108 | - "KEGG_2016" 109 | - "ESCAPE" 110 | - "Epigenomics_Roadmap_HM_ChIP-seq" 111 | - "ENCODE_TF_ChIP-seq_2015" 112 | - "ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X" 113 | - "ENCODE_Histone_Modifications_2015" 114 | - "OMIM_Expanded" 115 | - "TF-LOF_Expression_from_GEO" 116 | - "Gene_Perturbations_from_GEO_down" 117 | - "Gene_Perturbations_from_GEO_up" 118 | - "Disease_Perturbations_from_GEO_down" 119 | - "Disease_Perturbations_from_GEO_up" 120 | - "Drug_Perturbations_from_GEO_down" 121 | - "Drug_Perturbations_from_GEO_up" 122 | - "WikiPathways_2016" 123 | - "Reactome_2016" 124 | - "BioCarta_2016" 125 | - "NCI-Nature_2016" 126 | - "BioPlanet_2019" 127 | 128 | executables: 129 | twoBitToFa: twoBitToFa 130 | fasta-dinucleotide-shuffle: fasta-dinucleotide-shuffle 131 | ame: ame 132 | findMotifsGenome.pl: findMotifsGenome.pl 133 | compareMotifs.pl: compareMotifs.pl 134 | -------------------------------------------------------------------------------- /ngs_toolkit/config/example.yaml: -------------------------------------------------------------------------------- 1 | username: arendeiro 2 | email: arendeiro@cemm.oeaw.ac.at 3 | website_root: http://biomedical-sequencing.at/bocklab/arendeiro/ 4 | supported_data_types: 5 | - ATAC-seq 6 | - ChIP-seq 7 | - RNA-seq 8 | - CNV 9 | preferences: 10 | # For the next item, environment variables are formatted if they are of the form ${VAR} 11 | root_reference_dir: /home/${USER}/reference/ 12 | root_projects_dir: /home/${USER}/projects/ 13 | default_genome_assemblies: 14 | - human: hg38 15 | - mouse: mm10 16 | # The next item is the default computing configuration to use from divvy. 17 | # Run "divvy list" to see all options. 18 | # See more here: http://code.databio.org/divvy/ 19 | computing_configuration: 'default' 20 | report: 21 | record_figures: True 22 | record_csv: True 23 | continuous_generation: True 24 | timestamp_figures: True 25 | timestamp_tables: True 26 | graphics: 27 | matplotlib: 28 | backend: TkAgg # Agg 29 | # key:values under rcParams are used to update matplotlib.rcParams 30 | rcParams: 31 | # this ensures text in plots is exported as text objects 32 | svg.fonttype: "none" 33 | seaborn: 34 | # key:values under parameters are passed to seaborn.set 35 | parameters: 36 | context: "paper" 37 | style: "white" 38 | palette: "colorblind" 39 | color_codes: True 40 | figure_saving: 41 | # these arguments are passed to matplotlib.pyplot.savefig 42 | # https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html 43 | format: svg 44 | dpi: 300 45 | bbox_inches: "tight" 46 | close_saved_figures: True 47 | 48 | sample_input_files: 49 | # values in this section can use string formatting 50 | # of the form {variable} to be completed with variables from the sample objects 51 | # Example: 52 | # ATAC-seq: 53 | # aligned_filtered_bam: 54 | # "{data_dir}/{sample_name}/mapped/{sample_name}.trimmed.bowtie2.filtered.bam" 55 | ATAC-seq: 56 | aligned_filtered_bam: "{data_dir}/{sample_name}/mapped/{sample_name}.trimmed.bowtie2.filtered.bam" 57 | peaks: "{data_dir}/{sample_name}/peaks/{sample_name}_peaks.narrowPeak" 58 | summits: "{data_dir}/{sample_name}/peaks/{sample_name}_summits.bed" 59 | ChIP-seq: 60 | aligned_filtered_bam: "{data_dir}/{sample_name}/mapped/{sample_name}.trimmed.bowtie2.filtered.bam" 61 | ChIPmentation: 62 | aligned_filtered_bam: "{data_dir}/{sample_name}/mapped/{sample_name}.trimmed.bowtie2.filtered.bam" 63 | CNV: 64 | log2_read_counts: 65 | 1000kb: "{data_dir}/{sample_name}/{sample_name}_1000kb/CNAprofiles/log2_read_counts.igv" 66 | 100kb: "{data_dir}/{sample_name}/{sample_name}_100kb/CNAprofiles/log2_read_counts.igv" 67 | 10kb: "{data_dir}/{sample_name}/{sample_name}_10kb/CNAprofiles/log2_read_counts.igv" 68 | RNA-seq: 69 | aligned_filtered_bam: "{data_dir}/{sample_name}/mapped/{sample_name}.trimmed.bowtie2.filtered.bam" 70 | counts: "{data_dir}/{sample_name}/bowtie1_{genome}/bitSeq/{sample_name}.counts" 71 | 72 | resources: 73 | lola: 74 | region_databases: 75 | # under each section, there should be a list of items 76 | hg19: 77 | - /home/${USER}/resources/regions/LOLACore/hg19/ 78 | - /home/${USER}/resources/regions/customRegionDB/hg19/ 79 | hg38: 80 | - /home/${USER}/resources/regions/LOLACore/hg38/ 81 | - /home/${USER}/resources/regions/customRegionDB/hg38/ 82 | mm10: 83 | - /home/${USER}/resources/regions/LOLACore/mm10/ 84 | - /home/${USER}/resources/regions/customRegionDB/mm10/ 85 | region_set_labeling_columns: 86 | - "collection" 87 | - "description" 88 | - "filename" 89 | - "cellType" 90 | - "tissue" 91 | - "antibody" 92 | - "treatment" 93 | output_column_names: 94 | odds_ratio: "oddsRatio" 95 | log_p_value: "pValueLog" 96 | meme: 97 | motif_databases: 98 | human: /home/${USER}/resources/motifs/motif_databases/HUMAN/HOCOMOCOv10.meme 99 | mouse: /home/${USER}/resources/motifs/motif_databases/MOUSE/uniprobe_mouse.meme 100 | vertebrate: /home/arendeiro/workspace/homer_4.8/data/knownTFs/vertebrates/known.motifs 101 | motif_id_mapping: 102 | mouse: /home/${USER}/resources/motifs/motif_databases/MOUSE/uniprobe_mouse.id_mapping.tsv 103 | enrichr: 104 | gene_set_libraries: 105 | # this should be a list of items 106 | - "GO_Biological_Process_2015" 107 | - "ChEA_2015" 108 | - "KEGG_2016" 109 | - "ESCAPE" 110 | - "Epigenomics_Roadmap_HM_ChIP-seq" 111 | - "ENCODE_TF_ChIP-seq_2015" 112 | - "ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X" 113 | - "ENCODE_Histone_Modifications_2015" 114 | - "OMIM_Expanded" 115 | - "TF-LOF_Expression_from_GEO" 116 | - "Gene_Perturbations_from_GEO_down" 117 | - "Gene_Perturbations_from_GEO_up" 118 | - "Disease_Perturbations_from_GEO_down" 119 | - "Disease_Perturbations_from_GEO_up" 120 | - "Drug_Perturbations_from_GEO_down" 121 | - "Drug_Perturbations_from_GEO_up" 122 | - "WikiPathways_2016" 123 | - "Reactome_2016" 124 | - "BioCarta_2016" 125 | - "NCI-Nature_2016" 126 | - "BioPlanet_2019" 127 | 128 | executables: 129 | twoBitToFa: twoBitToFa 130 | fasta-dinucleotide-shuffle: fasta-dinucleotide-shuffle 131 | ame: ame 132 | findMotifsGenome.pl: findMotifsGenome.pl 133 | compareMotifs.pl: compareMotifs.pl -------------------------------------------------------------------------------- /ngs_toolkit/constants.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | organism_to_species_mapping = { 4 | "human": "hsapiens", 5 | "mouse": "mmusculus", 6 | "yeast": "scerevisiae", 7 | } 8 | organism_to_latest_ensembl_mapping = { 9 | "human": "grch38", 10 | "mouse": "grcm38", 11 | "yeast": "R64", 12 | } 13 | genome_to_organism_mapping = { 14 | "hg38": "human", 15 | "hg19": "human", 16 | "mm10": "mouse" 17 | } 18 | ucsc_to_ensembl_mapping = { 19 | "hg38": "grch38", 20 | "hg19": "grch37", 21 | "mm10": "grcm38", 22 | "mm9": "grcm37", 23 | } 24 | genome_to_ensembl_mapping = ucsc_to_ensembl_mapping 25 | -------------------------------------------------------------------------------- /ngs_toolkit/decorators.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from functools import wraps 4 | from ngs_toolkit import _LOGGER 5 | from ngs_toolkit.utils import warn_or_raise 6 | 7 | 8 | def check_has_attributes(attributes=None, object_types=None, permissive=False): 9 | attributes = [] or attributes 10 | object_types = [None] * len(attributes) or object_types 11 | if len(attributes) != len(object_types): 12 | msg = "`attributes` and `object_types` arguments must be the same length." 13 | _LOGGER.error(msg) 14 | raise ValueError(msg) 15 | 16 | def decorator(f): 17 | @wraps(f) 18 | def wrapper(*args, **kwargs): 19 | import pandas as pd 20 | 21 | # check attributes are set 22 | msg = "Analysis '{}' attribute(s) are not set." 23 | has = pd.Series( 24 | [hasattr(args[0], attr) for attr in attributes], 25 | index=attributes) 26 | if not has.all(): 27 | warn_or_raise(AttributeError(msg.format(",".join(has[~has].index))), permissive) 28 | 29 | # check attributes are not None 30 | msg = "Analysis '{}' attribute(s) are None." 31 | not_none = pd.Series( 32 | [getattr(args[0], attr) is not None for attr in attributes], 33 | index=attributes) 34 | if not not_none.all(): 35 | warn_or_raise(AttributeError(msg.format(",".join(not_none[~not_none].index))), permissive) 36 | 37 | # check the type of attribute values matches requested 38 | msg = "Analysis '{}' attribute(s) are not of requested types '{}'." 39 | t_attributes = [a for a, t in zip(attributes, object_types) if t is not None] 40 | t_object_types = [t for a, t in zip(attributes, object_types) if t is not None] 41 | not_type = pd.Series( 42 | [isinstance(getattr(args[0], attr), t) is not None 43 | for attr, t in zip(t_attributes, t_object_types)], 44 | index=t_attributes, dtype=object) 45 | if not not_type.all(): 46 | warn_or_raise( 47 | AttributeError(msg.format( 48 | ",".join(not_type[~not_type].index), 49 | ",".join([str(t) for t in t_object_types]))), 50 | permissive) 51 | 52 | # for iterable types, check length > 0 53 | msg = "Analysis '{}' attribute(s) have 0 elements." 54 | i_attributes = [a for a, t in zip(attributes, object_types) if hasattr(a, "__iter__")] 55 | i_object_types = [t for a, t in zip(attributes, object_types) if hasattr(a, "__iter__")] 56 | not_empty = pd.Series( 57 | [len(getattr(args[0], attr)) > 0 for attr in i_attributes], 58 | index=i_attributes) 59 | if not not_empty.all(): 60 | warn_or_raise( 61 | AttributeError(msg.format( 62 | ",".join(not_empty[~not_empty].index), 63 | ",".join([str(t) for t in i_object_types]))), 64 | permissive) 65 | return f(*args, **kwargs) 66 | return wrapper 67 | return decorator 68 | 69 | 70 | def read_csv_timestamped(f): 71 | from ngs_toolkit.utils import get_this_file_or_timestamped 72 | @wraps(f) 73 | def wrapper(*args, **kwargs): 74 | for i, _ in enumerate(args): 75 | if isinstance(args[i], str): 76 | args = args[:i] + ( 77 | get_this_file_or_timestamped(args[i]),) + args[i + 1:] 78 | return f(*args, **kwargs) 79 | return wrapper 80 | 81 | 82 | def to_csv_timestamped(f, exclude_functions=None): 83 | 84 | # TODO: fix to files without "." (dot) 85 | from ngs_toolkit.utils import ( 86 | record_analysis_output, get_timestamp, 87 | is_analysis_descendent) 88 | from ngs_toolkit import _CONFIG 89 | 90 | @wraps(f) 91 | def wrapper(*args, **kwargs): 92 | if is_analysis_descendent(exclude_functions=exclude_functions): 93 | # Add timestamp 94 | if _CONFIG["preferences"]["report"]["timestamp_tables"]: 95 | if len(args) > 1: 96 | if isinstance(args[1], str): 97 | s = args[1].split(".") 98 | end = s[-1] 99 | body = ".".join(s[:-1]) 100 | args = (args[0], ".".join([body, get_timestamp(), end])) + args[2:] 101 | record_analysis_output(args[1]) 102 | else: 103 | if isinstance(args[0], str): 104 | s = args[0].split(".") 105 | end = s[-1] 106 | body = ".".join(s[:-1]) 107 | args = (".".join([body, get_timestamp(), end])) + args[1:] 108 | record_analysis_output(args[0]) 109 | return f(*args, **kwargs) 110 | return wrapper 111 | 112 | 113 | def timestamped_input(f): 114 | from ngs_toolkit.utils import get_this_file_or_timestamped 115 | 116 | @wraps(f) 117 | def wrapper(file): 118 | return f(get_this_file_or_timestamped(file)) 119 | return wrapper 120 | -------------------------------------------------------------------------------- /ngs_toolkit/demo/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | A module providing functions to generate Analysis, Projects and their data. 5 | """ 6 | 7 | from ngs_toolkit.demo.data_generator import ( 8 | generate_count_matrix, generate_data, 9 | generate_project, generate_projects) 10 | -------------------------------------------------------------------------------- /ngs_toolkit/exceptions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | class Error(Exception): 5 | """Base class for exceptions in this module.""" 6 | pass 7 | 8 | 9 | class InputError(Error): 10 | """ 11 | Exception raised for errors in the input. 12 | 13 | Attributes 14 | ---------- 15 | expr : 16 | Input expression in which the error occurred 17 | msg : :obj:`str` 18 | Explanation of the error 19 | """ 20 | 21 | def __init__(self, expr, msg): 22 | self.expr = expr 23 | self.msg = msg 24 | 25 | 26 | class UserInputError(InputError): 27 | """ 28 | Exception raised for errors in the input caused by the user. 29 | 30 | Attributes 31 | ---------- 32 | expr : 33 | Input expression in which the error occurred 34 | msg : :obj:`str` 35 | Explanation of the error 36 | """ 37 | pass 38 | 39 | 40 | class NetworkError(Error): 41 | """ 42 | Exception raised for errors in API calls over the internet. 43 | 44 | Attributes 45 | ---------- 46 | expr : 47 | Input expression in which the error occurred 48 | msg : :obj:`str` 49 | Explanation of the error 50 | """ 51 | def __init__(self, expr, msg): 52 | self.expr = expr 53 | self.msg = msg 54 | 55 | 56 | class DependencyError(Error): 57 | """ 58 | Exception raised for errors in the input. 59 | 60 | Attributes 61 | ---------- 62 | expr : 63 | Input expression in which the error occurred 64 | msg : :obj:`str` 65 | Explanation of the error 66 | """ 67 | 68 | def __init__(self, expr, msg): 69 | self.expr = expr 70 | self.msg = msg 71 | 72 | 73 | class DependencyNotFoundError(Error): 74 | """ 75 | Exception raised for errors in the input. 76 | 77 | Attributes 78 | ---------- 79 | expr : 80 | Input expression in which the error occurred 81 | msg : :obj:`str` 82 | Explanation of the error 83 | """ 84 | 85 | def __init__(self, expr, msg): 86 | self.expr = expr 87 | self.msg = msg 88 | -------------------------------------------------------------------------------- /ngs_toolkit/parsers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | 5 | import pandas as pd 6 | 7 | 8 | def parse_ame(ame_output): 9 | """ 10 | Parse results of MEME-AME motif enrichment. 11 | 12 | Parameters 13 | ---------- 14 | ame_output : :obj:`str` 15 | MEME-AME results file. 16 | 17 | Returns 18 | ---------- 19 | pandas.DataFrame 20 | Data frame with enrichment statistics for each found TF motif. 21 | 22 | Raises 23 | ------- 24 | IOError 25 | If directory contain 26 | """ 27 | with open(ame_output, "r") as handle: 28 | lines = handle.readlines() 29 | 30 | output = list() 31 | for line in lines: 32 | # skip header lines 33 | if line[0] not in [str(i) for i in range(10)]: 34 | continue 35 | 36 | # get motif string and the first half of it (simple name) 37 | motif = line.strip().split(" ")[5].split("_")[0] 38 | # get corrected p-value 39 | q_value = float(line.strip().split(" ")[-2]) 40 | # append 41 | output.append((motif, q_value)) 42 | 43 | r = pd.Series(dict(output)).reset_index() 44 | r.columns = ["TF", "p_value"] 45 | return r 46 | 47 | 48 | def parse_homer(homer_dir): 49 | """ 50 | Parse results of HOMER findMotifs.pl de novo motif enrichment. 51 | 52 | Parameters 53 | ---------- 54 | homer_dir : :obj:`str` 55 | Directory with HOMER results. 56 | 57 | Returns 58 | ---------- 59 | pandas.DataFrame 60 | Data frame with enrichment statistics for each found TF motif. 61 | 62 | Raises 63 | ------- 64 | IOError 65 | """ 66 | import glob 67 | import re 68 | 69 | motif_htmls = sorted(glob.glob(os.path.join(homer_dir, "motif*.info.html"))) 70 | 71 | if len(motif_htmls) < 1: 72 | raise IOError("Homer directory does not contain any discovered motifs.") 73 | 74 | output = pd.DataFrame() 75 | for motif_html in motif_htmls: 76 | 77 | motif = int( 78 | re.sub( 79 | ".info.html", 80 | "", 81 | re.sub(os.path.join(homer_dir, "motif"), "", motif_html), 82 | ) 83 | ) 84 | 85 | with open(motif_html, "r") as handle: 86 | content = handle.read() 87 | 88 | # Parse table with motif info 89 | info_table = content[ 90 | re.search("""""", content) 91 | .end(): re.search("
", content) 92 | .start() 93 | ].strip() 94 | 95 | info_table = pd.DataFrame( 96 | [ 97 | x.split("") 98 | for x in info_table.replace("", "").split("") 99 | ] 100 | ) 101 | info_table.columns = ["description", "value"] 102 | info_table["description"] = info_table["description"].str.strip() 103 | info_table["motif"] = motif 104 | 105 | # Add most probable known motif name 106 | info_table["known_motif"] = content[ 107 | re.search("

", content).end(): re.search("

", content).start() 108 | ] 109 | 110 | # append 111 | output = output.append(info_table, ignore_index=True) 112 | 113 | return output.sort_values("motif") 114 | 115 | 116 | def parse_great_enrichment(input_tsv): 117 | """ 118 | Parse output from GREAT enrichment (http://great.stanford.edu). 119 | 120 | Parameters 121 | ---------- 122 | input_tsv : :obj:`str` 123 | TSV file exported from GREAT through the option "All data as .tsv" in "Global Controls". 124 | 125 | Returns 126 | ---------- 127 | pandas.DataFrame 128 | Pandas dataframe with enrichment results. 129 | """ 130 | df = pd.read_csv(input_tsv, sep="\t", skiprows=3) 131 | df.columns = df.columns.str.replace("# ", "") 132 | return df.loc[~df.iloc[:, 0].str.startswith("#")] 133 | -------------------------------------------------------------------------------- /ngs_toolkit/recipes/__init__.py: -------------------------------------------------------------------------------- 1 | # from ngs_toolkit.utils import have_unbuffered_output 2 | 3 | # have_unbuffered_output() 4 | -------------------------------------------------------------------------------- /ngs_toolkit/recipes/call_peaks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Call peaks for ChIP-seq samples given a comparison table 5 | mapping foreground-background relationships between samples. 6 | """ 7 | 8 | 9 | import sys 10 | 11 | from argparse import ArgumentParser 12 | 13 | import pandas as pd 14 | 15 | from ngs_toolkit.chipseq import ChIPSeqAnalysis 16 | 17 | 18 | def parse_arguments(): 19 | """ 20 | Global options for analysis. 21 | """ 22 | parser = ArgumentParser( 23 | prog="python -m ngs_toolkit.recipes.call_peaks", description=__doc__) 24 | parser.add_argument( 25 | dest="config_file", help="YAML project configuration file.", type=str 26 | ) 27 | parser.add_argument( 28 | "-c", 29 | "--comparison-table", 30 | dest="comparison_table", 31 | default=None, 32 | help="Comparison table to use for peak calling. If not provided will use a file" 33 | "named `comparison_table.csv` in the same directory of the given YAML Project configuration file.", 34 | type=str, 35 | ) 36 | parser.add_argument( 37 | "-t", 38 | "--only-toggle", 39 | action="store_true", 40 | dest="only_toggle", 41 | help="Whether only comparisons with 'toggle' value of '1' or 'True' should be performed.", 42 | ) 43 | parser.add_argument( 44 | "-qc", 45 | "--pass-qc", 46 | action="store_true", 47 | dest="pass_qc", 48 | help="Whether only samples with a 'pass_qc' attribute should be included." 49 | " Default is :obj:`False`.", 50 | ) 51 | parser.add_argument( 52 | "-j", 53 | "--as-jobs", 54 | action="store_true", 55 | dest="as_job", 56 | help="Whether jobs should be created for each sample, or " 57 | "it should run in serial mode.", 58 | ) 59 | parser.add_argument( 60 | "-o", 61 | "--results-output", 62 | default="results", 63 | dest="results_dir", 64 | help="Directory for analysis output files. " 65 | "Default is 'results' under the project root directory.", 66 | type=str, 67 | ) 68 | return parser 69 | 70 | 71 | def main(cli=None): 72 | args = parse_arguments().parse_args(cli) 73 | 74 | # Analysis 75 | print( 76 | "Starting Analysis from PEP configuration file: '{}'".format(args.config_file) 77 | ) 78 | analysis = ChIPSeqAnalysis( 79 | from_pep=args.config_file, results_dir=args.results_dir 80 | ) 81 | chip_data_types = ["ChIP-seq", "ChIPmentation"] 82 | analysis.samples = [s for s in analysis.samples if s.protocol == chip_data_types] 83 | 84 | # Samples 85 | # # filter QC if needed 86 | if args.pass_qc: 87 | analysis.samples = [ 88 | s for s in analysis.samples if s.pass_qc not in ["0", 0, "False", False] 89 | ] 90 | if analysis.samples: 91 | print( 92 | "Samples under consideration: '{}'. ".format( 93 | ",".join([s.name for s in analysis.samples]) 94 | ) 95 | + "Total of {} samples.".format(len([s.name for s in analysis.samples])) 96 | ) 97 | else: 98 | raise ValueError("There were no valid samples for this analysis type!") 99 | 100 | # Comparison table 101 | # # add provided 102 | if args.comparison_table is not None: 103 | analysis.comparison_table = pd.read_csv(args.comparison_table) 104 | # # or make sure analysis has one 105 | else: 106 | if not hasattr(analysis, "comparison_table"): 107 | raise ValueError( 108 | "Analysis doesn't have a 'comparison_table' and this was not provided." 109 | ) 110 | 111 | # # filter comparisons if needed 112 | if args.only_toggle: 113 | print("Filtering out comparisons marked with toggle != 1") 114 | analysis.comparison_table = analysis.comparison_table[ 115 | analysis.comparison_table["toggle"] == 1 116 | ] 117 | 118 | comps = analysis.comparison_table["comparison_name"].unique() 119 | if comps: 120 | print( 121 | "comparisons under consideration: '{}'. ".format(",".join(comps)) 122 | + "Total of {} comparisons.".format(len(comps)) 123 | ) 124 | else: 125 | raise ValueError("There were no valid comparisons in the comparison table!") 126 | 127 | # Call peaks 128 | analysis.call_peaks_from_comparisons(distributed=args.as_jobs) 129 | 130 | # # Get summary of peak calls 131 | # peak_counts = analysis.summarize_peaks_from_comparisons(comparison_table) 132 | # peak_counts.to_csv(os.path.join("results_pipeline", "chipseq_peaks", "peak_count_summary.csv"), index=False) 133 | 134 | 135 | if __name__ == "__main__": 136 | try: 137 | sys.exit(main()) 138 | except KeyboardInterrupt: 139 | print("Program canceled by user!") 140 | sys.exit(1) 141 | -------------------------------------------------------------------------------- /ngs_toolkit/recipes/coverage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | A helper script to calculate the read coverage of a BAM file 5 | in regions from a BED file. 6 | Ensures the same order and number of lines as input BED file. 7 | 8 | Software requirements: 9 | 10 | * None 11 | """ 12 | 13 | import os 14 | import sys 15 | 16 | from argparse import ArgumentParser 17 | 18 | import pandas as pd 19 | 20 | from ngs_toolkit.utils import count_reads_in_intervals 21 | from ngs_toolkit.utils import read_bed_file_three_columns 22 | from ngs_toolkit.utils import to_bed_index 23 | 24 | 25 | def parse_arguments(): 26 | """ 27 | Argument Parsing. 28 | """ 29 | parser = ArgumentParser( 30 | prog="python -m ngs_toolkit.recipes.coverage", description=__doc__) 31 | parser.add_argument( 32 | dest="bed_file", 33 | help="Input BED file with regions to quantify.", 34 | ) 35 | parser.add_argument( 36 | dest="bam_file", 37 | help="Input BAM file with reads.", 38 | ) 39 | parser.add_argument( 40 | dest="output_bed", help="Output BED file with counts for each region." 41 | ) 42 | parser.add_argument( 43 | "--no-overwrite", action="store_false", 44 | dest="overwrite", 45 | help="Whether results should not be overwritten if existing." 46 | ) 47 | return parser 48 | 49 | 50 | def main(cli=None): 51 | """Measure coverage of BAM file in BED file regions.""" 52 | print("Parsing CLI.") 53 | args = parse_arguments().parse_args(cli) 54 | 55 | if os.path.exists(args.output_bed) and (not args.overwrite): 56 | print("Output exists and `overwrite` is False, so not doing anything.") 57 | return 0 58 | 59 | print("Getting regions.") 60 | sites_str = to_bed_index(args.bed_file) 61 | print("Quantifying.") 62 | res = count_reads_in_intervals(args.bam_file, sites_str) 63 | 64 | print("Merging with input set.") 65 | # make sure there is an entry for each region in input file 66 | input_bed = read_bed_file_three_columns(args.bed_file).set_index("name") 67 | res = input_bed.join(pd.Series(res, name="sample")).fillna(0) 68 | res.loc[:, "sample"] = res.loc[:, "sample"].astype(int) 69 | 70 | print("Saving results.") 71 | res.to_csv(args.output_bed, index=False, header=False, sep="\t") 72 | 73 | print("Done.") 74 | 75 | 76 | if __name__ == "__main__": 77 | try: 78 | sys.exit(main()) 79 | except KeyboardInterrupt: 80 | print("Program canceled by user!") 81 | sys.exit(1) 82 | -------------------------------------------------------------------------------- /ngs_toolkit/recipes/deseq2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Perform differential expression using DESeq2 5 | by comparing sample groups using a formula. 6 | 7 | Software requirements: 8 | 9 | * DESeq2 10 | """ 11 | 12 | import os 13 | import pandas as pd 14 | import sys 15 | 16 | from argparse import ArgumentParser 17 | from ngs_toolkit.general import deseq_analysis 18 | 19 | 20 | def parse_arguments(): 21 | parser = ArgumentParser( 22 | prog="python -m ngs_toolkit.recipes.deseq2", description=__doc__) 23 | parser.add_argument( 24 | dest="work_dir", 25 | help="Working directory. Should contain required files for DESeq2.") 26 | parser.add_argument( 27 | "--output-prefix", 28 | dest="output_prefix", 29 | default="differential_analysis", 30 | type=str, 31 | help="Prefix for output files.") 32 | parser.add_argument( 33 | "--formula", 34 | default="~ sample_group", 35 | type=str, 36 | help="R-style formula for differential expression. Defaults to '~ sample_group'.") 37 | parser.add_argument( 38 | "--alpha", 39 | default=0.05, 40 | type=float, 41 | help="Significance level to call differential expression. All results will be output anyway.") 42 | parser.add_argument( 43 | "-d", 44 | "--dry-run", 45 | action="store_true", 46 | help="Don't actually do anything.") 47 | parser.add_argument( 48 | "--overwrite", 49 | action="store_true", 50 | default=False, 51 | help="Don't overwrite any existing directory or file.") 52 | parser.add_argument( 53 | "--no-save-inputs", 54 | action="store_false", 55 | default=True, 56 | help="Don't write inputs to disk.") 57 | 58 | # To enable the loop to pass args directly on to the pipelines... 59 | 60 | # args = parser.parse_args("--output_prefix differential_analysis --formula '~sample_group' --overwrite /scratch/lab_bock/shared/projects/baf-time_course/results/differential_analysis_ATAC-seq/ARID2_KO".split(" ")) 61 | return parser 62 | 63 | 64 | def main(cli=None): 65 | args = parse_arguments().parse_args(cli) 66 | 67 | # sample annotation 68 | print("Reading experiment_matrix") 69 | experiment_matrix = pd.read_csv( 70 | os.path.join(args.work_dir, "experiment_matrix.csv")) 71 | # comparison table 72 | print("Reading comparison_matrix") 73 | comparison_table = pd.read_csv( 74 | os.path.join(args.work_dir, "comparison_table.csv")) 75 | # count matrix 76 | print("Reading count_matrix") 77 | count_matrix = pd.read_csv( 78 | os.path.join(args.work_dir, "count_matrix.csv"), index_col=0) 79 | 80 | print("Differential expression with DESeq2") 81 | res = deseq_analysis( 82 | count_matrix, 83 | experiment_matrix, 84 | comparison_table, 85 | formula=args.formula, 86 | output_dir=args.work_dir, 87 | output_prefix=args.output_prefix, 88 | overwrite=args.overwrite, alpha=args.alpha, 89 | create_subdirectories=False, 90 | save_inputs=not args.no_save_inputs) 91 | 92 | print("Found {} differentially expressed genes with p < {}.".format( 93 | res[res['pvalue'] < args.alpha].shape[0], args.alpha)) 94 | print("Found {} differentially expressed genes with FDR < {}.".format( 95 | res[res['padj'] < args.alpha].shape[0], args.alpha)) 96 | 97 | 98 | if __name__ == '__main__': 99 | try: 100 | sys.exit(main()) 101 | except KeyboardInterrupt: 102 | print("Program canceled by user!") 103 | sys.exit(1) 104 | -------------------------------------------------------------------------------- /ngs_toolkit/recipes/enrichr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | A helper script to run enrichment analysis using the Enrichr API on a gene set. 5 | 6 | Software requirements: None 7 | """ 8 | 9 | from argparse import ArgumentParser 10 | import os 11 | import sys 12 | 13 | from ngs_toolkit.general import enrichr 14 | 15 | 16 | def parse_arguments(): 17 | """ 18 | Argument Parsing. 19 | """ 20 | parser = ArgumentParser( 21 | prog="python -m ngs_toolkit.recipes.enrichr", description=__doc__ 22 | ) 23 | parser.add_argument( 24 | dest="input_file", 25 | help="Input file with a gene name per row and no header.", 26 | ) 27 | parser.add_argument( 28 | dest="output_file", help="Output CSV file with results." 29 | ) 30 | parser.add_argument( 31 | "-a", 32 | "--max-attempts", 33 | type=int, 34 | default=5, 35 | dest="max_attempts", 36 | help="Maximum attempts to retry the API before giving up.", 37 | ) 38 | parser.add_argument( 39 | "--no-overwrite", 40 | action="store_false", 41 | dest="overwrite", 42 | help="Whether results should not be overwritten if existing.", 43 | ) 44 | return parser 45 | 46 | 47 | def main(cli=None): 48 | print("Enrichr analysis") 49 | args = parse_arguments().parse_args(cli) 50 | 51 | if os.path.exists(args.output_file) and (not args.overwrite): 52 | print("Output exists and `overwrite` is False, so not doing anything.") 53 | return 0 54 | 55 | print("Reading input file.") 56 | 57 | with open(args.input_file, "r") as handle: 58 | genes = handle.readlines() 59 | 60 | print("Found {} genes in input.".format(len(genes))) 61 | 62 | print("Starting Enrichr analysis.") 63 | res = enrichr( 64 | genes, 65 | gene_set_libraries=None, 66 | kind="genes", 67 | max_attempts=args.max_attempts, 68 | ) 69 | 70 | print("Saving results.") 71 | res.to_csv(args.output_file, index=False) 72 | 73 | print("Done.") 74 | 75 | 76 | if __name__ == "__main__": 77 | try: 78 | sys.exit(main()) 79 | except KeyboardInterrupt: 80 | print("Program canceled by user!") 81 | sys.exit(1) 82 | -------------------------------------------------------------------------------- /ngs_toolkit/recipes/generate_project.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | A helper script to generate synthetic data for a project in PEP format. 5 | """ 6 | 7 | import sys 8 | 9 | from argparse import ArgumentParser 10 | 11 | from ngs_toolkit.demo import generate_project 12 | from ngs_toolkit.utils import filter_kwargs_by_callable 13 | 14 | 15 | def parse_arguments(): 16 | """ 17 | Argument Parsing. 18 | """ 19 | import inspect 20 | 21 | parser = ArgumentParser( 22 | prog="python -m ngs_toolkit.recipes.generate_project", description=__doc__) 23 | 24 | sig = inspect.signature(generate_project) 25 | for arg in sig.parameters: 26 | if arg in ["kwargs", "initialize"]: 27 | continue 28 | d = sig.parameters[arg].default 29 | if d is None: 30 | parser.add_argument("--" + arg.replace("_", "-")) 31 | else: 32 | parser.add_argument( 33 | "--" + arg.replace("_", "-"), 34 | default=d, type=type(d)) 35 | parser.add_argument("--debug", action="store_true") 36 | return parser 37 | 38 | 39 | def main(cli=None): 40 | """Generate synthetic data for a project in PEP format.""" 41 | args = parse_arguments().parse_args(cli) 42 | if args.debug: 43 | print(args) 44 | kwargs = {k: v for k, v in args.__dict__.items() if v is not None} 45 | kwargs = filter_kwargs_by_callable(kwargs, generate_project) 46 | if args.debug: 47 | print(kwargs) 48 | pep = generate_project(**kwargs, initialize=False) 49 | sys.stdout.write(pep + "\n") 50 | 51 | 52 | if __name__ == "__main__": 53 | try: 54 | sys.exit(main()) 55 | except KeyboardInterrupt: 56 | print("Program canceled by user!") 57 | sys.exit(1) 58 | -------------------------------------------------------------------------------- /ngs_toolkit/recipes/lola.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | A helper script to run Location Overlap Analysis (LOLA) 5 | of a single region set in various sets of region-based annotations. 6 | 7 | 8 | Software requirements: 9 | 10 | * LOLA 11 | """ 12 | 13 | import os 14 | import sys 15 | 16 | from argparse import ArgumentParser 17 | from ngs_toolkit.general import lola 18 | 19 | 20 | def parse_arguments(): 21 | """ 22 | Argument Parsing. 23 | """ 24 | parser = ArgumentParser( 25 | prog="python -m ngs_toolkit.recipes.lola", description=__doc__) 26 | parser.add_argument(dest="bed_file", help="BED file with query set regions.") 27 | parser.add_argument( 28 | dest="universe_file", 29 | help="BED file with universe where the query set came from.", 30 | ) 31 | parser.add_argument( 32 | dest="output_folder", help="Output directory for produced files." 33 | ) 34 | parser.add_argument(dest="genome", help="Genome assembly of the region set.") 35 | parser.add_argument( 36 | "--overwrite", 37 | action="store_false", 38 | help="Don't overwrite existing output files.", 39 | ) 40 | parser.add_argument( 41 | "-c", 42 | "--cpus", 43 | dest="cpus", 44 | help="Number of CPUS/threads to use for analysis.", 45 | type=int, 46 | ) 47 | return parser 48 | 49 | 50 | def main(cli=None): 51 | print("LOLA analysis") 52 | args = parse_arguments().parse_args(cli) 53 | 54 | output_file = os.path.join(args.output_folder, "allEnrichments.tsv") 55 | if os.path.exists(output_file) and (not args.overwrite): 56 | print("Output exists and `overwrite` is False, so not doing anything.") 57 | return 0 58 | 59 | print("Starting LOLA analysis.") 60 | 61 | lola( 62 | args.bed_file, 63 | args.universe_file, 64 | args.output_folder, 65 | args.genome, 66 | cpus=args.cpus, 67 | ) 68 | 69 | print("Done.") 70 | 71 | 72 | if __name__ == "__main__": 73 | try: 74 | sys.exit(main()) 75 | except KeyboardInterrupt: 76 | print("Program canceled by user!") 77 | sys.exit(1) 78 | -------------------------------------------------------------------------------- /ngs_toolkit/recipes/ngs_analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Perform full end-to-end analysis of ATAC-seq, ChIP-seq or RNA-seq data. 5 | 6 | Produces quantification matrices, normalizes them, 7 | performes unsupervised and supervised analysis as 8 | well as enrichment analyisis of differential features, 9 | all accompaigned with powerful visualizations. 10 | 11 | Supervised analysis will only be performed if PEP configuration file contains a 12 | `comparison table `_ field. 13 | 14 | In addition, this recipe uses variables provided in the project configuration 15 | file ``project_name``, ``sample_attributes`` and ``group_attributes``. 16 | """ 17 | 18 | 19 | import os 20 | import sys 21 | 22 | from argparse import ArgumentParser 23 | 24 | import matplotlib 25 | import seaborn as sns 26 | 27 | import peppy 28 | 29 | from ngs_toolkit.atacseq import ATACSeqAnalysis 30 | from ngs_toolkit.chipseq import ChIPSeqAnalysis 31 | from ngs_toolkit.rnaseq import RNASeqAnalysis 32 | 33 | 34 | # Set settings 35 | sns.set(context="paper", style="ticks", palette="colorblind", color_codes=True) 36 | matplotlib.rc("text", usetex=False) 37 | 38 | 39 | def parse_arguments(): 40 | """ 41 | Global options for analysis. 42 | """ 43 | parser = ArgumentParser( 44 | prog="python -m ngs_toolkit.recipes.ngs_analysis", description=__doc__) 45 | parser.add_argument( 46 | dest="config_file", help="YAML project configuration file.", type=str) 47 | parser.add_argument( 48 | "-n", 49 | "--analysis-name", 50 | dest="name", 51 | default=None, 52 | help="Name of analysis. Will be the prefix of output_files. " 53 | "By default it will be the name of the Project given in the YAML configuration.", 54 | type=str, 55 | ) 56 | parser.add_argument( 57 | "-o", 58 | "--results-output", 59 | default="results", 60 | dest="results_dir", 61 | help="Directory for analysis output files. " 62 | "Default is 'results' under the project roort directory.", 63 | type=str, 64 | ) 65 | parser.add_argument( 66 | "-t", 67 | "--data-type", 68 | default=None, 69 | choices=["ATAC-seq", "RNA-seq", "ChIP-seq"], 70 | dest="data_type", 71 | help="Data type to restrict analysis to. " 72 | "Default is to run separate analysis for each data type.", 73 | type=str, 74 | ) 75 | parser.add_argument( 76 | "-q", 77 | "--pass-qc", 78 | action="store_true", 79 | dest="pass_qc", 80 | help="Whether only samples with a 'pass_qc' value of '1' " 81 | "in the annotation sheet should be used.", 82 | ) 83 | parser.add_argument( 84 | "-a", "--alpha", default=0.05, dest="alpha", 85 | help="Alpha value of confidence for supervised analysis.", type=str 86 | ) 87 | parser.add_argument( 88 | "-f", 89 | "--fold-change", 90 | default=0, 91 | dest="abs_fold_change", 92 | help="Absolute log2 fold change value for supervised analysis.", 93 | type=str, 94 | ) 95 | return parser 96 | 97 | 98 | def main(cli=None): 99 | args = parse_arguments().parse_args(cli) 100 | 101 | # Start project 102 | print("Starting peppy project with project" 103 | "configuration file: '{}'".format(args.config_file)) 104 | prj = peppy.Project(args.config_file) 105 | print("Changing directory to project root" 106 | "directory: '{}'.".format(prj.metadata.output_dir)) 107 | os.chdir(prj.metadata.output_dir) 108 | if args.pass_qc: 109 | print("Filtering samples out which didn't pass QC" 110 | "as specified in sample annotation in column 'pass_qc'") 111 | prj._samples = [ 112 | s for s in prj._samples 113 | if s.pass_qc not in ["0", 0, "False", False]] 114 | 115 | # ANALYSIS 116 | if args.data_type is None: 117 | print( 118 | "Type of analysis not specified. Will run independent analysis" 119 | "for all types of data in the sample annotation sheet." 120 | ) 121 | data_types = sorted(list(set([s.protocol for s in prj._samples]))) 122 | print("Sample data types: '{}'.".format(",".join(data_types))) 123 | else: 124 | print("Type of analysis specified. Will run only" 125 | "analysis for samples of type '{}'.".format(args.data_type)) 126 | data_types = [args.data_type] 127 | print("Sample data types: '{}'.".format(",".join(data_types))) 128 | if args.name is None: 129 | print( 130 | "Analysis name not specified, will use name in" 131 | "project configuration file: '{}'.".format(prj.project_name) 132 | ) 133 | args.name = prj.project_name 134 | 135 | for data_type in data_types: 136 | print("Starting analysis for samples of type: '{}'.".format(data_type)) 137 | samples = [s for s in prj._samples if (s.protocol == data_type)] 138 | if len(samples) > 0: 139 | print( 140 | "Samples under consideration: '{}'. ".format(",".join([s.name for s in samples])) 141 | + "Total of {} samples.".format(len([s.name for s in samples])) 142 | ) 143 | else: 144 | raise ValueError("There were no valid samples for this analysis type!") 145 | 146 | kwargs = {"prj": prj, "samples": samples, "results_dir": args.results_dir} 147 | if data_type in ["ATAC-seq"]: 148 | print("Initializing ATAC-seq analysis") 149 | analysis = ATACSeqAnalysis( 150 | name=args.name + "_atacseq", **kwargs 151 | ) 152 | elif data_type in ["ChIP-seq"]: 153 | print("Initializing ChIP-seq analysis") 154 | analysis = ChIPSeqAnalysis( 155 | name=args.name + "_chipseq", **kwargs 156 | ) 157 | elif data_type in ["RNA-seq"]: 158 | print("Initializing RNA-seq analysis") 159 | analysis = RNASeqAnalysis( 160 | name=args.name + "_rnaseq", **kwargs 161 | ) 162 | 163 | print("Running main analysis.") 164 | main_analysis_pipeline( 165 | analysis, alpha=args.alpha, abs_fold_change=args.abs_fold_change) 166 | print("`ngs_analysis` recipe completed successfully!") 167 | 168 | 169 | def main_analysis_pipeline(a, alpha=0.05, abs_fold_change=0): 170 | # TODO: annotate with chromatin state 171 | # TODO: handle the genome vs transcriptome ambiguity 172 | 173 | genomes = list(set(s.genome for s in a.samples)) 174 | 175 | if len(genomes) != 1: 176 | raise ValueError( 177 | "Samples under analysis have more than" 178 | "one genome assembly: '{}'.".format("', '".join(genomes)) 179 | ) 180 | 181 | if isinstance(a, ATACSeqAnalysis): 182 | 183 | # GET CONSENSUS PEAK SET, ANNOTATE IT, PLOT 184 | # Get consensus peak set from all samples 185 | a.get_consensus_sites() 186 | a.calculate_peak_support() 187 | 188 | # GET CHROMATIN OPENNESS MEASUREMENTS, PLOT 189 | # Get coverage values for each peak in each sample 190 | a.measure_coverage() 191 | # normalize coverage values 192 | a.normalize(method="vst") 193 | 194 | # Annotate peaks with closest gene 195 | a.get_peak_gene_annotation() 196 | # Annotate peaks with genomic regions 197 | a.get_peak_genomic_location() 198 | # Annotate peaks with chromatin state 199 | 200 | if isinstance(a, RNASeqAnalysis): 201 | # Get gene expression 202 | a.get_gene_expression() 203 | 204 | # Annotate peaks with closest gene, chromatin state, 205 | # genomic location, mean and variance measurements across samples 206 | a.annotate_features() 207 | a.to_pickle() 208 | 209 | # Unsupervised analysis 210 | a.unsupervised_analysis( 211 | plot_max_attr=20, 212 | plot_max_pcs=6, 213 | plot_group_centroids=True, 214 | axis_ticklabels=False, 215 | axis_lines=True, 216 | always_legend=False, 217 | display_corr_values=False, 218 | ) 219 | 220 | # Supervised analysis 221 | if a.comparison_table.empty: 222 | print( 223 | "Comparison table has no comparisons with 'data_type'=='{}'" 224 | "and 'comparison_type'=='differential'.".format( 225 | a.data_type 226 | ) 227 | ) 228 | print("Not performing differential analysis for this data type.") 229 | a.generate_report(pip_versions=True) 230 | a.to_pickle() 231 | return 232 | 233 | a.differential_analysis() 234 | a.to_pickle() 235 | 236 | diff = a.differential_results[ 237 | (a.differential_results["padj"] < alpha) 238 | & (a.differential_results["log2FoldChange"].abs() > abs_fold_change) 239 | ] 240 | if diff.empty: 241 | print( 242 | "Differential analysis contains no significant {}s" 243 | "at alpha {} and absolute fold change {}.".format( 244 | a.var_unit_name, alpha, abs_fold_change 245 | ) 246 | ) 247 | a.generate_report(pip_versions=True) 248 | a.to_pickle() 249 | return 250 | 251 | if diff.groupby("comparison_name").count().shape[0] > 1: 252 | a.differential_overlap(diff) 253 | 254 | a.plot_differential( 255 | alpha=alpha, 256 | corrected_p_value=True, 257 | fold_change=abs_fold_change, 258 | rasterized=True, 259 | robust=True, 260 | group_wise_colours=True, 261 | ) 262 | 263 | a.differential_enrichment( 264 | # TODO: have a way to automatically check what is callable 265 | steps=['enrichr'], 266 | directional=True, 267 | max_diff=1000, 268 | sort_var="pvalue", 269 | distributed=False) 270 | 271 | # TODO: is this actually needed? vvv 272 | # a.collect_differential_enrichment(directional=True, permissive=False) 273 | 274 | a.plot_differential_enrichment(direction_dependent=True, top_n=5) 275 | 276 | a.generate_report(pip_versions=True) 277 | a.to_pickle() 278 | 279 | 280 | if __name__ == "__main__": 281 | try: 282 | sys.exit(main()) 283 | except KeyboardInterrupt: 284 | print("Program canceled by user!") 285 | sys.exit(1) 286 | -------------------------------------------------------------------------------- /ngs_toolkit/recipes/region_enrichment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | A helper script to run enrichment analysis 5 | of a single region set in region-based set of annotations. 6 | """ 7 | 8 | import os 9 | import pandas as pd 10 | import sys 11 | 12 | from argparse import ArgumentParser 13 | from ngs_toolkit.atacseq import ATACSeqAnalysis 14 | from ngs_toolkit.utils import bed_to_index 15 | 16 | 17 | def parse_arguments(): 18 | """ 19 | Argument Parsing. 20 | """ 21 | parser = ArgumentParser( 22 | prog="python -m ngs_toolkit.recipes.region_enrichment", description=__doc__) 23 | parser.add_argument( 24 | dest="bed_file", 25 | help="BED file with regions.") 26 | parser.add_argument( 27 | dest="pep", 28 | help="The analysis' PEP config file.") 29 | parser.add_argument( 30 | "--output-file", 31 | dest="output_file", 32 | default="region_type_enrichment.csv", 33 | type=str, 34 | help="Output file.") 35 | parser.add_argument( 36 | "--overwrite", 37 | action="store_true", 38 | default=False, 39 | help="Don't overwrite any existing directory or file.") 40 | return parser 41 | 42 | 43 | def main(cli=None): 44 | print("Region type analysis") 45 | # Parse command-line arguments. 46 | args = parse_arguments().parse_args(cli) 47 | if os.path.exists(args.output_file) and (not args.overwrite): 48 | print("Output exists and `overwrite` is False, so not doing anything.") 49 | return 0 50 | 51 | print("Reading up the analysis object.") 52 | a = ATACSeqAnalysis(from_pep=args.pep) 53 | a.load_data() 54 | # ( 55 | # "genomic_region", 56 | # "region_annotation_mapping", 57 | # "region_annotation_b_mapping", 58 | # ), 59 | # ( 60 | # "chromatin_state", 61 | # "chrom_state_annotation_mapping", 62 | # "chrom_state_annotation_b_mapping", 63 | # ), 64 | print("Reading up the BED file.") 65 | df = pd.read_csv(args.bed_file, sep="\t", header=None) 66 | df.columns = ['chrom', 'start', 'end'] 67 | print("Getting the index.") 68 | index = bed_to_index(df) 69 | print("Doing enrichment.") 70 | enr = a.region_context_enrichment(index) 71 | print("Saving.") 72 | enr.to_csv(args.output_file) 73 | print("Done.") 74 | 75 | 76 | if __name__ == '__main__': 77 | try: 78 | sys.exit(main()) 79 | except KeyboardInterrupt: 80 | print("Program canceled by user!") 81 | sys.exit(1) 82 | -------------------------------------------------------------------------------- /ngs_toolkit/recipes/region_set_frip.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Compute fraction of reads in peaks (FRiP) based on a consensus set of regions 5 | derived from several samples. 6 | 7 | A consensus region set can be passed, otherwise it will either try to use an 8 | existing one for that analysis or produce one on the fly. 9 | 10 | 11 | Software requirements: 12 | 13 | * awk 14 | * samtools 15 | """ 16 | 17 | 18 | import os 19 | import sys 20 | 21 | from argparse import ArgumentParser 22 | 23 | import pybedtools 24 | 25 | from ngs_toolkit.atacseq import ATACSeqAnalysis 26 | from ngs_toolkit.chipseq import ChIPSeqAnalysis 27 | 28 | 29 | def parse_arguments(): 30 | """ 31 | Global options for analysis. 32 | """ 33 | parser = ArgumentParser( 34 | prog="python -m ngs_toolkit.recipes.region_set_frip", description=__doc__ 35 | ) 36 | parser.add_argument(dest="config_file", help="YAML project configuration file.", type=str) 37 | parser.add_argument( 38 | "-r", 39 | "--region-set", 40 | dest="region_set", 41 | default=None, 42 | help="BED file with region set derived from several samples or Oracle region set. " 43 | "If unset, will try to get the `sites` attribute of an existing analysis object " 44 | "if existing, otherwise will create a region set from the peaks of all samples.", 45 | type=str, 46 | ) 47 | parser.add_argument( 48 | "-q", 49 | "--pass-qc", 50 | action="store_true", 51 | dest="pass_qc", 52 | help="Whether only samples with a 'pass_qc' value of '1' " 53 | "in the annotation sheet should be used.", 54 | ) 55 | parser.add_argument( 56 | "--computing-configuration", 57 | dest="computing_configuration", 58 | help="Which `divvy` computing configuration to use for distributed jobs." 59 | " Type divvy list to see all options. Defaults to the value in the " 60 | "ngs_toolkit configuration.", 61 | ) 62 | parser.add_argument( 63 | "--permissive", 64 | action="store_true", 65 | dest="permissive", 66 | help="If creating regions set, allow sample files to be missing and use what is present.", 67 | ) 68 | return parser 69 | 70 | 71 | def main(cli=None): 72 | args = parse_arguments().parse_args(cli) 73 | 74 | for data_type, clax in [ 75 | ("ATAC-seq", ATACSeqAnalysis), 76 | ("ChIP-seq", ChIPSeqAnalysis), 77 | ]: 78 | an = clax(from_pep=args.config_file) 79 | 80 | if not an.samples: 81 | continue 82 | 83 | if args.pass_qc: 84 | an.samples = [s for s in an.samples if getattr(s, "pass_qc", None) in ["1", "1.0", 1]] 85 | 86 | if data_type == "ChIP-seq" and not hasattr(an, "comparison_table"): 87 | msg = ( 88 | "ChIP-seq analysis must have comparison_table specified in " 89 | "the project config in order to relate" 90 | " foreground and backgound sample groups." 91 | ) 92 | print(msg) 93 | raise ValueError(msg) 94 | 95 | if args.region_set is not None: 96 | print("Loading given region set: '{}'".format(args.region_set)) 97 | an.sites = pybedtools.BedTool(args.region_set) 98 | else: 99 | print("Trying to load existing consensus region set.") 100 | an.load_data(only_these_keys=["sites"]) 101 | 102 | if not hasattr(an, "sites"): 103 | print("Not found. Producing a new consensus region set.") 104 | an.get_consensus_sites(permissive=args.permissive) 105 | else: 106 | print("Using region set in BED format: '{}'".format(an.sites.fn)) 107 | 108 | calculate_region_set_frip( 109 | region_set=an.sites.fn, 110 | samples=an.samples, 111 | computing_configuration=args.computing_configuration, 112 | ) 113 | 114 | 115 | def calculate_region_set_frip(region_set, samples, computing_configuration=None): 116 | """ 117 | """ 118 | from ngs_toolkit.utils import submit_job 119 | 120 | for sample in samples: 121 | sample.sample_root = os.path.join( 122 | sample.project.root_dir, sample.project._config.results_subdir, sample.name 123 | ) 124 | inside_reads = os.path.join(sample.sample_root, "region_set_frip.inside_reads.txt") 125 | all_reads = os.path.join(sample.sample_root, "region_set_frip.all_reads.txt") 126 | 127 | job_name = sample.name + ".region_set_frip" 128 | log_file = os.path.join(sample.sample_root, job_name + ".log") 129 | job_file = os.path.join(sample.sample_root, job_name + ".sh") 130 | sample_stats = os.path.join(sample.sample_root, "stats.tsv") 131 | 132 | cmd = "\n".join( 133 | [ 134 | """samtools view -c -L {} {} > {}""".format( 135 | region_set, sample.aligned_filtered_bam, inside_reads 136 | ), 137 | """samtools view -c {} > {}""".format(sample.aligned_filtered_bam, all_reads), 138 | 'calc(){ awk "BEGIN { print "$*" }"; }', 139 | "IN=`cat {}`".format(inside_reads), 140 | "ALL=`cat {}`".format(all_reads), 141 | "FRIP=`calc $IN/$ALL`", 142 | 'echo "region_set_frip\\t$FRIP\\t." >> {}'.format(sample_stats), 143 | "date", 144 | ] 145 | ) 146 | submit_job( 147 | cmd, 148 | job_file, 149 | log_file, 150 | jobname=job_name, 151 | computing_configuration=computing_configuration, 152 | ) 153 | 154 | 155 | if __name__ == "__main__": 156 | try: 157 | sys.exit(main()) 158 | except KeyboardInterrupt: 159 | print("Program canceled by user!") 160 | sys.exit(1) 161 | -------------------------------------------------------------------------------- /ngs_toolkit/templates/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/afrendeiro/toolkit/c21b69dd8c2a58195c4d798ec53e44999cf5cb6a/ngs_toolkit/templates/__init__.py -------------------------------------------------------------------------------- /ngs_toolkit/templates/report.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 13 | 14 | ngs-toolkit report for '{{ analysis.name }}' project 15 | 16 | 21 | 22 | 23 | 24 | 25 | 69 | 70 | 71 |
72 |
73 |

{{ analysis.name }}

74 | {% if analysis.description is not none %} 75 |

{{ analysis.description }}

76 | {% endif %} 77 |

78 | ngs-toolkit analysis report. 79 |

80 |
81 |
82 | 83 | 84 | 85 |
86 |
87 | 88 |
90 |
91 |

Project description

92 | {% if analysis.description is not none %} 93 |

{{ analysis.description }}

94 | {% endif %} 95 | {% if analysis.data_type is not none %} 96 |

The project main data type is {{ analysis.data_type }}.

97 | {% endif %} 98 | {% if analysis.organism is not none %} 99 |

The project main organism is {{ analysis.organism }}.

100 | {% endif %} 101 | {% if analysis.genome is not none %} 102 |

The project main genome assembly is {{ analysis.genome }}.

103 | {% endif %} 104 | 105 |

106 | 114 |

115 |
116 |
117 |
    118 | {% for key, value in project_repr.items() %} 119 |
  • 120 | {{ key }}: {{ value }} 121 |
  • 122 | {% endfor %} 123 |
124 |
125 |
126 |
127 |

Project contains {{ samples|length }} samples:

128 |
129 |

130 | 138 |

139 |
140 |
141 |
    142 | {% for sample in samples %} 143 | 150 |
    151 |
    152 |
      153 | {% for key, value in sample.items() %} 154 |
    • {{ key }}: {{ value }}
    • 155 | {% endfor %} 156 |
    157 |
    158 |
    159 | {% endfor %} 160 |
161 |
162 |
163 | 164 |
165 |
166 | 167 |
168 |

Analysis report

169 | {% for section, fig_list in images.items() %} 170 |

{{ section }}

171 |
172 | {% for caption, csv in csvs[section] %} 173 |
174 |

175 | Download CSV file: {{ caption }} 176 |

177 |
178 | {% endfor %} 179 |
180 |
181 | {% for caption, fig in fig_list %} 182 |
183 |
184 | 185 | 186 | 187 |
{{ caption }}
188 |
189 |
190 | {% endfor %} 191 |
192 |
193 | {% endfor %} 194 |
195 | 196 |
197 |

Versioning

198 |
    199 |
  • Python version: {{ python_version }}
  • 200 |
  • ngs-toolkit version: {{ library_version }}
  • 201 | 202 | {% if freeze|length > 0 %} 203 |

    204 | 211 |

    212 |
    213 | {% for library in freeze %} 214 |
    215 |
  • {{ library }}
  • 216 |
    217 | {% endfor %} 218 |
    219 | {% endif %} 220 |
221 |
222 |
223 | 224 |
225 |
226 | 227 |
228 | 229 | 233 | 234 | 235 | 236 | 239 | 242 | 245 | 246 | 247 | -------------------------------------------------------------------------------- /ngs_toolkit/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/afrendeiro/toolkit/c21b69dd8c2a58195c4d798ec53e44999cf5cb6a/ngs_toolkit/tests/__init__.py -------------------------------------------------------------------------------- /ngs_toolkit/tests/test_analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | import glob 5 | import os 6 | import shutil 7 | 8 | import numpy as np 9 | import pytest 10 | 11 | from ngs_toolkit.analysis import Analysis 12 | from ngs_toolkit.utils import get_this_file_or_timestamped 13 | from .conftest import file_exists, file_not_empty, COMBAT 14 | 15 | 16 | class TestAnalysis: 17 | def test_analysis_representation(self): 18 | name = "test_analysis" 19 | 20 | an = Analysis(name=name) 21 | assert an.__repr__() == "Analysis '{}'.".format(name) 22 | assert "samples" not in an.__repr__() 23 | 24 | def test_with_object_as(self): 25 | name = "test_analysis" 26 | 27 | an = Analysis(name=name) 28 | with an as _an: 29 | assert an is _an 30 | assert an == _an 31 | assert _an.__repr__() == "Analysis '{}'.".format(name) 32 | assert "samples" not in _an.__repr__() 33 | 34 | def test_analysis_creation(self, tmp_path): 35 | from ngs_toolkit.demo.data_generator import generate_project 36 | 37 | tmp_path = str(tmp_path) 38 | 39 | # Let's make several "reallish" test projects 40 | project_prefix_name = "test-project" 41 | data_types = ["ATAC-seq", "RNA-seq", "ChIP-seq"] # "CNV" 42 | genome_assemblies = [("human", "hg38"), ("mouse", "mm10")] # ("human", "hg19"), 43 | 44 | params = { 45 | "ATAC-seq": { 46 | "n_factors": [1, 2, 3], 47 | "n_features": [100, 1000, 10000], 48 | "n_replicates": [1, 2, 5], 49 | "analysis": "ATACSeqAnalysis", 50 | }, 51 | "ChIP-seq": { 52 | "n_factors": [1, 2, 3], 53 | "n_features": [100, 1000, 10000], 54 | "n_replicates": [1, 2, 5], 55 | "analysis": "ChIPSeqAnalysis", 56 | }, 57 | "RNA-seq": { 58 | "n_factors": [1, 2, 3], 59 | "n_features": [100, 1000, 25000], 60 | "n_replicates": [1, 2, 5], 61 | "analysis": "RNASeqAnalysis", 62 | }, 63 | } 64 | 65 | for data_type in data_types: 66 | n_factors = params[data_type]["n_factors"][0] 67 | n_features = params[data_type]["n_features"][0] 68 | n_replicates = params[data_type]["n_replicates"][0] 69 | for organism, genome_assembly in genome_assemblies: 70 | 71 | project_name = "{}_{}_{}_{}_{}_{}".format( 72 | project_prefix_name, 73 | data_type, 74 | genome_assembly, 75 | n_factors, 76 | n_features, 77 | n_replicates, 78 | ) 79 | 80 | an = generate_project( 81 | output_dir=tmp_path, 82 | project_name=project_name, 83 | organism=organism, 84 | genome_assembly=genome_assembly, 85 | data_type=data_type, 86 | n_factors=n_factors, 87 | n_replicates=n_replicates, 88 | n_features=n_features, 89 | only_metadata=True, 90 | ) 91 | # n_samples = (n_factors * n_replicates) + n_factors 92 | # assert an.__repr__() == ( 93 | # "'{}' analysis '{}' with {} samples of organism '{}' ({}).".format( 94 | # data_type, project_name, n_samples, organism, genome_assembly 95 | # ) 96 | # ) 97 | # assert len(n_factors * 2 * n_replicates * 2) == len(an.prj.samples) == len(an.samples) 98 | assert all([x == y for x, y in zip(an.prj.samples, an.samples)]) 99 | 100 | shutil.rmtree(tmp_path) 101 | 102 | def test_analysis_serialization(self, tmp_path): 103 | 104 | tmp_path = str(tmp_path) 105 | 106 | pickle_file = os.path.join(tmp_path, "analysis.pickle") 107 | a = Analysis(pickle_file=pickle_file) 108 | assert not file_exists(pickle_file) 109 | a.to_pickle() 110 | assert file_exists(pickle_file) 111 | assert file_not_empty(pickle_file) 112 | 113 | previous_size = os.stat(get_this_file_or_timestamped(pickle_file)).st_size 114 | a.random = np.random.random((100, 100)) 115 | a.to_pickle() 116 | new_size = os.stat(get_this_file_or_timestamped(pickle_file)).st_size 117 | assert new_size > previous_size 118 | 119 | previous_size = os.stat(get_this_file_or_timestamped(pickle_file)).st_size 120 | a.random = np.random.random((100, 100)) 121 | a.to_pickle(timestamp=True) 122 | assert len(glob.glob(os.path.join(tmp_path, "*.pickle"))) == 2 123 | 124 | def test_analysis_loading(self, tmp_path): 125 | tmp_path = str(tmp_path) 126 | pickle_file = os.path.join(tmp_path, "pickle") 127 | secret = "I've existed before" 128 | 129 | a = Analysis() 130 | a.pickle_file = pickle_file 131 | a.secret = secret 132 | a.to_pickle() 133 | 134 | a2 = Analysis(from_pickle=pickle_file) 135 | assert a2.secret == secret 136 | 137 | a3 = Analysis() 138 | a3.update(pickle_file) 139 | assert a3.secret == secret 140 | 141 | a4 = Analysis() 142 | a4.pickle_file = pickle_file 143 | a4 = a4.from_pickle() 144 | assert a4.secret == secret 145 | 146 | shutil.rmtree(tmp_path) 147 | 148 | def test__overwride_sample_representation(self, atac_analysis): 149 | 150 | prev = atac_analysis.samples[0].__repr__ 151 | Analysis._overwride_sample_representation() 152 | new = atac_analysis.samples[0].__repr__ 153 | 154 | assert prev != new 155 | 156 | def test__check_data_type_is_supported(self): 157 | assert Analysis._check_data_type_is_supported("ATAC-seq") 158 | assert Analysis._check_data_type_is_supported("ChIP-seq") 159 | assert Analysis._check_data_type_is_supported("RNA-seq") 160 | assert Analysis._check_data_type_is_supported("CNV") 161 | assert not Analysis._check_data_type_is_supported("Microarray") 162 | 163 | def test__get_data_type(self, atac_analysis): 164 | assert atac_analysis._get_data_type() == "ATAC-seq" 165 | assert atac_analysis._get_data_type(data_type="ATAC-seq") == "ATAC-seq" 166 | assert atac_analysis._get_data_type(data_type="RNA-seq") == "RNA-seq" 167 | 168 | with pytest.raises(ValueError): 169 | atac_analysis._get_data_type(data_type="Microarray") 170 | 171 | atac_analysis.data_type = None 172 | with pytest.raises(ValueError): 173 | atac_analysis._get_data_type() 174 | 175 | del atac_analysis.data_type 176 | with pytest.raises(AttributeError): 177 | atac_analysis._get_data_type() 178 | 179 | def test__check_samples_have_file(self, atac_analysis): 180 | with pytest.raises(AttributeError): 181 | atac_analysis._check_samples_have_file("NOTEXISTING") 182 | 183 | # assert not atac_analysis._check_samples_have_file("summits") 184 | 185 | assert not atac_analysis._check_samples_have_file("sample_name") 186 | 187 | def test__get_samples_have_file(self, atac_analysis): 188 | assert not atac_analysis._get_samples_have_file("sample_name") 189 | 190 | def test__get_samples_missing_file(self, atac_analysis): 191 | with pytest.raises(AttributeError): 192 | atac_analysis._get_samples_have_file("NOTEXISTING") 193 | 194 | assert not atac_analysis._get_samples_have_file("sample_name") 195 | 196 | assert not atac_analysis._get_samples_have_file("aligned_filtered_bam") 197 | 198 | def test__get_samples_with_input_file(self, atac_analysis): 199 | with pytest.raises(AttributeError): 200 | atac_analysis._get_samples_with_input_file("NOTEXISTING") 201 | 202 | with pytest.raises(IOError): 203 | atac_analysis._get_samples_with_input_file("sample_name") 204 | 205 | with pytest.raises(IOError): 206 | atac_analysis._get_samples_with_input_file("aligned_filtered_bam") 207 | 208 | assert not atac_analysis._get_samples_with_input_file( 209 | "aligned_filtered_bam", permissive=True 210 | ) 211 | assert not atac_analysis._get_samples_with_input_file("peaks", permissive=True) 212 | assert not atac_analysis._get_samples_with_input_file("summits", permissive=True) 213 | 214 | @pytest.mark.parametrize( 215 | "env_var,string", 216 | [ 217 | ("_${USER}_", "_{}_".format(os.environ.get("USER"))), 218 | # ("_$PATH_", "_{}_".format(os.environ.get("PATH"))), 219 | ], 220 | ) 221 | def test__format_string_with_environment_variables(self, env_var, string): 222 | assert string == Analysis._format_string_with_environment_variables(env_var) 223 | 224 | def test__format_string_with_attributes_simple(self): 225 | t = Analysis() 226 | t.a = 1 227 | t.b = "" 228 | assert "1" == Analysis._format_string_with_attributes(t, "{a}{b}") 229 | 230 | @pytest.mark.parametrize( 231 | "env_var,string", 232 | [("{data_type}", "ATAC-seq"), ("{name}", "test-project_ATAC-seq_human_hg38_1_250_2"),], 233 | ) 234 | def test__format_string_with_attributes(self, atac_analysis, env_var, string): 235 | assert string == atac_analysis._format_string_with_attributes(env_var) 236 | 237 | def test_record_output_file(self, atac_analysis): 238 | assert hasattr(atac_analysis, "output_files") 239 | assert len(atac_analysis.output_files) == 0 240 | atac_analysis.record_output_file("a", name="analysis") 241 | assert hasattr(atac_analysis, "output_files") 242 | assert len(atac_analysis.output_files) == 1 243 | assert atac_analysis.output_files[0][0] == "analysis" 244 | assert atac_analysis.output_files[0][1] == "a" 245 | 246 | 247 | def test_project_with_subprojects(subproject_config): 248 | from ngs_toolkit import Analysis 249 | 250 | a = Analysis(from_pep=subproject_config) 251 | assert len(a.samples) == 0 252 | 253 | a = Analysis(from_pep=subproject_config, amendments=["test_subproject"]) 254 | assert len(a.samples) > 0 255 | 256 | 257 | @pytest.mark.skipif(not COMBAT, reason="Combat not installed") 258 | def test_remove_factor(atac_analysis_many_factors): 259 | import pandas as pd 260 | 261 | a = atac_analysis_many_factors 262 | a.matrix_norm = a.matrix_norm.dropna() 263 | 264 | prefix = os.path.join(a.results_dir, "unsupervised_analysis_{}".format(a.data_type), a.name) 265 | # inspect 266 | a.unsupervised_analysis(output_prefix="before", steps=["pca_association"]) 267 | 268 | f = prefix + ".before.pca.variable_principle_components_association.csv" 269 | p = pd.read_csv(get_this_file_or_timestamped(f)) 270 | 271 | # extract the name of the factor with highest contribution 272 | factor = p.iloc[p.query("pc == 1")["p_value"].idxmin()]["attribute"] 273 | # check if it's significant 274 | assert p.query("attribute == '{}' and pc < 15".format(factor))["p_value"].min() < 0.05 275 | 276 | # remove factor without regard for the other factors 277 | m = a.remove_factor_from_matrix(factor=factor, assign=False, save=False) 278 | a.unsupervised_analysis(matrix=m, output_prefix="after_simple", steps=["pca_association"]) 279 | 280 | f = prefix + ".after_simple.pca.variable_principle_components_association.csv" 281 | p2 = pd.read_csv(get_this_file_or_timestamped(f)) 282 | assert p2.query("attribute == '{}' and pc < 15".format(factor))["p_value"].min() > 0.05 283 | 284 | # remove factor accounting for the other factors 285 | m = a.remove_factor_from_matrix( 286 | factor=factor, 287 | covariates=[x for x in a.group_attributes if x != factor], 288 | assign=False, 289 | save=False, 290 | ) 291 | a.unsupervised_analysis(matrix=m, output_prefix="after_covariates", steps=["pca_association"]) 292 | 293 | f = prefix + ".after_covariates.pca.variable_principle_components_association.csv" 294 | p3 = pd.read_csv(get_this_file_or_timestamped(f)) 295 | assert p3.query("attribute == '{}' and pc < 15".format(factor))["p_value"].min() > 0.05 296 | -------------------------------------------------------------------------------- /ngs_toolkit/tests/test_chipseq_analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | import pytest 5 | 6 | from .conftest import file_exists, file_exists_and_not_empty # , CI, RPY2 7 | 8 | 9 | def test_call_peaks_from_comparisons(chipseq_analysis): 10 | chipseq_analysis.call_peaks_from_comparisons() 11 | 12 | for name, comp in chipseq_analysis.comparisons.items(): 13 | files = [ 14 | comp['prefix'] + ".homer.log", 15 | comp['prefix'] + ".homer.sh", 16 | comp['prefix'] + ".macs2.log", 17 | comp['prefix'] + ".macs2.sh", 18 | ] 19 | for f in files: 20 | assert file_exists(f) 21 | 22 | 23 | def test_filter_peaks(chipseq_analysis_with_peaks): 24 | chipseq_analysis_with_peaks.filter_peaks() 25 | 26 | for name, comp in chipseq_analysis_with_peaks.comparisons.items(): 27 | files = [ 28 | comp['prefix'] + "_homer_peaks.factor.filtered.bed", 29 | comp['prefix'] + "_homer_peaks.factor.narrowPeak", 30 | comp['prefix'] + "_homer_peaks.histone.filtered.bed", 31 | comp['prefix'] + "_homer_peaks.histone.narrowPeak", 32 | comp['prefix'] + "_peaks.filtered.bed", 33 | comp['prefix'] + "_peaks.narrowPeak", 34 | ] 35 | for f in files: 36 | assert file_exists_and_not_empty(f) 37 | 38 | 39 | @pytest.mark.xfail 40 | def test_summarize_peaks_from_comparisons(chipseq_analysis_with_peaks): 41 | chipseq_analysis_with_peaks.test_summarize_peaks_from_comparisons() 42 | assert False 43 | 44 | 45 | @pytest.mark.xfail 46 | def test_get_consensus_sites(chipseq_analysis_with_peaks): 47 | chipseq_analysis_with_peaks.test_get_consensus_sites() 48 | assert False 49 | 50 | 51 | @pytest.mark.xfail 52 | def test_calculate_peak_support(chipseq_analysis_with_peaks): 53 | chipseq_analysis_with_peaks.test_calculate_peak_support() 54 | assert False 55 | 56 | 57 | @pytest.mark.xfail 58 | def test_get_supported_peaks(chipseq_analysis_with_peaks): 59 | chipseq_analysis_with_peaks.test_get_supported_peaks() 60 | assert False 61 | 62 | 63 | @pytest.mark.xfail 64 | def test_normalize_by_background(chipseq_analysis_with_peaks): 65 | chipseq_analysis_with_peaks.test_normalize_by_background() 66 | assert False 67 | -------------------------------------------------------------------------------- /ngs_toolkit/tests/test_cnv_analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | 5 | import pytest 6 | import pandas as pd 7 | 8 | from .conftest import file_exists_and_not_empty, STAP, DNACOPY # , CI, RPY2 9 | 10 | 11 | @pytest.mark.xfail 12 | def test__copy_cnv_profile_plots(cnv_analysis): 13 | with cnv_analysis as an: 14 | an._copy_cnv_profile_plots() 15 | assert False 16 | 17 | 18 | def test_get_cnv_data(cnv_analysis_with_inputs): 19 | with cnv_analysis_with_inputs as an: 20 | an.get_cnv_data() 21 | 22 | p = os.path.join(an.results_dir, an.name) 23 | files = [ 24 | p + ".10kb.matrix_raw.csv", 25 | p + ".100kb.matrix_raw.csv", 26 | p + ".1000kb.matrix_raw.csv"] 27 | for f in files: 28 | assert file_exists_and_not_empty(f) 29 | assert pd.read_csv(f, index_col=0).sum().sum() == 0 30 | 31 | 32 | def test_normalize(cnv_analysis): 33 | with cnv_analysis as an: 34 | an.normalize() 35 | 36 | p = os.path.join(an.results_dir, an.name) 37 | files = [ 38 | p + ".10kb.matrix_norm.csv", 39 | p + ".100kb.matrix_norm.csv", 40 | p + ".1000kb.matrix_norm.csv"] 41 | for f in files: 42 | assert file_exists_and_not_empty(f) 43 | assert pd.read_csv(f, index_col=0).sum().sum() != 0 44 | 45 | 46 | def test_plot_all_data(cnv_analysis): 47 | with cnv_analysis as an: 48 | an.plot_all_data(matrix='matrix_raw') 49 | 50 | for res in an.resolutions: 51 | p = os.path.join(an.results_dir, an.name + "." + res + ".all_data.full_data") 52 | files = [ 53 | p + ".fillna.clustermap.svg", 54 | p + ".heatmap.svg"] 55 | for f in files: 56 | assert file_exists_and_not_empty(f) 57 | 58 | 59 | def test_plot_stats_per_chromosome(cnv_analysis): 60 | with cnv_analysis as an: 61 | an.plot_stats_per_chromosome(matrix="matrix_raw") 62 | 63 | for res in an.resolutions: 64 | for t in ['mean', 'variation']: 65 | p = os.path.join(an.results_dir, an.name + "." + res + ".all_data." + t + "_per_chrom") 66 | files = [ 67 | p + ".no_sex_chroms.zscore.svg", 68 | p + ".no_sex_chroms.svg", 69 | p + ".svg"] 70 | for f in files: 71 | assert file_exists_and_not_empty(f) 72 | 73 | 74 | # @pytest.mark.skipif(not STAP or not DNACOPY, reason="STAP and DNACopy R libraries are required to perform segmentation.") 75 | @pytest.mark.xfail 76 | def test_segment_genome(cnv_analysis): 77 | cnv_analysis.segment_genome() 78 | assert False 79 | 80 | 81 | @pytest.mark.xfail 82 | def test_annotate_with_chrom_bands(cnv_analysis): 83 | cnv_analysis.annotate_with_chrom_bands() 84 | assert False 85 | 86 | 87 | @pytest.mark.xfail 88 | def test_plot_segmentation_stats(cnv_analysis): 89 | cnv_analysis.plot_segmentation_stats() 90 | assert False 91 | -------------------------------------------------------------------------------- /ngs_toolkit/tests/test_config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | def test_config_has_all_required_fields(): 5 | from ngs_toolkit import _CONFIG as local_config 6 | import pkgutil 7 | import yaml 8 | 9 | def _dicts_same_keys(d1, d2): 10 | if type(d1) != type(d2): 11 | return False 12 | 13 | for k in d1.keys(): 14 | if k not in d2: 15 | return False 16 | else: 17 | if type(d1[k]) is dict: 18 | return _dicts_same_keys(d1[k], d2[k]) 19 | else: 20 | return True 21 | 22 | file_config = ( 23 | pkgutil.get_data("ngs_toolkit", "config/default.yaml").decode().strip() 24 | ) 25 | file_config = yaml.load(file_config) 26 | 27 | assert _dicts_same_keys(file_config, local_config) 28 | -------------------------------------------------------------------------------- /ngs_toolkit/tests/test_decorators.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | import pytest 5 | 6 | 7 | class Test_check_has_attributes: 8 | # here we use 'get_resources' as en example 9 | # decorated function that won't fail for some other 10 | # reason on a fairly empty analysis object 11 | def test_empty_analysis(self, empty_analysis): 12 | # Make sure it raises AttributeError 13 | with pytest.raises(AttributeError): 14 | empty_analysis.get_resources(steps=[]) 15 | 16 | def test_null_analysis(self, null_analysis): 17 | # Make sure it raises AttributeError 18 | with pytest.raises(AttributeError): 19 | null_analysis.get_resources(steps=[]) 20 | 21 | def test_full_analysis(self, full_analysis): 22 | full_analysis.get_resources(steps=[]) 23 | 24 | # here we use 'calculate_peak_support' as en example 25 | # decorated function. It will however fail for another 26 | # reason due to the fairly empty analysis object (last test) 27 | def test_empty_analysis_2(self, empty_analysis): 28 | # Make sure it raises AttributeError 29 | with pytest.raises(AttributeError): 30 | empty_analysis.calculate_peak_support() 31 | 32 | def test_null_analysis_2(self, null_analysis): 33 | # Make sure it raises AttributeError 34 | with pytest.raises(AttributeError): 35 | null_analysis.calculate_peak_support() 36 | 37 | def test_full_analysis_2(self, atac_analysis): 38 | # This passes on the decorator 39 | # but raises IOError specific to the function 40 | with pytest.raises(IOError): 41 | atac_analysis.calculate_peak_support() 42 | 43 | def test_iterable_attributes(self, atac_analysis): 44 | from ngs_toolkit import Analysis 45 | from ngs_toolkit.decorators import check_has_attributes 46 | 47 | class TestAnalysis(Analysis): 48 | @check_has_attributes(['samples'], [list]) 49 | def test_function(self): 50 | print(self.samples) 51 | return True 52 | 53 | a = TestAnalysis() 54 | 55 | # has not samples set 56 | del a.samples 57 | with pytest.raises(AttributeError): 58 | a.test_function() 59 | 60 | # samples is None 61 | a.samples = None 62 | with pytest.raises(AttributeError): 63 | a.test_function() 64 | 65 | # samples is empty list 66 | a.samples = list() 67 | with pytest.raises(AttributeError): 68 | a.test_function() 69 | 70 | # has samples 71 | a.samples = [1, 2, 3] 72 | assert a.test_function() 73 | -------------------------------------------------------------------------------- /ngs_toolkit/tests/test_differential_analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | 5 | import pytest 6 | 7 | from .conftest import file_exists, file_exists_and_not_empty, R, R_REASON 8 | 9 | 10 | # Note: 11 | # The DESeq2 1.24.0 version in Debian archives 12 | # differs from the DESeq2 1.24.0 version in bioconductor version 3.9 13 | # If estimateDispersions with default fitType="parametric" fails, 14 | # (as often happens with the quickly generated synthetic data from tests), 15 | # it tries to use local fit using the locfit package, but in Debian 16 | # version this is not a valid choice of fit, causing failure. 17 | # Due to this, and since I'm using Debian packages for faster testing 18 | # I'm manually setting fitType="mean" for testing only. 19 | 20 | 21 | @pytest.fixture 22 | def outputs(atac_analysis): 23 | output_dir = os.path.join(atac_analysis.results_dir, "differential_analysis_ATAC-seq") 24 | prefix = os.path.join(output_dir, "differential_analysis.") 25 | outs = [ 26 | os.path.join(output_dir, "Factor_A_2vs1"), 27 | os.path.join( 28 | output_dir, 29 | "Factor_A_2vs1", 30 | "differential_analysis.deseq_result.Factor_A_2vs1.csv", 31 | ), 32 | prefix + "comparison_table.tsv", 33 | prefix + "count_matrix.tsv", 34 | prefix + "deseq_result.all_comparisons.csv", 35 | prefix + "experiment_matrix.tsv", 36 | ] 37 | return outs 38 | 39 | 40 | # @pytest.fixture 41 | # def outputs_no_subdirectories(analysis): 42 | # output_dir = os.path.join(analysis.results_dir, "differential_analysis_ATAC-seq") 43 | # prefix = os.path.join(output_dir, "differential_analysis.") 44 | # outputs = [ 45 | # prefix + "deseq_result.Factor_A_2vs1.csv", 46 | # prefix + "comparison_table.tsv", 47 | # prefix + "count_matrix.tsv", 48 | # prefix + "deseq_result.all_comparisons.csv", 49 | # prefix + "experiment_matrix.tsv"] 50 | # return outputs 51 | 52 | 53 | @pytest.mark.skipif( 54 | not R, 55 | reason=R_REASON) 56 | def test_deseq_functionality(): 57 | import pandas as pd 58 | from ngs_toolkit.utils import recarray2pandas_df 59 | 60 | from rpy2.robjects import numpy2ri, pandas2ri, r 61 | from rpy2.robjects.packages import importr 62 | numpy2ri.activate() 63 | pandas2ri.activate() 64 | 65 | importr("DESeq2") 66 | 67 | dds = r.makeExampleDESeqDataSet() 68 | dds = r.estimateSizeFactors(dds) 69 | dds = r.estimateDispersions(dds) 70 | dds = r.nbinomWaldTest(dds) 71 | res = recarray2pandas_df(r("as.data.frame")(r("DESeq2::results")(dds))) 72 | assert isinstance(res, pd.DataFrame) 73 | 74 | dds = r.makeExampleDESeqDataSet() 75 | dds = r.DESeq(dds) 76 | res = recarray2pandas_df(r("as.data.frame")(r("DESeq2::results")(dds))) 77 | assert isinstance(res, pd.DataFrame) 78 | 79 | 80 | @pytest.mark.skipif( 81 | not R, 82 | reason=R_REASON) 83 | class Test_differential_analysis: 84 | def test_simple_design(self, atac_analysis, outputs): 85 | import pandas as pd 86 | 87 | atac_analysis.differential_analysis() 88 | assert file_exists( 89 | os.path.join(atac_analysis.results_dir, "differential_analysis_ATAC-seq") 90 | ) 91 | assert file_exists(outputs[0]) 92 | assert os.path.isdir(outputs[0]) 93 | for output in outputs[1:]: 94 | assert file_exists_and_not_empty(output) 95 | assert hasattr(atac_analysis, "differential_results") 96 | assert isinstance(atac_analysis.differential_results, pd.DataFrame) 97 | assert atac_analysis.differential_results.index.str.startswith("chr").all() 98 | assert atac_analysis.differential_results.index.name == "index" 99 | cols = [ 100 | "baseMean", 101 | "log2FoldChange", 102 | "lfcSE", 103 | "stat", 104 | "pvalue", 105 | "padj", 106 | "comparison_name", 107 | ] 108 | assert atac_analysis.differential_results.columns.tolist() == cols 109 | 110 | def test_complex_design(self, atac_analysis, outputs): 111 | import pandas as pd 112 | 113 | atac_analysis.differential_analysis() 114 | assert file_exists( 115 | os.path.join(atac_analysis.results_dir, "differential_analysis_ATAC-seq") 116 | ) 117 | assert file_exists(outputs[0]) 118 | assert os.path.isdir(outputs[0]) 119 | for output in outputs[1:]: 120 | assert file_exists_and_not_empty(output) 121 | assert hasattr(atac_analysis, "differential_results") 122 | assert isinstance(atac_analysis.differential_results, pd.DataFrame) 123 | assert atac_analysis.differential_results.index.str.startswith("chr").all() 124 | assert atac_analysis.differential_results.index.name == "index" 125 | cols = [ 126 | "baseMean", 127 | "log2FoldChange", 128 | "lfcSE", 129 | "stat", 130 | "pvalue", 131 | "padj", 132 | "comparison_name", 133 | ] 134 | assert atac_analysis.differential_results.columns.tolist() == cols 135 | 136 | # def test_no_subdirectories(self, atac_analysis, outputs): 137 | # atac_analysis.differential_analysis() 138 | # assert file_exists( 139 | # os.path.join(atac_analysis.results_dir, "differential_analysis_ATAC-seq")) 140 | # assert file_exists(outputs[0]) 141 | # assert os.path.isdir(outputs[0]) 142 | # for output in outputs[1:]: 143 | # assert file_exists(output) 144 | # assert os.stat(output).st_size > 0 145 | -------------------------------------------------------------------------------- /ngs_toolkit/tests/test_differential_enrichment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | 5 | import pytest 6 | from .conftest import file_exists_and_not_empty 7 | 8 | 9 | @pytest.fixture 10 | def outputs(analysis_with_differential): 11 | prefix = os.path.join( 12 | analysis_with_differential.results_dir, "differential_analysis_ATAC-seq", "enrichments" 13 | ) 14 | outputs = [ 15 | os.path.join( 16 | prefix, "Factor_A_2vs1.down/differential_analysis.gene_symbols.txt" 17 | ), 18 | os.path.join(prefix, "Factor_A_2vs1.down/differential_analysis_regions.bed"), 19 | os.path.join(prefix, "Factor_A_2vs1.down/differential_analysis_regions.tsv"), 20 | os.path.join( 21 | prefix, "Factor_A_2vs1.down/differential_analysis.enrichr.csv" 22 | ), 23 | os.path.join(prefix, "Factor_A_2vs1.up/differential_analysis.gene_symbols.txt"), 24 | os.path.join(prefix, "Factor_A_2vs1.up/differential_analysis_regions.bed"), 25 | os.path.join(prefix, "Factor_A_2vs1.up/differential_analysis_regions.tsv"), 26 | os.path.join( 27 | prefix, "Factor_A_2vs1.up/differential_analysis.enrichr.csv" 28 | ), 29 | os.path.join(prefix, "differential_analysis.enrichr.csv"), 30 | ] 31 | return outputs 32 | 33 | 34 | # @pytest.mark.skip(reason="no way of currently testing this") 35 | class Test_differential_enrichment: 36 | def test_no_arguments(self, analysis_with_differential, outputs): 37 | analysis_with_differential.differential_enrichment(steps=["enrichr"]) 38 | for output in outputs: 39 | assert file_exists_and_not_empty(output) 40 | -------------------------------------------------------------------------------- /ngs_toolkit/tests/test_general.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | 5 | import pybedtools 6 | from ngs_toolkit.general import lola 7 | 8 | import pytest 9 | from .conftest import file_exists_and_not_empty, CI # , RPY2 10 | 11 | 12 | class Test_annotate_samples: 13 | def test_no_arguments(self, analysis_normalized): 14 | analysis_normalized.annotate_samples() 15 | 16 | def test_matrix_raw(self, atac_analysis): 17 | atac_analysis.annotate_samples(matrix="matrix_raw") 18 | 19 | 20 | def test_get_matrix(atac_analysis): 21 | import numpy as np 22 | import pandas as pd 23 | 24 | matrix = atac_analysis.get_matrix(matrix="matrix_raw") 25 | assert np.array_equal(matrix.values, atac_analysis.matrix_raw.values) 26 | assert (matrix == atac_analysis.matrix_raw).all().all() 27 | atac_analysis.dummy = atac_analysis.matrix_raw + 1 28 | matrix = atac_analysis.get_matrix(matrix="dummy") 29 | assert (matrix == (atac_analysis.matrix_raw + 1)).all().all() 30 | 31 | matrix = atac_analysis.get_matrix(matrix=atac_analysis.matrix_raw) 32 | assert np.array_equal(matrix.values, atac_analysis.matrix_raw.values) 33 | assert (matrix == atac_analysis.matrix_raw).all().all() 34 | atac_analysis.dummy = atac_analysis.matrix_raw + 1 35 | matrix = atac_analysis.get_matrix(matrix="dummy") 36 | assert (matrix == (atac_analysis.matrix_raw + 1)).all().all() 37 | 38 | # sample subssetting 39 | matrix = atac_analysis.get_matrix( 40 | matrix="matrix_raw", samples=atac_analysis.samples[:2]) 41 | assert (pd.Series([ 42 | s.name 43 | for s in atac_analysis.samples[:2]]) == matrix.columns).all() 44 | 45 | 46 | # +++ get_genome_reference 47 | # index_fasta 48 | # twobit_to_fasta 49 | # +++ get_blacklist_annotations 50 | # +++ get_tss_annotations 51 | # +++ get_genomic_context 52 | # +++ get_chromosome_sizes 53 | # +++ deseq_analysis 54 | # least_squares_fit 55 | 56 | 57 | def test_differential_from_bivariate_fit(analysis_normalized): 58 | from ngs_toolkit.general import differential_from_bivariate_fit 59 | 60 | with analysis_normalized as an: 61 | out_dir = os.path.join(an.results_dir, "diff") 62 | out_prefix = os.path.join(out_dir, "bivariate_fit") 63 | differential_from_bivariate_fit( 64 | an.comparison_table, an.matrix_norm, 65 | out_dir, out_prefix) 66 | 67 | outs = [ 68 | ".deseq_result.all_comparisons.csv", 69 | ".deseq_result.all_comparisons.scatter.svg", 70 | ".fit_result.Factor_A_2vs1.csv"] 71 | for f in outs: 72 | assert file_exists_and_not_empty(out_prefix + f) 73 | 74 | 75 | @pytest.mark.skipif( 76 | CI, 77 | reason="LOLA testing is not performed on CI.") 78 | class Test_LOLA(): 79 | def test_lola_function(self, tmp_path): 80 | bed = pybedtools.example_bedtool('hg38-base.bed') 81 | univ = bed.slop(l=0, r=10, genome='hg38') 82 | bed_file = bed.fn 83 | universe_file = univ.fn 84 | output_folder = os.path.dirname(tmp_path) 85 | genome = "hg38" 86 | 87 | lola(bed_file, universe_file, output_folder, genome) 88 | 89 | output_files = [ 90 | "allEnrichments.tsv", 91 | "col_codex.tsv"] 92 | for file in output_files: 93 | assert file_exists_and_not_empty(os.path.join(output_folder, file)) 94 | 95 | def test_lola_function_multiple_inputs(self, tmp_path): 96 | import shutil 97 | bed = pybedtools.example_bedtool('hg38-base.bed') 98 | univ = bed.slop(l=0, r=10, genome='hg38') 99 | bed_file = bed.fn 100 | shutil.copy(bed_file, "A.bed") 101 | shutil.copy(bed_file, "B.bed") 102 | universe_file = univ.fn 103 | output_folder = os.path.dirname(tmp_path) 104 | genome = "hg38" 105 | 106 | lola(["A.bed", "B.bed"], universe_file, output_folder, genome) 107 | 108 | output_files = [ 109 | "allEnrichments", 110 | "col_codex"] 111 | for file in output_files: 112 | for i in ['A', 'B']: 113 | assert file_exists_and_not_empty( 114 | os.path.join(output_folder, file + i + ".tsv")) 115 | 116 | 117 | def test_lola_through_differential_enrichment( 118 | self, analysis_with_differential): 119 | with analysis_with_differential as an: 120 | an.differential_enrichment(steps=['lola']) 121 | 122 | output_files = [ 123 | "allEnrichments.tsv", 124 | "col_codex.tsv"] 125 | 126 | for file in output_files: 127 | for direction in ['up', 'down']: 128 | assert file_exists_and_not_empty(os.path.join( 129 | an.results_dir, 130 | "differential_analysis_ATAC-seq/enrichments/Factor_A_2vs1." 131 | + direction, file)) 132 | 133 | def test_lola_through_differential_enrichment_distributed( 134 | self, analysis_with_differential): 135 | with analysis_with_differential as an: 136 | an.differential_enrichment(steps=['lola'], distributed=True) 137 | 138 | output_files = [ 139 | "allEnrichments.tsv", 140 | "col_codex.tsv"] 141 | 142 | for file in output_files: 143 | for direction in ['up', 'down']: 144 | assert file_exists_and_not_empty(os.path.join( 145 | an.results_dir, 146 | "differential_analysis_ATAC-seq/enrichments/Factor_A_2vs1." 147 | + direction, file)) 148 | 149 | # def test_lola__plot_differential_enrichment(self): 150 | # pass 151 | 152 | # meme_ame 153 | 154 | @pytest.mark.skipif( 155 | CI, 156 | reason="HOMER testing is not performed on CI.") 157 | class TestHomer(): 158 | def test_homer_function(self, tmp_path): 159 | from ngs_toolkit.general import homer_motifs 160 | 161 | bed = pybedtools.example_bedtool('hg38-base.bed') 162 | univ = bed.slop(l=0, r=10, genome='hg38') 163 | bed_file = bed.fn 164 | universe_file = univ.fn 165 | output_dir = os.path.dirname(tmp_path) 166 | genome_assembly = "hg38" 167 | 168 | homer_motifs(bed_file, output_dir, genome_assembly) 169 | assert os.path.exists(os.path.join(output_dir, "homerMotifs.all.motifs")) 170 | 171 | 172 | # homer_combine_motifs 173 | # +++ enrichr 174 | # run_enrichment_jobs 175 | 176 | 177 | def test_project_to_geo(atac_analysis_with_unmapped_input_files): 178 | from ngs_toolkit.general import project_to_geo 179 | 180 | with atac_analysis_with_unmapped_input_files as an: 181 | out_dir = os.path.join(an.root_dir, "geo_submission") 182 | annot = project_to_geo( 183 | an.prj, 184 | output_dir=out_dir, steps=['bam', 'peaks'], 185 | computing_configuration="default") 186 | 187 | cols = [ 188 | 'bam_file0', 'bam_file0_md5sum', 189 | # 'bigwig_file', 'bigwig_file_md5sum', 190 | 'peaks_file', 'peaks_file_md5sum'] 191 | assert all(annot.columns == cols) 192 | 193 | outs = [ 194 | "project_to_geo.{}.sh", 195 | "{}.bam", 196 | "{}.bam.md5", 197 | # "{}.bigWig", 198 | # "{}.bigWig.md5", 199 | "{}.peaks.narrowPeak", 200 | "{}.peaks.narrowPeak.md5"] 201 | for sample in an.samples: 202 | for f in outs: 203 | assert file_exists_and_not_empty( 204 | os.path.join(out_dir, f.format(sample.name))) 205 | 206 | 207 | def test_rename_sample_files(atac_analysis_with_input_files): 208 | import pandas as pd 209 | from ngs_toolkit.general import rename_sample_files 210 | 211 | with atac_analysis_with_input_files as an: 212 | 213 | df = pd.DataFrame( 214 | [['S01_A1', 'S02_A1'], ['SXX_ZZ', 'SYY_ZZ']], 215 | index=['old_sample_name', 'new_sample_name']).T 216 | 217 | rename_sample_files(df, results_dir=an.data_dir) 218 | 219 | for sample in ['SXX_ZZ', 'SYY_ZZ']: 220 | outs = [ 221 | os.path.join("mapped", sample + '.trimmed.bowtie2.filtered.bam'), 222 | os.path.join("mapped", sample + '.trimmed.bowtie2.filtered.bam.bai'), 223 | os.path.join("peaks", sample + '_peaks.narrowPeak'), 224 | os.path.join("peaks", sample + '_summits.bed')] 225 | for f in outs: 226 | assert file_exists_and_not_empty(os.path.join(an.data_dir, sample, f)) 227 | 228 | 229 | # +++ query_biomart 230 | 231 | 232 | def test_subtract_principal_component(analysis_normalized): 233 | from ngs_toolkit.general import subtract_principal_component 234 | 235 | with analysis_normalized as an: 236 | 237 | plot = os.path.join(an.root_dir, "subtract_plot.svg") 238 | df = subtract_principal_component( 239 | an.matrix_norm.T, plot_name=plot).T 240 | 241 | assert df.shape == an.matrix_norm.shape 242 | assert file_exists_and_not_empty(plot) 243 | 244 | 245 | # fix_batch_effect_limma 246 | -------------------------------------------------------------------------------- /ngs_toolkit/tests/test_install.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | 5 | import pytest 6 | 7 | from .conftest import CI, DEV, BUILD_DIR 8 | 9 | 10 | @pytest.mark.skipif( 11 | not CI, 12 | reason="Development mode, not testing Pypi requirements") 13 | def test_version_matches(): 14 | from ngs_toolkit import __version__ as installed_version 15 | from pkg_resources import get_distribution 16 | 17 | file_version = get_distribution('ngs_toolkit').version 18 | 19 | assert installed_version == file_version 20 | 21 | 22 | @pytest.mark.skipif( 23 | DEV, 24 | reason="Development mode, not testing Pypi requirements") 25 | def test_pypi_requirements_are_importable(): 26 | import requests 27 | import importlib 28 | 29 | package_name = "ngs-toolkit" 30 | url = "https://pypi.python.org/pypi/" + str(package_name) + "/json" 31 | data = requests.get(url).json() 32 | 33 | # handle packages where the package name is different than the Pypi name 34 | replace = { 35 | "setuptools-scm": "setuptools_scm", 36 | "scikit-learn": "sklearn"} 37 | 38 | # not extra requirements 39 | requirements = [ 40 | x.split(" ")[0] 41 | for x in data["info"]["requires_dist"] 42 | if "extra" not in x 43 | ] 44 | for req in requirements: 45 | if req in replace: 46 | requirements.pop(requirements.index(req)) 47 | requirements.append(replace[req]) 48 | 49 | for req in requirements: 50 | try: 51 | importlib.import_module(req) 52 | except ImportError: 53 | assert False 54 | 55 | 56 | def test_all_requirements_are_importable(): 57 | import importlib 58 | 59 | # test only basic requirements (not extras) 60 | path = None 61 | if CI: 62 | reqs = os.path.join(BUILD_DIR, "requirements", "requirements.txt") 63 | if os.path.exists(reqs): 64 | path = reqs 65 | if path is None: 66 | path = os.path.join("requirements", "requirements.txt") 67 | 68 | if not os.path.exists(path): 69 | pytest.skip("Could not locate requirements.txt") 70 | 71 | data = open(path).read().split("\n") 72 | 73 | replace = {"scikit-learn": "sklearn"} 74 | 75 | # handle github stuff 76 | requirements = list() 77 | for x in data: 78 | for sep in ['>=', '<=', '=', ">", "<"]: 79 | x = x.split(sep) 80 | if len(x) == 2: 81 | x = x[0].replace("=", "").replace(">", "").replace("<", "") 82 | else: 83 | x = x[0] 84 | if "extra" not in x: 85 | requirements.append(x) 86 | 87 | # remove commnets 88 | requirements = [x[:x.index(" #")] if "#" in x else x 89 | for x in requirements] 90 | # remove empty lines 91 | requirements = [x for x in requirements if x != ""] 92 | for req in requirements: 93 | if req in replace: 94 | requirements.pop(requirements.index(req)) 95 | requirements.append(replace[req]) 96 | 97 | for req in requirements: 98 | try: 99 | importlib.import_module(req) 100 | except ImportError: 101 | assert False, "Required '%s' module could not be found!" % req 102 | -------------------------------------------------------------------------------- /ngs_toolkit/tests/test_logger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import pytest 5 | 6 | 7 | @pytest.fixture 8 | def log(): 9 | return os.path.join(os.path.expanduser("~"), ".ngs_toolkit.log.txt") 10 | 11 | 12 | def test_config_has_all_required_fields(log): 13 | import logging 14 | from ngs_toolkit import _LOGGER 15 | 16 | assert isinstance(_LOGGER, logging.Logger) 17 | previous_size = os.stat(log).st_size 18 | _LOGGER.info("Testing logger") 19 | new_size = os.stat(log).st_size 20 | assert new_size > previous_size 21 | 22 | 23 | def test_clear_log(log): 24 | from ngs_toolkit import clear_log 25 | 26 | previous_size = os.stat(log).st_size 27 | clear_log() 28 | new_size = os.stat(log).st_size 29 | assert new_size < previous_size 30 | assert new_size == 0 31 | -------------------------------------------------------------------------------- /ngs_toolkit/tests/test_plot_differential.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | 5 | import pytest 6 | from .conftest import file_exists_and_not_empty, R, R_REASON 7 | 8 | 9 | @pytest.fixture 10 | def outputs(analysis_with_differential): 11 | prefix = os.path.join( 12 | analysis_with_differential.results_dir, 13 | "differential_analysis_ATAC-seq", 14 | "differential_analysis.", 15 | ) 16 | outputs = [ 17 | # prefix + "diff_region.samples.clustermap.corr.svg", 18 | # prefix + "diff_region.samples.clustermap.svg", 19 | # prefix + "diff_region.samples.clustermap.z0.svg", 20 | # prefix + "diff_region.samples.sorted.clustermap.svg", 21 | # prefix + "diff_region.samples.sorted.clustermap.z0.svg", 22 | prefix + "log2FoldChange.distribution.per_comparison.svg", 23 | prefix + "log2FoldChange.distribution.svg", 24 | prefix + "ma_plots.svg", 25 | prefix + "number_differential.directional.svg", 26 | prefix + "padj.distribution.per_comparison.svg", 27 | prefix + "padj.distribution.svg", 28 | prefix + "pvalue.distribution.per_comparison.svg", 29 | prefix + "pvalue.distribution.svg", 30 | prefix + "scatter_plots.svg", 31 | prefix + "volcano_plots.svg", 32 | ] 33 | return outputs 34 | 35 | 36 | @pytest.mark.skipif( 37 | not R, 38 | reason=R_REASON) 39 | class Test_plot_differential: 40 | def test_no_arguments(self, analysis_with_differential, outputs): 41 | analysis_with_differential.plot_differential() 42 | for output in outputs: 43 | assert file_exists_and_not_empty(output) 44 | -------------------------------------------------------------------------------- /ngs_toolkit/tests/test_plot_differential_enrichment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | 5 | import pytest 6 | from .conftest import file_exists_and_not_empty, R, R_REASON 7 | 8 | 9 | @pytest.fixture 10 | def outputs(analysis_with_differential_enrichment): 11 | # gene_set_libraries = _CONFIG['resources']['enrichr']['gene_set_libraries'] 12 | gene_set_libraries = ["GO_Biological_Process_2015", "NCI-Nature_2016"] 13 | prefix = os.path.join( 14 | analysis_with_differential_enrichment.results_dir, 15 | "differential_analysis_ATAC-seq", 16 | "enrichments", 17 | "differential_analysis.enrichr.", 18 | ) 19 | outputs = list() 20 | for g in gene_set_libraries: 21 | outputs += [ 22 | prefix + "{}.barplot.top_5.svg".format(g), 23 | prefix + "{}.cluster_specific.Row_z_score.svg".format(g), 24 | prefix + "{}.cluster_specific.svg".format(g), 25 | prefix + "{}.correlation.svg".format(g), 26 | prefix + "{}.zscore_vs_pvalue.scatterplot.svg".format(g), 27 | ] 28 | return outputs 29 | 30 | 31 | @pytest.mark.skipif( 32 | not R, 33 | reason=R_REASON) 34 | class Test_plot_differential_enrichment: 35 | def test_no_arguments(self, analysis_with_differential_enrichment, outputs): 36 | analysis_with_differential_enrichment.plot_differential_enrichment() 37 | for output in outputs: 38 | assert file_exists_and_not_empty(output) 39 | -------------------------------------------------------------------------------- /ngs_toolkit/tests/test_project_manager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | from .conftest import file_exists 5 | 6 | 7 | def test_cli_parsing(): 8 | import pytest 9 | from ngs_toolkit.project_manager import parse_arguments 10 | 11 | with pytest.raises(SystemExit): 12 | parse_arguments() 13 | for s in ["", "--help", "create", "recipe"]: 14 | with pytest.raises(SystemExit): 15 | parse_arguments(s) 16 | args = parse_arguments("create asd") 17 | assert args.command == "create" 18 | 19 | 20 | def test_project_creation(tmp_path): 21 | from ngs_toolkit import _CONFIG 22 | from ngs_toolkit.project_manager import create_project 23 | import os 24 | import pandas as pd 25 | import shutil 26 | 27 | tmp_path = str(tmp_path) # for Python2 28 | 29 | project_name = "test_project" 30 | annotation_vars = [ 31 | "sample_name", 32 | "toggle", 33 | "pass_qc", 34 | "protocol", 35 | "library", 36 | "cell_line", 37 | "cell_type", 38 | "condition", 39 | "experimental_batch", 40 | "experiment_name", 41 | "replicate", 42 | "organism", 43 | "flowcell", 44 | "lane", 45 | "BSF_name", 46 | "data_source", 47 | ] 48 | 49 | genome_assemblies = { 50 | k: v 51 | for x in _CONFIG["preferences"]["default_genome_assemblies"] 52 | for k, v in x.items() 53 | } 54 | create_project( 55 | project_name, 56 | genome_assemblies=genome_assemblies, 57 | overwrite=True, 58 | root_projects_dir=tmp_path, 59 | ) 60 | 61 | expected_files = [ 62 | os.path.join(tmp_path, project_name, ".git"), 63 | os.path.join(tmp_path, project_name, "metadata"), 64 | os.path.join(tmp_path, project_name, "metadata", "project_config.yaml"), 65 | os.path.join(tmp_path, project_name, "metadata", "annotation.csv"), 66 | os.path.join(tmp_path, project_name, "metadata", "sample_subannotation.csv"), 67 | os.path.join(tmp_path, project_name, "metadata", "comparison_table.csv"), 68 | ] 69 | for f in expected_files: 70 | assert file_exists(f) 71 | 72 | df = pd.read_csv(os.path.join(tmp_path, project_name, "metadata", "annotation.csv")) 73 | assert df.shape == (0, len(annotation_vars)) 74 | assert all(c in df.columns for c in annotation_vars) 75 | 76 | shutil.rmtree(tmp_path) 77 | -------------------------------------------------------------------------------- /ngs_toolkit/tests/test_recipes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import subprocess 6 | import pytest 7 | import pandas as pd 8 | 9 | from ngs_toolkit import Analysis 10 | from .conftest import CI, DEV, file_exists_and_not_empty 11 | 12 | 13 | def test_region_set_frip(pep): 14 | import pkgutil 15 | 16 | # For this test, we need an analysis object with sample attributes pointing 17 | # to their input files (just like the atac_analysis_with_input_files parent), 18 | # but since it has to be in inside the recipe, so we will temporarily set it 19 | # at the home directory level, for this test only 20 | config = os.path.join(os.path.expanduser("~"), ".ngs_toolkit.config.yaml") 21 | yaml = ( 22 | pkgutil.get_data("ngs_toolkit", "config/example.yaml").decode().strip() 23 | ) 24 | with open(config, "w") as handle: 25 | handle.write(yaml) 26 | 27 | cmd = ( 28 | "{exe} -m ngs_toolkit.recipes.region_set_frip {pep} " 29 | "--computing-configuration default" 30 | ).format(exe=sys.executable, pep=pep) 31 | 32 | o = subprocess.call(cmd.split(" ")) 33 | assert o == 0 34 | 35 | an = Analysis(from_pep=pep) 36 | for sample in an.samples: 37 | files = [ 38 | "region_set_frip.all_reads.txt", 39 | "region_set_frip.inside_reads.txt", 40 | sample.name + ".region_set_frip.log", 41 | sample.name + ".region_set_frip.sh", 42 | "stats.tsv", 43 | ] 44 | for f in files: 45 | assert file_exists_and_not_empty( 46 | os.path.join(sample.sample_root, f) 47 | ) 48 | 49 | os.remove(config) 50 | 51 | 52 | def test_deseq2(tmp_path, atac_analysis_with_input_files): 53 | an = atac_analysis_with_input_files 54 | an.differential_analysis(distributed=True, dry_run=True) 55 | 56 | p = "differential_analysis_ATAC-seq" 57 | comp_dir = os.path.join(an.results_dir, p, "Factor_A_2vs1") 58 | output_prefix = os.path.join(comp_dir, p) 59 | 60 | cmd = ("{} -m ngs_toolkit.recipes.deseq2 --output-prefix {} {}").format( 61 | sys.executable, output_prefix, comp_dir 62 | ) 63 | 64 | o = subprocess.call(cmd.split(" ")) 65 | assert o == 0 66 | 67 | files = [ 68 | # "deseq_job.Factor_A_2vs1.log", 69 | p + ".deseq_result.Factor_A_2vs1.csv", 70 | p + ".deseq_result.all_comparisons.csv", 71 | ] 72 | for f in files: 73 | assert file_exists_and_not_empty(os.path.join(comp_dir, f)) 74 | 75 | 76 | def test_coverage(tmp_path, atac_analysis_with_input_files): 77 | 78 | region_set = atac_analysis_with_input_files.sites.fn 79 | sample = atac_analysis_with_input_files.samples[0] 80 | output = os.path.join(tmp_path, "output.bed") 81 | 82 | cmd = ("{} -m ngs_toolkit.recipes.coverage {} {} {}").format( 83 | sys.executable, region_set, sample.aligned_filtered_bam, output 84 | ) 85 | 86 | o = subprocess.call(cmd.split(" ")) 87 | assert o == 0 88 | 89 | assert file_exists_and_not_empty(output) 90 | 91 | assert pd.read_csv(output, sep="\t", header=None).shape[1] == 4 92 | 93 | 94 | def test_enrichr_good(tmp_path): 95 | genes = ["PAX5", "SOX2"] 96 | input_file = os.path.join(tmp_path, "genes.txt") 97 | output_file = os.path.join(tmp_path, "enrichr.csv") 98 | with open(input_file, "w") as handle: 99 | for g in genes: 100 | handle.write(g + "\n") 101 | cmd = ("{} -m ngs_toolkit.recipes.enrichr {} {}").format( 102 | sys.executable, input_file, output_file 103 | ) 104 | 105 | o = subprocess.call(cmd.split(" ")) 106 | assert o == 0 107 | 108 | assert file_exists_and_not_empty(output_file) 109 | 110 | assert pd.read_csv(output_file).shape[1] == 10 111 | 112 | 113 | def test_enrichr_bad(tmp_path): 114 | # No genes enriched, should return empty dataframe 115 | genes = ["!!~~IMPOSSIBLEGENE~~!!"] 116 | input_file = os.path.join(tmp_path, "impossible_genes.txt") 117 | output_file = os.path.join(tmp_path, "empty_enrichr.csv") 118 | with open(input_file, "w") as handle: 119 | for g in genes: 120 | handle.write(g + "\n") 121 | cmd = ("{} -m ngs_toolkit.recipes.enrichr {} {}").format( 122 | sys.executable, input_file, output_file 123 | ) 124 | 125 | o = subprocess.call(cmd.split(" ")) 126 | assert o == 0 127 | 128 | assert file_exists_and_not_empty(output_file) 129 | 130 | with pytest.raises(pd.errors.EmptyDataError): 131 | pd.read_csv(output_file) 132 | 133 | 134 | @pytest.mark.skipif(CI or DEV, reason="Test too long to be performed on CI.") 135 | def test_ngs_analysis(pep): 136 | cmd = ("{exe} -m ngs_toolkit.recipes.ngs_analysis {pep}").format( 137 | exe=sys.executable, pep=pep 138 | ) 139 | 140 | o = subprocess.call(cmd.split(" ")) 141 | assert o == 0 142 | 143 | 144 | @pytest.mark.skipif(CI or DEV, reason="Test too long to be performed on CI.") 145 | def test_merge_signal(pep): 146 | import pkgutil 147 | from .conftest import file_exists_and_not_empty 148 | 149 | dir_ = os.path.dirname(os.path.dirname(pep)) 150 | output_dir = os.path.join(dir_, "data_merged") 151 | cmd = ( 152 | "{exe} -m ngs_toolkit.recipes.merge_signal " 153 | "-d " 154 | "--attributes A " 155 | "--output-dir {output_dir} " 156 | "{pep}" 157 | ).format(exe=sys.executable, output_dir=output_dir, pep=pep) 158 | 159 | # this requires a config with sample input files 160 | file_config = os.path.join( 161 | os.path.expanduser("~"), ".ngs_toolkit.config.yaml" 162 | ) 163 | content = ( 164 | pkgutil.get_data("ngs_toolkit", "config/default.yaml").decode().strip() 165 | ) 166 | with open(file_config, "w") as handle: 167 | handle.write(content) 168 | 169 | o = subprocess.call(cmd.split(" ")) 170 | assert o == 0 171 | 172 | files = [ 173 | # "A_1.bigWig", 174 | # "A_1.merged.bam", 175 | # "A_1.merged.sorted.bam", 176 | # "A_1.merged.sorted.bam.bai", 177 | "A_1.merge_signal.sh", 178 | # "A_2.bigWig", 179 | # "A_2.merged.bam", 180 | # "A_2.merged.sorted.bam", 181 | # "A_2.merged.sorted.bam.bai", 182 | "A_2.merge_signal.sh", 183 | ] 184 | 185 | for f in files: 186 | assert file_exists_and_not_empty(os.path.join(output_dir, f)) 187 | 188 | os.remove(file_config) 189 | -------------------------------------------------------------------------------- /ngs_toolkit/tests/test_regression_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | from ngs_toolkit import Analysis 5 | import pybedtools 6 | import pandas as pd 7 | 8 | 9 | def test_pybedtools_to_from_dataframe(): 10 | 11 | class T(Analysis): 12 | 13 | def __init__(self, *args, **kwargs): 14 | super(T, self).__init__( 15 | *args, 16 | **kwargs 17 | ) 18 | self.samples = list() 19 | 20 | def trigger(self): 21 | d = pd.DataFrame( 22 | [['chr1', 1, 10], ['chr2', 1, 10]], 23 | columns=['chrom', 'start', 'end']) 24 | b = pybedtools.BedTool.from_dataframe(d) 25 | return b.to_dataframe() 26 | 27 | t = T() 28 | assert isinstance(t.trigger(), pd.DataFrame) 29 | 30 | 31 | def test_get_right_timestamped_file(tmpdir): 32 | from ngs_toolkit.utils import get_this_file_or_timestamped 33 | 34 | target = os.path.join(tmpdir, "human.grch38.genomic_context.bed") 35 | assert get_this_file_or_timestamped(target) == target 36 | 37 | outs = [ 38 | "human.grch38.genomic_context.2019-09-03-11:46:42.bed", 39 | "human.grch38.genomic_context.exon.2019-09-03-11:46:36.bed", 40 | "human.grch38.genomic_context.genebody.2019-09-03-11:46:36.bed", 41 | "human.grch38.genomic_context.intergenic.2019-09-03-11:46:41.bed", 42 | "human.grch38.genomic_context.intron.2019-09-03-11:46:38.bed", 43 | "human.grch38.genomic_context.promoter.2019-09-03-11:46:36.bed", 44 | "human.grch38.genomic_context.utr3.2019-09-03-11:46:40.bed", 45 | "human.grch38.genomic_context.utr5.2019-09-03-11:46:39.bed"] 46 | outs = [os.path.join(tmpdir, f) for f in outs] 47 | 48 | # Now with several existing files that also match the regex 49 | for f in outs: 50 | with open(f, "w") as handle: 51 | handle.write(f) 52 | 53 | assert get_this_file_or_timestamped(target) == outs[0] 54 | 55 | 56 | def test_bedtools_intersect_to_dataframe(): 57 | import pandas as pd 58 | import pybedtools 59 | 60 | a = pd.DataFrame([ 61 | ['chr1', 9844, 10460], 62 | ['chr1', 180534, 181797]]) 63 | 64 | b = pd.DataFrame([ 65 | ['chr1', 10000, 10800, '9_Het'], 66 | ['chr1', 10800, 16000, '15_Quies'], 67 | ['chr1', 16000, 16200, '1_TssA'], 68 | ['chr1', 16200, 19000, '5_TxWk'], 69 | ['chr1', 19000, 96080, '15_Quies'], 70 | ['chr1', 96276, 96476, '15_Quies'], 71 | ['chr1', 97276, 177200, '15_Quies']]) 72 | 73 | a_ = pybedtools.BedTool.from_dataframe(a) 74 | b_ = pybedtools.BedTool.from_dataframe(b) 75 | 76 | res = a_.intersect(b_, wa=True, wb=True, loj=True) 77 | df = res.to_dataframe() 78 | assert isinstance(df, pd.DataFrame) 79 | assert df.shape == (2, 7) 80 | assert df.iloc[1, -1] == '.' 81 | -------------------------------------------------------------------------------- /ngs_toolkit/tests/test_report.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from .conftest import file_exists, file_exists_and_not_empty 4 | 5 | 6 | def test_generate_report(atac_analysis): 7 | report = atac_analysis._format_string_with_attributes( 8 | "{root_dir}/{name}.analysis_report.html") 9 | assert not file_exists(report) 10 | atac_analysis.generate_report() 11 | assert file_exists_and_not_empty(report) 12 | -------------------------------------------------------------------------------- /ngs_toolkit/tests/test_rnaseq_analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | import os 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from .conftest import file_exists, file_exists_and_not_empty 9 | 10 | 11 | def test_rpm_normalization(rnaseq_analysis): 12 | with rnaseq_analysis as analysis: 13 | qnorm = analysis.normalize_rpm(save=False) 14 | assert qnorm.dtypes.all() == np.float 15 | assert hasattr(analysis, "matrix_norm") 16 | rpm_file = os.path.join( 17 | analysis.results_dir, analysis.name + ".matrix_norm.csv" 18 | ) 19 | assert not file_exists(rpm_file) 20 | qnorm = analysis.normalize_rpm(save=True) 21 | assert file_exists_and_not_empty(rpm_file) 22 | assert hasattr(analysis, "matrix_norm") 23 | 24 | 25 | def test_normalize(rnaseq_analysis): 26 | qnorm = rnaseq_analysis.normalize_rpm(save=False) 27 | assert isinstance(qnorm, pd.DataFrame) 28 | assert hasattr(rnaseq_analysis, "matrix_norm") 29 | del rnaseq_analysis.matrix_norm 30 | 31 | qnorm_d = rnaseq_analysis.normalize(method="rpm", save=False) 32 | assert isinstance(qnorm_d, pd.DataFrame) 33 | assert hasattr(rnaseq_analysis, "matrix_norm") 34 | assert np.array_equal(qnorm_d, qnorm) 35 | del rnaseq_analysis.matrix_norm 36 | 37 | qnorm = rnaseq_analysis.normalize_quantiles(save=False) 38 | assert hasattr(rnaseq_analysis, "matrix_norm") 39 | del rnaseq_analysis.matrix_norm 40 | 41 | qnorm_d = rnaseq_analysis.normalize(method="quantile", save=False) 42 | assert isinstance(qnorm_d, pd.DataFrame) 43 | assert hasattr(rnaseq_analysis, "matrix_norm") 44 | assert np.array_equal(qnorm_d, qnorm) 45 | del rnaseq_analysis.matrix_norm 46 | 47 | 48 | def test_annotate_features(rnaseq_analysis): 49 | rnaseq_analysis.get_matrix_stats(matrix="matrix_raw") 50 | rnaseq_analysis.annotate_features(matrix="matrix_raw") 51 | f = os.path.join( 52 | rnaseq_analysis.results_dir, rnaseq_analysis.name + ".matrix_features.csv" 53 | ) 54 | assert hasattr(rnaseq_analysis, "matrix_features") 55 | assert file_exists_and_not_empty(f) 56 | 57 | cols = [ 58 | "mean", 59 | "variance", 60 | "std_deviation", 61 | "dispersion", 62 | "qv2", 63 | "amplitude", 64 | "iqr", 65 | ] # from stats 66 | 67 | assert all([c in rnaseq_analysis.matrix_features.columns.tolist() for c in cols]) 68 | 69 | 70 | def test_plot_expression_characteristics(rnaseq_analysis): 71 | with rnaseq_analysis as analysis: 72 | analysis.normalize() 73 | analysis.plot_expression_characteristics() 74 | assert file_exists(os.path.join(analysis.results_dir, "quality_control")) 75 | 76 | 77 | def test_plot_features(rnaseq_analysis): 78 | from ngs_toolkit.rnaseq import plot_features 79 | with rnaseq_analysis as analysis: 80 | analysis.normalize_rpm() 81 | 82 | analysis.differential_analysis() 83 | 84 | plot_features( 85 | analysis, 86 | knockout_genes=analysis.matrix_norm.mean(1).nlargest(20).index.tolist()) 87 | 88 | outs = [os.path.join(analysis.results_dir, f) for f in [ 89 | "knockout_expression.svg", "results/knockout_expression.sorted.svg"]] 90 | for f in outs: 91 | file_exists_and_not_empty(f) 92 | -------------------------------------------------------------------------------- /ngs_toolkit/tests/test_sample_input_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | import os 5 | 6 | import pandas as pd 7 | import pytest 8 | 9 | from .conftest import file_exists_and_not_empty 10 | from ngs_toolkit.utils import get_this_file_or_timestamped 11 | 12 | 13 | @pytest.fixture 14 | def a(atac_analysis_with_input_files): 15 | return atac_analysis_with_input_files 16 | 17 | 18 | def test_has_bam_files(a): 19 | v = [file_exists_and_not_empty(s.aligned_filtered_bam) for s in a.samples] 20 | assert all(v) 21 | v = [file_exists_and_not_empty(s.aligned_filtered_bam + ".bai") for s in a.samples] 22 | assert all(v) 23 | 24 | 25 | def test_has_peak_files(a): 26 | v = [file_exists_and_not_empty(s.peaks) for s in a.samples] 27 | assert all(v) 28 | 29 | 30 | def test_has_summit_files(a): 31 | v = [file_exists_and_not_empty(s.summits) for s in a.samples] 32 | assert all(v) 33 | 34 | 35 | class Test_measure_coverage: 36 | def test_no_arguments(self, a): 37 | mn = get_this_file_or_timestamped(os.path.join(a.results_dir, a.name + ".matrix_raw.csv")) 38 | 39 | os.remove(mn) 40 | 41 | a.measure_coverage() 42 | 43 | assert file_exists_and_not_empty(mn) 44 | 45 | def test_distributed(self, a): 46 | mn = get_this_file_or_timestamped(os.path.join(a.results_dir, a.name + ".matrix_raw.csv")) 47 | 48 | os.remove(mn) 49 | 50 | a.measure_coverage(distributed=True, computing_configuration="localhost") 51 | 52 | # Check job files for each sample exist 53 | fs = list() 54 | for s in a.samples: 55 | f = os.path.join(s.sample_root, "coverage", s.name + ".peak_set_coverage.") 56 | for end in ["sh", "bed"]: 57 | fs.append(f + end) 58 | assert all([file_exists_and_not_empty(f) for f in fs]) 59 | 60 | # # has to be done separately for log files because they'll empty 61 | # # just check for existence 62 | fs = list() 63 | for s in a.samples: 64 | f = os.path.join(s.sample_root, "coverage", s.name + ".peak_set_coverage.") 65 | for end in ["log"]: 66 | fs.append(f + end) 67 | assert all([os.path.exists(f) for f in fs]) 68 | 69 | def test_few_samples(self, a): 70 | mn = get_this_file_or_timestamped(os.path.join(a.results_dir, a.name + ".matrix_raw.csv")) 71 | 72 | os.remove(mn) 73 | 74 | a.measure_coverage(samples=a.samples[:2]) 75 | 76 | mn = get_this_file_or_timestamped(mn) 77 | assert file_exists_and_not_empty(mn) 78 | assert pd.read_csv(mn, index_col=0).shape[1] == 2 79 | 80 | def test_one_sample(self, a): 81 | mn = get_this_file_or_timestamped(os.path.join(a.results_dir, a.name + ".matrix_raw.csv")) 82 | 83 | os.remove(mn) 84 | 85 | a.measure_coverage(samples=a.samples[:1]) 86 | 87 | mn = get_this_file_or_timestamped(mn) 88 | assert file_exists_and_not_empty(mn) 89 | assert pd.read_csv(mn, index_col=0).shape[1] == 1 90 | 91 | def test_missing_input_no_permissive(self, a): 92 | mn = get_this_file_or_timestamped(os.path.join(a.results_dir, a.name + ".matrix_raw.csv")) 93 | 94 | os.remove(mn) 95 | os.remove(a.samples[0].aligned_filtered_bam) 96 | 97 | with pytest.raises(IOError): 98 | a.measure_coverage(samples=a.samples[:1]) 99 | 100 | def test_missing_input_all_samples(self, a): 101 | mn = get_this_file_or_timestamped(os.path.join(a.results_dir, a.name + ".matrix_raw.csv")) 102 | 103 | os.remove(mn) 104 | for s in a.samples: 105 | os.remove(s.aligned_filtered_bam) 106 | 107 | with pytest.raises(IOError): 108 | a.measure_coverage() 109 | 110 | def test_missing_input_with_permissive(self, a): 111 | mn = get_this_file_or_timestamped(os.path.join(a.results_dir, a.name + ".matrix_raw.csv")) 112 | 113 | os.remove(mn) 114 | os.remove(a.samples[0].aligned_filtered_bam) 115 | 116 | a.measure_coverage(samples=a.samples[:2], permissive=True) 117 | 118 | mn = get_this_file_or_timestamped(mn) 119 | assert file_exists_and_not_empty(mn) 120 | assert pd.read_csv(mn, index_col=0).shape[1] == 1 121 | -------------------------------------------------------------------------------- /ngs_toolkit/tests/test_unsupervised_analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import shutil 5 | 6 | import pytest 7 | 8 | from .conftest import file_exists, file_not_empty, file_exists_and_not_empty 9 | 10 | 11 | @pytest.fixture 12 | def unsup_outputs(atac_analysis_many_factors): 13 | prefix = os.path.join( 14 | atac_analysis_many_factors.results_dir, 15 | "unsupervised_analysis_{}".format(atac_analysis_many_factors.data_type), 16 | atac_analysis_many_factors.name + ".all_{}s.".format(atac_analysis_many_factors.var_unit_name), 17 | ) 18 | outputs = [ 19 | prefix + "isomap.svg", 20 | prefix + "locallylinearembedding.svg", 21 | prefix + "mds.svg", 22 | prefix + "pca.explained_variance.csv", 23 | prefix + "pca.explained_variance.svg", 24 | prefix + "pca.svg", 25 | prefix + "pca.variable_principle_components_association.csv", 26 | prefix + "pca.variable_principle_components_association.p_value.masked.svg", 27 | prefix + "pca.variable_principle_components_association.p_value.svg", 28 | prefix + "pca.variable_principle_components_association.adj_pvalue.masked.svg", 29 | prefix + "pca.variable_principle_components_association.adj_pvalue.svg", 30 | prefix + "pearson_correlation.clustermap.svg", 31 | prefix + "spearman_correlation.clustermap.svg", 32 | prefix + "spectralembedding.svg", 33 | prefix + "tsne.svg", 34 | ] 35 | return outputs 36 | 37 | 38 | class TestUnsupervisedAnalysis: 39 | def test_no_arguments(self, atac_analysis_many_factors, unsup_outputs): 40 | # no arguments 41 | atac_analysis_many_factors.unsupervised_analysis() 42 | for output in unsup_outputs: 43 | assert file_exists_and_not_empty(output) 44 | 45 | def test_matrix_with_no_group_attributes(self, atac_analysis_many_factors): 46 | atac_analysis_many_factors.group_attributes = [] 47 | with pytest.raises(ValueError): 48 | atac_analysis_many_factors.unsupervised_analysis() 49 | 50 | def test_matrix_with_no_multiindex(self, atac_analysis_many_factors, unsup_outputs): 51 | atac_analysis_many_factors.unsupervised_analysis(matrix="matrix_raw") 52 | for output in unsup_outputs: 53 | assert file_exists_and_not_empty(output) 54 | 55 | def test_matrix_with_no_multiindex_no_sample_attributes(self, atac_analysis): 56 | atac_analysis.sample_attributes = [] 57 | with pytest.raises(ValueError): 58 | atac_analysis.unsupervised_analysis(matrix="matrix_raw") 59 | 60 | def test_matrix_with_no_multiindex2(self, atac_analysis_many_factors): 61 | atac_analysis_many_factors.unsupervised_analysis(matrix="matrix_raw") 62 | assert file_exists( 63 | os.path.join(atac_analysis_many_factors.results_dir, "unsupervised_analysis_ATAC-seq") 64 | ) 65 | 66 | def test_various_matrices(self, atac_analysis_many_factors, unsup_outputs): 67 | for matrix in ["matrix_raw", "matrix_norm"]: 68 | atac_analysis_many_factors.annotate_samples(matrix=matrix) 69 | atac_analysis_many_factors.unsupervised_analysis() 70 | for output in unsup_outputs: 71 | assert file_exists(output) 72 | assert file_not_empty(output) 73 | shutil.rmtree( 74 | os.path.join(atac_analysis_many_factors.results_dir, "unsupervised_analysis_ATAC-seq") 75 | ) 76 | # analysis.annotate_samples(matrix="coverage_qnorm") 77 | 78 | def test_too_low_numbers_of_samples_error(self, atac_analysis_many_factors): 79 | for i in range(2): 80 | with pytest.raises(ValueError): 81 | atac_analysis_many_factors.unsupervised_analysis(samples=atac_analysis_many_factors.samples[:i]) 82 | assert file_exists( 83 | os.path.join(atac_analysis_many_factors.results_dir, "unsupervised_analysis_ATAC-seq") 84 | ) 85 | shutil.rmtree( 86 | os.path.join(atac_analysis_many_factors.results_dir, "unsupervised_analysis_ATAC-seq") 87 | ) 88 | 89 | def test_low_samples_no_manifolds(self, atac_analysis_many_factors): 90 | import pandas as pd 91 | prefix = os.path.join( 92 | atac_analysis_many_factors.results_dir, 93 | "unsupervised_analysis_ATAC-seq", 94 | atac_analysis_many_factors.name + ".all_{}s.".format(atac_analysis_many_factors.var_unit_name), 95 | ) 96 | outputs2 = [ 97 | prefix + "mds.svg", 98 | prefix + "pca.explained_variance.csv", 99 | prefix + "pca.explained_variance.svg", 100 | # prefix + "pca.svg", 101 | prefix + "pearson_correlation.clustermap.svg", 102 | prefix + "spearman_correlation.clustermap.svg", 103 | prefix + "tsne.svg", 104 | prefix + "pca.variable_principle_components_association.csv", 105 | ] 106 | not_outputs = [ 107 | prefix + "isomap.svg", 108 | prefix + "locallylinearembedding.svg", 109 | prefix + "spectralembedding.svg", 110 | prefix + "pca.variable_principle_components_association.p_value.masked.svg", 111 | prefix 112 | + "pca.variable_principle_components_association.adj_pvalue.masked.svg", 113 | prefix + "pca.variable_principle_components_association.p_value.svg", 114 | prefix + "pca.variable_principle_components_association.adj_pvalue.svg", 115 | ] 116 | # here I'm picking the first and last samples just to make sure 117 | # they are from different values of attributes `a` and `b` 118 | samples = atac_analysis_many_factors.samples 119 | # idx = pd.Series([(s.A, s.B) for s in samples]).drop_duplicates().index.tolist() 120 | # samples = [samples[idx[0]]] + [samples[idx[-1]]] 121 | # assert samples[0].A != samples[1].A 122 | # assert samples[0].B != samples[1].B 123 | atac_analysis_many_factors.unsupervised_analysis( 124 | samples=[samples[0]] + [samples[-1]]) 125 | for output in outputs2: 126 | assert file_exists_and_not_empty(output) 127 | for output in not_outputs: 128 | assert not file_exists(output) 129 | 130 | # def test_high_samples_varying_all_outputs(self, atac_analysis_many_factors, outputs): 131 | # for i in range(4, len(atac_analysis_many_factors.samples), 2): 132 | # print(i) 133 | # atac_analysis_many_factors.unsupervised_analysis(samples=atac_analysis_many_factors.samples[i:]) 134 | # for output in outputs: 135 | # assert file_exists(output) 136 | # assert file_not_empty(output) 137 | # shutil.rmtree(os.path.join(atac_analysis_many_factors.results_dir, "unsupervised_analysis_ATAC-seq")) 138 | 139 | def test_no_plotting_attributes(self, atac_analysis_many_factors): 140 | with pytest.raises(ValueError): 141 | atac_analysis_many_factors.unsupervised_analysis(attributes_to_plot=[]) 142 | assert file_exists( 143 | os.path.join(atac_analysis_many_factors.results_dir, "unsupervised_analysis_ATAC-seq") 144 | ) 145 | 146 | def test_various_plotting_attributes(self, analysis_annotated, unsup_outputs): 147 | prefix = os.path.join( 148 | analysis_annotated.results_dir, 149 | "unsupervised_analysis_ATAC-seq", 150 | analysis_annotated.name + ".all_{}s.".format(analysis_annotated.var_unit_name), 151 | ) 152 | not_outputs = [ 153 | prefix + "pca.variable_principle_components_association.p_value.masked.svg", 154 | prefix + "pca.variable_principle_components_association.p_value.svg", 155 | prefix 156 | + "pca.variable_principle_components_association.adj_pvalue.masked.svg", 157 | prefix + "pca.variable_principle_components_association.adj_pvalue.svg", 158 | ] 159 | for i in range(1, len(analysis_annotated.group_attributes)): 160 | analysis_annotated.unsupervised_analysis( 161 | attributes_to_plot=analysis_annotated.group_attributes[:i] 162 | ) 163 | for output in unsup_outputs: 164 | if output not in not_outputs: 165 | assert file_exists_and_not_empty(output) 166 | for output in not_outputs: 167 | assert not file_exists(output) 168 | shutil.rmtree( 169 | os.path.join(analysis_annotated.results_dir, "unsupervised_analysis_ATAC-seq") 170 | ) 171 | 172 | def test_various_output_prefixes_attributes(self, atac_analysis_many_factors, unsup_outputs): 173 | atac_analysis_many_factors.unsupervised_analysis(output_prefix="test") 174 | for output in unsup_outputs: 175 | old_output = "all_{}s".format(atac_analysis_many_factors.var_unit_name) 176 | assert file_exists_and_not_empty( 177 | output.replace(old_output, "test") 178 | ) 179 | 180 | def test_standardized_matrix(self, atac_analysis_many_factors, unsup_outputs): 181 | atac_analysis_many_factors.unsupervised_analysis(standardize_matrix=False) 182 | for output in unsup_outputs: 183 | assert file_exists_and_not_empty(output) 184 | 185 | def test_save_additional(self, atac_analysis_many_factors): 186 | 187 | prefix = os.path.join( 188 | atac_analysis_many_factors.results_dir, 189 | "unsupervised_analysis_{}".format(atac_analysis_many_factors.data_type), 190 | atac_analysis_many_factors.name + ".all_{}s.".format(atac_analysis_many_factors.var_unit_name), 191 | ) 192 | 193 | additional_outputs = [ 194 | prefix + "isomap.embedding.csv", 195 | prefix + "locallylinearembedding.embedding.csv", 196 | prefix + "mds.embedding.csv", 197 | prefix + "spectralembedding.embedding.csv", 198 | prefix + "tsne.embedding.csv", 199 | prefix + "pca.embedding.csv", 200 | prefix + "pca.embedding.csv", 201 | prefix + "pca.loading.csv", 202 | ] 203 | atac_analysis_many_factors.unsupervised_analysis( 204 | save_additional=True) 205 | 206 | for output in additional_outputs: 207 | assert file_exists_and_not_empty(output) 208 | -------------------------------------------------------------------------------- /requirements/requirements.docs.txt: -------------------------------------------------------------------------------- 1 | Sphinx 2 | sphinx_rtd_theme 3 | #pydata_sphinx_theme 4 | sphinx-issues 5 | sphinx-argparse 6 | -------------------------------------------------------------------------------- /requirements/requirements.single_cell.txt: -------------------------------------------------------------------------------- 1 | scanpy 2 | scanorama 3 | phate 4 | scvelo 5 | loompy 6 | fbpca 7 | MulticoreTSNE # https://github.com/DmitryUlyanov/Multicore-TSNE/issues/32#issuecomment-368588074 8 | python-igraph 9 | louvain>=0.6 10 | leidenalg 11 | fa2 12 | -------------------------------------------------------------------------------- /requirements/requirements.test.txt: -------------------------------------------------------------------------------- 1 | pytest>=4.4.0 2 | coverage>=4.5.2 3 | pytest-cov 4 | codecov 5 | codacy-coverage 6 | pytest-xdist 7 | rpy2>=3.2.0 8 | -------------------------------------------------------------------------------- /requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | setuptools_scm>=3.3.3 2 | numpy>=1.15.0 3 | scipy>=1.0.0 4 | fastcluster 5 | pandas>=0.25.0 6 | matplotlib 7 | seaborn>=0.10.0 8 | parmap>=1.5.1 9 | pysam>=0.13 10 | pybedtools>=0.7.10 11 | scikit-learn>=0.19.1 12 | statsmodels>=0.8.0 13 | patsy>=0.4.1 14 | tqdm>=4.19.5 15 | peppy>=0.30.1 16 | divvy>=0.5.0 17 | attmap>=0.12.11 18 | requests>=2.21.0 19 | jinja2>=2.10.1 20 | natsort>=6.0.0 21 | joblib>=0.12.5 22 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | [options] 4 | setup_requires = 5 | setuptools_scm 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import sys 4 | 5 | 6 | def parse_requirements(req_file): 7 | requirements = open(req_file).read().strip().split("\n") 8 | requirements = [r for r in requirements if not r.startswith("#")] 9 | return [r for r in requirements if "#egg=" not in r] 10 | 11 | 12 | # take care of extra required modules depending on Python version 13 | extra = {} 14 | try: 15 | from setuptools import setup, find_packages 16 | 17 | if sys.version_info < (2, 7): 18 | extra["install_requires"] = ["argparse"] 19 | if sys.version_info >= (3,): 20 | extra["use_2to3"] = True 21 | except ImportError: 22 | from distutils.core import setup 23 | 24 | if sys.version_info < (2, 7): 25 | extra["dependencies"] = ["argparse"] 26 | 27 | # Requirements 28 | requirements = parse_requirements( 29 | "requirements/requirements.txt") 30 | requirements_test = parse_requirements( 31 | "requirements/requirements.test.txt") 32 | requirements_docs = parse_requirements( 33 | "requirements/requirements.docs.txt") 34 | requirements_sc = parse_requirements( 35 | "requirements/requirements.single_cell.txt") 36 | 37 | long_description = open("README.md").read() 38 | 39 | 40 | # setup 41 | setup( 42 | name="ngs_toolkit", 43 | packages=find_packages(), 44 | use_scm_version={ 45 | 'write_to': 'ngs_toolkit/_version.py', 46 | 'write_to_template': '__version__ = "{version}"\n' 47 | }, 48 | entry_points={ 49 | "console_scripts": [ 50 | "projectmanager = ngs_toolkit.project_manager:main", 51 | "trackmanager = ngs_toolkit.track_manager:main"] 52 | }, 53 | description="A toolkit for NGS analysis with Python.", 54 | long_description=long_description, 55 | long_description_content_type="text/markdown", 56 | classifiers=[ 57 | "Programming Language :: Python :: 3 :: Only", 58 | "Programming Language :: Python :: 3.6", 59 | "Programming Language :: Python :: 3.7", 60 | "Programming Language :: Python :: 3.8", 61 | "Development Status :: 4 - Beta", 62 | "License :: OSI Approved :: " 63 | "GNU General Public License v3 or later (GPLv3+)", 64 | "Topic :: Scientific/Engineering :: Bio-Informatics", 65 | ], 66 | keywords="bioinformatics, sequencing, ngs, ngs analysis, " 67 | "ATAC-Seq, ChIP-seq, RNA-seq, project management", 68 | url="https://github.com/afrendeiro/toolkit", 69 | project_urls={ 70 | "Bug Tracker": "https://github.com/afrendeiro/toolkit/issues", 71 | "Documentation": "https://ngs-toolkit.readthedocs.io", 72 | "Source Code": "https://github.com/afrendeiro/toolkit", 73 | }, 74 | author=u"Andre Rendeiro", 75 | author_email="andre.rendeiro@pm.me", 76 | license="GPL3", 77 | setup_requires=['setuptools_scm'], 78 | install_requires=requirements, 79 | tests_require=requirements_test, 80 | extras_require={ 81 | "testing": requirements_test, 82 | "docs": requirements_docs, 83 | "single_cell": requirements_sc}, 84 | package_data={"ngs_toolkit": ["config/*.yaml", "templates/*.html"]}, 85 | data_files=[ 86 | "requirements/requirements.txt", 87 | "requirements/requirements.test.txt", 88 | "requirements/requirements.single_cell.txt", 89 | ], 90 | **extra 91 | ) 92 | --------------------------------------------------------------------------------