├── .github
    └── workflows
    │   └── test.yaml
├── .gitignore
├── .gitpod.yml
├── .readthedocs.yml
├── .travis.yml
├── Dockerfile
├── LICENCE.txt
├── Makefile
├── Manifest.in
├── README.md
├── docs
    ├── Makefile
    └── source
    │   ├── api.rst
    │   ├── changelog.rst
    │   ├── concepts.rst
    │   ├── conf.py
    │   ├── distributed.rst
    │   ├── examples.rst
    │   ├── index.rst
    │   ├── install.rst
    │   ├── log_config.rst
    │   ├── manager_programs.rst
    │   ├── recipes.rst
    │   ├── report.rst
    │   ├── test.rst
    │   └── usage.rst
├── ngs_toolkit
    ├── .readthedocs.yml
    ├── __init__.py
    ├── analysis.py
    ├── atacseq.py
    ├── chipseq.py
    ├── cnv.py
    ├── config
    │   ├── default.yaml
    │   └── example.yaml
    ├── constants.py
    ├── decorators.py
    ├── demo
    │   ├── __init__.py
    │   └── data_generator.py
    ├── exceptions.py
    ├── general.py
    ├── graphics.py
    ├── parsers.py
    ├── project_manager.py
    ├── recipes
    │   ├── __init__.py
    │   ├── call_peaks.py
    │   ├── coverage.py
    │   ├── deseq2.py
    │   ├── enrichr.py
    │   ├── generate_project.py
    │   ├── lola.py
    │   ├── merge_signal.py
    │   ├── ngs_analysis.py
    │   ├── region_enrichment.py
    │   └── region_set_frip.py
    ├── rnaseq.py
    ├── templates
    │   ├── __init__.py
    │   └── report.html
    ├── tests
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_analysis.py
    │   ├── test_atacseq_analysis.py
    │   ├── test_chipseq_analysis.py
    │   ├── test_cnv_analysis.py
    │   ├── test_config.py
    │   ├── test_decorators.py
    │   ├── test_differential_analysis.py
    │   ├── test_differential_enrichment.py
    │   ├── test_general.py
    │   ├── test_install.py
    │   ├── test_logger.py
    │   ├── test_plot_differential.py
    │   ├── test_plot_differential_enrichment.py
    │   ├── test_project_manager.py
    │   ├── test_recipes.py
    │   ├── test_regression_tests.py
    │   ├── test_report.py
    │   ├── test_rnaseq_analysis.py
    │   ├── test_sample_input_files.py
    │   └── test_unsupervised_analysis.py
    ├── track_manager.py
    └── utils.py
├── requirements
    ├── requirements.docs.txt
    ├── requirements.single_cell.txt
    ├── requirements.test.txt
    └── requirements.txt
├── setup.cfg
└── setup.py


/.github/workflows/test.yaml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |     - master
 5 |     - dev
 6 |   schedule:
 7 |     # Run every sunday at 2am
 8 |     - cron: 0 2 * * 0
 9 | 
10 | jobs:
11 |   test:
12 |     name: ngs_toolkit
13 |     runs-on: ubuntu-20.04
14 |     strategy:
15 |       matrix:
16 |         python_version: ['3.7']
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v1
20 |     - name: Set up Python
21 |       uses: actions/setup-python@v1
22 |       with:
23 |           python-version: ${{ matrix.python }}
24 |           architecture: x64
25 |     - name: Update pip
26 |       run: |
27 |         sudo apt-get install python3-setuptools
28 |         python3 -m pip install --upgrade pip
29 |     - name: Install bedtools 2.27.1
30 |       run: |
31 |         wget http://ftp.br.debian.org/debian/pool/main/b/bedtools/bedtools_2.27.1+dfsg-4_amd64.deb
32 |         sudo dpkg -i bedtools_2.27.1+dfsg-4_amd64.deb
33 |     - name: Install R 3.6
34 |       run: |
35 |         sudo apt-get remove -y r-base
36 |         sudo apt-get autoremove
37 |         sudo apt-get update
38 |         sudo apt-get -y install r-base
39 |     - name: Install bioconductor libraries
40 |       run: |
41 |         sudo apt-get update
42 |         sudo apt-get -y install r-bioc-deseq2 r-bioc-preprocesscore
43 |     - name: Install Combat
44 |       run: |
45 |         python3 -m pip install git+https://github.com/afrendeiro/combat.git
46 |     - name: Install ngs-toolkit
47 |       run: python3 -m pip install .[testing]
48 |     - name: Lint with flake8
49 |       run: |
50 |         python3 -m pip install flake8
51 |         # stop the build if there are Python syntax errors or undefined names
52 |         python3 -m flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
53 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
54 |         python3 -m flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
55 |     - name: Test with pytest
56 |       run: |
57 |         python3 -m pytest -n 2 --disable-warnings --show-capture=no --cov ./ --cov-report term --cov-report xml --pyargs ngs_toolkit
58 |     - name: Report coverage
59 |       env:
60 |         CODACY_PROJECT_TOKEN: ${{ secrets.CODACY_PROJECT_TOKEN }}
61 |         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
62 |       run: |
63 |         python3 -m coverage xml
64 |         python3 -m codecov -f coverage.xml
65 |         python3 -m codacy -r coverage.xml
66 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # ignore test files
 2 | .tox
 3 | .mypy_cache
 4 | _version.py
 5 | pytest.log
 6 | .coverage*
 7 | docs/html
 8 | 
 9 | # Build-related stuff
10 | build/
11 | dist/
12 | *.egg-info
13 | 
14 | 
15 | # toy/experimental files
16 | *.txt
17 | *.csv
18 | *.tsv
19 | *.pkl
20 | *.pickle
21 | *.svg
22 | *.png
23 | *.jpg
24 | *.jpeg
25 | 
26 | # ignore eggs
27 | .eggs/
28 | 
29 | # ignore built docs
30 | doc/build/*
31 | 
32 | # generic ignore list:
33 | *.lst
34 | 
35 | # Compiled source 
36 | *.com
37 | *.class
38 | *.dll
39 | *.exe
40 | *.o
41 | *.so
42 | *.pyc
43 |  
44 | # Packages 
45 | # it's better to unpack these files and commit the raw source
46 | # git has its own built in compression methods
47 | *.7z
48 | *.dmg
49 | *.gz
50 | *.iso
51 | *.jar
52 | *.rar
53 | *.tar
54 | *.zip
55 |  
56 | # Logs and databases 
57 | *.log
58 | *.sql
59 | *.sqlite
60 |  
61 | # OS generated files 
62 | .DS_Store
63 | .DS_Store?
64 | ._*
65 | .Spotlight-V100
66 | .Trashes
67 | ehthumbs.db
68 | Thumbs.db
69 | 
70 | # Sublime files
71 | *.sublime-*
72 | 
73 | # Gedit temporary files 
74 | *~
75 | 
76 | # libreoffice lock files:
77 | .~lock*
78 | 
79 | # IDE-specific items
80 | .idea/
81 | 
82 | # pytest-related
83 | .cache/
84 | .coverage*
85 | coverage.xml
86 | 
87 | # Reserved files for comparison
88 | *RESERVE*
89 | 


--------------------------------------------------------------------------------
/.gitpod.yml:
--------------------------------------------------------------------------------
1 | image:
2 |   file: Dockerfile
3 | 
4 | tasks:
5 |   - init: ln -s ngs_toolkit/config/example.yaml ~/.ngs_toolkit.config.yaml
6 |   - command: ipython3
7 |   - command: sh
8 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/source/conf.py
11 | 
12 | # Build documentation with MkDocs
13 | #mkdocs:
14 | #  configuration: mkdocs.yml
15 | 
16 | # Optionally build your docs in additional formats such as PDF and ePub
17 | formats: all
18 | 
19 | # Optionally set the version of Python and requirements required to build your docs
20 | python:
21 |    version: 3.7
22 |    system_packages: False
23 |    install:
24 |       - method: pip
25 |         path: .
26 |         extra_requirements:
27 |            - docs
28 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | os: linux
 3 | dist: focal
 4 | 
 5 | python:
 6 |   - "3.7"
 7 |   - "3.8"
 8 | 
 9 | cache:
10 |   directories:
11 |     - $HOME/.cache/pip
12 | 
13 | services:
14 |   - xvfb
15 | 
16 | before_install:
17 |   # Install bedtools
18 |   - sudo apt-get -y install bedtools samtools gawk
19 |   # Install R >= 3.6
20 |   - sudo apt-get -y install r-base
21 |   # Add bioconductor libraries
22 |   - sudo apt-get -y install r-bioc-deseq2 r-bioc-preprocesscore
23 |   # Install Combat
24 |   - pip install git+https://github.com/afrendeiro/combat.git
25 | 
26 | install:
27 |   - pip install .[testing]
28 | 
29 | before_script:
30 |   - export DISPLAY=:99.0
31 |   - pip freeze
32 |   - R -q -e "suppressMessages(library('DESeq2')); suppressMessages(library('preprocessCore')); sessionInfo()"
33 | 
34 | script:
35 |   # - pytest -n 2 --disable-warnings --show-capture=no --cov ./ --cov-report term --cov-report xml --pyargs ngs_toolkit
36 |   - pytest -n 2 --disable-warnings --cov ./ --cov-report term --cov-report xml --pyargs ngs_toolkit
37 | 
38 | after_success:
39 |   - coverage xml
40 |   - codecov -f coverage.xml
41 |   - python-codacy-coverage -r coverage.xml
42 | 
43 | deploy:
44 |   - provider: pypi
45 |     user: afrendeiro
46 |     password:
47 |       secure: D1KWSJZHlw/v2JLR0ClOiaw4xAuj550NmSVF1jHpIb8An9WrcfJ2YSg4VWoNL8HCMHPTC+EPeaphQa68+RhZTy5fmvifs8qaKnvzADA3Tz514uAA+vjSa3ohI0fZnX3CsI2q8f0Zr45T1O9dgDLecrYnyNqq5iOwcxTNzpxTwTxgSFoj81k3qhJBer4DLG0yVmfG9SsV+V7ApTv3iUp+PZhiGW+duXsTRzFwZAObDfJuMwuT7O9gwSZ7ACm4pXVRk22CzJtjLX/MKT74QY9+eJehtaWfkGRsl9cVqhQZSb2PELLpbXa8sOAdtEcsvg0IlMuFDjPoV5vxgA5PiZL836Ec1Koi+GD5KJY1RFUoXB1Fq3wP4s9mTlSLggVr+C0YZK6XU1hiJp5+YUZycwxtQBIZmLzT+eUDuQnYCdrowqcnqyoWV3Mjd2Aan0Kn5ZlSb73UD+KX+5C9c8CPhrNBo9odDtK8f6Wuz8s6Szz9kbPKQNIaCWD5MTZGN8te5rq/qWjHEayUJprirgRY7eoJgFjOKO15U2+5QyBF9Z4r98WC03FYUr3EExcTQWhGbsqzVY1GsFgSnlNNgbgl3DwIyPprXMn9G8LJjl4thNZqDAs8q2YH6GJkc6DUkXAM4Ma9bS+PsUNJs0vclBXH3utasARF66t5X36KeYo5W8shsj0=
48 |     distributions: "sdist bdist_wheel"
49 |     on:
50 |       branch: master
51 |       tags: true
52 |       python: 3.7
53 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM gitpod/workspace-full-vnc
 2 | 
 3 | USER root
 4 | 
 5 | # Install bedtools
 6 | RUN apt-get update \
 7 |     && sudo apt-get install -y --no-install-recommends \
 8 |         bedtools \
 9 |     && rm -rf /tmp/downloaded_packages/ /tmp/*.rds \
10 |     && rm -rf /var/lib/apt/lists/*
11 | 
12 | # Install R and bioconductor libraries
13 | RUN apt-get update \
14 |     && apt-get install -y --no-install-recommends \
15 |         r-base \
16 |         r-bioc-deseq2 \
17 |         r-bioc-preprocesscore \
18 |     && rm -rf /tmp/downloaded_packages/ /tmp/*.rds \
19 |     && rm -rf /var/lib/apt/lists/*
20 | 
21 | USER gitpod
22 | 
23 | ENV PYTHONPATH=/home/gitpod/.local/lib/python3.7/site-packages/
24 | 
25 | # Install IPython
26 | RUN pip3 install --user ipython
27 | 
28 | # Install Python dependencies of ngs-toolkit
29 | RUN pip3 install --user -r \
30 |         https://raw.githubusercontent.com/afrendeiro/toolkit/master/requirements/requirements.txt \
31 |     && pip3 install --user -r \
32 |         https://raw.githubusercontent.com/afrendeiro/toolkit/master/requirements/requirements.test.txt \
33 |     && pip3 install --user git+https://github.com/afrendeiro/combat.git
34 | 
35 | # Install library
36 | RUN pip3 install --user \
37 |     git+https://github.com/afrendeiro/toolkit.git#egg=ngs-toolkit[testing]
38 | 
39 | ENV PATH="/home/gitpod/.local/bin:${PATH}"
40 | 
41 | USER root
42 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | .DEFAULT_GOAL := pypitest
  2 | 
  3 | install:
  4 | 	python -m \
  5 | 		pip \
  6 | 		install \
  7 | 		git+https://github.com/afrendeiro/toolkit.git#egg=ngs-toolkit \
  8 | 		--user
  9 | 
 10 | test:
 11 | 	python -m \
 12 | 		pytest -n 3 \
 13 | 		--disable-warnings \
 14 | 		--show-capture=no \
 15 | 		--cov=ngs_toolkit \
 16 | 		--lf \
 17 | 		--cov-report xml \
 18 | 		ngs_toolkit/tests/test_*.py
 19 | 
 20 | 
 21 | test_cov:
 22 | 	python -m \
 23 | 		pytest \
 24 | 		--testmon \
 25 | 		--disable-warnings \
 26 | 		--show-capture=no \
 27 | 		ngs_toolkit/tests/test_*.py
 28 | 
 29 | 
 30 | coverage: test
 31 | 	python -m codecov \
 32 | 		-f coverage.xml
 33 | 	python -m codacy \
 34 | 		-r coverage.xml
 35 | 
 36 | docs:
 37 | 	cd docs && $(MAKE) html
 38 | 	xdg-open docs/build/html/index.html
 39 | 
 40 | build: test
 41 | 	python setup.py sdist bdist_wheel
 42 | 
 43 | pypitest: build
 44 | 	twine \
 45 | 		upload \
 46 | 		-r pypitest dist/*
 47 | 
 48 | pypi: build
 49 | 	twine \
 50 | 		upload \
 51 | 		dist/*
 52 | 
 53 | gh:
 54 | 	docker \
 55 | 		build \
 56 | 		-t ngs-toolkit \
 57 | 		.
 58 | 	docker \
 59 | 		tag \
 60 | 		ngs-toolkit \
 61 | 		docker.pkg.github.com/afrendeiro/toolkit/ngs-toolkit:latest
 62 | 	docker \
 63 | 		push \
 64 | 		docker.pkg.github.com/afrendeiro/toolkit/ngs-toolkit:latest
 65 | 
 66 | gh-release: install
 67 | 	$(eval VERSION := \
 68 | 		$(shell \
 69 | 			python3 \
 70 | 			-c 'from ngs_toolkit import __version__ as v; print(v)'))
 71 | 	docker \
 72 | 		build \
 73 | 		-t ngs-toolkit:$(VERSION) \
 74 | 		.
 75 | 	docker \
 76 | 		tag \
 77 | 		ngs-toolkit \
 78 | 		docker.pkg.github.com/afrendeiro/toolkit/ngs-toolkit:$(VERSION)
 79 | 	docker \
 80 | 		push \
 81 | 		docker.pkg.github.com/afrendeiro/toolkit/ngs-toolkit:$(VERSION)
 82 | 
 83 | clean_pyc:
 84 | 	find . -name \*.pyc -delete
 85 | 
 86 | clean_mypy:
 87 | 	rm -rf .mypy_cache/
 88 | 
 89 | clean_test:
 90 | 	rm -rf .pytest_cache/
 91 | 	rm -rf /tmp/pytest*
 92 | 	find . -name "__pycache__" -exec rm -rf {} \;
 93 | 	rm -rf .coverage*
 94 | 	rm -rf .tox/
 95 | 
 96 | clean_cov: clean_test
 97 | 	rm -fr coverage.xml htmlcov
 98 | 
 99 | clean_docs:
100 | 	rm -fr docs/build/
101 | 
102 | clean_dist:
103 | 	rm -fr dist/
104 | 
105 | clean_build:
106 | 	rm -fr build/
107 | 	rm -rf ngs_toolkit/_version.py
108 | 
109 | clean_eggs:
110 | 	rm -fr ngs_toolkit.egg-info
111 | 	rm -fr .eggs
112 | 
113 | clean: \
114 | 	clean_pyc \
115 | 	clean_mypy \
116 | 	clean_test \
117 | 	clean_cov \
118 | 	clean_docs \
119 | 	clean_dist \
120 | 	clean_build \
121 | 	clean_eggs
122 | 
123 | all: \
124 | 	test \
125 | 	coverage \
126 | 	docs \
127 | 	build \
128 | 	pypitest \
129 | 	pypi \
130 | 	clean
131 | 
132 | .PHONY: \
133 | 	test \
134 | 	coverage \
135 | 	docs \
136 | 	build \
137 | 	pypitest \
138 | 	pypi \
139 | 	clean_pyc \
140 | 	clean_mypy \
141 | 	clean_test \
142 | 	clean_cov \
143 | 	clean_docs \
144 | 	clean_dist \
145 | 	clean_build \
146 | 	clean_eggs \
147 | 	clean
148 | 


--------------------------------------------------------------------------------
/Manifest.in:
--------------------------------------------------------------------------------
1 | include requirements/requirements*.txt
2 | include ngs_toolkit/templates/*
3 | include README.md
4 | include config/*.yaml
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | [![Documentation status](https://readthedocs.org/projects/ngs-toolkit/badge/?version=latest)](http://ngs-toolkit.readthedocs.io/en/latest/?badge=latest)
 3 | [![PyPI version](https://badge.fury.io/py/ngs-toolkit.svg)](https://badge.fury.io/py/ngs-toolkit)
 4 | [![Codacy Badge](https://api.codacy.com/project/badge/Grade/30fcafc027e64b21bf9ddfe8d7f0ff3a)](https://app.codacy.com/app/afrendeiro/toolkit?utm_source=github.com&utm_medium=referral&utm_content=afrendeiro/toolkit&utm_campaign=Badge_Grade_Dashboard)
 5 | [![Build Status](https://travis-ci.org/afrendeiro/toolkit.svg?branch=master)](https://travis-ci.org/afrendeiro/toolkit)
 6 | [![Gitter](https://badges.gitter.im/ngs-toolkit/Lobby.svg)](https://gitter.im/ngs-toolkit/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
 7 | [![PEP compatible](http://pepkit.github.io/img/PEP-compatible-green.svg)](http://pepkit.github.io)
 8 | <!--[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black)-->
 9 | [![Open in Gitpod](https://gitpod.io/button/open-in-gitpod.svg)](https://gitpod.io/#https://github.com/afrendeiro/toolkit)
10 | 
11 | # ngs-toolkit
12 | 
13 | This is my NGS analysis toolkit: ``ngs_toolkit``.
14 | 
15 | Head to the [documentation](http://ngs-toolkit.readthedocs.io/) to see how to install and use the toolkit, and have a look at the catalogue of available functions.
16 | 
17 | Install with:
18 | 
19 | ```bash
20 | pip install ngs-toolkit
21 | ```
22 | 
23 | You might need to add a ``--user`` flag to the above command.
24 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  applehelp  to make an Apple Help Book"
 34 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 35 | 	@echo "  epub       to make an epub"
 36 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 37 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 38 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 39 | 	@echo "  text       to make text files"
 40 | 	@echo "  man        to make manual pages"
 41 | 	@echo "  texinfo    to make Texinfo files"
 42 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 43 | 	@echo "  gettext    to make PO message catalogs"
 44 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 45 | 	@echo "  xml        to make Docutils-native XML files"
 46 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 47 | 	@echo "  linkcheck  to check all external links for integrity"
 48 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 49 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 50 | 
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 
 54 | html:
 55 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 58 | 
 59 | dirhtml:
 60 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 61 | 	@echo
 62 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 63 | 
 64 | singlehtml:
 65 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 68 | 
 69 | pickle:
 70 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 71 | 	@echo
 72 | 	@echo "Build finished; now you can process the pickle files."
 73 | 
 74 | json:
 75 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 76 | 	@echo
 77 | 	@echo "Build finished; now you can process the JSON files."
 78 | 
 79 | htmlhelp:
 80 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 81 | 	@echo
 82 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 83 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 84 | 
 85 | qthelp:
 86 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 87 | 	@echo
 88 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 89 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 90 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pipelines.qhcp"
 91 | 	@echo "To view the help file:"
 92 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pipelines.qhc"
 93 | 
 94 | applehelp:
 95 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
 96 | 	@echo
 97 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
 98 | 	@echo "N.B. You won't be able to view it unless you put it in" \
 99 | 	      "~/Library/Documentation/Help or install it in your application" \
100 | 	      "bundle."
101 | 
102 | devhelp:
103 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
104 | 	@echo
105 | 	@echo "Build finished."
106 | 	@echo "To view the help file:"
107 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/pipelines"
108 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pipelines"
109 | 	@echo "# devhelp"
110 | 
111 | epub:
112 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
113 | 	@echo
114 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
115 | 
116 | latex:
117 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
118 | 	@echo
119 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
120 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
121 | 	      "(use \`make latexpdf' here to do that automatically)."
122 | 
123 | latexpdf:
124 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
125 | 	@echo "Running LaTeX files through pdflatex..."
126 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
127 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
128 | 
129 | latexpdfja:
130 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
131 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
132 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
133 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
134 | 
135 | text:
136 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
137 | 	@echo
138 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
139 | 
140 | man:
141 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
142 | 	@echo
143 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
144 | 
145 | texinfo:
146 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
147 | 	@echo
148 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
149 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
150 | 	      "(use \`make info' here to do that automatically)."
151 | 
152 | info:
153 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
154 | 	@echo "Running Texinfo files through makeinfo..."
155 | 	make -C $(BUILDDIR)/texinfo info
156 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
157 | 
158 | gettext:
159 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
160 | 	@echo
161 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
162 | 
163 | changes:
164 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
165 | 	@echo
166 | 	@echo "The overview file is in $(BUILDDIR)/changes."
167 | 
168 | linkcheck:
169 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
170 | 	@echo
171 | 	@echo "Link check complete; look for any errors in the above output " \
172 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
173 | 
174 | doctest:
175 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
176 | 	@echo "Testing of doctests in the sources finished, look at the " \
177 | 	      "results in $(BUILDDIR)/doctest/output.txt."
178 | 
179 | coverage:
180 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
181 | 	@echo "Testing of coverage in the sources finished, look at the " \
182 | 	      "results in $(BUILDDIR)/coverage/python.txt."
183 | 
184 | xml:
185 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
186 | 	@echo
187 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
188 | 
189 | pseudoxml:
190 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
191 | 	@echo
192 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
193 | 


--------------------------------------------------------------------------------
/docs/source/api.rst:
--------------------------------------------------------------------------------
 1 | API
 2 | ===
 3 | 
 4 | The great flexibility of ``ngs_toolkit`` comes from the ability to compose workflows using the API.
 5 | 
 6 | It provides a rich but abstract :class:`~ngs_toolkit.analysis.Analysis` object and implements various modules building on it depending on the data type.
 7 | 
 8 | In addition, the :mod:`~ngs_toolkit.general` module contains several analysis-independent methods and the :mod:`~ngs_toolkit.utils` module provides low-level functions of general use.
 9 | 
10 | ngs_toolkit.analysis
11 | -----------------------
12 | .. automodule:: ngs_toolkit.analysis
13 |     :members:
14 | 
15 | ngs_toolkit.atacseq
16 | -----------------------
17 | .. automodule:: ngs_toolkit.atacseq
18 |     :members:
19 | 
20 | ngs_toolkit.chipseq
21 | -----------------------
22 | .. automodule:: ngs_toolkit.chipseq
23 |     :members:
24 | 
25 | ngs_toolkit.cnv
26 | -----------------------
27 | .. automodule:: ngs_toolkit.cnv
28 |     :members:
29 | 
30 | ngs_toolkit.rnaseq
31 | -----------------------
32 | .. automodule:: ngs_toolkit.rnaseq
33 |     :members:
34 | 
35 | ngs_toolkit.demo
36 | -----------------------
37 | .. automodule:: ngs_toolkit.demo.data_generator
38 |     :members:
39 | 
40 | ngs_toolkit.general
41 | -----------------------
42 | .. automodule:: ngs_toolkit.general
43 |     :members:
44 | 
45 | ngs_toolkit.graphics
46 | -----------------------
47 | .. automodule:: ngs_toolkit.graphics
48 |     :members:
49 | 
50 | ngs_toolkit.utils
51 | -----------------------
52 | .. automodule:: ngs_toolkit.utils
53 |     :members:
54 | 
55 | ngs_toolkit.parsers
56 | -----------------------
57 | .. automodule:: ngs_toolkit.parsers
58 |     :members:
59 | 
60 | ngs_toolkit
61 | -----------------------
62 | .. automodule:: ngs_toolkit
63 |     :members:
64 | 


--------------------------------------------------------------------------------
/docs/source/concepts.rst:
--------------------------------------------------------------------------------
  1 | Concepts
  2 | ******************************
  3 | 
  4 | A few notes on the way some of the library and its objects were designed to be used.
  5 | 
  6 | .. _AnalysisObjects:
  7 | 
  8 | Analysis objects
  9 | ==============================
 10 | 
 11 | The ``Analysis`` object and its data-type specific dependents are central to the usage of ``ngs-toolkit``. These objects hold attributes and functions relevant to the analysis, such as ``Sample`` objects (and their attributes), Dataframes with numerical values, and others.
 12 | 
 13 | .. _LeveragingOnThePEPFormat:
 14 | 
 15 | Leveraging on the PEP format
 16 | ----------------------------
 17 | 
 18 | One easy and recommended way to instantiate ``Analysis`` objects is with a ``PEP Project`` file.
 19 | This has several advantages:
 20 | 
 21 | - Usage of the language-agnostic PEP format to store a project description and interoperability with other tools (see https://github.com/pepkit for other tools);
 22 | - Initialization of project-specific variables into the ``Analysis`` object that are derived from the PEP. Examples: analysis samples, genome(s), sample and sample group attributes, sample comparison table.
 23 | 
 24 | The example below shows how this works:
 25 | 
 26 | .. code-block:: python
 27 | 
 28 | 	>>> from ngs_toolkit import Analysis
 29 | 	>>> an = Analysis(from_pep="my_project/metadata/project_config.yaml")
 30 | 	[INFO] > Setting project's 'sample_attributes' as the analysis 'sample_attributes'.
 31 | 	[INFO] > Setting project's 'group_attributes' as the analysis 'group_attributes'.
 32 | 	[INFO] > Setting project's 'comparison_table' as the analysis 'comparison_table'.
 33 | 	[INFO] > Setting analysis organism as 'mouse'.
 34 | 	[INFO] > Setting analysis genome as 'mm10'.
 35 | 	>>> print(an)
 36 | 	Analysis 'my_project' with 12 samples of organism 'mouse' (mm10).
 37 | 
 38 | .. note:: The verbosity of ``ngs-toolkit`` can be controlled
 39 | 
 40 | 		See the section on `logging <log_config.html#Configuration>`__ to control the verbosity of ``ngs-toolkit``.
 41 | 
 42 | .. _ReasonableDefaults:
 43 | 
 44 | Reasonable defaults with full customization
 45 | -------------------------------------------
 46 | 
 47 | Functions in the ``Analysis`` object are aware of these attributes and will 
 48 | use them by default, making calling the functions very simple (other overiding 
 49 | arguments can be passed though).
 50 | 
 51 | In the example below, we will generate a consensus peak set for ATAC-seq 
 52 | analyss using the ``get_consensus_sites`` function. This will demonstrate 
 53 | several things that "come for free":
 54 | 
 55 | .. code-block:: python
 56 | 
 57 | 	>>> from ngs_toolkit import ATACSeqAnalysis
 58 | 	>>> an = ATACSeqAnalysis(from_pep="my_project/metadata/project_config.yaml")
 59 | 	[INFO] > Setting project's 'sample_attributes' as the analysis 'sample_attributes'.
 60 | 	[INFO] > Setting project's 'group_attributes' as the analysis 'group_attributes'.
 61 | 	[INFO] > Setting project's 'comparison_table' as the analysis 'comparison_table'.
 62 | 	[INFO] > Subsetting samples for samples of type 'ATAC-seq'.
 63 | 	[INFO] > Subsetting comparison_table for comparisons of type 'ATAC-seq'.
 64 | 	[INFO] > Setting analysis organism as 'mouse'.
 65 | 	[INFO] > Setting analysis genome as 'mm10'.
 66 | 	>>> an.get_consensus_sites() 
 67 | 
 68 | - even though the PEP project includes samples from several data types (ATAC-, ChIP- and RNA-seq), the current analysis will only consider ATAC-seq samples.
 69 | - the necessary files with peak calls for each sample are not specified - ``ngs-toolkit`` knows where to find them;
 70 | - a BED file with ENCODE blacklisted regions will not be given, but these regions will be filtered out - ``ngs-toolkit`` will download this and use it. No static files are distributed with the package.
 71 | - related to the above, the correct blacklist file is downloaded because the genome assembly for the project is infered from the samples - even though it is not directly specified.
 72 | 
 73 | .. _Workflow:
 74 | 
 75 | Workflow
 76 | ------------------------------
 77 | 
 78 | Most functions of the ``Analysis`` object will take some input (usually a 
 79 | dataframe), apply some transformation and assign the result to a 
 80 | variable of the same Analysis object.
 81 | 
 82 | To see what variable has been assigned within a given function check the 
 83 | relevant function in the `API <api.html#Configuration>`__, specifically the 
 84 | `Variables` value. Some functions will assign attributes that are used almost
 85 | ubiquitily. See the `common attributes section <concepts.html#CommonAttributes>`__ for some examples.
 86 | 
 87 | High-level functions will also often assign their outputs to the object 
 88 | itself. To see which attribute holds it, note the ``Attributes`` section of 
 89 | the respective function documentation.
 90 | Assignment allows the exchange of information between analysis steps without 
 91 | the user always providing all required inputs, which would make using such a 
 92 | toolkit quite verbose.
 93 | 
 94 | The example below illustrates this:
 95 | 
 96 | .. code-block:: python
 97 | 
 98 | 	>>> from ngs_toolkit import ATACSeqAnalysis
 99 | 	>>> an = ATACSeqAnalysis(from_pep="my_project/metadata/project_config.yaml")
100 | 	>>> print(an)
101 | 	'ATAC-seq' analysis 'test-project_ATAC-seq_mm10_1_100_1' with 2 samples of organism 'mouse' (mm10).
102 | 	>>> an.get_consensus_sites()
103 | 	>>> an.measure_coverage()
104 | 	>>> print(an.matrix_raw.head())
105 | 	                        S1_a1  S2_a2
106 | 	region                              
107 | 	chr1:42447241-42447627    955   2211
108 | 	chr1:44445678-44446750   1939   2122
109 | 	chr1:44743959-44744926   1264   1443
110 | 	chr1:90513210-90513978   1262   1354
111 | 	chr1:93565764-93567191    911    892
112 | 	>>> an.normalize()
113 | 	>>> print(an.matrix_norm.head())
114 | 	region                      S1_a1      S2_a2
115 | 	chr1:42447241-42447627  12.681954  13.822151
116 | 	chr1:44445678-44446750  13.703582  13.762881
117 | 	chr1:44743959-44744926  13.086324  13.206576
118 | 	chr1:90513210-90513978  13.084040  13.114743
119 | 	chr1:93565764-93567191  12.613915  12.512715
120 | 
121 | All three ``get_consensus_sites``, ``measure_coverage`` and ``normalize`` build
122 | on the output of each other, but the user doesn't have to specify the input to
123 | any. Changing either the name of the attribute that stores either output or the
124 | location of files outputed is nonetheless easy.
125 | 
126 | Many functions also have a ``save`` argument which will save the result as a
127 | ``CSV`` file.
128 | 
129 | .. _CommonAttributes:
130 | 
131 | Common attributes
132 | -----------------
133 | 
134 | To allow a uniform usage across different data types and analysis types,
135 | a few but important attributes of the ``Analysis`` object and its derivatives
136 | have naming conventions:
137 | 
138 | - ``data_type``: The type of data of the analysis. Matches the object type.
139 | - ``matrix_raw``: A dataframe of raw, unnormalized values of shape (features, samples)
140 | - ``matrix_norm``: A dataframe of normalized values of shape (features, samples)
141 | - ``quantity``: The name of the units of the values measured. E.g. "expression" for RNA-seq or "accessibility" for ATAC-seq
142 | - ``var_unit_name``: The name of the variables measured. E.g. "gene" for RNA-seq or "region" for ATAC-seq or ChIP-seq
143 | - ``norm_method``: The method used to normalize the ``matrix_norm`` dataframe
144 | - ``thresholds``: A dictionary with keys "log_fold_change" and "p_value" storing thresholds used in the analysis
145 | 
146 | .. _ComparisonTable:
147 | 
148 | Comparison table
149 | ===============================
150 | 
151 | ``ngs-toolkit`` has functions to perform supervised differntial comparisons 
152 | between groups of samples. The sample groupings are specified in a CSV file called ``comparison_table``.
153 | 
154 | An example of a typical "case vs control" comparison table is given below:
155 | 
156 | .. csv-table:: Typical example of comparison_table
157 |    :header: "comparison_name", "comparison_side", "sample_name", "sample_group"
158 |    :widths: 30, 30, 30, 30
159 | 
160 | 	"KOA_vs_WT",	"1",	"ATAC-seq_KOA_r1",	"KO_A"
161 | 	"KOA_vs_WT",	"1",	"ATAC-seq_KOA_r2",	"KO_A"
162 | 	"KOA_vs_WT",	"0",	"ATAC-seq_WT_r1",	"WT"
163 | 	"KOA_vs_WT",	"0",	"ATAC-seq_WT_r2",	"WT"
164 | 	"KOB_vs_WT",	"1",	"ATAC-seq_KOB_r1",	"KO_B"
165 | 	"KOB_vs_WT",	"1",	"ATAC-seq_KOB_r2",	"KO_B"
166 | 	"KOB_vs_WT",	"0",	"ATAC-seq_WT_r1",	"WT"
167 | 	"KOB_vs_WT",	"0",	"ATAC-seq_WT_r2",	"WT"
168 | 
169 | 
170 | Each row is reserved for a given sample. Samples of the same group (typically 
171 | replicates) should have the same value of "sample_group" and same 
172 | "comparison_side". The group of interest (comparison foreground) should have a 
173 | value of 1 as "comparison_side" and the background a value of 0. Finally, the 
174 | comparison will be labeled with the value of "comparison_name", which should 
175 | be constant for all samples in both foreground and background groups.
176 | 
177 | 
178 | For an all-vs-all group comparison, I recommend labeling all background sample groups as a new group in the following manner:
179 | 
180 | .. csv-table:: "All-vs-all" example of comparison table
181 |    :header: "comparison_name", "comparison_side", "sample_name", "sample_group"
182 |    :widths: 30, 30, 30, 30
183 | 
184 | 	"celltypeA",	"1",	"ATAC-seq_celltypeA_r1",	"ct_A"
185 | 	"celltypeA",	"1",	"ATAC-seq_celltypeA_r2",	"ct_A"
186 | 	"celltypeA",	"0",	"ATAC-seq_celltypeB_r1",	"ct_A_background"
187 | 	"celltypeA",	"0",	"ATAC-seq_celltypeB_r2",	"ct_A_background"
188 | 	"celltypeA",	"0",	"ATAC-seq_celltypeC_r1",	"ct_A_background"
189 | 	"celltypeA",	"0",	"ATAC-seq_celltypeC_r2",	"ct_A_background"
190 | 	"celltypeB",	"1",	"ATAC-seq_celltypeB_r1",	"ct_B"
191 | 	"celltypeB",	"1",	"ATAC-seq_celltypeB_r2",	"ct_B"
192 | 	"celltypeB",	"0",	"ATAC-seq_celltypeA_r1",	"ct_B_background"
193 | 	"celltypeB",	"0",	"ATAC-seq_celltypeA_r2",	"ct_B_background"
194 | 	"celltypeB",	"0",	"ATAC-seq_celltypeC_r1",	"ct_B_background"
195 | 	"celltypeB",	"0",	"ATAC-seq_celltypeC_r2",	"ct_B_background"
196 | 
197 | 
198 | Additional useful columns are `data_type` (to subset comparisons based on type 
199 | of NGS data), `comparison_type` to specify the type of comparison to perform
200 | (e.g. one of 'differential' or 'peaks') and `toggle` for subsetting 
201 | comparisons to perform.
202 | 
203 | 
204 | .. note:: **Hyphens and other symbols in comparison_table**
205 | 	
206 | 	Since differential comparisons are perfomed using DESeq2, R is used 
207 | 	(throught the Python-R interface library rpy2).
208 | 	ngs_toolkit will create the required tables by DESeq2 which includes names 
209 | 	of samples and comparisons as dataframe columns. Unfortunately due to the 
210 | 	way R handles column names, these get changed.
211 | 
212 | 	In the future this will be accounted for but for now avoid using hyphens 
213 | 	and any other symbols as values for sample names or groups.
214 | 
215 | 
216 | .. _LowLevelFunctions:
217 | 
218 | Low-level functions - ``utils``
219 | ===============================
220 | 
221 | Functions from Analysis objects are generally pretty high level functions, 
222 | often performing several tasks by calling other more general-purpose 
223 | functions. However, one of the concepts I really wanted to have is that the 
224 | user retains as much control as they wish.
225 | 
226 | They may choose to use the high level functions which generally provide 
227 | sensible defaults, or retain more control and build their analysis pipeline 
228 | from the lower level helper functions.
229 | 
230 | One example: calling ``ATACSeqAnalysis.normalize()`` will by default run 3-4
231 | other functions to return a quantile normalized, GC-corrected, log-transformed
232 | output - a fairly complex normalization procedure but made simple by providing 
233 | sensible defaults.
234 | 
235 | A user may easily change the procedure by choosing one of the ~4 types of
236 | normalization using keyword arguments or implement an alternative method which 
237 | can be plugged in to the next step of the analysis.
238 | 
239 | In the future the low level functions will be moved to `ngs_toolkit.utils` and 
240 | the data type-specific modules will have only classes and functions specific 
241 | to those data which are usually more high-level.
242 | 


--------------------------------------------------------------------------------
/docs/source/distributed.rst:
--------------------------------------------------------------------------------
 1 | Distributed computing
 2 | =============================
 3 | 
 4 | divvy
 5 | -----------------------------
 6 | 
 7 | Certain functions in the ``ngs_toolkit`` toolkit can make use of distributed
 8 | computing. To achieve this for a variety of computing configurations
 9 | it uses the `divvy library <http://divvy.databio.org/>`_.
10 | 
11 | Divvy provides an abstract way of submitting a job to various job managers by
12 | shipping job templates for each configuration.
13 | 
14 | When ``divvy`` starts, a configuration is chosen (the ``compute_configuration``
15 | attribute) and that template gets filled with the attributes of the job -
16 | the code to be executed, the resouce requirements and others
17 | (e.g. "cores", "mem", "time" attributes).
18 | 
19 | To see all supported compute configurations run:
20 | 
21 | .. code-block:: bash
22 | 
23 |     divvy list
24 | 
25 | For more information on how to configure ``divvy``, see its documentation:
26 | http://divvy.databio.org/
27 | 
28 | To let ``ngs_toolkit`` know which ``divvy`` configuration to use by default,
29 | modify the following section in the ``ngs_toolkit`` configuration file:
30 | 
31 | .. code-block:: yaml
32 | 
33 |     preferences:
34 |       # The next item is the default computing configuration to use from divvy.
35 |       # Run "divvy list" to see all options.
36 |       # See more here: http://code.databio.org/divvy/
37 |       computing_configuration: 'slurm'
38 | 
39 | This will make ``ngs_toolkit`` send jobs to a slurm cluster if wanted.
40 | 
41 | All functions that allow running a task in a distributed manner have a
42 | ``distributed`` keyword argument.
43 | 
44 | In addition, they also accept additional keyword arguments (`kwargs` in the
45 | function signature) where additional options can be passed.
46 | These options must match fields available to format of the currently selected
47 | ``compute_configuration``. 
48 | 
49 | Sending jobs and collecting output
50 | ----------------------------------
51 | 
52 | Performing a taks in a distributed manner can therefore be as simple as calling 
53 | the desired function with ``distributed=True``. Jobs will be sent to the
54 | job manager of the chosen computing configuration.
55 | 
56 | However, since the jobs are often run individually for a sample/group of samples,
57 | functions called with ``distributed=True`` may not return the same output as
58 | ``distributed=False``.
59 | 
60 | For that reason, for all such functions, there is a reciprocal function of
61 | identical name as the first prefixed with ``collect``.
62 | 
63 | .. code-block:: python
64 | 
65 |     from ngs_toolkit.demo import generate_project
66 |     an = generate_project(sample_input_files=True)
67 |     an.measure_coverage(distributed=True)
68 |     coverage = collect_coverage()
69 | 
70 | Implementing automatic collection of job outputs in part of future plans.
71 | 
72 | Example
73 | -----------------------------
74 | 
75 | The :func:`ngs_toolkit.atacseq.ATACSeqAnalysis.measure_coverage` function has
76 | ``distributed`` and ``kwargs`` options.
77 | 
78 | This provides code portability and allows customization of various aspects of
79 | the jobs:
80 | 
81 | .. code-block:: python
82 | 
83 |     from ngs_toolkit.demo import generate_project
84 |     an = generate_project(sample_input_files=True)
85 |     # in serial
86 |     cov1 = an.measure_coverage()
87 |     # as slurm jobs (because the config computing_configuration is set to 'slurm')
88 |     an.measure_coverage(distributed=True)
89 |     cov2 = collect_coverage()
90 |     # confirm the output is the same
91 |     assert (cov2 == cov1).all().all()
92 | 
93 | .. code-block:: python
94 | 
95 |     # as slurm jobs to a particular queue and more memory
96 |     an.measure_coverage(distributed=True, partition="longq", mem=24000)
97 |     # here 'partition' and 'mem' are attributes of the slurm divvy template
98 |     # and not magic attributes
99 | 


--------------------------------------------------------------------------------
/docs/source/examples.rst:
--------------------------------------------------------------------------------
  1 | Examples
  2 | ******************************
  3 | 
  4 | 
  5 | Analysis example
  6 | ==============================
  7 | 
  8 | The following is an example of how to use ``ngs_toolkit`` in a ATAC-seq project.
  9 | While straightforward, it still allows considerable customization due to the modularity of the toolkit and the parametrization of most functions (this example uses default values everywhere nonetheless).
 10 | 
 11 | .. note::
 12 |     ``ngs_toolkit`` from version 0.25.0 on uses the `PEP 2.0 specification <http://pep.databio.org/en/2.0.0/specification/>`_. If you have a PEP made in an earlier version, you must update it in order to use ``ngs_toolkit``>=0.25.0.
 13 | 
 14 | 
 15 | We have the following `PEP project <http://pep.databio.org/>`_ config YAML file:
 16 | 
 17 | .. code-block:: yaml
 18 | 
 19 |     pep_version: "2.0.0"
 20 |     name: example_project
 21 |     description: example_project
 22 |     username: user
 23 |     email: user@email.com
 24 | 
 25 |     sample_table: annotation.csv
 26 |     subsample_table:
 27 |     comparison_table: comparison_table.csv
 28 | 
 29 |     submission_subdir: submission
 30 |     results_subdir: data
 31 |     output_dir: example_project
 32 | 
 33 |     pipeline_interfaces: /home/user/workspace/open_pipelines/pipeline_interface.yaml
 34 | 
 35 |     sample_attributes:
 36 |       - sample_name
 37 |       - genotype
 38 |       - replicate
 39 |     group_attributes:
 40 |       - genotype
 41 |       - replicate
 42 |     sample_modifiers:
 43 |         imply:
 44 |             - if:
 45 |                 organism: 'human'
 46 |               then:
 47 |                 genome: 'hg38'
 48 |         derive:
 49 |             attributes: [data_source]
 50 |             sources:
 51 |                 local: data/{sample_name}.bam
 52 |                 bsf: /scratch/lab_bsf/samples/{flowcell}/{flowcell}_{lane}_samples/{flowcell}_{lane}#{BSF_name}.bam
 53 | 
 54 | 
 55 | 
 56 | The following sample annotation CSV file, 'annotation.csv':
 57 | 
 58 | .. csv-table:: Annotation table for example
 59 |    :header: "sample_name", "protocol", "genotype", "replicate", "organism", flowcell, lane
 60 | 
 61 |     "ATAC-seq_KOA_r1",  "ATAC-seq", KO_A",   "1",   "human", "C0AXX",   "1"
 62 |     "ATAC-seq_KOA_r2",  "ATAC-seq", KO_A",   "2",   "human", "C0AXX",   "1"
 63 |     "ATAC-seq_KOB_r1",  "ATAC-seq", KO_B",   "1",   "human", "C0AXX",   "1"
 64 |     "ATAC-seq_KOB_r2",  "ATAC-seq", KO_B",   "2",   "human", "C0AXX",   "1"
 65 |     "ATAC-seq_WT_r1",   "ATAC-seq", WT",     "1",   "human", "C0AXX",   "1"
 66 |     "ATAC-seq_WT_r2",   "ATAC-seq", WT",     "2",   "human", "C0AXX",   "1"
 67 | 
 68 | 
 69 | And the following comparison table, 'comparison_table.csv':
 70 | 
 71 | .. csv-table:: Comparison table for example
 72 |    :header: "comparison_name", "comparison_side", "sample_name", "sample_group"
 73 |    :widths: 30, 30, 30, 30
 74 | 
 75 |     "KOA_vs_WT",    "1",    "ATAC-seq_KOA_r1",  "KO_A"
 76 |     "KOA_vs_WT",    "1",    "ATAC-seq_KOA_r2",  "KO_A"
 77 |     "KOA_vs_WT",    "0",    "ATAC-seq_WT_r1",   "WT"
 78 |     "KOA_vs_WT",    "0",    "ATAC-seq_WT_r2",   "WT"
 79 |     "KOB_vs_WT",    "1",    "ATAC-seq_KOB_r1",  "KO_B"
 80 |     "KOB_vs_WT",    "1",    "ATAC-seq_KOB_r2",  "KO_B"
 81 |     "KOB_vs_WT",    "0",    "ATAC-seq_WT_r1",   "WT"
 82 |     "KOB_vs_WT",    "0",    "ATAC-seq_WT_r2",   "WT"
 83 | 
 84 | 
 85 | 
 86 | ATAC-seq analysis example
 87 | -------------------------------
 88 | 
 89 | .. code-block:: python
 90 | 
 91 |     import os
 92 |     from ngs_toolkit.atacseq import ATACSeqAnalysis
 93 | 
 94 |     # Start project and analysis objects
 95 |     analysis = ATACSeqAnalysis(from_pep="project_config.yaml")
 96 | 
 97 |     # Generate consensus peak set and annotate it
 98 |     ## get consensus peak set from all samples
 99 |     analysis.get_consensus_sites()
100 |     ## annotate peak set with genomic context
101 |     analysis.get_peak_genomic_location()
102 |     ## annotate peak set with chromatin context
103 |     analysis.get_peak_chromatin_state(
104 |         os.path.join(
105 |             analysis.data_dir,
106 |             "external",
107 |             "E032_15_coreMarks_mnemonics.bed"))
108 |     ## annotate peak set with genes
109 |     analysis.get_peak_gene_annotation()
110 | 
111 |     # Use accessibility quantitatively
112 |     ## get coverage values for each peak in each sample of ATAC-seq
113 |     analysis.measure_coverage()
114 | 
115 |     # Normalize accessibility (quantile normalization + GC correction, requires cqn R library)
116 |     analysis.normalize(method="cqn")
117 | 
118 |     # Annotate normalized accessibility with sample and region info
119 |     # # annotate dataframe with peak metadata
120 |     analysis.annotate_features()
121 |     # # annotate dataframe with sample metadata
122 |     analysis.accessibility = analysis.annotate_samples()
123 | 
124 |     # UNSUPERVISED ANALYSIS
125 |     # # plot pairwise sample correlations,
126 |     # # perform dimensionality reduction (MDS, PCA)
127 |     # # and plot samples in this spaces, annotated with their attributes
128 |     analysis.unsupervised_analysis()
129 | 
130 | 
131 |     # SUPERVISED ANALYSIS
132 |     # # differential analysis with DESeq2
133 |     analysis.differential_analysis()
134 | 
135 |     # # plot scatter, volcano, MA, heatmaps on the differential regions
136 |     # # by groups and with individual samples, with normalized values
137 |     # # and scalled values (Z-score).
138 |     analysis.plot_differential(
139 |         alpha=0.05,
140 |         corrected_p_value=True,
141 |         fold_change=1)
142 | 
143 |     # # perform enrichment analysis on differnetial region sets
144 |     # # using LOLA, MEME-AME, HOMER and Enrichr
145 |     analysis.differential_enrichment(
146 |         directional=True,
147 |         max_diff=1000,
148 |         sort_var="pvalue")
149 | 
150 |     # # for each type of enrichment results,
151 |     # # plot bar and scatter plots of odds ratio vs p-value,
152 |     # # heatmaps of enrichment across terms for each comparison
153 |     # # and comparison correlation in enrichment terms
154 |     analysis.plot_differential_enrichment()
155 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome
 2 | ^^^^^^^^
 3 | 
 4 | ``ngs_toolkit`` is a Python library for the analysis of NGS data.
 5 | 
 6 | Its goals are to provide a highly customizable set of objects and tools that
 7 | interact with each other to create data processing and analysis workflows in
 8 | both a interactive and scripted way.
 9 | 
10 | ``ngs-toolkit`` is unique in the following aspects:
11 | 
12 |  - Includes tried-and-tested (and published) workflows for end-to-end analysis of NGS data, while at the same time allowing high customization;
13 |  - Tailored for well-established NGS data types, but supporting arbitrary data types;
14 |  - Its target audience are mid-level computational biologists who want to "get it done" and focus on interpretation of results. At the same time, it allows running workflows with minimal programming experience.
15 | 
16 | ``ngs-toolkit`` is reaching maturity, with a stable API (from version 0.14.0 on),
17 | improving documentation and increasing test coverage.
18 | 
19 | Head to the :doc:`Installation <install>` to see installation instructions, to
20 | :doc:`Usage <usage>` for quick use, or have a look at the catalogue of available
21 | functions in the :doc:`API <api>`.
22 | 
23 | Contents
24 | ^^^^^^^^
25 | 
26 | .. toctree::
27 |     :maxdepth: 1
28 | 
29 |     install
30 |     usage
31 |     examples
32 |     concepts
33 |     log_config
34 |     report
35 |     distributed
36 |     manager_programs
37 |     recipes
38 |     api
39 |     test
40 |     changelog
41 | 
42 | 
43 | Indices and tables
44 | ==================
45 | 
46 | * :ref:`genindex`
47 | * :ref:`modindex`
48 | * :ref:`search`
49 | 
50 | Links
51 | ^^^^^^^^
52 | 
53 |  * Documentation: http://toolkit.readthedocs.io/
54 |  * Issues and source code: https://github.com/afrendeiro/toolkit
55 | 


--------------------------------------------------------------------------------
/docs/source/install.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 3 | 
 4 | With pip
 5 | =============================
 6 | 
 7 | ``ngs_toolkit`` is available for Python 3 only. Is is tested in Python 3.6 and 3.7.
 8 | 
 9 | To install, simply do:
10 | 
11 | .. code-block:: bash
12 | 
13 |    pip install ngs-toolkit
14 | 
15 | you might need to add a ``--user`` flag if not root or running in a virtual environment.
16 | 
17 | This will install all the Python dependencies needed too.
18 | See `here <https://github.com/afrendeiro/toolkit/blob/master/requirements/requirements.txt>`_ a list of all Python dependencies used.
19 | 
20 | To install the latest development version:
21 | 
22 | .. code-block:: bash
23 | 
24 |    pip install git+https://github.com/afrendeiro/toolkit.git#egg=ngs-toolkit
25 | 
26 | 
27 | Using a conda environment
28 | =============================
29 | 
30 | Get the `latest Python 3 installation of miniconda from the conda website <https://docs.conda.io/en/latest/miniconda.html>`_ and follow the instructions for installation and activation of the environment.
31 | 
32 | Setup the bioconda channel:
33 | 
34 | .. code-block:: bash
35 | 
36 | 	conda config --add channels defaults
37 | 	conda config --add channels bioconda
38 | 	conda config --add channels conda-forge
39 | 
40 | Install non-Python dependencies:
41 | 
42 | .. code-block:: bash
43 | 
44 |    conda install -y bedtools==2.27.1
45 |    conda install -y ucsc-twobittofa
46 |    conda install -y bioconductor-deseq2
47 |    conda install -y bioconductor-cqn
48 | 
49 | And then install the ``ngs-toolkit`` library with pip (available only through PyPi).
50 | 
51 | .. code-block:: bash
52 | 
53 | 	pip install ngs-toolkit
54 | 
55 | 
56 | Non-Python requirements
57 | =============================
58 | 
59 | 
60 | ``ngs_toolkit`` makes use of some non-Python dependencies.
61 | 
62 |  - `bedtools <https://bedtools.readthedocs.io/en/latest/>`_: version should be at least 2.27.1
63 | 
64 | The following are highly recommended only for some data or analysis types:
65 | 
66 |  - `R <https://www.r-project.org/>`_ and some bioconductor libraries (optional):
67 |    - `DESeq2 <https://bioconductor.org/packages/release/bioc/html/DESeq2.html>`_ (optional): used for differential testing of genes/regulatory elements and variance stabilization transformation of data.
68 |    - `cqn <https://bioconductor.org/packages/release/bioc/html/cqn.html>`_ (optional): used for GC-content aware normalization of NGS data.
69 |  - `Kent tools <https://github.com/ENCODE-DCC/kentUtils>`_ (optional):
70 |    - the ``twoBitToFa`` binary from UCSC's Kent bioinformatics toolkit is used to convert between the 2bit and FASTA formats.
71 | 
72 | For region-based enrichment analysis, you may also want to have the following software installed (entirely optional):
73 | 
74 |  - `MEME suite <http://meme-suite.org/>`_
75 |  - `HOMER motif analysis <http://homer.ucsd.edu/homer/motif/>`_
76 |  - `LOLA R package <http://code.databio.org/LOLA/>`_
77 | 
78 | You can see how to install all requirements in an Ubuntu-based system in the provided `Dockerfile <https://github.com/afrendeiro/toolkit/blob/master/Dockerfile>`_.
79 | 
80 | 
81 | Docker
82 | =============================
83 | 
84 | A Docker image containing ``ngs_toolkit`` and its dependencies is also available: https://hub.docker.com/r/afrendeiro/ngs-toolkit
85 | 
86 | To pull the image and run a module for example in this way:
87 | 
88 | .. code-block:: bash
89 | 
90 |    docker pull afrendeiro/ngs-toolkit
91 |    docker run ngs-toolkit python3 -m ngs_toolkit.recipes.ngs_analysis --help
92 | 
93 | You can also run an interactive session of ``ngs_toolkit`` `based on the docker image on Gitpod <https://gitpod.io/#https://github.com/afrendeiro/toolkit>`_.
94 | 
95 | The Dockerfile that produced the image is available in the github repository: https://github.com/afrendeiro/toolkit/blob/master/Dockerfile
96 | 


--------------------------------------------------------------------------------
/docs/source/log_config.rst:
--------------------------------------------------------------------------------
 1 | Configuration, logging and versioning
 2 | *************************************
 3 | 
 4 | .. _Configuration:
 5 | 
 6 | 
 7 | Configuration
 8 | =============================
 9 | 
10 | ``ngs_toolkit`` uses a YAML configuration file.
11 | 
12 | While entirely optional, this allows the user to specify preferences, patterns and allows usage across different computing environments.
13 | 
14 | The user can provide its own configuration in two ways:
15 | 
16 | * In a YAML file located in ``$HOME/.ngs_toolkit.config.yaml``;
17 | * A user provided file given during interactive runtime passed to ``ngs_toolkit.setup_config()``.
18 | 
19 | If more than one is given values in the configuration files will be updated in the following order:
20 | 
21 | 1. A minimal configuration file from the package data;
22 | 2. The user provided file in ``$HOME/.ngs_toolkit.config.yaml``;
23 | 3. The user provided file passed to ``ngs_toolkit.setup_config()``.
24 | 
25 | To see how to structure the YAML file, see section below.
26 | 
27 | 
28 | 
29 | Example configuration files
30 | -----------------------------
31 | 
32 | To see all available configuration fields have a look at the default configuration file: https://github.com/afrendeiro/toolkit/blob/master/ngs_toolkit/config/default.yaml#L1
33 | 
34 | For a full example of a fully configured file have a look at the example configuration file: https://github.com/afrendeiro/toolkit/blob/master/ngs_toolkit/config/example.yaml#L1
35 | 
36 | However, the configuration file does not need to include all fields. Below is a minimal example of a configuration file.
37 | 
38 | .. code-block:: yaml
39 | 
40 |     username: user
41 |     email: user@mail.com
42 |     website_root: userwebsite.web.com
43 |     preferences:
44 |       # For the next item, environment variables are formatted if they are of the form ${VAR}
45 |       root_reference_dir: ${USER}/reference_data
46 |       root_projects_dir: ${USER}/projects
47 |       default_genome_assemblies:
48 |         - human: hg38
49 |         - mouse: mm10
50 |       # Below is the name of the divvy package configuration (http://divvy.databio.org/en/latest/)
51 |       computing_configuration: 'slurm'
52 |     sample_input_files:
53 |       ATAC-seq:
54 |         aligned_filtered_bam: "{data_dir}/{sample_name}/mapped/{sample_name}.bowtie2.filtered.bam"
55 |         peaks: "{data_dir}/{sample_name}/peaks/{sample_name}_peaks.narrowPeak"
56 |         summits: "{data_dir}/{sample_name}/peaks/{sample_name}_summits.narrowPeak"
57 |       ChIP-seq:
58 |         aligned_filtered_bam: "{data_dir}/{sample_name}/mapped/{sample_name}.bowtie2.filtered.bam"
59 |       CNV:
60 |         log2_read_counts: "{data_dir}/{sample_name}/{sample_name}_{resolution}/CNAprofiles/log2_read_counts.igv"
61 |       RNA-seq:
62 |         aligned_filtered_bam: "{data_dir}/{sample_name}/mapped/{sample_name}.bowtie2.filtered.bam"
63 |         bitseq_counts: "{data_dir}/{sample_name}/quantification/{sample_name}_bitseq.tsv"
64 | 
65 | 
66 | .. note:: `Not all elements are required`
67 |     
68 |     In fact none of it is required, but it is recommended to have a look at the template configuration file and set custom options.
69 | 
70 | .. _Logging:
71 | 
72 | Logging
73 | =============================
74 | 
75 | ``ngs_toolkit`` will log its operations and errors using the Python standard logging library.
76 | 
77 | This will happen by default to standard output (sys.stdout) but also to a file in ``$HOME/.ngs_toolkit.log.txt``.
78 | 
79 | The location of the log file and the level of events to be reported can be customized in the ``ngs_toolkit.setup_logger()`` function.
80 | 
81 | 
82 | .. _Versioning:
83 | 
84 | Versioning
85 | =============================
86 | 
87 | ``ngs_toolkit`` will by default timestamp every output it produces (CSV and figure files).
88 | 
89 | This behaviour can be controlled independently for tables and figures by setting the respective values of the configuration file:
90 | 
91 | .. code-block:: yaml
92 | 
93 |     preferences:
94 |       report:
95 |         timestamp_figures: False
96 |         timestamp_tables: False
97 | 


--------------------------------------------------------------------------------
/docs/source/manager_programs.rst:
--------------------------------------------------------------------------------
  1 | Manager programs 
  2 | ******************************
  3 | 
  4 | `ngs_toolkit` comes with two programs that provide a command line interface (CLI):
  5 |  - ``projectmanager`` handles the creation and execution of a `looper` project, providing sensible configuration templates and git-enabled tracking of changes.
  6 |  - ``trackmanager`` handles the creation of a UCSC trackhub or IGV link for ATAC/ChIP-seq data based on bigWig files created by ``looper`` pipelines.
  7 | 
  8 | 
  9 | Here you can see the command-line usage instructions for the main looper command and for each subcommand:
 10 | 
 11 | 
 12 | projectmanager
 13 | =============================
 14 | 
 15 | .. code-block:: none
 16 | 
 17 | 	usage: projectmanager [-h] {create,recipe} ...
 18 | 
 19 | 	projectmanager - A project manager.
 20 | 
 21 | 	positional arguments:
 22 | 	  {create,recipe}
 23 | 	    create         Create project.
 24 | 	    recipe         Run ngs_toolkit recipe for a given project.
 25 | 
 26 | 	optional arguments:
 27 | 	  -h, --help       show this help message and exit
 28 | 
 29 | 	https://github.com/afrendeiro/toolkit
 30 | 
 31 | 
 32 | 
 33 | projectmanager::create
 34 | -----------------------------
 35 | 
 36 | .. code-block:: none
 37 | 
 38 | 	usage: projectmanager create [-h] [-r ROOT_DIR] [-d] [--overwrite]
 39 | 	                             project_name
 40 | 
 41 | 	Create project.
 42 | 
 43 | 	positional arguments:
 44 | 	  project_name          Project name.
 45 | 
 46 | 	optional arguments:
 47 | 	  -h, --help            show this help message and exit
 48 | 	  -r ROOT_DIR, --root-dir ROOT_DIR
 49 | 	                        Root directory to create projects.
 50 | 	  -d, --dry-run         Don't actually do anything.
 51 | 	  --overwrite           Don't overwrite any existing directory or file.
 52 | 
 53 | 
 54 | projectmanager::recipe
 55 | -----------------------------
 56 | 
 57 | .. code-block:: none
 58 | 
 59 | 	usage: projectmanager recipe [-h] recipe_name project_config
 60 | 
 61 | 	Run recipe.
 62 | 
 63 | 	positional arguments:
 64 | 	  recipe_name     Recipe name.
 65 | 	  project_config  Project config.
 66 | 
 67 | 	optional arguments:
 68 | 	  -h, --help      show this help message and exit
 69 | 
 70 | 
 71 | trackmanager
 72 | =============================
 73 | 
 74 | .. code-block:: none
 75 | 
 76 | 	usage: trackmanager [-h] [-a [ATTRIBUTES]] [-c COLOR_ATTRIBUTE] [-r] [-l]
 77 | 	                    project_config_file
 78 | 	
 79 | 	positional arguments:
 80 | 	  project_config_file
 81 | 	
 82 | 	optional arguments:
 83 | 	  -h, --help            show this help message and exit
 84 | 	  -a [ATTRIBUTES], --attrs [ATTRIBUTES]
 85 | 	                        Sample attributes (annotation sheet columns) to use to
 86 | 	                        order tracks. Add attributes comma-separated with no
 87 | 	                        whitespace.
 88 | 	  -c COLOR_ATTRIBUTE, --color-attr COLOR_ATTRIBUTE
 89 | 	                        Sample attribute to use to color tracks with. Default
 90 | 	                        is first attribute passed.
 91 | 	  -r, --overlay-replicates
 92 | 	                        Whether replicate samples should be overlaied in same
 93 | 	                        track. Default=False.
 94 | 	  -l, --link            Whether bigWig files should be soft-linked to the
 95 | 	                        track database directory. Default=False.
 96 | 
 97 | 
 98 | .. note:: `Copying vs linking bigWig files files in trackmanager`
 99 | 	
100 | 	The intention of trackmanager is to create a hierarchy of files in a HTTP server which can be used by genome browsers.
101 | 	This requires files (and their parent directories) to be readable and executable.
102 | 	When soft-linking files, they will retain the permission attributes of the original files and this may not be appropriate to serve through a server.
103 | 	Be aware that copying or linking these files does not always works (manual movement of files might be required).
104 | 
105 | 
106 | .. note:: `Changing permissions of files and directories in bigwig directory`
107 | 	
108 | 	Trackmanager will try to change the permissions of the bigwig files and their parent directories to allow reading and execution by everyone.
109 | 	Be aware that this does not always works (manual permission changes might be required).
110 | 
111 | 


--------------------------------------------------------------------------------
/docs/source/recipes.rst:
--------------------------------------------------------------------------------
 1 | Recipes
 2 | ===========
 3 | 
 4 | 
 5 | ``ngs_toolkit`` provides scripts to perform routine tasks on NGS data -
 6 | they are called ``recipes``.
 7 | 
 8 | Recipes are distributed with ``ngs_toolkit`` and can be seen in the
 9 | `github repository <https://github.com/afrendeiro/toolkit/tree/master/ngs_toolkit/recipes>`_.
10 | 
11 | To make it convenient to run the scripts on data from a project,
12 | recipes can also be run with the command
13 | ``projectmanager recipe <recipe_name> <project_config.yaml>``.
14 | 
15 | 
16 | ngs_toolkit.recipes.ngs_analysis
17 | --------------------------------
18 | 
19 | .. argparse::
20 |    :module: ngs_toolkit.recipes.ngs_analysis
21 |    :func: parse_arguments
22 | 
23 | 
24 | ngs_toolkit.recipes.call_peaks
25 | ------------------------------
26 | 
27 | .. argparse::
28 |    :module: ngs_toolkit.recipes.call_peaks
29 |    :func: parse_arguments
30 | 
31 | 
32 | ngs_toolkit.recipes.coverage
33 | ----------------------------
34 | 
35 | .. argparse::
36 |    :module: ngs_toolkit.recipes.coverage
37 |    :func: parse_arguments
38 | 
39 | 
40 | ngs_toolkit.recipes.deseq2
41 | --------------------------
42 | 
43 | .. argparse::
44 |    :module: ngs_toolkit.recipes.deseq2
45 |    :func: parse_arguments
46 | 
47 | 
48 | ngs_toolkit.recipes.enrichr
49 | ---------------------------
50 | 
51 | .. argparse::
52 |    :module: ngs_toolkit.recipes.enrichr
53 |    :func: parse_arguments
54 | 
55 | 
56 | ngs_toolkit.recipes.generate_project
57 | ------------------------------------
58 | 
59 | .. argparse::
60 |    :module: ngs_toolkit.recipes.generate_project
61 |    :func: parse_arguments
62 | 
63 | 
64 | ngs_toolkit.recipes.lola
65 | ------------------------
66 | 
67 | .. argparse::
68 |    :module: ngs_toolkit.recipes.lola
69 |    :func: parse_arguments
70 | 
71 | 
72 | ngs_toolkit.recipes.merge_signal
73 | --------------------------------
74 | 
75 | .. argparse::
76 |    :module: ngs_toolkit.recipes.merge_signal
77 |    :func: parse_arguments
78 | 
79 | 
80 | ngs_toolkit.recipes.region_enrichment
81 | -------------------------------------
82 | 
83 | .. argparse::
84 |    :module: ngs_toolkit.recipes.region_enrichment
85 |    :func: parse_arguments
86 | 
87 | 
88 | ngs_toolkit.recipes.region_set_frip
89 | -----------------------------------
90 | 
91 | .. argparse::
92 |    :module: ngs_toolkit.recipes.region_set_frip
93 |    :func: parse_arguments
94 | 
95 | 


--------------------------------------------------------------------------------
/docs/source/report.rst:
--------------------------------------------------------------------------------
 1 | Analysis reports
 2 | ******************************
 3 | 
 4 | .. _Report:
 5 | 
 6 | Each analysis object in the ``ngs_toolkit`` will by default record the outputs it produces (e.g. tables, figures).
 7 | This allows the collection of all outputs in a standardized way and the generation of an HTML report.
 8 | 
 9 | By default the location of the report will be in: ``<root_directory>/<analysis_name>.analysis_report.html``
10 | 
11 | Every time a new output is produced, a new report is generated, in a way that analysis progress can be easily monitored in a user-friendly way by simply refreshing the HTML report file. This continuous generation behaviour can be controlled in the configuration file.
12 | 
13 | The recording behaviour can also be controlled independently for tables and figures by setting the respective values of the configuration file:
14 | 
15 | .. code-block:: yaml
16 | 
17 |     preferences:
18 |       report:
19 |         record_figures: True
20 |         record_csv: True
21 |         continuous_generation: True
22 | 
23 | The report will by default be generated in the root of the project directory, but this can be controlled by manually calling the :func:`ngs_toolkit.analysis.Analysis.generate_report` function at the user's will.
24 | 


--------------------------------------------------------------------------------
/docs/source/test.rst:
--------------------------------------------------------------------------------
 1 | Testing
 2 | =============================
 3 | 
 4 | To make sure everything is correctly configured, the user is encouraged to test the library prior to use.
 5 | 
 6 | In order to do this, install testing requirements and simply run ``pytest``:
 7 | 
 8 | .. code-block:: bash
 9 | 
10 |    pip install ngs-toolkit[testing]
11 |    pytest --pyargs ngs_toolkit
12 | 
13 | 
14 | Pytest will output summary results (`see for example <https://travis-ci.org/afrendeiro/toolkit/jobs/580167563>`_) and further outputs can be seen in ``${TMPDIR}/pytest-of-${USER}/`` or ``/tmp/pytest-of-${USER}/`` if $TMPDIR is not defined.
15 | 
16 | 


--------------------------------------------------------------------------------
/docs/source/usage.rst:
--------------------------------------------------------------------------------
 1 | Quick usage
 2 | =============================
 3 | 
 4 | 
 5 | Interactive usage through the API
 6 | ---------------------------------
 7 | 
 8 | To use a particular class or function from the toolkit, simply import it
 9 | following the structure of the library:
10 | 
11 | .. code-block:: python
12 | 
13 |     from ngs_toolkit import ATACSeqAnalysis
14 |     from ngs_toolkit.utils import log_p_values
15 | 
16 | The :class:`ngs_toolkit.analysis.Analysis` and their data type-specific
17 | children are the main drivers of the workflow, storing attributes and providing
18 | various methods through an OOP interface:
19 | 
20 | .. code-block:: python
21 | 
22 |     from ngs_toolkit.demo import generate_project
23 | 
24 |     an = generate_project(data_type="ATAC-seq", sample_input_files=True)
25 |     an.measure_coverage()
26 |     an.normalize()
27 |     an.unsupervised_analysis()
28 |     an.differential_analysis()
29 |     an.plot_differential()
30 |     an.get_peak_gene_annotation()
31 |     an.annotate_features()
32 |     an.differential_enrichment(steps=['enrichr'])
33 |     an.plot_differential_enrichment()
34 | 
35 | 
36 | Running recipes through the command-line interface
37 | --------------------------------------------------
38 | 
39 | ``ngs_toolkit`` also has some command-line programs on some commonly used
40 | workflows (here called ``recipes``), which can be run in the following manner:
41 | 
42 | .. code-block:: bash
43 | 
44 |     PEP=`python -m ngs_toolkit.recipes.generate_project --sample-input-files True`
45 |     python -m ngs_toolkit.recipes.ngs_analysis $PEP
46 | 
47 | This example is roughly equivalent to the on above with interactive usage.
48 | 


--------------------------------------------------------------------------------
/ngs_toolkit/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/source/conf.py
11 | 
12 | # Optionally build your docs in additional formats such as PDF and ePub
13 | formats:
14 |   - htmlzip
15 | 
16 | # Optionally set the version of Python and requirements required to build your docs
17 | python:
18 |   version: 3.7
19 |   install:
20 |     - requirements: requirements/requirements.docs.txt
21 | 


--------------------------------------------------------------------------------
/ngs_toolkit/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | from joblib import Memory
  5 | 
  6 | try:
  7 |     # Even though there is no "ngs_toolkit/_version" file,
  8 |     # it should be generated by
  9 |     # setuptools_scm when building the package
 10 |     from ngs_toolkit._version import __version__
 11 | except ImportError:
 12 |     from setuptools_scm import get_version as _get_version
 13 | 
 14 |     __version__ = _get_version(root="..", relative_to=__file__)
 15 | 
 16 | 
 17 | # Setup joblib memory
 18 | JOBLIB_CACHE_DIR = os.path.join(os.path.expanduser("~"), ".ngs_toolkit")
 19 | MEMORY = Memory(location=JOBLIB_CACHE_DIR, verbose=0)
 20 | 
 21 | 
 22 | def setup_logger(name="ngs_toolkit", level="INFO", logfile=None):
 23 |     """
 24 |     Set up a logger for the library.
 25 | 
 26 |     Parameters
 27 |     ----------
 28 | 
 29 |     level : :obj:`str`, optional
 30 |         Level of logging to display.
 31 |         See possible levels here:
 32 |         https://docs.python.org/2/library/logging.html#levels
 33 | 
 34 |         Defaults to "INFO".
 35 | 
 36 |     logfile : :obj:`str`, optional
 37 |         File to write log to.
 38 | 
 39 |         Defaults to "~/.ngs_toolkit.log.txt".
 40 | 
 41 |     Returns
 42 |     -------
 43 |     :class:`logging.Logger`
 44 |         A logger called "ngs_toolkit".
 45 |     """
 46 |     import logging
 47 |     import os
 48 | 
 49 |     _LOGGER = logging.getLogger(name)
 50 |     _LOGGER.setLevel(logging.DEBUG)
 51 |     # create file handler which logs even debug messages
 52 |     if logfile is None:
 53 |         logfile = os.path.join(os.path.expanduser("~"), "." + name + ".log.txt")
 54 |     fh = logging.FileHandler(logfile)
 55 |     fh.setLevel(logging.DEBUG)
 56 |     # create console handler with a higher log level
 57 |     ch = logging.StreamHandler()
 58 |     ch.setLevel(logging.getLevelName(level))
 59 |     # create formatter and add it to the handlers
 60 |     fmt = "ngs_toolkit.v{}:%(module)s:L%(lineno)d ".format(__version__)
 61 |     fmt += "(%(funcName)s) [%(levelname)s] %(asctime)s > %(message)s"
 62 |     formatter = logging.Formatter(fmt, datefmt="%Y-%m-%d %H:%M:%S")
 63 |     fh.setFormatter(formatter)
 64 |     # fmt = "[%(levelname)s] > %(message)s"
 65 |     fmt = "ngs_toolkit:%(module)s:L%(lineno)d (%(funcName)s) [%(levelname)s]"
 66 |     fmt += " > %(message)s"
 67 |     formatter = logging.Formatter(fmt)
 68 |     ch.setFormatter(formatter)
 69 |     # add the handlers to the logger
 70 |     _LOGGER.addHandler(fh)
 71 |     _LOGGER.addHandler(ch)
 72 | 
 73 |     _LOGGER.debug(
 74 |         "This is ngs_toolkit (http://ngs-toolkit.rtfd.io), version: {}".format(__version__)
 75 |     )
 76 |     return _LOGGER
 77 | 
 78 | 
 79 | def setup_config(custom_yaml_config=None):
 80 |     """
 81 |     Set up global library configuration.
 82 | 
 83 |     It reads ngs_toolkit's package data to load a default configuration,
 84 |     tries to update it by reading a file in ``~/.ngs_toolkit.config.yaml``
 85 |     if present, and lastly, updates it by reading a possible passed yaml file
 86 |     ``custom_yaml_config``.
 87 |     Non-exisiting fields will maintain the previous values, so that the user
 88 |     needs only to specify the section(s) as needed.
 89 | 
 90 |     Parameters
 91 |     ----------
 92 |     custom_yaml_config : :obj:`str`, optional
 93 |         Path to YAML file with configuration.
 94 |         To see the structure of the YAML file, see
 95 |         https://github.com/afrendeiro/toolkit/blob/master/ngs_toolkit/config/default.yaml
 96 | 
 97 |         Defaults to :obj:`None`.
 98 | 
 99 |     Returns
100 |     -------
101 |     :obj:`dict`
102 |         Dictionary with configurations.
103 |     """
104 |     import pkg_resources
105 |     import os
106 |     import yaml
107 |     from ngs_toolkit.utils import _format_string_with_environment_variables
108 | 
109 |     default_config_path = "config/default.yaml"
110 |     default_config_path = pkg_resources.resource_filename(__name__, default_config_path)
111 |     _LOGGER.debug(
112 |         "Reading default configuration file distributed"
113 |         " with package from '{}'.".format(default_config_path)
114 |     )
115 |     try:
116 |         _CONFIG = yaml.safe_load(open(default_config_path, "r"))
117 |         _LOGGER.debug("Default config: {}".format(_CONFIG))
118 |     except IOError:
119 |         _LOGGER.error("Couldn't read configuration file from '{}'.".format(default_config_path))
120 |         _CONFIG = dict()
121 | 
122 |     user_config_path = os.path.join(os.path.expanduser("~"), ".ngs_toolkit.config.yaml")
123 |     if os.path.exists(user_config_path):
124 |         # Read up and format user variables
125 |         _LOGGER.debug("Found custom user config: {}".format(user_config_path))
126 |         try:
127 |             string = open(user_config_path, "r").read()
128 |             # filter out comments (to prevent formating stuff there)
129 |             string = "\n".join(filter(lambda x: not x.strip().startswith("#"), string.split("\n")))
130 |             string = _format_string_with_environment_variables(string)
131 | 
132 |             custom_config = yaml.safe_load(string)
133 |             _LOGGER.debug("Custom user config: {}".format(custom_config))
134 |             # Update config
135 |             _LOGGER.debug(
136 |                 "Updating configuration with custom file from '{}'.".format(user_config_path)
137 |             )
138 |             _CONFIG.update(custom_config)
139 |             _LOGGER.debug("Current config: {}".format(custom_config))
140 |         except IOError:
141 |             _LOGGER.error(
142 |                 "Configuration file from '{}' exists but is not readable."
143 |                 " Ignoring.".format(user_config_path)
144 |             )
145 |     else:
146 |         _LOGGER.debug(
147 |             "To use custom configurations including paths to static files,"
148 |             " create a '{}' file.".format(user_config_path)
149 |         )
150 | 
151 |     if custom_yaml_config is not None:
152 |         # Read up
153 |         try:
154 |             custom_config = yaml.safe_load(open(custom_yaml_config, "r"))
155 |             _LOGGER.debug("Custom passed config: {}".format(custom_config))
156 |             # Update config
157 |             _LOGGER.debug(
158 |                 "Updating configuration with custom file from '{}'.".format(custom_yaml_config)
159 |             )
160 |             _CONFIG.update(custom_config)
161 |             _LOGGER.debug("Current config: {}".format(custom_config))
162 |         except IOError as e:
163 |             _LOGGER.error(
164 |                 "Passed configuration from '{}' exists but is not readable.".format(
165 |                     custom_yaml_config
166 |                 )
167 |             )
168 |             raise e
169 | 
170 |     return _CONFIG
171 | 
172 | 
173 | def setup_graphic_preferences():
174 |     """
175 |     Set up graphic preferences.
176 | 
177 |     It uses the values under "preferences:graphics:matplotlib:rcParams"
178 |     and "preferences:graphics:seaborn:parameters" to matplotlib
179 |     and seaborn respectively.
180 |     """
181 |     import matplotlib
182 |     import seaborn as sns
183 | 
184 |     graphics = _CONFIG["preferences"]["graphics"]
185 |     # matplotlib
186 |     rc_params = graphics["matplotlib"]["rcParams"]
187 |     matplotlib.rcParams.update(rc_params)
188 |     matplotlib.rcParams["svg.fonttype"] = "none"
189 |     matplotlib.rc("text", usetex=False)
190 | 
191 |     # seaborn
192 |     seaborn_params = graphics["seaborn"]["parameters"]
193 |     sns.set(**seaborn_params)
194 | 
195 | 
196 | def clear_log():
197 |     import os
198 | 
199 |     logfile = os.path.join(os.path.expanduser("~"), ".ngs_toolkit.log.txt")
200 |     open(logfile, "w")
201 | 
202 | 
203 | def setup_timestamping():
204 |     if _CONFIG["preferences"]["report"]["record_csv"]:
205 |         from ngs_toolkit.decorators import (
206 |             read_csv_timestamped,
207 |             to_csv_timestamped,
208 |             timestamped_input,
209 |         )
210 |         import pandas as pd
211 | 
212 |         pd.io.parsers.TextFileReader = read_csv_timestamped(pd.io.parsers.TextFileReader)
213 |         pd.DataFrame.to_csv = to_csv_timestamped(
214 |             pd.DataFrame.to_csv, exclude_functions=["from_dataframe"]
215 |         )
216 | 
217 |         os.remove = timestamped_input(os.remove)
218 | 
219 | 
220 | def check_bedtools_version():
221 |     import pybedtools
222 | 
223 |     version = pybedtools.helpers.settings.bedtools_version
224 |     # not existing
225 |     v = ".".join([str(x) for x in version])
226 |     msg = "Bedtools does not seem to be installed or is not in $PATH."
227 |     if v == "":
228 |         _LOGGER.warning(msg)
229 |         return None
230 | 
231 |     # too low version
232 |     msg = "Bedtools version '{}' is smaller than 2.26.".format(v)
233 |     msg += " Please upgrade to newer version."
234 |     if (version[0] < 2) or (version[1] < 26):
235 |         _LOGGER.warning(msg)
236 |         return None
237 |     return v
238 | 
239 | 
240 | # setup
241 | _LOGGER = setup_logger()
242 | _CONFIG = setup_config()
243 | check_bedtools_version()
244 | setup_graphic_preferences()
245 | setup_timestamping()
246 | 
247 | 
248 | # Easier API:
249 | # import all children of Analysis class
250 | from ngs_toolkit.analysis import Analysis
251 | from ngs_toolkit.atacseq import ATACSeqAnalysis
252 | from ngs_toolkit.chipseq import ChIPSeqAnalysis
253 | from ngs_toolkit.cnv import CNVAnalysis
254 | from ngs_toolkit.rnaseq import RNASeqAnalysis
255 | 


--------------------------------------------------------------------------------
/ngs_toolkit/config/default.yaml:
--------------------------------------------------------------------------------
  1 | username:
  2 | email:
  3 | website_root:
  4 | supported_data_types:
  5 |   - ATAC-seq
  6 |   - ChIP-seq
  7 |   - RNA-seq
  8 |   - CNV
  9 | preferences:
 10 |   # For the next item, environment variables are formatted if they are of the form ${VAR}
 11 |   root_reference_dir:
 12 |   root_projects_dir:
 13 |   default_genome_assemblies:
 14 |     - human: hg38
 15 |     - mouse: mm10
 16 |   # The next item is the default computing configuration to use from divvy.
 17 |   # Run "divvy list" to see all options.
 18 |   # See more here: http://code.databio.org/divvy/
 19 |   computing_configuration: 'default'
 20 |   report:
 21 |     record_figures: True
 22 |     record_csv: True
 23 |     continuous_generation: True
 24 |     timestamp_figures: True
 25 |     timestamp_tables: True
 26 |   graphics:
 27 |     matplotlib:
 28 |       backend: TkAgg # Agg
 29 |       # key:values under rcParams are used to update matplotlib.rcParams
 30 |       rcParams:
 31 |         # this ensures text in plots is exported as text objects
 32 |         svg.fonttype: "none"
 33 |     seaborn:
 34 |       # key:values under parameters are passed to seaborn.set
 35 |       parameters:
 36 |         context: "paper"
 37 |         style: "ticks"
 38 |         palette: "colorblind"
 39 |         color_codes: True
 40 |     figure_saving:
 41 |       # these arguments are passed to matplotlib.pyplot.savefig
 42 |       # https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
 43 |       format: svg
 44 |       dpi: 300
 45 |       bbox_inches: "tight"
 46 |     close_saved_figures: True
 47 | 
 48 | sample_input_files:
 49 |   # values in this section can use string formatting
 50 |   # of the form {variable} to be completed with variables from the sample objects
 51 |   # Example:
 52 |   # ATAC-seq:
 53 |   #   aligned_filtered_bam:
 54 |   #     "{data_dir}/{sample_name}/mapped/{sample_name}.bowtie2.filtered.bam"
 55 |   ATAC-seq:
 56 |     aligned_filtered_bam:
 57 |     peaks:
 58 |     summits:
 59 |   ChIP-seq:
 60 |     aligned_filtered_bam:
 61 |   ChIPmentation:
 62 |     aligned_filtered_bam:
 63 |   CNV:
 64 |     log2_read_counts:
 65 |       1000kb:
 66 |       100kb:
 67 |       10kb:
 68 |   RNA-seq:
 69 |     aligned_filtered_bam:
 70 |     counts:
 71 | 
 72 | resources:
 73 |   lola:
 74 |     region_databases:
 75 |       # under each section, there should be a list of items
 76 |       hg19:
 77 |         -
 78 |         -
 79 |       hg38:
 80 |         -
 81 |         -
 82 |       mm10:
 83 |         -
 84 |         -
 85 |     region_set_labeling_columns:
 86 |       - "collection"
 87 |       - "description"
 88 |       - "filename"
 89 |       - "cellType"
 90 |       - "tissue"
 91 |       - "antibody"
 92 |       - "treatment"
 93 |     output_column_names:
 94 |       odds_ratio: "oddsRatio"
 95 |       log_p_value: "pValueLog"
 96 |   meme:
 97 |     motif_databases:
 98 |       human:
 99 |       mouse:
100 |       vertebrate:
101 |     motif_id_mapping:
102 |       mouse:
103 |   enrichr:
104 |     gene_set_libraries:
105 |       # this should be a list of items
106 |       - "GO_Biological_Process_2015"
107 |       - "ChEA_2015"
108 |       - "KEGG_2016"
109 |       - "ESCAPE"
110 |       - "Epigenomics_Roadmap_HM_ChIP-seq"
111 |       - "ENCODE_TF_ChIP-seq_2015"
112 |       - "ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X"
113 |       - "ENCODE_Histone_Modifications_2015"
114 |       - "OMIM_Expanded"
115 |       - "TF-LOF_Expression_from_GEO"
116 |       - "Gene_Perturbations_from_GEO_down"
117 |       - "Gene_Perturbations_from_GEO_up"
118 |       - "Disease_Perturbations_from_GEO_down"
119 |       - "Disease_Perturbations_from_GEO_up"
120 |       - "Drug_Perturbations_from_GEO_down"
121 |       - "Drug_Perturbations_from_GEO_up"
122 |       - "WikiPathways_2016"
123 |       - "Reactome_2016"
124 |       - "BioCarta_2016"
125 |       - "NCI-Nature_2016"
126 |       - "BioPlanet_2019"
127 | 
128 | executables:
129 |   twoBitToFa: twoBitToFa
130 |   fasta-dinucleotide-shuffle: fasta-dinucleotide-shuffle
131 |   ame: ame
132 |   findMotifsGenome.pl: findMotifsGenome.pl
133 |   compareMotifs.pl: compareMotifs.pl
134 | 


--------------------------------------------------------------------------------
/ngs_toolkit/config/example.yaml:
--------------------------------------------------------------------------------
  1 | username: arendeiro
  2 | email: arendeiro@cemm.oeaw.ac.at
  3 | website_root: http://biomedical-sequencing.at/bocklab/arendeiro/
  4 | supported_data_types:
  5 |   - ATAC-seq
  6 |   - ChIP-seq
  7 |   - RNA-seq
  8 |   - CNV
  9 | preferences:
 10 |   # For the next item, environment variables are formatted if they are of the form ${VAR}
 11 |   root_reference_dir: /home/${USER}/reference/
 12 |   root_projects_dir: /home/${USER}/projects/
 13 |   default_genome_assemblies:
 14 |     - human: hg38
 15 |     - mouse: mm10
 16 |   # The next item is the default computing configuration to use from divvy.
 17 |   # Run "divvy list" to see all options.
 18 |   # See more here: http://code.databio.org/divvy/
 19 |   computing_configuration: 'default'
 20 |   report:
 21 |     record_figures: True
 22 |     record_csv: True
 23 |     continuous_generation: True
 24 |     timestamp_figures: True
 25 |     timestamp_tables: True
 26 |   graphics:
 27 |     matplotlib:
 28 |       backend: TkAgg # Agg
 29 |       # key:values under rcParams are used to update matplotlib.rcParams
 30 |       rcParams:
 31 |         # this ensures text in plots is exported as text objects
 32 |         svg.fonttype: "none"
 33 |     seaborn:
 34 |       # key:values under parameters are passed to seaborn.set
 35 |       parameters:
 36 |         context: "paper"
 37 |         style: "white"
 38 |         palette: "colorblind"
 39 |         color_codes: True
 40 |     figure_saving:
 41 |       # these arguments are passed to matplotlib.pyplot.savefig
 42 |       # https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
 43 |       format: svg
 44 |       dpi: 300
 45 |       bbox_inches: "tight"
 46 |     close_saved_figures: True
 47 | 
 48 | sample_input_files:
 49 |   # values in this section can use string formatting
 50 |   # of the form {variable} to be completed with variables from the sample objects
 51 |   # Example:
 52 |   # ATAC-seq:
 53 |   #   aligned_filtered_bam:
 54 |   #     "{data_dir}/{sample_name}/mapped/{sample_name}.trimmed.bowtie2.filtered.bam"
 55 |   ATAC-seq:
 56 |     aligned_filtered_bam: "{data_dir}/{sample_name}/mapped/{sample_name}.trimmed.bowtie2.filtered.bam"
 57 |     peaks: "{data_dir}/{sample_name}/peaks/{sample_name}_peaks.narrowPeak"
 58 |     summits: "{data_dir}/{sample_name}/peaks/{sample_name}_summits.bed"
 59 |   ChIP-seq:
 60 |     aligned_filtered_bam: "{data_dir}/{sample_name}/mapped/{sample_name}.trimmed.bowtie2.filtered.bam"
 61 |   ChIPmentation:
 62 |     aligned_filtered_bam: "{data_dir}/{sample_name}/mapped/{sample_name}.trimmed.bowtie2.filtered.bam"
 63 |   CNV:
 64 |     log2_read_counts:
 65 |       1000kb: "{data_dir}/{sample_name}/{sample_name}_1000kb/CNAprofiles/log2_read_counts.igv"
 66 |       100kb: "{data_dir}/{sample_name}/{sample_name}_100kb/CNAprofiles/log2_read_counts.igv"
 67 |       10kb: "{data_dir}/{sample_name}/{sample_name}_10kb/CNAprofiles/log2_read_counts.igv"
 68 |   RNA-seq:
 69 |     aligned_filtered_bam: "{data_dir}/{sample_name}/mapped/{sample_name}.trimmed.bowtie2.filtered.bam"
 70 |     counts: "{data_dir}/{sample_name}/bowtie1_{genome}/bitSeq/{sample_name}.counts"
 71 | 
 72 | resources:
 73 |   lola:
 74 |     region_databases:
 75 |       # under each section, there should be a list of items
 76 |       hg19:
 77 |         - /home/${USER}/resources/regions/LOLACore/hg19/
 78 |         - /home/${USER}/resources/regions/customRegionDB/hg19/
 79 |       hg38:
 80 |         - /home/${USER}/resources/regions/LOLACore/hg38/
 81 |         - /home/${USER}/resources/regions/customRegionDB/hg38/
 82 |       mm10:
 83 |         - /home/${USER}/resources/regions/LOLACore/mm10/
 84 |         - /home/${USER}/resources/regions/customRegionDB/mm10/
 85 |     region_set_labeling_columns:
 86 |       - "collection"
 87 |       - "description"
 88 |       - "filename"
 89 |       - "cellType"
 90 |       - "tissue"
 91 |       - "antibody"
 92 |       - "treatment"
 93 |     output_column_names:
 94 |       odds_ratio: "oddsRatio"
 95 |       log_p_value: "pValueLog"
 96 |   meme:
 97 |     motif_databases:
 98 |       human: /home/${USER}/resources/motifs/motif_databases/HUMAN/HOCOMOCOv10.meme
 99 |       mouse: /home/${USER}/resources/motifs/motif_databases/MOUSE/uniprobe_mouse.meme
100 |       vertebrate: /home/arendeiro/workspace/homer_4.8/data/knownTFs/vertebrates/known.motifs
101 |     motif_id_mapping:
102 |       mouse: /home/${USER}/resources/motifs/motif_databases/MOUSE/uniprobe_mouse.id_mapping.tsv
103 |   enrichr:
104 |     gene_set_libraries:
105 |       # this should be a list of items
106 |       - "GO_Biological_Process_2015"
107 |       - "ChEA_2015"
108 |       - "KEGG_2016"
109 |       - "ESCAPE"
110 |       - "Epigenomics_Roadmap_HM_ChIP-seq"
111 |       - "ENCODE_TF_ChIP-seq_2015"
112 |       - "ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X"
113 |       - "ENCODE_Histone_Modifications_2015"
114 |       - "OMIM_Expanded"
115 |       - "TF-LOF_Expression_from_GEO"
116 |       - "Gene_Perturbations_from_GEO_down"
117 |       - "Gene_Perturbations_from_GEO_up"
118 |       - "Disease_Perturbations_from_GEO_down"
119 |       - "Disease_Perturbations_from_GEO_up"
120 |       - "Drug_Perturbations_from_GEO_down"
121 |       - "Drug_Perturbations_from_GEO_up"
122 |       - "WikiPathways_2016"
123 |       - "Reactome_2016"
124 |       - "BioCarta_2016"
125 |       - "NCI-Nature_2016"
126 |       - "BioPlanet_2019"
127 | 
128 | executables:
129 |   twoBitToFa: twoBitToFa
130 |   fasta-dinucleotide-shuffle: fasta-dinucleotide-shuffle
131 |   ame: ame
132 |   findMotifsGenome.pl: findMotifsGenome.pl
133 |   compareMotifs.pl: compareMotifs.pl


--------------------------------------------------------------------------------
/ngs_toolkit/constants.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | organism_to_species_mapping = {
 4 |     "human": "hsapiens",
 5 |     "mouse": "mmusculus",
 6 |     "yeast": "scerevisiae",
 7 | }
 8 | organism_to_latest_ensembl_mapping = {
 9 |     "human": "grch38",
10 |     "mouse": "grcm38",
11 |     "yeast": "R64",
12 | }
13 | genome_to_organism_mapping = {
14 |     "hg38": "human",
15 |     "hg19": "human",
16 |     "mm10": "mouse"
17 | }
18 | ucsc_to_ensembl_mapping = {
19 |     "hg38": "grch38",
20 |     "hg19": "grch37",
21 |     "mm10": "grcm38",
22 |     "mm9": "grcm37",
23 | }
24 | genome_to_ensembl_mapping = ucsc_to_ensembl_mapping
25 | 


--------------------------------------------------------------------------------
/ngs_toolkit/decorators.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from functools import wraps
  4 | from ngs_toolkit import _LOGGER
  5 | from ngs_toolkit.utils import warn_or_raise
  6 | 
  7 | 
  8 | def check_has_attributes(attributes=None, object_types=None, permissive=False):
  9 |     attributes = [] or attributes
 10 |     object_types = [None] * len(attributes) or object_types
 11 |     if len(attributes) != len(object_types):
 12 |         msg = "`attributes` and `object_types` arguments must be the same length."
 13 |         _LOGGER.error(msg)
 14 |         raise ValueError(msg)
 15 | 
 16 |     def decorator(f):
 17 |         @wraps(f)
 18 |         def wrapper(*args, **kwargs):
 19 |             import pandas as pd
 20 | 
 21 |             # check attributes are set
 22 |             msg = "Analysis '{}' attribute(s) are not set."
 23 |             has = pd.Series(
 24 |                 [hasattr(args[0], attr) for attr in attributes],
 25 |                 index=attributes)
 26 |             if not has.all():
 27 |                 warn_or_raise(AttributeError(msg.format(",".join(has[~has].index))), permissive)
 28 | 
 29 |             # check attributes are not None
 30 |             msg = "Analysis '{}' attribute(s) are None."
 31 |             not_none = pd.Series(
 32 |                 [getattr(args[0], attr) is not None for attr in attributes],
 33 |                 index=attributes)
 34 |             if not not_none.all():
 35 |                 warn_or_raise(AttributeError(msg.format(",".join(not_none[~not_none].index))), permissive)
 36 | 
 37 |             # check the type of attribute values matches requested
 38 |             msg = "Analysis '{}' attribute(s) are not of requested types '{}'."
 39 |             t_attributes = [a for a, t in zip(attributes, object_types) if t is not None]
 40 |             t_object_types = [t for a, t in zip(attributes, object_types) if t is not None]
 41 |             not_type = pd.Series(
 42 |                 [isinstance(getattr(args[0], attr), t) is not None
 43 |                  for attr, t in zip(t_attributes, t_object_types)],
 44 |                 index=t_attributes, dtype=object)
 45 |             if not not_type.all():
 46 |                 warn_or_raise(
 47 |                     AttributeError(msg.format(
 48 |                         ",".join(not_type[~not_type].index),
 49 |                         ",".join([str(t) for t in t_object_types]))),
 50 |                     permissive)
 51 | 
 52 |             # for iterable types, check length > 0
 53 |             msg = "Analysis '{}' attribute(s) have 0 elements."
 54 |             i_attributes = [a for a, t in zip(attributes, object_types) if hasattr(a, "__iter__")]
 55 |             i_object_types = [t for a, t in zip(attributes, object_types) if hasattr(a, "__iter__")]
 56 |             not_empty = pd.Series(
 57 |                 [len(getattr(args[0], attr)) > 0 for attr in i_attributes],
 58 |                 index=i_attributes)
 59 |             if not not_empty.all():
 60 |                 warn_or_raise(
 61 |                     AttributeError(msg.format(
 62 |                         ",".join(not_empty[~not_empty].index),
 63 |                         ",".join([str(t) for t in i_object_types]))),
 64 |                     permissive)
 65 |             return f(*args, **kwargs)
 66 |         return wrapper
 67 |     return decorator
 68 | 
 69 | 
 70 | def read_csv_timestamped(f):
 71 |     from ngs_toolkit.utils import get_this_file_or_timestamped
 72 |     @wraps(f)
 73 |     def wrapper(*args, **kwargs):
 74 |         for i, _ in enumerate(args):
 75 |             if isinstance(args[i], str):
 76 |                 args = args[:i] + (
 77 |                     get_this_file_or_timestamped(args[i]),) + args[i + 1:]
 78 |         return f(*args, **kwargs)
 79 |     return wrapper
 80 | 
 81 | 
 82 | def to_csv_timestamped(f, exclude_functions=None):
 83 | 
 84 |     # TODO: fix to files without "." (dot)
 85 |     from ngs_toolkit.utils import (
 86 |         record_analysis_output, get_timestamp,
 87 |         is_analysis_descendent)
 88 |     from ngs_toolkit import _CONFIG
 89 | 
 90 |     @wraps(f)
 91 |     def wrapper(*args, **kwargs):
 92 |         if is_analysis_descendent(exclude_functions=exclude_functions):
 93 |             # Add timestamp
 94 |             if _CONFIG["preferences"]["report"]["timestamp_tables"]:
 95 |                 if len(args) > 1:
 96 |                     if isinstance(args[1], str):
 97 |                         s = args[1].split(".")
 98 |                         end = s[-1]
 99 |                         body = ".".join(s[:-1])
100 |                         args = (args[0], ".".join([body, get_timestamp(), end])) + args[2:]
101 |                         record_analysis_output(args[1])
102 |                 else:
103 |                     if isinstance(args[0], str):
104 |                         s = args[0].split(".")
105 |                         end = s[-1]
106 |                         body = ".".join(s[:-1])
107 |                         args = (".".join([body, get_timestamp(), end])) + args[1:]
108 |                         record_analysis_output(args[0])
109 |         return f(*args, **kwargs)
110 |     return wrapper
111 | 
112 | 
113 | def timestamped_input(f):
114 |     from ngs_toolkit.utils import get_this_file_or_timestamped
115 | 
116 |     @wraps(f)
117 |     def wrapper(file):
118 |         return f(get_this_file_or_timestamped(file))
119 |     return wrapper
120 | 


--------------------------------------------------------------------------------
/ngs_toolkit/demo/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | A module providing functions to generate Analysis, Projects and their data.
 5 | """
 6 | 
 7 | from ngs_toolkit.demo.data_generator import (
 8 |     generate_count_matrix, generate_data,
 9 |     generate_project, generate_projects)
10 | 


--------------------------------------------------------------------------------
/ngs_toolkit/exceptions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | class Error(Exception):
 5 |     """Base class for exceptions in this module."""
 6 |     pass
 7 | 
 8 | 
 9 | class InputError(Error):
10 |     """
11 |     Exception raised for errors in the input.
12 | 
13 |     Attributes
14 |     ----------
15 |         expr :
16 |             Input expression in which the error occurred
17 |         msg  : :obj:`str`
18 |             Explanation of the error
19 |     """
20 | 
21 |     def __init__(self, expr, msg):
22 |         self.expr = expr
23 |         self.msg = msg
24 | 
25 | 
26 | class UserInputError(InputError):
27 |     """
28 |     Exception raised for errors in the input caused by the user.
29 | 
30 |     Attributes
31 |     ----------
32 |         expr :
33 |             Input expression in which the error occurred
34 |         msg  : :obj:`str`
35 |             Explanation of the error
36 |     """
37 |     pass
38 | 
39 | 
40 | class NetworkError(Error):
41 |     """
42 |     Exception raised for errors in API calls over the internet.
43 | 
44 |     Attributes
45 |     ----------
46 |         expr :
47 |             Input expression in which the error occurred
48 |         msg  : :obj:`str`
49 |             Explanation of the error
50 |     """
51 |     def __init__(self, expr, msg):
52 |         self.expr = expr
53 |         self.msg = msg
54 | 
55 | 
56 | class DependencyError(Error):
57 |     """
58 |     Exception raised for errors in the input.
59 | 
60 |     Attributes
61 |     ----------
62 |         expr :
63 |             Input expression in which the error occurred
64 |         msg  : :obj:`str`
65 |             Explanation of the error
66 |     """
67 | 
68 |     def __init__(self, expr, msg):
69 |         self.expr = expr
70 |         self.msg = msg
71 | 
72 | 
73 | class DependencyNotFoundError(Error):
74 |     """
75 |     Exception raised for errors in the input.
76 | 
77 |     Attributes
78 |     ----------
79 |         expr :
80 |             Input expression in which the error occurred
81 |         msg  : :obj:`str`
82 |             Explanation of the error
83 |     """
84 | 
85 |     def __init__(self, expr, msg):
86 |         self.expr = expr
87 |         self.msg = msg
88 | 


--------------------------------------------------------------------------------
/ngs_toolkit/parsers.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | 
  5 | import pandas as pd
  6 | 
  7 | 
  8 | def parse_ame(ame_output):
  9 |     """
 10 |     Parse results of MEME-AME motif enrichment.
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     ame_output : :obj:`str`
 15 |         MEME-AME results file.
 16 | 
 17 |     Returns
 18 |     ----------
 19 |     pandas.DataFrame
 20 |         Data frame with enrichment statistics for each found TF motif.
 21 | 
 22 |     Raises
 23 |     -------
 24 |     IOError
 25 |         If directory contain
 26 |     """
 27 |     with open(ame_output, "r") as handle:
 28 |         lines = handle.readlines()
 29 | 
 30 |     output = list()
 31 |     for line in lines:
 32 |         # skip header lines
 33 |         if line[0] not in [str(i) for i in range(10)]:
 34 |             continue
 35 | 
 36 |         # get motif string and the first half of it (simple name)
 37 |         motif = line.strip().split(" ")[5].split("_")[0]
 38 |         # get corrected p-value
 39 |         q_value = float(line.strip().split(" ")[-2])
 40 |         # append
 41 |         output.append((motif, q_value))
 42 | 
 43 |     r = pd.Series(dict(output)).reset_index()
 44 |     r.columns = ["TF", "p_value"]
 45 |     return r
 46 | 
 47 | 
 48 | def parse_homer(homer_dir):
 49 |     """
 50 |     Parse results of HOMER findMotifs.pl de novo motif enrichment.
 51 | 
 52 |     Parameters
 53 |     ----------
 54 |     homer_dir : :obj:`str`
 55 |         Directory with HOMER results.
 56 | 
 57 |     Returns
 58 |     ----------
 59 |     pandas.DataFrame
 60 |         Data frame with enrichment statistics for each found TF motif.
 61 | 
 62 |     Raises
 63 |     -------
 64 |     IOError
 65 |     """
 66 |     import glob
 67 |     import re
 68 | 
 69 |     motif_htmls = sorted(glob.glob(os.path.join(homer_dir, "motif*.info.html")))
 70 | 
 71 |     if len(motif_htmls) < 1:
 72 |         raise IOError("Homer directory does not contain any discovered motifs.")
 73 | 
 74 |     output = pd.DataFrame()
 75 |     for motif_html in motif_htmls:
 76 | 
 77 |         motif = int(
 78 |             re.sub(
 79 |                 ".info.html",
 80 |                 "",
 81 |                 re.sub(os.path.join(homer_dir, "motif"), "", motif_html),
 82 |             )
 83 |         )
 84 | 
 85 |         with open(motif_html, "r") as handle:
 86 |             content = handle.read()
 87 | 
 88 |         # Parse table with motif info
 89 |         info_table = content[
 90 |             re.search("""<TABLE border="1" cellpading="0" cellspacing="0">""", content)
 91 |             .end(): re.search("</TABLE>", content)
 92 |             .start()
 93 |         ].strip()
 94 | 
 95 |         info_table = pd.DataFrame(
 96 |             [
 97 |                 x.split("</TD><TD>")
 98 |                 for x in info_table.replace("<TR><TD>", "").split("</TD></TR>")
 99 |             ]
100 |         )
101 |         info_table.columns = ["description", "value"]
102 |         info_table["description"] = info_table["description"].str.strip()
103 |         info_table["motif"] = motif
104 | 
105 |         # Add most probable known motif name
106 |         info_table["known_motif"] = content[
107 |             re.search("<H4>", content).end(): re.search("</H4>", content).start()
108 |         ]
109 | 
110 |         # append
111 |         output = output.append(info_table, ignore_index=True)
112 | 
113 |     return output.sort_values("motif")
114 | 
115 | 
116 | def parse_great_enrichment(input_tsv):
117 |     """
118 |     Parse output from GREAT enrichment (http://great.stanford.edu).
119 | 
120 |     Parameters
121 |     ----------
122 |     input_tsv : :obj:`str`
123 |         TSV file exported from GREAT through the option "All data as .tsv" in "Global Controls".
124 | 
125 |     Returns
126 |     ----------
127 |     pandas.DataFrame
128 |         Pandas dataframe with enrichment results.
129 |     """
130 |     df = pd.read_csv(input_tsv, sep="\t", skiprows=3)
131 |     df.columns = df.columns.str.replace("# ", "")
132 |     return df.loc[~df.iloc[:, 0].str.startswith("#")]
133 | 


--------------------------------------------------------------------------------
/ngs_toolkit/recipes/__init__.py:
--------------------------------------------------------------------------------
1 | # from ngs_toolkit.utils import have_unbuffered_output
2 | 
3 | # have_unbuffered_output()
4 | 


--------------------------------------------------------------------------------
/ngs_toolkit/recipes/call_peaks.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Call peaks for ChIP-seq samples given a comparison table
  5 | mapping foreground-background relationships between samples.
  6 | """
  7 | 
  8 | 
  9 | import sys
 10 | 
 11 | from argparse import ArgumentParser
 12 | 
 13 | import pandas as pd
 14 | 
 15 | from ngs_toolkit.chipseq import ChIPSeqAnalysis
 16 | 
 17 | 
 18 | def parse_arguments():
 19 |     """
 20 |     Global options for analysis.
 21 |     """
 22 |     parser = ArgumentParser(
 23 |         prog="python -m ngs_toolkit.recipes.call_peaks", description=__doc__)
 24 |     parser.add_argument(
 25 |         dest="config_file", help="YAML project configuration file.", type=str
 26 |     )
 27 |     parser.add_argument(
 28 |         "-c",
 29 |         "--comparison-table",
 30 |         dest="comparison_table",
 31 |         default=None,
 32 |         help="Comparison table to use for peak calling. If not provided will use a file"
 33 |         "named `comparison_table.csv` in the same directory of the given YAML Project configuration file.",
 34 |         type=str,
 35 |     )
 36 |     parser.add_argument(
 37 |         "-t",
 38 |         "--only-toggle",
 39 |         action="store_true",
 40 |         dest="only_toggle",
 41 |         help="Whether only comparisons with 'toggle' value of '1' or 'True' should be performed.",
 42 |     )
 43 |     parser.add_argument(
 44 |         "-qc",
 45 |         "--pass-qc",
 46 |         action="store_true",
 47 |         dest="pass_qc",
 48 |         help="Whether only samples with a 'pass_qc' attribute should be included."
 49 |         " Default is :obj:`False`.",
 50 |     )
 51 |     parser.add_argument(
 52 |         "-j",
 53 |         "--as-jobs",
 54 |         action="store_true",
 55 |         dest="as_job",
 56 |         help="Whether jobs should be created for each sample, or "
 57 |         "it should run in serial mode.",
 58 |     )
 59 |     parser.add_argument(
 60 |         "-o",
 61 |         "--results-output",
 62 |         default="results",
 63 |         dest="results_dir",
 64 |         help="Directory for analysis output files. "
 65 |         "Default is 'results' under the project root directory.",
 66 |         type=str,
 67 |     )
 68 |     return parser
 69 | 
 70 | 
 71 | def main(cli=None):
 72 |     args = parse_arguments().parse_args(cli)
 73 | 
 74 |     # Analysis
 75 |     print(
 76 |         "Starting Analysis from PEP configuration file: '{}'".format(args.config_file)
 77 |     )
 78 |     analysis = ChIPSeqAnalysis(
 79 |         from_pep=args.config_file, results_dir=args.results_dir
 80 |     )
 81 |     chip_data_types = ["ChIP-seq", "ChIPmentation"]
 82 |     analysis.samples = [s for s in analysis.samples if s.protocol == chip_data_types]
 83 | 
 84 |     # Samples
 85 |     # # filter QC if needed
 86 |     if args.pass_qc:
 87 |         analysis.samples = [
 88 |             s for s in analysis.samples if s.pass_qc not in ["0", 0, "False", False]
 89 |         ]
 90 |     if analysis.samples:
 91 |         print(
 92 |             "Samples under consideration: '{}'. ".format(
 93 |                 ",".join([s.name for s in analysis.samples])
 94 |             )
 95 |             + "Total of {} samples.".format(len([s.name for s in analysis.samples]))
 96 |         )
 97 |     else:
 98 |         raise ValueError("There were no valid samples for this analysis type!")
 99 | 
100 |     # Comparison table
101 |     # # add provided
102 |     if args.comparison_table is not None:
103 |         analysis.comparison_table = pd.read_csv(args.comparison_table)
104 |     # # or make sure analysis has one
105 |     else:
106 |         if not hasattr(analysis, "comparison_table"):
107 |             raise ValueError(
108 |                 "Analysis doesn't have a 'comparison_table' and this was not provided."
109 |             )
110 | 
111 |     # # filter comparisons if needed
112 |     if args.only_toggle:
113 |         print("Filtering out comparisons marked with toggle != 1")
114 |         analysis.comparison_table = analysis.comparison_table[
115 |             analysis.comparison_table["toggle"] == 1
116 |         ]
117 | 
118 |     comps = analysis.comparison_table["comparison_name"].unique()
119 |     if comps:
120 |         print(
121 |             "comparisons under consideration: '{}'. ".format(",".join(comps))
122 |             + "Total of {} comparisons.".format(len(comps))
123 |         )
124 |     else:
125 |         raise ValueError("There were no valid comparisons in the comparison table!")
126 | 
127 |     # Call peaks
128 |     analysis.call_peaks_from_comparisons(distributed=args.as_jobs)
129 | 
130 |     # # Get summary of peak calls
131 |     # peak_counts = analysis.summarize_peaks_from_comparisons(comparison_table)
132 |     # peak_counts.to_csv(os.path.join("results_pipeline", "chipseq_peaks", "peak_count_summary.csv"), index=False)
133 | 
134 | 
135 | if __name__ == "__main__":
136 |     try:
137 |         sys.exit(main())
138 |     except KeyboardInterrupt:
139 |         print("Program canceled by user!")
140 |         sys.exit(1)
141 | 


--------------------------------------------------------------------------------
/ngs_toolkit/recipes/coverage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | A helper script to calculate the read coverage of a BAM file
 5 | in regions from a BED file.
 6 | Ensures the same order and number of lines as input BED file.
 7 | 
 8 | Software requirements:
 9 | 
10 |  * None
11 | """
12 | 
13 | import os
14 | import sys
15 | 
16 | from argparse import ArgumentParser
17 | 
18 | import pandas as pd
19 | 
20 | from ngs_toolkit.utils import count_reads_in_intervals
21 | from ngs_toolkit.utils import read_bed_file_three_columns
22 | from ngs_toolkit.utils import to_bed_index
23 | 
24 | 
25 | def parse_arguments():
26 |     """
27 |     Argument Parsing.
28 |     """
29 |     parser = ArgumentParser(
30 |         prog="python -m ngs_toolkit.recipes.coverage", description=__doc__)
31 |     parser.add_argument(
32 |         dest="bed_file",
33 |         help="Input BED file with regions to quantify.",
34 |     )
35 |     parser.add_argument(
36 |         dest="bam_file",
37 |         help="Input BAM file with reads.",
38 |     )
39 |     parser.add_argument(
40 |         dest="output_bed", help="Output BED file with counts for each region."
41 |     )
42 |     parser.add_argument(
43 |         "--no-overwrite", action="store_false",
44 |         dest="overwrite",
45 |         help="Whether results should not be overwritten if existing."
46 |     )
47 |     return parser
48 | 
49 | 
50 | def main(cli=None):
51 |     """Measure coverage of BAM file in BED file regions."""
52 |     print("Parsing CLI.")
53 |     args = parse_arguments().parse_args(cli)
54 | 
55 |     if os.path.exists(args.output_bed) and (not args.overwrite):
56 |         print("Output exists and `overwrite` is False, so not doing anything.")
57 |         return 0
58 | 
59 |     print("Getting regions.")
60 |     sites_str = to_bed_index(args.bed_file)
61 |     print("Quantifying.")
62 |     res = count_reads_in_intervals(args.bam_file, sites_str)
63 | 
64 |     print("Merging with input set.")
65 |     # make sure there is an entry for each region in input file
66 |     input_bed = read_bed_file_three_columns(args.bed_file).set_index("name")
67 |     res = input_bed.join(pd.Series(res, name="sample")).fillna(0)
68 |     res.loc[:, "sample"] = res.loc[:, "sample"].astype(int)
69 | 
70 |     print("Saving results.")
71 |     res.to_csv(args.output_bed, index=False, header=False, sep="\t")
72 | 
73 |     print("Done.")
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     try:
78 |         sys.exit(main())
79 |     except KeyboardInterrupt:
80 |         print("Program canceled by user!")
81 |         sys.exit(1)
82 | 


--------------------------------------------------------------------------------
/ngs_toolkit/recipes/deseq2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Perform differential expression using DESeq2
  5 | by comparing sample groups using a formula.
  6 | 
  7 | Software requirements:
  8 | 
  9 |  * DESeq2
 10 | """
 11 | 
 12 | import os
 13 | import pandas as pd
 14 | import sys
 15 | 
 16 | from argparse import ArgumentParser
 17 | from ngs_toolkit.general import deseq_analysis
 18 | 
 19 | 
 20 | def parse_arguments():
 21 |     parser = ArgumentParser(
 22 |         prog="python -m ngs_toolkit.recipes.deseq2", description=__doc__)
 23 |     parser.add_argument(
 24 |         dest="work_dir",
 25 |         help="Working directory. Should contain required files for DESeq2.")
 26 |     parser.add_argument(
 27 |         "--output-prefix",
 28 |         dest="output_prefix",
 29 |         default="differential_analysis",
 30 |         type=str,
 31 |         help="Prefix for output files.")
 32 |     parser.add_argument(
 33 |         "--formula",
 34 |         default="~ sample_group",
 35 |         type=str,
 36 |         help="R-style formula for differential expression. Defaults to '~ sample_group'.")
 37 |     parser.add_argument(
 38 |         "--alpha",
 39 |         default=0.05,
 40 |         type=float,
 41 |         help="Significance level to call differential expression. All results will be output anyway.")
 42 |     parser.add_argument(
 43 |         "-d",
 44 |         "--dry-run",
 45 |         action="store_true",
 46 |         help="Don't actually do anything.")
 47 |     parser.add_argument(
 48 |         "--overwrite",
 49 |         action="store_true",
 50 |         default=False,
 51 |         help="Don't overwrite any existing directory or file.")
 52 |     parser.add_argument(
 53 |         "--no-save-inputs",
 54 |         action="store_false",
 55 |         default=True,
 56 |         help="Don't write inputs to disk.")
 57 | 
 58 |     # To enable the loop to pass args directly on to the pipelines...
 59 | 
 60 |     # args = parser.parse_args("--output_prefix differential_analysis --formula '~sample_group' --overwrite /scratch/lab_bock/shared/projects/baf-time_course/results/differential_analysis_ATAC-seq/ARID2_KO".split(" "))
 61 |     return parser
 62 | 
 63 | 
 64 | def main(cli=None):
 65 |     args = parse_arguments().parse_args(cli)
 66 | 
 67 |     # sample annotation
 68 |     print("Reading experiment_matrix")
 69 |     experiment_matrix = pd.read_csv(
 70 |         os.path.join(args.work_dir, "experiment_matrix.csv"))
 71 |     # comparison table
 72 |     print("Reading comparison_matrix")
 73 |     comparison_table = pd.read_csv(
 74 |         os.path.join(args.work_dir, "comparison_table.csv"))
 75 |     # count matrix
 76 |     print("Reading count_matrix")
 77 |     count_matrix = pd.read_csv(
 78 |         os.path.join(args.work_dir, "count_matrix.csv"), index_col=0)
 79 | 
 80 |     print("Differential expression with DESeq2")
 81 |     res = deseq_analysis(
 82 |         count_matrix,
 83 |         experiment_matrix,
 84 |         comparison_table,
 85 |         formula=args.formula,
 86 |         output_dir=args.work_dir,
 87 |         output_prefix=args.output_prefix,
 88 |         overwrite=args.overwrite, alpha=args.alpha,
 89 |         create_subdirectories=False,
 90 |         save_inputs=not args.no_save_inputs)
 91 | 
 92 |     print("Found {} differentially expressed genes with p < {}.".format(
 93 |         res[res['pvalue'] < args.alpha].shape[0], args.alpha))
 94 |     print("Found {} differentially expressed genes with FDR < {}.".format(
 95 |         res[res['padj'] < args.alpha].shape[0], args.alpha))
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     try:
100 |         sys.exit(main())
101 |     except KeyboardInterrupt:
102 |         print("Program canceled by user!")
103 |         sys.exit(1)
104 | 


--------------------------------------------------------------------------------
/ngs_toolkit/recipes/enrichr.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | A helper script to run enrichment analysis using the Enrichr API on a gene set.
 5 | 
 6 | Software requirements: None
 7 | """
 8 | 
 9 | from argparse import ArgumentParser
10 | import os
11 | import sys
12 | 
13 | from ngs_toolkit.general import enrichr
14 | 
15 | 
16 | def parse_arguments():
17 |     """
18 |     Argument Parsing.
19 |     """
20 |     parser = ArgumentParser(
21 |         prog="python -m ngs_toolkit.recipes.enrichr", description=__doc__
22 |     )
23 |     parser.add_argument(
24 |         dest="input_file",
25 |         help="Input file with a gene name per row and no header.",
26 |     )
27 |     parser.add_argument(
28 |         dest="output_file", help="Output CSV file with results."
29 |     )
30 |     parser.add_argument(
31 |         "-a",
32 |         "--max-attempts",
33 |         type=int,
34 |         default=5,
35 |         dest="max_attempts",
36 |         help="Maximum attempts to retry the API before giving up.",
37 |     )
38 |     parser.add_argument(
39 |         "--no-overwrite",
40 |         action="store_false",
41 |         dest="overwrite",
42 |         help="Whether results should not be overwritten if existing.",
43 |     )
44 |     return parser
45 | 
46 | 
47 | def main(cli=None):
48 |     print("Enrichr analysis")
49 |     args = parse_arguments().parse_args(cli)
50 | 
51 |     if os.path.exists(args.output_file) and (not args.overwrite):
52 |         print("Output exists and `overwrite` is False, so not doing anything.")
53 |         return 0
54 | 
55 |     print("Reading input file.")
56 | 
57 |     with open(args.input_file, "r") as handle:
58 |         genes = handle.readlines()
59 | 
60 |     print("Found {} genes in input.".format(len(genes)))
61 | 
62 |     print("Starting Enrichr analysis.")
63 |     res = enrichr(
64 |         genes,
65 |         gene_set_libraries=None,
66 |         kind="genes",
67 |         max_attempts=args.max_attempts,
68 |     )
69 | 
70 |     print("Saving results.")
71 |     res.to_csv(args.output_file, index=False)
72 | 
73 |     print("Done.")
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     try:
78 |         sys.exit(main())
79 |     except KeyboardInterrupt:
80 |         print("Program canceled by user!")
81 |         sys.exit(1)
82 | 


--------------------------------------------------------------------------------
/ngs_toolkit/recipes/generate_project.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | A helper script to generate synthetic data for a project in PEP format.
 5 | """
 6 | 
 7 | import sys
 8 | 
 9 | from argparse import ArgumentParser
10 | 
11 | from ngs_toolkit.demo import generate_project
12 | from ngs_toolkit.utils import filter_kwargs_by_callable
13 | 
14 | 
15 | def parse_arguments():
16 |     """
17 |     Argument Parsing.
18 |     """
19 |     import inspect
20 | 
21 |     parser = ArgumentParser(
22 |         prog="python -m ngs_toolkit.recipes.generate_project", description=__doc__)
23 | 
24 |     sig = inspect.signature(generate_project)
25 |     for arg in sig.parameters:
26 |         if arg in ["kwargs", "initialize"]:
27 |             continue
28 |         d = sig.parameters[arg].default
29 |         if d is None:
30 |             parser.add_argument("--" + arg.replace("_", "-"))
31 |         else:
32 |             parser.add_argument(
33 |                 "--" + arg.replace("_", "-"),
34 |                 default=d, type=type(d))
35 |     parser.add_argument("--debug", action="store_true")
36 |     return parser
37 | 
38 | 
39 | def main(cli=None):
40 |     """Generate synthetic data for a project in PEP format."""
41 |     args = parse_arguments().parse_args(cli)
42 |     if args.debug:
43 |         print(args)
44 |     kwargs = {k: v for k, v in args.__dict__.items() if v is not None}
45 |     kwargs = filter_kwargs_by_callable(kwargs, generate_project)
46 |     if args.debug:
47 |         print(kwargs)
48 |     pep = generate_project(**kwargs, initialize=False)
49 |     sys.stdout.write(pep + "\n")
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     try:
54 |         sys.exit(main())
55 |     except KeyboardInterrupt:
56 |         print("Program canceled by user!")
57 |         sys.exit(1)
58 | 


--------------------------------------------------------------------------------
/ngs_toolkit/recipes/lola.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | A helper script to run Location Overlap Analysis (LOLA)
 5 | of a single region set in various sets of region-based annotations.
 6 | 
 7 | 
 8 | Software requirements:
 9 | 
10 |  * LOLA
11 | """
12 | 
13 | import os
14 | import sys
15 | 
16 | from argparse import ArgumentParser
17 | from ngs_toolkit.general import lola
18 | 
19 | 
20 | def parse_arguments():
21 |     """
22 |     Argument Parsing.
23 |     """
24 |     parser = ArgumentParser(
25 |         prog="python -m ngs_toolkit.recipes.lola", description=__doc__)
26 |     parser.add_argument(dest="bed_file", help="BED file with query set regions.")
27 |     parser.add_argument(
28 |         dest="universe_file",
29 |         help="BED file with universe where the query set came from.",
30 |     )
31 |     parser.add_argument(
32 |         dest="output_folder", help="Output directory for produced files."
33 |     )
34 |     parser.add_argument(dest="genome", help="Genome assembly of the region set.")
35 |     parser.add_argument(
36 |         "--overwrite",
37 |         action="store_false",
38 |         help="Don't overwrite existing output files.",
39 |     )
40 |     parser.add_argument(
41 |         "-c",
42 |         "--cpus",
43 |         dest="cpus",
44 |         help="Number of CPUS/threads to use for analysis.",
45 |         type=int,
46 |     )
47 |     return parser
48 | 
49 | 
50 | def main(cli=None):
51 |     print("LOLA analysis")
52 |     args = parse_arguments().parse_args(cli)
53 | 
54 |     output_file = os.path.join(args.output_folder, "allEnrichments.tsv")
55 |     if os.path.exists(output_file) and (not args.overwrite):
56 |         print("Output exists and `overwrite` is False, so not doing anything.")
57 |         return 0
58 | 
59 |     print("Starting LOLA analysis.")
60 | 
61 |     lola(
62 |         args.bed_file,
63 |         args.universe_file,
64 |         args.output_folder,
65 |         args.genome,
66 |         cpus=args.cpus,
67 |     )
68 | 
69 |     print("Done.")
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     try:
74 |         sys.exit(main())
75 |     except KeyboardInterrupt:
76 |         print("Program canceled by user!")
77 |         sys.exit(1)
78 | 


--------------------------------------------------------------------------------
/ngs_toolkit/recipes/ngs_analysis.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Perform full end-to-end analysis of ATAC-seq, ChIP-seq or RNA-seq data.
  5 | 
  6 | Produces quantification matrices, normalizes them,
  7 | performes unsupervised and supervised analysis as
  8 | well as enrichment analyisis of differential features,
  9 | all accompaigned with powerful visualizations.
 10 | 
 11 | Supervised analysis will only be performed if PEP configuration file contains a
 12 | `comparison table <https://ngs-toolkit.readthedocs.io/en/latest/comparison_table.html>`_ field.
 13 | 
 14 | In addition, this recipe uses variables provided in the project configuration
 15 | file ``project_name``, ``sample_attributes`` and ``group_attributes``.
 16 | """
 17 | 
 18 | 
 19 | import os
 20 | import sys
 21 | 
 22 | from argparse import ArgumentParser
 23 | 
 24 | import matplotlib
 25 | import seaborn as sns
 26 | 
 27 | import peppy
 28 | 
 29 | from ngs_toolkit.atacseq import ATACSeqAnalysis
 30 | from ngs_toolkit.chipseq import ChIPSeqAnalysis
 31 | from ngs_toolkit.rnaseq import RNASeqAnalysis
 32 | 
 33 | 
 34 | # Set settings
 35 | sns.set(context="paper", style="ticks", palette="colorblind", color_codes=True)
 36 | matplotlib.rc("text", usetex=False)
 37 | 
 38 | 
 39 | def parse_arguments():
 40 |     """
 41 |     Global options for analysis.
 42 |     """
 43 |     parser = ArgumentParser(
 44 |         prog="python -m ngs_toolkit.recipes.ngs_analysis", description=__doc__)
 45 |     parser.add_argument(
 46 |         dest="config_file", help="YAML project configuration file.", type=str)
 47 |     parser.add_argument(
 48 |         "-n",
 49 |         "--analysis-name",
 50 |         dest="name",
 51 |         default=None,
 52 |         help="Name of analysis. Will be the prefix of output_files. "
 53 |         "By default it will be the name of the Project given in the YAML configuration.",
 54 |         type=str,
 55 |     )
 56 |     parser.add_argument(
 57 |         "-o",
 58 |         "--results-output",
 59 |         default="results",
 60 |         dest="results_dir",
 61 |         help="Directory for analysis output files. "
 62 |              "Default is 'results' under the project roort directory.",
 63 |         type=str,
 64 |     )
 65 |     parser.add_argument(
 66 |         "-t",
 67 |         "--data-type",
 68 |         default=None,
 69 |         choices=["ATAC-seq", "RNA-seq", "ChIP-seq"],
 70 |         dest="data_type",
 71 |         help="Data type to restrict analysis to. "
 72 |              "Default is to run separate analysis for each data type.",
 73 |         type=str,
 74 |     )
 75 |     parser.add_argument(
 76 |         "-q",
 77 |         "--pass-qc",
 78 |         action="store_true",
 79 |         dest="pass_qc",
 80 |         help="Whether only samples with a 'pass_qc' value of '1' "
 81 |              "in the annotation sheet should be used.",
 82 |     )
 83 |     parser.add_argument(
 84 |         "-a", "--alpha", default=0.05, dest="alpha",
 85 |         help="Alpha value of confidence for supervised analysis.", type=str
 86 |     )
 87 |     parser.add_argument(
 88 |         "-f",
 89 |         "--fold-change",
 90 |         default=0,
 91 |         dest="abs_fold_change",
 92 |         help="Absolute log2 fold change value for supervised analysis.",
 93 |         type=str,
 94 |     )
 95 |     return parser
 96 | 
 97 | 
 98 | def main(cli=None):
 99 |     args = parse_arguments().parse_args(cli)
100 | 
101 |     # Start project
102 |     print("Starting peppy project with project"
103 |           "configuration file: '{}'".format(args.config_file))
104 |     prj = peppy.Project(args.config_file)
105 |     print("Changing directory to project root"
106 |           "directory: '{}'.".format(prj.metadata.output_dir))
107 |     os.chdir(prj.metadata.output_dir)
108 |     if args.pass_qc:
109 |         print("Filtering samples out which didn't pass QC"
110 |               "as specified in sample annotation in column 'pass_qc'")
111 |         prj._samples = [
112 |             s for s in prj._samples
113 |             if s.pass_qc not in ["0", 0, "False", False]]
114 | 
115 |     # ANALYSIS
116 |     if args.data_type is None:
117 |         print(
118 |             "Type of analysis not specified. Will run independent analysis"
119 |             "for all types of data in the sample annotation sheet."
120 |         )
121 |         data_types = sorted(list(set([s.protocol for s in prj._samples])))
122 |         print("Sample data types: '{}'.".format(",".join(data_types)))
123 |     else:
124 |         print("Type of analysis specified. Will run only"
125 |               "analysis for samples of type '{}'.".format(args.data_type))
126 |         data_types = [args.data_type]
127 |         print("Sample data types: '{}'.".format(",".join(data_types)))
128 |     if args.name is None:
129 |         print(
130 |             "Analysis name not specified, will use name in"
131 |             "project configuration file: '{}'.".format(prj.project_name)
132 |         )
133 |         args.name = prj.project_name
134 | 
135 |     for data_type in data_types:
136 |         print("Starting analysis for samples of type: '{}'.".format(data_type))
137 |         samples = [s for s in prj._samples if (s.protocol == data_type)]
138 |         if len(samples) > 0:
139 |             print(
140 |                 "Samples under consideration: '{}'. ".format(",".join([s.name for s in samples]))
141 |                 + "Total of {} samples.".format(len([s.name for s in samples]))
142 |             )
143 |         else:
144 |             raise ValueError("There were no valid samples for this analysis type!")
145 | 
146 |         kwargs = {"prj": prj, "samples": samples, "results_dir": args.results_dir}
147 |         if data_type in ["ATAC-seq"]:
148 |             print("Initializing ATAC-seq analysis")
149 |             analysis = ATACSeqAnalysis(
150 |                 name=args.name + "_atacseq", **kwargs
151 |             )
152 |         elif data_type in ["ChIP-seq"]:
153 |             print("Initializing ChIP-seq analysis")
154 |             analysis = ChIPSeqAnalysis(
155 |                 name=args.name + "_chipseq", **kwargs
156 |             )
157 |         elif data_type in ["RNA-seq"]:
158 |             print("Initializing RNA-seq analysis")
159 |             analysis = RNASeqAnalysis(
160 |                 name=args.name + "_rnaseq", **kwargs
161 |             )
162 | 
163 |         print("Running main analysis.")
164 |         main_analysis_pipeline(
165 |             analysis, alpha=args.alpha, abs_fold_change=args.abs_fold_change)
166 |         print("`ngs_analysis` recipe completed successfully!")
167 | 
168 | 
169 | def main_analysis_pipeline(a, alpha=0.05, abs_fold_change=0):
170 |     # TODO: annotate with chromatin state
171 |     # TODO: handle the genome vs transcriptome ambiguity
172 | 
173 |     genomes = list(set(s.genome for s in a.samples))
174 | 
175 |     if len(genomes) != 1:
176 |         raise ValueError(
177 |             "Samples under analysis have more than"
178 |             "one genome assembly: '{}'.".format("', '".join(genomes))
179 |         )
180 | 
181 |     if isinstance(a, ATACSeqAnalysis):
182 | 
183 |         # GET CONSENSUS PEAK SET, ANNOTATE IT, PLOT
184 |         # Get consensus peak set from all samples
185 |         a.get_consensus_sites()
186 |         a.calculate_peak_support()
187 | 
188 |         # GET CHROMATIN OPENNESS MEASUREMENTS, PLOT
189 |         # Get coverage values for each peak in each sample
190 |         a.measure_coverage()
191 |         # normalize coverage values
192 |         a.normalize(method="vst")
193 | 
194 |         # Annotate peaks with closest gene
195 |         a.get_peak_gene_annotation()
196 |         # Annotate peaks with genomic regions
197 |         a.get_peak_genomic_location()
198 |         # Annotate peaks with chromatin state
199 | 
200 |     if isinstance(a, RNASeqAnalysis):
201 |         # Get gene expression
202 |         a.get_gene_expression()
203 | 
204 |     # Annotate peaks with closest gene, chromatin state,
205 |     # genomic location, mean and variance measurements across samples
206 |     a.annotate_features()
207 |     a.to_pickle()
208 | 
209 |     # Unsupervised analysis
210 |     a.unsupervised_analysis(
211 |         plot_max_attr=20,
212 |         plot_max_pcs=6,
213 |         plot_group_centroids=True,
214 |         axis_ticklabels=False,
215 |         axis_lines=True,
216 |         always_legend=False,
217 |         display_corr_values=False,
218 |     )
219 | 
220 |     # Supervised analysis
221 |     if a.comparison_table.empty:
222 |         print(
223 |             "Comparison table has no comparisons with 'data_type'=='{}'"
224 |             "and 'comparison_type'=='differential'.".format(
225 |                 a.data_type
226 |             )
227 |         )
228 |         print("Not performing differential analysis for this data type.")
229 |         a.generate_report(pip_versions=True)
230 |         a.to_pickle()
231 |         return
232 | 
233 |     a.differential_analysis()
234 |     a.to_pickle()
235 | 
236 |     diff = a.differential_results[
237 |         (a.differential_results["padj"] < alpha)
238 |         & (a.differential_results["log2FoldChange"].abs() > abs_fold_change)
239 |     ]
240 |     if diff.empty:
241 |         print(
242 |             "Differential analysis contains no significant {}s"
243 |             "at alpha {} and absolute fold change {}.".format(
244 |                 a.var_unit_name, alpha, abs_fold_change
245 |             )
246 |         )
247 |         a.generate_report(pip_versions=True)
248 |         a.to_pickle()
249 |         return
250 | 
251 |     if diff.groupby("comparison_name").count().shape[0] > 1:
252 |         a.differential_overlap(diff)
253 | 
254 |     a.plot_differential(
255 |         alpha=alpha,
256 |         corrected_p_value=True,
257 |         fold_change=abs_fold_change,
258 |         rasterized=True,
259 |         robust=True,
260 |         group_wise_colours=True,
261 |     )
262 | 
263 |     a.differential_enrichment(
264 |         # TODO: have a way to automatically check what is callable
265 |         steps=['enrichr'],
266 |         directional=True,
267 |         max_diff=1000,
268 |         sort_var="pvalue",
269 |         distributed=False)
270 | 
271 |     # TODO: is this actually needed? vvv
272 |     # a.collect_differential_enrichment(directional=True, permissive=False)
273 | 
274 |     a.plot_differential_enrichment(direction_dependent=True, top_n=5)
275 | 
276 |     a.generate_report(pip_versions=True)
277 |     a.to_pickle()
278 | 
279 | 
280 | if __name__ == "__main__":
281 |     try:
282 |         sys.exit(main())
283 |     except KeyboardInterrupt:
284 |         print("Program canceled by user!")
285 |         sys.exit(1)
286 | 


--------------------------------------------------------------------------------
/ngs_toolkit/recipes/region_enrichment.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | A helper script to run enrichment analysis
 5 | of a single region set in region-based set of annotations.
 6 | """
 7 | 
 8 | import os
 9 | import pandas as pd
10 | import sys
11 | 
12 | from argparse import ArgumentParser
13 | from ngs_toolkit.atacseq import ATACSeqAnalysis
14 | from ngs_toolkit.utils import bed_to_index
15 | 
16 | 
17 | def parse_arguments():
18 |     """
19 |     Argument Parsing.
20 |     """
21 |     parser = ArgumentParser(
22 |         prog="python -m ngs_toolkit.recipes.region_enrichment", description=__doc__)
23 |     parser.add_argument(
24 |         dest="bed_file",
25 |         help="BED file with regions.")
26 |     parser.add_argument(
27 |         dest="pep",
28 |         help="The analysis' PEP config file.")
29 |     parser.add_argument(
30 |         "--output-file",
31 |         dest="output_file",
32 |         default="region_type_enrichment.csv",
33 |         type=str,
34 |         help="Output file.")
35 |     parser.add_argument(
36 |         "--overwrite",
37 |         action="store_true",
38 |         default=False,
39 |         help="Don't overwrite any existing directory or file.")
40 |     return parser
41 | 
42 | 
43 | def main(cli=None):
44 |     print("Region type analysis")
45 |     # Parse command-line arguments.
46 |     args = parse_arguments().parse_args(cli)
47 |     if os.path.exists(args.output_file) and (not args.overwrite):
48 |         print("Output exists and `overwrite` is False, so not doing anything.")
49 |         return 0
50 | 
51 |     print("Reading up the analysis object.")
52 |     a = ATACSeqAnalysis(from_pep=args.pep)
53 |     a.load_data()
54 |     # (
55 |     #     "genomic_region",
56 |     #     "region_annotation_mapping",
57 |     #     "region_annotation_b_mapping",
58 |     # ),
59 |     # (
60 |     #     "chromatin_state",
61 |     #     "chrom_state_annotation_mapping",
62 |     #     "chrom_state_annotation_b_mapping",
63 |     # ),
64 |     print("Reading up the BED file.")
65 |     df = pd.read_csv(args.bed_file, sep="\t", header=None)
66 |     df.columns = ['chrom', 'start', 'end']
67 |     print("Getting the index.")
68 |     index = bed_to_index(df)
69 |     print("Doing enrichment.")
70 |     enr = a.region_context_enrichment(index)
71 |     print("Saving.")
72 |     enr.to_csv(args.output_file)
73 |     print("Done.")
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     try:
78 |         sys.exit(main())
79 |     except KeyboardInterrupt:
80 |         print("Program canceled by user!")
81 |         sys.exit(1)
82 | 


--------------------------------------------------------------------------------
/ngs_toolkit/recipes/region_set_frip.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Compute fraction of reads in peaks (FRiP) based on a consensus set of regions
  5 | derived from several samples.
  6 | 
  7 | A consensus region set can be passed, otherwise it will either try to use an
  8 | existing one for that analysis or produce one on the fly.
  9 | 
 10 | 
 11 | Software requirements:
 12 | 
 13 |  * awk
 14 |  * samtools
 15 | """
 16 | 
 17 | 
 18 | import os
 19 | import sys
 20 | 
 21 | from argparse import ArgumentParser
 22 | 
 23 | import pybedtools
 24 | 
 25 | from ngs_toolkit.atacseq import ATACSeqAnalysis
 26 | from ngs_toolkit.chipseq import ChIPSeqAnalysis
 27 | 
 28 | 
 29 | def parse_arguments():
 30 |     """
 31 |     Global options for analysis.
 32 |     """
 33 |     parser = ArgumentParser(
 34 |         prog="python -m ngs_toolkit.recipes.region_set_frip", description=__doc__
 35 |     )
 36 |     parser.add_argument(dest="config_file", help="YAML project configuration file.", type=str)
 37 |     parser.add_argument(
 38 |         "-r",
 39 |         "--region-set",
 40 |         dest="region_set",
 41 |         default=None,
 42 |         help="BED file with region set derived from several samples or Oracle region set. "
 43 |         "If unset, will try to get the `sites` attribute of an existing analysis object "
 44 |         "if existing, otherwise will create a region set from the peaks of all samples.",
 45 |         type=str,
 46 |     )
 47 |     parser.add_argument(
 48 |         "-q",
 49 |         "--pass-qc",
 50 |         action="store_true",
 51 |         dest="pass_qc",
 52 |         help="Whether only samples with a 'pass_qc' value of '1' "
 53 |         "in the annotation sheet should be used.",
 54 |     )
 55 |     parser.add_argument(
 56 |         "--computing-configuration",
 57 |         dest="computing_configuration",
 58 |         help="Which `divvy` computing configuration to use for distributed jobs."
 59 |         " Type divvy list to see all options. Defaults to the value in the "
 60 |         "ngs_toolkit configuration.",
 61 |     )
 62 |     parser.add_argument(
 63 |         "--permissive",
 64 |         action="store_true",
 65 |         dest="permissive",
 66 |         help="If creating regions set, allow sample files to be missing and use what is present.",
 67 |     )
 68 |     return parser
 69 | 
 70 | 
 71 | def main(cli=None):
 72 |     args = parse_arguments().parse_args(cli)
 73 | 
 74 |     for data_type, clax in [
 75 |         ("ATAC-seq", ATACSeqAnalysis),
 76 |         ("ChIP-seq", ChIPSeqAnalysis),
 77 |     ]:
 78 |         an = clax(from_pep=args.config_file)
 79 | 
 80 |         if not an.samples:
 81 |             continue
 82 | 
 83 |         if args.pass_qc:
 84 |             an.samples = [s for s in an.samples if getattr(s, "pass_qc", None) in ["1", "1.0", 1]]
 85 | 
 86 |         if data_type == "ChIP-seq" and not hasattr(an, "comparison_table"):
 87 |             msg = (
 88 |                 "ChIP-seq analysis must have comparison_table specified in "
 89 |                 "the project config in order to relate"
 90 |                 " foreground and backgound sample groups."
 91 |             )
 92 |             print(msg)
 93 |             raise ValueError(msg)
 94 | 
 95 |         if args.region_set is not None:
 96 |             print("Loading given region set: '{}'".format(args.region_set))
 97 |             an.sites = pybedtools.BedTool(args.region_set)
 98 |         else:
 99 |             print("Trying to load existing consensus region set.")
100 |             an.load_data(only_these_keys=["sites"])
101 | 
102 |         if not hasattr(an, "sites"):
103 |             print("Not found. Producing a new consensus region set.")
104 |             an.get_consensus_sites(permissive=args.permissive)
105 |         else:
106 |             print("Using region set in BED format: '{}'".format(an.sites.fn))
107 | 
108 |         calculate_region_set_frip(
109 |             region_set=an.sites.fn,
110 |             samples=an.samples,
111 |             computing_configuration=args.computing_configuration,
112 |         )
113 | 
114 | 
115 | def calculate_region_set_frip(region_set, samples, computing_configuration=None):
116 |     """
117 |     """
118 |     from ngs_toolkit.utils import submit_job
119 | 
120 |     for sample in samples:
121 |         sample.sample_root = os.path.join(
122 |             sample.project.root_dir, sample.project._config.results_subdir, sample.name
123 |         )
124 |         inside_reads = os.path.join(sample.sample_root, "region_set_frip.inside_reads.txt")
125 |         all_reads = os.path.join(sample.sample_root, "region_set_frip.all_reads.txt")
126 | 
127 |         job_name = sample.name + ".region_set_frip"
128 |         log_file = os.path.join(sample.sample_root, job_name + ".log")
129 |         job_file = os.path.join(sample.sample_root, job_name + ".sh")
130 |         sample_stats = os.path.join(sample.sample_root, "stats.tsv")
131 | 
132 |         cmd = "\n".join(
133 |             [
134 |                 """samtools view -c -L {} {} > {}""".format(
135 |                     region_set, sample.aligned_filtered_bam, inside_reads
136 |                 ),
137 |                 """samtools view -c {} > {}""".format(sample.aligned_filtered_bam, all_reads),
138 |                 'calc(){ awk "BEGIN { print "$*" }"; }',
139 |                 "IN=`cat {}`".format(inside_reads),
140 |                 "ALL=`cat {}`".format(all_reads),
141 |                 "FRIP=`calc $IN/$ALL`",
142 |                 'echo "region_set_frip\\t$FRIP\\t." >> {}'.format(sample_stats),
143 |                 "date",
144 |             ]
145 |         )
146 |         submit_job(
147 |             cmd,
148 |             job_file,
149 |             log_file,
150 |             jobname=job_name,
151 |             computing_configuration=computing_configuration,
152 |         )
153 | 
154 | 
155 | if __name__ == "__main__":
156 |     try:
157 |         sys.exit(main())
158 |     except KeyboardInterrupt:
159 |         print("Program canceled by user!")
160 |         sys.exit(1)
161 | 


--------------------------------------------------------------------------------
/ngs_toolkit/templates/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/afrendeiro/toolkit/c21b69dd8c2a58195c4d798ec53e44999cf5cb6a/ngs_toolkit/templates/__init__.py


--------------------------------------------------------------------------------
/ngs_toolkit/templates/report.html:
--------------------------------------------------------------------------------
  1 | <!doctype html>
  2 | <html lang="en">
  3 |   <head>
  4 |     <!-- Required meta tags -->
  5 |     <meta charset="utf-8">
  6 |     <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
  7 | 
  8 |     <!-- Bootstrap CSS -->
  9 |     <link rel="stylesheet"
 10 |           href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"
 11 |           integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T"
 12 |           crossorigin="anonymous">
 13 | 
 14 |     <title>ngs-toolkit report for '{{ analysis.name }}' project</title>
 15 | 
 16 |     <style type="text/css">
 17 |         body {
 18 |             padding-bottom: 65px;
 19 |         }
 20 |     </style>
 21 |   </head>
 22 |   <body>
 23 | 
 24 |     <!-- Navbar -->
 25 |     <nav class="navbar navbar-expand-lg navbar-light bg-light sticky-top" style="background-color: #e3f2fd;">
 26 |         <a class="navbar-brand" href="https://github.com/afrendeiro/toolkit">
 27 |         <img src="/docs/4.3/assets/brand/bootstrap-solid.svg"
 28 |              width="30" height="30" class="d-inline-block align-top" alt="">
 29 |             ngs-toolkit
 30 |         </a>
 31 |         <button class="navbar-toggler"
 32 |                 type="button"
 33 |                 data-toggle="collapse"
 34 |                 data-target="#navbarSupportedContent"
 35 |                 aria-controls="navbarSupportedContent"
 36 |                 aria-expanded="false"
 37 |                 aria-label="Toggle navigation">
 38 |         <span class="navbar-toggler-icon"></span>
 39 |         </button>
 40 | 
 41 |         <div class="collapse navbar-collapse" id="navbarSupportedContent">
 42 |         <ul class="navbar-nav mr-auto">
 43 |             <li class="nav-item">
 44 |                 <a class="nav-link" href="#project-description">Project description</a>
 45 |             </li>
 46 | 
 47 |             <li class="nav-item dropdown">
 48 |                 <a class="nav-link dropdown-toggle"
 49 |                    href="#"
 50 |                    id="navbarDropdown"
 51 |                    role="button"
 52 |                    data-toggle="dropdown"
 53 |                    aria-haspopup="true"
 54 |                    aria-expanded="false">
 55 |                 Analysis report
 56 |                 </a>
 57 |                 <div class="dropdown-menu" aria-labelledby="navbarDropdown">
 58 |                     {% for section, fig_list in images.items() %}
 59 |                     <a class="dropdown-item" href="#section-{{ section|replace(' ', '-') }}">{{ section }}</a>
 60 |                     {% endfor %}
 61 |                 </div>
 62 |             </li>
 63 | 
 64 |             <li class="nav-item">
 65 |                 <a class="nav-link" href="#versioning">Versioning</a>
 66 |             </li>
 67 |       </ul>
 68 |     </nav>
 69 | 
 70 |     <!-- Jumbotron -->
 71 |     <div class="jumbotron jumbotron-fluid">
 72 |       <div class="container">
 73 |         <h1 class="display-4">{{ analysis.name }}</h1>
 74 |         {% if analysis.description is not none %}
 75 |         <p class="lead">{{ analysis.description }}</p>
 76 |         {% endif %}
 77 |         <p class="lead">
 78 |             <a href="https://github.com/afrendeiro/toolkit">ngs-toolkit</a> analysis report.
 79 |         </p>
 80 |       </div>
 81 |     </div>
 82 | 
 83 | 
 84 |     <!-- Content -->
 85 |     <div class="container">
 86 |         <div class="row">
 87 | 
 88 |             <div class="container"
 89 |                  id="content">
 90 |                 <div class="container">
 91 |                     <h1 id="project-description">Project description</h1>
 92 |                     {% if analysis.description is not none %}
 93 |                     <p>{{ analysis.description }}</p>
 94 |                     {% endif %}
 95 |                     {% if analysis.data_type is not none %}
 96 |                     <p> The project main data type is {{ analysis.data_type }}.</p>
 97 |                     {% endif %}
 98 |                     {% if analysis.organism is not none %}
 99 |                     <p> The project main organism is {{ analysis.organism }}.</p>
100 |                     {% endif %}
101 |                     {% if analysis.genome is not none %}
102 |                     <p> The project main genome assembly is {{ analysis.genome }}.</p>
103 |                     {% endif %}
104 | 
105 |                     <p>
106 |                       <a class="btn btn-primary"
107 |                          data-toggle="collapse"
108 |                          href="#project"
109 |                          role="button"
110 |                          aria-expanded="false"
111 |                          aria-controls="project">
112 |                         More details
113 |                       </a>
114 |                     </p>
115 |                     <div class="collapse" id="project">
116 |                       <div class="card card-body">
117 |                         <ul>
118 |                             {% for key, value in project_repr.items() %}
119 |                             <li>
120 |                                 {{ key }}: {{ value }}
121 |                             </li>
122 |                             {% endfor %}
123 |                         </ul>
124 |                       </div>
125 |                     </div>
126 |                     <hr>
127 |                     <p> Project contains {{ samples|length }} samples:</p>
128 |                     <div class="row-fluid">
129 |                       <p>
130 |                         <a class="btn btn-primary"
131 |                            data-toggle="collapse"
132 |                            href="#samples"
133 |                            role="button"
134 |                            aria-expanded="false"
135 |                            aria-controls="samples">
136 |                           Show all samples
137 |                         </a>
138 |                       </p>
139 |                       <div class="collapse" id="samples">
140 |                         <div class="card card-body">
141 |                           <ul>
142 |                               {% for sample in samples %}
143 |                                 <a class="btn btn-primary"
144 |                                    data-toggle="collapse"
145 |                                    href="#sample-{{ sample.name }}"
146 |                                    role="button"
147 |                                    aria-expanded="false"
148 |                                    aria-controls="sample-{{ sample.name }}">
149 |                                 {{ sample.name }}</a>
150 |                                 <div class="collapse" id="sample-{{ sample.name }}">
151 |                                     <div class="card card-body">
152 |                                         <ul>
153 |                                             {% for key, value in sample.items() %}
154 |                                             <li>{{ key }}: {{ value }}</li>
155 |                                             {% endfor %}
156 |                                         </ul>
157 |                                     </div>
158 |                                 </div>
159 |                               {% endfor %}
160 |                           </ul>
161 |                         </div>
162 |                       </div>
163 | 
164 |                     </div>
165 |                 </div>
166 | 
167 |                 <div class="container" id="analysis-report">
168 |                     <h1>Analysis report</h1>
169 |                     {% for section, fig_list in images.items() %}
170 |                     <h3 id="section-{{ section|replace(' ', '-') }}">{{ section }}</h3>
171 |                     <div class="row">
172 |                         {% for caption, csv in csvs[section] %}
173 |                             <div id="csv-{{ caption }}" class="col-6">
174 |                                 <p>
175 |                                     <a href="{{ csv }}">Download CSV file: {{ caption }}</a>
176 |                                 </p>
177 |                             </div>
178 |                         {% endfor %}
179 |                     </div>
180 |                     <div class="row">
181 |                         {% for caption, fig in fig_list %}
182 |                             <div id="plot-{{ caption }}" class="col-6">
183 |                                 <figure>
184 |                                     <a href="{{ fig }}">
185 |                                         <img class="rounded float-left" src="{{ fig }}" style="width: 100%"/>
186 |                                     </a>
187 |                                 <figcaption class="figure-caption text-center">{{ caption }}</figcaption>
188 |                                 </figure>
189 |                             </div>
190 |                         {% endfor %}
191 |                     </div>
192 |                     <hr>
193 |                     {% endfor %}
194 |                 </div>
195 | 
196 |                 <div class="container" id="versioning">
197 |                     <h1>Versioning</h1>
198 |                     <ul>
199 |                         <li>Python version: {{ python_version }}</li>
200 |                         <li>ngs-toolkit version: {{ library_version }}</li>
201 | 
202 |                         {% if freeze|length > 0 %}
203 |                         <p>
204 |                           <a class="btn btn-primary"
205 |                              data-toggle="collapse"
206 |                              href="#versioning-additional"
207 |                              role="button"
208 |                              aria-expanded="false"
209 |                              aria-controls="versioning-additional">
210 |                           Additional Python libraries</a>
211 |                         </p>
212 |                         <div class="row collapse" id="versioning-additional">
213 |                             {% for library in freeze %}
214 |                                 <div id="versioning-{{library}}" class="col-2">
215 |                                     <li>{{ library }}</li>
216 |                                 </div>
217 |                             {% endfor %}
218 |                         </div>
219 |                         {% endif %}
220 |                     </ul>
221 |                     <hr>
222 |                 </div>
223 | 
224 |             </div>
225 |         </div>
226 | 
227 |     </div>
228 | 
229 |     <nav class="navbar fixed-bottom navbar-light bg-light">
230 |         <a class="navbar-brand" href="#"></a>
231 |         <p>Automatically generated by <a href="https://github.com/afrendeiro/toolkit">ngs-toolkit</a> at {{ time }}.</p>
232 |     </nav>
233 | 
234 |     <!-- Optional JavaScript -->
235 |     <!-- jQuery first, then Popper.js, then Bootstrap JS -->
236 |     <script src="https://code.jquery.com/jquery-3.3.1.slim.min.js"
237 |             integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo"
238 |             crossorigin="anonymous"></script>
239 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js"
240 |             integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1"
241 |             crossorigin="anonymous"></script>
242 |     <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js"
243 |             integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM"
244 |             crossorigin="anonymous"></script>
245 |   </body>
246 | </html>
247 | 


--------------------------------------------------------------------------------
/ngs_toolkit/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/afrendeiro/toolkit/c21b69dd8c2a58195c4d798ec53e44999cf5cb6a/ngs_toolkit/tests/__init__.py


--------------------------------------------------------------------------------
/ngs_toolkit/tests/test_analysis.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | 
  4 | import glob
  5 | import os
  6 | import shutil
  7 | 
  8 | import numpy as np
  9 | import pytest
 10 | 
 11 | from ngs_toolkit.analysis import Analysis
 12 | from ngs_toolkit.utils import get_this_file_or_timestamped
 13 | from .conftest import file_exists, file_not_empty, COMBAT
 14 | 
 15 | 
 16 | class TestAnalysis:
 17 |     def test_analysis_representation(self):
 18 |         name = "test_analysis"
 19 | 
 20 |         an = Analysis(name=name)
 21 |         assert an.__repr__() == "Analysis '{}'.".format(name)
 22 |         assert "samples" not in an.__repr__()
 23 | 
 24 |     def test_with_object_as(self):
 25 |         name = "test_analysis"
 26 | 
 27 |         an = Analysis(name=name)
 28 |         with an as _an:
 29 |             assert an is _an
 30 |             assert an == _an
 31 |             assert _an.__repr__() == "Analysis '{}'.".format(name)
 32 |             assert "samples" not in _an.__repr__()
 33 | 
 34 |     def test_analysis_creation(self, tmp_path):
 35 |         from ngs_toolkit.demo.data_generator import generate_project
 36 | 
 37 |         tmp_path = str(tmp_path)
 38 | 
 39 |         # Let's make several "reallish" test projects
 40 |         project_prefix_name = "test-project"
 41 |         data_types = ["ATAC-seq", "RNA-seq", "ChIP-seq"]  # "CNV"
 42 |         genome_assemblies = [("human", "hg38"), ("mouse", "mm10")]  # ("human", "hg19"),
 43 | 
 44 |         params = {
 45 |             "ATAC-seq": {
 46 |                 "n_factors": [1, 2, 3],
 47 |                 "n_features": [100, 1000, 10000],
 48 |                 "n_replicates": [1, 2, 5],
 49 |                 "analysis": "ATACSeqAnalysis",
 50 |             },
 51 |             "ChIP-seq": {
 52 |                 "n_factors": [1, 2, 3],
 53 |                 "n_features": [100, 1000, 10000],
 54 |                 "n_replicates": [1, 2, 5],
 55 |                 "analysis": "ChIPSeqAnalysis",
 56 |             },
 57 |             "RNA-seq": {
 58 |                 "n_factors": [1, 2, 3],
 59 |                 "n_features": [100, 1000, 25000],
 60 |                 "n_replicates": [1, 2, 5],
 61 |                 "analysis": "RNASeqAnalysis",
 62 |             },
 63 |         }
 64 | 
 65 |         for data_type in data_types:
 66 |             n_factors = params[data_type]["n_factors"][0]
 67 |             n_features = params[data_type]["n_features"][0]
 68 |             n_replicates = params[data_type]["n_replicates"][0]
 69 |             for organism, genome_assembly in genome_assemblies:
 70 | 
 71 |                 project_name = "{}_{}_{}_{}_{}_{}".format(
 72 |                     project_prefix_name,
 73 |                     data_type,
 74 |                     genome_assembly,
 75 |                     n_factors,
 76 |                     n_features,
 77 |                     n_replicates,
 78 |                 )
 79 | 
 80 |                 an = generate_project(
 81 |                     output_dir=tmp_path,
 82 |                     project_name=project_name,
 83 |                     organism=organism,
 84 |                     genome_assembly=genome_assembly,
 85 |                     data_type=data_type,
 86 |                     n_factors=n_factors,
 87 |                     n_replicates=n_replicates,
 88 |                     n_features=n_features,
 89 |                     only_metadata=True,
 90 |                 )
 91 |                 # n_samples = (n_factors * n_replicates) + n_factors
 92 |                 # assert an.__repr__() == (
 93 |                 #     "'{}' analysis '{}' with {} samples of organism '{}' ({}).".format(
 94 |                 #         data_type, project_name, n_samples, organism, genome_assembly
 95 |                 #     )
 96 |                 # )
 97 |                 # assert len(n_factors * 2 * n_replicates * 2) == len(an.prj.samples) == len(an.samples)
 98 |                 assert all([x == y for x, y in zip(an.prj.samples, an.samples)])
 99 | 
100 |                 shutil.rmtree(tmp_path)
101 | 
102 |     def test_analysis_serialization(self, tmp_path):
103 | 
104 |         tmp_path = str(tmp_path)
105 | 
106 |         pickle_file = os.path.join(tmp_path, "analysis.pickle")
107 |         a = Analysis(pickle_file=pickle_file)
108 |         assert not file_exists(pickle_file)
109 |         a.to_pickle()
110 |         assert file_exists(pickle_file)
111 |         assert file_not_empty(pickle_file)
112 | 
113 |         previous_size = os.stat(get_this_file_or_timestamped(pickle_file)).st_size
114 |         a.random = np.random.random((100, 100))
115 |         a.to_pickle()
116 |         new_size = os.stat(get_this_file_or_timestamped(pickle_file)).st_size
117 |         assert new_size > previous_size
118 | 
119 |         previous_size = os.stat(get_this_file_or_timestamped(pickle_file)).st_size
120 |         a.random = np.random.random((100, 100))
121 |         a.to_pickle(timestamp=True)
122 |         assert len(glob.glob(os.path.join(tmp_path, "*.pickle"))) == 2
123 | 
124 |     def test_analysis_loading(self, tmp_path):
125 |         tmp_path = str(tmp_path)
126 |         pickle_file = os.path.join(tmp_path, "pickle")
127 |         secret = "I've existed before"
128 | 
129 |         a = Analysis()
130 |         a.pickle_file = pickle_file
131 |         a.secret = secret
132 |         a.to_pickle()
133 | 
134 |         a2 = Analysis(from_pickle=pickle_file)
135 |         assert a2.secret == secret
136 | 
137 |         a3 = Analysis()
138 |         a3.update(pickle_file)
139 |         assert a3.secret == secret
140 | 
141 |         a4 = Analysis()
142 |         a4.pickle_file = pickle_file
143 |         a4 = a4.from_pickle()
144 |         assert a4.secret == secret
145 | 
146 |         shutil.rmtree(tmp_path)
147 | 
148 |     def test__overwride_sample_representation(self, atac_analysis):
149 | 
150 |         prev = atac_analysis.samples[0].__repr__
151 |         Analysis._overwride_sample_representation()
152 |         new = atac_analysis.samples[0].__repr__
153 | 
154 |         assert prev != new
155 | 
156 |     def test__check_data_type_is_supported(self):
157 |         assert Analysis._check_data_type_is_supported("ATAC-seq")
158 |         assert Analysis._check_data_type_is_supported("ChIP-seq")
159 |         assert Analysis._check_data_type_is_supported("RNA-seq")
160 |         assert Analysis._check_data_type_is_supported("CNV")
161 |         assert not Analysis._check_data_type_is_supported("Microarray")
162 | 
163 |     def test__get_data_type(self, atac_analysis):
164 |         assert atac_analysis._get_data_type() == "ATAC-seq"
165 |         assert atac_analysis._get_data_type(data_type="ATAC-seq") == "ATAC-seq"
166 |         assert atac_analysis._get_data_type(data_type="RNA-seq") == "RNA-seq"
167 | 
168 |         with pytest.raises(ValueError):
169 |             atac_analysis._get_data_type(data_type="Microarray")
170 | 
171 |         atac_analysis.data_type = None
172 |         with pytest.raises(ValueError):
173 |             atac_analysis._get_data_type()
174 | 
175 |         del atac_analysis.data_type
176 |         with pytest.raises(AttributeError):
177 |             atac_analysis._get_data_type()
178 | 
179 |     def test__check_samples_have_file(self, atac_analysis):
180 |         with pytest.raises(AttributeError):
181 |             atac_analysis._check_samples_have_file("NOTEXISTING")
182 | 
183 |         # assert not atac_analysis._check_samples_have_file("summits")
184 | 
185 |         assert not atac_analysis._check_samples_have_file("sample_name")
186 | 
187 |     def test__get_samples_have_file(self, atac_analysis):
188 |         assert not atac_analysis._get_samples_have_file("sample_name")
189 | 
190 |     def test__get_samples_missing_file(self, atac_analysis):
191 |         with pytest.raises(AttributeError):
192 |             atac_analysis._get_samples_have_file("NOTEXISTING")
193 | 
194 |         assert not atac_analysis._get_samples_have_file("sample_name")
195 | 
196 |         assert not atac_analysis._get_samples_have_file("aligned_filtered_bam")
197 | 
198 |     def test__get_samples_with_input_file(self, atac_analysis):
199 |         with pytest.raises(AttributeError):
200 |             atac_analysis._get_samples_with_input_file("NOTEXISTING")
201 | 
202 |         with pytest.raises(IOError):
203 |             atac_analysis._get_samples_with_input_file("sample_name")
204 | 
205 |         with pytest.raises(IOError):
206 |             atac_analysis._get_samples_with_input_file("aligned_filtered_bam")
207 | 
208 |         assert not atac_analysis._get_samples_with_input_file(
209 |             "aligned_filtered_bam", permissive=True
210 |         )
211 |         assert not atac_analysis._get_samples_with_input_file("peaks", permissive=True)
212 |         assert not atac_analysis._get_samples_with_input_file("summits", permissive=True)
213 | 
214 |     @pytest.mark.parametrize(
215 |         "env_var,string",
216 |         [
217 |             ("_${USER}_", "_{}_".format(os.environ.get("USER"))),
218 |             # ("_$PATH_", "_{}_".format(os.environ.get("PATH"))),
219 |         ],
220 |     )
221 |     def test__format_string_with_environment_variables(self, env_var, string):
222 |         assert string == Analysis._format_string_with_environment_variables(env_var)
223 | 
224 |     def test__format_string_with_attributes_simple(self):
225 |         t = Analysis()
226 |         t.a = 1
227 |         t.b = ""
228 |         assert "1" == Analysis._format_string_with_attributes(t, "{a}{b}")
229 | 
230 |     @pytest.mark.parametrize(
231 |         "env_var,string",
232 |         [("{data_type}", "ATAC-seq"), ("{name}", "test-project_ATAC-seq_human_hg38_1_250_2"),],
233 |     )
234 |     def test__format_string_with_attributes(self, atac_analysis, env_var, string):
235 |         assert string == atac_analysis._format_string_with_attributes(env_var)
236 | 
237 |     def test_record_output_file(self, atac_analysis):
238 |         assert hasattr(atac_analysis, "output_files")
239 |         assert len(atac_analysis.output_files) == 0
240 |         atac_analysis.record_output_file("a", name="analysis")
241 |         assert hasattr(atac_analysis, "output_files")
242 |         assert len(atac_analysis.output_files) == 1
243 |         assert atac_analysis.output_files[0][0] == "analysis"
244 |         assert atac_analysis.output_files[0][1] == "a"
245 | 
246 | 
247 | def test_project_with_subprojects(subproject_config):
248 |     from ngs_toolkit import Analysis
249 | 
250 |     a = Analysis(from_pep=subproject_config)
251 |     assert len(a.samples) == 0
252 | 
253 |     a = Analysis(from_pep=subproject_config, amendments=["test_subproject"])
254 |     assert len(a.samples) > 0
255 | 
256 | 
257 | @pytest.mark.skipif(not COMBAT, reason="Combat not installed")
258 | def test_remove_factor(atac_analysis_many_factors):
259 |     import pandas as pd
260 | 
261 |     a = atac_analysis_many_factors
262 |     a.matrix_norm = a.matrix_norm.dropna()
263 | 
264 |     prefix = os.path.join(a.results_dir, "unsupervised_analysis_{}".format(a.data_type), a.name)
265 |     # inspect
266 |     a.unsupervised_analysis(output_prefix="before", steps=["pca_association"])
267 | 
268 |     f = prefix + ".before.pca.variable_principle_components_association.csv"
269 |     p = pd.read_csv(get_this_file_or_timestamped(f))
270 | 
271 |     # extract the name of the factor with highest contribution
272 |     factor = p.iloc[p.query("pc == 1")["p_value"].idxmin()]["attribute"]
273 |     # check if it's significant
274 |     assert p.query("attribute == '{}' and pc < 15".format(factor))["p_value"].min() < 0.05
275 | 
276 |     # remove factor without regard for the other factors
277 |     m = a.remove_factor_from_matrix(factor=factor, assign=False, save=False)
278 |     a.unsupervised_analysis(matrix=m, output_prefix="after_simple", steps=["pca_association"])
279 | 
280 |     f = prefix + ".after_simple.pca.variable_principle_components_association.csv"
281 |     p2 = pd.read_csv(get_this_file_or_timestamped(f))
282 |     assert p2.query("attribute == '{}' and pc < 15".format(factor))["p_value"].min() > 0.05
283 | 
284 |     # remove factor accounting for the other factors
285 |     m = a.remove_factor_from_matrix(
286 |         factor=factor,
287 |         covariates=[x for x in a.group_attributes if x != factor],
288 |         assign=False,
289 |         save=False,
290 |     )
291 |     a.unsupervised_analysis(matrix=m, output_prefix="after_covariates", steps=["pca_association"])
292 | 
293 |     f = prefix + ".after_covariates.pca.variable_principle_components_association.csv"
294 |     p3 = pd.read_csv(get_this_file_or_timestamped(f))
295 |     assert p3.query("attribute == '{}' and pc < 15".format(factor))["p_value"].min() > 0.05
296 | 


--------------------------------------------------------------------------------
/ngs_toolkit/tests/test_chipseq_analysis.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | import pytest
 5 | 
 6 | from .conftest import file_exists, file_exists_and_not_empty  # , CI, RPY2
 7 | 
 8 | 
 9 | def test_call_peaks_from_comparisons(chipseq_analysis):
10 |     chipseq_analysis.call_peaks_from_comparisons()
11 | 
12 |     for name, comp in chipseq_analysis.comparisons.items():
13 |         files = [
14 |             comp['prefix'] + ".homer.log",
15 |             comp['prefix'] + ".homer.sh",
16 |             comp['prefix'] + ".macs2.log",
17 |             comp['prefix'] + ".macs2.sh",
18 |         ]
19 |         for f in files:
20 |             assert file_exists(f)
21 | 
22 | 
23 | def test_filter_peaks(chipseq_analysis_with_peaks):
24 |     chipseq_analysis_with_peaks.filter_peaks()
25 | 
26 |     for name, comp in chipseq_analysis_with_peaks.comparisons.items():
27 |         files = [
28 |             comp['prefix'] + "_homer_peaks.factor.filtered.bed",
29 |             comp['prefix'] + "_homer_peaks.factor.narrowPeak",
30 |             comp['prefix'] + "_homer_peaks.histone.filtered.bed",
31 |             comp['prefix'] + "_homer_peaks.histone.narrowPeak",
32 |             comp['prefix'] + "_peaks.filtered.bed",
33 |             comp['prefix'] + "_peaks.narrowPeak",
34 |         ]
35 |         for f in files:
36 |             assert file_exists_and_not_empty(f)
37 | 
38 | 
39 | @pytest.mark.xfail
40 | def test_summarize_peaks_from_comparisons(chipseq_analysis_with_peaks):
41 |     chipseq_analysis_with_peaks.test_summarize_peaks_from_comparisons()
42 |     assert False
43 | 
44 | 
45 | @pytest.mark.xfail
46 | def test_get_consensus_sites(chipseq_analysis_with_peaks):
47 |     chipseq_analysis_with_peaks.test_get_consensus_sites()
48 |     assert False
49 | 
50 | 
51 | @pytest.mark.xfail
52 | def test_calculate_peak_support(chipseq_analysis_with_peaks):
53 |     chipseq_analysis_with_peaks.test_calculate_peak_support()
54 |     assert False
55 | 
56 | 
57 | @pytest.mark.xfail
58 | def test_get_supported_peaks(chipseq_analysis_with_peaks):
59 |     chipseq_analysis_with_peaks.test_get_supported_peaks()
60 |     assert False
61 | 
62 | 
63 | @pytest.mark.xfail
64 | def test_normalize_by_background(chipseq_analysis_with_peaks):
65 |     chipseq_analysis_with_peaks.test_normalize_by_background()
66 |     assert False
67 | 


--------------------------------------------------------------------------------
/ngs_toolkit/tests/test_cnv_analysis.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | 
 5 | import pytest
 6 | import pandas as pd
 7 | 
 8 | from .conftest import file_exists_and_not_empty, STAP, DNACOPY  # , CI, RPY2
 9 | 
10 | 
11 | @pytest.mark.xfail
12 | def test__copy_cnv_profile_plots(cnv_analysis):
13 |     with cnv_analysis as an:
14 |         an._copy_cnv_profile_plots()
15 |     assert False
16 | 
17 | 
18 | def test_get_cnv_data(cnv_analysis_with_inputs):
19 |     with cnv_analysis_with_inputs as an:
20 |         an.get_cnv_data()
21 | 
22 |         p = os.path.join(an.results_dir, an.name)
23 |         files = [
24 |             p + ".10kb.matrix_raw.csv",
25 |             p + ".100kb.matrix_raw.csv",
26 |             p + ".1000kb.matrix_raw.csv"]
27 |         for f in files:
28 |             assert file_exists_and_not_empty(f)
29 |             assert pd.read_csv(f, index_col=0).sum().sum() == 0
30 | 
31 | 
32 | def test_normalize(cnv_analysis):
33 |     with cnv_analysis as an:
34 |         an.normalize()
35 | 
36 |         p = os.path.join(an.results_dir, an.name)
37 |         files = [
38 |             p + ".10kb.matrix_norm.csv",
39 |             p + ".100kb.matrix_norm.csv",
40 |             p + ".1000kb.matrix_norm.csv"]
41 |         for f in files:
42 |             assert file_exists_and_not_empty(f)
43 |             assert pd.read_csv(f, index_col=0).sum().sum() != 0
44 | 
45 | 
46 | def test_plot_all_data(cnv_analysis):
47 |     with cnv_analysis as an:
48 |         an.plot_all_data(matrix='matrix_raw')
49 | 
50 |         for res in an.resolutions:
51 |             p = os.path.join(an.results_dir, an.name + "." + res + ".all_data.full_data")
52 |             files = [
53 |                 p + ".fillna.clustermap.svg",
54 |                 p + ".heatmap.svg"]
55 |             for f in files:
56 |                 assert file_exists_and_not_empty(f)
57 | 
58 | 
59 | def test_plot_stats_per_chromosome(cnv_analysis):
60 |     with cnv_analysis as an:
61 |         an.plot_stats_per_chromosome(matrix="matrix_raw")
62 | 
63 |         for res in an.resolutions:
64 |             for t in ['mean', 'variation']:
65 |                 p = os.path.join(an.results_dir, an.name + "." + res + ".all_data." + t + "_per_chrom")
66 |                 files = [
67 |                     p + ".no_sex_chroms.zscore.svg",
68 |                     p + ".no_sex_chroms.svg",
69 |                     p + ".svg"]
70 |                 for f in files:
71 |                     assert file_exists_and_not_empty(f)
72 | 
73 | 
74 | # @pytest.mark.skipif(not STAP or not DNACOPY, reason="STAP and DNACopy R libraries are required to perform segmentation.")
75 | @pytest.mark.xfail
76 | def test_segment_genome(cnv_analysis):
77 |     cnv_analysis.segment_genome()
78 |     assert False
79 | 
80 | 
81 | @pytest.mark.xfail
82 | def test_annotate_with_chrom_bands(cnv_analysis):
83 |     cnv_analysis.annotate_with_chrom_bands()
84 |     assert False
85 | 
86 | 
87 | @pytest.mark.xfail
88 | def test_plot_segmentation_stats(cnv_analysis):
89 |     cnv_analysis.plot_segmentation_stats()
90 |     assert False
91 | 


--------------------------------------------------------------------------------
/ngs_toolkit/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | def test_config_has_all_required_fields():
 5 |     from ngs_toolkit import _CONFIG as local_config
 6 |     import pkgutil
 7 |     import yaml
 8 | 
 9 |     def _dicts_same_keys(d1, d2):
10 |         if type(d1) != type(d2):
11 |             return False
12 | 
13 |         for k in d1.keys():
14 |             if k not in d2:
15 |                 return False
16 |             else:
17 |                 if type(d1[k]) is dict:
18 |                     return _dicts_same_keys(d1[k], d2[k])
19 |                 else:
20 |                     return True
21 | 
22 |     file_config = (
23 |         pkgutil.get_data("ngs_toolkit", "config/default.yaml").decode().strip()
24 |     )
25 |     file_config = yaml.load(file_config)
26 | 
27 |     assert _dicts_same_keys(file_config, local_config)
28 | 


--------------------------------------------------------------------------------
/ngs_toolkit/tests/test_decorators.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | import pytest
 5 | 
 6 | 
 7 | class Test_check_has_attributes:
 8 |     # here we use 'get_resources' as en example
 9 |     # decorated function that won't fail for some other
10 |     # reason on a fairly empty analysis object
11 |     def test_empty_analysis(self, empty_analysis):
12 |         # Make sure it raises AttributeError
13 |         with pytest.raises(AttributeError):
14 |             empty_analysis.get_resources(steps=[])
15 | 
16 |     def test_null_analysis(self, null_analysis):
17 |         # Make sure it raises AttributeError
18 |         with pytest.raises(AttributeError):
19 |             null_analysis.get_resources(steps=[])
20 | 
21 |     def test_full_analysis(self, full_analysis):
22 |         full_analysis.get_resources(steps=[])
23 | 
24 |     # here we use 'calculate_peak_support' as en example
25 |     # decorated function. It will however fail for another
26 |     # reason due to the fairly empty analysis object (last test)
27 |     def test_empty_analysis_2(self, empty_analysis):
28 |         # Make sure it raises AttributeError
29 |         with pytest.raises(AttributeError):
30 |             empty_analysis.calculate_peak_support()
31 | 
32 |     def test_null_analysis_2(self, null_analysis):
33 |         # Make sure it raises AttributeError
34 |         with pytest.raises(AttributeError):
35 |             null_analysis.calculate_peak_support()
36 | 
37 |     def test_full_analysis_2(self, atac_analysis):
38 |         # This passes on the decorator
39 |         # but raises IOError specific to the function
40 |         with pytest.raises(IOError):
41 |             atac_analysis.calculate_peak_support()
42 | 
43 |     def test_iterable_attributes(self, atac_analysis):
44 |         from ngs_toolkit import Analysis
45 |         from ngs_toolkit.decorators import check_has_attributes
46 | 
47 |         class TestAnalysis(Analysis):
48 |             @check_has_attributes(['samples'], [list])
49 |             def test_function(self):
50 |                 print(self.samples)
51 |                 return True
52 | 
53 |         a = TestAnalysis()
54 | 
55 |         # has not samples set
56 |         del a.samples
57 |         with pytest.raises(AttributeError):
58 |             a.test_function()
59 | 
60 |         # samples is None
61 |         a.samples = None
62 |         with pytest.raises(AttributeError):
63 |             a.test_function()
64 | 
65 |         # samples is empty list
66 |         a.samples = list()
67 |         with pytest.raises(AttributeError):
68 |             a.test_function()
69 | 
70 |         # has samples
71 |         a.samples = [1, 2, 3]
72 |         assert a.test_function()
73 | 


--------------------------------------------------------------------------------
/ngs_toolkit/tests/test_differential_analysis.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | 
  5 | import pytest
  6 | 
  7 | from .conftest import file_exists, file_exists_and_not_empty, R, R_REASON
  8 | 
  9 | 
 10 | # Note:
 11 | # The DESeq2 1.24.0 version in Debian archives
 12 | # differs from the DESeq2 1.24.0 version in bioconductor version 3.9
 13 | # If estimateDispersions with default fitType="parametric" fails,
 14 | # (as often happens with the quickly generated synthetic data from tests),
 15 | # it tries to use local fit using the locfit package, but in Debian
 16 | # version this is not a valid choice of fit, causing failure.
 17 | # Due to this, and since I'm using Debian packages for faster testing
 18 | # I'm manually setting fitType="mean" for testing only.
 19 | 
 20 | 
 21 | @pytest.fixture
 22 | def outputs(atac_analysis):
 23 |     output_dir = os.path.join(atac_analysis.results_dir, "differential_analysis_ATAC-seq")
 24 |     prefix = os.path.join(output_dir, "differential_analysis.")
 25 |     outs = [
 26 |         os.path.join(output_dir, "Factor_A_2vs1"),
 27 |         os.path.join(
 28 |             output_dir,
 29 |             "Factor_A_2vs1",
 30 |             "differential_analysis.deseq_result.Factor_A_2vs1.csv",
 31 |         ),
 32 |         prefix + "comparison_table.tsv",
 33 |         prefix + "count_matrix.tsv",
 34 |         prefix + "deseq_result.all_comparisons.csv",
 35 |         prefix + "experiment_matrix.tsv",
 36 |     ]
 37 |     return outs
 38 | 
 39 | 
 40 | # @pytest.fixture
 41 | # def outputs_no_subdirectories(analysis):
 42 | #     output_dir = os.path.join(analysis.results_dir, "differential_analysis_ATAC-seq")
 43 | #     prefix = os.path.join(output_dir, "differential_analysis.")
 44 | #     outputs = [
 45 | #         prefix + "deseq_result.Factor_A_2vs1.csv",
 46 | #         prefix + "comparison_table.tsv",
 47 | #         prefix + "count_matrix.tsv",
 48 | #         prefix + "deseq_result.all_comparisons.csv",
 49 | #         prefix + "experiment_matrix.tsv"]
 50 | #     return outputs
 51 | 
 52 | 
 53 | @pytest.mark.skipif(
 54 |     not R,
 55 |     reason=R_REASON)
 56 | def test_deseq_functionality():
 57 |     import pandas as pd
 58 |     from ngs_toolkit.utils import recarray2pandas_df
 59 | 
 60 |     from rpy2.robjects import numpy2ri, pandas2ri, r
 61 |     from rpy2.robjects.packages import importr
 62 |     numpy2ri.activate()
 63 |     pandas2ri.activate()
 64 | 
 65 |     importr("DESeq2")
 66 | 
 67 |     dds = r.makeExampleDESeqDataSet()
 68 |     dds = r.estimateSizeFactors(dds)
 69 |     dds = r.estimateDispersions(dds)
 70 |     dds = r.nbinomWaldTest(dds)
 71 |     res = recarray2pandas_df(r("as.data.frame")(r("DESeq2::results")(dds)))
 72 |     assert isinstance(res, pd.DataFrame)
 73 | 
 74 |     dds = r.makeExampleDESeqDataSet()
 75 |     dds = r.DESeq(dds)
 76 |     res = recarray2pandas_df(r("as.data.frame")(r("DESeq2::results")(dds)))
 77 |     assert isinstance(res, pd.DataFrame)
 78 | 
 79 | 
 80 | @pytest.mark.skipif(
 81 |     not R,
 82 |     reason=R_REASON)
 83 | class Test_differential_analysis:
 84 |     def test_simple_design(self, atac_analysis, outputs):
 85 |         import pandas as pd
 86 | 
 87 |         atac_analysis.differential_analysis()
 88 |         assert file_exists(
 89 |             os.path.join(atac_analysis.results_dir, "differential_analysis_ATAC-seq")
 90 |         )
 91 |         assert file_exists(outputs[0])
 92 |         assert os.path.isdir(outputs[0])
 93 |         for output in outputs[1:]:
 94 |             assert file_exists_and_not_empty(output)
 95 |         assert hasattr(atac_analysis, "differential_results")
 96 |         assert isinstance(atac_analysis.differential_results, pd.DataFrame)
 97 |         assert atac_analysis.differential_results.index.str.startswith("chr").all()
 98 |         assert atac_analysis.differential_results.index.name == "index"
 99 |         cols = [
100 |             "baseMean",
101 |             "log2FoldChange",
102 |             "lfcSE",
103 |             "stat",
104 |             "pvalue",
105 |             "padj",
106 |             "comparison_name",
107 |         ]
108 |         assert atac_analysis.differential_results.columns.tolist() == cols
109 | 
110 |     def test_complex_design(self, atac_analysis, outputs):
111 |         import pandas as pd
112 | 
113 |         atac_analysis.differential_analysis()
114 |         assert file_exists(
115 |             os.path.join(atac_analysis.results_dir, "differential_analysis_ATAC-seq")
116 |         )
117 |         assert file_exists(outputs[0])
118 |         assert os.path.isdir(outputs[0])
119 |         for output in outputs[1:]:
120 |             assert file_exists_and_not_empty(output)
121 |         assert hasattr(atac_analysis, "differential_results")
122 |         assert isinstance(atac_analysis.differential_results, pd.DataFrame)
123 |         assert atac_analysis.differential_results.index.str.startswith("chr").all()
124 |         assert atac_analysis.differential_results.index.name == "index"
125 |         cols = [
126 |             "baseMean",
127 |             "log2FoldChange",
128 |             "lfcSE",
129 |             "stat",
130 |             "pvalue",
131 |             "padj",
132 |             "comparison_name",
133 |         ]
134 |         assert atac_analysis.differential_results.columns.tolist() == cols
135 | 
136 |     # def test_no_subdirectories(self, atac_analysis, outputs):
137 |     #     atac_analysis.differential_analysis()
138 |     #     assert file_exists(
139 |     #         os.path.join(atac_analysis.results_dir, "differential_analysis_ATAC-seq"))
140 |     #     assert file_exists(outputs[0])
141 |     #     assert os.path.isdir(outputs[0])
142 |     #     for output in outputs[1:]:
143 |     #         assert file_exists(output)
144 |     #         assert os.stat(output).st_size > 0
145 | 


--------------------------------------------------------------------------------
/ngs_toolkit/tests/test_differential_enrichment.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | 
 5 | import pytest
 6 | from .conftest import file_exists_and_not_empty
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def outputs(analysis_with_differential):
11 |     prefix = os.path.join(
12 |         analysis_with_differential.results_dir, "differential_analysis_ATAC-seq", "enrichments"
13 |     )
14 |     outputs = [
15 |         os.path.join(
16 |             prefix, "Factor_A_2vs1.down/differential_analysis.gene_symbols.txt"
17 |         ),
18 |         os.path.join(prefix, "Factor_A_2vs1.down/differential_analysis_regions.bed"),
19 |         os.path.join(prefix, "Factor_A_2vs1.down/differential_analysis_regions.tsv"),
20 |         os.path.join(
21 |             prefix, "Factor_A_2vs1.down/differential_analysis.enrichr.csv"
22 |         ),
23 |         os.path.join(prefix, "Factor_A_2vs1.up/differential_analysis.gene_symbols.txt"),
24 |         os.path.join(prefix, "Factor_A_2vs1.up/differential_analysis_regions.bed"),
25 |         os.path.join(prefix, "Factor_A_2vs1.up/differential_analysis_regions.tsv"),
26 |         os.path.join(
27 |             prefix, "Factor_A_2vs1.up/differential_analysis.enrichr.csv"
28 |         ),
29 |         os.path.join(prefix, "differential_analysis.enrichr.csv"),
30 |     ]
31 |     return outputs
32 | 
33 | 
34 | # @pytest.mark.skip(reason="no way of currently testing this")
35 | class Test_differential_enrichment:
36 |     def test_no_arguments(self, analysis_with_differential, outputs):
37 |         analysis_with_differential.differential_enrichment(steps=["enrichr"])
38 |         for output in outputs:
39 |             assert file_exists_and_not_empty(output)
40 | 


--------------------------------------------------------------------------------
/ngs_toolkit/tests/test_general.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | 
  5 | import pybedtools
  6 | from ngs_toolkit.general import lola
  7 | 
  8 | import pytest
  9 | from .conftest import file_exists_and_not_empty, CI  # , RPY2
 10 | 
 11 | 
 12 | class Test_annotate_samples:
 13 |     def test_no_arguments(self, analysis_normalized):
 14 |         analysis_normalized.annotate_samples()
 15 | 
 16 |     def test_matrix_raw(self, atac_analysis):
 17 |         atac_analysis.annotate_samples(matrix="matrix_raw")
 18 | 
 19 | 
 20 | def test_get_matrix(atac_analysis):
 21 |     import numpy as np
 22 |     import pandas as pd
 23 | 
 24 |     matrix = atac_analysis.get_matrix(matrix="matrix_raw")
 25 |     assert np.array_equal(matrix.values, atac_analysis.matrix_raw.values)
 26 |     assert (matrix == atac_analysis.matrix_raw).all().all()
 27 |     atac_analysis.dummy = atac_analysis.matrix_raw + 1
 28 |     matrix = atac_analysis.get_matrix(matrix="dummy")
 29 |     assert (matrix == (atac_analysis.matrix_raw + 1)).all().all()
 30 | 
 31 |     matrix = atac_analysis.get_matrix(matrix=atac_analysis.matrix_raw)
 32 |     assert np.array_equal(matrix.values, atac_analysis.matrix_raw.values)
 33 |     assert (matrix == atac_analysis.matrix_raw).all().all()
 34 |     atac_analysis.dummy = atac_analysis.matrix_raw + 1
 35 |     matrix = atac_analysis.get_matrix(matrix="dummy")
 36 |     assert (matrix == (atac_analysis.matrix_raw + 1)).all().all()
 37 | 
 38 |     # sample subssetting
 39 |     matrix = atac_analysis.get_matrix(
 40 |         matrix="matrix_raw", samples=atac_analysis.samples[:2])
 41 |     assert (pd.Series([
 42 |         s.name
 43 |         for s in atac_analysis.samples[:2]]) == matrix.columns).all()
 44 | 
 45 | 
 46 | # +++ get_genome_reference
 47 | # index_fasta
 48 | # twobit_to_fasta
 49 | # +++ get_blacklist_annotations
 50 | # +++ get_tss_annotations
 51 | # +++ get_genomic_context
 52 | # +++ get_chromosome_sizes
 53 | # +++ deseq_analysis
 54 | # least_squares_fit
 55 | 
 56 | 
 57 | def test_differential_from_bivariate_fit(analysis_normalized):
 58 |     from ngs_toolkit.general import differential_from_bivariate_fit
 59 | 
 60 |     with analysis_normalized as an:
 61 |         out_dir = os.path.join(an.results_dir, "diff")
 62 |         out_prefix = os.path.join(out_dir, "bivariate_fit")
 63 |         differential_from_bivariate_fit(
 64 |             an.comparison_table, an.matrix_norm,
 65 |             out_dir, out_prefix)
 66 | 
 67 |         outs = [
 68 |             ".deseq_result.all_comparisons.csv",
 69 |             ".deseq_result.all_comparisons.scatter.svg",
 70 |             ".fit_result.Factor_A_2vs1.csv"]
 71 |         for f in outs:
 72 |             assert file_exists_and_not_empty(out_prefix + f)
 73 | 
 74 | 
 75 | @pytest.mark.skipif(
 76 |     CI,
 77 |     reason="LOLA testing is not performed on CI.")
 78 | class Test_LOLA():
 79 |     def test_lola_function(self, tmp_path):
 80 |         bed = pybedtools.example_bedtool('hg38-base.bed')
 81 |         univ = bed.slop(l=0, r=10, genome='hg38')
 82 |         bed_file = bed.fn
 83 |         universe_file = univ.fn
 84 |         output_folder = os.path.dirname(tmp_path)
 85 |         genome = "hg38"
 86 | 
 87 |         lola(bed_file, universe_file, output_folder, genome)
 88 | 
 89 |         output_files = [
 90 |             "allEnrichments.tsv",
 91 |             "col_codex.tsv"]
 92 |         for file in output_files:
 93 |             assert file_exists_and_not_empty(os.path.join(output_folder, file))
 94 | 
 95 |     def test_lola_function_multiple_inputs(self, tmp_path):
 96 |         import shutil
 97 |         bed = pybedtools.example_bedtool('hg38-base.bed')
 98 |         univ = bed.slop(l=0, r=10, genome='hg38')
 99 |         bed_file = bed.fn
100 |         shutil.copy(bed_file, "A.bed")
101 |         shutil.copy(bed_file, "B.bed")
102 |         universe_file = univ.fn
103 |         output_folder = os.path.dirname(tmp_path)
104 |         genome = "hg38"
105 | 
106 |         lola(["A.bed", "B.bed"], universe_file, output_folder, genome)
107 | 
108 |         output_files = [
109 |             "allEnrichments",
110 |             "col_codex"]
111 |         for file in output_files:
112 |             for i in ['A', 'B']:
113 |                 assert file_exists_and_not_empty(
114 |                     os.path.join(output_folder, file + i + ".tsv"))
115 | 
116 | 
117 |     def test_lola_through_differential_enrichment(
118 |             self, analysis_with_differential):
119 |         with analysis_with_differential as an:
120 |             an.differential_enrichment(steps=['lola'])
121 | 
122 |         output_files = [
123 |             "allEnrichments.tsv",
124 |             "col_codex.tsv"]
125 | 
126 |         for file in output_files:
127 |             for direction in ['up', 'down']:
128 |                 assert file_exists_and_not_empty(os.path.join(
129 |                     an.results_dir,
130 |                     "differential_analysis_ATAC-seq/enrichments/Factor_A_2vs1."
131 |                     + direction, file))
132 | 
133 |     def test_lola_through_differential_enrichment_distributed(
134 |             self, analysis_with_differential):
135 |         with analysis_with_differential as an:
136 |             an.differential_enrichment(steps=['lola'], distributed=True)
137 | 
138 |         output_files = [
139 |             "allEnrichments.tsv",
140 |             "col_codex.tsv"]
141 | 
142 |         for file in output_files:
143 |             for direction in ['up', 'down']:
144 |                 assert file_exists_and_not_empty(os.path.join(
145 |                     an.results_dir,
146 |                     "differential_analysis_ATAC-seq/enrichments/Factor_A_2vs1."
147 |                     + direction, file))
148 | 
149 |     # def test_lola__plot_differential_enrichment(self):
150 |     #     pass
151 | 
152 | # meme_ame
153 | 
154 | @pytest.mark.skipif(
155 |     CI,
156 |     reason="HOMER testing is not performed on CI.")
157 | class TestHomer():
158 |     def test_homer_function(self, tmp_path):
159 |         from ngs_toolkit.general import homer_motifs
160 | 
161 |         bed = pybedtools.example_bedtool('hg38-base.bed')
162 |         univ = bed.slop(l=0, r=10, genome='hg38')
163 |         bed_file = bed.fn
164 |         universe_file = univ.fn
165 |         output_dir = os.path.dirname(tmp_path)
166 |         genome_assembly = "hg38"
167 | 
168 |         homer_motifs(bed_file, output_dir, genome_assembly)
169 |         assert os.path.exists(os.path.join(output_dir, "homerMotifs.all.motifs"))
170 | 
171 | 
172 | # homer_combine_motifs
173 | # +++ enrichr
174 | # run_enrichment_jobs
175 | 
176 | 
177 | def test_project_to_geo(atac_analysis_with_unmapped_input_files):
178 |     from ngs_toolkit.general import project_to_geo
179 | 
180 |     with atac_analysis_with_unmapped_input_files as an:
181 |         out_dir = os.path.join(an.root_dir, "geo_submission")
182 |         annot = project_to_geo(
183 |             an.prj,
184 |             output_dir=out_dir, steps=['bam', 'peaks'],
185 |             computing_configuration="default")
186 | 
187 |         cols = [
188 |             'bam_file0', 'bam_file0_md5sum',
189 |             # 'bigwig_file', 'bigwig_file_md5sum',
190 |             'peaks_file', 'peaks_file_md5sum']
191 |         assert all(annot.columns == cols)
192 | 
193 |         outs = [
194 |             "project_to_geo.{}.sh",
195 |             "{}.bam",
196 |             "{}.bam.md5",
197 |             # "{}.bigWig",
198 |             # "{}.bigWig.md5",
199 |             "{}.peaks.narrowPeak",
200 |             "{}.peaks.narrowPeak.md5"]
201 |         for sample in an.samples:
202 |             for f in outs:
203 |                 assert file_exists_and_not_empty(
204 |                     os.path.join(out_dir, f.format(sample.name)))
205 | 
206 | 
207 | def test_rename_sample_files(atac_analysis_with_input_files):
208 |     import pandas as pd
209 |     from ngs_toolkit.general import rename_sample_files
210 | 
211 |     with atac_analysis_with_input_files as an:
212 | 
213 |         df = pd.DataFrame(
214 |             [['S01_A1', 'S02_A1'], ['SXX_ZZ', 'SYY_ZZ']],
215 |             index=['old_sample_name', 'new_sample_name']).T
216 | 
217 |         rename_sample_files(df, results_dir=an.data_dir)
218 | 
219 |         for sample in ['SXX_ZZ', 'SYY_ZZ']:
220 |             outs = [
221 |                 os.path.join("mapped", sample + '.trimmed.bowtie2.filtered.bam'),
222 |                 os.path.join("mapped", sample + '.trimmed.bowtie2.filtered.bam.bai'),
223 |                 os.path.join("peaks", sample + '_peaks.narrowPeak'),
224 |                 os.path.join("peaks", sample + '_summits.bed')]
225 |             for f in outs:
226 |                 assert file_exists_and_not_empty(os.path.join(an.data_dir, sample, f))
227 | 
228 | 
229 | # +++ query_biomart
230 | 
231 | 
232 | def test_subtract_principal_component(analysis_normalized):
233 |     from ngs_toolkit.general import subtract_principal_component
234 | 
235 |     with analysis_normalized as an:
236 | 
237 |         plot = os.path.join(an.root_dir, "subtract_plot.svg")
238 |         df = subtract_principal_component(
239 |             an.matrix_norm.T, plot_name=plot).T
240 | 
241 |         assert df.shape == an.matrix_norm.shape
242 |         assert file_exists_and_not_empty(plot)
243 | 
244 | 
245 | # fix_batch_effect_limma
246 | 


--------------------------------------------------------------------------------
/ngs_toolkit/tests/test_install.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | 
  5 | import pytest
  6 | 
  7 | from .conftest import CI, DEV, BUILD_DIR
  8 | 
  9 | 
 10 | @pytest.mark.skipif(
 11 |     not CI,
 12 |     reason="Development mode, not testing Pypi requirements")
 13 | def test_version_matches():
 14 |     from ngs_toolkit import __version__ as installed_version
 15 |     from pkg_resources import get_distribution
 16 | 
 17 |     file_version = get_distribution('ngs_toolkit').version
 18 | 
 19 |     assert installed_version == file_version
 20 | 
 21 | 
 22 | @pytest.mark.skipif(
 23 |     DEV,
 24 |     reason="Development mode, not testing Pypi requirements")
 25 | def test_pypi_requirements_are_importable():
 26 |     import requests
 27 |     import importlib
 28 | 
 29 |     package_name = "ngs-toolkit"
 30 |     url = "https://pypi.python.org/pypi/" + str(package_name) + "/json"
 31 |     data = requests.get(url).json()
 32 | 
 33 |     # handle packages where the package name is different than the Pypi name
 34 |     replace = {
 35 |         "setuptools-scm": "setuptools_scm",
 36 |         "scikit-learn": "sklearn"}
 37 | 
 38 |     # not extra requirements
 39 |     requirements = [
 40 |         x.split(" ")[0]
 41 |         for x in data["info"]["requires_dist"]
 42 |         if "extra" not in x
 43 |     ]
 44 |     for req in requirements:
 45 |         if req in replace:
 46 |             requirements.pop(requirements.index(req))
 47 |             requirements.append(replace[req])
 48 | 
 49 |     for req in requirements:
 50 |         try:
 51 |             importlib.import_module(req)
 52 |         except ImportError:
 53 |             assert False
 54 | 
 55 | 
 56 | def test_all_requirements_are_importable():
 57 |     import importlib
 58 | 
 59 |     # test only basic requirements (not extras)
 60 |     path = None
 61 |     if CI:
 62 |         reqs = os.path.join(BUILD_DIR, "requirements", "requirements.txt")
 63 |         if os.path.exists(reqs):
 64 |             path = reqs
 65 |     if path is None:
 66 |         path = os.path.join("requirements", "requirements.txt")
 67 | 
 68 |     if not os.path.exists(path):
 69 |         pytest.skip("Could not locate requirements.txt")
 70 | 
 71 |     data = open(path).read().split("\n")
 72 | 
 73 |     replace = {"scikit-learn": "sklearn"}
 74 | 
 75 |     # handle github stuff
 76 |     requirements = list()
 77 |     for x in data:
 78 |         for sep in ['>=', '<=', '=', ">", "<"]:
 79 |             x = x.split(sep)
 80 |             if len(x) == 2:
 81 |                 x = x[0].replace("=", "").replace(">", "").replace("<", "")
 82 |             else:
 83 |                 x = x[0]
 84 |         if "extra" not in x:
 85 |             requirements.append(x)
 86 | 
 87 |     # remove commnets
 88 |     requirements = [x[:x.index("  #")] if "#" in x else x
 89 |                     for x in requirements]
 90 |     # remove empty lines
 91 |     requirements = [x for x in requirements if x != ""]
 92 |     for req in requirements:
 93 |         if req in replace:
 94 |             requirements.pop(requirements.index(req))
 95 |             requirements.append(replace[req])
 96 | 
 97 |     for req in requirements:
 98 |         try:
 99 |             importlib.import_module(req)
100 |         except ImportError:
101 |             assert False, "Required '%s' module could not be found!" % req
102 | 


--------------------------------------------------------------------------------
/ngs_toolkit/tests/test_logger.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import pytest
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def log():
 9 |     return os.path.join(os.path.expanduser("~"), ".ngs_toolkit.log.txt")
10 | 
11 | 
12 | def test_config_has_all_required_fields(log):
13 |     import logging
14 |     from ngs_toolkit import _LOGGER
15 | 
16 |     assert isinstance(_LOGGER, logging.Logger)
17 |     previous_size = os.stat(log).st_size
18 |     _LOGGER.info("Testing logger")
19 |     new_size = os.stat(log).st_size
20 |     assert new_size > previous_size
21 | 
22 | 
23 | def test_clear_log(log):
24 |     from ngs_toolkit import clear_log
25 | 
26 |     previous_size = os.stat(log).st_size
27 |     clear_log()
28 |     new_size = os.stat(log).st_size
29 |     assert new_size < previous_size
30 |     assert new_size == 0
31 | 


--------------------------------------------------------------------------------
/ngs_toolkit/tests/test_plot_differential.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | 
 5 | import pytest
 6 | from .conftest import file_exists_and_not_empty, R, R_REASON
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def outputs(analysis_with_differential):
11 |     prefix = os.path.join(
12 |         analysis_with_differential.results_dir,
13 |         "differential_analysis_ATAC-seq",
14 |         "differential_analysis.",
15 |     )
16 |     outputs = [
17 |         # prefix + "diff_region.samples.clustermap.corr.svg",
18 |         # prefix + "diff_region.samples.clustermap.svg",
19 |         # prefix + "diff_region.samples.clustermap.z0.svg",
20 |         # prefix + "diff_region.samples.sorted.clustermap.svg",
21 |         # prefix + "diff_region.samples.sorted.clustermap.z0.svg",
22 |         prefix + "log2FoldChange.distribution.per_comparison.svg",
23 |         prefix + "log2FoldChange.distribution.svg",
24 |         prefix + "ma_plots.svg",
25 |         prefix + "number_differential.directional.svg",
26 |         prefix + "padj.distribution.per_comparison.svg",
27 |         prefix + "padj.distribution.svg",
28 |         prefix + "pvalue.distribution.per_comparison.svg",
29 |         prefix + "pvalue.distribution.svg",
30 |         prefix + "scatter_plots.svg",
31 |         prefix + "volcano_plots.svg",
32 |     ]
33 |     return outputs
34 | 
35 | 
36 | @pytest.mark.skipif(
37 |     not R,
38 |     reason=R_REASON)
39 | class Test_plot_differential:
40 |     def test_no_arguments(self, analysis_with_differential, outputs):
41 |         analysis_with_differential.plot_differential()
42 |         for output in outputs:
43 |             assert file_exists_and_not_empty(output)
44 | 


--------------------------------------------------------------------------------
/ngs_toolkit/tests/test_plot_differential_enrichment.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | 
 5 | import pytest
 6 | from .conftest import file_exists_and_not_empty, R, R_REASON
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def outputs(analysis_with_differential_enrichment):
11 |     # gene_set_libraries = _CONFIG['resources']['enrichr']['gene_set_libraries']
12 |     gene_set_libraries = ["GO_Biological_Process_2015", "NCI-Nature_2016"]
13 |     prefix = os.path.join(
14 |         analysis_with_differential_enrichment.results_dir,
15 |         "differential_analysis_ATAC-seq",
16 |         "enrichments",
17 |         "differential_analysis.enrichr.",
18 |     )
19 |     outputs = list()
20 |     for g in gene_set_libraries:
21 |         outputs += [
22 |             prefix + "{}.barplot.top_5.svg".format(g),
23 |             prefix + "{}.cluster_specific.Row_z_score.svg".format(g),
24 |             prefix + "{}.cluster_specific.svg".format(g),
25 |             prefix + "{}.correlation.svg".format(g),
26 |             prefix + "{}.zscore_vs_pvalue.scatterplot.svg".format(g),
27 |         ]
28 |     return outputs
29 | 
30 | 
31 | @pytest.mark.skipif(
32 |     not R,
33 |     reason=R_REASON)
34 | class Test_plot_differential_enrichment:
35 |     def test_no_arguments(self, analysis_with_differential_enrichment, outputs):
36 |         analysis_with_differential_enrichment.plot_differential_enrichment()
37 |         for output in outputs:
38 |             assert file_exists_and_not_empty(output)
39 | 


--------------------------------------------------------------------------------
/ngs_toolkit/tests/test_project_manager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | from .conftest import file_exists
 5 | 
 6 | 
 7 | def test_cli_parsing():
 8 |     import pytest
 9 |     from ngs_toolkit.project_manager import parse_arguments
10 | 
11 |     with pytest.raises(SystemExit):
12 |         parse_arguments()
13 |     for s in ["", "--help", "create", "recipe"]:
14 |         with pytest.raises(SystemExit):
15 |             parse_arguments(s)
16 |     args = parse_arguments("create asd")
17 |     assert args.command == "create"
18 | 
19 | 
20 | def test_project_creation(tmp_path):
21 |     from ngs_toolkit import _CONFIG
22 |     from ngs_toolkit.project_manager import create_project
23 |     import os
24 |     import pandas as pd
25 |     import shutil
26 | 
27 |     tmp_path = str(tmp_path)  # for Python2
28 | 
29 |     project_name = "test_project"
30 |     annotation_vars = [
31 |         "sample_name",
32 |         "toggle",
33 |         "pass_qc",
34 |         "protocol",
35 |         "library",
36 |         "cell_line",
37 |         "cell_type",
38 |         "condition",
39 |         "experimental_batch",
40 |         "experiment_name",
41 |         "replicate",
42 |         "organism",
43 |         "flowcell",
44 |         "lane",
45 |         "BSF_name",
46 |         "data_source",
47 |     ]
48 | 
49 |     genome_assemblies = {
50 |         k: v
51 |         for x in _CONFIG["preferences"]["default_genome_assemblies"]
52 |         for k, v in x.items()
53 |     }
54 |     create_project(
55 |         project_name,
56 |         genome_assemblies=genome_assemblies,
57 |         overwrite=True,
58 |         root_projects_dir=tmp_path,
59 |     )
60 | 
61 |     expected_files = [
62 |         os.path.join(tmp_path, project_name, ".git"),
63 |         os.path.join(tmp_path, project_name, "metadata"),
64 |         os.path.join(tmp_path, project_name, "metadata", "project_config.yaml"),
65 |         os.path.join(tmp_path, project_name, "metadata", "annotation.csv"),
66 |         os.path.join(tmp_path, project_name, "metadata", "sample_subannotation.csv"),
67 |         os.path.join(tmp_path, project_name, "metadata", "comparison_table.csv"),
68 |     ]
69 |     for f in expected_files:
70 |         assert file_exists(f)
71 | 
72 |     df = pd.read_csv(os.path.join(tmp_path, project_name, "metadata", "annotation.csv"))
73 |     assert df.shape == (0, len(annotation_vars))
74 |     assert all(c in df.columns for c in annotation_vars)
75 | 
76 |     shutil.rmtree(tmp_path)
77 | 


--------------------------------------------------------------------------------
/ngs_toolkit/tests/test_recipes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | import sys
  5 | import subprocess
  6 | import pytest
  7 | import pandas as pd
  8 | 
  9 | from ngs_toolkit import Analysis
 10 | from .conftest import CI, DEV, file_exists_and_not_empty
 11 | 
 12 | 
 13 | def test_region_set_frip(pep):
 14 |     import pkgutil
 15 | 
 16 |     # For this test, we need an analysis object with sample attributes pointing
 17 |     # to their input files (just like the atac_analysis_with_input_files parent),
 18 |     # but since it has to be in inside the recipe, so we will temporarily set it
 19 |     # at the home directory level, for this test only
 20 |     config = os.path.join(os.path.expanduser("~"), ".ngs_toolkit.config.yaml")
 21 |     yaml = (
 22 |         pkgutil.get_data("ngs_toolkit", "config/example.yaml").decode().strip()
 23 |     )
 24 |     with open(config, "w") as handle:
 25 |         handle.write(yaml)
 26 | 
 27 |     cmd = (
 28 |         "{exe} -m ngs_toolkit.recipes.region_set_frip {pep} "
 29 |         "--computing-configuration default"
 30 |     ).format(exe=sys.executable, pep=pep)
 31 | 
 32 |     o = subprocess.call(cmd.split(" "))
 33 |     assert o == 0
 34 | 
 35 |     an = Analysis(from_pep=pep)
 36 |     for sample in an.samples:
 37 |         files = [
 38 |             "region_set_frip.all_reads.txt",
 39 |             "region_set_frip.inside_reads.txt",
 40 |             sample.name + ".region_set_frip.log",
 41 |             sample.name + ".region_set_frip.sh",
 42 |             "stats.tsv",
 43 |         ]
 44 |         for f in files:
 45 |             assert file_exists_and_not_empty(
 46 |                 os.path.join(sample.sample_root, f)
 47 |             )
 48 | 
 49 |     os.remove(config)
 50 | 
 51 | 
 52 | def test_deseq2(tmp_path, atac_analysis_with_input_files):
 53 |     an = atac_analysis_with_input_files
 54 |     an.differential_analysis(distributed=True, dry_run=True)
 55 | 
 56 |     p = "differential_analysis_ATAC-seq"
 57 |     comp_dir = os.path.join(an.results_dir, p, "Factor_A_2vs1")
 58 |     output_prefix = os.path.join(comp_dir, p)
 59 | 
 60 |     cmd = ("{} -m ngs_toolkit.recipes.deseq2 --output-prefix {} {}").format(
 61 |         sys.executable, output_prefix, comp_dir
 62 |     )
 63 | 
 64 |     o = subprocess.call(cmd.split(" "))
 65 |     assert o == 0
 66 | 
 67 |     files = [
 68 |         # "deseq_job.Factor_A_2vs1.log",
 69 |         p + ".deseq_result.Factor_A_2vs1.csv",
 70 |         p + ".deseq_result.all_comparisons.csv",
 71 |     ]
 72 |     for f in files:
 73 |         assert file_exists_and_not_empty(os.path.join(comp_dir, f))
 74 | 
 75 | 
 76 | def test_coverage(tmp_path, atac_analysis_with_input_files):
 77 | 
 78 |     region_set = atac_analysis_with_input_files.sites.fn
 79 |     sample = atac_analysis_with_input_files.samples[0]
 80 |     output = os.path.join(tmp_path, "output.bed")
 81 | 
 82 |     cmd = ("{} -m ngs_toolkit.recipes.coverage {} {} {}").format(
 83 |         sys.executable, region_set, sample.aligned_filtered_bam, output
 84 |     )
 85 | 
 86 |     o = subprocess.call(cmd.split(" "))
 87 |     assert o == 0
 88 | 
 89 |     assert file_exists_and_not_empty(output)
 90 | 
 91 |     assert pd.read_csv(output, sep="\t", header=None).shape[1] == 4
 92 | 
 93 | 
 94 | def test_enrichr_good(tmp_path):
 95 |     genes = ["PAX5", "SOX2"]
 96 |     input_file = os.path.join(tmp_path, "genes.txt")
 97 |     output_file = os.path.join(tmp_path, "enrichr.csv")
 98 |     with open(input_file, "w") as handle:
 99 |         for g in genes:
100 |             handle.write(g + "\n")
101 |     cmd = ("{} -m ngs_toolkit.recipes.enrichr {} {}").format(
102 |         sys.executable, input_file, output_file
103 |     )
104 | 
105 |     o = subprocess.call(cmd.split(" "))
106 |     assert o == 0
107 | 
108 |     assert file_exists_and_not_empty(output_file)
109 | 
110 |     assert pd.read_csv(output_file).shape[1] == 10
111 | 
112 | 
113 | def test_enrichr_bad(tmp_path):
114 |     # No genes enriched, should return empty dataframe
115 |     genes = ["!!~~IMPOSSIBLEGENE~~!!"]
116 |     input_file = os.path.join(tmp_path, "impossible_genes.txt")
117 |     output_file = os.path.join(tmp_path, "empty_enrichr.csv")
118 |     with open(input_file, "w") as handle:
119 |         for g in genes:
120 |             handle.write(g + "\n")
121 |     cmd = ("{} -m ngs_toolkit.recipes.enrichr {} {}").format(
122 |         sys.executable, input_file, output_file
123 |     )
124 | 
125 |     o = subprocess.call(cmd.split(" "))
126 |     assert o == 0
127 | 
128 |     assert file_exists_and_not_empty(output_file)
129 | 
130 |     with pytest.raises(pd.errors.EmptyDataError):
131 |         pd.read_csv(output_file)
132 | 
133 | 
134 | @pytest.mark.skipif(CI or DEV, reason="Test too long to be performed on CI.")
135 | def test_ngs_analysis(pep):
136 |     cmd = ("{exe} -m ngs_toolkit.recipes.ngs_analysis {pep}").format(
137 |         exe=sys.executable, pep=pep
138 |     )
139 | 
140 |     o = subprocess.call(cmd.split(" "))
141 |     assert o == 0
142 | 
143 | 
144 | @pytest.mark.skipif(CI or DEV, reason="Test too long to be performed on CI.")
145 | def test_merge_signal(pep):
146 |     import pkgutil
147 |     from .conftest import file_exists_and_not_empty
148 | 
149 |     dir_ = os.path.dirname(os.path.dirname(pep))
150 |     output_dir = os.path.join(dir_, "data_merged")
151 |     cmd = (
152 |         "{exe} -m ngs_toolkit.recipes.merge_signal "
153 |         "-d "
154 |         "--attributes A "
155 |         "--output-dir {output_dir} "
156 |         "{pep}"
157 |     ).format(exe=sys.executable, output_dir=output_dir, pep=pep)
158 | 
159 |     # this requires a config with sample input files
160 |     file_config = os.path.join(
161 |         os.path.expanduser("~"), ".ngs_toolkit.config.yaml"
162 |     )
163 |     content = (
164 |         pkgutil.get_data("ngs_toolkit", "config/default.yaml").decode().strip()
165 |     )
166 |     with open(file_config, "w") as handle:
167 |         handle.write(content)
168 | 
169 |     o = subprocess.call(cmd.split(" "))
170 |     assert o == 0
171 | 
172 |     files = [
173 |         # "A_1.bigWig",
174 |         # "A_1.merged.bam",
175 |         # "A_1.merged.sorted.bam",
176 |         # "A_1.merged.sorted.bam.bai",
177 |         "A_1.merge_signal.sh",
178 |         # "A_2.bigWig",
179 |         # "A_2.merged.bam",
180 |         # "A_2.merged.sorted.bam",
181 |         # "A_2.merged.sorted.bam.bai",
182 |         "A_2.merge_signal.sh",
183 |     ]
184 | 
185 |     for f in files:
186 |         assert file_exists_and_not_empty(os.path.join(output_dir, f))
187 | 
188 |     os.remove(file_config)
189 | 


--------------------------------------------------------------------------------
/ngs_toolkit/tests/test_regression_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | from ngs_toolkit import Analysis
 5 | import pybedtools
 6 | import pandas as pd
 7 | 
 8 | 
 9 | def test_pybedtools_to_from_dataframe():
10 | 
11 |     class T(Analysis):
12 | 
13 |         def __init__(self, *args, **kwargs):
14 |             super(T, self).__init__(
15 |                 *args,
16 |                 **kwargs
17 |             )
18 |             self.samples = list()
19 | 
20 |         def trigger(self):
21 |             d = pd.DataFrame(
22 |                 [['chr1', 1, 10], ['chr2', 1, 10]],
23 |                 columns=['chrom', 'start', 'end'])
24 |             b = pybedtools.BedTool.from_dataframe(d)
25 |             return b.to_dataframe()
26 | 
27 |     t = T()
28 |     assert isinstance(t.trigger(), pd.DataFrame)
29 | 
30 | 
31 | def test_get_right_timestamped_file(tmpdir):
32 |     from ngs_toolkit.utils import get_this_file_or_timestamped
33 | 
34 |     target = os.path.join(tmpdir, "human.grch38.genomic_context.bed")
35 |     assert get_this_file_or_timestamped(target) == target
36 | 
37 |     outs = [
38 |         "human.grch38.genomic_context.2019-09-03-11:46:42.bed",
39 |         "human.grch38.genomic_context.exon.2019-09-03-11:46:36.bed",
40 |         "human.grch38.genomic_context.genebody.2019-09-03-11:46:36.bed",
41 |         "human.grch38.genomic_context.intergenic.2019-09-03-11:46:41.bed",
42 |         "human.grch38.genomic_context.intron.2019-09-03-11:46:38.bed",
43 |         "human.grch38.genomic_context.promoter.2019-09-03-11:46:36.bed",
44 |         "human.grch38.genomic_context.utr3.2019-09-03-11:46:40.bed",
45 |         "human.grch38.genomic_context.utr5.2019-09-03-11:46:39.bed"]
46 |     outs = [os.path.join(tmpdir, f) for f in outs]
47 | 
48 |     # Now with several existing files that also match the regex
49 |     for f in outs:
50 |         with open(f, "w") as handle:
51 |             handle.write(f)
52 | 
53 |     assert get_this_file_or_timestamped(target) == outs[0]
54 | 
55 | 
56 | def test_bedtools_intersect_to_dataframe():
57 |     import pandas as pd
58 |     import pybedtools
59 | 
60 |     a = pd.DataFrame([
61 |         ['chr1', 9844, 10460],
62 |         ['chr1', 180534, 181797]])
63 | 
64 |     b = pd.DataFrame([
65 |         ['chr1', 10000, 10800, '9_Het'],
66 |         ['chr1', 10800, 16000, '15_Quies'],
67 |         ['chr1', 16000, 16200, '1_TssA'],
68 |         ['chr1', 16200, 19000, '5_TxWk'],
69 |         ['chr1', 19000, 96080, '15_Quies'],
70 |         ['chr1', 96276, 96476, '15_Quies'],
71 |         ['chr1', 97276, 177200, '15_Quies']])
72 | 
73 |     a_ = pybedtools.BedTool.from_dataframe(a)
74 |     b_ = pybedtools.BedTool.from_dataframe(b)
75 | 
76 |     res = a_.intersect(b_, wa=True, wb=True, loj=True)
77 |     df = res.to_dataframe()
78 |     assert isinstance(df, pd.DataFrame)
79 |     assert df.shape == (2, 7)
80 |     assert df.iloc[1, -1] == '.'
81 | 


--------------------------------------------------------------------------------
/ngs_toolkit/tests/test_report.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from .conftest import file_exists, file_exists_and_not_empty
 4 | 
 5 | 
 6 | def test_generate_report(atac_analysis):
 7 |     report = atac_analysis._format_string_with_attributes(
 8 |         "{root_dir}/{name}.analysis_report.html")
 9 |     assert not file_exists(report)
10 |     atac_analysis.generate_report()
11 |     assert file_exists_and_not_empty(report)
12 | 


--------------------------------------------------------------------------------
/ngs_toolkit/tests/test_rnaseq_analysis.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | import os
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | from .conftest import file_exists, file_exists_and_not_empty
 9 | 
10 | 
11 | def test_rpm_normalization(rnaseq_analysis):
12 |     with rnaseq_analysis as analysis:
13 |         qnorm = analysis.normalize_rpm(save=False)
14 |         assert qnorm.dtypes.all() == np.float
15 |         assert hasattr(analysis, "matrix_norm")
16 |         rpm_file = os.path.join(
17 |             analysis.results_dir, analysis.name + ".matrix_norm.csv"
18 |         )
19 |         assert not file_exists(rpm_file)
20 |         qnorm = analysis.normalize_rpm(save=True)
21 |         assert file_exists_and_not_empty(rpm_file)
22 |         assert hasattr(analysis, "matrix_norm")
23 | 
24 | 
25 | def test_normalize(rnaseq_analysis):
26 |     qnorm = rnaseq_analysis.normalize_rpm(save=False)
27 |     assert isinstance(qnorm, pd.DataFrame)
28 |     assert hasattr(rnaseq_analysis, "matrix_norm")
29 |     del rnaseq_analysis.matrix_norm
30 | 
31 |     qnorm_d = rnaseq_analysis.normalize(method="rpm", save=False)
32 |     assert isinstance(qnorm_d, pd.DataFrame)
33 |     assert hasattr(rnaseq_analysis, "matrix_norm")
34 |     assert np.array_equal(qnorm_d, qnorm)
35 |     del rnaseq_analysis.matrix_norm
36 | 
37 |     qnorm = rnaseq_analysis.normalize_quantiles(save=False)
38 |     assert hasattr(rnaseq_analysis, "matrix_norm")
39 |     del rnaseq_analysis.matrix_norm
40 | 
41 |     qnorm_d = rnaseq_analysis.normalize(method="quantile", save=False)
42 |     assert isinstance(qnorm_d, pd.DataFrame)
43 |     assert hasattr(rnaseq_analysis, "matrix_norm")
44 |     assert np.array_equal(qnorm_d, qnorm)
45 |     del rnaseq_analysis.matrix_norm
46 | 
47 | 
48 | def test_annotate_features(rnaseq_analysis):
49 |     rnaseq_analysis.get_matrix_stats(matrix="matrix_raw")
50 |     rnaseq_analysis.annotate_features(matrix="matrix_raw")
51 |     f = os.path.join(
52 |         rnaseq_analysis.results_dir, rnaseq_analysis.name + ".matrix_features.csv"
53 |     )
54 |     assert hasattr(rnaseq_analysis, "matrix_features")
55 |     assert file_exists_and_not_empty(f)
56 | 
57 |     cols = [
58 |         "mean",
59 |         "variance",
60 |         "std_deviation",
61 |         "dispersion",
62 |         "qv2",
63 |         "amplitude",
64 |         "iqr",
65 |     ]  # from stats
66 | 
67 |     assert all([c in rnaseq_analysis.matrix_features.columns.tolist() for c in cols])
68 | 
69 | 
70 | def test_plot_expression_characteristics(rnaseq_analysis):
71 |     with rnaseq_analysis as analysis:
72 |         analysis.normalize()
73 |         analysis.plot_expression_characteristics()
74 |         assert file_exists(os.path.join(analysis.results_dir, "quality_control"))
75 | 
76 | 
77 | def test_plot_features(rnaseq_analysis):
78 |     from ngs_toolkit.rnaseq import plot_features
79 |     with rnaseq_analysis as analysis:
80 |         analysis.normalize_rpm()
81 | 
82 |         analysis.differential_analysis()
83 | 
84 |         plot_features(
85 |             analysis,
86 |             knockout_genes=analysis.matrix_norm.mean(1).nlargest(20).index.tolist())
87 | 
88 |         outs = [os.path.join(analysis.results_dir, f) for f in [
89 |             "knockout_expression.svg", "results/knockout_expression.sorted.svg"]]
90 |         for f in outs:
91 |             file_exists_and_not_empty(f)
92 | 


--------------------------------------------------------------------------------
/ngs_toolkit/tests/test_sample_input_files.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | 
  4 | import os
  5 | 
  6 | import pandas as pd
  7 | import pytest
  8 | 
  9 | from .conftest import file_exists_and_not_empty
 10 | from ngs_toolkit.utils import get_this_file_or_timestamped
 11 | 
 12 | 
 13 | @pytest.fixture
 14 | def a(atac_analysis_with_input_files):
 15 |     return atac_analysis_with_input_files
 16 | 
 17 | 
 18 | def test_has_bam_files(a):
 19 |     v = [file_exists_and_not_empty(s.aligned_filtered_bam) for s in a.samples]
 20 |     assert all(v)
 21 |     v = [file_exists_and_not_empty(s.aligned_filtered_bam + ".bai") for s in a.samples]
 22 |     assert all(v)
 23 | 
 24 | 
 25 | def test_has_peak_files(a):
 26 |     v = [file_exists_and_not_empty(s.peaks) for s in a.samples]
 27 |     assert all(v)
 28 | 
 29 | 
 30 | def test_has_summit_files(a):
 31 |     v = [file_exists_and_not_empty(s.summits) for s in a.samples]
 32 |     assert all(v)
 33 | 
 34 | 
 35 | class Test_measure_coverage:
 36 |     def test_no_arguments(self, a):
 37 |         mn = get_this_file_or_timestamped(os.path.join(a.results_dir, a.name + ".matrix_raw.csv"))
 38 | 
 39 |         os.remove(mn)
 40 | 
 41 |         a.measure_coverage()
 42 | 
 43 |         assert file_exists_and_not_empty(mn)
 44 | 
 45 |     def test_distributed(self, a):
 46 |         mn = get_this_file_or_timestamped(os.path.join(a.results_dir, a.name + ".matrix_raw.csv"))
 47 | 
 48 |         os.remove(mn)
 49 | 
 50 |         a.measure_coverage(distributed=True, computing_configuration="localhost")
 51 | 
 52 |         # Check job files for each sample exist
 53 |         fs = list()
 54 |         for s in a.samples:
 55 |             f = os.path.join(s.sample_root, "coverage", s.name + ".peak_set_coverage.")
 56 |             for end in ["sh", "bed"]:
 57 |                 fs.append(f + end)
 58 |         assert all([file_exists_and_not_empty(f) for f in fs])
 59 | 
 60 |         # # has to be done separately for log files because they'll empty
 61 |         # # just check for existence
 62 |         fs = list()
 63 |         for s in a.samples:
 64 |             f = os.path.join(s.sample_root, "coverage", s.name + ".peak_set_coverage.")
 65 |             for end in ["log"]:
 66 |                 fs.append(f + end)
 67 |         assert all([os.path.exists(f) for f in fs])
 68 | 
 69 |     def test_few_samples(self, a):
 70 |         mn = get_this_file_or_timestamped(os.path.join(a.results_dir, a.name + ".matrix_raw.csv"))
 71 | 
 72 |         os.remove(mn)
 73 | 
 74 |         a.measure_coverage(samples=a.samples[:2])
 75 | 
 76 |         mn = get_this_file_or_timestamped(mn)
 77 |         assert file_exists_and_not_empty(mn)
 78 |         assert pd.read_csv(mn, index_col=0).shape[1] == 2
 79 | 
 80 |     def test_one_sample(self, a):
 81 |         mn = get_this_file_or_timestamped(os.path.join(a.results_dir, a.name + ".matrix_raw.csv"))
 82 | 
 83 |         os.remove(mn)
 84 | 
 85 |         a.measure_coverage(samples=a.samples[:1])
 86 | 
 87 |         mn = get_this_file_or_timestamped(mn)
 88 |         assert file_exists_and_not_empty(mn)
 89 |         assert pd.read_csv(mn, index_col=0).shape[1] == 1
 90 | 
 91 |     def test_missing_input_no_permissive(self, a):
 92 |         mn = get_this_file_or_timestamped(os.path.join(a.results_dir, a.name + ".matrix_raw.csv"))
 93 | 
 94 |         os.remove(mn)
 95 |         os.remove(a.samples[0].aligned_filtered_bam)
 96 | 
 97 |         with pytest.raises(IOError):
 98 |             a.measure_coverage(samples=a.samples[:1])
 99 | 
100 |     def test_missing_input_all_samples(self, a):
101 |         mn = get_this_file_or_timestamped(os.path.join(a.results_dir, a.name + ".matrix_raw.csv"))
102 | 
103 |         os.remove(mn)
104 |         for s in a.samples:
105 |             os.remove(s.aligned_filtered_bam)
106 | 
107 |         with pytest.raises(IOError):
108 |             a.measure_coverage()
109 | 
110 |     def test_missing_input_with_permissive(self, a):
111 |         mn = get_this_file_or_timestamped(os.path.join(a.results_dir, a.name + ".matrix_raw.csv"))
112 | 
113 |         os.remove(mn)
114 |         os.remove(a.samples[0].aligned_filtered_bam)
115 | 
116 |         a.measure_coverage(samples=a.samples[:2], permissive=True)
117 | 
118 |         mn = get_this_file_or_timestamped(mn)
119 |         assert file_exists_and_not_empty(mn)
120 |         assert pd.read_csv(mn, index_col=0).shape[1] == 1
121 | 


--------------------------------------------------------------------------------
/ngs_toolkit/tests/test_unsupervised_analysis.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | import shutil
  5 | 
  6 | import pytest
  7 | 
  8 | from .conftest import file_exists, file_not_empty, file_exists_and_not_empty
  9 | 
 10 | 
 11 | @pytest.fixture
 12 | def unsup_outputs(atac_analysis_many_factors):
 13 |     prefix = os.path.join(
 14 |         atac_analysis_many_factors.results_dir,
 15 |         "unsupervised_analysis_{}".format(atac_analysis_many_factors.data_type),
 16 |         atac_analysis_many_factors.name + ".all_{}s.".format(atac_analysis_many_factors.var_unit_name),
 17 |     )
 18 |     outputs = [
 19 |         prefix + "isomap.svg",
 20 |         prefix + "locallylinearembedding.svg",
 21 |         prefix + "mds.svg",
 22 |         prefix + "pca.explained_variance.csv",
 23 |         prefix + "pca.explained_variance.svg",
 24 |         prefix + "pca.svg",
 25 |         prefix + "pca.variable_principle_components_association.csv",
 26 |         prefix + "pca.variable_principle_components_association.p_value.masked.svg",
 27 |         prefix + "pca.variable_principle_components_association.p_value.svg",
 28 |         prefix + "pca.variable_principle_components_association.adj_pvalue.masked.svg",
 29 |         prefix + "pca.variable_principle_components_association.adj_pvalue.svg",
 30 |         prefix + "pearson_correlation.clustermap.svg",
 31 |         prefix + "spearman_correlation.clustermap.svg",
 32 |         prefix + "spectralembedding.svg",
 33 |         prefix + "tsne.svg",
 34 |     ]
 35 |     return outputs
 36 | 
 37 | 
 38 | class TestUnsupervisedAnalysis:
 39 |     def test_no_arguments(self, atac_analysis_many_factors, unsup_outputs):
 40 |         # no arguments
 41 |         atac_analysis_many_factors.unsupervised_analysis()
 42 |         for output in unsup_outputs:
 43 |             assert file_exists_and_not_empty(output)
 44 | 
 45 |     def test_matrix_with_no_group_attributes(self, atac_analysis_many_factors):
 46 |         atac_analysis_many_factors.group_attributes = []
 47 |         with pytest.raises(ValueError):
 48 |             atac_analysis_many_factors.unsupervised_analysis()
 49 | 
 50 |     def test_matrix_with_no_multiindex(self, atac_analysis_many_factors, unsup_outputs):
 51 |         atac_analysis_many_factors.unsupervised_analysis(matrix="matrix_raw")
 52 |         for output in unsup_outputs:
 53 |             assert file_exists_and_not_empty(output)
 54 | 
 55 |     def test_matrix_with_no_multiindex_no_sample_attributes(self, atac_analysis):
 56 |         atac_analysis.sample_attributes = []
 57 |         with pytest.raises(ValueError):
 58 |             atac_analysis.unsupervised_analysis(matrix="matrix_raw")
 59 | 
 60 |     def test_matrix_with_no_multiindex2(self, atac_analysis_many_factors):
 61 |         atac_analysis_many_factors.unsupervised_analysis(matrix="matrix_raw")
 62 |         assert file_exists(
 63 |             os.path.join(atac_analysis_many_factors.results_dir, "unsupervised_analysis_ATAC-seq")
 64 |         )
 65 | 
 66 |     def test_various_matrices(self, atac_analysis_many_factors, unsup_outputs):
 67 |         for matrix in ["matrix_raw", "matrix_norm"]:
 68 |             atac_analysis_many_factors.annotate_samples(matrix=matrix)
 69 |             atac_analysis_many_factors.unsupervised_analysis()
 70 |             for output in unsup_outputs:
 71 |                 assert file_exists(output)
 72 |                 assert file_not_empty(output)
 73 |             shutil.rmtree(
 74 |                 os.path.join(atac_analysis_many_factors.results_dir, "unsupervised_analysis_ATAC-seq")
 75 |             )
 76 |         # analysis.annotate_samples(matrix="coverage_qnorm")
 77 | 
 78 |     def test_too_low_numbers_of_samples_error(self, atac_analysis_many_factors):
 79 |         for i in range(2):
 80 |             with pytest.raises(ValueError):
 81 |                 atac_analysis_many_factors.unsupervised_analysis(samples=atac_analysis_many_factors.samples[:i])
 82 |             assert file_exists(
 83 |                 os.path.join(atac_analysis_many_factors.results_dir, "unsupervised_analysis_ATAC-seq")
 84 |             )
 85 |             shutil.rmtree(
 86 |                 os.path.join(atac_analysis_many_factors.results_dir, "unsupervised_analysis_ATAC-seq")
 87 |             )
 88 | 
 89 |     def test_low_samples_no_manifolds(self, atac_analysis_many_factors):
 90 |         import pandas as pd
 91 |         prefix = os.path.join(
 92 |             atac_analysis_many_factors.results_dir,
 93 |             "unsupervised_analysis_ATAC-seq",
 94 |             atac_analysis_many_factors.name + ".all_{}s.".format(atac_analysis_many_factors.var_unit_name),
 95 |         )
 96 |         outputs2 = [
 97 |             prefix + "mds.svg",
 98 |             prefix + "pca.explained_variance.csv",
 99 |             prefix + "pca.explained_variance.svg",
100 |             # prefix + "pca.svg",
101 |             prefix + "pearson_correlation.clustermap.svg",
102 |             prefix + "spearman_correlation.clustermap.svg",
103 |             prefix + "tsne.svg",
104 |             prefix + "pca.variable_principle_components_association.csv",
105 |         ]
106 |         not_outputs = [
107 |             prefix + "isomap.svg",
108 |             prefix + "locallylinearembedding.svg",
109 |             prefix + "spectralembedding.svg",
110 |             prefix + "pca.variable_principle_components_association.p_value.masked.svg",
111 |             prefix
112 |             + "pca.variable_principle_components_association.adj_pvalue.masked.svg",
113 |             prefix + "pca.variable_principle_components_association.p_value.svg",
114 |             prefix + "pca.variable_principle_components_association.adj_pvalue.svg",
115 |         ]
116 |         # here I'm picking the first and last samples just to make sure
117 |         # they are from different values of attributes `a` and `b`
118 |         samples = atac_analysis_many_factors.samples
119 |         # idx = pd.Series([(s.A, s.B) for s in samples]).drop_duplicates().index.tolist()
120 |         # samples = [samples[idx[0]]] + [samples[idx[-1]]]
121 |         # assert samples[0].A != samples[1].A
122 |         # assert samples[0].B != samples[1].B
123 |         atac_analysis_many_factors.unsupervised_analysis(
124 |             samples=[samples[0]] + [samples[-1]])
125 |         for output in outputs2:
126 |             assert file_exists_and_not_empty(output)
127 |         for output in not_outputs:
128 |             assert not file_exists(output)
129 | 
130 |     # def test_high_samples_varying_all_outputs(self, atac_analysis_many_factors, outputs):
131 |     #     for i in range(4, len(atac_analysis_many_factors.samples), 2):
132 |     #         print(i)
133 |     #         atac_analysis_many_factors.unsupervised_analysis(samples=atac_analysis_many_factors.samples[i:])
134 |     #         for output in outputs:
135 |     #             assert file_exists(output)
136 |     #             assert file_not_empty(output)
137 |     #         shutil.rmtree(os.path.join(atac_analysis_many_factors.results_dir, "unsupervised_analysis_ATAC-seq"))
138 | 
139 |     def test_no_plotting_attributes(self, atac_analysis_many_factors):
140 |         with pytest.raises(ValueError):
141 |             atac_analysis_many_factors.unsupervised_analysis(attributes_to_plot=[])
142 |         assert file_exists(
143 |             os.path.join(atac_analysis_many_factors.results_dir, "unsupervised_analysis_ATAC-seq")
144 |         )
145 | 
146 |     def test_various_plotting_attributes(self, analysis_annotated, unsup_outputs):
147 |         prefix = os.path.join(
148 |             analysis_annotated.results_dir,
149 |             "unsupervised_analysis_ATAC-seq",
150 |             analysis_annotated.name + ".all_{}s.".format(analysis_annotated.var_unit_name),
151 |         )
152 |         not_outputs = [
153 |             prefix + "pca.variable_principle_components_association.p_value.masked.svg",
154 |             prefix + "pca.variable_principle_components_association.p_value.svg",
155 |             prefix
156 |             + "pca.variable_principle_components_association.adj_pvalue.masked.svg",
157 |             prefix + "pca.variable_principle_components_association.adj_pvalue.svg",
158 |         ]
159 |         for i in range(1, len(analysis_annotated.group_attributes)):
160 |             analysis_annotated.unsupervised_analysis(
161 |                 attributes_to_plot=analysis_annotated.group_attributes[:i]
162 |             )
163 |             for output in unsup_outputs:
164 |                 if output not in not_outputs:
165 |                     assert file_exists_and_not_empty(output)
166 |             for output in not_outputs:
167 |                 assert not file_exists(output)
168 |             shutil.rmtree(
169 |                 os.path.join(analysis_annotated.results_dir, "unsupervised_analysis_ATAC-seq")
170 |             )
171 | 
172 |     def test_various_output_prefixes_attributes(self, atac_analysis_many_factors, unsup_outputs):
173 |         atac_analysis_many_factors.unsupervised_analysis(output_prefix="test")
174 |         for output in unsup_outputs:
175 |             old_output = "all_{}s".format(atac_analysis_many_factors.var_unit_name)
176 |             assert file_exists_and_not_empty(
177 |                 output.replace(old_output, "test")
178 |             )
179 | 
180 |     def test_standardized_matrix(self, atac_analysis_many_factors, unsup_outputs):
181 |         atac_analysis_many_factors.unsupervised_analysis(standardize_matrix=False)
182 |         for output in unsup_outputs:
183 |             assert file_exists_and_not_empty(output)
184 | 
185 |     def test_save_additional(self, atac_analysis_many_factors):
186 | 
187 |         prefix = os.path.join(
188 |             atac_analysis_many_factors.results_dir,
189 |             "unsupervised_analysis_{}".format(atac_analysis_many_factors.data_type),
190 |             atac_analysis_many_factors.name + ".all_{}s.".format(atac_analysis_many_factors.var_unit_name),
191 |         )
192 | 
193 |         additional_outputs = [
194 |             prefix + "isomap.embedding.csv",
195 |             prefix + "locallylinearembedding.embedding.csv",
196 |             prefix + "mds.embedding.csv",
197 |             prefix + "spectralembedding.embedding.csv",
198 |             prefix + "tsne.embedding.csv",
199 |             prefix + "pca.embedding.csv",
200 |             prefix + "pca.embedding.csv",
201 |             prefix + "pca.loading.csv",
202 |         ]
203 |         atac_analysis_many_factors.unsupervised_analysis(
204 |             save_additional=True)
205 | 
206 |         for output in additional_outputs:
207 |             assert file_exists_and_not_empty(output)
208 | 


--------------------------------------------------------------------------------
/requirements/requirements.docs.txt:
--------------------------------------------------------------------------------
1 | Sphinx
2 | sphinx_rtd_theme
3 | #pydata_sphinx_theme
4 | sphinx-issues
5 | sphinx-argparse
6 | 


--------------------------------------------------------------------------------
/requirements/requirements.single_cell.txt:
--------------------------------------------------------------------------------
 1 | scanpy
 2 | scanorama
 3 | phate
 4 | scvelo
 5 | loompy
 6 | fbpca
 7 | MulticoreTSNE  # https://github.com/DmitryUlyanov/Multicore-TSNE/issues/32#issuecomment-368588074
 8 | python-igraph
 9 | louvain>=0.6
10 | leidenalg
11 | fa2
12 | 


--------------------------------------------------------------------------------
/requirements/requirements.test.txt:
--------------------------------------------------------------------------------
1 | pytest>=4.4.0
2 | coverage>=4.5.2
3 | pytest-cov
4 | codecov
5 | codacy-coverage
6 | pytest-xdist
7 | rpy2>=3.2.0
8 | 


--------------------------------------------------------------------------------
/requirements/requirements.txt:
--------------------------------------------------------------------------------
 1 | setuptools_scm>=3.3.3
 2 | numpy>=1.15.0
 3 | scipy>=1.0.0
 4 | fastcluster
 5 | pandas>=0.25.0
 6 | matplotlib
 7 | seaborn>=0.10.0
 8 | parmap>=1.5.1
 9 | pysam>=0.13
10 | pybedtools>=0.7.10
11 | scikit-learn>=0.19.1
12 | statsmodels>=0.8.0
13 | patsy>=0.4.1
14 | tqdm>=4.19.5
15 | peppy>=0.30.1
16 | divvy>=0.5.0
17 | attmap>=0.12.11
18 | requests>=2.21.0
19 | jinja2>=2.10.1
20 | natsort>=6.0.0
21 | joblib>=0.12.5
22 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | [options]
4 | setup_requires =
5 |   setuptools_scm
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | 
 6 | def parse_requirements(req_file):
 7 |     requirements = open(req_file).read().strip().split("\n")
 8 |     requirements = [r for r in requirements if not r.startswith("#")]
 9 |     return [r for r in requirements if "#egg=" not in r]
10 | 
11 | 
12 | # take care of extra required modules depending on Python version
13 | extra = {}
14 | try:
15 |     from setuptools import setup, find_packages
16 | 
17 |     if sys.version_info < (2, 7):
18 |         extra["install_requires"] = ["argparse"]
19 |     if sys.version_info >= (3,):
20 |         extra["use_2to3"] = True
21 | except ImportError:
22 |     from distutils.core import setup
23 | 
24 |     if sys.version_info < (2, 7):
25 |         extra["dependencies"] = ["argparse"]
26 | 
27 | # Requirements
28 | requirements = parse_requirements(
29 |     "requirements/requirements.txt")
30 | requirements_test = parse_requirements(
31 |     "requirements/requirements.test.txt")
32 | requirements_docs = parse_requirements(
33 |     "requirements/requirements.docs.txt")
34 | requirements_sc = parse_requirements(
35 |     "requirements/requirements.single_cell.txt")
36 | 
37 | long_description = open("README.md").read()
38 | 
39 | 
40 | # setup
41 | setup(
42 |     name="ngs_toolkit",
43 |     packages=find_packages(),
44 |     use_scm_version={
45 |         'write_to': 'ngs_toolkit/_version.py',
46 |         'write_to_template': '__version__ = "{version}"\n'
47 |     },
48 |     entry_points={
49 |         "console_scripts": [
50 |             "projectmanager = ngs_toolkit.project_manager:main",
51 |             "trackmanager = ngs_toolkit.track_manager:main"]
52 |     },
53 |     description="A toolkit for NGS analysis with Python.",
54 |     long_description=long_description,
55 |     long_description_content_type="text/markdown",
56 |     classifiers=[
57 |         "Programming Language :: Python :: 3 :: Only",
58 |         "Programming Language :: Python :: 3.6",
59 |         "Programming Language :: Python :: 3.7",
60 |         "Programming Language :: Python :: 3.8",
61 |         "Development Status :: 4 - Beta",
62 |         "License :: OSI Approved :: "
63 |         "GNU General Public License v3 or later (GPLv3+)",
64 |         "Topic :: Scientific/Engineering :: Bio-Informatics",
65 |     ],
66 |     keywords="bioinformatics, sequencing, ngs, ngs analysis, "
67 |              "ATAC-Seq, ChIP-seq, RNA-seq, project management",
68 |     url="https://github.com/afrendeiro/toolkit",
69 |     project_urls={
70 |         "Bug Tracker": "https://github.com/afrendeiro/toolkit/issues",
71 |         "Documentation": "https://ngs-toolkit.readthedocs.io",
72 |         "Source Code": "https://github.com/afrendeiro/toolkit",
73 |     },
74 |     author=u"Andre Rendeiro",
75 |     author_email="andre.rendeiro@pm.me",
76 |     license="GPL3",
77 |     setup_requires=['setuptools_scm'],
78 |     install_requires=requirements,
79 |     tests_require=requirements_test,
80 |     extras_require={
81 |         "testing": requirements_test,
82 |         "docs": requirements_docs,
83 |         "single_cell": requirements_sc},
84 |     package_data={"ngs_toolkit": ["config/*.yaml", "templates/*.html"]},
85 |     data_files=[
86 |         "requirements/requirements.txt",
87 |         "requirements/requirements.test.txt",
88 |         "requirements/requirements.single_cell.txt",
89 |     ],
90 |     **extra
91 | )
92 | 


--------------------------------------------------------------------------------