├── sequana_pipelines └── rnaseq │ ├── data │ ├── __init__.py │ ├── Saccer3 │ │ └── __init__.py │ ├── README │ ├── phiX174.fa │ ├── rnadiff_one_factor.R │ └── rnadiff_GLM.R │ ├── dag.png │ ├── tools.txt │ ├── __init__.py │ ├── create_target.py │ ├── multiqc_config.yaml │ ├── schema.yaml │ ├── main.py │ ├── config.yaml │ └── rnaseq.rules ├── doc ├── images │ └── rRNA.png ├── index.rst ├── Makefile └── conf.py ├── test ├── __init__.py ├── data │ ├── KO_R1_.mapped.fastq.gz │ ├── WT_R1_.mapped.fastq.gz │ └── Saccer3 │ │ └── Saccer3_rRNA.fa └── test_main.py ├── environment.yml ├── .pre-commit-config.yaml ├── .gitignore ├── .github └── workflows │ ├── pypi.yml │ ├── main.yml │ └── apptainer.yml ├── LICENSE ├── pyproject.toml └── README.rst /sequana_pipelines/rnaseq/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sequana_pipelines/rnaseq/data/Saccer3/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc/images/rRNA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sequana/rnaseq/HEAD/doc/images/rRNA.png -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | test_dir = os.path.dirname(os.path.realpath(__file__)) 4 | -------------------------------------------------------------------------------- /sequana_pipelines/rnaseq/dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sequana/rnaseq/HEAD/sequana_pipelines/rnaseq/dag.png -------------------------------------------------------------------------------- /test/data/KO_R1_.mapped.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sequana/rnaseq/HEAD/test/data/KO_R1_.mapped.fastq.gz -------------------------------------------------------------------------------- /test/data/WT_R1_.mapped.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sequana/rnaseq/HEAD/test/data/WT_R1_.mapped.fastq.gz -------------------------------------------------------------------------------- /sequana_pipelines/rnaseq/tools.txt: -------------------------------------------------------------------------------- 1 | cutadapt 2 | picard 3 | bowtie 4 | bowtie2 5 | multiqc 6 | STAR 7 | featureCounts 8 | deeptools 9 | gffread 10 | salmon 11 | fastp 12 | fastqc 13 | samtools 14 | bamtools 15 | bedtools 16 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: sequana_rnaseq 2 | 3 | channels: 4 | - conda-forge 5 | - bioconda 6 | - defaults 7 | - r 8 | 9 | dependencies: 10 | - cutadapt 11 | - atropos 12 | - bowtie 13 | - samtools>1.7 14 | - bamtools 15 | - bedtools 16 | - bowtie2>=2.3 17 | - fastqc 18 | - subread 19 | - fastp 20 | - deeptools 21 | - salmon 22 | - star 23 | - picard>2.20 24 | - gffread 25 | -------------------------------------------------------------------------------- /sequana_pipelines/rnaseq/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib.metadata as metadata 2 | 3 | 4 | def get_package_version(package_name): 5 | try: 6 | version = metadata.version(package_name) 7 | return version 8 | except metadata.PackageNotFoundError: 9 | return f"{package_name} not found" 10 | 11 | 12 | version = get_package_version("sequana-lora") 13 | -------------------------------------------------------------------------------- /sequana_pipelines/rnaseq/create_target.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import click 3 | 4 | version ="1.0" 5 | CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) 6 | 7 | @click.command(context_settings=CONTEXT_SETTINGS) 8 | @click.version_option(version=version) 9 | @click.option('--feature-counts-directory', default="feature_counts") 10 | def main(feature_counts_directory): 11 | """Create target file for the RNADiff analysis""" 12 | filenames = glob.glob("{}/*_feature.out".format( 13 | feature_counts_directory)) 14 | print("label\tfiles\tcondition\treplicat") 15 | for filename in filenames: 16 | label = filename.split("/")[-1].replace("_feature.out", "") 17 | filename = filename.split("/")[-1] 18 | print("{}\t{}\t{}\t{}".format(label, filename, label, "X")) 19 | 20 | 21 | if __name__ == "__main__": #pragma: no cover 22 | main() 23 | 24 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | 2 | files: '\.(py|rst|sh)$' 3 | fail_fast: false 4 | 5 | repos: 6 | - repo: https://github.com/pre-commit/pre-commit-hooks 7 | rev: v3.2.0 8 | hooks: 9 | - id: trailing-whitespace 10 | - id: end-of-file-fixer 11 | - id: check-yaml 12 | #- id: check-executables-have-shebangs 13 | - id: check-ast 14 | 15 | - repo: https://github.com/pycqa/flake8 16 | rev: 6.1.0 17 | hooks: 18 | - id: flake8 19 | args: ["-j8", "--ignore=E203,E501,W503,E722", "--max-line-length=120", "--exit-zero"] 20 | 21 | - repo: https://github.com/psf/black 22 | rev: 22.10.0 23 | hooks: 24 | - id: black 25 | args: ["--line-length=120"] 26 | exclude: E501 27 | 28 | - repo: https://github.com/pycqa/isort 29 | rev: 5.12.0 30 | hooks: 31 | - id: isort 32 | args: ["--profile", "black"] # solves conflicts between black and isort 33 | 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | -------------------------------------------------------------------------------- /.github/workflows/pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | on: 3 | workflow_dispatch: 4 | push: 5 | tags: 6 | - '*' 7 | 8 | jobs: 9 | build-n-publish: 10 | name: Build and publish to PyPI and TestPyPI 11 | runs-on: ubuntu-20.04 12 | steps: 13 | - uses: actions/checkout@main 14 | - name: Set up Python 3.11 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: 3.11 18 | 19 | - name: Install package 20 | run: | 21 | pip install build poetry 22 | 23 | - name: Build source tarball 24 | run: | 25 | rm -rf dist; 26 | poetry build 27 | 28 | - name: Publish distribution to Test PyPI 29 | uses: pypa/gh-action-pypi-publish@release/v1 30 | with: 31 | user: __token__ 32 | password: ${{ secrets.TEST_PYPI_API_TOKEN }} 33 | repository_url: https://test.pypi.org/legacy/ 34 | - name: Publish distribution to PyPI 35 | if: startsWith(github.ref, 'refs/tags') 36 | uses: pypa/gh-action-pypi-publish@release/v1 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /sequana_pipelines/rnaseq/data/README: -------------------------------------------------------------------------------- 1 | # to save space, we will only keep chrI and chrXII in the GFF, fasta and raw 2 | data 3 | 4 | # first the gff 5 | mkdir temp 6 | mkdir Saccer3 7 | 8 | # Get the Saccer3 gff and fasta 9 | 10 | grep "chrI " Saccer3.gff > new.gff 11 | grep "chrXII " Saccer3.gff >> new.gff 12 | cp new.gff ./Saccer3/Saccer3.gff 13 | 14 | # then the fasta using Python 15 | from sequana import FastA 16 | f = FastA("Saccer3.fa") 17 | with open("Saccer_new.fa", "w") as fout: 18 | for name, seq in zip(f.names, f.sequences): 19 | if name in ['chrI', 'chrXII']: 20 | fout.write(">{}\n{}\n".format(name, seq)) 21 | 22 | cp Saccer3_new.fa ./Saccer3/Saccer3.fa 23 | 24 | 25 | cd Saccer3 26 | bwa index Saccer3.fa 27 | cd .. 28 | 29 | # extract only reads that mapped onto the chrI or chrXII (to get smaller fastq 30 | files) 31 | 32 | bwa mem Saccer3/Saccer3.fa WT_ATCACG_L001_R1_001.fastq > WT.sam 33 | bwa mem Saccer3/Saccer3.fa KO_ATCACG_L001_R1_001.fastq > KO.sam 34 | 35 | mkdir temp 36 | from sequana import tools 37 | tools.bam_to_mapped_umpaped("WT.sam", "temp") 38 | tools.bam_to_mapped_umpaped("KO.sam", "temp") 39 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - dev 8 | workflow_dispatch: 9 | pull_request: 10 | branches-ignore: [] 11 | schedule: 12 | - cron: '0 0 20 * *' 13 | 14 | jobs: 15 | build-linux: 16 | runs-on: ubuntu-latest 17 | strategy: 18 | max-parallel: 5 19 | matrix: 20 | python: ['3.10', '3.11'] 21 | fail-fast: false 22 | 23 | 24 | steps: 25 | 26 | - name: install graphviz 27 | run: | 28 | sudo apt-get install -y graphviz 29 | 30 | - name: checkout git repo 31 | uses: actions/checkout@v2 32 | 33 | - name: conda/mamba 34 | uses: mamba-org/setup-micromamba@v1 35 | with: 36 | environment-file: environment.yml 37 | create-args: >- 38 | python=${{ matrix.python }} 39 | - name: install package itself 40 | shell: bash -l {0} 41 | run: | 42 | pip install . 43 | pip install "pulp==2.7.0" --no-deps 44 | 45 | 46 | - name: Install dependencies 47 | shell: bash -l {0} 48 | run: | 49 | pip install coveralls pytest-cov pytest pytest-xdist 50 | 51 | - name: testing 52 | shell: bash -l {0} 53 | run: | 54 | pytest -v --cov-report term-missing --cov=sequana_pipelines.rnaseq 55 | 56 | - name: coveralls 57 | shell: bash -l {0} 58 | run: | 59 | echo $COVERALLS_REPO_TOKEN 60 | coveralls --service=github 61 | env: 62 | GITHUB_TOKEN: ${{ github.token }} 63 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2016-2019, Sequana Development Team 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["poetry-core>=1.0.0"] 3 | build-backend = "poetry.core.masonry.api" 4 | 5 | [tool.poetry] 6 | name = "sequana-rnaseq" 7 | version = "0.20.2" 8 | description = "A RNAseq pipeline from raw reads to feature counts" 9 | authors = ["Sequana Team"] 10 | license = "BSD-3" 11 | repository = "https://github.com/sequana/rnaseq" 12 | readme = "README.rst" 13 | keywords = ["snakemake, sequana, RNAseq, RNADiff, differential analysis"] 14 | classifiers = [ 15 | "Development Status :: 5 - Production/Stable", 16 | "Intended Audience :: Education", 17 | "Intended Audience :: End Users/Desktop", 18 | "Intended Audience :: Developers", 19 | "Intended Audience :: Science/Research", 20 | "License :: OSI Approved :: BSD License", 21 | "Operating System :: POSIX :: Linux", 22 | "Programming Language :: Python :: 3.10", 23 | "Programming Language :: Python :: 3.11", 24 | "Programming Language :: Python :: 3.12", 25 | "Topic :: Software Development :: Libraries :: Python Modules", 26 | "Topic :: Scientific/Engineering :: Bio-Informatics", 27 | "Topic :: Scientific/Engineering :: Information Analysis", 28 | ] 29 | 30 | packages = [ 31 | {include = "sequana_pipelines"} 32 | ] 33 | 34 | 35 | [tool.poetry.dependencies] 36 | python = ">=3.8,<4.0" 37 | sequana = ">=0.17.3" 38 | sequana_pipetools = ">=1.0.2" 39 | click-completion = "^0.5.2" 40 | pulp = "<2.8.0" 41 | 42 | 43 | [tool.poetry.scripts] 44 | sequana_rnaseq = "sequana_pipelines.rnaseq.main:main" 45 | 46 | 47 | [tool.poetry.group.dev.dependencies] 48 | black = "^23.7.0" 49 | pytest = "^7.4.0" 50 | mock = "^5.1.0" 51 | pytest-mock = "^3.11.1" 52 | pytest-cov = "^4.1.0" 53 | 54 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | Sequana rnaseq pipeline documentation 2 | ##################################################### 3 | 4 | |version|, |today|, status: production 5 | 6 | The **rnaseq** pipeline is a `Sequana `_ pipeline. You can find the source code 7 | on `https://github.com/sequana/sequana_rnaseq `_. Would you have issues 8 | about the code, usage or lack of information, please fill a report 9 | on `Sequana itself `_ indicating the pipeline name (We centralized all 10 | pipelines issues on **Sequana** repository only so as to be more responsive). 11 | 12 | If you use **Sequana**, please do not forget to cite us: 13 | 14 | Cokelaer et al, (2017), 'Sequana': a Set of Snakemake NGS pipelines, Journal of 15 | Open Source Software, 2(16), 352, `JOSS DOI doi:10.21105/joss.00352 `_ 16 | 17 | 18 | The Sequana rnaseq pipeline 19 | ============================================== 20 | 21 | .. include:: ../README.rst 22 | 23 | Example 24 | ======= 25 | 26 | This example is taken from the official tutorial on Sequana webiste. 27 | First download a sample:: 28 | 29 | wget https://sequana.readthedocs.io/en/master/_downloads/WT_ATCACG_L001_R1_001.fastq.gz 30 | wget https://sequana.readthedocs.io/en/master/_downloads/KO_ATCACG_L001_R1_001.fastq.gz 31 | 32 | 33 | # its genome and annotation 34 | mkdir genomes/Saccer3 35 | cd genomes/Saccer3 36 | wget http://hgdownload.cse.ucsc.edu/goldenPath/sacCer3/bigZips/chromFa.tar.gz 37 | tar -xvzf chromFa.tar.gz 38 | cat *.fa > Saccer3.fa 39 | wget http://downloads.yeastgenome.org/curation/chromosomal_feature/saccharomyces_cerevisiae.gff -O Saccer3.gff 40 | rm -f chr* 41 | cd ../.. 42 | 43 | Then, prepare the script:: 44 | 45 | sequana_pipelines_fastqc --input-directory . --genome-directory genomes/Saccer3 --aligner STAR 46 | cd rnaseq 47 | snakemake -s rnaseq.rules 48 | # or just run the script rnaseq.sh 49 | 50 | What is Sequana ? 51 | ===================== 52 | 53 | **Sequana** is a versatile tool that provides 54 | 55 | #. A Python library dedicated to NGS analysis (e.g., tools to visualise standard NGS formats). 56 | #. A set of Pipelines dedicated to NGS in the form of Snakefiles 57 | #. Standalone applications 58 | #. sequana_coverage ease the 59 | extraction of genomic regions of interest and genome coverage information 60 | #. sequana_taxonomy performs a quick 61 | taxonomy of your FastQ. This requires dedicated databases to be downloaded. 62 | #. Sequanix, a GUI for Snakemake workflows (hence Sequana pipelines as well) 63 | 64 | To join the project, please let us know on `github `_. 65 | 66 | For more information, please see `github `_. 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /.github/workflows/apptainer.yml: -------------------------------------------------------------------------------- 1 | name: Apptainer Run 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - dev 8 | workflow_dispatch: 9 | pull_request: 10 | branches-ignore: [] 11 | schedule: 12 | - cron: '0 0 20 * *' 13 | 14 | jobs: 15 | build-linux: 16 | runs-on: ubuntu-latest 17 | strategy: 18 | max-parallel: 5 19 | matrix: 20 | python: ['3.10'] 21 | fail-fast: false 22 | 23 | 24 | steps: 25 | 26 | # Clean up unnecessary preinstalled packages to free disk space 27 | - name: Pre-cleanup 28 | run: | 29 | sudo rm -rf /usr/share/dotnet 30 | sudo rm -rf "$AGENT_TOOLSDIRECTORY" 31 | 32 | # Cache APT .deb packages 33 | - name: Cache APT archives 34 | uses: actions/cache@v3 35 | with: 36 | path: /var/cache/apt/archives 37 | key: ${{ runner.os }}-apt-cache-v1 38 | 39 | # Cache Apptainer installation 40 | - name: Cache Apptainer install 41 | id: cache-apptainer 42 | uses: actions/cache@v3 43 | with: 44 | path: | 45 | /usr/bin/apptainer 46 | /usr/lib/apptainer 47 | /etc/apptainer 48 | key: ${{ runner.os }}-apptainer-v1 49 | 50 | # Install Apptainer only if not cached 51 | - name: Install Apptainer 52 | if: steps.cache-apptainer.outputs.cache-hit != 'true' 53 | run: | 54 | sudo apt-get update 55 | sudo apt-get install -y software-properties-common 56 | sudo add-apt-repository -y ppa:apptainer/ppa 57 | sudo apt update 58 | sudo apt install -y apptainer 59 | 60 | # Cache Apptainer image cache (~/.apptainer/cache) 61 | - name: Cache Apptainer images 62 | uses: actions/cache@v3 63 | with: 64 | path: ~/.apptainer/cache 65 | key: ${{ runner.os }}-apptainer-images-v1 66 | 67 | # Checkout repository 68 | - name: Checkout repo 69 | uses: actions/checkout@v4 70 | 71 | # 🐍 Set up Python 72 | - name: Set up Python ${{ matrix.python }} 73 | uses: actions/setup-python@v5 74 | with: 75 | python-version: ${{ matrix.python }} 76 | 77 | # Install dependencies 78 | - name: Install dependencies 79 | run: | 80 | python -m pip install --upgrade pip 81 | pip install .[testing] 82 | 83 | # Install package and pinned dependency (example: pulp) 84 | - name: Install package itself 85 | run: | 86 | pip install . 87 | pip install "pulp==2.7.0" --no-deps 88 | 89 | # Run tests using Apptainer 90 | - name: Run Apptainer tests 91 | run: | 92 | sequana_rnaseq --aligner-choice bowtie2 \ 93 | --input-directory test/data/ 94 | --apptainer-prefix ~/.apptainer/cache \ 95 | --genome-directory test/data/Saccer3 96 | 97 | cd rnaseq && sh rnaseq.sh 98 | 99 | 100 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | 15 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest 16 | 17 | help: 18 | @echo "Please use \`make ' where is one of" 19 | @echo " html to make standalone HTML files" 20 | @echo " dirhtml to make HTML files named index.html in directories" 21 | @echo " singlehtml to make a single large HTML file" 22 | @echo " pickle to make pickle files" 23 | @echo " json to make JSON files" 24 | @echo " htmlhelp to make HTML files and a HTML help project" 25 | @echo " qthelp to make HTML files and a qthelp project" 26 | @echo " devhelp to make HTML files and a Devhelp project" 27 | @echo " epub to make an epub" 28 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 29 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 30 | @echo " text to make text files" 31 | @echo " man to make manual pages" 32 | @echo " changes to make an overview of all changed/added/deprecated items" 33 | @echo " linkcheck to check all external links for integrity" 34 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 35 | 36 | clean: 37 | -rm -rf $(BUILDDIR)/* 38 | 39 | html: 40 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 41 | @echo 42 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 43 | 44 | dirhtml: 45 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 48 | 49 | singlehtml: 50 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 51 | @echo 52 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 53 | 54 | pickle: 55 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 56 | @echo 57 | @echo "Build finished; now you can process the pickle files." 58 | 59 | json: 60 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 61 | @echo 62 | @echo "Build finished; now you can process the JSON files." 63 | 64 | htmlhelp: 65 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 66 | @echo 67 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 68 | ".hhp project file in $(BUILDDIR)/htmlhelp." 69 | 70 | qthelp: 71 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 72 | @echo 73 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 74 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 75 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/ebisoftware.qhcp" 76 | @echo "To view the help file:" 77 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/ebisoftware.qhc" 78 | 79 | devhelp: 80 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 81 | @echo 82 | @echo "Build finished." 83 | @echo "To view the help file:" 84 | @echo "# mkdir -p $$HOME/.local/share/devhelp/ebisoftware" 85 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/ebisoftware" 86 | @echo "# devhelp" 87 | 88 | epub: 89 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 90 | @echo 91 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 92 | 93 | latex: 94 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 95 | @echo 96 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 97 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 98 | "(use \`make latexpdf' here to do that automatically)." 99 | 100 | latexpdf: 101 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 102 | @echo "Running LaTeX files through pdflatex..." 103 | make -C $(BUILDDIR)/latex all-pdf 104 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 105 | 106 | text: 107 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 108 | @echo 109 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 110 | 111 | man: 112 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 113 | @echo 114 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 115 | 116 | changes: 117 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 118 | @echo 119 | @echo "The overview file is in $(BUILDDIR)/changes." 120 | 121 | linkcheck: 122 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 123 | @echo 124 | @echo "Link check complete; look for any errors in the above output " \ 125 | "or in $(BUILDDIR)/linkcheck/output.txt." 126 | 127 | doctest: 128 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 129 | @echo "Testing of doctests in the sources finished, look at the " \ 130 | "results in $(BUILDDIR)/doctest/output.txt." 131 | 132 | -------------------------------------------------------------------------------- /sequana_pipelines/rnaseq/data/phiX174.fa: -------------------------------------------------------------------------------- 1 | >gi|9626372|dbj|NC_001422.1_phiX174_no_SNPs_True_Reference 2 | GAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTT 3 | GATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAA 4 | ATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTG 5 | TCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTA 6 | GATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATC 7 | TGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTT 8 | TCCAGACCGCTTTGGCCTCTATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGATTT 9 | CGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGTTGAGGCT 10 | TGCGTTTATGGTACGCTGGACTTTGTAGGATACCCTCGCTTTCCTGCTCCTGTTGAGTTTATTGCTGCCG 11 | TCATTGCTTATTATGTTCATCCCGTCAACATTCAAACGGCCTGTCTCATCATGGAAGGCGCTGAATTTAC 12 | GGAAAACATTATTAATGGCGTCGAGCGTCCGGTTAAAGCCGCTGAATTGTTCGCGTTTACCTTGCGTGTA 13 | CGCGCAGGAAACACTGACGTTCTTACTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCAGAAGGAG 14 | TGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACT 15 | AAAGGCAAGCGTAAAGGCGCTCGTCTTTGGTATGTAGGTGGTCAACAATTTTAATTGCAGGGGCTTCGGC 16 | CCCTTACTTGAGGATAAATTATGTCTAATATTCAAACTGGCGCCGAGCGTATGCCGCATGACCTTTCCCA 17 | TCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGAC 18 | TCCTTCGAGATGGACGCCGTTGGCGCTCTCCGTCTTTCTCCATTGCGTCGTGGCCTTGCTATTGACTCTA 19 | CTGTAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAA 20 | GGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTT 21 | GGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCAGGGTTATTTGAATATCTATAACA 22 | ACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCAAGATGATGC 23 | TCGTTATGGTTTCCGTTGCTGCCATCTCAAAAACATTTGGACTGCTCCGCTTCCTCCTGAGACTGAGCTT 24 | TCTCGCCAAATGACGACTTCTACCACATCTATTGACATTATGGGTCTGCAAGCTGCTTATGCTAATTTGC 25 | ATACTGACCAAGAACGTGATTACTTCATGCAGCGTTACCATGATGTTATTTCTTCATTTGGAGGTAAAAC 26 | CTCTTATGACGCTGACAACCGTCCTTTACTTGTCATGCGCTCTAATCTCTGGGCATCTGGCTATGATGTT 27 | GATGGAACTGACCAAACGTCGTTAGGCCAGTTTTCTGGTCGTGTTCAACAGACCTATAAACATTCTGTGC 28 | CGCGTTTCTTTGTTCCTGAGCATGGCACTATGTTTACTCTTGCGCTTGTTCGTTTTCCGCCTACTGCGAC 29 | TAAAGAGATTCAGTACCTTAACGCTAAAGGTGCTTTGACTTATACCGATATTGCTGGCGACCCTGTTTTG 30 | TATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGT 31 | TTAAGATTGCTGAGGGTCAGTGGTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCACCTTCTTGA 32 | AGGCTTCCCATTCATTCAGGAACCGCCTTCTGGTGATTTGCAAGAACGCGTACTTATTCGCCACCATGAT 33 | TATGACCAGTGTTTCCAGTCCGTTCAGTTGTTGCAGTGGAATAGTCAGGTTAAATTTAATGTGACCGTTT 34 | ATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTCGTGATAAAAGATTGAGTGTGAGGTTATAAC 35 | GCCGAAGCGGTAAAAATTTTAATTTTTGCCGCTGAGGGGTTGACCAAGCGAAGCGCGGTAGGTTTTCTGC 36 | TTAGGAGTTTAATCATGTTTCAGACTTTTATTTCTCGCCATAATTCAAACTTTTTTTCTGATAAGCTGGT 37 | TCTCACTTCTGTTACTCCAGCTTCTTCGGCACCTGTTTTACAGACACCTAAAGCTACATCGTCAACGTTA 38 | TATTTTGATAGTTTGACGGTTAATGCTGGTAATGGTGGTTTTCTTCATTGCATTCAGATGGATACATCTG 39 | TCAACGCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGATGCCGACCCTAAATTTTTTGC 40 | CTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGACTACCCTCCCGACTGCCTATGATGTTTATCCTTTG 41 | GATGGTCGCCATGATGGTGGTTATTATACCGTCAAGGACTGTGTGACTATTGACGTCCTTCCTCGTACGC 42 | CGGGCAATAATGTTTATGTTGGTTTCATGGTTTGGTCTAACTTTACCGCTACTAAATGCCGCGGATTGGT 43 | TTCGCTGAATCAGGTTATTAAAGAGATTATTTGTCTCCAGCCACTTAAGTGAGGTGATTTATGTTTGGTG 44 | CTATTGCTGGCGGTATTGCTTCTGCTCTTGCTGGTGGCGCCATGTCTAAATTGTTTGGAGGCGGTCAAAA 45 | AGCCGCCTCCGGTGGCATTCAAGGTGATGTGCTTGCTACCGATAACAATACTGTAGGCATGGGTGATGCT 46 | GGTATTAAATCTGCCATTCAAGGCTCTAATGTTCCTAACCCTGATGAGGCCGCCCCTAGTTTTGTTTCTG 47 | GTGCTATGGCTAAAGCTGGTAAAGGACTTCTTGAAGGTACGTTGCAGGCTGGCACTTCTGCCGTTTCTGA 48 | TAAGTTGCTTGATTTGGTTGGACTTGGTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTAT 49 | CTTGCTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGG 50 | TTGACGCCGGATTTGAGAATCAAAAAGAGCTTACTAAAATGCAACTGGACAATCAGAAAGAGATTGCCGA 51 | GATGCAAAATGAGACTCAAAAAGAGATTGCTGGCATTCAGTCGGCGACTTCACGCCAGAATACGAAAGAC 52 | CAGGTATATGCACAAAATGAGATGCTTGCTTATCAACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTA 53 | TGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGATTATGCGCCAAATGCTTACTCAAGCTCA 54 | AACGGCTGGTCAGTATTTTACCAATGACCAAATCAAAGAAATGACTCGCAAGGTTAGTGCTGAGGTTGAC 55 | TTAGTTCATCAGCAAACGCAGAATCAGCGGTATGGCTCTTCTCATATTGGCGCTACTGCAAAGGATATTT 56 | CTAATGTCGTCACTGATGCTGCTTCTGGTGTGGTTGATATTTTTCATGGTATTGATAAAGCTGTTGCCGA 57 | TACTTGGAACAATTTCTGGAAAGACGGTAAAGCTGATGGTATTGGCTCTAATTTGTCTAGGAAATAACCG 58 | TCAGGATTGACACCCTCCCAATTGTATGTTTTCATGCCTCCAAATCTTGGAGGCTTTTTTATGGTTCGTT 59 | CTTATTACCCTTCTGAATGTCACGCTGATTATTTTGACTTTGAGCGTATCGAGGCTCTTAAACCTGCTAT 60 | TGAGGCTTGTGGCATTTCTACTCTTTCTCAATCCCCAATGCTTGGCTTCCATAAGCAGATGGATAACCGC 61 | ATCAAGCTCTTGGAAGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATG 62 | TTGACGGCCATAAGGCTGCTTCTGACGTTCGTGATGAGTTTGTATCTGTTACTGAGAAGTTAATGGATGA 63 | ATTGGCACAATGCTACAATGTGCTCCCCCAACTTGATATTAATAACACTATAGACCACCGCCCCGAAGGG 64 | GACGAAAAATGGTTTTTAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGCTGAACGCC 65 | CTCTTAAGGATATTCGCGATGAGTATAATTACCCCAAAAAGAAAGGTATTAAGGATGAGTGTTCAAGATT 66 | GCTGGAGGCCTCCACTATGAAATCGCGTAGAGGCTTTGCTATTCAGCGTTTGATGAATGCAATGCGACAG 67 | GCTCATGCTGATGGTTGGTTTATCGTTTTTGACACTCTCACGTTGGCTGACGACCGATTAGAGGCGTTTT 68 | ATGATAATCCCAATGCTTTGCGTGACTATTTTCGTGATATTGGTCGTATGGTTCTTGCTGCCGAGGGTCG 69 | CAAGGCTAATGATTCACACGCCGACTGCTATCAGTATTTTTGTGTGCCTGAGTATGGTACAGCTAATGGC 70 | CGTCTTCATTTCCATGCGGTGCACTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTC 71 | GTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCAT 72 | CGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAG 73 | CCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAAAGTCAGATA 74 | TGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACT 75 | TCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTG 76 | TCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGC 77 | AGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACC 78 | TGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCA 79 | 80 | 81 | -------------------------------------------------------------------------------- /sequana_pipelines/rnaseq/multiqc_config.yaml: -------------------------------------------------------------------------------- 1 | ####################################### 2 | # Example MultiQC Configuration File 3 | ####################################### 4 | 5 | # This file can be saved either in the MultiQC installation 6 | # directory, or as ~/.multiqc_config.yaml 7 | 8 | # Configuration settings are taken from the following locations, in order: 9 | # - Hardcoded in MultiQC (multiqc/utils/config.py) 10 | # - /multiqc_config.yaml 11 | # - ~/.multiqc_config.yaml 12 | # - Command line options 13 | 14 | # Note that all of the values below are set to the MultiQC defaults. 15 | # It's recommended that you delete any that you don't need. 16 | 17 | 18 | #------------------------------------------------------------------------------ 19 | # TO CHANGE 20 | #------------------------------------------------------------------------------- 21 | 22 | # Title to use for the report. 23 | title: "sequana_rnaseq MultiQC summary" 24 | subtitle: "RNA-seq analysis" 25 | intro_text: "Report summarising cleaning (cutadapt section) and mapping (stat/bowtie sections) of your FastQ files, as well as feature counts (featureCounts section) and quick contaminant search (fastq screen section)" 26 | 27 | # Add generic information to the top of reports 28 | report_header_info: 29 | - Authors: 'Sequana developers' 30 | - Want to know more?: 'See the Sequana and sequana_rnaseq pipeline documentation.' 31 | - Citations: 'If you used Sequanix, Sequana, Sequana_coverage tool, or any Sequana pipelines, please see How to cite ? section. In particular, if you use this report in a publication, please cite Sequana.' 32 | - Contact E-mail: '' 33 | # - Project Type: 'Differential gene expression' 34 | 35 | 36 | #------------------------------------------------------------------------------- 37 | 38 | 39 | 40 | # Prepend sample names with their directory. Useful if analysing the 41 | # sample samples with different parameters. 42 | prepend_dirs: False 43 | 44 | # Default output filenames 45 | output_fn_name: multiqc_report.html 46 | data_dir_name: multiqc_data 47 | 48 | # Whether to create the parsed data directory in addition to the report 49 | make_data_dir: True 50 | 51 | # Cleaning options for sample names. Typically, sample names are detected 52 | # from an input filename. If any of these strings are found, they and any 53 | # text to their right will be discarded. 54 | # For example - file1.fq.gz_trimmed.bam_deduplicated_fastqc.zip 55 | # would be cleaned to 'file1' 56 | # Two options here - fn_clean_exts will replace the defaults, 57 | # extra_fn_clean_exts will append to the defaults 58 | extra_fn_clean_exts: 59 | - .gz 60 | - .fastq 61 | - _R1_.cutadapt 62 | - _R2_.cutadapt 63 | - _R2_001 64 | - _R1_001 65 | - .err 66 | - type: remove 67 | pattern: '.sorted' 68 | - type: regex 69 | pattern: '_S\d+' 70 | 71 | 72 | # Ignore these files / directories / paths when searching for logs 73 | fn_ignore_files: 74 | - .DS_Store 75 | - slurm*out 76 | - "*.js" 77 | 78 | fn_ignore_dirs: 79 | - .sequana 80 | - .snakemake 81 | - multiqc 82 | - logs 83 | 84 | # We want to ignore the 3 strand case to use only the final ones in 85 | # post_analysis/feature_counts 86 | fn_ignore_paths: 87 | - "*/feature_counts/*" 88 | 89 | # Ignore files larger than this when searcing for logs (bytes) 90 | log_filesize_limit: 5000000 91 | 92 | # MultiQC skips a couple of debug messages when searching files as the 93 | # log can get very verbose otherwise. Re-enable here to help debugging. 94 | report_readerrors: False 95 | report_imgskips: False 96 | 97 | # Opt-out of remotely checking that you're running the latest version 98 | no_version_check: False 99 | 100 | # How to plot graphs. Different templates can override these settings, but 101 | # the default template can use interactive plots (Javascript using HighCharts) 102 | # or flat plots (images, using MatPlotLib). With interactive plots, the report 103 | # can prevent automatically rendering all graphs if there are lots of samples 104 | # to prevent the browser being locked up when the report opens. 105 | plots_force_flat: False # Try to use only flat image graphs 106 | plots_force_interactive: False # Try to use only interactive javascript graphs 107 | plots_flat_numseries: 100 # If neither of the above, use flat if > this number of datasets 108 | num_datasets_plot_limit: 50 # If interactive, don't plot on load if > this number of datasets 109 | max_table_rows: 500 # Swap tables for a beeswarm plot above this 110 | 111 | # Overwrite module filename search patterns. See multiqc/utils/search_patterns.yaml 112 | # for the defaults. Remove a default by setting it to null. 113 | sp: 114 | star: 115 | fn: '*Log.final.out' 116 | cutadapt: 117 | fn: 'cutadapt.txt' 118 | fastp: 119 | fn: '*fastp*json' 120 | #rna_seqc/metrics: 121 | # fn: "*metrics.tsv" 122 | #rna_seqc/coverage: 123 | # fn: "*coverage.tsv" 124 | 125 | # Overwrite the defaults of which table columns are visible by default 126 | # 127 | read_count_prefix: '' 128 | read_count_multiplier: 1 129 | 130 | table_columns_visible: 131 | FastQC: 132 | percent_fails: False 133 | total_sequences: True 134 | percent_gc: False 135 | fastp: 136 | pct_duplication: False 137 | after_filtering_gc_content: False 138 | Bowtie 1: 139 | reads_aligned_percentage: False 140 | reads_aligned: False 141 | picard: 142 | PERCENT_DUPLICATION: False 143 | 144 | top_modules: 145 | - fastqc 146 | - fastp 147 | - bowtie1 148 | - bowtie2 149 | - salmon 150 | - star 151 | - featureCounts 152 | 153 | module_order: 154 | - fastqc 155 | - fastp 156 | - rseqc 157 | - markduplicates 158 | - picard 159 | - bowtie1 160 | - bowtie2 161 | - salmon 162 | - star 163 | - featureCounts 164 | 165 | remove_sections: 166 | - fastqc_status_checks 167 | - fastqc_per_base_n_content 168 | 169 | 170 | #fastqc_config: 171 | #fastqc_theoretical_gc: 'mm10_genome' 172 | # 173 | -------------------------------------------------------------------------------- /test/test_main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | import tempfile 5 | 6 | from click.testing import CliRunner 7 | 8 | from sequana_pipelines.rnaseq.main import main 9 | 10 | from . import test_dir 11 | 12 | sharedir = f"{test_dir}/data" 13 | saccer3 = f"{test_dir}/data/Saccer3/" 14 | conta = f"{test_dir}/data/Saccer3/Saccer3_rRNA.fa" 15 | 16 | 17 | # fast 18 | def test_standalone_subprocess(): 19 | directory = tempfile.TemporaryDirectory() 20 | cmd = """sequana_rnaseq --input-directory {} --working-directory {} """.format(sharedir, directory.name) 21 | subprocess.call(cmd.split()) 22 | 23 | 24 | # slow 25 | def test_standalone_script(): 26 | directory = tempfile.TemporaryDirectory() 27 | 28 | runner = CliRunner() 29 | results = runner.invoke( 30 | main, 31 | [ 32 | "--input-directory", 33 | sharedir, 34 | "--genome-directory", 35 | saccer3, 36 | "--force", 37 | "--aligner-choice", 38 | "bowtie2", 39 | "--feature-counts-feature-type", 40 | "gene,tRNA", 41 | "--working-directory", 42 | directory.name, 43 | "--rRNA-feature", 44 | "rRNA_gene", 45 | ], 46 | ) # ideally should be rRNA but current 47 | assert results.exit_code == 0 48 | 49 | 50 | def test_standalone_script_contaminant(): 51 | directory = tempfile.TemporaryDirectory() 52 | runner = CliRunner() 53 | results = runner.invoke( 54 | main, 55 | [ 56 | "--input-directory", 57 | sharedir, 58 | "--genome-directory", 59 | saccer3, 60 | "--force", 61 | "--aligner-choice", 62 | "bowtie2", 63 | "--feature-counts-feature-type", 64 | "gene", 65 | "--contaminant-file", 66 | conta, 67 | "--working-directory", 68 | directory.name, 69 | ], 70 | ) 71 | assert results.exit_code == 0 72 | 73 | 74 | # fast 75 | def test_version(): 76 | cmd = "sequana_rnaseq --version" 77 | subprocess.call(cmd.split()) 78 | 79 | 80 | # fast 81 | def test_standalone_script_wrong_feature(): 82 | directory = tempfile.TemporaryDirectory() 83 | import sequana_pipelines.rnaseq.main as m 84 | 85 | sys.argv = [ 86 | "test", 87 | "--input-directory", 88 | sharedir, 89 | "--genome-directory", 90 | saccer3, 91 | "--force", 92 | "--aligner-choice", 93 | "bowtie2", 94 | "--feature-counts-feature-type", 95 | "dummy", 96 | "--working-directory", 97 | directory.name, 98 | "--rRNA-feature", 99 | "rRNA_gene", 100 | ] # ideally should be rRNA but current 101 | try: 102 | m.main() 103 | assert False 104 | except: 105 | assert True 106 | 107 | 108 | # fast 109 | def test_standalone_script_wrong_reference(): 110 | directory = tempfile.TemporaryDirectory() 111 | import sequana_pipelines.rnaseq.main as m 112 | 113 | sys.argv = [ 114 | "test", 115 | "--input-directory", 116 | sharedir, 117 | "--genome-directory", 118 | "dummy", 119 | "--force", 120 | "--aligner-choice", 121 | "bowtie2", 122 | "--working-directory", 123 | directory.name, 124 | "--rRNA-feature", 125 | "rRNA_gene", 126 | ] # ideally should be rRNA but current 127 | try: 128 | m.main() 129 | assert False 130 | except: 131 | assert True 132 | 133 | 134 | # fast 135 | def test_standalone_script_wrong_triming(): 136 | directory = tempfile.TemporaryDirectory() 137 | import sequana_pipelines.rnaseq.main as m 138 | 139 | sys.argv = [ 140 | "test", 141 | "--input-directory", 142 | sharedir, 143 | "--genome-directory", 144 | saccer3, 145 | "--force", 146 | "--aligner-choice", 147 | "bowtie2", 148 | "--software-choice", 149 | "dummy", 150 | "--working-directory", 151 | directory.name, 152 | "--rRNA-feature", 153 | "rRNA_gene", 154 | ] # ideally should be rRNA but current 155 | try: 156 | m.main() 157 | assert False 158 | except SystemExit: 159 | assert True 160 | 161 | 162 | # slow 163 | def test_full(): 164 | 165 | with tempfile.TemporaryDirectory() as directory: 166 | wk = directory 167 | 168 | cmd = f"sequana_rnaseq --input-directory {sharedir} --genome-directory {saccer3} --aligner-choice bowtie2 --working-directory {wk} --force --rRNA-feature rRNA_gene" 169 | subprocess.call(cmd.split()) 170 | 171 | cmd = "sh rnaseq.sh" 172 | 173 | stat = subprocess.call(cmd.split(), cwd=wk) 174 | 175 | assert os.path.exists(wk + "/summary.html") 176 | assert os.path.exists(wk + "/multiqc/multiqc_report.html") 177 | 178 | 179 | # slow 180 | def test_full_star(): 181 | 182 | with tempfile.TemporaryDirectory() as directory: 183 | wk = directory 184 | 185 | cmd = f"sequana_rnaseq --input-directory {sharedir} --genome-directory {saccer3} --aligner-choice star --working-directory {wk} --force --rRNA-feature rRNA_gene" 186 | subprocess.call(cmd.split()) 187 | 188 | cmd = "snakemake -s rnaseq.rules --wrapper-prefix https://raw.githubusercontent.com/sequana/sequana-wrappers/ -p --cores 2 " 189 | 190 | stat = subprocess.call(cmd.split(), cwd=wk) 191 | 192 | assert os.path.exists(wk + "/summary.html") 193 | assert os.path.exists(wk + "/multiqc/multiqc_report.html") 194 | 195 | 196 | # slow 197 | def __test_full_salmon(): 198 | 199 | with tempfile.TemporaryDirectory() as directory: 200 | wk = directory 201 | 202 | cmd = f"sequana_rnaseq --input-directory {sharedir} --genome-directory {saccer3} --aligner-choice salmon --working-directory {wk} --force" 203 | subprocess.call(cmd.split()) 204 | 205 | cmd = "snakemake -s rnaseq.rules --wrapper-prefix https://raw.githubusercontent.com/sequana/sequana-wrappers/ -p --cores 2 " 206 | 207 | stat = subprocess.call(cmd.split(), cwd=wk) 208 | 209 | assert os.path.exists(wk + "/summary.html") 210 | assert os.path.exists(wk + "/multiqc/multiqc_report.html") 211 | -------------------------------------------------------------------------------- /sequana_pipelines/rnaseq/schema.yaml: -------------------------------------------------------------------------------- 1 | # Schema validator for the rnaseq pipeline 2 | # author: Thomas Cokelaer 3 | 4 | type: map 5 | mapping: 6 | "sequana_wrappers": 7 | type: str 8 | "input_directory": 9 | type: str 10 | required: True 11 | "input_readtag": 12 | type: str 13 | required: True 14 | "input_pattern": 15 | type: str 16 | required: True 17 | "apptainers": 18 | type: any 19 | 20 | 21 | "fastqc": 22 | type: map 23 | mapping: 24 | "skip_fastqc_raw": 25 | type: bool 26 | required: True 27 | "options": 28 | type: str 29 | required: True 30 | "threads": 31 | type: int 32 | required: True 33 | range: { min: 1 } 34 | "resources": 35 | type: any 36 | required: true 37 | 38 | 39 | "general": 40 | type: map 41 | mapping: 42 | "aligner": 43 | type: str 44 | required: True 45 | enum: ["bowtie2", "star", "salmon"] 46 | "genome_directory": 47 | required: True 48 | type: str 49 | "contaminant_file": 50 | type: str 51 | "rRNA_feature": 52 | type: str 53 | "custom_gff": 54 | type: str 55 | 56 | "add_read_group": 57 | type: map 58 | mapping: 59 | "options": 60 | type: str 61 | 62 | "trimming": 63 | type: map 64 | mapping: 65 | "do": 66 | type: bool 67 | required: True 68 | "software_choice": 69 | type: str 70 | enum: [cutadapt,atropos,fastp] 71 | 72 | "fastp": 73 | type: map 74 | mapping: 75 | "options": 76 | type: str 77 | "minimum_length": 78 | required: True 79 | type: int 80 | "adapters": 81 | type: str 82 | required: False 83 | "quality": 84 | type: int 85 | range: {max: 40, min: 0} 86 | required: False 87 | "threads": 88 | type: int 89 | required: True 90 | range: { min: 1 } 91 | "disable_adapter_trimming": 92 | type: bool 93 | "disable_quality_filtering": 94 | type: bool 95 | "resources": 96 | type: any 97 | required: true 98 | 99 | 100 | "cutadapt": 101 | type: map 102 | mapping: 103 | "tool_choice": 104 | type: str 105 | enum: [cutadapt,atropos] 106 | required: True 107 | "adapter_choice": 108 | type: str 109 | required: False 110 | "design_file": 111 | type: str 112 | required: False 113 | pattern: .* 114 | "fwd": 115 | type: str 116 | required: False 117 | "rev": 118 | type: str 119 | required: False 120 | "m": 121 | type: int 122 | range: {min: 0} 123 | required: True 124 | "mode": 125 | type: str 126 | enum: [b, g, a] 127 | required: True 128 | "options": 129 | type: str 130 | required: False 131 | "quality": 132 | type: int 133 | range: {max: 40, min: 0} 134 | required: False 135 | "threads": 136 | type: int 137 | required: True 138 | range: { min: 1 } 139 | 140 | "multiqc": 141 | type: map 142 | mapping: 143 | "options": 144 | type: str 145 | "modules": 146 | type: str 147 | "config_file": 148 | type: str 149 | "input_directory": 150 | type: str 151 | "resources": 152 | type: any 153 | required: true 154 | 155 | 156 | "feature_counts": 157 | type: map 158 | mapping: 159 | "do": 160 | type: bool 161 | "options": 162 | type: str 163 | "attribute": 164 | type: str 165 | "feature": 166 | type: str 167 | "extra_attributes": 168 | type: str 169 | "strandness": 170 | required: False 171 | type: any 172 | enum: ['0', '1', '2' , '', 0, 1, 2] 173 | "threads": 174 | type: int 175 | range: { min: 1, max: 8 } 176 | "tolerance": 177 | type: float 178 | range: { min: 0, max: 0.25 } 179 | 'bowtie1_mapping_ref': 180 | type: map 181 | mapping: 182 | "options": 183 | type: str 184 | "threads": 185 | type: int 186 | required: True 187 | range: { min: 1 } 188 | 189 | 'salmon_index': 190 | type: map 191 | mapping: 192 | "options": 193 | type: str 194 | "threads": 195 | type: int 196 | required: True 197 | range: { min: 1 } 198 | "resources": 199 | type: any 200 | required: true 201 | 202 | 203 | 'salmon_mapping': 204 | type: map 205 | mapping: 206 | "options": 207 | type: str 208 | "threads": 209 | type: int 210 | required: True 211 | range: { min: 1 } 212 | "resources": 213 | type: any 214 | required: true 215 | 216 | 'bam_coverage': 217 | type: map 218 | mapping: 219 | "do": 220 | type: bool 221 | "options": 222 | type: str 223 | "binSize": 224 | type: int 225 | "genomeSize": 226 | type: int 227 | "extendReads": 228 | type: int 229 | "minFragmentLength": 230 | type: int 231 | "maxFragmentLength": 232 | type: int 233 | "threads": 234 | type: int 235 | required: True 236 | range: { min: 1, max: 8 } 237 | "resources": 238 | type: any 239 | required: true 240 | 241 | 'mark_duplicates': 242 | type: map 243 | mapping: 244 | "do": 245 | type: bool 246 | "remove": 247 | type: bool 248 | "tmpdir": 249 | type: str 250 | "threads": 251 | type: int 252 | required: True 253 | range: { min: 1, max: 8 } 254 | "resources": 255 | type: any 256 | required: true 257 | 258 | 'bowtie1_mapping_rna': 259 | type: map 260 | mapping: 261 | "options": 262 | type: str 263 | "threads": 264 | type: int 265 | required: True 266 | range: { min: 1} 267 | "nreads": 268 | type: int 269 | required: True 270 | range: { min: -1, max: 1e15} 271 | 272 | 'igvtools': 273 | type: map 274 | mapping: 275 | "do": 276 | type: bool 277 | "chrom_sizes_file": 278 | type: str 279 | 'rnaseqc': 280 | type: map 281 | mapping: 282 | "do": 283 | type: bool 284 | "gtf_file": 285 | type: str 286 | "options": 287 | type: str 288 | "resources": 289 | type: any 290 | required: true 291 | 'rseqc': 292 | type: map 293 | mapping: 294 | "do": 295 | type: bool 296 | "bed_file": 297 | type: str 298 | 299 | 'bowtie2_mapping': 300 | type: map 301 | mapping: 302 | "options": 303 | type: str 304 | "threads": 305 | type: int 306 | required: True 307 | range: { min: 1} 308 | "genome_size_larger_than_4gb": 309 | type: bool 310 | "resources": 311 | type: any 312 | required: true 313 | 314 | 'bowtie2_index': 315 | type: map 316 | mapping: 317 | "options": 318 | type: str 319 | "threads": 320 | type: int 321 | required: True 322 | range: { min: 1} 323 | "resources": 324 | type: any 325 | required: true 326 | 327 | 'star_mapping': 328 | type: map 329 | mapping: 330 | "options": 331 | type: str 332 | "threads": 333 | type: int 334 | required: True 335 | range: { min: 1} 336 | "resources": 337 | type: any 338 | required: true 339 | "legacy": 340 | type: bool 341 | required: True 342 | 343 | 'star_index': 344 | type: map 345 | mapping: 346 | "options": 347 | type: str 348 | "threads": 349 | type: int 350 | required: True 351 | range: { min: 1} 352 | "resources": 353 | type: any 354 | required: true 355 | 356 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # documentation build configuration file, created by 4 | # sphinx-quickstart on Wed Aug 10 16:58:13 2011. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | import sphinx 16 | 17 | sys.path.insert(0, os.path.abspath('sphinxext')) 18 | 19 | # If extensions (or modules to document with autodoc) are in another directory, 20 | # add these directories to sys.path here. If the directory is relative to the 21 | # documentation root, use os.path.abspath to make it absolute, like shown here. 22 | 23 | pkg_name = "sequana_rnaseq" 24 | 25 | # This is for ReadTheDoc 26 | import matplotlib 27 | matplotlib.use('Agg') 28 | 29 | import pkg_resources 30 | version = pkg_resources.require(pkg_name)[0].version 31 | 32 | import matplotlib 33 | import matplotlib.sphinxext 34 | 35 | release = version 36 | author = "Thomas Cokelaer" 37 | title = "Sequana rnaseq pipeline" 38 | copyright = author + ", 2016-2019" 39 | project = 'Sequana rnaseq pipeline' 40 | 41 | import easydev 42 | from easydev import get_path_sphinx_themes 43 | 44 | 45 | # -- General configuration ----------------------------------------------------- 46 | 47 | # If your documentation needs a minimal Sphinx version, state it here. 48 | #needs_sphinx = '1.0' 49 | 50 | # Add any Sphinx extension module names here, as strings. They can be extensions 51 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 52 | 53 | extensions = [ 54 | 'sphinx.ext.autodoc', 55 | ('sphinx.ext.imgmath' # only available for sphinx >= 1.4 56 | if sphinx.version_info[:2] >= (1, 4) 57 | else 'sphinx.ext.pngmath'), 58 | 'sphinx.ext.coverage', 59 | 'sphinx.ext.doctest', 60 | 'sphinx.ext.intersphinx', 61 | 'sphinx.ext.todo', 62 | 'sphinx.ext.ifconfig', 63 | 'sphinx.ext.viewcode', 64 | "numpydoc.numpydoc", 65 | 'matplotlib.sphinxext.plot_directive', 66 | 'sphinx.ext.autosummary', 67 | "sequana.sphinxext.snakemakerule" 68 | ] 69 | # note that the numpy directives is buggy. Example: class and init are not recognised as two entities for the autoclass_content=both here below 70 | 71 | math_number_all = False 72 | 73 | todo_include_todos=True 74 | jscopybutton_path = "copybutton.js" 75 | autoclass_content = 'both' 76 | 77 | # Add any paths that contain templates here, relative to this directory. 78 | templates_path = ['_templates'] 79 | 80 | # The suffix of source filenames. 81 | source_suffix = '.rst' 82 | 83 | # The encoding of source files. 84 | #source_encoding = 'utf-8-sig' 85 | 86 | # The master toctree document. 87 | master_doc = 'index' 88 | 89 | # General information about the project. 90 | project = project 91 | copyright = copyright 92 | 93 | # The version info for the project you're documenting, acts as replacement for 94 | # |version| and |release|, also used in various other places throughout the 95 | # built documents. 96 | # 97 | # The short X.Y version. 98 | version = 'Current version: ' + str(version) 99 | # The full version, including alpha/beta/rc tags. 100 | release = release 101 | 102 | # The language for content autogenerated by Sphinx. Refer to documentation 103 | # for a list of supported languages. 104 | #language = None 105 | 106 | # There are two options for replacing |today|: either, you set today to some 107 | # non-false value, then it is used: 108 | #today = '' 109 | # Else, today_fmt is used as the format for a strftime call. 110 | #today_fmt = '%B %d, %Y' 111 | 112 | # List of documents that shouldn't be included in the build. 113 | #unused_docs = [] 114 | 115 | 116 | # List of patterns, relative to source directory, that match files and 117 | # directories to ignore when looking for source files. 118 | exclude_trees = ['_build'] 119 | exclude_patterns = [] 120 | 121 | # The reST default role (used for this markup: `text`) to use for all documents. 122 | #default_role = None 123 | 124 | # If true, '()' will be appended to :func: etc. cross-reference text. 125 | #add_function_parentheses = True 126 | 127 | # If true, the current module name will be prepended to all description 128 | # unit titles (such as .. function::). 129 | add_module_names = False 130 | 131 | # If true, sectionauthor and moduleauthor directives will be shown in the 132 | # output. They are ignored by default. 133 | show_authors = True 134 | 135 | # The name of the Pygments (syntax highlighting) style to use. 136 | pygments_style = 'sphinx' 137 | 138 | # A list of ignored prefixes for module index sorting. 139 | modindex_common_prefix = ["sequana."] 140 | 141 | 142 | # Get rid of spurious warnings due to some interaction between 143 | # autosummary and numpydoc. See 144 | # https://github.com/phn/pytpm/issues/3#issuecomment-12133978 for more 145 | # details 146 | numpydoc_show_class_members = False 147 | 148 | 149 | # solution from nilearn 150 | def touch_example_backreferences(app, what, name, obj, options, lines): 151 | # generate empty examples files, so that we don't get 152 | # inclusion errors if there are no examples for a class / module 153 | examples_path = os.path.join(app.srcdir, "modules", "generated", 154 | "%s.examples" % name) 155 | if not os.path.exists(examples_path): 156 | # touch file 157 | open(examples_path, 'w').close() 158 | 159 | 160 | 161 | # Add the 'copybutton' javascript, to hide/show the prompt in code 162 | # examples 163 | def setup(app): 164 | app.add_javascript('copybutton.js') 165 | app.connect('autodoc-process-docstring', touch_example_backreferences) 166 | 167 | 168 | 169 | 170 | 171 | # -- Options for HTML output --------------------------------------------------- 172 | 173 | # The theme to use for HTML and HTML Help pages. Major themes that come with 174 | # Sphinx are currently 'default' and 'sphinxdoc'. 175 | on_rtd = os.environ.get("READTHEDOCS", None) == "True" 176 | if not on_rtd: 177 | import sphinx_rtd_theme 178 | html_theme = 'sphinx_rtd_theme' 179 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 180 | else: 181 | html_theme = "default" 182 | 183 | # Theme options are theme-specific and customize the look and feel of a theme 184 | # further. For a list of options available for each theme, see the 185 | # documentation. 186 | # the user theme contains the otpions 'homepage', which is populated here 187 | #html_theme_options = {'homepage': init_sphinx.url} 188 | # Add any paths that contain custom themes here, relative to this directory. 189 | #html_theme_path = [get_path_sphinx_themes()] 190 | 191 | 192 | 193 | # The name for this set of Sphinx documents. If None, it defaults to 194 | # " v documentation". 195 | #html_title = None 196 | 197 | # A shorter title for the navigation bar. Default is the same as html_title. 198 | html_short_title = "sequana" 199 | 200 | # The name of an image file (relative to this directory) to place at the top 201 | # of the sidebar. 202 | #html_logo = "../../share/data/images/crx_logo.png" 203 | 204 | # The name of an image file (within the static path) to use as favicon of the 205 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 206 | # pixels large. 207 | #html_favicon = "../../share/data/images/crx_logo.ico" 208 | 209 | # Add any paths that contain custom static files (such as style sheets) here, 210 | # relative to this directory. They are copied after the builtin static files, 211 | # so a file named "default.css" will overwrite the builtin "default.css". 212 | 213 | html_static_path = ['_static'] 214 | 215 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 216 | # using the given strftime format. 217 | html_last_updated_fmt = '%b %d, %Y' 218 | 219 | # If true, SmartyPants will be used to convert quotes and dashes to 220 | # typographically correct entities. 221 | #html_use_smartypants = True 222 | 223 | # Custom sidebar templates, maps document names to template names. 224 | html_index = 'index.html' 225 | 226 | #Custom sidebar templates, maps page names to templates. 227 | #html_sidebars = { 228 | # 'index': [ 'indexsidebar.html'], 229 | # 'contents':'indexsidebar.html', 230 | #} 231 | # Additional templates that should be rendered to pages, maps page names to 232 | # template names. 233 | #html_additional_pages = { 'index': 'index.html'} 234 | 235 | 236 | # If false, no module index is generated. 237 | html_use_modindex = True 238 | html_domain_indices = True 239 | # If false, no index is generated. 240 | html_use_index = True 241 | 242 | # If true, the index is split into individual pages for each letter. 243 | html_split_index = False 244 | 245 | # If true, links to the reST sources are added to the pages. 246 | html_show_sourcelink = True 247 | html_copy_source = True 248 | 249 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 250 | html_show_sphinx = True 251 | 252 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 253 | #html_show_copyright = True 254 | 255 | # If true, an OpenSearch description file will be output, and all pages will 256 | # contain a tag referring to it. The value of this option must be the 257 | # base URL from which the finished HTML is served. 258 | #html_use_opensearch = '' 259 | 260 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 261 | #html_file_suffix = None 262 | 263 | # Output file base name for HTML help builder. 264 | htmlhelp_basename = 'doc' 265 | 266 | 267 | # -- Options for LaTeX output -------------------------------------------------- 268 | 269 | # NOT in original quickstart 270 | pngmath_use_preview = True 271 | 272 | # The font size ('10pt', '11pt' or '12pt'). 273 | latex_font_size = '10pt' 274 | 275 | # Grouping the document tree into LaTeX files. List of tuples 276 | # (source start file, target name, title, author, documentclass [howto/manual]). 277 | latex_documents = [ 278 | ('index', 'main.tex', title, 279 | author, 'manual'), 280 | ] 281 | 282 | latex_elements = { 'inputenc': '\\usepackage[utf8]{inputenc}' } 283 | 284 | # The name of an image file (relative to this directory) to place at the top of 285 | # the title page. 286 | #latex_logo = None 287 | 288 | # For "manual" documents, if this is true, then toplevel headings are parts, 289 | # not chapters. 290 | latex_use_parts = False 291 | 292 | # If true, show page references after internal links. 293 | #latex_show_pagerefs = False 294 | 295 | # If true, show URL addresses after external links. 296 | #latex_show_urls = False 297 | 298 | # Additional stuff for the LaTeX preamble. 299 | 300 | 301 | # Documents to append as an appendix to all manuals. 302 | #latex_appendices = [] 303 | 304 | # If false, no module index is generated. 305 | #latex_domain_indices = True 306 | 307 | 308 | # -- Options for manual page output -------------------------------------------- 309 | 310 | # One entry per manual page. List of tuples 311 | # (source start file, name, description, authors, manual section). 312 | man_pages = [ 313 | ('index', project, project, 314 | [author], 1) 315 | ] 316 | 317 | 318 | # Example configuration for intersphinx: refer to the Python standard library. 319 | intersphinx_mapping = { 320 | "python": ('http://docs.python.org/', None), 321 | } 322 | -------------------------------------------------------------------------------- /sequana_pipelines/rnaseq/data/rnadiff_one_factor.R: -------------------------------------------------------------------------------- 1 | ################################################### 2 | ### DESeq2_1factor parameters: to be modified by the user 3 | ################################################### 4 | rm(list=ls()) # remove all the objects of the R session 5 | 6 | workspace <- "." # workspace for the R session 7 | 8 | projectName <- "BXXXX" # name of the project (cannot contain any ".") 9 | analysisVersion <- "vN" # name of the analysis version (cannot contain any ".") 10 | 11 | author <- " (Biomics platform - Institut Pasteur)" # author of the statistical report 12 | researcher <- "" # name of the researcher 13 | chief <- "" # name of the head of unit 14 | 15 | varInt <- "condition" # factor of interest 16 | condRef <- "WT" # reference biological condition e.g. WT 17 | batch <- NULL # factor on which to adjust the statistical model: NULL (default) or "batch" for example 18 | 19 | outfile <- TRUE # TRUE to export figures, FALSE to display them in R 20 | colors <- c("#f3c300", "#875692", "#f38400", "#a1caf1", "#be0032", # vector of colors of each group on the plots 21 | "#c2b280", "#848482", "#008856", "#e68fac", "#0067a5", 22 | "#f99379", "#604e97", "#f6a600", "#b3446c", "#dcd300", 23 | "#882d17", "#8db600", "#654522", "#e25822", "#2b3d26") 24 | 25 | cooksCutoff <- NULL # outliers detection threshold (NULL to leave DESeq2 choosing it, Inf to keep outliers) 26 | independentFiltering <- TRUE # FALSE to turn off the independent filtering (default is TRUE) 27 | allComp <- TRUE # make all the possible comparisons or only those to the reference level? 28 | alpha <- 0.05 # threshold of statistical significance 29 | adjMethod <- "BH" # p-value adjustment method: "BH" (default) or "BY" 30 | type.trans <- "VST" # transformation for exploratory analysis: "VST" ou "rlog" (if size factors vary very widely) 31 | locfunc <- "median" # "median" (default) or "shorth" with library(genefilter) (to estimate the size factors) 32 | interestingFeatures <- NULL # vector of features for which to plot the expression 33 | featuresToRemove <- c("alignment_not_unique", # names of the features to be removed (default is the HTSeq-count specific lines) 34 | "ambiguous", "no_feature", 35 | "not_aligned", "too_low_aQual") 36 | 37 | fitType <- "parametric" # mean-variance relationship: "parametric" (default) or "local" 38 | 39 | ##################################### 40 | # INPUT FILES 41 | ##################################### 42 | geneLengthFile <- "input_gene_lengths.tsv" # path to the genes lenghts file (default is NULL) 43 | targetFile <- "target.txt" # path to the design/target file 44 | infoFile <- "input_info.tsv" # path to the annotation file (needed if 0 counts not in counts files) 45 | rawDir <- "feature_counts" # path to the directory containing raw counts files 46 | 47 | ################################################### 48 | ### code chunk number 1: construction autres parametres et divers chargements 49 | ################################################### 50 | setwd(workspace) 51 | library(RNADiff) 52 | library(knitr) 53 | if (locfunc=="shorth") library(genefilter) 54 | 55 | versionName <- paste(projectName, analysisVersion, sep="-") 56 | ncol <- NULL # largeur des tableaux dans le rapport 57 | 58 | cat("Creation des dossiers d'exports\n") 59 | dir.create("figures", showWarnings=FALSE) 60 | dir.create("tables", showWarnings=FALSE) 61 | 62 | ################################################### 63 | ### code chunk number 2: loadData 64 | ################################################### 65 | cat("Chargement des annotations et longueurs des genes si besoin\n") 66 | if (!is.null(infoFile)) print(head(info <- read.delim(infoFile, sep="\t", header=TRUE, stringsAsFactors=FALSE))) else info <- NULL 67 | if (!is.null(geneLengthFile)) print(head(glength <- read.table(geneLengthFile, sep="\t", header=TRUE, stringsAsFactors=FALSE))) else glength <- NULL 68 | 69 | cat("Chargement du target file\n") 70 | print(target <- loadTargetFile(targetFile, varInt=varInt, condRef=condRef)) 71 | conds <- levels(target[,varInt]) 72 | group <- data.frame(group=factor(target[,varInt])) 73 | 74 | cat("Chargement des donnees\n") 75 | counts <- loadCountData(target, rawDir=rawDir, versionName=versionName, featuresToRemove=featuresToRemove) 76 | 77 | cat("Verifier que les echantillons de counts sont dans le meme ordre que le target\n") 78 | print(cbind(target=as.character(target[,1]),counts=colnames(counts))) 79 | 80 | cat("Verifier que les identifiants dans info et glength sont les memes que dans les comptages\n") 81 | checkInfoGlength(counts=counts, info=info, glength=glength) 82 | 83 | ################################################### 84 | ### code chunk number 3: description of raw data 85 | ################################################### 86 | cat("\nFigure : nombre de reads par echantillon\n") 87 | barplotTC(counts=counts, group=group, col=colors, out=outfile, versionName=versionName) 88 | 89 | cat("Figure : nombre de comptages nuls par echantillon\n") 90 | barplotNul(counts=counts, group=group, col=colors, out=outfile, versionName=versionName) 91 | N <- nrow(counts) - nrow(removeNul(counts)) 92 | cat("\nNombre de genes avec que des comptages nuls :", N,"\n") 93 | 94 | cat("\nFigure : estimation de la densite des comptages de chaque echantillon\n") 95 | densityPlot(counts=counts, group=group, col=colors, out=outfile, versionName=versionName) 96 | 97 | cat("\nFigure + tableau : sequences majoritaires pour chaque echantillon\n") 98 | majSequences <- majSequences(counts=counts, group=group, versionName=versionName, col=colors, out=outfile) 99 | 100 | cat("\nCalcul des SERE\n") 101 | print(sere <- pairwiseSERE(counts, versionName=versionName)) 102 | 103 | cat("\nFigure : pairwise scatterplots of samples\n") 104 | pairwiseScatterPlots(counts=counts, group=group, out=outfile, versionName=versionName) 105 | 106 | ################################################### 107 | ### code chunk number 4: creating DESeqDataSet object, normalization and estimateDispersion 108 | ################################################### 109 | dds <- DESeqDataSetFromMatrix(countData=counts, colData=target, 110 | design=formula(paste("~", ifelse(!is.null(batch), paste(batch,"+"), ""), varInt))) 111 | print(design(dds)) 112 | 113 | cat("Estimation des size factors\n") 114 | dds <- estimateSizeFactors(dds, locfunc=eval(as.name(locfunc))) 115 | print(sf <- sizeFactors(dds)) 116 | cat("\nFigure : diagnostic des size factors\n") 117 | diagSizeFactors(dds=dds, group=group, col=colors, out=outfile, versionName=versionName) 118 | 119 | cat("\nCalcul des dispersions et graph relation mean-dispersion\n") 120 | dds <- estimateDispersions(dds, fitType=fitType) 121 | plotDispEstimates(dds=dds, out=outfile, versionName=versionName) 122 | cat("\nFigure : diagnostic de log-normalite des dispersions\n") 123 | diagLogNormalityDisp(dds=dds, out=outfile, versionName=versionName) 124 | 125 | ################################################### 126 | ### code chunk number 5: Boxplot avant et apres normalisation 127 | ################################################### 128 | cat("Figure : boxplots sur comptages bruts et normalises\n") 129 | boxplotCounts(counts=counts(dds), group=group, col=colors, out=outfile, versionName=versionName) 130 | boxplotCounts(counts=counts(dds, normalized=TRUE), group=group, col=colors, type="norm", out=outfile, versionName=versionName) 131 | 132 | ################################################### 133 | ### code chunk number 6: clustering + PCA of samples 134 | ################################################### 135 | cat("Figure : dendrogramme de la classification sur comptages transformes\n") 136 | if (type.trans == "VST") counts.trans <- assay(varianceStabilizingTransformation(dds)) 137 | if (type.trans == "rlog") counts.trans <- assay(rlogTransformation(dds)) 138 | clusterPlot(counts=counts.trans, out=outfile, versionName=versionName) 139 | 140 | cat("Figure : premier plan de l'ACP sur les comptages transformes\n") 141 | PCAPlot(dds=dds, group=group, type.trans=type.trans, col=colors, out=outfile, versionName=versionName) 142 | 143 | ################################################### 144 | ### code chunk number 7: analyse differentielle 145 | ################################################### 146 | cat("Tests statistiques\n") 147 | dds <- nbinomWaldTest(dds) 148 | results <- list() 149 | for (comp in combn(nlevels(colData(dds)[,varInt]), 2, simplify=FALSE)){ 150 | if (!allComp & comp[1]!=1) next 151 | levelRef <- levels(colData(dds)[,varInt])[comp[1]] 152 | levelTest <- levels(colData(dds)[,varInt])[comp[2]] 153 | results[[paste0(levelTest,"_vs_",levelRef)]] <- results(dds, contrast=c(varInt, levelTest, levelRef), pAdjustMethod=adjMethod, 154 | cooksCutoff=ifelse(!is.null(cooksCutoff), cooksCutoff, TRUE), 155 | independentFiltering=independentFiltering, alpha=alpha) 156 | cat(paste0("Comparison ", levelTest, " vs ", levelRef, "\n")) 157 | } 158 | 159 | ################################################### 160 | ### code chunk number 8: results of the independent filtering 161 | ################################################### 162 | if(independentFiltering){ 163 | cat("Tableau : independent filtering\n") 164 | print(tabIndepFiltering <- tabIndepFiltering(results, versionName=versionName), quote=FALSE) 165 | } 166 | 167 | ################################################### 168 | ### code chunk number 9: export tables 169 | ################################################### 170 | cat("Export des resultats\n") 171 | complete <- exportComplete.DESeq2(dds=dds, results=results, alpha=alpha, group=group[,1], 172 | cooksCutoff=cooksCutoff, conds=conds, versionName=versionName, 173 | info=info, export=TRUE) 174 | 175 | cat("# genes up, down et total par comparaison\n") 176 | print(nDiffTotal <- nDiffTotal(complete, alpha=alpha, versionName=versionName), quote=FALSE) 177 | 178 | cat("Figure : nb de genes DE selon seuil FDR\n") 179 | nbDiffSeuil(complete=complete, out=outfile, versionName=versionName) 180 | 181 | if (!is.null(geneLengthFile)){ 182 | cat("Export : comptages normalises par la longueur des genes\n") 183 | normGeneLength(counts=counts(dds, normalized=TRUE), glength=glength, versionName=versionName) 184 | geneLengthEffect(counts, complete, glength, out=outfile, versionName=versionName) 185 | } 186 | 187 | ################################################### 188 | ### code chunk number 10: distribution of raw p-values and MA-plot 189 | ################################################### 190 | cat("Figure : distribution des log2(Fold-Changes)\n") 191 | diagLogFC(complete=complete, out=outfile, versionName=versionName) 192 | 193 | cat("Figure : histogramme des p-valeurs brutes\n") 194 | histoRawp(complete=complete, out=outfile, versionName=versionName) 195 | 196 | cat("\nFigure : MA-plot\n") 197 | MAplotDE(complete=complete, pvalCutoff=alpha, out=outfile, versionName=versionName) 198 | 199 | cat("\nFigure : volcano-plot\n") 200 | volcanoPlotDE(complete=complete, pvalCutoff=alpha, out=outfile, versionName=versionName) 201 | 202 | # this causes trouble quite often 203 | #cat("\nFigure : Venn diagram\n") 204 | #vennDiagramDE(complete=complete, alpha=alpha, out=outfile, versionName=versionName) 205 | 206 | cat("\nFigure : heatmap\n") 207 | heatmapDE(counts.trans=counts.trans, complete=complete, alpha=alpha, out=outfile, 208 | key.xlab=paste0(type.trans, "-centered data"), versionName=versionName) 209 | 210 | cat("\nFigure : interesting features\n") 211 | if (!is.null(interestingFeatures)){ 212 | plotEvolution(mat=log2(counts(dds,normalized=TRUE)+1), features=interestingFeatures, 213 | target=target, varInt1=varInt, colors=colors, ylab=expression(log[2] ~ norm ~ counts + 1), 214 | out=outfile, versionName=versionName) 215 | } 216 | 217 | ################################################### 218 | ### code chunk number 11: sessionInfo and saving 219 | ################################################### 220 | cat("Sauvegarde des resultats\n") 221 | sessionInfo <- sessionInfo() 222 | pckVersionRNADiff <- packageVersion("RNADiff") 223 | pckVersionDESeq2 <- packageVersion("DESeq2") 224 | save.image(file=paste0(versionName, ".RData")) 225 | # export RData for PF2heatmaps 226 | results <- lapply(results, as.data.frame) 227 | pf2heatmaps_objects <- c("varInt", "target", "type.trans", "counts.trans", "results", "info") 228 | save(list=pf2heatmaps_objects, file=paste0(versionName, "_PF2heatmaps.RData"), version=2) 229 | # export RData for PF2toolsFilter 230 | extract_col <- function(comp, info=NULL){ 231 | if (is.null(info)){ 232 | comp[, c("Id","baseMean", "log2FoldChange","padj")] 233 | } else{ 234 | comp[, c(1:ncol(info), which(names(comp) %in% c("baseMean", "log2FoldChange","padj")))] 235 | } 236 | } 237 | complete <- lapply(complete, extract_col, info=info) 238 | save(complete, file=paste0(versionName, "_PF2toolsFilter.RData"), version=2) 239 | 240 | ################################################### 241 | ### code chunk number 12: knitr compilation 242 | ################################################### 243 | if (!outfile){ 244 | cat("outfile is FALSE: report and slides cannot be generated\n") 245 | } else{ 246 | cat("Creation du rapport et des slides\n") 247 | knit(system.file("report1factor.Rnw", package="RNADiff"), paste0("report-", versionName, ".tex"), quiet=TRUE) 248 | knit(system.file("slides1factor.Rnw", package="RNADiff"), paste0("slides-", versionName, ".tex"), quiet=TRUE) 249 | cat("Compilation du rapport\n") 250 | system(paste0("pdflatex report-", versionName, ".tex")) 251 | system(paste0("bibtex report-", versionName, ".aux")) 252 | system(paste0("pdflatex report-", versionName, ".tex")) 253 | system(paste0("pdflatex report-", versionName, ".tex")) 254 | } 255 | -------------------------------------------------------------------------------- /sequana_pipelines/rnaseq/main.py: -------------------------------------------------------------------------------- 1 | # 2 | # This file is part of Sequana software 3 | # 4 | # Copyright (c) 2016-2021 - Sequana Development Team 5 | # 6 | # File author(s): 7 | # Thomas Cokelaer 8 | # 9 | # Distributed under the terms of the 3-clause BSD license. 10 | # The full license is in the LICENSE file, distributed with this software. 11 | # 12 | # website: https://github.com/sequana/sequana 13 | # documentation: http://sequana.readthedocs.io 14 | # 15 | ############################################################################## 16 | import os 17 | import shutil 18 | import subprocess 19 | import sys 20 | 21 | import click_completion 22 | import rich_click as click 23 | from sequana_pipetools import SequanaManager 24 | from sequana_pipetools.options import * 25 | 26 | click_completion.init() 27 | NAME = "rnaseq" 28 | 29 | 30 | help = init_click( 31 | NAME, 32 | groups={ 33 | "Pipeline Specific": [ 34 | "--aligner-choice", 35 | "--contaminant-file", 36 | "--do-igvtools", 37 | "--do-bam-coverage", 38 | "--do-mark-duplicates", 39 | "--do-rnaseqc", 40 | "--do-rseqc", 41 | "--genome-directory", 42 | "--rnaseqc-gtf-file", 43 | "--rRNA-feature", 44 | "--rseqc-bed-file", 45 | "--skip-rRNA", 46 | "--skip-gff-check", 47 | "--trimming-quality", 48 | ], 49 | }, 50 | ) 51 | 52 | 53 | @click.command(context_settings=help) 54 | @include_options_from(ClickInputOptions) 55 | @include_options_from(ClickSnakemakeOptions, working_directory=NAME) 56 | @include_options_from(ClickSlurmOptions) 57 | @include_options_from(ClickGeneralOptions) 58 | @include_options_from(ClickTrimmingOptions) 59 | @include_options_from(ClickFeatureCountsOptions) 60 | @click.option( 61 | "--genome-directory", 62 | "genome_directory", 63 | show_default=True, 64 | type=click.Path(dir_okay=True, file_okay=False), 65 | required=True, 66 | ) 67 | @click.option( 68 | "--aligner-choice", 69 | "aligner", 70 | required=True, 71 | type=click.Choice(["bowtie2", "bowtie1", "star", "salmon"]), 72 | help="a mapper in bowtie, bowtie2, star", 73 | ) 74 | @click.option( 75 | "--rRNA-feature", 76 | "rRNA", 77 | default="rRNA", 78 | help="""Feature name corresponding to the rRNA to be identified in 79 | the input GFF/GTF files. Must exist and be valid. If you do not have any, 80 | you may skip this step using --skip-rRNA or provide a fasta file using --contaminant-file""", 81 | ) 82 | @click.option( 83 | "--skip-rRNA", 84 | "skip_rRNA", 85 | is_flag=True, 86 | help="""skip the mapping on rRNA feature. ignored if --contaminant-file is provided""", 87 | ) 88 | @click.option( 89 | "--contaminant-file", 90 | default=None, 91 | show_default=True, 92 | help="""A fasta file. If used, the rRNA-feature is not used 93 | This option is useful if you have a dedicated list of rRNA feature or a dedicated 94 | fasta file to search for contaminants""", 95 | ) 96 | @click.option( 97 | "--skip-gff-check", 98 | is_flag=True, 99 | default=False, 100 | show_default=True, 101 | help="""By default we check the coherence between the input 102 | GFF file and related options (e.g. --feature_counts_feature_type and 103 | --feature_counts_attribute options). This may take time e.g. for mouse or human. 104 | Using this option skips the sanity checks""", 105 | ) 106 | @click.option( 107 | "--do-igvtools", 108 | is_flag=True, 109 | help="""if set, this will compute TDF files that can be imported in 110 | IGV browser. TDF file allows to quickly visualise the coverage of the mapped 111 | reads.""", 112 | ) 113 | @click.option( 114 | "--do-bam-coverage", 115 | is_flag=True, 116 | help="Similar to --do-igvtools using bigwig", 117 | ) 118 | @click.option( 119 | "--do-mark-duplicates", 120 | is_flag=True, 121 | help="""Mark duplicates. To be used e.g. with QCs""", 122 | ) 123 | @click.option("--do-rnaseqc", is_flag=True, help="do RNA-seq QC using RNAseQC v2") 124 | @click.option( 125 | "--rnaseqc-gtf-file", 126 | help="""The GTF file to be used for RNAseQC. Without a valid GTF, 127 | RNAseqQC will not work. You may try sequana gff-to-gtf application.""", 128 | ) 129 | @click.option( 130 | "--do-rseqc", 131 | is_flag=True, 132 | help="""do RNA-seq QC using RseQC. This will need a BED file 133 | corresponding to your GFF file. For prokaryotes, the BED file is created on the 134 | fly.""", 135 | ) 136 | @click.option("--rseqc-bed-file", help="""The rseQC input bed file.""") 137 | def main(**options): 138 | 139 | if options["from_project"]: 140 | click.echo("--from-project Not yet implemented") 141 | sys.exit(1) 142 | # the real stuff is here 143 | manager = SequanaManager(options, NAME) 144 | manager.setup() 145 | 146 | # aliases 147 | options = manager.options 148 | cfg = manager.config.config 149 | 150 | from sequana_pipetools import logger 151 | 152 | logger.setLevel(options.level) 153 | 154 | manager.fill_data_options() 155 | # --------------------------------------------------------- general 156 | cfg.general.genome_directory = os.path.abspath(options.genome_directory) 157 | cfg.general.aligner = options.aligner 158 | 159 | # genome name = cfg.genome.genome_directory 160 | genome_name = cfg.general.genome_directory.rsplit("/", 1)[1] 161 | prefix = cfg.general.genome_directory 162 | fasta = cfg.general.genome_directory + f"/{genome_name}.fa" 163 | if os.path.exists(fasta) is False: 164 | logger.critical( 165 | """Could not find {}. You must have the genome sequence in fasta with the extension .fa named after the genome directory.""".format( 166 | fasta 167 | ) 168 | ) 169 | sys.exit() 170 | 171 | # mutually exclusive options 172 | if options.contaminant_file: 173 | cfg.general.contaminant_file = os.path.abspath(options.contaminant_file) 174 | logger.warning("You are using a custom FASTA --contaminant_file so --rRNA-feature will be ignored") 175 | cfg.general.rRNA_feature = None 176 | elif options.skip_rRNA: 177 | cfg.general.rRNA_feature = None 178 | else: 179 | cfg.general.rRNA_feature = options.rRNA 180 | 181 | # --------------------------------------------------------- trimming 182 | cfg.trimming.software_choice = options.trimming_software_choice 183 | cfg.trimming.do = not options.disable_trimming 184 | qual = options.trimming_quality 185 | 186 | if options.trimming_software_choice in ["cutadapt", "atropos"]: 187 | cfg.cutadapt.tool_choice = options.trimming_software_choice 188 | cfg.cutadapt.fwd = options.trimming_adapter_read1 189 | cfg.cutadapt.rev = options.trimming_adapter_read2 190 | cfg.cutadapt.m = options.trimming_minimum_length 191 | cfg.cutadapt.mode = options.trimming_cutadapt_mode 192 | cfg.cutadapt.options = options.trimming_cutadapt_options # trim Ns -O 6 193 | cfg.cutadapt.quality = 30 if qual == -1 else qual 194 | else: 195 | cfg.fastp.minimum_length = options.trimming_minimum_length 196 | cfg.fastp.quality = 15 if qual == -1 else qual 197 | cfg.fastp.adapters = "" 198 | if options.trimming_adapter_read1: 199 | cfg.fastp.adapters += f"--adapter_sequence {options.trimming_adapter_read1}" 200 | if options.trimming_adapter_read2: 201 | cfg.fastp.adapters += f"--adapter_sequence_r2 {options.trimming_adapter_read2}" 202 | 203 | cfg.fastp.options = " --cut_tail " 204 | cfg.fastp.disable_quality_filtering = False 205 | cfg.fastp.disable_adapter_trimming = False 206 | 207 | # ----------------------------------------------------- feature counts 208 | cfg.feature_counts.options = options.feature_counts_options 209 | cfg.feature_counts.strandness = options.feature_counts_strandness 210 | cfg.feature_counts.attribute = options.feature_counts_attribute 211 | cfg.feature_counts.feature = options.feature_counts_feature_type 212 | cfg.feature_counts.extra_attributes = options.feature_counts_extra_attributes 213 | 214 | # ------------------------------------------------------ optional 215 | cfg.igvtools.do = options.do_igvtools 216 | cfg.bam_coverage.do = options.do_bam_coverage 217 | cfg.mark_duplicates.do = False 218 | if options.do_mark_duplicates: 219 | cfg.mark_duplicates.do = True 220 | 221 | # -------------------------------------------------------- RNAseqQC 222 | cfg.rnaseqc.do = options.do_rnaseqc 223 | 224 | if options.do_rnaseqc: 225 | if options.rnaseqc_gtf_file is None: 226 | logger.info( 227 | "You asked for RNA_seqc QC assessements but no GTF" 228 | " file provided; Please use --rnaseqc-gtf-file option. Switching off in your" 229 | " config file and continuing. You may use 'sequana gff2gtf input.gff' to create" 230 | " the gtf file" 231 | ) 232 | cfg.rnaseqc.do = False 233 | if options.aligner in ["salmon"]: 234 | logger.info( 235 | "WARNING" 236 | "You asked for RNA_seqc QC assessements but no" 237 | " BAM will be generated by the salmon aligner. Switching off this option. " 238 | ) 239 | cfg.rnaseqc.do = False 240 | 241 | cfg.rnaseqc.gtf_file = options.rnaseqc_gtf_file 242 | 243 | cfg.rseqc.do = options.do_rseqc 244 | cfg.rseqc.bed_file = options.rseqc_bed_file 245 | 246 | # -------------------------------------------------------- RNAdiff 247 | 248 | # import sequana_pipelines.rnaseq 249 | 250 | # SANITY CHECKS 251 | # -------------------------------------- do we find rRNA feature in the GFF ? 252 | # if we do not build a custom feature_counts set of options, no need to 253 | # check carefully the GFF; if users knows what he is doing; no need to 254 | # check the GFF either 255 | if options.skip_gff_check is False and "," not in cfg.feature_counts.feature: 256 | logger.info("Checking your input GFF file and rRNA feature if provided") 257 | 258 | from sequana.gff3 import GFF3 259 | 260 | genome_directory = os.path.abspath(cfg.general.genome_directory) 261 | genome_name = genome_directory.rsplit("/", 1)[1] 262 | prefix_name = genome_directory + "/" + genome_name 263 | gff_file = prefix_name + ".gff" 264 | 265 | gff = GFF3(gff_file) 266 | df_gff = gff.df # This takes one minute on eukaryotes. No need to 267 | valid_features = gff.features # about 3 seconds 268 | valid_attributes = gff.get_attributes() # about 10 seconds 269 | 270 | # first check the rRNA feature 271 | if cfg["general"]["rRNA_feature"] and cfg["general"]["rRNA_feature"] not in valid_features: 272 | 273 | logger.error( 274 | "rRNA feature not found in the input GFF ({})".format(gff_file) 275 | + " This is probably an error. Please check the GFF content and /or" 276 | " change the feature name with --rRNA-feature based on the content" 277 | " of your GFF. Valid features are: {}".format(valid_features) 278 | ) 279 | sys.exit() 280 | 281 | # then, check the main feature 282 | fc_type = cfg.feature_counts.feature 283 | fc_attr = cfg.feature_counts.attribute 284 | 285 | logger.info("Checking your input GFF file and feature counts options.") 286 | logger.info(f"You chose '{fc_type}' feature and '{fc_attr}' attribute") 287 | # if only one feature (99% of the projet) 288 | if "," not in fc_type: 289 | fc_types = [fc_type] 290 | 291 | for fc_type in fc_types: 292 | S = sum(df_gff["genetic_type"] == fc_type) 293 | if S == 0: 294 | logger.error( 295 | "Found 0 entries for feature '{}'. Please choose a valid feature from: {}".format( 296 | fc_type, valid_features 297 | ) 298 | ) 299 | sys.exit() 300 | else: 301 | logger.info(f"Found {S} '{fc_type}' entries") 302 | 303 | # now we check the attribute: 304 | dd = df_gff.query("genetic_type==@fc_type") 305 | attributes = [y for x in dd.attributes for y in x.keys()] 306 | S = attributes.count(fc_attr) 307 | if S == 0: 308 | uniq_attributes = set(attributes) 309 | logger.error( 310 | f"Found 0 entries for attribute '{fc_attr}'. Please choose a valid attribute from: {uniq_attributes}" 311 | ) 312 | sys.exit() 313 | else: 314 | unique = set([x[fc_attr] for k, x in dd.attributes.items() if fc_attr in x]) 315 | logger.info(f"Found {S} '{fc_attr}' entries for the attribute [{len(unique)} unique entries]") 316 | 317 | if S != len(unique): 318 | logger.warning("Attribute non-unique. Feature counts should handle it") 319 | 320 | if options.feature_counts_extra_attributes: 321 | for extra_attr in cfg.feature_counts.extra_attributes.split(","): 322 | if extra_attr not in set(attributes): 323 | logger.error("{extra_attr} not found in the GFF attributes. Try one of {set(attributes)}") 324 | sys.exit() 325 | 326 | # need to move the custom file into the working directoty 327 | if "," in cfg.feature_counts.feature: 328 | logger.info("Building a custom GFF file (custom.gff) using Sequana. Please wait") 329 | genome_directory = os.path.abspath(cfg.general.genome_directory) 330 | genome_name = genome_directory.rsplit("/", 1)[1] 331 | prefix_name = genome_directory + "/" + genome_name 332 | 333 | from sequana import GFF3 334 | 335 | gff = GFF3(prefix_name + ".gff") 336 | fc_types = cfg.feature_counts.feature.strip().split(",") 337 | gff.save_gff_filtered(features=fc_types, filename="custom.gff") 338 | cfg.general.custom_gff = "custom.gff" 339 | 340 | # finalise the command and save it; copy the snakemake. update the config 341 | # file and save it. 342 | manager.teardown() 343 | 344 | try: # option added in latest version 345 | if cfg.general.custom_gff: 346 | shutil.copy(cfg.general.custom_gff, options.workdir) 347 | except: 348 | pass 349 | 350 | 351 | if __name__ == "__main__": 352 | main() 353 | -------------------------------------------------------------------------------- /sequana_pipelines/rnaseq/data/rnadiff_GLM.R: -------------------------------------------------------------------------------- 1 | ################################################### 2 | ### DESeq2_GLM parameters: to be modified by the user 3 | ################################################### 4 | rm(list=ls()) # remove all the objects of the R session 5 | 6 | workspace <- "." # workspace for the R session 7 | 8 | projectName <- "BXXXX" # name of the project (cannot contain any ".") 9 | analysisVersion <- "vN" # name of the analysis version (cannot contain any ".") 10 | 11 | author <- "FILLME (Biomics platform - Institut Pasteur)" # author of the statistical report 12 | researcher <- "FILLME" # name of the researcher 13 | chief <- "" # name of the head of unit 14 | 15 | varInt1 <- "varInt1" # first factor of interest 16 | varInt2 <- "varInt2" # second factor of interest 17 | condRef1 <- "condRef1" # reference biological condition for varInt1 18 | condRef2 <- "condRef2" # reference biological condition for varInt2 19 | design <- ~ varInt1 + varInt2 + varInt1:varInt2 # design du mod?le statistique 20 | 21 | outfile <- TRUE # TRUE to export figures, FALSE to display them in R 22 | colors <- c("#f3c300", "#875692", "#f38400", "#a1caf1", "#be0032", # vector of colors of each group on the plots 23 | "#c2b280", "#848482", "#008856", "#e68fac", "#0067a5", 24 | "#f99379", "#604e97", "#f6a600", "#b3446c", "#dcd300", 25 | "#882d17", "#8db600", "#654522", "#e25822", "#2b3d26") 26 | 27 | cooksCutoff <- NULL # outliers detection threshold (NULL to leave DESeq2 choosing it, Inf to keep outliers) 28 | independentFiltering <- TRUE # FALSE to turn off the independent filtering (default is TRUE) 29 | alpha <- 0.05 # threshold of statistical significance 30 | adjMethod <- "BH" # p-value adjustment method: "BH" (default) or "BY" 31 | 32 | type.trans <- "VST" # transformation for exploratory analysis: "VST" ou "rlog" (if size factors vary very widely) 33 | locfunc <- "median" # "median" (default) or "shorth" with library(genefilter) (to estimate the size factors) 34 | interestingFeatures <- NULL # vector of features for which to plot the expression 35 | featuresToRemove <- c("alignment_not_unique", # names of the features to be removed (default is the HTSeq-count specific lines) 36 | "ambiguous", "no_feature", 37 | "not_aligned", "too_low_aQual") 38 | 39 | fitType <- "parametric" # mean-variance relationship: "parametric" (default) or "local" 40 | 41 | ##################################### 42 | # INPUT FILES 43 | ##################################### 44 | geneLengthFile <- "input_gene_lengths.tsv" # path to the genes lenghts file (default is NULL) 45 | targetFile <- "target.txt" # path to the design/target file 46 | infoFile <- "input_info.tsv" # path to the annotation file (needed if 0 counts not in counts files) 47 | rawDir <- "feature_counts" # path to the directory containing raw counts files 48 | 49 | ################################################### 50 | ### code chunk number 1: construction autres parametres et divers chargements 51 | ################################################### 52 | setwd(workspace) 53 | library(RNADiff) 54 | library(knitr) 55 | if (locfunc=="shorth") library(genefilter) 56 | 57 | versionName <- paste(projectName, analysisVersion, sep="-") 58 | ncol <- NULL # largeur des tableaux dans le rapport 59 | 60 | cat("Creation des dossiers d'exports\n") 61 | dir.create("figures", showWarnings=FALSE) 62 | dir.create("tables", showWarnings=FALSE) 63 | 64 | ################################################### 65 | ### code chunk number 2: loadData 66 | ################################################### 67 | cat("Chargement des annotations et longueurs des genes si besoin\n") 68 | if (!is.null(infoFile)) print(head(info <- read.delim(infoFile, sep="\t", header=TRUE, stringsAsFactors=FALSE))) else info <- NULL 69 | if (!is.null(geneLengthFile)) print(head(glength <- read.table(geneLengthFile, sep="\t", header=TRUE, stringsAsFactors=FALSE))) else glength <- NULL 70 | 71 | cat("Chargement du target file\n") 72 | print(target <- loadTargetFile(targetFile, varInt=c(varInt1,varInt2), condRef=c(condRef1,condRef2))) 73 | 74 | cat("Chargement des donnees\n") 75 | counts <- loadCountData(target, rawDir=rawDir, versionName=versionName, featuresToRemove=featuresToRemove) 76 | 77 | cat("Verifier que les echantillons de counts sont dans le meme ordre que le target\n") 78 | print(cbind(target=as.character(target[,1]),counts=colnames(counts))) 79 | 80 | cat("Verifier que les identifiants dans info et glength sont les memes que dans les comptages\n") 81 | checkInfoGlength(counts=counts, info=info, glength=glength) 82 | 83 | #################################################### 84 | #### code chunk number 3: description of raw data 85 | #################################################### 86 | cat("\nFigure : nombre de reads par echantillon\n") 87 | barplotTC(counts=counts, group=target[,c(varInt1,varInt2)], col=colors, out=outfile, versionName=versionName) 88 | 89 | cat("Figure : nombre de comptages nuls par echantillon\n") 90 | barplotNul(counts=counts, group=target[,c(varInt1,varInt2)], col=colors, out=outfile, versionName=versionName) 91 | N <- nrow(counts) - nrow(removeNul(counts)) 92 | cat("\nNombre de genes avec que des comptages nuls :", N,"\n") 93 | 94 | cat("\nFigure : estimation de la densite des comptages de chaque echantillon\n") 95 | densityPlot(counts=counts, group=target[,c(varInt1,varInt2)], col=colors, out=outfile, versionName=versionName) 96 | 97 | cat("\nFigure + tableau : sequences majoritaires pour chaque echantillon\n") 98 | majSequences <- majSequences(counts=counts, group=target[,c(varInt1,varInt2)], versionName=versionName, col=colors, out=outfile) 99 | 100 | cat("\nCalcul des SERE\n") 101 | print(sere <- pairwiseSERE(counts, versionName=versionName)) 102 | 103 | cat("\nFigure : pairwise scatterplots of samples\n") 104 | pairwiseScatterPlots(counts=counts, group=target[,c(varInt1,varInt2)], out=outfile, versionName=versionName) 105 | 106 | ################################################### 107 | ### code chunk number 4: creating DESeqDataSet object, normalization and estimateDispersion 108 | ################################################### 109 | dds <- DESeqDataSetFromMatrix(countData=counts, colData=target, design=design) 110 | print(design(dds)) 111 | 112 | cat("Estimation des size factors\n") 113 | dds <- estimateSizeFactors(dds, locfunc=eval(as.name(locfunc))) 114 | print(sf <- sizeFactors(dds)) 115 | cat("\nFigure : diagnostic des size factors\n") 116 | diagSizeFactors(dds=dds, group=target[,c(varInt1,varInt2)], col=colors, out=outfile, versionName=versionName) 117 | 118 | cat("\nCalcul des dispersions et graph relation mean-dispersion\n") 119 | dds <- estimateDispersions(dds, fitType=fitType) 120 | plotDispEstimates(dds=dds, out=outfile, versionName=versionName) 121 | cat("\nFigure : diagnostic de log-normalite des dispersions\n") 122 | diagLogNormalityDisp(dds=dds, out=outfile, versionName=versionName) 123 | 124 | #################################################### 125 | ### code chunk number 5: Boxplot avant et apres normalisation 126 | #################################################### 127 | cat("Figure : boxplots sur comptages bruts et normalises\n") 128 | boxplotCounts(counts=counts(dds), group=target[,c(varInt1,varInt2)], col=colors, out=outfile, versionName=versionName) 129 | boxplotCounts(counts=counts(dds, normalized=TRUE), group=target[,c(varInt1,varInt2)], col=colors, type="norm", out=outfile, versionName=versionName) 130 | 131 | ################################################### 132 | ### code chunk number 6: clustering + PCA of samples 133 | ################################################### 134 | cat("Figure : dendrogramme de la classification sur comptages transformes\n") 135 | if (type.trans == "VST") counts.trans <- assay(varianceStabilizingTransformation(dds)) 136 | if (type.trans == "rlog") counts.trans <- assay(rlogTransformation(dds)) 137 | clusterPlot(counts=counts.trans, out=outfile, versionName=versionName) 138 | 139 | cat("Figure : premier plan de l'ACP sur les comptages transformes\n") 140 | PCAPlot(dds=dds, group=target[,c(varInt1,varInt2)], col=colors, type.trans=type.trans, out=outfile, versionName=versionName) 141 | 142 | ################################################### 143 | ### code chunk number 7: analyse differentielle 144 | ################################################### 145 | cat("Tests statistiques\n") 146 | dds <- nbinomWaldTest(dds) 147 | 148 | resultsNames(dds) 149 | # [1] "Intercept" "soucheSEG" "soucheB6" 150 | # [4] "infectionNI" "infectionImoins" "infectionIplus" 151 | # [7] "soucheSEG.infectionNI" "soucheB6.infectionNI" "soucheSEG.infectionImoins" 152 | # [10] "soucheB6.infectionImoins" "soucheSEG.infectionIplus" "soucheB6.infectionIplus" 153 | 154 | to_test <- list("B6-NI_vs_SEG-NI"=c(0,-1,1,0,0,0,-1,1,0,0,0,0), 155 | "B6-Imoins_vs_SEG-Imoins"=c(0,-1,1,0,0,0,0,0,-1,1,0,0), 156 | "(SEG-Iplus_vs_SEG-Imoins)_vs_(B6-Iplus_vs_B6-Imoins)"=c(0,0,0,0,0,0,0,0,-1,1,1,-1)) 157 | 158 | checkContrasts(coefs=resultsNames(dds),contrasts=to_test,versionName=versionName) 159 | 160 | results <- vector("list",length(to_test)); names(results) <- names(to_test); 161 | for (name in names(to_test)){ 162 | results[[name]] <- results(dds, contrast=to_test[[name]], pAdjustMethod=adjMethod, 163 | cooksCutoff=ifelse(!is.null(cooksCutoff), cooksCutoff, TRUE), 164 | independentFiltering=independentFiltering, alpha=alpha) 165 | } 166 | 167 | ################################################### 168 | ### code chunk number 8: results of the independent filtering 169 | ################################################### 170 | if(independentFiltering){ 171 | cat("Tableau : independent filtering\n") 172 | print(tabIndepFiltering <- tabIndepFiltering(results, versionName=versionName), quote=FALSE) 173 | } 174 | 175 | ################################################### 176 | ### code chunk number 9: export tables 177 | ################################################### 178 | cat("Export des resultats\n") 179 | complete <- exportComplete.DESeq2(dds=dds, results=results, alpha=alpha, cooksCutoff=cooksCutoff, 180 | group=paste(target[,varInt1], target[,varInt2], sep="-"), 181 | conds=unique(paste(target[,varInt1], target[,varInt2], sep="-")), 182 | versionName=versionName, info=info, export=TRUE) 183 | 184 | cat("# genes up, down et total par comparaison\n") 185 | print(nDiffTotal <- nDiffTotal(complete, alpha=alpha, versionName=versionName), quote=FALSE) 186 | 187 | cat("Figure : nb de genes DE selon seuil FDR\n") 188 | nbDiffSeuil(complete=complete, out=outfile, versionName=versionName) 189 | 190 | if (!is.null(geneLengthFile)){ 191 | cat("Export : comptages normalises par la longueur des genes\n") 192 | normGeneLength(counts=counts(dds, normalized=TRUE), glength=glength, versionName=versionName) 193 | geneLengthEffect(counts, complete, glength, out=outfile, versionName=versionName) 194 | } 195 | 196 | ################################################### 197 | ### code chunk number 10: distribution of raw p-values and MA-plot 198 | ################################################### 199 | cat("Figure : distribution des log2(Fold-Changes)\n") 200 | diagLogFC(complete=complete, out=outfile, versionName=versionName) 201 | 202 | cat("Figure : histogramme des p-valeurs brutes\n") 203 | histoRawp(complete=complete, out=outfile, versionName=versionName) 204 | 205 | cat("\nFigure : MA-plot\n") 206 | MAplotDE(complete=complete, pvalCutoff=alpha, out=outfile, versionName=versionName) 207 | 208 | cat("\nFigure : volcano-plot\n") 209 | volcanoPlotDE(complete=complete, pvalCutoff=alpha, out=outfile, versionName=versionName) 210 | 211 | cat("\nFigure : Venn diagram\n") 212 | vennDiagramDE(complete=complete, alpha=alpha, out=outfile, versionName=versionName) 213 | 214 | cat("\nFigure : heatmap\n") 215 | heatmapDE(counts.trans=counts.trans, complete=complete, alpha=alpha, out=outfile, 216 | key.xlab=paste0(type.trans, "-centered data"), versionName=versionName) 217 | 218 | cat("\nFigure : interesting features\n") 219 | if (!is.null(interestingFeatures)){ 220 | plotEvolution(mat=log2(counts(dds, normalized=TRUE)+1), features=interestingFeatures, 221 | target=target, varInt1=varInt2, varInt2=varInt1, colors=colors, 222 | ylab=expression(log[2] ~ norm ~ counts + 1), out=outfile, versionName=versionName) 223 | } 224 | 225 | ################################################### 226 | ### code chunk number 11: sessionInfo and saving 227 | ################################################### 228 | cat("Sauvegarde des resultats\n") 229 | sessionInfo <- sessionInfo() 230 | pckVersionRNADiff <- packageVersion("RNADiff") 231 | pckVersionDESeq2 <- packageVersion("DESeq2") 232 | save.image(file=paste0(versionName, ".RData")) 233 | # export RData for PF2heatmaps 234 | results <- lapply(results, as.data.frame) 235 | pf2heatmaps_objects <- c("varInt1", "varInt2", "target", "type.trans", "counts.trans", "results", "info") 236 | save(list=pf2heatmaps_objects, file=paste0(versionName, "_PF2heatmaps.RData"), version=2) 237 | # export RData for PF2toolsFilter 238 | extract_col <- function(comp, info=NULL){ 239 | if (is.null(info)){ 240 | comp[, c("Id","baseMean", "log2FoldChange","padj")] 241 | } else{ 242 | comp[, c(1:ncol(info), which(names(comp) %in% c("baseMean", "log2FoldChange","padj")))] 243 | } 244 | } 245 | complete <- lapply(complete, extract_col, info=info) 246 | save(complete, file=paste0(versionName, "_PF2toolsFilter.RData"), version=2) 247 | 248 | ################################################### 249 | ### code chunk number 12: knitr compilation 250 | ################################################### 251 | if (!outfile){ 252 | cat("outfile is FALSE: report and slides cannot be generated\n") 253 | } else{ 254 | cat("Creation du rapport et des slides\n") 255 | knit(system.file("reportGLM.Rnw", package="RNADiff"), paste0("report-", versionName, ".tex"), quiet=TRUE) 256 | knit(system.file("slidesGLM.Rnw", package="RNADiff"), paste0("slides-", versionName, ".tex"), quiet=TRUE) 257 | cat("Compilation du rapport\n") 258 | system(paste0("pdflatex report-", versionName, ".tex")) 259 | system(paste0("bibtex report-", versionName, ".aux")) 260 | system(paste0("pdflatex report-", versionName, ".tex")) 261 | system(paste0("pdflatex report-", versionName, ".tex")) 262 | } 263 | -------------------------------------------------------------------------------- /sequana_pipelines/rnaseq/config.yaml: -------------------------------------------------------------------------------- 1 | # ============================================================================ 2 | # Config file for RNA-seq 3 | # 4 | # ==================[ Sections for the users ]================================ 5 | # 6 | # One of input_directory, input_pattern and input_samples must be provided 7 | # If input_directory provided, use it otherwise if input_pattern provided, 8 | # use it, otherwise use input_samples. 9 | # ============================================================================ 10 | sequana_wrappers: "v24.8.29" 11 | 12 | input_directory: 13 | input_readtag: _R[12]_ 14 | input_pattern: '*fastq.gz' 15 | exclude_pattern: 16 | 17 | # See sequana_pipetools.readthedocs.io for details about these 2 options 18 | # common prefixes are removed. addition prefixes may be removed here 19 | #extra_prefixes_to_strip = [] 20 | # in special cases, sample names can be extracted with a pattern 21 | #sample_pattern: '{sample}.fastq.gz' 22 | apptainers: 23 | sequana_tools: "https://zenodo.org/record/7102074/files/sequana_tools_0.14.3.img" 24 | salmon: "https://zenodo.org/record/5708843/files/salmon_1.3.0.img" 25 | fastqc: "https://zenodo.org/record/7015004/files/fastqc_0.11.9-py3.img" 26 | fastp: "https://zenodo.org/record/7319782/files/fastp_0.23.2.img" 27 | igvtools: "https://zenodo.org/record/7022635/files/igvtools_2.12.0.img" 28 | graphviz: "https://zenodo.org/record/7928262/files/graphviz_7.0.5.img" 29 | multiqc: "https://zenodo.org/record/10205070/files/multiqc_1.16.0.img" 30 | rnaseqc: "https://zenodo.org/record/5799564/files/rnaseqc_2.35.0.img" 31 | 32 | # =========================================== Sections for the users 33 | 34 | ############################################################################# 35 | # Genome section: 36 | # 37 | # :Parameters: 38 | # 39 | # - aligner: either star or bowtie2. 40 | # - genome_directory: directory where all indexes are written. 41 | # - rRNA_contaminant: path to an existing fasta file for ribosomal RNA (to be found in 42 | # genome_directory) 43 | # - rRNA_feature: if rRNA_contaminant is not provided, ribosomal RNA will be extract 44 | # from GFF using this feature name. It must be found. 45 | general: 46 | aligner: bowtie2 47 | genome_directory: 48 | contaminant_file: 49 | rRNA_feature: rRNA 50 | custom_gff: '' 51 | 52 | 53 | ################################################################# 54 | # FastQC section 55 | # 56 | # :Parameters: 57 | # 58 | # - options: string with any valid FastQC options 59 | # 60 | fastqc: 61 | skip_fastqc_raw: true 62 | options: --nogroup 63 | threads: 4 64 | resources: 65 | mem: 4G 66 | 67 | ####################################################################### 68 | # Quality trimming and adapter removal 69 | # 70 | # for cutadapt, please fill the fwd and rev fields if required. It can be a 71 | # string, or a filename. If a filename, it must be prefixed with "file:" to 72 | # specify that it is a file and not a string. If the tool is cutadapt, the empty 73 | # fwd and rev fields means that no adapters are to be used. 74 | # 75 | # :Parameters: 76 | # 77 | # - fwd: a string or file (prefixed with *file:*) 78 | # - m: 20 means discard trimmed reads that are shorter than 20. 79 | # must be > 0 80 | # - quality: 0 means no trimming, 30 means keep base with quality 81 | # above 30 82 | # - mode: must be set to one of 83 | # - g for 5' 84 | # - a for 3' 85 | # - b for both 5'/3' 86 | # - rev: a string or file (prefixed with *file:*) 87 | # - tool: only cutadapt supported for now 88 | # - threads: number of threads to use (atropos only) 89 | # - options: See cutadapt documentation for details on 90 | # cutadapt.readthedocs.io. We change the default value 91 | # of -O to 6 (at least 6 bases are required to match before 92 | # trimming of an adapter) 93 | # 94 | # tool_choice__ = ["atropos", "cutadapt"] 95 | # 96 | # trim-n trims Ns at the end of the read 97 | cutadapt: 98 | tool_choice: cutadapt 99 | fwd: '' 100 | rev: '' 101 | m: 20 # {"strict_min": 0} 102 | mode: b # {"values": ["b","g","a"]} 103 | options: -O 6 --trim-n 104 | quality: 30 # {"range": [0,40]} 105 | threads: 4 106 | 107 | 108 | ############################################################################# 109 | # -Q should disable the quality filter 110 | # 111 | # Quality filtering only limits the N base number (-n, --n_base_limit) 112 | # meaning if 5 Ns are found, the read is discarded, 113 | # -q is the quality value se to Q15 to be qualified; If more than 40% of bases 114 | # are unqualified, the read is discarded. 115 | # You can also filter reads by average quality score using -e QUAL_score 116 | # 117 | # minimum length is set to 15 by default 118 | # 119 | # Adapter trimming is set by default. Can be disable with -A 120 | # For adapters, this is automatic but you can be specific using 121 | # --adapter_sequence for read1, and --adapter_sequence_r2 for read2. 122 | # The --cut_tail moves a sliding window from tail (3') to front, drop the bases 123 | # in the window if its mean quality is below cut_mean_quality, stop otherwise. 124 | # Use cut_tail_window_size to set the widnow size (default 4)), and 125 | # cut_tail_mean_quality to set the mean quality threshold (default 20) 126 | # Other useful options: --disable_adapter_trimming and --disable_quality_filtering. 127 | # or -n 5 (minimum number of Ns required to discard a read) 128 | fastp: 129 | options: ' --cut_tail ' 130 | minimum_length: 20 131 | adapters: '' 132 | quality: 15 133 | threads: 4 134 | disable_adapter_trimming: false 135 | disable_quality_filtering: false 136 | resources: 137 | mem: 8G 138 | 139 | ####################################################### 140 | # Quality trimming software choice 141 | # 142 | # software_choice__ = ["atropos", "cutadapt", "fastp"] 143 | # 144 | trimming: 145 | software_choice: fastp 146 | do: true 147 | 148 | ############################################################################# 149 | # bowtie1_mapping_rna used to align reads against ribosomal RNA 150 | # 151 | # :Parameters: 152 | # 153 | # - do: if unchecked, this rule is ignored 154 | # - options: any options recognised by bowtie1 tool 155 | # - threads: number of threads to be used 156 | # - nreads: no need to analyse all data to estimate the ribosomal content. 157 | # analyse 100,000 reads by default. Set to -1 to ignore and analyse all data 158 | bowtie1_mapping_rna: 159 | # remove in v1.20 and set automatically to on/off if rRNA/fasta provided 160 | # do: true 161 | options: '' 162 | threads: 4 163 | nreads: 100000 164 | 165 | ############################################################################# 166 | # star_mapping used to align reads against genome file 167 | # 168 | # :Parameters: 169 | # 170 | # - do: if unchecked, this rule is ignored 171 | # - options: any options recognised by rna-star tool. Set limitBAMsortRAM to 30G 172 | # - threads: number of threads to be used 173 | # - legacy: if set to True will use the old 2-pass version from STAR 174 | # used in this pipeline until v0.15.3. If you want to use the 175 | # 2-pass mode available in star, you will need star 2.7 and above 176 | # 177 | star_mapping: 178 | options: " --limitBAMsortRAM 30000000000 --outFilterMismatchNoverLmax 0.05 --seedSearchStartLmax 20 " 179 | legacy: True 180 | threads: 4 181 | resources: 182 | mem: 32G 183 | 184 | ############################################################################## 185 | # STAR indexing section 186 | # 187 | # :Parameters: 188 | # 189 | # - options: string with any valid STAR options 190 | star_index: 191 | options: 192 | threads: 4 193 | resources: 194 | mem: 4G 195 | ############################################################################# 196 | # bowtie1_mapping_ref used to align reads against genome file 197 | # 198 | # :Parameters: 199 | # 200 | # - do: if unchecked, this rule is ignored 201 | # - options: any options recognised by bowtie1 tool 202 | # - threads: number of threads to be used 203 | # 204 | bowtie1_mapping_ref: 205 | options: --chunkmbs 400 -m 1 206 | threads: 4 207 | 208 | ############################################################################# 209 | # bowtie2_mapping used to align reads against genome file 210 | # 211 | # :Parameters: 212 | # 213 | # - do: if unchecked, this rule is ignored 214 | # - options: any options recognised by bowtie2 tool 215 | # - threads: number of threads to be used 216 | # 217 | bowtie2_mapping: 218 | #options: "--dovetail --no-mixed --no-discordant " for paired-end data 219 | options: '' 220 | threads: 4 221 | genome_size_larger_than_4gb: false 222 | resources: 223 | mem: 20G 224 | 225 | bowtie2_index: 226 | options: '' 227 | threads: 4 228 | resources: 229 | mem: 20G 230 | 231 | salmon_index: 232 | threads: 2 233 | options: 234 | resources: 235 | mem: 4G 236 | 237 | salmon_mapping: 238 | options: -l A 239 | threads: 4 240 | resources: 241 | mem: 4G 242 | 243 | ############################################################################# 244 | # feature_counts used to count reads against features 245 | # 246 | # :Parameters: 247 | # 248 | # - do: if unchecked, this rule is ignored 249 | # - options: any options recognised by feature_counts tool except -s 250 | # - threads: number of threads to be used 251 | # - strandness: (optional) you should provide the strand parameters, given 252 | # from the experimental design. If not provided, we will guess it (see 253 | # tolerance parameter here below) 254 | # - tolerance: if strandness is not provided, we will guess it from 255 | # the data. The metric used is between 0 and 1. It is a ratio between 256 | # strand + and -. If below tolerance, the strand is reversely stranded. If 257 | # above 1-tolerance, it is (forward) stranded. If around 0.5 +- tolerance, 258 | # it is unstranded. Otherwise, it means our guess would not be very 259 | # reliable. A warning will be raised. Note also that if there is no 260 | # consensus across samples, a warning/error may also be raised. tolerance 261 | # is therefore in the range [0-0.25] 262 | # - feature: this is equivalent to the -t option to specify the feature type in GTF 263 | # annotation. For example gene, exon (default). 264 | # - attribute: this is the -g option to specify the attribute type in GTF annoation. 265 | # (gene_id) by default. 266 | # - extra_attributes: any other 267 | # 268 | feature_counts: 269 | do: true 270 | options: '' ## if exon/CDS is used, put -O option 271 | strandness: '' # set to 0, 1, 2 to force te type of strandness 272 | threads: 1 # 273 | tolerance: 0.15 # use to figure out the strandness. no need to change 274 | feature: gene # could be exon, mRNA, etc 275 | attribute: ID # could be ID, gene_id, etc 276 | extra_attributes: # by default, stores only the main attribute, but could add more 277 | 278 | ############################################################################# 279 | # bamCoverage write file in bigwig format from BAM files. 280 | # This tool takes an alignment of reads or fragments as input (BAM file) and 281 | # generates a coverage track (bigWig or bedGraph) as output. The coverage is 282 | # calculated as the number of reads per bin, where bins are short consecutive 283 | # counting windows of a defined size. It is possible to extended the length of 284 | # the reads to better reflect the actual fragment length. *bamCoverage* offers 285 | # normalization by scaling factor, Reads Per Kilobase per Million mapped reads 286 | # (RPKM), and 1x depth (reads per genome coverage, RPGC). 287 | # 288 | # :Parameters: 289 | # 290 | # - do: if unchecked, this rule is ignored 291 | # - binSize: Size of the bins, in bases, for the output of the 292 | # bigwig/bedgraph file. (default: 50) 293 | # - genomeSize: Report read coverage normalized to 1x sequencing depth 294 | # (also known as Reads Per Genomic Content (RPGC)). 295 | # Sequencing depth is defined as: (total number of 296 | # mapped reads * fragment length) / effective genome 297 | # size. The scaling factor used is the inverse of the 298 | # sequencing depth computed for the sample to match the 299 | # 1x coverage. To use this option, the effective genome 300 | # size has to be indicated after the option. The 301 | # effective genome size is the portion of the genome 302 | # that is mappable. 303 | # - extendReads: This parameter allows the extension of reads to 304 | # fragment size. 305 | # - minFragmentLength: The minimum fragment length needed for read/pair 306 | # inclusion. Note that a value other than 0 will exclude 307 | # all single-end reads. 308 | # - maxFragmentLength: The maximum fragment length needed for read/pair 309 | # inclusion. A value of 0 disables filtering and is 310 | # needed for including single-end and orphan reads. 311 | # - threads: number of threads to be used 312 | bam_coverage: 313 | do: false 314 | options: "--binSize 10 --effectiveGenomeSize 2150000000" 315 | #extendReads: 65 316 | #minFragmentLength: 0 #Note that a value other than 0 will exclude all single-end reads. 317 | #maxFragmentLength: 0 #A value of 0 disables filtering and is needed for including single-end and orphan reads. 318 | threads: 4 319 | resources: 320 | mem: 20G 321 | 322 | 323 | ########################################################################### 324 | # Creates a tdf files using igvtools 325 | # 326 | # :Parameters: 327 | # 328 | # - chromSize: path to index of reference genome obtain by samtools faidx 329 | igvtools: 330 | do: false 331 | # can be a link to the fasta file or an existing chrom.sizes file 332 | # If none provided, will use the input fasta file 333 | chrom_sizes_file: '' 334 | 335 | 336 | ############################################################################# 337 | # mark_duplicates (picard-tools) allows to mark PCR duplicate in BAM files 338 | # 339 | # :Parameters: 340 | # 341 | # - do: if unchecked, this rule is ignored. Mandatory for RNA-SeQC tool. 342 | # - remove: If true do not write duplicates to the output file instead of writing them with 343 | # appropriate flags set. Default value: false. This option can be set to 'null' to clear 344 | # the default value. Possible values: {true, false} 345 | # - tmpdir: write tempory file on this directory (default TMP_DIR=/tmp/, but could be "TMP_DIR=/local/scratch/") 346 | # 347 | mark_duplicates: 348 | do: false 349 | remove: false ## may be True 350 | tmpdir: ./tmp/ 351 | threads: 4 352 | resources: 353 | mem: 34G 354 | 355 | add_read_group: 356 | options: 357 | 358 | ############################################################################# 359 | # RNA-SeQC allows to compute a series of quality control metrics for RNA-seq data 360 | # 361 | # :Parameters: 362 | # 363 | # - do: if unchecked, this rule is ignored 364 | # - ref: Reference Genome in fasta format 365 | # - gtf: GTF File defining transcripts (must end in '.gtf') 366 | # You can use the 'sequana gff-to-gtf input.gff' command 367 | # - options: any options recognised by RNA-seQC tool 368 | rnaseqc: 369 | do: false 370 | gtf_file: 371 | options: --coverage 372 | resources: 373 | mem: 8G 374 | 375 | 376 | # if be_file not provided, try to create one on the fly 377 | # needs mark_duplicates 378 | rseqc: 379 | do: false 380 | bed_file: 381 | 382 | 383 | ############################################################################# 384 | # MultiQC aggregates results from bioinformatics analyses across many 385 | # samples into a single report. 386 | # 387 | # :Parameters: 388 | # 389 | # - options: any options recognised by multiqc 390 | # - output-directory: Create report in the specified output directory 391 | # - config_file: by default, we use sequana RNA-seq multiqc_config file. 392 | # If you want your own multiqc, fill this entry 393 | multiqc: 394 | options: -p -f -x *_init_* 395 | modules: '' 396 | input_directory: . 397 | config_file: multiqc_config.yaml 398 | resources: 399 | mem: 8G 400 | 401 | 402 | 403 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | 2 | 3 | .. image:: https://badge.fury.io/py/sequana-rnaseq.svg 4 | :target: https://pypi.python.org/pypi/sequana_rnaseq 5 | 6 | .. image:: http://joss.theoj.org/papers/10.21105/joss.00352/status.svg 7 | :target: http://joss.theoj.org/papers/10.21105/joss.00352 8 | :alt: JOSS (journal of open source software) DOI 9 | 10 | .. image:: https://github.com/sequana/rnaseq/actions/workflows/main.yml/badge.svg 11 | :target: https://github.com/sequana/rnaseq/actions/workflows/main.yaml 12 | 13 | 14 | 15 | This is is the **RNA-seq** pipeline from the `Sequana `_ project 16 | 17 | :Overview: RNASeq analysis from raw data to feature counts 18 | :Input: A set of Fastq Files and genome reference and annotation. 19 | :Output: MultiQC and HTML reports, BAM and bigwig files, feature Counts, script to launch differential analysis 20 | :Status: Production. 21 | :Citation(sequana): Cokelaer et al, (2017), ‘Sequana’: a Set of Snakemake NGS pipelines, Journal of Open Source Software, 2(16), 352, JOSS DOI doi:10.21105/joss.00352 22 | :Citation(pipeline): 23 | .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.4047837.svg 24 | :target: https://doi.org/10.5281/zenodo.4047837 25 | 26 | Installation 27 | ~~~~~~~~~~~~ 28 | 29 | **sequana_rnaseq** is based on Python3, just install the package as follows:: 30 | 31 | pip install sequana_rnaseq --upgrade 32 | 33 | You will need third-party software such as bowtie2/star. However, if you choose to use aptainer/singularity, 34 | then nothing to install except singularity itself ! See below for details. 35 | 36 | 37 | Usage 38 | ~~~~~ 39 | 40 | :: 41 | 42 | sequana_rnaseq --help 43 | sequana_rnaseq --input-directory DATAPATH --genome-directory genome --aligner-choice star 44 | 45 | This creates a directory with the pipeline and configuration file. You will then need 46 | to execute the pipeline:: 47 | 48 | cd rnaseq 49 | sh rnaseq.sh # for a local run 50 | 51 | This launch a snakemake pipeline. If you are familiar with snakemake, you can 52 | retrieve the pipeline itself and its configuration files and then execute the pipeline yourself with specific parameters:: 53 | 54 | snakemake -s rnaseq.rules -c config.yaml --cores 4 --stats stats.txt 55 | 56 | Or use `sequanix `_ interface. 57 | 58 | 59 | Usage with apptainer: 60 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 61 | 62 | With apptainer, initiate the working directory as follows:: 63 | 64 | sequana_rnaseq --apptainer-prefix ~/.sequana/apptainers 65 | 66 | Images are downloaded in the directory once for all; and then:: 67 | 68 | cd rnaseq 69 | sh rnaseq.sh 70 | 71 | if you decide to use snakemake manually, do not forget to add the apptainer-prefix options:: 72 | 73 | snakemake -s rnaseq.rules -c config.yaml --cores 4 --apptainer-prefix ~/.sequana/apptainers --apptainer-args "-B /home:/home" 74 | 75 | Usage on cluster with no internet access 76 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 77 | 78 | We use wrappers that are hosted on github: https://github.com/cokelaer/sequana-wrappers/ . There are copied locally in your home. However if you wish, you can download the repository locally before using the pipeline. For example: 79 | 80 | export WRAPPERS=/home/user/Wrappers 81 | git clone git@github.com:sequana/sequana-wrappers.git $WRAPPERS 82 | 83 | and define an environmental variable as follows (you should add it in your .profile or .bashrc for long term usage):: 84 | 85 | export SEQUANA_WRAPPERS=git+file://$WRAPPERS 86 | 87 | Requirements 88 | ~~~~~~~~~~~~ 89 | 90 | This pipelines requires lots of third-party executable(s). Here is a list that 91 | may change. A Message will inform you would you be missing an executable: 92 | 93 | - bowtie 94 | - bowtie2>=2.4.2 95 | - STAR 96 | - featureCounts (subread package) 97 | - picard 98 | - multiqc 99 | - samtools 100 | 101 | Note that bowtie>=2.4.2 is set to ensure the pipeline can be used with python 3.7-3.8-3.9 and the sequana-wrappers that supports bowtie2 with option --threads only (not previous versions). See environment.yaml or conda.yaml for latest list of required third-party tools. 102 | 103 | You can install most of the tools using `damona `_:: 104 | 105 | damona create --name sequana_tools 106 | damona activate sequana_tools 107 | damona install sequana_tools 108 | 109 | Or use the conda.yaml file available in this repository. If you start a new 110 | environment from scratch, those commands will create the environment and install 111 | all dependencies for you:: 112 | 113 | conda create --name sequana_env python 3.7.3 114 | conda activate sequana_env 115 | conda install -c anaconda qt pyqt>5 116 | pip install sequana 117 | pip install sequana_rnaseq 118 | conda install --file https://raw.githubusercontent.com/sequana/rnaseq/main/conda.yaml 119 | 120 | For Linux users, we provide singularity images available through within the damona project (https://damona.readthedocs.io). 121 | 122 | 123 | .. image:: https://raw.githubusercontent.com/sequana/sequana_rnaseq/main/sequana_pipelines/rnaseq/dag.png 124 | 125 | 126 | Details 127 | ~~~~~~~~~ 128 | 129 | This pipeline runs a **RNA-seq** analysis of sequencing data. It runs in 130 | parallel on a set of input FastQ files (paired or not). 131 | A brief HTML report is produced together with a MultiQC report. 132 | 133 | This pipeline is complex and requires some expertise for the interpretation. 134 | Many online-resources are available and should help you deciphering the output. 135 | 136 | Yet, it should be quite straigtforward to execute it as shown above. The 137 | pipeline uses bowtie1 to look for ribosomal contamination (rRNA). Then, 138 | it cleans the data with cutapdat if you say so (your data may already be 139 | pre-processed). If no adapters are provided (default), reads are 140 | trimmed for low quality bases only. Then, mapping is performed with standard mappers such as 141 | star or bowtie2 (--aligner option). Finally, 142 | feature counts are extracted from the previously generated BAM files. We guess 143 | the strand and save the feature counts into the directoy 144 | ./rnadiff/feature_counts. 145 | 146 | The pipelines stops there. However, RNA-seq analysis are followed by a different 147 | analysis (DGE hereafter). Although the DGE is not part of the pipeline, you can 148 | performed it with standard tools using the data in ./rnadiff directory. One such 149 | tool is provided within our framework (based on the well known DEseq2 software). 150 | 151 | Using our framework:: 152 | 153 | cd rnadiff 154 | sequana rnadiff --design design.csv --features all_features.out --annotation ANNOT \ 155 | --feature-name FEAT --attribute-name ATTR 156 | 157 | where ANNOT is the annotation file of your analysis, FEAT and ATTR the attribute 158 | and feature used in your analysis (coming from the annotation file). 159 | 160 | This produces a HTML report summarizing you differential analysis. 161 | 162 | Note that you need DESEQ2 and other packages installed. You may also use this contaier: https://zenodo.org/records/5708856 163 | 164 | 165 | Rules and configuration details 166 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 167 | 168 | Here is the `latest documented configuration file `_ 169 | to be used with the pipeline. Each rule used in the pipeline may have a section in the configuration file. 170 | 171 | 172 | .. warning:: the RNAseQC rule is switch off and is not currently functional in 173 | version 0.9.X 174 | 175 | Issues 176 | ~~~~~~ 177 | 178 | In the context of eukaryotes, you will need 32G of memory most probably. If this is too much, 179 | you can try to restrict the memory. Check out the config.yaml file in the star section. 180 | 181 | 182 | 183 | Changelog 184 | ~~~~~~~~~ 185 | 186 | ========= ==================================================================== 187 | Version Description 188 | ========= ==================================================================== 189 | 0.20.2 * Fix workflow and multiqc parsing 190 | 0.20.1 * Fix wrapper version in the config and fastp rule. 191 | 0.20.0 * Fix regression due to new sequana version 192 | * Update summary html to use new sequana plots 193 | 0.19.3 * fix regression with click to set the default rRNA to 'rRNA' again. 194 | 0.19.2 * fix bowtie1 regression in the log file, paired end case in 195 | multiqc and rnadiff script (regression) 196 | * set genome directory default to None to enforce its usage 197 | 0.19.1 * add rnaseqc container. 198 | * Update rseqc rules (redirection) 199 | * cleanup onsuccess rule 200 | 0.19.0 * Refactorisation to use click 201 | 0.18.1 * fastp multiqc regression. Fixed missing sample names by updating 202 | multiqc_config and adding sample names in the output filename 203 | 0.18.0 * New plots in the HTML reports. Includes version of executables. 204 | 0.17.2 * CHANGES: in star section, added --limitBAMsortRAM and set to 30G 205 | * BUG: Fix missing params (options) in star_mapping rule not taken 206 | into account 207 | 0.17.1 * use new rulegraph / graphviz apptainer 208 | 0.17.0 * fastp step changed to use sequana-wrappers. Slight change in 209 | config file. The reverse and forward adapter options called 210 | rev and fwd have been dropped in favor of a single adapters option. 211 | v0.17.0 config and schema are not compatible with previous 212 | versions. 213 | * Update singularity containers and add new one for fastp 214 | 0.16.1 * fix bug in feature counts automatic strand balance detection. Was 215 | always using the stranded case (2). 216 | * add singularity workflow for testing 217 | * fix documentation in config.yaml 218 | 0.16.0 * star, salmon, bam_coverage are now in sequana wrappers, updated 219 | the pipeline accordingly 220 | * updated config file and schema to include resources inside the 221 | config file (so as to use new --profile option) 222 | * set singularity images in all rules 223 | * star wrappers has changed significantly to use star 224 | recommandation. To keep using previous way, a legacy option 225 | is available and set to True in this version. 226 | * bamCoverage renamed in bam_coverage in the config file 227 | * multiqc_config removed redundant information and ordered 228 | the output in a coherent way (QC and then analysis) 229 | 0.15.2 * Fix bowtie2 rule to use new wrappers. Use wrappers in 230 | add_read_group and mark_duplicates 231 | 0.15.1 * Adapt to new bowtie2 align wrapper 232 | 0.15.0 * fix typo reported in https://github.com/sequana/rnaseq/issues/12 233 | * fix feature counts plots not showing anymore 234 | * Script for differential analysis is now in the main pipeline 235 | 0.14.2 * fix feature counts plots missing in multiqc results 236 | 0.14.1 * fix regression bug introduced in snakemake 6.9.0 237 | 0.14.0 * Allow the aligners to have dedicated index for each version in the 238 | same genome directory. 239 | * Ribosomal is now estimated on the first 100,000 reads to speed up 240 | analysis 241 | * --indexing and --force-indexing options not required anymore. 242 | Indexing will be done automatically and not redone if present. 243 | * Use of the new sequana-wrappers repository 244 | 0.13.0 * Major update to use the new sequana version and the RNADiff tools. 245 | * remove fastq_screen. One can use sequana_multitax for taxonomic 246 | content and contamination. 247 | * cutadapt is now replaced by fastp, although it can still be used. 248 | * full integration of salmon for prokaryotes and eukaryotes 249 | * user interface has now a --skip-gff-check option. Better handling of 250 | input gff with more meaningful messages 251 | * integration of rseqc tool 252 | 0.12.1 * indexing was always set to True in the config after 0.9.16 update. 253 | 0.12.0 * BUG fix: Switch mark_duplicates correctly beore feature counts 254 | 0.11.0 * rnadiff one factor is simplified 255 | * When initiating the pipeline, provide information about the GFF 256 | * mark duplicates off by default 257 | * feature_counts has more options in the help. split options into 258 | feature/attribute/extra_attributes. 259 | * HTML reports better strand picture and information about rRNA 260 | * refactorising the main standalone and config file to split feature 261 | counts optiions into feature and attribute. Sanoty checks are ow 262 | provided (--feature-counts-attribute, --feature-counts-feature-type) 263 | * can provide a custom GFF not in the genome directory 264 | * can provide several feature from the GFF. Then, a custom GFF is 265 | created and used 266 | * fix the --do-igvtools and --do-bam-coverage with better doc 267 | 0.10.0 * 9/12/2020 268 | * Fixed bug in sequana/star_indexing for small genomes (v0.9.7). 269 | Changed the rnaseq requirements to benefit from this bug-fix that 270 | could lead to seg fault with star aligner for small genomes. 271 | * Report improved with strand guess and plot 272 | 0.9.20 * 7/12/2020 273 | * BUG in sequana/star rules v0.9.6. Fixed in this release. 274 | * In config file, bowtie section 'do' option is removed. This is now 275 | set automatically if rRNA_feature or rRNA_file is provided. This 276 | allows us to skip the rRNA mapping entirely if needed. 277 | * fastq_screen should be functional. Default behaviour is off. If 278 | set only phiX174 will be search for. Users should build their own 279 | configuration file. 280 | * star/bowtie1/bowtie2 have now their own sub-directories in the 281 | genome directory. 282 | * added --run option to start pipeline automatically (if you know 283 | what you are doing) 284 | * rnadiff option has now a default value (one_factor) 285 | * add strandness plot in the HTML summary page 286 | 0.9.19 * Remove the try/except around tolerance (guess of strandness) to 287 | make sure this is provided by the user. Final onsuccess benefits 288 | from faster GFF function (sequana 0.9.4) 289 | 0.9.18 * Fix typo (regression bug) + add tolerance in schema + generic 290 | title in multiqc_config. (oct 2020) 291 | 0.9.17 * add the *tolerance* parameter in the feature_counts rule as a user 292 | parameter (config and pipeline). 293 | 0.9.16 * Best feature_counts is now saved into rnadiff/feature_counts 294 | directory and rnadiff scripts have been updated accordingly 295 | * the most probable feature count option is now computed more 296 | effectivily and incorporated inside the Snakemake pipeline (not in 297 | the onsuccess) so that multiqc picks the best one (not the 3 298 | results) 299 | * the target.txt file can be generated inside the pipeline if user 300 | fill the rnadiff/conditions section in the config file 301 | * indexing options are filled automatically when calling 302 | sequana_rnaseq based on the presence/absence of the index 303 | of the aligner being used. 304 | * salmon now integrated and feature counts created (still WIP in 305 | sequana) 306 | 0.9.15 * FastQC on raw data skipped by default (FastQC 307 | for processed data is still available) 308 | * Added paired options (-p) for featureCounts 309 | * Switch back markduplicates to False for now. 310 | 0.9.14 * Use only R1 with bowtie1 311 | * set the memory requirements for mark_duplicates in cluster_config 312 | file 313 | * Set temporary directory for mark_duplicates to be local ./tmp 314 | 0.9.13 * set mark_duplicate to true by default 315 | * use new sequana pipeline manager 316 | * export all features counts in a single file 317 | * custom HTML report 318 | * faster --help calls 319 | * --from-project option added 320 | 0.9.12 * include salmon tool as an alternative to star/bowtie2 321 | * include rnadiff directory with required input for Differential 322 | analysis 323 | 0.9.11 * Automatic guessing of the strandness of the experiment 324 | 0.9.10 * Fix multiqc for RNAseQC rule 325 | 0.9.9 * Fix RNAseQC rule, which is now available. 326 | * Fix ability to use existing rRNA file as input 327 | 0.9.8 * Fix indexing for bowtie1 to not be done if aligner is different 328 | * add new options: --feature-counts-options and --do-rnaseq-qc, 329 | --rRNA-feature 330 | * Based on the input GFF, we now check the validity of the rRNA 331 | feature and feature counts options to check whether the feature 332 | exists in the GFF 333 | * schema is now used to check the config file values 334 | * add a data test for testing and documentation 335 | 0.9.7 * fix typo found in version 0.9.6 336 | 0.9.6 * Fixed empty read tag in the configuration file 337 | * Possiblity to switch off cutadapt section 338 | * Fixing bowtie2 rule in sequana and update the pipeline accordingly 339 | * Include a schema file 340 | * output-directory parameter renamed into output_directory (multiqc 341 | section) 342 | * handle stdout correctly in fastqc, bowtie1, bowtie2 rules 343 | 0.9.5 * Fixed https://github.com/sequana/sequana/issues/571 344 | * More cutadapt commands and sanity checks 345 | * Fixed bowtie2 options import in rnaseq.rules 346 | 0.9.4 347 | 0.9.3 if a fastq_screen.conf is provided, we switch the fastqc_screen 348 | section ON automatically 349 | 0.9.0 **Major refactorisation.** 350 | 351 | * remove sartools, kraken rules. 352 | * Indexing is now optional and can be set in the configuration. 353 | * Configuration file is simplified with a general section to enter 354 | the genome location and aligner. 355 | * Fixed rules in sequana (0.8.0) that were not up-to-date with 356 | several executables used in the pipeline including picard, 357 | fastq_screen, etc. See Sequana Changelog for details with respect 358 | to rules changes. 359 | * Copying the feature counts in main directory ready to use for 360 | a differential analysis. 361 | ========= ==================================================================== 362 | -------------------------------------------------------------------------------- /test/data/Saccer3/Saccer3_rRNA.fa: -------------------------------------------------------------------------------- 1 | >chrXII:451574-451785 2 | ATAGTAAATAGTAACTTACATACATTAGTAAATGGTACACTCTTACACACTATCATCCTCATCGTATATTATAATAGATATATACAATACATGTTTTTACCCGGATCATAGAATTCTTAAGACAAATAAAATTTATAGAGACTTGTTCAGTCTACTTCTCTCTAAACTAGGCCCCGGCTCCTGCCAGTACCCACTTAGAAAGAAATAAAAA 3 | >chrXII:451574-458432 4 | ATAGTAAATAGTAACTTACATACATTAGTAAATGGTACACTCTTACACACTATCATCCTCATCGTATATTATAATAGATATATACAATACATGTTTTTACCCGGATCATAGAATTCTTAAGACAAATAAAATTTATAGAGACTTGTTCAGTCTACTTCTCTCTAAACTAGGCCCCGGCTCCTGCCAGTACCCACTTAGAAAGAAATAAAAAACAAATCAGACAACAAAGGCTTAATCTCAGCAGATCGTAACAACAAGGCTACTCTACTGCTTACAATACCCCGTTGTACATCTAAGTCGTATACAAATGATTTATCCCCACGCAAAATGACATTGCAATTCGCCAGCAAGCACCCAAGGCCTTTCCGCCAAGTGCACCGTTGCTAGCCTGCTATGGTTCAGCGACGCCACAAGGACGCCTTATTCGTATCCATCTATATTGTGTGGAGCAAAGAAATCACCGCGTTCTAGCATGGATTCTGACTTAGAGGCGTTCAGCCATAATCCAGCGGATGGTAGCTTCGCGGCAATGCCTGATCAGACAGCCGCAAAAACCAATTATCCGAATGAACTGTTCCTCTCGTACTAAGTTCAATTACTATTGCGGTAACATTCATCAGTAGGGTAAAACTAACCTGTCTCACGACGGTCTAAACCCAGCTCACGTTCCCTATTAGTGGGTGAACAATCCAACGCTTACCGAATTCTGCTTCGGTATGATAGGAAGAGCCGACATCGAAGAATCAAAAAGCAATGTCGCTATGAACGCTTGACTGCCACAAGCCAGTTATCCCTGTGGTAACTTTTCTGGCACCTCTAGCCTCAAATTCCGAGGGACTAAAGGATCGATAGGCCACACTTTCATGGTTTGTATTCACACTGAAAATCAAAATCAAGGGGGCTTTTACCCTTTTGTTCTACTGGAGATTTCTGTTCTCCATGAGCCCCCCTTAGGACATCTGCGTTATCGTTTAACAGATGTGCCGCCCCAGCCAAACTCCCCACCTGACAATGTCTTCAACCCGGATCAGCCCCGAATGGGACCTTGAATGCTAGAACGTGGAAAATGAATTCCAGCTCCGCTTCATTGAATAAGTAAAGAAACTATAAAGGTAGTGGTATTTCACTGGCGCCGAAGCTCCCACTTATTCTACACCCTCTATGTCTCTTCACAATGTCAAACTAGAGTCAAGCTCAACAGGGTCTTCTTTCCCCGCTGATTCTGCCAAGCCCGTTCCCTTGGCTGTGGTTTCGCTAGATAGTAGATAGGGACAGTGGGAATCTCGTTAATCCATTCATGCGCGTCACTAATTAGATGACGAGGCATTTGGCTACCTTAAGAGAGTCATAGTTACTCCCGCCGTTTACCCGCGCTTGGTTGAATTTCTTCACTTTGACATTCAGAGCACTGGGCAGAAATCACATTGCGTCAACATCACTTTCTGACCATCGCAATGCTATGTTTTAATTAGACAGTCAGATTCCCCTTGTCCGTACCAGTTCTAAGTTGATCGTTAATTGTAGCAAGCGACGGTCTACAAGAGACCTACCAAGGCCGTCTACAACAAGGCACGCAAGTAGTCCGCCTAGCAGAGCAAGCCCCACCAAGCAGTCCACAAGCACGCCCGCTGCGTCTGACCAAGGCCCTCACTACCCGACCCTTAGAGCCAATCCTTATCCCGAAGTTACGGATCTATTTTGCCGACTTCCCTTATCTACATTATTCTATCAACTAGAGGCTGTTCACCTTGGAGACCTGCTGCGGTTATCAGTACGACCTGGCATGAAAACTATTCCTTCCTGTGGATTTTCACGGGCCGTCACAAGCGCACCGGAGCCAGCAAAGGTGCTGGCCTCTTCCAGCCATAAGACCCCATCTCCGGATAAACCAATTCCGGGGTGATAAGCTGTTAAGAAGAAAAGATAACTCCTCCCAGGGCTCGCGCCGACGTCTCCACATTCAGTTACGTTACCGTGAAGAATCCATATCCAGGTTCCGGAATCTTAACCGGATTCCCTTTCGATGGTGGCCTGCATAAAATCAGGCCTTTGAAACGGAGCTTCCCCATCTCTTAGGATCGACTAACCCACGTCCAACTGCTGTTGACGTGGAACCTTTCCCCACTTCAGTCTTCAAAGTTCTCATTTGAATATTTGCTACTACCACCAAGATCTGCACTAGAGGCCGTTCGACCCGACCTTACGGTCTAGGCTTCGTCACTGACCTCCACGCCTGCCTACTCGTCAGGGCATCATATCAACCCTGACGGTAGAGTATAGGTAACACGCTTGAGCGCCATCCATTTTCAGGGCTAGTTCATTCGGCCGGTGAGTTGTTACACACTCCTTAGCGGATTCCGACTTCCATGGCCACCGTCCGGCTGTCTAGATGAACTAACACCTTTTGTGGTGTCTGATGAGCGTGTATTCCGGCACCTTAACTCTACGTTCGGTTCATCCCGCATCGCCAGTTCTGCTTACCAAAAATGGCCCACTAAAAGCTCTTCATTCAAATGTCCACGTTCAATTAAGTAACAAGGACTTCTTACATATTTAAAGTTTGAGAATAGGTCAAGGTCATTTCGACCCCGGAACCTCTAATCATTCGCTTTACCTCATAAAACTGATACGAGCTTCTGCTATCCTGAGGGAAACTTCGGCAGGAACCAGCTACTAGATGGTTCGATTAGTCTTTCGCCCCTATACCCAAATTCGACGATCGATTTGCACGTCAGAACCGCTACGAGCCTCCACCAGAGTTTCCTCTGGCTTCACCCTATTCAGGCATAGTTCACCATCTTTCGGGTCCCAACAGCTATGCTCTTACTCAAATCCATCCGAAGACATCAGGATCGGTCGATTGTGCACCTCTTGCGAGGCCCCAACCTACGTTCACTTTCATTACGCGTATGGGTTTTACACCCAAACACTCGCATAGACGTTAGACTCCTTGGTCCGTGTTTCAAGACGGGCGGCATATAACCATTATGCCAGCATCCTTGACTTACGTCGCAGTCCTCAGTCCCAGCTGGCAGTATTCCCACAGGCTATAATACTTACCGAGGCAAGCTACATTCCTATGGATTTATCCTGCCACCAAAACTGATGCTGGCCCAGTGAAATGCGAGATTCCCCTACCCACAAGGAGCAGAGGGCACAAAACACCATGTCTGATCAAATGCCCTTCCCTTTCAACAATTTCACGTACTTTTTCACTCTCTTTTCAAAGTTCTTTTCATCTTTCCATCACTGTACTTGTTCGCTATCGGTCTCTCGCCAATATTTAGCTTTAGATGGAATTTACCACCCACTTAGAGCTGCATTCCCAAACAACTCGACTCTTCGAAGGCACTTTACAAAGAACCGCACTCCTCGCCACACGGGATTCTCACCCTCTATGACGTCCTGTTCCAAGGAACATAGACAAGGAACGGCCCCAAAGTTGCCCTCTCCAAATTACAACTCGGGCACCGAAGGTACCAGATTTCAAATTTGAGCTTTTGCCGCTTCACTCGCCGTTACTAAGGCAATCCCGGTTGGTTTCTTTTCCTCCGCTTATTGATATGCTTAAGTTCAGCGGGTACTCCTACCTGATTTGAGGTCAAACTTTAAGAACATTGTTCGCCTAGACGCTCTCTTCTTATCGATAACGTTCCAATACGCTCAGTATAAAAAAAGATTAGCCGCAGTTGGTAAAACCTAAAACGACCGTACTTGCATTATACCTCAAGCACGCAGAGAAACCTCTCTTTGGAAAAAAAACATCCAATGAAAAGGCCAGCAATTTCAAGTTAACTCCAAAGAGTATCACTCACTACCAAACAGAATGTTTGAGAAGGAAATGACGCTCAAACAGGCATGCCCCCTGGAATACCAAGGGGCGCAATGTGCGTTCAAAGATTCGATGATTCACGGAATTCTGCAATTCACATTACGTATCGCATTTCGCTGCGTTCTTCATCGATGCGAGAACCAAGAGATCCGTTGTTGAAAGTTTTTAATATTTTAAAATTTCCAGTTACGAAAATTCTTGTTTTTGACAAAAATTTAATGAATAGATAAAATTGTTTGTGTTTGTTACCTCTGGGCCCCGATTGCTCGAATGCCCAAAGAAAAAGTTGCAAAGATATGAAAACTCCACAGTGTGTTGTATTGAAACGGTTTTAATTGTCCTATAACAAAAGCACAGAAATCTCTCACCGTTTGGAATAGCAAGAAAGAAACTTACAAGCCTAGCAAGACCGCGCACTTAAGCGCAGGCCCGGCTGGACTCTCCATCTCTTGTCTTCTTGCCCAGTAAAAGCTCTCATGCTCTTGCCAAAACAAAAAAATCCATTTTCAAAATTATTAAATTTCTTTAATGATCCTTCCGCAGGTTCACCTACGGAAACCTTGTTACGACTTTTAGTTCCTCTAAATGACCAAGTTTGTCCAAATTCTCCGCTCTGAGATGGAGTTGCCCCCTTCTCTAAGCAGATCCTGAGGCCTCACTAAGCCATTCAATCGGTACTAGCGACGGGCGGTGTGTACAAAGGGCAGGGACGTAATCAACGCAAGCTGATGACTTGCGCTTACTAGGAATTCCTCGTTGAAGAGCAATAATTACAATGCTCTATCCCCAGCACGACGGAGTTTCACAAGATTACCAAGACCTCTCGGCCAAGGTTAGACTCGCTGGCTCCGTCAGTGTAGCGCGCGTGCGGCCCAGAACGTCTAAGGGCATCACAGACCTGTTATTGCCTCAAACTTCCATCGGCTTGAAACCGATAGTCCCTCTAAGAAGTGGATAACCAGCAAATGCTAGCACCACTATTTAGTAGGTTAAGGTCTCGTTCGTTATCGCAATTAAGCAGACAAATCACTCCACCAACTAAGAACGGCCATGCACCACCACCCACAAAATCAAGAAAGAGCTCTCAATCTGTCAATCCTTATTGTGTCTGGACCTGGTGAGTTTCCCCGTGTTGAGTCAAATTAAGCCGCAGGCTCCACTCCTGGTGGTGCCCTTCCGTCAATTCCTTTAAGTTTCAGCCTTGCGACCATACTCCCCCCAGAACCCAAAGACTTTGATTTCTCGTAAGGTGCCGAGTGGGTCATTAAAAAAACACCACCCGATCCCTAGTCGGCATAGTTTATGGTTAAGACTACGACGGTATCTGATCATCTTCGATCCCCTAACTTTCGTTCTTGATTAATGAAAACGTCCTTGGCAAATGCTTTCGCAGTAGTTAGTCTTCAATAAATCCAAGAATTTCACCTCTGACAATTGAATACTGATGCCCCCGACCGTCCCTATTAATCATTACGATGGTCCTAGAAACCAACAAAATAGAACCAAACGTCCTATTCTATTATTCCATGCTAATATATTCGAGCAATACGCCTGCTTTGAACACTCTAATTTTTTCAAAGTAAAAGTCCTGGTTCGCCAAGAGCCACAAGGACTCAAGGTTAGCCAGAAGGAAAGGCCCCGTTGGAAATCCAGTACACGAAAAAATCGGACCGGCCAACCGGGCCCAAAGTTCAACTACGAGCTTTTTAACTGCAACAACTTTAATATACGCTATTGGAGCTGGAATTACCGCGGCTGCTGGCACCAGACTTGCCCTCCAATTGTTCCTCGTTAAGGTATTTACATTGTACTCATTCCAATTACAAGACCCGAATGGGCCCTGTATCGTTATTTATTGTCACTACCTCCCTGAATTAGGATTGGGTAATTTGCGCGCCTGCTGCCTTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTTATTCCCCGTTACCCGTTGAAACCATGGTAGGCCACTATCCTACCATCGAAAGTTGATAGGGCAGAAATTTGAATGAACCATCGCCAGCACAAGGCCATGCGATTCGAAAAGTTATTATGAATCATCAAAGAGTCCGAAGACATTGATTTTTTATCTAATAAATACATCTCTTCCAAAGGGTCGAGATTTTAAGCATGTATTAGCTCTAGAATTACCACAGTTATACCATGTAGTAAAGGAACTATCAAATAAACGATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTATAAATTGCTTATACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGACTACTGGCAGGATCAACCAGATAACTATCTTAAAAGAAGAAGCAACAAGCAGTAAAAAAGAAAGAAACCGAAATCTCTTTTTTTTTTTCCCACCTATTCCCTCTTGCTAGAAGATACTTATTGAGTTTGGAAACAGCTGAAATTCCAGAAAAATTGCTTTTTCAGGTCTCTCTGCTGCCGGAAATGCTCTCTGTTCAAAAAGCTTTTACACTCTTGACCAGCGCACTCCGTCACCATACCATAGCACTCTTTGAGTTTCCTCTAATCAGGTTCCACCAAACAGATACCCCGGTGTTTCACGGAATGGTACGTTTGATATCGCTGATTTGAGAGGAGGTTACACTTGAAGAATCACAGTCTTGCGACCGGCTATTCAACAAGGCATTCCCCCAAGTTTGAATTCTTTGAAATAGATTGCTATTAGCTAGTAATCCACCAAATCCTTCGCTGCTCACCAATGGAATCGCAAGATGCCCACGATGAGACTGTTCAGGTTAAACGCAAAAGAAACACACTCTGGGAATTTCTTCCCAAATTGTATCTCTCAATACGCATCAACCCATGTCAATTAAACACGCTGTATAGAGACTAGGCAGATCTGACGATCACCTAGCGACTCTCTCCACCGTTTGACGAGGCCATTTACAAAAACATAACGAACGACAAGCCTACTCGAATTCGTTTCCAAACTCTTTTCGAACTTGTCTTCAACTGCTTTCGCAT 5 | >chrXII:451785-455181 6 | ACAAATCAGACAACAAAGGCTTAATCTCAGCAGATCGTAACAACAAGGCTACTCTACTGCTTACAATACCCCGTTGTACATCTAAGTCGTATACAAATGATTTATCCCCACGCAAAATGACATTGCAATTCGCCAGCAAGCACCCAAGGCCTTTCCGCCAAGTGCACCGTTGCTAGCCTGCTATGGTTCAGCGACGCCACAAGGACGCCTTATTCGTATCCATCTATATTGTGTGGAGCAAAGAAATCACCGCGTTCTAGCATGGATTCTGACTTAGAGGCGTTCAGCCATAATCCAGCGGATGGTAGCTTCGCGGCAATGCCTGATCAGACAGCCGCAAAAACCAATTATCCGAATGAACTGTTCCTCTCGTACTAAGTTCAATTACTATTGCGGTAACATTCATCAGTAGGGTAAAACTAACCTGTCTCACGACGGTCTAAACCCAGCTCACGTTCCCTATTAGTGGGTGAACAATCCAACGCTTACCGAATTCTGCTTCGGTATGATAGGAAGAGCCGACATCGAAGAATCAAAAAGCAATGTCGCTATGAACGCTTGACTGCCACAAGCCAGTTATCCCTGTGGTAACTTTTCTGGCACCTCTAGCCTCAAATTCCGAGGGACTAAAGGATCGATAGGCCACACTTTCATGGTTTGTATTCACACTGAAAATCAAAATCAAGGGGGCTTTTACCCTTTTGTTCTACTGGAGATTTCTGTTCTCCATGAGCCCCCCTTAGGACATCTGCGTTATCGTTTAACAGATGTGCCGCCCCAGCCAAACTCCCCACCTGACAATGTCTTCAACCCGGATCAGCCCCGAATGGGACCTTGAATGCTAGAACGTGGAAAATGAATTCCAGCTCCGCTTCATTGAATAAGTAAAGAAACTATAAAGGTAGTGGTATTTCACTGGCGCCGAAGCTCCCACTTATTCTACACCCTCTATGTCTCTTCACAATGTCAAACTAGAGTCAAGCTCAACAGGGTCTTCTTTCCCCGCTGATTCTGCCAAGCCCGTTCCCTTGGCTGTGGTTTCGCTAGATAGTAGATAGGGACAGTGGGAATCTCGTTAATCCATTCATGCGCGTCACTAATTAGATGACGAGGCATTTGGCTACCTTAAGAGAGTCATAGTTACTCCCGCCGTTTACCCGCGCTTGGTTGAATTTCTTCACTTTGACATTCAGAGCACTGGGCAGAAATCACATTGCGTCAACATCACTTTCTGACCATCGCAATGCTATGTTTTAATTAGACAGTCAGATTCCCCTTGTCCGTACCAGTTCTAAGTTGATCGTTAATTGTAGCAAGCGACGGTCTACAAGAGACCTACCAAGGCCGTCTACAACAAGGCACGCAAGTAGTCCGCCTAGCAGAGCAAGCCCCACCAAGCAGTCCACAAGCACGCCCGCTGCGTCTGACCAAGGCCCTCACTACCCGACCCTTAGAGCCAATCCTTATCCCGAAGTTACGGATCTATTTTGCCGACTTCCCTTATCTACATTATTCTATCAACTAGAGGCTGTTCACCTTGGAGACCTGCTGCGGTTATCAGTACGACCTGGCATGAAAACTATTCCTTCCTGTGGATTTTCACGGGCCGTCACAAGCGCACCGGAGCCAGCAAAGGTGCTGGCCTCTTCCAGCCATAAGACCCCATCTCCGGATAAACCAATTCCGGGGTGATAAGCTGTTAAGAAGAAAAGATAACTCCTCCCAGGGCTCGCGCCGACGTCTCCACATTCAGTTACGTTACCGTGAAGAATCCATATCCAGGTTCCGGAATCTTAACCGGATTCCCTTTCGATGGTGGCCTGCATAAAATCAGGCCTTTGAAACGGAGCTTCCCCATCTCTTAGGATCGACTAACCCACGTCCAACTGCTGTTGACGTGGAACCTTTCCCCACTTCAGTCTTCAAAGTTCTCATTTGAATATTTGCTACTACCACCAAGATCTGCACTAGAGGCCGTTCGACCCGACCTTACGGTCTAGGCTTCGTCACTGACCTCCACGCCTGCCTACTCGTCAGGGCATCATATCAACCCTGACGGTAGAGTATAGGTAACACGCTTGAGCGCCATCCATTTTCAGGGCTAGTTCATTCGGCCGGTGAGTTGTTACACACTCCTTAGCGGATTCCGACTTCCATGGCCACCGTCCGGCTGTCTAGATGAACTAACACCTTTTGTGGTGTCTGATGAGCGTGTATTCCGGCACCTTAACTCTACGTTCGGTTCATCCCGCATCGCCAGTTCTGCTTACCAAAAATGGCCCACTAAAAGCTCTTCATTCAAATGTCCACGTTCAATTAAGTAACAAGGACTTCTTACATATTTAAAGTTTGAGAATAGGTCAAGGTCATTTCGACCCCGGAACCTCTAATCATTCGCTTTACCTCATAAAACTGATACGAGCTTCTGCTATCCTGAGGGAAACTTCGGCAGGAACCAGCTACTAGATGGTTCGATTAGTCTTTCGCCCCTATACCCAAATTCGACGATCGATTTGCACGTCAGAACCGCTACGAGCCTCCACCAGAGTTTCCTCTGGCTTCACCCTATTCAGGCATAGTTCACCATCTTTCGGGTCCCAACAGCTATGCTCTTACTCAAATCCATCCGAAGACATCAGGATCGGTCGATTGTGCACCTCTTGCGAGGCCCCAACCTACGTTCACTTTCATTACGCGTATGGGTTTTACACCCAAACACTCGCATAGACGTTAGACTCCTTGGTCCGTGTTTCAAGACGGGCGGCATATAACCATTATGCCAGCATCCTTGACTTACGTCGCAGTCCTCAGTCCCAGCTGGCAGTATTCCCACAGGCTATAATACTTACCGAGGCAAGCTACATTCCTATGGATTTATCCTGCCACCAAAACTGATGCTGGCCCAGTGAAATGCGAGATTCCCCTACCCACAAGGAGCAGAGGGCACAAAACACCATGTCTGATCAAATGCCCTTCCCTTTCAACAATTTCACGTACTTTTTCACTCTCTTTTCAAAGTTCTTTTCATCTTTCCATCACTGTACTTGTTCGCTATCGGTCTCTCGCCAATATTTAGCTTTAGATGGAATTTACCACCCACTTAGAGCTGCATTCCCAAACAACTCGACTCTTCGAAGGCACTTTACAAAGAACCGCACTCCTCGCCACACGGGATTCTCACCCTCTATGACGTCCTGTTCCAAGGAACATAGACAAGGAACGGCCCCAAAGTTGCCCTCTCCAAATTACAACTCGGGCACCGAAGGTACCAGATTTCAAATTTGAGCTTTTGCCGCTTCACTCGCCGTTACTAAGGCAATCCCGGTTGGTTTCTTTTCCTCCGCTTATTGATATGCTTAAGTTCAGCGGGTACTCCTACCTGATTTGAGGTCAAAC 7 | >chrXII:455181-455413 8 | TTTAAGAACATTGTTCGCCTAGACGCTCTCTTCTTATCGATAACGTTCCAATACGCTCAGTATAAAAAAAGATTAGCCGCAGTTGGTAAAACCTAAAACGACCGTACTTGCATTATACCTCAAGCACGCAGAGAAACCTCTCTTTGGAAAAAAAACATCCAATGAAAAGGCCAGCAATTTCAAGTTAACTCCAAAGAGTATCACTCACTACCAAACAGAATGTTTGAGAAGG 9 | >chrXII:455413-455571 10 | AAATGACGCTCAAACAGGCATGCCCCCTGGAATACCAAGGGGCGCAATGTGCGTTCAAAGATTCGATGATTCACGGAATTCTGCAATTCACATTACGTATCGCATTTCGCTGCGTTCTTCATCGATGCGAGAACCAAGAGATCCGTTGTTGAAAGTTT 11 | >chrXII:455571-455932 12 | TTAATATTTTAAAATTTCCAGTTACGAAAATTCTTGTTTTTGACAAAAATTTAATGAATAGATAAAATTGTTTGTGTTTGTTACCTCTGGGCCCCGATTGCTCGAATGCCCAAAGAAAAAGTTGCAAAGATATGAAAACTCCACAGTGTGTTGTATTGAAACGGTTTTAATTGTCCTATAACAAAAGCACAGAAATCTCTCACCGTTTGGAATAGCAAGAAAGAAACTTACAAGCCTAGCAAGACCGCGCACTTAAGCGCAGGCCCGGCTGGACTCTCCATCTCTTGTCTTCTTGCCCAGTAAAAGCTCTCATGCTCTTGCCAAAACAAAAAAATCCATTTTCAAAATTATTAAATTTCTT 13 | >chrXII:455932-457732 14 | TAATGATCCTTCCGCAGGTTCACCTACGGAAACCTTGTTACGACTTTTAGTTCCTCTAAATGACCAAGTTTGTCCAAATTCTCCGCTCTGAGATGGAGTTGCCCCCTTCTCTAAGCAGATCCTGAGGCCTCACTAAGCCATTCAATCGGTACTAGCGACGGGCGGTGTGTACAAAGGGCAGGGACGTAATCAACGCAAGCTGATGACTTGCGCTTACTAGGAATTCCTCGTTGAAGAGCAATAATTACAATGCTCTATCCCCAGCACGACGGAGTTTCACAAGATTACCAAGACCTCTCGGCCAAGGTTAGACTCGCTGGCTCCGTCAGTGTAGCGCGCGTGCGGCCCAGAACGTCTAAGGGCATCACAGACCTGTTATTGCCTCAAACTTCCATCGGCTTGAAACCGATAGTCCCTCTAAGAAGTGGATAACCAGCAAATGCTAGCACCACTATTTAGTAGGTTAAGGTCTCGTTCGTTATCGCAATTAAGCAGACAAATCACTCCACCAACTAAGAACGGCCATGCACCACCACCCACAAAATCAAGAAAGAGCTCTCAATCTGTCAATCCTTATTGTGTCTGGACCTGGTGAGTTTCCCCGTGTTGAGTCAAATTAAGCCGCAGGCTCCACTCCTGGTGGTGCCCTTCCGTCAATTCCTTTAAGTTTCAGCCTTGCGACCATACTCCCCCCAGAACCCAAAGACTTTGATTTCTCGTAAGGTGCCGAGTGGGTCATTAAAAAAACACCACCCGATCCCTAGTCGGCATAGTTTATGGTTAAGACTACGACGGTATCTGATCATCTTCGATCCCCTAACTTTCGTTCTTGATTAATGAAAACGTCCTTGGCAAATGCTTTCGCAGTAGTTAGTCTTCAATAAATCCAAGAATTTCACCTCTGACAATTGAATACTGATGCCCCCGACCGTCCCTATTAATCATTACGATGGTCCTAGAAACCAACAAAATAGAACCAAACGTCCTATTCTATTATTCCATGCTAATATATTCGAGCAATACGCCTGCTTTGAACACTCTAATTTTTTCAAAGTAAAAGTCCTGGTTCGCCAAGAGCCACAAGGACTCAAGGTTAGCCAGAAGGAAAGGCCCCGTTGGAAATCCAGTACACGAAAAAATCGGACCGGCCAACCGGGCCCAAAGTTCAACTACGAGCTTTTTAACTGCAACAACTTTAATATACGCTATTGGAGCTGGAATTACCGCGGCTGCTGGCACCAGACTTGCCCTCCAATTGTTCCTCGTTAAGGTATTTACATTGTACTCATTCCAATTACAAGACCCGAATGGGCCCTGTATCGTTATTTATTGTCACTACCTCCCTGAATTAGGATTGGGTAATTTGCGCGCCTGCTGCCTTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTTATTCCCCGTTACCCGTTGAAACCATGGTAGGCCACTATCCTACCATCGAAAGTTGATAGGGCAGAAATTTGAATGAACCATCGCCAGCACAAGGCCATGCGATTCGAAAAGTTATTATGAATCATCAAAGAGTCCGAAGACATTGATTTTTTATCTAATAAATACATCTCTTCCAAAGGGTCGAGATTTTAAGCATGTATTAGCTCTAGAATTACCACAGTTATACCATGTAGTAAAGGAACTATCAAATAAACGATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTATAAATTGCTTATACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGACTACTGGCAGGATCAACCAGATA 15 | >chrXII:457732-458432 16 | ACTATCTTAAAAGAAGAAGCAACAAGCAGTAAAAAAGAAAGAAACCGAAATCTCTTTTTTTTTTTCCCACCTATTCCCTCTTGCTAGAAGATACTTATTGAGTTTGGAAACAGCTGAAATTCCAGAAAAATTGCTTTTTCAGGTCTCTCTGCTGCCGGAAATGCTCTCTGTTCAAAAAGCTTTTACACTCTTGACCAGCGCACTCCGTCACCATACCATAGCACTCTTTGAGTTTCCTCTAATCAGGTTCCACCAAACAGATACCCCGGTGTTTCACGGAATGGTACGTTTGATATCGCTGATTTGAGAGGAGGTTACACTTGAAGAATCACAGTCTTGCGACCGGCTATTCAACAAGGCATTCCCCCAAGTTTGAATTCTTTGAAATAGATTGCTATTAGCTAGTAATCCACCAAATCCTTCGCTGCTCACCAATGGAATCGCAAGATGCCCACGATGAGACTGTTCAGGTTAAACGCAAAAGAAACACACTCTGGGAATTTCTTCCCAAATTGTATCTCTCAATACGCATCAACCCATGTCAATTAAACACGCTGTATAGAGACTAGGCAGATCTGACGATCACCTAGCGACTCTCTCCACCGTTTGACGAGGCCATTTACAAAAACATAACGAACGACAAGCCTACTCGAATTCGTTTCCAAACTCTTTTCGAACTTGTCTTCAACTGCTTTCGCAT 17 | >chrXII:459675-459796 18 | GGTTGCGGCCATATCTACCAGAAAGCACCGTTTCCCGTCCGATCAACTGTAGTTAAGCTGGTAAGAGCCTGACCGAGTAGTGTAGTGGGTGACCATACGCGAAACTCAGGTGCTGCAATCT 19 | >chrXII:460711-460922 20 | ATAGTAAATAGTAACTTACATACATTAGTAAATGGTACACTCTTACACACTATCATCCTCATCGTATATTATAATAGATATATACAATACATGTTTTTACCCGGATCATAGAATTCTTAAGACAAATAAAATTTATAGAGACTTGTTCAGTCTACTTCTCTCTAAACTAGGCCCCGGCTCCTGCCAGTACCCACTTAGAAAGAAATAAAAA 21 | >chrXII:460711-467569 22 | ATAGTAAATAGTAACTTACATACATTAGTAAATGGTACACTCTTACACACTATCATCCTCATCGTATATTATAATAGATATATACAATACATGTTTTTACCCGGATCATAGAATTCTTAAGACAAATAAAATTTATAGAGACTTGTTCAGTCTACTTCTCTCTAAACTAGGCCCCGGCTCCTGCCAGTACCCACTTAGAAAGAAATAAAAAACAAATCAGACAACAAAGGCTTAATCTCAGCAGATCGTAACAACAAGGCTACTCTACTGCTTACAATACCCCGTTGTACATCTAAGTCGTATACAAATGATTTATCCCCACGCAAAATGACATTGCAATTCGCCAGCAAGCACCCAAGGCCTTTCCGCCAAGTGCACCGTTGCTAGCCTGCTATGGTTCAGCGACGCCACAAGGACGCCTTATTCGTATCCATCTATATTGTGTGGAGCAAAGAAATCACCGCGTTCTAGCATGGATTCTGACTTAGAGGCGTTCAGCCATAATCCAGCGGATGGTAGCTTCGCGGCAATGCCTGATCAGACAGCCGCAAAAACCAATTATCCGAATGAACTGTTCCTCTCGTACTAAGTTCAATTACTATTGCGGTAACATTCATCAGTAGGGTAAAACTAACCTGTCTCACGACGGTCTAAACCCAGCTCACGTTCCCTATTAGTGGGTGAACAATCCAACGCTTACCGAATTCTGCTTCGGTATGATAGGAAGAGCCGACATCGAAGAATCAAAAAGCAATGTCGCTATGAACGCTTGACTGCCACAAGCCAGTTATCCCTGTGGTAACTTTTCTGGCACCTCTAGCCTCAAATTCCGAGGGACTAAAGGATCGATAGGCCACACTTTCATGGTTTGTATTCACACTGAAAATCAAAATCAAGGGGGCTTTTACCCTTTTGTTCTACTGGAGATTTCTGTTCTCCATGAGCCCCCCTTAGGACATCTGCGTTATCGTTTAACAGATGTGCCGCCCCAGCCAAACTCCCCACCTGACAATGTCTTCAACCCGGATCAGCCCCGAATGGGACCTTGAATGCTAGAACGTGGAAAATGAATTCCAGCTCCGCTTCATTGAATAAGTAAAGAAACTATAAAGGTAGTGGTATTTCACTGGCGCCGAAGCTCCCACTTATTCTACACCCTCTATGTCTCTTCACAATGTCAAACTAGAGTCAAGCTCAACAGGGTCTTCTTTCCCCGCTGATTCTGCCAAGCCCGTTCCCTTGGCTGTGGTTTCGCTAGATAGTAGATAGGGACAGTGGGAATCTCGTTAATCCATTCATGCGCGTCACTAATTAGATGACGAGGCATTTGGCTACCTTAAGAGAGTCATAGTTACTCCCGCCGTTTACCCGCGCTTGGTTGAATTTCTTCACTTTGACATTCAGAGCACTGGGCAGAAATCACATTGCGTCAACATCACTTTCTGACCATCGCAATGCTATGTTTTAATTAGACAGTCAGATTCCCCTTGTCCGTACCAGTTCTAAGTTGATCGTTAATTGTAGCAAGCGACGGTCTACAAGAGACCTACCAAGGCCGTCTACAACAAGGCACGCAAGTAGTCCGCCTAGCAGAGCAAGCCCCACCAAGCAGTCCACAAGCACGCCCGCTGCGTCTGACCAAGGCCCTCACTACCCGACCCTTAGAGCCAATCCTTATCCCGAAGTTACGGATCTATTTTGCCGACTTCCCTTATCTACATTATTCTATCAACTAGAGGCTGTTCACCTTGGAGACCTGCTGCGGTTATCAGTACGACCTGGCATGAAAACTATTCCTTCCTGTGGATTTTCACGGGCCGTCACAAGCGCACCGGAGCCAGCAAAGGTGCTGGCCTCTTCCAGCCATAAGACCCCATCTCCGGATAAACCAATTCCGGGGTGATAAGCTGTTAAGAAGAAAAGATAACTCCTCCCAGGGCTCGCGCCGACGTCTCCACATTCAGTTACGTTACCGTGAAGAATCCATATCCAGGTTCCGGAATCTTAACCGGATTCCCTTTCGATGGTGGCCTGCATAAAATCAGGCCTTTGAAACGGAGCTTCCCCATCTCTTAGGATCGACTAACCCACGTCCAACTGCTGTTGACGTGGAACCTTTCCCCACTTCAGTCTTCAAAGTTCTCATTTGAATATTTGCTACTACCACCAAGATCTGCACTAGAGGCCGTTCGACCCGACCTTACGGTCTAGGCTTCGTCACTGACCTCCACGCCTGCCTACTCGTCAGGGCATCATATCAACCCTGACGGTAGAGTATAGGTAACACGCTTGAGCGCCATCCATTTTCAGGGCTAGTTCATTCGGCCGGTGAGTTGTTACACACTCCTTAGCGGATTCCGACTTCCATGGCCACCGTCCGGCTGTCTAGATGAACTAACACCTTTTGTGGTGTCTGATGAGCGTGTATTCCGGCACCTTAACTCTACGTTCGGTTCATCCCGCATCGCCAGTTCTGCTTACCAAAAATGGCCCACTAAAAGCTCTTCATTCAAATGTCCACGTTCAATTAAGTAACAAGGACTTCTTACATATTTAAAGTTTGAGAATAGGTCAAGGTCATTTCGACCCCGGAACCTCTAATCATTCGCTTTACCTCATAAAACTGATACGAGCTTCTGCTATCCTGAGGGAAACTTCGGCAGGAACCAGCTACTAGATGGTTCGATTAGTCTTTCGCCCCTATACCCAAATTCGACGATCGATTTGCACGTCAGAACCGCTACGAGCCTCCACCAGAGTTTCCTCTGGCTTCACCCTATTCAGGCATAGTTCACCATCTTTCGGGTCCCAACAGCTATGCTCTTACTCAAATCCATCCGAAGACATCAGGATCGGTCGATTGTGCACCTCTTGCGAGGCCCCAACCTACGTTCACTTTCATTACGCGTATGGGTTTTACACCCAAACACTCGCATAGACGTTAGACTCCTTGGTCCGTGTTTCAAGACGGGCGGCATATAACCATTATGCCAGCATCCTTGACTTACGTCGCAGTCCTCAGTCCCAGCTGGCAGTATTCCCACAGGCTATAATACTTACCGAGGCAAGCTACATTCCTATGGATTTATCCTGCCACCAAAACTGATGCTGGCCCAGTGAAATGCGAGATTCCCCTACCCACAAGGAGCAGAGGGCACAAAACACCATGTCTGATCAAATGCCCTTCCCTTTCAACAATTTCACGTACTTTTTCACTCTCTTTTCAAAGTTCTTTTCATCTTTCCATCACTGTACTTGTTCGCTATCGGTCTCTCGCCAATATTTAGCTTTAGATGGAATTTACCACCCACTTAGAGCTGCATTCCCAAACAACTCGACTCTTCGAAGGCACTTTACAAAGAACCGCACTCCTCGCCACACGGGATTCTCACCCTCTATGACGTCCTGTTCCAAGGAACATAGACAAGGAACGGCCCCAAAGTTGCCCTCTCCAAATTACAACTCGGGCACCGAAGGTACCAGATTTCAAATTTGAGCTTTTGCCGCTTCACTCGCCGTTACTAAGGCAATCCCGGTTGGTTTCTTTTCCTCCGCTTATTGATATGCTTAAGTTCAGCGGGTACTCCTACCTGATTTGAGGTCAAACTTTAAGAACATTGTTCGCCTAGACGCTCTCTTCTTATCGATAACGTTCCAATACGCTCAGTATAAAAAAAGATTAGCCGCAGTTGGTAAAACCTAAAACGACCGTACTTGCATTATACCTCAAGCACGCAGAGAAACCTCTCTTTGGAAAAAAAACATCCAATGAAAAGGCCAGCAATTTCAAGTTAACTCCAAAGAGTATCACTCACTACCAAACAGAATGTTTGAGAAGGAAATGACGCTCAAACAGGCATGCCCCCTGGAATACCAAGGGGCGCAATGTGCGTTCAAAGATTCGATGATTCACGGAATTCTGCAATTCACATTACGTATCGCATTTCGCTGCGTTCTTCATCGATGCGAGAACCAAGAGATCCGTTGTTGAAAGTTTTTAATATTTTAAAATTTCCAGTTACGAAAATTCTTGTTTTTGACAAAAATTTAATGAATAGATAAAATTGTTTGTGTTTGTTACCTCTGGGCCCCGATTGCTCGAATGCCCAAAGAAAAAGTTGCAAAGATATGAAAACTCCACAGTGTGTTGTATTGAAACGGTTTTAATTGTCCTATAACAAAAGCACAGAAATCTCTCACCGTTTGGAATAGCAAGAAAGAAACTTACAAGCCTAGCAAGACCGCGCACTTAAGCGCAGGCCCGGCTGGACTCTCCATCTCTTGTCTTCTTGCCCAGTAAAAGCTCTCATGCTCTTGCCAAAACAAAAAAATCCATTTTCAAAATTATTAAATTTCTTTAATGATCCTTCCGCAGGTTCACCTACGGAAACCTTGTTACGACTTTTAGTTCCTCTAAATGACCAAGTTTGTCCAAATTCTCCGCTCTGAGATGGAGTTGCCCCCTTCTCTAAGCAGATCCTGAGGCCTCACTAAGCCATTCAATCGGTACTAGCGACGGGCGGTGTGTACAAAGGGCAGGGACGTAATCAACGCAAGCTGATGACTTGCGCTTACTAGGAATTCCTCGTTGAAGAGCAATAATTACAATGCTCTATCCCCAGCACGACGGAGTTTCACAAGATTACCAAGACCTCTCGGCCAAGGTTAGACTCGCTGGCTCCGTCAGTGTAGCGCGCGTGCGGCCCAGAACGTCTAAGGGCATCACAGACCTGTTATTGCCTCAAACTTCCATCGGCTTGAAACCGATAGTCCCTCTAAGAAGTGGATAACCAGCAAATGCTAGCACCACTATTTAGTAGGTTAAGGTCTCGTTCGTTATCGCAATTAAGCAGACAAATCACTCCACCAACTAAGAACGGCCATGCACCACCACCCACAAAATCAAGAAAGAGCTCTCAATCTGTCAATCCTTATTGTGTCTGGACCTGGTGAGTTTCCCCGTGTTGAGTCAAATTAAGCCGCAGGCTCCACTCCTGGTGGTGCCCTTCCGTCAATTCCTTTAAGTTTCAGCCTTGCGACCATACTCCCCCCAGAACCCAAAGACTTTGATTTCTCGTAAGGTGCCGAGTGGGTCATTAAAAAAACACCACCCGATCCCTAGTCGGCATAGTTTATGGTTAAGACTACGACGGTATCTGATCATCTTCGATCCCCTAACTTTCGTTCTTGATTAATGAAAACGTCCTTGGCAAATGCTTTCGCAGTAGTTAGTCTTCAATAAATCCAAGAATTTCACCTCTGACAATTGAATACTGATGCCCCCGACCGTCCCTATTAATCATTACGATGGTCCTAGAAACCAACAAAATAGAACCAAACGTCCTATTCTATTATTCCATGCTAATATATTCGAGCAATACGCCTGCTTTGAACACTCTAATTTTTTCAAAGTAAAAGTCCTGGTTCGCCAAGAGCCACAAGGACTCAAGGTTAGCCAGAAGGAAAGGCCCCGTTGGAAATCCAGTACACGAAAAAATCGGACCGGCCAACCGGGCCCAAAGTTCAACTACGAGCTTTTTAACTGCAACAACTTTAATATACGCTATTGGAGCTGGAATTACCGCGGCTGCTGGCACCAGACTTGCCCTCCAATTGTTCCTCGTTAAGGTATTTACATTGTACTCATTCCAATTACAAGACCCGAATGGGCCCTGTATCGTTATTTATTGTCACTACCTCCCTGAATTAGGATTGGGTAATTTGCGCGCCTGCTGCCTTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTTATTCCCCGTTACCCGTTGAAACCATGGTAGGCCACTATCCTACCATCGAAAGTTGATAGGGCAGAAATTTGAATGAACCATCGCCAGCACAAGGCCATGCGATTCGAAAAGTTATTATGAATCATCAAAGAGTCCGAAGACATTGATTTTTTATCTAATAAATACATCTCTTCCAAAGGGTCGAGATTTTAAGCATGTATTAGCTCTAGAATTACCACAGTTATACCATGTAGTAAAGGAACTATCAAATAAACGATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTATAAATTGCTTATACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGACTACTGGCAGGATCAACCAGATAACTATCTTAAAAGAAGAAGCAACAAGCAGTAAAAAAGAAAGAAACCGAAATCTCTTTTTTTTTTTCCCACCTATTCCCTCTTGCTAGAAGATACTTATTGAGTTTGGAAACAGCTGAAATTCCAGAAAAATTGCTTTTTCAGGTCTCTCTGCTGCCGGAAATGCTCTCTGTTCAAAAAGCTTTTACACTCTTGACCAGCGCACTCCGTCACCATACCATAGCACTCTTTGAGTTTCCTCTAATCAGGTTCCACCAAACAGATACCCCGGTGTTTCACGGAATGGTACGTTTGATATCGCTGATTTGAGAGGAGGTTACACTTGAAGAATCACAGTCTTGCGACCGGCTATTCAACAAGGCATTCCCCCAAGTTTGAATTCTTTGAAATAGATTGCTATTAGCTAGTAATCCACCAAATCCTTCGCTGCTCACCAATGGAATCGCAAGATGCCCACGATGAGACTGTTCAGGTTAAACGCAAAAGAAACACACTCTGGGAATTTCTTCCCAAATTGTATCTCTCAATACGCATCAACCCATGTCAATTAAACACGCTGTATAGAGACTAGGCAGATCTGACGATCACCTAGCGACTCTCTCCACCGTTTGACGAGGCCATTTACAAAAACATAACGAACGACAAGCCTACTCGAATTCGTTTCCAAACTCTTTTCGAACTTGTCTTCAACTGCTTTCGCAT 23 | >chrXII:460922-464318 24 | ACAAATCAGACAACAAAGGCTTAATCTCAGCAGATCGTAACAACAAGGCTACTCTACTGCTTACAATACCCCGTTGTACATCTAAGTCGTATACAAATGATTTATCCCCACGCAAAATGACATTGCAATTCGCCAGCAAGCACCCAAGGCCTTTCCGCCAAGTGCACCGTTGCTAGCCTGCTATGGTTCAGCGACGCCACAAGGACGCCTTATTCGTATCCATCTATATTGTGTGGAGCAAAGAAATCACCGCGTTCTAGCATGGATTCTGACTTAGAGGCGTTCAGCCATAATCCAGCGGATGGTAGCTTCGCGGCAATGCCTGATCAGACAGCCGCAAAAACCAATTATCCGAATGAACTGTTCCTCTCGTACTAAGTTCAATTACTATTGCGGTAACATTCATCAGTAGGGTAAAACTAACCTGTCTCACGACGGTCTAAACCCAGCTCACGTTCCCTATTAGTGGGTGAACAATCCAACGCTTACCGAATTCTGCTTCGGTATGATAGGAAGAGCCGACATCGAAGAATCAAAAAGCAATGTCGCTATGAACGCTTGACTGCCACAAGCCAGTTATCCCTGTGGTAACTTTTCTGGCACCTCTAGCCTCAAATTCCGAGGGACTAAAGGATCGATAGGCCACACTTTCATGGTTTGTATTCACACTGAAAATCAAAATCAAGGGGGCTTTTACCCTTTTGTTCTACTGGAGATTTCTGTTCTCCATGAGCCCCCCTTAGGACATCTGCGTTATCGTTTAACAGATGTGCCGCCCCAGCCAAACTCCCCACCTGACAATGTCTTCAACCCGGATCAGCCCCGAATGGGACCTTGAATGCTAGAACGTGGAAAATGAATTCCAGCTCCGCTTCATTGAATAAGTAAAGAAACTATAAAGGTAGTGGTATTTCACTGGCGCCGAAGCTCCCACTTATTCTACACCCTCTATGTCTCTTCACAATGTCAAACTAGAGTCAAGCTCAACAGGGTCTTCTTTCCCCGCTGATTCTGCCAAGCCCGTTCCCTTGGCTGTGGTTTCGCTAGATAGTAGATAGGGACAGTGGGAATCTCGTTAATCCATTCATGCGCGTCACTAATTAGATGACGAGGCATTTGGCTACCTTAAGAGAGTCATAGTTACTCCCGCCGTTTACCCGCGCTTGGTTGAATTTCTTCACTTTGACATTCAGAGCACTGGGCAGAAATCACATTGCGTCAACATCACTTTCTGACCATCGCAATGCTATGTTTTAATTAGACAGTCAGATTCCCCTTGTCCGTACCAGTTCTAAGTTGATCGTTAATTGTAGCAAGCGACGGTCTACAAGAGACCTACCAAGGCCGTCTACAACAAGGCACGCAAGTAGTCCGCCTAGCAGAGCAAGCCCCACCAAGCAGTCCACAAGCACGCCCGCTGCGTCTGACCAAGGCCCTCACTACCCGACCCTTAGAGCCAATCCTTATCCCGAAGTTACGGATCTATTTTGCCGACTTCCCTTATCTACATTATTCTATCAACTAGAGGCTGTTCACCTTGGAGACCTGCTGCGGTTATCAGTACGACCTGGCATGAAAACTATTCCTTCCTGTGGATTTTCACGGGCCGTCACAAGCGCACCGGAGCCAGCAAAGGTGCTGGCCTCTTCCAGCCATAAGACCCCATCTCCGGATAAACCAATTCCGGGGTGATAAGCTGTTAAGAAGAAAAGATAACTCCTCCCAGGGCTCGCGCCGACGTCTCCACATTCAGTTACGTTACCGTGAAGAATCCATATCCAGGTTCCGGAATCTTAACCGGATTCCCTTTCGATGGTGGCCTGCATAAAATCAGGCCTTTGAAACGGAGCTTCCCCATCTCTTAGGATCGACTAACCCACGTCCAACTGCTGTTGACGTGGAACCTTTCCCCACTTCAGTCTTCAAAGTTCTCATTTGAATATTTGCTACTACCACCAAGATCTGCACTAGAGGCCGTTCGACCCGACCTTACGGTCTAGGCTTCGTCACTGACCTCCACGCCTGCCTACTCGTCAGGGCATCATATCAACCCTGACGGTAGAGTATAGGTAACACGCTTGAGCGCCATCCATTTTCAGGGCTAGTTCATTCGGCCGGTGAGTTGTTACACACTCCTTAGCGGATTCCGACTTCCATGGCCACCGTCCGGCTGTCTAGATGAACTAACACCTTTTGTGGTGTCTGATGAGCGTGTATTCCGGCACCTTAACTCTACGTTCGGTTCATCCCGCATCGCCAGTTCTGCTTACCAAAAATGGCCCACTAAAAGCTCTTCATTCAAATGTCCACGTTCAATTAAGTAACAAGGACTTCTTACATATTTAAAGTTTGAGAATAGGTCAAGGTCATTTCGACCCCGGAACCTCTAATCATTCGCTTTACCTCATAAAACTGATACGAGCTTCTGCTATCCTGAGGGAAACTTCGGCAGGAACCAGCTACTAGATGGTTCGATTAGTCTTTCGCCCCTATACCCAAATTCGACGATCGATTTGCACGTCAGAACCGCTACGAGCCTCCACCAGAGTTTCCTCTGGCTTCACCCTATTCAGGCATAGTTCACCATCTTTCGGGTCCCAACAGCTATGCTCTTACTCAAATCCATCCGAAGACATCAGGATCGGTCGATTGTGCACCTCTTGCGAGGCCCCAACCTACGTTCACTTTCATTACGCGTATGGGTTTTACACCCAAACACTCGCATAGACGTTAGACTCCTTGGTCCGTGTTTCAAGACGGGCGGCATATAACCATTATGCCAGCATCCTTGACTTACGTCGCAGTCCTCAGTCCCAGCTGGCAGTATTCCCACAGGCTATAATACTTACCGAGGCAAGCTACATTCCTATGGATTTATCCTGCCACCAAAACTGATGCTGGCCCAGTGAAATGCGAGATTCCCCTACCCACAAGGAGCAGAGGGCACAAAACACCATGTCTGATCAAATGCCCTTCCCTTTCAACAATTTCACGTACTTTTTCACTCTCTTTTCAAAGTTCTTTTCATCTTTCCATCACTGTACTTGTTCGCTATCGGTCTCTCGCCAATATTTAGCTTTAGATGGAATTTACCACCCACTTAGAGCTGCATTCCCAAACAACTCGACTCTTCGAAGGCACTTTACAAAGAACCGCACTCCTCGCCACACGGGATTCTCACCCTCTATGACGTCCTGTTCCAAGGAACATAGACAAGGAACGGCCCCAAAGTTGCCCTCTCCAAATTACAACTCGGGCACCGAAGGTACCAGATTTCAAATTTGAGCTTTTGCCGCTTCACTCGCCGTTACTAAGGCAATCCCGGTTGGTTTCTTTTCCTCCGCTTATTGATATGCTTAAGTTCAGCGGGTACTCCTACCTGATTTGAGGTCAAAC 25 | >chrXII:464318-464550 26 | TTTAAGAACATTGTTCGCCTAGACGCTCTCTTCTTATCGATAACGTTCCAATACGCTCAGTATAAAAAAAGATTAGCCGCAGTTGGTAAAACCTAAAACGACCGTACTTGCATTATACCTCAAGCACGCAGAGAAACCTCTCTTTGGAAAAAAAACATCCAATGAAAAGGCCAGCAATTTCAAGTTAACTCCAAAGAGTATCACTCACTACCAAACAGAATGTTTGAGAAGG 27 | >chrXII:464550-464708 28 | AAATGACGCTCAAACAGGCATGCCCCCTGGAATACCAAGGGGCGCAATGTGCGTTCAAAGATTCGATGATTCACGGAATTCTGCAATTCACATTACGTATCGCATTTCGCTGCGTTCTTCATCGATGCGAGAACCAAGAGATCCGTTGTTGAAAGTTT 29 | >chrXII:464708-465069 30 | TTAATATTTTAAAATTTCCAGTTACGAAAATTCTTGTTTTTGACAAAAATTTAATGAATAGATAAAATTGTTTGTGTTTGTTACCTCTGGGCCCCGATTGCTCGAATGCCCAAAGAAAAAGTTGCAAAGATATGAAAACTCCACAGTGTGTTGTATTGAAACGGTTTTAATTGTCCTATAACAAAAGCACAGAAATCTCTCACCGTTTGGAATAGCAAGAAAGAAACTTACAAGCCTAGCAAGACCGCGCACTTAAGCGCAGGCCCGGCTGGACTCTCCATCTCTTGTCTTCTTGCCCAGTAAAAGCTCTCATGCTCTTGCCAAAACAAAAAAATCCATTTTCAAAATTATTAAATTTCTT 31 | >chrXII:465069-466869 32 | TAATGATCCTTCCGCAGGTTCACCTACGGAAACCTTGTTACGACTTTTAGTTCCTCTAAATGACCAAGTTTGTCCAAATTCTCCGCTCTGAGATGGAGTTGCCCCCTTCTCTAAGCAGATCCTGAGGCCTCACTAAGCCATTCAATCGGTACTAGCGACGGGCGGTGTGTACAAAGGGCAGGGACGTAATCAACGCAAGCTGATGACTTGCGCTTACTAGGAATTCCTCGTTGAAGAGCAATAATTACAATGCTCTATCCCCAGCACGACGGAGTTTCACAAGATTACCAAGACCTCTCGGCCAAGGTTAGACTCGCTGGCTCCGTCAGTGTAGCGCGCGTGCGGCCCAGAACGTCTAAGGGCATCACAGACCTGTTATTGCCTCAAACTTCCATCGGCTTGAAACCGATAGTCCCTCTAAGAAGTGGATAACCAGCAAATGCTAGCACCACTATTTAGTAGGTTAAGGTCTCGTTCGTTATCGCAATTAAGCAGACAAATCACTCCACCAACTAAGAACGGCCATGCACCACCACCCACAAAATCAAGAAAGAGCTCTCAATCTGTCAATCCTTATTGTGTCTGGACCTGGTGAGTTTCCCCGTGTTGAGTCAAATTAAGCCGCAGGCTCCACTCCTGGTGGTGCCCTTCCGTCAATTCCTTTAAGTTTCAGCCTTGCGACCATACTCCCCCCAGAACCCAAAGACTTTGATTTCTCGTAAGGTGCCGAGTGGGTCATTAAAAAAACACCACCCGATCCCTAGTCGGCATAGTTTATGGTTAAGACTACGACGGTATCTGATCATCTTCGATCCCCTAACTTTCGTTCTTGATTAATGAAAACGTCCTTGGCAAATGCTTTCGCAGTAGTTAGTCTTCAATAAATCCAAGAATTTCACCTCTGACAATTGAATACTGATGCCCCCGACCGTCCCTATTAATCATTACGATGGTCCTAGAAACCAACAAAATAGAACCAAACGTCCTATTCTATTATTCCATGCTAATATATTCGAGCAATACGCCTGCTTTGAACACTCTAATTTTTTCAAAGTAAAAGTCCTGGTTCGCCAAGAGCCACAAGGACTCAAGGTTAGCCAGAAGGAAAGGCCCCGTTGGAAATCCAGTACACGAAAAAATCGGACCGGCCAACCGGGCCCAAAGTTCAACTACGAGCTTTTTAACTGCAACAACTTTAATATACGCTATTGGAGCTGGAATTACCGCGGCTGCTGGCACCAGACTTGCCCTCCAATTGTTCCTCGTTAAGGTATTTACATTGTACTCATTCCAATTACAAGACCCGAATGGGCCCTGTATCGTTATTTATTGTCACTACCTCCCTGAATTAGGATTGGGTAATTTGCGCGCCTGCTGCCTTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTTATTCCCCGTTACCCGTTGAAACCATGGTAGGCCACTATCCTACCATCGAAAGTTGATAGGGCAGAAATTTGAATGAACCATCGCCAGCACAAGGCCATGCGATTCGAAAAGTTATTATGAATCATCAAAGAGTCCGAAGACATTGATTTTTTATCTAATAAATACATCTCTTCCAAAGGGTCGAGATTTTAAGCATGTATTAGCTCTAGAATTACCACAGTTATACCATGTAGTAAAGGAACTATCAAATAAACGATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTATAAATTGCTTATACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGACTACTGGCAGGATCAACCAGATA 33 | >chrXII:466869-467569 34 | ACTATCTTAAAAGAAGAAGCAACAAGCAGTAAAAAAGAAAGAAACCGAAATCTCTTTTTTTTTTTCCCACCTATTCCCTCTTGCTAGAAGATACTTATTGAGTTTGGAAACAGCTGAAATTCCAGAAAAATTGCTTTTTCAGGTCTCTCTGCTGCCGGAAATGCTCTCTGTTCAAAAAGCTTTTACACTCTTGACCAGCGCACTCCGTCACCATACCATAGCACTCTTTGAGTTTCCTCTAATCAGGTTCCACCAAACAGATACCCCGGTGTTTCACGGAATGGTACGTTTGATATCGCTGATTTGAGAGGAGGTTACACTTGAAGAATCACAGTCTTGCGACCGGCTATTCAACAAGGCATTCCCCCAAGTTTGAATTCTTTGAAATAGATTGCTATTAGCTAGTAATCCACCAAATCCTTCGCTGCTCACCAATGGAATCGCAAGATGCCCACGATGAGACTGTTCAGGTTAAACGCAAAAGAAACACACTCTGGGAATTTCTTCCCAAATTGTATCTCTCAATACGCATCAACCCATGTCAATTAAACACGCTGTATAGAGACTAGGCAGATCTGACGATCACCTAGCGACTCTCTCCACCGTTTGACGAGGCCATTTACAAAAACATAACGAACGACAAGCCTACTCGAATTCGTTTCCAAACTCTTTTCGAACTTGTCTTCAACTGCTTTCGCAT 35 | >chrXII:468812-468931 36 | GGTTGCGGCCATATCTACCAGAAAGCACCGTTTCCCGTCCGATCAACTGTAGTTAAGCTGGTAAGAGCCTGACCGAGTAGTGTAGTGGGTGACCATACGCGAAACTCAGGTGCTGCAGT 37 | >chrXII:472464-472583 38 | GGTTGCGGCCATATCTACCAGAAAGCACCGTTTCCCGTCCGATCAACTGTAGTTAAGCTGGTAAGAGCCTGACCGAGTAGTGTAGTGGGTGACCATACGCGAAACTCAGGTGCTGCAGT 39 | >chrXII:482044-482163 40 | GGTTGCGGCCATATCTACCAGAAAGCACCGTTTCCCGTCCGATCAACTGTAGTTAAGCTGGTAAGAGCCTGACCGAGTAGTGTAGTGGGTGACCATACGCGAAACTCAGGTGCTGCAGT 41 | >chrXII:485696-485815 42 | GGTTGCGGCCATATCTACCAGAAAGCACCGTTTCCCGTCCGATCAACTGTAGTTAAGCTGGTAAGAGCCTGACCGAGTAGTGTAGTGGGTGACCATACGCGAAACTCAGGTGCTGCAGT 43 | >chrXII:489348-489469 44 | GGTTGCGGCCATATCTACCAGAAAGCACCGTTTCCCGTCCGATCAACTGTAGTTAAGCTGGTAAGAGCCTGACCGAGTAGTGTAGTGGGTGACCATACGCGAAACTCAGGTGCTGCAATCT 45 | -------------------------------------------------------------------------------- /sequana_pipelines/rnaseq/rnaseq.rules: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2016-2021 Sequana Dev Team (https://sequana.readthedocs.io) 3 | # 4 | # The full license is in the LICENSE file, distributed with this software. 5 | # 6 | # Website: https://github.com/sequana/sequana 7 | # Documentation: http://sequana.readthedocs.io 8 | # Contributors: https://github.com/sequana/sequana/graphs/contributors 9 | ############################################################################## 10 | # standard modules 11 | import glob 12 | import os 13 | import shutil 14 | import subprocess 15 | 16 | import sequana 17 | from sequana_pipetools import snaketools as sm 18 | import sequana.featurecounts as fc 19 | 20 | # ========================================================= The main config file 21 | # 22 | configfile: "config.yaml" 23 | 24 | 25 | # ================================================== The sequana pipeline manager 26 | # 27 | manager = sm.PipelineManager("rnaseq", config) 28 | 29 | expected_output = [] 30 | # ========================================= Define output of the pipeline 31 | # 32 | manager.globals = {} 33 | 34 | if manager.config['general']['aligner'] == 'salmon': 35 | manager.globals['strand_summary'] = None 36 | rule rnaseq: 37 | input: 38 | "multiqc/multiqc_report.html", 39 | ".sequana/rulegraph.svg", 40 | "post_analysis/rnadiff.sh" 41 | else: 42 | manager.globals['strand_summary'] = "outputs/strand_summary.csv" 43 | rule rnaseq: 44 | input: 45 | "multiqc/multiqc_report.html", 46 | ".sequana/rulegraph.svg", 47 | "post_analysis/rnadiff.sh", 48 | 49 | 50 | # ========================================= Define genome directory and inputs 51 | # Make sure it is absolute 52 | # 53 | genome_directory = os.path.abspath(manager.config["general"]["genome_directory"]) 54 | genome_name = genome_directory.rsplit("/", 1)[1] 55 | 56 | __prefix_name__ = f"{genome_directory}/{genome_name}" 57 | __fasta_file__ = f"{__prefix_name__}.fa" 58 | __gff_file__ = f"{__prefix_name__}.gff" 59 | 60 | 61 | # ================================================ Build custom GFF if required 62 | # If we have several features, we need to build a custom GFF file 63 | # 64 | if manager.config['general']['custom_gff']: 65 | __gff_file__ = manager.config['general']['custom_gff'] 66 | assert os.path.exists(__gff_file__), f"GFF file {__gff_file__} does not exist" 67 | 68 | # check existence of fasta and gff before starting; 69 | for this in [__fasta_file__, __gff_file__]: 70 | if os.path.exists(this) is False: 71 | raise IOError("File {} not found".format(__fasta_file__)) 72 | 73 | 74 | if manager.config.general.contaminant_file and manager.config.general.rRNA_feature: 75 | logger.error("Either set contaminant_file or rRNA_feature in the config file, not both.") 76 | sys.exit(1) 77 | 78 | 79 | # ==================================== search for specific sequences as contaminants 80 | # 81 | if manager.config.general.contaminant_file: 82 | # Could be a local file of in the genome directory file 83 | __bowtie1_index_rna__fasta = f"{manager.config.general.contaminant_file}" 84 | 85 | # if not found locally, try to find it in the genome_directory path 86 | if os.path.exists(__bowtie1_index_rna__fasta) is False: 87 | __bowtie1_index_rna__fasta = f"{genome_directory}/{manager.config.general.contaminant_file}" 88 | if os.path.exists(__bowtie1_index_rna__fasta) is False: 89 | logger.error("File {} does not exists. Check your config file".format(__bowtie1_index_rna__fasta)) 90 | sys.exit(1) 91 | 92 | # we will copy the file to keep the information 93 | os.makedirs("inputs/contamination_file", exist_ok=True) 94 | shutil.copy(__bowtie1_index_rna__fasta, "inputs/contamination_file") 95 | 96 | # so we need to rename the input 97 | __bowtie1_index_rna__fasta = "inputs/contamination_file/" + os.path.basename(__bowtie1_index_rna__fasta) 98 | 99 | bowtie1_index_conta__input_reference = __bowtie1_index_rna__fasta 100 | bowtie1_index_conta__output = f"{__bowtie1_index_rna__fasta}.1.ebwt" 101 | rule samtools_faidx: 102 | input: 103 | __bowtie1_index_rna__fasta 104 | output: 105 | __bowtie1_index_rna__fasta + ".fai" 106 | container: 107 | config['apptainers']['sequana_tools'] 108 | shell: 109 | """ 110 | samtools faidx {input[0]} 111 | """ 112 | 113 | elif manager.config.general.rRNA_feature: 114 | # extract the rRNA feature from the GFF file. Build the corresponding FastA 115 | # file. if not found, a dummy FastA file with AAAAAAAAAAAAAA is built 116 | 117 | bowtie1_index_conta__input_reference = f"{__prefix_name__}_{manager.config.general.rRNA_feature}.fa" 118 | bowtie1_index_conta__input_gff = f"{__prefix_name__}_{manager.config.general.rRNA_feature}.gff" 119 | bowtie1_index_conta__output = f"{__prefix_name__}_{manager.config.general.rRNA_feature}.1.ebwt" 120 | rule extract_fasta: 121 | input: 122 | fasta = __fasta_file__, 123 | gff = __gff_file__ 124 | params: 125 | feature = config['general']['rRNA_feature'] 126 | output: 127 | fasta = bowtie1_index_conta__input_reference, 128 | fai = bowtie1_index_conta__input_reference + ".fai", 129 | gff = bowtie1_index_conta__input_gff 130 | log: 131 | "logs/indexing/get_rRNA.log" 132 | container: 133 | config['apptainers']['sequana_tools'] 134 | shell: 135 | """ 136 | # used to be gawk but awk is more generic. 137 | awk '{{ if ($3=="{params.feature}") print }}' {input.gff} > {output.gff} 138 | if [ -s {output.gff} ] 139 | then 140 | bedtools getfasta -fi {input.fasta} -bed {output.gff} -fo {output.fasta} 141 | else : 142 | echo -e ">empty\\nAAAAAAAAAAAAAA" > {output.fasta} 143 | fi 144 | samtools faidx {output.fasta} 145 | """ 146 | 147 | # ========================================================= Indexing for rRNA and contmination 148 | # 149 | # redo the indexing whatsover since it is pretty fast 150 | if manager.config.general.contaminant_file or manager.config.general.rRNA_feature: 151 | 152 | # identify ribosomal contamination or contamination 153 | rule ribosomal_contamination: 154 | input: 155 | reference = bowtie1_index_conta__input_reference, 156 | fai = bowtie1_index_conta__input_reference + ".fai" 157 | output: 158 | bowtie1_index_conta__output 159 | log: 160 | "logs/indexing/bowtie1_index_conta.log" 161 | params: 162 | options="" 163 | threads: 2 164 | container: 165 | config['apptainers']['sequana_tools'] 166 | wrapper: 167 | f"{manager.wrappers}/wrappers/bowtie1/build" 168 | 169 | 170 | # ============================================================================ bowtie2 index 171 | # 172 | if manager.config.general.aligner == "bowtie2": 173 | if manager.config['bowtie2_mapping']['genome_size_larger_than_4gb']: 174 | bt2_ext = "bt2l" 175 | else: 176 | bt2_ext = "bt2" 177 | 178 | # These two variables are used elsewhere and in the rule below 179 | # Index creatin may differ from one version to another and one may want to 180 | # keep track of the index and its version. We try to retrieve the version 181 | # and if succesful, we will add the version as a suffix, otherwise just the 182 | # name of bowtie2 183 | 184 | # tested on version 2.4.2 185 | try: 186 | p = subprocess.Popen(['bowtie2'], stderr=subprocess.PIPE) 187 | p.wait() 188 | stderr = p.stderr.read().decode().split("\n") 189 | hits = [line for line in stderr if "version" in line and 'Bowtie' in line] 190 | bowtie2_version = "_" + hits[0].split("version")[1].split()[0].strip() 191 | except Exception: # various type of exception may occur here 192 | logger.warning(f"Could not determine bowtie2 version. Index will be stored in {genome_directory}/bowtie2/") 193 | bowtie2_version = "" 194 | 195 | bowtie2_index = f"{genome_directory}/bowtie2{bowtie2_version}/{genome_name}" 196 | 197 | if os.path.exists(f"{bowtie2_index}.1.{bt2_ext}"): 198 | pass # index exists, no need to do it, everything should be fine 199 | else: 200 | rule bowtie2_index: 201 | input: 202 | reference=__fasta_file__ 203 | output: 204 | multiext( 205 | bowtie2_index, 206 | ".1.bt2", ".2.bt2", ".3.bt2", ".4.bt2", ".rev.1.bt2", ".rev.2.bt2", 207 | ), 208 | 209 | log: 210 | "logs/indexing/bowtie2_genome.log" 211 | params: 212 | options=config["bowtie2_index"]["options"] 213 | threads: 214 | config["bowtie2_index"]["threads"] 215 | container: 216 | config['apptainers']['sequana_tools'] 217 | resources: 218 | **config['bowtie2_index']['resources'] 219 | wrapper: 220 | f"{manager.wrappers}/wrappers/bowtie2/build" 221 | 222 | 223 | # ============================================================================ star index 224 | # 225 | elif manager.config.general.aligner == "star": 226 | # tested on version 2.7.8a 227 | try: 228 | p = subprocess.Popen(['STAR', '--version'], stdout=subprocess.PIPE) 229 | p.wait() 230 | star_version = p.stdout.read().decode().strip() 231 | except Exception: # various type of exception may occur here 232 | logger.warning(f"Could not determine STAR version. Index will be stored in {genome_directory}/star/") 233 | star_version = "" 234 | 235 | 236 | __star_index__dir__ = genome_directory + f"/star{star_version}" 237 | __star_index__done = f"{__star_index__dir__}/star.done" 238 | 239 | if not os.path.exists(__star_index__done): 240 | 241 | rule star_index: 242 | input: 243 | fasta = __fasta_file__ 244 | output: 245 | done = __star_index__done 246 | params: 247 | options= config['star_index']['options'], 248 | wkdir= __star_index__dir__ 249 | threads: 250 | config["star_index"]['threads'] 251 | log: 252 | "logs/indexing/star_genome.log" 253 | container: 254 | config['apptainers']['sequana_tools'] 255 | resources: 256 | **config['star_index']['resources'] 257 | wrapper: 258 | f"{manager.wrappers}/wrappers/star/index" 259 | 260 | 261 | # ========================================================================== salmon 262 | # 263 | elif manager.config.general.aligner == "salmon": 264 | #tested on salmon 1.4.0 265 | try: 266 | p = subprocess.Popen(['salmon', '--version'], stdout=subprocess.PIPE) 267 | p.wait() 268 | salmon_version = p.stdout.read().decode().split()[-1] 269 | except Exception: # various type of exception may occur here 270 | logger.warning(f"Could not determine salmon version. Index will be stored in {genome_directory}/salmon/") 271 | salmon_version = "" 272 | 273 | if os.path.exists(genome_directory + f"/salmon{salmon_version}/salmon.done"): 274 | pass # index exists, no need to do it, everything should be fine 275 | else: 276 | rule salmon_index: 277 | input: 278 | fasta=__fasta_file__, 279 | gff=__gff_file__ 280 | output: 281 | done=genome_directory + f"/salmon{salmon_version}/salmon.done" 282 | threads: 283 | config['salmon_index']['threads'] 284 | resources: 285 | **config["salmon_mapping"]['resources'] 286 | params: 287 | options=config['salmon_index']['options'] 288 | container: 289 | config['apptainers']['salmon'] 290 | log: 291 | "logs/salmon_indexing.log" 292 | wrapper: 293 | f"{manager.wrappers}/wrappers/salmon/index" 294 | 295 | 296 | 297 | # ===================================================================== FASTQC on input data set 298 | # 299 | if not manager.config['fastqc']['skip_fastqc_raw']: 300 | rule fastqc_raw: 301 | input: 302 | manager.getrawdata() 303 | output: 304 | done = "{sample}/fastqc_raw/fastqc.done" 305 | params: 306 | options= config["fastqc"]["options"], 307 | working_directory= "{sample}/fastqc_raw/" 308 | threads: config["fastqc"]["threads"] 309 | container: 310 | config['apptainers']['fastqc'] 311 | log: 312 | "{sample}/fastqc_raw/fastqc.log" 313 | wrapper: 314 | f"{manager.wrappers}/wrappers/fastqc" 315 | 316 | expected_output.extend(expand("{sample}/fastqc_raw/fastqc.done", sample=manager.samples)) 317 | 318 | 319 | # ================================================================== trimming 320 | valid_trimmer = ['cutadapt', 'fastp', 'atropos'] 321 | if manager.config.trimming.software_choice not in valid_trimmer: 322 | print(f"Invalid choice for trimming tool. Choose one in {valid_trimmer}") 323 | sys.exit(1) 324 | 325 | if manager.config.trimming.do is False: 326 | __clean_fastq__output = manager.getrawdata() 327 | elif manager.config.trimming.software_choice in ["cutadapt", "atropos"]: 328 | adapter_tool = manager.config.trimming.software_choice 329 | 330 | fwd = manager.config.cutadapt.fwd 331 | rev = manager.config.cutadapt.rev 332 | 333 | if adapter_tool in ["cutadapt", "atropos"]: 334 | adapter_tool = "cutadapt" 335 | __cutadapt__input_fastq = manager.getrawdata() 336 | __cutadapt__wkdir = "{sample}/cutadapt" 337 | __cutadapt__output = ["{sample}/cutadapt/{sample}_R1_.clean.fastq.gz"] 338 | if manager.paired: 339 | __cutadapt__output += ["{sample}/cutadapt/{sample}_R2_.clean.fastq.gz"] 340 | 341 | # Set the fwd and rev adapters 342 | __cutadapt__fwd = manager.config.cutadapt.fwd 343 | __cutadapt__rev = manager.config.cutadapt.rev 344 | 345 | __cutadapt__options = manager.config.cutadapt.options 346 | __cutadapt__mode = manager.config.cutadapt.mode 347 | __cutadapt__log = "%s/cutadapt/cutadapt.txt" % manager.sample 348 | __cutadapt__sample = manager.sample 349 | __clean_fastq__output = __cutadapt__output 350 | include: sm.modules["cutadapt"] 351 | elif manager.config.trimming.software_choice == "fastp": 352 | 353 | __clean_fastq__output = ["{sample}/fastp/{sample}_R1_.fastp.fastq.gz"] 354 | if manager.paired: 355 | __clean_fastq__output += ["{sample}/fastp/{sample}_R2_.fastp.fastq.gz"] 356 | 357 | _quality = config["fastp"].get("quality", 15) 358 | _minlen = config["fastp"].get("minimum_length", 20) 359 | 360 | options_fastp = config["fastp"].get("options", "") 361 | options_fastp += f" --qualified_quality_phred {_quality}" 362 | options_fastp += f" -l {_minlen}" 363 | if config["fastp"].get("disable_adapter_trimming", False) is True: 364 | options_fastp += "--disable_adapter_trimming" 365 | if config["fastp"].get("disable_quality_filtering", False) is True: 366 | options_fastp += "--disable_quality_filtering" 367 | 368 | rule fastp: 369 | input: 370 | sample=manager.getrawdata() 371 | output: 372 | trimmed=__clean_fastq__output, 373 | html="{sample}/fastp/fastp_{sample}.html", 374 | json="{sample}/fastp/fastp_{sample}.json", # must be named fastp 375 | log: 376 | "logs/fastp/{sample}.log" 377 | params: 378 | options=options_fastp, 379 | adapters=config["fastp"]["adapters"] 380 | threads: 381 | config["fastp"].get("threads", 4) 382 | resources: 383 | **config['fastp']['resources'] 384 | container: 385 | config['apptainers']['fastp'] 386 | wrapper: 387 | f"{manager.wrappers}/wrappers/fastp" 388 | 389 | 390 | # ===================================================== FASTQC fastp results 391 | # 392 | rule fastqc_clean: 393 | input: 394 | __clean_fastq__output 395 | output: 396 | done = "{sample}/fastqc_clean/fastqc.done" 397 | params: 398 | options= config["fastqc"]["options"], 399 | working_directory= "{sample}/fastqc_clean/" 400 | threads: config["fastqc"]["threads"] 401 | log: 402 | "{sample}/fastqc_clean/fastqc.log" 403 | resources: 404 | **config["fastqc"]['resources'] 405 | container: 406 | config['apptainers']['fastqc'] 407 | wrapper: 408 | f"{manager.wrappers}/wrappers/fastqc" 409 | expected_output.extend(expand("{sample}/fastqc_clean/fastqc.done", sample=manager.samples)) 410 | 411 | 412 | # ================================= Decompress fastq.gz file before running bowtie1 413 | # 414 | if manager.config.trimming.software_choice == 'cutadapt' and manager.config.trimming.do: 415 | #__unpigz_R1__input = manager.getname("cutadapt", "_R1_.clean.fastq.gz") 416 | __unpigz_R1__input = "{sample}/cutadapt/{sample}_R1_.clean.fastq.gz" 417 | elif manager.config.trimming.software_choice == 'fastp' and manager.config.trimming.do: 418 | __unpigz_R1__input = "{sample}/fastp/{sample}_R1_.fastp.fastq.gz" 419 | elif manager.config.trimming.software_choice == 'atropos' and manager.config.trimming.do: 420 | __unpigz_R1__input = "{sample}/atropos/{sample}_R1_.clean.fastq.gz" 421 | else: 422 | __unpigz_R1__input = manager.getrawdata() 423 | 424 | 425 | # ========== decompress and sanity check 426 | # 427 | if int(config['bowtie1_mapping_rna']['nreads']) != -1: 428 | extra = int(config['bowtie1_mapping_rna']['nreads']) * 4 429 | config['bowtie1_mapping_rna']['nreads'] = extra 430 | 431 | rule sample_rRNA: 432 | input: 433 | __unpigz_R1__input 434 | output: 435 | fastq=temp("{sample}/data_for_bowtie1/{sample}_R1_.fastq") 436 | threads: 4 437 | params: 438 | nreads = int(config['bowtie1_mapping_rna']['nreads']) 439 | shell: 440 | """ 441 | set +o pipefail 442 | if [[ {params.nreads} == -1 ]]; then 443 | unpigz -p {threads} -fk --stdout {input[0]} > {output[0]} 444 | else 445 | unpigz -p {threads} -fk --stdout {input[0]} | head -n {params.nreads} > {output[0]} 446 | fi 447 | """ 448 | 449 | """With paired data, alignement on rRNA leads to 0% alignment if we use R1 and 450 | R2. If we use R1 only, the percentage is >0. First reason is that reads are not 451 | trimmed properly. In truth, bowtie2 supports local alignments which means it can 452 | soft-clip non-matching (=adapter) content while still align the local part of 453 | the read that matches the reference. With Bowtie1 the read will probably go 454 | unaligned due to the many mismatches. So we do not include R2 from version 455 | v0.9.14. 456 | """ 457 | 458 | # ========================================== bowtie1 mapping to detect rRNA 459 | 460 | if manager.config.general.rRNA_feature or manager.config.general.contaminant_file: 461 | # rRNA. Note the list here below because the rule expects a list (in case it 462 | # is paired 463 | 464 | rule bowtie1_mapping_rna: 465 | input: 466 | fastq= rules.sample_rRNA.output.fastq, 467 | index=bowtie1_index_conta__output 468 | output: 469 | bam = "{sample}/bowtie1_mapping_rna/{sample}_rRNA.bam", 470 | sorted = "{sample}/bowtie1_mapping_rna/{sample}_rRNA.sorted.bam", 471 | log: 472 | "{sample}/bowtie1_mapping_rna/{sample}_bowtie1.log" 473 | params: 474 | options="" 475 | threads: 476 | config['bowtie1_mapping_rna']['threads'] 477 | container: 478 | config['apptainers']['sequana_tools'] 479 | wrapper: 480 | f"{manager.wrappers}/wrappers/bowtie1/align" 481 | 482 | rule fix_bowtie1_log: 483 | input: 484 | expand("{sample}/bowtie1_mapping_rna/{sample}_bowtie1.log", sample=manager.samples) 485 | output: 486 | "logs/fix_bowtie1/fix_bowtie1.log" 487 | run: 488 | 489 | for filename in input: 490 | # we read the file 491 | with open(filename) as fin: 492 | data = fin.readlines() 493 | # we update the file 494 | with open(filename, "w") as fout: 495 | for line in data: 496 | if "least one alignment" in line: 497 | fout.write(line) 498 | fout.write(line.replace("least one alignment", "least one reported alignment")) 499 | else: 500 | fout.write(line) 501 | with open(output[0], "w") as fout: 502 | fout.write("") 503 | expected_output += ["logs/fix_bowtie1/fix_bowtie1.log"] 504 | 505 | 506 | # ========================================================== bowtie2 mapping 507 | if manager.config.general.aligner == "bowtie2": 508 | 509 | rule bowtie2_mapping: 510 | input: 511 | fastq=__clean_fastq__output, 512 | idx=multiext( 513 | bowtie2_index, 514 | ".1.bt2", 515 | ".2.bt2", 516 | ".3.bt2", 517 | ".4.bt2", 518 | ".rev.1.bt2", 519 | ".rev.2.bt2", 520 | ), 521 | 522 | output: 523 | bam="{sample}/bowtie2/{sample}.sorted.bam", 524 | log: 525 | "{sample}/bowtie2/{sample}.log" 526 | params: 527 | options=config["bowtie2_mapping"]["options"], 528 | threads: 529 | config["bowtie2_mapping"]["threads"] 530 | container: 531 | config['apptainers']['sequana_tools'] 532 | resources: 533 | **config['bowtie2_mapping']['resources'] 534 | wrapper: 535 | f"{manager.wrappers}/wrappers/bowtie2/align" 536 | 537 | __mapping_output = "{sample}/bowtie2/{sample}.sorted.bam" 538 | 539 | 540 | # ========================================================== star mapping 541 | elif manager.config.general.aligner == "star": 542 | # Mapper rna-star 543 | 544 | rule star_mapping: 545 | input: 546 | fastq= __clean_fastq__output, 547 | reference=__fasta_file__, 548 | index= __star_index__done 549 | output: 550 | bam = "{sample}/star_mapping/{sample}_Aligned.sortedByCoord.out.bam" 551 | params: 552 | options=config['star_mapping']['options'], 553 | # for legacy mapping, set the first and second pass options 554 | options_first_pass=config['star_mapping']['options'], 555 | options_second_pass=config['star_mapping']['options'], 556 | prefix = "{sample}/star_mapping/{sample}", 557 | legacy=True, 558 | threads: 559 | config['star_mapping']['threads'] 560 | log: 561 | "{sample}/star_mapping/{sample}.log" 562 | container: 563 | config['apptainers']['sequana_tools'] 564 | resources: 565 | **config['star_mapping']['resources'] 566 | wrapper: 567 | f"{manager.wrappers}/wrappers/star/align" 568 | 569 | 570 | expected_output.extend( 571 | expand( 572 | "{sample}/star_mapping/{sample}_Aligned.sortedByCoord.out.bam", 573 | sample=manager.samples 574 | ) 575 | ) 576 | 577 | __mapping_output = "{sample}/star_mapping/{sample}_Aligned.sortedByCoord.out.bam" 578 | 579 | # ========================================================== salmon mapping 580 | 581 | elif manager.config.general.aligner == "salmon": 582 | # to be used later by salmon_to_features 583 | __salmon_mapping__output_counts = "{sample}/salmon_mapping/{sample}_quant.sf" 584 | 585 | rule salmon_mapping: 586 | input: 587 | fastq=__clean_fastq__output, 588 | index=genome_directory + f"/salmon{salmon_version}/salmon.done" 589 | output: 590 | quant="{sample}/salmon_mapping/{sample}_quant.sf" 591 | params: 592 | options=config['salmon_mapping']['options'] 593 | threads: 594 | config['salmon_mapping']['threads'] 595 | resources: 596 | **config["salmon_mapping"]['resources'] 597 | container: 598 | config['apptainers']['salmon'] 599 | log: 600 | "{sample}/salmon_mapping/salmon.log" 601 | wrapper: 602 | "add_salmon/wrappers/salmon/align" 603 | 604 | 605 | expected_output.extend(expand("{sample}/salmon_mapping/{sample}_quant.sf", sample=manager.samples)) 606 | # There is no BAM created 607 | __mapping_output = None 608 | 609 | 610 | # ========================================================== add_read_group 611 | # The input is the output of the mapping 612 | # Add Read group on BAM files 613 | if manager.config.general.aligner not in ['salmon']: 614 | rule add_read_group: 615 | input: 616 | __mapping_output 617 | output: 618 | "{sample}/add_read_group/{sample}.sorted.bam" 619 | log: 620 | "{sample}/add_read_group/{sample}.log" 621 | params: 622 | options=config["add_read_group"]["options"], 623 | SM="{sample}" 624 | container: 625 | config['apptainers']['sequana_tools'] 626 | wrapper: 627 | f"{manager.wrappers}/wrappers/add_read_group" 628 | 629 | 630 | 631 | # we always add read group so input is the read group output 632 | # output is stored in __final_bam__ 633 | # duplicates can be from PCR or if SR, by pure chance. 634 | # if Paired, most likely a PCR origin. 635 | # Mark duplicates 636 | if config["mark_duplicates"]["do"]: 637 | rule mark_duplicates: 638 | input: 639 | "{sample}/add_read_group/{sample}.sorted.bam" 640 | output: 641 | bam = "{sample}/mark_duplicates/{sample}.sorted.markdup.bam", 642 | metrics = "{sample}/mark_duplicates/{sample}.sorted.markdup.metrics", 643 | log: 644 | out = "{sample}/mark_duplicates/log.out", 645 | err = "{sample}/mark_duplicates/log.err" 646 | params: 647 | remove_dup = "false", 648 | tmpdir = "{sample}/mark_duplicates/tmp" 649 | container: 650 | config['apptainers']['sequana_tools'] 651 | resources: 652 | **config['mark_duplicates']['resources'] 653 | wrapper: 654 | f"{manager.wrappers}/wrappers/mark_duplicates" 655 | __final_bam__ = "{sample}/mark_duplicates/{sample}.sorted.markdup.bam" 656 | elif manager.config.general.aligner not in ['salmon']: 657 | __final_bam__ = "{sample}/add_read_group/{sample}.sorted.bam" 658 | else: 659 | __final_bam__ = [] 660 | 661 | 662 | # ====================================================================== generating bigwig files 663 | if manager.config.bam_coverage.do is True and config['general']['aligner'] not in ['salmon']: 664 | 665 | rule bam_coverage: 666 | input: __final_bam__ 667 | output: 668 | "{sample}/bam_coverage/{sample}.norm.bw" 669 | params: 670 | options = config['bam_coverage']["options"] 671 | log: 672 | "{sample}/bam_coverage/{sample}.log" 673 | threads: 674 | config['bam_coverage']['threads'] 675 | container: 676 | config['apptainers']['sequana_tools'] 677 | resources: 678 | **config['bam_coverage']['resources'] 679 | wrapper: 680 | f"{manager.wrappers}/wrappers/deeptools/bam_coverage" 681 | 682 | expected_output.extend( 683 | expand( 684 | "{sample}/bam_coverage/{sample}.norm.bw", 685 | sample=manager.samples 686 | ) 687 | ) 688 | 689 | 690 | # ============================================================= generating IGV plots 691 | if manager.config.igvtools.do and config['general']['aligner'] not in ['salmon']: 692 | # if nothing provided, it must be an empty string 693 | if manager.config.igvtools.chrom_sizes_file.strip(): 694 | pass 695 | else: 696 | config["igvtools"]["chrom_sizes_file"] = __fasta_file__ 697 | 698 | rule igvtools: 699 | input: __final_bam__ 700 | output: 701 | "{sample}/igvtools/{sample}.tdf" 702 | log: 703 | "{sample}/igvtools/{sample}.log" 704 | params: 705 | chromSize=config['igvtools']['chrom_sizes_file'] 706 | container: 707 | config['apptainers']['igvtools'] 708 | threads: 4 709 | shell: 710 | """ 711 | igvtools count -z 5 -w 25 -f mean,max --includeDuplicates {input} {output} {params.chromSize} 712 | """ 713 | expected_output.extend(expand("{sample}/igvtools/{sample}.tdf", sample=manager.samples)) 714 | 715 | 716 | # ===================================================================== Feature counts from subread suite 717 | if manager.config.general.aligner == "salmon": 718 | __feature_counts__input = __salmon_mapping__output_counts 719 | else : 720 | __feature_counts__input = __final_bam__ 721 | 722 | fc_outdir = "post_analysis/feature_counts/" 723 | 724 | if manager.config.feature_counts.do and manager.config.general.aligner not in ['salmon']: 725 | # Guessing strandness is not always straightfoward; Even when we set it; 726 | # collaborators may want to look at the other options. So, we compute 727 | # everything with the 3 different options of strandness. 728 | # We will copy one of them based on our criteria, but all 3 will be 729 | # available 730 | 731 | feature_type = config['feature_counts']['feature'] 732 | if "," in feature_type: # assume this is a custom GFF file 733 | feature_type = "custom" 734 | else: 735 | feature_type = config['feature_counts']["feature"] 736 | 737 | if config['feature_counts']['extra_attributes']: 738 | fc_options = f" {config['feature_counts']['options']} " 739 | fc_options += " --extraAttributes {} ".format(config['feature_counts']['extra_attributes']) 740 | else: 741 | fc_options = f" {config['feature_counts']['options']} " 742 | 743 | if manager.paired: 744 | fc_options += " -p " 745 | 746 | 747 | # ======================= calls feature counts 3 times Nsamples here below 748 | strand = [0,1,2] 749 | rule feature_counts: 750 | input: 751 | bam=__feature_counts__input, 752 | gff=__gff_file__ 753 | output: 754 | counts="{sample}/feature_counts/{strand}/{sample}_feature.out", 755 | summary="{sample}/feature_counts/{strand}/{sample}_feature.out.summary" 756 | params: 757 | options=fc_options, 758 | feature=feature_type, 759 | attribute=config['feature_counts']["attribute"], 760 | strandness="{strand}" 761 | threads: 762 | config["feature_counts"]['threads'] 763 | container: 764 | config['apptainers']['sequana_tools'] 765 | log: 766 | "{sample}/feature_counts/{strand}/feature_counts.log" 767 | wrapper: 768 | f"{manager.wrappers}/wrappers/feature_counts" 769 | 770 | 771 | # ===================== guessing the strand 772 | # 773 | __guess_strandness__output = expand(fc_outdir + "{sample}_feature.out", sample=manager.samples) 774 | rule guess_strandness: 775 | """Guessing strandnes""" 776 | input: 777 | counts = expand("{sample}/feature_counts/{strand}/{sample}_feature.out", sample=manager.samples, strand=[0,1,2]) 778 | output: 779 | data=__guess_strandness__output, 780 | summary=manager.globals['strand_summary'] 781 | run: 782 | # We compute all strandness 783 | import sequana.featurecounts as fc 784 | 785 | mfc = fc.MultiFeatureCount(rnaseq_folder=".", 786 | tolerance=manager.config.feature_counts.tolerance) 787 | mfc.df.to_csv(output.summary) 788 | 789 | try: 790 | mfc.plot_strandness(savefig=True, output_filename="outputs/strand_summary.png") 791 | except Exception as err: 792 | logger.warning("Could not create plot_strandness") 793 | 794 | logger.info(f"strandness inference: {mfc.probable_strand}") 795 | msg = f"This is {mfc.probable_strand} data (check in the multiqc report)" 796 | if mfc.probable_strand in [0, 1, 2]: 797 | choice = mfc.probable_strand 798 | logger.info(msg) 799 | else: 800 | logger.warning("Strandness is apparently neither of 0, 1, 2") 801 | logger.warning("you will need to copy the feature counts files yourself in ./feature_counts") 802 | choice = -1 803 | 804 | # If user knowns what he/she wants we overwrite the choice 805 | if "strandness" in config['feature_counts'] and config["feature_counts"]["strandness"]: 806 | user_choice = int(config["feature_counts"]["strandness"]) 807 | if user_choice in [0,1,2]: 808 | choice = user_choice 809 | else: 810 | logger.error(f"strandness in the config file must be 0,1,2. You gave {user_choice}") 811 | sys.exit(1) 812 | 813 | if choice in {0, 1, 2}: 814 | for filename in input: 815 | if f"feature_counts/{choice}/" in filename: 816 | shell(f"cp {filename} {fc_outdir}") 817 | shell(f"cp {filename}.summary {fc_outdir}") 818 | else: 819 | # if not clear, we copy everything and users should clean up the directory 820 | for filename in input.fc0: 821 | shell("cp {} {}".format(filename, fc_outdir)) 822 | shell("cp {}.summary {}".format(filename, fc_outdir)) 823 | for filename in input.fc1: 824 | shell("cp {} {}".format(filename, fc_outdir)) 825 | shell("cp {}.summary {}".format(filename, fc_outdir)) 826 | for filename in input.fc2: 827 | shell("cp {} {}".format(filename, fc_outdir)) 828 | shell("cp {}.summary {}".format(filename, fc_outdir)) 829 | elif manager.config.feature_counts.do and manager.config.general.aligner in ['salmon']: 830 | 831 | __salmon_to_features__output = fc_outdir + "{sample}_feature.out" 832 | rule salmon_to_features: 833 | input: __salmon_mapping__output_counts 834 | output: __salmon_to_features__output 835 | params: 836 | gff=__gff_file__ 837 | shell: 838 | """sequana salmon --input {input} --output {output} --gff {params.gff} --attribute ID """ 839 | expected_output += expand(__salmon_to_features__output, sample=manager.samples) 840 | 841 | 842 | # ==================================================================== Guess strandness 843 | 844 | if manager.config.general.aligner in ['salmon']: 845 | __guess_strandness__output = expand(__salmon_to_features__output, sample=manager.samples) 846 | rule merge_feature_counts: 847 | input: __guess_strandness__output 848 | output: "post_analysis/all_features.out" 849 | run: 850 | from sequana.featurecounts import FeatureCountMerger 851 | fcm = FeatureCountMerger(fof=input) 852 | fcm.to_tsv(output[0]) 853 | expected_output.append("post_analysis/all_features.out") 854 | 855 | # ================================================================== rseqc diag tool 856 | 857 | 858 | if config['rseqc']['do']: 859 | 860 | rule gff2bed: 861 | input: 862 | gff=__gff_file__ 863 | output: 864 | bed="tmp/temp.bed" # config['rseqc'].get('bed_file', "tmp/temp.bed") 865 | message: "Build BED file from GFF using Sequana" 866 | run: 867 | from sequana import GFF3 868 | g = GFF3(input[0]) 869 | g.to_bed(output[0], 'Name') 870 | 871 | rule rseqc: 872 | input: 873 | bam=__final_bam__, 874 | bed=rules.gff2bed.output.bed 875 | # no need to put all outputs 876 | output: 877 | bam_stat= "{sample}/rseqc/{sample}_bam_stat.txt", 878 | #read_gc="{sample}/rseqc/{sample}.GC.xls", 879 | geneBody_coverage="{sample}/rseqc/{sample}.geneBodyCoverage.txt" 880 | params: 881 | paired="PE" if manager.paired else "SE" 882 | log: 883 | "{sample}/rseqc/{sample}.log" 884 | container: 885 | config['apptainers']['sequana_tools'] 886 | shell: 887 | """ 888 | # for paired data only 889 | inner_distance.py -i {input.bam} -o {wildcards.sample}/rseqc/{wildcards.sample} -r {input.bed} &>{log} 890 | 891 | # For now GC not very useful in the output so commented 892 | # read_GC.py -i {input.bam} -o {wildcards.sample}/rseqc/{wildcards.sample} &>{log} 893 | 894 | # genebody coverage 895 | geneBody_coverage.py -i {input.bam} -o {wildcards.sample}/rseqc/{wildcards.sample} -r {input.bed} &>{log} 896 | 897 | # uses bigwig redundant with geneBody_coverage 898 | # geneBody_coverage2.py -i {wildcards.sample}/bamCoverage/{wildcards.sample}.norm.bw -o {wildcards}/rseq/{wildcards.sample} -r test.bed &>{log} 899 | 900 | # Not included in the multiqc module so commented for now 901 | #clipping_profile.py -i {input.bam} -s {params.paired} -o {wildcards.sample}/rseqc/{wildcards.sample} 902 | read_duplication.py -i {input.bam} -o {wildcards.sample}/rseqc/{wildcards.sample} &>{log} 903 | junction_annotation.py -i {input.bam} -o {wildcards.sample}/rseqc/{wildcards.sample} -r {input.bed} &>{log} 904 | junction_saturation.py -i {input.bam} -o {wildcards.sample}/rseqc/{wildcards.sample} -r {input.bed} &>{log} 905 | infer_experiment.py -i {input.bam} -r {input.bed} > {wildcards.sample}/rseqc/{wildcards.sample}.infer.txt &>{log} 906 | 907 | # bam stats KEEP last since that is the expected output to make sure 908 | # previous files (not listed in output:) are computed first. 909 | bam_stat.py -i {input.bam} > {output.bam_stat} &>{log} 910 | """ 911 | 912 | # one series is enough. if bam_stats is created, others are also created 913 | expected_output.extend( 914 | expand("{sample}/rseqc/{sample}_bam_stat.txt", sample=manager.samples) 915 | ) 916 | 917 | # ========================================================== RNAseqc diag tool 918 | # No need for mark_duplicates for RNASEQC . Just use the BAM file 919 | if config["rnaseqc"]["do"] and config['general']['aligner'] != 'salmon': 920 | 921 | 922 | # for multiqc, important that output directories are called rnaseqc 923 | __gtf_file__ = config['rnaseqc']['gtf_file'].strip() 924 | 925 | # Could be a local file. If provided, 926 | if __gtf_file__: 927 | if os.path.exists(__gtf_file__) is False: 928 | logger.error(f"{__gtf_file__} not found") 929 | sys.exit(1) 930 | else: # if gtf not provided, maybe available in the genome directory ? 931 | __gtf_file__ = __prefix_name__ + ".gtf" 932 | if os.path.exists(__gtf_file__) is False: 933 | gdir = config["general"]["genome_directory"] 934 | from sequana import logger as logs # to not interfere with snakemake 935 | logs.critical(f"{__gtf_file__} not found in {gdir}. Trying the GFF file") 936 | __gtf_file__ = __prefix_name__ + ".gff" 937 | 938 | 939 | rule rnaseqc_fixup: 940 | input: 941 | gtf = __gtf_file__ 942 | output: 943 | gtf = temp("tmp/test.gtf") 944 | run: 945 | # If input GTF has no exon or genes, an error message is printed and 946 | # no files are created. This seems to be an issue in rnaseqc. 947 | # So, we create dummy gene and exon 948 | with open(output.gtf, "w") as ff: 949 | ff.write(open(input['gtf'], "r").read()) 950 | ff.write('myCHR\tSGD\tgene\t0\t0\t.\t+\t0\tgene_id "dummy"\n') 951 | ff.write('myCHR\tSGD\texon\t0\t0\t.\t+\t0\texon_id "dummy"\n') 952 | ff.close() 953 | 954 | rule rnaseqc: 955 | input: 956 | bam = __final_bam__, 957 | gtf = rules.rnaseqc_fixup.output.gtf 958 | output: 959 | metrics = "{sample}/rnaseqc/{sample}.metrics.tsv" 960 | log: 961 | "{sample}/rnaseqc/{sample}.log", 962 | params: 963 | directory = "{sample}/rnaseqc", 964 | options= config['rnaseqc']['options'] 965 | resources: 966 | **config["rnaseqc"]["resources"] 967 | container: 968 | config["apptainers"]["rnaseqc"] 969 | shell: 970 | """ 971 | rnaseqc {input.gtf} {input.bam} {params.directory} -s {wildcards.sample} {params.options} &>{log} 972 | """ 973 | 974 | expected_output.extend(expand("{sample}/rnaseqc/{sample}.metrics.tsv", sample=manager.samples)) 975 | 976 | 977 | # ========================================================== multiqc 978 | multiqc_params_options = config['multiqc']['options'] 979 | if manager.config.multiqc.config_file: 980 | multiqc_params_options += f" -c {manager.config.multiqc.config_file}" 981 | 982 | 983 | 984 | rule multiqc: 985 | input: 986 | expected_output 987 | output: 988 | "multiqc/multiqc_report.html" 989 | params: 990 | options=multiqc_params_options, 991 | input_directory=config['multiqc']['input_directory'], 992 | config_file=config['multiqc']['config_file'], 993 | modules=config['multiqc']['modules'] 994 | log: 995 | "multiqc/multiqc.log" 996 | resources: 997 | **config['multiqc']['resources'] 998 | container: 999 | config["apptainers"]["multiqc"] 1000 | wrapper: 1001 | f"{manager.wrappers}/wrappers/multiqc" 1002 | 1003 | # ========================================================== rulegraph 1004 | 1005 | rule rulegraph: 1006 | input: 1007 | workflow.snakefile, 1008 | output: 1009 | "rulegraph/rulegraph.dot", 1010 | params: 1011 | configname="config.yaml", 1012 | mapper = {"multiqc": "../multiqc/multiqc_report.html"}, 1013 | wrapper: 1014 | f"{manager.wrappers}/wrappers/rulegraph" 1015 | 1016 | 1017 | rule dot2svg: 1018 | input: 1019 | "rulegraph/rulegraph.dot" 1020 | output: 1021 | ".sequana/rulegraph.svg" 1022 | container: 1023 | config['apptainers']['graphviz'] 1024 | shell: 1025 | """dot -Tsvg {input} -o {output}""" 1026 | 1027 | 1028 | 1029 | rule prepare_DGE_analysis: 1030 | input: 1031 | features="post_analysis/all_features.out", 1032 | output: 1033 | rnadiff="post_analysis/rnadiff.sh", 1034 | design="post_analysis/design.csv" 1035 | run: 1036 | 1037 | # ------------------------------------------- RNADIFF 1038 | # 1. save data for the RNADiff analysis 1039 | from sequana.featurecounts import FeatureCount 1040 | try: 1041 | fc = FeatureCount(input[0], guess_design=True) 1042 | fc.design_df.to_csv(output[1], index=False) 1043 | except: 1044 | msg = "Could not build the design.csv file in rnadiff. You will need to create it manually." 1045 | logger.warning(msg) 1046 | with open("post_analysis/README.rst", "w") as fout: 1047 | fout.write(f"""{msg} 1048 | The design.csv file must be formatted as follows (for 2 conditions with 3 replicates each): 1049 | 1050 | label,condition 1051 | samplename_1,condition_name_1 1052 | samplename_2,condition_name_1 1053 | samplename_3,condition_name_1 1054 | samplename_4,condition_name_2 1055 | samplename_5,condition_name_2 1056 | samplename_6,condition_name_2 1057 | """) 1058 | 1059 | # 2. save the script 1060 | with open(output.rnadiff, "w") as fout: 1061 | attribute = config['feature_counts']['attribute'] 1062 | feature = config['feature_counts']['feature'] 1063 | fout.write("#/bin/sh\nsequana rnadiff --features all_features.out " + 1064 | f" --annotation-file {__gff_file__} --design design.csv --feature-name {feature} --attribute-name {attribute}") 1065 | shell(f"chmod 755 {output.rnadiff}") 1066 | 1067 | 1068 | 1069 | # Those rules takes a couple of seconds so no need for a cluster 1070 | localrules: rulegraph, prepare_DGE_analysis 1071 | 1072 | 1073 | onsuccess: 1074 | # Create plots about stats 1075 | from sequana import logger as log 1076 | from sequana.modules_report.summary import SequanaReport 1077 | 1078 | import colorlog 1079 | log = colorlog.getLogger("sequana.rnaseq") 1080 | log.setLevel("INFO") 1081 | manager.teardown( 1082 | extra_files_to_remove=["requirements.txt"], 1083 | extra_dirs_to_remove=[".genomes", "tmp", "logs"]) 1084 | manager.clean_multiqc("multiqc/multiqc_report.html") 1085 | 1086 | 1087 | try: 1088 | import pandas as pd 1089 | df = pd.read_csv(manager.globals['strand_summary']) 1090 | guess = df['strand'].value_counts().idxmax() 1091 | names = {0: 'stranded', 1: 'unstranded', 2: 'reversely stranded'} 1092 | guess = names[guess] 1093 | except Exception as err: 1094 | if config['general']['aligner'] == "salmon": 1095 | logger.info("Salmon aligner used. No strandness information available") 1096 | else: 1097 | logger.warning(err) 1098 | guess = "?" 1099 | 1100 | 1101 | intro = f""" 1102 |

Overview

1103 |

1104 | The RNA-seq pipeline maps the reads on the provided reference (called {__fasta_file__.split("/")[-1]}). Features counts were extracted and are available in the feature counts directory; those files are entry points for differential gene expression analysis. The differential analysis, if performed, should be available in the DGE analysis directory. In addition, if enrichment was performed (GO or Kegg pathways), it should be available in the directory as well. 1105 |

1106 | 1107 |

A multiqc report is available, where various QC and mapping quality plots can be visualised. Some important plots are also in the HTML page here below. 1108 |

""" 1109 | 1110 | 1111 | rRNA_done = True 1112 | intro += """

ribosomal / contaminant content

""" 1113 | try: 1114 | if os.path.exists("multiqc/multiqc_report_data/multiqc_bowtie1.txt"): 1115 | df = pd.read_csv("multiqc/multiqc_report_data/multiqc_bowtie1.txt", sep='\t') 1116 | elif os.path.exists("multiqc/multiqc_data/multiqc_bowtie1.txt"): 1117 | df = pd.read_csv("multiqc/multiqc_data/multiqc_bowtie1.txt", sep='\t') 1118 | else: 1119 | rRNA_done = False 1120 | 1121 | if rRNA_done: 1122 | if "reads_aligned_percentage" in df.columns: 1123 | rRNA = int(df.reads_aligned_percentage.mean()*100)/100 1124 | else: 1125 | rRNA = int((100 - df.not_aligned_percentage.mean())*100)/100 1126 | 1127 | if rRNA < 10: 1128 | intro += f"

rRNA content (or contaminant provided) represents {rRNA}%, which is low (as expected). " 1129 | elif rRNA < 20: 1130 | intro += f"

rRNA content (or contaminant provided) represents {rRNA}%, which is moderately low. " 1131 | elif rRNA > 20 and rRNA <50: 1132 | rRNA += f"

rRNA content (or contaminant provided) represents {rRNA}%, which is relatively high. " 1133 | elif rRNA >= 50: 1134 | rRNA += f"

rRNA content (or contaminant provided) represents {rRNA}%, which is very high. " 1135 | else: 1136 | intro += f"

rRNA content not computed (no rRNA gene or contaminant provided)

" 1137 | 1138 | except Exception as err: 1139 | print(err) 1140 | pass 1141 | 1142 | try: 1143 | from sequana.multiqc.plots import Bowtie1Reader 1144 | if rRNA_done: 1145 | filename = "multiqc/multiqc_data/multiqc_bowtie1.txt" 1146 | if not os.path.exists(filename): 1147 | filename = "multiqc/multiqc_report_data/multiqc_bowtie1.txt" 1148 | br = Bowtie1Reader(filename) 1149 | br.df.Sample = [str(x).replace("_bowtie1","") for x in br.df.Sample] 1150 | fig = br.plot_bar(html_code=True) 1151 | from plotly import offline 1152 | intro += offline.plot(fig, output_type="div", include_plotlyjs=True) 1153 | except Exception as err: 1154 | print(err) 1155 | 1156 | 1157 | # Include the bowtie plot 1158 | intro += """

Mapping rate

""" 1159 | if config['general']['aligner'] == "bowtie2": 1160 | from sequana.multiqc.plots import Bowtie2 1161 | if not manager.paired: 1162 | filename = "multiqc/multiqc_data/mqc_bowtie2_se_plot_1.txt" 1163 | if not os.path.exists(filename): 1164 | filename = "multiqc/multiqc_report_data/mqc_bowtie2_se_plot_1.txt" 1165 | if not os.path.exists(filename): 1166 | filename = "multiqc/multiqc_data/multiqc_bowtie2.txt" 1167 | else: 1168 | filename = "multiqc/multiqc_data/mqc_bowtie2_pe_plot_1.txt" 1169 | if not os.path.exists(filename): 1170 | filename = "multiqc/multiqc_report_data/mqc_bowtie2_pe_plot_1.txt" 1171 | if not os.path.exists(filename): 1172 | filename = "multiqc/multiqc_data/multiqc_bowtie2.txt" 1173 | br = Bowtie2(filename) 1174 | fig = br.plot(html_code=True) 1175 | from plotly import offline 1176 | intro += """

The mapping was performed with bowtie2. Here below are the percentage of mapping for each sample. See also the multiqc report.

""" + offline.plot(fig, output_type="div", include_plotlyjs=True) 1177 | 1178 | elif config["general"]["aligner"] == "star": 1179 | from sequana.multiqc.plots import STAR 1180 | filename = "multiqc/multiqc_report_data/multiqc_star.txt" 1181 | if not os.path.exists(filename): 1182 | filename = "multiqc/multiqc_data/multiqc_star.txt" 1183 | br = STAR(filename) 1184 | fig = br.plot(html_code=True) 1185 | from plotly import offline 1186 | intro += """

The mapping was performed with STAR. Here below are the percentage of mapping for each sample. See also the multiqc report.

""" + offline.plot(fig, output_type="div", include_plotlyjs=True) 1187 | 1188 | try: 1189 | from sequana.multiqc.plots import FeatureCounts 1190 | intro += """

Annotation rate

""" 1191 | filename = "multiqc/multiqc_report_data/mqc_featureCounts_assignment_plot_1.txt" 1192 | if not os.path.exists(filename): 1193 | filename = "multiqc/multiqc_data/mqc_featureCounts_assignment_plot_1.txt" 1194 | # multiqc 1.27 1195 | if not os.path.exists(filename): 1196 | filename = "multiqc/multiqc_data/multiqc_featurecounts.txt" 1197 | 1198 | br = FeatureCounts(filename) 1199 | fig = br.plot(html_code=True) 1200 | from plotly import offline 1201 | intro += """

The annotation was performed with subread/feature counts software. Here below is the percentage of reads assigned to the requested feature (usually gene; see the config file here below).

""" + offline.plot(fig, output_type="div", include_plotlyjs=True) 1202 | except Exception as err: 1203 | print(err) 1204 | 1205 | if manager.globals['strand_summary']: 1206 | intro += """

Strandness

""" 1207 | intro+=""" 1208 |

Here below is a QC plot related to the strandness found for each samples. The red dotted lines indicate a tolerance. The 0.5 vertical line correspond to an unstranded case. A value close to 0 indicates a reversely stranded case, and a value close to 1 indicates a stranded case. 1209 | """.format(config["general"]['aligner'], guess) 1210 | 1211 | if "strandness" in config['feature_counts'] and config["feature_counts"]["strandness"]: 1212 | choice = config["feature_counts"]["strandness"] 1213 | intro += "User decided to set strandness to: {}

".format(choice) 1214 | else: 1215 | intro += "Strandness was guessed from the data.

" 1216 | 1217 | image = SequanaReport.png_to_embedded_png("strand", "outputs/strand_summary.png", 1218 | style="width:80%; height:40%") 1219 | intro += image 1220 | 1221 | intro+= """

Differential analysis

1222 |

1223 | Differentially expressed genes analysis is not performed automatically with this pipeline. However, information, feature counts, and other materials can be found in the directory post_analysis where the standalone 'sequana rnadiff' can be used with DeSEq2. Most probably an analysis is present. If so, please open the directory in rnadiff report. 1224 |

1225 | """ 1226 | 1227 | # Now the final report. add the original command in the HTML report 1228 | data = manager.getmetadata() 1229 | s = SequanaReport(data, intro) 1230 | 1231 | shell("chmod -R g+w .") 1232 | shell("rm -rf rulegraph") 1233 | 1234 | onerror: 1235 | manager.onerror() 1236 | --------------------------------------------------------------------------------