├── sequana_pipelines
    └── rnaseq
    │   ├── data
    │       ├── __init__.py
    │       ├── Saccer3
    │       │   └── __init__.py
    │       ├── README
    │       ├── phiX174.fa
    │       ├── rnadiff_one_factor.R
    │       └── rnadiff_GLM.R
    │   ├── dag.png
    │   ├── tools.txt
    │   ├── __init__.py
    │   ├── create_target.py
    │   ├── multiqc_config.yaml
    │   ├── schema.yaml
    │   ├── main.py
    │   ├── config.yaml
    │   └── rnaseq.rules
├── doc
    ├── images
    │   └── rRNA.png
    ├── index.rst
    ├── Makefile
    └── conf.py
├── test
    ├── __init__.py
    ├── data
    │   ├── KO_R1_.mapped.fastq.gz
    │   ├── WT_R1_.mapped.fastq.gz
    │   └── Saccer3
    │   │   └── Saccer3_rRNA.fa
    └── test_main.py
├── environment.yml
├── .pre-commit-config.yaml
├── .gitignore
├── .github
    └── workflows
    │   ├── pypi.yml
    │   ├── main.yml
    │   └── apptainer.yml
├── LICENSE
├── pyproject.toml
└── README.rst


/sequana_pipelines/rnaseq/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sequana_pipelines/rnaseq/data/Saccer3/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/doc/images/rRNA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sequana/rnaseq/HEAD/doc/images/rRNA.png


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | test_dir = os.path.dirname(os.path.realpath(__file__))
4 | 


--------------------------------------------------------------------------------
/sequana_pipelines/rnaseq/dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sequana/rnaseq/HEAD/sequana_pipelines/rnaseq/dag.png


--------------------------------------------------------------------------------
/test/data/KO_R1_.mapped.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sequana/rnaseq/HEAD/test/data/KO_R1_.mapped.fastq.gz


--------------------------------------------------------------------------------
/test/data/WT_R1_.mapped.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sequana/rnaseq/HEAD/test/data/WT_R1_.mapped.fastq.gz


--------------------------------------------------------------------------------
/sequana_pipelines/rnaseq/tools.txt:
--------------------------------------------------------------------------------
 1 | cutadapt
 2 | picard
 3 | bowtie
 4 | bowtie2
 5 | multiqc
 6 | STAR
 7 | featureCounts
 8 | deeptools
 9 | gffread
10 | salmon
11 | fastp
12 | fastqc
13 | samtools
14 | bamtools
15 | bedtools
16 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: sequana_rnaseq
 2 | 
 3 | channels:
 4 | - conda-forge
 5 | - bioconda
 6 | - defaults
 7 | - r
 8 | 
 9 | dependencies:
10 | - cutadapt
11 | - atropos
12 | - bowtie
13 | - samtools>1.7
14 | - bamtools
15 | - bedtools
16 | - bowtie2>=2.3
17 | - fastqc
18 | - subread
19 | - fastp
20 | - deeptools
21 | - salmon
22 | - star
23 | - picard>2.20
24 | - gffread
25 | 


--------------------------------------------------------------------------------
/sequana_pipelines/rnaseq/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib.metadata as metadata
 2 | 
 3 | 
 4 | def get_package_version(package_name):
 5 |     try:
 6 |         version = metadata.version(package_name)
 7 |         return version
 8 |     except metadata.PackageNotFoundError:
 9 |         return f"{package_name} not found"
10 | 
11 | 
12 | version = get_package_version("sequana-lora")
13 | 


--------------------------------------------------------------------------------
/sequana_pipelines/rnaseq/create_target.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import click
 3 | 
 4 | version ="1.0"
 5 | CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
 6 | 
 7 | @click.command(context_settings=CONTEXT_SETTINGS)
 8 | @click.version_option(version=version)
 9 | @click.option('--feature-counts-directory', default="feature_counts")
10 | def main(feature_counts_directory):
11 |     """Create target file for the RNADiff analysis"""
12 |     filenames = glob.glob("{}/*_feature.out".format(
13 |             feature_counts_directory))
14 |     print("label\tfiles\tcondition\treplicat")
15 |     for filename in filenames:
16 |         label = filename.split("/")[-1].replace("_feature.out", "")
17 |         filename = filename.split("/")[-1]
18 |         print("{}\t{}\t{}\t{}".format(label, filename, label, "X"))
19 | 
20 | 
21 | if __name__ == "__main__": #pragma: no cover
22 |     main()
23 | 
24 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | files: '\.(py|rst|sh)$'
 3 | fail_fast: false
 4 | 
 5 | repos:
 6 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 7 |     rev: v3.2.0
 8 |     hooks:
 9 |     -   id: trailing-whitespace
10 |     -   id: end-of-file-fixer
11 |     -   id: check-yaml
12 |     #-   id: check-executables-have-shebangs
13 |     -   id: check-ast
14 | 
15 | -   repo: https://github.com/pycqa/flake8
16 |     rev: 6.1.0
17 |     hooks:
18 |     -   id: flake8
19 |         args: ["-j8", "--ignore=E203,E501,W503,E722", "--max-line-length=120", "--exit-zero"]
20 | 
21 | -   repo: https://github.com/psf/black
22 |     rev: 22.10.0
23 |     hooks:
24 |     -   id: black
25 |         args: ["--line-length=120"]
26 |         exclude: E501
27 | 
28 | -   repo: https://github.com/pycqa/isort
29 |     rev: 5.12.0
30 |     hooks:
31 |       - id: isort
32 |         args: ["--profile", "black"] # solves conflicts between black and isort
33 | 
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to PyPI
 2 | on: 
 3 |   workflow_dispatch:
 4 |   push:
 5 |     tags:
 6 |       - '*'
 7 | 
 8 | jobs:
 9 |   build-n-publish:
10 |     name: Build and publish to PyPI and TestPyPI
11 |     runs-on: ubuntu-20.04
12 |     steps:
13 |     - uses: actions/checkout@main
14 |     - name: Set up Python 3.11
15 |       uses: actions/setup-python@v2
16 |       with:
17 |         python-version: 3.11
18 | 
19 |     - name: Install package 
20 |       run: |
21 |           pip install build poetry
22 | 
23 |     - name: Build source tarball
24 |       run: |
25 |           rm -rf dist;
26 |           poetry build
27 |  
28 |     - name: Publish distribution to Test PyPI
29 |       uses: pypa/gh-action-pypi-publish@release/v1
30 |       with:
31 |         user: __token__
32 |         password: ${{ secrets.TEST_PYPI_API_TOKEN }}
33 |         repository_url: https://test.pypi.org/legacy/
34 |     - name: Publish distribution to PyPI
35 |       if: startsWith(github.ref, 'refs/tags')
36 |       uses: pypa/gh-action-pypi-publish@release/v1
37 |       with:
38 |         user: __token__
39 |         password: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/sequana_pipelines/rnaseq/data/README:
--------------------------------------------------------------------------------
 1 | # to save space, we will only keep chrI and chrXII in the GFF, fasta and raw
 2 | data
 3 | 
 4 | # first the gff
 5 | mkdir temp
 6 | mkdir Saccer3
 7 | 
 8 | # Get the Saccer3 gff and fasta
 9 | 
10 | grep "chrI " Saccer3.gff > new.gff
11 | grep "chrXII	" Saccer3.gff >> new.gff
12 | cp new.gff  ./Saccer3/Saccer3.gff
13 | 
14 | # then the fasta using Python
15 | from sequana import FastA
16 | f = FastA("Saccer3.fa")
17 | with open("Saccer_new.fa", "w") as fout: 
18 |     for name, seq in zip(f.names, f.sequences): 
19 |         if name in ['chrI', 'chrXII']: 
20 |             fout.write(">{}\n{}\n".format(name, seq))                                   
21 | 
22 | cp Saccer3_new.fa ./Saccer3/Saccer3.fa
23 | 
24 | 
25 | cd Saccer3
26 | bwa index Saccer3.fa
27 | cd ..
28 | 
29 | # extract only reads that mapped onto the chrI or chrXII (to get smaller fastq
30 | files)
31 | 
32 | bwa mem Saccer3/Saccer3.fa WT_ATCACG_L001_R1_001.fastq > WT.sam
33 | bwa mem Saccer3/Saccer3.fa KO_ATCACG_L001_R1_001.fastq > KO.sam
34 | 
35 | mkdir temp
36 | from sequana import tools
37 | tools.bam_to_mapped_umpaped("WT.sam", "temp")
38 | tools.bam_to_mapped_umpaped("KO.sam", "temp")
39 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - dev
 8 |   workflow_dispatch:
 9 |   pull_request:
10 |     branches-ignore: []
11 |   schedule:
12 |     - cron: '0 0 20 * *'
13 | 
14 | jobs:
15 |   build-linux:
16 |     runs-on: ubuntu-latest
17 |     strategy:
18 |       max-parallel: 5
19 |       matrix:
20 |         python: ['3.10', '3.11']
21 |       fail-fast: false
22 | 
23 | 
24 |     steps:
25 | 
26 |     - name: install graphviz
27 |       run: |
28 |         sudo apt-get install -y graphviz
29 | 
30 |     - name: checkout git repo
31 |       uses: actions/checkout@v2
32 | 
33 |     - name: conda/mamba
34 |       uses: mamba-org/setup-micromamba@v1
35 |       with:
36 |           environment-file: environment.yml
37 |           create-args: >-
38 |             python=${{ matrix.python }}
39 |     - name: install package itself
40 |       shell: bash -l {0}
41 |       run: |
42 |         pip install .
43 |         pip install "pulp==2.7.0" --no-deps
44 | 
45 | 
46 |     - name: Install dependencies
47 |       shell: bash -l {0}
48 |       run: |
49 |         pip install coveralls pytest-cov pytest pytest-xdist
50 | 
51 |     - name: testing
52 |       shell: bash -l {0}
53 |       run: |
54 |         pytest -v --cov-report term-missing --cov=sequana_pipelines.rnaseq
55 | 
56 |     - name: coveralls
57 |       shell: bash -l {0}
58 |       run: |
59 |         echo $COVERALLS_REPO_TOKEN
60 |         coveralls --service=github
61 |       env:
62 |           GITHUB_TOKEN: ${{ github.token }}
63 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2016-2019, Sequana Development Team 
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["poetry-core>=1.0.0"]
 3 | build-backend = "poetry.core.masonry.api"
 4 | 
 5 | [tool.poetry]
 6 | name = "sequana-rnaseq"
 7 | version = "0.20.2"
 8 | description = "A RNAseq pipeline from raw reads to feature counts"
 9 | authors = ["Sequana Team"]
10 | license = "BSD-3"
11 | repository = "https://github.com/sequana/rnaseq"
12 | readme = "README.rst"
13 | keywords = ["snakemake, sequana, RNAseq, RNADiff, differential analysis"]
14 | classifiers = [
15 |         "Development Status :: 5 - Production/Stable",
16 |         "Intended Audience :: Education",
17 |         "Intended Audience :: End Users/Desktop",
18 |         "Intended Audience :: Developers",
19 |         "Intended Audience :: Science/Research",
20 |         "License :: OSI Approved :: BSD License",
21 |         "Operating System :: POSIX :: Linux",
22 |         "Programming Language :: Python :: 3.10",
23 |         "Programming Language :: Python :: 3.11",
24 |         "Programming Language :: Python :: 3.12",
25 |         "Topic :: Software Development :: Libraries :: Python Modules",
26 |         "Topic :: Scientific/Engineering :: Bio-Informatics",
27 |         "Topic :: Scientific/Engineering :: Information Analysis",
28 | ]
29 | 
30 | packages = [
31 |     {include = "sequana_pipelines"}
32 | ]
33 | 
34 | 
35 | [tool.poetry.dependencies]
36 | python = ">=3.8,<4.0"
37 | sequana = ">=0.17.3"
38 | sequana_pipetools = ">=1.0.2"
39 | click-completion = "^0.5.2"
40 | pulp = "<2.8.0"
41 | 
42 | 
43 | [tool.poetry.scripts]
44 | sequana_rnaseq = "sequana_pipelines.rnaseq.main:main"
45 | 
46 | 
47 | [tool.poetry.group.dev.dependencies]
48 | black = "^23.7.0"
49 | pytest = "^7.4.0"
50 | mock = "^5.1.0"
51 | pytest-mock = "^3.11.1"
52 | pytest-cov = "^4.1.0"
53 | 
54 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | Sequana rnaseq pipeline documentation
 2 | #####################################################
 3 | 
 4 | |version|, |today|, status: production
 5 | 
 6 | The **rnaseq** pipeline is a `Sequana <https://github.com/sequana/sequana>`_ pipeline. You can find the source code 
 7 | on  `https://github.com/sequana/sequana_rnaseq <https://github.com/sequana/sequana_rnaseq/>`_. Would you have issues
 8 | about the code, usage or lack of information, please fill a report 
 9 | on `Sequana itself <https://github.com/sequana/sequana/issues>`_ indicating the pipeline name (We centralized all 
10 | pipelines issues on **Sequana** repository only so as to be more responsive).
11 | 
12 | If you use **Sequana**, please do not forget to cite us:
13 | 
14 |     Cokelaer et al, (2017), 'Sequana': a Set of Snakemake NGS pipelines, Journal of
15 |     Open Source Software, 2(16), 352, `JOSS DOI doi:10.21105/joss.00352 <http://www.doi2bib.org/bib/10.21105/joss.00352>`_
16 | 
17 | 
18 | The Sequana rnaseq pipeline
19 | ==============================================
20 | 
21 | .. include:: ../README.rst
22 | 
23 | Example
24 | =======
25 | 
26 | This example is taken from the official tutorial on Sequana webiste.
27 | First download a sample::
28 | 
29 |     wget https://sequana.readthedocs.io/en/master/_downloads/WT_ATCACG_L001_R1_001.fastq.gz
30 |     wget https://sequana.readthedocs.io/en/master/_downloads/KO_ATCACG_L001_R1_001.fastq.gz
31 | 
32 | 
33 |     # its genome and annotation
34 |     mkdir genomes/Saccer3
35 |     cd genomes/Saccer3
36 |     wget http://hgdownload.cse.ucsc.edu/goldenPath/sacCer3/bigZips/chromFa.tar.gz
37 |     tar -xvzf chromFa.tar.gz
38 |     cat *.fa > Saccer3.fa
39 |     wget http://downloads.yeastgenome.org/curation/chromosomal_feature/saccharomyces_cerevisiae.gff -O Saccer3.gff
40 |     rm -f chr*
41 |     cd ../..
42 | 
43 | Then, prepare the script::
44 | 
45 |     sequana_pipelines_fastqc --input-directory . --genome-directory genomes/Saccer3 --aligner STAR
46 |     cd rnaseq
47 |     snakemake -s rnaseq.rules
48 |     # or just run the script rnaseq.sh
49 | 
50 | What is Sequana ?
51 | =====================
52 | 
53 | **Sequana** is a versatile tool that provides
54 | 
55 | #. A Python library dedicated to NGS analysis (e.g., tools to visualise standard NGS formats).
56 | #. A set of Pipelines dedicated to NGS in the form of Snakefiles
57 | #. Standalone applications
58 |     #. sequana_coverage ease the
59 |        extraction of genomic regions of interest and genome coverage information
60 |     #. sequana_taxonomy performs a quick
61 |        taxonomy of your FastQ. This requires dedicated databases to be downloaded.
62 |     #. Sequanix, a GUI for Snakemake workflows (hence Sequana pipelines as well)
63 | 
64 | To join the project, please let us know on `github <https://github.com/sequana/sequana/issues/306>`_.
65 | 
66 | For more information, please see `github <https://sequana.readthedocs.io>`_.
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/.github/workflows/apptainer.yml:
--------------------------------------------------------------------------------
  1 | name: Apptainer Run
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - main
  7 |       - dev
  8 |   workflow_dispatch:
  9 |   pull_request:
 10 |     branches-ignore: []
 11 |   schedule:
 12 |     - cron: '0 0 20 * *'
 13 | 
 14 | jobs:
 15 |   build-linux:
 16 |     runs-on: ubuntu-latest
 17 |     strategy:
 18 |       max-parallel: 5
 19 |       matrix:
 20 |         python: ['3.10']
 21 |       fail-fast: false
 22 | 
 23 | 
 24 |     steps:
 25 | 
 26 |     # Clean up unnecessary preinstalled packages to free disk space
 27 |     - name: Pre-cleanup
 28 |       run: |
 29 |         sudo rm -rf /usr/share/dotnet
 30 |         sudo rm -rf "$AGENT_TOOLSDIRECTORY"
 31 | 
 32 |     # Cache APT .deb packages
 33 |     - name: Cache APT archives
 34 |       uses: actions/cache@v3
 35 |       with:
 36 |         path: /var/cache/apt/archives
 37 |         key: ${{ runner.os }}-apt-cache-v1
 38 | 
 39 |     # Cache Apptainer installation
 40 |     - name: Cache Apptainer install
 41 |       id: cache-apptainer
 42 |       uses: actions/cache@v3
 43 |       with:
 44 |         path: |
 45 |           /usr/bin/apptainer
 46 |           /usr/lib/apptainer
 47 |           /etc/apptainer
 48 |         key: ${{ runner.os }}-apptainer-v1
 49 | 
 50 |     # Install Apptainer only if not cached
 51 |     - name: Install Apptainer
 52 |       if: steps.cache-apptainer.outputs.cache-hit != 'true'
 53 |       run: |
 54 |         sudo apt-get update
 55 |         sudo apt-get install -y software-properties-common
 56 |         sudo add-apt-repository -y ppa:apptainer/ppa
 57 |         sudo apt update
 58 |         sudo apt install -y apptainer
 59 | 
 60 |     # Cache Apptainer image cache (~/.apptainer/cache)
 61 |     - name: Cache Apptainer images
 62 |       uses: actions/cache@v3
 63 |       with:
 64 |         path: ~/.apptainer/cache
 65 |         key: ${{ runner.os }}-apptainer-images-v1
 66 | 
 67 |     # Checkout repository
 68 |     - name: Checkout repo
 69 |       uses: actions/checkout@v4
 70 | 
 71 |     # 🐍 Set up Python
 72 |     - name: Set up Python ${{ matrix.python }}
 73 |       uses: actions/setup-python@v5
 74 |       with:
 75 |         python-version: ${{ matrix.python }}
 76 | 
 77 |     #  Install dependencies
 78 |     - name: Install dependencies
 79 |       run: |
 80 |         python -m pip install --upgrade pip
 81 |         pip install .[testing]
 82 | 
 83 |     # Install package and pinned dependency (example: pulp)
 84 |     - name: Install package itself
 85 |       run: |
 86 |         pip install .
 87 |         pip install "pulp==2.7.0" --no-deps
 88 | 
 89 |     # Run tests using Apptainer
 90 |     - name: Run Apptainer tests
 91 |       run: |
 92 |         sequana_rnaseq --aligner-choice bowtie2 \
 93 |                 --input-directory test/data/
 94 |                 --apptainer-prefix ~/.apptainer/cache \
 95 |                 --genome-directory test/data/Saccer3
 96 | 
 97 |         cd rnaseq && sh rnaseq.sh
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | 
 15 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest
 16 | 
 17 | help:
 18 | 	@echo "Please use \`make <target>' where <target> is one of"
 19 | 	@echo "  html       to make standalone HTML files"
 20 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 21 | 	@echo "  singlehtml to make a single large HTML file"
 22 | 	@echo "  pickle     to make pickle files"
 23 | 	@echo "  json       to make JSON files"
 24 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 25 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 26 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 27 | 	@echo "  epub       to make an epub"
 28 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 29 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 30 | 	@echo "  text       to make text files"
 31 | 	@echo "  man        to make manual pages"
 32 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 33 | 	@echo "  linkcheck  to check all external links for integrity"
 34 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 35 | 
 36 | clean:
 37 | 	-rm -rf $(BUILDDIR)/*
 38 | 
 39 | html:
 40 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 41 | 	@echo
 42 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 43 | 
 44 | dirhtml:
 45 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 48 | 
 49 | singlehtml:
 50 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 53 | 
 54 | pickle:
 55 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 56 | 	@echo
 57 | 	@echo "Build finished; now you can process the pickle files."
 58 | 
 59 | json:
 60 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the JSON files."
 63 | 
 64 | htmlhelp:
 65 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 66 | 	@echo
 67 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 68 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 69 | 
 70 | qthelp:
 71 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 72 | 	@echo
 73 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 74 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 75 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/ebisoftware.qhcp"
 76 | 	@echo "To view the help file:"
 77 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/ebisoftware.qhc"
 78 | 
 79 | devhelp:
 80 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 81 | 	@echo
 82 | 	@echo "Build finished."
 83 | 	@echo "To view the help file:"
 84 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/ebisoftware"
 85 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/ebisoftware"
 86 | 	@echo "# devhelp"
 87 | 
 88 | epub:
 89 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 90 | 	@echo
 91 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 92 | 
 93 | latex:
 94 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
 95 | 	@echo
 96 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
 97 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
 98 | 	      "(use \`make latexpdf' here to do that automatically)."
 99 | 
100 | latexpdf:
101 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
102 | 	@echo "Running LaTeX files through pdflatex..."
103 | 	make -C $(BUILDDIR)/latex all-pdf
104 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
105 | 
106 | text:
107 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
108 | 	@echo
109 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
110 | 
111 | man:
112 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
113 | 	@echo
114 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
115 | 
116 | changes:
117 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
118 | 	@echo
119 | 	@echo "The overview file is in $(BUILDDIR)/changes."
120 | 
121 | linkcheck:
122 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
123 | 	@echo
124 | 	@echo "Link check complete; look for any errors in the above output " \
125 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
126 | 
127 | doctest:
128 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
129 | 	@echo "Testing of doctests in the sources finished, look at the " \
130 | 	      "results in $(BUILDDIR)/doctest/output.txt."
131 | 
132 | 


--------------------------------------------------------------------------------
/sequana_pipelines/rnaseq/data/phiX174.fa:
--------------------------------------------------------------------------------
 1 | >gi|9626372|dbj|NC_001422.1_phiX174_no_SNPs_True_Reference
 2 | GAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTT
 3 | GATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAA
 4 | ATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTG
 5 | TCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTA
 6 | GATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATC
 7 | TGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTT
 8 | TCCAGACCGCTTTGGCCTCTATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGATTT
 9 | CGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGTTGAGGCT
10 | TGCGTTTATGGTACGCTGGACTTTGTAGGATACCCTCGCTTTCCTGCTCCTGTTGAGTTTATTGCTGCCG
11 | TCATTGCTTATTATGTTCATCCCGTCAACATTCAAACGGCCTGTCTCATCATGGAAGGCGCTGAATTTAC
12 | GGAAAACATTATTAATGGCGTCGAGCGTCCGGTTAAAGCCGCTGAATTGTTCGCGTTTACCTTGCGTGTA
13 | CGCGCAGGAAACACTGACGTTCTTACTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCAGAAGGAG
14 | TGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACT
15 | AAAGGCAAGCGTAAAGGCGCTCGTCTTTGGTATGTAGGTGGTCAACAATTTTAATTGCAGGGGCTTCGGC
16 | CCCTTACTTGAGGATAAATTATGTCTAATATTCAAACTGGCGCCGAGCGTATGCCGCATGACCTTTCCCA
17 | TCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGAC
18 | TCCTTCGAGATGGACGCCGTTGGCGCTCTCCGTCTTTCTCCATTGCGTCGTGGCCTTGCTATTGACTCTA
19 | CTGTAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAA
20 | GGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTT
21 | GGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCAGGGTTATTTGAATATCTATAACA
22 | ACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCAAGATGATGC
23 | TCGTTATGGTTTCCGTTGCTGCCATCTCAAAAACATTTGGACTGCTCCGCTTCCTCCTGAGACTGAGCTT
24 | TCTCGCCAAATGACGACTTCTACCACATCTATTGACATTATGGGTCTGCAAGCTGCTTATGCTAATTTGC
25 | ATACTGACCAAGAACGTGATTACTTCATGCAGCGTTACCATGATGTTATTTCTTCATTTGGAGGTAAAAC
26 | CTCTTATGACGCTGACAACCGTCCTTTACTTGTCATGCGCTCTAATCTCTGGGCATCTGGCTATGATGTT
27 | GATGGAACTGACCAAACGTCGTTAGGCCAGTTTTCTGGTCGTGTTCAACAGACCTATAAACATTCTGTGC
28 | CGCGTTTCTTTGTTCCTGAGCATGGCACTATGTTTACTCTTGCGCTTGTTCGTTTTCCGCCTACTGCGAC
29 | TAAAGAGATTCAGTACCTTAACGCTAAAGGTGCTTTGACTTATACCGATATTGCTGGCGACCCTGTTTTG
30 | TATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGT
31 | TTAAGATTGCTGAGGGTCAGTGGTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCACCTTCTTGA
32 | AGGCTTCCCATTCATTCAGGAACCGCCTTCTGGTGATTTGCAAGAACGCGTACTTATTCGCCACCATGAT
33 | TATGACCAGTGTTTCCAGTCCGTTCAGTTGTTGCAGTGGAATAGTCAGGTTAAATTTAATGTGACCGTTT
34 | ATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTCGTGATAAAAGATTGAGTGTGAGGTTATAAC
35 | GCCGAAGCGGTAAAAATTTTAATTTTTGCCGCTGAGGGGTTGACCAAGCGAAGCGCGGTAGGTTTTCTGC
36 | TTAGGAGTTTAATCATGTTTCAGACTTTTATTTCTCGCCATAATTCAAACTTTTTTTCTGATAAGCTGGT
37 | TCTCACTTCTGTTACTCCAGCTTCTTCGGCACCTGTTTTACAGACACCTAAAGCTACATCGTCAACGTTA
38 | TATTTTGATAGTTTGACGGTTAATGCTGGTAATGGTGGTTTTCTTCATTGCATTCAGATGGATACATCTG
39 | TCAACGCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGATGCCGACCCTAAATTTTTTGC
40 | CTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGACTACCCTCCCGACTGCCTATGATGTTTATCCTTTG
41 | GATGGTCGCCATGATGGTGGTTATTATACCGTCAAGGACTGTGTGACTATTGACGTCCTTCCTCGTACGC
42 | CGGGCAATAATGTTTATGTTGGTTTCATGGTTTGGTCTAACTTTACCGCTACTAAATGCCGCGGATTGGT
43 | TTCGCTGAATCAGGTTATTAAAGAGATTATTTGTCTCCAGCCACTTAAGTGAGGTGATTTATGTTTGGTG
44 | CTATTGCTGGCGGTATTGCTTCTGCTCTTGCTGGTGGCGCCATGTCTAAATTGTTTGGAGGCGGTCAAAA
45 | AGCCGCCTCCGGTGGCATTCAAGGTGATGTGCTTGCTACCGATAACAATACTGTAGGCATGGGTGATGCT
46 | GGTATTAAATCTGCCATTCAAGGCTCTAATGTTCCTAACCCTGATGAGGCCGCCCCTAGTTTTGTTTCTG
47 | GTGCTATGGCTAAAGCTGGTAAAGGACTTCTTGAAGGTACGTTGCAGGCTGGCACTTCTGCCGTTTCTGA
48 | TAAGTTGCTTGATTTGGTTGGACTTGGTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTAT
49 | CTTGCTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGG
50 | TTGACGCCGGATTTGAGAATCAAAAAGAGCTTACTAAAATGCAACTGGACAATCAGAAAGAGATTGCCGA
51 | GATGCAAAATGAGACTCAAAAAGAGATTGCTGGCATTCAGTCGGCGACTTCACGCCAGAATACGAAAGAC
52 | CAGGTATATGCACAAAATGAGATGCTTGCTTATCAACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTA
53 | TGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGATTATGCGCCAAATGCTTACTCAAGCTCA
54 | AACGGCTGGTCAGTATTTTACCAATGACCAAATCAAAGAAATGACTCGCAAGGTTAGTGCTGAGGTTGAC
55 | TTAGTTCATCAGCAAACGCAGAATCAGCGGTATGGCTCTTCTCATATTGGCGCTACTGCAAAGGATATTT
56 | CTAATGTCGTCACTGATGCTGCTTCTGGTGTGGTTGATATTTTTCATGGTATTGATAAAGCTGTTGCCGA
57 | TACTTGGAACAATTTCTGGAAAGACGGTAAAGCTGATGGTATTGGCTCTAATTTGTCTAGGAAATAACCG
58 | TCAGGATTGACACCCTCCCAATTGTATGTTTTCATGCCTCCAAATCTTGGAGGCTTTTTTATGGTTCGTT
59 | CTTATTACCCTTCTGAATGTCACGCTGATTATTTTGACTTTGAGCGTATCGAGGCTCTTAAACCTGCTAT
60 | TGAGGCTTGTGGCATTTCTACTCTTTCTCAATCCCCAATGCTTGGCTTCCATAAGCAGATGGATAACCGC
61 | ATCAAGCTCTTGGAAGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATG
62 | TTGACGGCCATAAGGCTGCTTCTGACGTTCGTGATGAGTTTGTATCTGTTACTGAGAAGTTAATGGATGA
63 | ATTGGCACAATGCTACAATGTGCTCCCCCAACTTGATATTAATAACACTATAGACCACCGCCCCGAAGGG
64 | GACGAAAAATGGTTTTTAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGCTGAACGCC
65 | CTCTTAAGGATATTCGCGATGAGTATAATTACCCCAAAAAGAAAGGTATTAAGGATGAGTGTTCAAGATT
66 | GCTGGAGGCCTCCACTATGAAATCGCGTAGAGGCTTTGCTATTCAGCGTTTGATGAATGCAATGCGACAG
67 | GCTCATGCTGATGGTTGGTTTATCGTTTTTGACACTCTCACGTTGGCTGACGACCGATTAGAGGCGTTTT
68 | ATGATAATCCCAATGCTTTGCGTGACTATTTTCGTGATATTGGTCGTATGGTTCTTGCTGCCGAGGGTCG
69 | CAAGGCTAATGATTCACACGCCGACTGCTATCAGTATTTTTGTGTGCCTGAGTATGGTACAGCTAATGGC
70 | CGTCTTCATTTCCATGCGGTGCACTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTC
71 | GTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCAT
72 | CGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAG
73 | CCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAAAGTCAGATA
74 | TGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACT
75 | TCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTG
76 | TCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGC
77 | AGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACC
78 | TGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCA
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/sequana_pipelines/rnaseq/multiqc_config.yaml:
--------------------------------------------------------------------------------
  1 | #######################################
  2 | # Example MultiQC Configuration File
  3 | #######################################
  4 | 
  5 | # This file can be saved either in the MultiQC installation
  6 | # directory, or as ~/.multiqc_config.yaml
  7 | 
  8 | # Configuration settings are taken from the following locations, in order:
  9 | # - Hardcoded in MultiQC (multiqc/utils/config.py)
 10 | # - <installation_dir>/multiqc_config.yaml
 11 | # - ~/.multiqc_config.yaml
 12 | # - Command line options
 13 | 
 14 | # Note that all of the values below are set to the MultiQC defaults.
 15 | # It's recommended that you delete any that you don't need.
 16 | 
 17 | 
 18 | #------------------------------------------------------------------------------
 19 | #        TO CHANGE
 20 | #-------------------------------------------------------------------------------
 21 | 
 22 | # Title to use for the report.
 23 | title: "sequana_rnaseq MultiQC summary"
 24 | subtitle: "RNA-seq analysis" 
 25 | intro_text: "Report summarising cleaning (cutadapt section) and mapping (stat/bowtie sections) of your FastQ files, as well as feature counts (featureCounts section) and quick contaminant search (fastq screen section)"
 26 | 
 27 | # Add generic information to the top of reports
 28 | report_header_info:
 29 |    - Authors: '<a href="http://github.com/sequana/sequana">Sequana developers</a>'
 30 |    - Want to know more?: 'See the <a href="http://sequana.readthedocs.io" target="_blank">Sequana</a> and <a href="http://github.com/sequana/sequana_rnaseq/" target="_blank"> sequana_rnaseq pipeline</a> documentation.'
 31 |    - Citations: 'If you used Sequanix, Sequana, Sequana_coverage tool, or any Sequana pipelines, please see <a href="http://sequana.readthedocs.io">How to cite ?</a> section. In particular, if you use this report in a publication, please cite Sequana.'
 32 |    - Contact E-mail: ''
 33 | #   - Project Type: 'Differential gene expression'
 34 | 
 35 | 
 36 | #-------------------------------------------------------------------------------
 37 | 
 38 | 
 39 | 
 40 | # Prepend sample names with their directory. Useful if analysing the
 41 | # sample samples with different parameters.
 42 | prepend_dirs: False
 43 | 
 44 | # Default output filenames
 45 | output_fn_name: multiqc_report.html
 46 | data_dir_name: multiqc_data
 47 | 
 48 | # Whether to create the parsed data directory in addition to the report
 49 | make_data_dir: True
 50 | 
 51 | # Cleaning options for sample names. Typically, sample names are detected
 52 | # from an input filename. If any of these strings are found, they and any
 53 | # text to their right will be discarded.
 54 | # For example - file1.fq.gz_trimmed.bam_deduplicated_fastqc.zip
 55 | # would be cleaned to 'file1'
 56 | # Two options here - fn_clean_exts will replace the defaults,
 57 | # extra_fn_clean_exts will append to the defaults
 58 | extra_fn_clean_exts:
 59 |     - .gz
 60 |     - .fastq
 61 |     - _R1_.cutadapt
 62 |     - _R2_.cutadapt
 63 |     - _R2_001
 64 |     - _R1_001
 65 |     - .err
 66 |     - type: remove
 67 |       pattern: '.sorted'
 68 |     - type: regex
 69 |       pattern: '_S\d+'
 70 | 
 71 | 
 72 | # Ignore these files / directories / paths when searching for logs
 73 | fn_ignore_files:
 74 |     - .DS_Store
 75 |     - slurm*out
 76 |     - "*.js"
 77 | 
 78 | fn_ignore_dirs:
 79 |     - .sequana
 80 |     - .snakemake
 81 |     - multiqc
 82 |     - logs
 83 | 
 84 | # We want to ignore the 3 strand case to use only the final ones in
 85 | # post_analysis/feature_counts
 86 | fn_ignore_paths:
 87 |     - "*/feature_counts/*"
 88 | 
 89 | # Ignore files larger than this when searcing for logs (bytes)
 90 | log_filesize_limit: 5000000
 91 | 
 92 | # MultiQC skips a couple of debug messages when searching files as the
 93 | # log can get very verbose otherwise. Re-enable here to help debugging.
 94 | report_readerrors: False
 95 | report_imgskips: False
 96 | 
 97 | # Opt-out of remotely checking that you're running the latest version
 98 | no_version_check: False
 99 | 
100 | # How to plot graphs. Different templates can override these settings, but
101 | # the default template can use interactive plots (Javascript using HighCharts)
102 | # or flat plots (images, using MatPlotLib). With interactive plots, the report
103 | # can prevent automatically rendering all graphs if there are lots of samples
104 | # to prevent the browser being locked up when the report opens.
105 | plots_force_flat: False          # Try to use only flat image graphs
106 | plots_force_interactive: False   # Try to use only interactive javascript graphs
107 | plots_flat_numseries: 100        # If neither of the above, use flat if > this number of datasets
108 | num_datasets_plot_limit: 50      # If interactive, don't plot on load if > this number of datasets
109 | max_table_rows: 500              # Swap tables for a beeswarm plot above this
110 | 
111 | # Overwrite module filename search patterns. See multiqc/utils/search_patterns.yaml
112 | # for the defaults. Remove a default by setting it to null.
113 | sp:
114 |     star:
115 |         fn: '*Log.final.out'
116 |     cutadapt:
117 |         fn: 'cutadapt.txt'
118 |     fastp:
119 |         fn: '*fastp*json'
120 |     #rna_seqc/metrics:
121 |     #    fn: "*metrics.tsv"
122 |     #rna_seqc/coverage:
123 |     #    fn: "*coverage.tsv"
124 | 
125 | # Overwrite the defaults of which table columns are visible by default
126 | #
127 | read_count_prefix: ''
128 | read_count_multiplier: 1
129 | 
130 | table_columns_visible:
131 |     FastQC:
132 |         percent_fails: False
133 |         total_sequences: True
134 |         percent_gc: False
135 |     fastp:
136 |         pct_duplication: False
137 |         after_filtering_gc_content: False
138 |     Bowtie 1:
139 |         reads_aligned_percentage: False
140 |         reads_aligned: False
141 |     picard:
142 |         PERCENT_DUPLICATION: False
143 | 
144 | top_modules:
145 |   - fastqc
146 |   - fastp
147 |   - bowtie1
148 |   - bowtie2
149 |   - salmon
150 |   - star
151 |   - featureCounts
152 | 
153 | module_order:
154 |   - fastqc
155 |   - fastp
156 |   - rseqc
157 |   - markduplicates
158 |   - picard
159 |   - bowtie1
160 |   - bowtie2
161 |   - salmon
162 |   - star
163 |   - featureCounts
164 | 
165 | remove_sections:
166 |   - fastqc_status_checks
167 |   - fastqc_per_base_n_content
168 | 
169 | 
170 | #fastqc_config:
171 | #fastqc_theoretical_gc: 'mm10_genome'
172 | #
173 | 


--------------------------------------------------------------------------------
/test/test_main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import sys
  4 | import tempfile
  5 | 
  6 | from click.testing import CliRunner
  7 | 
  8 | from sequana_pipelines.rnaseq.main import main
  9 | 
 10 | from . import test_dir
 11 | 
 12 | sharedir = f"{test_dir}/data"
 13 | saccer3 = f"{test_dir}/data/Saccer3/"
 14 | conta = f"{test_dir}/data/Saccer3/Saccer3_rRNA.fa"
 15 | 
 16 | 
 17 | # fast
 18 | def test_standalone_subprocess():
 19 |     directory = tempfile.TemporaryDirectory()
 20 |     cmd = """sequana_rnaseq --input-directory {} --working-directory {} """.format(sharedir, directory.name)
 21 |     subprocess.call(cmd.split())
 22 | 
 23 | 
 24 | # slow
 25 | def test_standalone_script():
 26 |     directory = tempfile.TemporaryDirectory()
 27 | 
 28 |     runner = CliRunner()
 29 |     results = runner.invoke(
 30 |         main,
 31 |         [
 32 |             "--input-directory",
 33 |             sharedir,
 34 |             "--genome-directory",
 35 |             saccer3,
 36 |             "--force",
 37 |             "--aligner-choice",
 38 |             "bowtie2",
 39 |             "--feature-counts-feature-type",
 40 |             "gene,tRNA",
 41 |             "--working-directory",
 42 |             directory.name,
 43 |             "--rRNA-feature",
 44 |             "rRNA_gene",
 45 |         ],
 46 |     )  # ideally should be rRNA but current
 47 |     assert results.exit_code == 0
 48 | 
 49 | 
 50 | def test_standalone_script_contaminant():
 51 |     directory = tempfile.TemporaryDirectory()
 52 |     runner = CliRunner()
 53 |     results = runner.invoke(
 54 |         main,
 55 |         [
 56 |             "--input-directory",
 57 |             sharedir,
 58 |             "--genome-directory",
 59 |             saccer3,
 60 |             "--force",
 61 |             "--aligner-choice",
 62 |             "bowtie2",
 63 |             "--feature-counts-feature-type",
 64 |             "gene",
 65 |             "--contaminant-file",
 66 |             conta,
 67 |             "--working-directory",
 68 |             directory.name,
 69 |         ],
 70 |     )
 71 |     assert results.exit_code == 0
 72 | 
 73 | 
 74 | # fast
 75 | def test_version():
 76 |     cmd = "sequana_rnaseq --version"
 77 |     subprocess.call(cmd.split())
 78 | 
 79 | 
 80 | # fast
 81 | def test_standalone_script_wrong_feature():
 82 |     directory = tempfile.TemporaryDirectory()
 83 |     import sequana_pipelines.rnaseq.main as m
 84 | 
 85 |     sys.argv = [
 86 |         "test",
 87 |         "--input-directory",
 88 |         sharedir,
 89 |         "--genome-directory",
 90 |         saccer3,
 91 |         "--force",
 92 |         "--aligner-choice",
 93 |         "bowtie2",
 94 |         "--feature-counts-feature-type",
 95 |         "dummy",
 96 |         "--working-directory",
 97 |         directory.name,
 98 |         "--rRNA-feature",
 99 |         "rRNA_gene",
100 |     ]  # ideally should be rRNA but current
101 |     try:
102 |         m.main()
103 |         assert False
104 |     except:
105 |         assert True
106 | 
107 | 
108 | # fast
109 | def test_standalone_script_wrong_reference():
110 |     directory = tempfile.TemporaryDirectory()
111 |     import sequana_pipelines.rnaseq.main as m
112 | 
113 |     sys.argv = [
114 |         "test",
115 |         "--input-directory",
116 |         sharedir,
117 |         "--genome-directory",
118 |         "dummy",
119 |         "--force",
120 |         "--aligner-choice",
121 |         "bowtie2",
122 |         "--working-directory",
123 |         directory.name,
124 |         "--rRNA-feature",
125 |         "rRNA_gene",
126 |     ]  # ideally should be rRNA but current
127 |     try:
128 |         m.main()
129 |         assert False
130 |     except:
131 |         assert True
132 | 
133 | 
134 | # fast
135 | def test_standalone_script_wrong_triming():
136 |     directory = tempfile.TemporaryDirectory()
137 |     import sequana_pipelines.rnaseq.main as m
138 | 
139 |     sys.argv = [
140 |         "test",
141 |         "--input-directory",
142 |         sharedir,
143 |         "--genome-directory",
144 |         saccer3,
145 |         "--force",
146 |         "--aligner-choice",
147 |         "bowtie2",
148 |         "--software-choice",
149 |         "dummy",
150 |         "--working-directory",
151 |         directory.name,
152 |         "--rRNA-feature",
153 |         "rRNA_gene",
154 |     ]  # ideally should be rRNA but current
155 |     try:
156 |         m.main()
157 |         assert False
158 |     except SystemExit:
159 |         assert True
160 | 
161 | 
162 | # slow
163 | def test_full():
164 | 
165 |     with tempfile.TemporaryDirectory() as directory:
166 |         wk = directory
167 | 
168 |         cmd = f"sequana_rnaseq --input-directory {sharedir} --genome-directory {saccer3} --aligner-choice bowtie2 --working-directory {wk} --force --rRNA-feature rRNA_gene"
169 |         subprocess.call(cmd.split())
170 | 
171 |         cmd = "sh rnaseq.sh"
172 | 
173 |         stat = subprocess.call(cmd.split(), cwd=wk)
174 | 
175 |         assert os.path.exists(wk + "/summary.html")
176 |         assert os.path.exists(wk + "/multiqc/multiqc_report.html")
177 | 
178 | 
179 | # slow
180 | def test_full_star():
181 | 
182 |     with tempfile.TemporaryDirectory() as directory:
183 |         wk = directory
184 | 
185 |         cmd = f"sequana_rnaseq --input-directory {sharedir} --genome-directory {saccer3} --aligner-choice star --working-directory {wk} --force --rRNA-feature rRNA_gene"
186 |         subprocess.call(cmd.split())
187 | 
188 |         cmd = "snakemake -s rnaseq.rules --wrapper-prefix https://raw.githubusercontent.com/sequana/sequana-wrappers/  -p --cores 2 "
189 | 
190 |         stat = subprocess.call(cmd.split(), cwd=wk)
191 | 
192 |         assert os.path.exists(wk + "/summary.html")
193 |         assert os.path.exists(wk + "/multiqc/multiqc_report.html")
194 | 
195 | 
196 | # slow
197 | def __test_full_salmon():
198 | 
199 |     with tempfile.TemporaryDirectory() as directory:
200 |         wk = directory
201 | 
202 |         cmd = f"sequana_rnaseq --input-directory {sharedir} --genome-directory {saccer3} --aligner-choice salmon --working-directory {wk} --force"
203 |         subprocess.call(cmd.split())
204 | 
205 |         cmd = "snakemake -s rnaseq.rules --wrapper-prefix https://raw.githubusercontent.com/sequana/sequana-wrappers/  -p --cores 2 "
206 | 
207 |         stat = subprocess.call(cmd.split(), cwd=wk)
208 | 
209 |         assert os.path.exists(wk + "/summary.html")
210 |         assert os.path.exists(wk + "/multiqc/multiqc_report.html")
211 | 


--------------------------------------------------------------------------------
/sequana_pipelines/rnaseq/schema.yaml:
--------------------------------------------------------------------------------
  1 | # Schema validator for the rnaseq pipeline
  2 | # author: Thomas Cokelaer
  3 | 
  4 | type: map
  5 | mapping:
  6 |     "sequana_wrappers":
  7 |         type: str
  8 |     "input_directory":
  9 |         type: str
 10 |         required: True
 11 |     "input_readtag":
 12 |         type: str
 13 |         required: True
 14 |     "input_pattern":
 15 |         type: str
 16 |         required: True
 17 |     "apptainers":
 18 |         type: any
 19 | 
 20 | 
 21 |     "fastqc":
 22 |         type: map
 23 |         mapping:
 24 |             "skip_fastqc_raw":
 25 |                 type: bool
 26 |                 required: True
 27 |             "options":
 28 |                 type: str
 29 |                 required: True
 30 |             "threads":
 31 |                 type: int
 32 |                 required: True
 33 |                 range: { min: 1 }
 34 |             "resources":
 35 |                 type: any
 36 |                 required: true
 37 | 
 38 | 
 39 |     "general":
 40 |         type: map
 41 |         mapping:
 42 |             "aligner":
 43 |                 type: str
 44 |                 required: True
 45 |                 enum: ["bowtie2", "star", "salmon"]
 46 |             "genome_directory":
 47 |                 required: True
 48 |                 type: str
 49 |             "contaminant_file":
 50 |                 type: str
 51 |             "rRNA_feature":
 52 |                 type: str
 53 |             "custom_gff":
 54 |                 type: str
 55 | 
 56 |     "add_read_group":
 57 |       type: map
 58 |       mapping:
 59 |         "options":
 60 |           type: str
 61 | 
 62 |     "trimming":
 63 |       type: map
 64 |       mapping:
 65 |         "do":
 66 |           type: bool
 67 |           required: True
 68 |         "software_choice":
 69 |           type: str
 70 |           enum: [cutadapt,atropos,fastp]
 71 | 
 72 |     "fastp":
 73 |       type: map
 74 |       mapping:
 75 |         "options":
 76 |             type: str
 77 |         "minimum_length":
 78 |             required: True
 79 |             type: int
 80 |         "adapters":
 81 |             type: str
 82 |             required: False
 83 |         "quality":
 84 |             type: int
 85 |             range: {max: 40, min: 0}
 86 |             required: False
 87 |         "threads":
 88 |             type: int
 89 |             required: True
 90 |             range: { min: 1 }
 91 |         "disable_adapter_trimming":
 92 |             type: bool
 93 |         "disable_quality_filtering":
 94 |             type: bool
 95 |         "resources":
 96 |             type: any
 97 |             required: true
 98 | 
 99 | 
100 |     "cutadapt":
101 |         type: map
102 |         mapping:
103 |             "tool_choice":
104 |                 type: str
105 |                 enum: [cutadapt,atropos]
106 |                 required: True
107 |             "adapter_choice":
108 |                 type: str
109 |                 required: False
110 |             "design_file":
111 |                 type: str
112 |                 required: False
113 |                 pattern: .*
114 |             "fwd":
115 |                 type: str
116 |                 required: False
117 |             "rev":
118 |                 type: str
119 |                 required: False
120 |             "m":
121 |                 type: int
122 |                 range: {min: 0}
123 |                 required: True
124 |             "mode":
125 |                 type: str
126 |                 enum: [b, g, a]
127 |                 required: True
128 |             "options":
129 |                 type: str
130 |                 required: False
131 |             "quality":
132 |                 type: int
133 |                 range: {max: 40, min: 0}
134 |                 required: False
135 |             "threads":
136 |                 type: int
137 |                 required: True
138 |                 range: { min: 1 }
139 | 
140 |     "multiqc":
141 |         type: map
142 |         mapping:
143 |             "options":
144 |                 type: str
145 |             "modules":
146 |                 type: str
147 |             "config_file":
148 |                 type: str
149 |             "input_directory":
150 |                 type: str
151 |             "resources":
152 |                 type: any
153 |                 required: true
154 | 
155 | 
156 |     "feature_counts":
157 |         type: map
158 |         mapping:
159 |             "do":
160 |                 type: bool
161 |             "options":
162 |                 type: str
163 |             "attribute":
164 |                 type: str
165 |             "feature":
166 |                 type: str
167 |             "extra_attributes":
168 |                 type: str
169 |             "strandness":
170 |                 required: False
171 |                 type: any
172 |                 enum: ['0', '1', '2' , '', 0, 1, 2]
173 |             "threads":
174 |                 type: int
175 |                 range: { min: 1, max: 8 }
176 |             "tolerance":
177 |                 type: float
178 |                 range: { min: 0, max: 0.25 }
179 |     'bowtie1_mapping_ref':
180 |         type: map
181 |         mapping:
182 |             "options":
183 |                 type: str
184 |             "threads":
185 |                 type: int
186 |                 required: True
187 |                 range: { min: 1 }
188 | 
189 |     'salmon_index':
190 |       type: map
191 |       mapping: 
192 |         "options":
193 |           type: str
194 |         "threads":
195 |           type: int
196 |           required: True
197 |           range: { min: 1 }
198 |         "resources":
199 |           type: any
200 |           required: true
201 |       
202 | 
203 |     'salmon_mapping':
204 |         type: map
205 |         mapping:
206 |             "options":
207 |                 type: str
208 |             "threads":
209 |                 type: int
210 |                 required: True
211 |                 range: { min: 1 }
212 |             "resources":
213 |                 type: any
214 |                 required: true
215 | 
216 |     'bam_coverage':
217 |         type: map
218 |         mapping:
219 |             "do":
220 |                 type: bool
221 |             "options":
222 |                 type: str
223 |             "binSize":
224 |                 type: int
225 |             "genomeSize":
226 |                 type: int
227 |             "extendReads":
228 |                 type: int
229 |             "minFragmentLength":
230 |                 type: int
231 |             "maxFragmentLength":
232 |                 type: int
233 |             "threads":
234 |                 type: int
235 |                 required: True
236 |                 range: { min: 1, max: 8 }
237 |             "resources":
238 |                 type: any
239 |                 required: true
240 | 
241 |     'mark_duplicates':
242 |         type: map
243 |         mapping:
244 |             "do":
245 |                 type: bool
246 |             "remove":
247 |                 type: bool
248 |             "tmpdir":
249 |                 type: str
250 |             "threads":
251 |                 type: int
252 |                 required: True
253 |                 range: { min: 1, max: 8 }
254 |             "resources":
255 |                 type: any
256 |                 required: true
257 | 
258 |     'bowtie1_mapping_rna':
259 |         type: map
260 |         mapping:
261 |             "options":
262 |                 type: str
263 |             "threads":
264 |                 type: int
265 |                 required: True
266 |                 range: { min: 1}
267 |             "nreads":
268 |                 type: int
269 |                 required: True
270 |                 range: { min: -1, max: 1e15}
271 | 
272 |     'igvtools':
273 |         type: map
274 |         mapping:
275 |             "do":
276 |                 type: bool
277 |             "chrom_sizes_file":
278 |                 type: str
279 |     'rnaseqc':
280 |         type: map
281 |         mapping:
282 |             "do":
283 |                 type: bool
284 |             "gtf_file":
285 |                 type: str
286 |             "options":
287 |                 type: str
288 |             "resources":
289 |                 type: any
290 |                 required: true
291 |     'rseqc':
292 |         type: map
293 |         mapping:
294 |             "do":
295 |                 type: bool
296 |             "bed_file":
297 |                 type: str
298 | 
299 |     'bowtie2_mapping':
300 |         type: map
301 |         mapping:
302 |             "options":
303 |                 type: str
304 |             "threads":
305 |                 type: int
306 |                 required: True
307 |                 range: { min: 1}
308 |             "genome_size_larger_than_4gb":
309 |                 type: bool
310 |             "resources":
311 |                 type: any
312 |                 required: true
313 | 
314 |     'bowtie2_index':
315 |         type: map
316 |         mapping:
317 |             "options":
318 |                 type: str
319 |             "threads":
320 |                 type: int
321 |                 required: True
322 |                 range: { min: 1}
323 |             "resources":
324 |                 type: any
325 |                 required: true
326 | 
327 |     'star_mapping':
328 |         type: map
329 |         mapping:
330 |             "options":
331 |                 type: str
332 |             "threads":
333 |                 type: int
334 |                 required: True
335 |                 range: { min: 1}
336 |             "resources":
337 |                 type: any
338 |                 required: true
339 |             "legacy":
340 |                 type: bool
341 |                 required: True
342 | 
343 |     'star_index':
344 |         type: map
345 |         mapping:
346 |             "options":
347 |                 type: str
348 |             "threads":
349 |                 type: int
350 |                 required: True
351 |                 range: { min: 1}
352 |             "resources":
353 |                 type: any
354 |                 required: true
355 | 
356 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # documentation build configuration file, created by
  4 | # sphinx-quickstart on Wed Aug 10 16:58:13 2011.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import sys, os
 15 | import sphinx
 16 | 
 17 | sys.path.insert(0, os.path.abspath('sphinxext'))
 18 | 
 19 | # If extensions (or modules to document with autodoc) are in another directory,
 20 | # add these directories to sys.path here. If the directory is relative to the
 21 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 22 | 
 23 | pkg_name = "sequana_rnaseq"
 24 | 
 25 | # This is for ReadTheDoc
 26 | import matplotlib
 27 | matplotlib.use('Agg')
 28 | 
 29 | import pkg_resources
 30 | version = pkg_resources.require(pkg_name)[0].version
 31 | 
 32 | import matplotlib
 33 | import matplotlib.sphinxext
 34 | 
 35 | release = version
 36 | author = "Thomas Cokelaer"
 37 | title = "Sequana rnaseq pipeline"
 38 | copyright = author + ", 2016-2019"
 39 | project = 'Sequana rnaseq pipeline'
 40 | 
 41 | import easydev
 42 | from easydev import get_path_sphinx_themes
 43 | 
 44 | 
 45 | # -- General configuration -----------------------------------------------------
 46 | 
 47 | # If your documentation needs a minimal Sphinx version, state it here.
 48 | #needs_sphinx = '1.0'
 49 | 
 50 | # Add any Sphinx extension module names here, as strings. They can be extensions
 51 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 52 | 
 53 | extensions = [
 54 |     'sphinx.ext.autodoc',
 55 |     ('sphinx.ext.imgmath'  # only available for sphinx >= 1.4
 56 |                   if sphinx.version_info[:2] >= (1, 4)
 57 |                   else 'sphinx.ext.pngmath'),
 58 |     'sphinx.ext.coverage',
 59 |     'sphinx.ext.doctest',
 60 |     'sphinx.ext.intersphinx',
 61 |     'sphinx.ext.todo',
 62 |     'sphinx.ext.ifconfig',
 63 |     'sphinx.ext.viewcode',
 64 |     "numpydoc.numpydoc",
 65 |     'matplotlib.sphinxext.plot_directive',
 66 |     'sphinx.ext.autosummary',
 67 |     "sequana.sphinxext.snakemakerule"
 68 |     ]
 69 | # note that the numpy directives is buggy. Example: class and init are not recognised as two entities for the autoclass_content=both here below
 70 | 
 71 | math_number_all = False
 72 | 
 73 | todo_include_todos=True
 74 | jscopybutton_path = "copybutton.js"
 75 | autoclass_content = 'both'
 76 | 
 77 | # Add any paths that contain templates here, relative to this directory.
 78 | templates_path = ['_templates']
 79 | 
 80 | # The suffix of source filenames.
 81 | source_suffix = '.rst'
 82 | 
 83 | # The encoding of source files.
 84 | #source_encoding = 'utf-8-sig'
 85 | 
 86 | # The master toctree document.
 87 | master_doc = 'index'
 88 | 
 89 | # General information about the project.
 90 | project = project
 91 | copyright = copyright
 92 | 
 93 | # The version info for the project you're documenting, acts as replacement for
 94 | # |version| and |release|, also used in various other places throughout the
 95 | # built documents.
 96 | #
 97 | # The short X.Y version.
 98 | version = 'Current version: ' + str(version) 
 99 | # The full version, including alpha/beta/rc tags.
100 | release = release
101 | 
102 | # The language for content autogenerated by Sphinx. Refer to documentation
103 | # for a list of supported languages.
104 | #language = None
105 | 
106 | # There are two options for replacing |today|: either, you set today to some
107 | # non-false value, then it is used:
108 | #today = ''
109 | # Else, today_fmt is used as the format for a strftime call.
110 | #today_fmt = '%B %d, %Y'
111 | 
112 | # List of documents that shouldn't be included in the build.
113 | #unused_docs = []
114 | 
115 | 
116 | # List of patterns, relative to source directory, that match files and
117 | # directories to ignore when looking for source files.
118 | exclude_trees = ['_build']
119 | exclude_patterns = []
120 | 
121 | # The reST default role (used for this markup: `text`) to use for all documents.
122 | #default_role = None
123 | 
124 | # If true, '()' will be appended to :func: etc. cross-reference text.
125 | #add_function_parentheses = True
126 | 
127 | # If true, the current module name will be prepended to all description
128 | # unit titles (such as .. function::).
129 | add_module_names = False
130 | 
131 | # If true, sectionauthor and moduleauthor directives will be shown in the
132 | # output. They are ignored by default.
133 | show_authors = True
134 | 
135 | # The name of the Pygments (syntax highlighting) style to use.
136 | pygments_style = 'sphinx'
137 | 
138 | # A list of ignored prefixes for module index sorting.
139 | modindex_common_prefix = ["sequana."]
140 | 
141 | 
142 | # Get rid of spurious warnings due to some interaction between
143 | # autosummary and numpydoc. See
144 | # https://github.com/phn/pytpm/issues/3#issuecomment-12133978 for more
145 | # details
146 | numpydoc_show_class_members = False
147 | 
148 | 
149 | # solution from nilearn
150 | def touch_example_backreferences(app, what, name, obj, options, lines):
151 |     # generate empty examples files, so that we don't get
152 |     # inclusion errors if there are no examples for a class / module
153 |     examples_path = os.path.join(app.srcdir, "modules", "generated",
154 |                                  "%s.examples" % name)
155 |     if not os.path.exists(examples_path):
156 |         # touch file
157 |         open(examples_path, 'w').close()
158 | 
159 | 
160 | 
161 | # Add the 'copybutton' javascript, to hide/show the prompt in code
162 | # examples
163 | def setup(app):
164 |     app.add_javascript('copybutton.js')
165 |     app.connect('autodoc-process-docstring', touch_example_backreferences)
166 | 
167 | 
168 | 
169 | 
170 | 
171 | # -- Options for HTML output ---------------------------------------------------
172 | 
173 | # The theme to use for HTML and HTML Help pages.  Major themes that come with
174 | # Sphinx are currently 'default' and 'sphinxdoc'.
175 | on_rtd = os.environ.get("READTHEDOCS", None) == "True"
176 | if not on_rtd:
177 |     import sphinx_rtd_theme
178 |     html_theme = 'sphinx_rtd_theme'
179 |     html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
180 | else:
181 |     html_theme = "default"
182 | 
183 | # Theme options are theme-specific and customize the look and feel of a theme
184 | # further.  For a list of options available for each theme, see the
185 | # documentation.
186 | # the user theme contains the otpions 'homepage', which is populated here
187 | #html_theme_options = {'homepage': init_sphinx.url}
188 | # Add any paths that contain custom themes here, relative to this directory.
189 | #html_theme_path = [get_path_sphinx_themes()]
190 | 
191 | 
192 | 
193 | # The name for this set of Sphinx documents.  If None, it defaults to
194 | # "<project> v<release> documentation".
195 | #html_title = None
196 | 
197 | # A shorter title for the navigation bar.  Default is the same as html_title.
198 | html_short_title = "sequana"
199 | 
200 | # The name of an image file (relative to this directory) to place at the top
201 | # of the sidebar.
202 | #html_logo = "../../share/data/images/crx_logo.png"
203 | 
204 | # The name of an image file (within the static path) to use as favicon of the
205 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
206 | # pixels large.
207 | #html_favicon = "../../share/data/images/crx_logo.ico"
208 | 
209 | # Add any paths that contain custom static files (such as style sheets) here,
210 | # relative to this directory. They are copied after the builtin static files,
211 | # so a file named "default.css" will overwrite the builtin "default.css".
212 | 
213 | html_static_path = ['_static']
214 | 
215 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
216 | # using the given strftime format.
217 | html_last_updated_fmt = '%b %d, %Y'
218 | 
219 | # If true, SmartyPants will be used to convert quotes and dashes to
220 | # typographically correct entities.
221 | #html_use_smartypants = True
222 | 
223 | # Custom sidebar templates, maps document names to template names.
224 | html_index = 'index.html'
225 | 
226 | #Custom sidebar templates, maps page names to templates.
227 | #html_sidebars = {
228 | #                    'index': [ 'indexsidebar.html'], 
229 | #                    'contents':'indexsidebar.html',
230 | #}
231 | # Additional templates that should be rendered to pages, maps page names to
232 | # template names.
233 | #html_additional_pages = {   'index': 'index.html'}
234 | 
235 | 
236 | # If false, no module index is generated.
237 | html_use_modindex = True
238 | html_domain_indices = True
239 | # If false, no index is generated.
240 | html_use_index = True
241 | 
242 | # If true, the index is split into individual pages for each letter.
243 | html_split_index = False
244 | 
245 | # If true, links to the reST sources are added to the pages.
246 | html_show_sourcelink = True
247 | html_copy_source = True
248 | 
249 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
250 | html_show_sphinx = True
251 | 
252 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
253 | #html_show_copyright = True
254 | 
255 | # If true, an OpenSearch description file will be output, and all pages will
256 | # contain a <link> tag referring to it.  The value of this option must be the
257 | # base URL from which the finished HTML is served.
258 | #html_use_opensearch = ''
259 | 
260 | # This is the file name suffix for HTML files (e.g. ".xhtml").
261 | #html_file_suffix = None
262 | 
263 | # Output file base name for HTML help builder.
264 | htmlhelp_basename = 'doc'
265 | 
266 | 
267 | # -- Options for LaTeX output --------------------------------------------------
268 | 
269 | # NOT in original quickstart
270 | pngmath_use_preview = True
271 | 
272 | # The font size ('10pt', '11pt' or '12pt').
273 | latex_font_size = '10pt'
274 | 
275 | # Grouping the document tree into LaTeX files. List of tuples
276 | # (source start file, target name, title, author, documentclass [howto/manual]).
277 | latex_documents = [
278 |   ('index', 'main.tex', title,
279 |    author, 'manual'),
280 | ]
281 | 
282 | latex_elements = { 'inputenc': '\\usepackage[utf8]{inputenc}' }
283 | 
284 | # The name of an image file (relative to this directory) to place at the top of
285 | # the title page.
286 | #latex_logo = None
287 | 
288 | # For "manual" documents, if this is true, then toplevel headings are parts,
289 | # not chapters.
290 | latex_use_parts = False
291 | 
292 | # If true, show page references after internal links.
293 | #latex_show_pagerefs = False
294 | 
295 | # If true, show URL addresses after external links.
296 | #latex_show_urls = False
297 | 
298 | # Additional stuff for the LaTeX preamble.
299 | 
300 | 
301 | # Documents to append as an appendix to all manuals.
302 | #latex_appendices = []
303 | 
304 | # If false, no module index is generated.
305 | #latex_domain_indices = True
306 | 
307 | 
308 | # -- Options for manual page output --------------------------------------------
309 | 
310 | # One entry per manual page. List of tuples
311 | # (source start file, name, description, authors, manual section).
312 | man_pages = [
313 |     ('index', project, project,
314 |      [author], 1)
315 | ]
316 | 
317 | 
318 | # Example configuration for intersphinx: refer to the Python standard library.
319 | intersphinx_mapping = {
320 |     "python": ('http://docs.python.org/', None),
321 | }
322 | 


--------------------------------------------------------------------------------
/sequana_pipelines/rnaseq/data/rnadiff_one_factor.R:
--------------------------------------------------------------------------------
  1 | ###################################################
  2 | ### DESeq2_1factor parameters: to be modified by the user
  3 | ###################################################
  4 | rm(list=ls())                                        # remove all the objects of the R session
  5 | 
  6 | workspace <- "."                                     # workspace for the R session
  7 | 
  8 | projectName <- "BXXXX"                               # name of the project (cannot contain any ".")
  9 | analysisVersion <- "vN"                              # name of the analysis version (cannot contain any ".")
 10 | 
 11 | author <- " (Biomics platform - Institut Pasteur)"   # author of the statistical report
 12 | researcher <- ""                               #  name of the researcher
 13 | chief <- ""                                          # name of the head of unit
 14 | 
 15 | varInt <- "condition"                               # factor of interest
 16 | condRef <- "WT"                                 # reference biological condition e.g. WT
 17 | batch <- NULL                                        # factor on which to adjust the statistical model: NULL (default) or "batch" for example
 18 | 
 19 | outfile <- TRUE                                      # TRUE to export figures, FALSE to display them in R
 20 | colors <- c("#f3c300", "#875692", "#f38400", "#a1caf1", "#be0032", # vector of colors of each group on the plots
 21 |             "#c2b280", "#848482", "#008856", "#e68fac", "#0067a5", 
 22 |             "#f99379", "#604e97", "#f6a600", "#b3446c", "#dcd300", 
 23 |             "#882d17", "#8db600", "#654522", "#e25822", "#2b3d26")
 24 | 
 25 | cooksCutoff <- NULL                                  # outliers detection threshold (NULL to leave DESeq2 choosing it, Inf to keep outliers)
 26 | independentFiltering <- TRUE                         # FALSE to turn off the independent filtering (default is TRUE)
 27 | allComp <- TRUE                                      # make all the possible comparisons or only those to the reference level?
 28 | alpha <- 0.05                                        # threshold of statistical significance
 29 | adjMethod <- "BH"                                    # p-value adjustment method: "BH" (default) or "BY"
 30 | type.trans <- "VST"                                  # transformation for exploratory analysis: "VST" ou "rlog" (if size factors vary very widely)
 31 | locfunc <- "median"                                  # "median" (default) or "shorth" with library(genefilter) (to estimate the size factors)
 32 | interestingFeatures <- NULL                          # vector of features for which to plot the expression
 33 | featuresToRemove <- c("alignment_not_unique",        # names of the features to be removed (default is the HTSeq-count specific lines)
 34 |                       "ambiguous", "no_feature",
 35 |                       "not_aligned", "too_low_aQual") 
 36 | 
 37 | fitType <- "parametric"				                 # mean-variance relationship: "parametric" (default) or "local"
 38 | 
 39 | #####################################
 40 | # INPUT FILES
 41 | #####################################
 42 | geneLengthFile <- "input_gene_lengths.tsv"      # path to the genes lenghts file (default is NULL)
 43 | targetFile <- "target.txt"                      # path to the design/target file
 44 | infoFile <- "input_info.tsv"                    # path to the annotation file (needed if 0 counts not in counts files)
 45 | rawDir <- "feature_counts"                   # path to the directory containing raw counts files
 46 | 
 47 | ###################################################
 48 | ### code chunk number 1: construction autres parametres et divers chargements
 49 | ###################################################
 50 | setwd(workspace)
 51 | library(RNADiff)
 52 | library(knitr)
 53 | if (locfunc=="shorth") library(genefilter)
 54 | 
 55 | versionName <- paste(projectName, analysisVersion, sep="-")
 56 | ncol <- NULL                                         # largeur des tableaux dans le rapport
 57 | 
 58 | cat("Creation des dossiers d'exports\n") 
 59 | dir.create("figures", showWarnings=FALSE)
 60 | dir.create("tables", showWarnings=FALSE)            
 61 | 
 62 | ###################################################
 63 | ### code chunk number 2: loadData
 64 | ###################################################
 65 | cat("Chargement des annotations et longueurs des genes si besoin\n")
 66 | if (!is.null(infoFile)) print(head(info <- read.delim(infoFile, sep="\t", header=TRUE, stringsAsFactors=FALSE))) else info <- NULL
 67 | if (!is.null(geneLengthFile)) print(head(glength <- read.table(geneLengthFile, sep="\t", header=TRUE, stringsAsFactors=FALSE))) else glength <- NULL
 68 | 
 69 | cat("Chargement du target file\n")
 70 | print(target <- loadTargetFile(targetFile, varInt=varInt, condRef=condRef))
 71 | conds <- levels(target[,varInt])
 72 | group <- data.frame(group=factor(target[,varInt]))
 73 | 
 74 | cat("Chargement des donnees\n")
 75 | counts <- loadCountData(target, rawDir=rawDir, versionName=versionName, featuresToRemove=featuresToRemove)
 76 | 
 77 | cat("Verifier que les echantillons de counts sont dans le meme ordre que le target\n")
 78 | print(cbind(target=as.character(target[,1]),counts=colnames(counts)))
 79 | 
 80 | cat("Verifier que les identifiants dans info et glength sont les memes que dans les comptages\n")
 81 | checkInfoGlength(counts=counts, info=info, glength=glength)
 82 | 
 83 | ###################################################
 84 | ### code chunk number 3: description of raw data
 85 | ###################################################
 86 | cat("\nFigure : nombre de reads par echantillon\n")
 87 | barplotTC(counts=counts, group=group, col=colors, out=outfile, versionName=versionName)
 88 | 
 89 | cat("Figure : nombre de comptages nuls par echantillon\n")
 90 | barplotNul(counts=counts, group=group, col=colors, out=outfile, versionName=versionName)
 91 | N <- nrow(counts) - nrow(removeNul(counts))
 92 | cat("\nNombre de genes avec que des comptages nuls :", N,"\n")
 93 | 
 94 | cat("\nFigure : estimation de la densite des comptages de chaque echantillon\n")
 95 | densityPlot(counts=counts, group=group, col=colors, out=outfile, versionName=versionName)
 96 | 
 97 | cat("\nFigure + tableau : sequences majoritaires pour chaque echantillon\n")
 98 | majSequences <- majSequences(counts=counts, group=group, versionName=versionName, col=colors, out=outfile)
 99 | 
100 | cat("\nCalcul des SERE\n")
101 | print(sere <- pairwiseSERE(counts, versionName=versionName))
102 | 
103 | cat("\nFigure : pairwise scatterplots of samples\n")
104 | pairwiseScatterPlots(counts=counts, group=group, out=outfile, versionName=versionName)
105 | 
106 | ###################################################
107 | ### code chunk number 4: creating DESeqDataSet object, normalization and estimateDispersion
108 | ###################################################
109 | dds <- DESeqDataSetFromMatrix(countData=counts, colData=target, 
110 |                               design=formula(paste("~", ifelse(!is.null(batch), paste(batch,"+"), ""), varInt)))
111 | print(design(dds))
112 | 
113 | cat("Estimation des size factors\n")
114 | dds <- estimateSizeFactors(dds, locfunc=eval(as.name(locfunc)))
115 | print(sf <- sizeFactors(dds))
116 | cat("\nFigure : diagnostic des size factors\n")
117 | diagSizeFactors(dds=dds, group=group, col=colors, out=outfile, versionName=versionName)
118 | 
119 | cat("\nCalcul des dispersions et graph relation mean-dispersion\n")
120 | dds <- estimateDispersions(dds, fitType=fitType)
121 | plotDispEstimates(dds=dds, out=outfile, versionName=versionName)
122 | cat("\nFigure : diagnostic de log-normalite des dispersions\n")
123 | diagLogNormalityDisp(dds=dds, out=outfile, versionName=versionName)
124 | 
125 | ###################################################
126 | ### code chunk number 5: Boxplot avant et apres normalisation
127 | ###################################################
128 | cat("Figure : boxplots sur comptages bruts et normalises\n")
129 | boxplotCounts(counts=counts(dds), group=group, col=colors, out=outfile, versionName=versionName)
130 | boxplotCounts(counts=counts(dds, normalized=TRUE), group=group, col=colors, type="norm", out=outfile, versionName=versionName)
131 | 
132 | ###################################################
133 | ### code chunk number 6: clustering + PCA of samples
134 | ###################################################
135 | cat("Figure : dendrogramme de la classification sur comptages transformes\n")
136 | if (type.trans == "VST") counts.trans <- assay(varianceStabilizingTransformation(dds))
137 | if (type.trans == "rlog") counts.trans <- assay(rlogTransformation(dds))
138 | clusterPlot(counts=counts.trans, out=outfile, versionName=versionName)
139 | 
140 | cat("Figure : premier plan de l'ACP sur les comptages transformes\n")
141 | PCAPlot(dds=dds, group=group, type.trans=type.trans, col=colors, out=outfile, versionName=versionName)
142 | 
143 | ###################################################
144 | ### code chunk number 7: analyse differentielle
145 | ###################################################
146 | cat("Tests statistiques\n")
147 | dds <- nbinomWaldTest(dds)
148 | results <- list()
149 | for (comp in combn(nlevels(colData(dds)[,varInt]), 2, simplify=FALSE)){
150 |   if (!allComp & comp[1]!=1) next
151 |   levelRef <- levels(colData(dds)[,varInt])[comp[1]]
152 |   levelTest <- levels(colData(dds)[,varInt])[comp[2]]
153 |   results[[paste0(levelTest,"_vs_",levelRef)]] <- results(dds, contrast=c(varInt, levelTest, levelRef), pAdjustMethod=adjMethod, 
154 |                                                           cooksCutoff=ifelse(!is.null(cooksCutoff), cooksCutoff, TRUE),
155 |                                                           independentFiltering=independentFiltering, alpha=alpha)
156 |   cat(paste0("Comparison ", levelTest, " vs ", levelRef, "\n"))
157 | }
158 | 
159 | ###################################################
160 | ### code chunk number 8: results of the independent filtering
161 | ###################################################
162 | if(independentFiltering){
163 |   cat("Tableau : independent filtering\n")
164 |   print(tabIndepFiltering <- tabIndepFiltering(results, versionName=versionName), quote=FALSE)
165 | }
166 | 
167 | ###################################################
168 | ### code chunk number 9: export tables
169 | ###################################################
170 | cat("Export des resultats\n")
171 | complete <- exportComplete.DESeq2(dds=dds, results=results, alpha=alpha, group=group[,1], 
172 |                                   cooksCutoff=cooksCutoff, conds=conds, versionName=versionName,
173 |                                   info=info, export=TRUE)
174 | 
175 | cat("# genes up, down et total par comparaison\n")
176 | print(nDiffTotal <- nDiffTotal(complete, alpha=alpha, versionName=versionName), quote=FALSE)
177 | 
178 | cat("Figure : nb de genes DE selon seuil FDR\n")
179 | nbDiffSeuil(complete=complete, out=outfile, versionName=versionName)
180 | 
181 | if (!is.null(geneLengthFile)){
182 |   cat("Export : comptages normalises par la longueur des genes\n")
183 |   normGeneLength(counts=counts(dds, normalized=TRUE), glength=glength, versionName=versionName)
184 |   geneLengthEffect(counts, complete, glength, out=outfile, versionName=versionName)
185 | }
186 | 
187 | ###################################################
188 | ### code chunk number 10: distribution of raw p-values and MA-plot
189 | ###################################################
190 | cat("Figure : distribution des log2(Fold-Changes)\n")
191 | diagLogFC(complete=complete, out=outfile, versionName=versionName)
192 | 
193 | cat("Figure : histogramme des p-valeurs brutes\n")
194 | histoRawp(complete=complete, out=outfile, versionName=versionName)
195 | 
196 | cat("\nFigure : MA-plot\n")
197 | MAplotDE(complete=complete, pvalCutoff=alpha, out=outfile, versionName=versionName)
198 | 
199 | cat("\nFigure : volcano-plot\n")
200 | volcanoPlotDE(complete=complete, pvalCutoff=alpha, out=outfile, versionName=versionName)
201 | 
202 | # this causes trouble quite often
203 | #cat("\nFigure : Venn diagram\n")
204 | #vennDiagramDE(complete=complete, alpha=alpha, out=outfile, versionName=versionName)
205 | 
206 | cat("\nFigure : heatmap\n")
207 | heatmapDE(counts.trans=counts.trans, complete=complete, alpha=alpha, out=outfile, 
208 |           key.xlab=paste0(type.trans, "-centered data"), versionName=versionName)
209 | 
210 | cat("\nFigure : interesting features\n")
211 | if (!is.null(interestingFeatures)){
212 |   plotEvolution(mat=log2(counts(dds,normalized=TRUE)+1), features=interestingFeatures,
213 |                 target=target, varInt1=varInt, colors=colors, ylab=expression(log[2] ~ norm ~ counts + 1),
214 |                 out=outfile, versionName=versionName)
215 | }
216 | 
217 | ###################################################
218 | ### code chunk number 11: sessionInfo and saving
219 | ###################################################
220 | cat("Sauvegarde des resultats\n")
221 | sessionInfo <- sessionInfo()
222 | pckVersionRNADiff <- packageVersion("RNADiff")
223 | pckVersionDESeq2 <- packageVersion("DESeq2")
224 | save.image(file=paste0(versionName, ".RData"))
225 | # export RData for PF2heatmaps
226 | results <- lapply(results, as.data.frame)
227 | pf2heatmaps_objects <- c("varInt", "target", "type.trans", "counts.trans", "results", "info")
228 | save(list=pf2heatmaps_objects, file=paste0(versionName, "_PF2heatmaps.RData"), version=2)
229 | # export RData for PF2toolsFilter
230 | extract_col <- function(comp, info=NULL){
231 |   if (is.null(info)){
232 |     comp[, c("Id","baseMean", "log2FoldChange","padj")]
233 |   } else{
234 |     comp[, c(1:ncol(info), which(names(comp) %in%  c("baseMean", "log2FoldChange","padj")))]
235 |   }
236 | }
237 | complete <- lapply(complete, extract_col, info=info)
238 | save(complete, file=paste0(versionName, "_PF2toolsFilter.RData"), version=2)
239 | 
240 | ###################################################
241 | ### code chunk number 12: knitr compilation
242 | ###################################################
243 | if (!outfile){
244 |   cat("outfile is FALSE: report and slides cannot be generated\n")
245 | } else{
246 |   cat("Creation du rapport et des slides\n")
247 |   knit(system.file("report1factor.Rnw", package="RNADiff"), paste0("report-", versionName, ".tex"), quiet=TRUE)
248 |   knit(system.file("slides1factor.Rnw", package="RNADiff"), paste0("slides-", versionName, ".tex"), quiet=TRUE)
249 |   cat("Compilation du rapport\n")
250 |   system(paste0("pdflatex report-", versionName, ".tex"))
251 |   system(paste0("bibtex report-", versionName, ".aux"))
252 |   system(paste0("pdflatex report-", versionName, ".tex"))
253 |   system(paste0("pdflatex report-", versionName, ".tex"))
254 | }
255 | 


--------------------------------------------------------------------------------
/sequana_pipelines/rnaseq/main.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #  This file is part of Sequana software
  3 | #
  4 | #  Copyright (c) 2016-2021 - Sequana Development Team
  5 | #
  6 | #  File author(s):
  7 | #      Thomas Cokelaer <thomas.cokelaer@pasteur.fr>
  8 | #
  9 | #  Distributed under the terms of the 3-clause BSD license.
 10 | #  The full license is in the LICENSE file, distributed with this software.
 11 | #
 12 | #  website: https://github.com/sequana/sequana
 13 | #  documentation: http://sequana.readthedocs.io
 14 | #
 15 | ##############################################################################
 16 | import os
 17 | import shutil
 18 | import subprocess
 19 | import sys
 20 | 
 21 | import click_completion
 22 | import rich_click as click
 23 | from sequana_pipetools import SequanaManager
 24 | from sequana_pipetools.options import *
 25 | 
 26 | click_completion.init()
 27 | NAME = "rnaseq"
 28 | 
 29 | 
 30 | help = init_click(
 31 |     NAME,
 32 |     groups={
 33 |         "Pipeline Specific": [
 34 |             "--aligner-choice",
 35 |             "--contaminant-file",
 36 |             "--do-igvtools",
 37 |             "--do-bam-coverage",
 38 |             "--do-mark-duplicates",
 39 |             "--do-rnaseqc",
 40 |             "--do-rseqc",
 41 |             "--genome-directory",
 42 |             "--rnaseqc-gtf-file",
 43 |             "--rRNA-feature",
 44 |             "--rseqc-bed-file",
 45 |             "--skip-rRNA",
 46 |             "--skip-gff-check",
 47 |             "--trimming-quality",
 48 |         ],
 49 |     },
 50 | )
 51 | 
 52 | 
 53 | @click.command(context_settings=help)
 54 | @include_options_from(ClickInputOptions)
 55 | @include_options_from(ClickSnakemakeOptions, working_directory=NAME)
 56 | @include_options_from(ClickSlurmOptions)
 57 | @include_options_from(ClickGeneralOptions)
 58 | @include_options_from(ClickTrimmingOptions)
 59 | @include_options_from(ClickFeatureCountsOptions)
 60 | @click.option(
 61 |     "--genome-directory",
 62 |     "genome_directory",
 63 |     show_default=True,
 64 |     type=click.Path(dir_okay=True, file_okay=False),
 65 |     required=True,
 66 | )
 67 | @click.option(
 68 |     "--aligner-choice",
 69 |     "aligner",
 70 |     required=True,
 71 |     type=click.Choice(["bowtie2", "bowtie1", "star", "salmon"]),
 72 |     help="a mapper in bowtie, bowtie2, star",
 73 | )
 74 | @click.option(
 75 |     "--rRNA-feature",
 76 |     "rRNA",
 77 |     default="rRNA",
 78 |     help="""Feature name corresponding to the rRNA to be identified in
 79 | the input GFF/GTF files. Must exist and be valid. If you do not have any,
 80 | you may skip this step using --skip-rRNA or provide a fasta file using --contaminant-file""",
 81 | )
 82 | @click.option(
 83 |     "--skip-rRNA",
 84 |     "skip_rRNA",
 85 |     is_flag=True,
 86 |     help="""skip the mapping on rRNA feature. ignored if --contaminant-file is provided""",
 87 | )
 88 | @click.option(
 89 |     "--contaminant-file",
 90 |     default=None,
 91 |     show_default=True,
 92 |     help="""A fasta file. If used, the rRNA-feature is not used
 93 | This option is useful if you have a dedicated list of rRNA feature or a dedicated
 94 | fasta file to search for contaminants""",
 95 | )
 96 | @click.option(
 97 |     "--skip-gff-check",
 98 |     is_flag=True,
 99 |     default=False,
100 |     show_default=True,
101 |     help="""By default we check the coherence between the input
102 | GFF file and related options (e.g. --feature_counts_feature_type and
103 | --feature_counts_attribute options). This may take time e.g. for mouse or human.
104 | Using this option skips the sanity checks""",
105 | )
106 | @click.option(
107 |     "--do-igvtools",
108 |     is_flag=True,
109 |     help="""if set, this will compute TDF files that can be imported in
110 | IGV browser. TDF file allows to quickly visualise the coverage of the mapped
111 | reads.""",
112 | )
113 | @click.option(
114 |     "--do-bam-coverage",
115 |     is_flag=True,
116 |     help="Similar to --do-igvtools using bigwig",
117 | )
118 | @click.option(
119 |     "--do-mark-duplicates",
120 |     is_flag=True,
121 |     help="""Mark duplicates. To be used e.g. with QCs""",
122 | )
123 | @click.option("--do-rnaseqc", is_flag=True, help="do RNA-seq QC using RNAseQC v2")
124 | @click.option(
125 |     "--rnaseqc-gtf-file",
126 |     help="""The GTF file to be used for RNAseQC. Without a valid GTF,
127 |     RNAseqQC will not work. You may try sequana gff-to-gtf application.""",
128 | )
129 | @click.option(
130 |     "--do-rseqc",
131 |     is_flag=True,
132 |     help="""do RNA-seq QC using RseQC. This will need a BED file
133 | corresponding to your GFF file. For prokaryotes, the BED file is created on the
134 | fly.""",
135 | )
136 | @click.option("--rseqc-bed-file", help="""The rseQC input bed file.""")
137 | def main(**options):
138 | 
139 |     if options["from_project"]:
140 |         click.echo("--from-project Not yet implemented")
141 |         sys.exit(1)
142 |     # the real stuff is here
143 |     manager = SequanaManager(options, NAME)
144 |     manager.setup()
145 | 
146 |     # aliases
147 |     options = manager.options
148 |     cfg = manager.config.config
149 | 
150 |     from sequana_pipetools import logger
151 | 
152 |     logger.setLevel(options.level)
153 | 
154 |     manager.fill_data_options()
155 |     # --------------------------------------------------------- general
156 |     cfg.general.genome_directory = os.path.abspath(options.genome_directory)
157 |     cfg.general.aligner = options.aligner
158 | 
159 |     # genome name = cfg.genome.genome_directory
160 |     genome_name = cfg.general.genome_directory.rsplit("/", 1)[1]
161 |     prefix = cfg.general.genome_directory
162 |     fasta = cfg.general.genome_directory + f"/{genome_name}.fa"
163 |     if os.path.exists(fasta) is False:
164 |         logger.critical(
165 |             """Could not find {}. You must have the genome sequence in fasta with the extension .fa named after the genome directory.""".format(
166 |                 fasta
167 |             )
168 |         )
169 |         sys.exit()
170 | 
171 |     # mutually exclusive options
172 |     if options.contaminant_file:
173 |         cfg.general.contaminant_file = os.path.abspath(options.contaminant_file)
174 |         logger.warning("You are using a custom FASTA --contaminant_file so --rRNA-feature will be ignored")
175 |         cfg.general.rRNA_feature = None
176 |     elif options.skip_rRNA:
177 |         cfg.general.rRNA_feature = None
178 |     else:
179 |         cfg.general.rRNA_feature = options.rRNA
180 | 
181 |     # --------------------------------------------------------- trimming
182 |     cfg.trimming.software_choice = options.trimming_software_choice
183 |     cfg.trimming.do = not options.disable_trimming
184 |     qual = options.trimming_quality
185 | 
186 |     if options.trimming_software_choice in ["cutadapt", "atropos"]:
187 |         cfg.cutadapt.tool_choice = options.trimming_software_choice
188 |         cfg.cutadapt.fwd = options.trimming_adapter_read1
189 |         cfg.cutadapt.rev = options.trimming_adapter_read2
190 |         cfg.cutadapt.m = options.trimming_minimum_length
191 |         cfg.cutadapt.mode = options.trimming_cutadapt_mode
192 |         cfg.cutadapt.options = options.trimming_cutadapt_options  # trim Ns -O 6
193 |         cfg.cutadapt.quality = 30 if qual == -1 else qual
194 |     else:
195 |         cfg.fastp.minimum_length = options.trimming_minimum_length
196 |         cfg.fastp.quality = 15 if qual == -1 else qual
197 |         cfg.fastp.adapters = ""
198 |         if options.trimming_adapter_read1:
199 |             cfg.fastp.adapters += f"--adapter_sequence {options.trimming_adapter_read1}"
200 |         if options.trimming_adapter_read2:
201 |             cfg.fastp.adapters += f"--adapter_sequence_r2 {options.trimming_adapter_read2}"
202 | 
203 |         cfg.fastp.options = " --cut_tail "
204 |         cfg.fastp.disable_quality_filtering = False
205 |         cfg.fastp.disable_adapter_trimming = False
206 | 
207 |     # ----------------------------------------------------- feature counts
208 |     cfg.feature_counts.options = options.feature_counts_options
209 |     cfg.feature_counts.strandness = options.feature_counts_strandness
210 |     cfg.feature_counts.attribute = options.feature_counts_attribute
211 |     cfg.feature_counts.feature = options.feature_counts_feature_type
212 |     cfg.feature_counts.extra_attributes = options.feature_counts_extra_attributes
213 | 
214 |     # ------------------------------------------------------ optional
215 |     cfg.igvtools.do = options.do_igvtools
216 |     cfg.bam_coverage.do = options.do_bam_coverage
217 |     cfg.mark_duplicates.do = False
218 |     if options.do_mark_duplicates:
219 |         cfg.mark_duplicates.do = True
220 | 
221 |     # -------------------------------------------------------- RNAseqQC
222 |     cfg.rnaseqc.do = options.do_rnaseqc
223 | 
224 |     if options.do_rnaseqc:
225 |         if options.rnaseqc_gtf_file is None:
226 |             logger.info(
227 |                 "You asked for RNA_seqc QC assessements but no GTF"
228 |                 " file provided; Please use --rnaseqc-gtf-file option. Switching off in your"
229 |                 " config file and continuing. You may use 'sequana gff2gtf input.gff' to create"
230 |                 " the gtf file"
231 |             )
232 |             cfg.rnaseqc.do = False
233 |         if options.aligner in ["salmon"]:
234 |             logger.info(
235 |                 "WARNING"
236 |                 "You asked for RNA_seqc QC assessements but no"
237 |                 " BAM will be generated by the salmon aligner. Switching off this option. "
238 |             )
239 |             cfg.rnaseqc.do = False
240 | 
241 |     cfg.rnaseqc.gtf_file = options.rnaseqc_gtf_file
242 | 
243 |     cfg.rseqc.do = options.do_rseqc
244 |     cfg.rseqc.bed_file = options.rseqc_bed_file
245 | 
246 |     # -------------------------------------------------------- RNAdiff
247 | 
248 |     # import sequana_pipelines.rnaseq
249 | 
250 |     # SANITY CHECKS
251 |     # -------------------------------------- do we find rRNA feature in the GFF ?
252 |     # if we do not build a custom feature_counts set of options, no need to
253 |     # check carefully the GFF; if users knows what he is doing; no need to
254 |     # check the GFF either
255 |     if options.skip_gff_check is False and "," not in cfg.feature_counts.feature:
256 |         logger.info("Checking your input GFF file and rRNA feature if provided")
257 | 
258 |         from sequana.gff3 import GFF3
259 | 
260 |         genome_directory = os.path.abspath(cfg.general.genome_directory)
261 |         genome_name = genome_directory.rsplit("/", 1)[1]
262 |         prefix_name = genome_directory + "/" + genome_name
263 |         gff_file = prefix_name + ".gff"
264 | 
265 |         gff = GFF3(gff_file)
266 |         df_gff = gff.df  # This takes one minute on eukaryotes. No need to
267 |         valid_features = gff.features  # about 3 seconds
268 |         valid_attributes = gff.get_attributes()  # about 10 seconds
269 | 
270 |         # first check the rRNA feature
271 |         if cfg["general"]["rRNA_feature"] and cfg["general"]["rRNA_feature"] not in valid_features:
272 | 
273 |             logger.error(
274 |                 "rRNA feature not found in the input GFF ({})".format(gff_file)
275 |                 + " This is probably an error. Please check the GFF content and /or"
276 |                 " change the feature name with --rRNA-feature based on the content"
277 |                 " of your GFF. Valid features are: {}".format(valid_features)
278 |             )
279 |             sys.exit()
280 | 
281 |         # then, check the main feature
282 |         fc_type = cfg.feature_counts.feature
283 |         fc_attr = cfg.feature_counts.attribute
284 | 
285 |         logger.info("Checking your input GFF file and feature counts options.")
286 |         logger.info(f"You chose '{fc_type}' feature and '{fc_attr}' attribute")
287 |         # if only one feature (99% of the projet)
288 |         if "," not in fc_type:
289 |             fc_types = [fc_type]
290 | 
291 |         for fc_type in fc_types:
292 |             S = sum(df_gff["genetic_type"] == fc_type)
293 |             if S == 0:
294 |                 logger.error(
295 |                     "Found 0 entries for feature '{}'. Please choose a valid feature from: {}".format(
296 |                         fc_type, valid_features
297 |                     )
298 |                 )
299 |                 sys.exit()
300 |             else:
301 |                 logger.info(f"Found {S} '{fc_type}' entries")
302 | 
303 |             # now we check the attribute:
304 |             dd = df_gff.query("genetic_type==@fc_type")
305 |             attributes = [y for x in dd.attributes for y in x.keys()]
306 |             S = attributes.count(fc_attr)
307 |             if S == 0:
308 |                 uniq_attributes = set(attributes)
309 |                 logger.error(
310 |                     f"Found 0 entries for attribute '{fc_attr}'. Please choose a valid attribute from: {uniq_attributes}"
311 |                 )
312 |                 sys.exit()
313 |             else:
314 |                 unique = set([x[fc_attr] for k, x in dd.attributes.items() if fc_attr in x])
315 |                 logger.info(f"Found {S} '{fc_attr}' entries for the attribute [{len(unique)} unique entries]")
316 | 
317 |             if S != len(unique):
318 |                 logger.warning("Attribute non-unique. Feature counts should handle it")
319 | 
320 |             if options.feature_counts_extra_attributes:
321 |                 for extra_attr in cfg.feature_counts.extra_attributes.split(","):
322 |                     if extra_attr not in set(attributes):
323 |                         logger.error("{extra_attr} not found in the GFF attributes. Try one of {set(attributes)}")
324 |                         sys.exit()
325 | 
326 |     # need to move the custom file into the working directoty
327 |     if "," in cfg.feature_counts.feature:
328 |         logger.info("Building a custom GFF file (custom.gff) using Sequana. Please wait")
329 |         genome_directory = os.path.abspath(cfg.general.genome_directory)
330 |         genome_name = genome_directory.rsplit("/", 1)[1]
331 |         prefix_name = genome_directory + "/" + genome_name
332 | 
333 |         from sequana import GFF3
334 | 
335 |         gff = GFF3(prefix_name + ".gff")
336 |         fc_types = cfg.feature_counts.feature.strip().split(",")
337 |         gff.save_gff_filtered(features=fc_types, filename="custom.gff")
338 |         cfg.general.custom_gff = "custom.gff"
339 | 
340 |     # finalise the command and save it; copy the snakemake. update the config
341 |     # file and save it.
342 |     manager.teardown()
343 | 
344 |     try:  # option added in latest version
345 |         if cfg.general.custom_gff:
346 |             shutil.copy(cfg.general.custom_gff, options.workdir)
347 |     except:
348 |         pass
349 | 
350 | 
351 | if __name__ == "__main__":
352 |     main()
353 | 


--------------------------------------------------------------------------------
/sequana_pipelines/rnaseq/data/rnadiff_GLM.R:
--------------------------------------------------------------------------------
  1 | ###################################################
  2 | ### DESeq2_GLM parameters: to be modified by the user
  3 | ###################################################
  4 | rm(list=ls())                                        # remove all the objects of the R session
  5 | 
  6 | workspace <- "."                                     # workspace for the R session
  7 | 
  8 | projectName <- "BXXXX"                               # name of the project (cannot contain any ".")
  9 | analysisVersion <- "vN"                              # name of the analysis version (cannot contain any ".")
 10 | 
 11 | author <- "FILLME (Biomics platform - Institut Pasteur)"   # author of the statistical report
 12 | researcher <- "FILLME"                               #  name of the researcher
 13 | chief <- ""                                          # name of the head of unit
 14 | 
 15 | varInt1 <- "varInt1"                                 # first factor of interest
 16 | varInt2 <- "varInt2"                                 # second factor of interest
 17 | condRef1 <- "condRef1"         			             # reference biological condition for varInt1
 18 | condRef2 <- "condRef2"                               # reference biological condition for varInt2
 19 | design <- ~ varInt1 + varInt2 + varInt1:varInt2      # design du mod?le statistique
 20 | 
 21 | outfile <- TRUE                                      # TRUE to export figures, FALSE to display them in R
 22 | colors <- c("#f3c300", "#875692", "#f38400", "#a1caf1", "#be0032", # vector of colors of each group on the plots
 23 |             "#c2b280", "#848482", "#008856", "#e68fac", "#0067a5", 
 24 |             "#f99379", "#604e97", "#f6a600", "#b3446c", "#dcd300", 
 25 |             "#882d17", "#8db600", "#654522", "#e25822", "#2b3d26")
 26 | 
 27 | cooksCutoff <- NULL                                  # outliers detection threshold (NULL to leave DESeq2 choosing it, Inf to keep outliers)
 28 | independentFiltering <- TRUE                         # FALSE to turn off the independent filtering (default is TRUE)
 29 | alpha <- 0.05                                        # threshold of statistical significance
 30 | adjMethod <- "BH"                                    # p-value adjustment method: "BH" (default) or "BY"
 31 | 
 32 | type.trans <- "VST"                                  # transformation for exploratory analysis: "VST" ou "rlog" (if size factors vary very widely)
 33 | locfunc <- "median"                                  # "median" (default) or "shorth" with library(genefilter) (to estimate the size factors)
 34 | interestingFeatures <- NULL                          # vector of features for which to plot the expression
 35 | featuresToRemove <- c("alignment_not_unique",        # names of the features to be removed (default is the HTSeq-count specific lines)
 36 |                       "ambiguous", "no_feature",
 37 |                       "not_aligned", "too_low_aQual") 
 38 | 
 39 | fitType <- "parametric"				                 # mean-variance relationship: "parametric" (default) or "local"
 40 | 
 41 | #####################################
 42 | # INPUT FILES
 43 | #####################################
 44 | geneLengthFile <- "input_gene_lengths.tsv"      # path to the genes lenghts file (default is NULL)
 45 | targetFile <- "target.txt"                      # path to the design/target file
 46 | infoFile <- "input_info.tsv"                    # path to the annotation file (needed if 0 counts not in counts files)
 47 | rawDir <- "feature_counts"                   # path to the directory containing raw counts files
 48 | 
 49 | ###################################################
 50 | ### code chunk number 1: construction autres parametres et divers chargements
 51 | ###################################################
 52 | setwd(workspace)
 53 | library(RNADiff)
 54 | library(knitr)
 55 | if (locfunc=="shorth") library(genefilter)
 56 | 
 57 | versionName <- paste(projectName, analysisVersion, sep="-")
 58 | ncol <- NULL                                         # largeur des tableaux dans le rapport
 59 | 
 60 | cat("Creation des dossiers d'exports\n") 
 61 | dir.create("figures", showWarnings=FALSE)
 62 | dir.create("tables", showWarnings=FALSE)            
 63 | 
 64 | ###################################################
 65 | ### code chunk number 2: loadData
 66 | ###################################################
 67 | cat("Chargement des annotations et longueurs des genes si besoin\n")
 68 | if (!is.null(infoFile)) print(head(info <- read.delim(infoFile, sep="\t", header=TRUE, stringsAsFactors=FALSE))) else info <- NULL
 69 | if (!is.null(geneLengthFile)) print(head(glength <- read.table(geneLengthFile, sep="\t", header=TRUE, stringsAsFactors=FALSE))) else glength <- NULL
 70 | 
 71 | cat("Chargement du target file\n")
 72 | print(target <- loadTargetFile(targetFile, varInt=c(varInt1,varInt2), condRef=c(condRef1,condRef2)))
 73 | 
 74 | cat("Chargement des donnees\n")
 75 | counts <- loadCountData(target, rawDir=rawDir, versionName=versionName, featuresToRemove=featuresToRemove)
 76 | 
 77 | cat("Verifier que les echantillons de counts sont dans le meme ordre que le target\n")
 78 | print(cbind(target=as.character(target[,1]),counts=colnames(counts)))
 79 | 
 80 | cat("Verifier que les identifiants dans info et glength sont les memes que dans les comptages\n")
 81 | checkInfoGlength(counts=counts, info=info, glength=glength)
 82 | 
 83 | ####################################################
 84 | #### code chunk number 3: description of raw data
 85 | ####################################################
 86 | cat("\nFigure : nombre de reads par echantillon\n")
 87 | barplotTC(counts=counts, group=target[,c(varInt1,varInt2)], col=colors, out=outfile, versionName=versionName)
 88 | 
 89 | cat("Figure : nombre de comptages nuls par echantillon\n")
 90 | barplotNul(counts=counts, group=target[,c(varInt1,varInt2)], col=colors, out=outfile, versionName=versionName)
 91 | N <- nrow(counts) - nrow(removeNul(counts))
 92 | cat("\nNombre de genes avec que des comptages nuls :", N,"\n")
 93 | 
 94 | cat("\nFigure : estimation de la densite des comptages de chaque echantillon\n")
 95 | densityPlot(counts=counts, group=target[,c(varInt1,varInt2)], col=colors, out=outfile, versionName=versionName)
 96 | 
 97 | cat("\nFigure + tableau : sequences majoritaires pour chaque echantillon\n")
 98 | majSequences <- majSequences(counts=counts, group=target[,c(varInt1,varInt2)], versionName=versionName, col=colors, out=outfile)
 99 | 
100 | cat("\nCalcul des SERE\n")
101 | print(sere <- pairwiseSERE(counts, versionName=versionName))
102 | 
103 | cat("\nFigure : pairwise scatterplots of samples\n")
104 | pairwiseScatterPlots(counts=counts, group=target[,c(varInt1,varInt2)], out=outfile, versionName=versionName)
105 | 
106 | ###################################################
107 | ### code chunk number 4: creating DESeqDataSet object, normalization and estimateDispersion
108 | ###################################################
109 | dds <- DESeqDataSetFromMatrix(countData=counts, colData=target, design=design)
110 | print(design(dds))
111 | 
112 | cat("Estimation des size factors\n")
113 | dds <- estimateSizeFactors(dds, locfunc=eval(as.name(locfunc)))
114 | print(sf <- sizeFactors(dds))
115 | cat("\nFigure : diagnostic des size factors\n")
116 | diagSizeFactors(dds=dds, group=target[,c(varInt1,varInt2)], col=colors, out=outfile, versionName=versionName)
117 | 
118 | cat("\nCalcul des dispersions et graph relation mean-dispersion\n")
119 | dds <- estimateDispersions(dds, fitType=fitType)
120 | plotDispEstimates(dds=dds, out=outfile, versionName=versionName)
121 | cat("\nFigure : diagnostic de log-normalite des dispersions\n")
122 | diagLogNormalityDisp(dds=dds, out=outfile, versionName=versionName)
123 | 
124 | ####################################################
125 | ### code chunk number 5: Boxplot avant et apres normalisation
126 | ####################################################
127 | cat("Figure : boxplots sur comptages bruts et normalises\n")
128 | boxplotCounts(counts=counts(dds), group=target[,c(varInt1,varInt2)], col=colors, out=outfile, versionName=versionName)
129 | boxplotCounts(counts=counts(dds, normalized=TRUE), group=target[,c(varInt1,varInt2)], col=colors, type="norm", out=outfile, versionName=versionName)
130 | 
131 | ###################################################
132 | ### code chunk number 6: clustering + PCA of samples
133 | ###################################################
134 | cat("Figure : dendrogramme de la classification sur comptages transformes\n")
135 | if (type.trans == "VST") counts.trans <- assay(varianceStabilizingTransformation(dds))
136 | if (type.trans == "rlog") counts.trans <- assay(rlogTransformation(dds))
137 | clusterPlot(counts=counts.trans, out=outfile, versionName=versionName)
138 | 
139 | cat("Figure : premier plan de l'ACP sur les comptages transformes\n")
140 | PCAPlot(dds=dds, group=target[,c(varInt1,varInt2)], col=colors, type.trans=type.trans, out=outfile, versionName=versionName)
141 | 
142 | ###################################################
143 | ### code chunk number 7: analyse differentielle
144 | ###################################################
145 | cat("Tests statistiques\n")
146 | dds <- nbinomWaldTest(dds)
147 | 
148 | resultsNames(dds)
149 | #  [1] "Intercept"                 "soucheSEG"                 "soucheB6"                 
150 | #  [4] "infectionNI"               "infectionImoins"           "infectionIplus"           
151 | #  [7] "soucheSEG.infectionNI"     "soucheB6.infectionNI"      "soucheSEG.infectionImoins"
152 | # [10] "soucheB6.infectionImoins"  "soucheSEG.infectionIplus"  "soucheB6.infectionIplus" 
153 | 
154 | to_test <- list("B6-NI_vs_SEG-NI"=c(0,-1,1,0,0,0,-1,1,0,0,0,0),
155 |                 "B6-Imoins_vs_SEG-Imoins"=c(0,-1,1,0,0,0,0,0,-1,1,0,0),
156 |                 "(SEG-Iplus_vs_SEG-Imoins)_vs_(B6-Iplus_vs_B6-Imoins)"=c(0,0,0,0,0,0,0,0,-1,1,1,-1))
157 | 
158 | checkContrasts(coefs=resultsNames(dds),contrasts=to_test,versionName=versionName)
159 | 				
160 | results <- vector("list",length(to_test)); names(results) <- names(to_test);
161 | for (name in names(to_test)){
162 |   results[[name]] <- results(dds, contrast=to_test[[name]], pAdjustMethod=adjMethod,
163 |                              cooksCutoff=ifelse(!is.null(cooksCutoff), cooksCutoff, TRUE),
164 |                              independentFiltering=independentFiltering, alpha=alpha)
165 | }
166 | 
167 | ###################################################
168 | ### code chunk number 8: results of the independent filtering
169 | ###################################################
170 | if(independentFiltering){
171 |   cat("Tableau : independent filtering\n")
172 |   print(tabIndepFiltering <- tabIndepFiltering(results, versionName=versionName), quote=FALSE)
173 | }
174 | 
175 | ###################################################
176 | ### code chunk number 9: export tables
177 | ###################################################
178 | cat("Export des resultats\n")
179 | complete <- exportComplete.DESeq2(dds=dds, results=results, alpha=alpha, cooksCutoff=cooksCutoff,
180 |                                   group=paste(target[,varInt1], target[,varInt2], sep="-"),
181 |                                   conds=unique(paste(target[,varInt1], target[,varInt2], sep="-")),
182 |                                   versionName=versionName, info=info, export=TRUE)
183 | 
184 | cat("# genes up, down et total par comparaison\n")
185 | print(nDiffTotal <- nDiffTotal(complete, alpha=alpha, versionName=versionName), quote=FALSE)
186 | 
187 | cat("Figure : nb de genes DE selon seuil FDR\n")
188 | nbDiffSeuil(complete=complete, out=outfile, versionName=versionName)
189 | 
190 | if (!is.null(geneLengthFile)){
191 |   cat("Export : comptages normalises par la longueur des genes\n")
192 |   normGeneLength(counts=counts(dds, normalized=TRUE), glength=glength, versionName=versionName)
193 |   geneLengthEffect(counts, complete, glength, out=outfile, versionName=versionName)
194 | }
195 | 
196 | ###################################################
197 | ### code chunk number 10: distribution of raw p-values and MA-plot
198 | ###################################################
199 | cat("Figure : distribution des log2(Fold-Changes)\n")
200 | diagLogFC(complete=complete, out=outfile, versionName=versionName)
201 | 
202 | cat("Figure : histogramme des p-valeurs brutes\n")
203 | histoRawp(complete=complete, out=outfile, versionName=versionName)
204 | 
205 | cat("\nFigure : MA-plot\n")
206 | MAplotDE(complete=complete, pvalCutoff=alpha, out=outfile, versionName=versionName)
207 | 
208 | cat("\nFigure : volcano-plot\n")
209 | volcanoPlotDE(complete=complete, pvalCutoff=alpha, out=outfile, versionName=versionName)
210 | 
211 | cat("\nFigure : Venn diagram\n")
212 | vennDiagramDE(complete=complete, alpha=alpha, out=outfile, versionName=versionName)
213 | 
214 | cat("\nFigure : heatmap\n")
215 | heatmapDE(counts.trans=counts.trans, complete=complete, alpha=alpha, out=outfile,
216 |           key.xlab=paste0(type.trans, "-centered data"), versionName=versionName)
217 | 
218 | cat("\nFigure : interesting features\n")
219 | if (!is.null(interestingFeatures)){
220 |   plotEvolution(mat=log2(counts(dds, normalized=TRUE)+1), features=interestingFeatures,
221 |                 target=target, varInt1=varInt2, varInt2=varInt1, colors=colors,
222 |                 ylab=expression(log[2] ~ norm ~ counts + 1), out=outfile, versionName=versionName)
223 | }
224 | 
225 | ###################################################
226 | ### code chunk number 11: sessionInfo and saving
227 | ###################################################
228 | cat("Sauvegarde des resultats\n")
229 | sessionInfo <- sessionInfo()
230 | pckVersionRNADiff <- packageVersion("RNADiff")
231 | pckVersionDESeq2 <- packageVersion("DESeq2")
232 | save.image(file=paste0(versionName, ".RData"))
233 | # export RData for PF2heatmaps
234 | results <- lapply(results, as.data.frame)
235 | pf2heatmaps_objects <- c("varInt1", "varInt2", "target", "type.trans", "counts.trans", "results", "info")
236 | save(list=pf2heatmaps_objects, file=paste0(versionName, "_PF2heatmaps.RData"), version=2)
237 | # export RData for PF2toolsFilter
238 | extract_col <- function(comp, info=NULL){
239 |   if (is.null(info)){
240 |     comp[, c("Id","baseMean", "log2FoldChange","padj")]
241 |   } else{
242 |     comp[, c(1:ncol(info), which(names(comp) %in%  c("baseMean", "log2FoldChange","padj")))]
243 |   }
244 | }
245 | complete <- lapply(complete, extract_col, info=info)
246 | save(complete, file=paste0(versionName, "_PF2toolsFilter.RData"), version=2)
247 | 
248 | ###################################################
249 | ### code chunk number 12: knitr compilation
250 | ###################################################
251 | if (!outfile){
252 |   cat("outfile is FALSE: report and slides cannot be generated\n")
253 | } else{
254 |   cat("Creation du rapport et des slides\n")
255 |   knit(system.file("reportGLM.Rnw", package="RNADiff"), paste0("report-", versionName, ".tex"), quiet=TRUE)
256 |   knit(system.file("slidesGLM.Rnw", package="RNADiff"), paste0("slides-", versionName, ".tex"), quiet=TRUE)
257 |   cat("Compilation du rapport\n")
258 |   system(paste0("pdflatex report-", versionName, ".tex"))
259 |   system(paste0("bibtex report-", versionName, ".aux"))
260 |   system(paste0("pdflatex report-", versionName, ".tex"))
261 |   system(paste0("pdflatex report-", versionName, ".tex"))
262 | }
263 | 


--------------------------------------------------------------------------------
/sequana_pipelines/rnaseq/config.yaml:
--------------------------------------------------------------------------------
  1 | # ============================================================================
  2 | # Config file for RNA-seq
  3 | #
  4 | # ==================[ Sections for the users ]================================
  5 | #
  6 | # One of input_directory, input_pattern and input_samples must be provided
  7 | # If input_directory provided, use it otherwise if input_pattern provided,
  8 | # use it, otherwise use input_samples.
  9 | # ============================================================================
 10 | sequana_wrappers: "v24.8.29"
 11 | 
 12 | input_directory:
 13 | input_readtag: _R[12]_
 14 | input_pattern: '*fastq.gz'
 15 | exclude_pattern:
 16 | 
 17 | # See sequana_pipetools.readthedocs.io for details about these 2 options
 18 | # common prefixes are removed. addition prefixes may be removed here
 19 | #extra_prefixes_to_strip = []
 20 | # in special cases, sample names can be extracted with a pattern
 21 | #sample_pattern: '{sample}.fastq.gz'
 22 | apptainers:
 23 |   sequana_tools: "https://zenodo.org/record/7102074/files/sequana_tools_0.14.3.img"
 24 |   salmon: "https://zenodo.org/record/5708843/files/salmon_1.3.0.img"
 25 |   fastqc: "https://zenodo.org/record/7015004/files/fastqc_0.11.9-py3.img"
 26 |   fastp: "https://zenodo.org/record/7319782/files/fastp_0.23.2.img"
 27 |   igvtools: "https://zenodo.org/record/7022635/files/igvtools_2.12.0.img"
 28 |   graphviz: "https://zenodo.org/record/7928262/files/graphviz_7.0.5.img"
 29 |   multiqc: "https://zenodo.org/record/10205070/files/multiqc_1.16.0.img"
 30 |   rnaseqc: "https://zenodo.org/record/5799564/files/rnaseqc_2.35.0.img"
 31 | 
 32 | # =========================================== Sections for the users
 33 | 
 34 | #############################################################################
 35 | # Genome section:
 36 | #
 37 | # :Parameters:
 38 | #
 39 | # - aligner: either star or bowtie2. 
 40 | # - genome_directory: directory where all indexes are written.
 41 | # - rRNA_contaminant: path to an existing fasta file for ribosomal RNA (to be found in
 42 | #   genome_directory)
 43 | # - rRNA_feature: if rRNA_contaminant is not provided, ribosomal RNA will be extract
 44 | #     from GFF using this feature name. It must be found. 
 45 | general:
 46 |     aligner: bowtie2
 47 |     genome_directory:
 48 |     contaminant_file:
 49 |     rRNA_feature: rRNA
 50 |     custom_gff: ''
 51 | 
 52 | 
 53 | #################################################################
 54 | # FastQC section
 55 | #
 56 | # :Parameters:
 57 | #
 58 | # - options: string with any valid FastQC options
 59 | #
 60 | fastqc:
 61 |     skip_fastqc_raw: true
 62 |     options: --nogroup
 63 |     threads: 4
 64 |     resources:
 65 |         mem: 4G
 66 | 
 67 | #######################################################################
 68 | # Quality trimming and adapter removal
 69 | #
 70 | # for cutadapt, please fill the fwd and rev fields if required. It can be a
 71 | # string, or a filename. If a filename, it must be prefixed with "file:" to
 72 | # specify that it is a file and not a string. If the tool is cutadapt, the empty
 73 | # fwd and rev fields means that no adapters are to be used.
 74 | #
 75 | # :Parameters:
 76 | #
 77 | # - fwd: a string or file (prefixed with *file:*)
 78 | # - m: 20 means discard trimmed reads that are shorter than 20.
 79 | #         must be > 0
 80 | # - quality: 0 means no trimming, 30 means keep base with quality
 81 | #        above 30
 82 | # - mode: must be set to one of
 83 | #     - g for 5'
 84 | #     - a for 3'
 85 | #     - b for both 5'/3'
 86 | # - rev: a string or file (prefixed with *file:*)
 87 | # - tool: only cutadapt supported for now
 88 | # - threads: number of threads to use (atropos only)
 89 | # - options: See cutadapt documentation for details on
 90 | #            cutadapt.readthedocs.io. We change the default value
 91 | #            of -O to 6 (at least 6 bases are required to match before
 92 | #            trimming of an adapter)
 93 | #
 94 | # tool_choice__ = ["atropos", "cutadapt"]
 95 | #
 96 | # trim-n trims Ns at the end of the read
 97 | cutadapt:
 98 |     tool_choice: cutadapt
 99 |     fwd: ''
100 |     rev: ''
101 |     m: 20                       # {"strict_min": 0}
102 |     mode: b                     # {"values": ["b","g","a"]}
103 |     options: -O 6 --trim-n
104 |     quality: 30                 # {"range": [0,40]}
105 |     threads: 4
106 | 
107 | 
108 | #############################################################################
109 | # -Q should disable the quality filter
110 | #
111 | # Quality filtering only limits the N base number (-n, --n_base_limit) 
112 | # meaning if 5 Ns are found, the read is discarded, 
113 | # -q is the quality value se to Q15 to be qualified; If more than 40% of bases
114 | # are unqualified, the read is discarded.
115 | # You can also filter reads by average quality score using -e QUAL_score
116 | #
117 | # minimum length is set to 15 by default
118 | #
119 | # Adapter trimming is set by default. Can be disable with -A
120 | # For adapters, this is automatic but you can be specific using 
121 | # --adapter_sequence for read1, and --adapter_sequence_r2 for read2.
122 | # The --cut_tail moves a sliding window from tail (3') to front, drop the bases 
123 | # in the window if its mean quality is below cut_mean_quality, stop otherwise. 
124 | # Use cut_tail_window_size to set the widnow size (default 4)), and 
125 | # cut_tail_mean_quality to set the mean quality threshold (default 20)
126 | # Other useful options: --disable_adapter_trimming and --disable_quality_filtering.
127 | # or -n 5 (minimum number of Ns required to discard a read)
128 | fastp:
129 |     options: ' --cut_tail '
130 |     minimum_length: 20
131 |     adapters: ''
132 |     quality: 15
133 |     threads: 4
134 |     disable_adapter_trimming: false
135 |     disable_quality_filtering: false
136 |     resources:
137 |         mem: 8G
138 | 
139 | #######################################################
140 | # Quality trimming software choice
141 | #
142 | # software_choice__ = ["atropos", "cutadapt", "fastp"]
143 | #
144 | trimming:
145 |     software_choice: fastp
146 |     do: true
147 | 
148 | #############################################################################
149 | # bowtie1_mapping_rna used to align reads against ribosomal RNA
150 | #
151 | # :Parameters:
152 | #
153 | # - do: if unchecked, this rule is ignored
154 | # - options: any options recognised by bowtie1 tool
155 | # - threads: number of threads to be used
156 | # - nreads: no need to analyse all data to estimate the ribosomal content. 
157 | #   analyse 100,000 reads by default. Set to -1 to ignore and analyse all data
158 | bowtie1_mapping_rna:
159 |     # remove in v1.20 and set automatically to on/off if rRNA/fasta provided
160 |     # do: true
161 |     options: ''
162 |     threads: 4
163 |     nreads: 100000
164 | 
165 | #############################################################################
166 | # star_mapping used to align reads against genome file
167 | #
168 | # :Parameters:
169 | #
170 | # - do: if unchecked, this rule is ignored
171 | # - options: any options recognised by rna-star tool. Set limitBAMsortRAM to 30G
172 | # - threads: number of threads to be used
173 | # - legacy: if set to True will use the old 2-pass version from STAR
174 | #      used in this pipeline until v0.15.3. If you want to use the
175 | #      2-pass mode available in star, you will need star 2.7 and above
176 | # 
177 | star_mapping:
178 |     options: " --limitBAMsortRAM 30000000000 --outFilterMismatchNoverLmax 0.05 --seedSearchStartLmax 20 "
179 |     legacy: True
180 |     threads: 4
181 |     resources:
182 |       mem: 32G
183 | 
184 | ##############################################################################
185 | # STAR indexing section
186 | #
187 | # :Parameters:
188 | #
189 | # - options: string with any valid STAR options
190 | star_index:
191 |     options:
192 |     threads: 4
193 |     resources:
194 |       mem: 4G
195 | #############################################################################
196 | # bowtie1_mapping_ref used to align reads against genome file
197 | #
198 | # :Parameters:
199 | #
200 | # - do: if unchecked, this rule is ignored
201 | # - options: any options recognised by bowtie1 tool
202 | # - threads: number of threads to be used
203 | #
204 | bowtie1_mapping_ref:
205 |     options: --chunkmbs 400 -m 1
206 |     threads: 4
207 | 
208 | #############################################################################
209 | # bowtie2_mapping used to align reads against genome file
210 | #
211 | # :Parameters:
212 | #
213 | # - do: if unchecked, this rule is ignored
214 | # - options: any options recognised by bowtie2 tool
215 | # - threads: number of threads to be used
216 | #
217 | bowtie2_mapping:
218 |     #options: "--dovetail --no-mixed --no-discordant " for paired-end data
219 |     options: ''
220 |     threads: 4
221 |     genome_size_larger_than_4gb: false
222 |     resources:
223 |       mem: 20G
224 | 
225 | bowtie2_index:
226 |     options: ''
227 |     threads: 4
228 |     resources:
229 |       mem: 20G
230 | 
231 | salmon_index:
232 |     threads: 2
233 |     options:
234 |     resources:
235 |         mem: 4G
236 | 
237 | salmon_mapping:
238 |     options: -l A
239 |     threads: 4
240 |     resources:
241 |         mem: 4G
242 | 
243 | #############################################################################
244 | # feature_counts used to count reads against features
245 | #
246 | # :Parameters:
247 | #
248 | # - do: if unchecked, this rule is ignored
249 | # - options: any options recognised by feature_counts tool except -s
250 | # - threads: number of threads to be used
251 | # - strandness: (optional) you should provide the strand parameters, given
252 | #      from the experimental design. If not provided, we will guess it (see
253 | #      tolerance parameter here below)
254 | # - tolerance: if strandness is not provided, we will guess it from
255 | #     the data. The metric used is between 0 and 1. It is a ratio between 
256 | #     strand + and -. If below tolerance, the strand is reversely stranded. If
257 | #     above 1-tolerance, it is (forward) stranded. If around 0.5 +- tolerance,
258 | #     it is unstranded. Otherwise, it means our guess would not be very
259 | #     reliable. A warning will be raised. Note also that if there is no
260 | #     consensus across samples, a warning/error may also be raised. tolerance
261 | #     is therefore in the range [0-0.25]
262 | # - feature: this is equivalent to the -t option to specify the feature type in GTF
263 | #     annotation. For example gene, exon (default). 
264 | # - attribute: this is the -g option to specify the attribute type in GTF annoation. 
265 | #   (gene_id) by default. 
266 | # - extra_attributes: any other 
267 | #
268 | feature_counts:
269 |     do: true
270 |     options: ''      ## if exon/CDS is used, put -O option
271 |     strandness: ''   # set to 0, 1, 2 to force te type of strandness
272 |     threads: 1       # 
273 |     tolerance: 0.15  # use to figure out the strandness. no need to change
274 |     feature: gene    # could be exon, mRNA, etc
275 |     attribute: ID    # could be ID, gene_id, etc
276 |     extra_attributes:    # by default, stores only the main attribute, but could add more
277 | 
278 | #############################################################################
279 | # bamCoverage write file in bigwig format from BAM files.
280 | # This tool takes an alignment of reads or fragments as input (BAM file) and
281 | # generates a coverage track (bigWig or bedGraph) as output. The coverage is
282 | # calculated as the number of reads per bin, where bins are short consecutive
283 | # counting windows of a defined size. It is possible to extended the length of
284 | # the reads to better reflect the actual fragment length. *bamCoverage* offers
285 | # normalization by scaling factor, Reads Per Kilobase per Million mapped reads
286 | # (RPKM), and 1x depth (reads per genome coverage, RPGC).
287 | #
288 | # :Parameters:
289 | #
290 | # - do: if unchecked, this rule is ignored
291 | # - binSize: Size of the bins, in bases, for the output of the
292 | #            bigwig/bedgraph file. (default: 50)
293 | # - genomeSize: Report read coverage normalized to 1x sequencing depth
294 | #                        (also known as Reads Per Genomic Content (RPGC)).
295 | #                        Sequencing depth is defined as: (total number of
296 | #                        mapped reads * fragment length) / effective genome
297 | #                        size. The scaling factor used is the inverse of the
298 | #                        sequencing depth computed for the sample to match the
299 | #                        1x coverage. To use this option, the effective genome
300 | #                        size has to be indicated after the option. The
301 | #                        effective genome size is the portion of the genome
302 | #                        that is mappable.
303 | # - extendReads: This parameter allows the extension of reads to
304 | #                fragment size.
305 | # - minFragmentLength: The minimum fragment length needed for read/pair
306 | #                      inclusion. Note that a value other than 0 will exclude
307 | #                      all single-end reads.
308 | # - maxFragmentLength: The maximum fragment length needed for read/pair
309 | #                      inclusion. A value of 0 disables filtering and is
310 | #                      needed for including single-end and orphan reads.
311 | # - threads: number of threads to be used
312 | bam_coverage:
313 |     do: false
314 |     options: "--binSize 10 --effectiveGenomeSize 2150000000"
315 |     #extendReads: 65
316 |     #minFragmentLength: 0 #Note that a value other than 0 will exclude all single-end reads.
317 |     #maxFragmentLength: 0 #A value of 0 disables filtering and is needed for including single-end and orphan reads.
318 |     threads: 4
319 |     resources:
320 |         mem: 20G
321 | 
322 | 
323 | ###########################################################################
324 | # Creates a tdf files using igvtools
325 | #
326 | # :Parameters:
327 | #
328 | # - chromSize: path to index of reference genome obtain by samtools faidx
329 | igvtools:
330 |     do: false
331 |     # can be a link to the fasta file or an existing chrom.sizes file
332 |     # If none provided, will use the input fasta file
333 |     chrom_sizes_file: ''
334 | 
335 | 
336 | #############################################################################
337 | # mark_duplicates (picard-tools) allows to mark PCR duplicate in BAM files
338 | #
339 | # :Parameters:
340 | #
341 | # - do: if unchecked, this rule is ignored. Mandatory for RNA-SeQC tool.
342 | # - remove: If true do not write duplicates to the output file instead of writing them with
343 | #            appropriate flags set.  Default value: false. This option can be set to 'null' to clear
344 | #            the default value. Possible values: {true, false}
345 | # - tmpdir: write tempory file on this directory (default TMP_DIR=/tmp/, but could be "TMP_DIR=/local/scratch/")
346 | #
347 | mark_duplicates:
348 |     do: false
349 |     remove: false ## may be True
350 |     tmpdir: ./tmp/
351 |     threads: 4
352 |     resources:
353 |       mem: 34G
354 | 
355 | add_read_group:
356 |     options: 
357 | 
358 | #############################################################################
359 | # RNA-SeQC allows to compute a series of quality control metrics for RNA-seq data
360 | #
361 | # :Parameters:
362 | #
363 | # - do: if unchecked, this rule is ignored
364 | # - ref: Reference Genome in fasta format
365 | # - gtf: GTF File defining transcripts (must end in '.gtf')
366 | #        You can use the 'sequana gff-to-gtf input.gff' command
367 | # - options: any options recognised by RNA-seQC tool
368 | rnaseqc:
369 |     do: false
370 |     gtf_file:
371 |     options: --coverage
372 |     resources:
373 |         mem: 8G
374 | 
375 | 
376 | # if be_file not provided, try to create one on the fly
377 | # needs mark_duplicates
378 | rseqc:
379 |     do: false
380 |     bed_file:
381 | 
382 | 
383 | #############################################################################
384 | #   MultiQC aggregates results from bioinformatics analyses across many
385 | #   samples into a single report.
386 | #
387 | # :Parameters:
388 | #
389 | # - options: any options recognised by multiqc
390 | # - output-directory: Create report in the specified output directory
391 | # - config_file: by default, we use sequana RNA-seq multiqc_config file. 
392 | #       If you want your own multiqc, fill this entry
393 | multiqc:
394 |     options: -p -f -x *_init_*
395 |     modules: ''
396 |     input_directory: .
397 |     config_file: multiqc_config.yaml
398 |     resources:
399 |         mem: 8G
400 | 
401 | 
402 | 
403 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | .. image:: https://badge.fury.io/py/sequana-rnaseq.svg
  4 |      :target: https://pypi.python.org/pypi/sequana_rnaseq
  5 | 
  6 | .. image:: http://joss.theoj.org/papers/10.21105/joss.00352/status.svg
  7 |     :target: http://joss.theoj.org/papers/10.21105/joss.00352
  8 |     :alt: JOSS (journal of open source software) DOI
  9 | 
 10 | .. image:: https://github.com/sequana/rnaseq/actions/workflows/main.yml/badge.svg
 11 |    :target: https://github.com/sequana/rnaseq/actions/workflows/main.yaml
 12 | 
 13 | 
 14 | 
 15 | This is is the **RNA-seq** pipeline from the `Sequana <https://sequana.readthedocs.org>`_ project
 16 | 
 17 | :Overview: RNASeq analysis from raw data to feature counts
 18 | :Input: A set of Fastq Files and genome reference and annotation.
 19 | :Output: MultiQC and HTML reports, BAM and bigwig files, feature Counts, script to launch differential analysis
 20 | :Status: Production.
 21 | :Citation(sequana): Cokelaer et al, (2017), ‘Sequana’: a Set of Snakemake NGS pipelines, Journal of Open Source Software, 2(16), 352, JOSS DOI doi:10.21105/joss.00352
 22 | :Citation(pipeline):
 23 |     .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.4047837.svg
 24 |        :target: https://doi.org/10.5281/zenodo.4047837
 25 | 
 26 | Installation
 27 | ~~~~~~~~~~~~
 28 | 
 29 | **sequana_rnaseq** is based on Python3, just install the package as follows::
 30 | 
 31 |     pip install sequana_rnaseq --upgrade
 32 | 
 33 | You will need third-party software such as bowtie2/star. However, if you choose to use aptainer/singularity,
 34 | then nothing to install except singularity itself ! See below for details.
 35 | 
 36 | 
 37 | Usage
 38 | ~~~~~
 39 | 
 40 | ::
 41 | 
 42 |     sequana_rnaseq --help
 43 |     sequana_rnaseq --input-directory DATAPATH --genome-directory genome --aligner-choice star
 44 | 
 45 | This creates a directory with the pipeline and configuration file. You will then need
 46 | to execute the pipeline::
 47 | 
 48 |     cd rnaseq
 49 |     sh rnaseq.sh  # for a local run
 50 | 
 51 | This launch a snakemake pipeline. If you are familiar with snakemake, you can
 52 | retrieve the pipeline itself and its configuration files and then execute the pipeline yourself with specific parameters::
 53 | 
 54 |     snakemake -s rnaseq.rules -c config.yaml --cores 4 --stats stats.txt
 55 | 
 56 | Or use `sequanix <https://sequana.readthedocs.io/en/main/sequanix.html>`_ interface.
 57 | 
 58 | 
 59 | Usage with apptainer:
 60 | ~~~~~~~~~~~~~~~~~~~~~~~~~
 61 | 
 62 | With apptainer, initiate the working directory as follows::
 63 | 
 64 |     sequana_rnaseq --apptainer-prefix ~/.sequana/apptainers
 65 | 
 66 | Images are downloaded in the directory once for all; and then::
 67 | 
 68 |     cd rnaseq
 69 |     sh rnaseq.sh
 70 | 
 71 | if you decide to use snakemake manually, do not forget to add the apptainer-prefix options::
 72 | 
 73 |     snakemake -s rnaseq.rules -c config.yaml --cores 4 --apptainer-prefix ~/.sequana/apptainers --apptainer-args "-B /home:/home"
 74 | 
 75 | Usage on cluster with no internet access
 76 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 77 | 
 78 | We use wrappers that are hosted on github: https://github.com/cokelaer/sequana-wrappers/ . There are copied locally in your home. However if you wish, you can download the repository locally before using the pipeline. For example:
 79 | 
 80 |     export WRAPPERS=/home/user/Wrappers
 81 |     git clone git@github.com:sequana/sequana-wrappers.git $WRAPPERS
 82 | 
 83 | and define an environmental variable as follows (you should add it in your .profile or .bashrc for long term usage)::
 84 | 
 85 |     export SEQUANA_WRAPPERS=git+file://$WRAPPERS
 86 | 
 87 | Requirements
 88 | ~~~~~~~~~~~~
 89 | 
 90 | This pipelines requires lots of third-party executable(s). Here is a list that
 91 | may change. A Message will inform you would you be missing an executable:
 92 | 
 93 | - bowtie
 94 | - bowtie2>=2.4.2
 95 | - STAR
 96 | - featureCounts (subread package)
 97 | - picard
 98 | - multiqc
 99 | - samtools
100 | 
101 | Note that bowtie>=2.4.2 is set to ensure the pipeline can be used with python 3.7-3.8-3.9 and the sequana-wrappers that supports bowtie2 with option --threads only (not previous versions). See environment.yaml or conda.yaml for latest list of required third-party tools.
102 | 
103 | You can install most of the tools using `damona <https://damona.readthedocs.io>`_::
104 | 
105 |     damona create --name sequana_tools
106 |     damona activate sequana_tools
107 |     damona install sequana_tools
108 | 
109 | Or use the conda.yaml file available in this repository. If you start a new
110 | environment from scratch, those commands will create the environment and install
111 | all dependencies for you::
112 | 
113 |     conda create --name sequana_env python 3.7.3
114 |     conda activate sequana_env
115 |     conda install -c anaconda qt pyqt>5
116 |     pip install sequana
117 |     pip install sequana_rnaseq
118 |     conda install --file https://raw.githubusercontent.com/sequana/rnaseq/main/conda.yaml
119 | 
120 | For Linux users, we provide singularity images available through within the damona project (https://damona.readthedocs.io).
121 | 
122 | 
123 | .. image:: https://raw.githubusercontent.com/sequana/sequana_rnaseq/main/sequana_pipelines/rnaseq/dag.png
124 | 
125 | 
126 | Details
127 | ~~~~~~~~~
128 | 
129 | This pipeline runs a **RNA-seq** analysis of sequencing data. It runs in
130 | parallel on a set of input FastQ files (paired or not).
131 | A brief HTML report is produced together with a MultiQC report.
132 | 
133 | This pipeline is complex and requires some expertise for the interpretation.
134 | Many online-resources are available and should help you deciphering the output.
135 | 
136 | Yet, it should be quite straigtforward to execute it as shown above. The
137 | pipeline uses bowtie1 to look for ribosomal contamination (rRNA). Then,
138 | it cleans  the data with cutapdat if you say so (your data may already be
139 | pre-processed). If no adapters are provided (default), reads are
140 | trimmed for low quality bases only. Then, mapping is performed with standard mappers such as
141 | star or bowtie2 (--aligner option). Finally,
142 | feature counts are extracted from the previously generated BAM files. We guess
143 | the strand and save the feature counts into the directoy
144 | ./rnadiff/feature_counts.
145 | 
146 | The pipelines stops there. However, RNA-seq analysis are followed by a different
147 | analysis (DGE hereafter). Although the DGE is not part of the pipeline, you can
148 | performed it with standard tools using the data in ./rnadiff directory. One such
149 | tool is provided within our framework (based on the well known DEseq2 software).
150 | 
151 | Using our framework::
152 | 
153 |     cd rnadiff
154 |     sequana rnadiff --design design.csv --features all_features.out --annotation ANNOT \
155 |            --feature-name FEAT --attribute-name ATTR
156 | 
157 | where ANNOT is the annotation file of your analysis, FEAT and ATTR the attribute
158 | and feature used in your analysis (coming from the annotation file).
159 | 
160 | This produces a HTML report summarizing you differential analysis.
161 | 
162 | Note that you need DESEQ2 and other packages installed. You may also use this contaier: https://zenodo.org/records/5708856
163 | 
164 | 
165 | Rules and configuration details
166 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
167 | 
168 | Here is the `latest documented configuration file <https://raw.githubusercontent.com/sequana/sequana_rnaseq/main/sequana_pipelines/rnaseq/config.yaml>`_
169 | to be used with the pipeline. Each rule used in the pipeline may have a section in the configuration file.
170 | 
171 | 
172 | .. warning:: the RNAseQC rule is switch off and is not currently functional in
173 |    version 0.9.X
174 | 
175 | Issues
176 | ~~~~~~
177 | 
178 | In the context of eukaryotes, you will need 32G of memory most probably. If this is too much,
179 | you can try to restrict the memory. Check out the config.yaml file in the star section.
180 | 
181 | 
182 | 
183 | Changelog
184 | ~~~~~~~~~
185 | 
186 | ========= ====================================================================
187 | Version   Description
188 | ========= ====================================================================
189 | 0.20.2    * Fix workflow and multiqc parsing
190 | 0.20.1    * Fix wrapper version in the config and fastp rule.
191 | 0.20.0    * Fix regression due to new sequana version
192 |           * Update summary html to use new sequana plots
193 | 0.19.3    * fix regression with click to set the default rRNA to 'rRNA' again.
194 | 0.19.2    * fix bowtie1 regression in the log file, paired end case in
195 |             multiqc and rnadiff script (regression)
196 |           * set genome directory default to None to enforce its usage
197 | 0.19.1    * add rnaseqc container.
198 |           * Update rseqc rules (redirection)
199 |           * cleanup onsuccess rule
200 | 0.19.0    * Refactorisation to use click
201 | 0.18.1    * fastp multiqc regression. Fixed missing sample names by updating
202 |             multiqc_config and adding sample names in the output filename
203 | 0.18.0    * New plots in the HTML reports. Includes version of executables.
204 | 0.17.2    * CHANGES: in star section, added --limitBAMsortRAM and set to 30G
205 |           * BUG: Fix missing params (options) in star_mapping rule not taken
206 |             into account
207 | 0.17.1    * use new rulegraph / graphviz apptainer
208 | 0.17.0    * fastp step changed to use sequana-wrappers. Slight change in
209 |             config file. The reverse and forward adapter options called
210 |             rev and fwd have been dropped in favor of a single adapters option.
211 |             v0.17.0 config and schema are not compatible with previous
212 |             versions.
213 |           * Update singularity containers and add new one for fastp
214 | 0.16.1    * fix bug in feature counts automatic strand balance detection. Was
215 |             always using the stranded case (2).
216 |           * add singularity workflow for testing
217 |           * fix documentation in config.yaml
218 | 0.16.0    * star, salmon, bam_coverage are now in sequana wrappers, updated
219 |             the pipeline accordingly
220 |           * updated config file and schema to include resources inside the
221 |             config file (so as to use new --profile option)
222 |           * set singularity images in all rules
223 |           * star wrappers has changed significantly to use star
224 |             recommandation. To keep using previous way, a legacy option
225 |             is available and set to True in this version.
226 |           * bamCoverage renamed in bam_coverage in the config file
227 |           * multiqc_config removed redundant information and ordered
228 |             the output in a coherent way (QC and then analysis)
229 | 0.15.2    * Fix bowtie2 rule to use new wrappers. Use wrappers in
230 |             add_read_group and mark_duplicates
231 | 0.15.1    * Adapt to new bowtie2 align wrapper
232 | 0.15.0    * fix typo reported in https://github.com/sequana/rnaseq/issues/12
233 |           * fix feature counts plots not showing anymore
234 |           * Script for differential analysis is now in the main pipeline
235 | 0.14.2    * fix feature counts plots missing in multiqc results
236 | 0.14.1    * fix regression bug introduced in snakemake 6.9.0
237 | 0.14.0    * Allow the aligners to have dedicated index for each version in the
238 |             same genome directory.
239 |           * Ribosomal is now estimated on the first 100,000 reads to speed up
240 |             analysis
241 |           * --indexing and --force-indexing  options not required anymore.
242 |             Indexing will be done automatically and not redone if present.
243 |           * Use of the new sequana-wrappers repository
244 | 0.13.0    * Major update to use the new sequana version and the RNADiff tools.
245 |           * remove fastq_screen. One can use sequana_multitax for taxonomic
246 |             content and contamination.
247 |           * cutadapt is now replaced by fastp, although it can still be used.
248 |           * full integration of salmon for prokaryotes and eukaryotes
249 |           * user interface has now a --skip-gff-check option. Better handling of
250 |             input gff with more meaningful messages
251 |           * integration of rseqc tool
252 | 0.12.1    * indexing was always set to True in the config after 0.9.16 update.
253 | 0.12.0    * BUG fix: Switch mark_duplicates correctly beore feature counts
254 | 0.11.0    * rnadiff one factor is simplified
255 |           * When initiating the pipeline, provide information about the GFF
256 |           * mark duplicates off by default
257 |           * feature_counts has more options in the help. split options into
258 |             feature/attribute/extra_attributes.
259 |           * HTML reports better strand picture and information about rRNA
260 |           * refactorising the main standalone and config file to split feature
261 |             counts optiions into feature and attribute. Sanoty checks are ow
262 |             provided (--feature-counts-attribute, --feature-counts-feature-type)
263 |           * can provide a custom GFF not in the genome directory
264 |           * can provide several feature from the GFF. Then, a custom GFF is
265 |             created and used
266 |           * fix the --do-igvtools and --do-bam-coverage with better doc
267 | 0.10.0    * 9/12/2020
268 |           * Fixed bug in sequana/star_indexing for small genomes (v0.9.7).
269 |             Changed the rnaseq requirements to benefit from this bug-fix that
270 |             could lead to seg fault with star aligner for small genomes.
271 |           * Report improved with strand guess and plot
272 | 0.9.20    * 7/12/2020
273 |           * BUG in sequana/star rules v0.9.6. Fixed in this release.
274 |           * In config file, bowtie section 'do' option is removed. This is now
275 |             set automatically if rRNA_feature or rRNA_file is provided. This
276 |             allows us to skip the rRNA mapping entirely if needed.
277 |           * fastq_screen should be functional. Default behaviour is off. If
278 |             set only phiX174 will be search for. Users should build their own
279 |             configuration file.
280 |           * star/bowtie1/bowtie2 have now their own sub-directories in the
281 |             genome directory.
282 |           * added --run option to start pipeline automatically (if you know
283 |             what you are doing)
284 |           * rnadiff option has now a default value (one_factor)
285 |           * add strandness plot in the HTML summary page
286 | 0.9.19    * Remove the try/except around tolerance (guess of strandness) to
287 |             make sure this is provided by the user. Final onsuccess benefits
288 |             from faster GFF function (sequana 0.9.4)
289 | 0.9.18    * Fix typo (regression bug) + add tolerance in schema + generic
290 |             title in multiqc_config. (oct 2020)
291 | 0.9.17    * add the *tolerance* parameter in the feature_counts rule as a user
292 |             parameter (config and pipeline).
293 | 0.9.16    * Best feature_counts is now saved into rnadiff/feature_counts
294 |             directory and rnadiff scripts have been updated accordingly
295 |           * the most probable feature count option is now computed more
296 |             effectivily and incorporated inside the Snakemake pipeline (not in
297 |             the onsuccess) so that multiqc picks the best one (not the 3
298 |             results)
299 |           * the target.txt file can be generated inside the pipeline if user
300 |             fill the rnadiff/conditions section in the config file
301 |           * indexing options are filled automatically when calling
302 |             sequana_rnaseq based on the presence/absence of the index
303 |             of the aligner being used.
304 |           * salmon now integrated and feature counts created (still WIP in
305 |             sequana)
306 | 0.9.15    * FastQC on raw data skipped by default (FastQC
307 |             for processed data is still available)
308 |           * Added paired options (-p) for featureCounts
309 |           * Switch back markduplicates to False for now.
310 | 0.9.14    * Use only R1 with bowtie1
311 |           * set the memory requirements for mark_duplicates in cluster_config
312 |             file
313 |           * Set temporary directory for mark_duplicates to be local ./tmp
314 | 0.9.13    * set mark_duplicate to true by default
315 |           * use new sequana pipeline manager
316 |           * export all features counts in a single file
317 |           * custom HTML report
318 |           * faster --help calls
319 |           * --from-project option added
320 | 0.9.12    * include salmon tool as an alternative to star/bowtie2
321 |           * include rnadiff directory with required input for Differential
322 |             analysis
323 | 0.9.11    * Automatic guessing of the strandness of the experiment
324 | 0.9.10    * Fix multiqc for RNAseQC rule
325 | 0.9.9     * Fix RNAseQC rule, which is now available.
326 |           * Fix ability to use existing rRNA file as input
327 | 0.9.8     * Fix indexing for bowtie1 to not be done if aligner is different
328 |           * add new options: --feature-counts-options and --do-rnaseq-qc,
329 |             --rRNA-feature
330 |           * Based on the input GFF, we now check the validity of the rRNA
331 |             feature and feature counts options to check whether the feature
332 |             exists in the GFF
333 |           * schema is now used to check the config file values
334 |           * add a data test for testing and documentation
335 | 0.9.7     * fix typo found in version 0.9.6
336 | 0.9.6     * Fixed empty read tag in the configuration file
337 |           * Possiblity to switch off cutadapt section
338 |           * Fixing bowtie2 rule in sequana and update the pipeline accordingly
339 |           * Include a schema file
340 |           * output-directory parameter renamed into output_directory (multiqc
341 |             section)
342 |           * handle stdout correctly in fastqc, bowtie1, bowtie2 rules
343 | 0.9.5     * Fixed https://github.com/sequana/sequana/issues/571
344 |           * More cutadapt commands and sanity checks
345 |           * Fixed bowtie2 options import in rnaseq.rules
346 | 0.9.4
347 | 0.9.3     if a fastq_screen.conf is provided, we switch the fastqc_screen
348 |           section ON automatically
349 | 0.9.0     **Major refactorisation.**
350 | 
351 |           * remove sartools, kraken rules.
352 |           * Indexing is now optional and can be set in the configuration.
353 |           * Configuration file is simplified  with a general section to enter
354 |             the genome location and aligner.
355 |           * Fixed rules in  sequana (0.8.0) that were not up-to-date with
356 |             several executables used in the  pipeline including picard,
357 |             fastq_screen, etc. See Sequana Changelog for details with respect
358 |             to rules changes.
359 |           * Copying the feature counts in main directory  ready to use for
360 |             a differential analysis.
361 | ========= ====================================================================
362 | 


--------------------------------------------------------------------------------
/test/data/Saccer3/Saccer3_rRNA.fa:
--------------------------------------------------------------------------------
 1 | >chrXII:451574-451785
 2 | ATAGTAAATAGTAACTTACATACATTAGTAAATGGTACACTCTTACACACTATCATCCTCATCGTATATTATAATAGATATATACAATACATGTTTTTACCCGGATCATAGAATTCTTAAGACAAATAAAATTTATAGAGACTTGTTCAGTCTACTTCTCTCTAAACTAGGCCCCGGCTCCTGCCAGTACCCACTTAGAAAGAAATAAAAA
 3 | >chrXII:451574-458432
 4 | ATAGTAAATAGTAACTTACATACATTAGTAAATGGTACACTCTTACACACTATCATCCTCATCGTATATTATAATAGATATATACAATACATGTTTTTACCCGGATCATAGAATTCTTAAGACAAATAAAATTTATAGAGACTTGTTCAGTCTACTTCTCTCTAAACTAGGCCCCGGCTCCTGCCAGTACCCACTTAGAAAGAAATAAAAAACAAATCAGACAACAAAGGCTTAATCTCAGCAGATCGTAACAACAAGGCTACTCTACTGCTTACAATACCCCGTTGTACATCTAAGTCGTATACAAATGATTTATCCCCACGCAAAATGACATTGCAATTCGCCAGCAAGCACCCAAGGCCTTTCCGCCAAGTGCACCGTTGCTAGCCTGCTATGGTTCAGCGACGCCACAAGGACGCCTTATTCGTATCCATCTATATTGTGTGGAGCAAAGAAATCACCGCGTTCTAGCATGGATTCTGACTTAGAGGCGTTCAGCCATAATCCAGCGGATGGTAGCTTCGCGGCAATGCCTGATCAGACAGCCGCAAAAACCAATTATCCGAATGAACTGTTCCTCTCGTACTAAGTTCAATTACTATTGCGGTAACATTCATCAGTAGGGTAAAACTAACCTGTCTCACGACGGTCTAAACCCAGCTCACGTTCCCTATTAGTGGGTGAACAATCCAACGCTTACCGAATTCTGCTTCGGTATGATAGGAAGAGCCGACATCGAAGAATCAAAAAGCAATGTCGCTATGAACGCTTGACTGCCACAAGCCAGTTATCCCTGTGGTAACTTTTCTGGCACCTCTAGCCTCAAATTCCGAGGGACTAAAGGATCGATAGGCCACACTTTCATGGTTTGTATTCACACTGAAAATCAAAATCAAGGGGGCTTTTACCCTTTTGTTCTACTGGAGATTTCTGTTCTCCATGAGCCCCCCTTAGGACATCTGCGTTATCGTTTAACAGATGTGCCGCCCCAGCCAAACTCCCCACCTGACAATGTCTTCAACCCGGATCAGCCCCGAATGGGACCTTGAATGCTAGAACGTGGAAAATGAATTCCAGCTCCGCTTCATTGAATAAGTAAAGAAACTATAAAGGTAGTGGTATTTCACTGGCGCCGAAGCTCCCACTTATTCTACACCCTCTATGTCTCTTCACAATGTCAAACTAGAGTCAAGCTCAACAGGGTCTTCTTTCCCCGCTGATTCTGCCAAGCCCGTTCCCTTGGCTGTGGTTTCGCTAGATAGTAGATAGGGACAGTGGGAATCTCGTTAATCCATTCATGCGCGTCACTAATTAGATGACGAGGCATTTGGCTACCTTAAGAGAGTCATAGTTACTCCCGCCGTTTACCCGCGCTTGGTTGAATTTCTTCACTTTGACATTCAGAGCACTGGGCAGAAATCACATTGCGTCAACATCACTTTCTGACCATCGCAATGCTATGTTTTAATTAGACAGTCAGATTCCCCTTGTCCGTACCAGTTCTAAGTTGATCGTTAATTGTAGCAAGCGACGGTCTACAAGAGACCTACCAAGGCCGTCTACAACAAGGCACGCAAGTAGTCCGCCTAGCAGAGCAAGCCCCACCAAGCAGTCCACAAGCACGCCCGCTGCGTCTGACCAAGGCCCTCACTACCCGACCCTTAGAGCCAATCCTTATCCCGAAGTTACGGATCTATTTTGCCGACTTCCCTTATCTACATTATTCTATCAACTAGAGGCTGTTCACCTTGGAGACCTGCTGCGGTTATCAGTACGACCTGGCATGAAAACTATTCCTTCCTGTGGATTTTCACGGGCCGTCACAAGCGCACCGGAGCCAGCAAAGGTGCTGGCCTCTTCCAGCCATAAGACCCCATCTCCGGATAAACCAATTCCGGGGTGATAAGCTGTTAAGAAGAAAAGATAACTCCTCCCAGGGCTCGCGCCGACGTCTCCACATTCAGTTACGTTACCGTGAAGAATCCATATCCAGGTTCCGGAATCTTAACCGGATTCCCTTTCGATGGTGGCCTGCATAAAATCAGGCCTTTGAAACGGAGCTTCCCCATCTCTTAGGATCGACTAACCCACGTCCAACTGCTGTTGACGTGGAACCTTTCCCCACTTCAGTCTTCAAAGTTCTCATTTGAATATTTGCTACTACCACCAAGATCTGCACTAGAGGCCGTTCGACCCGACCTTACGGTCTAGGCTTCGTCACTGACCTCCACGCCTGCCTACTCGTCAGGGCATCATATCAACCCTGACGGTAGAGTATAGGTAACACGCTTGAGCGCCATCCATTTTCAGGGCTAGTTCATTCGGCCGGTGAGTTGTTACACACTCCTTAGCGGATTCCGACTTCCATGGCCACCGTCCGGCTGTCTAGATGAACTAACACCTTTTGTGGTGTCTGATGAGCGTGTATTCCGGCACCTTAACTCTACGTTCGGTTCATCCCGCATCGCCAGTTCTGCTTACCAAAAATGGCCCACTAAAAGCTCTTCATTCAAATGTCCACGTTCAATTAAGTAACAAGGACTTCTTACATATTTAAAGTTTGAGAATAGGTCAAGGTCATTTCGACCCCGGAACCTCTAATCATTCGCTTTACCTCATAAAACTGATACGAGCTTCTGCTATCCTGAGGGAAACTTCGGCAGGAACCAGCTACTAGATGGTTCGATTAGTCTTTCGCCCCTATACCCAAATTCGACGATCGATTTGCACGTCAGAACCGCTACGAGCCTCCACCAGAGTTTCCTCTGGCTTCACCCTATTCAGGCATAGTTCACCATCTTTCGGGTCCCAACAGCTATGCTCTTACTCAAATCCATCCGAAGACATCAGGATCGGTCGATTGTGCACCTCTTGCGAGGCCCCAACCTACGTTCACTTTCATTACGCGTATGGGTTTTACACCCAAACACTCGCATAGACGTTAGACTCCTTGGTCCGTGTTTCAAGACGGGCGGCATATAACCATTATGCCAGCATCCTTGACTTACGTCGCAGTCCTCAGTCCCAGCTGGCAGTATTCCCACAGGCTATAATACTTACCGAGGCAAGCTACATTCCTATGGATTTATCCTGCCACCAAAACTGATGCTGGCCCAGTGAAATGCGAGATTCCCCTACCCACAAGGAGCAGAGGGCACAAAACACCATGTCTGATCAAATGCCCTTCCCTTTCAACAATTTCACGTACTTTTTCACTCTCTTTTCAAAGTTCTTTTCATCTTTCCATCACTGTACTTGTTCGCTATCGGTCTCTCGCCAATATTTAGCTTTAGATGGAATTTACCACCCACTTAGAGCTGCATTCCCAAACAACTCGACTCTTCGAAGGCACTTTACAAAGAACCGCACTCCTCGCCACACGGGATTCTCACCCTCTATGACGTCCTGTTCCAAGGAACATAGACAAGGAACGGCCCCAAAGTTGCCCTCTCCAAATTACAACTCGGGCACCGAAGGTACCAGATTTCAAATTTGAGCTTTTGCCGCTTCACTCGCCGTTACTAAGGCAATCCCGGTTGGTTTCTTTTCCTCCGCTTATTGATATGCTTAAGTTCAGCGGGTACTCCTACCTGATTTGAGGTCAAACTTTAAGAACATTGTTCGCCTAGACGCTCTCTTCTTATCGATAACGTTCCAATACGCTCAGTATAAAAAAAGATTAGCCGCAGTTGGTAAAACCTAAAACGACCGTACTTGCATTATACCTCAAGCACGCAGAGAAACCTCTCTTTGGAAAAAAAACATCCAATGAAAAGGCCAGCAATTTCAAGTTAACTCCAAAGAGTATCACTCACTACCAAACAGAATGTTTGAGAAGGAAATGACGCTCAAACAGGCATGCCCCCTGGAATACCAAGGGGCGCAATGTGCGTTCAAAGATTCGATGATTCACGGAATTCTGCAATTCACATTACGTATCGCATTTCGCTGCGTTCTTCATCGATGCGAGAACCAAGAGATCCGTTGTTGAAAGTTTTTAATATTTTAAAATTTCCAGTTACGAAAATTCTTGTTTTTGACAAAAATTTAATGAATAGATAAAATTGTTTGTGTTTGTTACCTCTGGGCCCCGATTGCTCGAATGCCCAAAGAAAAAGTTGCAAAGATATGAAAACTCCACAGTGTGTTGTATTGAAACGGTTTTAATTGTCCTATAACAAAAGCACAGAAATCTCTCACCGTTTGGAATAGCAAGAAAGAAACTTACAAGCCTAGCAAGACCGCGCACTTAAGCGCAGGCCCGGCTGGACTCTCCATCTCTTGTCTTCTTGCCCAGTAAAAGCTCTCATGCTCTTGCCAAAACAAAAAAATCCATTTTCAAAATTATTAAATTTCTTTAATGATCCTTCCGCAGGTTCACCTACGGAAACCTTGTTACGACTTTTAGTTCCTCTAAATGACCAAGTTTGTCCAAATTCTCCGCTCTGAGATGGAGTTGCCCCCTTCTCTAAGCAGATCCTGAGGCCTCACTAAGCCATTCAATCGGTACTAGCGACGGGCGGTGTGTACAAAGGGCAGGGACGTAATCAACGCAAGCTGATGACTTGCGCTTACTAGGAATTCCTCGTTGAAGAGCAATAATTACAATGCTCTATCCCCAGCACGACGGAGTTTCACAAGATTACCAAGACCTCTCGGCCAAGGTTAGACTCGCTGGCTCCGTCAGTGTAGCGCGCGTGCGGCCCAGAACGTCTAAGGGCATCACAGACCTGTTATTGCCTCAAACTTCCATCGGCTTGAAACCGATAGTCCCTCTAAGAAGTGGATAACCAGCAAATGCTAGCACCACTATTTAGTAGGTTAAGGTCTCGTTCGTTATCGCAATTAAGCAGACAAATCACTCCACCAACTAAGAACGGCCATGCACCACCACCCACAAAATCAAGAAAGAGCTCTCAATCTGTCAATCCTTATTGTGTCTGGACCTGGTGAGTTTCCCCGTGTTGAGTCAAATTAAGCCGCAGGCTCCACTCCTGGTGGTGCCCTTCCGTCAATTCCTTTAAGTTTCAGCCTTGCGACCATACTCCCCCCAGAACCCAAAGACTTTGATTTCTCGTAAGGTGCCGAGTGGGTCATTAAAAAAACACCACCCGATCCCTAGTCGGCATAGTTTATGGTTAAGACTACGACGGTATCTGATCATCTTCGATCCCCTAACTTTCGTTCTTGATTAATGAAAACGTCCTTGGCAAATGCTTTCGCAGTAGTTAGTCTTCAATAAATCCAAGAATTTCACCTCTGACAATTGAATACTGATGCCCCCGACCGTCCCTATTAATCATTACGATGGTCCTAGAAACCAACAAAATAGAACCAAACGTCCTATTCTATTATTCCATGCTAATATATTCGAGCAATACGCCTGCTTTGAACACTCTAATTTTTTCAAAGTAAAAGTCCTGGTTCGCCAAGAGCCACAAGGACTCAAGGTTAGCCAGAAGGAAAGGCCCCGTTGGAAATCCAGTACACGAAAAAATCGGACCGGCCAACCGGGCCCAAAGTTCAACTACGAGCTTTTTAACTGCAACAACTTTAATATACGCTATTGGAGCTGGAATTACCGCGGCTGCTGGCACCAGACTTGCCCTCCAATTGTTCCTCGTTAAGGTATTTACATTGTACTCATTCCAATTACAAGACCCGAATGGGCCCTGTATCGTTATTTATTGTCACTACCTCCCTGAATTAGGATTGGGTAATTTGCGCGCCTGCTGCCTTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTTATTCCCCGTTACCCGTTGAAACCATGGTAGGCCACTATCCTACCATCGAAAGTTGATAGGGCAGAAATTTGAATGAACCATCGCCAGCACAAGGCCATGCGATTCGAAAAGTTATTATGAATCATCAAAGAGTCCGAAGACATTGATTTTTTATCTAATAAATACATCTCTTCCAAAGGGTCGAGATTTTAAGCATGTATTAGCTCTAGAATTACCACAGTTATACCATGTAGTAAAGGAACTATCAAATAAACGATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTATAAATTGCTTATACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGACTACTGGCAGGATCAACCAGATAACTATCTTAAAAGAAGAAGCAACAAGCAGTAAAAAAGAAAGAAACCGAAATCTCTTTTTTTTTTTCCCACCTATTCCCTCTTGCTAGAAGATACTTATTGAGTTTGGAAACAGCTGAAATTCCAGAAAAATTGCTTTTTCAGGTCTCTCTGCTGCCGGAAATGCTCTCTGTTCAAAAAGCTTTTACACTCTTGACCAGCGCACTCCGTCACCATACCATAGCACTCTTTGAGTTTCCTCTAATCAGGTTCCACCAAACAGATACCCCGGTGTTTCACGGAATGGTACGTTTGATATCGCTGATTTGAGAGGAGGTTACACTTGAAGAATCACAGTCTTGCGACCGGCTATTCAACAAGGCATTCCCCCAAGTTTGAATTCTTTGAAATAGATTGCTATTAGCTAGTAATCCACCAAATCCTTCGCTGCTCACCAATGGAATCGCAAGATGCCCACGATGAGACTGTTCAGGTTAAACGCAAAAGAAACACACTCTGGGAATTTCTTCCCAAATTGTATCTCTCAATACGCATCAACCCATGTCAATTAAACACGCTGTATAGAGACTAGGCAGATCTGACGATCACCTAGCGACTCTCTCCACCGTTTGACGAGGCCATTTACAAAAACATAACGAACGACAAGCCTACTCGAATTCGTTTCCAAACTCTTTTCGAACTTGTCTTCAACTGCTTTCGCAT
 5 | >chrXII:451785-455181
 6 | ACAAATCAGACAACAAAGGCTTAATCTCAGCAGATCGTAACAACAAGGCTACTCTACTGCTTACAATACCCCGTTGTACATCTAAGTCGTATACAAATGATTTATCCCCACGCAAAATGACATTGCAATTCGCCAGCAAGCACCCAAGGCCTTTCCGCCAAGTGCACCGTTGCTAGCCTGCTATGGTTCAGCGACGCCACAAGGACGCCTTATTCGTATCCATCTATATTGTGTGGAGCAAAGAAATCACCGCGTTCTAGCATGGATTCTGACTTAGAGGCGTTCAGCCATAATCCAGCGGATGGTAGCTTCGCGGCAATGCCTGATCAGACAGCCGCAAAAACCAATTATCCGAATGAACTGTTCCTCTCGTACTAAGTTCAATTACTATTGCGGTAACATTCATCAGTAGGGTAAAACTAACCTGTCTCACGACGGTCTAAACCCAGCTCACGTTCCCTATTAGTGGGTGAACAATCCAACGCTTACCGAATTCTGCTTCGGTATGATAGGAAGAGCCGACATCGAAGAATCAAAAAGCAATGTCGCTATGAACGCTTGACTGCCACAAGCCAGTTATCCCTGTGGTAACTTTTCTGGCACCTCTAGCCTCAAATTCCGAGGGACTAAAGGATCGATAGGCCACACTTTCATGGTTTGTATTCACACTGAAAATCAAAATCAAGGGGGCTTTTACCCTTTTGTTCTACTGGAGATTTCTGTTCTCCATGAGCCCCCCTTAGGACATCTGCGTTATCGTTTAACAGATGTGCCGCCCCAGCCAAACTCCCCACCTGACAATGTCTTCAACCCGGATCAGCCCCGAATGGGACCTTGAATGCTAGAACGTGGAAAATGAATTCCAGCTCCGCTTCATTGAATAAGTAAAGAAACTATAAAGGTAGTGGTATTTCACTGGCGCCGAAGCTCCCACTTATTCTACACCCTCTATGTCTCTTCACAATGTCAAACTAGAGTCAAGCTCAACAGGGTCTTCTTTCCCCGCTGATTCTGCCAAGCCCGTTCCCTTGGCTGTGGTTTCGCTAGATAGTAGATAGGGACAGTGGGAATCTCGTTAATCCATTCATGCGCGTCACTAATTAGATGACGAGGCATTTGGCTACCTTAAGAGAGTCATAGTTACTCCCGCCGTTTACCCGCGCTTGGTTGAATTTCTTCACTTTGACATTCAGAGCACTGGGCAGAAATCACATTGCGTCAACATCACTTTCTGACCATCGCAATGCTATGTTTTAATTAGACAGTCAGATTCCCCTTGTCCGTACCAGTTCTAAGTTGATCGTTAATTGTAGCAAGCGACGGTCTACAAGAGACCTACCAAGGCCGTCTACAACAAGGCACGCAAGTAGTCCGCCTAGCAGAGCAAGCCCCACCAAGCAGTCCACAAGCACGCCCGCTGCGTCTGACCAAGGCCCTCACTACCCGACCCTTAGAGCCAATCCTTATCCCGAAGTTACGGATCTATTTTGCCGACTTCCCTTATCTACATTATTCTATCAACTAGAGGCTGTTCACCTTGGAGACCTGCTGCGGTTATCAGTACGACCTGGCATGAAAACTATTCCTTCCTGTGGATTTTCACGGGCCGTCACAAGCGCACCGGAGCCAGCAAAGGTGCTGGCCTCTTCCAGCCATAAGACCCCATCTCCGGATAAACCAATTCCGGGGTGATAAGCTGTTAAGAAGAAAAGATAACTCCTCCCAGGGCTCGCGCCGACGTCTCCACATTCAGTTACGTTACCGTGAAGAATCCATATCCAGGTTCCGGAATCTTAACCGGATTCCCTTTCGATGGTGGCCTGCATAAAATCAGGCCTTTGAAACGGAGCTTCCCCATCTCTTAGGATCGACTAACCCACGTCCAACTGCTGTTGACGTGGAACCTTTCCCCACTTCAGTCTTCAAAGTTCTCATTTGAATATTTGCTACTACCACCAAGATCTGCACTAGAGGCCGTTCGACCCGACCTTACGGTCTAGGCTTCGTCACTGACCTCCACGCCTGCCTACTCGTCAGGGCATCATATCAACCCTGACGGTAGAGTATAGGTAACACGCTTGAGCGCCATCCATTTTCAGGGCTAGTTCATTCGGCCGGTGAGTTGTTACACACTCCTTAGCGGATTCCGACTTCCATGGCCACCGTCCGGCTGTCTAGATGAACTAACACCTTTTGTGGTGTCTGATGAGCGTGTATTCCGGCACCTTAACTCTACGTTCGGTTCATCCCGCATCGCCAGTTCTGCTTACCAAAAATGGCCCACTAAAAGCTCTTCATTCAAATGTCCACGTTCAATTAAGTAACAAGGACTTCTTACATATTTAAAGTTTGAGAATAGGTCAAGGTCATTTCGACCCCGGAACCTCTAATCATTCGCTTTACCTCATAAAACTGATACGAGCTTCTGCTATCCTGAGGGAAACTTCGGCAGGAACCAGCTACTAGATGGTTCGATTAGTCTTTCGCCCCTATACCCAAATTCGACGATCGATTTGCACGTCAGAACCGCTACGAGCCTCCACCAGAGTTTCCTCTGGCTTCACCCTATTCAGGCATAGTTCACCATCTTTCGGGTCCCAACAGCTATGCTCTTACTCAAATCCATCCGAAGACATCAGGATCGGTCGATTGTGCACCTCTTGCGAGGCCCCAACCTACGTTCACTTTCATTACGCGTATGGGTTTTACACCCAAACACTCGCATAGACGTTAGACTCCTTGGTCCGTGTTTCAAGACGGGCGGCATATAACCATTATGCCAGCATCCTTGACTTACGTCGCAGTCCTCAGTCCCAGCTGGCAGTATTCCCACAGGCTATAATACTTACCGAGGCAAGCTACATTCCTATGGATTTATCCTGCCACCAAAACTGATGCTGGCCCAGTGAAATGCGAGATTCCCCTACCCACAAGGAGCAGAGGGCACAAAACACCATGTCTGATCAAATGCCCTTCCCTTTCAACAATTTCACGTACTTTTTCACTCTCTTTTCAAAGTTCTTTTCATCTTTCCATCACTGTACTTGTTCGCTATCGGTCTCTCGCCAATATTTAGCTTTAGATGGAATTTACCACCCACTTAGAGCTGCATTCCCAAACAACTCGACTCTTCGAAGGCACTTTACAAAGAACCGCACTCCTCGCCACACGGGATTCTCACCCTCTATGACGTCCTGTTCCAAGGAACATAGACAAGGAACGGCCCCAAAGTTGCCCTCTCCAAATTACAACTCGGGCACCGAAGGTACCAGATTTCAAATTTGAGCTTTTGCCGCTTCACTCGCCGTTACTAAGGCAATCCCGGTTGGTTTCTTTTCCTCCGCTTATTGATATGCTTAAGTTCAGCGGGTACTCCTACCTGATTTGAGGTCAAAC
 7 | >chrXII:455181-455413
 8 | TTTAAGAACATTGTTCGCCTAGACGCTCTCTTCTTATCGATAACGTTCCAATACGCTCAGTATAAAAAAAGATTAGCCGCAGTTGGTAAAACCTAAAACGACCGTACTTGCATTATACCTCAAGCACGCAGAGAAACCTCTCTTTGGAAAAAAAACATCCAATGAAAAGGCCAGCAATTTCAAGTTAACTCCAAAGAGTATCACTCACTACCAAACAGAATGTTTGAGAAGG
 9 | >chrXII:455413-455571
10 | AAATGACGCTCAAACAGGCATGCCCCCTGGAATACCAAGGGGCGCAATGTGCGTTCAAAGATTCGATGATTCACGGAATTCTGCAATTCACATTACGTATCGCATTTCGCTGCGTTCTTCATCGATGCGAGAACCAAGAGATCCGTTGTTGAAAGTTT
11 | >chrXII:455571-455932
12 | TTAATATTTTAAAATTTCCAGTTACGAAAATTCTTGTTTTTGACAAAAATTTAATGAATAGATAAAATTGTTTGTGTTTGTTACCTCTGGGCCCCGATTGCTCGAATGCCCAAAGAAAAAGTTGCAAAGATATGAAAACTCCACAGTGTGTTGTATTGAAACGGTTTTAATTGTCCTATAACAAAAGCACAGAAATCTCTCACCGTTTGGAATAGCAAGAAAGAAACTTACAAGCCTAGCAAGACCGCGCACTTAAGCGCAGGCCCGGCTGGACTCTCCATCTCTTGTCTTCTTGCCCAGTAAAAGCTCTCATGCTCTTGCCAAAACAAAAAAATCCATTTTCAAAATTATTAAATTTCTT
13 | >chrXII:455932-457732
14 | TAATGATCCTTCCGCAGGTTCACCTACGGAAACCTTGTTACGACTTTTAGTTCCTCTAAATGACCAAGTTTGTCCAAATTCTCCGCTCTGAGATGGAGTTGCCCCCTTCTCTAAGCAGATCCTGAGGCCTCACTAAGCCATTCAATCGGTACTAGCGACGGGCGGTGTGTACAAAGGGCAGGGACGTAATCAACGCAAGCTGATGACTTGCGCTTACTAGGAATTCCTCGTTGAAGAGCAATAATTACAATGCTCTATCCCCAGCACGACGGAGTTTCACAAGATTACCAAGACCTCTCGGCCAAGGTTAGACTCGCTGGCTCCGTCAGTGTAGCGCGCGTGCGGCCCAGAACGTCTAAGGGCATCACAGACCTGTTATTGCCTCAAACTTCCATCGGCTTGAAACCGATAGTCCCTCTAAGAAGTGGATAACCAGCAAATGCTAGCACCACTATTTAGTAGGTTAAGGTCTCGTTCGTTATCGCAATTAAGCAGACAAATCACTCCACCAACTAAGAACGGCCATGCACCACCACCCACAAAATCAAGAAAGAGCTCTCAATCTGTCAATCCTTATTGTGTCTGGACCTGGTGAGTTTCCCCGTGTTGAGTCAAATTAAGCCGCAGGCTCCACTCCTGGTGGTGCCCTTCCGTCAATTCCTTTAAGTTTCAGCCTTGCGACCATACTCCCCCCAGAACCCAAAGACTTTGATTTCTCGTAAGGTGCCGAGTGGGTCATTAAAAAAACACCACCCGATCCCTAGTCGGCATAGTTTATGGTTAAGACTACGACGGTATCTGATCATCTTCGATCCCCTAACTTTCGTTCTTGATTAATGAAAACGTCCTTGGCAAATGCTTTCGCAGTAGTTAGTCTTCAATAAATCCAAGAATTTCACCTCTGACAATTGAATACTGATGCCCCCGACCGTCCCTATTAATCATTACGATGGTCCTAGAAACCAACAAAATAGAACCAAACGTCCTATTCTATTATTCCATGCTAATATATTCGAGCAATACGCCTGCTTTGAACACTCTAATTTTTTCAAAGTAAAAGTCCTGGTTCGCCAAGAGCCACAAGGACTCAAGGTTAGCCAGAAGGAAAGGCCCCGTTGGAAATCCAGTACACGAAAAAATCGGACCGGCCAACCGGGCCCAAAGTTCAACTACGAGCTTTTTAACTGCAACAACTTTAATATACGCTATTGGAGCTGGAATTACCGCGGCTGCTGGCACCAGACTTGCCCTCCAATTGTTCCTCGTTAAGGTATTTACATTGTACTCATTCCAATTACAAGACCCGAATGGGCCCTGTATCGTTATTTATTGTCACTACCTCCCTGAATTAGGATTGGGTAATTTGCGCGCCTGCTGCCTTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTTATTCCCCGTTACCCGTTGAAACCATGGTAGGCCACTATCCTACCATCGAAAGTTGATAGGGCAGAAATTTGAATGAACCATCGCCAGCACAAGGCCATGCGATTCGAAAAGTTATTATGAATCATCAAAGAGTCCGAAGACATTGATTTTTTATCTAATAAATACATCTCTTCCAAAGGGTCGAGATTTTAAGCATGTATTAGCTCTAGAATTACCACAGTTATACCATGTAGTAAAGGAACTATCAAATAAACGATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTATAAATTGCTTATACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGACTACTGGCAGGATCAACCAGATA
15 | >chrXII:457732-458432
16 | ACTATCTTAAAAGAAGAAGCAACAAGCAGTAAAAAAGAAAGAAACCGAAATCTCTTTTTTTTTTTCCCACCTATTCCCTCTTGCTAGAAGATACTTATTGAGTTTGGAAACAGCTGAAATTCCAGAAAAATTGCTTTTTCAGGTCTCTCTGCTGCCGGAAATGCTCTCTGTTCAAAAAGCTTTTACACTCTTGACCAGCGCACTCCGTCACCATACCATAGCACTCTTTGAGTTTCCTCTAATCAGGTTCCACCAAACAGATACCCCGGTGTTTCACGGAATGGTACGTTTGATATCGCTGATTTGAGAGGAGGTTACACTTGAAGAATCACAGTCTTGCGACCGGCTATTCAACAAGGCATTCCCCCAAGTTTGAATTCTTTGAAATAGATTGCTATTAGCTAGTAATCCACCAAATCCTTCGCTGCTCACCAATGGAATCGCAAGATGCCCACGATGAGACTGTTCAGGTTAAACGCAAAAGAAACACACTCTGGGAATTTCTTCCCAAATTGTATCTCTCAATACGCATCAACCCATGTCAATTAAACACGCTGTATAGAGACTAGGCAGATCTGACGATCACCTAGCGACTCTCTCCACCGTTTGACGAGGCCATTTACAAAAACATAACGAACGACAAGCCTACTCGAATTCGTTTCCAAACTCTTTTCGAACTTGTCTTCAACTGCTTTCGCAT
17 | >chrXII:459675-459796
18 | GGTTGCGGCCATATCTACCAGAAAGCACCGTTTCCCGTCCGATCAACTGTAGTTAAGCTGGTAAGAGCCTGACCGAGTAGTGTAGTGGGTGACCATACGCGAAACTCAGGTGCTGCAATCT
19 | >chrXII:460711-460922
20 | ATAGTAAATAGTAACTTACATACATTAGTAAATGGTACACTCTTACACACTATCATCCTCATCGTATATTATAATAGATATATACAATACATGTTTTTACCCGGATCATAGAATTCTTAAGACAAATAAAATTTATAGAGACTTGTTCAGTCTACTTCTCTCTAAACTAGGCCCCGGCTCCTGCCAGTACCCACTTAGAAAGAAATAAAAA
21 | >chrXII:460711-467569
22 | ATAGTAAATAGTAACTTACATACATTAGTAAATGGTACACTCTTACACACTATCATCCTCATCGTATATTATAATAGATATATACAATACATGTTTTTACCCGGATCATAGAATTCTTAAGACAAATAAAATTTATAGAGACTTGTTCAGTCTACTTCTCTCTAAACTAGGCCCCGGCTCCTGCCAGTACCCACTTAGAAAGAAATAAAAAACAAATCAGACAACAAAGGCTTAATCTCAGCAGATCGTAACAACAAGGCTACTCTACTGCTTACAATACCCCGTTGTACATCTAAGTCGTATACAAATGATTTATCCCCACGCAAAATGACATTGCAATTCGCCAGCAAGCACCCAAGGCCTTTCCGCCAAGTGCACCGTTGCTAGCCTGCTATGGTTCAGCGACGCCACAAGGACGCCTTATTCGTATCCATCTATATTGTGTGGAGCAAAGAAATCACCGCGTTCTAGCATGGATTCTGACTTAGAGGCGTTCAGCCATAATCCAGCGGATGGTAGCTTCGCGGCAATGCCTGATCAGACAGCCGCAAAAACCAATTATCCGAATGAACTGTTCCTCTCGTACTAAGTTCAATTACTATTGCGGTAACATTCATCAGTAGGGTAAAACTAACCTGTCTCACGACGGTCTAAACCCAGCTCACGTTCCCTATTAGTGGGTGAACAATCCAACGCTTACCGAATTCTGCTTCGGTATGATAGGAAGAGCCGACATCGAAGAATCAAAAAGCAATGTCGCTATGAACGCTTGACTGCCACAAGCCAGTTATCCCTGTGGTAACTTTTCTGGCACCTCTAGCCTCAAATTCCGAGGGACTAAAGGATCGATAGGCCACACTTTCATGGTTTGTATTCACACTGAAAATCAAAATCAAGGGGGCTTTTACCCTTTTGTTCTACTGGAGATTTCTGTTCTCCATGAGCCCCCCTTAGGACATCTGCGTTATCGTTTAACAGATGTGCCGCCCCAGCCAAACTCCCCACCTGACAATGTCTTCAACCCGGATCAGCCCCGAATGGGACCTTGAATGCTAGAACGTGGAAAATGAATTCCAGCTCCGCTTCATTGAATAAGTAAAGAAACTATAAAGGTAGTGGTATTTCACTGGCGCCGAAGCTCCCACTTATTCTACACCCTCTATGTCTCTTCACAATGTCAAACTAGAGTCAAGCTCAACAGGGTCTTCTTTCCCCGCTGATTCTGCCAAGCCCGTTCCCTTGGCTGTGGTTTCGCTAGATAGTAGATAGGGACAGTGGGAATCTCGTTAATCCATTCATGCGCGTCACTAATTAGATGACGAGGCATTTGGCTACCTTAAGAGAGTCATAGTTACTCCCGCCGTTTACCCGCGCTTGGTTGAATTTCTTCACTTTGACATTCAGAGCACTGGGCAGAAATCACATTGCGTCAACATCACTTTCTGACCATCGCAATGCTATGTTTTAATTAGACAGTCAGATTCCCCTTGTCCGTACCAGTTCTAAGTTGATCGTTAATTGTAGCAAGCGACGGTCTACAAGAGACCTACCAAGGCCGTCTACAACAAGGCACGCAAGTAGTCCGCCTAGCAGAGCAAGCCCCACCAAGCAGTCCACAAGCACGCCCGCTGCGTCTGACCAAGGCCCTCACTACCCGACCCTTAGAGCCAATCCTTATCCCGAAGTTACGGATCTATTTTGCCGACTTCCCTTATCTACATTATTCTATCAACTAGAGGCTGTTCACCTTGGAGACCTGCTGCGGTTATCAGTACGACCTGGCATGAAAACTATTCCTTCCTGTGGATTTTCACGGGCCGTCACAAGCGCACCGGAGCCAGCAAAGGTGCTGGCCTCTTCCAGCCATAAGACCCCATCTCCGGATAAACCAATTCCGGGGTGATAAGCTGTTAAGAAGAAAAGATAACTCCTCCCAGGGCTCGCGCCGACGTCTCCACATTCAGTTACGTTACCGTGAAGAATCCATATCCAGGTTCCGGAATCTTAACCGGATTCCCTTTCGATGGTGGCCTGCATAAAATCAGGCCTTTGAAACGGAGCTTCCCCATCTCTTAGGATCGACTAACCCACGTCCAACTGCTGTTGACGTGGAACCTTTCCCCACTTCAGTCTTCAAAGTTCTCATTTGAATATTTGCTACTACCACCAAGATCTGCACTAGAGGCCGTTCGACCCGACCTTACGGTCTAGGCTTCGTCACTGACCTCCACGCCTGCCTACTCGTCAGGGCATCATATCAACCCTGACGGTAGAGTATAGGTAACACGCTTGAGCGCCATCCATTTTCAGGGCTAGTTCATTCGGCCGGTGAGTTGTTACACACTCCTTAGCGGATTCCGACTTCCATGGCCACCGTCCGGCTGTCTAGATGAACTAACACCTTTTGTGGTGTCTGATGAGCGTGTATTCCGGCACCTTAACTCTACGTTCGGTTCATCCCGCATCGCCAGTTCTGCTTACCAAAAATGGCCCACTAAAAGCTCTTCATTCAAATGTCCACGTTCAATTAAGTAACAAGGACTTCTTACATATTTAAAGTTTGAGAATAGGTCAAGGTCATTTCGACCCCGGAACCTCTAATCATTCGCTTTACCTCATAAAACTGATACGAGCTTCTGCTATCCTGAGGGAAACTTCGGCAGGAACCAGCTACTAGATGGTTCGATTAGTCTTTCGCCCCTATACCCAAATTCGACGATCGATTTGCACGTCAGAACCGCTACGAGCCTCCACCAGAGTTTCCTCTGGCTTCACCCTATTCAGGCATAGTTCACCATCTTTCGGGTCCCAACAGCTATGCTCTTACTCAAATCCATCCGAAGACATCAGGATCGGTCGATTGTGCACCTCTTGCGAGGCCCCAACCTACGTTCACTTTCATTACGCGTATGGGTTTTACACCCAAACACTCGCATAGACGTTAGACTCCTTGGTCCGTGTTTCAAGACGGGCGGCATATAACCATTATGCCAGCATCCTTGACTTACGTCGCAGTCCTCAGTCCCAGCTGGCAGTATTCCCACAGGCTATAATACTTACCGAGGCAAGCTACATTCCTATGGATTTATCCTGCCACCAAAACTGATGCTGGCCCAGTGAAATGCGAGATTCCCCTACCCACAAGGAGCAGAGGGCACAAAACACCATGTCTGATCAAATGCCCTTCCCTTTCAACAATTTCACGTACTTTTTCACTCTCTTTTCAAAGTTCTTTTCATCTTTCCATCACTGTACTTGTTCGCTATCGGTCTCTCGCCAATATTTAGCTTTAGATGGAATTTACCACCCACTTAGAGCTGCATTCCCAAACAACTCGACTCTTCGAAGGCACTTTACAAAGAACCGCACTCCTCGCCACACGGGATTCTCACCCTCTATGACGTCCTGTTCCAAGGAACATAGACAAGGAACGGCCCCAAAGTTGCCCTCTCCAAATTACAACTCGGGCACCGAAGGTACCAGATTTCAAATTTGAGCTTTTGCCGCTTCACTCGCCGTTACTAAGGCAATCCCGGTTGGTTTCTTTTCCTCCGCTTATTGATATGCTTAAGTTCAGCGGGTACTCCTACCTGATTTGAGGTCAAACTTTAAGAACATTGTTCGCCTAGACGCTCTCTTCTTATCGATAACGTTCCAATACGCTCAGTATAAAAAAAGATTAGCCGCAGTTGGTAAAACCTAAAACGACCGTACTTGCATTATACCTCAAGCACGCAGAGAAACCTCTCTTTGGAAAAAAAACATCCAATGAAAAGGCCAGCAATTTCAAGTTAACTCCAAAGAGTATCACTCACTACCAAACAGAATGTTTGAGAAGGAAATGACGCTCAAACAGGCATGCCCCCTGGAATACCAAGGGGCGCAATGTGCGTTCAAAGATTCGATGATTCACGGAATTCTGCAATTCACATTACGTATCGCATTTCGCTGCGTTCTTCATCGATGCGAGAACCAAGAGATCCGTTGTTGAAAGTTTTTAATATTTTAAAATTTCCAGTTACGAAAATTCTTGTTTTTGACAAAAATTTAATGAATAGATAAAATTGTTTGTGTTTGTTACCTCTGGGCCCCGATTGCTCGAATGCCCAAAGAAAAAGTTGCAAAGATATGAAAACTCCACAGTGTGTTGTATTGAAACGGTTTTAATTGTCCTATAACAAAAGCACAGAAATCTCTCACCGTTTGGAATAGCAAGAAAGAAACTTACAAGCCTAGCAAGACCGCGCACTTAAGCGCAGGCCCGGCTGGACTCTCCATCTCTTGTCTTCTTGCCCAGTAAAAGCTCTCATGCTCTTGCCAAAACAAAAAAATCCATTTTCAAAATTATTAAATTTCTTTAATGATCCTTCCGCAGGTTCACCTACGGAAACCTTGTTACGACTTTTAGTTCCTCTAAATGACCAAGTTTGTCCAAATTCTCCGCTCTGAGATGGAGTTGCCCCCTTCTCTAAGCAGATCCTGAGGCCTCACTAAGCCATTCAATCGGTACTAGCGACGGGCGGTGTGTACAAAGGGCAGGGACGTAATCAACGCAAGCTGATGACTTGCGCTTACTAGGAATTCCTCGTTGAAGAGCAATAATTACAATGCTCTATCCCCAGCACGACGGAGTTTCACAAGATTACCAAGACCTCTCGGCCAAGGTTAGACTCGCTGGCTCCGTCAGTGTAGCGCGCGTGCGGCCCAGAACGTCTAAGGGCATCACAGACCTGTTATTGCCTCAAACTTCCATCGGCTTGAAACCGATAGTCCCTCTAAGAAGTGGATAACCAGCAAATGCTAGCACCACTATTTAGTAGGTTAAGGTCTCGTTCGTTATCGCAATTAAGCAGACAAATCACTCCACCAACTAAGAACGGCCATGCACCACCACCCACAAAATCAAGAAAGAGCTCTCAATCTGTCAATCCTTATTGTGTCTGGACCTGGTGAGTTTCCCCGTGTTGAGTCAAATTAAGCCGCAGGCTCCACTCCTGGTGGTGCCCTTCCGTCAATTCCTTTAAGTTTCAGCCTTGCGACCATACTCCCCCCAGAACCCAAAGACTTTGATTTCTCGTAAGGTGCCGAGTGGGTCATTAAAAAAACACCACCCGATCCCTAGTCGGCATAGTTTATGGTTAAGACTACGACGGTATCTGATCATCTTCGATCCCCTAACTTTCGTTCTTGATTAATGAAAACGTCCTTGGCAAATGCTTTCGCAGTAGTTAGTCTTCAATAAATCCAAGAATTTCACCTCTGACAATTGAATACTGATGCCCCCGACCGTCCCTATTAATCATTACGATGGTCCTAGAAACCAACAAAATAGAACCAAACGTCCTATTCTATTATTCCATGCTAATATATTCGAGCAATACGCCTGCTTTGAACACTCTAATTTTTTCAAAGTAAAAGTCCTGGTTCGCCAAGAGCCACAAGGACTCAAGGTTAGCCAGAAGGAAAGGCCCCGTTGGAAATCCAGTACACGAAAAAATCGGACCGGCCAACCGGGCCCAAAGTTCAACTACGAGCTTTTTAACTGCAACAACTTTAATATACGCTATTGGAGCTGGAATTACCGCGGCTGCTGGCACCAGACTTGCCCTCCAATTGTTCCTCGTTAAGGTATTTACATTGTACTCATTCCAATTACAAGACCCGAATGGGCCCTGTATCGTTATTTATTGTCACTACCTCCCTGAATTAGGATTGGGTAATTTGCGCGCCTGCTGCCTTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTTATTCCCCGTTACCCGTTGAAACCATGGTAGGCCACTATCCTACCATCGAAAGTTGATAGGGCAGAAATTTGAATGAACCATCGCCAGCACAAGGCCATGCGATTCGAAAAGTTATTATGAATCATCAAAGAGTCCGAAGACATTGATTTTTTATCTAATAAATACATCTCTTCCAAAGGGTCGAGATTTTAAGCATGTATTAGCTCTAGAATTACCACAGTTATACCATGTAGTAAAGGAACTATCAAATAAACGATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTATAAATTGCTTATACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGACTACTGGCAGGATCAACCAGATAACTATCTTAAAAGAAGAAGCAACAAGCAGTAAAAAAGAAAGAAACCGAAATCTCTTTTTTTTTTTCCCACCTATTCCCTCTTGCTAGAAGATACTTATTGAGTTTGGAAACAGCTGAAATTCCAGAAAAATTGCTTTTTCAGGTCTCTCTGCTGCCGGAAATGCTCTCTGTTCAAAAAGCTTTTACACTCTTGACCAGCGCACTCCGTCACCATACCATAGCACTCTTTGAGTTTCCTCTAATCAGGTTCCACCAAACAGATACCCCGGTGTTTCACGGAATGGTACGTTTGATATCGCTGATTTGAGAGGAGGTTACACTTGAAGAATCACAGTCTTGCGACCGGCTATTCAACAAGGCATTCCCCCAAGTTTGAATTCTTTGAAATAGATTGCTATTAGCTAGTAATCCACCAAATCCTTCGCTGCTCACCAATGGAATCGCAAGATGCCCACGATGAGACTGTTCAGGTTAAACGCAAAAGAAACACACTCTGGGAATTTCTTCCCAAATTGTATCTCTCAATACGCATCAACCCATGTCAATTAAACACGCTGTATAGAGACTAGGCAGATCTGACGATCACCTAGCGACTCTCTCCACCGTTTGACGAGGCCATTTACAAAAACATAACGAACGACAAGCCTACTCGAATTCGTTTCCAAACTCTTTTCGAACTTGTCTTCAACTGCTTTCGCAT
23 | >chrXII:460922-464318
24 | ACAAATCAGACAACAAAGGCTTAATCTCAGCAGATCGTAACAACAAGGCTACTCTACTGCTTACAATACCCCGTTGTACATCTAAGTCGTATACAAATGATTTATCCCCACGCAAAATGACATTGCAATTCGCCAGCAAGCACCCAAGGCCTTTCCGCCAAGTGCACCGTTGCTAGCCTGCTATGGTTCAGCGACGCCACAAGGACGCCTTATTCGTATCCATCTATATTGTGTGGAGCAAAGAAATCACCGCGTTCTAGCATGGATTCTGACTTAGAGGCGTTCAGCCATAATCCAGCGGATGGTAGCTTCGCGGCAATGCCTGATCAGACAGCCGCAAAAACCAATTATCCGAATGAACTGTTCCTCTCGTACTAAGTTCAATTACTATTGCGGTAACATTCATCAGTAGGGTAAAACTAACCTGTCTCACGACGGTCTAAACCCAGCTCACGTTCCCTATTAGTGGGTGAACAATCCAACGCTTACCGAATTCTGCTTCGGTATGATAGGAAGAGCCGACATCGAAGAATCAAAAAGCAATGTCGCTATGAACGCTTGACTGCCACAAGCCAGTTATCCCTGTGGTAACTTTTCTGGCACCTCTAGCCTCAAATTCCGAGGGACTAAAGGATCGATAGGCCACACTTTCATGGTTTGTATTCACACTGAAAATCAAAATCAAGGGGGCTTTTACCCTTTTGTTCTACTGGAGATTTCTGTTCTCCATGAGCCCCCCTTAGGACATCTGCGTTATCGTTTAACAGATGTGCCGCCCCAGCCAAACTCCCCACCTGACAATGTCTTCAACCCGGATCAGCCCCGAATGGGACCTTGAATGCTAGAACGTGGAAAATGAATTCCAGCTCCGCTTCATTGAATAAGTAAAGAAACTATAAAGGTAGTGGTATTTCACTGGCGCCGAAGCTCCCACTTATTCTACACCCTCTATGTCTCTTCACAATGTCAAACTAGAGTCAAGCTCAACAGGGTCTTCTTTCCCCGCTGATTCTGCCAAGCCCGTTCCCTTGGCTGTGGTTTCGCTAGATAGTAGATAGGGACAGTGGGAATCTCGTTAATCCATTCATGCGCGTCACTAATTAGATGACGAGGCATTTGGCTACCTTAAGAGAGTCATAGTTACTCCCGCCGTTTACCCGCGCTTGGTTGAATTTCTTCACTTTGACATTCAGAGCACTGGGCAGAAATCACATTGCGTCAACATCACTTTCTGACCATCGCAATGCTATGTTTTAATTAGACAGTCAGATTCCCCTTGTCCGTACCAGTTCTAAGTTGATCGTTAATTGTAGCAAGCGACGGTCTACAAGAGACCTACCAAGGCCGTCTACAACAAGGCACGCAAGTAGTCCGCCTAGCAGAGCAAGCCCCACCAAGCAGTCCACAAGCACGCCCGCTGCGTCTGACCAAGGCCCTCACTACCCGACCCTTAGAGCCAATCCTTATCCCGAAGTTACGGATCTATTTTGCCGACTTCCCTTATCTACATTATTCTATCAACTAGAGGCTGTTCACCTTGGAGACCTGCTGCGGTTATCAGTACGACCTGGCATGAAAACTATTCCTTCCTGTGGATTTTCACGGGCCGTCACAAGCGCACCGGAGCCAGCAAAGGTGCTGGCCTCTTCCAGCCATAAGACCCCATCTCCGGATAAACCAATTCCGGGGTGATAAGCTGTTAAGAAGAAAAGATAACTCCTCCCAGGGCTCGCGCCGACGTCTCCACATTCAGTTACGTTACCGTGAAGAATCCATATCCAGGTTCCGGAATCTTAACCGGATTCCCTTTCGATGGTGGCCTGCATAAAATCAGGCCTTTGAAACGGAGCTTCCCCATCTCTTAGGATCGACTAACCCACGTCCAACTGCTGTTGACGTGGAACCTTTCCCCACTTCAGTCTTCAAAGTTCTCATTTGAATATTTGCTACTACCACCAAGATCTGCACTAGAGGCCGTTCGACCCGACCTTACGGTCTAGGCTTCGTCACTGACCTCCACGCCTGCCTACTCGTCAGGGCATCATATCAACCCTGACGGTAGAGTATAGGTAACACGCTTGAGCGCCATCCATTTTCAGGGCTAGTTCATTCGGCCGGTGAGTTGTTACACACTCCTTAGCGGATTCCGACTTCCATGGCCACCGTCCGGCTGTCTAGATGAACTAACACCTTTTGTGGTGTCTGATGAGCGTGTATTCCGGCACCTTAACTCTACGTTCGGTTCATCCCGCATCGCCAGTTCTGCTTACCAAAAATGGCCCACTAAAAGCTCTTCATTCAAATGTCCACGTTCAATTAAGTAACAAGGACTTCTTACATATTTAAAGTTTGAGAATAGGTCAAGGTCATTTCGACCCCGGAACCTCTAATCATTCGCTTTACCTCATAAAACTGATACGAGCTTCTGCTATCCTGAGGGAAACTTCGGCAGGAACCAGCTACTAGATGGTTCGATTAGTCTTTCGCCCCTATACCCAAATTCGACGATCGATTTGCACGTCAGAACCGCTACGAGCCTCCACCAGAGTTTCCTCTGGCTTCACCCTATTCAGGCATAGTTCACCATCTTTCGGGTCCCAACAGCTATGCTCTTACTCAAATCCATCCGAAGACATCAGGATCGGTCGATTGTGCACCTCTTGCGAGGCCCCAACCTACGTTCACTTTCATTACGCGTATGGGTTTTACACCCAAACACTCGCATAGACGTTAGACTCCTTGGTCCGTGTTTCAAGACGGGCGGCATATAACCATTATGCCAGCATCCTTGACTTACGTCGCAGTCCTCAGTCCCAGCTGGCAGTATTCCCACAGGCTATAATACTTACCGAGGCAAGCTACATTCCTATGGATTTATCCTGCCACCAAAACTGATGCTGGCCCAGTGAAATGCGAGATTCCCCTACCCACAAGGAGCAGAGGGCACAAAACACCATGTCTGATCAAATGCCCTTCCCTTTCAACAATTTCACGTACTTTTTCACTCTCTTTTCAAAGTTCTTTTCATCTTTCCATCACTGTACTTGTTCGCTATCGGTCTCTCGCCAATATTTAGCTTTAGATGGAATTTACCACCCACTTAGAGCTGCATTCCCAAACAACTCGACTCTTCGAAGGCACTTTACAAAGAACCGCACTCCTCGCCACACGGGATTCTCACCCTCTATGACGTCCTGTTCCAAGGAACATAGACAAGGAACGGCCCCAAAGTTGCCCTCTCCAAATTACAACTCGGGCACCGAAGGTACCAGATTTCAAATTTGAGCTTTTGCCGCTTCACTCGCCGTTACTAAGGCAATCCCGGTTGGTTTCTTTTCCTCCGCTTATTGATATGCTTAAGTTCAGCGGGTACTCCTACCTGATTTGAGGTCAAAC
25 | >chrXII:464318-464550
26 | TTTAAGAACATTGTTCGCCTAGACGCTCTCTTCTTATCGATAACGTTCCAATACGCTCAGTATAAAAAAAGATTAGCCGCAGTTGGTAAAACCTAAAACGACCGTACTTGCATTATACCTCAAGCACGCAGAGAAACCTCTCTTTGGAAAAAAAACATCCAATGAAAAGGCCAGCAATTTCAAGTTAACTCCAAAGAGTATCACTCACTACCAAACAGAATGTTTGAGAAGG
27 | >chrXII:464550-464708
28 | AAATGACGCTCAAACAGGCATGCCCCCTGGAATACCAAGGGGCGCAATGTGCGTTCAAAGATTCGATGATTCACGGAATTCTGCAATTCACATTACGTATCGCATTTCGCTGCGTTCTTCATCGATGCGAGAACCAAGAGATCCGTTGTTGAAAGTTT
29 | >chrXII:464708-465069
30 | TTAATATTTTAAAATTTCCAGTTACGAAAATTCTTGTTTTTGACAAAAATTTAATGAATAGATAAAATTGTTTGTGTTTGTTACCTCTGGGCCCCGATTGCTCGAATGCCCAAAGAAAAAGTTGCAAAGATATGAAAACTCCACAGTGTGTTGTATTGAAACGGTTTTAATTGTCCTATAACAAAAGCACAGAAATCTCTCACCGTTTGGAATAGCAAGAAAGAAACTTACAAGCCTAGCAAGACCGCGCACTTAAGCGCAGGCCCGGCTGGACTCTCCATCTCTTGTCTTCTTGCCCAGTAAAAGCTCTCATGCTCTTGCCAAAACAAAAAAATCCATTTTCAAAATTATTAAATTTCTT
31 | >chrXII:465069-466869
32 | TAATGATCCTTCCGCAGGTTCACCTACGGAAACCTTGTTACGACTTTTAGTTCCTCTAAATGACCAAGTTTGTCCAAATTCTCCGCTCTGAGATGGAGTTGCCCCCTTCTCTAAGCAGATCCTGAGGCCTCACTAAGCCATTCAATCGGTACTAGCGACGGGCGGTGTGTACAAAGGGCAGGGACGTAATCAACGCAAGCTGATGACTTGCGCTTACTAGGAATTCCTCGTTGAAGAGCAATAATTACAATGCTCTATCCCCAGCACGACGGAGTTTCACAAGATTACCAAGACCTCTCGGCCAAGGTTAGACTCGCTGGCTCCGTCAGTGTAGCGCGCGTGCGGCCCAGAACGTCTAAGGGCATCACAGACCTGTTATTGCCTCAAACTTCCATCGGCTTGAAACCGATAGTCCCTCTAAGAAGTGGATAACCAGCAAATGCTAGCACCACTATTTAGTAGGTTAAGGTCTCGTTCGTTATCGCAATTAAGCAGACAAATCACTCCACCAACTAAGAACGGCCATGCACCACCACCCACAAAATCAAGAAAGAGCTCTCAATCTGTCAATCCTTATTGTGTCTGGACCTGGTGAGTTTCCCCGTGTTGAGTCAAATTAAGCCGCAGGCTCCACTCCTGGTGGTGCCCTTCCGTCAATTCCTTTAAGTTTCAGCCTTGCGACCATACTCCCCCCAGAACCCAAAGACTTTGATTTCTCGTAAGGTGCCGAGTGGGTCATTAAAAAAACACCACCCGATCCCTAGTCGGCATAGTTTATGGTTAAGACTACGACGGTATCTGATCATCTTCGATCCCCTAACTTTCGTTCTTGATTAATGAAAACGTCCTTGGCAAATGCTTTCGCAGTAGTTAGTCTTCAATAAATCCAAGAATTTCACCTCTGACAATTGAATACTGATGCCCCCGACCGTCCCTATTAATCATTACGATGGTCCTAGAAACCAACAAAATAGAACCAAACGTCCTATTCTATTATTCCATGCTAATATATTCGAGCAATACGCCTGCTTTGAACACTCTAATTTTTTCAAAGTAAAAGTCCTGGTTCGCCAAGAGCCACAAGGACTCAAGGTTAGCCAGAAGGAAAGGCCCCGTTGGAAATCCAGTACACGAAAAAATCGGACCGGCCAACCGGGCCCAAAGTTCAACTACGAGCTTTTTAACTGCAACAACTTTAATATACGCTATTGGAGCTGGAATTACCGCGGCTGCTGGCACCAGACTTGCCCTCCAATTGTTCCTCGTTAAGGTATTTACATTGTACTCATTCCAATTACAAGACCCGAATGGGCCCTGTATCGTTATTTATTGTCACTACCTCCCTGAATTAGGATTGGGTAATTTGCGCGCCTGCTGCCTTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTTATTCCCCGTTACCCGTTGAAACCATGGTAGGCCACTATCCTACCATCGAAAGTTGATAGGGCAGAAATTTGAATGAACCATCGCCAGCACAAGGCCATGCGATTCGAAAAGTTATTATGAATCATCAAAGAGTCCGAAGACATTGATTTTTTATCTAATAAATACATCTCTTCCAAAGGGTCGAGATTTTAAGCATGTATTAGCTCTAGAATTACCACAGTTATACCATGTAGTAAAGGAACTATCAAATAAACGATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTATAAATTGCTTATACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGACTACTGGCAGGATCAACCAGATA
33 | >chrXII:466869-467569
34 | ACTATCTTAAAAGAAGAAGCAACAAGCAGTAAAAAAGAAAGAAACCGAAATCTCTTTTTTTTTTTCCCACCTATTCCCTCTTGCTAGAAGATACTTATTGAGTTTGGAAACAGCTGAAATTCCAGAAAAATTGCTTTTTCAGGTCTCTCTGCTGCCGGAAATGCTCTCTGTTCAAAAAGCTTTTACACTCTTGACCAGCGCACTCCGTCACCATACCATAGCACTCTTTGAGTTTCCTCTAATCAGGTTCCACCAAACAGATACCCCGGTGTTTCACGGAATGGTACGTTTGATATCGCTGATTTGAGAGGAGGTTACACTTGAAGAATCACAGTCTTGCGACCGGCTATTCAACAAGGCATTCCCCCAAGTTTGAATTCTTTGAAATAGATTGCTATTAGCTAGTAATCCACCAAATCCTTCGCTGCTCACCAATGGAATCGCAAGATGCCCACGATGAGACTGTTCAGGTTAAACGCAAAAGAAACACACTCTGGGAATTTCTTCCCAAATTGTATCTCTCAATACGCATCAACCCATGTCAATTAAACACGCTGTATAGAGACTAGGCAGATCTGACGATCACCTAGCGACTCTCTCCACCGTTTGACGAGGCCATTTACAAAAACATAACGAACGACAAGCCTACTCGAATTCGTTTCCAAACTCTTTTCGAACTTGTCTTCAACTGCTTTCGCAT
35 | >chrXII:468812-468931
36 | GGTTGCGGCCATATCTACCAGAAAGCACCGTTTCCCGTCCGATCAACTGTAGTTAAGCTGGTAAGAGCCTGACCGAGTAGTGTAGTGGGTGACCATACGCGAAACTCAGGTGCTGCAGT
37 | >chrXII:472464-472583
38 | GGTTGCGGCCATATCTACCAGAAAGCACCGTTTCCCGTCCGATCAACTGTAGTTAAGCTGGTAAGAGCCTGACCGAGTAGTGTAGTGGGTGACCATACGCGAAACTCAGGTGCTGCAGT
39 | >chrXII:482044-482163
40 | GGTTGCGGCCATATCTACCAGAAAGCACCGTTTCCCGTCCGATCAACTGTAGTTAAGCTGGTAAGAGCCTGACCGAGTAGTGTAGTGGGTGACCATACGCGAAACTCAGGTGCTGCAGT
41 | >chrXII:485696-485815
42 | GGTTGCGGCCATATCTACCAGAAAGCACCGTTTCCCGTCCGATCAACTGTAGTTAAGCTGGTAAGAGCCTGACCGAGTAGTGTAGTGGGTGACCATACGCGAAACTCAGGTGCTGCAGT
43 | >chrXII:489348-489469
44 | GGTTGCGGCCATATCTACCAGAAAGCACCGTTTCCCGTCCGATCAACTGTAGTTAAGCTGGTAAGAGCCTGACCGAGTAGTGTAGTGGGTGACCATACGCGAAACTCAGGTGCTGCAATCT
45 | 


--------------------------------------------------------------------------------
/sequana_pipelines/rnaseq/rnaseq.rules:
--------------------------------------------------------------------------------
   1 | #
   2 | #  Copyright (c) 2016-2021 Sequana Dev Team (https://sequana.readthedocs.io)
   3 | #
   4 | #  The full license is in the LICENSE file, distributed with this software.
   5 | #
   6 | #  Website:       https://github.com/sequana/sequana
   7 | #  Documentation: http://sequana.readthedocs.io
   8 | #  Contributors:  https://github.com/sequana/sequana/graphs/contributors
   9 | ##############################################################################
  10 | # standard modules
  11 | import glob
  12 | import os
  13 | import shutil
  14 | import subprocess
  15 | 
  16 | import sequana
  17 | from sequana_pipetools import snaketools as sm
  18 | import sequana.featurecounts as fc
  19 | 
  20 | # ========================================================= The main config file
  21 | #
  22 | configfile: "config.yaml"
  23 | 
  24 | 
  25 | # ================================================== The sequana pipeline manager
  26 | #
  27 | manager = sm.PipelineManager("rnaseq", config)
  28 | 
  29 | expected_output = []
  30 | # ========================================= Define output of the pipeline
  31 | #
  32 | manager.globals = {}
  33 | 
  34 | if manager.config['general']['aligner'] == 'salmon':
  35 |     manager.globals['strand_summary'] = None
  36 |     rule rnaseq:
  37 |         input:
  38 |             "multiqc/multiqc_report.html",
  39 |             ".sequana/rulegraph.svg",
  40 |             "post_analysis/rnadiff.sh"
  41 | else:
  42 |     manager.globals['strand_summary'] = "outputs/strand_summary.csv"
  43 |     rule rnaseq:
  44 |         input:
  45 |             "multiqc/multiqc_report.html",
  46 |             ".sequana/rulegraph.svg",
  47 |             "post_analysis/rnadiff.sh",
  48 | 
  49 | 
  50 | # ========================================= Define genome directory and inputs
  51 | # Make sure it is absolute
  52 | #
  53 | genome_directory = os.path.abspath(manager.config["general"]["genome_directory"])
  54 | genome_name = genome_directory.rsplit("/", 1)[1]
  55 | 
  56 | __prefix_name__ = f"{genome_directory}/{genome_name}"
  57 | __fasta_file__ = f"{__prefix_name__}.fa"
  58 | __gff_file__   = f"{__prefix_name__}.gff"
  59 | 
  60 | 
  61 | # ================================================ Build custom GFF if required
  62 | # If we have several features, we need to build a custom GFF file
  63 | #
  64 | if manager.config['general']['custom_gff']:
  65 |     __gff_file__   = manager.config['general']['custom_gff']
  66 | assert os.path.exists(__gff_file__), f"GFF file {__gff_file__} does not exist"
  67 | 
  68 | # check existence of fasta and gff before starting;
  69 | for this in [__fasta_file__, __gff_file__]:
  70 |     if os.path.exists(this) is False:
  71 |         raise IOError("File {} not found".format(__fasta_file__))
  72 | 
  73 | 
  74 | if manager.config.general.contaminant_file and manager.config.general.rRNA_feature:
  75 |     logger.error("Either set contaminant_file or rRNA_feature in the config file, not both.")
  76 |     sys.exit(1)
  77 | 
  78 | 
  79 | # ==================================== search for specific sequences as contaminants
  80 | #
  81 | if manager.config.general.contaminant_file:
  82 |     # Could be a local file of in the genome directory file
  83 |     __bowtie1_index_rna__fasta = f"{manager.config.general.contaminant_file}"
  84 | 
  85 |     # if not found locally, try to find it in the genome_directory path
  86 |     if os.path.exists(__bowtie1_index_rna__fasta) is False:
  87 |         __bowtie1_index_rna__fasta = f"{genome_directory}/{manager.config.general.contaminant_file}"
  88 |         if os.path.exists(__bowtie1_index_rna__fasta) is False:
  89 |             logger.error("File {} does not exists. Check your config file".format(__bowtie1_index_rna__fasta))
  90 |             sys.exit(1)
  91 | 
  92 |     # we will copy the file to keep the information
  93 |     os.makedirs("inputs/contamination_file", exist_ok=True)
  94 |     shutil.copy(__bowtie1_index_rna__fasta, "inputs/contamination_file")
  95 | 
  96 |     # so we need to rename the input
  97 |     __bowtie1_index_rna__fasta = "inputs/contamination_file/" + os.path.basename(__bowtie1_index_rna__fasta)
  98 | 
  99 |     bowtie1_index_conta__input_reference = __bowtie1_index_rna__fasta
 100 |     bowtie1_index_conta__output = f"{__bowtie1_index_rna__fasta}.1.ebwt"
 101 |     rule samtools_faidx:
 102 |         input:
 103 |             __bowtie1_index_rna__fasta
 104 |         output:
 105 |             __bowtie1_index_rna__fasta + ".fai"
 106 |         container:
 107 |             config['apptainers']['sequana_tools']
 108 |         shell:
 109 |             """
 110 |             samtools faidx {input[0]}
 111 |             """
 112 | 
 113 | elif manager.config.general.rRNA_feature:
 114 |     # extract the rRNA feature from the GFF file. Build the corresponding FastA
 115 |     # file. if not found, a dummy FastA file with  AAAAAAAAAAAAAA is built
 116 | 
 117 |     bowtie1_index_conta__input_reference = f"{__prefix_name__}_{manager.config.general.rRNA_feature}.fa"
 118 |     bowtie1_index_conta__input_gff = f"{__prefix_name__}_{manager.config.general.rRNA_feature}.gff"
 119 |     bowtie1_index_conta__output = f"{__prefix_name__}_{manager.config.general.rRNA_feature}.1.ebwt"
 120 |     rule extract_fasta:
 121 |         input:
 122 |             fasta = __fasta_file__,
 123 |             gff = __gff_file__
 124 |         params:
 125 |             feature = config['general']['rRNA_feature']
 126 |         output:
 127 |             fasta = bowtie1_index_conta__input_reference,
 128 |             fai = bowtie1_index_conta__input_reference + ".fai",
 129 |             gff = bowtie1_index_conta__input_gff
 130 |         log:
 131 |             "logs/indexing/get_rRNA.log"
 132 |         container:
 133 |             config['apptainers']['sequana_tools']
 134 |         shell:
 135 |             """
 136 |             # used to be gawk but awk is more generic.
 137 |             awk '{{ if ($3=="{params.feature}") print }}' {input.gff} > {output.gff}
 138 |             if [ -s {output.gff} ]
 139 |             then
 140 |                 bedtools getfasta -fi {input.fasta} -bed {output.gff}  -fo {output.fasta}
 141 |             else :
 142 |                 echo -e ">empty\\nAAAAAAAAAAAAAA" > {output.fasta}
 143 |             fi
 144 |             samtools faidx {output.fasta}
 145 |             """
 146 | 
 147 | # ========================================================= Indexing for rRNA and contmination
 148 | #
 149 | # redo the indexing whatsover since it is pretty fast
 150 | if manager.config.general.contaminant_file or manager.config.general.rRNA_feature:
 151 | 
 152 |     # identify ribosomal contamination or contamination
 153 |     rule ribosomal_contamination:
 154 |         input:
 155 |             reference = bowtie1_index_conta__input_reference,
 156 |             fai = bowtie1_index_conta__input_reference + ".fai"
 157 |         output:
 158 |             bowtie1_index_conta__output
 159 |         log:
 160 |             "logs/indexing/bowtie1_index_conta.log"
 161 |         params:
 162 |             options=""
 163 |         threads: 2
 164 |         container:
 165 |             config['apptainers']['sequana_tools']
 166 |         wrapper:
 167 |             f"{manager.wrappers}/wrappers/bowtie1/build"
 168 | 
 169 | 
 170 | # ============================================================================ bowtie2 index
 171 | #
 172 | if manager.config.general.aligner == "bowtie2":
 173 |     if manager.config['bowtie2_mapping']['genome_size_larger_than_4gb']:
 174 |         bt2_ext = "bt2l"
 175 |     else:
 176 |         bt2_ext = "bt2"
 177 | 
 178 |     # These two variables are used elsewhere and in the rule below
 179 |     # Index creatin may differ from one version to another and one may want to
 180 |     # keep track of the index and its version. We try to retrieve the version
 181 |     # and if succesful, we will add the version as a suffix, otherwise just the
 182 |     # name of bowtie2
 183 | 
 184 |     # tested on version 2.4.2
 185 |     try:
 186 |         p = subprocess.Popen(['bowtie2'], stderr=subprocess.PIPE)
 187 |         p.wait()
 188 |         stderr = p.stderr.read().decode().split("\n")
 189 |         hits = [line for line in stderr if "version" in line and 'Bowtie' in line]
 190 |         bowtie2_version = "_" + hits[0].split("version")[1].split()[0].strip()
 191 |     except Exception:  # various type of exception may occur here
 192 |          logger.warning(f"Could not determine bowtie2 version. Index will be stored in {genome_directory}/bowtie2/")
 193 |          bowtie2_version = ""
 194 | 
 195 |     bowtie2_index = f"{genome_directory}/bowtie2{bowtie2_version}/{genome_name}"
 196 | 
 197 |     if os.path.exists(f"{bowtie2_index}.1.{bt2_ext}"):
 198 |         pass # index exists, no need to do it, everything should be fine
 199 |     else:
 200 |         rule bowtie2_index:
 201 |             input:
 202 |                 reference=__fasta_file__
 203 |             output:
 204 |                 multiext(
 205 |                     bowtie2_index,
 206 |                     ".1.bt2", ".2.bt2", ".3.bt2", ".4.bt2", ".rev.1.bt2", ".rev.2.bt2",
 207 |                 ),
 208 | 
 209 |             log:
 210 |                 "logs/indexing/bowtie2_genome.log"
 211 |             params:
 212 |                 options=config["bowtie2_index"]["options"]
 213 |             threads:
 214 |                 config["bowtie2_index"]["threads"]
 215 |             container:
 216 |                 config['apptainers']['sequana_tools']
 217 |             resources:
 218 |                 **config['bowtie2_index']['resources']
 219 |             wrapper:
 220 |                 f"{manager.wrappers}/wrappers/bowtie2/build"
 221 | 
 222 | 
 223 | # ============================================================================ star index
 224 | #
 225 | elif manager.config.general.aligner  == "star":
 226 |     # tested on version 2.7.8a
 227 |     try:
 228 |         p = subprocess.Popen(['STAR', '--version'], stdout=subprocess.PIPE)
 229 |         p.wait()
 230 |         star_version = p.stdout.read().decode().strip()
 231 |     except Exception:  # various type of exception may occur here
 232 |          logger.warning(f"Could not determine STAR version. Index will be stored in {genome_directory}/star/")
 233 |          star_version = ""
 234 | 
 235 | 
 236 |     __star_index__dir__ = genome_directory + f"/star{star_version}"
 237 |     __star_index__done =  f"{__star_index__dir__}/star.done"
 238 | 
 239 |     if not os.path.exists(__star_index__done):
 240 | 
 241 |         rule star_index:
 242 |             input:
 243 |                 fasta =  __fasta_file__
 244 |             output:
 245 |                 done = __star_index__done
 246 |             params:
 247 |                 options= config['star_index']['options'],
 248 |                 wkdir= __star_index__dir__
 249 |             threads:
 250 |                 config["star_index"]['threads']
 251 |             log:
 252 |                 "logs/indexing/star_genome.log"
 253 |             container:
 254 |                 config['apptainers']['sequana_tools']
 255 |             resources:
 256 |                 **config['star_index']['resources']
 257 |             wrapper:
 258 |                 f"{manager.wrappers}/wrappers/star/index"
 259 | 
 260 | 
 261 | # ========================================================================== salmon
 262 | #
 263 | elif manager.config.general.aligner == "salmon":
 264 |     #tested on salmon 1.4.0
 265 |     try:
 266 |         p = subprocess.Popen(['salmon', '--version'], stdout=subprocess.PIPE)
 267 |         p.wait()
 268 |         salmon_version = p.stdout.read().decode().split()[-1]
 269 |     except Exception:  # various type of exception may occur here
 270 |          logger.warning(f"Could not determine salmon version. Index will be stored in {genome_directory}/salmon/")
 271 |          salmon_version = ""
 272 | 
 273 |     if os.path.exists(genome_directory + f"/salmon{salmon_version}/salmon.done"):
 274 |         pass # index exists, no need to do it, everything should be fine
 275 |     else:
 276 |         rule salmon_index:
 277 |             input:
 278 |                 fasta=__fasta_file__,
 279 |                 gff=__gff_file__
 280 |             output:
 281 |                 done=genome_directory + f"/salmon{salmon_version}/salmon.done"
 282 |             threads:
 283 |                 config['salmon_index']['threads']
 284 |             resources:
 285 |                 **config["salmon_mapping"]['resources']
 286 |             params:
 287 |                 options=config['salmon_index']['options']
 288 |             container:
 289 |                 config['apptainers']['salmon']
 290 |             log:
 291 |                 "logs/salmon_indexing.log"
 292 |             wrapper:
 293 |                 f"{manager.wrappers}/wrappers/salmon/index"
 294 | 
 295 | 
 296 | 
 297 | # ===================================================================== FASTQC on input data set
 298 | #
 299 | if not manager.config['fastqc']['skip_fastqc_raw']:
 300 |     rule fastqc_raw:
 301 |         input:
 302 |             manager.getrawdata()
 303 |         output:
 304 |             done = "{sample}/fastqc_raw/fastqc.done"
 305 |         params:
 306 |             options= config["fastqc"]["options"],
 307 |             working_directory= "{sample}/fastqc_raw/"
 308 |         threads: config["fastqc"]["threads"]
 309 |         container:
 310 |             config['apptainers']['fastqc']
 311 |         log:
 312 |             "{sample}/fastqc_raw/fastqc.log"
 313 |         wrapper:
 314 |             f"{manager.wrappers}/wrappers/fastqc"
 315 | 
 316 |     expected_output.extend(expand("{sample}/fastqc_raw/fastqc.done", sample=manager.samples))
 317 | 
 318 | 
 319 | # ================================================================== trimming
 320 | valid_trimmer = ['cutadapt', 'fastp', 'atropos']
 321 | if manager.config.trimming.software_choice not in valid_trimmer:
 322 |     print(f"Invalid choice for trimming tool. Choose one in {valid_trimmer}")
 323 |     sys.exit(1)
 324 | 
 325 | if manager.config.trimming.do is False:
 326 |     __clean_fastq__output = manager.getrawdata()
 327 | elif manager.config.trimming.software_choice in ["cutadapt", "atropos"]:
 328 |     adapter_tool = manager.config.trimming.software_choice
 329 | 
 330 |     fwd = manager.config.cutadapt.fwd
 331 |     rev = manager.config.cutadapt.rev
 332 | 
 333 |     if adapter_tool in ["cutadapt", "atropos"]:
 334 |         adapter_tool = "cutadapt"
 335 |         __cutadapt__input_fastq = manager.getrawdata()
 336 |         __cutadapt__wkdir = "{sample}/cutadapt"
 337 |         __cutadapt__output = ["{sample}/cutadapt/{sample}_R1_.clean.fastq.gz"]
 338 |         if manager.paired:
 339 |             __cutadapt__output += ["{sample}/cutadapt/{sample}_R2_.clean.fastq.gz"]
 340 | 
 341 |         # Set the fwd and rev adapters
 342 |         __cutadapt__fwd = manager.config.cutadapt.fwd
 343 |         __cutadapt__rev = manager.config.cutadapt.rev
 344 | 
 345 |         __cutadapt__options = manager.config.cutadapt.options
 346 |         __cutadapt__mode = manager.config.cutadapt.mode
 347 |         __cutadapt__log = "%s/cutadapt/cutadapt.txt" % manager.sample
 348 |         __cutadapt__sample = manager.sample
 349 |         __clean_fastq__output = __cutadapt__output
 350 |         include: sm.modules["cutadapt"]
 351 | elif manager.config.trimming.software_choice == "fastp":
 352 | 
 353 |     __clean_fastq__output = ["{sample}/fastp/{sample}_R1_.fastp.fastq.gz"]
 354 |     if manager.paired:
 355 |         __clean_fastq__output += ["{sample}/fastp/{sample}_R2_.fastp.fastq.gz"]
 356 | 
 357 |     _quality = config["fastp"].get("quality", 15)
 358 |     _minlen = config["fastp"].get("minimum_length", 20)
 359 | 
 360 |     options_fastp = config["fastp"].get("options", "")
 361 |     options_fastp += f" --qualified_quality_phred {_quality}"
 362 |     options_fastp += f" -l {_minlen}"
 363 |     if config["fastp"].get("disable_adapter_trimming", False) is True:
 364 |         options_fastp += "--disable_adapter_trimming"
 365 |     if config["fastp"].get("disable_quality_filtering", False) is True:
 366 |         options_fastp += "--disable_quality_filtering"
 367 | 
 368 |     rule fastp:
 369 |         input:
 370 |             sample=manager.getrawdata()
 371 |         output:
 372 |             trimmed=__clean_fastq__output,
 373 |             html="{sample}/fastp/fastp_{sample}.html",
 374 |             json="{sample}/fastp/fastp_{sample}.json", # must be named fastp
 375 |         log:
 376 |             "logs/fastp/{sample}.log"
 377 |         params:
 378 |             options=options_fastp,
 379 |             adapters=config["fastp"]["adapters"]
 380 |         threads:
 381 |             config["fastp"].get("threads", 4)
 382 |         resources:
 383 |             **config['fastp']['resources']
 384 |         container:
 385 |             config['apptainers']['fastp']
 386 |         wrapper:
 387 |             f"{manager.wrappers}/wrappers/fastp"
 388 | 
 389 | 
 390 | # ===================================================== FASTQC fastp results
 391 | #
 392 | rule fastqc_clean:
 393 |     input:
 394 |         __clean_fastq__output
 395 |     output:
 396 |         done = "{sample}/fastqc_clean/fastqc.done"
 397 |     params:
 398 |         options= config["fastqc"]["options"],
 399 |         working_directory= "{sample}/fastqc_clean/"
 400 |     threads: config["fastqc"]["threads"]
 401 |     log:
 402 |         "{sample}/fastqc_clean/fastqc.log"
 403 |     resources:
 404 |         **config["fastqc"]['resources']
 405 |     container:
 406 |         config['apptainers']['fastqc']
 407 |     wrapper:
 408 |         f"{manager.wrappers}/wrappers/fastqc"
 409 | expected_output.extend(expand("{sample}/fastqc_clean/fastqc.done", sample=manager.samples))
 410 | 
 411 | 
 412 | # ================================= Decompress fastq.gz file before running bowtie1
 413 | #
 414 | if manager.config.trimming.software_choice == 'cutadapt' and manager.config.trimming.do:
 415 |     #__unpigz_R1__input = manager.getname("cutadapt", "_R1_.clean.fastq.gz")
 416 |     __unpigz_R1__input = "{sample}/cutadapt/{sample}_R1_.clean.fastq.gz"
 417 | elif manager.config.trimming.software_choice == 'fastp' and manager.config.trimming.do:
 418 |     __unpigz_R1__input = "{sample}/fastp/{sample}_R1_.fastp.fastq.gz"
 419 | elif manager.config.trimming.software_choice == 'atropos' and manager.config.trimming.do:
 420 |     __unpigz_R1__input = "{sample}/atropos/{sample}_R1_.clean.fastq.gz"
 421 | else:
 422 |     __unpigz_R1__input = manager.getrawdata()
 423 | 
 424 | 
 425 | # ==========  decompress and sanity check
 426 | #
 427 | if int(config['bowtie1_mapping_rna']['nreads']) != -1:
 428 |     extra = int(config['bowtie1_mapping_rna']['nreads']) * 4
 429 |     config['bowtie1_mapping_rna']['nreads'] = extra
 430 | 
 431 | rule sample_rRNA:
 432 |     input:
 433 |         __unpigz_R1__input
 434 |     output:
 435 |         fastq=temp("{sample}/data_for_bowtie1/{sample}_R1_.fastq")
 436 |     threads: 4
 437 |     params:
 438 |         nreads = int(config['bowtie1_mapping_rna']['nreads'])
 439 |     shell:
 440 |         """
 441 |         set +o pipefail
 442 |         if [[ {params.nreads} == -1 ]]; then
 443 |             unpigz -p {threads} -fk --stdout {input[0]} > {output[0]}
 444 |         else
 445 |             unpigz -p {threads} -fk --stdout {input[0]} | head -n {params.nreads} > {output[0]}
 446 |         fi
 447 |         """
 448 | 
 449 | """With paired data, alignement on rRNA leads to 0% alignment if we use R1 and
 450 | R2. If we use R1 only, the percentage is >0. First reason is that reads are not
 451 | trimmed properly. In truth, bowtie2 supports local alignments which means it can
 452 | soft-clip non-matching (=adapter) content while still align the local part of
 453 | the read that matches the reference. With Bowtie1 the read will probably go
 454 | unaligned due to the many mismatches. So we do not include R2 from version
 455 | v0.9.14.
 456 | """
 457 | 
 458 | # ========================================== bowtie1 mapping to detect rRNA
 459 | 
 460 | if manager.config.general.rRNA_feature or manager.config.general.contaminant_file:
 461 |     # rRNA. Note the list here below because the rule expects a list (in case it
 462 |     # is paired
 463 | 
 464 |     rule bowtie1_mapping_rna:
 465 |         input:
 466 |             fastq= rules.sample_rRNA.output.fastq,
 467 |             index=bowtie1_index_conta__output
 468 |         output:
 469 |             bam = "{sample}/bowtie1_mapping_rna/{sample}_rRNA.bam",
 470 |             sorted = "{sample}/bowtie1_mapping_rna/{sample}_rRNA.sorted.bam",
 471 |         log:
 472 |             "{sample}/bowtie1_mapping_rna/{sample}_bowtie1.log"
 473 |         params:
 474 |             options=""
 475 |         threads:
 476 |             config['bowtie1_mapping_rna']['threads']
 477 |         container:
 478 |             config['apptainers']['sequana_tools']
 479 |         wrapper:
 480 |             f"{manager.wrappers}/wrappers/bowtie1/align"
 481 | 
 482 |     rule fix_bowtie1_log:
 483 |         input:
 484 |             expand("{sample}/bowtie1_mapping_rna/{sample}_bowtie1.log", sample=manager.samples)
 485 |         output:
 486 |             "logs/fix_bowtie1/fix_bowtie1.log"
 487 |         run:
 488 | 
 489 |             for filename in input:
 490 |                 # we read the file
 491 |                 with open(filename) as fin:
 492 |                     data = fin.readlines()
 493 |                 # we update the file
 494 |                 with open(filename, "w") as fout:
 495 |                     for line in data:
 496 |                         if "least one alignment" in line:
 497 |                             fout.write(line)
 498 |                             fout.write(line.replace("least one alignment", "least one reported alignment"))
 499 |                         else:
 500 |                             fout.write(line)
 501 |             with open(output[0], "w") as fout:
 502 |                 fout.write("")
 503 |     expected_output += ["logs/fix_bowtie1/fix_bowtie1.log"]
 504 | 
 505 | 
 506 | # ========================================================== bowtie2 mapping
 507 | if manager.config.general.aligner == "bowtie2":
 508 | 
 509 |     rule bowtie2_mapping:
 510 |         input:
 511 |             fastq=__clean_fastq__output,
 512 |             idx=multiext(
 513 |             bowtie2_index,
 514 |             ".1.bt2",
 515 |             ".2.bt2",
 516 |             ".3.bt2",
 517 |             ".4.bt2",
 518 |             ".rev.1.bt2",
 519 |             ".rev.2.bt2",
 520 |         ),
 521 | 
 522 |         output:
 523 |             bam="{sample}/bowtie2/{sample}.sorted.bam",
 524 |         log:
 525 |             "{sample}/bowtie2/{sample}.log"
 526 |         params:
 527 |             options=config["bowtie2_mapping"]["options"],
 528 |         threads:
 529 |             config["bowtie2_mapping"]["threads"]
 530 |         container:
 531 |             config['apptainers']['sequana_tools']
 532 |         resources:
 533 |              **config['bowtie2_mapping']['resources']
 534 |         wrapper:
 535 |             f"{manager.wrappers}/wrappers/bowtie2/align"
 536 | 
 537 |     __mapping_output = "{sample}/bowtie2/{sample}.sorted.bam"
 538 | 
 539 | 
 540 | # ========================================================== star mapping
 541 | elif manager.config.general.aligner == "star":
 542 |     # Mapper rna-star
 543 | 
 544 |     rule star_mapping:
 545 |         input:
 546 |             fastq= __clean_fastq__output,
 547 |             reference=__fasta_file__,
 548 |             index= __star_index__done
 549 |         output:
 550 |             bam = "{sample}/star_mapping/{sample}_Aligned.sortedByCoord.out.bam"
 551 |         params:
 552 |             options=config['star_mapping']['options'],
 553 |             # for legacy mapping, set the first and second pass options
 554 |             options_first_pass=config['star_mapping']['options'],
 555 |             options_second_pass=config['star_mapping']['options'],
 556 |             prefix = "{sample}/star_mapping/{sample}",
 557 |             legacy=True,
 558 |         threads:
 559 |             config['star_mapping']['threads']
 560 |         log:
 561 |             "{sample}/star_mapping/{sample}.log"
 562 |         container:
 563 |             config['apptainers']['sequana_tools']
 564 |         resources:
 565 |             **config['star_mapping']['resources']
 566 |         wrapper:
 567 |             f"{manager.wrappers}/wrappers/star/align"
 568 | 
 569 | 
 570 |     expected_output.extend(
 571 |         expand(
 572 |             "{sample}/star_mapping/{sample}_Aligned.sortedByCoord.out.bam",
 573 |             sample=manager.samples
 574 |             )
 575 |     )
 576 | 
 577 |     __mapping_output = "{sample}/star_mapping/{sample}_Aligned.sortedByCoord.out.bam"
 578 | 
 579 | # ========================================================== salmon mapping
 580 | 
 581 | elif manager.config.general.aligner == "salmon":
 582 |     # to be used later by salmon_to_features
 583 |     __salmon_mapping__output_counts = "{sample}/salmon_mapping/{sample}_quant.sf"
 584 | 
 585 |     rule salmon_mapping:
 586 |         input:
 587 |             fastq=__clean_fastq__output,
 588 |             index=genome_directory + f"/salmon{salmon_version}/salmon.done"
 589 |         output:
 590 |             quant="{sample}/salmon_mapping/{sample}_quant.sf"
 591 |         params:
 592 |             options=config['salmon_mapping']['options']
 593 |         threads:
 594 |             config['salmon_mapping']['threads']
 595 |         resources:
 596 |             **config["salmon_mapping"]['resources']
 597 |         container:
 598 |             config['apptainers']['salmon']
 599 |         log:
 600 |             "{sample}/salmon_mapping/salmon.log"
 601 |         wrapper:
 602 |             "add_salmon/wrappers/salmon/align"
 603 | 
 604 | 
 605 |     expected_output.extend(expand("{sample}/salmon_mapping/{sample}_quant.sf", sample=manager.samples))
 606 |     # There is no BAM created
 607 |     __mapping_output = None
 608 | 
 609 | 
 610 | # ========================================================== add_read_group
 611 | # The input is the output of the mapping
 612 | # Add Read group on BAM files
 613 | if manager.config.general.aligner not in ['salmon']:
 614 |     rule add_read_group:
 615 |         input:
 616 |             __mapping_output
 617 |         output:
 618 |             "{sample}/add_read_group/{sample}.sorted.bam"
 619 |         log:
 620 |             "{sample}/add_read_group/{sample}.log"
 621 |         params:
 622 |             options=config["add_read_group"]["options"],
 623 |             SM="{sample}"
 624 |         container:
 625 |             config['apptainers']['sequana_tools']
 626 |         wrapper:
 627 |             f"{manager.wrappers}/wrappers/add_read_group"
 628 | 
 629 | 
 630 | 
 631 | # we always add read group so input is the read group output
 632 | # output is stored in __final_bam__
 633 | # duplicates can be from PCR or if SR, by pure chance.
 634 | # if Paired, most likely a PCR origin.
 635 | # Mark duplicates
 636 | if config["mark_duplicates"]["do"]:
 637 |     rule mark_duplicates:
 638 |         input:
 639 |             "{sample}/add_read_group/{sample}.sorted.bam"
 640 |         output:
 641 |             bam = "{sample}/mark_duplicates/{sample}.sorted.markdup.bam",
 642 |             metrics = "{sample}/mark_duplicates/{sample}.sorted.markdup.metrics",
 643 |         log:
 644 |             out = "{sample}/mark_duplicates/log.out",
 645 |             err = "{sample}/mark_duplicates/log.err"
 646 |         params:
 647 |             remove_dup = "false",
 648 |             tmpdir = "{sample}/mark_duplicates/tmp"
 649 |         container:
 650 |             config['apptainers']['sequana_tools']
 651 |         resources:
 652 |             **config['mark_duplicates']['resources']
 653 |         wrapper:
 654 |             f"{manager.wrappers}/wrappers/mark_duplicates"
 655 |     __final_bam__ = "{sample}/mark_duplicates/{sample}.sorted.markdup.bam"
 656 | elif manager.config.general.aligner not in ['salmon']:
 657 |     __final_bam__   = "{sample}/add_read_group/{sample}.sorted.bam"
 658 | else:
 659 |     __final_bam__ = []
 660 | 
 661 | 
 662 | # ====================================================================== generating bigwig files
 663 | if manager.config.bam_coverage.do is True and config['general']['aligner'] not in ['salmon']:
 664 | 
 665 |     rule bam_coverage:
 666 |         input: __final_bam__
 667 |         output:
 668 |             "{sample}/bam_coverage/{sample}.norm.bw"
 669 |         params:
 670 |             options = config['bam_coverage']["options"]
 671 |         log:
 672 |             "{sample}/bam_coverage/{sample}.log"
 673 |         threads:
 674 |             config['bam_coverage']['threads']
 675 |         container:
 676 |             config['apptainers']['sequana_tools']
 677 |         resources:
 678 |             **config['bam_coverage']['resources']
 679 |         wrapper:
 680 |             f"{manager.wrappers}/wrappers/deeptools/bam_coverage"
 681 | 
 682 |     expected_output.extend(
 683 |         expand(
 684 |             "{sample}/bam_coverage/{sample}.norm.bw",
 685 |             sample=manager.samples
 686 |             )
 687 |     )
 688 | 
 689 | 
 690 | # ============================================================= generating IGV plots
 691 | if manager.config.igvtools.do and config['general']['aligner'] not in ['salmon']:
 692 |     # if nothing provided, it must be an empty string
 693 |     if manager.config.igvtools.chrom_sizes_file.strip():
 694 |         pass
 695 |     else:
 696 |         config["igvtools"]["chrom_sizes_file"] = __fasta_file__
 697 | 
 698 |     rule igvtools:
 699 |         input: __final_bam__
 700 |         output:
 701 |             "{sample}/igvtools/{sample}.tdf"
 702 |         log:
 703 |             "{sample}/igvtools/{sample}.log"
 704 |         params:
 705 |             chromSize=config['igvtools']['chrom_sizes_file']
 706 |         container:
 707 |             config['apptainers']['igvtools']
 708 |         threads: 4
 709 |         shell:
 710 |             """
 711 |             igvtools count -z 5 -w 25 -f mean,max --includeDuplicates {input} {output} {params.chromSize}
 712 |             """
 713 |     expected_output.extend(expand("{sample}/igvtools/{sample}.tdf", sample=manager.samples))
 714 | 
 715 | 
 716 | # ===================================================================== Feature counts from subread suite
 717 | if manager.config.general.aligner == "salmon":
 718 |     __feature_counts__input = __salmon_mapping__output_counts
 719 | else :
 720 |     __feature_counts__input = __final_bam__
 721 | 
 722 | fc_outdir = "post_analysis/feature_counts/"
 723 | 
 724 | if manager.config.feature_counts.do and manager.config.general.aligner not in ['salmon']:
 725 |     # Guessing strandness is not always straightfoward; Even when we set it;
 726 |     # collaborators may want to look at the other options. So, we compute
 727 |     # everything with the 3 different options of strandness.
 728 |     # We will copy one of them based on our criteria, but all 3 will be
 729 |     # available
 730 | 
 731 |     feature_type = config['feature_counts']['feature']
 732 |     if "," in feature_type: # assume this is a custom GFF file
 733 |         feature_type = "custom"
 734 |     else:
 735 |         feature_type = config['feature_counts']["feature"]
 736 | 
 737 |     if config['feature_counts']['extra_attributes']:
 738 |         fc_options = f" {config['feature_counts']['options']} "
 739 |         fc_options += " --extraAttributes {} ".format(config['feature_counts']['extra_attributes'])
 740 |     else:
 741 |         fc_options = f" {config['feature_counts']['options']} "
 742 | 
 743 |     if manager.paired:
 744 |         fc_options += " -p "
 745 | 
 746 | 
 747 |     # ======================= calls feature counts 3 times Nsamples here below
 748 |     strand = [0,1,2]
 749 |     rule feature_counts:
 750 |         input:
 751 |             bam=__feature_counts__input,
 752 |             gff=__gff_file__
 753 |         output:
 754 |             counts="{sample}/feature_counts/{strand}/{sample}_feature.out",
 755 |             summary="{sample}/feature_counts/{strand}/{sample}_feature.out.summary"
 756 |         params:
 757 |             options=fc_options,
 758 |             feature=feature_type,
 759 |             attribute=config['feature_counts']["attribute"],
 760 |             strandness="{strand}"
 761 |         threads:
 762 |             config["feature_counts"]['threads']
 763 |         container:
 764 |             config['apptainers']['sequana_tools']
 765 |         log:
 766 |             "{sample}/feature_counts/{strand}/feature_counts.log"
 767 |         wrapper:
 768 |             f"{manager.wrappers}/wrappers/feature_counts"
 769 | 
 770 | 
 771 |     # ===================== guessing the strand
 772 |     #
 773 |     __guess_strandness__output = expand(fc_outdir + "{sample}_feature.out", sample=manager.samples)
 774 |     rule guess_strandness:
 775 |         """Guessing strandnes"""
 776 |         input:
 777 |             counts = expand("{sample}/feature_counts/{strand}/{sample}_feature.out", sample=manager.samples, strand=[0,1,2])
 778 |         output:
 779 |             data=__guess_strandness__output,
 780 |             summary=manager.globals['strand_summary']
 781 |         run:
 782 |             # We compute all strandness
 783 |             import sequana.featurecounts as fc
 784 | 
 785 |             mfc = fc.MultiFeatureCount(rnaseq_folder=".",
 786 |                         tolerance=manager.config.feature_counts.tolerance)
 787 |             mfc.df.to_csv(output.summary)
 788 | 
 789 |             try:
 790 |                 mfc.plot_strandness(savefig=True, output_filename="outputs/strand_summary.png")
 791 |             except Exception as err:
 792 |                 logger.warning("Could not create plot_strandness")
 793 | 
 794 |             logger.info(f"strandness inference: {mfc.probable_strand}")
 795 |             msg = f"This is {mfc.probable_strand} data (check in the multiqc report)"
 796 |             if mfc.probable_strand in [0, 1, 2]:
 797 |                 choice = mfc.probable_strand
 798 |                 logger.info(msg)
 799 |             else:
 800 |                 logger.warning("Strandness is apparently neither of 0, 1, 2")
 801 |                 logger.warning("you will need to copy the feature counts files yourself in ./feature_counts")
 802 |                 choice = -1
 803 | 
 804 |             # If user knowns what he/she wants we overwrite the choice
 805 |             if "strandness" in config['feature_counts'] and config["feature_counts"]["strandness"]:
 806 |                 user_choice =  int(config["feature_counts"]["strandness"])
 807 |                 if user_choice in [0,1,2]:
 808 |                     choice = user_choice
 809 |                 else:
 810 |                     logger.error(f"strandness in the config file must be 0,1,2. You gave {user_choice}")
 811 |                     sys.exit(1)
 812 | 
 813 |             if choice in {0, 1, 2}:
 814 |                 for filename in input:
 815 |                     if f"feature_counts/{choice}/" in filename:
 816 |                         shell(f"cp {filename} {fc_outdir}")
 817 |                         shell(f"cp {filename}.summary {fc_outdir}")
 818 |             else:
 819 |                 # if not clear, we copy everything and users should clean up the directory
 820 |                 for filename in input.fc0:
 821 |                     shell("cp {} {}".format(filename, fc_outdir))
 822 |                     shell("cp {}.summary {}".format(filename, fc_outdir))
 823 |                 for filename in input.fc1:
 824 |                     shell("cp {} {}".format(filename, fc_outdir))
 825 |                     shell("cp {}.summary {}".format(filename, fc_outdir))
 826 |                 for filename in input.fc2:
 827 |                     shell("cp {} {}".format(filename, fc_outdir))
 828 |                     shell("cp {}.summary {}".format(filename, fc_outdir))
 829 | elif manager.config.feature_counts.do and manager.config.general.aligner in ['salmon']:
 830 | 
 831 |     __salmon_to_features__output = fc_outdir + "{sample}_feature.out"
 832 |     rule salmon_to_features:
 833 |         input: __salmon_mapping__output_counts
 834 |         output: __salmon_to_features__output
 835 |         params:
 836 |             gff=__gff_file__
 837 |         shell:
 838 |             """sequana salmon --input {input} --output {output} --gff {params.gff} --attribute ID   """
 839 |     expected_output += expand(__salmon_to_features__output, sample=manager.samples)
 840 | 
 841 | 
 842 | # ==================================================================== Guess strandness
 843 | 
 844 | if manager.config.general.aligner in ['salmon']:
 845 |     __guess_strandness__output = expand(__salmon_to_features__output, sample=manager.samples)
 846 | rule merge_feature_counts:
 847 |     input: __guess_strandness__output
 848 |     output: "post_analysis/all_features.out"
 849 |     run:
 850 |         from sequana.featurecounts import FeatureCountMerger
 851 |         fcm = FeatureCountMerger(fof=input)
 852 |         fcm.to_tsv(output[0])
 853 | expected_output.append("post_analysis/all_features.out")
 854 | 
 855 | # ================================================================== rseqc diag tool
 856 | 
 857 | 
 858 | if config['rseqc']['do']:
 859 | 
 860 |     rule gff2bed:
 861 |         input:
 862 |             gff=__gff_file__
 863 |         output:
 864 |             bed="tmp/temp.bed" # config['rseqc'].get('bed_file', "tmp/temp.bed")
 865 |         message: "Build BED file from GFF using Sequana"
 866 |         run:
 867 |             from sequana import GFF3
 868 |             g = GFF3(input[0])
 869 |             g.to_bed(output[0], 'Name')
 870 | 
 871 |     rule rseqc:
 872 |         input:
 873 |             bam=__final_bam__,
 874 |             bed=rules.gff2bed.output.bed
 875 |         # no need to put all outputs
 876 |         output:
 877 |             bam_stat= "{sample}/rseqc/{sample}_bam_stat.txt",
 878 |             #read_gc="{sample}/rseqc/{sample}.GC.xls",
 879 |             geneBody_coverage="{sample}/rseqc/{sample}.geneBodyCoverage.txt"
 880 |         params:
 881 |             paired="PE" if manager.paired else "SE"
 882 |         log:
 883 |             "{sample}/rseqc/{sample}.log"
 884 |         container:
 885 |             config['apptainers']['sequana_tools']
 886 |         shell:
 887 |             """
 888 |             # for paired data only
 889 |             inner_distance.py -i {input.bam} -o {wildcards.sample}/rseqc/{wildcards.sample} -r {input.bed} &>{log}
 890 | 
 891 |             # For now GC not very useful in the output so commented
 892 |             # read_GC.py -i {input.bam} -o {wildcards.sample}/rseqc/{wildcards.sample} &>{log}
 893 | 
 894 |             # genebody coverage
 895 |             geneBody_coverage.py -i {input.bam} -o {wildcards.sample}/rseqc/{wildcards.sample} -r {input.bed} &>{log}
 896 | 
 897 |             # uses bigwig redundant with geneBody_coverage
 898 |             # geneBody_coverage2.py -i {wildcards.sample}/bamCoverage/{wildcards.sample}.norm.bw  -o {wildcards}/rseq/{wildcards.sample} -r test.bed &>{log}
 899 | 
 900 |             # Not included in the multiqc module so commented for now
 901 |             #clipping_profile.py -i {input.bam} -s {params.paired} -o {wildcards.sample}/rseqc/{wildcards.sample}
 902 |             read_duplication.py -i {input.bam} -o {wildcards.sample}/rseqc/{wildcards.sample} &>{log}
 903 |             junction_annotation.py -i {input.bam} -o {wildcards.sample}/rseqc/{wildcards.sample} -r {input.bed}  &>{log}
 904 |             junction_saturation.py -i {input.bam} -o {wildcards.sample}/rseqc/{wildcards.sample}  -r {input.bed} &>{log}
 905 |             infer_experiment.py -i {input.bam} -r {input.bed} > {wildcards.sample}/rseqc/{wildcards.sample}.infer.txt &>{log}
 906 | 
 907 |             # bam stats  KEEP last since that is the expected output to make sure
 908 |             # previous files (not listed in output:) are computed first.
 909 |             bam_stat.py -i {input.bam} > {output.bam_stat} &>{log}
 910 |             """
 911 | 
 912 |     # one series is enough. if bam_stats is created, others are also created
 913 |     expected_output.extend(
 914 |         expand("{sample}/rseqc/{sample}_bam_stat.txt", sample=manager.samples)
 915 |     )
 916 | 
 917 | # ========================================================== RNAseqc diag tool
 918 | # No need for mark_duplicates for RNASEQC . Just use the BAM file
 919 | if config["rnaseqc"]["do"] and config['general']['aligner'] != 'salmon':
 920 | 
 921 | 
 922 |     # for multiqc, important that output directories are called rnaseqc
 923 |     __gtf_file__ = config['rnaseqc']['gtf_file'].strip()
 924 | 
 925 |     # Could be a local file. If provided,
 926 |     if __gtf_file__:
 927 |         if os.path.exists(__gtf_file__) is False:
 928 |             logger.error(f"{__gtf_file__} not found")
 929 |             sys.exit(1)
 930 |     else: # if gtf not provided, maybe available in the genome directory ?
 931 |         __gtf_file__   = __prefix_name__ + ".gtf"
 932 |         if os.path.exists(__gtf_file__) is False:
 933 |             gdir = config["general"]["genome_directory"]
 934 |             from sequana import logger as logs  # to not interfere with snakemake
 935 |             logs.critical(f"{__gtf_file__} not found in {gdir}. Trying the GFF file")
 936 |             __gtf_file__ = __prefix_name__ + ".gff"
 937 | 
 938 | 
 939 |     rule rnaseqc_fixup:
 940 |         input:
 941 |             gtf = __gtf_file__
 942 |         output:
 943 |             gtf = temp("tmp/test.gtf")
 944 |         run:
 945 |             # If input GTF has no exon or genes, an error message is printed and
 946 |             # no files are created. This seems to be an issue in rnaseqc.
 947 |             # So, we create dummy gene and exon
 948 |             with open(output.gtf, "w") as ff:
 949 |                 ff.write(open(input['gtf'], "r").read())
 950 |                 ff.write('myCHR\tSGD\tgene\t0\t0\t.\t+\t0\tgene_id "dummy"\n')
 951 |                 ff.write('myCHR\tSGD\texon\t0\t0\t.\t+\t0\texon_id "dummy"\n')
 952 |                 ff.close()
 953 | 
 954 |     rule rnaseqc:
 955 |         input:
 956 |             bam = __final_bam__,
 957 |             gtf = rules.rnaseqc_fixup.output.gtf
 958 |         output:
 959 |             metrics = "{sample}/rnaseqc/{sample}.metrics.tsv"
 960 |         log:
 961 |             "{sample}/rnaseqc/{sample}.log",
 962 |         params:
 963 |             directory = "{sample}/rnaseqc",
 964 |             options= config['rnaseqc']['options']
 965 |         resources:
 966 |             **config["rnaseqc"]["resources"]
 967 |         container:
 968 |             config["apptainers"]["rnaseqc"]
 969 |         shell:
 970 |             """
 971 |             rnaseqc {input.gtf} {input.bam} {params.directory} -s {wildcards.sample} {params.options} &>{log}
 972 |             """
 973 | 
 974 |     expected_output.extend(expand("{sample}/rnaseqc/{sample}.metrics.tsv", sample=manager.samples))
 975 | 
 976 | 
 977 | # ========================================================== multiqc
 978 | multiqc_params_options = config['multiqc']['options']
 979 | if manager.config.multiqc.config_file:
 980 |     multiqc_params_options += f" -c {manager.config.multiqc.config_file}"
 981 | 
 982 | 
 983 | 
 984 | rule multiqc:
 985 |     input:
 986 |         expected_output
 987 |     output:
 988 |        "multiqc/multiqc_report.html"
 989 |     params:
 990 |         options=multiqc_params_options,
 991 |         input_directory=config['multiqc']['input_directory'],
 992 |         config_file=config['multiqc']['config_file'],
 993 |         modules=config['multiqc']['modules']
 994 |     log:
 995 |         "multiqc/multiqc.log"
 996 |     resources:
 997 |         **config['multiqc']['resources']
 998 |     container:
 999 |         config["apptainers"]["multiqc"]
1000 |     wrapper:
1001 |        f"{manager.wrappers}/wrappers/multiqc"
1002 | 
1003 | # ========================================================== rulegraph
1004 | 
1005 | rule rulegraph:
1006 |     input:
1007 |         workflow.snakefile,
1008 |     output:
1009 |         "rulegraph/rulegraph.dot",
1010 |     params:
1011 |         configname="config.yaml",
1012 |         mapper = {"multiqc": "../multiqc/multiqc_report.html"},
1013 |     wrapper:
1014 |         f"{manager.wrappers}/wrappers/rulegraph"
1015 | 
1016 | 
1017 | rule dot2svg:
1018 |     input:
1019 |         "rulegraph/rulegraph.dot"
1020 |     output:
1021 |         ".sequana/rulegraph.svg"
1022 |     container:
1023 |         config['apptainers']['graphviz']
1024 |     shell:
1025 |         """dot -Tsvg {input} -o {output}"""
1026 | 
1027 | 
1028 | 
1029 | rule prepare_DGE_analysis:
1030 |     input:
1031 |         features="post_analysis/all_features.out",
1032 |     output:
1033 |         rnadiff="post_analysis/rnadiff.sh",
1034 |         design="post_analysis/design.csv"
1035 |     run:
1036 | 
1037 | 	    # ------------------------------------------- RNADIFF
1038 | 	    # 1. save data for the RNADiff analysis
1039 | 	    from sequana.featurecounts import FeatureCount
1040 | 	    try:
1041 | 	        fc = FeatureCount(input[0], guess_design=True)
1042 | 	        fc.design_df.to_csv(output[1], index=False)
1043 | 	    except:
1044 | 	        msg = "Could not build the design.csv file in rnadiff. You will need to create it manually."
1045 | 	        logger.warning(msg)
1046 | 	        with open("post_analysis/README.rst", "w") as fout:
1047 | 	            fout.write(f"""{msg}
1048 | 	The design.csv file must be formatted as follows (for 2 conditions with 3 replicates each):
1049 | 
1050 | 	label,condition
1051 | 	samplename_1,condition_name_1
1052 | 	samplename_2,condition_name_1
1053 | 	samplename_3,condition_name_1
1054 | 	samplename_4,condition_name_2
1055 | 	samplename_5,condition_name_2
1056 | 	samplename_6,condition_name_2
1057 | 	""")
1058 | 
1059 | 	    # 2. save the script
1060 | 	    with open(output.rnadiff, "w") as fout:
1061 | 	        attribute = config['feature_counts']['attribute']
1062 | 	        feature = config['feature_counts']['feature']
1063 | 	        fout.write("#/bin/sh\nsequana rnadiff --features all_features.out " +
1064 | 	            f" --annotation-file {__gff_file__} --design design.csv --feature-name {feature} --attribute-name {attribute}")
1065 | 	    shell(f"chmod 755 {output.rnadiff}")
1066 | 
1067 | 
1068 | 
1069 | # Those rules takes a couple of seconds so no need for a cluster
1070 | localrules:  rulegraph, prepare_DGE_analysis
1071 | 
1072 | 
1073 | onsuccess:
1074 |     # Create plots about stats
1075 |     from sequana import logger as log
1076 |     from sequana.modules_report.summary import SequanaReport
1077 | 
1078 |     import colorlog
1079 |     log = colorlog.getLogger("sequana.rnaseq")
1080 |     log.setLevel("INFO")
1081 |     manager.teardown(
1082 |         extra_files_to_remove=["requirements.txt"],
1083 |         extra_dirs_to_remove=[".genomes", "tmp", "logs"])
1084 |     manager.clean_multiqc("multiqc/multiqc_report.html")
1085 | 
1086 | 
1087 |     try:
1088 |         import pandas as pd
1089 |         df = pd.read_csv(manager.globals['strand_summary'])
1090 |         guess = df['strand'].value_counts().idxmax()
1091 |         names = {0: 'stranded', 1: 'unstranded', 2: 'reversely stranded'}
1092 |         guess = names[guess]
1093 |     except Exception as err:
1094 |         if config['general']['aligner'] == "salmon":
1095 |             logger.info("Salmon aligner used. No strandness information available")
1096 |         else:
1097 |             logger.warning(err)
1098 |             guess = "?"
1099 | 
1100 | 
1101 |     intro = f"""
1102 |     <h2>Overview</h2>
1103 |     <p>
1104 |     The RNA-seq pipeline maps the reads on the provided reference (called <i>{__fasta_file__.split("/")[-1]}</i>). Features counts were extracted and are available in the <a href="./post_analysis/feature_counts">feature counts</a> directory; those files are entry points for differential gene expression analysis. The differential analysis, if performed, should be available in the <a href="post_analysis/rnadiff/">DGE analysis directory</a>. In addition, if enrichment was performed (GO or Kegg pathways), it should be available in the <a href="post_analysis"></a> directory as well.
1105 |     </p>
1106 | 
1107 |     <p>A <a href="multiqc/multiqc_report.html">multiqc report</a> is available, where various QC and mapping quality plots can be visualised. Some important plots are also in the HTML page here below.
1108 |     </p>"""
1109 | 
1110 | 
1111 |     rRNA_done = True
1112 |     intro += """<h2>ribosomal / contaminant content</h2>"""
1113 |     try:
1114 |         if os.path.exists("multiqc/multiqc_report_data/multiqc_bowtie1.txt"):
1115 |             df = pd.read_csv("multiqc/multiqc_report_data/multiqc_bowtie1.txt", sep='\t')
1116 |         elif os.path.exists("multiqc/multiqc_data/multiqc_bowtie1.txt"):
1117 |             df = pd.read_csv("multiqc/multiqc_data/multiqc_bowtie1.txt", sep='\t')
1118 |         else:
1119 |             rRNA_done = False
1120 | 
1121 |         if rRNA_done:
1122 |             if "reads_aligned_percentage" in df.columns:
1123 |                 rRNA = int(df.reads_aligned_percentage.mean()*100)/100
1124 |             else:
1125 |                 rRNA = int((100 - df.not_aligned_percentage.mean())*100)/100
1126 | 
1127 |             if rRNA < 10:
1128 |                 intro += f"<p>rRNA content (or contaminant provided) represents {rRNA}%, which is low (as expected). "
1129 |             elif rRNA < 20:
1130 |                 intro += f"<p>rRNA content (or contaminant provided) represents {rRNA}%, which is moderately low. "
1131 |             elif rRNA > 20 and rRNA <50:
1132 |                 rRNA += f"<p>rRNA content (or contaminant provided) represents {rRNA}%, which is relatively high. "
1133 |             elif rRNA >= 50:
1134 |                 rRNA += f"<p>rRNA content (or contaminant provided) represents {rRNA}%, which is very high. "
1135 |         else:
1136 |             intro += f"<p>rRNA content not computed (no rRNA gene or contaminant provided)</p>"
1137 | 
1138 |     except Exception as err:
1139 |         print(err)
1140 |         pass
1141 | 
1142 |     try:
1143 |         from sequana.multiqc.plots import Bowtie1Reader
1144 |         if rRNA_done:
1145 |             filename = "multiqc/multiqc_data/multiqc_bowtie1.txt"
1146 |             if not os.path.exists(filename):
1147 |                 filename = "multiqc/multiqc_report_data/multiqc_bowtie1.txt"
1148 |             br = Bowtie1Reader(filename)
1149 |             br.df.Sample = [str(x).replace("_bowtie1","") for x in br.df.Sample]
1150 |             fig = br.plot_bar(html_code=True)
1151 |             from plotly import offline
1152 |             intro += offline.plot(fig, output_type="div", include_plotlyjs=True)
1153 |     except Exception as err:
1154 |         print(err)
1155 | 
1156 | 
1157 |     # Include the bowtie plot
1158 |     intro += """<h2>Mapping rate</h2>"""
1159 |     if config['general']['aligner'] == "bowtie2":
1160 |         from sequana.multiqc.plots import Bowtie2
1161 |         if not manager.paired:
1162 |             filename = "multiqc/multiqc_data/mqc_bowtie2_se_plot_1.txt"
1163 |             if not os.path.exists(filename):
1164 |                 filename = "multiqc/multiqc_report_data/mqc_bowtie2_se_plot_1.txt"
1165 |                 if not os.path.exists(filename):
1166 |                     filename = "multiqc/multiqc_data/multiqc_bowtie2.txt"
1167 |         else:
1168 |             filename = "multiqc/multiqc_data/mqc_bowtie2_pe_plot_1.txt"
1169 |             if not os.path.exists(filename):
1170 |                 filename = "multiqc/multiqc_report_data/mqc_bowtie2_pe_plot_1.txt"
1171 |                 if not os.path.exists(filename):
1172 |                     filename = "multiqc/multiqc_data/multiqc_bowtie2.txt"
1173 |         br = Bowtie2(filename)
1174 |         fig = br.plot(html_code=True)
1175 |         from  plotly import offline
1176 |         intro += """<p>The mapping was performed with bowtie2. Here below are the percentage of mapping for each sample. See also the multiqc report.</p>""" + offline.plot(fig, output_type="div", include_plotlyjs=True)
1177 | 
1178 |     elif config["general"]["aligner"] == "star":
1179 |         from sequana.multiqc.plots import STAR
1180 |         filename = "multiqc/multiqc_report_data/multiqc_star.txt"
1181 |         if not os.path.exists(filename):
1182 |             filename = "multiqc/multiqc_data/multiqc_star.txt"
1183 |         br = STAR(filename)
1184 |         fig = br.plot(html_code=True)
1185 |         from  plotly import offline
1186 |         intro += """<p>The mapping was performed with STAR. Here below are the percentage of mapping for each sample. See also the multiqc report.</p>""" + offline.plot(fig, output_type="div", include_plotlyjs=True)
1187 | 
1188 |     try:
1189 |         from sequana.multiqc.plots import FeatureCounts
1190 |         intro += """<h2>Annotation rate</h2>"""
1191 |         filename = "multiqc/multiqc_report_data/mqc_featureCounts_assignment_plot_1.txt"
1192 |         if not os.path.exists(filename):
1193 |             filename = "multiqc/multiqc_data/mqc_featureCounts_assignment_plot_1.txt"
1194 |             # multiqc 1.27
1195 |             if not os.path.exists(filename):
1196 |                 filename = "multiqc/multiqc_data/multiqc_featurecounts.txt"
1197 | 
1198 |         br = FeatureCounts(filename)
1199 |         fig = br.plot(html_code=True)
1200 |         from  plotly import offline
1201 |         intro += """<p>The annotation was performed with subread/feature counts software. Here below is the percentage of reads assigned to the requested feature (usually gene; see the config file here below). </p>""" + offline.plot(fig, output_type="div", include_plotlyjs=True)
1202 |     except Exception as err:
1203 |         print(err)
1204 | 
1205 |     if manager.globals['strand_summary']:
1206 |         intro += """<h2>Strandness</h2>"""
1207 |         intro+="""
1208 |         <p>Here below is a QC plot related to the strandness found for each samples. The red dotted lines indicate a tolerance. The 0.5 vertical line correspond to an <b>unstranded</b> case. A value close to 0 indicates a <b>reversely stranded</b> case, and a value close to 1 indicates a <b>stranded</b> case.
1209 |     """.format(config["general"]['aligner'], guess)
1210 | 
1211 |         if "strandness" in config['feature_counts'] and config["feature_counts"]["strandness"]:
1212 |             choice = config["feature_counts"]["strandness"]
1213 |             intro += "User decided to set strandness to: {} </p>".format(choice)
1214 |         else:
1215 |             intro += "Strandness was guessed from the data. </p>"
1216 | 
1217 |         image = SequanaReport.png_to_embedded_png("strand", "outputs/strand_summary.png",
1218 |                      style="width:80%; height:40%")
1219 |         intro += image
1220 | 
1221 |     intro+=    """<h2>Differential analysis</h2>
1222 |     <p>
1223 |     Differentially expressed genes analysis is not performed automatically with this pipeline. However, information, feature counts, and other materials can be found in the directory <a href="./post_analysis/.">post_analysis</a> where the standalone 'sequana rnadiff' can be used with DeSEq2. Most probably an analysis is present. If so, please open the directory in  <a href="post_analysis/rnadiff/">rnadiff</a> report.
1224 |     </p>
1225 |     """
1226 | 
1227 |     # Now the final report. add the original command in the HTML report
1228 |     data = manager.getmetadata()
1229 |     s = SequanaReport(data, intro)
1230 | 
1231 |     shell("chmod -R g+w .")
1232 |     shell("rm -rf rulegraph")
1233 | 
1234 | onerror:
1235 |     manager.onerror()
1236 | 


--------------------------------------------------------------------------------