├── .gitignore
├── 00container
    └── rnaseq.def
├── LICENSE
├── README.md
├── exercise00
    └── README.md
├── exercise01
    ├── README.md
    ├── Snakefile.finished
    └── aln.sh
├── exercise02
    ├── README.md
    ├── Snakefile
    ├── Snakefile.finished
    └── myprofile
    │   └── config.yaml
├── exercise03
    ├── README.md
    ├── Snakefile
    ├── Snakefile.finished
    └── myprofile
    │   └── config.yaml
├── exercise04
    ├── README.md
    ├── Snakefile
    ├── Snakefile.finished
    └── myprofile
    │   └── config.yaml
├── exercise05
    ├── README.md
    ├── Snakefile
    └── Snakefile.finished
├── exercise06
    ├── README.md
    ├── Snakefile
    ├── dag.png
    └── rulegraph.png
├── setup.sh
└── setup.smk


/.gitignore:
--------------------------------------------------------------------------------
 1 | exercise*/rnaseq
 2 | exercise*/00*
 3 | exercise*/01qc
 4 | exercise*/02aln
 5 | exercise*/03track
 6 | exercise*/04count
 7 | exercise*/05salmon
 8 | 00container/*.simg
 9 | 00container/*.sif
10 | */config.yml
11 | .snakemake
12 | .*swp
13 | 00fastq
14 | 00ref
15 | logs
16 | bwprofile
17 | 


--------------------------------------------------------------------------------
/00container/rnaseq.def:
--------------------------------------------------------------------------------
 1 | BootStrap: library
 2 | From: hpc-staff/base-images/rocky8
 3 | 
 4 | ################################################################################
 5 | %labels
 6 | CONTAINER_VERSION=0.8
 7 | 
 8 | ################################################################################
 9 | %post
10 | 
11 | # basic packages
12 | dnf update -y
13 | dnf install -y zip wget bzip2 tar
14 | 
15 | # conda installation
16 | mkdir -p /opt
17 | cd /opt
18 | wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
19 | bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda
20 | . /opt/conda/etc/profile.d/conda.sh
21 | 
22 | 
23 | # conda environment
24 | conda create -y -n rnaseq
25 | conda activate rnaseq
26 | conda config --env --set channel_priority strict
27 | conda config --env --add channels bioconda
28 | conda config --env --add channels conda-forge
29 | conda install -qy \
30 |   samtools=1.17 \
31 |   bcftools=1.17 \
32 |   fastqc=0.11.9 \
33 |   rseqc=5.0.1 \
34 |   stringtie=2.1.4 \
35 |   trimmomatic=0.39 \
36 |   hisat2=2.2.1 \
37 |   gffutils \
38 |   ucsc-bigwigtowig \
39 |   ucsc-wigtobigwig \
40 |   blas=*=openblas \
41 |   qualimap \
42 |   'r-base>=4' \
43 |   bioconductor-deseq2 \
44 |   bioconductor-edger \
45 |   bioconductor-rsamtools \
46 |   r-tidyverse
47 | 
48 | 
49 | # $SINGULARITY_ENVIRONMENT points to an environment file in /.singularity.d/env with high precedence
50 | printf 'PATH="${PATH}:/opt/conda/envs/rnaseq/bin"\n' >> $SINGULARITY_ENVIRONMENT
51 | 
52 | 
53 | 
54 | ###
55 | ### install salmon, and subread from binary packages
56 | ###
57 | 
58 | sver=1.10.0
59 | wget --quiet https://github.com/COMBINE-lab/salmon/releases/download/v${sver}/salmon-${sver}_linux_x86_64.tar.gz
60 | mkdir -p salmon/${sver}
61 | tar -xzf salmon-${sver}_linux_x86_64.tar.gz -C salmon/${sver} --strip-components=1
62 | printf 'PATH="/opt/salmon/%s/bin:${PATH}"\n' $sver >> $SINGULARITY_ENVIRONMENT
63 | rm -f salmon-${sver}_linux_x86_64.tar.gz
64 | 
65 | rver=2.0.6
66 | wget --quiet -O subread-${rver}-Linux-x86_64.tar.gz \
67 |   https://sourceforge.net/projects/subread/files/subread-${rver}/subread-${rver}-Linux-x86_64.tar.gz/download
68 | mkdir -p subread/${rver}
69 | tar -xzf subread-${rver}-Linux-x86_64.tar.gz -C subread/${rver} --strip-components=1
70 | printf 'PATH="/opt/subread/%s/bin:${PATH}"\n' $rver >> $SINGULARITY_ENVIRONMENT
71 | rm -f subread-${rver}-Linux-x86_64.tar.gz
72 | 
73 | 
74 | # clean up so image can be shrunk
75 | conda clean -pt --yes
76 | dnf clean all
77 | 
78 | # set up the environment for the container
79 | ################################################################################
80 | %environment
81 | 
82 | export PS1="Singularity.$SINGULARITY_CONTAINER> $PS1"
83 | export LANG=en_US.UTF-8
84 | 
85 | ################################################################################
86 | %runscript
87 | 
88 | # In this case using the runscript only for self documentation.
89 | 
90 | cat <<EOF
91 | 
92 | ------------------------------------------------------------
93 | rnaseq - rnaseq pipeline tools version $CONTAINER_VER
94 | ------------------------------------------------------------
95 | 
96 | This container encapsulates tools for RNA-Seq analysis.
97 | It is intended for creating reproducible pipelines.
98 | EOF
99 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | # ===========================================================================
 2 | #
 3 | #                            PUBLIC DOMAIN NOTICE
 4 | #                     Center for Information Technology (CIT)
 5 | #                        National Institute of Health (NIH)
 6 | #
 7 | #  This software/database is a "United States Government Work" under the
 8 | #  terms of the United States Copyright Act.  It was written as part of
 9 | #  the author's official duties as a United States Government employee and
10 | #  thus cannot be copyrighted.  This software is freely available
11 | #  to the public for use.  The Center for Information Technology, The
12 | #  National Institutes of Health, and the U.S. Government have not placed
13 | #  any restriction on its use or reproduction.
14 | #
15 | #  Although all reasonable efforts have been taken to ensure the accuracy
16 | #  and reliability of the software and data, CIT, NIH and the U.S.
17 | #  Government do not and cannot warrant the performance or results that
18 | #  may be obtained by using this software or data. CIT, NIH and the U.S.
19 | #  Government disclaim all warranties, express or implied, including
20 | #  warranties of performance, merchantability or fitness for any particular
21 | #  purpose.
22 | #
23 | #  Please cite the author and the "NIH Biowulf Cluster" in any work or product
24 | #  based on this material.
25 | #
26 | # ===========================================================================
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Setup - please complete before class
 3 | ================================================================================
 4 | 
 5 | 
 6 | After cloning this repository, a number of steps (caching the singularity
 7 | container and data, creating reference files, ...) have to be carried out
 8 | before starting class. This requires `singularity`, `snakemake` and `git`
 9 | and has to be executed on a compute node.
10 | 
11 | 
12 | 
13 | On the NIH HPC systems, start an interactive session, load the snakemake and
14 | singularity modules, and clone this repository:
15 | 
16 | ```console
17 | user@headnode> sinteractive --cpus-per-task=12 --mem=24g --gres=lscratch:20
18 | ...
19 | user@cn1234> ## change to a suitable directory somewhere in /data
20 | user@cn1234> cd /data/$USER
21 | user@cn1234> module load git
22 | user@cn1234> git clone https://github.com/NIH-HPC/snakemake-class.git
23 | user@cn1234> cd snakemake-class
24 | ```
25 | 
26 | Setup will be different on other systems.
27 | 
28 | When you are ready run the setup script to fetch data and create all files
29 | necessary for the exercies.
30 | 
31 | ```
32 | user@cn1234> ./setup.sh
33 | ...
34 | +------------------------------------------------------------------------------+
35 | |                                                                              |
36 | |                Class materials have been set up successfully                 |
37 | |                                                                              |
38 | +------------------------------------------------------------------------------+
39 | ```
40 | 
41 | The `setup.smk` in the root directory of the repo takes care of all the setup
42 | required for class. It also serves as another example for a Snakemake workflow.
43 | 
44 | ## Exercises
45 | 
46 | * [Exercise 0 - Singularity refresher](/exercise00/)
47 | * [Exercise 1 - Basic snakemake rules](/exercise01/)
48 | * [Exercise 2 - Parallelizing pipelines](/exercise02/)
49 | * [Exercise 3 - Adding more rules](/exercise03/)
50 | * [Exercise 4 - Running on a HPC cluster](/exercise04/)
51 | * [Exercise 5 - Visualizing workflows](/exercise05/)
52 | 


--------------------------------------------------------------------------------
/exercise00/README.md:
--------------------------------------------------------------------------------
  1 | ## Exercise 00 - Singularity refresher.
  2 | 
  3 | In this tutorial we will be using tools packaged into a singularity container
  4 | with our workflows. This helps portability and reproducibility of workflows.
  5 | Exercise 00 is a brief introduction to singularity. While not strictly required
  6 | for this class it is useful to understand the basics.
  7 | 
  8 | For more details see the official [Singularity documentation](https://sylabs.io/guides/latest/user-guide/)
  9 | and our [Singularity overview](https://hpc.nih.gov/apps/singularity.html).
 10 | 
 11 | The container used in the exercises is `library://wresch/classes/rnaseq` and
 12 | should have locally cached during setup. After runnign the setup script it
 13 | can be found in the `00container` directory in the root of the repository along with
 14 | its definition file:
 15 | 
 16 | ```
 17 | REPO_ROOT/00container/
 18 | -rwxr-xr-x 1 user group 1.3G Feb 10 12:45 2354d2ff28bcf0b42c57fae398b4c9b5.simg
 19 | lrwxrwxrwx 1 user group   37 Sep 26 14:31 rnaseq.sif -> 2354d2ff28bcf0b42c57fae398b4c9b5.simg
 20 | -rw-r--r-- 1 user group 3.1K Feb 10 20:02 rnaseq.def
 21 | ```
 22 | 
 23 | It contains a number of tools that can be used in an RNA-Seq analysis (hisat2,
 24 | salmon, samtools, subread, R, ...). If you wanted to manually fetch this
 25 | container you could do so with
 26 | 
 27 | ```console
 28 | user@cn1234> singularity pull library://wresch/classes/rnaseq
 29 | ```
 30 | 
 31 | ### What is singularity
 32 | 
 33 | > SingularityCE is a container platform. It allows you to create and run
 34 | > containers that package up pieces of software in a way that is portable and
 35 | > reproducible. You can build a container using SingularityCE on your laptop, and
 36 | > then run it on many of the largest HPC clusters in the world, local university
 37 | > or company clusters, a single server, in the cloud, or on a workstation down
 38 | > the hall. Your container is a single file, and you don’t have to worry about
 39 | > how to install all the software you need on each different operating system.
 40 | > 
 41 | > _From_ [SingularityCE introduction](https://docs.sylabs.io/guides/latest/user-guide/introduction.html)
 42 | 
 43 | Containers in general package software an all its dependencies into a package
 44 | that can be executed in a more or less isolated environment. Singularity and
 45 | the closely related Apptainer in particular are an implementation of container
 46 | technology geared towards scientific computing.
 47 | 
 48 | ### Recap of basic singularity commands
 49 | 
 50 | If you are running this on the NIH HPC cluster, please make sure this and all
 51 | the following exercises are done within an interactive session with the
 52 | singularity and snakemake modules loaded:
 53 | 
 54 | ```console
 55 | loginnode$ sinteractive --cpus-per-task=12 --gres=lscratch:20 --mem=24g
 56 | ...
 57 | user@cn1234> module load singularity snakemake
 58 | [+] Loading singularity  4.0.3  on cn4301
 59 | [+] Loading snakemake  7.32.4
 60 | 
 61 | user@cn1234>
 62 | ```
 63 | 
 64 | singularity allows you to create and use single file containers. You can run
 65 | programs inside a container transparently: IO can be redirected; arguments
 66 | passed, and files accessed; Ports on the host system from within a container
 67 | and a user retains their identity inside the container.
 68 | 
 69 | The singularity command includes many subcommands and options. use
 70 | 
 71 | ```console
 72 | user@cn1234> singularity help
 73 | ```
 74 | 
 75 | to get more information about this tool which is the main way of interacting
 76 | with singularity containers.
 77 | 
 78 | Singularity container files, in addition to being used with the singularity command,
 79 | can also be made executable. When executed in this way, the runscript inside the
 80 | container is executed, which in our example prints a simple message.
 81 | 
 82 | ```console
 83 | user@cn1234> # define a variable containing the full path to the container for connvenience
 84 | user@cn1234> container=$(cd .. && pwd)/00container/rnaseq.sif
 85 | user@cn1234> $container
 86 | ------------------------------------------------------------
 87 | rnaseq - rnaseq pipeline tools version 0.8
 88 | ------------------------------------------------------------
 89 | 
 90 | This container encapsulates tools for RNA-Seq analysis.
 91 | It is intended for creating reproducible pipelines.
 92 | 
 93 | ```
 94 | 
 95 | `singularity exec` is used to execute programs inside the container
 96 | 
 97 | ```console
 98 | user@cn1234> singularity exec $container samtools index
 99 | Usage: samtools index [-bc] [-m INT] <in.bam> [out.index]
100 | Options:
101 |   -b       Generate BAI-format index for BAM files [default]
102 |   -c       Generate CSI-format index for BAM files
103 |   -m INT   Set minimum interval size for CSI indices to 2^INT [14]
104 |   -@ INT   Sets the number of threads [none]
105 | ```
106 | 
107 | By default, programs running inside a container have minimal access
108 | to the file systems outside the container. Let's start a shell inside our
109 | example container and try to access the /fdb directory, for example:
110 | 
111 | ```console
112 | user@cn1234> singularity shell $container
113 | Singularity> ls -lh /fdb                           
114 | ls: cannot access '/fdb': No such file or directory
115 | ```
116 | 
117 | The only external paths accessible are your home directory and the current
118 | directory (`/data/username/snakemake-class/exercise00` in this example):
119 | 
120 | ```console
121 | Singularity> pwd
122 | /data/username/snakemake-class/exercise00
123 | Singularity> cd ~
124 | Singularity> pwd
125 | /home/username
126 | Singularity> ls
127 | Desktop bin temp ...
128 | Singularity> exit
129 | user@cn1234>
130 | ```
131 | 
132 | However, other paths can be made visible to processes inside the container via
133 | what is called a bind mount. Bind mounts can be define on the command line with `-B`
134 | or by setting the `SINGULARITY_BINDPATH` variable. For example
135 | 
136 | ```console
137 | user@cn1234> singularity shell -B /fdb/STAR_indices $container
138 | Singularity> ls -l /fdb
139 | total 0                                                 
140 | drwxr-xr-x 2 user group 4096 Apr 12 11:35 STAR_indices
141 | Singularity> exit
142 | 
143 | user@cn1234> ## use outside_path:inside_path to make paths visible under a different path
144 |                 to containerized processes
145 | user@cn1234> singularity shell -B /fdb/STAR_indices:/star,/fdb/salmon:/salmon $container
146 | Singularity> ls -l /fdb
147 | ls: cannot access '/fdb': No such file or directory
148 | Singularity> ls -l /star
149 | total 58
150 | ...
151 | drwxrwxr-x 2 user group  4096 Sep 26  2018 2.5.4a
152 | ...
153 | ```
154 | 
155 | We provide a convenient script you can source to set up a bindpath
156 | variable that will make all our file systems accessible at the same paths
157 | inside the container
158 | 
159 | ```console
160 | user@cn1234> source /usr/local/current/singularity/app_conf/sing_binds
161 | user@cn1234> echo $SINGULARITY_BINDPATH
162 | /gs10,/gs11,/gs12,/vf,/spin1,/data,/fdb,/gpfs,/lscratch
163 | user@cn1234> singularity shell $container
164 | Singularity> ls /fdb
165 | 00_TO_BE_DELETED   T2T       dbscSNV       humann     purge_haplotigs
166 | ...
167 | Singularity> exit
168 | ```
169 | 
170 | For the workflows in this tutorial we will use the `SINGULARITY_BINDPATH`
171 | defined by our helper script so that paths inside and outside the container
172 | will be identical. If you only use relative paths this is not in fact
173 | necessary.
174 | 
175 | 


--------------------------------------------------------------------------------
/exercise01/README.md:
--------------------------------------------------------------------------------
  1 | ## Exercise 01 - basics of Snakemake rules
  2 | 
  3 | **Goal:** turn a shell script (`aln.sh` - alignment with hisat2) into a single
  4 | rule Snakemake file (`Snakefile`). One possible solution is given in
  5 | **Snakefile.finished**.
  6 | 
  7 | 
  8 | :information_source: Snakefiles are basically Python with some extra constructs for defining
  9 | rules based workflows. That means that **Snakefiles are whitesspace sensitive** - 
 10 | indentation matters and tabs and spaces can't be mixed. Code in these examples
 11 | uses spaces - no tabs. Please adjust your editors accordingly.
 12 | 
 13 | The script `aln.sh` takes a fastq file as an argument and aligns it to the
 14 | S. cerevisiae genome. We use the tools installed in the container that
 15 | was downloaded during the setup. If you haven't done so in the previous
 16 | exercise please set up all the required bind mounts with
 17 | 
 18 | ```console
 19 | user@cn1234> source /usr/local/current/singularity/app_conf/sing_binds
 20 | ```
 21 | 
 22 | Then run the script for one sample
 23 | 
 24 | ```console
 25 | user@cn1234> cat aln.sh
 26 | #! /bin/bash
 27 | 
 28 | fq=$1
 29 | bam=02aln/$(basename $fq .fastq.gz).bam 
 30 | 
 31 | idx=00ref/hisat_index/R64-1-1
 32 | mkdir -p 02aln
 33 | hisat2 -k 4 -x $idx -U $fq --threads 4 \
 34 |   | samtools sort -T $bam -O BAM \
 35 |   > $bam
 36 | samtools index $bam
 37 | 
 38 | user@cn1234> export CONTAINER="$(cd .. && pwd)/00container/rnaseq.sif"
 39 | user@cn1234> singularity exec "$CONTAINER" ./aln.sh 00fastq/ERR458495.fastq.gz
 40 | 
 41 | 1066501 reads; of these:
 42 |   1066501 (100.00%) were unpaired; of these:
 43 |     37947 (3.56%) aligned 0 times
 44 |     912472 (85.56%) aligned exactly 1 time
 45 |     116082 (10.88%) aligned >1 times
 46 | 96.44% overall alignment rate
 47 | 
 48 | user@cn1234> rm -rf 02aln
 49 | ```
 50 | 
 51 | Let's turn this into a simple Snakefile by formalizing the input and
 52 | output file types. Note that by convention snakemake looks for a file named
 53 | `Snakefile` if it is not given a different filename with the `-s` option.
 54 | 
 55 | A simple snakefile with a general rule for how to create bam files in the
 56 | `02aln` directory from compressed fastq files in the `00fastq` directory might
 57 | look similar to the following:
 58 | 
 59 | ```python
 60 | # this file is 'Snakefile'
 61 | rule hisat2:
 62 |     input: fq = "00fastq/{sample}.fastq.gz",
 63 |            idx = "00ref/hisat_index/R64-1-1"
 64 |     output: bam = "02aln/{sample}.bam",
 65 |             bai = "02aln/{sample}.bam.bai"
 66 |     shell:
 67 |         """
 68 |         hisat2 -k 4 -x {input.idx} -U {input.fq} --threads 4 \
 69 |           | samtools sort -T {output.bam} -O BAM \
 70 |           > {output.bam}
 71 |         samtools index {output.bam}
 72 |         """
 73 | ```
 74 | 
 75 | When snakemake is asked to create a specific file it searches for a rule with
 76 | an output pattern that matches the file name and then executes the rule, which
 77 | in this example is shell code. The input and output file names for the specific
 78 | file are available as string substitutions within the shell code section. Note that
 79 | specifying `--cores` is required.
 80 | 
 81 | ```console
 82 | user@cn1234> snakemake --cores=4 -np 02aln/ERR458502.bam
 83 | Building DAG of jobs...
 84 | Job stats:
 85 | job       count
 86 | ------  -------
 87 | hisat2        1
 88 | total         1
 89 | 
 90 | 
 91 | [Thu Sep 26 17:13:36 2024]
 92 | rule hisat2:
 93 |     input: 00fastq/ERR458502.fastq.gz, 00ref/hisat_index/R64-1-1
 94 |     output: 02aln/ERR458502.bam, 02aln/ERR458502.bam.bai
 95 |     jobid: 0
 96 |     reason: Missing output files: 02aln/ERR458502.bam
 97 |     wildcards: sample=ERR458502
 98 |     resources: tmpdir=/tmp
 99 | 
100 | 
101 |         hisat2 -k 4 -x 00ref/hisat_index/R64-1-1 -U 00fastq/ERR458502.fastq.gz --threads 4           | samtools sort -T tmp/ERR458502 -O BAM           > 02aln/ERR458502.bam
102 |         samtools index 02aln/ERR458502.bam
103 | 
104 | Job stats:
105 | job       count
106 | ------  -------
107 | hisat2        1
108 | total         1
109 | 
110 | Reasons:
111 |     (check individual jobs above for details)
112 |     missing output files:
113 |         hisat2
114 | 
115 | This was a dry-run (flag -n). The order of jobs does not reflect the order of execution.
116 | ```
117 | 
118 | This can't actually be run yet because hisat and samtools are not
119 | available on our path. However we can teach snakemake to run a particular
120 | rule inside a singularity container by making one small change:
121 | 
122 | ```python
123 | rule hisat2:
124 |     input: fq = "00fastq/{sample}.fastq.gz",
125 |            idx = "00ref/hisat_index/R64-1-1"
126 |     output: bam = "02aln/{sample}.bam",
127 |             bai = "02aln/{sample}.bam.bai"
128 |     singularity:
129 |         "library://wresch/classes/rnaseq:0.8"
130 |     shell:
131 |         """
132 |         hisat2 -k 4 -x {input.idx} -U {input.fq} --threads 4 \
133 |           | samtools sort -T tmp/{wildcards.sample} -O BAM \
134 |           > {output.bam}
135 |         samtools index {output.bam}
136 |         """
137 | ```
138 | 
139 | By default, snakemake ignores the `singularity` directive. The
140 | `--use-singularity` option is required to enable use of singularity and the
141 | `singularity` executable has to be available on the path. Additional
142 | singularity options can be passed with `--singularity-args` and the
143 | location of pulled containers can be specified with `--singularity-prefix` which
144 | defaults to `.snakemake/singularity`. We already have the container pulled
145 | down in `../00container/` so we'll use that instead of creating another copy.
146 | We are using the environment variable `$SINGULARITY_BINDPATH` so
147 | for now we don't need `--singularity-args`
148 | 
149 | 
150 | ```console
151 | user@cn1234> snakemake --cores 8 --use-singularity 02aln/ERR458502.bam \
152 |     --singularity-prefix=../00container
153 | 
154 | Building DAG of jobs...
155 | Using shell: /usr/bin/bash
156 | Provided cores: 8
157 | Rules claiming more threads will be scaled down.
158 | Job stats:
159 | job       count
160 | ------  -------
161 | hisat2        1
162 | total         1
163 | 
164 | Select jobs to execute...
165 | 
166 | [Thu Sep 26 17:19:17 2024]
167 | rule hisat2:
168 |     input: 00fastq/ERR458502.fastq.gz, 00ref/hisat_index/R64-1-1
169 |     output: 02aln/ERR458502.bam, 02aln/ERR458502.bam.bai
170 |     jobid: 0
171 |     reason: Missing output files: 02aln/ERR458502.bam
172 |     wildcards: sample=ERR458502
173 |     resources: tmpdir=/tmp
174 | 
175 | Activating singularity image /spin1/users/wresch/code/class_materials/snakemake-class/00container/2354d2ff28bcf0b42c57fae398b4c9b5.simg
176 | 1853031 reads; of these:
177 |   1853031 (100.00%) were unpaired; of these:
178 |     49450 (2.67%) aligned 0 times
179 |     1655519 (89.34%) aligned exactly 1 time
180 |     148062 (7.99%) aligned >1 times
181 | 97.33% overall alignment rate
182 | [Thu Sep 26 17:19:29 2024]
183 | Finished job 0.
184 | 1 of 1 steps (100%) done
185 | Complete log: .snakemake/log/2024-09-26T171916.905065.snakemake.log
186 | 
187 | 
188 | ```
189 | 
190 | If we ask snakemake to produce the same output file again, it recognizes
191 | that nothing has to be done because the output files are newer than
192 | the input files:
193 | 
194 | ```console
195 | user@cn1234> snakemake --cores 8 --use-singularity 02aln/ERR458502.bam \
196 |     --singularity-prefix=../00container
197 | Building DAG of jobs...
198 | Nothing to be done.
199 | ```
200 | 
201 | If the modification time on one of the input files is changed to a more
202 | recent time than the output files with `touch`, the output files are
203 | regenerated. `snakemake --summary` can be used to get a summary of 
204 | all outputs and their status.
205 | 
206 | ```console
207 | user@cn1234> touch 00fastq/ERR458502.fastq.gz
208 | user@cn1234> snakemake --summary 02aln/ERR458502.bam
209 | Building DAG of jobs...
210 | output_file     date    rule    version log-file(s)     status  plan
211 | 02aln/ERR458502.bam     Thu Sep 26 17:19:29 2024        hisat2  -               updated input files     update pending
212 | 02aln/ERR458502.bam.bai Thu Sep 26 17:19:29 2024        hisat2  -               updated input files     update pending
213 | 
214 | 
215 | user@cn1234> snakemake --cores 8 --use-singularity 02aln/ERR458502.bam \
216 |     --singularity-prefix=../00container
217 | Building DAG of jobs...
218 | Using shell: /bin/bash
219 | Provided cores: 1
220 | Rules claiming more threads will be scaled down.
221 | Job counts:
222 |         count   jobs
223 |         1       hisat2
224 |         1
225 | ...
226 | ```
227 | 
228 | Since snakemake 7.8 reruns are also triggered if parameters, code, input file
229 | set, or software stack changed. For example if we make a small change in the 
230 | hisat2 rule by adding a new empty line we can see that
231 | 
232 | ```console
233 | user@cn1234> snakemake --summary 02aln/ERR458502.bam
234 | Building DAG of jobs...
235 | output_file     date    rule    version log-file(s)     status  plan
236 | 02aln/ERR458502.bam     Thu Sep 26 17:21:58 2024        hisat2  -               rule implementation changed     update pending
237 | 02aln/ERR458502.bam.bai Thu Sep 26 17:21:58 2024        hisat2  -               rule implementation changed     update pending
238 | ```
239 | 
240 | 
241 | Now let's add a rule for cleaning up all generated files at the end of the
242 | snakefile.  A rule to clean up generated files should in general not be the
243 | first rule in a Snakefile since the first rule is the default rule snakemake
244 | runs when no rule or concrete file is provided as an argument.
245 | 
246 | ```python
247 | rule clean:
248 |     shell:
249 |         """
250 |         rm -rf 02aln
251 |         """
252 | ```
253 | 
254 | Snakemake can be asked to run a rule as long as it does not have any wildcard
255 | inputs:
256 | 
257 | ```console
258 | user@cn1234> snakemake -pn clean
259 | Building DAG of jobs...
260 | 
261 | rule clean:
262 |     jobid: 0
263 | 
264 | 
265 |         rm -rf 02aln
266 | ```
267 | 
268 | This does not work with a rule that has wildcard inputs b/c that is an abstract
269 | rule and snakemake would have no way to know what the actual input and output
270 | files should be:
271 | 
272 | ```console
273 | user@cn1234> snakemake -pn hisat2
274 | WorkflowError:                                                                                    
275 | Target rules may not contain wildcards. Please specify concrete files or a rule without wildcards.
276 | ```
277 | 
278 | So, we have to call snakemake with multiple targets to generate multiple
279 | alignments:
280 | 
281 | ```console
282 | user@cn1234> snakemake --cores 8 --use-singularity \
283 |     --singularity-prefix=../00container \
284 |     02aln/ERR458495.bam 02aln/ERR458502.bam
285 | ```
286 | 
287 | Or better - create a default rule (i.e. first rule in the file) that lists
288 | each of the desired output files as an input but does not do any actual work.
289 | Such a rule is conventionally called `all`:
290 | 
291 | ```console
292 | rule all:
293 |     input: "02aln/ERR458495.bam",
294 |            "02aln/ERR458502.bam"
295 | ```
296 | 
297 | Now, if we do a `snakemake all`, or just `snakemake`, snakemake will run the
298 | all rule which requires two inputs but has no action. It then searches for
299 | rules to generate those input files and, in this case, finds that hisat2 can generate
300 | both of the input files and executes the rule twice with the different input files:
301 | 
302 | ```console
303 | user@cn1234> snakemake --use-singularity --cores=8 --singularity-prefix=../00container all
304 | Building DAG of jobs...
305 | Using shell: /bin/bash
306 | Provided cores: 1
307 | Rules claiming more threads will be scaled down.
308 | Job counts:
309 |         count   jobs
310 |         1       all
311 |         2       hisat2
312 |         3
313 | ...
314 | ```
315 | 
316 | 
317 | 


--------------------------------------------------------------------------------
/exercise01/Snakefile.finished:
--------------------------------------------------------------------------------
 1 | # vim: set ft=python:
 2 | 
 3 | rule all:
 4 |     input: "02aln/ERR458495.bam",
 5 |            "02aln/ERR458502.bam"
 6 | 
 7 | rule clean:
 8 |     shell:
 9 |         """
10 |         rm -rf 02aln
11 |         """
12 | 
13 | rule hisat2:
14 |     input: fq = "00fastq/{sample}.fastq.gz",
15 |            idx = "00ref/hisat_index/R64-1-1"
16 |     output: bam = "02aln/{sample}.bam",
17 |             bai = "02aln/{sample}.bam.bai"
18 |     singularity:
19 |         "library://wresch/classes/rnaseq:0.8"
20 |     shell:
21 |         """
22 |         hisat2 -k 4 -x {input.idx} -U {input.fq} --threads 4 \
23 |           | samtools sort -T tmp/{wildcards.sample} -O BAM \
24 |           > {output.bam}
25 |         samtools index {output.bam}
26 |         """
27 | 


--------------------------------------------------------------------------------
/exercise01/aln.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | fq=$1
 4 | bam=02aln/$(basename $fq .fastq.gz).bam 
 5 | 
 6 | idx=00ref/hisat_index/R64-1-1
 7 | mkdir -p 02aln
 8 | hisat2 -k 4 -x $idx -U $fq --threads 4 \
 9 |   | samtools sort -T $bam -O BAM \
10 |   > $bam
11 | samtools index $bam
12 | 


--------------------------------------------------------------------------------
/exercise02/README.md:
--------------------------------------------------------------------------------
  1 | ## Exercise 02 - Parallelizing pipelines (1)
  2 | 
  3 | **Goal:** Formalize the number of CPUs each rule requires for each rule as well as 
  4 | parameters for tools used. Run workflow in parallel. Use the file `Snakefile` as a starting
  5 | point. `Snakefile.finished` is a possible solution.
  6 | 
  7 | ### Threads
  8 | 
  9 | In order to run individual tasks in parallel, snakemake needs to know how many
 10 | CPUs each rule would like to use and how many CPUs are available in total. In
 11 | our example, we would like to run hisat2 with 4 threads. If we allow snakemake
 12 | to use 8 CPUs, it could run two alignments in parallel.  If fewer than 4 CPUs
 13 | were available, the number of threads for the rule would be scaled down
 14 | accordingly. The number of CPUs each rule would like to use is specified
 15 | in the `threads` section and is available in the shell block as `{threads}`.
 16 | For example, here is the modified hisat2 rule:
 17 | 
 18 | 
 19 | ```python
 20 | rule hisat2:
 21 |     input: fq = "00fastq/{sample}.fastq.gz",
 22 |            idx = "00ref/hisat_index/R64-1-1"
 23 |     output: bam = "02aln/{sample}.bam",
 24 |             bai = "02aln/{sample}.bam.bai"
 25 |     threads: 4
 26 |     singularity:
 27 |         "library://wresch/classes/rnaseq:0.8"
 28 |     shell:
 29 |         """
 30 |         mkdir -p 02aln
 31 |         hisat2 -k 4 -x {input.idx} -U {input.fq} --threads {threads} \
 32 |           | samtools sort -T tmp/{wildcards.sample} -O BAM \
 33 |           > {output.bam}
 34 |         samtools index {output.bam}
 35 |         """
 36 | ```
 37 | 
 38 | Now, the workflow can run in parallel:
 39 | 
 40 | ```console
 41 | user@cn1234> snakemake --cores 8 --use-singularity --singularity-prefix=../00container
 42 | ```
 43 | 
 44 | ### Resources
 45 | 
 46 | In the resources section arbitrary resources (other than threads) required by
 47 | the rule can be specified. This can be used to specify, for example, the amount
 48 | of memory used by the rule (and in fact the resource name `mem_mb` is special
 49 | b/c it is used by the kubernetes runner). But it can also be used to limit the
 50 | number of concurrent I/O intensive jobs or include the walltime limit. The total
 51 | amount of resources available can be set on the command line. All resources must be
 52 | integers.
 53 | 
 54 | ```python
 55 | rule hisat2:
 56 |     input: fq = "00fastq/{sample}.fastq.gz",
 57 |            idx = "00ref/hisat_index/R64-1-1"
 58 |     output: bam = "02aln/{sample}.bam",
 59 |             bai = "02aln/{sample}.bam.bai"
 60 |     threads: 4
 61 |     resources: mem_mb = 6144
 62 |     singularity:
 63 |         "library://wresch/classes/rnaseq:0.8"
 64 |     shell:
 65 |         """
 66 |         mkdir -p 02aln
 67 |         hisat2 -k 4 -x {input.idx} -U {input.fq} --threads {threads} \
 68 |           | samtools sort -T tmp/{wildcards.sample} -O BAM \
 69 |           > {output.bam}
 70 |         samtools index {output.bam}
 71 |         """
 72 | ```
 73 | 
 74 | Note that there are different ways in which resources such as memory can
 75 | be specified. For example cluster-specific resources could also be specified
 76 | in a separate config file though that is deprecated.
 77 | 
 78 | ```console
 79 | user@cn1234> snakemake --cores 8 --resources mem_mb=12288 --use-singularity \
 80 |     --singularity-prefix=../00container
 81 | ```
 82 | 
 83 | ### Rule parameters
 84 | 
 85 | The `params` section of a rule is a good place for storing arguments
 86 | to commands used in the shell commands, tool versions, and other non-input
 87 | parameters. For example:
 88 | 
 89 | ```python
 90 | rule hisat2:
 91 |     input: fq = "00fastq/{sample}.fastq.gz",
 92 |            idx = "00ref/hisat_index/R64-1-1"
 93 |     output: bam = "02aln/{sample}.bam",
 94 |             bai = "02aln/{sample}.bam.bai"
 95 |     threads: 4
 96 |     params: hisat = "-k 4"
 97 |     resources: mem_mb = 6144
 98 |     singularity:
 99 |         "library://wresch/classes/rnaseq:0.8"
100 |     shell:
101 |         """
102 |         mkdir -p 02aln
103 |         hisat2 {params.hisat} -x {input.idx} -U {input.fq} --threads {threads} \
104 |           | samtools sort -T tmp/{wildcards.sample} -O BAM \
105 |           > {output.bam}
106 |         samtools index {output.bam}
107 |         """
108 | ```
109 | 
110 | ### Profiles
111 | 
112 | It can get pretty pretty cumbersome to re-type all the parameters for
113 | snakemake. Snakemake profiles can set default values for command line
114 | parameters.  A profile is a directory containing at a minimum a config.yaml
115 | with keys corresponding to command line flags. A profile specified by
116 | name on the command line is search for in `$HOME/.config/snakemake`. Alternatively
117 | profiles can be specified by path. There is a simple profile available in
118 | this directory which specifies all the required singularity settings so
119 | we don't have to include them on the command line any more. Note that
120 | profiles can be used to do much more sophisticated configuration and for
121 | later exercises we will use the biowulf-specific profile downloaded
122 | by the setup script.
123 | 
124 | ```console
125 | user@cn1234> cat myprofile/config.yaml
126 | use-singularity: true
127 | singularity-prefix: ../00container
128 | 
129 | user@cn1234> snakemake --cores 8 --profile ./myprofile
130 | ```
131 | 


--------------------------------------------------------------------------------
/exercise02/Snakefile:
--------------------------------------------------------------------------------
 1 | # vim: set ft=python:
 2 | 
 3 | rule all:
 4 |     input: "02aln/ERR458495.bam",
 5 |            "02aln/ERR458502.bam"
 6 | 
 7 | rule clean:
 8 |     shell:
 9 |         """
10 |         rm -rf 02aln
11 |         """
12 | 
13 | rule hisat2:
14 |     input: fq = "00fastq/{sample}.fastq.gz",
15 |            idx = "00ref/hisat_index/R64-1-1"
16 |     output: bam = "02aln/{sample}.bam",
17 |             bai = "02aln/{sample}.bam.bai"
18 |     singularity:
19 |         "library://wresch/classes/rnaseq:0.8"
20 |     shell:
21 |         """
22 |         hisat2 -k 4 -x {input.idx} -U {input.fq} --threads 4 \
23 |           | samtools sort -T tmp/{wildcards.sample} -O BAM \
24 |           > {output.bam}
25 |         samtools index {output.bam}
26 |         """
27 | 


--------------------------------------------------------------------------------
/exercise02/Snakefile.finished:
--------------------------------------------------------------------------------
 1 | # vim: set ft=python:
 2 | 
 3 | rule all:
 4 |     input: "02aln/ERR458495.bam",
 5 |            "02aln/ERR458502.bam"
 6 | 
 7 | rule clean:
 8 |     shell:
 9 |         """
10 |         rm -rf 02aln
11 |         """
12 | 
13 | rule hisat2:
14 |     input: fq = "00fastq/{sample}.fastq.gz",
15 |            idx = "00ref/hisat_index/R64-1-1"
16 |     output: bam = "02aln/{sample}.bam",
17 |             bai = "02aln/{sample}.bam.bai"
18 |     threads: 4
19 |     resources: mem_mb = 6144
20 |     params: hisat = "-k 4"
21 |     singularity:
22 |         "library://wresch/classes/rnaseq:0.8"
23 |     shell:
24 |         """
25 |         hisat2 {params.hisat} -x {input.idx} -U {input.fq} --threads {threads} \
26 |           | samtools sort -T tmp/{wildcards.sample} -O BAM \
27 |           > {output.bam}
28 |         samtools index {output.bam}
29 |         """
30 | 


--------------------------------------------------------------------------------
/exercise02/myprofile/config.yaml:
--------------------------------------------------------------------------------
1 | use-singularity: true
2 | singularity-prefix: ../00container
3 | 


--------------------------------------------------------------------------------
/exercise03/README.md:
--------------------------------------------------------------------------------
 1 | ## Add a new rule to count RNA-Seq reads per gene
 2 | 
 3 | **Goal:** Add a new rule that summarizes the count of RNA-Seq reads per gene
 4 | for all 6 samples. The starting point, as before, is `Snakefile` and an example
 5 | of a finished workflow is provided in `Snakefile.finished`
 6 | 
 7 | 
 8 | ### Add the featureCounts rule
 9 | 
10 | `featureCounts` from the subread package is used to count the number of reads
11 | from each alignment mapping to each gene. It's **inputs** are a bam file and
12 | the annotation file `00ref/R64-1-1.genes.gtf`. It's **output** is a per sample
13 | count file in the `04count` directory. It's expected to use 4GB of memory and
14 | run 2 threads.
15 | 
16 | Here is a possible implementation of this rule:
17 | 
18 | ```python
19 | rule count:
20 |     input: bam = "02aln/{sample}.bam",
21 |            annot = "00ref/R64-1-1.genes.gtf"
22 |     output: "04count/{sample}"
23 |     threads: 2
24 |     resources: mem_mb = 4096
25 |     singularity:
26 |         "library://wresch/classes/rnaseq:0.8"
27 |     shell:
28 |         """
29 |         featureCounts -a {input.annot} -o {output} \
30 |                 -T {threads} --minOverlap 10 {input.bam}
31 |         """
32 | ```
33 | 
34 | Now, since the final output of the pipeline as it is are the count files, the
35 | `all` rule has to be changed to request the generation of the count files.
36 | Alignment files don't have to be specified any more b/c snakemake will
37 | automatically determine that alignments are required to generate count files.
38 | The new `all` rule would therefore be
39 | 
40 | ```python
41 | rule all:
42 |     input: "04count/ERR458495",
43 |            "04count/ERR458502"
44 | ```
45 | 
46 | ### Extend the pipeline to include all 6 samples
47 | 
48 | Snakefiles are essentially python and arbitrary python code can be used in
49 | many places. Let's take advantage of this and extend the workflow to include
50 | all fastq files present in the `00fastq` directory:
51 | 
52 | ```python
53 | import os.path
54 | from glob import glob
55 | 
56 | # use the glob function to get all fastq files from the 00fastq directory
57 | # extract the sample name from each path (assumes that there is one fastq per sample)
58 | samples = [os.path.basename(a).replace(".fastq.gz", "") for a in glob("00fastq/*.fastq.gz")]
59 | 
60 | # the all rule creates a list of all count files that sould be generated based on
61 | # the list of samples
62 | rule all:
63 |     input: expand("04count/{s}", s=samples)
64 | ```
65 | 
66 | Now the workflow can be run on all samples in parallel. Again using a profile that
67 | sets the correct options for using the singularity container.
68 | 
69 | ```console
70 | user@cn1234> snakemake --profile ./myprofile --cores 12 --resources mem_mb=12288
71 | Building DAG of jobs...
72 | Using shell: /bin/bash
73 | Provided cores: 12
74 | Rules claiming more threads will be scaled down.
75 | Provided resources: mem_mb=12288
76 | Job counts:
77 |         count   jobs
78 |         1       all
79 |         6       count
80 |         6       hisat2
81 |         13
82 | ...
83 | 
84 | 


--------------------------------------------------------------------------------
/exercise03/Snakefile:
--------------------------------------------------------------------------------
 1 | # vim: set ft=python:
 2 | 
 3 | ###
 4 | ### Exercise 3 - incorporate the featureCounts chunk as a rule
 5 | ###
 6 | 
 7 | rule all:
 8 |     input: "02aln/ERR458495.bam",
 9 |            "02aln/ERR458502.bam"
10 | 
11 | rule clean:
12 |     shell:
13 |         """
14 |         rm -rf 02aln
15 |         """
16 | 
17 | ###
18 | ### FeatureCounts chunk - make this into a rule and incorporate it into the pipeline
19 | ###
20 | 
21 | # Input files are a bam file and the annotation file '00ref/R64-1-1.genes.gtf'.
22 | # Output for each sample should to to directory 04count
23 | # featureCounts should be run with 2 threads and is expected to use less than 4GB
24 | # of memory.
25 | 
26 |     shell:
27 |         """
28 |         featureCounts -a {input.annot} -o {output} \
29 |                 -T {threads} --minOverlap 10 {input.bam}
30 |         """
31 | 
32 | rule hisat2:
33 |     input: fq = "00fastq/{sample}.fastq.gz",
34 |            idx = "00ref/hisat_index/R64-1-1"
35 |     output: bam = "02aln/{sample}.bam",
36 |             bai = "02aln/{sample}.bam.bai"
37 |     threads: 4
38 |     resources: mem_mb = 6144
39 |     params: hisat = "-k 4"
40 |     singularity:
41 |         "library://wresch/classes/rnaseq:0.8"
42 |     shell:
43 |         """
44 |         hisat2 {params.hisat} -x {input.idx} -U {input.fq} --threads {threads} \
45 |           | samtools sort -T tmp/{wildcards.sample} -O BAM \
46 |           > {output.bam}
47 |         samtools index {output.bam}
48 |         """
49 | 


--------------------------------------------------------------------------------
/exercise03/Snakefile.finished:
--------------------------------------------------------------------------------
 1 | # vim: set ft=python:
 2 | 
 3 | ###
 4 | ### Exercise 3 - incorporate the featureCounts chunk as a rule
 5 | ###
 6 | import os.path
 7 | from glob import glob
 8 | 
 9 | samples = [os.path.basename(a).replace(".fastq.gz", "") for a in glob("00fastq/*.fastq.gz")]
10 | 
11 | rule all:
12 |     input: expand("04count/{s}", s=samples)
13 | 
14 | rule clean:
15 |     shell:
16 |         """
17 |         rm -rf 02aln 04count
18 |         """
19 | 
20 | ###
21 | ### FeatureCounts chunk - make this into a rule and incorporate it into the pipeline
22 | ###
23 | 
24 | rule count:    
25 |     input: bam = "02aln/{sample}.bam",
26 |            annot = "00ref/R64-1-1.genes.gtf"
27 |     output: "04count/{sample}"
28 |     threads: 2
29 |     resources: mem_mb = 4096
30 |     singularity:
31 |         "library://wresch/classes/rnaseq:0.8"
32 |     shell:
33 |         """
34 |         featureCounts -a {input.annot} -o {output} \
35 |                 -T {threads} --minOverlap 10 {input.bam}
36 |         """
37 | 
38 | rule hisat2:
39 |     input: fq = "00fastq/{sample}.fastq.gz",
40 |            idx = "00ref/hisat_index/R64-1-1"
41 |     output: bam = "02aln/{sample}.bam",
42 |             bai = "02aln/{sample}.bam.bai"
43 |     threads: 4
44 |     resources: mem_mb = 6144
45 |     params: hisat = "-k 4"
46 |     singularity:
47 |         "library://wresch/classes/rnaseq:0.8"
48 |     shell:
49 |         """
50 |         hisat2 {params.hisat} -x {input.idx} -U {input.fq} --threads {threads} \
51 |           | samtools sort -T tmp/{wildcards.sample} -O BAM \
52 |           > {output.bam}
53 |         samtools index {output.bam}
54 |         """
55 |  
56 | 


--------------------------------------------------------------------------------
/exercise03/myprofile/config.yaml:
--------------------------------------------------------------------------------
1 | use-singularity: true
2 | singularity-prefix: ../00container
3 | 


--------------------------------------------------------------------------------
/exercise04/README.md:
--------------------------------------------------------------------------------
 1 | ## Configuration of workflows
 2 | 
 3 | **Goal:** Extract sample information and the path to the hisat index from
 4 | a configuration file in yaml. As in the other exercises, the starting
 5 | file is `Snakemake` and the final product is `Snakemake.finished`.
 6 | 
 7 | Snakemake workflows can make use of configuration files in [yaml](yaml.org)
 8 | or [json](https://www.json.org/) format. The configuration file can be specified
 9 | at the top level of the Snakefile. For example:
10 | 
11 | ```python
12 | configfile: "config.yml"
13 | 
14 | rule all:
15 |     input: expand("04count/{s}", s=samples)
16 | ```
17 | 
18 | or on the command line.
19 | 
20 | The config file is parsed and made available as a global dictionary named
21 | `config`. So, given the following config file
22 | 
23 | ```yaml
24 | samples:
25 |   ERR458502:
26 |     gt: SNF2
27 |     rep: 1
28 |   ERR458509:
29 |     gt: SNF2
30 |     rep: 2
31 |   ERR458516:
32 |     gt: SNF2
33 |     rep: 3
34 |   ERR458495:
35 |     gt: WT
36 |     rep: 1
37 |   ERR458880:
38 |     gt: WT
39 |     rep: 2
40 |   ERR458887:
41 |     gt: WT
42 |     rep: 3
43 | reference:
44 |   ensembl_ver: 88
45 |   genome_build: R64-1-1
46 |   hisat_index: 00ref/hisat_index/R64-1-1
47 |   genome_file: 00ref/R64-1-1.fa
48 |   cdna_file: 00ref/R64-1-1.cdna_nc.fa
49 | ```
50 | 
51 | the `config` dict would look like this:
52 | 
53 | ```python
54 | {'reference': {'cdna_file': '00ref/R64-1-1.cdna_nc.fa',
55 |                'ensembl_ver': 88,
56 |                'genome_build': 'R64-1-1',
57 |                'genome_file': '00ref/R64-1-1.fa',
58 |                'hisat_index': '00ref/hisat_index/R64-1-1'},
59 |  'samples': {'ERR458495': {'gt': 'WT', 'rep': 1},
60 |              'ERR458502': {'gt': 'SNF2', 'rep': 1},
61 |              'ERR458509': {'gt': 'SNF2', 'rep': 2},
62 |              'ERR458516': {'gt': 'SNF2', 'rep': 3},
63 |              'ERR458880': {'gt': 'WT', 'rep': 2},
64 |              'ERR458887': {'gt': 'WT', 'rep': 3}}}
65 | ```
66 | 
67 | The sample list can be extracted like so:
68 | ```python
69 | configfile: "config.yml"
70 | samples = config["samples"].keys()
71 | 
72 | rule all:
73 |     input: expand("04count/{s}", s=samples)
74 | ```
75 | 
76 | and the hisat index in the hisat rule like this:
77 | ```python
78 | rule hisat2:                                       
79 |     input: fq = "00fastq/{sample}.fastq.gz",       
80 |            idx = config["reference"]["hisat_index"]
81 |     ...
82 | ```
83 | 


--------------------------------------------------------------------------------
/exercise04/Snakefile:
--------------------------------------------------------------------------------
 1 | # vim: set ft=python:
 2 | 
 3 | ###
 4 | ### Exercise 4 - using a configuration file for extracting hisat index path and sample ids
 5 | ###
 6 | 
 7 | # config file is loaded as a global dict named 'config'
 8 | #  extract a list of samples names from the config dict and assign it to 'samples'
 9 | configfile: "config.yml"
10 | 
11 | rule all:
12 |     input: expand("04count/{s}", s=samples)
13 | 
14 | rule clean:
15 |     shell:
16 |         """
17 |         rm -rf 02aln 04count
18 |         """
19 | 
20 | rule count:    
21 |     input: bam = "02aln/{sample}.bam",
22 |            annot = "00ref/R64-1-1.genes.gtf"
23 |     output: "04count/{sample}"
24 |     threads: 2
25 |     resources: mem_mb = 4096
26 |     singularity:
27 |         "library://wresch/classes/rnaseq:0.8"
28 |     shell:
29 |         """
30 |         featureCounts -a {input.annot} -o {output} \
31 |                 -T {threads} --minOverlap 10 {input.bam}
32 |         """
33 | 
34 | # extract the location of the hisat index from the config dict
35 | rule hisat2:
36 |     input: fq = "00fastq/{sample}.fastq.gz",
37 |            idx = "00ref/hisat_index/R64-1-1"
38 |     output: bam = "02aln/{sample}.bam",
39 |             bai = "02aln/{sample}.bam.bai"
40 |     threads: 4
41 |     resources: mem_mb = 6144
42 |     params: hisat = "-k 4"
43 |     singularity:
44 |         "library://wresch/classes/rnaseq:0.8"
45 |     shell:
46 |         """
47 |         hisat2 {params.hisat} -x {input.idx} -U {input.fq} --threads {threads} \
48 |           | samtools sort -T tmp/{wildcards.sample} -O BAM \
49 |           > {output.bam}
50 |         samtools index {output.bam}
51 |         """
52 | 
53 | 


--------------------------------------------------------------------------------
/exercise04/Snakefile.finished:
--------------------------------------------------------------------------------
 1 | # vim: set ft=python:
 2 | 
 3 | ###
 4 | ### Exercise 4 - using a configuration file for extracting hisat index path and sample ids
 5 | ###
 6 | 
 7 | # config file is loaded as a global dict named 'config'
 8 | configfile: "config.yml"
 9 | samples = config["samples"].keys()
10 | 
11 | rule all:
12 |     input: expand("04count/{s}", s=samples)
13 | 
14 | rule clean:
15 |     shell:
16 |         """
17 |         rm -rf 02aln 04count
18 |         """
19 | 
20 | rule count:    
21 |     input: bam = "02aln/{sample}.bam",
22 |            annot = "00ref/R64-1-1.genes.gtf"
23 |     output: "04count/{sample}"
24 |     threads: 2
25 |     resources: mem_mb = 4096
26 |     singularity:
27 |         "library://wresch/classes/rnaseq:0.8"
28 |     shell:
29 |         """
30 |         featureCounts -a {input.annot} -o {output} \
31 |                 -T {threads} --minOverlap 10 {input.bam}
32 |         """
33 | 
34 | ## hisat index should be obtained from the global configuration file
35 | rule hisat2:
36 |     input: fq = "00fastq/{sample}.fastq.gz",
37 |            idx = config["reference"]["hisat_index"]
38 |     output: bam = "02aln/{sample}.bam",
39 |             bai = "02aln/{sample}.bam.bai"
40 |     threads: 4
41 |     resources: mem_mb = 6144
42 |     params: hisat = "-k 4"
43 |     singularity:
44 |         "library://wresch/classes/rnaseq:0.8"
45 |     shell:
46 |         """
47 |         hisat2 {params.hisat} -x {input.idx} -U {input.fq} --threads {threads} \
48 |           | samtools sort -T tmp/{wildcards.sample} -O BAM \
49 |           > {output.bam}
50 |         samtools index {output.bam}
51 |         """
52 | 
53 | 


--------------------------------------------------------------------------------
/exercise04/myprofile/config.yaml:
--------------------------------------------------------------------------------
1 | use-singularity: true
2 | singularity-prefix: ../00container
3 | 


--------------------------------------------------------------------------------
/exercise05/README.md:
--------------------------------------------------------------------------------
  1 | ## Parallelizing workflows across the cluster
  2 | 
  3 | **Goal:** Run the workflow in parallel by submitting some jobs to the cluster
  4 | as batch jobs. As before, use `Snakefile` as a starting point.
  5 | 
  6 | The changes required to the Snakefile in this case are minimal. We explicitly
  7 | declare which rules are to be executed on the same host as the main snakemake
  8 | process even if submitting other jobs as cluster batch jobs. At the top level
  9 | of the Snakefile:
 10 | 
 11 | ```python
 12 | localrules: all, clean
 13 | ```
 14 | 
 15 | Snakemake can be taught to submit batch jobs by providing a template string
 16 | that has access to many of the properties of jobs (threads, resources, params)
 17 | specified in the Snakefile, as well as parameters defined in a cluster config
 18 | file. This can be done with plain snakemake. However, there is a Biowulf-specific
 19 | [profile](https://github.com/NIH-HPC/snakemake_profile) that takes care of of job submission
 20 | and job status checking without taxing the slurm scheduler. Rather than refining
 21 | the profile from the previous exercise, we will switch to the Biowulf profile.
 22 | Instead of specifying the profile manually every time we run snakemake
 23 | we'll set an environment variable
 24 | 
 25 | ```console
 26 | user@cn1234> export SNAKEMAKE_PROFILE="$(cd .. && pwd)/bwprofile"
 27 | user@cn1234> cat $SNAKEMAKE_PROFILE/config.yaml
 28 | restart-times: 0
 29 | jobscript: slurm_jobscript.sh
 30 | cluster: bw_submit.py
 31 | cluster-status: bw_status.py
 32 | cluster-cancel: scancel
 33 | max-jobs-per-second: 1
 34 | max-status-checks-per-second: 1
 35 | local-cores: 4
 36 | latency-wait: 240
 37 | jobs: 50
 38 | keep-going: True
 39 | rerun-incomplete: True
 40 | # note that the space before the -- is necessary - otherwise cluster
 41 | # execution failes as --cleanenv gets interpreted as a new option since
 42 | # snakemake passes this on as --singularity-args "--cleanenv" - i.e. without
 43 | # a `=`
 44 | singularity-args: " --cleanenv"
 45 | use-singularity: true
 46 | singularity-prefix: /data/user/snakemake-class/00container
 47 | ```
 48 | 
 49 | The last line will be specific to your class directory. Use `snakemake --help`
 50 | to understand the different options specified in the profile.
 51 | In brief, here are some of the most relevant options:
 52 | 
 53 |   - `-k`, `--keep-going`: By default, snakemake will quit if a job fails (after waiting
 54 |     for running jobs to finish. `-k` will make snakemake continue with independent jobs.
 55 |   - `-w`, `--latency-wait`, `--output-wait`: The amount of time snakemake will wait for
 56 |     output files to appear after a job has finished. This defaults to a low 5s. On the
 57 |     shared file systems latency may be higher. Raising it to 120s is a bit excessive, but
 58 |     it doesn't really hurt too much.
 59 |   - `--local-cores`: The number of CPUs available for local rules
 60 |   - `--max-jobs-per-second`: Max numbers of jobs to submit per second. Please be kind
 61 |     to the batch scheduler.
 62 |   - `--cluster`: The template string used to submit each (non local) job.
 63 |   - `--jobs`: The number of jobs to run concurrently.
 64 |   - `--cluster-config`: The cluster config file
 65 | 
 66 | :information_source: Please **do not run snakemake workflows on the login node**, 
 67 | even if submitting jobs as batch jobs. Run the main process as a 
 68 | batch job itself or, if the workflow runs quickly enough, from an sinteractive
 69 | session
 70 | 
 71 | So in our example (after adding the `localrules` declaration described earlier and
 72 | setting `$SNAKEMAKE_PROFILE`):
 73 | 
 74 | ```console
 75 | user@cn1234> snakemake 
 76 | Building DAG of jobs...
 77 | Using shell: /usr/bin/bash
 78 | Provided cluster nodes: 50
 79 | Job stats:
 80 | job       count
 81 | ------  -------
 82 | all           1
 83 | count         6
 84 | hisat2        6
 85 | total        13
 86 | 
 87 | Select jobs to execute...
 88 | 
 89 | [Thu Sep 26 18:48:20 2024]
 90 | rule hisat2:
 91 |     input: 00fastq/ERR458495.fastq.gz, 00ref/hisat_index/R64-1-1
 92 |     output: 02aln/ERR458495.bam, 02aln/ERR458495.bam.bai
 93 |     jobid: 8
 94 |     reason: Missing output files: 02aln/ERR458495.bam
 95 |     wildcards: sample=ERR458495
 96 |     threads: 4
 97 |     resources: mem_mb=6144, mem_mib=5860, disk_mb=1000, disk_mib=954, tmpdir=<TBD>
 98 | 
 99 | hisat2: submission command "sbatch --cpus-per-task=4 --mem=6144 --time=120 --gres=lscratch:1 --output=logs/hisat2-%j.out --partition=quick /spin1/users/wresch/code/class_materials/snakemake-class/exercise05/.snakemake/tmp.cxnvsixj/snakejob.hisat2.8.sh
100 | Submitted job 8 with external jobid '36450266'.
101 | 
102 | ...
103 | ```
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/exercise05/Snakefile:
--------------------------------------------------------------------------------
 1 | # vim: set ft=python:
 2 | 
 3 | ###
 4 | ### Exercise 5 - run tasks except for all and clean on the cluster as batch jobs
 5 | ###
 6 | #
 7 | # use a cluster config file. Note that we are requesting lscratch for all jobs
 8 | # and are instructing singularity to mount /lscratch/$SLURM_JOB_ID as /tmp in
 9 | # the container. Ensure that all and clean run locally, not as batch jobs
10 | 
11 | configfile: "config.yml"
12 | samples = config["samples"].keys()
13 | 
14 | rule all:
15 |     input: expand("04count/{s}", s=samples)
16 | 
17 | rule clean:
18 |     shell:
19 |         """
20 |         rm -rf 02aln 04count
21 |         """
22 | 
23 | rule count:    
24 |     input: bam = "02aln/{sample}.bam",
25 |            annot = "00ref/R64-1-1.genes.gtf"
26 |     output: "04count/{sample}"
27 |     threads: 2
28 |     resources: mem_mb = 4096
29 |     singularity:
30 |         "library://wresch/classes/rnaseq:0.8"
31 |     shell:
32 |         """
33 |         featureCounts -a {input.annot} -o {output} \
34 |                 -T {threads} --minOverlap 10 {input.bam}
35 |         """
36 | 
37 | rule hisat2:
38 |     input: fq = "00fastq/{sample}.fastq.gz",
39 |            idx = config["reference"]["hisat_index"]
40 |     output: bam = "02aln/{sample}.bam",
41 |             bai = "02aln/{sample}.bam.bai"
42 |     threads: 4
43 |     resources: mem_mb = 6144
44 |     params: hisat = "-k 4"
45 |     singularity:
46 |         "library://wresch/classes/rnaseq:0.8"
47 |     shell:
48 |         """
49 |         hisat2 {params.hisat} -x {input.idx} -U {input.fq} --threads {threads} \
50 |           | samtools sort -T /tmp/{wildcards.sample} -O BAM \
51 |           > {output.bam}
52 |         samtools index {output.bam}
53 |         ls -lh /tmp
54 |         """
55 | 
56 | 


--------------------------------------------------------------------------------
/exercise05/Snakefile.finished:
--------------------------------------------------------------------------------
 1 | # vim: set ft=python:
 2 | 
 3 | ###
 4 | ### Exercise 5 - run tasks except for all and clean on the cluster as batch jobs
 5 | ###
 6 | #
 7 | # use a cluster config file. Note that we are requesting lscratch for all jobs
 8 | # and are instructing singularity to mount /lscratch/$SLURM_JOB_ID as /tmp in
 9 | # the container. Ensure that all and clean run locally, not as batch jobs
10 | 
11 | configfile: "config.yml"
12 | samples = config["samples"].keys()
13 | 
14 | localrules: all, clean
15 | 
16 | rule all:
17 |     input: expand("04count/{s}", s=samples)
18 | 
19 | rule clean:
20 |     shell:
21 |         """
22 |         rm -rf 02aln 04count
23 |         """
24 | 
25 | rule count:    
26 |     input: bam = "02aln/{sample}.bam",
27 |            annot = "00ref/R64-1-1.genes.gtf"
28 |     output: "04count/{sample}"
29 |     threads: 2
30 |     resources: mem_mb = 4096
31 |     singularity:
32 |         "library://wresch/classes/rnaseq:0.8"
33 |     shell:
34 |         """
35 |         featureCounts -a {input.annot} -o {output} \
36 |                 -T {threads} --minOverlap 10 {input.bam}
37 |         """
38 | 
39 | rule hisat2:
40 |     input: fq = "00fastq/{sample}.fastq.gz",
41 |            idx = config["reference"]["hisat_index"]
42 |     output: bam = "02aln/{sample}.bam",
43 |             bai = "02aln/{sample}.bam.bai"
44 |     threads: 4
45 |     resources: mem_mb = 6144
46 |     params: hisat = "-k 4"
47 |     singularity:
48 |         "library://wresch/classes/rnaseq:0.8"
49 |     shell:
50 |         """
51 |         hisat2 {params.hisat} -x {input.idx} -U {input.fq} --threads {threads} \
52 |           | samtools sort -T /tmp/{wildcards.sample} -O BAM \
53 |           > {output.bam}
54 |         samtools index {output.bam}
55 |         ls -lh /tmp
56 |         """
57 | 
58 | 


--------------------------------------------------------------------------------
/exercise06/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Visualizing a larger workflow
 3 | 
 4 | This `Snakefile` includes a whole RNA-Seq workflow from fastq to counts. Please
 5 | note that this workflow is meant for illustration and should not be considered
 6 | a best practices workflow.
 7 | 
 8 | With the profile and the config files, this wokflow can be run on the cluster with
 9 | 
10 | ```console
11 | user@cn1234> snakemake
12 | ```
13 | 
14 | There are two ways to visualize the workflow. A rulegraph illustrates the relationship of the
15 | rules to each other. Each rule is only shown once.
16 | 
17 | ```console
18 | user@cn1234> module load graphviz
19 | user@cn1234> snakemake --rulegraph | dot -Tpng > rulegraph.png
20 | ```
21 | 
22 | <img width="600" alt="workflow rulegraph" src="./rulegraph.png" />
23 | 
24 | To display all the jobs that would be run to generate all output files and how
25 | they relate to each other use the following:
26 | 
27 | ```console
28 | user@cn1234> snakemake --dag | dot -Tpng > dag.png
29 | ```
30 | 
31 | <img width="800" alt="workflow dag" src="./dag.png" />
32 | 


--------------------------------------------------------------------------------
/exercise06/Snakefile:
--------------------------------------------------------------------------------
  1 | # vim: set ft=python:
  2 | 
  3 | ###
  4 | ### Exercise 6 - read through the snakefile and create a dag/rulegraph
  5 | ###
  6 | 
  7 | configfile: "config.yml"
  8 | samples = config["samples"].keys()
  9 | 
 10 | localrules: all, merge_counts, clean
 11 | 
 12 | rule all:
 13 |     input: expand("01qc/{s}_fastqc.html", s=samples),
 14 |            expand("02aln/{s}.bam", s=samples),
 15 |            expand("01qc/{s}.infer_experiment", s=samples),
 16 |            "04count/merged",
 17 |            expand("05salmon/{s}", s=samples)
 18 | 
 19 | rule clean:
 20 |     shell:
 21 |         """
 22 |         rm -rf 01qc 02aln 03track 04count 05salmon
 23 |         """
 24 | 
 25 | rule fastqc:
 26 |     """
 27 |     the shell portion of rules run in a singularity container
 28 |     is passed to singularity as a string. Therefore shell
 29 |     variables have to be escaped or code moved to an external
 30 |     script
 31 |     """
 32 |     input: "00fastq/{sample}.fastq.gz"
 33 |     output: pkg = "01qc/{sample}_fastqc.zip",
 34 |             summary = "01qc/{sample}_fastqc_summary.txt",
 35 |             data = "01qc/{sample}_fastqc_data.txt",
 36 |             html = "01qc/{sample}_fastqc.html"
 37 |     threads: 1
 38 |     resources: mem_mb = 2048
 39 |     singularity:
 40 |         "library://wresch/classes/rnaseq:0.8"
 41 |     shell:
 42 |         """
 43 |         odir=01qc/{wildcards.sample}_fastqc
 44 |         fastqc -o 01qc --extract {input}
 45 |         mv $odir/summary.txt {output.summary}
 46 |         mv $odir/fastqc_data.txt {output.data}
 47 |         rm -rf $odir
 48 |         """
 49 | 
 50 | rule hisat2:
 51 |     input: fq = "00fastq/{sample}.fastq.gz",
 52 |            idx = config["reference"]["hisat_index"]
 53 |     output: bam = "02aln/{sample}.bam",
 54 |             bai = "02aln/{sample}.bam.bai"
 55 |     threads: 4
 56 |     resources: mem_mb = 6144
 57 |     params: hisat = "-k 4"
 58 |     singularity:
 59 |         "library://wresch/classes/rnaseq:0.8"
 60 |     shell:
 61 |         """
 62 |         hisat2 {params.hisat} -x {input.idx} -U {input.fq} --threads {threads} \
 63 |           | samtools sort -T /tmp/{wildcards.sample} -O BAM \
 64 |           > {output.bam}
 65 |         samtools index {output.bam}
 66 |         ls -lh /tmp
 67 |         """
 68 | 
 69 | 
 70 | rule rseqc:
 71 |     """
 72 |     post-process the sorted bam file.
 73 |     Copy bam file to /tmp  and operate there. This is done b/c /tmp is
 74 |     a bind mount of /lscratch/$SLURM_JOB_ID.
 75 |     """
 76 |     input: bam = "02aln/{sample}.bam",
 77 |            bai = "02aln/{sample}.bam.bai",
 78 |            bed = "00ref/R64-1-1.genes.bed12",
 79 |            gs  = "00ref/chromosomes"
 80 |     output: ie  = "01qc/{sample}.infer_experiment",
 81 |             gb  = "01qc/{sample}.geneBodyCoverage.txt",
 82 |             gb2 = "01qc/{sample}.geneBodyCoverage.r",
 83 |             gb3 = "01qc/{sample}.geneBodyCoverage.curves.pdf",
 84 |             wig = "03track/{sample}.wig"
 85 |     threads: 1
 86 |     resources: mem_mb = 4096
 87 |     params: tmp_bam = lambda wc: "/tmp/{s}.bam".format(s=wc.sample)
 88 |     singularity:
 89 |         "library://wresch/classes/rnaseq:0.8"
 90 |     shell:
 91 |         """
 92 |         cp {input.bam} {input.bai} /tmp
 93 |         infer_experiment.py -i {params.tmp_bam} -r {input.bed} > {output.ie}
 94 |         geneBody_coverage.py -i {params.tmp_bam} -r {input.bed} \
 95 |                 -o 01qc/{wildcards.sample}
 96 |         bam2wig.py -i {params.tmp_bam} -s {input.gs} \
 97 |                 -o $(echo {output.wig} | sed 's/.wig//') -t 1000000 -u
 98 |         """
 99 | 
100 | 
101 | rule count:    
102 |     input: bam = "02aln/{sample}.bam",
103 |            annot = "00ref/R64-1-1.genes.gtf"
104 |     output: "04count/{sample}"
105 |     threads: 2
106 |     resources: mem_mb = 4096
107 |     singularity:
108 |         "library://wresch/classes/rnaseq:0.8"
109 |     shell:
110 |         """
111 |         featureCounts -a {input.annot} -o {output} \
112 |                 -T {threads} --minOverlap 10 {input.bam}
113 |         """
114 | 
115 | rule merge_counts:
116 |     input: expand("04count/{s}", s=samples)
117 |     output: "04count/merged"
118 |     threads: 1
119 |     shell:
120 |         """
121 |         set -x
122 |         rm -f {output}
123 |         for f in {input}; do
124 |             if [[ -e {output} ]]; then
125 |                 paste {output} <(tail -n +2 $f | awk '{{print $7"\t"$1}}') \
126 |                     | awk -F'\t' 'BEGIN{{OFS="\t"}} $NF != $1 {{exit 1}} {{NF-=1; print}}' \
127 |                     > {output}.tmp && mv {output}.tmp {output}
128 |             else
129 |                 tail -n +2 $f > {output}
130 |             fi
131 |         done
132 |         """
133 | 
134 | rule salmon_index:
135 |     input: config["reference"]["cdna_file"]
136 |     output: "00ref/salmon_index"
137 |     threads: 4
138 |     resources: mem_mb = 6144
139 |     singularity:
140 |         "library://wresch/classes/rnaseq:0.8"
141 |     shell:
142 |         """
143 |         rm -rf {output}
144 |         salmon index -t {input} -k 21 -i {output} -p {threads}
145 |         """
146 | 
147 | rule salmon_quant:
148 |     input: fq = "00fastq/{sample}.fastq.gz",
149 |            idx = "00ref/salmon_index"
150 |     output: "05salmon/{sample}/quant.sf"
151 |     threads: 4
152 |     resources: mem_mb = 4048
153 |     singularity:
154 |         "library://wresch/classes/rnaseq:0.8"
155 |     shell:
156 |         """
157 |         salmon quant -i {input.idx} -l U -r <(zcat {input.fq}) \
158 |                 -p {threads} -o {output}
159 |         """
160 | 


--------------------------------------------------------------------------------
/exercise06/dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NIH-HPC/snakemake-class/5f1b9bf4fcc0130bd9fff22cf9c7be6504c11959/exercise06/dag.png


--------------------------------------------------------------------------------
/exercise06/rulegraph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NIH-HPC/snakemake-class/5f1b9bf4fcc0130bd9fff22cf9c7be6504c11959/exercise06/rulegraph.png


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | function fail() {
 4 |     echo "FAIL: $@" >&2
 5 |     exit 1
 6 | }
 7 | 
 8 | function info() {
 9 |     echo "INFO: $@"
10 | }
11 | 
12 | module load singularity || fail "*** Please run the setup script in an sinteractive session ***"
13 | module load snakemake/7 || fail "Could not load snakemake 7 module"
14 | module load git || fail "Could not load git module"
15 | 
16 | ## set up all the necessary bind mounts for transparent access to /home, /data, ...
17 | source /usr/local/current/singularity/app_conf/sing_binds
18 | 
19 | ## fetch the biowulf profile
20 | if [[ ! -d bwprofile ]]
21 | then
22 |     info "Fetching snakemake profile for Biowulf from https://github.com/NIH-HPC/snakemake_profile"
23 |     git clone https://github.com/NIH-HPC/snakemake_profile.git bwprofile &> /dev/null \
24 |         || fail "unable to clone profile repo"
25 |     echo "use-singularity: true" >> bwprofile/config.yaml
26 |     echo "singularity-prefix: $PWD/00container" >> bwprofile/config.yaml
27 | else
28 |     info "Snakemake profile for biowulf already downloaded"
29 | fi
30 | info "The profile has been configured to use singularity"
31 | 
32 | # running the setup workflow will also cache the latest rnaseq container
33 | export SNAKEMAKE_PROFILE=${PWD}/bwprofile
34 | if snakemake -s setup.smk setup
35 | then
36 |     cat <<-EOF
37 | 	+------------------------------------------------------------------------------+
38 | 	|                                                                              |
39 | 	|                Class materials have been set up successfully                 |
40 | 	|                                                                              |
41 | 	+------------------------------------------------------------------------------+
42 | 	EOF
43 |     # create a symlink for the most recent .simg file in 00container
44 |     pushd 00container
45 |     latest="$(ls -1 -t *.simg)"
46 |     rm -f rnaseq.sif && ln -s "${latest}" rnaseq.sif
47 | else
48 |     cat <<-EOF
49 | 	+------------------------------------------------------------------------------+
50 | 	|                                                                              |
51 | 	|                        An error occured during setup                         |
52 | 	|                                                                              |
53 | 	+------------------------------------------------------------------------------+
54 | 	EOF
55 | fi
56 | 


--------------------------------------------------------------------------------
/setup.smk:
--------------------------------------------------------------------------------
  1 | # vim: set ft=python:
  2 | 
  3 | # Samples from https://www.ebi.ac.uk/ena/data/view/PRJEB5348
  4 | # to use for this example
  5 | SAMPLES = ["ERR458495",
  6 |            "ERR458502",
  7 |            "ERR458509",
  8 |            "ERR458516",
  9 |            "ERR458880",
 10 |            "ERR458887"]
 11 | 
 12 | 
 13 | ###
 14 | ### main driver rule
 15 | ###
 16 | localrules: setup, clean
 17 | 
 18 | rule setup:
 19 |     input: "00fastq/ERP004763_sample_table.tsv",
 20 |            "00fastq/samples.yml",
 21 |            expand("00fastq/{sample}.fastq.gz", sample=SAMPLES),
 22 |            expand("exercise{n:02d}/config.yml", n=range(4, 7)),
 23 |            "00ref/R64-1-1.fa", 
 24 |            "00ref/hisat_index/R64-1-1", 
 25 |            "00ref/R64-1-1.cdna_nc.fa", 
 26 |            "00ref/R64-1-1.genes.gtf",
 27 |            "00ref/ref.yml",
 28 |            "00ref/R64-1-1.tran2gene.tsv",
 29 |            "00ref/R64-1-1.genes.bed12",
 30 |            "00ref/chromosomes",
 31 |     shell:
 32 |         """
 33 |         for ex in exercise0{{1..6}}; do
 34 |             rm -rf $ex/00ref $ex/00fastq
 35 |             cp -lr 00ref 00fastq $ex
 36 |         done
 37 |         """
 38 | 
 39 | rule clean:
 40 |     shell:
 41 |         """
 42 |         rm -rf 00ref 00fastq logs exercise*/logs
 43 |         rm -rf exercise*/00* exercise*/rnaseq exercise*/config.yml
 44 |         rm -rf exercise*/02aln exercise*/04count
 45 |         rm -rf exercise*/slurm-* exercise*/.snakemake
 46 |         rm -rf exercise06/{{.cache,.java,.fontconfig}}
 47 |         rm -rf exercise06/{{01qc,03track,05salmon}}
 48 |         """
 49 | 
 50 | 
 51 | ###
 52 | ### data
 53 | ###
 54 | 
 55 | localrules: fetch_sample_desc, sample_table, fetch_fastq, config_yml
 56 | 
 57 | rule fetch_sample_desc:
 58 |     """fetch the sample description table; use local repo if possible"""
 59 |     output: "00fastq/ERP004763_sample_table.tsv"
 60 |     shell:
 61 |         """
 62 |         if [[ -f /data/classes/snakemake/{output} ]]; then
 63 |             cp /data/classes/snakemake/{output} {output}
 64 |         else
 65 |             wget -O {output} \
 66 |               https://ndownloader.figshare.com/files/2194841
 67 |         fi
 68 |         """
 69 | 
 70 | rule sample_table:
 71 |     input: "00fastq/ERP004763_sample_table.tsv"
 72 |     output: "00fastq/samples.yml"
 73 |     shell:
 74 |         """
 75 |         echo "samples:" > {output}
 76 |         pattern="$(echo {SAMPLES} | tr -s ' ' '|')"
 77 |         egrep "$pattern" {input} \
 78 |                 | sort -k3,4 \
 79 |                 | awk '{{printf("  %s:\\n    gt: %s\\n    rep: %s\\n", $1, $3, $4)}}' \
 80 |                 >> {output}
 81 |         """
 82 | 
 83 | rule fetch_fastq:
 84 |     """fetch fastq; use local repo if possible"""
 85 |     output: "00fastq/{sample}.fastq.gz"
 86 |     shell:
 87 |         """
 88 |         if [[ -f /data/classes/snakemake/{output} ]]; then
 89 |             cp /data/classes/snakemake/{output} {output}
 90 |         else
 91 |             url="ftp://ftp.sra.ebi.ac.uk/vol1/fastq"
 92 |             sample="{wildcards.sample}"
 93 |             wget -O {output} "$url/${{sample:0:6}}/$sample/${{sample}}.fastq.gz"
 94 |         fi
 95 |         """
 96 | 
 97 | rule config_yml:
 98 |     input: "00fastq/samples.yml", "00ref/ref.yml"
 99 |     output: "{prefix}/config.yml"
100 |     shell:
101 |         """
102 |         cat {input} >> {output}
103 |         """
104 | 
105 | 
106 | ###
107 | ### reference data
108 | ###
109 | 
110 | localrules: fetch_genome, fetch_transcriptome, fetch_gtf, gtf2bed12, make_transcript_gene_map, chroms, ref_yml
111 | 
112 | ENSEMBL_RELEASE = 112
113 | ENSEMBL_URL = "ftp://ftp.ensembl.org/pub/release-{}".format(ENSEMBL_RELEASE)
114 | 
115 | genome_build = "R64-1-1"
116 | 
117 | rule fetch_genome:
118 |     output: "00ref/R64-1-1.fa"
119 |     shell:
120 |         """
121 |         wget {ENSEMBL_URL}/fasta/saccharomyces_cerevisiae/dna/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz
122 |         gunzip Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz
123 |         mv Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa {output}
124 |         """
125 | 
126 | rule fetch_transcriptome:
127 |     output: "00ref/R64-1-1.cdna_nc.fa"
128 |     shell:
129 |         """
130 |         wget {ENSEMBL_URL}/fasta/saccharomyces_cerevisiae/cdna/Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa.gz
131 |         wget {ENSEMBL_URL}/fasta/saccharomyces_cerevisiae/ncrna/Saccharomyces_cerevisiae.R64-1-1.ncrna.fa.gz
132 |         gunzip Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa.gz
133 |         gunzip Saccharomyces_cerevisiae.R64-1-1.ncrna.fa.gz
134 |         cat Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa \
135 |             Saccharomyces_cerevisiae.R64-1-1.ncrna.fa \
136 |               | awk '/^>/ {{NF=1}} {{print}}' > {output}
137 |         rm Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa \
138 |            Saccharomyces_cerevisiae.R64-1-1.ncrna.fa
139 |         """
140 | 
141 | rule fetch_gtf:
142 |     output: "00ref/R64-1-1.genes.gtf"
143 |     shell:
144 |         """
145 |         wget "{ENSEMBL_URL}/gtf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae.R64-1-1.{ENSEMBL_RELEASE}.gtf.gz"
146 |         gunzip Saccharomyces_cerevisiae.R64-1-1.{ENSEMBL_RELEASE}.gtf.gz
147 |         mv Saccharomyces_cerevisiae.R64-1-1.{ENSEMBL_RELEASE}.gtf {output}
148 |         """
149 | 
150 | rule gtf2bed12:
151 |     input: "00ref/R64-1-1.genes.gtf"
152 |     output: "00ref/R64-1-1.genes.bed12"
153 |     run:
154 |         transcripts = {}
155 |         i = 0
156 |         for line in open(input[0]):
157 |             if line.startswith('#'):
158 |                 continue
159 |             chrom, _, feat, s, e, score, strand, _, attr = line.split('\t')
160 |             if feat not in ('transcript', 'exon'):
161 |                 continue
162 |             s = int(s) - 1
163 |             e = int(e)
164 |             assert s < e
165 |             tid = None
166 |             for a in attr.split(';'):
167 |                 if a.strip().startswith('transcript_id'):
168 |                     tid = a.split('"')[1]
169 |                     break
170 |             if tid is None:
171 |                 raise ValueError
172 |             if feat == 'transcript':
173 |                 i += 1
174 |                 transcripts[tid] = {'c': chrom, 's':s, 'e':e, 
175 |                         'strand': strand, 'exons':[], 'tid': tid, 'i': i}
176 |             else:
177 |                 transcripts[tid]['exons'].append((s, e))
178 |         with open(output[0], "w") as of:
179 |             for tid, t in sorted(transcripts.items(), key = lambda x: x[1]['i']):
180 |                 of.write("{c}\t{s}\t{e}\t{tid}\t0\t{strand}".format(**t))
181 |                 of.write("\t{s}\t{e}\t0,0,0\t{en}".format(en=len(t['exons']), **t))
182 |                 exons = sorted(t['exons'])
183 |                 of.write("\t{bsz}\t{bs}\n".format(bsz=",".join(str(e-s) for s,e in exons),
184 |                     bs=",".join(str(s) for s,_ in exons)))
185 | 
186 | rule make_transcript_gene_map:
187 |     input: "00ref/R64-1-1.genes.gtf"
188 |     output: "00ref/R64-1-1.tran2gene.tsv"
189 |     threads: 1
190 |     run:
191 |         transcripts = []
192 |         for line in open(input[0]):
193 |             if line.startswith('#'):
194 |                 continue
195 |             chrom, _, feat, s, e, score, strand, _, attr = line.split('\t')
196 |             if feat != 'transcript':
197 |                 continue
198 |             tid = None
199 |             gid = None
200 |             for a in attr.split(';'):
201 |                 if a.strip().startswith('transcript_id'):
202 |                     tid = a.split('"')[1]
203 |                 if a.strip().startswith('gene_id'):
204 |                     gid = a.split('"')[1]
205 |             if tid is None or gid is None:
206 |                 raise ValueError
207 |             transcripts.append((tid, gid))
208 |         with open(output[0], "w") as of:
209 |             for tid, gid in transcripts:
210 |                 of.write("{}\t{}\n".format(tid, gid))
211 | 
212 | rule chroms:
213 |     output: "00ref/chromosomes"
214 |     shell:
215 |         """
216 |         curl -s {ENSEMBL_URL}/fasta/saccharomyces_cerevisiae/dna_index/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz.fai \
217 |                 | cut -f1,2 > {output}
218 |         """
219 | 
220 | rule ref_yml:
221 |     output: "00ref/ref.yml"
222 |     shell:
223 |         """
224 |         echo "reference:" > {output}
225 |         echo "  ensembl_ver: {ENSEMBL_RELEASE}" >> {output}
226 |         echo "  genome_build: R64-1-1" >> {output}
227 |         echo "  genome_file: 00ref/R64-1-1.fa" >> {output}
228 |         echo "  cdna_file: 00ref/R64-1-1.cdna_nc.fa" >> {output}
229 |         echo "  hisat_index: 00ref/hisat_index/R64-1-1" >> {output}
230 |         """
231 | 
232 | 
233 | rule make_hisat_index:
234 |     input:  "00ref/R64-1-1.fa"
235 |     output: idxf1 = "00ref/hisat_index/R64-1-1.1.ht2", 
236 |             name = "00ref/hisat_index/R64-1-1"
237 |     threads: 4
238 |     resources: mem_mb=24*1024
239 |     singularity:
240 |         "library://wresch/classes/rnaseq:0.8"
241 |     shell:
242 |         """
243 |         hisat2-build -p {threads} {input} {output.name} \
244 |                 && touch {output.name}
245 |         """
246 | 


--------------------------------------------------------------------------------