├── README.md
├── bio
    └── ngs
    │   └── rules
    │       ├── annotation
    │           ├── hmmer.rules
    │           └── prodigal.rules
    │       ├── assembly
    │           ├── merge.rules
    │           ├── ray.rules
    │           └── report.rules
    │       ├── binning
    │           └── concoct.rules
    │       ├── blast
    │           └── rpsblast.rules
    │       ├── mapping
    │           ├── bowtie2.rules
    │           ├── report.rules
    │           └── samtools.rules
    │       ├── quality_control
    │           └── fastqc.rules
    │       └── trimming
    │           └── trimmomatic.rules
├── common
    └── rules
    │   ├── compression.rules
    │   └── track_dir.rules
└── scheduling
    ├── Snakefile_qsub.py
    └── Snakefile_sbatch.py


/README.md:
--------------------------------------------------------------------------------
 1 | Snakemake Workflows
 2 | ===================
 3 | An experimental repo with [Snakemake] rules and workflows for Next Generation
 4 | Sequencing, specifically for metagenomics.  The directory structure is similar
 5 | to [snakemake-workflows] from the Snakemake author himself so it may later be
 6 | merged once the rules prove useful.
 7 | 
 8 | 
 9 | Requirements
10 | ============
11 | - [Snakemake] version 3.1
12 | 
13 | [Snakemake]: https://bitbucket.org/johanneskoester/snakemake
14 | [Snakemake-workflows]: https://bitbucket.org/johanneskoester/snakemake
15 | 


--------------------------------------------------------------------------------
/bio/ngs/rules/annotation/hmmer.rules:
--------------------------------------------------------------------------------
 1 | # vim: syntax=python tabstop=4 expandtab
 2 | # coding: utf-8
 3 | 
 4 | 
 5 | __author__ = "Ino de Bruijn"
 6 | __license__ = "MIT"
 7 | 
 8 | from snakemake.exceptions import MissingInputException
 9 | 
10 | 
11 | rule hmmer_run:
12 |     """
13 |     Uses HMmer to run query sequence against given database with given
14 |     parameters.
15 |     """
16 |     input:
17 |         aa=lambda wildcards: config["hmmer_rules"]["query_aas"][wildcards.protein_aa]
18 |     output:
19 |         hmmer_out="hmmer/{parameters}/{db}/{protein_aa}/hmmer.out",
20 |         hmmer_tsv="hmmer/{parameters}/{db}/{protein_aa}/hmmer.tsv"
21 |     params:
22 |         hmmer_params=lambda wildcards: config["hmmer_rules"]["hmmer_params"][wildcards.parameters],
23 |         db=lambda wildcards: config["hmmer_rules"]["databases"][wildcards.db]
24 |     shell:
25 |         """
26 |         {config[hmmer_rules][load_env]}
27 |         hmmscan \
28 |             {params.hmmer_params} \
29 |             --tblout {output.hmmer_tsv} \
30 |             {params.db} \
31 |             {input.aa} \
32 |         > {output.hmmer_out}
33 |         """
34 | 
35 | 
36 | rule hmmer_run_all:
37 |     input:
38 |         outs=expand("hmmer/{parameters}/{db}/{protein_aa}/hmmer.out",
39 |             parameters=config["hmmer_rules"]["hmmer_params"],
40 |             db=config["hmmer_rules"]["databases"],
41 |             protein_aa=config["hmmer_rules"]["query_aas"]),
42 |         tsvs=expand("hmmer/{parameters}/{db}/{protein_aa}/hmmer.tsv",
43 |             parameters=config["hmmer_rules"]["hmmer_params"],
44 |             db=config["hmmer_rules"]["databases"],
45 |             protein_aa=config["hmmer_rules"]["query_aas"])
46 | 


--------------------------------------------------------------------------------
/bio/ngs/rules/annotation/prodigal.rules:
--------------------------------------------------------------------------------
 1 | from snakemake.exceptions import MissingInputException
 2 | 
 3 | rule prodigal_run:
 4 |     input:
 5 |         asm=lambda wildcards: config["prodigal_rules"]["assemblies"][wildcards.assembly]
 6 |     output:
 7 |         aa="annotation/prodigal/{parameters}/{assembly}/proteins/proteins.faa",
 8 |         gff="annotation/prodigal/{parameters}/{assembly}/proteins/proteins.gff"
 9 |     log:
10 |         "annotation/prodigal/{parameters}/{assembly}/proteins/proteins.log"
11 |     params:
12 |         prodigal_params=lambda wildcards: config["prodigal_rules"]["prodigal_params"][wildcards.parameters]
13 |     shell:
14 |         """
15 |         {config[prodigal_rules][load_env]}
16 |         prodigal -i {input.asm} \
17 |             -a {output.aa} \
18 |             -f gff \
19 |             {params.prodigal_params} > {output.gff} \
20 |             2> {log}
21 |         """
22 | 
23 | rule prodigal_run_all:
24 |     input:
25 |         protein_aas=expand("annotation/prodigal/{parameters}/{assembly}/proteins/proteins.faa",
26 |             parameters=config["prodigal_rules"]["prodigal_params"],
27 |             assembly=config["prodigal_rules"]["assemblies"]),
28 |         protein_gffs=expand("annotation/prodigal/{parameters}/{assembly}/proteins/proteins.gff",
29 |             parameters=config["prodigal_rules"]["prodigal_params"],
30 |             assembly=config["prodigal_rules"]["assemblies"])
31 | 


--------------------------------------------------------------------------------
/bio/ngs/rules/assembly/merge.rules:
--------------------------------------------------------------------------------
 1 | # vim: syntax=python tabstop=4 expandtab
 2 | # coding: utf-8
 3 | 
 4 | 
 5 | __author__ = "Ino de Bruijn"
 6 | __license__ = "MIT"
 7 | 
 8 | 
 9 | rule merge_newbler:
10 |     input:
11 |         lambda wildcards: config["assembly_merge_rules"]["merge"][wildcards.merge]
12 |     output:
13 |         "assembly/newbler/{merge}/454AllContigs.fna"
14 |     shell:
15 |         """
16 |         {config[assembly_merge_rules][load_env]}
17 |         bash -x $METASSEMBLE_DIR/scripts/assembly/merge-asm-newbler.sh \
18 |             assembly/newbler/{wildcards.merge} {input}
19 |         """
20 | 
21 | 
22 | rule merge_newbler_all:
23 |     input:
24 |         expand("assembly/newbler/{merge}/454AllContigs.fna",
25 |                merge=config["assembly_merge_rules"]["merge"])
26 | 


--------------------------------------------------------------------------------
/bio/ngs/rules/assembly/ray.rules:
--------------------------------------------------------------------------------
 1 | # vim: syntax=python tabstop=4 expandtab
 2 | # coding: utf-8
 3 | 
 4 | 
 5 | __author__ = "Ino de Bruijn"
 6 | __license__ = "MIT"
 7 | 
 8 | 
 9 | def create_ray_read_input_str(unit):
10 |     if len(unit) == 2:
11 |         return "-p {unit[0]} {unit[1]}".format(unit=unit)
12 |     elif len(unit) == 1:
13 |         return "-s {unit[0]}".format(unit=unit)
14 |     else:
15 |         raise(Exception("Units should either be paired library or single read library."))
16 | 
17 | rule ray_assembly:
18 |     input:
19 |         lambda wildcards: sum([config["ray_rules"]["units"][unit] for unit in config["ray_rules"]["samples"][wildcards.sample]], [])
20 |     output:
21 |         "assembly/ray/{assembly_params}/{sample}/out_{kmer}/Contigs.fasta"
22 |     params:
23 |         custom=lambda wildcards: config["ray_rules"]["assembly_params"][wildcards.assembly_params],
24 |         read_input_str=lambda wildcards: " ".join([create_ray_read_input_str(config["ray_rules"]["units"][unit]) for unit in config["ray_rules"]["samples"][wildcards.sample]])
25 |     shell:
26 |         """
27 |         {config[ray_rules][load_env]}
28 |         rm -rf assembly/ray/{wildcards.assembly_params}/{wildcards.sample}/out_{wildcards.kmer}/tmp
29 |         {config[ray_rules][mpi_cmd]} Ray {params.custom} \
30 |         {params.read_input_str} \
31 |         -o assembly/ray/{wildcards.assembly_params}/{wildcards.sample}/out_{wildcards.kmer}/tmp
32 |         mv assembly/ray/{wildcards.assembly_params}/{wildcards.sample}/out_{wildcards.kmer}/tmp/* \
33 |            assembly/ray/{wildcards.assembly_params}/{wildcards.sample}/out_{wildcards.kmer}/
34 |         """
35 | 
36 | rule ray_assembly_all:
37 |     input:
38 |         expand("assembly/ray/{assembly_params}/{sample}/out_{kmer}/Contigs.fasta", assembly_params=config["ray_rules"]["assembly_params"], sample=config["ray_rules"]["samples"], kmer=config["ray_rules"]["kmers"])
39 | 


--------------------------------------------------------------------------------
/bio/ngs/rules/assembly/report.rules:
--------------------------------------------------------------------------------
  1 | # vim: syntax=python tabstop=4 expandtab
  2 | # coding: utf-8
  3 | 
  4 | 
  5 | """
  6 | Rules for analysing fasta files with FastQC.
  7 | 
  8 | For usage, include this in your workflow.
  9 | """
 10 | 
 11 | 
 12 | __author__ = "Ino de Bruijn (http://ino.pm)"
 13 | __license__ = "MIT"
 14 | 
 15 | 
 16 | rule assembly_report:
 17 |     """Creates an assembly HTML report + json for given assemblies. Uses masmvali (github.com/inodb/masmvali)"""
 18 |     input:
 19 |         asms=lambda wildcards: sorted(config["assembly_report_rules"]["assemblies"])
 20 |     output:
 21 |         report="report/assemblies/index.html",
 22 |         json="report/assemblies/data.json"
 23 |     shell:
 24 |         """
 25 |         {config[assembly_report_rules][load_env]}
 26 |         python <(cat <<EOF
 27 | 
 28 | import json
 29 | from masmvali.assembly import asm_stats_fasta
 30 | import sys
 31 | 
 32 | first = True
 33 | print "["
 34 | for asm in "{input.asms}".split():
 35 |     for cut_off in [100, 1000]:
 36 |         if first:
 37 |             first = False
 38 |         else:
 39 |             sys.stdout.write(",\\n")
 40 | 
 41 |         res = asm_stats_fasta(asm, cut_off=cut_off)
 42 |         json_out = {{"name":asm,
 43 |                      "cut_off":cut_off,
 44 |                      "l50":res[0],
 45 |                      "n50":res[1],
 46 |                      "totbases":res[2],
 47 |                      "max_length":res[3],
 48 |                      "nr_contigs":res[4]}}
 49 |         sys.stdout.write(json.dumps(json_out))
 50 | print "]"
 51 | 
 52 | EOF
 53 |         ) > {output.json}
 54 |         cat > {output.report} <<EOF
 55 |             <html>
 56 |             <head>
 57 |             <!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->
 58 |             <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js"></script>
 59 |             <!-- Latest compiled and minified CSS -->
 60 |             <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css">
 61 |             <!-- Latest compiled and minified JavaScript -->
 62 |             <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/js/bootstrap.min.js"></script>
 63 | 
 64 |             <!-- Latest compiled and minified CSS -->
 65 |             <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap-table/1.5.0/bootstrap-table.min.css">
 66 |             <!-- Latest compiled and minified JavaScript -->
 67 |             <script src="https://cdnjs.cloudflare.com/ajax/libs/bootstrap-table/1.5.0/bootstrap-table.min.js"></script>
 68 |             <script src="https://raw.githubusercontent.com/wenzhixin/bootstrap-table/master/src/extensions/export/bootstrap-table-export.js"></script>
 69 |             <script src="https://rawgit.com/kayalshri/tableExport.jquery.plugin/master/tableExport.js"></script>
 70 |             <script src="https://rawgit.com/kayalshri/tableExport.jquery.plugin/master/jquery.base64.js"></script>
 71 |             </head>
 72 |             <body>
 73 |                 <h1>Assembly Stats</h1>
 74 |                 <div class="container">
 75 |                     <table
 76 |                            data-toggle="table"
 77 |                            data-show-export="true"
 78 |                            data-search="true"
 79 |                            data-show-columns="true"
 80 |                            data-url="data.json"
 81 |                            data-sort-name="name"
 82 |                            data-sort-order="asc"
 83 |                            data-export="true">
 84 |                         <thead>
 85 |                             <tr>
 86 |                                 <th data-field="name"
 87 |                                     data-sortable="true">
 88 |                                     name
 89 |                                 </th>
 90 |                                 <th data-field="cut_off"
 91 |                                     data-sortable="true">
 92 |                                     cutoff
 93 |                                 </th>
 94 |                                 <th data-field="nr_contigs"
 95 |                                     data-sortable="true">
 96 |                                     nr_contigs
 97 |                                 </th>
 98 |                                 <th data-field="totbases"
 99 |                                     data-sortable="true">
100 |                                     totbases
101 |                                 </th>
102 |                                 <th data-field="l50"
103 |                                     data-sortable="true">
104 |                                     l50
105 |                                 </th>
106 |                                 <th data-field="n50"
107 |                                     data-sortable="true">
108 |                                     n50
109 |                                 </th>
110 |                                 <th data-field="max_length"
111 |                                     data-sortable="true">
112 |                                     max_length
113 |                                 </th>
114 |                             </tr>
115 |                         </thead>
116 |                     </table>
117 |                 </div>
118 |             </body>
119 |             </html>
120 | EOF
121 |         """
122 | 


--------------------------------------------------------------------------------
/bio/ngs/rules/binning/concoct.rules:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | from snakemake.exceptions import MissingInputException
  4 | from snakemake.utils import report
  5 | 
  6 | # Check values in config file
  7 | CONFIG_REQS = ["assemblies", "mapper", "mapping_params", "concoct_params", "scripts_dir"]
  8 | if "concoct_rules" not in config:
  9 |     raise(Exception("concoct_rules key not in config file"))
 10 | for cr in CONFIG_REQS:
 11 |     if cr not in config["concoct_rules"]:
 12 |         raise(Exception("{cr} not in concoct_rules config file".format(cr=cr)))
 13 | 
 14 | 
 15 | #  add 10K cutup as references for bowtie2 to map against
 16 | config["bowtie2_rules"].setdefault("references", {}).update({a + "_10K": "concoct/{a}/cutup/contigs_10K.fasta".format(a=a) for a in config["concoct_rules"]["assemblies"]})
 17 | 
 18 | rule concoct_cutup_10K:
 19 |     input:
 20 |         lambda wildcards: config["concoct_rules"]["assemblies"][wildcards.assembly]
 21 |     output:
 22 |         "concoct/{assembly}/cutup/contigs_10K.fasta"
 23 |     params:
 24 |         chunk_size="10000",
 25 |         overlap="0"
 26 |     threads: 1
 27 |     shell:
 28 |         """
 29 |         {config[concoct_rules][load_env]}
 30 |         python {config[concoct_rules][scripts_dir]}/cut_up_fasta.py -c {params.chunk_size} -o {params.overlap} \
 31 |             -m {input} > {output}
 32 |         """
 33 | 
 34 | rule concoct_cutup_10K_all:
 35 |     input:
 36 |             expand("concoct/{assembly}/cutup/contigs_10K.fasta", assembly=config["concoct_rules"]["assemblies"])
 37 | 
 38 | 
 39 | rule concoct_map_10K_all:
 40 |     input:
 41 |         expand("mapping/{mapper}/{mapping_params}/{assembly}/samples/{sample}.sorted.removeduplicates.bam",
 42 |                assembly=config["bowtie2_rules"]["references"],
 43 |                sample=config["bowtie2_rules"]["samples"],
 44 |                mapping_params=config["concoct_rules"]["mapping_params"],
 45 |                mapper=config["concoct_rules"]["mapper"])
 46 | 
 47 | 
 48 | rule concoct_generate_coverage_table_10K:
 49 |     input:
 50 |         asm="concoct/{assembly}/cutup/contigs_10K.fasta",
 51 |         bedcovs=expand("mapping/{mapper}/{mapping_params}/{{assembly}}_10K/samples/{samples}.sorted.removeduplicates.coverage.tsv",
 52 |                    samples=sorted(config["bowtie2_rules"]["samples"]),
 53 |                    mapper=config["concoct_rules"]["mapper"],
 54 |                    mapping_params=config["concoct_rules"]["mapping_params"])
 55 |     output:
 56 |         "concoct/{assembly}/input/concoct_inputtable.tsv",
 57 |         "concoct/{assembly}/input/concoct_inputtableR.tsv"
 58 |     params:
 59 |         sample_names=sorted(config["bowtie2_rules"]["samples"])
 60 |     shell:
 61 |         """
 62 |         {config[concoct_rules][load_env]}
 63 |         python {config[concoct_rules][scripts_dir]}/gen_input_table.py --isbedfiles \
 64 |             --samplenames <(for s in {params.sample_names}; do echo $s; done) \
 65 |                 {input.asm} {input.bedcovs} \
 66 |                 > {output[0]} && \
 67 |         cut -f1,3- {output[0]} > {output[1]}
 68 |         """
 69 | 
 70 | 
 71 | rule concoct_inputtable_10K_all:
 72 |     input:
 73 |         expand("concoct/{assembly}/input/concoct_inputtableR.tsv", assembly=config["concoct_rules"]["assemblies"])
 74 | 
 75 | 
 76 | rule concoct_run_10K:
 77 |     """
 78 |     Run CONCOCT
 79 |     """
 80 |     input:
 81 |         asm="concoct/{assembly}/cutup/contigs_10K.fasta",
 82 |         input_table="concoct/{assembly}/input/concoct_inputtableR.tsv"
 83 |     output:
 84 |         clustering="concoct/{assembly}/output/{cparams}/clustering.csv"
 85 |     params:
 86 |         output_folder="concoct/{assembly}/output/{cparams}/",
 87 |         concoct_params=lambda wildcards: config["concoct_rules"]["concoct_params"][wildcards.cparams]
 88 |     shell:
 89 |         """
 90 |         {config[concoct_rules][load_env]}
 91 |         concoct {params.concoct_params} \
 92 |             --coverage_file {input.input_table} \
 93 |             --composition_file {input.asm} \
 94 |             -b {params.output_folder} && \
 95 |         ln -fs $(basename {params.output_folder}clustering_gt*.csv) \
 96 |                {output.clustering} && \
 97 |         touch -h {output.clustering}
 98 |         """
 99 | 
100 | 
101 | rule concoct_run_10K_all:
102 |     """
103 |     Run CONCOCT on all assemblies over all parameters specified in the config file.
104 |     """
105 |     input:
106 |         expand("concoct/{assembly}/output/{concoct_params}/clustering.csv",
107 |             assembly=config["concoct_rules"]["assemblies"],
108 |             concoct_params=config["concoct_rules"]["concoct_params"])
109 | 
110 | 
111 | #  add 10K cutup as assemblies for prodigal to predict genes for
112 | config["prodigal_rules"]["assemblies"] = {a + "_10K": "concoct/{a}/cutup/contigs_10K.fasta".format(a=a) for a in config["concoct_rules"]["assemblies"]}
113 | 
114 | #  add prodigal predicted genes as query for rpsblast
115 | config["rpsblast_rules"]["query_aas"] = {a: "annotation/prodigal/default-meta/{a}/proteins/proteins.faa".format(a=a) for a in config["prodigal_rules"]["assemblies"]}
116 | 
117 | #  add prodigal predicted genes as query for hmmer
118 | config["hmmer_rules"]["query_aas"] = config["rpsblast_rules"]["query_aas"]
119 | 
120 | rule concoct_eval_cog_table:
121 |     """
122 |     Generate COG table from rpsblast output and concoct binning results
123 |     """
124 |     input:
125 |         clust="concoct/{assembly}/output/{concoct_params}/clustering.csv",
126 |         rpsblast="blast/rpsblast/default-concoct/cog/{assembly}_10K/rpsblast.out"
127 |     output:
128 |         "concoct/{assembly}/evaluation/scg/{concoct_params}/clustering_scg.tsv"
129 |     shell:
130 |         """
131 |         {config[concoct_rules][load_env]}
132 |         python {config[concoct_rules][scripts_dir]}/COG_table.py \
133 |             -b {input.rpsblast} \
134 |             -m {config[concoct_rules][scripts_dir]}/../scgs/scg_cogs_min0.97_max1.03_unique_genera.txt \
135 |             -c {input.clust} \
136 |             --cdd_cog_file {config[concoct_rules][scripts_dir]}/../scgs/cdd_to_cog.tsv \
137 |             > {output}
138 |         """
139 | 
140 | rule concoct_extract_approved_scg_bins:
141 |     input:
142 |         scg_tsvs=expand("concoct/{assembly}/evaluation/scg/{concoct_params}/clustering_scg.tsv",
143 |             assembly=sorted(config["concoct_rules"]["assemblies"]),
144 |             concoct_params=sorted(config["concoct_rules"]["concoct_params"])),
145 |         asms=expand("concoct/{assembly}/cutup/contigs_10K.fasta",
146 |             assembly=sorted(config["concoct_rules"]["assemblies"]),
147 |             concoct_params=sorted(config["concoct_rules"]["concoct_params"]))
148 |     output:
149 |         dynamic("concoct/approved_scg_bins/{cluster_name}.fa")
150 |     params:
151 |         names=expand("{assembly}_{concoct_params}",
152 |             assembly=sorted(config["concoct_rules"]["assemblies"]),
153 |             concoct_params=sorted(config["concoct_rules"]["concoct_params"])),
154 |         groups=expand("{assembly}",
155 |             assembly=sorted(config["concoct_rules"]["assemblies"]),
156 |             concoct_params=sorted(config["concoct_rules"]["concoct_params"]))
157 |     shell:
158 |         """
159 |             {config[concoct_rules][load_env]}
160 |             python {config[concoct_rules][scripts_dir]}/extract_scg_bins.py \
161 |                 --output_folder concoct/approved_scg_bins \
162 |                 --scg_tsvs {input.scg_tsvs} \
163 |                 --fasta_files {input.asms} \
164 |                 --names {params.names} \
165 |                 --groups {params.groups} \
166 |                 --max_missing_scg 5 \
167 |                 --max_multicopy_scg 2
168 |          """
169 | 
170 | rule concoct_extract_approved_scg_bins_all:
171 |     input:
172 |         dynamic("concoct/approved_scg_bins/{cluster_name}.fa")
173 | 
174 | 
175 | rule concoct_dnadiff_dist_matrix:
176 |     """Get distance matrix from approved SCG bins"""
177 |     input:
178 |         clusters=dynamic("concoct/approved_scg_bins/{cluster_name}.fa")
179 |     output:
180 |         "concoct/dnadiff_dist_matrix/dist_matrix.tsv",
181 |         "concoct/dnadiff_dist_matrix/hclust_heatmap.pdf",
182 |         "concoct/dnadiff_dist_matrix/hclust_dendrogram.pdf"
183 |     run:
184 |         sorted_input = sorted(input.clusters)
185 |         shell("""
186 |         {config[concoct_rules][load_env]}
187 |         python {config[concoct_rules][scripts_dir]}/dnadiff_dist_matrix.py \
188 |             concoct/dnadiff_dist_matrix {sorted_input}
189 |         """)
190 | 
191 | 
192 | rule concoct_dnadiff_dist_matrix_report:
193 |     input:
194 |         dnadiff_output=rules.concoct_dnadiff_dist_matrix.output,
195 |         readme_rst=glob.glob("report/concoct/dnadiff_dist_matrix/README.rst")
196 |     output:
197 |         "report/concoct/dnadiff_dist_matrix/index.html"
198 |     params:
199 |         readme_html="report/concoct/dnadiff_dist_matrix/README.html"
200 |     shell:
201 |         """
202 |         cp --parents {input.dnadiff_output} report/
203 |         (
204 |             echo '<html><head><style>body {{ text-align: center }}</style></head><body>'
205 |             for p in $(for i in {input.dnadiff_output}; do basename $i; done | sort); do
206 |                 echo "<a href='$p'>$p</a><br />"
207 |             done
208 |             for f in {input.readme_rst}; do
209 |                 echo "<iframe style='width: 80%; height: 100%;' frameBorder=0 src='README.html'></iframe>"
210 |                 rst2html.py $f > {params.readme_html}
211 |             done
212 |             echo '</body></html>'
213 |         ) > {output}
214 |         """
215 | 
216 | 
217 | rule concoct_eval_cog_plot:
218 |     """
219 |     Plot COGs using COG table
220 |     """
221 |     input:
222 |         "concoct/{assembly}/evaluation/scg/{concoct_params}/clustering_scg.tsv"
223 |     output:
224 |         "concoct/{assembly}/evaluation/scg/{concoct_params}/clustering_scg.pdf"
225 |     shell:
226 |         """
227 |         {config[concoct_rules][load_env]}
228 |         Rscript {config[concoct_rules][scripts_dir]}/COGPlot.R \
229 |             -s {input} \
230 |             -o {output}
231 |         """
232 | 
233 | 
234 | rule concoct_eval_cog_plot_all:
235 |     input:
236 |         expand("concoct/{assembly}/evaluation/scg/{concoct_params}/clustering_scg.pdf",
237 |             assembly=config["concoct_rules"]["assemblies"],
238 |             concoct_params=config["concoct_rules"]["concoct_params"])
239 | 
240 | 
241 | 
242 | rule concoct_eval_cog_report:
243 |     input:
244 |         expand("concoct/{assembly}/evaluation/scg/{concoct_params}/clustering_scg.pdf",
245 |             assembly=config["concoct_rules"]["assemblies"],
246 |             concoct_params=config["concoct_rules"]["concoct_params"])
247 |     output:
248 |         "report/concoct/cog_plots.html"
249 |     shell:
250 |         """
251 |         cp --parents {input} report/
252 |         (
253 |             echo '<html><head><style>body {{ text-align: center }}</style></head><body>'
254 |             for p in $(for i in {input}; do echo $i | cut -d/ -f2-; done | sort); do
255 |                 echo "<a href='$p'>$p</a><br />"
256 |             done
257 |             echo '</body></html>'
258 |         ) > {output}
259 |         """
260 | 
261 | rule concoct_eval_report:
262 |     input:
263 |         cog_html=rules.concoct_eval_cog_report.output,
264 |         dnadiff_html=rules.concoct_dnadiff_dist_matrix_report.output
265 |     output:
266 |         "report/concoct/index.html"
267 |     shell:
268 |         """
269 |         (
270 |             echo '<html><head><style>body {{ text-align: center }}</style></head><body>'
271 |             echo "<a href='cog_plots.html'>COG Plots</a><br />"
272 |             echo "<a href='dnadiff_dist_matrix/index.html'>DNA Diff matrix as constructed with MUMmer</a><br />"
273 |             echo '</body></html>'
274 |         ) > {output}
275 |         """
276 | 
277 | 
278 | rule concoct_eval_cog_report_flashy:
279 |     input:
280 |         cog_plots=expand("concoct/{assembly}/evaluation/scg/{concoct_params}/clustering_scg.pdf",
281 |             assembly=config["concoct_rules"]["assemblies"],
282 |             concoct_params=config["concoct_rules"]["concoct_params"])
283 |     output:
284 |         html="report/concoct/cog_plots_flashy.html"
285 |     run:
286 |         dict_cp = {"{a}-{cp}".format(a=cp.split("/")[1],cp=cp.split("/")[4]):cp for cp in input.cog_plots}
287 |         cp_ids = "\n".join(["- " + cp + "_" for cp in sorted(dict_cp.keys())])
288 |         report("""
289 |         =========
290 |         SCG Plots
291 |         =========
292 |         {cp_ids}
293 |         """, output.html, **dict_cp)
294 | 


--------------------------------------------------------------------------------
/bio/ngs/rules/blast/rpsblast.rules:
--------------------------------------------------------------------------------
 1 | rule rpsblast_run:
 2 |     """
 3 |     Uses GNU Parallel to run query sequence against given database with given
 4 |     rpsblast parameters.
 5 |     """
 6 |     input:
 7 |         aa=lambda wildcards: config["rpsblast_rules"]["query_aas"][wildcards.protein_aa]
 8 |     output:
 9 |         rps_out="blast/rpsblast/{parameters}/{db}/{protein_aa}/rpsblast.out"
10 |     params:
11 |         rpsblast_params=lambda wildcards: config["rpsblast_rules"]["rpsblast_params"][wildcards.parameters].replace('"',"'"),
12 |         db=lambda wildcards: config["rpsblast_rules"]["databases"][wildcards.db],
13 |         parallel_params=config["rpsblast_rules"].get("parallel_params", "")
14 |     shell:
15 |         """
16 |         {config[rpsblast_rules][load_env]}
17 |         cat {input.aa} | \
18 |         parallel {params.parallel_params} \
19 |             --pipe -k --recstart '>' --no-notice \
20 |             rpsblast "{params.rpsblast_params}" \
21 |                 -query - \
22 |                 -db {params.db} \
23 |         > {output.rps_out}
24 |         """
25 | 
26 | 
27 | rule rpsblast_run_all:
28 |     input:
29 |         expand("blast/rpsblast/{parameters}/{db}/{protein_aa}/rpsblast.out",
30 |             parameters=config["rpsblast_rules"]["rpsblast_params"],
31 |             db=config["rpsblast_rules"]["databases"],
32 |             protein_aa=config["rpsblast_rules"]["query_aas"])
33 | 


--------------------------------------------------------------------------------
/bio/ngs/rules/mapping/bowtie2.rules:
--------------------------------------------------------------------------------
  1 | # vim: syntax=python tabstop=4 expandtab
  2 | # coding: utf-8
  3 | 
  4 | """
  5 | Read mapping with Bowtie2
  6 | 
  7 | For usage, include this in your workflow.
  8 | 
  9 | Expects the global variable config
 10 | (see https://bitbucket.org/johanneskoester/snakemake/wiki/Documentation#markdown-header-configuration)
 11 | of at least the following structure, assuming that the desired reference sequence is some genome
 12 | to be found under the given path, and two units A and B have been sequenced with Illumina,
 13 | the first paired and the second single end:
 14 | 
 15 | {
 16 |     "bowtie2_rules": {
 17 |         "references": {
 18 |             "genome": "path/to/genome.fasta"
 19 |         },
 20 |         "samples": {
 21 |             "A": ["A"],
 22 |             "B": ["B"]
 23 |         },
 24 |         "units": {
 25 |             "A":
 26 |                 ["path/to/A_R1.fastq.gz", "path/to/A_R2.fastq.gz"],
 27 |             "B":
 28 |                 ["path/to/B.fastq.gz"]
 29 |         },
 30 |         "platform": "Illumina",
 31 |         "mapping_params": {
 32 |             "default": ""
 33 |         }
 34 |     }
 35 | }
 36 | 
 37 | Note the separation between samples and units that allows to have more than
 38 | one sequencing run for each sample, or multiple lanes per sample.
 39 | """
 40 | 
 41 | 
 42 | __author__ = "Johannes KÃ¶ster (http://johanneskoester.bitbucket.org), Ino de Bruijn"
 43 | __license__ = "MIT"
 44 | 
 45 | 
 46 | UNIT_TO_SAMPLE = {
 47 |     unit: sample for sample, units in config["bowtie2_rules"]["samples"].items()
 48 |     for unit in units}
 49 | 
 50 | def create_bowtie2_read_input_str(unit):
 51 |     if len(unit) == 2:
 52 |         return "-1 {unit[0]} -2 {unit[1]}".format(unit=unit)
 53 |     elif len(unit) == 1:
 54 |         return "-r {unit[0]}".format(unit=unit)
 55 |     else:
 56 |         raise(Exception("Units should either be paired library or single read library."))
 57 | 
 58 | from snakemake.exceptions import MissingInputException
 59 | 
 60 | 
 61 | rule bowtie2_index:
 62 |     input:
 63 |         lambda wildcards: config["bowtie2_rules"]["references"][wildcards.reference]
 64 |     output:
 65 |         expand("mapping/bowtie2/{{reference}}.{index}.bt2", index=range(1,5)),
 66 |         expand("mapping/bowtie2/{{reference}}.rev.{index}.bt2", index=range(1,3))
 67 |     params:
 68 |         prefix="mapping/bowtie2/{reference}"
 69 |     shell:
 70 |         """
 71 |         {config[bowtie2_rules][load_env]}
 72 |         bowtie2-build {input} {params.prefix}
 73 |         """
 74 | 
 75 | 
 76 | rule bowtie2_map:
 77 |     input:
 78 |         lambda wildcards: config["bowtie2_rules"]["units"][wildcards.unit],
 79 |         expand("mapping/bowtie2/{{reference}}.{index}.bt2", index=range(1,5)),
 80 |         expand("mapping/bowtie2/{{reference}}.rev.{index}.bt2", index=range(1,3))
 81 |     output:
 82 |         "mapping/bowtie2/{mapping_params}/{reference}/units/{unit,\w+}.bam"
 83 |     params:
 84 |         sample=lambda wildcards: UNIT_TO_SAMPLE[wildcards.unit],
 85 |         custom=lambda wildcards: config["bowtie2_rules"]["mapping_params"][wildcards.mapping_params],
 86 |         read_input_str=lambda wildcards: create_bowtie2_read_input_str(config["bowtie2_rules"]["units"][wildcards.unit]),
 87 |         ref_idx_base="mapping/bowtie2/{reference}",
 88 |         sam="mapping/bowtie2/{mapping_params}/{reference}/units/{unit,\w+}.sam"
 89 |     log:
 90 |         "mapping/bowtie2/{mapping_params}/{reference}/units/{unit,\w+}.log"
 91 |     threads: 4
 92 |     shell:
 93 |         """
 94 |         {config[bowtie2_rules][load_env]}
 95 |         bowtie2 {params.custom} \
 96 |         --rg-id '{wildcards.unit}' \
 97 |         --rg 'SM:{params.sample}\\tPL:{config[bowtie2_rules][platform]}' \
 98 |         -x {params.ref_idx_base} \
 99 |         -p {threads} {params.read_input_str} \
100 |         -S {params.sam} \
101 |         2> {log} && \
102 |         samtools view -Sbh {params.sam} > {output} && \
103 |         rm {params.sam}
104 |         """
105 | 


--------------------------------------------------------------------------------
/bio/ngs/rules/mapping/report.rules:
--------------------------------------------------------------------------------
  1 | # vim: syntax=python tabstop=4 expandtab
  2 | # coding: utf-8
  3 | 
  4 | 
  5 | __author__ = "Ino de Bruijn"
  6 | __license__ = "MIT"
  7 | 
  8 | 
  9 | rule mapping_report:
 10 |     """Creates an assembly HTML report + json for given mappings. Order of
 11 |     given bowtie2 and markduplicates log/metrics files should be similar"""
 12 |     input:
 13 |         b2_logs=lambda wildcards: sorted(config["mapping_report_rules"]["bowtie2_logs"]),
 14 |         md_logs=lambda wildcards: sorted(config["mapping_report_rules"]["markduplicates_metrics"])
 15 |     output:
 16 |         report="report/mapping/index.html",
 17 |         json="report/mapping/data.json"
 18 |     shell:
 19 |         """
 20 |         {config[mapping_report_rules][load_env]}
 21 |         python <(cat <<EOF
 22 | 
 23 | import json
 24 | import sys
 25 | 
 26 | md_logs = "{input.md_logs}".split()
 27 | first = True
 28 | print "["
 29 | for i, log in enumerate("{input.b2_logs}".split()):
 30 |     if first:
 31 |         first = False
 32 |     else:
 33 |         sys.stdout.write(",\\n")
 34 | 
 35 |     logl = [s.strip() for s in open(log).readlines()]
 36 |     json_out = {{"name":log,
 37 |                  "nr_reads":logl[0].split()[0],
 38 |                  "overall_alignment_rate":logl[-1].split()[0]}}
 39 |     # add markduplicate metrics
 40 |     metl = [s.strip() for s in open(md_logs[i]).readlines()]
 41 |     json_out.update({{h:metl[7].split("\\t")[i] for i,h in enumerate(metl[6].split("\\t")[1:])}})
 42 | 
 43 |     sys.stdout.write(json.dumps(json_out))
 44 | print "]"
 45 | 
 46 | 
 47 | EOF
 48 |         ) > {output.json}
 49 |         cat > {output.report} <<EOF
 50 |             <html>
 51 |             <head>
 52 |             <!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->
 53 |             <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js"></script>
 54 |             <!-- Latest compiled and minified CSS -->
 55 |             <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css">
 56 |             <!-- Latest compiled and minified JavaScript -->
 57 |             <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/js/bootstrap.min.js"></script>
 58 | 
 59 |             <!-- Latest compiled and minified CSS -->
 60 |             <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap-table/1.5.0/bootstrap-table.min.css">
 61 |             <!-- Latest compiled and minified JavaScript -->
 62 |             <script src="https://cdnjs.cloudflare.com/ajax/libs/bootstrap-table/1.5.0/bootstrap-table.min.js"></script>
 63 |             <script src="https://raw.githubusercontent.com/wenzhixin/bootstrap-table/master/src/extensions/export/bootstrap-table-export.js"></script>
 64 |             <script src="https://rawgit.com/kayalshri/tableExport.jquery.plugin/master/tableExport.js"></script>
 65 |             <script src="https://rawgit.com/kayalshri/tableExport.jquery.plugin/master/jquery.base64.js"></script>
 66 |             </head>
 67 |             <body>
 68 |                 <h1>Mapping Stats</h1>
 69 |                 <div class="container">
 70 |                     <table
 71 |                            data-toggle="table"
 72 |                            data-show-export="true"
 73 |                            data-search="true"
 74 |                            data-show-columns="true"
 75 |                            data-url="data.json"
 76 |                            data-sort-name="name"
 77 |                            data-sort-order="asc"
 78 |                            data-export="true">
 79 |                         <thead>
 80 |                             <tr>
 81 |                                 <th data-field="name"
 82 |                                     data-sortable="true">
 83 |                                     name
 84 |                                 </th>
 85 |                                 <th data-field="nr_reads"
 86 |                                     data-sortable="true">
 87 |                                     nr_reads
 88 |                                 </th>
 89 |                                 <th data-field="overall_alignment_rate"
 90 |                                     data-sortable="true">
 91 |                                     overall_alignment_rate
 92 |                                 </th>
 93 |                                 <th data-field="PERCENT_DUPLICATION"
 94 |                                     data-sortable="true">
 95 |                                     percent_duplication
 96 |                                 </th>
 97 |                             </tr>
 98 |                         </thead>
 99 |                     </table>
100 |                 </div>
101 |             </body>
102 |             </html>
103 | EOF
104 |         """
105 | 


--------------------------------------------------------------------------------
/bio/ngs/rules/mapping/samtools.rules:
--------------------------------------------------------------------------------
  1 | # vim: syntax=python tabstop=4 expandtab
  2 | # coding: utf-8
  3 | 
  4 | 
  5 | """
  6 | Rules for modifying SAM or BAM files. Need samtools in your path and config
  7 | needs to set picard location with jars in 'picard_jars'.
  8 | 
  9 | For usage, include this in your workflow.
 10 | """
 11 | 
 12 | import os
 13 | 
 14 | 
 15 | __author__ = "Johannes KÃ¶ster, Ino de Bruijn"
 16 | __license__ = "MIT"
 17 | 
 18 | 
 19 | rule bam_index:
 20 |     input:
 21 |         "{prefix}.bam"
 22 |     output:
 23 |         "{prefix}.bam.bai"
 24 |     shell:
 25 |         """
 26 |         {config[samtools_rules][load_env]}
 27 |         samtools index {input}
 28 |         """
 29 | 
 30 | 
 31 | rule bam_sort:
 32 |     input:
 33 |         "{prefix}.bam"
 34 |     output:
 35 |         "{prefix}.sorted.bam"
 36 |     shell:
 37 |         """
 38 |         {config[samtools_rules][load_env]}
 39 |         samtools sort {input} {wildcards.prefix}.sorted
 40 |         """
 41 | 
 42 | 
 43 | rule bam_sort_name:
 44 |     input:
 45 |         "{prefix}.bam"
 46 |     output:
 47 |         "{prefix}.namesorted.bam"
 48 |     shell:
 49 |         """
 50 |         {config[samtools_rules][load_env]}
 51 |         samtools sort -n {input} {wildcards.prefix}.namesorted
 52 |         """
 53 | 
 54 | 
 55 | rule sam_to_bam:
 56 |     input:
 57 |         "{prefix}.sam"
 58 |     output:
 59 |         "{prefix}.bam"
 60 |     shell:
 61 |         """
 62 |         {config[samtools_rules][load_env]}
 63 |         samtools view -Sbh {input} > {output}
 64 |         """
 65 | 
 66 | 
 67 | rule bam_stats:
 68 |     input:
 69 |         "{prefix}.bam"
 70 |     output:
 71 |         "{prefix}.stats.txt"
 72 |     shell:
 73 |         """
 74 |         {config[samtools_rules][load_env]}
 75 |         samtools idxstats {input} > {output}
 76 |         """
 77 | 
 78 | 
 79 | rule bam_measure_insert_size:
 80 |     input:
 81 |         "{prefix}.sorted.bam"
 82 |     output:
 83 |         txt="{prefix}.insert_size.txt",
 84 |         pdf="{prefix}.insert_size_histogram.pdf"
 85 |     shell:
 86 |         """
 87 |         java -jar {config[samtools_rules][picard_jars]}/CollectInsertSizeMetrics.jar \
 88 |             INPUT={input} \
 89 |             OUTPUT={output.txt} \
 90 |             HISTOGRAM_FILE={output.pdf}
 91 |         """
 92 | 
 93 | 
 94 | rule fasta_index:
 95 |     input:
 96 |         "{prefix}.{suffix}"
 97 |     output:
 98 |         "{prefix}.{suffix,(fasta|fa)}.fai"
 99 |     shell:
100 |         """
101 |         {config[samtools_rules][load_env]}
102 |         samtools faidx {input}
103 |         """
104 | 
105 | 
106 | rule fasta_dict:
107 |     input:
108 |         "{prefix}.fasta"
109 |     output:
110 |         "{prefix}.dict"
111 |     shell:
112 |         """
113 |         java -jar {config[samtools_rules][picard_jars]}/CreateSequenceDictionary.jar \
114 |             REFERENCE={input} \
115 |             OUTPUT={output}
116 |         """
117 | 
118 | 
119 | rule remove_mark_duplicates:
120 |     input:
121 |         "{prefix}.sorted.bam"
122 |     output:
123 |         "{prefix}.sorted.removeduplicates.bam",
124 |         "{prefix}.sorted.removeduplicates.metrics"
125 |     log:
126 |         "{prefix}.sorted.removeduplicates.log"
127 |     params:
128 |         java_opt="-Xms2g -Xmx32g -XX:MaxPermSize=2g -XX:+CMSClassUnloadingEnabled"
129 |     shell:
130 |         """
131 |         java {params.java_opt} -XX:ParallelGCThreads={threads} \
132 |             -jar {config[samtools_rules][picard_jars]}/MarkDuplicates.jar \
133 |             INPUT={input} \
134 |             OUTPUT={output[0]} \
135 |             METRICS_FILE={output[1]} \
136 |             AS=TRUE \
137 |             VALIDATION_STRINGENCY=LENIENT \
138 |             MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 \
139 |             REMOVE_DUPLICATES=TRUE 2> {log}
140 |         """
141 | 
142 | 
143 | rule bedtools_coverage:
144 |     input:
145 |         "{prefix}.bam"
146 |     output:
147 |         "{prefix}.coverage.tsv"
148 |     shell:
149 |         """
150 |         {config[samtools_rules][load_env]}
151 |         genomeCoverageBed -ibam {input} > {output}
152 |         """
153 | 
154 | 
155 | rule sample_merge:
156 |     """
157 |     Merge bam files for multiple units into one for the given sample.
158 |     If the sample has only one unit, a symlink will be created.
159 |     """
160 |     input:
161 |         lambda wildcards: expand(
162 |             "mapping/bowtie2/{mapping_params}/{reference}/units/{unit}.sorted.removeduplicates.bam",
163 |             unit=config["bowtie2_rules"]["samples"][wildcards.sample],
164 |             mapping_params=wildcards.mapping_params,
165 |             reference=wildcards.reference)
166 |     output:
167 |         "mapping/bowtie2/{mapping_params}/{reference}/samples/{sample}.sorted.removeduplicates.bam"
168 |     run:
169 |         if len(input) > 1:
170 |             shell("{config[samtools_rules][load_env]} && "
171 |                   "samtools merge {output} {input}")
172 |         else:
173 |             shell("ln -fs ../units/{basename} {{output}} && touch -h "
174 |                   "{{output}}".format(basename=os.path.basename(input[0])))
175 | 


--------------------------------------------------------------------------------
/bio/ngs/rules/quality_control/fastqc.rules:
--------------------------------------------------------------------------------
  1 | # vim: syntax=python tabstop=4 expandtab
  2 | # coding: utf-8
  3 | 
  4 | 
  5 | """
  6 | Rules for analysing fasta files with FastQC.
  7 | 
  8 | For usage, include this in your workflow.
  9 | """
 10 | 
 11 | 
 12 | __author__ = "Johannes Köster (http://johanneskoester.bitbucket.org), Ino de Bruijn (http://ino.pm)"
 13 | __license__ = "MIT"
 14 | 
 15 | 
 16 | # Check values in config file
 17 | CONFIG_REQS = ["reads"]
 18 | if "fastqc_rules" not in config:
 19 |     raise(Exception("fastqc_rules key not in config file"))
 20 | for cr in CONFIG_REQS:
 21 |     if cr not in config["fastqc_rules"]:
 22 |         raise(Exception("{cr} not in config file".format(cr=cr)))
 23 | 
 24 | 
 25 | def get_fasta_basename(filename):
 26 |     """Return basename of fasta/fastq file"""
 27 |     filename = os.path.basename(filename)
 28 |     possible_ext = [".fastq", ".fq.gz", ".fastq.gz", ".fasta", ".fa", ".fa.gz",
 29 |         ".fasta.gz"]
 30 |     for e in possible_ext:
 31 |         if filename.endswith(e):
 32 |             return filename[:-len(e)]
 33 |     return filename
 34 | 
 35 | import os
 36 | 
 37 | 
 38 | rule fastqc:
 39 |     """Generates fastqc output for given fastq or fastq.gz file. The reads can
 40 |     be specified in the config file but this is not necessary."""
 41 |     input:
 42 |         lambda wildcards: \
 43 |             ["fastqc/{}".format(os.path.basename(r)) for r in config["fastqc_rules"]["reads"] \
 44 |                 if get_fasta_basename(r) == os.path.basename(wildcards.prefix)]\
 45 |             or glob.glob(wildcards.prefix + ".fa*") \
 46 |             or "{}.{{fastq.gz,fastq}}".format(wildcards.prefix)
 47 |     output:
 48 |         "{prefix}_fastqc.zip",
 49 |         "{prefix}_fastqc.html"
 50 |     shell:
 51 |         """
 52 |         {config[fastqc_rules][load_env]}
 53 |         fastqc {input}
 54 |         """
 55 | 
 56 | 
 57 | rule create_read_symlink:
 58 |     """Create symbolic links for given reads"""
 59 |     input:
 60 |         lambda wildcards: config["fastqc_rules"]["reads"][wildcards.reads]
 61 |     output:
 62 |         "fastqc/{reads}"
 63 |     shell:
 64 |         """
 65 |         ln -s $(readlink -f {input}) {output}
 66 |         """
 67 | 
 68 | 
 69 | rule fastqc_all:
 70 |     input:
 71 |         links=expand("fastqc/{reads}", reads=config["fastqc_rules"]["reads"]),
 72 |         htmls=expand("fastqc/{reads}_fastqc.html", reads=[get_fasta_basename(r) for r in config["fastqc_rules"]["reads"]]),
 73 |         zips=expand("fastqc/{reads}_fastqc.zip", reads=[get_fasta_basename(r) for r in config["fastqc_rules"]["reads"]])
 74 | 
 75 | 
 76 | import glob
 77 | 
 78 | rule fastqc_report:
 79 |     input:
 80 |         htmls=sorted(rules.fastqc_all.input.htmls),
 81 |         zips=sorted(rules.fastqc_all.input.zips),
 82 |         readme_rst=glob.glob("report/fastqc/README.rst")
 83 |     output:
 84 |         report="report/fastqc/index.html",
 85 |         json="report/fastqc/data.json"
 86 |     params:
 87 |         htmls_basename=[os.path.basename(h) for h in sorted(rules.fastqc_all.input.htmls)],
 88 |         readme_html="report/fastqc/README.html"
 89 |     shell:
 90 |         """
 91 |         cp --parents {input.htmls} report/
 92 |         htmls=( {params.htmls_basename} )
 93 |         zips=( {input.zips} )
 94 |         (
 95 |             cat <<EOF
 96 |                     <html>
 97 |                     <head>
 98 |                     <!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->
 99 |                     <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js"></script>
100 |                     <!-- Latest compiled and minified CSS -->
101 |                     <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css">
102 |                     <!-- Latest compiled and minified JavaScript -->
103 |                     <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/js/bootstrap.min.js"></script>
104 | 
105 |                     <!-- Latest compiled and minified CSS -->
106 |                     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap-table/1.5.0/bootstrap-table.min.css">
107 |                     <!-- Latest compiled and minified JavaScript -->
108 |                     <script src="https://cdnjs.cloudflare.com/ajax/libs/bootstrap-table/1.5.0/bootstrap-table.min.js"></script>
109 |                     <script src="https://raw.githubusercontent.com/wenzhixin/bootstrap-table/master/src/extensions/export/bootstrap-table-export.js"></script>
110 |                     <script src="https://rawgit.com/kayalshri/tableExport.jquery.plugin/master/tableExport.js"></script>
111 |                     <script src="https://rawgit.com/kayalshri/tableExport.jquery.plugin/master/jquery.base64.js"></script>
112 |                     </head>
113 |                     <body>
114 |                         <h1>FastQC</h1>
115 |                         <div class="container">
116 |                             <table
117 |                                    data-toggle="table"
118 |                                    data-show-export="true"
119 |                                    data-search="true"
120 |                                    data-show-columns="true"
121 |                                    data-url="data.json"
122 |                                    data-sort-name="url"
123 |                                    data-sort-order="asc"
124 |                                    data-export="true">
125 |                                 <thead>
126 |                                     <tr>
127 |                                         <th data-field="url"
128 |                                             data-sortable="true">
129 |                                             Reads
130 |                                         </th>
131 |                                         <th data-field="Total Sequences"
132 |                                             data-sortable="true">
133 |                                             Total Seq
134 |                                         </th>
135 |                                         <th data-field="PASS"
136 |                                             data-sortable="true">
137 |                                             PASS
138 |                                         </th>
139 |                                         <th data-field="WARN"
140 |                                             data-sortable="true">
141 |                                             WARN
142 |                                         </th>
143 |                                         <th data-field="FAIL"
144 |                                             data-sortable="true">
145 |                                             FAIL
146 |                                         </th>
147 |                                     </tr>
148 |                                 </thead>
149 |                             </table>
150 |                         </div>
151 | EOF
152 |             for f in {input.readme_rst}; do
153 |                 echo "<div class="container"><iframe style='width: 80%; height: 100%;' frameBorder=0 src='README.html'></iframe></div>"
154 |                 rst2html.py $f > {params.readme_html}
155 |             done
156 |             echo '</body></html>'
157 |         ) > {output.report}
158 |         (
159 |             echo "["
160 |             for i in $(seq 0 $((${{#htmls[@]}}-1))); do
161 |                 echo "{{"
162 |                 echo '"url":"<a href='"'${{htmls[$i]}}'"">${{htmls[$i]}}</a>"'",'
163 |                 echo '"PASS":'$(unzip -p ${{zips[$i]}} '*/summary.txt' | grep -c PASS),
164 |                 echo '"WARN":'$(unzip -p ${{zips[$i]}} '*/summary.txt' | grep -c WARN),
165 |                 echo '"FAIL":'$(unzip -p ${{zips[$i]}} '*/summary.txt' | grep -c FAIL),
166 |                 paste <(unzip -p ${{zips[$i]}} '*/summary.txt' | cut -f2) <(unzip -p ${{zips[$i]}} '*/summary.txt' | cut -f1) | awk -v ORS=",\n" -v FS="\t" -v OFS=":" '{{print "\\""$1"\\"", "\\""$2"\\""}}'
167 |                 echo '"Total Sequences":'$(unzip -p ${{zips[$i]}} '*/fastqc_data.txt' | grep "Total Sequences" | cut -f2)
168 |                 if [[ $i -ne $((${{#htmls[@]}}-1)) ]]; then
169 |                     echo "}},"
170 |                 else
171 |                     echo "}}"
172 |                 fi
173 |             done
174 |             echo "]"
175 |         ) > {output.json}
176 |         """
177 | 
178 | 
179 | rule fastqc_clean:
180 |     """Remove FastQC dir"""
181 |     shell:
182 |         """
183 |         rm -rf fastqc/
184 |         """
185 | 


--------------------------------------------------------------------------------
/bio/ngs/rules/trimming/trimmomatic.rules:
--------------------------------------------------------------------------------
 1 | # vim: syntax=python tabstop=4 expandtab
 2 | # coding: utf-8
 3 | 
 4 | 
 5 | """
 6 | Rules for trimming NGS reads with trimmomatic
 7 | (http://www.usadellab.org/cms/?page=trimmomatic)
 8 | 
 9 | For usage, include this in your workflow.
10 | """
11 | 
12 | 
13 | __author__ = "Ino de Bruijn (http://ino.pm)"
14 | __license__ = "MIT"
15 | 
16 | 
17 | # Check values in config file
18 | CONFIG_REQS = ["reads", "trim_params"]
19 | if "trimmomatic_rules" not in config:
20 |     raise(Exception("trimmomatic_rules key not in config file"))
21 | for cr in CONFIG_REQS:
22 |     if cr not in config["trimmomatic_rules"]:
23 |         raise(Exception("{cr} not in config file".format(cr=cr)))
24 | 
25 | rule trimmomatic_pe:
26 |     """Trims given paired-end reads with given parameters"""
27 |     input:
28 |         lambda wildcards: config["trimmomatic_rules"]["reads"][wildcards.reads]
29 |     output:
30 |         "trimmomatic/{trim_params}/{reads}_1P.fastq.gz",
31 |         "trimmomatic/{trim_params}/{reads}_2P.fastq.gz",
32 |         "trimmomatic/{trim_params}/{reads}_1U.fastq.gz",
33 |         "trimmomatic/{trim_params}/{reads}_2U.fastq.gz"
34 |     params:
35 |         trim_params=lambda wildcards: config["trimmomatic_rules"]["trim_params"][wildcards.trim_params]
36 |     shell:
37 |         """
38 |         time java -jar {config[trimmomatic_rules][jar]} PE \
39 |             {input} {output[0]} {output[2]} {output[1]} {output[3]} \
40 |             {params.trim_params}
41 |         """
42 | 
43 | rule trimmomatic_all:
44 |     """Trim all reads with all supplied trimming parameters"""
45 |     input:
46 |         trimmed_reads=expand("trimmomatic/{trim_params}/{reads}_{ext}.fastq.gz", reads=config["trimmomatic_rules"]["reads"],
47 |         trim_params=config["trimmomatic_rules"]["trim_params"],
48 |         ext=["1P","2P","1U","2U"])
49 | 


--------------------------------------------------------------------------------
/common/rules/compression.rules:
--------------------------------------------------------------------------------
1 | rule gzip:
2 |     input: "{prefix}"
3 |     output: "{prefix}.gz"
4 |     shell: """
5 |     gzip {input}
6 |     """
7 | 


--------------------------------------------------------------------------------
/common/rules/track_dir.rules:
--------------------------------------------------------------------------------
1 | rule track_dir:
2 |     input: "{dir}/"
3 |     output: "{dir}_track.txt"
4 |     shell: """
5 |     find {input} -type f | xargs ls -l > {output}
6 |     """
7 | 


--------------------------------------------------------------------------------
/scheduling/Snakefile_qsub.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Submit this clustering script for sbatch to snakemake with:
  4 | 
  5 |     snakemake -j 99 --debug --immediate-submit --cluster 'Snakefile-qsub.py {dependencies}'
  6 | """
  7 | import argparse
  8 | import sys
  9 | import subprocess
 10 | import os
 11 | import math
 12 | import errno
 13 | import json
 14 | from snakemake.utils import read_job_properties
 15 | 
 16 | def make_dir(directory):
 17 |     """Make directory unless existing. Ignore error in the latter case."""
 18 |     try:
 19 |         os.makedirs(directory)
 20 |     except OSError as exception:
 21 |         if exception.errno != errno.EEXIST:
 22 |             raise
 23 | 
 24 | 
 25 | class SnakeJob:
 26 |     """Snakemake can generate bash scripts that can be sumbitted by a
 27 |     scheduler.  This class reads the bash script and stores the number of the
 28 |     rule, name of bash file and the supplied input files."""
 29 |     def __init__(self, snakebashfile, dependencies=None, config=None):
 30 |         self.scriptname = snakebashfile
 31 |         job_properties = read_job_properties(snakebashfile)
 32 |         self.rule = job_properties['rule']
 33 |         self.ifiles = job_properties['input']
 34 |         self.ofiles = job_properties['output']
 35 |         if dependencies == None or len(dependencies) < 1:
 36 |             self.dependencies = None
 37 |         else:
 38 |             # expects snakemake like list of numbers
 39 |             self.dependencies = dependencies
 40 |             assert len(self.dependencies) >= 1
 41 |         self.config = config
 42 | 
 43 | class UndefinedJobRule(Exception):
 44 |     """Exception in case an sbatch job has no defined resource usage in the
 45 |     code."""
 46 |     def __init__(self, msg):
 47 |         self.msg = msg
 48 | 
 49 | 
 50 | class SnakeJobQsub(SnakeJob):
 51 |     def __init__(self, snakebashfile, dependencies=None, config=None):
 52 |         SnakeJob.__init__(self, snakebashfile, dependencies, config)
 53 |         if self.dependencies == None:
 54 |             self.dep_str = ''
 55 |         else:
 56 |             self.dep_str = '-hold_jid ' + ','.join(["%s" % d for d in self.dependencies])
 57 | 
 58 |     def schedule(self):
 59 |         """Schedules a snakemake job with sbatch and determines resource usage
 60 |         based on input files."""
 61 |         if len(self.ofiles) > 0:
 62 |             # create the output directory, so slurm output can go there
 63 |             make_dir(os.path.dirname(os.path.abspath(self.ofiles[0])))
 64 | 
 65 |         schedule_rule = "schedule_{0}".format(self.rule)
 66 |         if schedule_rule in self.config:
 67 |             rule_conf = self.config[schedule_rule]
 68 |             # If rule_conf is referring to another scheduling rule, use those
 69 |             # resources instead
 70 |             try:
 71 |                 if rule_conf.startswith("schedule_"):
 72 |                     rule_conf = self.config[rule_conf]
 73 |             except KeyError:
 74 |                 raise UndefinedJobRule('No schedule config found for {0}'.format(rule_conf))
 75 |             except AttributeError:
 76 |                 pass
 77 | 
 78 |             attributes = {
 79 |                     'dep_str': self.dep_str,
 80 |                     'job_name': 'snakemake_{0}'.format(self.rule),
 81 |                     'qsub_job_path': self.config['qsub_general']['wrapper_script'],
 82 |                     'script_name': self.scriptname,
 83 |                     'queue': rule_conf['queue'],
 84 |                     'threads': rule_conf['threads'],
 85 |                     'log_file': self.ofiles[0] + '-qsub.out' if len(self.ofiles) > 0 else 'snakemake-{0}-qsub.out'.format(self.rule),
 86 |                     'err_file': self.ofiles[0] + '-qsub.err' if len(self.ofiles) > 0 else 'snakemake-{0}-qsub.err'.format(self.rule),
 87 |                     'extra_parameters': rule_conf.get('extra_parameters', "")
 88 |             }
 89 |             qsub_cmd = """qsub -o {log_file} -e {err_file} {dep_str} -q {queue} -pe smp {threads} \
 90 |                     -N {job_name} {extra_parameters} {qsub_job_path} \
 91 |                     '{script_name}'""".format(**attributes)
 92 |         else:
 93 |             raise UndefinedJobRule('No schedule config found for schedule_{0}'.format(self.rule))
 94 |             return 2
 95 | 
 96 |         print(qsub_cmd, file=sys.stderr)
 97 |         popenrv = subprocess.Popen(qsub_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True).communicate()
 98 | 
 99 |         # Snakemake expects only id of submitted job on stdout for scheduling
100 |         # with {dependencies}
101 |         try:
102 |             print("%i" % int(popenrv[0].split()[2]))
103 |         except ValueError:
104 |             print("Not a submitted job: %s" % popenrv[0])
105 |             sys.exit(2)
106 | 
107 | if __name__ == '__main__':
108 |     parser = argparse.ArgumentParser(description=__doc__,
109 |             formatter_class=argparse.RawDescriptionHelpFormatter)
110 |     parser.add_argument("dependencies", nargs="*", help="{{dependencies}} string given by snakemake\n")
111 |     parser.add_argument("snakescript", help="Snakemake generated shell script with commands to execute snakemake rule\n")
112 |     args = parser.parse_args()
113 | 
114 |     #print("Passed bidniz:", args.snakescript, args.dependencies, file=sys.stderr)
115 |     #print("Passed args:", args, file=sys.stderr)
116 |     sj = SnakeJobQsub(args.snakescript, dependencies=args.dependencies, config=json.load(open("config_qsub.json")))
117 |     try:
118 |         sj.schedule()
119 |     except UndefinedJobRule as err:
120 |         print(err.msg, file=sys.stderr)
121 |         sys.exit(2)
122 | 


--------------------------------------------------------------------------------
/scheduling/Snakefile_sbatch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Submit this clustering script for sbatch to snakemake with:
  4 | 
  5 |     snakemake -j 99 --debug --immediate-submit --cluster 'Snakefile-sbatch.py {dependencies}'
  6 | """
  7 | import argparse
  8 | import sys
  9 | import subprocess
 10 | import os
 11 | import math
 12 | import errno
 13 | import json
 14 | from snakemake.utils import read_job_properties
 15 | 
 16 | def make_dir(directory):
 17 |     """Make directory unless existing. Ignore error in the latter case."""
 18 |     try:
 19 |         os.makedirs(directory)
 20 |     except OSError as exception:
 21 |         if exception.errno != errno.EEXIST:
 22 |             raise
 23 | 
 24 | 
 25 | class SnakeJob:
 26 |     """Snakemake can generate bash scripts that can be sumbitted by a
 27 |     scheduler.  This class reads the bash script and stores the number of the
 28 |     rule, name of bash file and the supplied input files."""
 29 |     def __init__(self, snakebashfile, dependencies=None, config=None):
 30 |         self.scriptname = snakebashfile
 31 |         job_properties = read_job_properties(snakebashfile)
 32 |         self.rule = job_properties['rule']
 33 |         self.ifiles = job_properties['input']
 34 |         self.ofiles = job_properties['output']
 35 |         if dependencies == None or len(dependencies) < 1:
 36 |             self.dependencies = None
 37 |         else:
 38 |             # expects snakemake like list of numbers
 39 |             self.dependencies = dependencies
 40 |             assert len(self.dependencies) >= 1
 41 |         self.config = config
 42 | 
 43 | class UndefinedJobRule(Exception):
 44 |     """Exception in case an sbatch job has no defined resource usage in the
 45 |     code."""
 46 |     def __init__(self, msg):
 47 |         self.msg = msg
 48 | 
 49 | 
 50 | class SnakeJobSbatch(SnakeJob):
 51 |     def __init__(self, snakebashfile, dependencies=None, config=None):
 52 |         SnakeJob.__init__(self, snakebashfile, dependencies, config)
 53 |         if self.dependencies == None:
 54 |             self.dep_str = ''
 55 |         else:
 56 |             self.dep_str = '-d ' + ','.join(["afterok:%s" % d for d in self.dependencies])
 57 | 
 58 |     def schedule(self):
 59 |         """Schedules a snakemake job with sbatch and determines resource usage
 60 |         based on input files."""
 61 |         if len(self.ofiles) > 0:
 62 |             # create the output directory, so slurm output can go there
 63 |             make_dir(os.path.dirname(os.path.abspath(self.ofiles[0])))
 64 | 
 65 |         schedule_rule = "schedule_{0}".format(self.rule)
 66 |         if schedule_rule in self.config:
 67 |             rule_conf = self.config[schedule_rule]
 68 |             # If rule_conf is referring to another scheduling rule, use those
 69 |             # resources instead
 70 |             try:
 71 |                 if rule_conf.startswith("schedule_"):
 72 |                     rule_conf = self.config[rule_conf]
 73 |             except KeyError:
 74 |                 raise UndefinedJobRule('No schedule config found for {0}'.format(rule_conf))
 75 |             except AttributeError:
 76 |                 pass
 77 | 
 78 |             attributes = {
 79 |                     'dep_str': self.dep_str,
 80 |                     'job_name': 'snakemake_{0}'.format(self.rule),
 81 |                     'sbatch_job_path': self.config['sbatch_general']['wrapper_script'],
 82 |                     'script_name': self.scriptname,
 83 |                     'days': rule_conf['days'],
 84 |                     'hours': rule_conf['hours'],
 85 |                     'minutes': rule_conf['minutes'],
 86 |                     'partition': rule_conf['partition'],
 87 |                     'cores': rule_conf['cores'],
 88 |                     'account': self.config['sbatch_general']['account'],
 89 |                     'log_file': self.ofiles[0] + '-slurm.out' if len(self.ofiles) > 0 else 'snakemake-{0}-slurm.out'.format(self.rule),
 90 |                     'extra_parameters': rule_conf.get('extra_parameters', "")
 91 |             }
 92 |             sbatch_cmd = """sbatch --output={log_file} {dep_str} -A {account} -p {partition} -n {cores} -t {days}-{hours}:{minutes}:00 \
 93 |                     -J {job_name} {extra_parameters} {sbatch_job_path} \
 94 |                     '{script_name}'""".format(**attributes)
 95 |         else:
 96 |             raise UndefinedJobRule('No schedule config found for schedule_{0}'.format(self.rule))
 97 |             return 2
 98 | 
 99 |         print(sbatch_cmd, file=sys.stderr)
100 |         popenrv = subprocess.Popen(sbatch_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True).communicate()
101 | 
102 |         # Snakemake expects only id of submitted job on stdout for scheduling
103 |         # with {dependencies}
104 |         try:
105 |             print("%i" % int(popenrv[0].split()[-1]))
106 |         except ValueError:
107 |             print("Not a submitted job: %s" % popenrv[0])
108 |             sys.exit(2)
109 | 
110 | if __name__ == '__main__':
111 |     parser = argparse.ArgumentParser(description=__doc__,
112 |             formatter_class=argparse.RawDescriptionHelpFormatter)
113 |     parser.add_argument("dependencies", nargs="*", help="{{dependencies}} string given by snakemake\n")
114 |     parser.add_argument("snakescript", help="Snakemake generated shell script with commands to execute snakemake rule\n")
115 |     args = parser.parse_args()
116 | 
117 |     #print("Passed bidniz:", args.snakescript, args.dependencies, file=sys.stderr)
118 |     #print("Passed args:", args, file=sys.stderr)
119 |     sj = SnakeJobSbatch(args.snakescript, dependencies=args.dependencies, config=json.load(open("config_sbatch.json")))
120 |     try:
121 |         sj.schedule()
122 |     except UndefinedJobRule as err:
123 |         print(err.msg, file=sys.stderr)
124 |         sys.exit(2)
125 | 


--------------------------------------------------------------------------------