├── README.md ├── bio └── ngs │ └── rules │ ├── annotation │ ├── hmmer.rules │ └── prodigal.rules │ ├── assembly │ ├── merge.rules │ ├── ray.rules │ └── report.rules │ ├── binning │ └── concoct.rules │ ├── blast │ └── rpsblast.rules │ ├── mapping │ ├── bowtie2.rules │ ├── report.rules │ └── samtools.rules │ ├── quality_control │ └── fastqc.rules │ └── trimming │ └── trimmomatic.rules ├── common └── rules │ ├── compression.rules │ └── track_dir.rules └── scheduling ├── Snakefile_qsub.py └── Snakefile_sbatch.py /README.md: -------------------------------------------------------------------------------- 1 | Snakemake Workflows 2 | =================== 3 | An experimental repo with [Snakemake] rules and workflows for Next Generation 4 | Sequencing, specifically for metagenomics. The directory structure is similar 5 | to [snakemake-workflows] from the Snakemake author himself so it may later be 6 | merged once the rules prove useful. 7 | 8 | 9 | Requirements 10 | ============ 11 | - [Snakemake] version 3.1 12 | 13 | [Snakemake]: https://bitbucket.org/johanneskoester/snakemake 14 | [Snakemake-workflows]: https://bitbucket.org/johanneskoester/snakemake 15 | -------------------------------------------------------------------------------- /bio/ngs/rules/annotation/hmmer.rules: -------------------------------------------------------------------------------- 1 | # vim: syntax=python tabstop=4 expandtab 2 | # coding: utf-8 3 | 4 | 5 | __author__ = "Ino de Bruijn" 6 | __license__ = "MIT" 7 | 8 | from snakemake.exceptions import MissingInputException 9 | 10 | 11 | rule hmmer_run: 12 | """ 13 | Uses HMmer to run query sequence against given database with given 14 | parameters. 15 | """ 16 | input: 17 | aa=lambda wildcards: config["hmmer_rules"]["query_aas"][wildcards.protein_aa] 18 | output: 19 | hmmer_out="hmmer/{parameters}/{db}/{protein_aa}/hmmer.out", 20 | hmmer_tsv="hmmer/{parameters}/{db}/{protein_aa}/hmmer.tsv" 21 | params: 22 | hmmer_params=lambda wildcards: config["hmmer_rules"]["hmmer_params"][wildcards.parameters], 23 | db=lambda wildcards: config["hmmer_rules"]["databases"][wildcards.db] 24 | shell: 25 | """ 26 | {config[hmmer_rules][load_env]} 27 | hmmscan \ 28 | {params.hmmer_params} \ 29 | --tblout {output.hmmer_tsv} \ 30 | {params.db} \ 31 | {input.aa} \ 32 | > {output.hmmer_out} 33 | """ 34 | 35 | 36 | rule hmmer_run_all: 37 | input: 38 | outs=expand("hmmer/{parameters}/{db}/{protein_aa}/hmmer.out", 39 | parameters=config["hmmer_rules"]["hmmer_params"], 40 | db=config["hmmer_rules"]["databases"], 41 | protein_aa=config["hmmer_rules"]["query_aas"]), 42 | tsvs=expand("hmmer/{parameters}/{db}/{protein_aa}/hmmer.tsv", 43 | parameters=config["hmmer_rules"]["hmmer_params"], 44 | db=config["hmmer_rules"]["databases"], 45 | protein_aa=config["hmmer_rules"]["query_aas"]) 46 | -------------------------------------------------------------------------------- /bio/ngs/rules/annotation/prodigal.rules: -------------------------------------------------------------------------------- 1 | from snakemake.exceptions import MissingInputException 2 | 3 | rule prodigal_run: 4 | input: 5 | asm=lambda wildcards: config["prodigal_rules"]["assemblies"][wildcards.assembly] 6 | output: 7 | aa="annotation/prodigal/{parameters}/{assembly}/proteins/proteins.faa", 8 | gff="annotation/prodigal/{parameters}/{assembly}/proteins/proteins.gff" 9 | log: 10 | "annotation/prodigal/{parameters}/{assembly}/proteins/proteins.log" 11 | params: 12 | prodigal_params=lambda wildcards: config["prodigal_rules"]["prodigal_params"][wildcards.parameters] 13 | shell: 14 | """ 15 | {config[prodigal_rules][load_env]} 16 | prodigal -i {input.asm} \ 17 | -a {output.aa} \ 18 | -f gff \ 19 | {params.prodigal_params} > {output.gff} \ 20 | 2> {log} 21 | """ 22 | 23 | rule prodigal_run_all: 24 | input: 25 | protein_aas=expand("annotation/prodigal/{parameters}/{assembly}/proteins/proteins.faa", 26 | parameters=config["prodigal_rules"]["prodigal_params"], 27 | assembly=config["prodigal_rules"]["assemblies"]), 28 | protein_gffs=expand("annotation/prodigal/{parameters}/{assembly}/proteins/proteins.gff", 29 | parameters=config["prodigal_rules"]["prodigal_params"], 30 | assembly=config["prodigal_rules"]["assemblies"]) 31 | -------------------------------------------------------------------------------- /bio/ngs/rules/assembly/merge.rules: -------------------------------------------------------------------------------- 1 | # vim: syntax=python tabstop=4 expandtab 2 | # coding: utf-8 3 | 4 | 5 | __author__ = "Ino de Bruijn" 6 | __license__ = "MIT" 7 | 8 | 9 | rule merge_newbler: 10 | input: 11 | lambda wildcards: config["assembly_merge_rules"]["merge"][wildcards.merge] 12 | output: 13 | "assembly/newbler/{merge}/454AllContigs.fna" 14 | shell: 15 | """ 16 | {config[assembly_merge_rules][load_env]} 17 | bash -x $METASSEMBLE_DIR/scripts/assembly/merge-asm-newbler.sh \ 18 | assembly/newbler/{wildcards.merge} {input} 19 | """ 20 | 21 | 22 | rule merge_newbler_all: 23 | input: 24 | expand("assembly/newbler/{merge}/454AllContigs.fna", 25 | merge=config["assembly_merge_rules"]["merge"]) 26 | -------------------------------------------------------------------------------- /bio/ngs/rules/assembly/ray.rules: -------------------------------------------------------------------------------- 1 | # vim: syntax=python tabstop=4 expandtab 2 | # coding: utf-8 3 | 4 | 5 | __author__ = "Ino de Bruijn" 6 | __license__ = "MIT" 7 | 8 | 9 | def create_ray_read_input_str(unit): 10 | if len(unit) == 2: 11 | return "-p {unit[0]} {unit[1]}".format(unit=unit) 12 | elif len(unit) == 1: 13 | return "-s {unit[0]}".format(unit=unit) 14 | else: 15 | raise(Exception("Units should either be paired library or single read library.")) 16 | 17 | rule ray_assembly: 18 | input: 19 | lambda wildcards: sum([config["ray_rules"]["units"][unit] for unit in config["ray_rules"]["samples"][wildcards.sample]], []) 20 | output: 21 | "assembly/ray/{assembly_params}/{sample}/out_{kmer}/Contigs.fasta" 22 | params: 23 | custom=lambda wildcards: config["ray_rules"]["assembly_params"][wildcards.assembly_params], 24 | read_input_str=lambda wildcards: " ".join([create_ray_read_input_str(config["ray_rules"]["units"][unit]) for unit in config["ray_rules"]["samples"][wildcards.sample]]) 25 | shell: 26 | """ 27 | {config[ray_rules][load_env]} 28 | rm -rf assembly/ray/{wildcards.assembly_params}/{wildcards.sample}/out_{wildcards.kmer}/tmp 29 | {config[ray_rules][mpi_cmd]} Ray {params.custom} \ 30 | {params.read_input_str} \ 31 | -o assembly/ray/{wildcards.assembly_params}/{wildcards.sample}/out_{wildcards.kmer}/tmp 32 | mv assembly/ray/{wildcards.assembly_params}/{wildcards.sample}/out_{wildcards.kmer}/tmp/* \ 33 | assembly/ray/{wildcards.assembly_params}/{wildcards.sample}/out_{wildcards.kmer}/ 34 | """ 35 | 36 | rule ray_assembly_all: 37 | input: 38 | expand("assembly/ray/{assembly_params}/{sample}/out_{kmer}/Contigs.fasta", assembly_params=config["ray_rules"]["assembly_params"], sample=config["ray_rules"]["samples"], kmer=config["ray_rules"]["kmers"]) 39 | -------------------------------------------------------------------------------- /bio/ngs/rules/assembly/report.rules: -------------------------------------------------------------------------------- 1 | # vim: syntax=python tabstop=4 expandtab 2 | # coding: utf-8 3 | 4 | 5 | """ 6 | Rules for analysing fasta files with FastQC. 7 | 8 | For usage, include this in your workflow. 9 | """ 10 | 11 | 12 | __author__ = "Ino de Bruijn (http://ino.pm)" 13 | __license__ = "MIT" 14 | 15 | 16 | rule assembly_report: 17 | """Creates an assembly HTML report + json for given assemblies. Uses masmvali (github.com/inodb/masmvali)""" 18 | input: 19 | asms=lambda wildcards: sorted(config["assembly_report_rules"]["assemblies"]) 20 | output: 21 | report="report/assemblies/index.html", 22 | json="report/assemblies/data.json" 23 | shell: 24 | """ 25 | {config[assembly_report_rules][load_env]} 26 | python <(cat < {output.json} 54 | cat > {output.report} < 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 |

Assembly Stats

74 |
75 | 84 | 85 | 86 | 90 | 94 | 98 | 102 | 106 | 110 | 114 | 115 | 116 |
88 | name 89 | 92 | cutoff 93 | 96 | nr_contigs 97 | 100 | totbases 101 | 104 | l50 105 | 108 | n50 109 | 112 | max_length 113 |
117 |
118 | 119 | 120 | EOF 121 | """ 122 | -------------------------------------------------------------------------------- /bio/ngs/rules/binning/concoct.rules: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | from snakemake.exceptions import MissingInputException 4 | from snakemake.utils import report 5 | 6 | # Check values in config file 7 | CONFIG_REQS = ["assemblies", "mapper", "mapping_params", "concoct_params", "scripts_dir"] 8 | if "concoct_rules" not in config: 9 | raise(Exception("concoct_rules key not in config file")) 10 | for cr in CONFIG_REQS: 11 | if cr not in config["concoct_rules"]: 12 | raise(Exception("{cr} not in concoct_rules config file".format(cr=cr))) 13 | 14 | 15 | # add 10K cutup as references for bowtie2 to map against 16 | config["bowtie2_rules"].setdefault("references", {}).update({a + "_10K": "concoct/{a}/cutup/contigs_10K.fasta".format(a=a) for a in config["concoct_rules"]["assemblies"]}) 17 | 18 | rule concoct_cutup_10K: 19 | input: 20 | lambda wildcards: config["concoct_rules"]["assemblies"][wildcards.assembly] 21 | output: 22 | "concoct/{assembly}/cutup/contigs_10K.fasta" 23 | params: 24 | chunk_size="10000", 25 | overlap="0" 26 | threads: 1 27 | shell: 28 | """ 29 | {config[concoct_rules][load_env]} 30 | python {config[concoct_rules][scripts_dir]}/cut_up_fasta.py -c {params.chunk_size} -o {params.overlap} \ 31 | -m {input} > {output} 32 | """ 33 | 34 | rule concoct_cutup_10K_all: 35 | input: 36 | expand("concoct/{assembly}/cutup/contigs_10K.fasta", assembly=config["concoct_rules"]["assemblies"]) 37 | 38 | 39 | rule concoct_map_10K_all: 40 | input: 41 | expand("mapping/{mapper}/{mapping_params}/{assembly}/samples/{sample}.sorted.removeduplicates.bam", 42 | assembly=config["bowtie2_rules"]["references"], 43 | sample=config["bowtie2_rules"]["samples"], 44 | mapping_params=config["concoct_rules"]["mapping_params"], 45 | mapper=config["concoct_rules"]["mapper"]) 46 | 47 | 48 | rule concoct_generate_coverage_table_10K: 49 | input: 50 | asm="concoct/{assembly}/cutup/contigs_10K.fasta", 51 | bedcovs=expand("mapping/{mapper}/{mapping_params}/{{assembly}}_10K/samples/{samples}.sorted.removeduplicates.coverage.tsv", 52 | samples=sorted(config["bowtie2_rules"]["samples"]), 53 | mapper=config["concoct_rules"]["mapper"], 54 | mapping_params=config["concoct_rules"]["mapping_params"]) 55 | output: 56 | "concoct/{assembly}/input/concoct_inputtable.tsv", 57 | "concoct/{assembly}/input/concoct_inputtableR.tsv" 58 | params: 59 | sample_names=sorted(config["bowtie2_rules"]["samples"]) 60 | shell: 61 | """ 62 | {config[concoct_rules][load_env]} 63 | python {config[concoct_rules][scripts_dir]}/gen_input_table.py --isbedfiles \ 64 | --samplenames <(for s in {params.sample_names}; do echo $s; done) \ 65 | {input.asm} {input.bedcovs} \ 66 | > {output[0]} && \ 67 | cut -f1,3- {output[0]} > {output[1]} 68 | """ 69 | 70 | 71 | rule concoct_inputtable_10K_all: 72 | input: 73 | expand("concoct/{assembly}/input/concoct_inputtableR.tsv", assembly=config["concoct_rules"]["assemblies"]) 74 | 75 | 76 | rule concoct_run_10K: 77 | """ 78 | Run CONCOCT 79 | """ 80 | input: 81 | asm="concoct/{assembly}/cutup/contigs_10K.fasta", 82 | input_table="concoct/{assembly}/input/concoct_inputtableR.tsv" 83 | output: 84 | clustering="concoct/{assembly}/output/{cparams}/clustering.csv" 85 | params: 86 | output_folder="concoct/{assembly}/output/{cparams}/", 87 | concoct_params=lambda wildcards: config["concoct_rules"]["concoct_params"][wildcards.cparams] 88 | shell: 89 | """ 90 | {config[concoct_rules][load_env]} 91 | concoct {params.concoct_params} \ 92 | --coverage_file {input.input_table} \ 93 | --composition_file {input.asm} \ 94 | -b {params.output_folder} && \ 95 | ln -fs $(basename {params.output_folder}clustering_gt*.csv) \ 96 | {output.clustering} && \ 97 | touch -h {output.clustering} 98 | """ 99 | 100 | 101 | rule concoct_run_10K_all: 102 | """ 103 | Run CONCOCT on all assemblies over all parameters specified in the config file. 104 | """ 105 | input: 106 | expand("concoct/{assembly}/output/{concoct_params}/clustering.csv", 107 | assembly=config["concoct_rules"]["assemblies"], 108 | concoct_params=config["concoct_rules"]["concoct_params"]) 109 | 110 | 111 | # add 10K cutup as assemblies for prodigal to predict genes for 112 | config["prodigal_rules"]["assemblies"] = {a + "_10K": "concoct/{a}/cutup/contigs_10K.fasta".format(a=a) for a in config["concoct_rules"]["assemblies"]} 113 | 114 | # add prodigal predicted genes as query for rpsblast 115 | config["rpsblast_rules"]["query_aas"] = {a: "annotation/prodigal/default-meta/{a}/proteins/proteins.faa".format(a=a) for a in config["prodigal_rules"]["assemblies"]} 116 | 117 | # add prodigal predicted genes as query for hmmer 118 | config["hmmer_rules"]["query_aas"] = config["rpsblast_rules"]["query_aas"] 119 | 120 | rule concoct_eval_cog_table: 121 | """ 122 | Generate COG table from rpsblast output and concoct binning results 123 | """ 124 | input: 125 | clust="concoct/{assembly}/output/{concoct_params}/clustering.csv", 126 | rpsblast="blast/rpsblast/default-concoct/cog/{assembly}_10K/rpsblast.out" 127 | output: 128 | "concoct/{assembly}/evaluation/scg/{concoct_params}/clustering_scg.tsv" 129 | shell: 130 | """ 131 | {config[concoct_rules][load_env]} 132 | python {config[concoct_rules][scripts_dir]}/COG_table.py \ 133 | -b {input.rpsblast} \ 134 | -m {config[concoct_rules][scripts_dir]}/../scgs/scg_cogs_min0.97_max1.03_unique_genera.txt \ 135 | -c {input.clust} \ 136 | --cdd_cog_file {config[concoct_rules][scripts_dir]}/../scgs/cdd_to_cog.tsv \ 137 | > {output} 138 | """ 139 | 140 | rule concoct_extract_approved_scg_bins: 141 | input: 142 | scg_tsvs=expand("concoct/{assembly}/evaluation/scg/{concoct_params}/clustering_scg.tsv", 143 | assembly=sorted(config["concoct_rules"]["assemblies"]), 144 | concoct_params=sorted(config["concoct_rules"]["concoct_params"])), 145 | asms=expand("concoct/{assembly}/cutup/contigs_10K.fasta", 146 | assembly=sorted(config["concoct_rules"]["assemblies"]), 147 | concoct_params=sorted(config["concoct_rules"]["concoct_params"])) 148 | output: 149 | dynamic("concoct/approved_scg_bins/{cluster_name}.fa") 150 | params: 151 | names=expand("{assembly}_{concoct_params}", 152 | assembly=sorted(config["concoct_rules"]["assemblies"]), 153 | concoct_params=sorted(config["concoct_rules"]["concoct_params"])), 154 | groups=expand("{assembly}", 155 | assembly=sorted(config["concoct_rules"]["assemblies"]), 156 | concoct_params=sorted(config["concoct_rules"]["concoct_params"])) 157 | shell: 158 | """ 159 | {config[concoct_rules][load_env]} 160 | python {config[concoct_rules][scripts_dir]}/extract_scg_bins.py \ 161 | --output_folder concoct/approved_scg_bins \ 162 | --scg_tsvs {input.scg_tsvs} \ 163 | --fasta_files {input.asms} \ 164 | --names {params.names} \ 165 | --groups {params.groups} \ 166 | --max_missing_scg 5 \ 167 | --max_multicopy_scg 2 168 | """ 169 | 170 | rule concoct_extract_approved_scg_bins_all: 171 | input: 172 | dynamic("concoct/approved_scg_bins/{cluster_name}.fa") 173 | 174 | 175 | rule concoct_dnadiff_dist_matrix: 176 | """Get distance matrix from approved SCG bins""" 177 | input: 178 | clusters=dynamic("concoct/approved_scg_bins/{cluster_name}.fa") 179 | output: 180 | "concoct/dnadiff_dist_matrix/dist_matrix.tsv", 181 | "concoct/dnadiff_dist_matrix/hclust_heatmap.pdf", 182 | "concoct/dnadiff_dist_matrix/hclust_dendrogram.pdf" 183 | run: 184 | sorted_input = sorted(input.clusters) 185 | shell(""" 186 | {config[concoct_rules][load_env]} 187 | python {config[concoct_rules][scripts_dir]}/dnadiff_dist_matrix.py \ 188 | concoct/dnadiff_dist_matrix {sorted_input} 189 | """) 190 | 191 | 192 | rule concoct_dnadiff_dist_matrix_report: 193 | input: 194 | dnadiff_output=rules.concoct_dnadiff_dist_matrix.output, 195 | readme_rst=glob.glob("report/concoct/dnadiff_dist_matrix/README.rst") 196 | output: 197 | "report/concoct/dnadiff_dist_matrix/index.html" 198 | params: 199 | readme_html="report/concoct/dnadiff_dist_matrix/README.html" 200 | shell: 201 | """ 202 | cp --parents {input.dnadiff_output} report/ 203 | ( 204 | echo '' 205 | for p in $(for i in {input.dnadiff_output}; do basename $i; done | sort); do 206 | echo "$p
" 207 | done 208 | for f in {input.readme_rst}; do 209 | echo "" 210 | rst2html.py $f > {params.readme_html} 211 | done 212 | echo '' 213 | ) > {output} 214 | """ 215 | 216 | 217 | rule concoct_eval_cog_plot: 218 | """ 219 | Plot COGs using COG table 220 | """ 221 | input: 222 | "concoct/{assembly}/evaluation/scg/{concoct_params}/clustering_scg.tsv" 223 | output: 224 | "concoct/{assembly}/evaluation/scg/{concoct_params}/clustering_scg.pdf" 225 | shell: 226 | """ 227 | {config[concoct_rules][load_env]} 228 | Rscript {config[concoct_rules][scripts_dir]}/COGPlot.R \ 229 | -s {input} \ 230 | -o {output} 231 | """ 232 | 233 | 234 | rule concoct_eval_cog_plot_all: 235 | input: 236 | expand("concoct/{assembly}/evaluation/scg/{concoct_params}/clustering_scg.pdf", 237 | assembly=config["concoct_rules"]["assemblies"], 238 | concoct_params=config["concoct_rules"]["concoct_params"]) 239 | 240 | 241 | 242 | rule concoct_eval_cog_report: 243 | input: 244 | expand("concoct/{assembly}/evaluation/scg/{concoct_params}/clustering_scg.pdf", 245 | assembly=config["concoct_rules"]["assemblies"], 246 | concoct_params=config["concoct_rules"]["concoct_params"]) 247 | output: 248 | "report/concoct/cog_plots.html" 249 | shell: 250 | """ 251 | cp --parents {input} report/ 252 | ( 253 | echo '' 254 | for p in $(for i in {input}; do echo $i | cut -d/ -f2-; done | sort); do 255 | echo "$p
" 256 | done 257 | echo '' 258 | ) > {output} 259 | """ 260 | 261 | rule concoct_eval_report: 262 | input: 263 | cog_html=rules.concoct_eval_cog_report.output, 264 | dnadiff_html=rules.concoct_dnadiff_dist_matrix_report.output 265 | output: 266 | "report/concoct/index.html" 267 | shell: 268 | """ 269 | ( 270 | echo '' 271 | echo "COG Plots
" 272 | echo "DNA Diff matrix as constructed with MUMmer
" 273 | echo '' 274 | ) > {output} 275 | """ 276 | 277 | 278 | rule concoct_eval_cog_report_flashy: 279 | input: 280 | cog_plots=expand("concoct/{assembly}/evaluation/scg/{concoct_params}/clustering_scg.pdf", 281 | assembly=config["concoct_rules"]["assemblies"], 282 | concoct_params=config["concoct_rules"]["concoct_params"]) 283 | output: 284 | html="report/concoct/cog_plots_flashy.html" 285 | run: 286 | dict_cp = {"{a}-{cp}".format(a=cp.split("/")[1],cp=cp.split("/")[4]):cp for cp in input.cog_plots} 287 | cp_ids = "\n".join(["- " + cp + "_" for cp in sorted(dict_cp.keys())]) 288 | report(""" 289 | ========= 290 | SCG Plots 291 | ========= 292 | {cp_ids} 293 | """, output.html, **dict_cp) 294 | -------------------------------------------------------------------------------- /bio/ngs/rules/blast/rpsblast.rules: -------------------------------------------------------------------------------- 1 | rule rpsblast_run: 2 | """ 3 | Uses GNU Parallel to run query sequence against given database with given 4 | rpsblast parameters. 5 | """ 6 | input: 7 | aa=lambda wildcards: config["rpsblast_rules"]["query_aas"][wildcards.protein_aa] 8 | output: 9 | rps_out="blast/rpsblast/{parameters}/{db}/{protein_aa}/rpsblast.out" 10 | params: 11 | rpsblast_params=lambda wildcards: config["rpsblast_rules"]["rpsblast_params"][wildcards.parameters].replace('"',"'"), 12 | db=lambda wildcards: config["rpsblast_rules"]["databases"][wildcards.db], 13 | parallel_params=config["rpsblast_rules"].get("parallel_params", "") 14 | shell: 15 | """ 16 | {config[rpsblast_rules][load_env]} 17 | cat {input.aa} | \ 18 | parallel {params.parallel_params} \ 19 | --pipe -k --recstart '>' --no-notice \ 20 | rpsblast "{params.rpsblast_params}" \ 21 | -query - \ 22 | -db {params.db} \ 23 | > {output.rps_out} 24 | """ 25 | 26 | 27 | rule rpsblast_run_all: 28 | input: 29 | expand("blast/rpsblast/{parameters}/{db}/{protein_aa}/rpsblast.out", 30 | parameters=config["rpsblast_rules"]["rpsblast_params"], 31 | db=config["rpsblast_rules"]["databases"], 32 | protein_aa=config["rpsblast_rules"]["query_aas"]) 33 | -------------------------------------------------------------------------------- /bio/ngs/rules/mapping/bowtie2.rules: -------------------------------------------------------------------------------- 1 | # vim: syntax=python tabstop=4 expandtab 2 | # coding: utf-8 3 | 4 | """ 5 | Read mapping with Bowtie2 6 | 7 | For usage, include this in your workflow. 8 | 9 | Expects the global variable config 10 | (see https://bitbucket.org/johanneskoester/snakemake/wiki/Documentation#markdown-header-configuration) 11 | of at least the following structure, assuming that the desired reference sequence is some genome 12 | to be found under the given path, and two units A and B have been sequenced with Illumina, 13 | the first paired and the second single end: 14 | 15 | { 16 | "bowtie2_rules": { 17 | "references": { 18 | "genome": "path/to/genome.fasta" 19 | }, 20 | "samples": { 21 | "A": ["A"], 22 | "B": ["B"] 23 | }, 24 | "units": { 25 | "A": 26 | ["path/to/A_R1.fastq.gz", "path/to/A_R2.fastq.gz"], 27 | "B": 28 | ["path/to/B.fastq.gz"] 29 | }, 30 | "platform": "Illumina", 31 | "mapping_params": { 32 | "default": "" 33 | } 34 | } 35 | } 36 | 37 | Note the separation between samples and units that allows to have more than 38 | one sequencing run for each sample, or multiple lanes per sample. 39 | """ 40 | 41 | 42 | __author__ = "Johannes Köster (http://johanneskoester.bitbucket.org), Ino de Bruijn" 43 | __license__ = "MIT" 44 | 45 | 46 | UNIT_TO_SAMPLE = { 47 | unit: sample for sample, units in config["bowtie2_rules"]["samples"].items() 48 | for unit in units} 49 | 50 | def create_bowtie2_read_input_str(unit): 51 | if len(unit) == 2: 52 | return "-1 {unit[0]} -2 {unit[1]}".format(unit=unit) 53 | elif len(unit) == 1: 54 | return "-r {unit[0]}".format(unit=unit) 55 | else: 56 | raise(Exception("Units should either be paired library or single read library.")) 57 | 58 | from snakemake.exceptions import MissingInputException 59 | 60 | 61 | rule bowtie2_index: 62 | input: 63 | lambda wildcards: config["bowtie2_rules"]["references"][wildcards.reference] 64 | output: 65 | expand("mapping/bowtie2/{{reference}}.{index}.bt2", index=range(1,5)), 66 | expand("mapping/bowtie2/{{reference}}.rev.{index}.bt2", index=range(1,3)) 67 | params: 68 | prefix="mapping/bowtie2/{reference}" 69 | shell: 70 | """ 71 | {config[bowtie2_rules][load_env]} 72 | bowtie2-build {input} {params.prefix} 73 | """ 74 | 75 | 76 | rule bowtie2_map: 77 | input: 78 | lambda wildcards: config["bowtie2_rules"]["units"][wildcards.unit], 79 | expand("mapping/bowtie2/{{reference}}.{index}.bt2", index=range(1,5)), 80 | expand("mapping/bowtie2/{{reference}}.rev.{index}.bt2", index=range(1,3)) 81 | output: 82 | "mapping/bowtie2/{mapping_params}/{reference}/units/{unit,\w+}.bam" 83 | params: 84 | sample=lambda wildcards: UNIT_TO_SAMPLE[wildcards.unit], 85 | custom=lambda wildcards: config["bowtie2_rules"]["mapping_params"][wildcards.mapping_params], 86 | read_input_str=lambda wildcards: create_bowtie2_read_input_str(config["bowtie2_rules"]["units"][wildcards.unit]), 87 | ref_idx_base="mapping/bowtie2/{reference}", 88 | sam="mapping/bowtie2/{mapping_params}/{reference}/units/{unit,\w+}.sam" 89 | log: 90 | "mapping/bowtie2/{mapping_params}/{reference}/units/{unit,\w+}.log" 91 | threads: 4 92 | shell: 93 | """ 94 | {config[bowtie2_rules][load_env]} 95 | bowtie2 {params.custom} \ 96 | --rg-id '{wildcards.unit}' \ 97 | --rg 'SM:{params.sample}\\tPL:{config[bowtie2_rules][platform]}' \ 98 | -x {params.ref_idx_base} \ 99 | -p {threads} {params.read_input_str} \ 100 | -S {params.sam} \ 101 | 2> {log} && \ 102 | samtools view -Sbh {params.sam} > {output} && \ 103 | rm {params.sam} 104 | """ 105 | -------------------------------------------------------------------------------- /bio/ngs/rules/mapping/report.rules: -------------------------------------------------------------------------------- 1 | # vim: syntax=python tabstop=4 expandtab 2 | # coding: utf-8 3 | 4 | 5 | __author__ = "Ino de Bruijn" 6 | __license__ = "MIT" 7 | 8 | 9 | rule mapping_report: 10 | """Creates an assembly HTML report + json for given mappings. Order of 11 | given bowtie2 and markduplicates log/metrics files should be similar""" 12 | input: 13 | b2_logs=lambda wildcards: sorted(config["mapping_report_rules"]["bowtie2_logs"]), 14 | md_logs=lambda wildcards: sorted(config["mapping_report_rules"]["markduplicates_metrics"]) 15 | output: 16 | report="report/mapping/index.html", 17 | json="report/mapping/data.json" 18 | shell: 19 | """ 20 | {config[mapping_report_rules][load_env]} 21 | python <(cat < {output.json} 49 | cat > {output.report} < 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 |

Mapping Stats

69 |
70 | 79 | 80 | 81 | 85 | 89 | 93 | 97 | 98 | 99 |
83 | name 84 | 87 | nr_reads 88 | 91 | overall_alignment_rate 92 | 95 | percent_duplication 96 |
100 |
101 | 102 | 103 | EOF 104 | """ 105 | -------------------------------------------------------------------------------- /bio/ngs/rules/mapping/samtools.rules: -------------------------------------------------------------------------------- 1 | # vim: syntax=python tabstop=4 expandtab 2 | # coding: utf-8 3 | 4 | 5 | """ 6 | Rules for modifying SAM or BAM files. Need samtools in your path and config 7 | needs to set picard location with jars in 'picard_jars'. 8 | 9 | For usage, include this in your workflow. 10 | """ 11 | 12 | import os 13 | 14 | 15 | __author__ = "Johannes Köster, Ino de Bruijn" 16 | __license__ = "MIT" 17 | 18 | 19 | rule bam_index: 20 | input: 21 | "{prefix}.bam" 22 | output: 23 | "{prefix}.bam.bai" 24 | shell: 25 | """ 26 | {config[samtools_rules][load_env]} 27 | samtools index {input} 28 | """ 29 | 30 | 31 | rule bam_sort: 32 | input: 33 | "{prefix}.bam" 34 | output: 35 | "{prefix}.sorted.bam" 36 | shell: 37 | """ 38 | {config[samtools_rules][load_env]} 39 | samtools sort {input} {wildcards.prefix}.sorted 40 | """ 41 | 42 | 43 | rule bam_sort_name: 44 | input: 45 | "{prefix}.bam" 46 | output: 47 | "{prefix}.namesorted.bam" 48 | shell: 49 | """ 50 | {config[samtools_rules][load_env]} 51 | samtools sort -n {input} {wildcards.prefix}.namesorted 52 | """ 53 | 54 | 55 | rule sam_to_bam: 56 | input: 57 | "{prefix}.sam" 58 | output: 59 | "{prefix}.bam" 60 | shell: 61 | """ 62 | {config[samtools_rules][load_env]} 63 | samtools view -Sbh {input} > {output} 64 | """ 65 | 66 | 67 | rule bam_stats: 68 | input: 69 | "{prefix}.bam" 70 | output: 71 | "{prefix}.stats.txt" 72 | shell: 73 | """ 74 | {config[samtools_rules][load_env]} 75 | samtools idxstats {input} > {output} 76 | """ 77 | 78 | 79 | rule bam_measure_insert_size: 80 | input: 81 | "{prefix}.sorted.bam" 82 | output: 83 | txt="{prefix}.insert_size.txt", 84 | pdf="{prefix}.insert_size_histogram.pdf" 85 | shell: 86 | """ 87 | java -jar {config[samtools_rules][picard_jars]}/CollectInsertSizeMetrics.jar \ 88 | INPUT={input} \ 89 | OUTPUT={output.txt} \ 90 | HISTOGRAM_FILE={output.pdf} 91 | """ 92 | 93 | 94 | rule fasta_index: 95 | input: 96 | "{prefix}.{suffix}" 97 | output: 98 | "{prefix}.{suffix,(fasta|fa)}.fai" 99 | shell: 100 | """ 101 | {config[samtools_rules][load_env]} 102 | samtools faidx {input} 103 | """ 104 | 105 | 106 | rule fasta_dict: 107 | input: 108 | "{prefix}.fasta" 109 | output: 110 | "{prefix}.dict" 111 | shell: 112 | """ 113 | java -jar {config[samtools_rules][picard_jars]}/CreateSequenceDictionary.jar \ 114 | REFERENCE={input} \ 115 | OUTPUT={output} 116 | """ 117 | 118 | 119 | rule remove_mark_duplicates: 120 | input: 121 | "{prefix}.sorted.bam" 122 | output: 123 | "{prefix}.sorted.removeduplicates.bam", 124 | "{prefix}.sorted.removeduplicates.metrics" 125 | log: 126 | "{prefix}.sorted.removeduplicates.log" 127 | params: 128 | java_opt="-Xms2g -Xmx32g -XX:MaxPermSize=2g -XX:+CMSClassUnloadingEnabled" 129 | shell: 130 | """ 131 | java {params.java_opt} -XX:ParallelGCThreads={threads} \ 132 | -jar {config[samtools_rules][picard_jars]}/MarkDuplicates.jar \ 133 | INPUT={input} \ 134 | OUTPUT={output[0]} \ 135 | METRICS_FILE={output[1]} \ 136 | AS=TRUE \ 137 | VALIDATION_STRINGENCY=LENIENT \ 138 | MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 \ 139 | REMOVE_DUPLICATES=TRUE 2> {log} 140 | """ 141 | 142 | 143 | rule bedtools_coverage: 144 | input: 145 | "{prefix}.bam" 146 | output: 147 | "{prefix}.coverage.tsv" 148 | shell: 149 | """ 150 | {config[samtools_rules][load_env]} 151 | genomeCoverageBed -ibam {input} > {output} 152 | """ 153 | 154 | 155 | rule sample_merge: 156 | """ 157 | Merge bam files for multiple units into one for the given sample. 158 | If the sample has only one unit, a symlink will be created. 159 | """ 160 | input: 161 | lambda wildcards: expand( 162 | "mapping/bowtie2/{mapping_params}/{reference}/units/{unit}.sorted.removeduplicates.bam", 163 | unit=config["bowtie2_rules"]["samples"][wildcards.sample], 164 | mapping_params=wildcards.mapping_params, 165 | reference=wildcards.reference) 166 | output: 167 | "mapping/bowtie2/{mapping_params}/{reference}/samples/{sample}.sorted.removeduplicates.bam" 168 | run: 169 | if len(input) > 1: 170 | shell("{config[samtools_rules][load_env]} && " 171 | "samtools merge {output} {input}") 172 | else: 173 | shell("ln -fs ../units/{basename} {{output}} && touch -h " 174 | "{{output}}".format(basename=os.path.basename(input[0]))) 175 | -------------------------------------------------------------------------------- /bio/ngs/rules/quality_control/fastqc.rules: -------------------------------------------------------------------------------- 1 | # vim: syntax=python tabstop=4 expandtab 2 | # coding: utf-8 3 | 4 | 5 | """ 6 | Rules for analysing fasta files with FastQC. 7 | 8 | For usage, include this in your workflow. 9 | """ 10 | 11 | 12 | __author__ = "Johannes Köster (http://johanneskoester.bitbucket.org), Ino de Bruijn (http://ino.pm)" 13 | __license__ = "MIT" 14 | 15 | 16 | # Check values in config file 17 | CONFIG_REQS = ["reads"] 18 | if "fastqc_rules" not in config: 19 | raise(Exception("fastqc_rules key not in config file")) 20 | for cr in CONFIG_REQS: 21 | if cr not in config["fastqc_rules"]: 22 | raise(Exception("{cr} not in config file".format(cr=cr))) 23 | 24 | 25 | def get_fasta_basename(filename): 26 | """Return basename of fasta/fastq file""" 27 | filename = os.path.basename(filename) 28 | possible_ext = [".fastq", ".fq.gz", ".fastq.gz", ".fasta", ".fa", ".fa.gz", 29 | ".fasta.gz"] 30 | for e in possible_ext: 31 | if filename.endswith(e): 32 | return filename[:-len(e)] 33 | return filename 34 | 35 | import os 36 | 37 | 38 | rule fastqc: 39 | """Generates fastqc output for given fastq or fastq.gz file. The reads can 40 | be specified in the config file but this is not necessary.""" 41 | input: 42 | lambda wildcards: \ 43 | ["fastqc/{}".format(os.path.basename(r)) for r in config["fastqc_rules"]["reads"] \ 44 | if get_fasta_basename(r) == os.path.basename(wildcards.prefix)]\ 45 | or glob.glob(wildcards.prefix + ".fa*") \ 46 | or "{}.{{fastq.gz,fastq}}".format(wildcards.prefix) 47 | output: 48 | "{prefix}_fastqc.zip", 49 | "{prefix}_fastqc.html" 50 | shell: 51 | """ 52 | {config[fastqc_rules][load_env]} 53 | fastqc {input} 54 | """ 55 | 56 | 57 | rule create_read_symlink: 58 | """Create symbolic links for given reads""" 59 | input: 60 | lambda wildcards: config["fastqc_rules"]["reads"][wildcards.reads] 61 | output: 62 | "fastqc/{reads}" 63 | shell: 64 | """ 65 | ln -s $(readlink -f {input}) {output} 66 | """ 67 | 68 | 69 | rule fastqc_all: 70 | input: 71 | links=expand("fastqc/{reads}", reads=config["fastqc_rules"]["reads"]), 72 | htmls=expand("fastqc/{reads}_fastqc.html", reads=[get_fasta_basename(r) for r in config["fastqc_rules"]["reads"]]), 73 | zips=expand("fastqc/{reads}_fastqc.zip", reads=[get_fasta_basename(r) for r in config["fastqc_rules"]["reads"]]) 74 | 75 | 76 | import glob 77 | 78 | rule fastqc_report: 79 | input: 80 | htmls=sorted(rules.fastqc_all.input.htmls), 81 | zips=sorted(rules.fastqc_all.input.zips), 82 | readme_rst=glob.glob("report/fastqc/README.rst") 83 | output: 84 | report="report/fastqc/index.html", 85 | json="report/fastqc/data.json" 86 | params: 87 | htmls_basename=[os.path.basename(h) for h in sorted(rules.fastqc_all.input.htmls)], 88 | readme_html="report/fastqc/README.html" 89 | shell: 90 | """ 91 | cp --parents {input.htmls} report/ 92 | htmls=( {params.htmls_basename} ) 93 | zips=( {input.zips} ) 94 | ( 95 | cat < 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 |

FastQC

115 |
116 | 125 | 126 | 127 | 131 | 135 | 139 | 143 | 147 | 148 | 149 |
129 | Reads 130 | 133 | Total Seq 134 | 137 | PASS 138 | 141 | WARN 142 | 145 | FAIL 146 |
150 |
151 | EOF 152 | for f in {input.readme_rst}; do 153 | echo "
" 154 | rst2html.py $f > {params.readme_html} 155 | done 156 | echo '' 157 | ) > {output.report} 158 | ( 159 | echo "[" 160 | for i in $(seq 0 $((${{#htmls[@]}}-1))); do 161 | echo "{{" 162 | echo '"url":"${{htmls[$i]}}"'",' 163 | echo '"PASS":'$(unzip -p ${{zips[$i]}} '*/summary.txt' | grep -c PASS), 164 | echo '"WARN":'$(unzip -p ${{zips[$i]}} '*/summary.txt' | grep -c WARN), 165 | echo '"FAIL":'$(unzip -p ${{zips[$i]}} '*/summary.txt' | grep -c FAIL), 166 | paste <(unzip -p ${{zips[$i]}} '*/summary.txt' | cut -f2) <(unzip -p ${{zips[$i]}} '*/summary.txt' | cut -f1) | awk -v ORS=",\n" -v FS="\t" -v OFS=":" '{{print "\\""$1"\\"", "\\""$2"\\""}}' 167 | echo '"Total Sequences":'$(unzip -p ${{zips[$i]}} '*/fastqc_data.txt' | grep "Total Sequences" | cut -f2) 168 | if [[ $i -ne $((${{#htmls[@]}}-1)) ]]; then 169 | echo "}}," 170 | else 171 | echo "}}" 172 | fi 173 | done 174 | echo "]" 175 | ) > {output.json} 176 | """ 177 | 178 | 179 | rule fastqc_clean: 180 | """Remove FastQC dir""" 181 | shell: 182 | """ 183 | rm -rf fastqc/ 184 | """ 185 | -------------------------------------------------------------------------------- /bio/ngs/rules/trimming/trimmomatic.rules: -------------------------------------------------------------------------------- 1 | # vim: syntax=python tabstop=4 expandtab 2 | # coding: utf-8 3 | 4 | 5 | """ 6 | Rules for trimming NGS reads with trimmomatic 7 | (http://www.usadellab.org/cms/?page=trimmomatic) 8 | 9 | For usage, include this in your workflow. 10 | """ 11 | 12 | 13 | __author__ = "Ino de Bruijn (http://ino.pm)" 14 | __license__ = "MIT" 15 | 16 | 17 | # Check values in config file 18 | CONFIG_REQS = ["reads", "trim_params"] 19 | if "trimmomatic_rules" not in config: 20 | raise(Exception("trimmomatic_rules key not in config file")) 21 | for cr in CONFIG_REQS: 22 | if cr not in config["trimmomatic_rules"]: 23 | raise(Exception("{cr} not in config file".format(cr=cr))) 24 | 25 | rule trimmomatic_pe: 26 | """Trims given paired-end reads with given parameters""" 27 | input: 28 | lambda wildcards: config["trimmomatic_rules"]["reads"][wildcards.reads] 29 | output: 30 | "trimmomatic/{trim_params}/{reads}_1P.fastq.gz", 31 | "trimmomatic/{trim_params}/{reads}_2P.fastq.gz", 32 | "trimmomatic/{trim_params}/{reads}_1U.fastq.gz", 33 | "trimmomatic/{trim_params}/{reads}_2U.fastq.gz" 34 | params: 35 | trim_params=lambda wildcards: config["trimmomatic_rules"]["trim_params"][wildcards.trim_params] 36 | shell: 37 | """ 38 | time java -jar {config[trimmomatic_rules][jar]} PE \ 39 | {input} {output[0]} {output[2]} {output[1]} {output[3]} \ 40 | {params.trim_params} 41 | """ 42 | 43 | rule trimmomatic_all: 44 | """Trim all reads with all supplied trimming parameters""" 45 | input: 46 | trimmed_reads=expand("trimmomatic/{trim_params}/{reads}_{ext}.fastq.gz", reads=config["trimmomatic_rules"]["reads"], 47 | trim_params=config["trimmomatic_rules"]["trim_params"], 48 | ext=["1P","2P","1U","2U"]) 49 | -------------------------------------------------------------------------------- /common/rules/compression.rules: -------------------------------------------------------------------------------- 1 | rule gzip: 2 | input: "{prefix}" 3 | output: "{prefix}.gz" 4 | shell: """ 5 | gzip {input} 6 | """ 7 | -------------------------------------------------------------------------------- /common/rules/track_dir.rules: -------------------------------------------------------------------------------- 1 | rule track_dir: 2 | input: "{dir}/" 3 | output: "{dir}_track.txt" 4 | shell: """ 5 | find {input} -type f | xargs ls -l > {output} 6 | """ 7 | -------------------------------------------------------------------------------- /scheduling/Snakefile_qsub.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Submit this clustering script for sbatch to snakemake with: 4 | 5 | snakemake -j 99 --debug --immediate-submit --cluster 'Snakefile-qsub.py {dependencies}' 6 | """ 7 | import argparse 8 | import sys 9 | import subprocess 10 | import os 11 | import math 12 | import errno 13 | import json 14 | from snakemake.utils import read_job_properties 15 | 16 | def make_dir(directory): 17 | """Make directory unless existing. Ignore error in the latter case.""" 18 | try: 19 | os.makedirs(directory) 20 | except OSError as exception: 21 | if exception.errno != errno.EEXIST: 22 | raise 23 | 24 | 25 | class SnakeJob: 26 | """Snakemake can generate bash scripts that can be sumbitted by a 27 | scheduler. This class reads the bash script and stores the number of the 28 | rule, name of bash file and the supplied input files.""" 29 | def __init__(self, snakebashfile, dependencies=None, config=None): 30 | self.scriptname = snakebashfile 31 | job_properties = read_job_properties(snakebashfile) 32 | self.rule = job_properties['rule'] 33 | self.ifiles = job_properties['input'] 34 | self.ofiles = job_properties['output'] 35 | if dependencies == None or len(dependencies) < 1: 36 | self.dependencies = None 37 | else: 38 | # expects snakemake like list of numbers 39 | self.dependencies = dependencies 40 | assert len(self.dependencies) >= 1 41 | self.config = config 42 | 43 | class UndefinedJobRule(Exception): 44 | """Exception in case an sbatch job has no defined resource usage in the 45 | code.""" 46 | def __init__(self, msg): 47 | self.msg = msg 48 | 49 | 50 | class SnakeJobQsub(SnakeJob): 51 | def __init__(self, snakebashfile, dependencies=None, config=None): 52 | SnakeJob.__init__(self, snakebashfile, dependencies, config) 53 | if self.dependencies == None: 54 | self.dep_str = '' 55 | else: 56 | self.dep_str = '-hold_jid ' + ','.join(["%s" % d for d in self.dependencies]) 57 | 58 | def schedule(self): 59 | """Schedules a snakemake job with sbatch and determines resource usage 60 | based on input files.""" 61 | if len(self.ofiles) > 0: 62 | # create the output directory, so slurm output can go there 63 | make_dir(os.path.dirname(os.path.abspath(self.ofiles[0]))) 64 | 65 | schedule_rule = "schedule_{0}".format(self.rule) 66 | if schedule_rule in self.config: 67 | rule_conf = self.config[schedule_rule] 68 | # If rule_conf is referring to another scheduling rule, use those 69 | # resources instead 70 | try: 71 | if rule_conf.startswith("schedule_"): 72 | rule_conf = self.config[rule_conf] 73 | except KeyError: 74 | raise UndefinedJobRule('No schedule config found for {0}'.format(rule_conf)) 75 | except AttributeError: 76 | pass 77 | 78 | attributes = { 79 | 'dep_str': self.dep_str, 80 | 'job_name': 'snakemake_{0}'.format(self.rule), 81 | 'qsub_job_path': self.config['qsub_general']['wrapper_script'], 82 | 'script_name': self.scriptname, 83 | 'queue': rule_conf['queue'], 84 | 'threads': rule_conf['threads'], 85 | 'log_file': self.ofiles[0] + '-qsub.out' if len(self.ofiles) > 0 else 'snakemake-{0}-qsub.out'.format(self.rule), 86 | 'err_file': self.ofiles[0] + '-qsub.err' if len(self.ofiles) > 0 else 'snakemake-{0}-qsub.err'.format(self.rule), 87 | 'extra_parameters': rule_conf.get('extra_parameters', "") 88 | } 89 | qsub_cmd = """qsub -o {log_file} -e {err_file} {dep_str} -q {queue} -pe smp {threads} \ 90 | -N {job_name} {extra_parameters} {qsub_job_path} \ 91 | '{script_name}'""".format(**attributes) 92 | else: 93 | raise UndefinedJobRule('No schedule config found for schedule_{0}'.format(self.rule)) 94 | return 2 95 | 96 | print(qsub_cmd, file=sys.stderr) 97 | popenrv = subprocess.Popen(qsub_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True).communicate() 98 | 99 | # Snakemake expects only id of submitted job on stdout for scheduling 100 | # with {dependencies} 101 | try: 102 | print("%i" % int(popenrv[0].split()[2])) 103 | except ValueError: 104 | print("Not a submitted job: %s" % popenrv[0]) 105 | sys.exit(2) 106 | 107 | if __name__ == '__main__': 108 | parser = argparse.ArgumentParser(description=__doc__, 109 | formatter_class=argparse.RawDescriptionHelpFormatter) 110 | parser.add_argument("dependencies", nargs="*", help="{{dependencies}} string given by snakemake\n") 111 | parser.add_argument("snakescript", help="Snakemake generated shell script with commands to execute snakemake rule\n") 112 | args = parser.parse_args() 113 | 114 | #print("Passed bidniz:", args.snakescript, args.dependencies, file=sys.stderr) 115 | #print("Passed args:", args, file=sys.stderr) 116 | sj = SnakeJobQsub(args.snakescript, dependencies=args.dependencies, config=json.load(open("config_qsub.json"))) 117 | try: 118 | sj.schedule() 119 | except UndefinedJobRule as err: 120 | print(err.msg, file=sys.stderr) 121 | sys.exit(2) 122 | -------------------------------------------------------------------------------- /scheduling/Snakefile_sbatch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Submit this clustering script for sbatch to snakemake with: 4 | 5 | snakemake -j 99 --debug --immediate-submit --cluster 'Snakefile-sbatch.py {dependencies}' 6 | """ 7 | import argparse 8 | import sys 9 | import subprocess 10 | import os 11 | import math 12 | import errno 13 | import json 14 | from snakemake.utils import read_job_properties 15 | 16 | def make_dir(directory): 17 | """Make directory unless existing. Ignore error in the latter case.""" 18 | try: 19 | os.makedirs(directory) 20 | except OSError as exception: 21 | if exception.errno != errno.EEXIST: 22 | raise 23 | 24 | 25 | class SnakeJob: 26 | """Snakemake can generate bash scripts that can be sumbitted by a 27 | scheduler. This class reads the bash script and stores the number of the 28 | rule, name of bash file and the supplied input files.""" 29 | def __init__(self, snakebashfile, dependencies=None, config=None): 30 | self.scriptname = snakebashfile 31 | job_properties = read_job_properties(snakebashfile) 32 | self.rule = job_properties['rule'] 33 | self.ifiles = job_properties['input'] 34 | self.ofiles = job_properties['output'] 35 | if dependencies == None or len(dependencies) < 1: 36 | self.dependencies = None 37 | else: 38 | # expects snakemake like list of numbers 39 | self.dependencies = dependencies 40 | assert len(self.dependencies) >= 1 41 | self.config = config 42 | 43 | class UndefinedJobRule(Exception): 44 | """Exception in case an sbatch job has no defined resource usage in the 45 | code.""" 46 | def __init__(self, msg): 47 | self.msg = msg 48 | 49 | 50 | class SnakeJobSbatch(SnakeJob): 51 | def __init__(self, snakebashfile, dependencies=None, config=None): 52 | SnakeJob.__init__(self, snakebashfile, dependencies, config) 53 | if self.dependencies == None: 54 | self.dep_str = '' 55 | else: 56 | self.dep_str = '-d ' + ','.join(["afterok:%s" % d for d in self.dependencies]) 57 | 58 | def schedule(self): 59 | """Schedules a snakemake job with sbatch and determines resource usage 60 | based on input files.""" 61 | if len(self.ofiles) > 0: 62 | # create the output directory, so slurm output can go there 63 | make_dir(os.path.dirname(os.path.abspath(self.ofiles[0]))) 64 | 65 | schedule_rule = "schedule_{0}".format(self.rule) 66 | if schedule_rule in self.config: 67 | rule_conf = self.config[schedule_rule] 68 | # If rule_conf is referring to another scheduling rule, use those 69 | # resources instead 70 | try: 71 | if rule_conf.startswith("schedule_"): 72 | rule_conf = self.config[rule_conf] 73 | except KeyError: 74 | raise UndefinedJobRule('No schedule config found for {0}'.format(rule_conf)) 75 | except AttributeError: 76 | pass 77 | 78 | attributes = { 79 | 'dep_str': self.dep_str, 80 | 'job_name': 'snakemake_{0}'.format(self.rule), 81 | 'sbatch_job_path': self.config['sbatch_general']['wrapper_script'], 82 | 'script_name': self.scriptname, 83 | 'days': rule_conf['days'], 84 | 'hours': rule_conf['hours'], 85 | 'minutes': rule_conf['minutes'], 86 | 'partition': rule_conf['partition'], 87 | 'cores': rule_conf['cores'], 88 | 'account': self.config['sbatch_general']['account'], 89 | 'log_file': self.ofiles[0] + '-slurm.out' if len(self.ofiles) > 0 else 'snakemake-{0}-slurm.out'.format(self.rule), 90 | 'extra_parameters': rule_conf.get('extra_parameters', "") 91 | } 92 | sbatch_cmd = """sbatch --output={log_file} {dep_str} -A {account} -p {partition} -n {cores} -t {days}-{hours}:{minutes}:00 \ 93 | -J {job_name} {extra_parameters} {sbatch_job_path} \ 94 | '{script_name}'""".format(**attributes) 95 | else: 96 | raise UndefinedJobRule('No schedule config found for schedule_{0}'.format(self.rule)) 97 | return 2 98 | 99 | print(sbatch_cmd, file=sys.stderr) 100 | popenrv = subprocess.Popen(sbatch_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True).communicate() 101 | 102 | # Snakemake expects only id of submitted job on stdout for scheduling 103 | # with {dependencies} 104 | try: 105 | print("%i" % int(popenrv[0].split()[-1])) 106 | except ValueError: 107 | print("Not a submitted job: %s" % popenrv[0]) 108 | sys.exit(2) 109 | 110 | if __name__ == '__main__': 111 | parser = argparse.ArgumentParser(description=__doc__, 112 | formatter_class=argparse.RawDescriptionHelpFormatter) 113 | parser.add_argument("dependencies", nargs="*", help="{{dependencies}} string given by snakemake\n") 114 | parser.add_argument("snakescript", help="Snakemake generated shell script with commands to execute snakemake rule\n") 115 | args = parser.parse_args() 116 | 117 | #print("Passed bidniz:", args.snakescript, args.dependencies, file=sys.stderr) 118 | #print("Passed args:", args, file=sys.stderr) 119 | sj = SnakeJobSbatch(args.snakescript, dependencies=args.dependencies, config=json.load(open("config_sbatch.json"))) 120 | try: 121 | sj.schedule() 122 | except UndefinedJobRule as err: 123 | print(err.msg, file=sys.stderr) 124 | sys.exit(2) 125 | --------------------------------------------------------------------------------