├── README.md ├── example-01 └── Makefile ├── example-02 └── Makefile ├── example-03 └── Snakefile ├── example-04 └── Snakefile ├── example-05 └── Snakefile ├── example-06 └── Snakefile └── example-07 ├── Snakefile ├── Snakefile_plot ├── process_sims.r └── sim.slim /README.md: -------------------------------------------------------------------------------- 1 | ## Materials for my blog post Understanding Snakemake 2 | 3 | This Git repository contains the Make, Snakemake, R, and SLiM files necessary 4 | to follow the examples of my blog post, [Understanding 5 | Snakemake](https://vincebuffalo.com/blog/2020/03/04/understanding-snakemake.html). 6 | -------------------------------------------------------------------------------- /example-01/Makefile: -------------------------------------------------------------------------------- 1 | 2 | all: Dmel_BDGP6.28_seqlens.tsv 3 | 4 | Drosophila_melanogaster.BDGP6.28.dna.toplevel.fa.gz: 5 | wget ftp://ftp.ensembl.org/pub/release-99/fasta/drosophila_melanogaster/dna/Drosophila_melanogaster.BDGP6.28.dna.toplevel.fa.gz 6 | 7 | Dmel_BDGP6.28_seqlens.tsv: Drosophila_melanogaster.BDGP6.28.dna.toplevel.fa.gz 8 | bioawk -c fastx '{print $$name "\t" length($$seq)}' Drosophila_melanogaster.BDGP6.28.dna.toplevel.fa.gz > Dmel_BDGP6.28_seqlens.tsv 9 | 10 | 11 | -------------------------------------------------------------------------------- /example-02/Makefile: -------------------------------------------------------------------------------- 1 | 2 | all: Dmel_BDGP6.28_seqlens.tsv 3 | 4 | Drosophila_melanogaster.BDGP6.28.dna.toplevel.fa.gz: 5 | wget ftp://ftp.ensembl.org/pub/release-99/fasta/drosophila_melanogaster/dna/Drosophila_melanogaster.BDGP6.28.dna.toplevel.fa.gz 6 | 7 | Dmel_BDGP6.28_seqlens.tsv: Drosophila_melanogaster.BDGP6.28.dna.toplevel.fa.gz 8 | bioawk -c fastx '{print $$name "\t" length($$seq)}' $< > $@ 9 | 10 | 11 | -------------------------------------------------------------------------------- /example-03/Snakefile: -------------------------------------------------------------------------------- 1 | rule all: 2 | input: 3 | "Dmel_BDGP6.28_seqlens.tsv" 4 | 5 | rule genome: 6 | output: 7 | "Drosophila_melanogaster.BDGP6.28.dna.toplevel.fa.gz" 8 | shell: 9 | "wget ftp://ftp.ensembl.org/pub/release-99/fasta/drosophila_melanogaster/dna/Drosophila_melanogaster.BDGP6.28.dna.toplevel.fa.gz" 10 | 11 | rule seqlens: 12 | input: 13 | "Drosophila_melanogaster.BDGP6.28.dna.toplevel.fa.gz" 14 | output: 15 | "Dmel_BDGP6.28_seqlens.tsv" 16 | shell: 17 | """bioawk -c fastx '{{print $name "\t" length($seq)}}' {input} > {output}""" 18 | 19 | 20 | -------------------------------------------------------------------------------- /example-04/Snakefile: -------------------------------------------------------------------------------- 1 | chrom_filename = "Drosophila_melanogaster.BDGP6.28.dna.chromosome.{chrom}.fa.gz" 2 | 3 | chroms = ['2L', '2R', '3L', '3R', 'X', '4'] 4 | 5 | chrom_fa_files = expand(chrom_filename, chrom=chroms) 6 | 7 | print(chrom_fa_files) 8 | -------------------------------------------------------------------------------- /example-05/Snakefile: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | Ns = [100] 3 | selcoefs = 10**np.linspace(-3, -1, 3) 4 | rbps = 10**np.linspace(-8, -7, 2) 5 | nreps = np.arange(20) 6 | 7 | sim_results_pattern = "sim_{N}N_{selcoef}s_{rbp}rbp_{rep}rep.tsv" 8 | 9 | sim_results = expand(sim_results_pattern, 10 | N=Ns, selcoef=selcoefs, 11 | rbp=rbps, rep=nreps) 12 | 13 | print(sim_results) 14 | 15 | -------------------------------------------------------------------------------- /example-06/Snakefile: -------------------------------------------------------------------------------- 1 | 2 | results = "file_{sample}.txt" 3 | 4 | all_results = expand(results, sample = [1, 2, 3]) 5 | 6 | rule all: 7 | input: 8 | all_results 9 | 10 | rule sims: 11 | input: 12 | output: 13 | "file_{sample_name}.txt" 14 | run: 15 | with open(output[0], 'w') as f: 16 | f.write(f"the sample name is {wildcards.sample_name}") 17 | -------------------------------------------------------------------------------- /example-07/Snakefile: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | Ns = [100] 3 | selcoefs = 10**np.linspace(-3, -1, 3) 4 | rbps = 10**np.linspace(-8, -7, 2) 5 | nreps = np.arange(40) 6 | 7 | sim_results_pattern = "sim_{N}N_{selcoef}s_{rbp}rbp_{rep}rep.tsv" 8 | 9 | sim_results = expand(sim_results_pattern, 10 | N=Ns, selcoef=selcoefs, 11 | rbp=rbps, rep=nreps) 12 | 13 | rule all: 14 | input: 15 | sim_results 16 | 17 | rule sims: 18 | input: 19 | output: 20 | sim_results_pattern 21 | shell: 22 | ("slim -d s={wildcards.selcoef} -d rbp={wildcards.rbp} " + 23 | "-d N={wildcards.N} -d rep={wildcards.rep} sim.slim") 24 | 25 | -------------------------------------------------------------------------------- /example-07/Snakefile_plot: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | Ns = [100] 3 | selcoefs = 10**np.linspace(-3, -1, 3) 4 | rbps = 10**np.linspace(-8, -7, 2) 5 | nreps = np.arange(20) 6 | 7 | sim_results_pattern = "sim_{N}N_{selcoef}s_{rbp}rbp_{rep}rep.tsv" 8 | 9 | sim_results = expand(sim_results_pattern, 10 | N=Ns, selcoef=selcoefs, 11 | rbp=rbps, rep=nreps) 12 | 13 | rule all: 14 | input: 15 | "recurrent_sweeps.pdf" 16 | 17 | rule sims: 18 | input: 19 | output: 20 | sim_results_pattern 21 | shell: 22 | ("slim -d s={wildcards.selcoef} -d rbp={wildcards.rbp} " + 23 | "-d N={wildcards.N} -d rep={wildcards.rep} sim.slim") 24 | 25 | rule plot: 26 | input: 27 | sim_results 28 | output: 29 | "recurrent_sweeps.pdf" 30 | shell: 31 | "Rscript process_sims.r" 32 | 33 | rule clean: 34 | shell: 35 | "rm -f {sim_results}" 36 | "rm -f recurrent_sweeps.pdf" 37 | -------------------------------------------------------------------------------- /example-07/process_sims.r: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | 4 | sims <- list.files('.', pattern = 'sim.*\\.tsv') 5 | 6 | parse_name <- function(x) { 7 | res <- gsub('sim_([^N]+)N_([^s]+)s_([^r]+)rbp_(\\d+)rep.tsv', 8 | '\\1;;\\2;;\\3;;\\4', x) 9 | mat <- do.call(rbind, strsplit(res, ';;')) 10 | tibble(N = as.integer(mat[, 1]), 11 | s = as.numeric(mat[, 2]), 12 | rbp = as.numeric(mat[, 3]), 13 | rep = as.integer(mat[, 4])) 14 | } 15 | 16 | 17 | midpoint <- function(x) { 18 | out <- lapply(strsplit(as.character(x), ','), function(y) { 19 | as.numeric(gsub('(\\)|\\]|\\[|\\()', '', y)) 20 | }) 21 | bins <- do.call(rbind, out) 22 | rowMeans(bins) 23 | } 24 | 25 | bin_region <- function(x, nbins, total_length) { 26 | cut(x, seq(0, total_length, length.out=nbins)) 27 | } 28 | 29 | d <- tibble(sims = sims) %>% 30 | mutate(params = map(sims, parse_name)) %>% 31 | mutate(results = map(sims, read_tsv, col_types = "id")) %>% 32 | unnest(c(params, results)) 33 | 34 | 35 | ds <- d %>% 36 | mutate(bins = bin_region(pos, 140, 50e6), 37 | midpoint = midpoint(bins)) %>% 38 | mutate(s = as.factor(s)) %>% 39 | group_by(N, s, rbp, midpoint) %>% 40 | summarize(het = mean(het)) 41 | 42 | p <- ggplot(ds, aes(midpoint, het, color=s)) + geom_point() + 43 | facet_wrap(~ rbp) + geom_smooth(se=FALSE, span=0.5) + 44 | ylab("average heterozygosity") + xlab("position") 45 | 46 | 47 | ggsave('recurrent_sweeps.pdf', p, width = 7, height = 4) 48 | -------------------------------------------------------------------------------- /example-07/sim.slim: -------------------------------------------------------------------------------- 1 | initialize() { 2 | initializeMutationRate(1e-7); 3 | initializeMutationType("m1", 0.5, "f", 0.0); 4 | initializeMutationType("m2", 0.5, "f", s); 5 | 6 | initializeGenomicElementType("g1", m1, 1); 7 | initializeGenomicElementType("g2", c(m1, m2), c(1, 1)); 8 | initializeGenomicElementType("g3", m1, 1); 9 | 10 | // the rest of the whole region 11 | gene_length = 10000; 12 | initializeGenomicElement(g1, 0, 25e+6 - gene_length - 1); 13 | initializeGenomicElement(g3, 25e+6 + gene_length, 50e6 - 1); 14 | 15 | // a genic region 16 | initializeGenomicElement(g2, 25e+6 - gene_length, 25e+6 + gene_length - 1); 17 | initializeRecombinationRate(rbp); 18 | m1.convertToSubstitution = T; 19 | m2.convertToSubstitution = T; 20 | 21 | base_filename = ("sim_" + N + "N_" + 22 | s + "s_" + rbp + "rbp_" + 23 | rep + "rep.tsv"); 24 | defineConstant("filename", base_filename); 25 | 26 | } 27 | 28 | 1 early() { 29 | sim.addSubpop("p1", N); 30 | burnin = 10*N; 31 | sim.rescheduleScriptBlock(s1, start=burnin, end=burnin); 32 | } 33 | 34 | s1 late() { 35 | y = sim.mutationFrequencies(p1, sim.mutationsOfType(m1)); 36 | pos = sapply(sim.mutationsOfType(m1), "applyValue.position;"); 37 | het = 2*(y * (1-y)); 38 | writeFile(filename, "pos\thet"); 39 | for (i in seqAlong(pos)) { 40 | writeFile(filename, pos[i] + "\t" + het[i], append=T); 41 | } 42 | } 43 | --------------------------------------------------------------------------------